[
  {
    "id": "research-1-1",
    "feature": "Zero-Knowledge Authentication",
    "featureId": 1,
    "featureDesc": "Argon2id + AES-256-GCM client-side — password never leaves device",
    "title": "Cloud trust collapse after SaaS mega-breaches",
    "description": "Cloud trust collapse after SaaS mega-breaches — users refuse to store sensitive data with any server-side-key vendor",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "r/privacy, Privacy Guides Discord",
    "impact": "Market shift to local-first and zero-knowledge tools accelerating 40% YoY since LastPass 2022",
    "quote": "Zero knowledge means the company cannot view, share or decrypt your data — and neither do any infrastructure providers",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-1-2",
    "feature": "Zero-Knowledge Authentication",
    "featureId": 1,
    "featureDesc": "Argon2id + AES-256-GCM client-side — password never leaves device",
    "title": "Vendors falsely advertise 'zero-knowledge'",
    "description": "Vendors falsely advertise 'zero-knowledge' — Privacy Guides community actively investigates and exposes fraudulent ZK claims",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Privacy Guides Discord",
    "impact": "Brand trust collapse for any tool caught misrepresenting ZK architecture; active community watchdog culture",
    "quote": "Drime Cloud falsely advertises zero-knowledge encryption",
    "provenance": "discord"
  },
  {
    "id": "research-1-3",
    "feature": "Zero-Knowledge Authentication",
    "featureId": 1,
    "featureDesc": "Argon2id + AES-256-GCM client-side — password never leaves device",
    "title": "30% of enterprises now require client-side encryption as a hard procurement qualifier",
    "description": "30% of enterprises now require client-side encryption as a hard procurement qualifier — not a preference, a gate",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "PrivSec Discord, Enterprise security",
    "impact": "ZK encryption market: $1.28B (2024) → $7.59B (2033); unlocks enterprise deals blocked at security questionnaire stage",
    "quote": "Zero-knowledge systems: even in a breach, attackers get encrypted data that requires your personal key to decrypt",
    "provenance": "discord"
  },
  {
    "id": "research-1-4",
    "feature": "Zero-Knowledge Authentication",
    "featureId": 1,
    "featureDesc": "Argon2id + AES-256-GCM client-side — password never leaves device",
    "title": "Replay attacks and session hijacking on traditional authentication systems",
    "description": "Replay attacks and session hijacking on traditional authentication systems",
    "source": "reddit",
    "score": 3,
    "severity": "Medium",
    "region": "GLOBAL",
    "community": "r/netsec",
    "impact": "Account compromise, unauthorized PII access without proper authentication",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-1-5",
    "feature": "Zero-Knowledge Authentication",
    "featureId": 1,
    "featureDesc": "Argon2id + AES-256-GCM client-side — password never leaves device",
    "title": "Government subpoena vulnerability",
    "description": "Government subpoena vulnerability — vendors can be compelled to hand over encrypted vaults if keys are held server-side",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "r/privacy, r/legaladvice",
    "impact": "Legal exposure; enterprises in regulated industries cite ZK as only safe architecture",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-2-1",
    "feature": "Multi-Language Support (48 Languages)",
    "featureId": 2,
    "featureDesc": "spaCy (25) + Stanza (7) + XLM-RoBERTa (16) — widest commercial coverage",
    "title": "No open-source multilingual PII dataset exists",
    "description": "No open-source multilingual PII dataset exists — root cause of all non-English production failures",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Hugging Face Discord (80K+), ACM 2024, NeurIPS 2025",
    "impact": "Every non-English PII pipeline must build annotated datasets from scratch; IBM annotated 336 locale-specific PII types across 13 locales",
    "quote": "There is no open-source PII-masking dataset sufficiently diverse to enable detection across languages and geographies — ACM 2024",
    "provenance": "discord"
  },
  {
    "id": "research-2-2",
    "feature": "Multi-Language Support (48 Languages)",
    "featureId": 2,
    "featureDesc": "spaCy (25) + Stanza (7) + XLM-RoBERTa (16) — widest commercial coverage",
    "title": "Arabic, Japanese, and Chinese degrade severely in XLM-RoBERTa; MENA and APAC deployments fail silently in production",
    "description": "Arabic, Japanese, and Chinese degrade severely in XLM-RoBERTa; MENA and APAC deployments fail silently in production",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "APAC",
    "community": "Hugging Face Discord, GitHub multilingual NER repo",
    "impact": "APAC and MENA enterprises get zero out-of-box PII detection; $0 to fix with 48-language support",
    "quote": "Arabic-like languages are not presented well by the model, though it still works",
    "provenance": "discord"
  },
  {
    "id": "research-2-3",
    "feature": "Multi-Language Support (48 Languages)",
    "featureId": 2,
    "featureDesc": "spaCy (25) + Stanza (7) + XLM-RoBERTa (16) — widest commercial coverage",
    "title": "NER miss rate rises from 44% to 69% for non-standard entity mentions",
    "description": "NER miss rate rises from 44% to 69% for non-standard entity mentions — doubles failure rate in harder multilingual text",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "ML practitioner community, Nature/Scientific Reports",
    "impact": "1-in-3 PII entities missed; in financial/healthcare context = ongoing silent compliance failure",
    "quote": "Performance degrades as identifiers become harder to detect, risk increasing from 44% for standard-form to 69% for non-standard mentions",
    "provenance": "discord"
  },
  {
    "id": "research-2-4",
    "feature": "Multi-Language Support (48 Languages)",
    "featureId": 2,
    "featureDesc": "spaCy (25) + Stanza (7) + XLM-RoBERTa (16) — widest commercial coverage",
    "title": "Low-resource language PII detection fails due to limited annotated training data and linguistic diversity",
    "description": "Low-resource language PII detection fails due to limited annotated training data and linguistic diversity",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/MachineLearning, Hugging Face forums",
    "impact": "Teams in non-English markets forced to build expensive custom datasets or accept 30–70% miss rates",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-2-5",
    "feature": "Multi-Language Support (48 Languages)",
    "featureId": 2,
    "featureDesc": "spaCy (25) + Stanza (7) + XLM-RoBERTa (16) — widest commercial coverage",
    "title": "Commercial tools warn that language detection ≠ PII detection; practitioners discover this only after production failure",
    "description": "Commercial tools warn that language detection ≠ PII detection; practitioners discover this only after production failure",
    "source": "discord",
    "score": 3,
    "severity": "Medium",
    "region": "GLOBAL",
    "community": "Private AI developer community",
    "impact": "Enterprise false confidence; silent compliance failure in production",
    "quote": "Detection of a language does not guarantee that the appropriate PII model was used to process the payload — Private AI Docs",
    "provenance": "discord"
  },
  {
    "id": "research-2-6",
    "feature": "Multi-Language Support (48 Languages)",
    "featureId": 2,
    "featureDesc": "spaCy (25) + Stanza (7) + XLM-RoBERTa (16) — widest commercial coverage",
    "title": "German, French, and Spanish require different entity recognition patterns; NER models trained on English degrade on DACH dialects",
    "description": "German, French, and Spanish require different entity recognition patterns; NER models trained on English degrade on DACH dialects",
    "source": "reddit",
    "score": 3,
    "severity": "Medium",
    "region": "DACH",
    "community": "r/de, r/datenschutz, German tech communities",
    "impact": "Steuer-ID, IBAN, and German address formats frequently missed by English-first tools",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-3-1",
    "feature": "Hybrid Recognizer (Regex + NLP + Transformers)",
    "featureId": 3,
    "featureDesc": "Three-tier detection — 30% more precise than vanilla Presidio",
    "title": "Presidio TFN Recognizer assigns 1.0 confidence to false positives",
    "description": "Presidio TFN Recognizer assigns 1.0 confidence to false positives — context check runs after checksum, corrupting spreadsheets and logs",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Presidio GitHub Discussion #1071",
    "impact": "Production pipelines treat random numeric sequences as confirmed PII; data corruption at scale",
    "quote": "The code marks confidence as 1 if it passes the checksum — context words are checked after this step",
    "provenance": "discord"
  },
  {
    "id": "research-3-2",
    "feature": "Hybrid Recognizer (Regex + NLP + Transformers)",
    "featureId": 3,
    "featureDesc": "Three-tier detection — 30% more precise than vanilla Presidio",
    "title": "Presidio en_core_web_lg generates 13,536 false positive name detections across 4,434 samples",
    "description": "Presidio en_core_web_lg generates 13,536 false positive name detections across 4,434 samples — flags pronouns, vessel names, countries",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Presidio GitHub Discussion #1226, Python Discord",
    "impact": "Unusable at production scale without 30–80 hours of tuning; Microsoft confirmed: 'vanilla Presidio isn't very accurate'",
    "quote": "Vanilla Presidio's results aren't very accurate… we see Presidio as a framework rather than a complete solution — Microsoft team",
    "provenance": "discord"
  },
  {
    "id": "research-3-3",
    "feature": "Hybrid Recognizer (Regex + NLP + Transformers)",
    "featureId": 3,
    "featureDesc": "Three-tier detection — 30% more precise than vanilla Presidio",
    "title": "Presidio default precision 0.83 F1 vs hybrid approaches at 94.7%",
    "description": "Presidio default precision 0.83 F1 vs hybrid approaches at 94.7% — 30% accuracy gap in financial document processing",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "arXiv 2404.14465, NeurIPS 2025",
    "impact": "17% of PII entities missed in tightest compliance contexts; financial/healthcare data = direct regulatory exposure",
    "quote": "Configuring Presidio can improve accuracy and boost the F score by approximately 30% — but requires significant engineering investment",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-3-4",
    "feature": "Hybrid Recognizer (Regex + NLP + Transformers)",
    "featureId": 3,
    "featureDesc": "Three-tier detection — 30% more precise than vanilla Presidio",
    "title": "Developers building pipelines for logs and CSVs: too many false positives make automated anonymization unusable",
    "description": "Developers building pipelines for logs and CSVs: too many false positives make automated anonymization unusable",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Presidio GitHub Discussions #388, #804, #1022, #1299",
    "impact": "Loss of automation ROI; every flagged entity requires manual review; teams abandon tool entirely",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-3-5",
    "feature": "Hybrid Recognizer (Regex + NLP + Transformers)",
    "featureId": 3,
    "featureDesc": "Three-tier detection — 30% more precise than vanilla Presidio",
    "title": "False positive rates in structured data: SSN patterns match product codes, timestamps match phone patterns",
    "description": "False positive rates in structured data: SSN patterns match product codes, timestamps match phone patterns",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "r/dataengineering, r/MachineLearning",
    "impact": "Manual review overhead eliminates efficiency gains; data pipeline reliability destroyed",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-4-1",
    "feature": "MCP Server Integration",
    "featureId": 4,
    "featureDesc": "Real-time PII filter for Claude Desktop, Cursor, and all MCP tools",
    "title": "77% of enterprise AI users paste company data into public AI tools; 82% use personal accounts",
    "description": "77% of enterprise AI users paste company data into public AI tools; 82% use personal accounts — zero corporate visibility",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "r/ChatGPT, enterprise security Discord, LayerX 2025",
    "impact": "GenAI tools responsible for 32% of all unauthorized corporate data movement; $670K more per breach for high shadow-AI orgs (IBM 2025)",
    "quote": "Generative AI tools have become the leading channel for corporate-to-personal data exfiltration, responsible for 32% of all unauthorized data movement",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-4-2",
    "feature": "MCP Server Integration",
    "featureId": 4,
    "featureDesc": "Real-time PII filter for Claude Desktop, Cursor, and all MCP tools",
    "title": "Samsung leaked semiconductor source code, meeting transcripts, and chip yield tests into ChatGPT 3 times in 20 days",
    "description": "Samsung leaked semiconductor source code, meeting transcripts, and chip yield tests into ChatGPT 3 times in 20 days",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "r/ChatGPT, r/netsec, Cursor Discord (cross-post)",
    "impact": "Industry-wide enterprise AI bans: Apple, JPMorgan, Deutsche Bank, Goldman Sachs, US House of Representatives",
    "quote": "Less than three weeks after Samsung lifted its ban, the company leaked its own secrets at least three times",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-4-3",
    "feature": "MCP Server Integration",
    "featureId": 4,
    "featureDesc": "Real-time PII filter for Claude Desktop, Cursor, and all MCP tools",
    "title": "GitHub MCP server: prompt injection via public issue → AI agent silently exfiltrates private repos and personal salary data",
    "description": "GitHub MCP server: prompt injection via public issue → AI agent silently exfiltrates private repos and personal salary data",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Cursor Discord, Claude Discord, Docker blog widely shared",
    "impact": "13,000+ MCP servers on GitHub expose enterprise data by default; a single malicious issue can trigger private repo leak",
    "quote": "An exploited MCP can pivot across systems without breaking a sweat, putting PII and PHI directly in the crosshairs — MCPcat",
    "provenance": "discord"
  },
  {
    "id": "research-4-4",
    "feature": "MCP Server Integration",
    "featureId": 4,
    "featureDesc": "Real-time PII filter for Claude Desktop, Cursor, and all MCP tools",
    "title": "Cursor sends full codebase including .env files and API keys to external servers by default",
    "description": "Cursor sends full codebase including .env files and API keys to external servers by default — CVE-2025-54135/54136",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Cursor Community Forum #5418, r/cursor_ai",
    "impact": "Entire engineering codebase + secrets transmitted to third party without developer awareness; GDPR Article 44 violation",
    "quote": "I realized my AI tools were leaking sensitive data. So I built a local proxy to stop it",
    "provenance": "discord"
  },
  {
    "id": "research-4-5",
    "feature": "MCP Server Integration",
    "featureId": 4,
    "featureDesc": "Real-time PII filter for Claude Desktop, Cursor, and all MCP tools",
    "title": "Malicious Postmark MCP server with 1,500 weekly downloads silently BCCed all emails to attacker for weeks",
    "description": "Malicious Postmark MCP server with 1,500 weekly downloads silently BCCed all emails to attacker for weeks",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Security Discord, authzed breach timeline",
    "impact": "Supply chain attack via MCP ecosystem; legitimate tool appearance masks data exfiltration",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-4-6",
    "feature": "MCP Server Integration",
    "featureId": 4,
    "featureDesc": "Real-time PII filter for Claude Desktop, Cursor, and all MCP tools",
    "title": "8.5% of LLM prompts sent by enterprise users contain PII",
    "description": "8.5% of LLM prompts sent by enterprise users contain PII — real-time pre-filter would prevent all of it",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "AI security community, Cyberhaven 2024",
    "impact": "Prevention at point-of-paste is 100x cheaper than breach remediation; 15% of employees paste sensitive data unknowingly",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-5-1",
    "feature": "Office Add-in (Word & Excel)",
    "featureId": 5,
    "featureDesc": "Native Word/Excel PII detection with formatting preservation",
    "title": "Word 'redaction' via black boxes is bypassed by copy-paste",
    "description": "Word 'redaction' via black boxes is bypassed by copy-paste — underlying XML text persists; a journalist copy-pasted through it",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Legal Tech Discord, Microsoft Q&A community",
    "impact": "87% of organizations faced PII exposure from inadequate redaction in 2025; structural Word architecture limitation, not user error",
    "quote": "A journalist simply selected and copied the black boxes and subsequently pasted the text into a new document",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-5-2",
    "feature": "Office Add-in (Word & Excel)",
    "featureId": 5,
    "featureDesc": "Native Word/Excel PII detection with formatting preservation",
    "title": "Excel PII redaction requires removing cell values + metadata + formulas + hidden rows",
    "description": "Excel PII redaction requires removing cell values + metadata + formulas + hidden rows — manually unmanageable at scale",
    "source": "both",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Presidio GitHub Discussion #1300, r/excel, compliance communities",
    "impact": "Legal/compliance teams with hundreds of rows of SSNs/bank details unable to redact at scale; Discussion #1300 title is literally the pain",
    "quote": "How to make Microsoft Presidio work with Excel? — GitHub Discussion #1300",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-5-3",
    "feature": "Office Add-in (Word & Excel)",
    "featureId": 5,
    "featureDesc": "Native Word/Excel PII detection with formatting preservation",
    "title": "FOIA agencies: 200,000+ pending requests; 20-day statutory deadline breached systemically; manual Word/PDF redaction untenable",
    "description": "FOIA agencies: 200,000+ pending requests; 20-day statutory deadline breached systemically; manual Word/PDF redaction untenable",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "US",
    "community": "r/FOIA, government Discord, U.S. GAO blog",
    "impact": "AI redaction clears backlogs 32x faster; entire US government FOIA backlog is an addressable market",
    "quote": "Federal agencies process thousands of FOIA requests annually; manual redaction is too slow to meet the 20-day statutory deadline",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-5-4",
    "feature": "Office Add-in (Word & Excel)",
    "featureId": 5,
    "featureDesc": "Native Word/Excel PII detection with formatting preservation",
    "title": "Law firms draft in Word, but redaction requires export to separate tool",
    "description": "Law firms draft in Word, but redaction requires export to separate tool — breaks document chain-of-custody and increases error risk",
    "source": "both",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Legal Tech Discord, r/law, r/paralegal",
    "impact": "GDPR violations start at €20M; HIPAA at $50K per violation; many costly fines trace back to wrong redaction tools",
    "quote": "",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-5-5",
    "feature": "Office Add-in (Word & Excel)",
    "featureId": 5,
    "featureDesc": "Native Word/Excel PII detection with formatting preservation",
    "title": "Word document metadata (author names, tracked changes, revision history) survives visual redaction",
    "description": "Word document metadata (author names, tracked changes, revision history) survives visual redaction",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "r/legaladvice, r/law",
    "impact": "DOJ case compromised when metadata wasn't scrubbed from Word documents converted to PDF",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-6-1",
    "feature": "Desktop Application (Offline / Air-Gapped)",
    "featureId": 6,
    "featureDesc": "Tauri/Rust app, fully local, Zero-Knowledge vault, no network required",
    "title": "US defense/government: FedRAMP IL5, ITAR, CJIS prohibit cloud; NARA declared ChatGPT 'unacceptable risk' May 2024",
    "description": "US defense/government: FedRAMP IL5, ITAR, CJIS prohibit cloud; NARA declared ChatGPT 'unacceptable risk' May 2024",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "US",
    "community": "Privacy Guides Discord, government security communities",
    "impact": "Entire US defense/intelligence market requires local-only processing; $112B federal IT annual spend (FY2024)",
    "quote": "In air-gapped environments common in defense, healthcare, and financial services, local inference is not a preference but a hard requirement",
    "provenance": "discord"
  },
  {
    "id": "research-6-2",
    "feature": "Desktop Application (Offline / Air-Gapped)",
    "featureId": 6,
    "featureDesc": "Tauri/Rust app, fully local, Zero-Knowledge vault, no network required",
    "title": "HIPAA BAA restricts cloud vendor use for PHI",
    "description": "HIPAA BAA restricts cloud vendor use for PHI — healthcare orgs must use local-only processing for sensitive clinical data",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "Healthcare IT Discord, LocalLLaMA Discord",
    "impact": "Healthcare systems building local-only AI pipelines; ELEKS documented local-only as only viable path for PHI processing",
    "quote": "Cloud was a non-starter for PHI processing; we built local-first — ELEKS case study",
    "provenance": "discord"
  },
  {
    "id": "research-6-3",
    "feature": "Desktop Application (Offline / Air-Gapped)",
    "featureId": 6,
    "featureDesc": "Tauri/Rust app, fully local, Zero-Knowledge vault, no network required",
    "title": "LocalLLaMA Discord (266,500+ members) cites privacy as #1 reason for running local LLMs; Ollama GitHub Issue #12436 requests local-only mode",
    "description": "LocalLLaMA Discord (266,500+ members) cites privacy as #1 reason for running local LLMs; Ollama GitHub Issue #12436 requests local-only mode",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "LocalLLaMA Discord, Ollama Discord",
    "impact": "Massive pre-built community audience for offline-first privacy tools; self-hosted demand growing 40% YoY",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-6-4",
    "feature": "Desktop Application (Offline / Air-Gapped)",
    "featureId": 6,
    "featureDesc": "Tauri/Rust app, fully local, Zero-Knowledge vault, no network required",
    "title": "Cloud fatigue: security-conscious developers and privacy advocates refuse to trust any SaaS that sends data to external servers",
    "description": "Cloud fatigue: security-conscious developers and privacy advocates refuse to trust any SaaS that sends data to external servers",
    "source": "both",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/privacy, r/selfhosted, Privacy Guides Discord",
    "impact": "Growing segment of power users will only use fully local tools regardless of price",
    "quote": "",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-6-5",
    "feature": "Desktop Application (Offline / Air-Gapped)",
    "featureId": 6,
    "featureDesc": "Tauri/Rust app, fully local, Zero-Knowledge vault, no network required",
    "title": "Air-gapped research environments (nuclear, defense, biomedical) cannot have any network-connected tools in the processing chain",
    "description": "Air-gapped research environments (nuclear, defense, biomedical) cannot have any network-connected tools in the processing chain",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "r/sysadmin, government practitioner communities",
    "impact": "Specialized but mission-critical market; no cloud tool can serve it by definition",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-7-1",
    "feature": "Chrome Extension (JIT Anonymization)",
    "featureId": 7,
    "featureDesc": "Browser-layer PII filter before ChatGPT / Claude / Gemini submission",
    "title": "77% of enterprise employees paste confidential data into AI chat; 82% from personal accounts invisible to corporate IT",
    "description": "77% of enterprise employees paste confidential data into AI chat; 82% from personal accounts invisible to corporate IT",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "r/ChatGPT, enterprise security Discord, LayerX 2025",
    "impact": "Continuous invisible exfiltration at scale; IBM 2025: orgs with high shadow-AI paid $670K more per breach",
    "quote": "With 82% of pastes from unmanaged personal accounts, enterprises have little to no visibility into what data is being shared",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-7-2",
    "feature": "Chrome Extension (JIT Anonymization)",
    "featureId": 7,
    "featureDesc": "Browser-layer PII filter before ChatGPT / Claude / Gemini submission",
    "title": "Urban VPN Chrome Extension (8M users) + 2 others (900K users) stole AI chat conversations in Dec 2025",
    "description": "Urban VPN Chrome Extension (8M users) + 2 others (900K users) stole AI chat conversations in Dec 2025–Jan 2026",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Security Discord, Dark Reading / Hacker News",
    "impact": "Legitimate privacy extensions face zero-trust market; anonym.legal needs established brand to overcome extension skepticism",
    "quote": "Chrome extension slurps up AI chats after users installed it for privacy — Malwarebytes headline",
    "provenance": "discord"
  },
  {
    "id": "research-7-3",
    "feature": "Chrome Extension (JIT Anonymization)",
    "featureId": 7,
    "featureDesc": "Browser-layer PII filter before ChatGPT / Claude / Gemini submission",
    "title": "Customer support agents paste customer PII into ChatGPT for empathy drafts",
    "description": "Customer support agents paste customer PII into ChatGPT for empathy drafts — Italy fined OpenAI €15M; Google indexes conversations",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "Privacy Guides Discord, GDPR Discord, Wald.ai breach timeline",
    "impact": "Customer support is highest-risk AI paste segment; every paste is a potential GDPR Article 44 violation",
    "quote": "Customer support agent pastes client medical history into ChatGPT — GDPR violation before anonymization begins",
    "provenance": "discord"
  },
  {
    "id": "research-7-4",
    "feature": "Chrome Extension (JIT Anonymization)",
    "featureId": 7,
    "featureDesc": "Browser-layer PII filter before ChatGPT / Claude / Gemini submission",
    "title": "143,000+ AI chat conversations (Claude, Copilot, ChatGPT) were publicly accessible due to missing access controls",
    "description": "143,000+ AI chat conversations (Claude, Copilot, ChatGPT) were publicly accessible due to missing access controls",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/privacy, r/netsec",
    "impact": "Highlights that AI tool providers themselves are not securing user conversations; user-side protection is the only reliable layer",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-7-5",
    "feature": "Chrome Extension (JIT Anonymization)",
    "featureId": 7,
    "featureDesc": "Browser-layer PII filter before ChatGPT / Claude / Gemini submission",
    "title": "No corporate policy can prevent personal-device AI tool use",
    "description": "No corporate policy can prevent personal-device AI tool use — bans create workarounds, not compliance",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/ChatGPT, r/netsec, enterprise security communities",
    "impact": "Technical control at browser layer is the only enforcement mechanism that works across managed and unmanaged devices",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-8-1",
    "feature": "Reversible Encryption (AES-256-GCM)",
    "featureId": 8,
    "featureDesc": "Unique differentiator — decrypt with key; only tool at this price point",
    "title": "Courts sanction parties who cannot produce original documents behind redactions",
    "description": "Courts sanction parties who cannot produce original documents behind redactions — adverse inference, fee-shifting, compelled re-production",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "US",
    "community": "Legal Tech Discord, Morgan Lewis Q4 2024, Sidley Austin Q1 2025",
    "impact": "Permanent redaction is legally dangerous in litigation; reversible tokenization solves sharing AND production simultaneously",
    "quote": "If you need analytics, machine learning, or legal/archival purposes, reversible methods such as tokenization are your only choice",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-8-2",
    "feature": "Reversible Encryption (AES-256-GCM)",
    "featureId": 8,
    "featureDesc": "Unique differentiator — decrypt with key; only tool at this price point",
    "title": "Clinical trials: 10",
    "description": "Clinical trials: 10–15 year patient follow-up (oncology, cell/gene therapy) requires linking anonymized research data back to patients",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Healthcare Discord, Datavant 2025, Frontiers 2025",
    "impact": "Irreversible anonymization breaks research continuity for entire drug development pipeline; tokenization is now standard",
    "quote": "Tokenization is now standard for long-term follow-up; irreversible anonymization = research continuity broken — Datavant 2025",
    "provenance": "discord"
  },
  {
    "id": "research-8-3",
    "feature": "Reversible Encryption (AES-256-GCM)",
    "featureId": 8,
    "featureDesc": "Unique differentiator — decrypt with key; only tool at this price point",
    "title": "Financial auditors must verify original figures behind redacted reports",
    "description": "Financial auditors must verify original figures behind redacted reports — TD Bank $3B AML fine demonstrates stakes of missed verification",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Finance/Compliance Discord, IRI documentation",
    "impact": "Audit-grade reversibility is a procurement requirement for financial services tools",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-8-4",
    "feature": "Reversible Encryption (AES-256-GCM)",
    "featureId": 8,
    "featureDesc": "Unique differentiator — decrypt with key; only tool at this price point",
    "title": "HIPAA Safe Harbor de-identification explicitly permits reversible de-identification with key management",
    "description": "HIPAA Safe Harbor de-identification explicitly permits reversible de-identification with key management — but most tools only offer permanent redaction",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "r/healthcare, r/HIPAA practitioner communities",
    "impact": "Healthcare organizations need controlled reversibility for research re-contact and billing verification",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-8-5",
    "feature": "Reversible Encryption (AES-256-GCM)",
    "featureId": 8,
    "featureDesc": "Unique differentiator — decrypt with key; only tool at this price point",
    "title": "Law firms anonymize client documents for external review but need to recover originals when deal closes or case settles",
    "description": "Law firms anonymize client documents for external review but need to recover originals when deal closes or case settles",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/legaladvice, Legal Tech Discord",
    "impact": "Permanent redaction workflow is incompatible with deal-room and litigation document management",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-9-1",
    "feature": "260+ Entity Types (75+ Countries)",
    "featureId": 9,
    "featureDesc": "Regional national IDs, healthcare, financial, professional identifiers",
    "title": "Presidio defaults cover ~20 entity types (US-centric)",
    "description": "Presidio defaults cover ~20 entity types (US-centric) — misses Steuer-ID, NIR, Personnummer, AHV-Nr, BSN, NIF, Carte Vitale",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "GDPR Discord, Finance Discord, Presidio GitHub docs",
    "impact": "Bloomberg study: 10% of customer tax IDs missing/invalid at top-50 SaaS; GDPR applies equally to all EU national ID formats",
    "quote": "The default phone number recognizer does not support all country codes — Microsoft Presidio official documentation",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-9-2",
    "feature": "260+ Entity Types (75+ Countries)",
    "featureId": 9,
    "featureDesc": "Regional national IDs, healthcare, financial, professional identifiers",
    "title": "$4.5 billion in global KYC/AML fines in 2024 directly linked to identity verification failures including missed country-specific identifiers",
    "description": "$4.5 billion in global KYC/AML fines in 2024 directly linked to identity verification failures including missed country-specific identifiers",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Finance/Compliance Discord, Sumsub, Flagright",
    "impact": "TD Bank $3B AML fine; Starling Bank £28.96M; entity coverage gap = direct AML regulatory exposure",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-9-3",
    "feature": "260+ Entity Types (75+ Countries)",
    "featureId": 9,
    "featureDesc": "Regional national IDs, healthcare, financial, professional identifiers",
    "title": "Healthcare: each hospital uses different MRN format; Presidio misses custom institutional identifiers; HIPAA requires 18 specific PHI types",
    "description": "Healthcare: each hospital uses different MRN format; Presidio misses custom institutional identifiers; HIPAA requires 18 specific PHI types",
    "source": "both",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "Healthcare IT Discord, John Snow Labs comparison 2024",
    "impact": "Patient identity exposed when MRN format not recognized; HIPAA violations: $100K–$1.9M per violation category/year",
    "quote": "Presidio does not recognize Aadhar and Health Insurance Claim Numbers (HICNs) correctly — GitHub Issue #1305",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-9-4",
    "feature": "260+ Entity Types (75+ Countries)",
    "featureId": 9,
    "featureDesc": "Regional national IDs, healthcare, financial, professional identifiers",
    "title": "Only 56% of organizations have comprehensive classification distinguishing PII, PHI, and PCI",
    "description": "Only 56% of organizations have comprehensive classification distinguishing PII, PHI, and PCI — 44% using inadequate entity sets",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Healthcare/Finance Discord, Metomic, Nightfall AI",
    "impact": "Off-the-shelf tools with insufficient entity sets force organisations into non-compliance",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-9-5",
    "feature": "260+ Entity Types (75+ Countries)",
    "featureId": 9,
    "featureDesc": "Regional national IDs, healthcare, financial, professional identifiers",
    "title": "Japanese corporate ID formats, My Number (マイナンバー), and organisation-specific identifiers require full custom recognizer builds",
    "description": "Japanese corporate ID formats, My Number (マイナンバー), and organisation-specific identifiers require full custom recognizer builds",
    "source": "discord",
    "score": 3,
    "severity": "Medium",
    "region": "APAC",
    "community": "Mamezou Developer Portal, Japanese developer communities",
    "impact": "APAC enterprises must build custom recognizers for each market they operate in — weeks of engineering per identifier",
    "quote": "It is almost essential to accurately detect Japan-specific information or organisation-specific formats, making customisation necessary",
    "provenance": "discord"
  },
  {
    "id": "research-10-1",
    "feature": "GDPR / ISO 27001 Compliance",
    "featureId": 10,
    "featureDesc": "EU Hetzner data residency, zero-knowledge, DPIA completed, ISO 27001 certified",
    "title": "TikTok €530M fine (May 2025) for EU data transferred to China",
    "description": "TikTok €530M fine (May 2025) for EU data transferred to China — largest data-residency GDPR penalty in history",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "GDPR Discord, Privacy Guides Discord",
    "impact": "Any tool processing EU data on non-EU servers faces the same exposure; zero-knowledge + EU Hetzner = only defensible architecture",
    "quote": "TikTok failed to verify that EU user data accessed by Chinese staff received equivalent protection — Irish DPC May 2025",
    "provenance": "discord"
  },
  {
    "id": "research-10-2",
    "feature": "GDPR / ISO 27001 Compliance",
    "featureId": 10,
    "featureDesc": "EU Hetzner data residency, zero-knowledge, DPIA completed, ISO 27001 certified",
    "title": "EDPB CEF 2025: 764 organizations investigated for right-to-erasure failures; 'inefficient anonymisation' explicitly rejected as deletion substitute",
    "description": "EDPB CEF 2025: 764 organizations investigated for right-to-erasure failures; 'inefficient anonymisation' explicitly rejected as deletion substitute",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "GDPR/Compliance Discord, EDPB official report Feb 2026",
    "impact": "9 DPAs opened formal investigations; regulators now define what counts as 'efficient' anonymization",
    "quote": "Reliance by some controllers on inefficient anonymisation techniques to handle erasure requests as an alternative to deletion — EDPB CEF 2025",
    "provenance": "discord"
  },
  {
    "id": "research-10-3",
    "feature": "GDPR / ISO 27001 Compliance",
    "featureId": 10,
    "featureDesc": "EU Hetzner data residency, zero-knowledge, DPIA completed, ISO 27001 certified",
    "title": "DPO paradox: using a non-GDPR-compliant tool to achieve GDPR compliance",
    "description": "DPO paradox: using a non-GDPR-compliant tool to achieve GDPR compliance — EDPB Guidelines 01/2025 expand what counts as personal data",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "GDPR/Compliance Discord, EU Startup Discord",
    "impact": "€1.3M average annual GDPR compliance spend (Deloitte 2024); DPOs have procurement authority and board-level accountability",
    "quote": "EDPB clarifies: tool infrastructure matters — storing pseudonymization keys on third-country servers undermines the pseudonymization",
    "provenance": "discord"
  },
  {
    "id": "research-10-4",
    "feature": "GDPR / ISO 27001 Compliance",
    "featureId": 10,
    "featureDesc": "EU Hetzner data residency, zero-knowledge, DPIA completed, ISO 27001 certified",
    "title": "ISO 27001 is now a hard procurement gate at 81% of enterprises",
    "description": "ISO 27001 is now a hard procurement gate at 81% of enterprises — uncertified vendors structurally excluded from regulated industry sales",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Enterprise IT Discord, Secfix.com 2025",
    "impact": "Certification reduces sales cycle 30%; certified companies report 10x–30x ROI within first year; deals die at security questionnaire",
    "quote": "In 2025, large enterprises require ISO 27001 certification as a minimum bar for vendor onboarding",
    "provenance": "discord"
  },
  {
    "id": "research-10-5",
    "feature": "GDPR / ISO 27001 Compliance",
    "featureId": 10,
    "featureDesc": "EU Hetzner data residency, zero-knowledge, DPIA completed, ISO 27001 certified",
    "title": "LinkedIn €310M fine for behavioral targeting without valid consent (Oct 2024); GDPR fines 2025 total €2.3B (+38% YoY)",
    "description": "LinkedIn €310M fine for behavioral targeting without valid consent (Oct 2024); GDPR fines 2025 total €2.3B (+38% YoY)",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "GDPR Discord, Privacy Guides Discord, DLA Piper Survey",
    "impact": "Advertising-era data practices now routinely attract nine-figure fines; compliance tooling is board-level spend",
    "quote": "",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-10-6",
    "feature": "GDPR / ISO 27001 Compliance",
    "featureId": 10,
    "featureDesc": "EU Hetzner data residency, zero-knowledge, DPIA completed, ISO 27001 certified",
    "title": "Security questionnaire fatigue: 40",
    "description": "Security questionnaire fatigue: 40–80 hours per questionnaire, 200–400 questions, multiple annually — ISO 27001 cuts burden 80%",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Enterprise vendor communities, Panorays guide",
    "impact": "Without certification: deals stall 3–6 months; with certification: procurement bypasses routine checks automatically",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-11-1",
    "feature": "Token-Based Pricing with Free Tier",
    "featureId": 11,
    "featureDesc": "Free 200 tokens + €3/€15/€29 tiers — no sales call needed",
    "title": "Enterprise PII tools cost $30K",
    "description": "Enterprise PII tools cost $30K–$100K+/year; most require 'contact sales' for pricing — SMBs and startups structurally excluded",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "r/Privacy, r/SaaS, G2/Capterra communities",
    "impact": "Token pricing is only viable entry point for indie/startup teams; fixes the opaque pricing problem competitors universally have",
    "quote": "",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-11-2",
    "feature": "Token-Based Pricing with Free Tier",
    "featureId": 11,
    "featureDesc": "Free 200 tokens + €3/€15/€29 tiers — no sales call needed",
    "title": "Usage-based billing is a strong Reddit community preference",
    "description": "Usage-based billing is a strong Reddit community preference — fixed-seat enterprise licensing viewed as predatory for variable workloads",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/Privacy, r/Anticonsumption, r/SaaS",
    "impact": "Token model directly matches stated community preference; reduces churn risk vs. annual fixed contracts",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-11-3",
    "feature": "Token-Based Pricing with Free Tier",
    "featureId": 11,
    "featureDesc": "Free 200 tokens + €3/€15/€29 tiers — no sales call needed",
    "title": "Private AI offers 500 free calls then requires direct vendor contact",
    "description": "Private AI offers 500 free calls then requires direct vendor contact — no self-serve upgrade path frustrates teams that outgrow free tier",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "PII tool comparison Discord communities, Datastreamer 2024",
    "impact": "Self-serve transparent upgrade = competitive differentiator vs. Private AI, Nightfall, and every enterprise tool",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-11-4",
    "feature": "Token-Based Pricing with Free Tier",
    "featureId": 11,
    "featureDesc": "Free 200 tokens + €3/€15/€29 tiers — no sales call needed",
    "title": "GDPR compliance has created an unintended moat for large platforms",
    "description": "GDPR compliance has created an unintended moat for large platforms — SMBs cannot afford enterprise-level compliance tooling",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "EU",
    "community": "GDPR Discord, EU Startup Discord",
    "impact": "SME GDPR fines range €10K–€500K; even modest penalties can be existential for startups without enterprise-grade tools",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-12-1",
    "feature": "Batch Processing (1–5,000 Files)",
    "featureId": 12,
    "featureDesc": "Bulk anonymization, API-accessible, automation-friendly",
    "title": "DSAR volumes +246% (2021",
    "description": "DSAR volumes +246% (2021–2024); 27 staff hours per request; automated processing cuts response time 60%",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "GDPR/Compliance Discord, Termly 2025, DSAR.ai",
    "impact": "DSAR processing at scale is impossible manually; €1.2B in GDPR fines 2024 with deadline violations a key trigger",
    "quote": "Everyone's automating at least part of their DSAR process now, especially the big firms — DSAR.ai 2025",
    "provenance": "discord"
  },
  {
    "id": "research-12-2",
    "feature": "Batch Processing (1–5,000 Files)",
    "featureId": 12,
    "featureDesc": "Bulk anonymization, API-accessible, automation-friendly",
    "title": "FOIA request backlog: 200,000+ pending government-wide; AI batch redaction clears backlogs 32x faster",
    "description": "FOIA request backlog: 200,000+ pending government-wide; AI batch redaction clears backlogs 32x faster",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "US",
    "community": "Government Discord, r/FOIA, U.S. GAO",
    "impact": "Federal agencies miss 20-day statutory deadline systemically; batch AI redaction is the only viable scaling solution",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-12-3",
    "feature": "Batch Processing (1–5,000 Files)",
    "featureId": 12,
    "featureDesc": "Bulk anonymization, API-accessible, automation-friendly",
    "title": "e-Discovery: expanding data volumes (Slack, Teams, mobile, AI-generated content) against strict court deadlines",
    "description": "e-Discovery: expanding data volumes (Slack, Teams, mobile, AI-generated content) against strict court deadlines",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "US",
    "community": "Legal Tech Discord, LawSites 2026",
    "impact": "'Biggest development in the whole history of e-Discovery' — Reed Smith partner on AI e-discovery adoption 2024",
    "quote": "eDiscovery attorneys must uncover evidence faster, ensure defensible practices, and meet court deadlines across expanding datasets",
    "provenance": "discord"
  },
  {
    "id": "research-12-4",
    "feature": "Batch Processing (1–5,000 Files)",
    "featureId": 12,
    "featureDesc": "Bulk anonymization, API-accessible, automation-friendly",
    "title": "dbt pipeline masking policies wiped on rebuild; EDPB 2024 clarified unmasked prod data in dev/test violates GDPR Art. 5",
    "description": "dbt pipeline masking policies wiped on rebuild; EDPB 2024 clarified unmasked prod data in dev/test violates GDPR Art. 5",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "EU",
    "community": "dbt Community Discord, Accutive Security 2025",
    "impact": "Data engineers need persistent anonymization that survives pipeline changes; multiple €8M–€22M fines for weak pseudonymization",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-13-1",
    "feature": "Custom Entity Creation",
    "featureId": 13,
    "featureDesc": "AI-assisted pattern builder — describe in plain language, get regex",
    "title": "Presidio custom recognizers silently fail: PatternRecognizer not recognized by AnalyzerEngine; language registration errors go unnoticed",
    "description": "Presidio custom recognizers silently fail: PatternRecognizer not recognized by AnalyzerEngine; language registration errors go unnoticed",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Presidio GitHub Discussion #1165, #1305, #1463, #1389",
    "impact": "Hours of debugging per custom entity type; 2024 issues still unresolved; practitioners abandon custom recognizer path",
    "quote": "Entity PNR doesn't have the corresponding recognizer in language: sv — GitHub Discussion #1165",
    "provenance": "discord"
  },
  {
    "id": "research-13-2",
    "feature": "Custom Entity Creation",
    "featureId": 13,
    "featureDesc": "AI-assisted pattern builder — describe in plain language, get regex",
    "title": "No built-in medical entity support in Presidio",
    "description": "No built-in medical entity support in Presidio — open GitHub Issue #1491 requests diseases, medications, clinical procedures recognizer",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "US",
    "community": "Healthcare Discord, Presidio GitHub Issue #1491",
    "impact": "Healthcare breaches cost $9.77M average per incident 2024; HIPAA violations: up to $1.5M/year per violation category",
    "quote": "Presidio does not have built-in support for medical entities such as diseases, medications, and clinical procedures",
    "provenance": "discord"
  },
  {
    "id": "research-13-3",
    "feature": "Custom Entity Creation",
    "featureId": 13,
    "featureDesc": "AI-assisted pattern builder — describe in plain language, get regex",
    "title": "LangChain cannot pass custom preloaded Presidio models to its PII pipeline",
    "description": "LangChain cannot pass custom preloaded Presidio models to its PII pipeline — blocks AI+privacy pipeline customization",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "LangChain Discord, GitHub Discussion #19430",
    "impact": "Developers forced to choose between LLM capability and custom privacy compliance in AI pipelines",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-13-4",
    "feature": "Custom Entity Creation",
    "featureId": 13,
    "featureDesc": "AI-assisted pattern builder — describe in plain language, get regex",
    "title": "Only 56% of organizations classify PII, PHI, and PCI comprehensively",
    "description": "Only 56% of organizations classify PII, PHI, and PCI comprehensively — inadequate entity sets leave 44% non-compliant by design",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Healthcare/Finance Discord, Metomic 2024",
    "impact": "Off-the-shelf entity sets guarantee non-compliance for specialized industries; custom builder is the only path",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-13-5",
    "feature": "Custom Entity Creation",
    "featureId": 13,
    "featureDesc": "AI-assisted pattern builder — describe in plain language, get regex",
    "title": "Industry-specific PII (nuclear facility codes, military service numbers, proprietary internal IDs) not covered by any commercial tool",
    "description": "Industry-specific PII (nuclear facility codes, military service numbers, proprietary internal IDs) not covered by any commercial tool",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "r/netsec, r/sysadmin, government communities",
    "impact": "Organizations with unique identifier formats must build custom solutions without a no-code option",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-14-1",
    "feature": "Presets System",
    "featureId": 14,
    "featureDesc": "Saved configs, team sharing, consistent policy across all team members",
    "title": "Inconsistent redaction across distributed teams is the most common compliance failure mode",
    "description": "Inconsistent redaction across distributed teams is the most common compliance failure mode — US courts have sanctioned parties for it",
    "source": "both",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Legal Tech Discord, Compliance Discord, Redactable guide",
    "impact": "95% of 2024 data breaches tied to human error; inconsistent redaction = #1 cited ICO/DPA audit finding",
    "quote": "Without standardized policies, every team member potentially redacts documents differently — TermsFeed Redaction Policy guide",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-14-2",
    "feature": "Presets System",
    "featureId": 14,
    "featureDesc": "Saved configs, team sharing, consistent policy across all team members",
    "title": "HIPAA and GDPR require demonstrable, consistent data handling practices across all employees and locations",
    "description": "HIPAA and GDPR require demonstrable, consistent data handling practices across all employees and locations",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Healthcare/Compliance Discord, HIPAA Journal",
    "impact": "HIPAA violations: $1.5M/year per violation category; GDPR Art. 5 requires consistency — tools that can't enforce shared config expose orgs",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-14-3",
    "feature": "Presets System",
    "featureId": 14,
    "featureDesc": "Saved configs, team sharing, consistent policy across all team members",
    "title": "Enterprise tools (Privitar, K2View, Protegrity) all market 'policy-driven anonymization' as core differentiator",
    "description": "Enterprise tools (Privitar, K2View, Protegrity) all market 'policy-driven anonymization' as core differentiator — validates presets as enterprise buyer requirement",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Enterprise IT Discord, Gigantics.io 2025 tool comparison",
    "impact": "SMBs need presets equivalent of enterprise 'policy-driven' tools at fraction of cost; market has validated this as purchase criterion",
    "quote": "Privitar is centred around policy-based anonymization — defining rules depending on data type and laws — K2View 2026",
    "provenance": "discord"
  },
  {
    "id": "research-14-4",
    "feature": "Presets System",
    "featureId": 14,
    "featureDesc": "Saved configs, team sharing, consistent policy across all team members",
    "title": "Government agencies require auditable, standardized redaction documentation",
    "description": "Government agencies require auditable, standardized redaction documentation — 'different people redacted different things' triggers regulatory findings",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "US",
    "community": "Government Discord, Redactor.ai federal guide",
    "impact": "Presets codify policy into the tool — eliminating training dependency and enforcing compliance by design",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-15-1",
    "feature": "Microsoft Presidio Foundation",
    "featureId": 15,
    "featureDesc": "Extended Presidio — three-tier hybrid, managed, no deployment pain",
    "title": "Presidio is 'a framework, not a solution'",
    "description": "Presidio is 'a framework, not a solution' — Microsoft's own words; requires 30–80 hours engineering to deploy reliably",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Presidio GitHub Discussion #1226, Python Discord",
    "impact": "Creates demand for managed Presidio wrapper/service; every hour saved is direct positioning advantage",
    "quote": "We don't have formal results, and this is somewhat intentional since we see Presidio as a framework rather than a solution — Microsoft",
    "provenance": "discord"
  },
  {
    "id": "research-15-2",
    "feature": "Microsoft Presidio Foundation",
    "featureId": 15,
    "featureDesc": "Extended Presidio — three-tier hybrid, managed, no deployment pain",
    "title": "Docker/Kubernetes deployment failures: Issues #1663, #1678, #1746, #1773",
    "description": "Docker/Kubernetes deployment failures: Issues #1663, #1678, #1746, #1773 — sidecar crashes, service mesh conflicts in production",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Presidio GitHub Issues, DevOps Discord",
    "impact": "Operators cannot run Presidio reliably in production without dedicated DevOps support; deployment blocks adoption",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-15-3",
    "feature": "Microsoft Presidio Foundation",
    "featureId": 15,
    "featureDesc": "Extended Presidio — three-tier hybrid, managed, no deployment pain",
    "title": "Presidio's own evaluation page recommends custom models as a workaround for the accuracy gap",
    "description": "Presidio's own evaluation page recommends custom models as a workaround for the accuracy gap — most teams lack ML expertise to do this",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Presidio docs community, ML practitioner Discord",
    "impact": "30% F-score improvement is documented but requires ML engineer skills most teams don't have; 0.83 → 0.95+ precision unlocked",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-16-1",
    "feature": "Real-Time Detection",
    "featureId": 16,
    "featureDesc": "Live PII scanning with confidence scoring before transmission",
    "title": "8.5% of LLM prompts contain PII",
    "description": "8.5% of LLM prompts contain PII — real-time interception before submission is the only prevention that works",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "AI security research, Cyberhaven 2024",
    "impact": "Post-hoc detection misses the window; 15% of employees paste sensitive data unaware they're doing it",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-16-2",
    "feature": "Real-Time Detection",
    "featureId": 16,
    "featureDesc": "Live PII scanning with confidence scoring before transmission",
    "title": "Discord October 2025 breach: 70,000+ government-issued IDs exposed via support channel",
    "description": "Discord October 2025 breach: 70,000+ government-issued IDs exposed via support channel — all text-based PII in a messaging platform",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Security Discord, SecurityWeek, Dark Reading",
    "impact": "Discord itself is a PII exposure vector; real-time scanning of support channels would have caught every ID before it was sent",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-16-3",
    "feature": "Real-Time Detection",
    "featureId": 16,
    "featureDesc": "Live PII scanning with confidence scoring before transmission",
    "title": "Customer support workflows involve real-time pasting of customer data",
    "description": "Customer support workflows involve real-time pasting of customer data — every ticket is a potential GDPR exposure event",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "EU",
    "community": "r/CustomerService, customer support communities",
    "impact": "Real-time PII detection at paste event is the enforcement layer that policy cannot provide",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-17-1",
    "feature": "Multi-Format Document Support",
    "featureId": 17,
    "featureDesc": "PDF, DOCX, XLSX, TXT, CSV, JSON, XML — format-aware extraction",
    "title": "Format fragmentation: organizations process PDF, DOCX, XLSX, CSV, JSON",
    "description": "Format fragmentation: organizations process PDF, DOCX, XLSX, CSV, JSON — each format requires different redaction approach; single-format tools create parallel workflows",
    "source": "both",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Data Engineering Discord, Legal Tech Discord",
    "impact": "Organizations with mixed document types need multiple tools; tool fragmentation creates audit inconsistencies",
    "quote": "",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-17-2",
    "feature": "Multi-Format Document Support",
    "featureId": 17,
    "featureDesc": "PDF, DOCX, XLSX, TXT, CSV, JSON, XML — format-aware extraction",
    "title": "dbt pipeline rebuilds destroy masking policies on CSV and JSON data",
    "description": "dbt pipeline rebuilds destroy masking policies on CSV and JSON data — EDPB 2024 clarifies this violates GDPR Art. 5(1)(a)",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "EU",
    "community": "dbt Community Discord, Accutive Security 2025",
    "impact": "Data engineering teams need format-aware anonymization that persists through pipeline changes",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-17-3",
    "feature": "Multi-Format Document Support",
    "featureId": 17,
    "featureDesc": "PDF, DOCX, XLSX, TXT, CSV, JSON, XML — format-aware extraction",
    "title": "Log files are the neglected PII surface",
    "description": "Log files are the neglected PII surface — developers focus on databases but logs contain API keys, user IDs, IP addresses, session tokens",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Developer Discord, OWASP logging guidance",
    "impact": "Log files where PII goes to be forgotten — often more sensitive than databases; systematic compliance gap",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-17-4",
    "feature": "Multi-Format Document Support",
    "featureId": 17,
    "featureDesc": "PDF, DOCX, XLSX, TXT, CSV, JSON, XML — format-aware extraction",
    "title": "Scanned documents and PDFs with embedded images lose PII protection when converted",
    "description": "Scanned documents and PDFs with embedded images lose PII protection when converted — no tool handles both native text and image text",
    "source": "reddit",
    "score": 3,
    "severity": "Medium",
    "region": "GLOBAL",
    "community": "r/sysadmin, Legal Tech communities",
    "impact": "Hybrid documents (scanned + text) fall through the gap between document and image redaction tools",
    "quote": "",
    "provenance": "reddit"
  },
  {
    "id": "research-18-1",
    "feature": "Text-Based Image PII Detection",
    "featureId": 18,
    "featureDesc": "Text in screenshots and scanned docs — no facial recognition",
    "title": "Microsoft Purview explicitly cannot scan JPEG/PNG",
    "description": "Microsoft Purview explicitly cannot scan JPEG/PNG — text PII in screenshots is completely invisible to the enterprise DLP system",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Enterprise IT Discord, Microsoft Purview documentation",
    "impact": "Gap in every enterprise Microsoft security stack; screenshot-based PII exposure = undetected by default",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-18-2",
    "feature": "Text-Based Image PII Detection",
    "featureId": 18,
    "featureDesc": "Text in screenshots and scanned docs — no facial recognition",
    "title": "SparkCat malware (iOS/Android, Dec 2025) used OCR to steal crypto wallet recovery phrases from screenshots in photo library",
    "description": "SparkCat malware (iOS/Android, Dec 2025) used OCR to steal crypto wallet recovery phrases from screenshots in photo library",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Security Discord, Kaspersky bulletin",
    "impact": "Screenshot PII is an active attack target; malware specifically targeting image-based text PII is in the wild",
    "quote": "SparkCat specifically targeted text content in screenshots using OCR — first mobile malware of this type",
    "provenance": "discord"
  },
  {
    "id": "research-18-3",
    "feature": "Text-Based Image PII Detection",
    "featureId": 18,
    "featureDesc": "Text in screenshots and scanned docs — no facial recognition",
    "title": "87% of organizations at risk from inadequate image-based PII redaction",
    "description": "87% of organizations at risk from inadequate image-based PII redaction — most tools only handle plain text documents",
    "source": "both",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Enterprise security community, Tungsten Automation",
    "impact": "Systematic compliance gap across all sectors; organizations assume their tool covers images when it doesn't",
    "quote": "",
    "provenance": "reddit+discord"
  },
  {
    "id": "research-18-4",
    "feature": "Text-Based Image PII Detection",
    "featureId": 18,
    "featureDesc": "Text in screenshots and scanned docs — no facial recognition",
    "title": "OCR + Presidio coordinate mapping fails on scanned documents",
    "description": "OCR + Presidio coordinate mapping fails on scanned documents — text extracted but bounding boxes misaligned, redacting wrong text",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "ML/NLP engineering Discord, GitHub OCR+Presidio issues",
    "impact": "Scanned document redaction produces unreliable output even when OCR succeeds; pipeline is broken at coordinate translation",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-19-1",
    "feature": "Cross-Platform Consistency",
    "featureId": 19,
    "featureDesc": "Same engine across Web, Desktop, Office, Chrome Extension, MCP",
    "title": "Multi-vendor PII stacks create audit trail gaps",
    "description": "Multi-vendor PII stacks create audit trail gaps — different tools flag different entities; audit cannot reconcile discrepancies",
    "source": "discord",
    "score": 5,
    "severity": "Critical",
    "region": "GLOBAL",
    "community": "Enterprise IT Discord, DevOps Discord, IBM 2025",
    "impact": "60%+ of organizations using 3+ PII tools report audit inconsistencies in cross-platform document reviews",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-19-2",
    "feature": "Cross-Platform Consistency",
    "featureId": 19,
    "featureDesc": "Same engine across Web, Desktop, Office, Chrome Extension, MCP",
    "title": "Inconsistent detection undermines tool trust",
    "description": "Inconsistent detection undermines tool trust — same name detected on Web but not in Office Add-in; practitioners revert to manual review",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Enterprise users, Legal Tech Discord",
    "impact": "Tool fragmentation destroys confidence; organizations return to slow, expensive manual processes",
    "quote": "If the tool gives different results depending on where I use it, I can't trust any of the results",
    "provenance": "discord"
  },
  {
    "id": "research-19-3",
    "feature": "Cross-Platform Consistency",
    "featureId": 19,
    "featureDesc": "Same engine across Web, Desktop, Office, Chrome Extension, MCP",
    "title": "Multi-department tools don't share entity configs; no single audit trail; inconsistency discovered only during regulatory review",
    "description": "Multi-department tools don't share entity configs; no single audit trail; inconsistency discovered only during regulatory review",
    "source": "discord",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "Enterprise IT Discord, cross-platform tool comparison research",
    "impact": "Unified platform is the only solution — not integration between separate tools with different detection engines",
    "quote": "",
    "provenance": "discord"
  },
  {
    "id": "research-19-4",
    "feature": "Cross-Platform Consistency",
    "featureId": 19,
    "featureDesc": "Same engine across Web, Desktop, Office, Chrome Extension, MCP",
    "title": "Enterprise security teams managing separate DLP tools per platform cannot demonstrate consistent PII policy to auditors",
    "description": "Enterprise security teams managing separate DLP tools per platform cannot demonstrate consistent PII policy to auditors",
    "source": "reddit",
    "score": 4,
    "severity": "High",
    "region": "GLOBAL",
    "community": "r/sysadmin, r/netsec, enterprise security communities",
    "impact": "Audit failure on cross-platform consistency = GDPR Article 5 violation; SOC 2 audit finding",
    "quote": "",
    "provenance": "reddit"
  }
]