{
  "meta": {
    "version": "2.0.0",
    "generated": "2026-03-06",
    "solutions": 42,
    "painPoints": 60,
    "scoring": "0=not addressed, 1=partial, 2=fully addressed"
  },
  "solutions": [
    {
      "id": 0,
      "name": "Microsoft Presidio",
      "short": "Presidio",
      "tier": 1,
      "type": "open-source",
      "url": "https://microsoft.github.io/presidio/",
      "github": "https://github.com/microsoft/presidio",
      "entities": "~20 default",
      "langs": 6,
      "detection": "NER (spaCy/Stanza/Transformers) + regex",
      "methods": ["Redact", "Replace", "Mask", "Hash", "Encrypt"],
      "deploy": ["Self-hosted", "Docker", "API"],
      "formats": ["Text", "CSV", "Images"],
      "pricing": { "tier": "Free", "range": "$0 + engineering" },
      "compliance": [],
      "airGap": true,
      "color": "#4fc3f7",
      "strengths": [
        "Microsoft-backed open-source with active community",
        "Extensible recognizer framework for custom entities",
        "Image redaction via Tesseract OCR",
        "Used as detection backend by multiple platforms",
        "Well-documented Python SDK"
      ],
      "limitations": [
        "Only ~20 default entity types",
        "Limited language models (6–8 languages)",
        "No native PDF/DOCX processing",
        "No GUI or desktop application",
        "Requires Python engineering to deploy"
      ],
      "ppCoverage": [1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,2,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1],
      "sources": ["https://microsoft.github.io/presidio/", "https://github.com/microsoft/presidio"]
    },
    {
      "id": 1,
      "name": "ARX Data Anonymization",
      "short": "ARX",
      "tier": 1,
      "type": "open-source",
      "url": "https://arx.deidentifier.org/",
      "github": "https://github.com/arx-deidentifier/arx",
      "entities": "N/A (tabular)",
      "langs": 0,
      "detection": "Statistical (user-defined quasi-identifiers)",
      "methods": ["Generalize", "Suppress", "k-Anonymity", "l-Diversity", "t-Closeness", "DP"],
      "deploy": ["Desktop", "Java library"],
      "formats": ["CSV", "Excel", "Database"],
      "pricing": { "tier": "Free", "range": "$0" },
      "compliance": ["HIPAA Safe Harbor"],
      "airGap": true,
      "color": "#81c784",
      "strengths": [
        "Best-in-class statistical anonymization",
        "Risk quantification and utility measurement",
        "Academic research backing",
        "Desktop GUI for non-developers"
      ],
      "limitations": [
        "Tabular data only — no text or document support",
        "No NER or entity detection",
        "Java-only, limited API",
        "No real-time processing capability",
        "Limited recent development activity"
      ],
      "ppCoverage": [0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
      "sources": ["https://arx.deidentifier.org/"]
    },
    {
      "id": 2,
      "name": "Gretel.ai",
      "short": "Gretel",
      "tier": 1,
      "type": "commercial",
      "url": "https://gretel.ai/",
      "github": "https://github.com/gretelai",
      "entities": "~40+",
      "langs": 3,
      "detection": "Transformer NER + regex patterns",
      "methods": ["Replace", "Redact", "Hash", "Synthesize", "Mask"],
      "deploy": ["SaaS", "Hybrid VPC", "Docker"],
      "formats": ["CSV", "JSON", "Parquet", "SQL", "Text"],
      "pricing": { "tier": "Freemium", "range": "$0–$300+/mo" },
      "compliance": ["SOC 2 Type II", "HIPAA BAA"],
      "airGap": false,
      "color": "#7e57c2",
      "strengths": [
        "Best-in-class synthetic data generation",
        "Modern ML pipeline with good DX",
        "Hybrid VPC deployment option",
        "Active development and funding"
      ],
      "limitations": [
        "Primarily structured/tabular data focus",
        "Limited document anonymization",
        "English-centric NER",
        "SaaS pricing scales with volume",
        "No desktop or browser tool"
      ],
      "ppCoverage": [1,1,1,1,0,0,1,1,1,1,0,1,1,0,1,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://gretel.ai/docs", "https://gretel.ai/pricing"]
    },
    {
      "id": 3,
      "name": "Privitar",
      "short": "Privitar",
      "tier": 1,
      "type": "enterprise",
      "url": "https://www.privitar.com/",
      "github": null,
      "entities": "100+",
      "langs": 5,
      "detection": "ML classification + pattern matching",
      "methods": ["Mask", "Generalize", "Hash", "Encrypt", "Tokenize", "Suppress", "Synthesize", "k-Anonymity", "DP"],
      "deploy": ["On-premise", "Private cloud", "Kubernetes"],
      "formats": ["Database", "Spark", "Hadoop", "Cloud stores"],
      "pricing": { "tier": "Enterprise", "range": "$200K–$500K/yr" },
      "compliance": ["SOC 2", "ISO 27001", "GDPR", "HIPAA"],
      "airGap": true,
      "color": "#26a69a",
      "strengths": [
        "Enterprise-grade data privacy platform",
        "Strong statistical anonymization",
        "Policy-driven approach",
        "Kubernetes-native deployment"
      ],
      "limitations": [
        "No public pricing — enterprise sales only",
        "Primarily structured/tabular data",
        "No document or PDF anonymization",
        "No individual/SMB offering",
        "Acquired by Informatica (2024)"
      ],
      "ppCoverage": [1,2,1,1,0,0,1,1,1,1,1,1,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1,0,0],
      "sources": ["https://www.privitar.com/"]
    },
    {
      "id": 4,
      "name": "BigID",
      "short": "BigID",
      "tier": 2,
      "type": "enterprise",
      "url": "https://bigid.com/",
      "github": null,
      "entities": "100+",
      "langs": 10,
      "detection": "ML classification + NER + correlation",
      "methods": ["Mask", "Tokenize", "Delete"],
      "deploy": ["SaaS", "On-premise", "Hybrid"],
      "formats": ["100+ data sources", "Databases", "Files", "Cloud", "SaaS apps"],
      "pricing": { "tier": "Enterprise", "range": "$100K–$300K/yr" },
      "compliance": ["SOC 2 Type II", "ISO 27701", "GDPR", "CCPA", "HIPAA"],
      "airGap": false,
      "color": "#42a5f5",
      "strengths": [
        "Industry-leading data discovery and classification",
        "ML-powered correlation across data sources",
        "100+ connectors for data sources",
        "DSAR automation capabilities",
        "Strong analyst ratings (Gartner, Forrester)"
      ],
      "limitations": [
        "Primarily discovery — limited built-in anonymization",
        "Very expensive ($100K+ entry)",
        "Complex multi-month implementation",
        "Requires professional services",
        "Not for individual or SMB use"
      ],
      "ppCoverage": [1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0],
      "sources": ["https://bigid.com/"]
    },
    {
      "id": 5,
      "name": "OneTrust",
      "short": "OneTrust",
      "tier": 2,
      "type": "enterprise",
      "url": "https://www.onetrust.com/",
      "github": null,
      "entities": "200+",
      "langs": 100,
      "detection": "ML classification + pattern matching",
      "methods": ["Redact", "Mask"],
      "deploy": ["SaaS"],
      "formats": ["Websites", "Mobile", "SaaS", "Databases", "Cloud"],
      "pricing": { "tier": "Enterprise", "range": "$50K–$300K/yr" },
      "compliance": ["SOC 2", "ISO 27001", "ISO 27701", "GDPR", "CCPA", "LGPD"],
      "airGap": false,
      "color": "#66bb6a",
      "strengths": [
        "Market leader in privacy management",
        "Broadest regulatory coverage (consent, DSAR, cookie)",
        "200+ integrations",
        "Strong GRC and risk capabilities"
      ],
      "limitations": [
        "Not an anonymization tool — governance focused",
        "Very expensive",
        "Complex multi-month implementation",
        "Limited PII transformation capabilities",
        "Cloud-only platform"
      ],
      "ppCoverage": [0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0],
      "sources": ["https://www.onetrust.com/"]
    },
    {
      "id": 6,
      "name": "Protegrity",
      "short": "Protegrity",
      "tier": 2,
      "type": "enterprise",
      "url": "https://www.protegrity.com/",
      "github": null,
      "entities": "Configurable",
      "langs": 0,
      "detection": "Policy-driven classification",
      "methods": ["Tokenize", "Encrypt", "Mask", "Hash"],
      "deploy": ["On-premise", "Cloud", "Hybrid"],
      "formats": ["Databases", "Hadoop", "Mainframes", "Cloud stores"],
      "pricing": { "tier": "Enterprise", "range": "$200K–$1M+/yr" },
      "compliance": ["PCI-DSS", "GDPR", "HIPAA", "SOC 2"],
      "airGap": true,
      "color": "#ef5350",
      "strengths": [
        "Best-in-class tokenization and FPE",
        "Financial services specialization",
        "Hardware security module integration",
        "Mainframe and legacy support"
      ],
      "limitations": [
        "Exclusively enterprise",
        "Primarily structured data tokenization",
        "No document/text anonymization",
        "No public pricing",
        "Niche market focus"
      ],
      "ppCoverage": [1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0],
      "sources": ["https://www.protegrity.com/"]
    },
    {
      "id": 7,
      "name": "Informatica",
      "short": "Informatica",
      "tier": 2,
      "type": "enterprise",
      "url": "https://www.informatica.com/",
      "github": null,
      "entities": "100+",
      "langs": 20,
      "detection": "ML (CLAIRE AI) + profiling + patterns",
      "methods": ["Mask", "Tokenize", "Encrypt", "Generalize", "Synthesize"],
      "deploy": ["SaaS", "On-premise", "Hybrid"],
      "formats": ["100+ connectors", "Databases", "Files", "Cloud", "Mainframes"],
      "pricing": { "tier": "Enterprise", "range": "$100K–$500K/yr" },
      "compliance": ["SOC 2", "ISO 27001", "GDPR", "HIPAA", "PCI-DSS"],
      "airGap": false,
      "color": "#ff7043",
      "strengths": [
        "Comprehensive data management platform",
        "CLAIRE AI for intelligent classification",
        "Strong test data management",
        "Broadest connector ecosystem",
        "Acquired Privitar for enhanced privacy"
      ],
      "limitations": [
        "Not a dedicated anonymization tool",
        "Extremely expensive",
        "Complex multi-year implementations",
        "Requires specialist consultants",
        "Overkill for document anonymization"
      ],
      "ppCoverage": [1,1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1,0,0],
      "sources": ["https://www.informatica.com/"]
    },
    {
      "id": 8,
      "name": "Spirion",
      "short": "Spirion",
      "tier": 2,
      "type": "enterprise",
      "url": "https://www.spirion.com/",
      "github": null,
      "entities": "300+",
      "langs": 2,
      "detection": "AnyFind: pattern matching + context + validation",
      "methods": ["Redact", "Mask", "Quarantine", "Delete", "Encrypt"],
      "deploy": ["On-premise", "Cloud console", "Endpoint agents"],
      "formats": ["Office", "PDF", "PST", "ZIP", "Databases", "Endpoints"],
      "pricing": { "tier": "Enterprise", "range": "$50K–$150K/yr" },
      "compliance": ["GDPR", "CCPA", "HIPAA", "PCI-DSS", "FERPA"],
      "airGap": true,
      "color": "#ab47bc",
      "strengths": [
        "Strong endpoint PII scanning with validation",
        "Broad file format support",
        "Remediation actions (not just discovery)",
        "FERPA/education sector focus"
      ],
      "limitations": [
        "US-centric PII types",
        "Primarily scanning/discovery",
        "No text-level NER",
        "Aging user interface",
        "Limited API capabilities"
      ],
      "ppCoverage": [1,0,1,1,1,0,1,1,0,0,1,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0],
      "sources": ["https://www.spirion.com/"]
    },
    {
      "id": 9,
      "name": "Google Cloud DLP",
      "short": "Google DLP",
      "tier": 3,
      "type": "cloud",
      "url": "https://cloud.google.com/sensitive-data-protection",
      "github": null,
      "entities": "150+",
      "langs": 25,
      "detection": "ML + regex + dictionary + context",
      "methods": ["Redact", "Replace", "Mask", "Hash", "Encrypt", "Bucketing", "Date-shift"],
      "deploy": ["Cloud API"],
      "formats": ["Text", "Images", "BigQuery", "Cloud Storage"],
      "pricing": { "tier": "Pay-per-use", "range": "$1–3/GB" },
      "compliance": ["SOC 1/2/3", "ISO 27001", "HIPAA BAA", "FedRAMP", "PCI-DSS"],
      "airGap": false,
      "color": "#fdd835",
      "strengths": [
        "Most comprehensive cloud DLP API",
        "150+ built-in infoTypes",
        "Image redaction support",
        "Format-preserving encryption",
        "Strong compliance certifications"
      ],
      "limitations": [
        "Cloud-only — no offline or air-gap",
        "GCP vendor lock-in",
        "Costs scale with data volume",
        "No desktop or browser tool",
        "Requires development effort"
      ],
      "ppCoverage": [1,1,2,2,1,0,1,2,1,1,1,1,1,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://cloud.google.com/sensitive-data-protection/docs"]
    },
    {
      "id": 10,
      "name": "AWS Comprehend / Macie",
      "short": "AWS",
      "tier": 3,
      "type": "cloud",
      "url": "https://aws.amazon.com/comprehend/",
      "github": null,
      "entities": "~20 (Comprehend) + 100+ (Macie)",
      "langs": 5,
      "detection": "NLP/ML (Comprehend) + pattern matching (Macie)",
      "methods": ["Redact"],
      "deploy": ["Cloud API"],
      "formats": ["Text", "S3 objects", "CSV", "JSON", "PDF"],
      "pricing": { "tier": "Pay-per-use", "range": "$0.0001/unit" },
      "compliance": ["SOC 1/2/3", "ISO 27001", "HIPAA BAA", "FedRAMP", "PCI-DSS"],
      "airGap": false,
      "color": "#ff9800",
      "strengths": [
        "Deep AWS ecosystem integration",
        "Macie automated S3 scanning",
        "Custom entity recognition (Comprehend)",
        "Pay-per-use pricing model",
        "Strong compliance certifications"
      ],
      "limitations": [
        "Limited PII entity types (Comprehend)",
        "English-centric PII detection",
        "Two separate services — no unified pipeline",
        "No built-in de-identification pipeline",
        "AWS vendor lock-in"
      ],
      "ppCoverage": [1,0,1,1,1,0,0,1,0,0,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://docs.aws.amazon.com/comprehend/"]
    },
    {
      "id": 11,
      "name": "Azure Information Protection",
      "short": "Azure IP",
      "tier": 3,
      "type": "cloud",
      "url": "https://learn.microsoft.com/en-us/purview/",
      "github": null,
      "entities": "300+",
      "langs": 40,
      "detection": "Regex + keyword + ML trainable classifiers + fingerprinting",
      "methods": ["Encrypt", "Restrict", "Label"],
      "deploy": ["SaaS", "On-premise scanner"],
      "formats": ["Office", "PDF", "Email", "Teams", "SharePoint", "Endpoints"],
      "pricing": { "tier": "Per-user", "range": "$12–57/user/mo" },
      "compliance": ["SOC 1/2", "ISO 27001", "HIPAA BAA", "FedRAMP"],
      "airGap": false,
      "color": "#29b6f6",
      "strengths": [
        "Deepest Microsoft 365 integration",
        "300+ sensitive information types",
        "Sensitivity labels with encryption",
        "Endpoint DLP for Windows/Mac",
        "Massive enterprise adoption"
      ],
      "limitations": [
        "Microsoft ecosystem lock-in",
        "DLP/classification — no text-level anonymization",
        "Per-user licensing expensive at scale",
        "Complex policy management",
        "Cannot redact/replace PII in text content"
      ],
      "ppCoverage": [1,0,1,1,1,0,1,1,1,0,0,1,1,0,0,0,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://learn.microsoft.com/en-us/purview/"]
    },
    {
      "id": 12,
      "name": "spaCy",
      "short": "spaCy",
      "tier": 4,
      "type": "open-source",
      "url": "https://spacy.io/",
      "github": "https://github.com/explosion/spaCy",
      "entities": "4–18 (NER)",
      "langs": 25,
      "detection": "CNN / Transformer NER",
      "methods": [],
      "deploy": ["Python library", "Docker"],
      "formats": ["Text"],
      "pricing": { "tier": "Free", "range": "$0" },
      "compliance": [],
      "airGap": true,
      "color": "#26c6da",
      "strengths": [
        "Industry standard for production NLP",
        "Excellent speed/accuracy trade-off",
        "25+ language models",
        "Used as Presidio detection backend",
        "Well-documented with commercial support"
      ],
      "limitations": [
        "NER only — zero anonymization capability",
        "Entity types are NER labels, not PII-specific",
        "No regex/pattern matching built-in",
        "Text-only input (no PDF/DOCX/images)",
        "Requires building complete pipeline"
      ],
      "ppCoverage": [0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://spacy.io/models"]
    },
    {
      "id": 13,
      "name": "Stanza",
      "short": "Stanza",
      "tier": 4,
      "type": "open-source",
      "url": "https://stanfordnlp.github.io/stanza/",
      "github": "https://github.com/stanfordnlp/stanza",
      "entities": "4–18 (NER)",
      "langs": 70,
      "detection": "BiLSTM-CRF + Charlm embeddings",
      "methods": [],
      "deploy": ["Python library"],
      "formats": ["Text"],
      "pricing": { "tier": "Free", "range": "$0" },
      "compliance": [],
      "airGap": true,
      "color": "#78909c",
      "strengths": [
        "Broadest language coverage (70+)",
        "Stanford NLP academic backing",
        "Biomedical NER models",
        "Used as Presidio detection backend"
      ],
      "limitations": [
        "NER only — zero anonymization capability",
        "Slower than spaCy for production",
        "Smaller community and ecosystem",
        "Text-only input",
        "Limited industry adoption"
      ],
      "ppCoverage": [0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://stanfordnlp.github.io/stanza/"]
    },
    {
      "id": 14,
      "name": "Hugging Face NER",
      "short": "HF NER",
      "tier": 4,
      "type": "open-source",
      "url": "https://huggingface.co/models?pipeline_tag=token-classification",
      "github": "https://github.com/huggingface/transformers",
      "entities": "4–18 (per model)",
      "langs": 100,
      "detection": "Transformer NER (BERT, RoBERTa, XLM-R, DeBERTa)",
      "methods": [],
      "deploy": ["Python library", "Inference API", "Docker"],
      "formats": ["Text"],
      "pricing": { "tier": "Free", "range": "$0 (Pro $9/mo)" },
      "compliance": [],
      "airGap": true,
      "color": "#ffb74d",
      "strengths": [
        "Largest NER model selection (5,000+)",
        "State-of-the-art accuracy",
        "100+ languages via multilingual models",
        "Active community and model cards"
      ],
      "limitations": [
        "NER only — zero anonymization capability",
        "Models vary wildly in quality",
        "No standardized PII entity taxonomy",
        "Requires ML expertise to select/fine-tune",
        "Heavy compute requirements (GPU recommended)"
      ],
      "ppCoverage": [0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
      "sources": ["https://huggingface.co/models?pipeline_tag=token-classification"]
    }
  ]
}
