"""PII detection. Two complementary detectors, merged into one span list: 1. GLiNER (zero-shot NER, PII-tuned) for fuzzy entities: people, orgs, addresses, dates of birth, etc. 2. Regex recognizers for high-precision structured identifiers that NER models get wrong: emails, phones, IBANs, credit cards, IPs, etc. Everything runs locally (CPU is fine for GLiNER). No external calls. """ from __future__ import annotations import re from dataclasses import dataclass from functools import lru_cache from typing import List # Labels GLiNER is asked to find. Tuned for documents / ID cards. DEFAULT_LABELS = [ "person", "organization", "address", "date of birth", "passport number", "driver license number", "national id number", "bank account number", "phone number", "email address", ] # GLiNER PII-tuned checkpoint. _GLINER_MODEL = "urchade/gliner_multi_pii-v1" @dataclass class Entity: start: int end: int text: str label: str score: float source: str # "gliner" or "regex" # --- Regex recognizers (high precision, deterministic) ----------------------- # Ordered most-specific first; on an equal-score overlap the earlier # recognizer wins, so the greedy phone pattern is deliberately last. _REGEX_RECOGNIZERS = [ ("email address", re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")), ("iban", re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")), ("national id number", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")), # US SSN shape ("credit card number", re.compile(r"\b(?:\d[ -]?){13,16}\b")), ("ip address", re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")), ("phone number", re.compile(r"(? List[Entity]: out: List[Entity] = [] for label, pattern in _REGEX_RECOGNIZERS: for m in pattern.finditer(text): span = m.group().strip() if len(span) < 4: continue # Phone recognizer: ignore dates and short digit runs (serials). if label == "phone number": if _DATE_RE.match(span): continue if sum(c.isdigit() for c in span) < 7: continue out.append( Entity( start=m.start(), end=m.start() + len(span), text=span, label=label, score=1.0, source="regex", ) ) return out # --- GLiNER ------------------------------------------------------------------ @lru_cache(maxsize=1) def gliner_available() -> bool: """True if GLiNER can be imported. Lets the app fall back to regex-only (and tell the user) instead of crashing when GLiNER isn't installed.""" try: import gliner # noqa: F401 return True except Exception: return False @lru_cache(maxsize=1) def _load_gliner(): from gliner import GLiNER return GLiNER.from_pretrained(_GLINER_MODEL) def _gliner_entities(text: str, labels: List[str], threshold: float) -> List[Entity]: model = _load_gliner() preds = model.predict_entities(text, labels, threshold=threshold) return [ Entity( start=p["start"], end=p["end"], text=p["text"], label=p["label"], score=float(p.get("score", 0.0)), source="gliner", ) for p in preds ] # --- Merge ------------------------------------------------------------------- def _resolve_overlaps(entities: List[Entity]) -> List[Entity]: """Keep the highest-scoring entity when spans overlap; regex (score 1.0) wins ties, which is what we want for structured identifiers.""" ordered = sorted(entities, key=lambda e: (-e.score, e.start)) kept: List[Entity] = [] for e in ordered: if any(not (e.end <= k.start or e.start >= k.end) for k in kept): continue kept.append(e) return sorted(kept, key=lambda e: e.start) def detect_entities( text: str, labels: List[str] | None = None, threshold: float = 0.45, use_gliner: bool = True, ) -> List[Entity]: """Return de-duplicated PII spans found in `text`, sorted by position.""" if not text or not text.strip(): return [] labels = labels or DEFAULT_LABELS found: List[Entity] = _regex_entities(text) if use_gliner and gliner_available(): found.extend(_gliner_entities(text, labels, threshold)) return _resolve_overlaps(found)