Spaces:
Running on Zero
Running on Zero
| """PII detection. | |
| Two complementary detectors, merged into one span list: | |
| 1. GLiNER (zero-shot NER, PII-tuned) for fuzzy entities: people, orgs, | |
| addresses, dates of birth, etc. | |
| 2. Regex recognizers for high-precision structured identifiers that NER | |
| models get wrong: emails, phones, IBANs, credit cards, IPs, etc. | |
| Everything runs locally (CPU is fine for GLiNER). No external calls. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| from functools import lru_cache | |
| from typing import List | |
| # Labels GLiNER is asked to find. Tuned for documents / ID cards. | |
| DEFAULT_LABELS = [ | |
| "person", | |
| "organization", | |
| "address", | |
| "date of birth", | |
| "passport number", | |
| "driver license number", | |
| "national id number", | |
| "bank account number", | |
| "phone number", | |
| "email address", | |
| ] | |
| # GLiNER PII-tuned checkpoint. | |
| _GLINER_MODEL = "urchade/gliner_multi_pii-v1" | |
| class Entity: | |
| start: int | |
| end: int | |
| text: str | |
| label: str | |
| score: float | |
| source: str # "gliner" or "regex" | |
| # --- Regex recognizers (high precision, deterministic) ----------------------- | |
| # Ordered most-specific first; on an equal-score overlap the earlier | |
| # recognizer wins, so the greedy phone pattern is deliberately last. | |
| _REGEX_RECOGNIZERS = [ | |
| ("email address", re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")), | |
| ("iban", re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")), | |
| ("national id number", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")), # US SSN shape | |
| ("credit card number", re.compile(r"\b(?:\d[ -]?){13,16}\b")), | |
| ("ip address", re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")), | |
| ("phone number", re.compile(r"(?<!\w)(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{2,4}\)?[\s.-]?){2,4}\d{2,4}(?!\w)")), | |
| ] | |
| # Common date shapes the greedy phone pattern would otherwise swallow. | |
| _DATE_RE = re.compile( | |
| r"^\d{4}[./-]\d{1,2}[./-]\d{1,2}$|^\d{1,2}[./-]\d{1,2}[./-]\d{2,4}$" | |
| ) | |
| def _regex_entities(text: str) -> List[Entity]: | |
| out: List[Entity] = [] | |
| for label, pattern in _REGEX_RECOGNIZERS: | |
| for m in pattern.finditer(text): | |
| span = m.group().strip() | |
| if len(span) < 4: | |
| continue | |
| # Phone recognizer: ignore dates and short digit runs (serials). | |
| if label == "phone number": | |
| if _DATE_RE.match(span): | |
| continue | |
| if sum(c.isdigit() for c in span) < 7: | |
| continue | |
| out.append( | |
| Entity( | |
| start=m.start(), | |
| end=m.start() + len(span), | |
| text=span, | |
| label=label, | |
| score=1.0, | |
| source="regex", | |
| ) | |
| ) | |
| return out | |
| # --- GLiNER ------------------------------------------------------------------ | |
| def gliner_available() -> bool: | |
| """True if GLiNER can be imported. Lets the app fall back to regex-only | |
| (and tell the user) instead of crashing when GLiNER isn't installed.""" | |
| try: | |
| import gliner # noqa: F401 | |
| return True | |
| except Exception: | |
| return False | |
| def _load_gliner(): | |
| from gliner import GLiNER | |
| return GLiNER.from_pretrained(_GLINER_MODEL) | |
| def _gliner_entities(text: str, labels: List[str], threshold: float) -> List[Entity]: | |
| model = _load_gliner() | |
| preds = model.predict_entities(text, labels, threshold=threshold) | |
| return [ | |
| Entity( | |
| start=p["start"], | |
| end=p["end"], | |
| text=p["text"], | |
| label=p["label"], | |
| score=float(p.get("score", 0.0)), | |
| source="gliner", | |
| ) | |
| for p in preds | |
| ] | |
| # --- Merge ------------------------------------------------------------------- | |
| def _resolve_overlaps(entities: List[Entity]) -> List[Entity]: | |
| """Keep the highest-scoring entity when spans overlap; regex (score 1.0) | |
| wins ties, which is what we want for structured identifiers.""" | |
| ordered = sorted(entities, key=lambda e: (-e.score, e.start)) | |
| kept: List[Entity] = [] | |
| for e in ordered: | |
| if any(not (e.end <= k.start or e.start >= k.end) for k in kept): | |
| continue | |
| kept.append(e) | |
| return sorted(kept, key=lambda e: e.start) | |
| def detect_entities( | |
| text: str, | |
| labels: List[str] | None = None, | |
| threshold: float = 0.45, | |
| use_gliner: bool = True, | |
| ) -> List[Entity]: | |
| """Return de-duplicated PII spans found in `text`, sorted by position.""" | |
| if not text or not text.strip(): | |
| return [] | |
| labels = labels or DEFAULT_LABELS | |
| found: List[Entity] = _regex_entities(text) | |
| if use_gliner and gliner_available(): | |
| found.extend(_gliner_entities(text, labels, threshold)) | |
| return _resolve_overlaps(found) | |