"""Layer 1 — deterministic detectors: regex + validators + entropy. Each detector returns spans as dicts: {"start": int, "end": int, "type": str, "value": str, "source": "regex", "confidence": 0.99} High precision is the goal: numeric PII (Aadhaar, card) must pass real checksums (Verhoeff / Luhn) so random 12/16-digit numbers are NOT flagged. """ import math import re import string # --------------------------------------------------------------------------- # Verhoeff checksum (Aadhaar) — identical tables to Appendix A # --------------------------------------------------------------------------- _D = [ [0,1,2,3,4,5,6,7,8,9],[1,2,3,4,0,6,7,8,9,5],[2,3,4,0,1,7,8,9,5,6], [3,4,0,1,2,8,9,5,6,7],[4,0,1,2,3,9,5,6,7,8],[5,9,8,7,6,0,4,3,2,1], [6,5,9,8,7,1,0,4,3,2],[7,6,5,9,8,2,1,0,4,3],[8,7,6,5,9,3,2,1,0,4], [9,8,7,6,5,4,3,2,1,0]] _P = [ [0,1,2,3,4,5,6,7,8,9],[1,5,7,6,2,8,3,0,9,4],[5,8,0,3,7,9,6,1,4,2], [8,9,1,6,0,4,3,5,2,7],[9,4,5,3,1,2,6,8,7,0],[4,2,8,6,5,7,3,9,0,1], [2,7,9,3,8,0,6,4,1,5],[7,0,4,6,9,1,3,2,5,8]] _INV = [0,4,3,2,1,5,6,7,8,9] def verhoeff_validate(number: str) -> bool: """Validate a Verhoeff checksum (last digit is the check digit).""" c = 0 for i, item in enumerate(reversed(number)): c = _D[c][_P[i % 8][int(item)]] return c == 0 # --------------------------------------------------------------------------- # Luhn checksum (cards) # --------------------------------------------------------------------------- def luhn_validate(card: str) -> bool: s = 0 for i, ch in enumerate(reversed(card)): d = int(ch) if i % 2 == 1: d *= 2 if d > 9: d -= 9 s += d return s % 10 == 0 # --------------------------------------------------------------------------- # Shannon entropy (secrets) # --------------------------------------------------------------------------- def shannon_entropy(s: str) -> float: if not s: return 0.0 probs = [s.count(c) / len(s) for c in set(s)] return -sum(p * math.log2(p) for p in probs) # A small dictionary-word check so plain English words of length >= 20 # (rare, but possible in compound text) don't get flagged as secrets. _COMMON_WORDS = { "responsibility", "internationalization", "characterization", "telecommunications", "incomprehensibility", "disproportionately", } def _is_dictionary_word(token: str) -> bool: return token.lower() in _COMMON_WORDS # --------------------------------------------------------------------------- # Regex patterns # --------------------------------------------------------------------------- _EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b") _PHONE_RE = re.compile(r"(? threshold and not _is_dictionary_word(token) and not token.isdigit(): spans.append({ "start": m.start(), "end": m.end(), "type": "SECRET", "value": token, "source": "entropy", "confidence": round(min(0.5 + (ent - threshold) * 0.15, 0.95), 2), }) return spans def _find_generic_tokens(text): spans = [] for m in _GENERIC_TOKEN_RE.finditer(text): value = m.group(1) if len(value) < 8: continue start, end = m.span(1) spans.append({ "start": start, "end": end, "type": "SECRET", "value": value, "source": "regex", "confidence": 0.9, }) return spans def _simple_finditer(pattern, type_name, text, confidence=0.99, source="regex"): return [ { "start": m.start(), "end": m.end(), "type": type_name, "value": m.group(0), "source": source, "confidence": confidence, } for m in pattern.finditer(text) ] # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def detect_pii(text: str) -> list[dict]: """Detect structured + context-free PII (Layer 1, PII categories).""" spans = [] spans += _simple_finditer(_EMAIL_RE, "EMAIL", text) spans += _find_aadhaar(text) spans += _simple_finditer(_PAN_RE, "PAN", text) spans += _simple_finditer(_IFSC_RE, "IFSC", text) spans += _find_card(text) spans += _simple_finditer(_IP_RE, "IP", text, confidence=0.95) spans += _simple_finditer(_DOB_RE, "DOB", text, confidence=0.9) spans += _find_upi(text) # phone last: avoid double-matching digits already consumed by aadhaar/card spans += _simple_finditer(_PHONE_RE, "PHONE", text, confidence=0.95) return spans def detect_secrets(text: str) -> list[dict]: """Detect credentials / secrets (Layer 1, secret categories).""" spans = [] spans += _simple_finditer(_AWS_KEY_RE, "AWS_KEY", text) spans += _simple_finditer(_GITHUB_TOKEN_RE, "GITHUB_TOKEN", text) spans += _simple_finditer(_SLACK_TOKEN_RE, "SLACK_TOKEN", text) spans += _simple_finditer(_JWT_RE, "JWT", text) spans += _simple_finditer(_PRIVATE_KEY_RE, "PRIVATE_KEY", text, confidence=0.99) spans += _find_generic_tokens(text) spans += _find_entropy_secrets(text) return spans def detect_all(text: str, pii: bool = True, secrets: bool = True) -> list[dict]: """Run all enabled detector groups and return raw (possibly overlapping) spans.""" spans = [] if pii: spans += detect_pii(text) if secrets: spans += detect_secrets(text) return spans # Alias matching the integration doc's naming (`detectors.detect(text)`). detect = detect_all