Spaces:
Sleeping
Sleeping
| """Layer 1 — deterministic detectors: regex + validators + entropy. | |
| Each detector returns spans as dicts: | |
| {"start": int, "end": int, "type": str, "value": str, | |
| "source": "regex", "confidence": 0.99} | |
| High precision is the goal: numeric PII (Aadhaar, card) must pass real | |
| checksums (Verhoeff / Luhn) so random 12/16-digit numbers are NOT flagged. | |
| """ | |
| import math | |
| import re | |
| import string | |
| # --------------------------------------------------------------------------- | |
| # Verhoeff checksum (Aadhaar) — identical tables to Appendix A | |
| # --------------------------------------------------------------------------- | |
| _D = [ | |
| [0,1,2,3,4,5,6,7,8,9],[1,2,3,4,0,6,7,8,9,5],[2,3,4,0,1,7,8,9,5,6], | |
| [3,4,0,1,2,8,9,5,6,7],[4,0,1,2,3,9,5,6,7,8],[5,9,8,7,6,0,4,3,2,1], | |
| [6,5,9,8,7,1,0,4,3,2],[7,6,5,9,8,2,1,0,4,3],[8,7,6,5,9,3,2,1,0,4], | |
| [9,8,7,6,5,4,3,2,1,0]] | |
| _P = [ | |
| [0,1,2,3,4,5,6,7,8,9],[1,5,7,6,2,8,3,0,9,4],[5,8,0,3,7,9,6,1,4,2], | |
| [8,9,1,6,0,4,3,5,2,7],[9,4,5,3,1,2,6,8,7,0],[4,2,8,6,5,7,3,9,0,1], | |
| [2,7,9,3,8,0,6,4,1,5],[7,0,4,6,9,1,3,2,5,8]] | |
| _INV = [0,4,3,2,1,5,6,7,8,9] | |
| def verhoeff_validate(number: str) -> bool: | |
| """Validate a Verhoeff checksum (last digit is the check digit).""" | |
| c = 0 | |
| for i, item in enumerate(reversed(number)): | |
| c = _D[c][_P[i % 8][int(item)]] | |
| return c == 0 | |
| # --------------------------------------------------------------------------- | |
| # Luhn checksum (cards) | |
| # --------------------------------------------------------------------------- | |
| def luhn_validate(card: str) -> bool: | |
| s = 0 | |
| for i, ch in enumerate(reversed(card)): | |
| d = int(ch) | |
| if i % 2 == 1: | |
| d *= 2 | |
| if d > 9: | |
| d -= 9 | |
| s += d | |
| return s % 10 == 0 | |
| # --------------------------------------------------------------------------- | |
| # Shannon entropy (secrets) | |
| # --------------------------------------------------------------------------- | |
| def shannon_entropy(s: str) -> float: | |
| if not s: | |
| return 0.0 | |
| probs = [s.count(c) / len(s) for c in set(s)] | |
| return -sum(p * math.log2(p) for p in probs) | |
| # A small dictionary-word check so plain English words of length >= 20 | |
| # (rare, but possible in compound text) don't get flagged as secrets. | |
| _COMMON_WORDS = { | |
| "responsibility", "internationalization", "characterization", | |
| "telecommunications", "incomprehensibility", "disproportionately", | |
| } | |
| def _is_dictionary_word(token: str) -> bool: | |
| return token.lower() in _COMMON_WORDS | |
| # --------------------------------------------------------------------------- | |
| # Regex patterns | |
| # --------------------------------------------------------------------------- | |
| _EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b") | |
| _PHONE_RE = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)") | |
| _AADHAAR_RE = re.compile(r"(?<!\d)\d{4}\s?\d{4}\s?\d{4}(?!\d)") | |
| _PAN_RE = re.compile(r"\b[A-Z]{5}[0-9]{4}[A-Z]\b") | |
| _IFSC_RE = re.compile(r"\b[A-Z]{4}0[A-Z0-9]{6}\b") | |
| _CARD_RE = re.compile(r"(?<!\d)\d(?:[ -]?\d){12,18}(?!\d)") | |
| _IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b") | |
| _DOB_RE = re.compile(r"\b(?:0[1-9]|[12]\d|3[01])[/-](?:0[1-9]|1[0-2])[/-](?:19|20)\d{2}\b") | |
| _UPI_RE = re.compile(r"\b[\w.\-]{2,256}@(?:[a-zA-Z]{3,64})\b") | |
| # Secrets | |
| _AWS_KEY_RE = re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b") | |
| _GITHUB_TOKEN_RE = re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b") | |
| _SLACK_TOKEN_RE = re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,72}\b") | |
| _JWT_RE = re.compile(r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b") | |
| _PRIVATE_KEY_RE = re.compile( | |
| r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----", | |
| re.DOTALL, | |
| ) | |
| # Generic API-token-like assignments: api_key="...", token: '...', secret=... | |
| _GENERIC_TOKEN_RE = re.compile( | |
| r"(?i)\b(?:api[_-]?key|api[_-]?secret|access[_-]?token|secret[_-]?key|" | |
| r"auth[_-]?token|password|token)\s*[:=]\s*[\"']?([A-Za-z0-9/_+\-]{16,})[\"']?" | |
| ) | |
| # Indian UPI handles (specific bank suffixes, narrower than generic email-ish) | |
| _UPI_SUFFIXES = ( | |
| "okhdfcbank", "oksbi", "okicici", "okaxis", "ybl", "paytm", "ibl", "axl", "apl", | |
| ) | |
| def _find_aadhaar(text): | |
| spans = [] | |
| for m in _AADHAAR_RE.finditer(text): | |
| digits = m.group(0).replace(" ", "") | |
| if len(digits) == 12 and verhoeff_validate(digits): | |
| spans.append({ | |
| "start": m.start(), "end": m.end(), "type": "AADHAAR", | |
| "value": m.group(0), "source": "regex", "confidence": 0.99, | |
| }) | |
| return spans | |
| def _find_card(text): | |
| spans = [] | |
| for m in _CARD_RE.finditer(text): | |
| digits = re.sub(r"[ -]", "", m.group(0)) | |
| if len(digits) in (13, 14, 15, 16, 17, 18, 19) and luhn_validate(digits): | |
| spans.append({ | |
| "start": m.start(), "end": m.end(), "type": "CARD", | |
| "value": m.group(0), "source": "regex", "confidence": 0.99, | |
| }) | |
| return spans | |
| def _find_upi(text): | |
| spans = [] | |
| for m in _UPI_RE.finditer(text): | |
| handle = m.group(0) | |
| suffix = handle.rsplit("@", 1)[-1].lower() | |
| if suffix in _UPI_SUFFIXES: | |
| spans.append({ | |
| "start": m.start(), "end": m.end(), "type": "UPI", | |
| "value": handle, "source": "regex", "confidence": 0.97, | |
| }) | |
| return spans | |
| def _find_entropy_secrets(text, threshold=4.0, min_len=20): | |
| spans = [] | |
| for m in re.finditer(r"[A-Za-z0-9+/=_\-]{%d,}" % min_len, text): | |
| token = m.group(0) | |
| ent = shannon_entropy(token) | |
| if ent > threshold and not _is_dictionary_word(token) and not token.isdigit(): | |
| spans.append({ | |
| "start": m.start(), "end": m.end(), "type": "SECRET", | |
| "value": token, "source": "entropy", | |
| "confidence": round(min(0.5 + (ent - threshold) * 0.15, 0.95), 2), | |
| }) | |
| return spans | |
| def _find_generic_tokens(text): | |
| spans = [] | |
| for m in _GENERIC_TOKEN_RE.finditer(text): | |
| value = m.group(1) | |
| if len(value) < 8: | |
| continue | |
| start, end = m.span(1) | |
| spans.append({ | |
| "start": start, "end": end, "type": "SECRET", | |
| "value": value, "source": "regex", "confidence": 0.9, | |
| }) | |
| return spans | |
| def _simple_finditer(pattern, type_name, text, confidence=0.99, source="regex"): | |
| return [ | |
| { | |
| "start": m.start(), "end": m.end(), "type": type_name, | |
| "value": m.group(0), "source": source, "confidence": confidence, | |
| } | |
| for m in pattern.finditer(text) | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def detect_pii(text: str) -> list[dict]: | |
| """Detect structured + context-free PII (Layer 1, PII categories).""" | |
| spans = [] | |
| spans += _simple_finditer(_EMAIL_RE, "EMAIL", text) | |
| spans += _find_aadhaar(text) | |
| spans += _simple_finditer(_PAN_RE, "PAN", text) | |
| spans += _simple_finditer(_IFSC_RE, "IFSC", text) | |
| spans += _find_card(text) | |
| spans += _simple_finditer(_IP_RE, "IP", text, confidence=0.95) | |
| spans += _simple_finditer(_DOB_RE, "DOB", text, confidence=0.9) | |
| spans += _find_upi(text) | |
| # phone last: avoid double-matching digits already consumed by aadhaar/card | |
| spans += _simple_finditer(_PHONE_RE, "PHONE", text, confidence=0.95) | |
| return spans | |
| def detect_secrets(text: str) -> list[dict]: | |
| """Detect credentials / secrets (Layer 1, secret categories).""" | |
| spans = [] | |
| spans += _simple_finditer(_AWS_KEY_RE, "AWS_KEY", text) | |
| spans += _simple_finditer(_GITHUB_TOKEN_RE, "GITHUB_TOKEN", text) | |
| spans += _simple_finditer(_SLACK_TOKEN_RE, "SLACK_TOKEN", text) | |
| spans += _simple_finditer(_JWT_RE, "JWT", text) | |
| spans += _simple_finditer(_PRIVATE_KEY_RE, "PRIVATE_KEY", text, confidence=0.99) | |
| spans += _find_generic_tokens(text) | |
| spans += _find_entropy_secrets(text) | |
| return spans | |
| def detect_all(text: str, pii: bool = True, secrets: bool = True) -> list[dict]: | |
| """Run all enabled detector groups and return raw (possibly overlapping) spans.""" | |
| spans = [] | |
| if pii: | |
| spans += detect_pii(text) | |
| if secrets: | |
| spans += detect_secrets(text) | |
| return spans | |
| # Alias matching the integration doc's naming (`detectors.detect(text)`). | |
| detect = detect_all | |