PrivacyShield / detectors.py
perceptron01's picture
Upload 6 files
e431b8d verified
Raw
History Blame Contribute Delete
8.36 kB
"""Layer 1 — deterministic detectors: regex + validators + entropy.
Each detector returns spans as dicts:
{"start": int, "end": int, "type": str, "value": str,
"source": "regex", "confidence": 0.99}
High precision is the goal: numeric PII (Aadhaar, card) must pass real
checksums (Verhoeff / Luhn) so random 12/16-digit numbers are NOT flagged.
"""
import math
import re
import string
# ---------------------------------------------------------------------------
# Verhoeff checksum (Aadhaar) — identical tables to Appendix A
# ---------------------------------------------------------------------------
_D = [
[0,1,2,3,4,5,6,7,8,9],[1,2,3,4,0,6,7,8,9,5],[2,3,4,0,1,7,8,9,5,6],
[3,4,0,1,2,8,9,5,6,7],[4,0,1,2,3,9,5,6,7,8],[5,9,8,7,6,0,4,3,2,1],
[6,5,9,8,7,1,0,4,3,2],[7,6,5,9,8,2,1,0,4,3],[8,7,6,5,9,3,2,1,0,4],
[9,8,7,6,5,4,3,2,1,0]]
_P = [
[0,1,2,3,4,5,6,7,8,9],[1,5,7,6,2,8,3,0,9,4],[5,8,0,3,7,9,6,1,4,2],
[8,9,1,6,0,4,3,5,2,7],[9,4,5,3,1,2,6,8,7,0],[4,2,8,6,5,7,3,9,0,1],
[2,7,9,3,8,0,6,4,1,5],[7,0,4,6,9,1,3,2,5,8]]
_INV = [0,4,3,2,1,5,6,7,8,9]
def verhoeff_validate(number: str) -> bool:
"""Validate a Verhoeff checksum (last digit is the check digit)."""
c = 0
for i, item in enumerate(reversed(number)):
c = _D[c][_P[i % 8][int(item)]]
return c == 0
# ---------------------------------------------------------------------------
# Luhn checksum (cards)
# ---------------------------------------------------------------------------
def luhn_validate(card: str) -> bool:
s = 0
for i, ch in enumerate(reversed(card)):
d = int(ch)
if i % 2 == 1:
d *= 2
if d > 9:
d -= 9
s += d
return s % 10 == 0
# ---------------------------------------------------------------------------
# Shannon entropy (secrets)
# ---------------------------------------------------------------------------
def shannon_entropy(s: str) -> float:
if not s:
return 0.0
probs = [s.count(c) / len(s) for c in set(s)]
return -sum(p * math.log2(p) for p in probs)
# A small dictionary-word check so plain English words of length >= 20
# (rare, but possible in compound text) don't get flagged as secrets.
_COMMON_WORDS = {
"responsibility", "internationalization", "characterization",
"telecommunications", "incomprehensibility", "disproportionately",
}
def _is_dictionary_word(token: str) -> bool:
return token.lower() in _COMMON_WORDS
# ---------------------------------------------------------------------------
# Regex patterns
# ---------------------------------------------------------------------------
_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
_PHONE_RE = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)")
_AADHAAR_RE = re.compile(r"(?<!\d)\d{4}\s?\d{4}\s?\d{4}(?!\d)")
_PAN_RE = re.compile(r"\b[A-Z]{5}[0-9]{4}[A-Z]\b")
_IFSC_RE = re.compile(r"\b[A-Z]{4}0[A-Z0-9]{6}\b")
_CARD_RE = re.compile(r"(?<!\d)\d(?:[ -]?\d){12,18}(?!\d)")
_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
_DOB_RE = re.compile(r"\b(?:0[1-9]|[12]\d|3[01])[/-](?:0[1-9]|1[0-2])[/-](?:19|20)\d{2}\b")
_UPI_RE = re.compile(r"\b[\w.\-]{2,256}@(?:[a-zA-Z]{3,64})\b")
# Secrets
_AWS_KEY_RE = re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b")
_GITHUB_TOKEN_RE = re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b")
_SLACK_TOKEN_RE = re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,72}\b")
_JWT_RE = re.compile(r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b")
_PRIVATE_KEY_RE = re.compile(
r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----",
re.DOTALL,
)
# Generic API-token-like assignments: api_key="...", token: '...', secret=...
_GENERIC_TOKEN_RE = re.compile(
r"(?i)\b(?:api[_-]?key|api[_-]?secret|access[_-]?token|secret[_-]?key|"
r"auth[_-]?token|password|token)\s*[:=]\s*[\"']?([A-Za-z0-9/_+\-]{16,})[\"']?"
)
# Indian UPI handles (specific bank suffixes, narrower than generic email-ish)
_UPI_SUFFIXES = (
"okhdfcbank", "oksbi", "okicici", "okaxis", "ybl", "paytm", "ibl", "axl", "apl",
)
def _find_aadhaar(text):
spans = []
for m in _AADHAAR_RE.finditer(text):
digits = m.group(0).replace(" ", "")
if len(digits) == 12 and verhoeff_validate(digits):
spans.append({
"start": m.start(), "end": m.end(), "type": "AADHAAR",
"value": m.group(0), "source": "regex", "confidence": 0.99,
})
return spans
def _find_card(text):
spans = []
for m in _CARD_RE.finditer(text):
digits = re.sub(r"[ -]", "", m.group(0))
if len(digits) in (13, 14, 15, 16, 17, 18, 19) and luhn_validate(digits):
spans.append({
"start": m.start(), "end": m.end(), "type": "CARD",
"value": m.group(0), "source": "regex", "confidence": 0.99,
})
return spans
def _find_upi(text):
spans = []
for m in _UPI_RE.finditer(text):
handle = m.group(0)
suffix = handle.rsplit("@", 1)[-1].lower()
if suffix in _UPI_SUFFIXES:
spans.append({
"start": m.start(), "end": m.end(), "type": "UPI",
"value": handle, "source": "regex", "confidence": 0.97,
})
return spans
def _find_entropy_secrets(text, threshold=4.0, min_len=20):
spans = []
for m in re.finditer(r"[A-Za-z0-9+/=_\-]{%d,}" % min_len, text):
token = m.group(0)
ent = shannon_entropy(token)
if ent > threshold and not _is_dictionary_word(token) and not token.isdigit():
spans.append({
"start": m.start(), "end": m.end(), "type": "SECRET",
"value": token, "source": "entropy",
"confidence": round(min(0.5 + (ent - threshold) * 0.15, 0.95), 2),
})
return spans
def _find_generic_tokens(text):
spans = []
for m in _GENERIC_TOKEN_RE.finditer(text):
value = m.group(1)
if len(value) < 8:
continue
start, end = m.span(1)
spans.append({
"start": start, "end": end, "type": "SECRET",
"value": value, "source": "regex", "confidence": 0.9,
})
return spans
def _simple_finditer(pattern, type_name, text, confidence=0.99, source="regex"):
return [
{
"start": m.start(), "end": m.end(), "type": type_name,
"value": m.group(0), "source": source, "confidence": confidence,
}
for m in pattern.finditer(text)
]
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def detect_pii(text: str) -> list[dict]:
"""Detect structured + context-free PII (Layer 1, PII categories)."""
spans = []
spans += _simple_finditer(_EMAIL_RE, "EMAIL", text)
spans += _find_aadhaar(text)
spans += _simple_finditer(_PAN_RE, "PAN", text)
spans += _simple_finditer(_IFSC_RE, "IFSC", text)
spans += _find_card(text)
spans += _simple_finditer(_IP_RE, "IP", text, confidence=0.95)
spans += _simple_finditer(_DOB_RE, "DOB", text, confidence=0.9)
spans += _find_upi(text)
# phone last: avoid double-matching digits already consumed by aadhaar/card
spans += _simple_finditer(_PHONE_RE, "PHONE", text, confidence=0.95)
return spans
def detect_secrets(text: str) -> list[dict]:
"""Detect credentials / secrets (Layer 1, secret categories)."""
spans = []
spans += _simple_finditer(_AWS_KEY_RE, "AWS_KEY", text)
spans += _simple_finditer(_GITHUB_TOKEN_RE, "GITHUB_TOKEN", text)
spans += _simple_finditer(_SLACK_TOKEN_RE, "SLACK_TOKEN", text)
spans += _simple_finditer(_JWT_RE, "JWT", text)
spans += _simple_finditer(_PRIVATE_KEY_RE, "PRIVATE_KEY", text, confidence=0.99)
spans += _find_generic_tokens(text)
spans += _find_entropy_secrets(text)
return spans
def detect_all(text: str, pii: bool = True, secrets: bool = True) -> list[dict]:
"""Run all enabled detector groups and return raw (possibly overlapping) spans."""
spans = []
if pii:
spans += detect_pii(text)
if secrets:
spans += detect_secrets(text)
return spans
# Alias matching the integration doc's naming (`detectors.detect(text)`).
detect = detect_all