Spaces:

build-small-hackathon
/

PrivacyShield

Sleeping

App Files Files Community

PrivacyShield / detectors.py

perceptron01

Upload 6 files

e431b8d verified 16 days ago

Raw

History Blame Contribute Delete

8.36 kB

	"""Layer 1 — deterministic detectors: regex + validators + entropy.

	Each detector returns spans as dicts:
	{"start": int, "end": int, "type": str, "value": str,
	"source": "regex", "confidence": 0.99}

	High precision is the goal: numeric PII (Aadhaar, card) must pass real
	checksums (Verhoeff / Luhn) so random 12/16-digit numbers are NOT flagged.
	"""
	import math
	import re
	import string

	# ---------------------------------------------------------------------------
	# Verhoeff checksum (Aadhaar) — identical tables to Appendix A
	# ---------------------------------------------------------------------------
	_D = [
	[0,1,2,3,4,5,6,7,8,9],[1,2,3,4,0,6,7,8,9,5],[2,3,4,0,1,7,8,9,5,6],
	[3,4,0,1,2,8,9,5,6,7],[4,0,1,2,3,9,5,6,7,8],[5,9,8,7,6,0,4,3,2,1],
	[6,5,9,8,7,1,0,4,3,2],[7,6,5,9,8,2,1,0,4,3],[8,7,6,5,9,3,2,1,0,4],
	[9,8,7,6,5,4,3,2,1,0]]
	_P = [
	[0,1,2,3,4,5,6,7,8,9],[1,5,7,6,2,8,3,0,9,4],[5,8,0,3,7,9,6,1,4,2],
	[8,9,1,6,0,4,3,5,2,7],[9,4,5,3,1,2,6,8,7,0],[4,2,8,6,5,7,3,9,0,1],
	[2,7,9,3,8,0,6,4,1,5],[7,0,4,6,9,1,3,2,5,8]]
	_INV = [0,4,3,2,1,5,6,7,8,9]


	def verhoeff_validate(number: str) -> bool:
	"""Validate a Verhoeff checksum (last digit is the check digit)."""
	c = 0
	for i, item in enumerate(reversed(number)):
	c = _D[c][_P[i % 8][int(item)]]
	return c == 0


	# ---------------------------------------------------------------------------
	# Luhn checksum (cards)
	# ---------------------------------------------------------------------------
	def luhn_validate(card: str) -> bool:
	s = 0
	for i, ch in enumerate(reversed(card)):
	d = int(ch)
	if i % 2 == 1:
	d *= 2
	if d > 9:
	d -= 9
	s += d
	return s % 10 == 0


	# ---------------------------------------------------------------------------
	# Shannon entropy (secrets)
	# ---------------------------------------------------------------------------
	def shannon_entropy(s: str) -> float:
	if not s:
	return 0.0
	probs = [s.count(c) / len(s) for c in set(s)]
	return -sum(p * math.log2(p) for p in probs)


	# A small dictionary-word check so plain English words of length >= 20
	# (rare, but possible in compound text) don't get flagged as secrets.
	_COMMON_WORDS = {
	"responsibility", "internationalization", "characterization",
	"telecommunications", "incomprehensibility", "disproportionately",
	}


	def _is_dictionary_word(token: str) -> bool:
	return token.lower() in _COMMON_WORDS


	# ---------------------------------------------------------------------------
	# Regex patterns
	# ---------------------------------------------------------------------------
	_EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
	_PHONE_RE = re.compile(r"(?<!\d)(?:\+?91[-\s]?)?[6-9]\d{9}(?!\d)")
	_AADHAAR_RE = re.compile(r"(?<!\d)\d{4}\s?\d{4}\s?\d{4}(?!\d)")
	_PAN_RE = re.compile(r"\b[A-Z]{5}[0-9]{4}[A-Z]\b")
	_IFSC_RE = re.compile(r"\b[A-Z]{4}0[A-Z0-9]{6}\b")
	_CARD_RE = re.compile(r"(?<!\d)\d(?:[ -]?\d){12,18}(?!\d)")
	_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
	_DOB_RE = re.compile(r"\b(?:0[1-9]\|[12]\d\|3[01])[/-](?:0[1-9]\|1[0-2])[/-](?:19\|20)\d{2}\b")
	_UPI_RE = re.compile(r"\b[\w.\-]{2,256}@(?:[a-zA-Z]{3,64})\b")

	# Secrets
	_AWS_KEY_RE = re.compile(r"\b(?:AKIA\|ASIA)[A-Z0-9]{16}\b")
	_GITHUB_TOKEN_RE = re.compile(r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b")
	_SLACK_TOKEN_RE = re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,72}\b")
	_JWT_RE = re.compile(r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b")
	_PRIVATE_KEY_RE = re.compile(
	r"-----BEGIN [A-Z0-9 ]PRIVATE KEY-----.?-----END [A-Z0-9 ]*PRIVATE KEY-----",
	re.DOTALL,
	)
	# Generic API-token-like assignments: api_key="...", token: '...', secret=...
	_GENERIC_TOKEN_RE = re.compile(
	r"(?i)\b(?:api[_-]?key\|api[_-]?secret\|access[_-]?token\|secret[_-]?key\|"
	r"auth[_-]?token\|password\|token)\s[:=]\s[\"']?([A-Za-z0-9/_+\-]{16,})[\"']?"
	)

	# Indian UPI handles (specific bank suffixes, narrower than generic email-ish)
	_UPI_SUFFIXES = (
	"okhdfcbank", "oksbi", "okicici", "okaxis", "ybl", "paytm", "ibl", "axl", "apl",
	)


	def _find_aadhaar(text):
	spans = []
	for m in _AADHAAR_RE.finditer(text):
	digits = m.group(0).replace(" ", "")
	if len(digits) == 12 and verhoeff_validate(digits):
	spans.append({
	"start": m.start(), "end": m.end(), "type": "AADHAAR",
	"value": m.group(0), "source": "regex", "confidence": 0.99,
	})
	return spans


	def _find_card(text):
	spans = []
	for m in _CARD_RE.finditer(text):
	digits = re.sub(r"[ -]", "", m.group(0))
	if len(digits) in (13, 14, 15, 16, 17, 18, 19) and luhn_validate(digits):
	spans.append({
	"start": m.start(), "end": m.end(), "type": "CARD",
	"value": m.group(0), "source": "regex", "confidence": 0.99,
	})
	return spans


	def _find_upi(text):
	spans = []
	for m in _UPI_RE.finditer(text):
	handle = m.group(0)
	suffix = handle.rsplit("@", 1)[-1].lower()
	if suffix in _UPI_SUFFIXES:
	spans.append({
	"start": m.start(), "end": m.end(), "type": "UPI",
	"value": handle, "source": "regex", "confidence": 0.97,
	})
	return spans


	def _find_entropy_secrets(text, threshold=4.0, min_len=20):
	spans = []
	for m in re.finditer(r"[A-Za-z0-9+/=_\-]{%d,}" % min_len, text):
	token = m.group(0)
	ent = shannon_entropy(token)
	if ent > threshold and not _is_dictionary_word(token) and not token.isdigit():
	spans.append({
	"start": m.start(), "end": m.end(), "type": "SECRET",
	"value": token, "source": "entropy",
	"confidence": round(min(0.5 + (ent - threshold) * 0.15, 0.95), 2),
	})
	return spans


	def _find_generic_tokens(text):
	spans = []
	for m in _GENERIC_TOKEN_RE.finditer(text):
	value = m.group(1)
	if len(value) < 8:
	continue
	start, end = m.span(1)
	spans.append({
	"start": start, "end": end, "type": "SECRET",
	"value": value, "source": "regex", "confidence": 0.9,
	})
	return spans


	def _simple_finditer(pattern, type_name, text, confidence=0.99, source="regex"):
	return [
	{
	"start": m.start(), "end": m.end(), "type": type_name,
	"value": m.group(0), "source": source, "confidence": confidence,
	}
	for m in pattern.finditer(text)
	]


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------
	def detect_pii(text: str) -> list[dict]:
	"""Detect structured + context-free PII (Layer 1, PII categories)."""
	spans = []
	spans += _simple_finditer(_EMAIL_RE, "EMAIL", text)
	spans += _find_aadhaar(text)
	spans += _simple_finditer(_PAN_RE, "PAN", text)
	spans += _simple_finditer(_IFSC_RE, "IFSC", text)
	spans += _find_card(text)
	spans += _simple_finditer(_IP_RE, "IP", text, confidence=0.95)
	spans += _simple_finditer(_DOB_RE, "DOB", text, confidence=0.9)
	spans += _find_upi(text)
	# phone last: avoid double-matching digits already consumed by aadhaar/card
	spans += _simple_finditer(_PHONE_RE, "PHONE", text, confidence=0.95)
	return spans


	def detect_secrets(text: str) -> list[dict]:
	"""Detect credentials / secrets (Layer 1, secret categories)."""
	spans = []
	spans += _simple_finditer(_AWS_KEY_RE, "AWS_KEY", text)
	spans += _simple_finditer(_GITHUB_TOKEN_RE, "GITHUB_TOKEN", text)
	spans += _simple_finditer(_SLACK_TOKEN_RE, "SLACK_TOKEN", text)
	spans += _simple_finditer(_JWT_RE, "JWT", text)
	spans += _simple_finditer(_PRIVATE_KEY_RE, "PRIVATE_KEY", text, confidence=0.99)
	spans += _find_generic_tokens(text)
	spans += _find_entropy_secrets(text)
	return spans


	def detect_all(text: str, pii: bool = True, secrets: bool = True) -> list[dict]:
	"""Run all enabled detector groups and return raw (possibly overlapping) spans."""
	spans = []
	if pii:
	spans += detect_pii(text)
	if secrets:
	spans += detect_secrets(text)
	return spans


	# Alias matching the integration doc's naming (`detectors.detect(text)`).
	detect = detect_all