Spaces:

1337XCode
/

personabot-api

Running

personabot-api / app /security /sanitizer.py

GitHub Actions

Deploy 85f07db

3d134a6 about 1 month ago

3.7 kB

	"""
	backend/app/security/sanitizer.py

	Input sanitisation and lightweight PII redaction for user queries.

	Issue 4 resolution: Presidio was replaced with six compiled regex patterns.

	WHY Presidio was removed
	─────────────────────────
	Presidio uses spaCy-based NLP internally: named entity recognition, pattern
	matching, and context analysis. This added 50-100ms to every request before
	any business logic ran. For a personal portfolio chatbot, the realistic PII
	risk is near zero — no legitimate user submits their credit card number or SSN
	to a developer's portfolio assistant. The threat model does not justify the
	latency cost or the large spaCy model in the Docker image.

	Six regex patterns cover every plausible PII type for this use case and run
	in microseconds, not milliseconds. If Presidio is ever reconsidered, the
	latency cost must be measured and documented before reintroduction.
	DO NOT reintroduce Presidio or spaCy without explicit justification.
	"""
	import re

	# LLM token delimiters that attackers embed in queries to escape the system prompt
	# or inject new instructions. Strip them before any further processing.
	_RE_INJECT_TOKENS = re.compile(
	r"(<\\|\s(system\|user\|assistant\|im_start\|im_end)\s\\|>"
	r"\|<<\ssys\s>>"
	r"\|\[/?\sinst\s\]"
	r"\|\[/?\ssystem\s\]"
	r"\|---\ssystem\s---"
	r"\|\\n###\s*instruction)",
	re.IGNORECASE,
	)

	# Six compiled patterns covering plausible PII in portfolio chatbot input.
	# Named capturing groups make the replacements self-documenting.
	# Patterns are ordered cheapest-first (no backtracking before complex ones).
	_PII_PATTERNS: tuple[re.Pattern, ...] = (
	# Email address
	re.compile(r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"),
	# IPv4 address (before phone to avoid 4-octet false positives in phone patterns)
	re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
	# UK phone: 07xxx xxxxxx, +44 7xxx xxxxxx, 01xxx xxxxxx, etc.
	re.compile(r"\b(?:\+44\s?\|0)(?:\d\s?){9,10}\b"),
	# UK National Insurance number: two letters, six digits, one letter (A–D)
	re.compile(r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b", re.IGNORECASE),
	# UK sort code: xx-xx-xx or xxxxxx (6 digits)
	re.compile(r"\b\d{2}[-\s]?\d{2}[-\s]?\d{2}\b"),
	# Credit card: 13–19 digit sequences with optional spaces/dashes
	re.compile(r"\b(?:\d[ \-]?){13,19}\b"),
	)


	def sanitize_input(text: str) -> str:
	"""
	1. Strip null bytes and non-printable control characters (keep \\n, \\t).
	2. Remove LLM token-injection delimiters (<\|system\|>, <<SYS>>, [INST], etc.).
	3. Collapse 3+ consecutive whitespace to a single space.
	4. Truncate to 500 chars.
	"""
	if not text:
	return ""

	# Remove control characters except \n and \t
	text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)

	# Strip injection delimiters before they reach the classifier or LLM
	text = _RE_INJECT_TOKENS.sub('', text)

	text = re.sub(r'\s{3,}', ' ', text)
	text = text[:500]
	return text


	def redact_pii(text: str) -> str:
	"""
	Detect and redact PII using six lightweight compiled regex patterns.

	Patterns cover: email address, IPv4 address, UK phone number,
	UK National Insurance number, UK sort code, and credit card number.
	Runs in microseconds per query — no NLP model, no spaCy, no network calls.

	PERSON entities are intentionally not redacted: visitors are expected to
	name Darshan Chheda in their queries. Redacting that breaks retrieval.
	"""
	if not text:
	return text

	for pattern in _PII_PATTERNS:
	text = pattern.sub("[REDACTED]", text)
	return text