""" backend/app/security/sanitizer.py Input sanitisation and lightweight PII redaction for user queries. Issue 4 resolution: Presidio was replaced with six compiled regex patterns. WHY Presidio was removed ───────────────────────── Presidio uses spaCy-based NLP internally: named entity recognition, pattern matching, and context analysis. This added 50-100ms to every request before any business logic ran. For a personal portfolio chatbot, the realistic PII risk is near zero — no legitimate user submits their credit card number or SSN to a developer's portfolio assistant. The threat model does not justify the latency cost or the large spaCy model in the Docker image. Six regex patterns cover every plausible PII type for this use case and run in microseconds, not milliseconds. If Presidio is ever reconsidered, the latency cost must be measured and documented before reintroduction. DO NOT reintroduce Presidio or spaCy without explicit justification. """ import re # LLM token delimiters that attackers embed in queries to escape the system prompt # or inject new instructions. Strip them before any further processing. _RE_INJECT_TOKENS = re.compile( r"(<\|\s*(system|user|assistant|im_start|im_end)\s*\|>" r"|<<\s*sys\s*>>" r"|\[/?\s*inst\s*\]" r"|\[/?\s*system\s*\]" r"|---\s*system\s*---" r"|\\n###\s*instruction)", re.IGNORECASE, ) # Six compiled patterns covering plausible PII in portfolio chatbot input. # Named capturing groups make the replacements self-documenting. # Patterns are ordered cheapest-first (no backtracking before complex ones). _PII_PATTERNS: tuple[re.Pattern, ...] = ( # Email address re.compile(r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"), # IPv4 address (before phone to avoid 4-octet false positives in phone patterns) re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), # UK phone: 07xxx xxxxxx, +44 7xxx xxxxxx, 01xxx xxxxxx, etc. re.compile(r"\b(?:\+44\s?|0)(?:\d\s?){9,10}\b"), # UK National Insurance number: two letters, six digits, one letter (A–D) re.compile(r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b", re.IGNORECASE), # UK sort code: xx-xx-xx or xxxxxx (6 digits) re.compile(r"\b\d{2}[-\s]?\d{2}[-\s]?\d{2}\b"), # Credit card: 13–19 digit sequences with optional spaces/dashes re.compile(r"\b(?:\d[ \-]?){13,19}\b"), ) def sanitize_input(text: str) -> str: """ 1. Strip null bytes and non-printable control characters (keep \\n, \\t). 2. Remove LLM token-injection delimiters (<|system|>, <>, [INST], etc.). 3. Collapse 3+ consecutive whitespace to a single space. 4. Truncate to 500 chars. """ if not text: return "" # Remove control characters except \n and \t text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) # Strip injection delimiters before they reach the classifier or LLM text = _RE_INJECT_TOKENS.sub('', text) text = re.sub(r'\s{3,}', ' ', text) text = text[:500] return text def redact_pii(text: str) -> str: """ Detect and redact PII using six lightweight compiled regex patterns. Patterns cover: email address, IPv4 address, UK phone number, UK National Insurance number, UK sort code, and credit card number. Runs in microseconds per query — no NLP model, no spaCy, no network calls. PERSON entities are intentionally not redacted: visitors are expected to name Darshan Chheda in their queries. Redacting that breaks retrieval. """ if not text: return text for pattern in _PII_PATTERNS: text = pattern.sub("[REDACTED]", text) return text