Spaces:
Running
Running
| """ | |
| backend/app/security/sanitizer.py | |
| Input sanitisation and lightweight PII redaction for user queries. | |
| Issue 4 resolution: Presidio was replaced with six compiled regex patterns. | |
| WHY Presidio was removed | |
| βββββββββββββββββββββββββ | |
| Presidio uses spaCy-based NLP internally: named entity recognition, pattern | |
| matching, and context analysis. This added 50-100ms to every request before | |
| any business logic ran. For a personal portfolio chatbot, the realistic PII | |
| risk is near zero β no legitimate user submits their credit card number or SSN | |
| to a developer's portfolio assistant. The threat model does not justify the | |
| latency cost or the large spaCy model in the Docker image. | |
| Six regex patterns cover every plausible PII type for this use case and run | |
| in microseconds, not milliseconds. If Presidio is ever reconsidered, the | |
| latency cost must be measured and documented before reintroduction. | |
| DO NOT reintroduce Presidio or spaCy without explicit justification. | |
| """ | |
| import re | |
| # LLM token delimiters that attackers embed in queries to escape the system prompt | |
| # or inject new instructions. Strip them before any further processing. | |
| _RE_INJECT_TOKENS = re.compile( | |
| r"(<\|\s*(system|user|assistant|im_start|im_end)\s*\|>" | |
| r"|<<\s*sys\s*>>" | |
| r"|\[/?\s*inst\s*\]" | |
| r"|\[/?\s*system\s*\]" | |
| r"|---\s*system\s*---" | |
| r"|\\n###\s*instruction)", | |
| re.IGNORECASE, | |
| ) | |
| # Six compiled patterns covering plausible PII in portfolio chatbot input. | |
| # Named capturing groups make the replacements self-documenting. | |
| # Patterns are ordered cheapest-first (no backtracking before complex ones). | |
| _PII_PATTERNS: tuple[re.Pattern, ...] = ( | |
| # Email address | |
| re.compile(r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"), | |
| # IPv4 address (before phone to avoid 4-octet false positives in phone patterns) | |
| re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), | |
| # UK phone: 07xxx xxxxxx, +44 7xxx xxxxxx, 01xxx xxxxxx, etc. | |
| re.compile(r"\b(?:\+44\s?|0)(?:\d\s?){9,10}\b"), | |
| # UK National Insurance number: two letters, six digits, one letter (AβD) | |
| re.compile(r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b", re.IGNORECASE), | |
| # UK sort code: xx-xx-xx or xxxxxx (6 digits) | |
| re.compile(r"\b\d{2}[-\s]?\d{2}[-\s]?\d{2}\b"), | |
| # Credit card: 13β19 digit sequences with optional spaces/dashes | |
| re.compile(r"\b(?:\d[ \-]?){13,19}\b"), | |
| ) | |
| def sanitize_input(text: str) -> str: | |
| """ | |
| 1. Strip null bytes and non-printable control characters (keep \\n, \\t). | |
| 2. Remove LLM token-injection delimiters (<|system|>, <<SYS>>, [INST], etc.). | |
| 3. Collapse 3+ consecutive whitespace to a single space. | |
| 4. Truncate to 500 chars. | |
| """ | |
| if not text: | |
| return "" | |
| # Remove control characters except \n and \t | |
| text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) | |
| # Strip injection delimiters before they reach the classifier or LLM | |
| text = _RE_INJECT_TOKENS.sub('', text) | |
| text = re.sub(r'\s{3,}', ' ', text) | |
| text = text[:500] | |
| return text | |
| def redact_pii(text: str) -> str: | |
| """ | |
| Detect and redact PII using six lightweight compiled regex patterns. | |
| Patterns cover: email address, IPv4 address, UK phone number, | |
| UK National Insurance number, UK sort code, and credit card number. | |
| Runs in microseconds per query β no NLP model, no spaCy, no network calls. | |
| PERSON entities are intentionally not redacted: visitors are expected to | |
| name Darshan Chheda in their queries. Redacting that breaks retrieval. | |
| """ | |
| if not text: | |
| return text | |
| for pattern in _PII_PATTERNS: | |
| text = pattern.sub("[REDACTED]", text) | |
| return text | |