personabot-api / app /security /sanitizer.py
GitHub Actions
Deploy 85f07db
3d134a6
"""
backend/app/security/sanitizer.py
Input sanitisation and lightweight PII redaction for user queries.
Issue 4 resolution: Presidio was replaced with six compiled regex patterns.
WHY Presidio was removed
─────────────────────────
Presidio uses spaCy-based NLP internally: named entity recognition, pattern
matching, and context analysis. This added 50-100ms to every request before
any business logic ran. For a personal portfolio chatbot, the realistic PII
risk is near zero β€” no legitimate user submits their credit card number or SSN
to a developer's portfolio assistant. The threat model does not justify the
latency cost or the large spaCy model in the Docker image.
Six regex patterns cover every plausible PII type for this use case and run
in microseconds, not milliseconds. If Presidio is ever reconsidered, the
latency cost must be measured and documented before reintroduction.
DO NOT reintroduce Presidio or spaCy without explicit justification.
"""
import re
# LLM token delimiters that attackers embed in queries to escape the system prompt
# or inject new instructions. Strip them before any further processing.
_RE_INJECT_TOKENS = re.compile(
r"(<\|\s*(system|user|assistant|im_start|im_end)\s*\|>"
r"|<<\s*sys\s*>>"
r"|\[/?\s*inst\s*\]"
r"|\[/?\s*system\s*\]"
r"|---\s*system\s*---"
r"|\\n###\s*instruction)",
re.IGNORECASE,
)
# Six compiled patterns covering plausible PII in portfolio chatbot input.
# Named capturing groups make the replacements self-documenting.
# Patterns are ordered cheapest-first (no backtracking before complex ones).
_PII_PATTERNS: tuple[re.Pattern, ...] = (
# Email address
re.compile(r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"),
# IPv4 address (before phone to avoid 4-octet false positives in phone patterns)
re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
# UK phone: 07xxx xxxxxx, +44 7xxx xxxxxx, 01xxx xxxxxx, etc.
re.compile(r"\b(?:\+44\s?|0)(?:\d\s?){9,10}\b"),
# UK National Insurance number: two letters, six digits, one letter (A–D)
re.compile(r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b", re.IGNORECASE),
# UK sort code: xx-xx-xx or xxxxxx (6 digits)
re.compile(r"\b\d{2}[-\s]?\d{2}[-\s]?\d{2}\b"),
# Credit card: 13–19 digit sequences with optional spaces/dashes
re.compile(r"\b(?:\d[ \-]?){13,19}\b"),
)
def sanitize_input(text: str) -> str:
"""
1. Strip null bytes and non-printable control characters (keep \\n, \\t).
2. Remove LLM token-injection delimiters (<|system|>, <<SYS>>, [INST], etc.).
3. Collapse 3+ consecutive whitespace to a single space.
4. Truncate to 500 chars.
"""
if not text:
return ""
# Remove control characters except \n and \t
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# Strip injection delimiters before they reach the classifier or LLM
text = _RE_INJECT_TOKENS.sub('', text)
text = re.sub(r'\s{3,}', ' ', text)
text = text[:500]
return text
def redact_pii(text: str) -> str:
"""
Detect and redact PII using six lightweight compiled regex patterns.
Patterns cover: email address, IPv4 address, UK phone number,
UK National Insurance number, UK sort code, and credit card number.
Runs in microseconds per query β€” no NLP model, no spaCy, no network calls.
PERSON entities are intentionally not redacted: visitors are expected to
name Darshan Chheda in their queries. Redacting that breaks retrieval.
"""
if not text:
return text
for pattern in _PII_PATTERNS:
text = pattern.sub("[REDACTED]", text)
return text