plg4-dev-server / backend /utils /sanitization.py
Jesse Johnson
New commit for backend deployment: 2025-09-25_13-24-03
c59d808
# Data Sanitization Utilities
import re
import html
from typing import Optional
from backend.config.logging_config import get_logger
# Setup logging
logger = get_logger("sanitization")
class DataSanitizer:
"""Simple data sanitization utility for recipe chatbot inputs"""
# Configuration constants
MAX_MESSAGE_LENGTH = 1000
MIN_MESSAGE_LENGTH = 1
# Simple patterns for basic protection
HARMFUL_PATTERNS = [
r'<script[^>]*>.*?</script>', # Script tags
r'javascript:', # JavaScript URLs
r'on\w+\s*=', # Event handlers
]
@classmethod
def sanitize_input(cls, text: str) -> str:
"""
Sanitize user input for recipe chatbot
Args:
text: Raw user input
Returns:
Sanitized text
Raises:
ValueError: If input fails validation
"""
if not text:
raise ValueError("Input cannot be empty")
logger.debug(f"🧼 Sanitizing input: '{text[:50]}...'")
# Step 1: Basic validation
cls._validate_length(text)
# Step 2: HTML encoding for basic XSS protection
sanitized = html.escape(text.strip())
# Step 3: Remove harmful patterns
sanitized = cls._remove_harmful_content(sanitized)
# Step 4: Normalize whitespace
sanitized = cls._normalize_whitespace(sanitized)
# Step 5: Final validation
if not sanitized.strip():
raise ValueError("Input cannot be empty after sanitization")
logger.debug(f"✅ Input sanitized successfully")
return sanitized.strip()
@classmethod
def _validate_length(cls, text: str) -> None:
"""Validate input length"""
if len(text) < cls.MIN_MESSAGE_LENGTH:
raise ValueError(f"Input too short (minimum {cls.MIN_MESSAGE_LENGTH} character)")
if len(text) > cls.MAX_MESSAGE_LENGTH:
raise ValueError(f"Input too long (maximum {cls.MAX_MESSAGE_LENGTH} characters)")
@classmethod
def _remove_harmful_content(cls, text: str) -> str:
"""Remove basic harmful content"""
for pattern in cls.HARMFUL_PATTERNS:
text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
return text
@classmethod
def _normalize_whitespace(cls, text: str) -> str:
"""Normalize whitespace in text"""
# Replace multiple whitespace with single space
text = re.sub(r'\s+', ' ', text)
return text.strip()
# Convenience function for easy import
def sanitize_user_input(text: str) -> str:
"""Sanitize any user input (chat messages, demo prompts, etc.)"""
return DataSanitizer.sanitize_input(text)
def clean(s: Optional[str]) -> Optional[str]:
if not s: return None
s = re.sub(r"\s+", " ", s).strip()
s = re.sub(r"\bclick here\b.*", "", s, flags=re.I)
return s or None