File size: 3,045 Bytes
c59d808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Data Sanitization Utilities
import re
import html
from typing import Optional
from backend.config.logging_config import get_logger

# Setup logging
logger = get_logger("sanitization")



class DataSanitizer:
    """Simple data sanitization utility for recipe chatbot inputs"""
    
    # Configuration constants
    MAX_MESSAGE_LENGTH = 1000
    MIN_MESSAGE_LENGTH = 1
    
    # Simple patterns for basic protection
    HARMFUL_PATTERNS = [
        r'<script[^>]*>.*?</script>',  # Script tags
        r'javascript:',               # JavaScript URLs
        r'on\w+\s*=',                # Event handlers
    ]
    
    @classmethod
    def sanitize_input(cls, text: str) -> str:
        """
        Sanitize user input for recipe chatbot
        
        Args:
            text: Raw user input
            
        Returns:
            Sanitized text
            
        Raises:
            ValueError: If input fails validation
        """
        if not text:
            raise ValueError("Input cannot be empty")
        
        logger.debug(f"🧼 Sanitizing input: '{text[:50]}...'")
        
        # Step 1: Basic validation
        cls._validate_length(text)
        
        # Step 2: HTML encoding for basic XSS protection
        sanitized = html.escape(text.strip())
        
        # Step 3: Remove harmful patterns
        sanitized = cls._remove_harmful_content(sanitized)
        
        # Step 4: Normalize whitespace
        sanitized = cls._normalize_whitespace(sanitized)
        
        # Step 5: Final validation
        if not sanitized.strip():
            raise ValueError("Input cannot be empty after sanitization")
        
        logger.debug(f"✅ Input sanitized successfully")
        return sanitized.strip()
    
    @classmethod
    def _validate_length(cls, text: str) -> None:
        """Validate input length"""
        if len(text) < cls.MIN_MESSAGE_LENGTH:
            raise ValueError(f"Input too short (minimum {cls.MIN_MESSAGE_LENGTH} character)")
        
        if len(text) > cls.MAX_MESSAGE_LENGTH:
            raise ValueError(f"Input too long (maximum {cls.MAX_MESSAGE_LENGTH} characters)")
    
    @classmethod
    def _remove_harmful_content(cls, text: str) -> str:
        """Remove basic harmful content"""
        for pattern in cls.HARMFUL_PATTERNS:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
        return text
    
    @classmethod
    def _normalize_whitespace(cls, text: str) -> str:
        """Normalize whitespace in text"""
        # Replace multiple whitespace with single space
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

# Convenience function for easy import
def sanitize_user_input(text: str) -> str:
    """Sanitize any user input (chat messages, demo prompts, etc.)"""
    return DataSanitizer.sanitize_input(text)

def clean(s: Optional[str]) -> Optional[str]:
    if not s: return None
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\bclick here\b.*", "", s, flags=re.I)
    return s or None