File size: 4,643 Bytes
abb96d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""

Security Module

===============

Input validation and sanitization to prevent abuse and attacks.



Features:

- Input length validation

- Prompt injection detection

- Suspicious pattern detection

- Logging of security violations



Configuration:

- Adjust MAX_INPUT_LENGTH and MIN_INPUT_LENGTH as needed

- Add custom suspicious patterns if needed

"""

import re
import json
from datetime import datetime
from pathlib import Path
from typing import Tuple, Optional


class SecurityValidator:
    """Validates and sanitizes user input."""
    
    # Input length constraints
    MAX_INPUT_LENGTH = 2000
    MIN_INPUT_LENGTH = 1
    
    # Suspicious patterns that might indicate prompt injection or abuse
    SUSPICIOUS_PATTERNS = [
        r"ignore\s+(previous|all|your)\s+instructions",
        r"system\s*prompt",
        r"you\s+are\s+now",
        r"pretend\s+to\s+be",
        r"act\s+as\s+(a|an)",
        r"<script[^>]*>",
        r"javascript:",
        r"\{\{.*\}\}",  # Template injection
        r"reveal\s+(your|the)\s+(prompt|instructions)",
        r"disregard\s+(previous|all)",
        r"admin\s+mode",
        r"developer\s+mode",
    ]
    
    def __init__(self, log_dir: str = "logs"):
        """Initialize security validator."""
        self.log_dir = Path(log_dir)
        try:
            self.log_dir.mkdir(parents=True, exist_ok=True)
        except (PermissionError, OSError):
            import tempfile
            self.log_dir = Path(tempfile.gettempdir()) / "hickeylab_logs"
            self.log_dir.mkdir(parents=True, exist_ok=True)
        self.security_log = self.log_dir / "security.jsonl"
    
    def validate_input(

        self,

        user_input: str,

        session_id: str

    ) -> Tuple[bool, str, Optional[str]]:
        """

        Validate and sanitize user input.

        

        Args:

            user_input: The user's input text

            session_id: Unique session identifier for logging

        

        Returns:

            Tuple of (is_valid, cleaned_input, error_message)

            - is_valid: True if input passes all checks

            - cleaned_input: The cleaned/trimmed input

            - error_message: User-facing error message if invalid

        """
        # Strip whitespace
        cleaned = user_input.strip()
        
        # Check minimum length
        if len(cleaned) < self.MIN_INPUT_LENGTH:
            return False, "", "Please enter a question."
        
        # Check maximum length
        if len(cleaned) > self.MAX_INPUT_LENGTH:
            return (
                False,
                "",
                f"⚠️ Question too long. Please keep your question under {self.MAX_INPUT_LENGTH} characters. "
                f"(Current: {len(cleaned)} characters)"
            )
        
        # Check for suspicious patterns
        for pattern in self.SUSPICIOUS_PATTERNS:
            if re.search(pattern, cleaned, re.IGNORECASE):
                self._log_suspicious(session_id, cleaned, pattern)
                return (
                    False,
                    "",
                    "⚠️ Your question contains invalid content. Please rephrase and try again."
                )
        
        # Check for excessive special characters (might indicate injection attempt)
        special_char_ratio = len(re.findall(r"[^a-zA-Z0-9\s.,;:?!()\-']", cleaned)) / max(len(cleaned), 1)
        if special_char_ratio > 0.3:  # More than 30% special characters
            self._log_suspicious(session_id, cleaned, "excessive_special_chars")
            return (
                False,
                "",
                "⚠️ Your question contains unusual characters. Please use standard text."
            )
        
        # All checks passed
        return True, cleaned, None
    
    def _log_suspicious(self, session_id: str, content: str, reason: str) -> None:
        """Log suspicious input for security review."""
        log_entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "session_id": session_id[:8] if len(session_id) >= 8 else session_id,
            "content_length": len(content),
            "content_preview": content[:100] + "..." if len(content) > 100 else content,
            "reason": reason
        }
        
        try:
            with open(self.security_log, "a", encoding="utf-8") as f:
                f.write(json.dumps(log_entry) + "\n")
        except (IOError, OSError) as e:
            print(f"Warning: Could not log security violation: {e}")