File size: 7,976 Bytes
dc3879e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""Security utilities for the AI chatbot.

[Task]: T057
[From]: specs/004-ai-chatbot/tasks.md

This module provides security functions including prompt injection sanitization,
input validation, and content filtering.
"""
import re
import html
from typing import Optional, List


# Known prompt injection patterns
PROMPT_INJECTION_PATTERNS = [
    # Direct instructions to ignore previous context
    r"(?i)ignore\s+(all\s+)?(previous|above|prior)",
    r"(?i)disregard\s+(all\s+)?(previous|above|prior)",
    r"(?i)forget\s+(everything|all\s+instructions|previous)",
    r"(?i)override\s+(your\s+)?programming",
    r"(?i)new\s+(instruction|direction|rule)s?",
    r"(?i)change\s+(your\s+)?(behavior|role|persona)",

    # Jailbreak attempts
    r"(?i)(jailbreak|jail\s*break)",
    r"(?i)(developer|admin|root|privileged)\s+mode",
    r"(?i)act\s+as\s+(a\s+)?(developer|admin|root)",
    r"(?i)roleplay\s+as",
    r"(?i)pretend\s+(to\s+be|you're)",
    r"(?i)simulate\s+being",

    # System prompt extraction
    r"(?i)show\s+(your\s+)?(instructions|system\s+prompt|prompt)",
    r"(?i)print\s+(your\s+)?(instructions|system\s+prompt)",
    r"(?i)reveal\s+(your\s+)?(instructions|system\s+prompt)",
    r"(?i)what\s+(are\s+)?your\s+instructions",
    r"(?i)tell\s+me\s+how\s+you\s+work",

    # DAN and similar jailbreaks
    r"(?i)do\s+anything\s+now",
    r"(?i)unrestricted\s+mode",
    r"(?i)no\s+limitations?",
    r"(?i)bypass\s+(safety|filters|restrictions)",
    r"(?i)\bDAN\b",  # Do Anything Now
]


def sanitize_message(message: str, max_length: int = 10000) -> str:
    """Sanitize a user message to prevent prompt injection attacks.

    [From]: specs/004-ai-chatbot/spec.md - NFR-017

    Args:
        message: The raw user message
        max_length: Maximum allowed message length

    Returns:
        Sanitized message safe for processing by AI

    Raises:
        ValueError: If message contains severe injection attempts
    """
    if not message:
        return ""

    # Trim to max length
    message = message[:max_length]

    # Check for severe injection patterns
    detected = detect_prompt_injection(message)
    if detected:
        # For severe attacks, reject the message
        if detected["severity"] == "high":
            raise ValueError(
                "This message contains content that cannot be processed. "
                "Please rephrase your request."
            )

    # Apply sanitization
    sanitized = _apply_sanitization(message)

    return sanitized


def detect_prompt_injection(message: str) -> Optional[dict]:
    """Detect potential prompt injection attempts in a message.

    [From]: specs/004-ai-chatbot/spec.md - NFR-017

    Args:
        message: The message to check

    Returns:
        Dictionary with detection info if injection detected, None otherwise:
        {
            "detected": True,
            "severity": "low" | "medium" | "high",
            "pattern": "matched pattern",
            "confidence": 0.0-1.0
        }
    """
    message_lower = message.lower()

    for pattern in PROMPT_INJECTION_PATTERNS:
        match = re.search(pattern, message_lower)

        if match:
            # Determine severity based on pattern type
            severity = _get_severity_for_pattern(pattern)

            # Check for context that might indicate legitimate use
            is_legitimate = _check_legitimate_context(message, match.group())

            if not is_legitimate:
                return {
                    "detected": True,
                    "severity": severity,
                    "pattern": match.group(),
                    "confidence": 0.8
                }

    return None


def _get_severity_for_pattern(pattern: str) -> str:
    """Determine severity level for a matched pattern.

    Args:
        pattern: The regex pattern that matched

    Returns:
        "low", "medium", or "high"
    """
    pattern_lower = pattern.lower()

    # High severity: direct jailbreak attempts
    if any(word in pattern_lower for word in ["jailbreak", "dan", "unrestricted", "bypass"]):
        return "high"

    # High severity: system prompt extraction
    if any(word in pattern_lower for word in ["show", "print", "reveal", "instructions"]):
        return "high"

    # Medium severity: role/persona manipulation
    if any(word in pattern_lower for word in ["act as", "pretend", "roleplay", "override"]):
        return "medium"

    # Low severity: ignore instructions
    if any(word in pattern_lower for word in ["ignore", "disregard", "forget"]):
        return "low"

    return "low"


def _check_legitimate_context(message: str, matched_text: str) -> bool:
    """Check if a matched pattern might be legitimate user content.

    [From]: specs/004-ai-chatbot/spec.md - NFR-017

    Args:
        message: The full message
        matched_text: The text that matched a pattern

    Returns:
        True if this appears to be legitimate context, False otherwise
    """
    message_lower = message.lower()
    matched_lower = matched_text.lower()

    # Check if the matched text is part of a task description (legitimate)
    legitimate_contexts = [
        # Common task-related phrases
        "task to ignore",
        "mark as complete",
        "disregard this",
        "role in the project",
        "change status",
        "update the role",
        "priority change",
    ]

    for context in legitimate_contexts:
        if context in message_lower:
            return True

    # Check if matched text is very short (likely false positive)
    if len(matched_text) <= 3:
        return True

    return False


def _apply_sanitization(message: str) -> str:
    """Apply sanitization transformations to a message.

    [From]: specs/004-ai-chatbot/spec.md - NFR-017

    Args:
        message: The message to sanitize

    Returns:
        Sanitized message
    """
    # Remove excessive whitespace
    message = re.sub(r"\s+", " ", message)

    # Remove control characters except newlines and tabs
    message = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]", "", message)

    # Normalize line endings
    message = message.replace("\r\n", "\n").replace("\r", "\n")

    # Limit consecutive newlines to 2
    message = re.sub(r"\n{3,}", "\n\n", message)

    return message.strip()


def validate_task_input(task_data: dict) -> tuple[bool, Optional[str]]:
    """Validate task-related input for security issues.

    [From]: specs/004-ai-chatbot/spec.md - NFR-017

    Args:
        task_data: Dictionary containing task fields

    Returns:
        Tuple of (is_valid, error_message)
    """
    if not isinstance(task_data, dict):
        return False, "Invalid task data format"

    # Check for SQL injection patterns in string fields
    sql_patterns = [
        r"(?i)(\bunion\b.*\bselect\b)",
        r"(?i)(\bselect\b.*\bfrom\b)",
        r"(?i)(\binsert\b.*\binto\b)",
        r"(?i)(\bupdate\b.*\bset\b)",
        r"(?i)(\bdelete\b.*\bfrom\b)",
        r"(?i)(\bdrop\b.*\btable\b)",
        r";\s*(union|select|insert|update|delete|drop)",
    ]

    for key, value in task_data.items():
        if isinstance(value, str):
            for pattern in sql_patterns:
                if re.search(pattern, value):
                    return False, f"Invalid characters in {key}"

            # Check for script injection
            if re.search(r"<script[^>]*>.*?</script>", value, re.IGNORECASE):
                return False, f"Invalid content in {key}"

    return True, None


def sanitize_html_content(content: str) -> str:
    """Sanitize HTML content by escaping potentially dangerous elements.

    [From]: specs/004-ai-chatbot/spec.md - NFR-017

    Args:
        content: Content that may contain HTML

    Returns:
        Escaped HTML string
    """
    return html.escape(content, quote=False)


__all__ = [
    "sanitize_message",
    "detect_prompt_injection",
    "validate_task_input",
    "sanitize_html_content",
]