"""Security utilities for the AI chatbot. [Task]: T057 [From]: specs/004-ai-chatbot/tasks.md This module provides security functions including prompt injection sanitization, input validation, and content filtering. """ import re import html from typing import Optional, List # Known prompt injection patterns PROMPT_INJECTION_PATTERNS = [ # Direct instructions to ignore previous context r"(?i)ignore\s+(all\s+)?(previous|above|prior)", r"(?i)disregard\s+(all\s+)?(previous|above|prior)", r"(?i)forget\s+(everything|all\s+instructions|previous)", r"(?i)override\s+(your\s+)?programming", r"(?i)new\s+(instruction|direction|rule)s?", r"(?i)change\s+(your\s+)?(behavior|role|persona)", # Jailbreak attempts r"(?i)(jailbreak|jail\s*break)", r"(?i)(developer|admin|root|privileged)\s+mode", r"(?i)act\s+as\s+(a\s+)?(developer|admin|root)", r"(?i)roleplay\s+as", r"(?i)pretend\s+(to\s+be|you're)", r"(?i)simulate\s+being", # System prompt extraction r"(?i)show\s+(your\s+)?(instructions|system\s+prompt|prompt)", r"(?i)print\s+(your\s+)?(instructions|system\s+prompt)", r"(?i)reveal\s+(your\s+)?(instructions|system\s+prompt)", r"(?i)what\s+(are\s+)?your\s+instructions", r"(?i)tell\s+me\s+how\s+you\s+work", # DAN and similar jailbreaks r"(?i)do\s+anything\s+now", r"(?i)unrestricted\s+mode", r"(?i)no\s+limitations?", r"(?i)bypass\s+(safety|filters|restrictions)", r"(?i)\bDAN\b", # Do Anything Now ] def sanitize_message(message: str, max_length: int = 10000) -> str: """Sanitize a user message to prevent prompt injection attacks. [From]: specs/004-ai-chatbot/spec.md - NFR-017 Args: message: The raw user message max_length: Maximum allowed message length Returns: Sanitized message safe for processing by AI Raises: ValueError: If message contains severe injection attempts """ if not message: return "" # Trim to max length message = message[:max_length] # Check for severe injection patterns detected = detect_prompt_injection(message) if detected: # For severe attacks, reject the message if detected["severity"] == "high": raise ValueError( "This message contains content that cannot be processed. " "Please rephrase your request." ) # Apply sanitization sanitized = _apply_sanitization(message) return sanitized def detect_prompt_injection(message: str) -> Optional[dict]: """Detect potential prompt injection attempts in a message. [From]: specs/004-ai-chatbot/spec.md - NFR-017 Args: message: The message to check Returns: Dictionary with detection info if injection detected, None otherwise: { "detected": True, "severity": "low" | "medium" | "high", "pattern": "matched pattern", "confidence": 0.0-1.0 } """ message_lower = message.lower() for pattern in PROMPT_INJECTION_PATTERNS: match = re.search(pattern, message_lower) if match: # Determine severity based on pattern type severity = _get_severity_for_pattern(pattern) # Check for context that might indicate legitimate use is_legitimate = _check_legitimate_context(message, match.group()) if not is_legitimate: return { "detected": True, "severity": severity, "pattern": match.group(), "confidence": 0.8 } return None def _get_severity_for_pattern(pattern: str) -> str: """Determine severity level for a matched pattern. Args: pattern: The regex pattern that matched Returns: "low", "medium", or "high" """ pattern_lower = pattern.lower() # High severity: direct jailbreak attempts if any(word in pattern_lower for word in ["jailbreak", "dan", "unrestricted", "bypass"]): return "high" # High severity: system prompt extraction if any(word in pattern_lower for word in ["show", "print", "reveal", "instructions"]): return "high" # Medium severity: role/persona manipulation if any(word in pattern_lower for word in ["act as", "pretend", "roleplay", "override"]): return "medium" # Low severity: ignore instructions if any(word in pattern_lower for word in ["ignore", "disregard", "forget"]): return "low" return "low" def _check_legitimate_context(message: str, matched_text: str) -> bool: """Check if a matched pattern might be legitimate user content. [From]: specs/004-ai-chatbot/spec.md - NFR-017 Args: message: The full message matched_text: The text that matched a pattern Returns: True if this appears to be legitimate context, False otherwise """ message_lower = message.lower() matched_lower = matched_text.lower() # Check if the matched text is part of a task description (legitimate) legitimate_contexts = [ # Common task-related phrases "task to ignore", "mark as complete", "disregard this", "role in the project", "change status", "update the role", "priority change", ] for context in legitimate_contexts: if context in message_lower: return True # Check if matched text is very short (likely false positive) if len(matched_text) <= 3: return True return False def _apply_sanitization(message: str) -> str: """Apply sanitization transformations to a message. [From]: specs/004-ai-chatbot/spec.md - NFR-017 Args: message: The message to sanitize Returns: Sanitized message """ # Remove excessive whitespace message = re.sub(r"\s+", " ", message) # Remove control characters except newlines and tabs message = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]", "", message) # Normalize line endings message = message.replace("\r\n", "\n").replace("\r", "\n") # Limit consecutive newlines to 2 message = re.sub(r"\n{3,}", "\n\n", message) return message.strip() def validate_task_input(task_data: dict) -> tuple[bool, Optional[str]]: """Validate task-related input for security issues. [From]: specs/004-ai-chatbot/spec.md - NFR-017 Args: task_data: Dictionary containing task fields Returns: Tuple of (is_valid, error_message) """ if not isinstance(task_data, dict): return False, "Invalid task data format" # Check for SQL injection patterns in string fields sql_patterns = [ r"(?i)(\bunion\b.*\bselect\b)", r"(?i)(\bselect\b.*\bfrom\b)", r"(?i)(\binsert\b.*\binto\b)", r"(?i)(\bupdate\b.*\bset\b)", r"(?i)(\bdelete\b.*\bfrom\b)", r"(?i)(\bdrop\b.*\btable\b)", r";\s*(union|select|insert|update|delete|drop)", ] for key, value in task_data.items(): if isinstance(value, str): for pattern in sql_patterns: if re.search(pattern, value): return False, f"Invalid characters in {key}" # Check for script injection if re.search(r"]*>.*?", value, re.IGNORECASE): return False, f"Invalid content in {key}" return True, None def sanitize_html_content(content: str) -> str: """Sanitize HTML content by escaping potentially dangerous elements. [From]: specs/004-ai-chatbot/spec.md - NFR-017 Args: content: Content that may contain HTML Returns: Escaped HTML string """ return html.escape(content, quote=False) __all__ = [ "sanitize_message", "detect_prompt_injection", "validate_task_input", "sanitize_html_content", ]