""" Output Sanitization Module Sanitizes HTML content, user-generated strings, and API responses to prevent XSS and other injection attacks. """ from __future__ import annotations import html import re import logging from typing import Any, Optional logger = logging.getLogger(__name__) # ============================================================================= # HTML SANITIZATION # ============================================================================= # Allowed HTML tags for rich text ALLOWED_TAGS = frozenset({ "p", "br", "hr", "h1", "h2", "h3", "h4", "h5", "h6", "strong", "b", "em", "i", "u", "s", "strike", "ul", "ol", "li", "blockquote", "pre", "code", "a", "img", "table", "thead", "tbody", "tr", "th", "td", "figure", "figcaption", "div", "span", }) # Allowed attributes by tag ALLOWED_ATTRS = { "a": {"href", "title", "target", "rel"}, "img": {"src", "alt", "title", "width", "height"}, "code": {"class"}, "pre": {"class"}, "div": {"class", "id"}, "span": {"class"}, "table": {"class"}, "td": {"colspan", "rowspan"}, "th": {"colspan", "rowspan"}, } # Dangerous attribute patterns DANGEROUS_ATTR_PATTERNS = ( re.compile(r"javascript:", re.IGNORECASE), re.compile(r"vbscript:", re.IGNORECASE), re.compile(r"data:", re.IGNORECASE), re.compile(r"on\w+\s*=", re.IGNORECASE), # onclick, onload, etc. ) def sanitize_html(html_content: str, allow_all_tags: bool = False) -> str: """ Sanitize HTML content to prevent XSS. Args: html_content: Raw HTML to sanitize allow_all_tags: If True, only remove dangerous attrs (for trusted content) Returns: Sanitized HTML string """ if not html_content: return "" # Check for dangerous patterns for pattern in DANGEROUS_ATTR_PATTERNS: html_content = pattern.sub("", html_content) # Remove script and style tags completely html_content = re.sub( r"]*>.*?", "", html_content, flags=re.IGNORECASE | re.DOTALL ) html_content = re.sub( r"]*>.*?", "", html_content, flags=re.IGNORECASE | re.DOTALL ) # Remove event handlers from all tags html_content = re.sub( r'\s+on\w+\s*=\s*["\'][^"\']*["\']', "", html_content, flags=re.IGNORECASE ) return html_content def escape_html(text: str) -> str: """ Escape HTML special characters. Args: text: Plain text to escape Returns: HTML-escaped text """ if not text: return "" return html.escape(text, quote=True) def unescape_html(text: str) -> str: """ Unescape HTML entities. Args: text: HTML-escaped text Returns: Unescaped text """ if not text: return "" return html.unescape(text) # ============================================================================= # STRING SANITIZATION # ============================================================================= def sanitize_filename(filename: str, max_length: int = 100) -> str: """ Sanitize a string for use as a filename. Args: filename: Raw filename max_length: Maximum filename length Returns: Safe filename """ if not filename: return "untitled" # Remove or replace dangerous characters safe = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '_', filename) # Remove leading/trailing spaces and dots safe = safe.strip(". ") # Collapse multiple underscores safe = re.sub(r'_+', '_', safe) # Truncate if len(safe) > max_length: safe = safe[:max_length] # Ensure not empty if not safe: return "untitled" return safe def sanitize_for_json(text: str) -> str: """ Sanitize text for safe JSON embedding. Args: text: Raw text Returns: JSON-safe text """ if not text: return "" # Escape control characters text = re.sub(r'[\x00-\x1f\x7f]', '', text) # Escape problematic characters text = text.replace('\\', '\\\\') text = text.replace('"', '\\"') text = text.replace('\n', '\\n') text = text.replace('\r', '\\r') text = text.replace('\t', '\\t') return text def truncate_text(text: str, max_length: int = 1000, suffix: str = "...") -> str: """ Truncate text to maximum length. Args: text: Text to truncate max_length: Maximum length suffix: Suffix to add when truncated Returns: Truncated text """ if not text: return "" if len(text) <= max_length: return text return text[:max_length - len(suffix)] + suffix # ============================================================================= # URL SANITIZATION # ============================================================================= def sanitize_url(url: str) -> Optional[str]: """ Sanitize a URL for safe use. Args: url: Raw URL Returns: Sanitized URL or None if invalid """ if not url: return None url = url.strip() # Check for dangerous schemes lower_url = url.lower() if any(lower_url.startswith(s) for s in ["javascript:", "vbscript:", "data:", "file:"]): logger.warning(f"Blocked dangerous URL scheme: {url[:50]}") return None # Ensure valid scheme if not url.startswith(("http://", "https://")): # Try adding https url = "https://" + url return url def sanitize_href(href: str) -> str: """ Sanitize an href attribute value. Args: href: Raw href value Returns: Sanitized href or '#' if invalid """ sanitized = sanitize_url(href) return sanitized if sanitized else "#" # ============================================================================= # API RESPONSE SANITIZATION # ============================================================================= def sanitize_api_response(data: dict[str, Any]) -> dict[str, Any]: """ Sanitize an API response for safe rendering. Args: data: Raw API response dict Returns: Sanitized response """ if not isinstance(data, dict): return {} sanitized = {} for key, value in data.items(): if isinstance(value, str): # Escape potential HTML in string values sanitized[key] = escape_html(value) elif isinstance(value, dict): sanitized[key] = sanitize_api_response(value) elif isinstance(value, list): sanitized[key] = [ sanitize_api_response(item) if isinstance(item, dict) else escape_html(item) if isinstance(item, str) else item for item in value ] else: sanitized[key] = value return sanitized def sanitize_markdown(markdown: str) -> str: """ Sanitize markdown content. Removes potentially dangerous content while preserving formatting. Args: markdown: Raw markdown Returns: Sanitized markdown """ if not markdown: return "" # Remove HTML comments markdown = re.sub(r'', '', markdown, flags=re.DOTALL) # Remove inline scripts markdown = re.sub(r']*>.*?', '', markdown, flags=re.DOTALL | re.IGNORECASE) # Remove inline styles that could be malicious markdown = re.sub(r']*>.*?', '', markdown, flags=re.DOTALL | re.IGNORECASE) # Remove javascript: links markdown = re.sub(r'\[([^\]]+)\]\(javascript:[^)]*\)', r'\1', markdown, flags=re.IGNORECASE) return markdown