Spaces:
Running
Running
| """ | |
| Output Sanitization Module | |
| Sanitizes HTML content, user-generated strings, and API responses | |
| to prevent XSS and other injection attacks. | |
| """ | |
| from __future__ import annotations | |
| import html | |
| import re | |
| import logging | |
| from typing import Any, Optional | |
| logger = logging.getLogger(__name__) | |
| # ============================================================================= | |
| # HTML SANITIZATION | |
| # ============================================================================= | |
| # Allowed HTML tags for rich text | |
| ALLOWED_TAGS = frozenset({ | |
| "p", "br", "hr", | |
| "h1", "h2", "h3", "h4", "h5", "h6", | |
| "strong", "b", "em", "i", "u", "s", "strike", | |
| "ul", "ol", "li", | |
| "blockquote", "pre", "code", | |
| "a", "img", | |
| "table", "thead", "tbody", "tr", "th", "td", | |
| "figure", "figcaption", | |
| "div", "span", | |
| }) | |
| # Allowed attributes by tag | |
| ALLOWED_ATTRS = { | |
| "a": {"href", "title", "target", "rel"}, | |
| "img": {"src", "alt", "title", "width", "height"}, | |
| "code": {"class"}, | |
| "pre": {"class"}, | |
| "div": {"class", "id"}, | |
| "span": {"class"}, | |
| "table": {"class"}, | |
| "td": {"colspan", "rowspan"}, | |
| "th": {"colspan", "rowspan"}, | |
| } | |
| # Dangerous attribute patterns | |
| DANGEROUS_ATTR_PATTERNS = ( | |
| re.compile(r"javascript:", re.IGNORECASE), | |
| re.compile(r"vbscript:", re.IGNORECASE), | |
| re.compile(r"data:", re.IGNORECASE), | |
| re.compile(r"on\w+\s*=", re.IGNORECASE), # onclick, onload, etc. | |
| ) | |
| def sanitize_html(html_content: str, allow_all_tags: bool = False) -> str: | |
| """ | |
| Sanitize HTML content to prevent XSS. | |
| Args: | |
| html_content: Raw HTML to sanitize | |
| allow_all_tags: If True, only remove dangerous attrs (for trusted content) | |
| Returns: | |
| Sanitized HTML string | |
| """ | |
| if not html_content: | |
| return "" | |
| # Check for dangerous patterns | |
| for pattern in DANGEROUS_ATTR_PATTERNS: | |
| html_content = pattern.sub("", html_content) | |
| # Remove script and style tags completely | |
| html_content = re.sub( | |
| r"<script[^>]*>.*?</script>", | |
| "", | |
| html_content, | |
| flags=re.IGNORECASE | re.DOTALL | |
| ) | |
| html_content = re.sub( | |
| r"<style[^>]*>.*?</style>", | |
| "", | |
| html_content, | |
| flags=re.IGNORECASE | re.DOTALL | |
| ) | |
| # Remove event handlers from all tags | |
| html_content = re.sub( | |
| r'\s+on\w+\s*=\s*["\'][^"\']*["\']', | |
| "", | |
| html_content, | |
| flags=re.IGNORECASE | |
| ) | |
| return html_content | |
| def escape_html(text: str) -> str: | |
| """ | |
| Escape HTML special characters. | |
| Args: | |
| text: Plain text to escape | |
| Returns: | |
| HTML-escaped text | |
| """ | |
| if not text: | |
| return "" | |
| return html.escape(text, quote=True) | |
| def unescape_html(text: str) -> str: | |
| """ | |
| Unescape HTML entities. | |
| Args: | |
| text: HTML-escaped text | |
| Returns: | |
| Unescaped text | |
| """ | |
| if not text: | |
| return "" | |
| return html.unescape(text) | |
| # ============================================================================= | |
| # STRING SANITIZATION | |
| # ============================================================================= | |
| def sanitize_filename(filename: str, max_length: int = 100) -> str: | |
| """ | |
| Sanitize a string for use as a filename. | |
| Args: | |
| filename: Raw filename | |
| max_length: Maximum filename length | |
| Returns: | |
| Safe filename | |
| """ | |
| if not filename: | |
| return "untitled" | |
| # Remove or replace dangerous characters | |
| safe = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '_', filename) | |
| # Remove leading/trailing spaces and dots | |
| safe = safe.strip(". ") | |
| # Collapse multiple underscores | |
| safe = re.sub(r'_+', '_', safe) | |
| # Truncate | |
| if len(safe) > max_length: | |
| safe = safe[:max_length] | |
| # Ensure not empty | |
| if not safe: | |
| return "untitled" | |
| return safe | |
| def sanitize_for_json(text: str) -> str: | |
| """ | |
| Sanitize text for safe JSON embedding. | |
| Args: | |
| text: Raw text | |
| Returns: | |
| JSON-safe text | |
| """ | |
| if not text: | |
| return "" | |
| # Escape control characters | |
| text = re.sub(r'[\x00-\x1f\x7f]', '', text) | |
| # Escape problematic characters | |
| text = text.replace('\\', '\\\\') | |
| text = text.replace('"', '\\"') | |
| text = text.replace('\n', '\\n') | |
| text = text.replace('\r', '\\r') | |
| text = text.replace('\t', '\\t') | |
| return text | |
| def truncate_text(text: str, max_length: int = 1000, suffix: str = "...") -> str: | |
| """ | |
| Truncate text to maximum length. | |
| Args: | |
| text: Text to truncate | |
| max_length: Maximum length | |
| suffix: Suffix to add when truncated | |
| Returns: | |
| Truncated text | |
| """ | |
| if not text: | |
| return "" | |
| if len(text) <= max_length: | |
| return text | |
| return text[:max_length - len(suffix)] + suffix | |
| # ============================================================================= | |
| # URL SANITIZATION | |
| # ============================================================================= | |
| def sanitize_url(url: str) -> Optional[str]: | |
| """ | |
| Sanitize a URL for safe use. | |
| Args: | |
| url: Raw URL | |
| Returns: | |
| Sanitized URL or None if invalid | |
| """ | |
| if not url: | |
| return None | |
| url = url.strip() | |
| # Check for dangerous schemes | |
| lower_url = url.lower() | |
| if any(lower_url.startswith(s) for s in ["javascript:", "vbscript:", "data:", "file:"]): | |
| logger.warning(f"Blocked dangerous URL scheme: {url[:50]}") | |
| return None | |
| # Ensure valid scheme | |
| if not url.startswith(("http://", "https://")): | |
| # Try adding https | |
| url = "https://" + url | |
| return url | |
| def sanitize_href(href: str) -> str: | |
| """ | |
| Sanitize an href attribute value. | |
| Args: | |
| href: Raw href value | |
| Returns: | |
| Sanitized href or '#' if invalid | |
| """ | |
| sanitized = sanitize_url(href) | |
| return sanitized if sanitized else "#" | |
| # ============================================================================= | |
| # API RESPONSE SANITIZATION | |
| # ============================================================================= | |
| def sanitize_api_response(data: dict[str, Any]) -> dict[str, Any]: | |
| """ | |
| Sanitize an API response for safe rendering. | |
| Args: | |
| data: Raw API response dict | |
| Returns: | |
| Sanitized response | |
| """ | |
| if not isinstance(data, dict): | |
| return {} | |
| sanitized = {} | |
| for key, value in data.items(): | |
| if isinstance(value, str): | |
| # Escape potential HTML in string values | |
| sanitized[key] = escape_html(value) | |
| elif isinstance(value, dict): | |
| sanitized[key] = sanitize_api_response(value) | |
| elif isinstance(value, list): | |
| sanitized[key] = [ | |
| sanitize_api_response(item) if isinstance(item, dict) | |
| else escape_html(item) if isinstance(item, str) | |
| else item | |
| for item in value | |
| ] | |
| else: | |
| sanitized[key] = value | |
| return sanitized | |
| def sanitize_markdown(markdown: str) -> str: | |
| """ | |
| Sanitize markdown content. | |
| Removes potentially dangerous content while preserving formatting. | |
| Args: | |
| markdown: Raw markdown | |
| Returns: | |
| Sanitized markdown | |
| """ | |
| if not markdown: | |
| return "" | |
| # Remove HTML comments | |
| markdown = re.sub(r'<!--.*?-->', '', markdown, flags=re.DOTALL) | |
| # Remove inline scripts | |
| markdown = re.sub(r'<script[^>]*>.*?</script>', '', markdown, flags=re.DOTALL | re.IGNORECASE) | |
| # Remove inline styles that could be malicious | |
| markdown = re.sub(r'<style[^>]*>.*?</style>', '', markdown, flags=re.DOTALL | re.IGNORECASE) | |
| # Remove javascript: links | |
| markdown = re.sub(r'\[([^\]]+)\]\(javascript:[^)]*\)', r'\1', markdown, flags=re.IGNORECASE) | |
| return markdown | |