"""
Output Sanitization Module
Sanitizes HTML content, user-generated strings, and API responses
to prevent XSS and other injection attacks.
"""
from __future__ import annotations
import html
import re
import logging
from typing import Any, Optional
logger = logging.getLogger(__name__)
# =============================================================================
# HTML SANITIZATION
# =============================================================================
# Allowed HTML tags for rich text
ALLOWED_TAGS = frozenset({
"p", "br", "hr",
"h1", "h2", "h3", "h4", "h5", "h6",
"strong", "b", "em", "i", "u", "s", "strike",
"ul", "ol", "li",
"blockquote", "pre", "code",
"a", "img",
"table", "thead", "tbody", "tr", "th", "td",
"figure", "figcaption",
"div", "span",
})
# Allowed attributes by tag
ALLOWED_ATTRS = {
"a": {"href", "title", "target", "rel"},
"img": {"src", "alt", "title", "width", "height"},
"code": {"class"},
"pre": {"class"},
"div": {"class", "id"},
"span": {"class"},
"table": {"class"},
"td": {"colspan", "rowspan"},
"th": {"colspan", "rowspan"},
}
# Dangerous attribute patterns
DANGEROUS_ATTR_PATTERNS = (
re.compile(r"javascript:", re.IGNORECASE),
re.compile(r"vbscript:", re.IGNORECASE),
re.compile(r"data:", re.IGNORECASE),
re.compile(r"on\w+\s*=", re.IGNORECASE), # onclick, onload, etc.
)
def sanitize_html(html_content: str, allow_all_tags: bool = False) -> str:
"""
Sanitize HTML content to prevent XSS.
Args:
html_content: Raw HTML to sanitize
allow_all_tags: If True, only remove dangerous attrs (for trusted content)
Returns:
Sanitized HTML string
"""
if not html_content:
return ""
# Check for dangerous patterns
for pattern in DANGEROUS_ATTR_PATTERNS:
html_content = pattern.sub("", html_content)
# Remove script and style tags completely
html_content = re.sub(
r"",
"",
html_content,
flags=re.IGNORECASE | re.DOTALL
)
html_content = re.sub(
r"",
"",
html_content,
flags=re.IGNORECASE | re.DOTALL
)
# Remove event handlers from all tags
html_content = re.sub(
r'\s+on\w+\s*=\s*["\'][^"\']*["\']',
"",
html_content,
flags=re.IGNORECASE
)
return html_content
def escape_html(text: str) -> str:
"""
Escape HTML special characters.
Args:
text: Plain text to escape
Returns:
HTML-escaped text
"""
if not text:
return ""
return html.escape(text, quote=True)
def unescape_html(text: str) -> str:
"""
Unescape HTML entities.
Args:
text: HTML-escaped text
Returns:
Unescaped text
"""
if not text:
return ""
return html.unescape(text)
# =============================================================================
# STRING SANITIZATION
# =============================================================================
def sanitize_filename(filename: str, max_length: int = 100) -> str:
"""
Sanitize a string for use as a filename.
Args:
filename: Raw filename
max_length: Maximum filename length
Returns:
Safe filename
"""
if not filename:
return "untitled"
# Remove or replace dangerous characters
safe = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '_', filename)
# Remove leading/trailing spaces and dots
safe = safe.strip(". ")
# Collapse multiple underscores
safe = re.sub(r'_+', '_', safe)
# Truncate
if len(safe) > max_length:
safe = safe[:max_length]
# Ensure not empty
if not safe:
return "untitled"
return safe
def sanitize_for_json(text: str) -> str:
"""
Sanitize text for safe JSON embedding.
Args:
text: Raw text
Returns:
JSON-safe text
"""
if not text:
return ""
# Escape control characters
text = re.sub(r'[\x00-\x1f\x7f]', '', text)
# Escape problematic characters
text = text.replace('\\', '\\\\')
text = text.replace('"', '\\"')
text = text.replace('\n', '\\n')
text = text.replace('\r', '\\r')
text = text.replace('\t', '\\t')
return text
def truncate_text(text: str, max_length: int = 1000, suffix: str = "...") -> str:
"""
Truncate text to maximum length.
Args:
text: Text to truncate
max_length: Maximum length
suffix: Suffix to add when truncated
Returns:
Truncated text
"""
if not text:
return ""
if len(text) <= max_length:
return text
return text[:max_length - len(suffix)] + suffix
# =============================================================================
# URL SANITIZATION
# =============================================================================
def sanitize_url(url: str) -> Optional[str]:
"""
Sanitize a URL for safe use.
Args:
url: Raw URL
Returns:
Sanitized URL or None if invalid
"""
if not url:
return None
url = url.strip()
# Check for dangerous schemes
lower_url = url.lower()
if any(lower_url.startswith(s) for s in ["javascript:", "vbscript:", "data:", "file:"]):
logger.warning(f"Blocked dangerous URL scheme: {url[:50]}")
return None
# Ensure valid scheme
if not url.startswith(("http://", "https://")):
# Try adding https
url = "https://" + url
return url
def sanitize_href(href: str) -> str:
"""
Sanitize an href attribute value.
Args:
href: Raw href value
Returns:
Sanitized href or '#' if invalid
"""
sanitized = sanitize_url(href)
return sanitized if sanitized else "#"
# =============================================================================
# API RESPONSE SANITIZATION
# =============================================================================
def sanitize_api_response(data: dict[str, Any]) -> dict[str, Any]:
"""
Sanitize an API response for safe rendering.
Args:
data: Raw API response dict
Returns:
Sanitized response
"""
if not isinstance(data, dict):
return {}
sanitized = {}
for key, value in data.items():
if isinstance(value, str):
# Escape potential HTML in string values
sanitized[key] = escape_html(value)
elif isinstance(value, dict):
sanitized[key] = sanitize_api_response(value)
elif isinstance(value, list):
sanitized[key] = [
sanitize_api_response(item) if isinstance(item, dict)
else escape_html(item) if isinstance(item, str)
else item
for item in value
]
else:
sanitized[key] = value
return sanitized
def sanitize_markdown(markdown: str) -> str:
"""
Sanitize markdown content.
Removes potentially dangerous content while preserving formatting.
Args:
markdown: Raw markdown
Returns:
Sanitized markdown
"""
if not markdown:
return ""
# Remove HTML comments
markdown = re.sub(r'', '', markdown, flags=re.DOTALL)
# Remove inline scripts
markdown = re.sub(r'', '', markdown, flags=re.DOTALL | re.IGNORECASE)
# Remove inline styles that could be malicious
markdown = re.sub(r'', '', markdown, flags=re.DOTALL | re.IGNORECASE)
# Remove javascript: links
markdown = re.sub(r'\[([^\]]+)\]\(javascript:[^)]*\)', r'\1', markdown, flags=re.IGNORECASE)
return markdown