Spaces:

T0X1N
/

Medium-MCP

Running

Medium-MCP / src /sanitization.py

Nikhil Pravin Pise

feat: implement comprehensive improvement plan (Phases 1-5)

e98cc10 3 months ago

8.04 kB

	"""
	Output Sanitization Module

	Sanitizes HTML content, user-generated strings, and API responses
	to prevent XSS and other injection attacks.
	"""

	from __future__ import annotations

	import html
	import re
	import logging
	from typing import Any, Optional


	logger = logging.getLogger(__name__)


	# =============================================================================
	# HTML SANITIZATION
	# =============================================================================

	# Allowed HTML tags for rich text
	ALLOWED_TAGS = frozenset({
	"p", "br", "hr",
	"h1", "h2", "h3", "h4", "h5", "h6",
	"strong", "b", "em", "i", "u", "s", "strike",
	"ul", "ol", "li",
	"blockquote", "pre", "code",
	"a", "img",
	"table", "thead", "tbody", "tr", "th", "td",
	"figure", "figcaption",
	"div", "span",
	})

	# Allowed attributes by tag
	ALLOWED_ATTRS = {
	"a": {"href", "title", "target", "rel"},
	"img": {"src", "alt", "title", "width", "height"},
	"code": {"class"},
	"pre": {"class"},
	"div": {"class", "id"},
	"span": {"class"},
	"table": {"class"},
	"td": {"colspan", "rowspan"},
	"th": {"colspan", "rowspan"},
	}

	# Dangerous attribute patterns
	DANGEROUS_ATTR_PATTERNS = (
	re.compile(r"javascript:", re.IGNORECASE),
	re.compile(r"vbscript:", re.IGNORECASE),
	re.compile(r"data:", re.IGNORECASE),
	re.compile(r"on\w+\s*=", re.IGNORECASE), # onclick, onload, etc.
	)


	def sanitize_html(html_content: str, allow_all_tags: bool = False) -> str:
	"""
	Sanitize HTML content to prevent XSS.

	Args:
	html_content: Raw HTML to sanitize
	allow_all_tags: If True, only remove dangerous attrs (for trusted content)

	Returns:
	Sanitized HTML string
	"""
	if not html_content:
	return ""

	# Check for dangerous patterns
	for pattern in DANGEROUS_ATTR_PATTERNS:
	html_content = pattern.sub("", html_content)

	# Remove script and style tags completely
	html_content = re.sub(
	r"<script[^>]>.?</script>",
	"",
	html_content,
	flags=re.IGNORECASE \| re.DOTALL
	)
	html_content = re.sub(
	r"<style[^>]>.?</style>",
	"",
	html_content,
	flags=re.IGNORECASE \| re.DOTALL
	)

	# Remove event handlers from all tags
	html_content = re.sub(
	r'\s+on\w+\s=\s["\'][^"\']*["\']',
	"",
	html_content,
	flags=re.IGNORECASE
	)

	return html_content


	def escape_html(text: str) -> str:
	"""
	Escape HTML special characters.

	Args:
	text: Plain text to escape

	Returns:
	HTML-escaped text
	"""
	if not text:
	return ""

	return html.escape(text, quote=True)


	def unescape_html(text: str) -> str:
	"""
	Unescape HTML entities.

	Args:
	text: HTML-escaped text

	Returns:
	Unescaped text
	"""
	if not text:
	return ""

	return html.unescape(text)


	# =============================================================================
	# STRING SANITIZATION
	# =============================================================================


	def sanitize_filename(filename: str, max_length: int = 100) -> str:
	"""
	Sanitize a string for use as a filename.

	Args:
	filename: Raw filename
	max_length: Maximum filename length

	Returns:
	Safe filename
	"""
	if not filename:
	return "untitled"

	# Remove or replace dangerous characters
	safe = re.sub(r'[<>:"/\\\|?*\x00-\x1f]', '_', filename)

	# Remove leading/trailing spaces and dots
	safe = safe.strip(". ")

	# Collapse multiple underscores
	safe = re.sub(r'_+', '_', safe)

	# Truncate
	if len(safe) > max_length:
	safe = safe[:max_length]

	# Ensure not empty
	if not safe:
	return "untitled"

	return safe


	def sanitize_for_json(text: str) -> str:
	"""
	Sanitize text for safe JSON embedding.

	Args:
	text: Raw text

	Returns:
	JSON-safe text
	"""
	if not text:
	return ""

	# Escape control characters
	text = re.sub(r'[\x00-\x1f\x7f]', '', text)

	# Escape problematic characters
	text = text.replace('\\', '\\\\')
	text = text.replace('"', '\\"')
	text = text.replace('\n', '\\n')
	text = text.replace('\r', '\\r')
	text = text.replace('\t', '\\t')

	return text


	def truncate_text(text: str, max_length: int = 1000, suffix: str = "...") -> str:
	"""
	Truncate text to maximum length.

	Args:
	text: Text to truncate
	max_length: Maximum length
	suffix: Suffix to add when truncated

	Returns:
	Truncated text
	"""
	if not text:
	return ""

	if len(text) <= max_length:
	return text

	return text[:max_length - len(suffix)] + suffix


	# =============================================================================
	# URL SANITIZATION
	# =============================================================================


	def sanitize_url(url: str) -> Optional[str]:
	"""
	Sanitize a URL for safe use.

	Args:
	url: Raw URL

	Returns:
	Sanitized URL or None if invalid
	"""
	if not url:
	return None

	url = url.strip()

	# Check for dangerous schemes
	lower_url = url.lower()
	if any(lower_url.startswith(s) for s in ["javascript:", "vbscript:", "data:", "file:"]):
	logger.warning(f"Blocked dangerous URL scheme: {url[:50]}")
	return None

	# Ensure valid scheme
	if not url.startswith(("http://", "https://")):
	# Try adding https
	url = "https://" + url

	return url


	def sanitize_href(href: str) -> str:
	"""
	Sanitize an href attribute value.

	Args:
	href: Raw href value

	Returns:
	Sanitized href or '#' if invalid
	"""
	sanitized = sanitize_url(href)
	return sanitized if sanitized else "#"


	# =============================================================================
	# API RESPONSE SANITIZATION
	# =============================================================================


	def sanitize_api_response(data: dict[str, Any]) -> dict[str, Any]:
	"""
	Sanitize an API response for safe rendering.

	Args:
	data: Raw API response dict

	Returns:
	Sanitized response
	"""
	if not isinstance(data, dict):
	return {}

	sanitized = {}

	for key, value in data.items():
	if isinstance(value, str):
	# Escape potential HTML in string values
	sanitized[key] = escape_html(value)
	elif isinstance(value, dict):
	sanitized[key] = sanitize_api_response(value)
	elif isinstance(value, list):
	sanitized[key] = [
	sanitize_api_response(item) if isinstance(item, dict)
	else escape_html(item) if isinstance(item, str)
	else item
	for item in value
	]
	else:
	sanitized[key] = value

	return sanitized


	def sanitize_markdown(markdown: str) -> str:
	"""
	Sanitize markdown content.

	Removes potentially dangerous content while preserving formatting.

	Args:
	markdown: Raw markdown

	Returns:
	Sanitized markdown
	"""
	if not markdown:
	return ""

	# Remove HTML comments
	markdown = re.sub(r'<!--.*?-->', '', markdown, flags=re.DOTALL)

	# Remove inline scripts
	markdown = re.sub(r'<script[^>]>.?</script>', '', markdown, flags=re.DOTALL \| re.IGNORECASE)

	# Remove inline styles that could be malicious
	markdown = re.sub(r'<style[^>]>.?</style>', '', markdown, flags=re.DOTALL \| re.IGNORECASE)

	# Remove javascript: links
	markdown = re.sub(r'\[([^\]]+)\]\(javascript:[^)]*\)', r'\1', markdown, flags=re.IGNORECASE)

	return markdown