"""
HTML Sanitizer Module
Provides XSS-safe HTML sanitization for the annotation platform. This module
allows legitimate span annotation HTML while blocking potentially dangerous
elements and attributes.
The sanitizer uses an allowlist approach - only explicitly permitted elements
and attributes are kept, everything else is escaped or removed.
Usage:
from potato.server_utils.html_sanitizer import sanitize_html
# In Jinja2 template:
{{ instance | sanitize_html }}
"""
import re
import html
import logging
from typing import Set, Dict, List, Tuple
from markupsafe import Markup
logger = logging.getLogger(__name__)
# Elements allowed in sanitized HTML
ALLOWED_ELEMENTS: Set[str] = {
# Span annotations
'span',
# Basic formatting (may be in source data)
'b', 'i', 'u', 'strong', 'em', 'mark',
's', 'del', 'ins',
# Line breaks and horizontal rules
'br', 'hr', 'wbr',
# Dialogue/conversation layout elements
'div',
# Structural elements for instructional content (Issue #120)
'p',
'ul', 'ol', 'li',
'dl', 'dt', 'dd',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
# Tables for formatted content
'table', 'thead', 'tbody', 'tr', 'th', 'td', 'caption',
# Inline semantics
'sub', 'sup', 'small', 'code', 'pre', 'blockquote',
'abbr', 'cite', 'kbd', 'samp', 'var',
# Collapsible sections
'details', 'summary',
# Links (href sanitized against dangerous patterns)
'a',
# Media and figures for instructional/survey content (Issue #129)
'img', 'figure', 'figcaption',
# Audio/video media — src checked against dangerous patterns; only safe
# attributes allowed (no event handlers). Potato is a multimedia tool.
'video', 'audio', 'source', 'track',
# Ruby annotations for CJK text
'ruby', 'rt', 'rp',
}
# Attributes allowed per element
ALLOWED_ATTRIBUTES: Dict[str, Set[str]] = {
'span': {
'class',
'style',
'data-annotation-id',
'data-label',
'schema',
'title',
},
'div': {
'class',
'style',
'data-speaker',
'data-speaker-index',
},
'mark': {'class', 'style'},
# Links — href checked against dangerous patterns in _sanitize_attributes
'a': {'href', 'title', 'target', 'rel'},
# Table elements
'td': {'colspan', 'rowspan', 'style'},
'th': {'colspan', 'rowspan', 'style', 'scope'},
'table': {'class', 'style'},
# Ordered lists
'ol': {'start', 'type'},
# Definition lists
'dl': {'class', 'style'},
'dt': {'class', 'style'},
'dd': {'class', 'style'},
# Block-level elements that may need class/style
'blockquote': {'class', 'style'},
'p': {'class', 'style'},
'pre': {'class', 'style'},
'code': {'class'},
# Collapsible sections
'details': {'class', 'style', 'open'},
'summary': {'class', 'style'},
# Abbreviations with tooltip
'abbr': {'title'},
# Deletion/insertion with optional metadata
'del': {'datetime', 'cite'},
'ins': {'datetime', 'cite'},
# Table captions
'caption': {'class', 'style'},
# Images — src checked against dangerous patterns like href
'img': {'src', 'alt', 'title', 'style', 'width', 'height'},
# Audio/video media — src checked against dangerous patterns; boolean
# attributes (controls/autoplay/loop/muted) carry no script risk.
'video': {'src', 'controls', 'width', 'height', 'style', 'poster',
'preload', 'autoplay', 'loop', 'muted', 'playsinline'},
'audio': {'src', 'controls', 'preload', 'autoplay', 'loop', 'muted'},
'source': {'src', 'type', 'srcset', 'media'},
'track': {'src', 'kind', 'srclang', 'label', 'default'},
# Figures and captions for instructional content
'figure': {'class', 'style'},
'figcaption': {'class', 'style'},
# Most elements get no attributes
'*': set(),
}
# Allowed CSS properties in style attributes
ALLOWED_CSS_PROPERTIES: Set[str] = {
'background-color',
'color',
'font-weight',
'font-style',
'font-family',
'font-size',
'text-decoration',
'text-align',
'line-height',
# Layout properties for dialogue/pairwise display
'display',
'width',
'max-width',
'padding',
'padding-top', 'padding-bottom', 'padding-left', 'padding-right',
'margin',
'margin-top', 'margin-bottom', 'margin-left', 'margin-right',
'box-sizing',
'vertical-align',
'gap',
# List styling
'list-style-type',
# Borders
'border',
'border-radius',
'border-collapse',
}
# Valueless boolean attributes — emitted as a bare name when present without a
# value (e.g.