""" HTML Sanitizer Module Provides XSS-safe HTML sanitization for the annotation platform. This module allows legitimate span annotation HTML while blocking potentially dangerous elements and attributes. The sanitizer uses an allowlist approach - only explicitly permitted elements and attributes are kept, everything else is escaped or removed. Usage: from potato.server_utils.html_sanitizer import sanitize_html # In Jinja2 template: {{ instance | sanitize_html }} """ import re import html import logging from typing import Set, Dict, List, Tuple from markupsafe import Markup logger = logging.getLogger(__name__) # Elements allowed in sanitized HTML ALLOWED_ELEMENTS: Set[str] = { # Span annotations 'span', # Basic formatting (may be in source data) 'b', 'i', 'u', 'strong', 'em', 'mark', 's', 'del', 'ins', # Line breaks and horizontal rules 'br', 'hr', 'wbr', # Dialogue/conversation layout elements 'div', # Structural elements for instructional content (Issue #120) 'p', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', # Tables for formatted content 'table', 'thead', 'tbody', 'tr', 'th', 'td', 'caption', # Inline semantics 'sub', 'sup', 'small', 'code', 'pre', 'blockquote', 'abbr', 'cite', 'kbd', 'samp', 'var', # Collapsible sections 'details', 'summary', # Links (href sanitized against dangerous patterns) 'a', # Media and figures for instructional/survey content (Issue #129) 'img', 'figure', 'figcaption', # Audio/video media — src checked against dangerous patterns; only safe # attributes allowed (no event handlers). Potato is a multimedia tool. 'video', 'audio', 'source', 'track', # Ruby annotations for CJK text 'ruby', 'rt', 'rp', } # Attributes allowed per element ALLOWED_ATTRIBUTES: Dict[str, Set[str]] = { 'span': { 'class', 'style', 'data-annotation-id', 'data-label', 'schema', 'title', }, 'div': { 'class', 'style', 'data-speaker', 'data-speaker-index', }, 'mark': {'class', 'style'}, # Links — href checked against dangerous patterns in _sanitize_attributes 'a': {'href', 'title', 'target', 'rel'}, # Table elements 'td': {'colspan', 'rowspan', 'style'}, 'th': {'colspan', 'rowspan', 'style', 'scope'}, 'table': {'class', 'style'}, # Ordered lists 'ol': {'start', 'type'}, # Definition lists 'dl': {'class', 'style'}, 'dt': {'class', 'style'}, 'dd': {'class', 'style'}, # Block-level elements that may need class/style 'blockquote': {'class', 'style'}, 'p': {'class', 'style'}, 'pre': {'class', 'style'}, 'code': {'class'}, # Collapsible sections 'details': {'class', 'style', 'open'}, 'summary': {'class', 'style'}, # Abbreviations with tooltip 'abbr': {'title'}, # Deletion/insertion with optional metadata 'del': {'datetime', 'cite'}, 'ins': {'datetime', 'cite'}, # Table captions 'caption': {'class', 'style'}, # Images — src checked against dangerous patterns like href 'img': {'src', 'alt', 'title', 'style', 'width', 'height'}, # Audio/video media — src checked against dangerous patterns; boolean # attributes (controls/autoplay/loop/muted) carry no script risk. 'video': {'src', 'controls', 'width', 'height', 'style', 'poster', 'preload', 'autoplay', 'loop', 'muted', 'playsinline'}, 'audio': {'src', 'controls', 'preload', 'autoplay', 'loop', 'muted'}, 'source': {'src', 'type', 'srcset', 'media'}, 'track': {'src', 'kind', 'srclang', 'label', 'default'}, # Figures and captions for instructional content 'figure': {'class', 'style'}, 'figcaption': {'class', 'style'}, # Most elements get no attributes '*': set(), } # Allowed CSS properties in style attributes ALLOWED_CSS_PROPERTIES: Set[str] = { 'background-color', 'color', 'font-weight', 'font-style', 'font-family', 'font-size', 'text-decoration', 'text-align', 'line-height', # Layout properties for dialogue/pairwise display 'display', 'width', 'max-width', 'padding', 'padding-top', 'padding-bottom', 'padding-left', 'padding-right', 'margin', 'margin-top', 'margin-bottom', 'margin-left', 'margin-right', 'box-sizing', 'vertical-align', 'gap', # List styling 'list-style-type', # Borders 'border', 'border-radius', 'border-collapse', } # Valueless boolean attributes — emitted as a bare name when present without a # value (e.g.