"""HTML diff highlighting utilities.""" import difflib import html # Visible markers for whitespace differences so they don't become invisible # inside colored diff spans. # ␣ (U+2423 OPEN BOX) for regular space # ↵ (U+21B5 DOWNWARDS ARROW WITH CORNER LEFTWARDS) for newline # → (U+2192 RIGHTWARDS ARROW) for tab _WHITESPACE_MARKERS = { " ": "␣", "\t": "→", "\u00a0": "␣", # non-breaking space } def _visualize_whitespace(escaped: str) -> str: """Replace whitespace chars with visible markers inside a diff span. Operates on already HTML-escaped text. Newlines are handled separately by the final `\n` -> `
` pass, but we mark them inline too so the reader sees *where* a newline was inserted/removed before the
. """ parts: list[str] = [] for ch in escaped: if ch == "\n": parts.append( '\n' ) # marker + real newline (for
) elif ch in _WHITESPACE_MARKERS: parts.append(f'{_WHITESPACE_MARKERS[ch]}') else: parts.append(ch) return "".join(parts) def _render_segment(escaped: str) -> str: """Render a diff segment, adding whitespace markers only when the whole segment is whitespace. Rationale: a pure-whitespace diff (e.g. a single inserted space) is otherwise invisible, so we show ␣/↵/→. But when the segment already contains visible text, the colored background is enough — marking the incidental spaces would just add noise to word-level edits. """ if escaped and escaped.strip() == "": return _visualize_whitespace(escaped) return escaped def highlight_diff(original: str, corrected: str) -> str: """Generate HTML highlighting differences between original and corrected text. Uses character-level difflib.SequenceMatcher to produce inline HTML with strikethrough for deletions and green highlights for insertions. Whitespace changes are rendered with visible markers (␣ for space, ↵ for newline) so spacing-only edits are perceivable. Args: original: Original text. corrected: Corrected text. Returns: HTML string with diff highlights. Safe against XSS. """ # Escape HTML entities BEFORE diffing to prevent XSS original_escaped = html.escape(original) corrected_escaped = html.escape(corrected) matcher = difflib.SequenceMatcher(None, original_escaped, corrected_escaped) result_parts: list[str] = [] del_style = "background:#ffecec;text-decoration:line-through;" ins_style = "background:#e6ffec;" for tag, i1, i2, j1, j2 in matcher.get_opcodes(): orig_seg = original_escaped[i1:i2] corr_seg = corrected_escaped[j1:j2] if tag == "equal": result_parts.append(f"{orig_seg}") elif tag == "replace": result_parts.append( f'{_render_segment(orig_seg)}' ) result_parts.append( f'{_render_segment(corr_seg)}' ) elif tag == "delete": result_parts.append( f'{_render_segment(orig_seg)}' ) elif tag == "insert": result_parts.append( f'{_render_segment(corr_seg)}' ) result_html = "".join(result_parts) # Convert real newlines (outside diff spans or inside equal spans) to
return result_html.replace("\n", "
")