"""HTML diff highlighting utilities."""
import difflib
import html
# Visible markers for whitespace differences so they don't become invisible
# inside colored diff spans.
# ␣ (U+2423 OPEN BOX) for regular space
# ↵ (U+21B5 DOWNWARDS ARROW WITH CORNER LEFTWARDS) for newline
# → (U+2192 RIGHTWARDS ARROW) for tab
_WHITESPACE_MARKERS = {
" ": "␣",
"\t": "→",
"\u00a0": "␣", # non-breaking space
}
def _visualize_whitespace(escaped: str) -> str:
"""Replace whitespace chars with visible markers inside a diff span.
Operates on already HTML-escaped text. Newlines are handled separately
by the final `\n` -> `
` pass, but we mark them inline too so the
reader sees *where* a newline was inserted/removed before the
.
"""
parts: list[str] = []
for ch in escaped:
if ch == "\n":
parts.append(
'↵\n'
) # marker + real newline (for
)
elif ch in _WHITESPACE_MARKERS:
parts.append(f'{_WHITESPACE_MARKERS[ch]}')
else:
parts.append(ch)
return "".join(parts)
def _render_segment(escaped: str) -> str:
"""Render a diff segment, adding whitespace markers only when the whole
segment is whitespace.
Rationale: a pure-whitespace diff (e.g. a single inserted space) is
otherwise invisible, so we show ␣/↵/→. But when the segment already
contains visible text, the colored background is enough — marking the
incidental spaces would just add noise to word-level edits.
"""
if escaped and escaped.strip() == "":
return _visualize_whitespace(escaped)
return escaped
def highlight_diff(original: str, corrected: str) -> str:
"""Generate HTML highlighting differences between original and corrected text.
Uses character-level difflib.SequenceMatcher to produce inline HTML
with strikethrough for deletions and green highlights for insertions.
Whitespace changes are rendered with visible markers (␣ for space,
↵ for newline) so spacing-only edits are perceivable.
Args:
original: Original text.
corrected: Corrected text.
Returns:
HTML string with diff highlights. Safe against XSS.
"""
# Escape HTML entities BEFORE diffing to prevent XSS
original_escaped = html.escape(original)
corrected_escaped = html.escape(corrected)
matcher = difflib.SequenceMatcher(None, original_escaped, corrected_escaped)
result_parts: list[str] = []
del_style = "background:#ffecec;text-decoration:line-through;"
ins_style = "background:#e6ffec;"
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
orig_seg = original_escaped[i1:i2]
corr_seg = corrected_escaped[j1:j2]
if tag == "equal":
result_parts.append(f"{orig_seg}")
elif tag == "replace":
result_parts.append(
f'{_render_segment(orig_seg)}'
)
result_parts.append(
f'{_render_segment(corr_seg)}'
)
elif tag == "delete":
result_parts.append(
f'{_render_segment(orig_seg)}'
)
elif tag == "insert":
result_parts.append(
f'{_render_segment(corr_seg)}'
)
result_html = "".join(result_parts)
# Convert real newlines (outside diff spans or inside equal spans) to
return result_html.replace("\n", "
")