File size: 3,686 Bytes
61d7017
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""HTML diff highlighting utilities."""

import difflib
import html

# Visible markers for whitespace differences so they don't become invisible
# inside colored diff spans.
#   ␣ (U+2423 OPEN BOX) for regular space
#   ↵ (U+21B5 DOWNWARDS ARROW WITH CORNER LEFTWARDS) for newline
#   → (U+2192 RIGHTWARDS ARROW) for tab
_WHITESPACE_MARKERS = {
    " ": "␣",
    "\t": "→",
    "\u00a0": "␣",  # non-breaking space
}


def _visualize_whitespace(escaped: str) -> str:
    """Replace whitespace chars with visible markers inside a diff span.

    Operates on already HTML-escaped text. Newlines are handled separately
    by the final `\n` -> `<br>` pass, but we mark them inline too so the
    reader sees *where* a newline was inserted/removed before the <br>.
    """
    parts: list[str] = []
    for ch in escaped:
        if ch == "\n":
            parts.append(
                '<span style="opacity:0.6;">↵</span>\n'
            )  # marker + real newline (for <br>)
        elif ch in _WHITESPACE_MARKERS:
            parts.append(f'<span style="opacity:0.6;">{_WHITESPACE_MARKERS[ch]}</span>')
        else:
            parts.append(ch)
    return "".join(parts)


def _render_segment(escaped: str) -> str:
    """Render a diff segment, adding whitespace markers only when the whole
    segment is whitespace.

    Rationale: a pure-whitespace diff (e.g. a single inserted space) is
    otherwise invisible, so we show ␣/↵/→. But when the segment already
    contains visible text, the colored background is enough — marking the
    incidental spaces would just add noise to word-level edits.
    """
    if escaped and escaped.strip() == "":
        return _visualize_whitespace(escaped)
    return escaped


def highlight_diff(original: str, corrected: str) -> str:
    """Generate HTML highlighting differences between original and corrected text.

    Uses character-level difflib.SequenceMatcher to produce inline HTML
    with strikethrough for deletions and green highlights for insertions.
    Whitespace changes are rendered with visible markers (␣ for space,
    ↵ for newline) so spacing-only edits are perceivable.

    Args:
        original: Original text.
        corrected: Corrected text.

    Returns:
        HTML string with diff highlights. Safe against XSS.
    """
    # Escape HTML entities BEFORE diffing to prevent XSS
    original_escaped = html.escape(original)
    corrected_escaped = html.escape(corrected)

    matcher = difflib.SequenceMatcher(None, original_escaped, corrected_escaped)
    result_parts: list[str] = []

    del_style = "background:#ffecec;text-decoration:line-through;"
    ins_style = "background:#e6ffec;"

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        orig_seg = original_escaped[i1:i2]
        corr_seg = corrected_escaped[j1:j2]

        if tag == "equal":
            result_parts.append(f"<span>{orig_seg}</span>")
        elif tag == "replace":
            result_parts.append(
                f'<span style="{del_style}">{_render_segment(orig_seg)}</span>'
            )
            result_parts.append(
                f'<span style="{ins_style}">{_render_segment(corr_seg)}</span>'
            )
        elif tag == "delete":
            result_parts.append(
                f'<span style="{del_style}">{_render_segment(orig_seg)}</span>'
            )
        elif tag == "insert":
            result_parts.append(
                f'<span style="{ins_style}">{_render_segment(corr_seg)}</span>'
            )

    result_html = "".join(result_parts)
    # Convert real newlines (outside diff spans or inside equal spans) to <br>
    return result_html.replace("\n", "<br>")