tiny-press / core /diff.py
sriharsha-cr's picture
Project files
ebc3bf5
raw
history blame
7.02 kB
import difflib
import html as _h
def _word_diff(original: str, compressed: str) -> tuple[str, str]:
"""
Word-level SequenceMatcher diff.
Returns (annotated_original_html, annotated_compressed_html).
Colour key:
original β€” red strikethrough = dropped
original β€” plain = survived unchanged
compressed β€” amber = rewritten (replaced)
compressed β€” green = inserted (rare; model added a connector word)
compressed β€” plain = survived unchanged
"""
orig_words = original.split()
comp_words = compressed.split()
matcher = difflib.SequenceMatcher(None, orig_words, comp_words, autojunk=False)
orig_parts: list[str] = []
comp_parts: list[str] = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
ow = _h.escape(" ".join(orig_words[i1:i2]))
cw = _h.escape(" ".join(comp_words[j1:j2]))
if tag == "equal":
orig_parts.append(ow)
comp_parts.append(cw)
elif tag == "delete":
orig_parts.append(
f'<mark style="background:#fee2e2;color:#b91c1c;'
f'text-decoration:line-through;padding:1px 3px;border-radius:3px">{ow}</mark>'
)
elif tag == "insert":
comp_parts.append(
f'<mark style="background:#dcfce7;color:#15803d;'
f'padding:1px 3px;border-radius:3px">{cw}</mark>'
)
elif tag == "replace":
orig_parts.append(
f'<mark style="background:#fee2e2;color:#b91c1c;'
f'text-decoration:line-through;padding:1px 3px;border-radius:3px">{ow}</mark>'
)
comp_parts.append(
f'<mark style="background:#fef9c3;color:#92400e;'
f'padding:1px 3px;border-radius:3px">{cw}</mark>'
)
return " ".join(orig_parts), " ".join(comp_parts)
def render_diff_html(record: dict) -> str:
"""Build a self-contained side-by-side diff HTML block for a compression run."""
original = record.get("input_text", "")
compressed = record.get("output_text", "")
if not original or not compressed:
return ""
orig_html, comp_html = _word_diff(original, compressed)
model = _h.escape(record.get("model", "β€”"))
tokenizer = _h.escape(record.get("tokenizer", "β€”"))
ts = _h.escape(record.get("timestamp", "β€”"))
in_tok = record.get("input_tokens", "β€”")
out_tok = record.get("output_tokens", "β€”")
target_tok = record.get("target_tokens", "β€”")
ratio = record.get("compression_ratio", 0)
quality = record.get("quality_score", 0)
duration = record.get("duration_ms", "β€”")
run_id = record.get("id", "β€”")
feedback_val = record.get("feedback")
feedback_note = _h.escape(record.get("feedback_comment") or "")
# Build optional feedback block
if feedback_val is not None:
badge_bg = "#f0fdf4" if feedback_val == 1 else "#fef2f2"
badge_color = "#15803d" if feedback_val == 1 else "#b91c1c"
badge_text = "πŸ‘ Helpful" if feedback_val == 1 else "πŸ‘Ž Not helpful"
feedback_block = (
f'<div style="display:flex;flex-wrap:wrap;align-items:center;gap:8px;'
f'margin-top:10px;padding:8px 12px;border-radius:6px;background:{badge_bg}">'
f'<span style="font-weight:600;font-size:0.8rem;color:{badge_color}">{badge_text}</span>'
)
if feedback_note:
feedback_block += (
f'<span style="font-size:0.8rem;color:#374151;font-style:italic">'
f'"{feedback_note}"</span>'
)
feedback_block += "</div>"
else:
feedback_block = ""
return f"""
<div style="font-family:system-ui,sans-serif;margin-top:4px">
<!-- Primary meta chips -->
<div style="display:flex;flex-wrap:wrap;gap:6px;margin-bottom:6px;font-size:0.78rem">
<span style="background:#f3f4f6;padding:3px 9px;border-radius:12px;color:#374151">Run #{run_id}</span>
<span style="background:#f3f4f6;padding:3px 9px;border-radius:12px;color:#374151">{ts}</span>
<span style="background:#eff6ff;padding:3px 9px;border-radius:12px;color:#1d4ed8">{model}</span>
<span style="background:#f0fdf4;padding:3px 9px;border-radius:12px;color:#15803d">Quality {quality:.4f}</span>
<span style="background:#fff7ed;padding:3px 9px;border-radius:12px;color:#c2410c">Ratio {ratio:.4f}</span>
<span style="background:#faf5ff;padding:3px 9px;border-radius:12px;color:#7e22ce">⏱ {duration} ms</span>
</div>
<!-- Secondary meta chips -->
<div style="display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px;font-size:0.78rem">
<span style="background:#f3f4f6;padding:3px 9px;border-radius:12px;color:#374151">{in_tok} in β†’ {out_tok} out (target {target_tok})</span>
<span style="background:#f3f4f6;padding:3px 9px;border-radius:12px;color:#374151">tokenizer: {tokenizer}</span>
</div>
<!-- Side-by-side panels -->
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
<!-- Original -->
<div style="border:1px solid #fecaca;border-radius:8px;overflow:hidden">
<div style="background:#fef2f2;padding:8px 14px;border-bottom:1px solid #fecaca;
display:flex;justify-content:space-between;align-items:center">
<span style="font-weight:700;font-size:0.8rem;color:#b91c1c;letter-spacing:.04em">ORIGINAL</span>
<span style="font-size:0.75rem;color:#6b7280">{in_tok} tokens</span>
</div>
<div style="padding:14px;line-height:1.8;font-size:0.875rem;color:#1a1a1a;
max-height:340px;overflow-y:auto;word-break:break-word">
{orig_html}
</div>
</div>
<!-- Compressed -->
<div style="border:1px solid #bbf7d0;border-radius:8px;overflow:hidden">
<div style="background:#f0fdf4;padding:8px 14px;border-bottom:1px solid #bbf7d0;
display:flex;justify-content:space-between;align-items:center">
<span style="font-weight:700;font-size:0.8rem;color:#15803d;letter-spacing:.04em">COMPRESSED</span>
<span style="font-size:0.75rem;color:#6b7280">{out_tok} tokens</span>
</div>
<div style="padding:14px;line-height:1.8;font-size:0.875rem;color:#1a1a1a;
max-height:340px;overflow-y:auto;word-break:break-word">
{comp_html}
</div>
</div>
</div>
{feedback_block}
<!-- Legend -->
<div style="display:flex;flex-wrap:wrap;gap:14px;margin-top:10px;font-size:0.75rem;color:#6b7280;align-items:center">
<mark style="background:#fee2e2;color:#b91c1c;text-decoration:line-through;padding:2px 7px;border-radius:3px">dropped</mark>
<mark style="background:#fef9c3;color:#92400e;padding:2px 7px;border-radius:3px">rewritten</mark>
<mark style="background:#dcfce7;color:#15803d;padding:2px 7px;border-radius:3px">inserted</mark>
<span>plain = unchanged</span>
</div>
</div>
"""