Spaces:

cmpatino
/

tokenization_diff

Sleeping

App Files Files Community

cmpatino HF Staff commited on 16 days ago

Commit

35f1842

1 Parent(s): f8df45d

Init version

Browse files

Files changed (2) hide show

app.py +172 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import html
+import gradio as gr
+from datasets import load_dataset
+from transformers import AutoTokenizer
+def build_alignment_groups_from_ids(student_tokenizer, teacher_tokenizer, student_token_ids, teacher_token_ids):
+    """
+    Build alignment groups using a greedy substring-equality algorithm on decoded token pieces.
+    Adapted from TRL's GoldTrainer._build_alignment_groups_from_ids.
+    """
+    def to_canonical_pieces(tok, ids):
+        pieces = []
+        prev = ""
+        for k in range(len(ids)):
+            cur = tok.decode(ids[: k + 1], skip_special_tokens=False, clean_up_tokenization_spaces=False)
+            pieces.append(cur[len(prev):])
+            prev = cur
+        return pieces
+    s_pieces = to_canonical_pieces(student_tokenizer, student_token_ids)
+    t_pieces = to_canonical_pieces(teacher_tokenizer, teacher_token_ids)
+    i = j = 0
+    s_buf = t_buf = ""
+    s_group = []
+    t_group = []
+    s_groups = []
+    t_groups = []
+    def flush():
+        if s_group and t_group:
+            s_groups.append(s_group.copy())
+            t_groups.append(t_group.copy())
+    while i < len(s_pieces) or j < len(t_pieces):
+        if s_buf == t_buf and s_buf != "":
+            flush()
+            s_buf = t_buf = ""
+            s_group = []
+            t_group = []
+            continue
+        if s_buf == "" and i < len(s_pieces):
+            s_buf += s_pieces[i]
+            s_group.append(i)
+            i += 1
+            continue
+        if t_buf == "" and j < len(t_pieces):
+            t_buf += t_pieces[j]
+            t_group.append(j)
+            j += 1
+            continue
+        if len(s_buf) <= len(t_buf):
+            if i < len(s_pieces):
+                s_buf += s_pieces[i]
+                s_group.append(i)
+                i += 1
+            elif j < len(t_pieces):
+                t_buf += t_pieces[j]
+                t_group.append(j)
+                j += 1
+        else:
+            if j < len(t_pieces):
+                t_buf += t_pieces[j]
+                t_group.append(j)
+                j += 1
+            elif i < len(s_pieces):
+                s_buf += s_pieces[i]
+                s_group.append(i)
+                i += 1
+    if s_buf == t_buf and s_group and t_group:
+        flush()
+    elif s_group or t_group:
+        if not s_group:
+            s_group = []
+        if not t_group:
+            t_group = []
+        if s_group or t_group:
+            s_groups.append(s_group.copy() if s_group else [])
+            t_groups.append(t_group.copy() if t_group else [])
+    return s_groups, t_groups
+def highlight_groups(student_tokenizer, student_token_ids, s_groups, t_groups):
+    """Build an HTML string with highlighted misalignment regions."""
+    parts = []
+    for k in range(len(s_groups)):
+        s_ids = [student_token_ids[idx] for idx in s_groups[k]]
+        text = student_tokenizer.decode(s_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
+        escaped = html.escape(text)
+        s_multi = len(s_groups[k]) > 1
+        t_multi = len(t_groups[k]) > 1
+        if s_multi and t_multi:
+            parts.append(f'<span style="background-color: #b388ff;">{escaped}</span>')
+        elif s_multi:
+            parts.append(f'<span style="background-color: #ffcc80;">{escaped}</span>')
+        elif t_multi:
+            parts.append(f'<span style="background-color: #90caf9;">{escaped}</span>')
+        else:
+            parts.append(escaped)
+    return "".join(parts)
+def process_texts(student_model_id, teacher_model_id, dataset_id, progress=gr.Progress()):
+    """Load tokenizers and dataset, compute alignment, return highlighted HTML."""
+    progress(0, desc="Loading tokenizers...")
+    student_tokenizer = AutoTokenizer.from_pretrained(student_model_id)
+    teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)
+    progress(0.1, desc="Loading dataset...")
+    ds = load_dataset(dataset_id, split="train")
+    rows = ds.select(range(min(10, len(ds))))
+    html_blocks = []
+    for row_idx, row in enumerate(rows):
+        progress((row_idx + 1) / 12, desc=f"Processing text {row_idx + 1}/10...")
+        text = "".join(msg["content"] for msg in row["messages"])
+        s_ids = student_tokenizer.encode(text, add_special_tokens=False)
+        t_ids = teacher_tokenizer.encode(text, add_special_tokens=False)
+        s_groups, t_groups = build_alignment_groups_from_ids(
+            student_tokenizer, teacher_tokenizer, s_ids, t_ids
+        )
+        highlighted = highlight_groups(student_tokenizer, s_ids, s_groups, t_groups)
+        html_blocks.append(
+            f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
+            f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
+            f"<strong>Text {row_idx + 1}</strong> "
+            f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
+            f"{highlighted}</div>"
+        )
+    progress(1, desc="Done!")
+    legend = (
+        '<div style="margin-bottom:15px; font-family:sans-serif;">'
+        "<strong>Legend:</strong> "
+        '<span style="background-color:#ffcc80; padding:2px 8px; margin-right:8px;">Student misalignment (orange)</span>'
+        '<span style="background-color:#90caf9; padding:2px 8px; margin-right:8px;">Teacher misalignment (blue)</span>'
+        '<span style="background-color:#b388ff; padding:2px 8px;">Both (purple)</span>'
+        "</div>"
+    )
+    return legend + "\n".join(html_blocks)
+with gr.Blocks(title="Tokenization Diff") as demo:
+    gr.Markdown("# Tokenization Diff\nVisualize where two tokenizers differ in how they tokenize text.")
+    with gr.Row():
+        student_model = gr.Textbox(label="Student Model", value="Qwen/Qwen3-8B")
+        teacher_model = gr.Textbox(label="Teacher Model", value="deepseek-ai/DeepSeek-Math-V2")
+        dataset_id = gr.Textbox(label="Dataset ID", value="lm-provers/FineProofs-SFT")
+    submit_btn = gr.Button("Submit", variant="primary")
+    output = gr.HTML(label="Tokenization Diff Output")
+    submit_btn.click(fn=process_texts, inputs=[student_model, teacher_model, dataset_id], outputs=output)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=6.6.0
+transformers
+datasets