Spaces:

cmpatino
/

tokenization_diff

Sleeping

App Files Files Community

cmpatino HF Staff commited on 18 days ago

Commit

2597b43

1 Parent(s): 40112a6

Add tokenization details

Browse files

Files changed (1) hide show

app.py +110 -39

app.py CHANGED Viewed

@@ -87,9 +87,24 @@ def build_alignment_groups_from_ids(student_tokenizer, teacher_tokenizer, studen
     return s_groups, t_groups
-def highlight_groups(student_tokenizer, student_token_ids, s_groups, t_groups):
     """Build an HTML string with highlighted misalignment regions."""
     parts = []
     for k in range(len(s_groups)):
         s_ids = [student_token_ids[idx] for idx in s_groups[k]]
         text = student_tokenizer.decode(s_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
@@ -99,64 +114,94 @@ def highlight_groups(student_tokenizer, student_token_ids, s_groups, t_groups):
         t_multi = len(t_groups[k]) > 1
         if s_multi and t_multi:
-            parts.append(f'<span style="background-color: #b388ff;">{escaped}</span>')
         elif s_multi:
-            parts.append(f'<span style="background-color: #ffcc80;">{escaped}</span>')
         elif t_multi:
-            parts.append(f'<span style="background-color: #90caf9;">{escaped}</span>')
         else:
             parts.append(escaped)
     return "".join(parts)
-def process_texts(student_model_id, teacher_model_id, dataset_id, progress=gr.Progress()):
-    """Load tokenizers and dataset, compute alignment, return highlighted HTML."""
     progress(0, desc="Loading tokenizers...")
     student_tokenizer = AutoTokenizer.from_pretrained(student_model_id)
     teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)
-    progress(0.1, desc="Loading dataset...")
-    ds = load_dataset(dataset_id, split="train")
     rows = ds.select(range(min(10, len(ds))))
-    def make_html_block(row, idx):
-        text = "".join(msg["content"] for msg in row["messages"])
-        s_ids = student_tokenizer.encode(text, add_special_tokens=False)
-        t_ids = teacher_tokenizer.encode(text, add_special_tokens=False)
-        s_groups, t_groups = build_alignment_groups_from_ids(
-            student_tokenizer, teacher_tokenizer, s_ids, t_ids
-        )
-        highlighted = highlight_groups(student_tokenizer, s_ids, s_groups, t_groups)
-        return {
-            "html_block": (
-                f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
-                f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
-                f"<strong>Text {idx + 1}</strong> "
-                f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
-                f"{highlighted}</div>"
-            )
-        }
-    progress(0.2, desc="Processing texts...")
-    rows = rows.map(make_html_block, num_proc=4, with_indices=True)
-    html_blocks = rows["html_block"]
-    progress(1, desc="Done!")
-    legend = (
-        '<div style="margin-bottom:15px; font-family:sans-serif;">'
-        "<strong>Legend:</strong> "
-        '<span style="background-color:#ffcc80; padding:2px 8px; margin-right:8px;">Student misalignment (orange)</span>'
-        '<span style="background-color:#90caf9; padding:2px 8px; margin-right:8px;">Teacher misalignment (blue)</span>'
-        '<span style="background-color:#b388ff; padding:2px 8px;">Both (purple)</span>'
-        "</div>"
-    )
-    return legend + "\n".join(html_blocks)
 with gr.Blocks(title="Tokenization Diff") as demo:
@@ -166,11 +211,37 @@ with gr.Blocks(title="Tokenization Diff") as demo:
         student_model = gr.Textbox(label="Student Model", value="Qwen/Qwen3-8B")
         teacher_model = gr.Textbox(label="Teacher Model", value="deepseek-ai/DeepSeek-Math-V2")
         dataset_id = gr.Textbox(label="Dataset ID", value="lm-provers/FineProofs-SFT")
     submit_btn = gr.Button("Submit", variant="primary")
     output = gr.HTML(label="Tokenization Diff Output")
-    submit_btn.click(fn=process_texts, inputs=[student_model, teacher_model, dataset_id], outputs=output)
 if __name__ == "__main__":
     demo.launch()

     return s_groups, t_groups
+def _decode_pieces(tokenizer, token_ids, indices):
+    """Decode individual token pieces for a group of token indices."""
+    return [
+        tokenizer.decode([token_ids[idx]], skip_special_tokens=False, clean_up_tokenization_spaces=False)
+        for idx in indices
+    ]
+def _format_pieces(pieces):
+    """Format token pieces as a list, e.g. '["hel", "lo"]'."""
+    inner = ", ".join(f'"{p}"' for p in pieces)
+    return f"[{inner}]"
+def highlight_groups(student_tokenizer, teacher_tokenizer, student_token_ids, teacher_token_ids, s_groups, t_groups):
     """Build an HTML string with highlighted misalignment regions."""
     parts = []
+    first_purple = True
     for k in range(len(s_groups)):
         s_ids = [student_token_ids[idx] for idx in s_groups[k]]
         text = student_tokenizer.decode(s_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
         t_multi = len(t_groups[k]) > 1
         if s_multi and t_multi:
+            if first_purple:
+                s_pieces = _decode_pieces(student_tokenizer, student_token_ids, s_groups[k])
+                t_pieces = _decode_pieces(teacher_tokenizer, teacher_token_ids, t_groups[k])
+                tooltip = html.escape(f'Student: {_format_pieces(s_pieces)} / Teacher: {_format_pieces(t_pieces)}')
+                parts.append(f'<span style="background-color: #b388ff;" title="{tooltip}">{escaped}</span>')
+                first_purple = False
+            else:
+                parts.append(f'<span style="background-color: #b388ff;">{escaped}</span>')
         elif s_multi:
+            s_pieces = _decode_pieces(student_tokenizer, student_token_ids, s_groups[k])
+            tooltip = html.escape(f'Student: {_format_pieces(s_pieces)}')
+            parts.append(f'<span style="background-color: #ffcc80;" title="{tooltip}">{escaped}</span>')
         elif t_multi:
+            t_pieces = _decode_pieces(teacher_tokenizer, teacher_token_ids, t_groups[k])
+            tooltip = html.escape(f'Teacher: {_format_pieces(t_pieces)}')
+            parts.append(f'<span style="background-color: #90caf9;" title="{tooltip}">{escaped}</span>')
         else:
             parts.append(escaped)
     return "".join(parts)
+def make_html_block(student_tokenizer, teacher_tokenizer, text, idx):
+    """Process a single text and return its highlighted HTML block."""
+    s_ids = student_tokenizer.encode(text, add_special_tokens=False)
+    t_ids = teacher_tokenizer.encode(text, add_special_tokens=False)
+    s_groups, t_groups = build_alignment_groups_from_ids(
+        student_tokenizer, teacher_tokenizer, s_ids, t_ids
+    )
+    highlighted = highlight_groups(student_tokenizer, teacher_tokenizer, s_ids, t_ids, s_groups, t_groups)
+    return (
+        f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
+        f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
+        f"<strong>Text {idx + 1}</strong> "
+        f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
+        f"{highlighted}</div>"
+    )
+def process_texts(student_model_id, teacher_model_id, dataset_id, dataset_config, progress=gr.Progress()):
+    """Load tokenizers and dataset, compute first row only."""
     progress(0, desc="Loading tokenizers...")
     student_tokenizer = AutoTokenizer.from_pretrained(student_model_id)
     teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)
+    progress(0.5, desc="Loading dataset...")
+    config = dataset_config.strip() if dataset_config and dataset_config.strip() else None
+    ds = load_dataset(dataset_id, name=config, split="train")
     rows = ds.select(range(min(10, len(ds))))
+    texts = ["".join(msg["content"] for msg in row["messages"]) for row in rows]
+    progress(0.8, desc="Processing first text...")
+    first_block = make_html_block(student_tokenizer, teacher_tokenizer, texts[0], 0)
+    cache = {0: first_block}
+    progress(1, desc="Done!")
+    return student_tokenizer, teacher_tokenizer, texts, cache, 0, render_page(cache, 0, len(texts))
+LEGEND = (
+    '<div style="margin-bottom:15px; font-family:sans-serif;">'
+    "<strong>Legend:</strong> "
+    '<span style="background-color:#ffcc80; padding:2px 8px; margin-right:8px;">Student token split (orange)</span>'
+    '<span style="background-color:#90caf9; padding:2px 8px; margin-right:8px;">Teacher token split (blue)</span>'
+    '<span style="background-color:#b388ff; padding:2px 8px;">Both (purple)</span>'
+    "</div>"
+)
+def render_page(cache, idx, total):
+    if not cache:
+        return ""
+    counter = f'<div style="font-family:sans-serif; margin-bottom:10px;">Text {idx + 1} of {total}</div>'
+    return LEGEND + counter + cache[idx]
+def go_prev(cache, idx, texts):
+    idx = max(0, idx - 1)
+    return cache, idx, render_page(cache, idx, len(texts))
+def go_next(student_tokenizer, teacher_tokenizer, texts, cache, idx):
+    idx = min(len(texts) - 1, idx + 1)
+    if idx not in cache:
+        cache[idx] = make_html_block(student_tokenizer, teacher_tokenizer, texts[idx], idx)
+    return cache, idx, render_page(cache, idx, len(texts))
 with gr.Blocks(title="Tokenization Diff") as demo:
         student_model = gr.Textbox(label="Student Model", value="Qwen/Qwen3-8B")
         teacher_model = gr.Textbox(label="Teacher Model", value="deepseek-ai/DeepSeek-Math-V2")
         dataset_id = gr.Textbox(label="Dataset ID", value="lm-provers/FineProofs-SFT")
+        dataset_config = gr.Textbox(label="Dataset Config", value="default")
     submit_btn = gr.Button("Submit", variant="primary")
+    student_tok_state = gr.State(None)
+    teacher_tok_state = gr.State(None)
+    texts_state = gr.State([])
+    cache_state = gr.State({})
+    idx_state = gr.State(0)
     output = gr.HTML(label="Tokenization Diff Output")
+    with gr.Row():
+        prev_btn = gr.Button("Previous")
+        next_btn = gr.Button("Next")
+    submit_btn.click(
+        fn=process_texts,
+        inputs=[student_model, teacher_model, dataset_id, dataset_config],
+        outputs=[student_tok_state, teacher_tok_state, texts_state, cache_state, idx_state, output],
+    )
+    prev_btn.click(
+        fn=go_prev,
+        inputs=[cache_state, idx_state, texts_state],
+        outputs=[cache_state, idx_state, output],
+    )
+    next_btn.click(
+        fn=go_next,
+        inputs=[student_tok_state, teacher_tok_state, texts_state, cache_state, idx_state],
+        outputs=[cache_state, idx_state, output],
+    )
 if __name__ == "__main__":
     demo.launch()