Spaces:

Jellyfish042
/

Compression-Lens

Sleeping

Jellyfish042 Claude Sonnet 4.5 commited on Jan 18

Commit

6fcf271

1 Parent(s): 98b6961

Improve UI and fix multi-token display in tooltips

- Increase max text length from 4000 to 8192 characters
- Rename model labels to use generic Model A/B instead of hardcoded names
- Fix tooltip to show all tokens in a byte range (not just one)
- Remove "UncheatableEval" title from HTML output
- Update data attributes from data-qwen/data-rwkv to data-model-a/data-model-b

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show

app.py +1 -1
visualization/html_generator.py +12 -31

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ MODELS_DIR = SCRIPT_DIR / "models"
 SUPPORT_DIR = SCRIPT_DIR / "support"
 # Text length limits
-MAX_TEXT_LENGTH = 4000
 MIN_TEXT_LENGTH = 1
 # Global model cache

 SUPPORT_DIR = SCRIPT_DIR / "support"
 # Text length limits
+MAX_TEXT_LENGTH = 8192
 MIN_TEXT_LENGTH = 1
 # Global model cache

visualization/html_generator.py CHANGED Viewed

@@ -353,30 +353,12 @@ def generate_comparison_html(
         model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
         model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
-        # Build token info strings showing actual token IDs from the models
-        qwen_info_parts = []
-        rwkv_info_parts = []
-        # For Model A (Qwen), show the actual token ID if available
-        if model_a_token_idx is not None and topk_predictions_a and model_a_token_idx < len(topk_predictions_a):
-            actual_token_id = topk_predictions_a[model_a_token_idx][0]
-            token_text_decoded = decode_token(actual_token_id, tokenizer_a, model_type_a)
-            qwen_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
-        else:
-            # Fallback to display tokenizer info
-            qwen_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]]
-        # For Model B (RWKV), show the actual token ID if available
-        if model_b_token_idx is not None and topk_predictions_b and model_b_token_idx < len(topk_predictions_b):
-            actual_token_id = topk_predictions_b[model_b_token_idx][0]
-            token_text_decoded = decode_token(actual_token_id, tokenizer_b, model_type_b)
-            rwkv_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
-        else:
-            # Fallback to display tokenizer info
-            rwkv_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]]
-        qwen_info = ", ".join(qwen_info_parts)
-        rwkv_info = ", ".join(rwkv_info_parts)
         raw_bytes = list(text_bytes[byte_start:byte_end])
         losses_a = byte_losses_a[byte_start:byte_end]
@@ -443,8 +425,8 @@ def generate_comparison_html(
         token_span_content = "".join(token_html_parts)
         data_attrs = (
-            f'data-qwen="{escape_for_attr(qwen_info)}" '
-            f'data-rwkv="{escape_for_attr(rwkv_info)}" '
             f'data-bytes="{escape_for_attr(bytes_str)}" '
             f'data-compression-a="{escape_for_attr(compression_a_str)}" '
             f'data-compression-b="{escape_for_attr(compression_b_str)}" '
@@ -475,7 +457,7 @@ def generate_comparison_html(
 <html>
 <head>
     <meta charset="UTF-8">
-    <title>UncheatableEval - Byte-wise Loss Comparison</title>
     <style>
         body {{
             font-family: Consolas, 'Courier New', monospace;
@@ -671,7 +653,6 @@ def generate_comparison_html(
     <svg id="svg-overlay"></svg>
     <div id="tooltip"></div>
     <div class="header">
-        <h1>UncheatableEval - Byte-wise Loss Comparison</h1>
         <div class="meta">
             <div>Model A: {model_a_name}</div>
             <div>Model B: {model_b_name}</div>
@@ -781,8 +762,8 @@ def generate_comparison_html(
         tokenSpans.forEach(token => {{
             token.addEventListener('mouseenter', (e) => {{
-                const qwen = token.getAttribute('data-qwen') || 'N/A';
-                const rwkv = token.getAttribute('data-rwkv') || 'N/A';
                 const bytes = token.getAttribute('data-bytes') || '';
                 const compressionA = token.getAttribute('data-compression-a') || '';
                 const compressionB = token.getAttribute('data-compression-b') || '';
@@ -829,8 +810,8 @@ def generate_comparison_html(
                     <div><span class="label">Compression A:</span> <span class="loss-a">${{compressionA || '(empty)'}}</span></div>
                     <div><span class="label">Compression B:</span> <span class="loss-b">${{compressionB || '(empty)'}}</span></div>
                     <hr style="border-color: #555; margin: 6px 0;">
-                    <div><span class="label">Qwen:</span> <span class="qwen">${{qwen || '(empty)'}}</span></div>
-                    <div><span class="label">RWKV:</span> <span class="rwkv">${{rwkv || '(empty)'}}</span></div>
                 `;
                 if (top5A || top5B) {{
                     tooltipHtml += '<div class="topk-section"><div class="topk-container">';

         model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
         model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
+        # Build token info strings showing all tokens in this byte range
+        # Model A (RWKV7) - show all tokens that overlap with this byte range
+        model_a_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]])
+        # Model B (Qwen3) - show all tokens that overlap with this byte range
+        model_b_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]])
         raw_bytes = list(text_bytes[byte_start:byte_end])
         losses_a = byte_losses_a[byte_start:byte_end]
         token_span_content = "".join(token_html_parts)
         data_attrs = (
+            f'data-model-a="{escape_for_attr(model_a_info)}" '
+            f'data-model-b="{escape_for_attr(model_b_info)}" '
             f'data-bytes="{escape_for_attr(bytes_str)}" '
             f'data-compression-a="{escape_for_attr(compression_a_str)}" '
             f'data-compression-b="{escape_for_attr(compression_b_str)}" '
 <html>
 <head>
     <meta charset="UTF-8">
+    <title>Model Comparison</title>
     <style>
         body {{
             font-family: Consolas, 'Courier New', monospace;
     <svg id="svg-overlay"></svg>
     <div id="tooltip"></div>
     <div class="header">
         <div class="meta">
             <div>Model A: {model_a_name}</div>
             <div>Model B: {model_b_name}</div>
         tokenSpans.forEach(token => {{
             token.addEventListener('mouseenter', (e) => {{
+                const modelA = token.getAttribute('data-model-a') || 'N/A';
+                const modelB = token.getAttribute('data-model-b') || 'N/A';
                 const bytes = token.getAttribute('data-bytes') || '';
                 const compressionA = token.getAttribute('data-compression-a') || '';
                 const compressionB = token.getAttribute('data-compression-b') || '';
                     <div><span class="label">Compression A:</span> <span class="loss-a">${{compressionA || '(empty)'}}</span></div>
                     <div><span class="label">Compression B:</span> <span class="loss-b">${{compressionB || '(empty)'}}</span></div>
                     <hr style="border-color: #555; margin: 6px 0;">
+                    <div><span class="label">Model A:</span> <span class="model-a">${{modelA || '(empty)'}}</span></div>
+                    <div><span class="label">Model B:</span> <span class="model-b">${{modelB || '(empty)'}}</span></div>
                 `;
                 if (top5A || top5B) {{
                     tooltipHtml += '<div class="topk-section"><div class="topk-container">';