Spaces:

Jellyfish042
/

Compression-Lens

Sleeping

Jellyfish042 Claude Sonnet 4.5 commited on Jan 18

Commit

8629056

1 Parent(s): 24f1b39

Fix token display to show actual token IDs instead of positions

- Show actual model token IDs from topk_predictions
- Decode token IDs using the correct tokenizer
- Display format: [token_id] 'decoded_text'
- Add fallback to display tokenizer info if token ID unavailable

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show

visualization/html_generator.py +28 -2

visualization/html_generator.py CHANGED Viewed

@@ -359,8 +359,34 @@ def generate_comparison_html(
         byte_start = token["byte_start"]
         byte_end = token["byte_end"]
-        qwen_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]])
-        rwkv_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]])
         raw_bytes = list(text_bytes[byte_start:byte_end])
         losses_a = byte_losses_a[byte_start:byte_end]

         byte_start = token["byte_start"]
         byte_end = token["byte_end"]
+        # Get actual model token IDs for this byte range
+        model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
+        model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
+        # Build token info strings showing actual token IDs from the models
+        qwen_info_parts = []
+        rwkv_info_parts = []
+        # For Model A (Qwen), show the actual token ID if available
+        if model_a_token_idx is not None and topk_predictions_a and model_a_token_idx < len(topk_predictions_a):
+            actual_token_id = topk_predictions_a[model_a_token_idx][0]
+            token_text_decoded = decode_token(actual_token_id, tokenizer_a, model_type_a)
+            qwen_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
+        else:
+            # Fallback to display tokenizer info
+            qwen_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]]
+        # For Model B (RWKV), show the actual token ID if available
+        if model_b_token_idx is not None and topk_predictions_b and model_b_token_idx < len(topk_predictions_b):
+            actual_token_id = topk_predictions_b[model_b_token_idx][0]
+            token_text_decoded = decode_token(actual_token_id, tokenizer_b, model_type_b)
+            rwkv_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
+        else:
+            # Fallback to display tokenizer info
+            rwkv_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]]
+        qwen_info = ", ".join(qwen_info_parts)
+        rwkv_info = ", ".join(rwkv_info_parts)
         raw_bytes = list(text_bytes[byte_start:byte_end])
         losses_a = byte_losses_a[byte_start:byte_end]