Jellyfish042 Claude Sonnet 4.5 commited on
Commit
8629056
·
1 Parent(s): 24f1b39

Fix token display to show actual token IDs instead of positions

Browse files

- Show actual model token IDs from topk_predictions
- Decode token IDs using the correct tokenizer
- Display format: [token_id] 'decoded_text'
- Add fallback to display tokenizer info if token ID unavailable

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. visualization/html_generator.py +28 -2
visualization/html_generator.py CHANGED
@@ -359,8 +359,34 @@ def generate_comparison_html(
359
  byte_start = token["byte_start"]
360
  byte_end = token["byte_end"]
361
 
362
- qwen_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]])
363
- rwkv_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  raw_bytes = list(text_bytes[byte_start:byte_end])
366
  losses_a = byte_losses_a[byte_start:byte_end]
 
359
  byte_start = token["byte_start"]
360
  byte_end = token["byte_end"]
361
 
362
+ # Get actual model token IDs for this byte range
363
+ model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
364
+ model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
365
+
366
+ # Build token info strings showing actual token IDs from the models
367
+ qwen_info_parts = []
368
+ rwkv_info_parts = []
369
+
370
+ # For Model A (Qwen), show the actual token ID if available
371
+ if model_a_token_idx is not None and topk_predictions_a and model_a_token_idx < len(topk_predictions_a):
372
+ actual_token_id = topk_predictions_a[model_a_token_idx][0]
373
+ token_text_decoded = decode_token(actual_token_id, tokenizer_a, model_type_a)
374
+ qwen_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
375
+ else:
376
+ # Fallback to display tokenizer info
377
+ qwen_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]]
378
+
379
+ # For Model B (RWKV), show the actual token ID if available
380
+ if model_b_token_idx is not None and topk_predictions_b and model_b_token_idx < len(topk_predictions_b):
381
+ actual_token_id = topk_predictions_b[model_b_token_idx][0]
382
+ token_text_decoded = decode_token(actual_token_id, tokenizer_b, model_type_b)
383
+ rwkv_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
384
+ else:
385
+ # Fallback to display tokenizer info
386
+ rwkv_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]]
387
+
388
+ qwen_info = ", ".join(qwen_info_parts)
389
+ rwkv_info = ", ".join(rwkv_info_parts)
390
 
391
  raw_bytes = list(text_bytes[byte_start:byte_end])
392
  losses_a = byte_losses_a[byte_start:byte_end]