Spaces:
Sleeping
Sleeping
Commit ·
8629056
1
Parent(s): 24f1b39
Fix token display to show actual token IDs instead of positions
Browse files- Show actual model token IDs from topk_predictions
- Decode token IDs using the correct tokenizer
- Display format: [token_id] 'decoded_text'
- Add fallback to display tokenizer info if token ID unavailable
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
visualization/html_generator.py
CHANGED
|
@@ -359,8 +359,34 @@ def generate_comparison_html(
|
|
| 359 |
byte_start = token["byte_start"]
|
| 360 |
byte_end = token["byte_end"]
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
raw_bytes = list(text_bytes[byte_start:byte_end])
|
| 366 |
losses_a = byte_losses_a[byte_start:byte_end]
|
|
|
|
| 359 |
byte_start = token["byte_start"]
|
| 360 |
byte_end = token["byte_end"]
|
| 361 |
|
| 362 |
+
# Get actual model token IDs for this byte range
|
| 363 |
+
model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
|
| 364 |
+
model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
|
| 365 |
+
|
| 366 |
+
# Build token info strings showing actual token IDs from the models
|
| 367 |
+
qwen_info_parts = []
|
| 368 |
+
rwkv_info_parts = []
|
| 369 |
+
|
| 370 |
+
# For Model A (Qwen), show the actual token ID if available
|
| 371 |
+
if model_a_token_idx is not None and topk_predictions_a and model_a_token_idx < len(topk_predictions_a):
|
| 372 |
+
actual_token_id = topk_predictions_a[model_a_token_idx][0]
|
| 373 |
+
token_text_decoded = decode_token(actual_token_id, tokenizer_a, model_type_a)
|
| 374 |
+
qwen_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
|
| 375 |
+
else:
|
| 376 |
+
# Fallback to display tokenizer info
|
| 377 |
+
qwen_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]]
|
| 378 |
+
|
| 379 |
+
# For Model B (RWKV), show the actual token ID if available
|
| 380 |
+
if model_b_token_idx is not None and topk_predictions_b and model_b_token_idx < len(topk_predictions_b):
|
| 381 |
+
actual_token_id = topk_predictions_b[model_b_token_idx][0]
|
| 382 |
+
token_text_decoded = decode_token(actual_token_id, tokenizer_b, model_type_b)
|
| 383 |
+
rwkv_info_parts.append(f"[{actual_token_id}] {repr(token_text_decoded)}")
|
| 384 |
+
else:
|
| 385 |
+
# Fallback to display tokenizer info
|
| 386 |
+
rwkv_info_parts = [f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]]
|
| 387 |
+
|
| 388 |
+
qwen_info = ", ".join(qwen_info_parts)
|
| 389 |
+
rwkv_info = ", ".join(rwkv_info_parts)
|
| 390 |
|
| 391 |
raw_bytes = list(text_bytes[byte_start:byte_end])
|
| 392 |
losses_a = byte_losses_a[byte_start:byte_end]
|