Spaces:
Running
Running
Commit
·
951e8dd
1
Parent(s):
52ba00f
Add comprehensive debug logging for Top 10 predictions
Browse files- Add debug prints in app.py to track evaluation results
- Log tokenizer types and top5_predictions data structure
- Add detailed logging in html_generator.py for token mapping
- Track topk JSON generation process with error details
- Print pred structure and decoding attempts for debugging
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- app.py +21 -3
- visualization/html_generator.py +45 -8
app.py
CHANGED
|
@@ -199,24 +199,41 @@ def run_evaluation(text: str, progress=gr.Progress()):
|
|
| 199 |
|
| 200 |
try:
|
| 201 |
# Step 1: Evaluate Qwen (using cached model)
|
| 202 |
-
progress(
|
| 203 |
result_qwen = evaluate_hf_single_sample(
|
| 204 |
_qwen_model,
|
| 205 |
_qwen_tokenizer,
|
| 206 |
text,
|
| 207 |
bos_mode="add_newline_token"
|
| 208 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
# Step 2: Evaluate RWKV7 (using cached model)
|
| 211 |
-
progress(
|
| 212 |
result_rwkv = evaluate_rwkv7_single_sample(
|
| 213 |
_rwkv_model,
|
| 214 |
_rwkv_tokenizer,
|
| 215 |
text
|
| 216 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# Step 3: Generate visualization
|
| 219 |
-
progress(
|
|
|
|
|
|
|
|
|
|
| 220 |
html = generate_comparison_html(
|
| 221 |
text=text,
|
| 222 |
byte_losses_a=result_qwen["byte_wise_losses"],
|
|
@@ -230,6 +247,7 @@ def run_evaluation(text: str, progress=gr.Progress()):
|
|
| 230 |
model_type_a="hf",
|
| 231 |
model_type_b="rwkv7"
|
| 232 |
)
|
|
|
|
| 233 |
|
| 234 |
# Wrap HTML for iframe display
|
| 235 |
wrapped_html = wrap_html_in_iframe(html)
|
|
|
|
| 199 |
|
| 200 |
try:
|
| 201 |
# Step 1: Evaluate Qwen (using cached model)
|
| 202 |
+
progress(desc="Evaluating with Qwen3...")
|
| 203 |
result_qwen = evaluate_hf_single_sample(
|
| 204 |
_qwen_model,
|
| 205 |
_qwen_tokenizer,
|
| 206 |
text,
|
| 207 |
bos_mode="add_newline_token"
|
| 208 |
)
|
| 209 |
+
print(f"[DEBUG] Qwen evaluation complete")
|
| 210 |
+
print(f"[DEBUG] Qwen top5_predictions type: {type(result_qwen.get('top5_predictions'))}")
|
| 211 |
+
print(f"[DEBUG] Qwen top5_predictions length: {len(result_qwen.get('top5_predictions', []))}")
|
| 212 |
+
if result_qwen.get('top5_predictions'):
|
| 213 |
+
print(f"[DEBUG] Qwen first prediction sample: {result_qwen['top5_predictions'][0]}")
|
| 214 |
+
print(f"[DEBUG] Qwen tokenizer type: {type(result_qwen.get('tokenizer'))}")
|
| 215 |
+
print(f"[DEBUG] Qwen tokenizer: {result_qwen.get('tokenizer')}")
|
| 216 |
|
| 217 |
# Step 2: Evaluate RWKV7 (using cached model)
|
| 218 |
+
progress(desc="Evaluating with RWKV7...")
|
| 219 |
result_rwkv = evaluate_rwkv7_single_sample(
|
| 220 |
_rwkv_model,
|
| 221 |
_rwkv_tokenizer,
|
| 222 |
text
|
| 223 |
)
|
| 224 |
+
print(f"[DEBUG] RWKV evaluation complete")
|
| 225 |
+
print(f"[DEBUG] RWKV top5_predictions type: {type(result_rwkv.get('top5_predictions'))}")
|
| 226 |
+
print(f"[DEBUG] RWKV top5_predictions length: {len(result_rwkv.get('top5_predictions', []))}")
|
| 227 |
+
if result_rwkv.get('top5_predictions'):
|
| 228 |
+
print(f"[DEBUG] RWKV first prediction sample: {result_rwkv['top5_predictions'][0]}")
|
| 229 |
+
print(f"[DEBUG] RWKV tokenizer type: {type(result_rwkv.get('tokenizer'))}")
|
| 230 |
+
print(f"[DEBUG] RWKV tokenizer: {result_rwkv.get('tokenizer')}")
|
| 231 |
|
| 232 |
# Step 3: Generate visualization
|
| 233 |
+
progress(desc="Generating visualization...")
|
| 234 |
+
print(f"[DEBUG] Starting HTML generation...")
|
| 235 |
+
print(f"[DEBUG] Passing tokenizer_a: {result_qwen['tokenizer']}")
|
| 236 |
+
print(f"[DEBUG] Passing tokenizer_b: {result_rwkv['tokenizer']}")
|
| 237 |
html = generate_comparison_html(
|
| 238 |
text=text,
|
| 239 |
byte_losses_a=result_qwen["byte_wise_losses"],
|
|
|
|
| 247 |
model_type_a="hf",
|
| 248 |
model_type_b="rwkv7"
|
| 249 |
)
|
| 250 |
+
print(f"[DEBUG] HTML generation complete")
|
| 251 |
|
| 252 |
# Wrap HTML for iframe display
|
| 253 |
wrapped_html = wrap_html_in_iframe(html)
|
visualization/html_generator.py
CHANGED
|
@@ -274,6 +274,25 @@ def generate_comparison_html(
|
|
| 274 |
model_a_token_ranges = build_byte_to_token_map(text, tokenizer_a, model_type_a)
|
| 275 |
model_b_token_ranges = build_byte_to_token_map(text, tokenizer_b, model_type_b)
|
| 276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
def get_tokens_for_range(byte_start, byte_end, token_list):
|
| 278 |
result = []
|
| 279 |
for idx, (t_start, t_end, t_str) in enumerate(token_list):
|
|
@@ -356,18 +375,36 @@ def generate_comparison_html(
|
|
| 356 |
model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
|
| 357 |
if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
|
| 358 |
pred = topk_predictions_a[model_a_token_idx]
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
if topk_predictions_b is not None and model_b_token_ranges:
|
| 366 |
model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
|
| 367 |
if model_b_token_idx is not None and model_b_token_idx < len(topk_predictions_b):
|
| 368 |
pred = topk_predictions_b[model_b_token_idx]
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
token_deltas = deltas[byte_start:byte_end]
|
| 373 |
avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
|
|
|
|
| 274 |
model_a_token_ranges = build_byte_to_token_map(text, tokenizer_a, model_type_a)
|
| 275 |
model_b_token_ranges = build_byte_to_token_map(text, tokenizer_b, model_type_b)
|
| 276 |
|
| 277 |
+
print(f"[DEBUG HTML] tokenizer_a: {tokenizer_a}")
|
| 278 |
+
print(f"[DEBUG HTML] tokenizer_b: {tokenizer_b}")
|
| 279 |
+
print(f"[DEBUG HTML] model_type_a: {model_type_a}")
|
| 280 |
+
print(f"[DEBUG HTML] model_type_b: {model_type_b}")
|
| 281 |
+
print(f"[DEBUG HTML] model_a_token_ranges length: {len(model_a_token_ranges)}")
|
| 282 |
+
print(f"[DEBUG HTML] model_b_token_ranges length: {len(model_b_token_ranges)}")
|
| 283 |
+
if model_a_token_ranges:
|
| 284 |
+
print(f"[DEBUG HTML] model_a first token range: {model_a_token_ranges[0]}")
|
| 285 |
+
if model_b_token_ranges:
|
| 286 |
+
print(f"[DEBUG HTML] model_b first token range: {model_b_token_ranges[0]}")
|
| 287 |
+
print(f"[DEBUG HTML] topk_predictions_a type: {type(topk_predictions_a)}")
|
| 288 |
+
print(f"[DEBUG HTML] topk_predictions_b type: {type(topk_predictions_b)}")
|
| 289 |
+
if topk_predictions_a:
|
| 290 |
+
print(f"[DEBUG HTML] topk_predictions_a length: {len(topk_predictions_a)}")
|
| 291 |
+
print(f"[DEBUG HTML] topk_predictions_a[0]: {topk_predictions_a[0]}")
|
| 292 |
+
if topk_predictions_b:
|
| 293 |
+
print(f"[DEBUG HTML] topk_predictions_b length: {len(topk_predictions_b)}")
|
| 294 |
+
print(f"[DEBUG HTML] topk_predictions_b[0]: {topk_predictions_b[0]}")
|
| 295 |
+
|
| 296 |
def get_tokens_for_range(byte_start, byte_end, token_list):
|
| 297 |
result = []
|
| 298 |
for idx, (t_start, t_end, t_str) in enumerate(token_list):
|
|
|
|
| 375 |
model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
|
| 376 |
if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
|
| 377 |
pred = topk_predictions_a[model_a_token_idx]
|
| 378 |
+
print(f"[DEBUG] Processing token at byte {byte_start}, model_a_token_idx={model_a_token_idx}")
|
| 379 |
+
print(f"[DEBUG] pred structure: {pred}")
|
| 380 |
+
try:
|
| 381 |
+
decoded_pred = [
|
| 382 |
+
pred[0],
|
| 383 |
+
pred[1],
|
| 384 |
+
[[tid, prob, decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
|
| 385 |
+
]
|
| 386 |
+
topk_a_json = json.dumps(decoded_pred, ensure_ascii=False)
|
| 387 |
+
print(f"[DEBUG] Successfully generated topk_a_json")
|
| 388 |
+
except Exception as e:
|
| 389 |
+
print(f"[DEBUG] Error generating topk_a_json: {e}")
|
| 390 |
+
print(f"[DEBUG] pred[2] type: {type(pred[2])}")
|
| 391 |
+
if len(pred) > 2:
|
| 392 |
+
print(f"[DEBUG] pred[2] content: {pred[2][:3]}") # First 3 items
|
| 393 |
if topk_predictions_b is not None and model_b_token_ranges:
|
| 394 |
model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
|
| 395 |
if model_b_token_idx is not None and model_b_token_idx < len(topk_predictions_b):
|
| 396 |
pred = topk_predictions_b[model_b_token_idx]
|
| 397 |
+
print(f"[DEBUG] Processing token at byte {byte_start}, model_b_token_idx={model_b_token_idx}")
|
| 398 |
+
print(f"[DEBUG] pred structure: {pred}")
|
| 399 |
+
try:
|
| 400 |
+
decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
|
| 401 |
+
topk_b_json = json.dumps(decoded_pred, ensure_ascii=False)
|
| 402 |
+
print(f"[DEBUG] Successfully generated topk_b_json")
|
| 403 |
+
except Exception as e:
|
| 404 |
+
print(f"[DEBUG] Error generating topk_b_json: {e}")
|
| 405 |
+
print(f"[DEBUG] pred[2] type: {type(pred[2])}")
|
| 406 |
+
if len(pred) > 2:
|
| 407 |
+
print(f"[DEBUG] pred[2] content: {pred[2][:3]}") # First 3 items
|
| 408 |
|
| 409 |
token_deltas = deltas[byte_start:byte_end]
|
| 410 |
avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
|