Jellyfish042 Claude Sonnet 4.5 commited on
Commit
951e8dd
·
1 Parent(s): 52ba00f

Add comprehensive debug logging for Top 10 predictions

Browse files

- Add debug prints in app.py to track evaluation results
- Log tokenizer types and top5_predictions data structure
- Add detailed logging in html_generator.py for token mapping
- Track topk JSON generation process with error details
- Print pred structure and decoding attempts for debugging

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +21 -3
  2. visualization/html_generator.py +45 -8
app.py CHANGED
@@ -199,24 +199,41 @@ def run_evaluation(text: str, progress=gr.Progress()):
199
 
200
  try:
201
  # Step 1: Evaluate Qwen (using cached model)
202
- progress(0, desc="Evaluating with Qwen3...")
203
  result_qwen = evaluate_hf_single_sample(
204
  _qwen_model,
205
  _qwen_tokenizer,
206
  text,
207
  bos_mode="add_newline_token"
208
  )
 
 
 
 
 
 
 
209
 
210
  # Step 2: Evaluate RWKV7 (using cached model)
211
- progress(0, desc="Evaluating with RWKV7...")
212
  result_rwkv = evaluate_rwkv7_single_sample(
213
  _rwkv_model,
214
  _rwkv_tokenizer,
215
  text
216
  )
 
 
 
 
 
 
 
217
 
218
  # Step 3: Generate visualization
219
- progress(0, desc="Generating visualization...")
 
 
 
220
  html = generate_comparison_html(
221
  text=text,
222
  byte_losses_a=result_qwen["byte_wise_losses"],
@@ -230,6 +247,7 @@ def run_evaluation(text: str, progress=gr.Progress()):
230
  model_type_a="hf",
231
  model_type_b="rwkv7"
232
  )
 
233
 
234
  # Wrap HTML for iframe display
235
  wrapped_html = wrap_html_in_iframe(html)
 
199
 
200
  try:
201
  # Step 1: Evaluate Qwen (using cached model)
202
+ progress(desc="Evaluating with Qwen3...")
203
  result_qwen = evaluate_hf_single_sample(
204
  _qwen_model,
205
  _qwen_tokenizer,
206
  text,
207
  bos_mode="add_newline_token"
208
  )
209
+ print(f"[DEBUG] Qwen evaluation complete")
210
+ print(f"[DEBUG] Qwen top5_predictions type: {type(result_qwen.get('top5_predictions'))}")
211
+ print(f"[DEBUG] Qwen top5_predictions length: {len(result_qwen.get('top5_predictions', []))}")
212
+ if result_qwen.get('top5_predictions'):
213
+ print(f"[DEBUG] Qwen first prediction sample: {result_qwen['top5_predictions'][0]}")
214
+ print(f"[DEBUG] Qwen tokenizer type: {type(result_qwen.get('tokenizer'))}")
215
+ print(f"[DEBUG] Qwen tokenizer: {result_qwen.get('tokenizer')}")
216
 
217
  # Step 2: Evaluate RWKV7 (using cached model)
218
+ progress(desc="Evaluating with RWKV7...")
219
  result_rwkv = evaluate_rwkv7_single_sample(
220
  _rwkv_model,
221
  _rwkv_tokenizer,
222
  text
223
  )
224
+ print(f"[DEBUG] RWKV evaluation complete")
225
+ print(f"[DEBUG] RWKV top5_predictions type: {type(result_rwkv.get('top5_predictions'))}")
226
+ print(f"[DEBUG] RWKV top5_predictions length: {len(result_rwkv.get('top5_predictions', []))}")
227
+ if result_rwkv.get('top5_predictions'):
228
+ print(f"[DEBUG] RWKV first prediction sample: {result_rwkv['top5_predictions'][0]}")
229
+ print(f"[DEBUG] RWKV tokenizer type: {type(result_rwkv.get('tokenizer'))}")
230
+ print(f"[DEBUG] RWKV tokenizer: {result_rwkv.get('tokenizer')}")
231
 
232
  # Step 3: Generate visualization
233
+ progress(desc="Generating visualization...")
234
+ print(f"[DEBUG] Starting HTML generation...")
235
+ print(f"[DEBUG] Passing tokenizer_a: {result_qwen['tokenizer']}")
236
+ print(f"[DEBUG] Passing tokenizer_b: {result_rwkv['tokenizer']}")
237
  html = generate_comparison_html(
238
  text=text,
239
  byte_losses_a=result_qwen["byte_wise_losses"],
 
247
  model_type_a="hf",
248
  model_type_b="rwkv7"
249
  )
250
+ print(f"[DEBUG] HTML generation complete")
251
 
252
  # Wrap HTML for iframe display
253
  wrapped_html = wrap_html_in_iframe(html)
visualization/html_generator.py CHANGED
@@ -274,6 +274,25 @@ def generate_comparison_html(
274
  model_a_token_ranges = build_byte_to_token_map(text, tokenizer_a, model_type_a)
275
  model_b_token_ranges = build_byte_to_token_map(text, tokenizer_b, model_type_b)
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  def get_tokens_for_range(byte_start, byte_end, token_list):
278
  result = []
279
  for idx, (t_start, t_end, t_str) in enumerate(token_list):
@@ -356,18 +375,36 @@ def generate_comparison_html(
356
  model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
357
  if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
358
  pred = topk_predictions_a[model_a_token_idx]
359
- decoded_pred = [
360
- pred[0],
361
- pred[1],
362
- [[tid, prob, decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
363
- ]
364
- topk_a_json = json.dumps(decoded_pred, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
365
  if topk_predictions_b is not None and model_b_token_ranges:
366
  model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
367
  if model_b_token_idx is not None and model_b_token_idx < len(topk_predictions_b):
368
  pred = topk_predictions_b[model_b_token_idx]
369
- decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
370
- topk_b_json = json.dumps(decoded_pred, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
371
 
372
  token_deltas = deltas[byte_start:byte_end]
373
  avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
 
274
  model_a_token_ranges = build_byte_to_token_map(text, tokenizer_a, model_type_a)
275
  model_b_token_ranges = build_byte_to_token_map(text, tokenizer_b, model_type_b)
276
 
277
+ print(f"[DEBUG HTML] tokenizer_a: {tokenizer_a}")
278
+ print(f"[DEBUG HTML] tokenizer_b: {tokenizer_b}")
279
+ print(f"[DEBUG HTML] model_type_a: {model_type_a}")
280
+ print(f"[DEBUG HTML] model_type_b: {model_type_b}")
281
+ print(f"[DEBUG HTML] model_a_token_ranges length: {len(model_a_token_ranges)}")
282
+ print(f"[DEBUG HTML] model_b_token_ranges length: {len(model_b_token_ranges)}")
283
+ if model_a_token_ranges:
284
+ print(f"[DEBUG HTML] model_a first token range: {model_a_token_ranges[0]}")
285
+ if model_b_token_ranges:
286
+ print(f"[DEBUG HTML] model_b first token range: {model_b_token_ranges[0]}")
287
+ print(f"[DEBUG HTML] topk_predictions_a type: {type(topk_predictions_a)}")
288
+ print(f"[DEBUG HTML] topk_predictions_b type: {type(topk_predictions_b)}")
289
+ if topk_predictions_a:
290
+ print(f"[DEBUG HTML] topk_predictions_a length: {len(topk_predictions_a)}")
291
+ print(f"[DEBUG HTML] topk_predictions_a[0]: {topk_predictions_a[0]}")
292
+ if topk_predictions_b:
293
+ print(f"[DEBUG HTML] topk_predictions_b length: {len(topk_predictions_b)}")
294
+ print(f"[DEBUG HTML] topk_predictions_b[0]: {topk_predictions_b[0]}")
295
+
296
  def get_tokens_for_range(byte_start, byte_end, token_list):
297
  result = []
298
  for idx, (t_start, t_end, t_str) in enumerate(token_list):
 
375
  model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
376
  if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
377
  pred = topk_predictions_a[model_a_token_idx]
378
+ print(f"[DEBUG] Processing token at byte {byte_start}, model_a_token_idx={model_a_token_idx}")
379
+ print(f"[DEBUG] pred structure: {pred}")
380
+ try:
381
+ decoded_pred = [
382
+ pred[0],
383
+ pred[1],
384
+ [[tid, prob, decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
385
+ ]
386
+ topk_a_json = json.dumps(decoded_pred, ensure_ascii=False)
387
+ print(f"[DEBUG] Successfully generated topk_a_json")
388
+ except Exception as e:
389
+ print(f"[DEBUG] Error generating topk_a_json: {e}")
390
+ print(f"[DEBUG] pred[2] type: {type(pred[2])}")
391
+ if len(pred) > 2:
392
+ print(f"[DEBUG] pred[2] content: {pred[2][:3]}") # First 3 items
393
  if topk_predictions_b is not None and model_b_token_ranges:
394
  model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
395
  if model_b_token_idx is not None and model_b_token_idx < len(topk_predictions_b):
396
  pred = topk_predictions_b[model_b_token_idx]
397
+ print(f"[DEBUG] Processing token at byte {byte_start}, model_b_token_idx={model_b_token_idx}")
398
+ print(f"[DEBUG] pred structure: {pred}")
399
+ try:
400
+ decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
401
+ topk_b_json = json.dumps(decoded_pred, ensure_ascii=False)
402
+ print(f"[DEBUG] Successfully generated topk_b_json")
403
+ except Exception as e:
404
+ print(f"[DEBUG] Error generating topk_b_json: {e}")
405
+ print(f"[DEBUG] pred[2] type: {type(pred[2])}")
406
+ if len(pred) > 2:
407
+ print(f"[DEBUG] pred[2] content: {pred[2][:3]}") # First 3 items
408
 
409
  token_deltas = deltas[byte_start:byte_end]
410
  avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0