Luigi commited on
Commit
c0889b4
·
1 Parent(s): 451a43d

feat: Add comprehensive generation metrics to JSON export

Browse files

- Added timing: time_to_first_token, total_processing_time, model_load_time
- Added tokens: n_ctx, input_tokens, output_tokens, thinking_tokens, total_tokens
- Added performance: generation_speed_tps, prefill_speed_tps
- Added file_info: filename, size_bytes, original_char_count
- Added truncation_info: was_truncated, original/final char counts
- Updated download_summary_json with organized metrics structure

Files changed (1) hide show
  1. app.py +165 -24
app.py CHANGED
@@ -291,7 +291,7 @@ def update_reasoning_visibility(model_key):
291
  return gr.update(visible=supports_toggle)
292
 
293
 
294
- def download_summary_json(summary, thinking, model_key, language):
295
  """Generate JSON file with summary and metadata."""
296
  import json
297
  from datetime import datetime
@@ -307,6 +307,32 @@ def download_summary_json(summary, thinking, model_key, language):
307
  "summary": summary
308
  }
309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
311
  with open(filename, 'w', encoding='utf-8') as f:
312
  json.dump(data, f, ensure_ascii=False, indent=2)
@@ -425,10 +451,10 @@ def summarize_streaming(
425
  top_p: float = None,
426
  top_k: int = None,
427
  output_language: str = "en",
428
- ) -> Generator[Tuple[str, str, str], None, None]:
429
  """
430
  Stream summary generation from uploaded file.
431
-
432
  Args:
433
  file_obj: Gradio file object
434
  model_key: Model identifier from AVAILABLE_MODELS
@@ -437,10 +463,29 @@ def summarize_streaming(
437
  top_p: Nucleus sampling parameter (uses model default if None)
438
  top_k: Top-k sampling parameter (uses model default if None)
439
  output_language: Target language for summary ("en" or "zh-TW")
440
-
441
  Yields:
442
- Tuple of (thinking_text, summary_text, info_text)
443
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  global llm, converter
445
 
446
  model = AVAILABLE_MODELS[model_key]
@@ -453,26 +498,50 @@ def summarize_streaming(
453
  # Read uploaded file
454
  try:
455
  path = file_obj.name if hasattr(file_obj, 'name') else file_obj
 
 
 
 
 
456
  with open(path, 'r', encoding='utf-8') as f:
457
  transcript = f.read()
 
 
 
 
 
 
 
458
  except Exception as e:
459
- yield ("", f"Error reading file: {e}", "")
460
  return
461
-
462
  if not transcript.strip():
463
- yield ("", "Error: File is empty", "")
464
  return
465
-
466
  # Calculate context and check truncation
467
  n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens)
468
-
 
469
  # Truncate if needed (estimate max chars from available tokens)
470
  available_tokens = usable_max - max_tokens - 512
471
  max_bytes = available_tokens * 3 # Reverse estimate: tokens * 3 bytes
472
  encoded = transcript.encode('utf-8')
473
- if len(encoded) > max_bytes:
 
 
 
474
  transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
475
  transcript += "\n\n[Content truncated to fit model context]"
 
 
 
 
 
 
 
 
476
 
477
  # Build info text
478
  input_tokens = estimate_tokens(transcript)
@@ -485,12 +554,14 @@ def summarize_streaming(
485
  if warning:
486
  info += f"\n\n{warning}"
487
 
488
- # Load model (no-op if already loaded)
 
489
  try:
490
  llm, load_msg = load_model(model_key)
491
  logger.info(load_msg)
 
492
  except Exception as e:
493
- yield ("", f"Error loading model: {e}", "")
494
  return
495
 
496
  # Prepare system prompt with reasoning toggle for Qwen3 models
@@ -537,6 +608,29 @@ def summarize_streaming(
537
  current_summary = ""
538
 
539
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  # Apply model-specific inference settings
541
  stream = llm.create_chat_completion(
542
  messages=messages,
@@ -548,32 +642,76 @@ def summarize_streaming(
548
  repeat_penalty=repeat_penalty,
549
  stream=True,
550
  )
551
-
 
 
552
  for chunk in stream:
553
  if 'choices' in chunk and len(chunk['choices']) > 0:
554
  delta = chunk['choices'][0].get('delta', {})
555
  content = delta.get('content', '')
556
  if content:
 
 
 
 
 
 
 
557
  if output_language == "zh-TW":
558
  converted = converter.convert(content)
559
  full_response += converted
560
  else:
561
  full_response += content
562
-
563
  thinking, summary = parse_thinking_blocks(full_response, streaming=True)
564
  current_thinking = thinking or ""
565
  current_summary = summary or ""
566
- yield (current_thinking, current_summary, info)
567
-
568
- # Final parse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  thinking, summary = parse_thinking_blocks(full_response)
570
- yield (thinking or "", summary or "", info)
571
-
 
 
 
 
 
 
 
 
572
  llm.reset()
573
-
574
  except Exception as e:
575
  logger.error(f"Generation error: {e}")
576
- yield (current_thinking, current_summary + f"\n\nError: {e}", info)
 
577
 
578
 
579
  # Custom CSS for better UI
@@ -853,6 +991,9 @@ def create_interface():
853
  variant="primary",
854
  elem_classes=["submit-btn"]
855
  )
 
 
 
856
 
857
  # Model info section (dynamic)
858
  with gr.Group():
@@ -894,7 +1035,7 @@ def create_interface():
894
  submit_btn.click(
895
  fn=summarize_streaming,
896
  inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
897
- outputs=[thinking_output, summary_output, info_output],
898
  show_progress="full"
899
  )
900
 
@@ -922,7 +1063,7 @@ def create_interface():
922
  # Download button
923
  download_btn.click(
924
  fn=download_summary_json,
925
- inputs=[summary_output, thinking_output, model_dropdown, language_selector],
926
  outputs=[gr.File(label="Download")]
927
  )
928
 
 
291
  return gr.update(visible=supports_toggle)
292
 
293
 
294
+ def download_summary_json(summary, thinking, model_key, language, metrics):
295
  """Generate JSON file with summary and metadata."""
296
  import json
297
  from datetime import datetime
 
307
  "summary": summary
308
  }
309
 
310
+ # Add generation metrics if available
311
+ if metrics and isinstance(metrics, dict):
312
+ data["generation_metrics"] = {
313
+ "settings_used": metrics.get("settings", {}),
314
+ "timing": {
315
+ "time_to_first_token_ms": round(metrics.get("time_to_first_token_ms", 0), 2) if metrics.get("time_to_first_token_ms") else None,
316
+ "total_processing_time_ms": round(metrics.get("total_processing_time_ms", 0), 2) if metrics.get("total_processing_time_ms") else None,
317
+ "model_load_time_ms": round(metrics.get("model_load_time_ms", 0), 2) if metrics.get("model_load_time_ms") else None,
318
+ },
319
+ "tokens": {
320
+ "n_ctx": metrics.get("n_ctx"),
321
+ "input_tokens": metrics.get("input_tokens"),
322
+ "output_tokens": metrics.get("output_tokens"),
323
+ "thinking_tokens": metrics.get("thinking_tokens"),
324
+ "total_tokens": metrics.get("total_tokens"),
325
+ "generation_tokens": metrics.get("generation_tokens"),
326
+ "prefill_tokens": metrics.get("prefill_tokens")
327
+ },
328
+ "performance": {
329
+ "generation_speed_tps": round(metrics.get("generation_speed_tps", 0), 2) if metrics.get("generation_speed_tps") else None,
330
+ "prefill_speed_tps": round(metrics.get("prefill_speed_tps", 0), 2) if metrics.get("prefill_speed_tps") else None
331
+ },
332
+ "file_info": metrics.get("file_info", {}),
333
+ "truncation_info": metrics.get("truncation_info", {})
334
+ }
335
+
336
  filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
337
  with open(filename, 'w', encoding='utf-8') as f:
338
  json.dump(data, f, ensure_ascii=False, indent=2)
 
451
  top_p: float = None,
452
  top_k: int = None,
453
  output_language: str = "en",
454
+ ) -> Generator[Tuple[str, str, str, dict], None, None]:
455
  """
456
  Stream summary generation from uploaded file.
457
+
458
  Args:
459
  file_obj: Gradio file object
460
  model_key: Model identifier from AVAILABLE_MODELS
 
463
  top_p: Nucleus sampling parameter (uses model default if None)
464
  top_k: Top-k sampling parameter (uses model default if None)
465
  output_language: Target language for summary ("en" or "zh-TW")
466
+
467
  Yields:
468
+ Tuple of (thinking_text, summary_text, info_text, metrics_dict)
469
  """
470
+ import time
471
+
472
+ metrics = {
473
+ "start_time": None,
474
+ "time_to_first_token_ms": None,
475
+ "generation_start_time": None,
476
+ "generation_end_time": None,
477
+ "model_load_time_ms": None,
478
+ "total_tokens": 0,
479
+ "generation_tokens": 0,
480
+ "prefill_tokens": 0,
481
+ "input_tokens": 0,
482
+ "output_tokens": 0,
483
+ "thinking_tokens": 0,
484
+ "n_ctx": 0,
485
+ "settings": {},
486
+ "file_info": {},
487
+ "truncation_info": {},
488
+ }
489
  global llm, converter
490
 
491
  model = AVAILABLE_MODELS[model_key]
 
498
  # Read uploaded file
499
  try:
500
  path = file_obj.name if hasattr(file_obj, 'name') else file_obj
501
+ # Get file metadata
502
+ import os
503
+ file_size = os.path.getsize(path)
504
+ file_name = os.path.basename(path)
505
+
506
  with open(path, 'r', encoding='utf-8') as f:
507
  transcript = f.read()
508
+
509
+ # Store file info
510
+ metrics["file_info"] = {
511
+ "filename": file_name,
512
+ "size_bytes": file_size,
513
+ "original_char_count": len(transcript),
514
+ }
515
  except Exception as e:
516
+ yield ("", f"Error reading file: {e}", "", metrics)
517
  return
518
+
519
  if not transcript.strip():
520
+ yield ("", "Error: File is empty", "", metrics)
521
  return
522
+
523
  # Calculate context and check truncation
524
  n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens)
525
+ metrics["n_ctx"] = n_ctx
526
+
527
  # Truncate if needed (estimate max chars from available tokens)
528
  available_tokens = usable_max - max_tokens - 512
529
  max_bytes = available_tokens * 3 # Reverse estimate: tokens * 3 bytes
530
  encoded = transcript.encode('utf-8')
531
+ was_truncated = len(encoded) > max_bytes
532
+ original_length = len(transcript)
533
+
534
+ if was_truncated:
535
  transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
536
  transcript += "\n\n[Content truncated to fit model context]"
537
+
538
+ # Store truncation info
539
+ metrics["truncation_info"] = {
540
+ "was_truncated": was_truncated,
541
+ "original_char_count": original_length,
542
+ "final_char_count": len(transcript),
543
+ "original_token_estimate": estimate_tokens(transcript) if not was_truncated else estimate_tokens(encoded[:max_bytes].decode('utf-8', errors='ignore')),
544
+ }
545
 
546
  # Build info text
547
  input_tokens = estimate_tokens(transcript)
 
554
  if warning:
555
  info += f"\n\n{warning}"
556
 
557
+ # Load model (no-op if already loaded) with timing
558
+ model_load_start = time.time()
559
  try:
560
  llm, load_msg = load_model(model_key)
561
  logger.info(load_msg)
562
+ metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
563
  except Exception as e:
564
+ yield ("", f"Error loading model: {e}", "", metrics)
565
  return
566
 
567
  # Prepare system prompt with reasoning toggle for Qwen3 models
 
608
  current_summary = ""
609
 
610
  try:
611
+ # Record generation settings
612
+ metrics["settings"] = {
613
+ "model": model_key,
614
+ "max_tokens": max_tokens,
615
+ "temperature": effective_temperature,
616
+ "top_p": final_top_p,
617
+ "top_k": final_top_k,
618
+ "repeat_penalty": repeat_penalty,
619
+ "enable_reasoning": enable_reasoning,
620
+ "output_language": output_language,
621
+ "n_ctx": metrics["n_ctx"],
622
+ }
623
+
624
+ # Calculate exact input tokens (system + user prompts)
625
+ system_tokens = estimate_tokens(system_content)
626
+ user_tokens = estimate_tokens(user_content)
627
+ metrics["input_tokens"] = system_tokens + user_tokens
628
+
629
+ # Start timing
630
+ metrics["start_time"] = time.time()
631
+ first_token_time = None
632
+ token_count = 0
633
+
634
  # Apply model-specific inference settings
635
  stream = llm.create_chat_completion(
636
  messages=messages,
 
642
  repeat_penalty=repeat_penalty,
643
  stream=True,
644
  )
645
+
646
+ metrics["generation_start_time"] = time.time()
647
+
648
  for chunk in stream:
649
  if 'choices' in chunk and len(chunk['choices']) > 0:
650
  delta = chunk['choices'][0].get('delta', {})
651
  content = delta.get('content', '')
652
  if content:
653
+ # Track time to first token
654
+ if first_token_time is None:
655
+ first_token_time = time.time()
656
+ metrics["time_to_first_token_ms"] = (first_token_time - metrics["start_time"]) * 1000
657
+
658
+ token_count += 1
659
+
660
  if output_language == "zh-TW":
661
  converted = converter.convert(content)
662
  full_response += converted
663
  else:
664
  full_response += content
665
+
666
  thinking, summary = parse_thinking_blocks(full_response, streaming=True)
667
  current_thinking = thinking or ""
668
  current_summary = summary or ""
669
+ yield (current_thinking, current_summary, info, metrics)
670
+
671
+ # Final timing calculations
672
+ metrics["generation_end_time"] = time.time()
673
+ metrics["generation_tokens"] = token_count
674
+ metrics["total_tokens"] = token_count
675
+
676
+ # Calculate speeds
677
+ generation_duration = metrics["generation_end_time"] - metrics["generation_start_time"]
678
+ if generation_duration > 0:
679
+ metrics["generation_speed_tps"] = token_count / generation_duration
680
+ else:
681
+ metrics["generation_speed_tps"] = 0.0
682
+
683
+ # Prefill = time from start to first token
684
+ if metrics["time_to_first_token_ms"]:
685
+ prefill_seconds = metrics["time_to_first_token_ms"] / 1000
686
+ # Estimate prefill tokens (input tokens processed before first output)
687
+ input_tokens = estimate_tokens(transcript)
688
+ metrics["prefill_tokens"] = input_tokens
689
+ if prefill_seconds > 0:
690
+ metrics["prefill_speed_tps"] = input_tokens / prefill_seconds
691
+ else:
692
+ metrics["prefill_speed_tps"] = 0.0
693
+
694
+ # Total processing time
695
+ metrics["total_processing_time_ms"] = (metrics["generation_end_time"] - metrics["start_time"]) * 1000
696
+
697
+ # Final parse and token counts
698
  thinking, summary = parse_thinking_blocks(full_response)
699
+
700
+ # Calculate output tokens
701
+ metrics["output_tokens"] = estimate_tokens(summary) if summary else 0
702
+ metrics["thinking_tokens"] = estimate_tokens(thinking) if thinking else 0
703
+
704
+ # Update totals
705
+ metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
706
+
707
+ yield (thinking or "", summary or "", info, metrics)
708
+
709
  llm.reset()
710
+
711
  except Exception as e:
712
  logger.error(f"Generation error: {e}")
713
+ metrics["error"] = str(e)
714
+ yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics)
715
 
716
 
717
  # Custom CSS for better UI
 
991
  variant="primary",
992
  elem_classes=["submit-btn"]
993
  )
994
+
995
+ # Hidden state to store generation metrics
996
+ metrics_state = gr.State(value={})
997
 
998
  # Model info section (dynamic)
999
  with gr.Group():
 
1035
  submit_btn.click(
1036
  fn=summarize_streaming,
1037
  inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
1038
+ outputs=[thinking_output, summary_output, info_output, metrics_state],
1039
  show_progress="full"
1040
  )
1041
 
 
1063
  # Download button
1064
  download_btn.click(
1065
  fn=download_summary_json,
1066
+ inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
1067
  outputs=[gr.File(label="Download")]
1068
  )
1069