Spaces:
Running
Running
feat: Add comprehensive generation metrics to JSON export
Browse files- Added timing: time_to_first_token, total_processing_time, model_load_time
- Added tokens: n_ctx, input_tokens, output_tokens, thinking_tokens, total_tokens
- Added performance: generation_speed_tps, prefill_speed_tps
- Added file_info: filename, size_bytes, original_char_count
- Added truncation_info: was_truncated, original/final char counts
- Updated download_summary_json with organized metrics structure
app.py
CHANGED
|
@@ -291,7 +291,7 @@ def update_reasoning_visibility(model_key):
|
|
| 291 |
return gr.update(visible=supports_toggle)
|
| 292 |
|
| 293 |
|
| 294 |
-
def download_summary_json(summary, thinking, model_key, language):
|
| 295 |
"""Generate JSON file with summary and metadata."""
|
| 296 |
import json
|
| 297 |
from datetime import datetime
|
|
@@ -307,6 +307,32 @@ def download_summary_json(summary, thinking, model_key, language):
|
|
| 307 |
"summary": summary
|
| 308 |
}
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 311 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 312 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
@@ -425,10 +451,10 @@ def summarize_streaming(
|
|
| 425 |
top_p: float = None,
|
| 426 |
top_k: int = None,
|
| 427 |
output_language: str = "en",
|
| 428 |
-
) -> Generator[Tuple[str, str, str], None, None]:
|
| 429 |
"""
|
| 430 |
Stream summary generation from uploaded file.
|
| 431 |
-
|
| 432 |
Args:
|
| 433 |
file_obj: Gradio file object
|
| 434 |
model_key: Model identifier from AVAILABLE_MODELS
|
|
@@ -437,10 +463,29 @@ def summarize_streaming(
|
|
| 437 |
top_p: Nucleus sampling parameter (uses model default if None)
|
| 438 |
top_k: Top-k sampling parameter (uses model default if None)
|
| 439 |
output_language: Target language for summary ("en" or "zh-TW")
|
| 440 |
-
|
| 441 |
Yields:
|
| 442 |
-
Tuple of (thinking_text, summary_text, info_text)
|
| 443 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
global llm, converter
|
| 445 |
|
| 446 |
model = AVAILABLE_MODELS[model_key]
|
|
@@ -453,26 +498,50 @@ def summarize_streaming(
|
|
| 453 |
# Read uploaded file
|
| 454 |
try:
|
| 455 |
path = file_obj.name if hasattr(file_obj, 'name') else file_obj
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
with open(path, 'r', encoding='utf-8') as f:
|
| 457 |
transcript = f.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
except Exception as e:
|
| 459 |
-
yield ("", f"Error reading file: {e}", "")
|
| 460 |
return
|
| 461 |
-
|
| 462 |
if not transcript.strip():
|
| 463 |
-
yield ("", "Error: File is empty", "")
|
| 464 |
return
|
| 465 |
-
|
| 466 |
# Calculate context and check truncation
|
| 467 |
n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens)
|
| 468 |
-
|
|
|
|
| 469 |
# Truncate if needed (estimate max chars from available tokens)
|
| 470 |
available_tokens = usable_max - max_tokens - 512
|
| 471 |
max_bytes = available_tokens * 3 # Reverse estimate: tokens * 3 bytes
|
| 472 |
encoded = transcript.encode('utf-8')
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
| 474 |
transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
|
| 475 |
transcript += "\n\n[Content truncated to fit model context]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
|
| 477 |
# Build info text
|
| 478 |
input_tokens = estimate_tokens(transcript)
|
|
@@ -485,12 +554,14 @@ def summarize_streaming(
|
|
| 485 |
if warning:
|
| 486 |
info += f"\n\n{warning}"
|
| 487 |
|
| 488 |
-
# Load model (no-op if already loaded)
|
|
|
|
| 489 |
try:
|
| 490 |
llm, load_msg = load_model(model_key)
|
| 491 |
logger.info(load_msg)
|
|
|
|
| 492 |
except Exception as e:
|
| 493 |
-
yield ("", f"Error loading model: {e}", "")
|
| 494 |
return
|
| 495 |
|
| 496 |
# Prepare system prompt with reasoning toggle for Qwen3 models
|
|
@@ -537,6 +608,29 @@ def summarize_streaming(
|
|
| 537 |
current_summary = ""
|
| 538 |
|
| 539 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
# Apply model-specific inference settings
|
| 541 |
stream = llm.create_chat_completion(
|
| 542 |
messages=messages,
|
|
@@ -548,32 +642,76 @@ def summarize_streaming(
|
|
| 548 |
repeat_penalty=repeat_penalty,
|
| 549 |
stream=True,
|
| 550 |
)
|
| 551 |
-
|
|
|
|
|
|
|
| 552 |
for chunk in stream:
|
| 553 |
if 'choices' in chunk and len(chunk['choices']) > 0:
|
| 554 |
delta = chunk['choices'][0].get('delta', {})
|
| 555 |
content = delta.get('content', '')
|
| 556 |
if content:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
if output_language == "zh-TW":
|
| 558 |
converted = converter.convert(content)
|
| 559 |
full_response += converted
|
| 560 |
else:
|
| 561 |
full_response += content
|
| 562 |
-
|
| 563 |
thinking, summary = parse_thinking_blocks(full_response, streaming=True)
|
| 564 |
current_thinking = thinking or ""
|
| 565 |
current_summary = summary or ""
|
| 566 |
-
yield (current_thinking, current_summary, info)
|
| 567 |
-
|
| 568 |
-
# Final
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
thinking, summary = parse_thinking_blocks(full_response)
|
| 570 |
-
|
| 571 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
llm.reset()
|
| 573 |
-
|
| 574 |
except Exception as e:
|
| 575 |
logger.error(f"Generation error: {e}")
|
| 576 |
-
|
|
|
|
| 577 |
|
| 578 |
|
| 579 |
# Custom CSS for better UI
|
|
@@ -853,6 +991,9 @@ def create_interface():
|
|
| 853 |
variant="primary",
|
| 854 |
elem_classes=["submit-btn"]
|
| 855 |
)
|
|
|
|
|
|
|
|
|
|
| 856 |
|
| 857 |
# Model info section (dynamic)
|
| 858 |
with gr.Group():
|
|
@@ -894,7 +1035,7 @@ def create_interface():
|
|
| 894 |
submit_btn.click(
|
| 895 |
fn=summarize_streaming,
|
| 896 |
inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
|
| 897 |
-
outputs=[thinking_output, summary_output, info_output],
|
| 898 |
show_progress="full"
|
| 899 |
)
|
| 900 |
|
|
@@ -922,7 +1063,7 @@ def create_interface():
|
|
| 922 |
# Download button
|
| 923 |
download_btn.click(
|
| 924 |
fn=download_summary_json,
|
| 925 |
-
inputs=[summary_output, thinking_output, model_dropdown, language_selector],
|
| 926 |
outputs=[gr.File(label="Download")]
|
| 927 |
)
|
| 928 |
|
|
|
|
| 291 |
return gr.update(visible=supports_toggle)
|
| 292 |
|
| 293 |
|
| 294 |
+
def download_summary_json(summary, thinking, model_key, language, metrics):
|
| 295 |
"""Generate JSON file with summary and metadata."""
|
| 296 |
import json
|
| 297 |
from datetime import datetime
|
|
|
|
| 307 |
"summary": summary
|
| 308 |
}
|
| 309 |
|
| 310 |
+
# Add generation metrics if available
|
| 311 |
+
if metrics and isinstance(metrics, dict):
|
| 312 |
+
data["generation_metrics"] = {
|
| 313 |
+
"settings_used": metrics.get("settings", {}),
|
| 314 |
+
"timing": {
|
| 315 |
+
"time_to_first_token_ms": round(metrics.get("time_to_first_token_ms", 0), 2) if metrics.get("time_to_first_token_ms") else None,
|
| 316 |
+
"total_processing_time_ms": round(metrics.get("total_processing_time_ms", 0), 2) if metrics.get("total_processing_time_ms") else None,
|
| 317 |
+
"model_load_time_ms": round(metrics.get("model_load_time_ms", 0), 2) if metrics.get("model_load_time_ms") else None,
|
| 318 |
+
},
|
| 319 |
+
"tokens": {
|
| 320 |
+
"n_ctx": metrics.get("n_ctx"),
|
| 321 |
+
"input_tokens": metrics.get("input_tokens"),
|
| 322 |
+
"output_tokens": metrics.get("output_tokens"),
|
| 323 |
+
"thinking_tokens": metrics.get("thinking_tokens"),
|
| 324 |
+
"total_tokens": metrics.get("total_tokens"),
|
| 325 |
+
"generation_tokens": metrics.get("generation_tokens"),
|
| 326 |
+
"prefill_tokens": metrics.get("prefill_tokens")
|
| 327 |
+
},
|
| 328 |
+
"performance": {
|
| 329 |
+
"generation_speed_tps": round(metrics.get("generation_speed_tps", 0), 2) if metrics.get("generation_speed_tps") else None,
|
| 330 |
+
"prefill_speed_tps": round(metrics.get("prefill_speed_tps", 0), 2) if metrics.get("prefill_speed_tps") else None
|
| 331 |
+
},
|
| 332 |
+
"file_info": metrics.get("file_info", {}),
|
| 333 |
+
"truncation_info": metrics.get("truncation_info", {})
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 337 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 338 |
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
| 451 |
top_p: float = None,
|
| 452 |
top_k: int = None,
|
| 453 |
output_language: str = "en",
|
| 454 |
+
) -> Generator[Tuple[str, str, str, dict], None, None]:
|
| 455 |
"""
|
| 456 |
Stream summary generation from uploaded file.
|
| 457 |
+
|
| 458 |
Args:
|
| 459 |
file_obj: Gradio file object
|
| 460 |
model_key: Model identifier from AVAILABLE_MODELS
|
|
|
|
| 463 |
top_p: Nucleus sampling parameter (uses model default if None)
|
| 464 |
top_k: Top-k sampling parameter (uses model default if None)
|
| 465 |
output_language: Target language for summary ("en" or "zh-TW")
|
| 466 |
+
|
| 467 |
Yields:
|
| 468 |
+
Tuple of (thinking_text, summary_text, info_text, metrics_dict)
|
| 469 |
"""
|
| 470 |
+
import time
|
| 471 |
+
|
| 472 |
+
metrics = {
|
| 473 |
+
"start_time": None,
|
| 474 |
+
"time_to_first_token_ms": None,
|
| 475 |
+
"generation_start_time": None,
|
| 476 |
+
"generation_end_time": None,
|
| 477 |
+
"model_load_time_ms": None,
|
| 478 |
+
"total_tokens": 0,
|
| 479 |
+
"generation_tokens": 0,
|
| 480 |
+
"prefill_tokens": 0,
|
| 481 |
+
"input_tokens": 0,
|
| 482 |
+
"output_tokens": 0,
|
| 483 |
+
"thinking_tokens": 0,
|
| 484 |
+
"n_ctx": 0,
|
| 485 |
+
"settings": {},
|
| 486 |
+
"file_info": {},
|
| 487 |
+
"truncation_info": {},
|
| 488 |
+
}
|
| 489 |
global llm, converter
|
| 490 |
|
| 491 |
model = AVAILABLE_MODELS[model_key]
|
|
|
|
| 498 |
# Read uploaded file
|
| 499 |
try:
|
| 500 |
path = file_obj.name if hasattr(file_obj, 'name') else file_obj
|
| 501 |
+
# Get file metadata
|
| 502 |
+
import os
|
| 503 |
+
file_size = os.path.getsize(path)
|
| 504 |
+
file_name = os.path.basename(path)
|
| 505 |
+
|
| 506 |
with open(path, 'r', encoding='utf-8') as f:
|
| 507 |
transcript = f.read()
|
| 508 |
+
|
| 509 |
+
# Store file info
|
| 510 |
+
metrics["file_info"] = {
|
| 511 |
+
"filename": file_name,
|
| 512 |
+
"size_bytes": file_size,
|
| 513 |
+
"original_char_count": len(transcript),
|
| 514 |
+
}
|
| 515 |
except Exception as e:
|
| 516 |
+
yield ("", f"Error reading file: {e}", "", metrics)
|
| 517 |
return
|
| 518 |
+
|
| 519 |
if not transcript.strip():
|
| 520 |
+
yield ("", "Error: File is empty", "", metrics)
|
| 521 |
return
|
| 522 |
+
|
| 523 |
# Calculate context and check truncation
|
| 524 |
n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens)
|
| 525 |
+
metrics["n_ctx"] = n_ctx
|
| 526 |
+
|
| 527 |
# Truncate if needed (estimate max chars from available tokens)
|
| 528 |
available_tokens = usable_max - max_tokens - 512
|
| 529 |
max_bytes = available_tokens * 3 # Reverse estimate: tokens * 3 bytes
|
| 530 |
encoded = transcript.encode('utf-8')
|
| 531 |
+
was_truncated = len(encoded) > max_bytes
|
| 532 |
+
original_length = len(transcript)
|
| 533 |
+
|
| 534 |
+
if was_truncated:
|
| 535 |
transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
|
| 536 |
transcript += "\n\n[Content truncated to fit model context]"
|
| 537 |
+
|
| 538 |
+
# Store truncation info
|
| 539 |
+
metrics["truncation_info"] = {
|
| 540 |
+
"was_truncated": was_truncated,
|
| 541 |
+
"original_char_count": original_length,
|
| 542 |
+
"final_char_count": len(transcript),
|
| 543 |
+
"original_token_estimate": estimate_tokens(transcript) if not was_truncated else estimate_tokens(encoded[:max_bytes].decode('utf-8', errors='ignore')),
|
| 544 |
+
}
|
| 545 |
|
| 546 |
# Build info text
|
| 547 |
input_tokens = estimate_tokens(transcript)
|
|
|
|
| 554 |
if warning:
|
| 555 |
info += f"\n\n{warning}"
|
| 556 |
|
| 557 |
+
# Load model (no-op if already loaded) with timing
|
| 558 |
+
model_load_start = time.time()
|
| 559 |
try:
|
| 560 |
llm, load_msg = load_model(model_key)
|
| 561 |
logger.info(load_msg)
|
| 562 |
+
metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
|
| 563 |
except Exception as e:
|
| 564 |
+
yield ("", f"Error loading model: {e}", "", metrics)
|
| 565 |
return
|
| 566 |
|
| 567 |
# Prepare system prompt with reasoning toggle for Qwen3 models
|
|
|
|
| 608 |
current_summary = ""
|
| 609 |
|
| 610 |
try:
|
| 611 |
+
# Record generation settings
|
| 612 |
+
metrics["settings"] = {
|
| 613 |
+
"model": model_key,
|
| 614 |
+
"max_tokens": max_tokens,
|
| 615 |
+
"temperature": effective_temperature,
|
| 616 |
+
"top_p": final_top_p,
|
| 617 |
+
"top_k": final_top_k,
|
| 618 |
+
"repeat_penalty": repeat_penalty,
|
| 619 |
+
"enable_reasoning": enable_reasoning,
|
| 620 |
+
"output_language": output_language,
|
| 621 |
+
"n_ctx": metrics["n_ctx"],
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
# Calculate exact input tokens (system + user prompts)
|
| 625 |
+
system_tokens = estimate_tokens(system_content)
|
| 626 |
+
user_tokens = estimate_tokens(user_content)
|
| 627 |
+
metrics["input_tokens"] = system_tokens + user_tokens
|
| 628 |
+
|
| 629 |
+
# Start timing
|
| 630 |
+
metrics["start_time"] = time.time()
|
| 631 |
+
first_token_time = None
|
| 632 |
+
token_count = 0
|
| 633 |
+
|
| 634 |
# Apply model-specific inference settings
|
| 635 |
stream = llm.create_chat_completion(
|
| 636 |
messages=messages,
|
|
|
|
| 642 |
repeat_penalty=repeat_penalty,
|
| 643 |
stream=True,
|
| 644 |
)
|
| 645 |
+
|
| 646 |
+
metrics["generation_start_time"] = time.time()
|
| 647 |
+
|
| 648 |
for chunk in stream:
|
| 649 |
if 'choices' in chunk and len(chunk['choices']) > 0:
|
| 650 |
delta = chunk['choices'][0].get('delta', {})
|
| 651 |
content = delta.get('content', '')
|
| 652 |
if content:
|
| 653 |
+
# Track time to first token
|
| 654 |
+
if first_token_time is None:
|
| 655 |
+
first_token_time = time.time()
|
| 656 |
+
metrics["time_to_first_token_ms"] = (first_token_time - metrics["start_time"]) * 1000
|
| 657 |
+
|
| 658 |
+
token_count += 1
|
| 659 |
+
|
| 660 |
if output_language == "zh-TW":
|
| 661 |
converted = converter.convert(content)
|
| 662 |
full_response += converted
|
| 663 |
else:
|
| 664 |
full_response += content
|
| 665 |
+
|
| 666 |
thinking, summary = parse_thinking_blocks(full_response, streaming=True)
|
| 667 |
current_thinking = thinking or ""
|
| 668 |
current_summary = summary or ""
|
| 669 |
+
yield (current_thinking, current_summary, info, metrics)
|
| 670 |
+
|
| 671 |
+
# Final timing calculations
|
| 672 |
+
metrics["generation_end_time"] = time.time()
|
| 673 |
+
metrics["generation_tokens"] = token_count
|
| 674 |
+
metrics["total_tokens"] = token_count
|
| 675 |
+
|
| 676 |
+
# Calculate speeds
|
| 677 |
+
generation_duration = metrics["generation_end_time"] - metrics["generation_start_time"]
|
| 678 |
+
if generation_duration > 0:
|
| 679 |
+
metrics["generation_speed_tps"] = token_count / generation_duration
|
| 680 |
+
else:
|
| 681 |
+
metrics["generation_speed_tps"] = 0.0
|
| 682 |
+
|
| 683 |
+
# Prefill = time from start to first token
|
| 684 |
+
if metrics["time_to_first_token_ms"]:
|
| 685 |
+
prefill_seconds = metrics["time_to_first_token_ms"] / 1000
|
| 686 |
+
# Estimate prefill tokens (input tokens processed before first output)
|
| 687 |
+
input_tokens = estimate_tokens(transcript)
|
| 688 |
+
metrics["prefill_tokens"] = input_tokens
|
| 689 |
+
if prefill_seconds > 0:
|
| 690 |
+
metrics["prefill_speed_tps"] = input_tokens / prefill_seconds
|
| 691 |
+
else:
|
| 692 |
+
metrics["prefill_speed_tps"] = 0.0
|
| 693 |
+
|
| 694 |
+
# Total processing time
|
| 695 |
+
metrics["total_processing_time_ms"] = (metrics["generation_end_time"] - metrics["start_time"]) * 1000
|
| 696 |
+
|
| 697 |
+
# Final parse and token counts
|
| 698 |
thinking, summary = parse_thinking_blocks(full_response)
|
| 699 |
+
|
| 700 |
+
# Calculate output tokens
|
| 701 |
+
metrics["output_tokens"] = estimate_tokens(summary) if summary else 0
|
| 702 |
+
metrics["thinking_tokens"] = estimate_tokens(thinking) if thinking else 0
|
| 703 |
+
|
| 704 |
+
# Update totals
|
| 705 |
+
metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
|
| 706 |
+
|
| 707 |
+
yield (thinking or "", summary or "", info, metrics)
|
| 708 |
+
|
| 709 |
llm.reset()
|
| 710 |
+
|
| 711 |
except Exception as e:
|
| 712 |
logger.error(f"Generation error: {e}")
|
| 713 |
+
metrics["error"] = str(e)
|
| 714 |
+
yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics)
|
| 715 |
|
| 716 |
|
| 717 |
# Custom CSS for better UI
|
|
|
|
| 991 |
variant="primary",
|
| 992 |
elem_classes=["submit-btn"]
|
| 993 |
)
|
| 994 |
+
|
| 995 |
+
# Hidden state to store generation metrics
|
| 996 |
+
metrics_state = gr.State(value={})
|
| 997 |
|
| 998 |
# Model info section (dynamic)
|
| 999 |
with gr.Group():
|
|
|
|
| 1035 |
submit_btn.click(
|
| 1036 |
fn=summarize_streaming,
|
| 1037 |
inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
|
| 1038 |
+
outputs=[thinking_output, summary_output, info_output, metrics_state],
|
| 1039 |
show_progress="full"
|
| 1040 |
)
|
| 1041 |
|
|
|
|
| 1063 |
# Download button
|
| 1064 |
download_btn.click(
|
| 1065 |
fn=download_summary_json,
|
| 1066 |
+
inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
|
| 1067 |
outputs=[gr.File(label="Download")]
|
| 1068 |
)
|
| 1069 |
|