Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Feb 1

Commit

c0889b4

1 Parent(s): 451a43d

feat: Add comprehensive generation metrics to JSON export

- Added timing: time_to_first_token, total_processing_time, model_load_time
- Added tokens: n_ctx, input_tokens, output_tokens, thinking_tokens, total_tokens
- Added performance: generation_speed_tps, prefill_speed_tps
- Added file_info: filename, size_bytes, original_char_count
- Added truncation_info: was_truncated, original/final char counts
- Updated download_summary_json with organized metrics structure

Files changed (1) hide show

app.py +165 -24

app.py CHANGED Viewed

@@ -291,7 +291,7 @@ def update_reasoning_visibility(model_key):
     return gr.update(visible=supports_toggle)
-def download_summary_json(summary, thinking, model_key, language):
     """Generate JSON file with summary and metadata."""
     import json
     from datetime import datetime
@@ -307,6 +307,32 @@ def download_summary_json(summary, thinking, model_key, language):
         "summary": summary
     }
     filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
     with open(filename, 'w', encoding='utf-8') as f:
         json.dump(data, f, ensure_ascii=False, indent=2)
@@ -425,10 +451,10 @@ def summarize_streaming(
     top_p: float = None,
     top_k: int = None,
     output_language: str = "en",
-) -> Generator[Tuple[str, str, str], None, None]:
     """
     Stream summary generation from uploaded file.
     Args:
         file_obj: Gradio file object
         model_key: Model identifier from AVAILABLE_MODELS
@@ -437,10 +463,29 @@ def summarize_streaming(
         top_p: Nucleus sampling parameter (uses model default if None)
         top_k: Top-k sampling parameter (uses model default if None)
         output_language: Target language for summary ("en" or "zh-TW")
     Yields:
-        Tuple of (thinking_text, summary_text, info_text)
     """
     global llm, converter
     model = AVAILABLE_MODELS[model_key]
@@ -453,26 +498,50 @@ def summarize_streaming(
     # Read uploaded file
     try:
         path = file_obj.name if hasattr(file_obj, 'name') else file_obj
         with open(path, 'r', encoding='utf-8') as f:
             transcript = f.read()
     except Exception as e:
-        yield ("", f"Error reading file: {e}", "")
         return
     if not transcript.strip():
-        yield ("", "Error: File is empty", "")
         return
     # Calculate context and check truncation
     n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens)
     # Truncate if needed (estimate max chars from available tokens)
     available_tokens = usable_max - max_tokens - 512
     max_bytes = available_tokens * 3  # Reverse estimate: tokens * 3 bytes
     encoded = transcript.encode('utf-8')
-    if len(encoded) > max_bytes:
         transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
         transcript += "\n\n[Content truncated to fit model context]"
     # Build info text
     input_tokens = estimate_tokens(transcript)
@@ -485,12 +554,14 @@ def summarize_streaming(
     if warning:
         info += f"\n\n{warning}"
-    # Load model (no-op if already loaded)
     try:
         llm, load_msg = load_model(model_key)
         logger.info(load_msg)
     except Exception as e:
-        yield ("", f"Error loading model: {e}", "")
         return
     # Prepare system prompt with reasoning toggle for Qwen3 models
@@ -537,6 +608,29 @@ def summarize_streaming(
     current_summary = ""
     try:
         # Apply model-specific inference settings
         stream = llm.create_chat_completion(
             messages=messages,
@@ -548,32 +642,76 @@ def summarize_streaming(
             repeat_penalty=repeat_penalty,
             stream=True,
         )
         for chunk in stream:
             if 'choices' in chunk and len(chunk['choices']) > 0:
                 delta = chunk['choices'][0].get('delta', {})
                 content = delta.get('content', '')
                 if content:
                     if output_language == "zh-TW":
                         converted = converter.convert(content)
                         full_response += converted
                     else:
                         full_response += content
                     thinking, summary = parse_thinking_blocks(full_response, streaming=True)
                     current_thinking = thinking or ""
                     current_summary = summary or ""
-                    yield (current_thinking, current_summary, info)
-        # Final parse
         thinking, summary = parse_thinking_blocks(full_response)
-        yield (thinking or "", summary or "", info)
         llm.reset()
     except Exception as e:
         logger.error(f"Generation error: {e}")
-        yield (current_thinking, current_summary + f"\n\nError: {e}", info)
 # Custom CSS for better UI
@@ -853,6 +991,9 @@ def create_interface():
                         variant="primary",
                         elem_classes=["submit-btn"]
                     )
                 # Model info section (dynamic)
                 with gr.Group():
@@ -894,7 +1035,7 @@ def create_interface():
         submit_btn.click(
             fn=summarize_streaming,
             inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
-            outputs=[thinking_output, summary_output, info_output],
             show_progress="full"
         )
@@ -922,7 +1063,7 @@ def create_interface():
         # Download button
         download_btn.click(
             fn=download_summary_json,
-            inputs=[summary_output, thinking_output, model_dropdown, language_selector],
             outputs=[gr.File(label="Download")]
         )

     return gr.update(visible=supports_toggle)
+def download_summary_json(summary, thinking, model_key, language, metrics):
     """Generate JSON file with summary and metadata."""
     import json
     from datetime import datetime
         "summary": summary
     }
+    # Add generation metrics if available
+    if metrics and isinstance(metrics, dict):
+        data["generation_metrics"] = {
+            "settings_used": metrics.get("settings", {}),
+            "timing": {
+                "time_to_first_token_ms": round(metrics.get("time_to_first_token_ms", 0), 2) if metrics.get("time_to_first_token_ms") else None,
+                "total_processing_time_ms": round(metrics.get("total_processing_time_ms", 0), 2) if metrics.get("total_processing_time_ms") else None,
+                "model_load_time_ms": round(metrics.get("model_load_time_ms", 0), 2) if metrics.get("model_load_time_ms") else None,
+            },
+            "tokens": {
+                "n_ctx": metrics.get("n_ctx"),
+                "input_tokens": metrics.get("input_tokens"),
+                "output_tokens": metrics.get("output_tokens"),
+                "thinking_tokens": metrics.get("thinking_tokens"),
+                "total_tokens": metrics.get("total_tokens"),
+                "generation_tokens": metrics.get("generation_tokens"),
+                "prefill_tokens": metrics.get("prefill_tokens")
+            },
+            "performance": {
+                "generation_speed_tps": round(metrics.get("generation_speed_tps", 0), 2) if metrics.get("generation_speed_tps") else None,
+                "prefill_speed_tps": round(metrics.get("prefill_speed_tps", 0), 2) if metrics.get("prefill_speed_tps") else None
+            },
+            "file_info": metrics.get("file_info", {}),
+            "truncation_info": metrics.get("truncation_info", {})
+        }
     filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
     with open(filename, 'w', encoding='utf-8') as f:
         json.dump(data, f, ensure_ascii=False, indent=2)
     top_p: float = None,
     top_k: int = None,
     output_language: str = "en",
+) -> Generator[Tuple[str, str, str, dict], None, None]:
     """
     Stream summary generation from uploaded file.
     Args:
         file_obj: Gradio file object
         model_key: Model identifier from AVAILABLE_MODELS
         top_p: Nucleus sampling parameter (uses model default if None)
         top_k: Top-k sampling parameter (uses model default if None)
         output_language: Target language for summary ("en" or "zh-TW")
     Yields:
+        Tuple of (thinking_text, summary_text, info_text, metrics_dict)
     """
+    import time
+    metrics = {
+        "start_time": None,
+        "time_to_first_token_ms": None,
+        "generation_start_time": None,
+        "generation_end_time": None,
+        "model_load_time_ms": None,
+        "total_tokens": 0,
+        "generation_tokens": 0,
+        "prefill_tokens": 0,
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "thinking_tokens": 0,
+        "n_ctx": 0,
+        "settings": {},
+        "file_info": {},
+        "truncation_info": {},
+    }
     global llm, converter
     model = AVAILABLE_MODELS[model_key]
     # Read uploaded file
     try:
         path = file_obj.name if hasattr(file_obj, 'name') else file_obj
+        # Get file metadata
+        import os
+        file_size = os.path.getsize(path)
+        file_name = os.path.basename(path)
         with open(path, 'r', encoding='utf-8') as f:
             transcript = f.read()
+        # Store file info
+        metrics["file_info"] = {
+            "filename": file_name,
+            "size_bytes": file_size,
+            "original_char_count": len(transcript),
+        }
     except Exception as e:
+        yield ("", f"Error reading file: {e}", "", metrics)
         return
     if not transcript.strip():
+        yield ("", "Error: File is empty", "", metrics)
         return
     # Calculate context and check truncation
     n_ctx, warning = calculate_n_ctx(model_key, transcript, max_tokens)
+    metrics["n_ctx"] = n_ctx
     # Truncate if needed (estimate max chars from available tokens)
     available_tokens = usable_max - max_tokens - 512
     max_bytes = available_tokens * 3  # Reverse estimate: tokens * 3 bytes
     encoded = transcript.encode('utf-8')
+    was_truncated = len(encoded) > max_bytes
+    original_length = len(transcript)
+    if was_truncated:
         transcript = encoded[:max_bytes].decode('utf-8', errors='ignore')
         transcript += "\n\n[Content truncated to fit model context]"
+    # Store truncation info
+    metrics["truncation_info"] = {
+        "was_truncated": was_truncated,
+        "original_char_count": original_length,
+        "final_char_count": len(transcript),
+        "original_token_estimate": estimate_tokens(transcript) if not was_truncated else estimate_tokens(encoded[:max_bytes].decode('utf-8', errors='ignore')),
+    }
     # Build info text
     input_tokens = estimate_tokens(transcript)
     if warning:
         info += f"\n\n{warning}"
+    # Load model (no-op if already loaded) with timing
+    model_load_start = time.time()
     try:
         llm, load_msg = load_model(model_key)
         logger.info(load_msg)
+        metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
     except Exception as e:
+        yield ("", f"Error loading model: {e}", "", metrics)
         return
     # Prepare system prompt with reasoning toggle for Qwen3 models
     current_summary = ""
     try:
+        # Record generation settings
+        metrics["settings"] = {
+            "model": model_key,
+            "max_tokens": max_tokens,
+            "temperature": effective_temperature,
+            "top_p": final_top_p,
+            "top_k": final_top_k,
+            "repeat_penalty": repeat_penalty,
+            "enable_reasoning": enable_reasoning,
+            "output_language": output_language,
+            "n_ctx": metrics["n_ctx"],
+        }
+        # Calculate exact input tokens (system + user prompts)
+        system_tokens = estimate_tokens(system_content)
+        user_tokens = estimate_tokens(user_content)
+        metrics["input_tokens"] = system_tokens + user_tokens
+        # Start timing
+        metrics["start_time"] = time.time()
+        first_token_time = None
+        token_count = 0
         # Apply model-specific inference settings
         stream = llm.create_chat_completion(
             messages=messages,
             repeat_penalty=repeat_penalty,
             stream=True,
         )
+        metrics["generation_start_time"] = time.time()
         for chunk in stream:
             if 'choices' in chunk and len(chunk['choices']) > 0:
                 delta = chunk['choices'][0].get('delta', {})
                 content = delta.get('content', '')
                 if content:
+                    # Track time to first token
+                    if first_token_time is None:
+                        first_token_time = time.time()
+                        metrics["time_to_first_token_ms"] = (first_token_time - metrics["start_time"]) * 1000
+                    token_count += 1
                     if output_language == "zh-TW":
                         converted = converter.convert(content)
                         full_response += converted
                     else:
                         full_response += content
                     thinking, summary = parse_thinking_blocks(full_response, streaming=True)
                     current_thinking = thinking or ""
                     current_summary = summary or ""
+                    yield (current_thinking, current_summary, info, metrics)
+        # Final timing calculations
+        metrics["generation_end_time"] = time.time()
+        metrics["generation_tokens"] = token_count
+        metrics["total_tokens"] = token_count
+        # Calculate speeds
+        generation_duration = metrics["generation_end_time"] - metrics["generation_start_time"]
+        if generation_duration > 0:
+            metrics["generation_speed_tps"] = token_count / generation_duration
+        else:
+            metrics["generation_speed_tps"] = 0.0
+        # Prefill = time from start to first token
+        if metrics["time_to_first_token_ms"]:
+            prefill_seconds = metrics["time_to_first_token_ms"] / 1000
+            # Estimate prefill tokens (input tokens processed before first output)
+            input_tokens = estimate_tokens(transcript)
+            metrics["prefill_tokens"] = input_tokens
+            if prefill_seconds > 0:
+                metrics["prefill_speed_tps"] = input_tokens / prefill_seconds
+            else:
+                metrics["prefill_speed_tps"] = 0.0
+        # Total processing time
+        metrics["total_processing_time_ms"] = (metrics["generation_end_time"] - metrics["start_time"]) * 1000
+        # Final parse and token counts
         thinking, summary = parse_thinking_blocks(full_response)
+        # Calculate output tokens
+        metrics["output_tokens"] = estimate_tokens(summary) if summary else 0
+        metrics["thinking_tokens"] = estimate_tokens(thinking) if thinking else 0
+        # Update totals
+        metrics["total_tokens"] = metrics["input_tokens"] + metrics["output_tokens"] + metrics["thinking_tokens"]
+        yield (thinking or "", summary or "", info, metrics)
         llm.reset()
     except Exception as e:
         logger.error(f"Generation error: {e}")
+        metrics["error"] = str(e)
+        yield (current_thinking, current_summary + f"\n\nError: {e}", info, metrics)
 # Custom CSS for better UI
                         variant="primary",
                         elem_classes=["submit-btn"]
                     )
+                    # Hidden state to store generation metrics
+                    metrics_state = gr.State(value={})
                 # Model info section (dynamic)
                 with gr.Group():
         submit_btn.click(
             fn=summarize_streaming,
             inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
+            outputs=[thinking_output, summary_output, info_output, metrics_state],
             show_progress="full"
         )
         # Download button
         download_btn.click(
             fn=download_summary_json,
+            inputs=[summary_output, thinking_output, model_dropdown, language_selector, metrics_state],
             outputs=[gr.File(label="Download")]
         )