Spaces:
Running
Running
feat: enrich JSON export with rich debug info for all 3 pipeline stages
Browse filesAdd debug_info section to Advanced Mode JSON export with:
- Preprocessing: stats and noise phrases removed
- Extraction: per-window full LLM responses and parse details
- Deduplication: duplicate groups with similarity scores
- Synthesis: full input items and prompts
Enables debugging pipeline failures causing bad final summaries.
- app.py +50 -13
- meeting_summarizer/extraction.py +461 -30
- meeting_summarizer/trace.py +182 -2
app.py
CHANGED
|
@@ -1453,7 +1453,7 @@ def summarize_advanced(
|
|
| 1453 |
"""
|
| 1454 |
from meeting_summarizer.trace import Tracer
|
| 1455 |
from meeting_summarizer.extraction import (
|
| 1456 |
-
EmbeddingModel, Window,
|
| 1457 |
stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
|
| 1458 |
)
|
| 1459 |
|
|
@@ -1482,8 +1482,23 @@ def summarize_advanced(
|
|
| 1482 |
"""Count tokens using the extraction model's tokenizer."""
|
| 1483 |
return len(extraction_llm.tokenize(text.encode('utf-8')))
|
| 1484 |
|
| 1485 |
-
#
|
| 1486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1487 |
lines = [l.strip() for l in transcript.split('\n') if l.strip()]
|
| 1488 |
|
| 1489 |
# Reserve tokens for system prompt (~200) and output (~1024)
|
|
@@ -1508,6 +1523,14 @@ def summarize_advanced(
|
|
| 1508 |
end_turn=line_num - 1,
|
| 1509 |
token_count=current_tokens
|
| 1510 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1511 |
window_id += 1
|
| 1512 |
|
| 1513 |
# Start new window with overlap
|
|
@@ -1528,6 +1551,14 @@ def summarize_advanced(
|
|
| 1528 |
end_turn=len(lines) - 1,
|
| 1529 |
token_count=current_tokens
|
| 1530 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1531 |
|
| 1532 |
total_windows = len(windows)
|
| 1533 |
yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
|
|
@@ -1640,6 +1671,7 @@ def summarize_advanced(
|
|
| 1640 |
|
| 1641 |
# Get trace stats and add model names for download JSON
|
| 1642 |
trace_stats = tracer.get_summary_stats()
|
|
|
|
| 1643 |
ext_config = get_model_config(extraction_model_key, "extraction")
|
| 1644 |
syn_config = get_model_config(synthesis_model_key, "synthesis")
|
| 1645 |
trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
|
|
@@ -1652,7 +1684,8 @@ def summarize_advanced(
|
|
| 1652 |
"thinking": final_thinking,
|
| 1653 |
"summary": final_summary,
|
| 1654 |
"trace_stats": trace_stats,
|
| 1655 |
-
"trace_json": tracer.get_trace_json()
|
|
|
|
| 1656 |
}
|
| 1657 |
|
| 1658 |
except Exception as e:
|
|
@@ -1685,6 +1718,8 @@ def download_summary_json(summary, thinking, model_key, language, metrics):
|
|
| 1685 |
if is_advanced:
|
| 1686 |
# Advanced Mode: embed trace data and use pipeline model names
|
| 1687 |
trace_stats = metrics.get("trace_stats", {})
|
|
|
|
|
|
|
| 1688 |
data = {
|
| 1689 |
"metadata": {
|
| 1690 |
"generated_at": datetime.now().isoformat(),
|
|
@@ -1707,6 +1742,7 @@ def download_summary_json(summary, thinking, model_key, language, metrics):
|
|
| 1707 |
"synthesis_success": trace_stats.get("synthesis_success", False),
|
| 1708 |
"total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
|
| 1709 |
},
|
|
|
|
| 1710 |
"trace": metrics.get("trace_json", [])
|
| 1711 |
}
|
| 1712 |
else:
|
|
@@ -3187,8 +3223,8 @@ def create_interface():
|
|
| 3187 |
supports_toggle = config.get("supports_toggle", False)
|
| 3188 |
|
| 3189 |
if supports_toggle:
|
| 3190 |
-
# Hybrid model
|
| 3191 |
-
return gr.update(visible=True, value=
|
| 3192 |
elif config.get("supports_reasoning", False):
|
| 3193 |
# Thinking-only model (none currently in extraction)
|
| 3194 |
return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
|
|
@@ -3478,17 +3514,18 @@ def create_interface():
|
|
| 3478 |
|
| 3479 |
# Format info message
|
| 3480 |
info_msg = f"""**Advanced Mode Complete**
|
| 3481 |
-
- Total Windows: {trace_stats.get('total_windows', 0)}
|
| 3482 |
-
- Items Extracted: {trace_stats.get('total_items_extracted', 0)}
|
| 3483 |
-
- Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
|
| 3484 |
-
- Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
|
| 3485 |
-
- Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
|
| 3486 |
|
| 3487 |
-
# Store trace for download
|
| 3488 |
metrics = {
|
| 3489 |
"mode": "advanced",
|
| 3490 |
"trace_stats": trace_stats,
|
| 3491 |
-
"trace_json": update.get("trace_json", [])
|
|
|
|
| 3492 |
}
|
| 3493 |
|
| 3494 |
yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")
|
|
|
|
| 1453 |
"""
|
| 1454 |
from meeting_summarizer.trace import Tracer
|
| 1455 |
from meeting_summarizer.extraction import (
|
| 1456 |
+
EmbeddingModel, Window, preprocess_transcript,
|
| 1457 |
stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
|
| 1458 |
)
|
| 1459 |
|
|
|
|
| 1482 |
"""Count tokens using the extraction model's tokenizer."""
|
| 1483 |
return len(extraction_llm.tokenize(text.encode('utf-8')))
|
| 1484 |
|
| 1485 |
+
# Preprocess transcript: strip CSV format, remove noise/repetition
|
| 1486 |
+
raw_line_count = len(transcript.split('\n'))
|
| 1487 |
+
raw_char_count = len(transcript)
|
| 1488 |
+
transcript, noise_phrases = preprocess_transcript(transcript)
|
| 1489 |
+
cleaned_line_count = len(transcript.split('\n'))
|
| 1490 |
+
cleaned_char_count = len(transcript)
|
| 1491 |
+
|
| 1492 |
+
# Log preprocessing info to tracer
|
| 1493 |
+
tracer.log_preprocessing(
|
| 1494 |
+
original_line_count=raw_line_count,
|
| 1495 |
+
cleaned_line_count=cleaned_line_count,
|
| 1496 |
+
original_char_count=raw_char_count,
|
| 1497 |
+
cleaned_char_count=cleaned_char_count,
|
| 1498 |
+
noise_phrases_removed=noise_phrases
|
| 1499 |
+
)
|
| 1500 |
+
|
| 1501 |
+
# Create windows from preprocessed transcript
|
| 1502 |
lines = [l.strip() for l in transcript.split('\n') if l.strip()]
|
| 1503 |
|
| 1504 |
# Reserve tokens for system prompt (~200) and output (~1024)
|
|
|
|
| 1523 |
end_turn=line_num - 1,
|
| 1524 |
token_count=current_tokens
|
| 1525 |
))
|
| 1526 |
+
# Log window to tracer for debugging
|
| 1527 |
+
tracer.log_window(
|
| 1528 |
+
window_id=window_id,
|
| 1529 |
+
content=window_content,
|
| 1530 |
+
token_count=current_tokens,
|
| 1531 |
+
start_turn=line_num - len(current_window),
|
| 1532 |
+
end_turn=line_num - 1
|
| 1533 |
+
)
|
| 1534 |
window_id += 1
|
| 1535 |
|
| 1536 |
# Start new window with overlap
|
|
|
|
| 1551 |
end_turn=len(lines) - 1,
|
| 1552 |
token_count=current_tokens
|
| 1553 |
))
|
| 1554 |
+
# Log window to tracer for debugging
|
| 1555 |
+
tracer.log_window(
|
| 1556 |
+
window_id=window_id,
|
| 1557 |
+
content=window_content,
|
| 1558 |
+
token_count=current_tokens,
|
| 1559 |
+
start_turn=len(lines) - len(current_window),
|
| 1560 |
+
end_turn=len(lines) - 1
|
| 1561 |
+
)
|
| 1562 |
|
| 1563 |
total_windows = len(windows)
|
| 1564 |
yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
|
|
|
|
| 1671 |
|
| 1672 |
# Get trace stats and add model names for download JSON
|
| 1673 |
trace_stats = tracer.get_summary_stats()
|
| 1674 |
+
debug_json = tracer.get_debug_json()
|
| 1675 |
ext_config = get_model_config(extraction_model_key, "extraction")
|
| 1676 |
syn_config = get_model_config(synthesis_model_key, "synthesis")
|
| 1677 |
trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
|
|
|
|
| 1684 |
"thinking": final_thinking,
|
| 1685 |
"summary": final_summary,
|
| 1686 |
"trace_stats": trace_stats,
|
| 1687 |
+
"trace_json": tracer.get_trace_json(),
|
| 1688 |
+
"debug_json": debug_json
|
| 1689 |
}
|
| 1690 |
|
| 1691 |
except Exception as e:
|
|
|
|
| 1718 |
if is_advanced:
|
| 1719 |
# Advanced Mode: embed trace data and use pipeline model names
|
| 1720 |
trace_stats = metrics.get("trace_stats", {})
|
| 1721 |
+
debug_info = metrics.get("debug_json", {})
|
| 1722 |
+
|
| 1723 |
data = {
|
| 1724 |
"metadata": {
|
| 1725 |
"generated_at": datetime.now().isoformat(),
|
|
|
|
| 1742 |
"synthesis_success": trace_stats.get("synthesis_success", False),
|
| 1743 |
"total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
|
| 1744 |
},
|
| 1745 |
+
"debug_info": debug_info,
|
| 1746 |
"trace": metrics.get("trace_json", [])
|
| 1747 |
}
|
| 1748 |
else:
|
|
|
|
| 3223 |
supports_toggle = config.get("supports_toggle", False)
|
| 3224 |
|
| 3225 |
if supports_toggle:
|
| 3226 |
+
# Hybrid model — default reasoning ON for better extraction quality
|
| 3227 |
+
return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Extraction")
|
| 3228 |
elif config.get("supports_reasoning", False):
|
| 3229 |
# Thinking-only model (none currently in extraction)
|
| 3230 |
return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
|
|
|
|
| 3514 |
|
| 3515 |
# Format info message
|
| 3516 |
info_msg = f"""**Advanced Mode Complete**
|
| 3517 |
+
- Total Windows: {trace_stats.get('total_windows', 0)}
|
| 3518 |
+
- Items Extracted: {trace_stats.get('total_items_extracted', 0)}
|
| 3519 |
+
- Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
|
| 3520 |
+
- Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
|
| 3521 |
+
- Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
|
| 3522 |
|
| 3523 |
+
# Store trace and debug info for download
|
| 3524 |
metrics = {
|
| 3525 |
"mode": "advanced",
|
| 3526 |
"trace_stats": trace_stats,
|
| 3527 |
+
"trace_json": update.get("trace_json", []),
|
| 3528 |
+
"debug_json": update.get("debug_json", {})
|
| 3529 |
}
|
| 3530 |
|
| 3531 |
yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")
|
meeting_summarizer/extraction.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
Advanced Extraction Pipeline
|
| 3 |
|
| 4 |
Provides:
|
|
|
|
| 5 |
1. EMBEDDING_MODELS registry (4 models for deduplication)
|
| 6 |
2. NativeTokenizer - Count tokens without llama.cpp
|
| 7 |
3. EmbeddingModel - Load/compute embeddings
|
|
@@ -11,11 +12,13 @@ Provides:
|
|
| 11 |
7. stream_synthesize_executive_summary - Stage 3: Synthesis
|
| 12 |
"""
|
| 13 |
|
|
|
|
|
|
|
| 14 |
import re
|
| 15 |
import json
|
| 16 |
import time
|
| 17 |
import logging
|
| 18 |
-
from typing import Dict, List, Any, Tuple, Generator, Optional
|
| 19 |
from dataclasses import dataclass
|
| 20 |
import numpy as np
|
| 21 |
from llama_cpp import Llama
|
|
@@ -23,6 +26,233 @@ from llama_cpp import Llama
|
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# ===== EMBEDDING MODELS REGISTRY =====
|
| 27 |
|
| 28 |
EMBEDDING_MODELS = {
|
|
@@ -281,39 +511,172 @@ def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
| 281 |
|
| 282 |
# ===== HELPER FUNCTIONS =====
|
| 283 |
|
| 284 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
"""
|
| 286 |
Attempt to parse extraction JSON from LLM output.
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
| 288 |
Args:
|
| 289 |
text: Raw LLM output
|
| 290 |
-
|
|
|
|
|
|
|
| 291 |
Returns:
|
| 292 |
-
Parsed dict or None if
|
| 293 |
"""
|
| 294 |
# Remove markdown code blocks
|
| 295 |
text = re.sub(r'```json\s*', '', text)
|
| 296 |
text = re.sub(r'```\s*$', '', text)
|
| 297 |
text = text.strip()
|
| 298 |
-
|
|
|
|
|
|
|
| 299 |
try:
|
| 300 |
data = json.loads(text)
|
| 301 |
-
|
| 302 |
-
# Validate schema
|
| 303 |
-
required_keys = {"action_items", "decisions", "key_points", "open_questions"}
|
| 304 |
-
if not isinstance(data, dict) or not required_keys.issubset(data.keys()):
|
| 305 |
-
return None
|
| 306 |
-
|
| 307 |
-
# Validate all values are lists
|
| 308 |
-
for key in required_keys:
|
| 309 |
-
if not isinstance(data[key], list):
|
| 310 |
-
return None
|
| 311 |
-
|
| 312 |
-
return data
|
| 313 |
-
|
| 314 |
except json.JSONDecodeError:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
return None
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
def _sample_llm_response(text: str, max_chars: int = 400) -> str:
|
| 319 |
"""Sample LLM response for trace logging."""
|
|
@@ -478,10 +841,11 @@ def stream_extract_from_window(
|
|
| 478 |
token_count = 0
|
| 479 |
|
| 480 |
try:
|
|
|
|
| 481 |
settings = model_config["inference_settings"]
|
| 482 |
stream = extraction_llm.create_chat_completion(
|
| 483 |
messages=messages,
|
| 484 |
-
max_tokens=
|
| 485 |
temperature=settings["temperature"],
|
| 486 |
top_p=settings["top_p"],
|
| 487 |
top_k=settings["top_k"],
|
|
@@ -522,7 +886,7 @@ def stream_extract_from_window(
|
|
| 522 |
# Calculate metrics
|
| 523 |
elapsed = time.time() - start_time
|
| 524 |
tps = token_count / elapsed if elapsed > 0 else 0
|
| 525 |
-
eta = int((
|
| 526 |
|
| 527 |
# Get item counts
|
| 528 |
items_found = {k: len(v) for k, v in partial_items.items()}
|
|
@@ -559,12 +923,14 @@ def stream_extract_from_window(
|
|
| 559 |
else:
|
| 560 |
json_text = full_response
|
| 561 |
|
| 562 |
-
final_items = _try_parse_extraction_json(json_text)
|
| 563 |
|
| 564 |
if not final_items:
|
|
|
|
|
|
|
| 565 |
error_msg = f"Failed to parse JSON from window {window_id}"
|
| 566 |
debug_output = f"{error_msg}\n\nRaw LLM output:\n{full_response[:1000]}\n"
|
| 567 |
-
logger.
|
| 568 |
print(f"\n{'='*80}\n{debug_output}{'='*80}\n", flush=True)
|
| 569 |
tracer.log_extraction(
|
| 570 |
window_id=window_id,
|
|
@@ -572,7 +938,23 @@ def stream_extract_from_window(
|
|
| 572 |
llm_response=_sample_llm_response(full_response),
|
| 573 |
error=error_msg
|
| 574 |
)
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
# Log success
|
| 578 |
tracer.log_extraction(
|
|
@@ -583,6 +965,27 @@ def stream_extract_from_window(
|
|
| 583 |
error=None
|
| 584 |
)
|
| 585 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
# Final ticker
|
| 587 |
elapsed = time.time() - start_time
|
| 588 |
tps = token_count / elapsed if elapsed > 0 else 0
|
|
@@ -644,26 +1047,41 @@ def deduplicate_items(
|
|
| 644 |
emb = embedding_model.embed(item)
|
| 645 |
embeddings.append(emb)
|
| 646 |
|
| 647 |
-
# Mark duplicates
|
| 648 |
keep_indices = []
|
|
|
|
|
|
|
| 649 |
for i in range(len(items)):
|
| 650 |
is_duplicate = False
|
|
|
|
|
|
|
| 651 |
|
| 652 |
# Compare with all previously kept items
|
| 653 |
for j in keep_indices:
|
| 654 |
similarity = cosine_similarity(embeddings[i], embeddings[j])
|
| 655 |
if similarity >= similarity_threshold:
|
| 656 |
is_duplicate = True
|
|
|
|
|
|
|
| 657 |
break
|
| 658 |
|
| 659 |
if not is_duplicate:
|
| 660 |
keep_indices.append(i)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
|
| 662 |
# Keep only unique items
|
| 663 |
unique_items = [items[i] for i in keep_indices]
|
| 664 |
deduplicated[category] = unique_items
|
| 665 |
|
| 666 |
-
# Log deduplication
|
| 667 |
duplicates_removed = original_count - len(unique_items)
|
| 668 |
tracer.log_deduplication(
|
| 669 |
category=category,
|
|
@@ -671,7 +1089,10 @@ def deduplicate_items(
|
|
| 671 |
deduplicated_count=len(unique_items),
|
| 672 |
duplicates_removed=duplicates_removed,
|
| 673 |
similarity_threshold=similarity_threshold,
|
| 674 |
-
embedding_model=embedding_model.model_key
|
|
|
|
|
|
|
|
|
|
| 675 |
)
|
| 676 |
|
| 677 |
logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
|
|
@@ -772,15 +1193,22 @@ def stream_synthesize_executive_summary(
|
|
| 772 |
else:
|
| 773 |
summary_text = full_summary
|
| 774 |
|
| 775 |
-
# Log synthesis
|
| 776 |
tracer.log_synthesis(
|
| 777 |
synthesis_model=model_config["name"],
|
| 778 |
input_item_counts=item_counts,
|
| 779 |
output_summary=_sample_llm_response(summary_text),
|
| 780 |
thinking=_sample_llm_response(thinking_content) if thinking_content else None,
|
| 781 |
-
error=None
|
|
|
|
|
|
|
|
|
|
| 782 |
)
|
| 783 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
yield (summary_text, thinking_content, True)
|
| 785 |
|
| 786 |
except Exception as e:
|
|
@@ -789,6 +1217,9 @@ def stream_synthesize_executive_summary(
|
|
| 789 |
input_item_counts=item_counts,
|
| 790 |
output_summary="",
|
| 791 |
thinking=None,
|
| 792 |
-
error=str(e)
|
|
|
|
|
|
|
|
|
|
| 793 |
)
|
| 794 |
raise
|
|
|
|
| 2 |
Advanced Extraction Pipeline
|
| 3 |
|
| 4 |
Provides:
|
| 5 |
+
0. preprocess_transcript - Clean noisy CSV transcripts before extraction
|
| 6 |
1. EMBEDDING_MODELS registry (4 models for deduplication)
|
| 7 |
2. NativeTokenizer - Count tokens without llama.cpp
|
| 8 |
3. EmbeddingModel - Load/compute embeddings
|
|
|
|
| 12 |
7. stream_synthesize_executive_summary - Stage 3: Synthesis
|
| 13 |
"""
|
| 14 |
|
| 15 |
+
import csv
|
| 16 |
+
import io
|
| 17 |
import re
|
| 18 |
import json
|
| 19 |
import time
|
| 20 |
import logging
|
| 21 |
+
from typing import Dict, List, Any, Tuple, Generator, Optional, Set
|
| 22 |
from dataclasses import dataclass
|
| 23 |
import numpy as np
|
| 24 |
from llama_cpp import Llama
|
|
|
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
|
| 29 |
+
# ===== TRANSCRIPT PREPROCESSING =====
|
| 30 |
+
|
| 31 |
+
def preprocess_transcript(transcript_text: str) -> Tuple[str, List[str]]:
|
| 32 |
+
"""
|
| 33 |
+
Clean noisy transcript text before extraction.
|
| 34 |
+
|
| 35 |
+
Handles:
|
| 36 |
+
1. CSV format detection and text column extraction
|
| 37 |
+
2. Speaker label prefixing (for context)
|
| 38 |
+
3. Collapsing consecutive duplicate lines
|
| 39 |
+
4. Collapsing repeated phrases within lines
|
| 40 |
+
5. Filtering lines that are pure noise (no meaningful content)
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
transcript_text: Raw transcript (CSV or plain text)
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Tuple of (cleaned_dialogue_text, noise_phrases_list)
|
| 47 |
+
- cleaned_dialogue_text: Cleaned dialogue text with speaker labels
|
| 48 |
+
- noise_phrases_list: List of noise phrases detected and removed
|
| 49 |
+
"""
|
| 50 |
+
raw_lines = transcript_text.strip().split('\n')
|
| 51 |
+
if not raw_lines:
|
| 52 |
+
return "", []
|
| 53 |
+
|
| 54 |
+
# Step 1: Detect CSV format and extract dialogue
|
| 55 |
+
dialogue_lines = _extract_dialogue_from_csv(raw_lines)
|
| 56 |
+
|
| 57 |
+
# Step 2: Collapse consecutive duplicate lines
|
| 58 |
+
deduped_lines = _collapse_consecutive_duplicates(dialogue_lines)
|
| 59 |
+
|
| 60 |
+
# Step 3: Clean repeated phrases within each line
|
| 61 |
+
cleaned_lines = []
|
| 62 |
+
for line in deduped_lines:
|
| 63 |
+
cleaned = _collapse_repeated_phrases(line)
|
| 64 |
+
if cleaned:
|
| 65 |
+
cleaned_lines.append(cleaned)
|
| 66 |
+
|
| 67 |
+
# Step 4: Filter lines that are pure noise
|
| 68 |
+
meaningful_lines, noise_phrases = _filter_noise_lines(cleaned_lines)
|
| 69 |
+
|
| 70 |
+
result = '\n'.join(meaningful_lines)
|
| 71 |
+
if result != transcript_text.strip():
|
| 72 |
+
original_len = len(transcript_text.strip())
|
| 73 |
+
cleaned_len = len(result)
|
| 74 |
+
reduction = ((original_len - cleaned_len) / original_len * 100) if original_len > 0 else 0
|
| 75 |
+
logger.info(
|
| 76 |
+
f"Transcript preprocessed: {original_len} → {cleaned_len} chars "
|
| 77 |
+
f"({reduction:.0f}% reduction, {len(meaningful_lines)} lines)"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
return result, list(noise_phrases)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _extract_dialogue_from_csv(lines: List[str]) -> List[str]:
|
| 84 |
+
"""
|
| 85 |
+
Detect CSV format and extract speaker-prefixed dialogue lines.
|
| 86 |
+
|
| 87 |
+
If the first line looks like a CSV header (start,end,speaker,text),
|
| 88 |
+
parse as CSV and return 'SPEAKER_XX: text' lines.
|
| 89 |
+
Otherwise return lines as-is.
|
| 90 |
+
"""
|
| 91 |
+
# Check for CSV header
|
| 92 |
+
first_line = lines[0].strip().lower()
|
| 93 |
+
is_csv = first_line.startswith('start,end,speaker,text') or (
|
| 94 |
+
',' in first_line and any(
|
| 95 |
+
kw in first_line for kw in ['speaker', 'start', 'text']
|
| 96 |
+
)
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
if not is_csv:
|
| 100 |
+
return [l.strip() for l in lines if l.strip()]
|
| 101 |
+
|
| 102 |
+
# Parse CSV, skipping header
|
| 103 |
+
dialogue = []
|
| 104 |
+
csv_text = '\n'.join(lines)
|
| 105 |
+
reader = csv.reader(io.StringIO(csv_text))
|
| 106 |
+
|
| 107 |
+
for i, row in enumerate(reader):
|
| 108 |
+
if i == 0:
|
| 109 |
+
# Skip header row
|
| 110 |
+
continue
|
| 111 |
+
if len(row) >= 4:
|
| 112 |
+
speaker = row[2].strip()
|
| 113 |
+
text = row[3].strip().strip('"')
|
| 114 |
+
if text:
|
| 115 |
+
dialogue.append(f"{speaker}: {text}")
|
| 116 |
+
elif len(row) >= 1:
|
| 117 |
+
# Fallback: take whatever text is there
|
| 118 |
+
text = ','.join(row).strip()
|
| 119 |
+
if text:
|
| 120 |
+
dialogue.append(text)
|
| 121 |
+
|
| 122 |
+
return dialogue
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _collapse_consecutive_duplicates(lines: List[str]) -> List[str]:
|
| 126 |
+
"""Remove consecutive duplicate lines (exact match)."""
|
| 127 |
+
if not lines:
|
| 128 |
+
return []
|
| 129 |
+
|
| 130 |
+
result = [lines[0]]
|
| 131 |
+
for line in lines[1:]:
|
| 132 |
+
if line != result[-1]:
|
| 133 |
+
result.append(line)
|
| 134 |
+
return result
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _collapse_repeated_phrases(line: str, max_repeats: int = 2) -> str:
|
| 138 |
+
"""
|
| 139 |
+
Collapse repeated phrases within a single line.
|
| 140 |
+
|
| 141 |
+
Detects patterns like 'ABC。ABC。ABC。' and reduces to 'ABC。'
|
| 142 |
+
Works with Chinese punctuation boundaries.
|
| 143 |
+
"""
|
| 144 |
+
if not line:
|
| 145 |
+
return line
|
| 146 |
+
|
| 147 |
+
# Split by Chinese/standard sentence boundaries
|
| 148 |
+
# Keep the delimiter attached to the preceding segment
|
| 149 |
+
segments = re.split(r'(?<=[。!?;\.\!\?\;])', line)
|
| 150 |
+
segments = [s.strip() for s in segments if s.strip()]
|
| 151 |
+
|
| 152 |
+
if len(segments) <= 1:
|
| 153 |
+
return line
|
| 154 |
+
|
| 155 |
+
# Collapse consecutive identical segments
|
| 156 |
+
deduped = [segments[0]]
|
| 157 |
+
repeat_count = 1
|
| 158 |
+
for seg in segments[1:]:
|
| 159 |
+
if seg == deduped[-1]:
|
| 160 |
+
repeat_count += 1
|
| 161 |
+
if repeat_count <= max_repeats:
|
| 162 |
+
deduped.append(seg)
|
| 163 |
+
else:
|
| 164 |
+
deduped.append(seg)
|
| 165 |
+
repeat_count = 1
|
| 166 |
+
|
| 167 |
+
return ''.join(deduped)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _filter_noise_lines(
|
| 171 |
+
lines: List[str],
|
| 172 |
+
min_unique_chars: int = 5,
|
| 173 |
+
noise_phrase_threshold: int = 5
|
| 174 |
+
) -> Tuple[List[str], Set[str]]:
|
| 175 |
+
"""
|
| 176 |
+
Filter out lines that are pure noise (ASR hallucination loops).
|
| 177 |
+
|
| 178 |
+
A line is noise if:
|
| 179 |
+
- It has fewer than min_unique_chars unique non-punctuation characters
|
| 180 |
+
- Its content is entirely composed of a single phrase that repeats
|
| 181 |
+
across the transcript more than noise_phrase_threshold times
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
lines: Preprocessed dialogue lines
|
| 185 |
+
min_unique_chars: Minimum unique chars to keep a line
|
| 186 |
+
noise_phrase_threshold: A phrase appearing more than this many times
|
| 187 |
+
across the transcript is considered noise
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Tuple of (filtered_lines, noise_phrases)
|
| 191 |
+
- filtered_lines: Lines that are not pure noise
|
| 192 |
+
- noise_phrases: Set of noise phrases detected
|
| 193 |
+
"""
|
| 194 |
+
if not lines:
|
| 195 |
+
return [], set()
|
| 196 |
+
|
| 197 |
+
_punct_re = re.compile(
|
| 198 |
+
r'[\s\u3000\uff0c\u3002\uff01\uff1f\u3001\uff1b\uff1a'
|
| 199 |
+
r'\u201c\u201d\u2018\u2019'
|
| 200 |
+
r'\uff08\uff09()\.,!?;:"\'\s]'
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
def strip_speaker(line: str) -> str:
|
| 204 |
+
return re.sub(r'^SPEAKER_\d+:\s*', '', line)
|
| 205 |
+
|
| 206 |
+
def get_content(text: str) -> str:
|
| 207 |
+
return _punct_re.sub('', text)
|
| 208 |
+
|
| 209 |
+
# Step 1: Split each line into sentence-level segments and count
|
| 210 |
+
# how many times each segment appears across the entire transcript.
|
| 211 |
+
# This catches ASR hallucination like "並且請留意下方的資訊欄" which
|
| 212 |
+
# may repeat within a line and across many lines.
|
| 213 |
+
segment_counts: Dict[str, int] = {}
|
| 214 |
+
for line in lines:
|
| 215 |
+
text = strip_speaker(line)
|
| 216 |
+
# Split on Chinese sentence boundaries
|
| 217 |
+
segments = re.split(r'[。!?;\.\!\?\;]', text)
|
| 218 |
+
seen_in_line: set = set()
|
| 219 |
+
for seg in segments:
|
| 220 |
+
seg_content = get_content(seg)
|
| 221 |
+
if len(seg_content) >= 3 and seg_content not in seen_in_line:
|
| 222 |
+
seen_in_line.add(seg_content)
|
| 223 |
+
segment_counts[seg_content] = segment_counts.get(seg_content, 0) + 1
|
| 224 |
+
|
| 225 |
+
# Step 2: Find noise phrases (segments appearing in too many lines)
|
| 226 |
+
noise_phrases = {
|
| 227 |
+
phrase for phrase, count in segment_counts.items()
|
| 228 |
+
if count >= noise_phrase_threshold
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
# Step 3: For each line, check if it's purely noise
|
| 232 |
+
meaningful = []
|
| 233 |
+
for line in lines:
|
| 234 |
+
text = strip_speaker(line)
|
| 235 |
+
content = get_content(text)
|
| 236 |
+
|
| 237 |
+
# Skip if too few unique characters
|
| 238 |
+
if len(set(content)) < min_unique_chars:
|
| 239 |
+
continue
|
| 240 |
+
|
| 241 |
+
# Check if the line is entirely composed of noise phrases.
|
| 242 |
+
# Remove all noise phrase occurrences and see if anything meaningful remains.
|
| 243 |
+
remaining = content
|
| 244 |
+
for noise in noise_phrases:
|
| 245 |
+
remaining = remaining.replace(noise, '')
|
| 246 |
+
|
| 247 |
+
# If nothing meaningful remains after removing noise, skip this line
|
| 248 |
+
if len(remaining.strip()) < min_unique_chars:
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
meaningful.append(line)
|
| 252 |
+
|
| 253 |
+
return meaningful, noise_phrases
|
| 254 |
+
|
| 255 |
+
|
| 256 |
# ===== EMBEDDING MODELS REGISTRY =====
|
| 257 |
|
| 258 |
EMBEDDING_MODELS = {
|
|
|
|
| 511 |
|
| 512 |
# ===== HELPER FUNCTIONS =====
|
| 513 |
|
| 514 |
+
def _repair_truncated_json(text: str) -> str:
|
| 515 |
+
"""
|
| 516 |
+
Attempt to repair truncated JSON by closing open brackets/strings.
|
| 517 |
+
|
| 518 |
+
Handles cases where max_tokens cuts off the response mid-JSON,
|
| 519 |
+
e.g. a string never closed, an array never closed, etc.
|
| 520 |
+
|
| 521 |
+
Args:
|
| 522 |
+
text: Truncated JSON string
|
| 523 |
+
|
| 524 |
+
Returns:
|
| 525 |
+
Repaired JSON string (best effort)
|
| 526 |
+
"""
|
| 527 |
+
in_string = False
|
| 528 |
+
escape_next = False
|
| 529 |
+
stack = [] # tracks open { and [
|
| 530 |
+
|
| 531 |
+
for char in text:
|
| 532 |
+
if escape_next:
|
| 533 |
+
escape_next = False
|
| 534 |
+
continue
|
| 535 |
+
if char == '\\' and in_string:
|
| 536 |
+
escape_next = True
|
| 537 |
+
continue
|
| 538 |
+
if char == '"' and not escape_next:
|
| 539 |
+
in_string = not in_string
|
| 540 |
+
continue
|
| 541 |
+
if in_string:
|
| 542 |
+
continue
|
| 543 |
+
if char in ('{', '['):
|
| 544 |
+
stack.append(char)
|
| 545 |
+
elif char == '}' and stack and stack[-1] == '{':
|
| 546 |
+
stack.pop()
|
| 547 |
+
elif char == ']' and stack and stack[-1] == '[':
|
| 548 |
+
stack.pop()
|
| 549 |
+
|
| 550 |
+
repair = ""
|
| 551 |
+
if in_string:
|
| 552 |
+
repair += '"'
|
| 553 |
+
for opener in reversed(stack):
|
| 554 |
+
if opener == '[':
|
| 555 |
+
repair += ']'
|
| 556 |
+
elif opener == '{':
|
| 557 |
+
repair += '}'
|
| 558 |
+
|
| 559 |
+
return text + repair
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
def _normalize_item_to_string(item: Any) -> str:
|
| 563 |
+
"""
|
| 564 |
+
Normalize an extracted item to a plain string.
|
| 565 |
+
|
| 566 |
+
Models may output items as strings or as dicts with various fields
|
| 567 |
+
(e.g. {"assigned_to": "X", "due_date": "Y"}). This flattens them
|
| 568 |
+
to a single descriptive string.
|
| 569 |
+
|
| 570 |
+
Args:
|
| 571 |
+
item: A string or dict from the extraction JSON
|
| 572 |
+
|
| 573 |
+
Returns:
|
| 574 |
+
A plain string representation
|
| 575 |
+
"""
|
| 576 |
+
if isinstance(item, str):
|
| 577 |
+
return item.strip()
|
| 578 |
+
|
| 579 |
+
if isinstance(item, dict):
|
| 580 |
+
parts = []
|
| 581 |
+
for key, value in item.items():
|
| 582 |
+
if value and isinstance(value, str) and value.strip():
|
| 583 |
+
parts.append(f"{key}: {value.strip()}")
|
| 584 |
+
return '; '.join(parts) if parts else str(item)
|
| 585 |
+
|
| 586 |
+
return str(item)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
def _normalize_extraction_items(data: Dict[str, list]) -> Dict[str, List[str]]:
|
| 590 |
+
"""
|
| 591 |
+
Normalize all extracted items to plain strings.
|
| 592 |
+
|
| 593 |
+
Args:
|
| 594 |
+
data: Parsed extraction dict (values may contain dicts or strings)
|
| 595 |
+
|
| 596 |
+
Returns:
|
| 597 |
+
Dict with all values as lists of strings
|
| 598 |
+
"""
|
| 599 |
+
required_keys = {"action_items", "decisions", "key_points", "open_questions"}
|
| 600 |
+
normalized: Dict[str, List[str]] = {}
|
| 601 |
+
|
| 602 |
+
for key in required_keys:
|
| 603 |
+
items = data.get(key, [])
|
| 604 |
+
if not isinstance(items, list):
|
| 605 |
+
normalized[key] = []
|
| 606 |
+
continue
|
| 607 |
+
normalized[key] = [
|
| 608 |
+
s for s in (_normalize_item_to_string(item) for item in items) if s
|
| 609 |
+
]
|
| 610 |
+
|
| 611 |
+
return normalized
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
def _try_parse_extraction_json(
|
| 615 |
+
text: str, log_repair: bool = False
|
| 616 |
+
) -> Optional[Dict[str, List[str]]]:
|
| 617 |
"""
|
| 618 |
Attempt to parse extraction JSON from LLM output.
|
| 619 |
+
|
| 620 |
+
Handles truncated JSON (from max_tokens cutoff) by repairing
|
| 621 |
+
unclosed brackets/strings. Normalizes item formats (dicts -> strings).
|
| 622 |
+
|
| 623 |
Args:
|
| 624 |
text: Raw LLM output
|
| 625 |
+
log_repair: If True, log when repair was needed (use only for
|
| 626 |
+
final parse, not streaming chunks)
|
| 627 |
+
|
| 628 |
Returns:
|
| 629 |
+
Parsed and normalized dict, or None if unrecoverable
|
| 630 |
"""
|
| 631 |
# Remove markdown code blocks
|
| 632 |
text = re.sub(r'```json\s*', '', text)
|
| 633 |
text = re.sub(r'```\s*$', '', text)
|
| 634 |
text = text.strip()
|
| 635 |
+
|
| 636 |
+
# Attempt 1: parse as-is
|
| 637 |
+
data = None
|
| 638 |
try:
|
| 639 |
data = json.loads(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
except json.JSONDecodeError:
|
| 641 |
+
pass
|
| 642 |
+
|
| 643 |
+
# Attempt 2: repair truncated JSON
|
| 644 |
+
if data is None:
|
| 645 |
+
repaired = _repair_truncated_json(text)
|
| 646 |
+
try:
|
| 647 |
+
data = json.loads(repaired)
|
| 648 |
+
if log_repair:
|
| 649 |
+
logger.info("Successfully parsed JSON after repair (output was truncated)")
|
| 650 |
+
except json.JSONDecodeError:
|
| 651 |
+
pass
|
| 652 |
+
|
| 653 |
+
# Attempt 3: find outermost { and repair from there
|
| 654 |
+
if data is None:
|
| 655 |
+
match = re.search(r'\{', text)
|
| 656 |
+
if match:
|
| 657 |
+
repaired = _repair_truncated_json(text[match.start():])
|
| 658 |
+
try:
|
| 659 |
+
data = json.loads(repaired)
|
| 660 |
+
if log_repair:
|
| 661 |
+
logger.info("Successfully parsed JSON from substring after repair")
|
| 662 |
+
except json.JSONDecodeError:
|
| 663 |
+
return None
|
| 664 |
+
else:
|
| 665 |
+
return None
|
| 666 |
+
|
| 667 |
+
# Validate schema
|
| 668 |
+
required_keys = {"action_items", "decisions", "key_points", "open_questions"}
|
| 669 |
+
if not isinstance(data, dict) or not required_keys.issubset(data.keys()):
|
| 670 |
return None
|
| 671 |
|
| 672 |
+
# Validate all values are lists
|
| 673 |
+
for key in required_keys:
|
| 674 |
+
if not isinstance(data[key], list):
|
| 675 |
+
return None
|
| 676 |
+
|
| 677 |
+
# Normalize items (flatten dicts to strings)
|
| 678 |
+
return _normalize_extraction_items(data)
|
| 679 |
+
|
| 680 |
|
| 681 |
def _sample_llm_response(text: str, max_chars: int = 400) -> str:
|
| 682 |
"""Sample LLM response for trace logging."""
|
|
|
|
| 841 |
token_count = 0
|
| 842 |
|
| 843 |
try:
|
| 844 |
+
max_gen_tokens = 1024
|
| 845 |
settings = model_config["inference_settings"]
|
| 846 |
stream = extraction_llm.create_chat_completion(
|
| 847 |
messages=messages,
|
| 848 |
+
max_tokens=max_gen_tokens,
|
| 849 |
temperature=settings["temperature"],
|
| 850 |
top_p=settings["top_p"],
|
| 851 |
top_k=settings["top_k"],
|
|
|
|
| 886 |
# Calculate metrics
|
| 887 |
elapsed = time.time() - start_time
|
| 888 |
tps = token_count / elapsed if elapsed > 0 else 0
|
| 889 |
+
eta = int((max_gen_tokens - token_count) / tps) if tps > 0 else 0
|
| 890 |
|
| 891 |
# Get item counts
|
| 892 |
items_found = {k: len(v) for k, v in partial_items.items()}
|
|
|
|
| 923 |
else:
|
| 924 |
json_text = full_response
|
| 925 |
|
| 926 |
+
final_items = _try_parse_extraction_json(json_text, log_repair=True)
|
| 927 |
|
| 928 |
if not final_items:
|
| 929 |
+
# Graceful degradation: log warning but don't crash the pipeline.
|
| 930 |
+
# Other windows may still succeed and produce useful data.
|
| 931 |
error_msg = f"Failed to parse JSON from window {window_id}"
|
| 932 |
debug_output = f"{error_msg}\n\nRaw LLM output:\n{full_response[:1000]}\n"
|
| 933 |
+
logger.warning(debug_output)
|
| 934 |
print(f"\n{'='*80}\n{debug_output}{'='*80}\n", flush=True)
|
| 935 |
tracer.log_extraction(
|
| 936 |
window_id=window_id,
|
|
|
|
| 938 |
llm_response=_sample_llm_response(full_response),
|
| 939 |
error=error_msg
|
| 940 |
)
|
| 941 |
+
# Yield empty result instead of crashing
|
| 942 |
+
empty_items = {
|
| 943 |
+
"action_items": [], "decisions": [],
|
| 944 |
+
"key_points": [], "open_questions": []
|
| 945 |
+
}
|
| 946 |
+
ticker = format_progress_ticker(
|
| 947 |
+
current_window=window_id,
|
| 948 |
+
total_windows=total_windows,
|
| 949 |
+
window_tokens=window.token_count,
|
| 950 |
+
max_tokens=4096,
|
| 951 |
+
items_found={k: 0 for k in empty_items},
|
| 952 |
+
tokens_per_sec=0,
|
| 953 |
+
eta_seconds=0,
|
| 954 |
+
current_snippet=f"⚠️ Window {window_id} parse failed, continuing..."
|
| 955 |
+
)
|
| 956 |
+
yield (ticker, thinking_content, empty_items, True)
|
| 957 |
+
return
|
| 958 |
|
| 959 |
# Log success
|
| 960 |
tracer.log_extraction(
|
|
|
|
| 965 |
error=None
|
| 966 |
)
|
| 967 |
|
| 968 |
+
# Log detailed extraction info for debugging
|
| 969 |
+
json_repaired = False
|
| 970 |
+
parse_attempts = 1
|
| 971 |
+
|
| 972 |
+
# Check if the JSON was repaired by examining the parse function
|
| 973 |
+
# This is a heuristic - the actual parse_attempts would be tracked inside _try_parse_extraction_json
|
| 974 |
+
try:
|
| 975 |
+
json.loads(full_response)
|
| 976 |
+
except json.JSONDecodeError:
|
| 977 |
+
json_repaired = True
|
| 978 |
+
parse_attempts = 2
|
| 979 |
+
|
| 980 |
+
tracer.log_extraction_detail(
|
| 981 |
+
window_id=window_id,
|
| 982 |
+
extracted_items=final_items,
|
| 983 |
+
full_llm_response=full_response,
|
| 984 |
+
full_thinking=thinking_content,
|
| 985 |
+
json_repaired=json_repaired,
|
| 986 |
+
parse_attempts=parse_attempts
|
| 987 |
+
)
|
| 988 |
+
|
| 989 |
# Final ticker
|
| 990 |
elapsed = time.time() - start_time
|
| 991 |
tps = token_count / elapsed if elapsed > 0 else 0
|
|
|
|
| 1047 |
emb = embedding_model.embed(item)
|
| 1048 |
embeddings.append(emb)
|
| 1049 |
|
| 1050 |
+
# Mark duplicates and track duplicate groups
|
| 1051 |
keep_indices = []
|
| 1052 |
+
duplicate_groups = []
|
| 1053 |
+
|
| 1054 |
for i in range(len(items)):
|
| 1055 |
is_duplicate = False
|
| 1056 |
+
duplicate_of_idx = -1
|
| 1057 |
+
similarity_score = 0.0
|
| 1058 |
|
| 1059 |
# Compare with all previously kept items
|
| 1060 |
for j in keep_indices:
|
| 1061 |
similarity = cosine_similarity(embeddings[i], embeddings[j])
|
| 1062 |
if similarity >= similarity_threshold:
|
| 1063 |
is_duplicate = True
|
| 1064 |
+
duplicate_of_idx = j
|
| 1065 |
+
similarity_score = similarity
|
| 1066 |
break
|
| 1067 |
|
| 1068 |
if not is_duplicate:
|
| 1069 |
keep_indices.append(i)
|
| 1070 |
+
else:
|
| 1071 |
+
# Record duplicate group for debugging
|
| 1072 |
+
duplicate_groups.append({
|
| 1073 |
+
"duplicate_item": items[i],
|
| 1074 |
+
"duplicate_index": i,
|
| 1075 |
+
"kept_item": items[duplicate_of_idx],
|
| 1076 |
+
"kept_index": duplicate_of_idx,
|
| 1077 |
+
"similarity": round(similarity_score, 3),
|
| 1078 |
+
})
|
| 1079 |
|
| 1080 |
# Keep only unique items
|
| 1081 |
unique_items = [items[i] for i in keep_indices]
|
| 1082 |
deduplicated[category] = unique_items
|
| 1083 |
|
| 1084 |
+
# Log deduplication with full details
|
| 1085 |
duplicates_removed = original_count - len(unique_items)
|
| 1086 |
tracer.log_deduplication(
|
| 1087 |
category=category,
|
|
|
|
| 1089 |
deduplicated_count=len(unique_items),
|
| 1090 |
duplicates_removed=duplicates_removed,
|
| 1091 |
similarity_threshold=similarity_threshold,
|
| 1092 |
+
embedding_model=embedding_model.model_key,
|
| 1093 |
+
original_items=items,
|
| 1094 |
+
deduplicated_items=unique_items,
|
| 1095 |
+
duplicate_groups=duplicate_groups
|
| 1096 |
)
|
| 1097 |
|
| 1098 |
logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
|
|
|
|
| 1193 |
else:
|
| 1194 |
summary_text = full_summary
|
| 1195 |
|
| 1196 |
+
# Log synthesis with full details
|
| 1197 |
tracer.log_synthesis(
|
| 1198 |
synthesis_model=model_config["name"],
|
| 1199 |
input_item_counts=item_counts,
|
| 1200 |
output_summary=_sample_llm_response(summary_text),
|
| 1201 |
thinking=_sample_llm_response(thinking_content) if thinking_content else None,
|
| 1202 |
+
error=None,
|
| 1203 |
+
input_items=deduplicated_items,
|
| 1204 |
+
system_prompt=system_prompt,
|
| 1205 |
+
user_prompt=user_prompt
|
| 1206 |
)
|
| 1207 |
|
| 1208 |
+
# Also store full outputs in synthesis_details directly
|
| 1209 |
+
tracer.synthesis_details["full_output_summary"] = summary_text
|
| 1210 |
+
tracer.synthesis_details["full_thinking"] = thinking_content
|
| 1211 |
+
|
| 1212 |
yield (summary_text, thinking_content, True)
|
| 1213 |
|
| 1214 |
except Exception as e:
|
|
|
|
| 1217 |
input_item_counts=item_counts,
|
| 1218 |
output_summary="",
|
| 1219 |
thinking=None,
|
| 1220 |
+
error=str(e),
|
| 1221 |
+
input_items=deduplicated_items,
|
| 1222 |
+
system_prompt=system_prompt,
|
| 1223 |
+
user_prompt=user_prompt
|
| 1224 |
)
|
| 1225 |
raise
|
meeting_summarizer/trace.py
CHANGED
|
@@ -27,6 +27,11 @@ class Tracer:
|
|
| 27 |
self.enabled = enabled
|
| 28 |
self.trace_entries: List[Dict[str, Any]] = []
|
| 29 |
self.start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def log_extraction(
|
| 32 |
self,
|
|
@@ -71,7 +76,10 @@ class Tracer:
|
|
| 71 |
deduplicated_count: int,
|
| 72 |
duplicates_removed: int,
|
| 73 |
similarity_threshold: float,
|
| 74 |
-
embedding_model: str
|
|
|
|
|
|
|
|
|
|
| 75 |
) -> None:
|
| 76 |
"""
|
| 77 |
Log deduplication operation for a category.
|
|
@@ -83,6 +91,9 @@ class Tracer:
|
|
| 83 |
duplicates_removed: Number of duplicates removed
|
| 84 |
similarity_threshold: Similarity threshold used
|
| 85 |
embedding_model: Embedding model used
|
|
|
|
|
|
|
|
|
|
| 86 |
"""
|
| 87 |
if not self.enabled:
|
| 88 |
return
|
|
@@ -101,6 +112,13 @@ class Tracer:
|
|
| 101 |
}
|
| 102 |
|
| 103 |
self.trace_entries.append(entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
logger.debug(f"[Trace] Deduplication {category}: {original_count} → {deduplicated_count} ({duplicates_removed} removed)")
|
| 105 |
|
| 106 |
def log_synthesis(
|
|
@@ -109,7 +127,10 @@ class Tracer:
|
|
| 109 |
input_item_counts: Dict[str, int],
|
| 110 |
output_summary: str,
|
| 111 |
thinking: Optional[str] = None,
|
| 112 |
-
error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
| 113 |
) -> None:
|
| 114 |
"""
|
| 115 |
Log synthesis operation.
|
|
@@ -120,6 +141,9 @@ class Tracer:
|
|
| 120 |
output_summary: Generated summary (sampled)
|
| 121 |
thinking: Thinking/reasoning content (sampled, if applicable)
|
| 122 |
error: Error message if synthesis failed
|
|
|
|
|
|
|
|
|
|
| 123 |
"""
|
| 124 |
if not self.enabled:
|
| 125 |
return
|
|
@@ -137,6 +161,15 @@ class Tracer:
|
|
| 137 |
}
|
| 138 |
|
| 139 |
self.trace_entries.append(entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
logger.debug(f"[Trace] Synthesis: {entry['success']}")
|
| 141 |
|
| 142 |
def get_trace_jsonl(self) -> str:
|
|
@@ -163,6 +196,53 @@ class Tracer:
|
|
| 163 |
|
| 164 |
return self.trace_entries
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
def get_summary_stats(self) -> Dict[str, Any]:
|
| 167 |
"""
|
| 168 |
Get summary statistics from trace.
|
|
@@ -195,3 +275,103 @@ class Tracer:
|
|
| 195 |
"synthesis_success": synthesis_entries[0]["success"] if synthesis_entries else False,
|
| 196 |
"total_elapsed_seconds": round(time.time() - self.start_time, 2),
|
| 197 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
self.enabled = enabled
|
| 28 |
self.trace_entries: List[Dict[str, Any]] = []
|
| 29 |
self.start_time = time.time()
|
| 30 |
+
self.preprocessing_info: Dict[str, Any] = {}
|
| 31 |
+
self.windows_info: List[Dict[str, Any]] = []
|
| 32 |
+
self.extraction_details: Dict[int, Dict[str, Any]] = {}
|
| 33 |
+
self.deduplication_details: Dict[str, Dict[str, Any]] = {}
|
| 34 |
+
self.synthesis_details: Dict[str, Any] = {}
|
| 35 |
|
| 36 |
def log_extraction(
|
| 37 |
self,
|
|
|
|
| 76 |
deduplicated_count: int,
|
| 77 |
duplicates_removed: int,
|
| 78 |
similarity_threshold: float,
|
| 79 |
+
embedding_model: str,
|
| 80 |
+
original_items: Optional[List[str]] = None,
|
| 81 |
+
deduplicated_items: Optional[List[str]] = None,
|
| 82 |
+
duplicate_groups: Optional[List[Dict[str, Any]]] = None
|
| 83 |
) -> None:
|
| 84 |
"""
|
| 85 |
Log deduplication operation for a category.
|
|
|
|
| 91 |
duplicates_removed: Number of duplicates removed
|
| 92 |
similarity_threshold: Similarity threshold used
|
| 93 |
embedding_model: Embedding model used
|
| 94 |
+
original_items: Original items list (full)
|
| 95 |
+
deduplicated_items: Deduplicated items list (full)
|
| 96 |
+
duplicate_groups: List of duplicate groups with similarity scores
|
| 97 |
"""
|
| 98 |
if not self.enabled:
|
| 99 |
return
|
|
|
|
| 112 |
}
|
| 113 |
|
| 114 |
self.trace_entries.append(entry)
|
| 115 |
+
|
| 116 |
+
self.deduplication_details[category] = {
|
| 117 |
+
"original_items": original_items or [],
|
| 118 |
+
"deduplicated_items": deduplicated_items or [],
|
| 119 |
+
"duplicate_groups": duplicate_groups or [],
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
logger.debug(f"[Trace] Deduplication {category}: {original_count} → {deduplicated_count} ({duplicates_removed} removed)")
|
| 123 |
|
| 124 |
def log_synthesis(
|
|
|
|
| 127 |
input_item_counts: Dict[str, int],
|
| 128 |
output_summary: str,
|
| 129 |
thinking: Optional[str] = None,
|
| 130 |
+
error: Optional[str] = None,
|
| 131 |
+
input_items: Optional[Dict[str, List[str]]] = None,
|
| 132 |
+
system_prompt: Optional[str] = None,
|
| 133 |
+
user_prompt: Optional[str] = None
|
| 134 |
) -> None:
|
| 135 |
"""
|
| 136 |
Log synthesis operation.
|
|
|
|
| 141 |
output_summary: Generated summary (sampled)
|
| 142 |
thinking: Thinking/reasoning content (sampled, if applicable)
|
| 143 |
error: Error message if synthesis failed
|
| 144 |
+
input_items: Full input items dict
|
| 145 |
+
system_prompt: System prompt used
|
| 146 |
+
user_prompt: User prompt used
|
| 147 |
"""
|
| 148 |
if not self.enabled:
|
| 149 |
return
|
|
|
|
| 161 |
}
|
| 162 |
|
| 163 |
self.trace_entries.append(entry)
|
| 164 |
+
|
| 165 |
+
self.synthesis_details = {
|
| 166 |
+
"input_items": input_items or {},
|
| 167 |
+
"system_prompt": system_prompt or "",
|
| 168 |
+
"user_prompt": user_prompt or "",
|
| 169 |
+
"full_output_summary": output_summary or "",
|
| 170 |
+
"full_thinking": thinking or "",
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
logger.debug(f"[Trace] Synthesis: {entry['success']}")
|
| 174 |
|
| 175 |
def get_trace_jsonl(self) -> str:
|
|
|
|
| 196 |
|
| 197 |
return self.trace_entries
|
| 198 |
|
| 199 |
+
def get_debug_json(self) -> Dict[str, Any]:
|
| 200 |
+
"""
|
| 201 |
+
Get full debug information including detailed logs from all stages.
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
Dict with rich debug information for all 3 stages
|
| 205 |
+
"""
|
| 206 |
+
if not self.enabled:
|
| 207 |
+
return {}
|
| 208 |
+
|
| 209 |
+
return {
|
| 210 |
+
"preprocessing": self.preprocessing_info,
|
| 211 |
+
"windows": self.windows_info,
|
| 212 |
+
"extraction": {
|
| 213 |
+
"details": self.extraction_details,
|
| 214 |
+
"summary": {
|
| 215 |
+
"total_windows": len(self.windows_info),
|
| 216 |
+
"total_items": sum(
|
| 217 |
+
sum(d["item_counts"].values())
|
| 218 |
+
for d in self.extraction_details.values()
|
| 219 |
+
),
|
| 220 |
+
"windows_with_repaired_json": sum(
|
| 221 |
+
1 for d in self.extraction_details.values()
|
| 222 |
+
if d.get("json_repaired", False)
|
| 223 |
+
),
|
| 224 |
+
}
|
| 225 |
+
},
|
| 226 |
+
"deduplication": {
|
| 227 |
+
"details": self.deduplication_details,
|
| 228 |
+
"summary": {
|
| 229 |
+
"total_original_items": sum(
|
| 230 |
+
len(d.get("original_items", []))
|
| 231 |
+
for d in self.deduplication_details.values()
|
| 232 |
+
),
|
| 233 |
+
"total_deduplicated_items": sum(
|
| 234 |
+
len(d.get("deduplicated_items", []))
|
| 235 |
+
for d in self.deduplication_details.values()
|
| 236 |
+
),
|
| 237 |
+
"total_duplicates_removed": sum(
|
| 238 |
+
len(d.get("original_items", [])) - len(d.get("deduplicated_items", []))
|
| 239 |
+
for d in self.deduplication_details.values()
|
| 240 |
+
),
|
| 241 |
+
}
|
| 242 |
+
},
|
| 243 |
+
"synthesis": self.synthesis_details,
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
def get_summary_stats(self) -> Dict[str, Any]:
|
| 247 |
"""
|
| 248 |
Get summary statistics from trace.
|
|
|
|
| 275 |
"synthesis_success": synthesis_entries[0]["success"] if synthesis_entries else False,
|
| 276 |
"total_elapsed_seconds": round(time.time() - self.start_time, 2),
|
| 277 |
}
|
| 278 |
+
|
| 279 |
+
def log_preprocessing(
|
| 280 |
+
self,
|
| 281 |
+
original_line_count: int,
|
| 282 |
+
cleaned_line_count: int,
|
| 283 |
+
original_char_count: int,
|
| 284 |
+
cleaned_char_count: int,
|
| 285 |
+
noise_phrases_removed: List[str],
|
| 286 |
+
detection_method: str = "segment_level"
|
| 287 |
+
) -> None:
|
| 288 |
+
"""
|
| 289 |
+
Log transcript preprocessing information.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
original_line_count: Number of lines before preprocessing
|
| 293 |
+
cleaned_line_count: Number of lines after preprocessing
|
| 294 |
+
original_char_count: Character count before preprocessing
|
| 295 |
+
cleaned_char_count: Character count after preprocessing
|
| 296 |
+
noise_phrases_removed: List of noise phrases detected and removed
|
| 297 |
+
detection_method: Method used for noise detection
|
| 298 |
+
"""
|
| 299 |
+
if not self.enabled:
|
| 300 |
+
return
|
| 301 |
+
|
| 302 |
+
self.preprocessing_info = {
|
| 303 |
+
"original_line_count": original_line_count,
|
| 304 |
+
"cleaned_line_count": cleaned_line_count,
|
| 305 |
+
"original_char_count": original_char_count,
|
| 306 |
+
"cleaned_char_count": cleaned_char_count,
|
| 307 |
+
"lines_removed": original_line_count - cleaned_line_count,
|
| 308 |
+
"chars_removed": original_char_count - cleaned_char_count,
|
| 309 |
+
"line_reduction_pct": round((1 - cleaned_line_count / original_line_count) * 100, 1) if original_line_count > 0 else 0.0,
|
| 310 |
+
"char_reduction_pct": round((1 - cleaned_char_count / original_char_count) * 100, 1) if original_char_count > 0 else 0.0,
|
| 311 |
+
"noise_phrases_removed": noise_phrases_removed,
|
| 312 |
+
"detection_method": detection_method,
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
logger.debug(f"[Trace] Preprocessing: {original_line_count} → {cleaned_line_count} lines ({self.preprocessing_info['line_reduction_pct']}% reduction)")
|
| 316 |
+
|
| 317 |
+
def log_window(
|
| 318 |
+
self,
|
| 319 |
+
window_id: int,
|
| 320 |
+
content: str,
|
| 321 |
+
token_count: int,
|
| 322 |
+
start_turn: int,
|
| 323 |
+
end_turn: int
|
| 324 |
+
) -> None:
|
| 325 |
+
"""
|
| 326 |
+
Log window information.
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
window_id: Window identifier
|
| 330 |
+
content: Window content (may be truncated for storage)
|
| 331 |
+
token_count: Number of tokens in window
|
| 332 |
+
start_turn: Starting line number
|
| 333 |
+
end_turn: Ending line number
|
| 334 |
+
"""
|
| 335 |
+
if not self.enabled:
|
| 336 |
+
return
|
| 337 |
+
|
| 338 |
+
self.windows_info.append({
|
| 339 |
+
"window_id": window_id,
|
| 340 |
+
"content": content,
|
| 341 |
+
"token_count": token_count,
|
| 342 |
+
"start_turn": start_turn,
|
| 343 |
+
"end_turn": end_turn,
|
| 344 |
+
"line_count": end_turn - start_turn + 1,
|
| 345 |
+
})
|
| 346 |
+
|
| 347 |
+
def log_extraction_detail(
|
| 348 |
+
self,
|
| 349 |
+
window_id: int,
|
| 350 |
+
extracted_items: Dict[str, List[str]],
|
| 351 |
+
full_llm_response: str,
|
| 352 |
+
full_thinking: Optional[str],
|
| 353 |
+
json_repaired: bool,
|
| 354 |
+
parse_attempts: int
|
| 355 |
+
) -> None:
|
| 356 |
+
"""
|
| 357 |
+
Log detailed extraction information for a window.
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
window_id: Window identifier
|
| 361 |
+
extracted_items: Full extracted items dict
|
| 362 |
+
full_llm_response: Complete LLM response
|
| 363 |
+
full_thinking: Complete thinking content (if any)
|
| 364 |
+
json_repaired: Whether JSON was repaired during parsing
|
| 365 |
+
parse_attempts: Number of parse attempts required
|
| 366 |
+
"""
|
| 367 |
+
if not self.enabled:
|
| 368 |
+
return
|
| 369 |
+
|
| 370 |
+
self.extraction_details[window_id] = {
|
| 371 |
+
"extracted_items": extracted_items,
|
| 372 |
+
"full_llm_response": full_llm_response,
|
| 373 |
+
"full_thinking": full_thinking,
|
| 374 |
+
"json_repaired": json_repaired,
|
| 375 |
+
"parse_attempts": parse_attempts,
|
| 376 |
+
"item_counts": {k: len(v) for k, v in extracted_items.items()},
|
| 377 |
+
}
|