Luigi commited on
Commit
7c9ccb7
·
1 Parent(s): 27363be

feat: enrich JSON export with rich debug info for all 3 pipeline stages

Browse files

Add debug_info section to Advanced Mode JSON export with:
- Preprocessing: stats and noise phrases removed
- Extraction: per-window full LLM responses and parse details
- Deduplication: duplicate groups with similarity scores
- Synthesis: full input items and prompts

Enables debugging pipeline failures causing bad final summaries.

Files changed (3) hide show
  1. app.py +50 -13
  2. meeting_summarizer/extraction.py +461 -30
  3. meeting_summarizer/trace.py +182 -2
app.py CHANGED
@@ -1453,7 +1453,7 @@ def summarize_advanced(
1453
  """
1454
  from meeting_summarizer.trace import Tracer
1455
  from meeting_summarizer.extraction import (
1456
- EmbeddingModel, Window,
1457
  stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
1458
  )
1459
 
@@ -1482,8 +1482,23 @@ def summarize_advanced(
1482
  """Count tokens using the extraction model's tokenizer."""
1483
  return len(extraction_llm.tokenize(text.encode('utf-8')))
1484
 
1485
- # Create windows from transcript (simple split by turns for now)
1486
- # In production, this would be more sophisticated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1487
  lines = [l.strip() for l in transcript.split('\n') if l.strip()]
1488
 
1489
  # Reserve tokens for system prompt (~200) and output (~1024)
@@ -1508,6 +1523,14 @@ def summarize_advanced(
1508
  end_turn=line_num - 1,
1509
  token_count=current_tokens
1510
  ))
 
 
 
 
 
 
 
 
1511
  window_id += 1
1512
 
1513
  # Start new window with overlap
@@ -1528,6 +1551,14 @@ def summarize_advanced(
1528
  end_turn=len(lines) - 1,
1529
  token_count=current_tokens
1530
  ))
 
 
 
 
 
 
 
 
1531
 
1532
  total_windows = len(windows)
1533
  yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
@@ -1640,6 +1671,7 @@ def summarize_advanced(
1640
 
1641
  # Get trace stats and add model names for download JSON
1642
  trace_stats = tracer.get_summary_stats()
 
1643
  ext_config = get_model_config(extraction_model_key, "extraction")
1644
  syn_config = get_model_config(synthesis_model_key, "synthesis")
1645
  trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
@@ -1652,7 +1684,8 @@ def summarize_advanced(
1652
  "thinking": final_thinking,
1653
  "summary": final_summary,
1654
  "trace_stats": trace_stats,
1655
- "trace_json": tracer.get_trace_json()
 
1656
  }
1657
 
1658
  except Exception as e:
@@ -1685,6 +1718,8 @@ def download_summary_json(summary, thinking, model_key, language, metrics):
1685
  if is_advanced:
1686
  # Advanced Mode: embed trace data and use pipeline model names
1687
  trace_stats = metrics.get("trace_stats", {})
 
 
1688
  data = {
1689
  "metadata": {
1690
  "generated_at": datetime.now().isoformat(),
@@ -1707,6 +1742,7 @@ def download_summary_json(summary, thinking, model_key, language, metrics):
1707
  "synthesis_success": trace_stats.get("synthesis_success", False),
1708
  "total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
1709
  },
 
1710
  "trace": metrics.get("trace_json", [])
1711
  }
1712
  else:
@@ -3187,8 +3223,8 @@ def create_interface():
3187
  supports_toggle = config.get("supports_toggle", False)
3188
 
3189
  if supports_toggle:
3190
- # Hybrid model
3191
- return gr.update(visible=True, value=False, interactive=True, label="🧠 Enable Reasoning for Extraction")
3192
  elif config.get("supports_reasoning", False):
3193
  # Thinking-only model (none currently in extraction)
3194
  return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
@@ -3478,17 +3514,18 @@ def create_interface():
3478
 
3479
  # Format info message
3480
  info_msg = f"""**Advanced Mode Complete**
3481
- - Total Windows: {trace_stats.get('total_windows', 0)}
3482
- - Items Extracted: {trace_stats.get('total_items_extracted', 0)}
3483
- - Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
3484
- - Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
3485
- - Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
3486
 
3487
- # Store trace for download
3488
  metrics = {
3489
  "mode": "advanced",
3490
  "trace_stats": trace_stats,
3491
- "trace_json": update.get("trace_json", [])
 
3492
  }
3493
 
3494
  yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")
 
1453
  """
1454
  from meeting_summarizer.trace import Tracer
1455
  from meeting_summarizer.extraction import (
1456
+ EmbeddingModel, Window, preprocess_transcript,
1457
  stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
1458
  )
1459
 
 
1482
  """Count tokens using the extraction model's tokenizer."""
1483
  return len(extraction_llm.tokenize(text.encode('utf-8')))
1484
 
1485
+ # Preprocess transcript: strip CSV format, remove noise/repetition
1486
+ raw_line_count = len(transcript.split('\n'))
1487
+ raw_char_count = len(transcript)
1488
+ transcript, noise_phrases = preprocess_transcript(transcript)
1489
+ cleaned_line_count = len(transcript.split('\n'))
1490
+ cleaned_char_count = len(transcript)
1491
+
1492
+ # Log preprocessing info to tracer
1493
+ tracer.log_preprocessing(
1494
+ original_line_count=raw_line_count,
1495
+ cleaned_line_count=cleaned_line_count,
1496
+ original_char_count=raw_char_count,
1497
+ cleaned_char_count=cleaned_char_count,
1498
+ noise_phrases_removed=noise_phrases
1499
+ )
1500
+
1501
+ # Create windows from preprocessed transcript
1502
  lines = [l.strip() for l in transcript.split('\n') if l.strip()]
1503
 
1504
  # Reserve tokens for system prompt (~200) and output (~1024)
 
1523
  end_turn=line_num - 1,
1524
  token_count=current_tokens
1525
  ))
1526
+ # Log window to tracer for debugging
1527
+ tracer.log_window(
1528
+ window_id=window_id,
1529
+ content=window_content,
1530
+ token_count=current_tokens,
1531
+ start_turn=line_num - len(current_window),
1532
+ end_turn=line_num - 1
1533
+ )
1534
  window_id += 1
1535
 
1536
  # Start new window with overlap
 
1551
  end_turn=len(lines) - 1,
1552
  token_count=current_tokens
1553
  ))
1554
+ # Log window to tracer for debugging
1555
+ tracer.log_window(
1556
+ window_id=window_id,
1557
+ content=window_content,
1558
+ token_count=current_tokens,
1559
+ start_turn=len(lines) - len(current_window),
1560
+ end_turn=len(lines) - 1
1561
+ )
1562
 
1563
  total_windows = len(windows)
1564
  yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
 
1671
 
1672
  # Get trace stats and add model names for download JSON
1673
  trace_stats = tracer.get_summary_stats()
1674
+ debug_json = tracer.get_debug_json()
1675
  ext_config = get_model_config(extraction_model_key, "extraction")
1676
  syn_config = get_model_config(synthesis_model_key, "synthesis")
1677
  trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
 
1684
  "thinking": final_thinking,
1685
  "summary": final_summary,
1686
  "trace_stats": trace_stats,
1687
+ "trace_json": tracer.get_trace_json(),
1688
+ "debug_json": debug_json
1689
  }
1690
 
1691
  except Exception as e:
 
1718
  if is_advanced:
1719
  # Advanced Mode: embed trace data and use pipeline model names
1720
  trace_stats = metrics.get("trace_stats", {})
1721
+ debug_info = metrics.get("debug_json", {})
1722
+
1723
  data = {
1724
  "metadata": {
1725
  "generated_at": datetime.now().isoformat(),
 
1742
  "synthesis_success": trace_stats.get("synthesis_success", False),
1743
  "total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
1744
  },
1745
+ "debug_info": debug_info,
1746
  "trace": metrics.get("trace_json", [])
1747
  }
1748
  else:
 
3223
  supports_toggle = config.get("supports_toggle", False)
3224
 
3225
  if supports_toggle:
3226
+ # Hybrid model — default reasoning ON for better extraction quality
3227
+ return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Extraction")
3228
  elif config.get("supports_reasoning", False):
3229
  # Thinking-only model (none currently in extraction)
3230
  return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
 
3514
 
3515
  # Format info message
3516
  info_msg = f"""**Advanced Mode Complete**
3517
+ - Total Windows: {trace_stats.get('total_windows', 0)}
3518
+ - Items Extracted: {trace_stats.get('total_items_extracted', 0)}
3519
+ - Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
3520
+ - Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
3521
+ - Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
3522
 
3523
+ # Store trace and debug info for download
3524
  metrics = {
3525
  "mode": "advanced",
3526
  "trace_stats": trace_stats,
3527
+ "trace_json": update.get("trace_json", []),
3528
+ "debug_json": update.get("debug_json", {})
3529
  }
3530
 
3531
  yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")
meeting_summarizer/extraction.py CHANGED
@@ -2,6 +2,7 @@
2
  Advanced Extraction Pipeline
3
 
4
  Provides:
 
5
  1. EMBEDDING_MODELS registry (4 models for deduplication)
6
  2. NativeTokenizer - Count tokens without llama.cpp
7
  3. EmbeddingModel - Load/compute embeddings
@@ -11,11 +12,13 @@ Provides:
11
  7. stream_synthesize_executive_summary - Stage 3: Synthesis
12
  """
13
 
 
 
14
  import re
15
  import json
16
  import time
17
  import logging
18
- from typing import Dict, List, Any, Tuple, Generator, Optional
19
  from dataclasses import dataclass
20
  import numpy as np
21
  from llama_cpp import Llama
@@ -23,6 +26,233 @@ from llama_cpp import Llama
23
  logger = logging.getLogger(__name__)
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # ===== EMBEDDING MODELS REGISTRY =====
27
 
28
  EMBEDDING_MODELS = {
@@ -281,39 +511,172 @@ def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
281
 
282
  # ===== HELPER FUNCTIONS =====
283
 
284
- def _try_parse_extraction_json(text: str) -> Optional[Dict[str, List[str]]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  """
286
  Attempt to parse extraction JSON from LLM output.
287
-
 
 
 
288
  Args:
289
  text: Raw LLM output
290
-
 
 
291
  Returns:
292
- Parsed dict or None if invalid
293
  """
294
  # Remove markdown code blocks
295
  text = re.sub(r'```json\s*', '', text)
296
  text = re.sub(r'```\s*$', '', text)
297
  text = text.strip()
298
-
 
 
299
  try:
300
  data = json.loads(text)
301
-
302
- # Validate schema
303
- required_keys = {"action_items", "decisions", "key_points", "open_questions"}
304
- if not isinstance(data, dict) or not required_keys.issubset(data.keys()):
305
- return None
306
-
307
- # Validate all values are lists
308
- for key in required_keys:
309
- if not isinstance(data[key], list):
310
- return None
311
-
312
- return data
313
-
314
  except json.JSONDecodeError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  return None
316
 
 
 
 
 
 
 
 
 
317
 
318
  def _sample_llm_response(text: str, max_chars: int = 400) -> str:
319
  """Sample LLM response for trace logging."""
@@ -478,10 +841,11 @@ def stream_extract_from_window(
478
  token_count = 0
479
 
480
  try:
 
481
  settings = model_config["inference_settings"]
482
  stream = extraction_llm.create_chat_completion(
483
  messages=messages,
484
- max_tokens=512,
485
  temperature=settings["temperature"],
486
  top_p=settings["top_p"],
487
  top_k=settings["top_k"],
@@ -522,7 +886,7 @@ def stream_extract_from_window(
522
  # Calculate metrics
523
  elapsed = time.time() - start_time
524
  tps = token_count / elapsed if elapsed > 0 else 0
525
- eta = int((1024 - token_count) / tps) if tps > 0 else 0
526
 
527
  # Get item counts
528
  items_found = {k: len(v) for k, v in partial_items.items()}
@@ -559,12 +923,14 @@ def stream_extract_from_window(
559
  else:
560
  json_text = full_response
561
 
562
- final_items = _try_parse_extraction_json(json_text)
563
 
564
  if not final_items:
 
 
565
  error_msg = f"Failed to parse JSON from window {window_id}"
566
  debug_output = f"{error_msg}\n\nRaw LLM output:\n{full_response[:1000]}\n"
567
- logger.error(debug_output)
568
  print(f"\n{'='*80}\n{debug_output}{'='*80}\n", flush=True)
569
  tracer.log_extraction(
570
  window_id=window_id,
@@ -572,7 +938,23 @@ def stream_extract_from_window(
572
  llm_response=_sample_llm_response(full_response),
573
  error=error_msg
574
  )
575
- raise ValueError(error_msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
 
577
  # Log success
578
  tracer.log_extraction(
@@ -583,6 +965,27 @@ def stream_extract_from_window(
583
  error=None
584
  )
585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  # Final ticker
587
  elapsed = time.time() - start_time
588
  tps = token_count / elapsed if elapsed > 0 else 0
@@ -644,26 +1047,41 @@ def deduplicate_items(
644
  emb = embedding_model.embed(item)
645
  embeddings.append(emb)
646
 
647
- # Mark duplicates
648
  keep_indices = []
 
 
649
  for i in range(len(items)):
650
  is_duplicate = False
 
 
651
 
652
  # Compare with all previously kept items
653
  for j in keep_indices:
654
  similarity = cosine_similarity(embeddings[i], embeddings[j])
655
  if similarity >= similarity_threshold:
656
  is_duplicate = True
 
 
657
  break
658
 
659
  if not is_duplicate:
660
  keep_indices.append(i)
 
 
 
 
 
 
 
 
 
661
 
662
  # Keep only unique items
663
  unique_items = [items[i] for i in keep_indices]
664
  deduplicated[category] = unique_items
665
 
666
- # Log deduplication
667
  duplicates_removed = original_count - len(unique_items)
668
  tracer.log_deduplication(
669
  category=category,
@@ -671,7 +1089,10 @@ def deduplicate_items(
671
  deduplicated_count=len(unique_items),
672
  duplicates_removed=duplicates_removed,
673
  similarity_threshold=similarity_threshold,
674
- embedding_model=embedding_model.model_key
 
 
 
675
  )
676
 
677
  logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
@@ -772,15 +1193,22 @@ def stream_synthesize_executive_summary(
772
  else:
773
  summary_text = full_summary
774
 
775
- # Log synthesis
776
  tracer.log_synthesis(
777
  synthesis_model=model_config["name"],
778
  input_item_counts=item_counts,
779
  output_summary=_sample_llm_response(summary_text),
780
  thinking=_sample_llm_response(thinking_content) if thinking_content else None,
781
- error=None
 
 
 
782
  )
783
 
 
 
 
 
784
  yield (summary_text, thinking_content, True)
785
 
786
  except Exception as e:
@@ -789,6 +1217,9 @@ def stream_synthesize_executive_summary(
789
  input_item_counts=item_counts,
790
  output_summary="",
791
  thinking=None,
792
- error=str(e)
 
 
 
793
  )
794
  raise
 
2
  Advanced Extraction Pipeline
3
 
4
  Provides:
5
+ 0. preprocess_transcript - Clean noisy CSV transcripts before extraction
6
  1. EMBEDDING_MODELS registry (4 models for deduplication)
7
  2. NativeTokenizer - Count tokens without llama.cpp
8
  3. EmbeddingModel - Load/compute embeddings
 
12
  7. stream_synthesize_executive_summary - Stage 3: Synthesis
13
  """
14
 
15
+ import csv
16
+ import io
17
  import re
18
  import json
19
  import time
20
  import logging
21
+ from typing import Dict, List, Any, Tuple, Generator, Optional, Set
22
  from dataclasses import dataclass
23
  import numpy as np
24
  from llama_cpp import Llama
 
26
  logger = logging.getLogger(__name__)
27
 
28
 
29
+ # ===== TRANSCRIPT PREPROCESSING =====
30
+
31
+ def preprocess_transcript(transcript_text: str) -> Tuple[str, List[str]]:
32
+ """
33
+ Clean noisy transcript text before extraction.
34
+
35
+ Handles:
36
+ 1. CSV format detection and text column extraction
37
+ 2. Speaker label prefixing (for context)
38
+ 3. Collapsing consecutive duplicate lines
39
+ 4. Collapsing repeated phrases within lines
40
+ 5. Filtering lines that are pure noise (no meaningful content)
41
+
42
+ Args:
43
+ transcript_text: Raw transcript (CSV or plain text)
44
+
45
+ Returns:
46
+ Tuple of (cleaned_dialogue_text, noise_phrases_list)
47
+ - cleaned_dialogue_text: Cleaned dialogue text with speaker labels
48
+ - noise_phrases_list: List of noise phrases detected and removed
49
+ """
50
+ raw_lines = transcript_text.strip().split('\n')
51
+ if not raw_lines:
52
+ return "", []
53
+
54
+ # Step 1: Detect CSV format and extract dialogue
55
+ dialogue_lines = _extract_dialogue_from_csv(raw_lines)
56
+
57
+ # Step 2: Collapse consecutive duplicate lines
58
+ deduped_lines = _collapse_consecutive_duplicates(dialogue_lines)
59
+
60
+ # Step 3: Clean repeated phrases within each line
61
+ cleaned_lines = []
62
+ for line in deduped_lines:
63
+ cleaned = _collapse_repeated_phrases(line)
64
+ if cleaned:
65
+ cleaned_lines.append(cleaned)
66
+
67
+ # Step 4: Filter lines that are pure noise
68
+ meaningful_lines, noise_phrases = _filter_noise_lines(cleaned_lines)
69
+
70
+ result = '\n'.join(meaningful_lines)
71
+ if result != transcript_text.strip():
72
+ original_len = len(transcript_text.strip())
73
+ cleaned_len = len(result)
74
+ reduction = ((original_len - cleaned_len) / original_len * 100) if original_len > 0 else 0
75
+ logger.info(
76
+ f"Transcript preprocessed: {original_len} → {cleaned_len} chars "
77
+ f"({reduction:.0f}% reduction, {len(meaningful_lines)} lines)"
78
+ )
79
+
80
+ return result, list(noise_phrases)
81
+
82
+
83
+ def _extract_dialogue_from_csv(lines: List[str]) -> List[str]:
84
+ """
85
+ Detect CSV format and extract speaker-prefixed dialogue lines.
86
+
87
+ If the first line looks like a CSV header (start,end,speaker,text),
88
+ parse as CSV and return 'SPEAKER_XX: text' lines.
89
+ Otherwise return lines as-is.
90
+ """
91
+ # Check for CSV header
92
+ first_line = lines[0].strip().lower()
93
+ is_csv = first_line.startswith('start,end,speaker,text') or (
94
+ ',' in first_line and any(
95
+ kw in first_line for kw in ['speaker', 'start', 'text']
96
+ )
97
+ )
98
+
99
+ if not is_csv:
100
+ return [l.strip() for l in lines if l.strip()]
101
+
102
+ # Parse CSV, skipping header
103
+ dialogue = []
104
+ csv_text = '\n'.join(lines)
105
+ reader = csv.reader(io.StringIO(csv_text))
106
+
107
+ for i, row in enumerate(reader):
108
+ if i == 0:
109
+ # Skip header row
110
+ continue
111
+ if len(row) >= 4:
112
+ speaker = row[2].strip()
113
+ text = row[3].strip().strip('"')
114
+ if text:
115
+ dialogue.append(f"{speaker}: {text}")
116
+ elif len(row) >= 1:
117
+ # Fallback: take whatever text is there
118
+ text = ','.join(row).strip()
119
+ if text:
120
+ dialogue.append(text)
121
+
122
+ return dialogue
123
+
124
+
125
+ def _collapse_consecutive_duplicates(lines: List[str]) -> List[str]:
126
+ """Remove consecutive duplicate lines (exact match)."""
127
+ if not lines:
128
+ return []
129
+
130
+ result = [lines[0]]
131
+ for line in lines[1:]:
132
+ if line != result[-1]:
133
+ result.append(line)
134
+ return result
135
+
136
+
137
+ def _collapse_repeated_phrases(line: str, max_repeats: int = 2) -> str:
138
+ """
139
+ Collapse repeated phrases within a single line.
140
+
141
+ Detects patterns like 'ABC。ABC。ABC。' and reduces to 'ABC。'
142
+ Works with Chinese punctuation boundaries.
143
+ """
144
+ if not line:
145
+ return line
146
+
147
+ # Split by Chinese/standard sentence boundaries
148
+ # Keep the delimiter attached to the preceding segment
149
+ segments = re.split(r'(?<=[。!?;\.\!\?\;])', line)
150
+ segments = [s.strip() for s in segments if s.strip()]
151
+
152
+ if len(segments) <= 1:
153
+ return line
154
+
155
+ # Collapse consecutive identical segments
156
+ deduped = [segments[0]]
157
+ repeat_count = 1
158
+ for seg in segments[1:]:
159
+ if seg == deduped[-1]:
160
+ repeat_count += 1
161
+ if repeat_count <= max_repeats:
162
+ deduped.append(seg)
163
+ else:
164
+ deduped.append(seg)
165
+ repeat_count = 1
166
+
167
+ return ''.join(deduped)
168
+
169
+
170
+ def _filter_noise_lines(
171
+ lines: List[str],
172
+ min_unique_chars: int = 5,
173
+ noise_phrase_threshold: int = 5
174
+ ) -> Tuple[List[str], Set[str]]:
175
+ """
176
+ Filter out lines that are pure noise (ASR hallucination loops).
177
+
178
+ A line is noise if:
179
+ - It has fewer than min_unique_chars unique non-punctuation characters
180
+ - Its content is entirely composed of a single phrase that repeats
181
+ across the transcript more than noise_phrase_threshold times
182
+
183
+ Args:
184
+ lines: Preprocessed dialogue lines
185
+ min_unique_chars: Minimum unique chars to keep a line
186
+ noise_phrase_threshold: A phrase appearing more than this many times
187
+ across the transcript is considered noise
188
+
189
+ Returns:
190
+ Tuple of (filtered_lines, noise_phrases)
191
+ - filtered_lines: Lines that are not pure noise
192
+ - noise_phrases: Set of noise phrases detected
193
+ """
194
+ if not lines:
195
+ return [], set()
196
+
197
+ _punct_re = re.compile(
198
+ r'[\s\u3000\uff0c\u3002\uff01\uff1f\u3001\uff1b\uff1a'
199
+ r'\u201c\u201d\u2018\u2019'
200
+ r'\uff08\uff09()\.,!?;:"\'\s]'
201
+ )
202
+
203
+ def strip_speaker(line: str) -> str:
204
+ return re.sub(r'^SPEAKER_\d+:\s*', '', line)
205
+
206
+ def get_content(text: str) -> str:
207
+ return _punct_re.sub('', text)
208
+
209
+ # Step 1: Split each line into sentence-level segments and count
210
+ # how many times each segment appears across the entire transcript.
211
+ # This catches ASR hallucination like "並且請留意下方的資訊欄" which
212
+ # may repeat within a line and across many lines.
213
+ segment_counts: Dict[str, int] = {}
214
+ for line in lines:
215
+ text = strip_speaker(line)
216
+ # Split on Chinese sentence boundaries
217
+ segments = re.split(r'[。!?;\.\!\?\;]', text)
218
+ seen_in_line: set = set()
219
+ for seg in segments:
220
+ seg_content = get_content(seg)
221
+ if len(seg_content) >= 3 and seg_content not in seen_in_line:
222
+ seen_in_line.add(seg_content)
223
+ segment_counts[seg_content] = segment_counts.get(seg_content, 0) + 1
224
+
225
+ # Step 2: Find noise phrases (segments appearing in too many lines)
226
+ noise_phrases = {
227
+ phrase for phrase, count in segment_counts.items()
228
+ if count >= noise_phrase_threshold
229
+ }
230
+
231
+ # Step 3: For each line, check if it's purely noise
232
+ meaningful = []
233
+ for line in lines:
234
+ text = strip_speaker(line)
235
+ content = get_content(text)
236
+
237
+ # Skip if too few unique characters
238
+ if len(set(content)) < min_unique_chars:
239
+ continue
240
+
241
+ # Check if the line is entirely composed of noise phrases.
242
+ # Remove all noise phrase occurrences and see if anything meaningful remains.
243
+ remaining = content
244
+ for noise in noise_phrases:
245
+ remaining = remaining.replace(noise, '')
246
+
247
+ # If nothing meaningful remains after removing noise, skip this line
248
+ if len(remaining.strip()) < min_unique_chars:
249
+ continue
250
+
251
+ meaningful.append(line)
252
+
253
+ return meaningful, noise_phrases
254
+
255
+
256
  # ===== EMBEDDING MODELS REGISTRY =====
257
 
258
  EMBEDDING_MODELS = {
 
511
 
512
  # ===== HELPER FUNCTIONS =====
513
 
514
+ def _repair_truncated_json(text: str) -> str:
515
+ """
516
+ Attempt to repair truncated JSON by closing open brackets/strings.
517
+
518
+ Handles cases where max_tokens cuts off the response mid-JSON,
519
+ e.g. a string never closed, an array never closed, etc.
520
+
521
+ Args:
522
+ text: Truncated JSON string
523
+
524
+ Returns:
525
+ Repaired JSON string (best effort)
526
+ """
527
+ in_string = False
528
+ escape_next = False
529
+ stack = [] # tracks open { and [
530
+
531
+ for char in text:
532
+ if escape_next:
533
+ escape_next = False
534
+ continue
535
+ if char == '\\' and in_string:
536
+ escape_next = True
537
+ continue
538
+ if char == '"' and not escape_next:
539
+ in_string = not in_string
540
+ continue
541
+ if in_string:
542
+ continue
543
+ if char in ('{', '['):
544
+ stack.append(char)
545
+ elif char == '}' and stack and stack[-1] == '{':
546
+ stack.pop()
547
+ elif char == ']' and stack and stack[-1] == '[':
548
+ stack.pop()
549
+
550
+ repair = ""
551
+ if in_string:
552
+ repair += '"'
553
+ for opener in reversed(stack):
554
+ if opener == '[':
555
+ repair += ']'
556
+ elif opener == '{':
557
+ repair += '}'
558
+
559
+ return text + repair
560
+
561
+
562
+ def _normalize_item_to_string(item: Any) -> str:
563
+ """
564
+ Normalize an extracted item to a plain string.
565
+
566
+ Models may output items as strings or as dicts with various fields
567
+ (e.g. {"assigned_to": "X", "due_date": "Y"}). This flattens them
568
+ to a single descriptive string.
569
+
570
+ Args:
571
+ item: A string or dict from the extraction JSON
572
+
573
+ Returns:
574
+ A plain string representation
575
+ """
576
+ if isinstance(item, str):
577
+ return item.strip()
578
+
579
+ if isinstance(item, dict):
580
+ parts = []
581
+ for key, value in item.items():
582
+ if value and isinstance(value, str) and value.strip():
583
+ parts.append(f"{key}: {value.strip()}")
584
+ return '; '.join(parts) if parts else str(item)
585
+
586
+ return str(item)
587
+
588
+
589
+ def _normalize_extraction_items(data: Dict[str, list]) -> Dict[str, List[str]]:
590
+ """
591
+ Normalize all extracted items to plain strings.
592
+
593
+ Args:
594
+ data: Parsed extraction dict (values may contain dicts or strings)
595
+
596
+ Returns:
597
+ Dict with all values as lists of strings
598
+ """
599
+ required_keys = {"action_items", "decisions", "key_points", "open_questions"}
600
+ normalized: Dict[str, List[str]] = {}
601
+
602
+ for key in required_keys:
603
+ items = data.get(key, [])
604
+ if not isinstance(items, list):
605
+ normalized[key] = []
606
+ continue
607
+ normalized[key] = [
608
+ s for s in (_normalize_item_to_string(item) for item in items) if s
609
+ ]
610
+
611
+ return normalized
612
+
613
+
614
+ def _try_parse_extraction_json(
615
+ text: str, log_repair: bool = False
616
+ ) -> Optional[Dict[str, List[str]]]:
617
  """
618
  Attempt to parse extraction JSON from LLM output.
619
+
620
+ Handles truncated JSON (from max_tokens cutoff) by repairing
621
+ unclosed brackets/strings. Normalizes item formats (dicts -> strings).
622
+
623
  Args:
624
  text: Raw LLM output
625
+ log_repair: If True, log when repair was needed (use only for
626
+ final parse, not streaming chunks)
627
+
628
  Returns:
629
+ Parsed and normalized dict, or None if unrecoverable
630
  """
631
  # Remove markdown code blocks
632
  text = re.sub(r'```json\s*', '', text)
633
  text = re.sub(r'```\s*$', '', text)
634
  text = text.strip()
635
+
636
+ # Attempt 1: parse as-is
637
+ data = None
638
  try:
639
  data = json.loads(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  except json.JSONDecodeError:
641
+ pass
642
+
643
+ # Attempt 2: repair truncated JSON
644
+ if data is None:
645
+ repaired = _repair_truncated_json(text)
646
+ try:
647
+ data = json.loads(repaired)
648
+ if log_repair:
649
+ logger.info("Successfully parsed JSON after repair (output was truncated)")
650
+ except json.JSONDecodeError:
651
+ pass
652
+
653
+ # Attempt 3: find outermost { and repair from there
654
+ if data is None:
655
+ match = re.search(r'\{', text)
656
+ if match:
657
+ repaired = _repair_truncated_json(text[match.start():])
658
+ try:
659
+ data = json.loads(repaired)
660
+ if log_repair:
661
+ logger.info("Successfully parsed JSON from substring after repair")
662
+ except json.JSONDecodeError:
663
+ return None
664
+ else:
665
+ return None
666
+
667
+ # Validate schema
668
+ required_keys = {"action_items", "decisions", "key_points", "open_questions"}
669
+ if not isinstance(data, dict) or not required_keys.issubset(data.keys()):
670
  return None
671
 
672
+ # Validate all values are lists
673
+ for key in required_keys:
674
+ if not isinstance(data[key], list):
675
+ return None
676
+
677
+ # Normalize items (flatten dicts to strings)
678
+ return _normalize_extraction_items(data)
679
+
680
 
681
  def _sample_llm_response(text: str, max_chars: int = 400) -> str:
682
  """Sample LLM response for trace logging."""
 
841
  token_count = 0
842
 
843
  try:
844
+ max_gen_tokens = 1024
845
  settings = model_config["inference_settings"]
846
  stream = extraction_llm.create_chat_completion(
847
  messages=messages,
848
+ max_tokens=max_gen_tokens,
849
  temperature=settings["temperature"],
850
  top_p=settings["top_p"],
851
  top_k=settings["top_k"],
 
886
  # Calculate metrics
887
  elapsed = time.time() - start_time
888
  tps = token_count / elapsed if elapsed > 0 else 0
889
+ eta = int((max_gen_tokens - token_count) / tps) if tps > 0 else 0
890
 
891
  # Get item counts
892
  items_found = {k: len(v) for k, v in partial_items.items()}
 
923
  else:
924
  json_text = full_response
925
 
926
+ final_items = _try_parse_extraction_json(json_text, log_repair=True)
927
 
928
  if not final_items:
929
+ # Graceful degradation: log warning but don't crash the pipeline.
930
+ # Other windows may still succeed and produce useful data.
931
  error_msg = f"Failed to parse JSON from window {window_id}"
932
  debug_output = f"{error_msg}\n\nRaw LLM output:\n{full_response[:1000]}\n"
933
+ logger.warning(debug_output)
934
  print(f"\n{'='*80}\n{debug_output}{'='*80}\n", flush=True)
935
  tracer.log_extraction(
936
  window_id=window_id,
 
938
  llm_response=_sample_llm_response(full_response),
939
  error=error_msg
940
  )
941
+ # Yield empty result instead of crashing
942
+ empty_items = {
943
+ "action_items": [], "decisions": [],
944
+ "key_points": [], "open_questions": []
945
+ }
946
+ ticker = format_progress_ticker(
947
+ current_window=window_id,
948
+ total_windows=total_windows,
949
+ window_tokens=window.token_count,
950
+ max_tokens=4096,
951
+ items_found={k: 0 for k in empty_items},
952
+ tokens_per_sec=0,
953
+ eta_seconds=0,
954
+ current_snippet=f"⚠️ Window {window_id} parse failed, continuing..."
955
+ )
956
+ yield (ticker, thinking_content, empty_items, True)
957
+ return
958
 
959
  # Log success
960
  tracer.log_extraction(
 
965
  error=None
966
  )
967
 
968
+ # Log detailed extraction info for debugging
969
+ json_repaired = False
970
+ parse_attempts = 1
971
+
972
+ # Check if the JSON was repaired by examining the parse function
973
+ # This is a heuristic - the actual parse_attempts would be tracked inside _try_parse_extraction_json
974
+ try:
975
+ json.loads(full_response)
976
+ except json.JSONDecodeError:
977
+ json_repaired = True
978
+ parse_attempts = 2
979
+
980
+ tracer.log_extraction_detail(
981
+ window_id=window_id,
982
+ extracted_items=final_items,
983
+ full_llm_response=full_response,
984
+ full_thinking=thinking_content,
985
+ json_repaired=json_repaired,
986
+ parse_attempts=parse_attempts
987
+ )
988
+
989
  # Final ticker
990
  elapsed = time.time() - start_time
991
  tps = token_count / elapsed if elapsed > 0 else 0
 
1047
  emb = embedding_model.embed(item)
1048
  embeddings.append(emb)
1049
 
1050
+ # Mark duplicates and track duplicate groups
1051
  keep_indices = []
1052
+ duplicate_groups = []
1053
+
1054
  for i in range(len(items)):
1055
  is_duplicate = False
1056
+ duplicate_of_idx = -1
1057
+ similarity_score = 0.0
1058
 
1059
  # Compare with all previously kept items
1060
  for j in keep_indices:
1061
  similarity = cosine_similarity(embeddings[i], embeddings[j])
1062
  if similarity >= similarity_threshold:
1063
  is_duplicate = True
1064
+ duplicate_of_idx = j
1065
+ similarity_score = similarity
1066
  break
1067
 
1068
  if not is_duplicate:
1069
  keep_indices.append(i)
1070
+ else:
1071
+ # Record duplicate group for debugging
1072
+ duplicate_groups.append({
1073
+ "duplicate_item": items[i],
1074
+ "duplicate_index": i,
1075
+ "kept_item": items[duplicate_of_idx],
1076
+ "kept_index": duplicate_of_idx,
1077
+ "similarity": round(similarity_score, 3),
1078
+ })
1079
 
1080
  # Keep only unique items
1081
  unique_items = [items[i] for i in keep_indices]
1082
  deduplicated[category] = unique_items
1083
 
1084
+ # Log deduplication with full details
1085
  duplicates_removed = original_count - len(unique_items)
1086
  tracer.log_deduplication(
1087
  category=category,
 
1089
  deduplicated_count=len(unique_items),
1090
  duplicates_removed=duplicates_removed,
1091
  similarity_threshold=similarity_threshold,
1092
+ embedding_model=embedding_model.model_key,
1093
+ original_items=items,
1094
+ deduplicated_items=unique_items,
1095
+ duplicate_groups=duplicate_groups
1096
  )
1097
 
1098
  logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
 
1193
  else:
1194
  summary_text = full_summary
1195
 
1196
+ # Log synthesis with full details
1197
  tracer.log_synthesis(
1198
  synthesis_model=model_config["name"],
1199
  input_item_counts=item_counts,
1200
  output_summary=_sample_llm_response(summary_text),
1201
  thinking=_sample_llm_response(thinking_content) if thinking_content else None,
1202
+ error=None,
1203
+ input_items=deduplicated_items,
1204
+ system_prompt=system_prompt,
1205
+ user_prompt=user_prompt
1206
  )
1207
 
1208
+ # Also store full outputs in synthesis_details directly
1209
+ tracer.synthesis_details["full_output_summary"] = summary_text
1210
+ tracer.synthesis_details["full_thinking"] = thinking_content
1211
+
1212
  yield (summary_text, thinking_content, True)
1213
 
1214
  except Exception as e:
 
1217
  input_item_counts=item_counts,
1218
  output_summary="",
1219
  thinking=None,
1220
+ error=str(e),
1221
+ input_items=deduplicated_items,
1222
+ system_prompt=system_prompt,
1223
+ user_prompt=user_prompt
1224
  )
1225
  raise
meeting_summarizer/trace.py CHANGED
@@ -27,6 +27,11 @@ class Tracer:
27
  self.enabled = enabled
28
  self.trace_entries: List[Dict[str, Any]] = []
29
  self.start_time = time.time()
 
 
 
 
 
30
 
31
  def log_extraction(
32
  self,
@@ -71,7 +76,10 @@ class Tracer:
71
  deduplicated_count: int,
72
  duplicates_removed: int,
73
  similarity_threshold: float,
74
- embedding_model: str
 
 
 
75
  ) -> None:
76
  """
77
  Log deduplication operation for a category.
@@ -83,6 +91,9 @@ class Tracer:
83
  duplicates_removed: Number of duplicates removed
84
  similarity_threshold: Similarity threshold used
85
  embedding_model: Embedding model used
 
 
 
86
  """
87
  if not self.enabled:
88
  return
@@ -101,6 +112,13 @@ class Tracer:
101
  }
102
 
103
  self.trace_entries.append(entry)
 
 
 
 
 
 
 
104
  logger.debug(f"[Trace] Deduplication {category}: {original_count} → {deduplicated_count} ({duplicates_removed} removed)")
105
 
106
  def log_synthesis(
@@ -109,7 +127,10 @@ class Tracer:
109
  input_item_counts: Dict[str, int],
110
  output_summary: str,
111
  thinking: Optional[str] = None,
112
- error: Optional[str] = None
 
 
 
113
  ) -> None:
114
  """
115
  Log synthesis operation.
@@ -120,6 +141,9 @@ class Tracer:
120
  output_summary: Generated summary (sampled)
121
  thinking: Thinking/reasoning content (sampled, if applicable)
122
  error: Error message if synthesis failed
 
 
 
123
  """
124
  if not self.enabled:
125
  return
@@ -137,6 +161,15 @@ class Tracer:
137
  }
138
 
139
  self.trace_entries.append(entry)
 
 
 
 
 
 
 
 
 
140
  logger.debug(f"[Trace] Synthesis: {entry['success']}")
141
 
142
  def get_trace_jsonl(self) -> str:
@@ -163,6 +196,53 @@ class Tracer:
163
 
164
  return self.trace_entries
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  def get_summary_stats(self) -> Dict[str, Any]:
167
  """
168
  Get summary statistics from trace.
@@ -195,3 +275,103 @@ class Tracer:
195
  "synthesis_success": synthesis_entries[0]["success"] if synthesis_entries else False,
196
  "total_elapsed_seconds": round(time.time() - self.start_time, 2),
197
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  self.enabled = enabled
28
  self.trace_entries: List[Dict[str, Any]] = []
29
  self.start_time = time.time()
30
+ self.preprocessing_info: Dict[str, Any] = {}
31
+ self.windows_info: List[Dict[str, Any]] = []
32
+ self.extraction_details: Dict[int, Dict[str, Any]] = {}
33
+ self.deduplication_details: Dict[str, Dict[str, Any]] = {}
34
+ self.synthesis_details: Dict[str, Any] = {}
35
 
36
  def log_extraction(
37
  self,
 
76
  deduplicated_count: int,
77
  duplicates_removed: int,
78
  similarity_threshold: float,
79
+ embedding_model: str,
80
+ original_items: Optional[List[str]] = None,
81
+ deduplicated_items: Optional[List[str]] = None,
82
+ duplicate_groups: Optional[List[Dict[str, Any]]] = None
83
  ) -> None:
84
  """
85
  Log deduplication operation for a category.
 
91
  duplicates_removed: Number of duplicates removed
92
  similarity_threshold: Similarity threshold used
93
  embedding_model: Embedding model used
94
+ original_items: Original items list (full)
95
+ deduplicated_items: Deduplicated items list (full)
96
+ duplicate_groups: List of duplicate groups with similarity scores
97
  """
98
  if not self.enabled:
99
  return
 
112
  }
113
 
114
  self.trace_entries.append(entry)
115
+
116
+ self.deduplication_details[category] = {
117
+ "original_items": original_items or [],
118
+ "deduplicated_items": deduplicated_items or [],
119
+ "duplicate_groups": duplicate_groups or [],
120
+ }
121
+
122
  logger.debug(f"[Trace] Deduplication {category}: {original_count} → {deduplicated_count} ({duplicates_removed} removed)")
123
 
124
  def log_synthesis(
 
127
  input_item_counts: Dict[str, int],
128
  output_summary: str,
129
  thinking: Optional[str] = None,
130
+ error: Optional[str] = None,
131
+ input_items: Optional[Dict[str, List[str]]] = None,
132
+ system_prompt: Optional[str] = None,
133
+ user_prompt: Optional[str] = None
134
  ) -> None:
135
  """
136
  Log synthesis operation.
 
141
  output_summary: Generated summary (sampled)
142
  thinking: Thinking/reasoning content (sampled, if applicable)
143
  error: Error message if synthesis failed
144
+ input_items: Full input items dict
145
+ system_prompt: System prompt used
146
+ user_prompt: User prompt used
147
  """
148
  if not self.enabled:
149
  return
 
161
  }
162
 
163
  self.trace_entries.append(entry)
164
+
165
+ self.synthesis_details = {
166
+ "input_items": input_items or {},
167
+ "system_prompt": system_prompt or "",
168
+ "user_prompt": user_prompt or "",
169
+ "full_output_summary": output_summary or "",
170
+ "full_thinking": thinking or "",
171
+ }
172
+
173
  logger.debug(f"[Trace] Synthesis: {entry['success']}")
174
 
175
  def get_trace_jsonl(self) -> str:
 
196
 
197
  return self.trace_entries
198
 
199
+ def get_debug_json(self) -> Dict[str, Any]:
200
+ """
201
+ Get full debug information including detailed logs from all stages.
202
+
203
+ Returns:
204
+ Dict with rich debug information for all 3 stages
205
+ """
206
+ if not self.enabled:
207
+ return {}
208
+
209
+ return {
210
+ "preprocessing": self.preprocessing_info,
211
+ "windows": self.windows_info,
212
+ "extraction": {
213
+ "details": self.extraction_details,
214
+ "summary": {
215
+ "total_windows": len(self.windows_info),
216
+ "total_items": sum(
217
+ sum(d["item_counts"].values())
218
+ for d in self.extraction_details.values()
219
+ ),
220
+ "windows_with_repaired_json": sum(
221
+ 1 for d in self.extraction_details.values()
222
+ if d.get("json_repaired", False)
223
+ ),
224
+ }
225
+ },
226
+ "deduplication": {
227
+ "details": self.deduplication_details,
228
+ "summary": {
229
+ "total_original_items": sum(
230
+ len(d.get("original_items", []))
231
+ for d in self.deduplication_details.values()
232
+ ),
233
+ "total_deduplicated_items": sum(
234
+ len(d.get("deduplicated_items", []))
235
+ for d in self.deduplication_details.values()
236
+ ),
237
+ "total_duplicates_removed": sum(
238
+ len(d.get("original_items", [])) - len(d.get("deduplicated_items", []))
239
+ for d in self.deduplication_details.values()
240
+ ),
241
+ }
242
+ },
243
+ "synthesis": self.synthesis_details,
244
+ }
245
+
246
  def get_summary_stats(self) -> Dict[str, Any]:
247
  """
248
  Get summary statistics from trace.
 
275
  "synthesis_success": synthesis_entries[0]["success"] if synthesis_entries else False,
276
  "total_elapsed_seconds": round(time.time() - self.start_time, 2),
277
  }
278
+
279
+ def log_preprocessing(
280
+ self,
281
+ original_line_count: int,
282
+ cleaned_line_count: int,
283
+ original_char_count: int,
284
+ cleaned_char_count: int,
285
+ noise_phrases_removed: List[str],
286
+ detection_method: str = "segment_level"
287
+ ) -> None:
288
+ """
289
+ Log transcript preprocessing information.
290
+
291
+ Args:
292
+ original_line_count: Number of lines before preprocessing
293
+ cleaned_line_count: Number of lines after preprocessing
294
+ original_char_count: Character count before preprocessing
295
+ cleaned_char_count: Character count after preprocessing
296
+ noise_phrases_removed: List of noise phrases detected and removed
297
+ detection_method: Method used for noise detection
298
+ """
299
+ if not self.enabled:
300
+ return
301
+
302
+ self.preprocessing_info = {
303
+ "original_line_count": original_line_count,
304
+ "cleaned_line_count": cleaned_line_count,
305
+ "original_char_count": original_char_count,
306
+ "cleaned_char_count": cleaned_char_count,
307
+ "lines_removed": original_line_count - cleaned_line_count,
308
+ "chars_removed": original_char_count - cleaned_char_count,
309
+ "line_reduction_pct": round((1 - cleaned_line_count / original_line_count) * 100, 1) if original_line_count > 0 else 0.0,
310
+ "char_reduction_pct": round((1 - cleaned_char_count / original_char_count) * 100, 1) if original_char_count > 0 else 0.0,
311
+ "noise_phrases_removed": noise_phrases_removed,
312
+ "detection_method": detection_method,
313
+ }
314
+
315
+ logger.debug(f"[Trace] Preprocessing: {original_line_count} → {cleaned_line_count} lines ({self.preprocessing_info['line_reduction_pct']}% reduction)")
316
+
317
+ def log_window(
318
+ self,
319
+ window_id: int,
320
+ content: str,
321
+ token_count: int,
322
+ start_turn: int,
323
+ end_turn: int
324
+ ) -> None:
325
+ """
326
+ Log window information.
327
+
328
+ Args:
329
+ window_id: Window identifier
330
+ content: Window content (may be truncated for storage)
331
+ token_count: Number of tokens in window
332
+ start_turn: Starting line number
333
+ end_turn: Ending line number
334
+ """
335
+ if not self.enabled:
336
+ return
337
+
338
+ self.windows_info.append({
339
+ "window_id": window_id,
340
+ "content": content,
341
+ "token_count": token_count,
342
+ "start_turn": start_turn,
343
+ "end_turn": end_turn,
344
+ "line_count": end_turn - start_turn + 1,
345
+ })
346
+
347
+ def log_extraction_detail(
348
+ self,
349
+ window_id: int,
350
+ extracted_items: Dict[str, List[str]],
351
+ full_llm_response: str,
352
+ full_thinking: Optional[str],
353
+ json_repaired: bool,
354
+ parse_attempts: int
355
+ ) -> None:
356
+ """
357
+ Log detailed extraction information for a window.
358
+
359
+ Args:
360
+ window_id: Window identifier
361
+ extracted_items: Full extracted items dict
362
+ full_llm_response: Complete LLM response
363
+ full_thinking: Complete thinking content (if any)
364
+ json_repaired: Whether JSON was repaired during parsing
365
+ parse_attempts: Number of parse attempts required
366
+ """
367
+ if not self.enabled:
368
+ return
369
+
370
+ self.extraction_details[window_id] = {
371
+ "extracted_items": extracted_items,
372
+ "full_llm_response": full_llm_response,
373
+ "full_thinking": full_thinking,
374
+ "json_repaired": json_repaired,
375
+ "parse_attempts": parse_attempts,
376
+ "item_counts": {k: len(v) for k, v in extracted_items.items()},
377
+ }