Spaces:

Luigi
/

tiny-scribe

Sleeping

Luigi commited on Feb 5

Commit

7c9ccb7

1 Parent(s): 27363be

feat: enrich JSON export with rich debug info for all 3 pipeline stages

Add debug_info section to Advanced Mode JSON export with:
- Preprocessing: stats and noise phrases removed
- Extraction: per-window full LLM responses and parse details
- Deduplication: duplicate groups with similarity scores
- Synthesis: full input items and prompts

Enables debugging pipeline failures causing bad final summaries.

Files changed (3) hide show

app.py +50 -13
meeting_summarizer/extraction.py +461 -30
meeting_summarizer/trace.py +182 -2

app.py CHANGED Viewed

@@ -1453,7 +1453,7 @@ def summarize_advanced(
     """
     from meeting_summarizer.trace import Tracer
     from meeting_summarizer.extraction import (
-        EmbeddingModel, Window,
         stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
     )
@@ -1482,8 +1482,23 @@ def summarize_advanced(
             """Count tokens using the extraction model's tokenizer."""
             return len(extraction_llm.tokenize(text.encode('utf-8')))
-        # Create windows from transcript (simple split by turns for now)
-        # In production, this would be more sophisticated
         lines = [l.strip() for l in transcript.split('\n') if l.strip()]
         # Reserve tokens for system prompt (~200) and output (~1024)
@@ -1508,6 +1523,14 @@ def summarize_advanced(
                     end_turn=line_num - 1,
                     token_count=current_tokens
                 ))
                 window_id += 1
                 # Start new window with overlap
@@ -1528,6 +1551,14 @@ def summarize_advanced(
                 end_turn=len(lines) - 1,
                 token_count=current_tokens
             ))
         total_windows = len(windows)
         yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
@@ -1640,6 +1671,7 @@ def summarize_advanced(
         # Get trace stats and add model names for download JSON
         trace_stats = tracer.get_summary_stats()
         ext_config = get_model_config(extraction_model_key, "extraction")
         syn_config = get_model_config(synthesis_model_key, "synthesis")
         trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
@@ -1652,7 +1684,8 @@ def summarize_advanced(
             "thinking": final_thinking,
             "summary": final_summary,
             "trace_stats": trace_stats,
-            "trace_json": tracer.get_trace_json()
         }
     except Exception as e:
@@ -1685,6 +1718,8 @@ def download_summary_json(summary, thinking, model_key, language, metrics):
     if is_advanced:
         # Advanced Mode: embed trace data and use pipeline model names
         trace_stats = metrics.get("trace_stats", {})
         data = {
             "metadata": {
                 "generated_at": datetime.now().isoformat(),
@@ -1707,6 +1742,7 @@ def download_summary_json(summary, thinking, model_key, language, metrics):
                 "synthesis_success": trace_stats.get("synthesis_success", False),
                 "total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
             },
             "trace": metrics.get("trace_json", [])
         }
     else:
@@ -3187,8 +3223,8 @@ def create_interface():
             supports_toggle = config.get("supports_toggle", False)
             if supports_toggle:
-                # Hybrid model
-                return gr.update(visible=True, value=False, interactive=True, label="🧠 Enable Reasoning for Extraction")
             elif config.get("supports_reasoning", False):
                 # Thinking-only model (none currently in extraction)
                 return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
@@ -3478,17 +3514,18 @@ def create_interface():
                         # Format info message
                         info_msg = f"""**Advanced Mode Complete**
-- Total Windows: {trace_stats.get('total_windows', 0)}
-- Items Extracted: {trace_stats.get('total_items_extracted', 0)}
-- Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
-- Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
-- Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
-                        # Store trace for download
                         metrics = {
                             "mode": "advanced",
                             "trace_stats": trace_stats,
-                            "trace_json": update.get("trace_json", [])
                         }
                         yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")

     """
     from meeting_summarizer.trace import Tracer
     from meeting_summarizer.extraction import (
+        EmbeddingModel, Window, preprocess_transcript,
         stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
     )
             """Count tokens using the extraction model's tokenizer."""
             return len(extraction_llm.tokenize(text.encode('utf-8')))
+        # Preprocess transcript: strip CSV format, remove noise/repetition
+        raw_line_count = len(transcript.split('\n'))
+        raw_char_count = len(transcript)
+        transcript, noise_phrases = preprocess_transcript(transcript)
+        cleaned_line_count = len(transcript.split('\n'))
+        cleaned_char_count = len(transcript)
+        # Log preprocessing info to tracer
+        tracer.log_preprocessing(
+            original_line_count=raw_line_count,
+            cleaned_line_count=cleaned_line_count,
+            original_char_count=raw_char_count,
+            cleaned_char_count=cleaned_char_count,
+            noise_phrases_removed=noise_phrases
+        )
+        # Create windows from preprocessed transcript
         lines = [l.strip() for l in transcript.split('\n') if l.strip()]
         # Reserve tokens for system prompt (~200) and output (~1024)
                     end_turn=line_num - 1,
                     token_count=current_tokens
                 ))
+                # Log window to tracer for debugging
+                tracer.log_window(
+                    window_id=window_id,
+                    content=window_content,
+                    token_count=current_tokens,
+                    start_turn=line_num - len(current_window),
+                    end_turn=line_num - 1
+                )
                 window_id += 1
                 # Start new window with overlap
                 end_turn=len(lines) - 1,
                 token_count=current_tokens
             ))
+            # Log window to tracer for debugging
+            tracer.log_window(
+                window_id=window_id,
+                content=window_content,
+                token_count=current_tokens,
+                start_turn=len(lines) - len(current_window),
+                end_turn=len(lines) - 1
+            )
         total_windows = len(windows)
         yield {"stage": "extraction", "ticker": f"Created {total_windows} windows", "thinking": "", "summary": ""}
         # Get trace stats and add model names for download JSON
         trace_stats = tracer.get_summary_stats()
+        debug_json = tracer.get_debug_json()
         ext_config = get_model_config(extraction_model_key, "extraction")
         syn_config = get_model_config(synthesis_model_key, "synthesis")
         trace_stats["extraction_model"] = ext_config.get("name", extraction_model_key)
             "thinking": final_thinking,
             "summary": final_summary,
             "trace_stats": trace_stats,
+            "trace_json": tracer.get_trace_json(),
+            "debug_json": debug_json
         }
     except Exception as e:
     if is_advanced:
         # Advanced Mode: embed trace data and use pipeline model names
         trace_stats = metrics.get("trace_stats", {})
+        debug_info = metrics.get("debug_json", {})
         data = {
             "metadata": {
                 "generated_at": datetime.now().isoformat(),
                 "synthesis_success": trace_stats.get("synthesis_success", False),
                 "total_elapsed_seconds": trace_stats.get("total_elapsed_seconds", 0),
             },
+            "debug_info": debug_info,
             "trace": metrics.get("trace_json", [])
         }
     else:
             supports_toggle = config.get("supports_toggle", False)
             if supports_toggle:
+                # Hybrid model — default reasoning ON for better extraction quality
+                return gr.update(visible=True, value=True, interactive=True, label="🧠 Enable Reasoning for Extraction")
             elif config.get("supports_reasoning", False):
                 # Thinking-only model (none currently in extraction)
                 return gr.update(visible=True, value=True, interactive=False, label="🧠 Reasoning Mode (Always On)")
                         # Format info message
                         info_msg = f"""**Advanced Mode Complete**
+ - Total Windows: {trace_stats.get('total_windows', 0)}
+ - Items Extracted: {trace_stats.get('total_items_extracted', 0)}
+ - Items After Dedup: {trace_stats.get('total_items_after_dedup', 0)}
+ - Duplicates Removed: {trace_stats.get('total_duplicates_removed', 0)}
+ - Total Time: {trace_stats.get('total_elapsed_seconds', 0):.1f}s"""
+                        # Store trace and debug info for download
                         metrics = {
                             "mode": "advanced",
                             "trace_stats": trace_stats,
+                            "trace_json": update.get("trace_json", []),
+                            "debug_json": update.get("debug_json", {})
                         }
                         yield (thinking, summary, info_msg, metrics, "Advanced Mode (3-Model Pipeline)")

meeting_summarizer/extraction.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Advanced Extraction Pipeline
 Provides:
 1. EMBEDDING_MODELS registry (4 models for deduplication)
 2. NativeTokenizer - Count tokens without llama.cpp
 3. EmbeddingModel - Load/compute embeddings
@@ -11,11 +12,13 @@ Provides:
 7. stream_synthesize_executive_summary - Stage 3: Synthesis
 """
 import re
 import json
 import time
 import logging
-from typing import Dict, List, Any, Tuple, Generator, Optional
 from dataclasses import dataclass
 import numpy as np
 from llama_cpp import Llama
@@ -23,6 +26,233 @@ from llama_cpp import Llama
 logger = logging.getLogger(__name__)
 # ===== EMBEDDING MODELS REGISTRY =====
 EMBEDDING_MODELS = {
@@ -281,39 +511,172 @@ def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
 # ===== HELPER FUNCTIONS =====
-def _try_parse_extraction_json(text: str) -> Optional[Dict[str, List[str]]]:
     """
     Attempt to parse extraction JSON from LLM output.
     Args:
         text: Raw LLM output
     Returns:
-        Parsed dict or None if invalid
     """
     # Remove markdown code blocks
     text = re.sub(r'```json\s*', '', text)
     text = re.sub(r'```\s*$', '', text)
     text = text.strip()
     try:
         data = json.loads(text)
-        # Validate schema
-        required_keys = {"action_items", "decisions", "key_points", "open_questions"}
-        if not isinstance(data, dict) or not required_keys.issubset(data.keys()):
-            return None
-        # Validate all values are lists
-        for key in required_keys:
-            if not isinstance(data[key], list):
-                return None
-        return data
     except json.JSONDecodeError:
         return None
 def _sample_llm_response(text: str, max_chars: int = 400) -> str:
     """Sample LLM response for trace logging."""
@@ -478,10 +841,11 @@ def stream_extract_from_window(
     token_count = 0
     try:
         settings = model_config["inference_settings"]
         stream = extraction_llm.create_chat_completion(
             messages=messages,
-            max_tokens=512,
             temperature=settings["temperature"],
             top_p=settings["top_p"],
             top_k=settings["top_k"],
@@ -522,7 +886,7 @@ def stream_extract_from_window(
                     # Calculate metrics
                     elapsed = time.time() - start_time
                     tps = token_count / elapsed if elapsed > 0 else 0
-                    eta = int((1024 - token_count) / tps) if tps > 0 else 0
                     # Get item counts
                     items_found = {k: len(v) for k, v in partial_items.items()}
@@ -559,12 +923,14 @@ def stream_extract_from_window(
         else:
             json_text = full_response
-        final_items = _try_parse_extraction_json(json_text)
         if not final_items:
             error_msg = f"Failed to parse JSON from window {window_id}"
             debug_output = f"{error_msg}\n\nRaw LLM output:\n{full_response[:1000]}\n"
-            logger.error(debug_output)
             print(f"\n{'='*80}\n{debug_output}{'='*80}\n", flush=True)
             tracer.log_extraction(
                 window_id=window_id,
@@ -572,7 +938,23 @@ def stream_extract_from_window(
                 llm_response=_sample_llm_response(full_response),
                 error=error_msg
             )
-            raise ValueError(error_msg)
         # Log success
         tracer.log_extraction(
@@ -583,6 +965,27 @@ def stream_extract_from_window(
             error=None
         )
         # Final ticker
         elapsed = time.time() - start_time
         tps = token_count / elapsed if elapsed > 0 else 0
@@ -644,26 +1047,41 @@ def deduplicate_items(
             emb = embedding_model.embed(item)
             embeddings.append(emb)
-        # Mark duplicates
         keep_indices = []
         for i in range(len(items)):
             is_duplicate = False
             # Compare with all previously kept items
             for j in keep_indices:
                 similarity = cosine_similarity(embeddings[i], embeddings[j])
                 if similarity >= similarity_threshold:
                     is_duplicate = True
                     break
             if not is_duplicate:
                 keep_indices.append(i)
         # Keep only unique items
         unique_items = [items[i] for i in keep_indices]
         deduplicated[category] = unique_items
-        # Log deduplication
         duplicates_removed = original_count - len(unique_items)
         tracer.log_deduplication(
             category=category,
@@ -671,7 +1089,10 @@ def deduplicate_items(
             deduplicated_count=len(unique_items),
             duplicates_removed=duplicates_removed,
             similarity_threshold=similarity_threshold,
-            embedding_model=embedding_model.model_key
         )
         logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
@@ -772,15 +1193,22 @@ def stream_synthesize_executive_summary(
         else:
             summary_text = full_summary
-        # Log synthesis
         tracer.log_synthesis(
             synthesis_model=model_config["name"],
             input_item_counts=item_counts,
             output_summary=_sample_llm_response(summary_text),
             thinking=_sample_llm_response(thinking_content) if thinking_content else None,
-            error=None
         )
         yield (summary_text, thinking_content, True)
     except Exception as e:
@@ -789,6 +1217,9 @@ def stream_synthesize_executive_summary(
             input_item_counts=item_counts,
             output_summary="",
             thinking=None,
-            error=str(e)
         )
         raise

 Advanced Extraction Pipeline
 Provides:
+0. preprocess_transcript - Clean noisy CSV transcripts before extraction
 1. EMBEDDING_MODELS registry (4 models for deduplication)
 2. NativeTokenizer - Count tokens without llama.cpp
 3. EmbeddingModel - Load/compute embeddings
 7. stream_synthesize_executive_summary - Stage 3: Synthesis
 """
+import csv
+import io
 import re
 import json
 import time
 import logging
+from typing import Dict, List, Any, Tuple, Generator, Optional, Set
 from dataclasses import dataclass
 import numpy as np
 from llama_cpp import Llama
 logger = logging.getLogger(__name__)
+# ===== TRANSCRIPT PREPROCESSING =====
+def preprocess_transcript(transcript_text: str) -> Tuple[str, List[str]]:
+    """
+    Clean noisy transcript text before extraction.
+    Handles:
+    1. CSV format detection and text column extraction
+    2. Speaker label prefixing (for context)
+    3. Collapsing consecutive duplicate lines
+    4. Collapsing repeated phrases within lines
+    5. Filtering lines that are pure noise (no meaningful content)
+    Args:
+        transcript_text: Raw transcript (CSV or plain text)
+    Returns:
+        Tuple of (cleaned_dialogue_text, noise_phrases_list)
+        - cleaned_dialogue_text: Cleaned dialogue text with speaker labels
+        - noise_phrases_list: List of noise phrases detected and removed
+    """
+    raw_lines = transcript_text.strip().split('\n')
+    if not raw_lines:
+        return "", []
+    # Step 1: Detect CSV format and extract dialogue
+    dialogue_lines = _extract_dialogue_from_csv(raw_lines)
+    # Step 2: Collapse consecutive duplicate lines
+    deduped_lines = _collapse_consecutive_duplicates(dialogue_lines)
+    # Step 3: Clean repeated phrases within each line
+    cleaned_lines = []
+    for line in deduped_lines:
+        cleaned = _collapse_repeated_phrases(line)
+        if cleaned:
+            cleaned_lines.append(cleaned)
+    # Step 4: Filter lines that are pure noise
+    meaningful_lines, noise_phrases = _filter_noise_lines(cleaned_lines)
+    result = '\n'.join(meaningful_lines)
+    if result != transcript_text.strip():
+        original_len = len(transcript_text.strip())
+        cleaned_len = len(result)
+        reduction = ((original_len - cleaned_len) / original_len * 100) if original_len > 0 else 0
+        logger.info(
+            f"Transcript preprocessed: {original_len} → {cleaned_len} chars "
+            f"({reduction:.0f}% reduction, {len(meaningful_lines)} lines)"
+        )
+    return result, list(noise_phrases)
+def _extract_dialogue_from_csv(lines: List[str]) -> List[str]:
+    """
+    Detect CSV format and extract speaker-prefixed dialogue lines.
+    If the first line looks like a CSV header (start,end,speaker,text),
+    parse as CSV and return 'SPEAKER_XX: text' lines.
+    Otherwise return lines as-is.
+    """
+    # Check for CSV header
+    first_line = lines[0].strip().lower()
+    is_csv = first_line.startswith('start,end,speaker,text') or (
+        ',' in first_line and any(
+            kw in first_line for kw in ['speaker', 'start', 'text']
+        )
+    )
+    if not is_csv:
+        return [l.strip() for l in lines if l.strip()]
+    # Parse CSV, skipping header
+    dialogue = []
+    csv_text = '\n'.join(lines)
+    reader = csv.reader(io.StringIO(csv_text))
+    for i, row in enumerate(reader):
+        if i == 0:
+            # Skip header row
+            continue
+        if len(row) >= 4:
+            speaker = row[2].strip()
+            text = row[3].strip().strip('"')
+            if text:
+                dialogue.append(f"{speaker}: {text}")
+        elif len(row) >= 1:
+            # Fallback: take whatever text is there
+            text = ','.join(row).strip()
+            if text:
+                dialogue.append(text)
+    return dialogue
+def _collapse_consecutive_duplicates(lines: List[str]) -> List[str]:
+    """Remove consecutive duplicate lines (exact match)."""
+    if not lines:
+        return []
+    result = [lines[0]]
+    for line in lines[1:]:
+        if line != result[-1]:
+            result.append(line)
+    return result
+def _collapse_repeated_phrases(line: str, max_repeats: int = 2) -> str:
+    """
+    Collapse repeated phrases within a single line.
+    Detects patterns like 'ABC。ABC。ABC。' and reduces to 'ABC。'
+    Works with Chinese punctuation boundaries.
+    """
+    if not line:
+        return line
+    # Split by Chinese/standard sentence boundaries
+    # Keep the delimiter attached to the preceding segment
+    segments = re.split(r'(?<=[。！？；\.\!\?\;])', line)
+    segments = [s.strip() for s in segments if s.strip()]
+    if len(segments) <= 1:
+        return line
+    # Collapse consecutive identical segments
+    deduped = [segments[0]]
+    repeat_count = 1
+    for seg in segments[1:]:
+        if seg == deduped[-1]:
+            repeat_count += 1
+            if repeat_count <= max_repeats:
+                deduped.append(seg)
+        else:
+            deduped.append(seg)
+            repeat_count = 1
+    return ''.join(deduped)
+def _filter_noise_lines(
+    lines: List[str],
+    min_unique_chars: int = 5,
+    noise_phrase_threshold: int = 5
+) -> Tuple[List[str], Set[str]]:
+    """
+    Filter out lines that are pure noise (ASR hallucination loops).
+    A line is noise if:
+    - It has fewer than min_unique_chars unique non-punctuation characters
+    - Its content is entirely composed of a single phrase that repeats
+      across the transcript more than noise_phrase_threshold times
+    Args:
+        lines: Preprocessed dialogue lines
+        min_unique_chars: Minimum unique chars to keep a line
+        noise_phrase_threshold: A phrase appearing more than this many times
+                                across the transcript is considered noise
+    Returns:
+        Tuple of (filtered_lines, noise_phrases)
+        - filtered_lines: Lines that are not pure noise
+        - noise_phrases: Set of noise phrases detected
+    """
+    if not lines:
+        return [], set()
+    _punct_re = re.compile(
+        r'[\s\u3000\uff0c\u3002\uff01\uff1f\u3001\uff1b\uff1a'
+        r'\u201c\u201d\u2018\u2019'
+        r'\uff08\uff09()\.,!?;:"\'\s]'
+    )
+    def strip_speaker(line: str) -> str:
+        return re.sub(r'^SPEAKER_\d+:\s*', '', line)
+    def get_content(text: str) -> str:
+        return _punct_re.sub('', text)
+    # Step 1: Split each line into sentence-level segments and count
+    # how many times each segment appears across the entire transcript.
+    # This catches ASR hallucination like "並且請留意下方的資訊欄" which
+    # may repeat within a line and across many lines.
+    segment_counts: Dict[str, int] = {}
+    for line in lines:
+        text = strip_speaker(line)
+        # Split on Chinese sentence boundaries
+        segments = re.split(r'[。！？；\.\!\?\;]', text)
+        seen_in_line: set = set()
+        for seg in segments:
+            seg_content = get_content(seg)
+            if len(seg_content) >= 3 and seg_content not in seen_in_line:
+                seen_in_line.add(seg_content)
+                segment_counts[seg_content] = segment_counts.get(seg_content, 0) + 1
+    # Step 2: Find noise phrases (segments appearing in too many lines)
+    noise_phrases = {
+        phrase for phrase, count in segment_counts.items()
+        if count >= noise_phrase_threshold
+    }
+    # Step 3: For each line, check if it's purely noise
+    meaningful = []
+    for line in lines:
+        text = strip_speaker(line)
+        content = get_content(text)
+        # Skip if too few unique characters
+        if len(set(content)) < min_unique_chars:
+            continue
+        # Check if the line is entirely composed of noise phrases.
+        # Remove all noise phrase occurrences and see if anything meaningful remains.
+        remaining = content
+        for noise in noise_phrases:
+            remaining = remaining.replace(noise, '')
+        # If nothing meaningful remains after removing noise, skip this line
+        if len(remaining.strip()) < min_unique_chars:
+            continue
+        meaningful.append(line)
+    return meaningful, noise_phrases
 # ===== EMBEDDING MODELS REGISTRY =====
 EMBEDDING_MODELS = {
 # ===== HELPER FUNCTIONS =====
+def _repair_truncated_json(text: str) -> str:
+    """
+    Attempt to repair truncated JSON by closing open brackets/strings.
+    Handles cases where max_tokens cuts off the response mid-JSON,
+    e.g. a string never closed, an array never closed, etc.
+    Args:
+        text: Truncated JSON string
+    Returns:
+        Repaired JSON string (best effort)
+    """
+    in_string = False
+    escape_next = False
+    stack = []  # tracks open { and [
+    for char in text:
+        if escape_next:
+            escape_next = False
+            continue
+        if char == '\\' and in_string:
+            escape_next = True
+            continue
+        if char == '"' and not escape_next:
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if char in ('{', '['):
+            stack.append(char)
+        elif char == '}' and stack and stack[-1] == '{':
+            stack.pop()
+        elif char == ']' and stack and stack[-1] == '[':
+            stack.pop()
+    repair = ""
+    if in_string:
+        repair += '"'
+    for opener in reversed(stack):
+        if opener == '[':
+            repair += ']'
+        elif opener == '{':
+            repair += '}'
+    return text + repair
+def _normalize_item_to_string(item: Any) -> str:
+    """
+    Normalize an extracted item to a plain string.
+    Models may output items as strings or as dicts with various fields
+    (e.g. {"assigned_to": "X", "due_date": "Y"}). This flattens them
+    to a single descriptive string.
+    Args:
+        item: A string or dict from the extraction JSON
+    Returns:
+        A plain string representation
+    """
+    if isinstance(item, str):
+        return item.strip()
+    if isinstance(item, dict):
+        parts = []
+        for key, value in item.items():
+            if value and isinstance(value, str) and value.strip():
+                parts.append(f"{key}: {value.strip()}")
+        return '; '.join(parts) if parts else str(item)
+    return str(item)
+def _normalize_extraction_items(data: Dict[str, list]) -> Dict[str, List[str]]:
+    """
+    Normalize all extracted items to plain strings.
+    Args:
+        data: Parsed extraction dict (values may contain dicts or strings)
+    Returns:
+        Dict with all values as lists of strings
+    """
+    required_keys = {"action_items", "decisions", "key_points", "open_questions"}
+    normalized: Dict[str, List[str]] = {}
+    for key in required_keys:
+        items = data.get(key, [])
+        if not isinstance(items, list):
+            normalized[key] = []
+            continue
+        normalized[key] = [
+            s for s in (_normalize_item_to_string(item) for item in items) if s
+        ]
+    return normalized
+def _try_parse_extraction_json(
+    text: str, log_repair: bool = False
+) -> Optional[Dict[str, List[str]]]:
     """
     Attempt to parse extraction JSON from LLM output.
+    Handles truncated JSON (from max_tokens cutoff) by repairing
+    unclosed brackets/strings. Normalizes item formats (dicts -> strings).
     Args:
         text: Raw LLM output
+        log_repair: If True, log when repair was needed (use only for
+                    final parse, not streaming chunks)
     Returns:
+        Parsed and normalized dict, or None if unrecoverable
     """
     # Remove markdown code blocks
     text = re.sub(r'```json\s*', '', text)
     text = re.sub(r'```\s*$', '', text)
     text = text.strip()
+    # Attempt 1: parse as-is
+    data = None
     try:
         data = json.loads(text)
     except json.JSONDecodeError:
+        pass
+    # Attempt 2: repair truncated JSON
+    if data is None:
+        repaired = _repair_truncated_json(text)
+        try:
+            data = json.loads(repaired)
+            if log_repair:
+                logger.info("Successfully parsed JSON after repair (output was truncated)")
+        except json.JSONDecodeError:
+            pass
+    # Attempt 3: find outermost { and repair from there
+    if data is None:
+        match = re.search(r'\{', text)
+        if match:
+            repaired = _repair_truncated_json(text[match.start():])
+            try:
+                data = json.loads(repaired)
+                if log_repair:
+                    logger.info("Successfully parsed JSON from substring after repair")
+            except json.JSONDecodeError:
+                return None
+        else:
+            return None
+    # Validate schema
+    required_keys = {"action_items", "decisions", "key_points", "open_questions"}
+    if not isinstance(data, dict) or not required_keys.issubset(data.keys()):
         return None
+    # Validate all values are lists
+    for key in required_keys:
+        if not isinstance(data[key], list):
+            return None
+    # Normalize items (flatten dicts to strings)
+    return _normalize_extraction_items(data)
 def _sample_llm_response(text: str, max_chars: int = 400) -> str:
     """Sample LLM response for trace logging."""
     token_count = 0
     try:
+        max_gen_tokens = 1024
         settings = model_config["inference_settings"]
         stream = extraction_llm.create_chat_completion(
             messages=messages,
+            max_tokens=max_gen_tokens,
             temperature=settings["temperature"],
             top_p=settings["top_p"],
             top_k=settings["top_k"],
                     # Calculate metrics
                     elapsed = time.time() - start_time
                     tps = token_count / elapsed if elapsed > 0 else 0
+                    eta = int((max_gen_tokens - token_count) / tps) if tps > 0 else 0
                     # Get item counts
                     items_found = {k: len(v) for k, v in partial_items.items()}
         else:
             json_text = full_response
+        final_items = _try_parse_extraction_json(json_text, log_repair=True)
         if not final_items:
+            # Graceful degradation: log warning but don't crash the pipeline.
+            # Other windows may still succeed and produce useful data.
             error_msg = f"Failed to parse JSON from window {window_id}"
             debug_output = f"{error_msg}\n\nRaw LLM output:\n{full_response[:1000]}\n"
+            logger.warning(debug_output)
             print(f"\n{'='*80}\n{debug_output}{'='*80}\n", flush=True)
             tracer.log_extraction(
                 window_id=window_id,
                 llm_response=_sample_llm_response(full_response),
                 error=error_msg
             )
+            # Yield empty result instead of crashing
+            empty_items = {
+                "action_items": [], "decisions": [],
+                "key_points": [], "open_questions": []
+            }
+            ticker = format_progress_ticker(
+                current_window=window_id,
+                total_windows=total_windows,
+                window_tokens=window.token_count,
+                max_tokens=4096,
+                items_found={k: 0 for k in empty_items},
+                tokens_per_sec=0,
+                eta_seconds=0,
+                current_snippet=f"⚠️ Window {window_id} parse failed, continuing..."
+            )
+            yield (ticker, thinking_content, empty_items, True)
+            return
         # Log success
         tracer.log_extraction(
             error=None
         )
+        # Log detailed extraction info for debugging
+        json_repaired = False
+        parse_attempts = 1
+        # Check if the JSON was repaired by examining the parse function
+        # This is a heuristic - the actual parse_attempts would be tracked inside _try_parse_extraction_json
+        try:
+            json.loads(full_response)
+        except json.JSONDecodeError:
+            json_repaired = True
+            parse_attempts = 2
+        tracer.log_extraction_detail(
+            window_id=window_id,
+            extracted_items=final_items,
+            full_llm_response=full_response,
+            full_thinking=thinking_content,
+            json_repaired=json_repaired,
+            parse_attempts=parse_attempts
+        )
         # Final ticker
         elapsed = time.time() - start_time
         tps = token_count / elapsed if elapsed > 0 else 0
             emb = embedding_model.embed(item)
             embeddings.append(emb)
+        # Mark duplicates and track duplicate groups
         keep_indices = []
+        duplicate_groups = []
         for i in range(len(items)):
             is_duplicate = False
+            duplicate_of_idx = -1
+            similarity_score = 0.0
             # Compare with all previously kept items
             for j in keep_indices:
                 similarity = cosine_similarity(embeddings[i], embeddings[j])
                 if similarity >= similarity_threshold:
                     is_duplicate = True
+                    duplicate_of_idx = j
+                    similarity_score = similarity
                     break
             if not is_duplicate:
                 keep_indices.append(i)
+            else:
+                # Record duplicate group for debugging
+                duplicate_groups.append({
+                    "duplicate_item": items[i],
+                    "duplicate_index": i,
+                    "kept_item": items[duplicate_of_idx],
+                    "kept_index": duplicate_of_idx,
+                    "similarity": round(similarity_score, 3),
+                })
         # Keep only unique items
         unique_items = [items[i] for i in keep_indices]
         deduplicated[category] = unique_items
+        # Log deduplication with full details
         duplicates_removed = original_count - len(unique_items)
         tracer.log_deduplication(
             category=category,
             deduplicated_count=len(unique_items),
             duplicates_removed=duplicates_removed,
             similarity_threshold=similarity_threshold,
+            embedding_model=embedding_model.model_key,
+            original_items=items,
+            deduplicated_items=unique_items,
+            duplicate_groups=duplicate_groups
         )
         logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
         else:
             summary_text = full_summary
+        # Log synthesis with full details
         tracer.log_synthesis(
             synthesis_model=model_config["name"],
             input_item_counts=item_counts,
             output_summary=_sample_llm_response(summary_text),
             thinking=_sample_llm_response(thinking_content) if thinking_content else None,
+            error=None,
+            input_items=deduplicated_items,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt
         )
+        # Also store full outputs in synthesis_details directly
+        tracer.synthesis_details["full_output_summary"] = summary_text
+        tracer.synthesis_details["full_thinking"] = thinking_content
         yield (summary_text, thinking_content, True)
     except Exception as e:
             input_item_counts=item_counts,
             output_summary="",
             thinking=None,
+            error=str(e),
+            input_items=deduplicated_items,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt
         )
         raise

meeting_summarizer/trace.py CHANGED Viewed

@@ -27,6 +27,11 @@ class Tracer:
         self.enabled = enabled
         self.trace_entries: List[Dict[str, Any]] = []
         self.start_time = time.time()
     def log_extraction(
         self,
@@ -71,7 +76,10 @@ class Tracer:
         deduplicated_count: int,
         duplicates_removed: int,
         similarity_threshold: float,
-        embedding_model: str
     ) -> None:
         """
         Log deduplication operation for a category.
@@ -83,6 +91,9 @@ class Tracer:
             duplicates_removed: Number of duplicates removed
             similarity_threshold: Similarity threshold used
             embedding_model: Embedding model used
         """
         if not self.enabled:
             return
@@ -101,6 +112,13 @@ class Tracer:
         }
         self.trace_entries.append(entry)
         logger.debug(f"[Trace] Deduplication {category}: {original_count} → {deduplicated_count} ({duplicates_removed} removed)")
     def log_synthesis(
@@ -109,7 +127,10 @@ class Tracer:
         input_item_counts: Dict[str, int],
         output_summary: str,
         thinking: Optional[str] = None,
-        error: Optional[str] = None
     ) -> None:
         """
         Log synthesis operation.
@@ -120,6 +141,9 @@ class Tracer:
             output_summary: Generated summary (sampled)
             thinking: Thinking/reasoning content (sampled, if applicable)
             error: Error message if synthesis failed
         """
         if not self.enabled:
             return
@@ -137,6 +161,15 @@ class Tracer:
         }
         self.trace_entries.append(entry)
         logger.debug(f"[Trace] Synthesis: {entry['success']}")
     def get_trace_jsonl(self) -> str:
@@ -163,6 +196,53 @@ class Tracer:
         return self.trace_entries
     def get_summary_stats(self) -> Dict[str, Any]:
         """
         Get summary statistics from trace.
@@ -195,3 +275,103 @@ class Tracer:
             "synthesis_success": synthesis_entries[0]["success"] if synthesis_entries else False,
             "total_elapsed_seconds": round(time.time() - self.start_time, 2),
         }

         self.enabled = enabled
         self.trace_entries: List[Dict[str, Any]] = []
         self.start_time = time.time()
+        self.preprocessing_info: Dict[str, Any] = {}
+        self.windows_info: List[Dict[str, Any]] = []
+        self.extraction_details: Dict[int, Dict[str, Any]] = {}
+        self.deduplication_details: Dict[str, Dict[str, Any]] = {}
+        self.synthesis_details: Dict[str, Any] = {}
     def log_extraction(
         self,
         deduplicated_count: int,
         duplicates_removed: int,
         similarity_threshold: float,
+        embedding_model: str,
+        original_items: Optional[List[str]] = None,
+        deduplicated_items: Optional[List[str]] = None,
+        duplicate_groups: Optional[List[Dict[str, Any]]] = None
     ) -> None:
         """
         Log deduplication operation for a category.
             duplicates_removed: Number of duplicates removed
             similarity_threshold: Similarity threshold used
             embedding_model: Embedding model used
+            original_items: Original items list (full)
+            deduplicated_items: Deduplicated items list (full)
+            duplicate_groups: List of duplicate groups with similarity scores
         """
         if not self.enabled:
             return
         }
         self.trace_entries.append(entry)
+        self.deduplication_details[category] = {
+            "original_items": original_items or [],
+            "deduplicated_items": deduplicated_items or [],
+            "duplicate_groups": duplicate_groups or [],
+        }
         logger.debug(f"[Trace] Deduplication {category}: {original_count} → {deduplicated_count} ({duplicates_removed} removed)")
     def log_synthesis(
         input_item_counts: Dict[str, int],
         output_summary: str,
         thinking: Optional[str] = None,
+        error: Optional[str] = None,
+        input_items: Optional[Dict[str, List[str]]] = None,
+        system_prompt: Optional[str] = None,
+        user_prompt: Optional[str] = None
     ) -> None:
         """
         Log synthesis operation.
             output_summary: Generated summary (sampled)
             thinking: Thinking/reasoning content (sampled, if applicable)
             error: Error message if synthesis failed
+            input_items: Full input items dict
+            system_prompt: System prompt used
+            user_prompt: User prompt used
         """
         if not self.enabled:
             return
         }
         self.trace_entries.append(entry)
+        self.synthesis_details = {
+            "input_items": input_items or {},
+            "system_prompt": system_prompt or "",
+            "user_prompt": user_prompt or "",
+            "full_output_summary": output_summary or "",
+            "full_thinking": thinking or "",
+        }
         logger.debug(f"[Trace] Synthesis: {entry['success']}")
     def get_trace_jsonl(self) -> str:
         return self.trace_entries
+    def get_debug_json(self) -> Dict[str, Any]:
+        """
+        Get full debug information including detailed logs from all stages.
+        Returns:
+            Dict with rich debug information for all 3 stages
+        """
+        if not self.enabled:
+            return {}
+        return {
+            "preprocessing": self.preprocessing_info,
+            "windows": self.windows_info,
+            "extraction": {
+                "details": self.extraction_details,
+                "summary": {
+                    "total_windows": len(self.windows_info),
+                    "total_items": sum(
+                        sum(d["item_counts"].values())
+                        for d in self.extraction_details.values()
+                    ),
+                    "windows_with_repaired_json": sum(
+                        1 for d in self.extraction_details.values()
+                        if d.get("json_repaired", False)
+                    ),
+                }
+            },
+            "deduplication": {
+                "details": self.deduplication_details,
+                "summary": {
+                    "total_original_items": sum(
+                        len(d.get("original_items", []))
+                        for d in self.deduplication_details.values()
+                    ),
+                    "total_deduplicated_items": sum(
+                        len(d.get("deduplicated_items", []))
+                        for d in self.deduplication_details.values()
+                    ),
+                    "total_duplicates_removed": sum(
+                        len(d.get("original_items", [])) - len(d.get("deduplicated_items", []))
+                        for d in self.deduplication_details.values()
+                    ),
+                }
+            },
+            "synthesis": self.synthesis_details,
+        }
     def get_summary_stats(self) -> Dict[str, Any]:
         """
         Get summary statistics from trace.
             "synthesis_success": synthesis_entries[0]["success"] if synthesis_entries else False,
             "total_elapsed_seconds": round(time.time() - self.start_time, 2),
         }
+    def log_preprocessing(
+        self,
+        original_line_count: int,
+        cleaned_line_count: int,
+        original_char_count: int,
+        cleaned_char_count: int,
+        noise_phrases_removed: List[str],
+        detection_method: str = "segment_level"
+    ) -> None:
+        """
+        Log transcript preprocessing information.
+        Args:
+            original_line_count: Number of lines before preprocessing
+            cleaned_line_count: Number of lines after preprocessing
+            original_char_count: Character count before preprocessing
+            cleaned_char_count: Character count after preprocessing
+            noise_phrases_removed: List of noise phrases detected and removed
+            detection_method: Method used for noise detection
+        """
+        if not self.enabled:
+            return
+        self.preprocessing_info = {
+            "original_line_count": original_line_count,
+            "cleaned_line_count": cleaned_line_count,
+            "original_char_count": original_char_count,
+            "cleaned_char_count": cleaned_char_count,
+            "lines_removed": original_line_count - cleaned_line_count,
+            "chars_removed": original_char_count - cleaned_char_count,
+            "line_reduction_pct": round((1 - cleaned_line_count / original_line_count) * 100, 1) if original_line_count > 0 else 0.0,
+            "char_reduction_pct": round((1 - cleaned_char_count / original_char_count) * 100, 1) if original_char_count > 0 else 0.0,
+            "noise_phrases_removed": noise_phrases_removed,
+            "detection_method": detection_method,
+        }
+        logger.debug(f"[Trace] Preprocessing: {original_line_count} → {cleaned_line_count} lines ({self.preprocessing_info['line_reduction_pct']}% reduction)")
+    def log_window(
+        self,
+        window_id: int,
+        content: str,
+        token_count: int,
+        start_turn: int,
+        end_turn: int
+    ) -> None:
+        """
+        Log window information.
+        Args:
+            window_id: Window identifier
+            content: Window content (may be truncated for storage)
+            token_count: Number of tokens in window
+            start_turn: Starting line number
+            end_turn: Ending line number
+        """
+        if not self.enabled:
+            return
+        self.windows_info.append({
+            "window_id": window_id,
+            "content": content,
+            "token_count": token_count,
+            "start_turn": start_turn,
+            "end_turn": end_turn,
+            "line_count": end_turn - start_turn + 1,
+        })
+    def log_extraction_detail(
+        self,
+        window_id: int,
+        extracted_items: Dict[str, List[str]],
+        full_llm_response: str,
+        full_thinking: Optional[str],
+        json_repaired: bool,
+        parse_attempts: int
+    ) -> None:
+        """
+        Log detailed extraction information for a window.
+        Args:
+            window_id: Window identifier
+            extracted_items: Full extracted items dict
+            full_llm_response: Complete LLM response
+            full_thinking: Complete thinking content (if any)
+            json_repaired: Whether JSON was repaired during parsing
+            parse_attempts: Number of parse attempts required
+        """
+        if not self.enabled:
+            return
+        self.extraction_details[window_id] = {
+            "extracted_items": extracted_items,
+            "full_llm_response": full_llm_response,
+            "full_thinking": full_thinking,
+            "json_repaired": json_repaired,
+            "parse_attempts": parse_attempts,
+            "item_counts": {k: len(v) for k, v in extracted_items.items()},
+        }