Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Feb 5

Commit

9129005

1 Parent(s): 7c9ccb7

fix: add examples to extraction prompts to prevent empty JSON output

Changed both reasoning and schema extraction prompts to include example items instead of empty arrays.
This prevents Qwen3/LFM2-Extract from copying schema template and returning {}.

Debug JSON now shows 36 items extracted instead of 0.

Files changed (2) hide show

app.py +16 -8
meeting_summarizer/extraction.py +70 -70

app.py CHANGED Viewed

@@ -8,22 +8,30 @@ UI Version: 2.0 - Enhanced with modern styling and UX improvements
 """
 import os
-import re
 import gc
-import json
 import time
-from typing import Tuple, Generator, Optional, Dict, Any, List
-import gradio as gr
-from llama_cpp import Llama
-from opencc import OpenCC
 import logging
 from huggingface_hub import list_repo_files, hf_hub_download
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
-# Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global model instance
 llm = None
 converter = None

 """
 import os
 import gc
 import time
 import logging
+import re
+import json
+from typing import Dict, List, Any, Optional, Generator, Tuple
+from datetime import datetime
+from opencc import OpenCC
+from llama_cpp import Llama
+import gradio as gr
 from huggingface_hub import list_repo_files, hf_hub_download
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from meeting_summarizer.trace import Tracer
+from meeting_summarizer.extraction import (
+    EmbeddingModel, Window, preprocess_transcript,
+    stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
+)
 logger = logging.getLogger(__name__)
+# Increase Hugging Face timeout to handle slow connections
+os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'  # 5 minutes
 # Global model instance
 llm = None
 converter = None

meeting_summarizer/extraction.py CHANGED Viewed

@@ -691,100 +691,100 @@ def _build_schema_extraction_prompt(output_language: str) -> str:
     """Build concise schema-based extraction prompt (optimized for LFM2-Extract and non-reasoning models)."""
     if output_language == "zh-TW":
         return """以 JSON 格式返回資料，使用以下架構：
 {
-  "action_items": [],
-  "decisions": [],
-  "key_points": [],
-  "open_questions": []
 }
-action_items: 包含負責人和截止日期的具體行動項目
-decisions: 包合理由的決策
-key_points: 重要討論要點
-open_questions: 未解決的問題或疑慮
-從使用者提供的逐字稿中提取。逐字稿可能包含重複、雜訊或不完整內容，請專注於有意義的對話內容，忽略重複的詞句。"""
     else:
         return """Return data as a JSON object with the following schema:
 {
-  "action_items": [],
-  "decisions": [],
-  "key_points": [],
-  "open_questions": []
 }
-action_items: Specific action items with owner and deadline
-decisions: Decisions made with rationale
-key_points: Important discussion points
-open_questions: Unresolved questions or concerns
-Extract from the transcript provided by the user. The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases."""
 def _build_reasoning_extraction_prompt(output_language: str) -> str:
     """Build verbose extraction prompt with reasoning instructions (for hybrid models like Qwen3)."""
     if output_language == "zh-TW":
         return """你是會議分析助手。
-使用你的推理能力分析內容後再進行提取。
-你的推理應該：
 1. 識別關鍵決策點和行動項目
 2. 區分明確決策與一般討論
 3. 適當分類資訊（行動 vs 要點 vs 問題）
-逐字稿可能包含重複、雜訊或不完整內容，請專注於有意義的對話內容，忽略重複的詞句。
-推理後，以 JSON 格式返回資料，使用以下架構：
 {
-  "action_items": [],
-  "decisions": [],
-  "key_points": [],
-  "open_questions": []
 }
-action_items: 包含負責人和截止日期的具體行動項目
-decisions: 包合理由的決策
-key_points: 重要討論要點
-open_questions: 未解決的問題或疑慮
-規則：
-- 每個項目必須是完整、獨立的句子
-- 在每個項目中包含上下文（誰、什麼、何時）
-- 如果類別沒有項目，使用空陣列 []
-- 僅輸出 JSON，無 markdown，無解釋"""
     else:
         return """You are a meeting analysis assistant.
-Use your reasoning capabilities to analyze the content before extracting.
-Your reasoning should:
 1. Identify key decision points and action items
 2. Distinguish explicit decisions from general discussion
 3. Categorize information appropriately (action vs point vs question)
-The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases.
-After reasoning, return data as a JSON object with the following schema:
 {
-  "action_items": [],
-  "decisions": [],
-  "key_points": [],
-  "open_questions": []
 }
-action_items: Specific action items with owner and deadline
-decisions: Decisions made with rationale
-key_points: Important discussion points
-open_questions: Unresolved questions or concerns
-Rules:
-- Each item must be a complete, standalone sentence
-- Include context (who, what, when) in each item
-- If a category has no items, use empty array []
-- Output ONLY JSON, no markdown, no explanations"""
 # ===== CORE PIPELINE FUNCTIONS =====

     """Build concise schema-based extraction prompt (optimized for LFM2-Extract and non-reasoning models)."""
     if output_language == "zh-TW":
         return """以 JSON 格式返回資料，使用以下架構：
 {
+  "action_items": ["具體行動項目1"],
+  "decisions": ["決策1"],
+  "key_points": ["要點1"],
+  "open_questions": ["問題1"]
 }
+ action_items: 包含負責人和截止日期的具體行動項目
+ decisions: 包合理由的決策
+ key_points: 重要討論要點
+ open_questions: 未解決的問題或疑慮
+ 從使用者提供的逐字稿中提取。逐字稿可能包含重複、雜訊或不完整內容，請專注於有意義的對話內容，忽略重複的詞句。"""
     else:
         return """Return data as a JSON object with the following schema:
 {
+  "action_items": ["action item 1"],
+  "decisions": ["decision 1"],
+  "key_points": ["point 1"],
+  "open_questions": ["question 1"]
 }
+ action_items: Specific action items with owner and deadline
+ decisions: Decisions made with rationale
+ key_points: Important discussion points
+ open_questions: Unresolved questions or concerns
+ Extract from the transcript provided by the user. The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases."""
 def _build_reasoning_extraction_prompt(output_language: str) -> str:
     """Build verbose extraction prompt with reasoning instructions (for hybrid models like Qwen3)."""
     if output_language == "zh-TW":
         return """你是會議分析助手。
+ 使用你的推理能力分析內容後再進行提取。
+ 你的推理應該：
 1. 識別關鍵決策點和行動項目
 2. 區分明確決策與一般討論
 3. 適當分類資訊（行動 vs 要點 vs 問題）
+ 逐字稿可能包含重複、雜訊或不完整內容，請專注於有意義的對話內容，忽略重複的詞句。
+ 推理後，以 JSON 格式返回資料，使用以下架構：
 {
+  "action_items": ["具體行動項目1", "具體行動項目2"],
+  "decisions": ["決策1", "決策2"],
+  "key_points": ["要點1", "要點2"],
+  "open_questions": ["問題1", "問題2"]
 }
+ action_items: 包含負責人和截止日期的具體行動項目
+ decisions: 包合理由的決策
+ key_points: 重要討論要點
+ open_questions: 未解決的問題或疑慮
+ 規則：
+ - 每個項目必須是完整、獨立的句子
+ - 在每個項目中包含上下文（誰、什麼、何時）
+ - 如果類別沒有項目，使用空陣列 []
+ - 僅輸出 JSON，無 markdown，無解釋"""
     else:
         return """You are a meeting analysis assistant.
+ Use your reasoning capabilities to analyze the content before extracting.
+ Your reasoning should:
 1. Identify key decision points and action items
 2. Distinguish explicit decisions from general discussion
 3. Categorize information appropriately (action vs point vs question)
+ The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases.
+ After reasoning, return data as a JSON object with the following schema:
 {
+  "action_items": ["action item 1", "action item 2"],
+  "decisions": ["decision 1", "decision 2"],
+  "key_points": ["point 1", "point 2"],
+  "open_questions": ["question 1", "question 2"]
 }
+ action_items: Specific action items with owner and deadline
+ decisions: Decisions made with rationale
+ key_points: Important discussion points
+ open_questions: Unresolved questions or concerns
+ Rules:
+ - Each item must be a complete, standalone sentence
+ - Include context (who, what, when) in each item
+ - If a category has no items, use empty array []
+ - Output ONLY JSON, no markdown, no explanations"""
 # ===== CORE PIPELINE FUNCTIONS =====