Luigi commited on
Commit
9129005
·
1 Parent(s): 7c9ccb7

fix: add examples to extraction prompts to prevent empty JSON output

Browse files

Changed both reasoning and schema extraction prompts to include example items instead of empty arrays.
This prevents Qwen3/LFM2-Extract from copying schema template and returning {}.

Debug JSON now shows 36 items extracted instead of 0.

Files changed (2) hide show
  1. app.py +16 -8
  2. meeting_summarizer/extraction.py +70 -70
app.py CHANGED
@@ -8,22 +8,30 @@ UI Version: 2.0 - Enhanced with modern styling and UX improvements
8
  """
9
 
10
  import os
11
- import re
12
  import gc
13
- import json
14
  import time
15
- from typing import Tuple, Generator, Optional, Dict, Any, List
16
- import gradio as gr
17
- from llama_cpp import Llama
18
- from opencc import OpenCC
19
  import logging
 
 
 
 
 
 
 
20
  from huggingface_hub import list_repo_files, hf_hub_download
21
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
22
 
23
- # Configure logging
24
- logging.basicConfig(level=logging.INFO)
 
 
 
 
25
  logger = logging.getLogger(__name__)
26
 
 
 
 
27
  # Global model instance
28
  llm = None
29
  converter = None
 
8
  """
9
 
10
  import os
 
11
  import gc
 
12
  import time
 
 
 
 
13
  import logging
14
+ import re
15
+ import json
16
+ from typing import Dict, List, Any, Optional, Generator, Tuple
17
+ from datetime import datetime
18
+ from opencc import OpenCC
19
+ from llama_cpp import Llama
20
+ import gradio as gr
21
  from huggingface_hub import list_repo_files, hf_hub_download
22
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
23
 
24
+ from meeting_summarizer.trace import Tracer
25
+ from meeting_summarizer.extraction import (
26
+ EmbeddingModel, Window, preprocess_transcript,
27
+ stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
28
+ )
29
+
30
  logger = logging.getLogger(__name__)
31
 
32
+ # Increase Hugging Face timeout to handle slow connections
33
+ os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300' # 5 minutes
34
+
35
  # Global model instance
36
  llm = None
37
  converter = None
meeting_summarizer/extraction.py CHANGED
@@ -691,100 +691,100 @@ def _build_schema_extraction_prompt(output_language: str) -> str:
691
  """Build concise schema-based extraction prompt (optimized for LFM2-Extract and non-reasoning models)."""
692
  if output_language == "zh-TW":
693
  return """以 JSON 格式返回資料,使用以下架構:
694
-
695
  {
696
- "action_items": [],
697
- "decisions": [],
698
- "key_points": [],
699
- "open_questions": []
700
  }
701
-
702
- action_items: 包含負責人和截止日期的具體行動項目
703
- decisions: 包合理由的決策
704
- key_points: 重要討論要點
705
- open_questions: 未解決的問題或疑慮
706
-
707
- 從使用者提供的逐字稿中提取。逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。"""
708
  else:
709
  return """Return data as a JSON object with the following schema:
710
-
711
  {
712
- "action_items": [],
713
- "decisions": [],
714
- "key_points": [],
715
- "open_questions": []
716
  }
717
-
718
- action_items: Specific action items with owner and deadline
719
- decisions: Decisions made with rationale
720
- key_points: Important discussion points
721
- open_questions: Unresolved questions or concerns
722
-
723
- Extract from the transcript provided by the user. The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases."""
724
 
725
 
726
  def _build_reasoning_extraction_prompt(output_language: str) -> str:
727
  """Build verbose extraction prompt with reasoning instructions (for hybrid models like Qwen3)."""
728
  if output_language == "zh-TW":
729
  return """你是會議分析助手。
730
-
731
- 使用你的推理能力分析內容後再進行提取。
732
-
733
- 你的推理應該:
734
  1. 識別關鍵決策點和行動項目
735
  2. 區分明確決策與一般討論
736
  3. 適當分類資訊(行動 vs 要點 vs 問題)
737
-
738
- 逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。
739
-
740
- 推理後,以 JSON 格式返回資料,使用以下架構:
741
  {
742
- "action_items": [],
743
- "decisions": [],
744
- "key_points": [],
745
- "open_questions": []
746
  }
747
-
748
- action_items: 包含負責人和截止日期的具體行動項目
749
- decisions: 包合理由的決策
750
- key_points: 重要討論要點
751
- open_questions: 未解決的問題或疑慮
752
-
753
- 規則:
754
- - 每個項目必須是完整、獨立的句子
755
- - 在每個項目中包含上下文(誰、什麼、何時)
756
- - 如果類別沒有項目,使用空陣列 []
757
- - 僅輸出 JSON,無 markdown,無解釋"""
758
  else:
759
  return """You are a meeting analysis assistant.
760
-
761
- Use your reasoning capabilities to analyze the content before extracting.
762
-
763
- Your reasoning should:
764
  1. Identify key decision points and action items
765
  2. Distinguish explicit decisions from general discussion
766
  3. Categorize information appropriately (action vs point vs question)
767
-
768
- The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases.
769
-
770
- After reasoning, return data as a JSON object with the following schema:
771
  {
772
- "action_items": [],
773
- "decisions": [],
774
- "key_points": [],
775
- "open_questions": []
776
  }
777
-
778
- action_items: Specific action items with owner and deadline
779
- decisions: Decisions made with rationale
780
- key_points: Important discussion points
781
- open_questions: Unresolved questions or concerns
782
-
783
- Rules:
784
- - Each item must be a complete, standalone sentence
785
- - Include context (who, what, when) in each item
786
- - If a category has no items, use empty array []
787
- - Output ONLY JSON, no markdown, no explanations"""
788
 
789
 
790
  # ===== CORE PIPELINE FUNCTIONS =====
 
691
  """Build concise schema-based extraction prompt (optimized for LFM2-Extract and non-reasoning models)."""
692
  if output_language == "zh-TW":
693
  return """以 JSON 格式返回資料,使用以下架構:
694
+
695
  {
696
+ "action_items": ["具體行動項目1"],
697
+ "decisions": ["決策1"],
698
+ "key_points": ["要點1"],
699
+ "open_questions": ["問題1"]
700
  }
701
+
702
+ action_items: 包含負責人和截止日期的具體行動項目
703
+ decisions: 包合理由的決策
704
+ key_points: 重要討論要點
705
+ open_questions: 未解決的問題或疑慮
706
+
707
+ 從使用者提供的逐字稿中提取。逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。"""
708
  else:
709
  return """Return data as a JSON object with the following schema:
710
+
711
  {
712
+ "action_items": ["action item 1"],
713
+ "decisions": ["decision 1"],
714
+ "key_points": ["point 1"],
715
+ "open_questions": ["question 1"]
716
  }
717
+
718
+ action_items: Specific action items with owner and deadline
719
+ decisions: Decisions made with rationale
720
+ key_points: Important discussion points
721
+ open_questions: Unresolved questions or concerns
722
+
723
+ Extract from the transcript provided by the user. The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases."""
724
 
725
 
726
  def _build_reasoning_extraction_prompt(output_language: str) -> str:
727
  """Build verbose extraction prompt with reasoning instructions (for hybrid models like Qwen3)."""
728
  if output_language == "zh-TW":
729
  return """你是會議分析助手。
730
+
731
+ 使用你的推理能力分析內容後再進行提取。
732
+
733
+ 你的推理應該:
734
  1. 識別關鍵決策點和行動項目
735
  2. 區分明確決策與一般討論
736
  3. 適當分類資訊(行動 vs 要點 vs 問題)
737
+
738
+ 逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。
739
+
740
+ 推理後,以 JSON 格式返回資料,使用以下架構:
741
  {
742
+ "action_items": ["具體行動項目1", "具體行動項目2"],
743
+ "decisions": ["決策1", "決策2"],
744
+ "key_points": ["要點1", "要點2"],
745
+ "open_questions": ["問題1", "問題2"]
746
  }
747
+
748
+ action_items: 包含負責人和截止日期的具體行動項目
749
+ decisions: 包合理由的決策
750
+ key_points: 重要討論要點
751
+ open_questions: 未解決的問題或疑慮
752
+
753
+ 規則:
754
+ - 每個項目必須是完整、獨立的句子
755
+ - 在每個項目中包含上下文(誰、什麼、何時)
756
+ - 如果類別沒有項目,使用空陣列 []
757
+ - 僅輸出 JSON,無 markdown,無解釋"""
758
  else:
759
  return """You are a meeting analysis assistant.
760
+
761
+ Use your reasoning capabilities to analyze the content before extracting.
762
+
763
+ Your reasoning should:
764
  1. Identify key decision points and action items
765
  2. Distinguish explicit decisions from general discussion
766
  3. Categorize information appropriately (action vs point vs question)
767
+
768
+ The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases.
769
+
770
+ After reasoning, return data as a JSON object with the following schema:
771
  {
772
+ "action_items": ["action item 1", "action item 2"],
773
+ "decisions": ["decision 1", "decision 2"],
774
+ "key_points": ["point 1", "point 2"],
775
+ "open_questions": ["question 1", "question 2"]
776
  }
777
+
778
+ action_items: Specific action items with owner and deadline
779
+ decisions: Decisions made with rationale
780
+ key_points: Important discussion points
781
+ open_questions: Unresolved questions or concerns
782
+
783
+ Rules:
784
+ - Each item must be a complete, standalone sentence
785
+ - Include context (who, what, when) in each item
786
+ - If a category has no items, use empty array []
787
+ - Output ONLY JSON, no markdown, no explanations"""
788
 
789
 
790
  # ===== CORE PIPELINE FUNCTIONS =====