Spaces:
Running
Running
fix: add examples to extraction prompts to prevent empty JSON output
Browse filesChanged both reasoning and schema extraction prompts to include example items instead of empty arrays.
This prevents Qwen3/LFM2-Extract from copying schema template and returning {}.
Debug JSON now shows 36 items extracted instead of 0.
- app.py +16 -8
- meeting_summarizer/extraction.py +70 -70
app.py
CHANGED
|
@@ -8,22 +8,30 @@ UI Version: 2.0 - Enhanced with modern styling and UX improvements
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import os
|
| 11 |
-
import re
|
| 12 |
import gc
|
| 13 |
-
import json
|
| 14 |
import time
|
| 15 |
-
from typing import Tuple, Generator, Optional, Dict, Any, List
|
| 16 |
-
import gradio as gr
|
| 17 |
-
from llama_cpp import Llama
|
| 18 |
-
from opencc import OpenCC
|
| 19 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
from huggingface_hub import list_repo_files, hf_hub_download
|
| 21 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
# Global model instance
|
| 28 |
llm = None
|
| 29 |
converter = None
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import os
|
|
|
|
| 11 |
import gc
|
|
|
|
| 12 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
import logging
|
| 14 |
+
import re
|
| 15 |
+
import json
|
| 16 |
+
from typing import Dict, List, Any, Optional, Generator, Tuple
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from opencc import OpenCC
|
| 19 |
+
from llama_cpp import Llama
|
| 20 |
+
import gradio as gr
|
| 21 |
from huggingface_hub import list_repo_files, hf_hub_download
|
| 22 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 23 |
|
| 24 |
+
from meeting_summarizer.trace import Tracer
|
| 25 |
+
from meeting_summarizer.extraction import (
|
| 26 |
+
EmbeddingModel, Window, preprocess_transcript,
|
| 27 |
+
stream_extract_from_window, deduplicate_items, stream_synthesize_executive_summary
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
logger = logging.getLogger(__name__)
|
| 31 |
|
| 32 |
+
# Increase Hugging Face timeout to handle slow connections
|
| 33 |
+
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300' # 5 minutes
|
| 34 |
+
|
| 35 |
# Global model instance
|
| 36 |
llm = None
|
| 37 |
converter = None
|
meeting_summarizer/extraction.py
CHANGED
|
@@ -691,100 +691,100 @@ def _build_schema_extraction_prompt(output_language: str) -> str:
|
|
| 691 |
"""Build concise schema-based extraction prompt (optimized for LFM2-Extract and non-reasoning models)."""
|
| 692 |
if output_language == "zh-TW":
|
| 693 |
return """以 JSON 格式返回資料,使用以下架構:
|
| 694 |
-
|
| 695 |
{
|
| 696 |
-
"action_items": [],
|
| 697 |
-
"decisions": [],
|
| 698 |
-
"key_points": [],
|
| 699 |
-
"open_questions": []
|
| 700 |
}
|
| 701 |
-
|
| 702 |
-
action_items: 包含負責人和截止日期的具體行動項目
|
| 703 |
-
decisions: 包合理由的決策
|
| 704 |
-
key_points: 重要討論要點
|
| 705 |
-
open_questions: 未解決的問題或疑慮
|
| 706 |
-
|
| 707 |
-
從使用者提供的逐字稿中提取。逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。"""
|
| 708 |
else:
|
| 709 |
return """Return data as a JSON object with the following schema:
|
| 710 |
-
|
| 711 |
{
|
| 712 |
-
"action_items": [],
|
| 713 |
-
"decisions": [],
|
| 714 |
-
"key_points": [],
|
| 715 |
-
"open_questions": []
|
| 716 |
}
|
| 717 |
-
|
| 718 |
-
action_items: Specific action items with owner and deadline
|
| 719 |
-
decisions: Decisions made with rationale
|
| 720 |
-
key_points: Important discussion points
|
| 721 |
-
open_questions: Unresolved questions or concerns
|
| 722 |
-
|
| 723 |
-
Extract from the transcript provided by the user. The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases."""
|
| 724 |
|
| 725 |
|
| 726 |
def _build_reasoning_extraction_prompt(output_language: str) -> str:
|
| 727 |
"""Build verbose extraction prompt with reasoning instructions (for hybrid models like Qwen3)."""
|
| 728 |
if output_language == "zh-TW":
|
| 729 |
return """你是會議分析助手。
|
| 730 |
-
|
| 731 |
-
使用你的推理能力分析內容後再進行提取。
|
| 732 |
-
|
| 733 |
-
你的推理應該:
|
| 734 |
1. 識別關鍵決策點和行動項目
|
| 735 |
2. 區分明確決策與一般討論
|
| 736 |
3. 適當分類資訊(行動 vs 要點 vs 問題)
|
| 737 |
-
|
| 738 |
-
逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。
|
| 739 |
-
|
| 740 |
-
推理後,以 JSON 格式返回資料,使用以下架構:
|
| 741 |
{
|
| 742 |
-
"action_items": [],
|
| 743 |
-
"decisions": [],
|
| 744 |
-
"key_points": [],
|
| 745 |
-
"open_questions": []
|
| 746 |
}
|
| 747 |
-
|
| 748 |
-
action_items: 包含負責人和截止日期的具體行動項目
|
| 749 |
-
decisions: 包合理由的決策
|
| 750 |
-
key_points: 重要討論要點
|
| 751 |
-
open_questions: 未解決的問題或疑慮
|
| 752 |
-
|
| 753 |
-
規則:
|
| 754 |
-
- 每個項目必須是完整、獨立的句子
|
| 755 |
-
- 在每個項目中包含上下文(誰、什麼、何時)
|
| 756 |
-
- 如果類別沒有項目,使用空陣列 []
|
| 757 |
-
- 僅輸出 JSON,無 markdown,無解釋"""
|
| 758 |
else:
|
| 759 |
return """You are a meeting analysis assistant.
|
| 760 |
-
|
| 761 |
-
Use your reasoning capabilities to analyze the content before extracting.
|
| 762 |
-
|
| 763 |
-
Your reasoning should:
|
| 764 |
1. Identify key decision points and action items
|
| 765 |
2. Distinguish explicit decisions from general discussion
|
| 766 |
3. Categorize information appropriately (action vs point vs question)
|
| 767 |
-
|
| 768 |
-
The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases.
|
| 769 |
-
|
| 770 |
-
After reasoning, return data as a JSON object with the following schema:
|
| 771 |
{
|
| 772 |
-
"action_items": [],
|
| 773 |
-
"decisions": [],
|
| 774 |
-
"key_points": [],
|
| 775 |
-
"open_questions": []
|
| 776 |
}
|
| 777 |
-
|
| 778 |
-
action_items: Specific action items with owner and deadline
|
| 779 |
-
decisions: Decisions made with rationale
|
| 780 |
-
key_points: Important discussion points
|
| 781 |
-
open_questions: Unresolved questions or concerns
|
| 782 |
-
|
| 783 |
-
Rules:
|
| 784 |
-
- Each item must be a complete, standalone sentence
|
| 785 |
-
- Include context (who, what, when) in each item
|
| 786 |
-
- If a category has no items, use empty array []
|
| 787 |
-
- Output ONLY JSON, no markdown, no explanations"""
|
| 788 |
|
| 789 |
|
| 790 |
# ===== CORE PIPELINE FUNCTIONS =====
|
|
|
|
| 691 |
"""Build concise schema-based extraction prompt (optimized for LFM2-Extract and non-reasoning models)."""
|
| 692 |
if output_language == "zh-TW":
|
| 693 |
return """以 JSON 格式返回資料,使用以下架構:
|
| 694 |
+
|
| 695 |
{
|
| 696 |
+
"action_items": ["具體行動項目1"],
|
| 697 |
+
"decisions": ["決策1"],
|
| 698 |
+
"key_points": ["要點1"],
|
| 699 |
+
"open_questions": ["問題1"]
|
| 700 |
}
|
| 701 |
+
|
| 702 |
+
action_items: 包含負責人和截止日期的具體行動項目
|
| 703 |
+
decisions: 包合理由的決策
|
| 704 |
+
key_points: 重要討論要點
|
| 705 |
+
open_questions: 未解決的問題或疑慮
|
| 706 |
+
|
| 707 |
+
從使用者提供的逐字稿中提取。逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。"""
|
| 708 |
else:
|
| 709 |
return """Return data as a JSON object with the following schema:
|
| 710 |
+
|
| 711 |
{
|
| 712 |
+
"action_items": ["action item 1"],
|
| 713 |
+
"decisions": ["decision 1"],
|
| 714 |
+
"key_points": ["point 1"],
|
| 715 |
+
"open_questions": ["question 1"]
|
| 716 |
}
|
| 717 |
+
|
| 718 |
+
action_items: Specific action items with owner and deadline
|
| 719 |
+
decisions: Decisions made with rationale
|
| 720 |
+
key_points: Important discussion points
|
| 721 |
+
open_questions: Unresolved questions or concerns
|
| 722 |
+
|
| 723 |
+
Extract from the transcript provided by the user. The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases."""
|
| 724 |
|
| 725 |
|
| 726 |
def _build_reasoning_extraction_prompt(output_language: str) -> str:
|
| 727 |
"""Build verbose extraction prompt with reasoning instructions (for hybrid models like Qwen3)."""
|
| 728 |
if output_language == "zh-TW":
|
| 729 |
return """你是會議分析助手。
|
| 730 |
+
|
| 731 |
+
使用你的推理能力分析內容後再進行提取。
|
| 732 |
+
|
| 733 |
+
你的推理應該:
|
| 734 |
1. 識別關鍵決策點和行動項目
|
| 735 |
2. 區分明確決策與一般討論
|
| 736 |
3. 適當分類資訊(行動 vs 要點 vs 問題)
|
| 737 |
+
|
| 738 |
+
逐字稿可能包含重複、雜訊或不完整內容,請專注於有意義的對話內容,忽略重複的詞句。
|
| 739 |
+
|
| 740 |
+
推理後,以 JSON 格式返回資料,使用以下架構:
|
| 741 |
{
|
| 742 |
+
"action_items": ["具體行動項目1", "具體行動項目2"],
|
| 743 |
+
"decisions": ["決策1", "決策2"],
|
| 744 |
+
"key_points": ["要點1", "要點2"],
|
| 745 |
+
"open_questions": ["問題1", "問題2"]
|
| 746 |
}
|
| 747 |
+
|
| 748 |
+
action_items: 包含負責人和截止日期的具體行動項目
|
| 749 |
+
decisions: 包合理由的決策
|
| 750 |
+
key_points: 重要討論要點
|
| 751 |
+
open_questions: 未解決的問題或疑慮
|
| 752 |
+
|
| 753 |
+
規則:
|
| 754 |
+
- 每個項目必須是完整、獨立的句子
|
| 755 |
+
- 在每個項目中包含上下文(誰、什麼、何時)
|
| 756 |
+
- 如果類別沒有項目,使用空陣列 []
|
| 757 |
+
- 僅輸出 JSON,無 markdown,無解釋"""
|
| 758 |
else:
|
| 759 |
return """You are a meeting analysis assistant.
|
| 760 |
+
|
| 761 |
+
Use your reasoning capabilities to analyze the content before extracting.
|
| 762 |
+
|
| 763 |
+
Your reasoning should:
|
| 764 |
1. Identify key decision points and action items
|
| 765 |
2. Distinguish explicit decisions from general discussion
|
| 766 |
3. Categorize information appropriately (action vs point vs question)
|
| 767 |
+
|
| 768 |
+
The transcript may contain repetitions, noise, or incomplete sentences - focus on meaningful dialogue content and ignore repetitive phrases.
|
| 769 |
+
|
| 770 |
+
After reasoning, return data as a JSON object with the following schema:
|
| 771 |
{
|
| 772 |
+
"action_items": ["action item 1", "action item 2"],
|
| 773 |
+
"decisions": ["decision 1", "decision 2"],
|
| 774 |
+
"key_points": ["point 1", "point 2"],
|
| 775 |
+
"open_questions": ["question 1", "question 2"]
|
| 776 |
}
|
| 777 |
+
|
| 778 |
+
action_items: Specific action items with owner and deadline
|
| 779 |
+
decisions: Decisions made with rationale
|
| 780 |
+
key_points: Important discussion points
|
| 781 |
+
open_questions: Unresolved questions or concerns
|
| 782 |
+
|
| 783 |
+
Rules:
|
| 784 |
+
- Each item must be a complete, standalone sentence
|
| 785 |
+
- Include context (who, what, when) in each item
|
| 786 |
+
- If a category has no items, use empty array []
|
| 787 |
+
- Output ONLY JSON, no markdown, no explanations"""
|
| 788 |
|
| 789 |
|
| 790 |
# ===== CORE PIPELINE FUNCTIONS =====
|