Spaces:
Running
Running
fix: improve extraction success rate with Qwen3 models
Browse files- Increase max_tokens from 1024 to max(2048, window//2) for larger windows
- Reduce window size from ~2600 to ~1800 tokens (n_ctx - 2300)
- Lower Qwen3 temperature to 0.1 for extraction (vs 0.3 for synthesis)
- Add empty JSON retry logic when model returns {}
- Update docs: remove LFM2-Extract models, set Qwen3 1.7B as default
Root cause: Qwen3 returns empty JSON for windows > 2400 tokens.
Fixes: larger token limit, smaller windows, greedy temp, retry on empty.
- app.py +22 -40
- docs/advanced-mode-implementation-plan.md +11 -14
- meeting_summarizer/extraction.py +91 -14
app.py
CHANGED
|
@@ -753,22 +753,6 @@ EXTRACTION_MODELS = {
|
|
| 753 |
"repeat_penalty": 1.0,
|
| 754 |
},
|
| 755 |
},
|
| 756 |
-
"lfm2_extract_350m": {
|
| 757 |
-
"name": "LFM2-Extract 350M (Specialized)",
|
| 758 |
-
"repo_id": "LiquidAI/LFM2-350M-Extract-GGUF",
|
| 759 |
-
"filename": "*Q8_0.gguf",
|
| 760 |
-
"max_context": 32768,
|
| 761 |
-
"default_n_ctx": 4096,
|
| 762 |
-
"params_size": "350M",
|
| 763 |
-
"supports_reasoning": False,
|
| 764 |
-
"supports_toggle": False,
|
| 765 |
-
"inference_settings": {
|
| 766 |
-
"temperature": 0.0, # LFM2-Extract: use temp=0 (greedy decoding)
|
| 767 |
-
"top_p": 0.9,
|
| 768 |
-
"top_k": 30,
|
| 769 |
-
"repeat_penalty": 1.0,
|
| 770 |
-
},
|
| 771 |
-
},
|
| 772 |
"bitcpm4_500m": {
|
| 773 |
"name": "BitCPM4 0.5B (128K Context)",
|
| 774 |
"repo_id": "openbmb/BitCPM4-0.5B-GGUF",
|
|
@@ -865,25 +849,9 @@ EXTRACTION_MODELS = {
|
|
| 865 |
"repeat_penalty": 1.0,
|
| 866 |
},
|
| 867 |
},
|
| 868 |
-
"lfm2_extract_1.2b": {
|
| 869 |
-
"name": "LFM2-Extract 1.2B (Specialized) ⭐",
|
| 870 |
-
"repo_id": "LiquidAI/LFM2-1.2B-Extract-GGUF",
|
| 871 |
-
"filename": "*Q8_0.gguf",
|
| 872 |
-
"max_context": 32768,
|
| 873 |
-
"default_n_ctx": 4096,
|
| 874 |
-
"params_size": "1.2B",
|
| 875 |
-
"supports_reasoning": False,
|
| 876 |
-
"supports_toggle": False,
|
| 877 |
-
"inference_settings": {
|
| 878 |
-
"temperature": 0.0, # LFM2-Extract: use temp=0 (greedy decoding)
|
| 879 |
-
"top_p": 0.9,
|
| 880 |
-
"top_k": 30,
|
| 881 |
-
"repeat_penalty": 1.0,
|
| 882 |
-
},
|
| 883 |
-
},
|
| 884 |
}
|
| 885 |
|
| 886 |
-
DEFAULT_EXTRACTION_MODEL = "
|
| 887 |
|
| 888 |
|
| 889 |
# ===== ADVANCED MODE: SYNTHESIS MODELS REGISTRY (16 models, 1B-30B) =====
|
|
@@ -1509,8 +1477,8 @@ def summarize_advanced(
|
|
| 1509 |
# Create windows from preprocessed transcript
|
| 1510 |
lines = [l.strip() for l in transcript.split('\n') if l.strip()]
|
| 1511 |
|
| 1512 |
-
# Reserve tokens for system prompt (~200) and output (~
|
| 1513 |
-
max_window_tokens = extraction_n_ctx -
|
| 1514 |
|
| 1515 |
# Simple windowing: split into chunks based on token count
|
| 1516 |
windows = []
|
|
@@ -1608,13 +1576,27 @@ def summarize_advanced(
|
|
| 1608 |
|
| 1609 |
yield {"stage": "deduplication", "ticker": load_msg, "thinking": "", "summary": ""}
|
| 1610 |
|
| 1611 |
-
# Deduplicate
|
| 1612 |
-
deduplicated_items =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1613 |
all_items=all_items,
|
| 1614 |
embedding_model=embedding_model,
|
| 1615 |
similarity_threshold=similarity_threshold,
|
| 1616 |
tracer=tracer
|
| 1617 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1618 |
|
| 1619 |
# Unload embedding model
|
| 1620 |
embedding_model.unload()
|
|
@@ -1631,7 +1613,7 @@ def summarize_advanced(
|
|
| 1631 |
}
|
| 1632 |
|
| 1633 |
# ===== STAGE 3: SYNTHESIS =====
|
| 1634 |
-
yield {"stage": "synthesis", "ticker": "", "thinking": "", "summary": "
|
| 1635 |
|
| 1636 |
synthesis_llm, load_msg = load_model_for_role(
|
| 1637 |
model_key=synthesis_model_key,
|
|
@@ -1639,7 +1621,7 @@ def summarize_advanced(
|
|
| 1639 |
n_threads=n_threads
|
| 1640 |
)
|
| 1641 |
|
| 1642 |
-
yield {"stage": "synthesis", "ticker": "", "thinking": "", "summary":
|
| 1643 |
|
| 1644 |
# Synthesize
|
| 1645 |
synthesis_config = get_model_config(synthesis_model_key, "synthesis")
|
|
|
|
| 753 |
"repeat_penalty": 1.0,
|
| 754 |
},
|
| 755 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
"bitcpm4_500m": {
|
| 757 |
"name": "BitCPM4 0.5B (128K Context)",
|
| 758 |
"repo_id": "openbmb/BitCPM4-0.5B-GGUF",
|
|
|
|
| 849 |
"repeat_penalty": 1.0,
|
| 850 |
},
|
| 851 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
}
|
| 853 |
|
| 854 |
+
DEFAULT_EXTRACTION_MODEL = "qwen3_1.7b_q4"
|
| 855 |
|
| 856 |
|
| 857 |
# ===== ADVANCED MODE: SYNTHESIS MODELS REGISTRY (16 models, 1B-30B) =====
|
|
|
|
| 1477 |
# Create windows from preprocessed transcript
|
| 1478 |
lines = [l.strip() for l in transcript.split('\n') if l.strip()]
|
| 1479 |
|
| 1480 |
+
# Reserve tokens for system prompt (~200) and output (~2048)
|
| 1481 |
+
max_window_tokens = extraction_n_ctx - 2300 # Target ~1800 tokens per window
|
| 1482 |
|
| 1483 |
# Simple windowing: split into chunks based on token count
|
| 1484 |
windows = []
|
|
|
|
| 1576 |
|
| 1577 |
yield {"stage": "deduplication", "ticker": load_msg, "thinking": "", "summary": ""}
|
| 1578 |
|
| 1579 |
+
# Deduplicate - now a generator for progress updates
|
| 1580 |
+
deduplicated_items = {"action_items": [], "decisions": [], "key_points": [], "open_questions": []}
|
| 1581 |
+
categories_processed = 0
|
| 1582 |
+
total_categories = len([k for k, v in all_items.items() if v])
|
| 1583 |
+
|
| 1584 |
+
for intermediate_dedup in deduplicate_items(
|
| 1585 |
all_items=all_items,
|
| 1586 |
embedding_model=embedding_model,
|
| 1587 |
similarity_threshold=similarity_threshold,
|
| 1588 |
tracer=tracer
|
| 1589 |
+
):
|
| 1590 |
+
deduplicated_items = intermediate_dedup
|
| 1591 |
+
categories_processed += 1
|
| 1592 |
+
|
| 1593 |
+
current_total = sum(len(v) for v in deduplicated_items.values())
|
| 1594 |
+
yield {
|
| 1595 |
+
"stage": "deduplication",
|
| 1596 |
+
"ticker": f"Deduplicating: {categories_processed}/{total_categories} categories processed ({current_total} items so far)...",
|
| 1597 |
+
"thinking": "",
|
| 1598 |
+
"summary": ""
|
| 1599 |
+
}
|
| 1600 |
|
| 1601 |
# Unload embedding model
|
| 1602 |
embedding_model.unload()
|
|
|
|
| 1613 |
}
|
| 1614 |
|
| 1615 |
# ===== STAGE 3: SYNTHESIS =====
|
| 1616 |
+
yield {"stage": "synthesis", "ticker": "", "thinking": "Loading synthesis model...", "summary": ""}
|
| 1617 |
|
| 1618 |
synthesis_llm, load_msg = load_model_for_role(
|
| 1619 |
model_key=synthesis_model_key,
|
|
|
|
| 1621 |
n_threads=n_threads
|
| 1622 |
)
|
| 1623 |
|
| 1624 |
+
yield {"stage": "synthesis", "ticker": "", "thinking": f"✅ {load_msg}", "summary": ""}
|
| 1625 |
|
| 1626 |
# Synthesize
|
| 1627 |
synthesis_config = get_model_config(synthesis_model_key, "synthesis")
|
docs/advanced-mode-implementation-plan.md
CHANGED
|
@@ -49,7 +49,7 @@ Stage 3: SYNTHESIS → Generate executive summary from deduplicated items
|
|
| 49 |
| **New Code** | ~1,800 lines |
|
| 50 |
| **Modified Code** | ~60 lines |
|
| 51 |
| **Total Models** | 33 unique (13 + 4 + 16) |
|
| 52 |
-
| **Default Models** | `
|
| 53 |
| **Memory Strategy** | Sequential load/unload (safe for HF Spaces Free Tier) |
|
| 54 |
|
| 55 |
---
|
|
@@ -57,9 +57,9 @@ Stage 3: SYNTHESIS → Generate executive summary from deduplicated items
|
|
| 57 |
## Design Decisions
|
| 58 |
|
| 59 |
### Q1: Extraction Model List Composition (REVISION)
|
| 60 |
-
**Decision:** Option A -
|
| 61 |
|
| 62 |
-
**Rationale:**
|
| 63 |
|
| 64 |
### Q1a: Synthesis Model Selection (NEW)
|
| 65 |
**Decision:** Restrict to models ≤4GB (max 4B parameters)
|
|
@@ -78,11 +78,11 @@ Stage 3: SYNTHESIS → Generate executive summary from deduplicated items
|
|
| 78 |
|
| 79 |
### Q4: Default Models
|
| 80 |
**Decision:**
|
| 81 |
-
- Extraction: `
|
| 82 |
- Embedding: `granite-107m` (fastest, good enough)
|
| 83 |
- Synthesis: `qwen3_1.7b_q4` (larger than extraction, better quality)
|
| 84 |
|
| 85 |
-
**Rationale:** Balanced defaults optimized for quality and speed
|
| 86 |
|
| 87 |
### Q5: Model Key Naming
|
| 88 |
**Decision:** Keep same keys (no prefix like `adv_synth_`)
|
|
@@ -129,7 +129,7 @@ Stage 3: SYNTHESIS → Generate executive summary from deduplicated items
|
|
| 129 |
- ✅ 2 hybrid models with reasoning toggle
|
| 130 |
- ✅ All models verified on HuggingFace
|
| 131 |
|
| 132 |
-
**Complete Registry (
|
| 133 |
|
| 134 |
```python
|
| 135 |
EXTRACTION_MODELS = {
|
|
@@ -457,7 +457,7 @@ with gr.TabItem("🧠 Advanced Mode (3-Model Pipeline)"):
|
|
| 457 |
with gr.Row():
|
| 458 |
extraction_model = gr.Dropdown(
|
| 459 |
choices=list(EXTRACTION_MODELS.keys()),
|
| 460 |
-
value="
|
| 461 |
label="🔍 Stage 1: Extraction Model (≤1.7B)",
|
| 462 |
info="Extracts structured items (action_items, decisions, key_points, questions) from windows"
|
| 463 |
)
|
|
@@ -1193,14 +1193,14 @@ print('✅ All model registries validated!')
|
|
| 1193 |
3. Verify default models selected
|
| 1194 |
4. Adjust extraction_n_ctx slider (2K → 8K)
|
| 1195 |
5. Select qwen3_600m_q4 for extraction → reasoning checkbox appears
|
| 1196 |
-
6. Select
|
| 1197 |
7. Select qwen3_4b_thinking_q3 for synthesis → reasoning locked ON
|
| 1198 |
8. Verify model info panels update on selection
|
| 1199 |
|
| 1200 |
### Phase 3: Pipeline Test - min.txt (Quick)
|
| 1201 |
|
| 1202 |
**Configuration:**
|
| 1203 |
-
- Extraction: `
|
| 1204 |
- Extraction n_ctx: 4096 (default)
|
| 1205 |
- Embedding: `granite-107m` (default)
|
| 1206 |
- Synthesis: `qwen3_1.7b_q4` (default)
|
|
@@ -1235,7 +1235,7 @@ print('✅ All model registries validated!')
|
|
| 1235 |
### Phase 5: Pipeline Test - full.txt (Production)
|
| 1236 |
|
| 1237 |
**Configuration:**
|
| 1238 |
-
- Extraction: `
|
| 1239 |
- Extraction n_ctx: 4096 (default)
|
| 1240 |
- Embedding: `qwen-600m` (highest quality)
|
| 1241 |
- Synthesis: `qwen3_4b_thinking_q3` (4B thinking model)
|
|
@@ -1316,7 +1316,6 @@ print('✅ All model registries validated!')
|
|
| 1316 |
|
| 1317 |
| Risk | Probability | Impact | Mitigation |
|
| 1318 |
|-------|-------------|--------|------------|
|
| 1319 |
-
| **LFM2-Extract models don't exist on HuggingFace** | Medium | High | Verify repo availability before implementation; prepare fallback to qwen3_600m_q4 |
|
| 1320 |
| **Memory overflow on HF Spaces Free Tier** | Low | High | Sequential loading/unloading tested; add memory monitoring |
|
| 1321 |
| **Reasoning output breaks JSON parsing** | Medium | Medium | Robust thinking block parsing with fallback; strict error handling |
|
| 1322 |
| **User n_ctx slider causes OOM** | Low | Medium | Cap at MAX_USABLE_CTX (32K); show warning if user sets too high |
|
|
@@ -1327,7 +1326,7 @@ print('✅ All model registries validated!')
|
|
| 1327 |
|
| 1328 |
## Appendix: Model Comparison Tables
|
| 1329 |
|
| 1330 |
-
### Extraction Models (
|
| 1331 |
|
| 1332 |
| Model | Size | Context | Reasoning | Settings |
|
| 1333 |
|--------|------|---------|-----------|----------|
|
|
@@ -1335,14 +1334,12 @@ print('✅ All model registries validated!')
|
|
| 1335 |
| gemma3_270m | 270M | 32K | No | temp=0.3 |
|
| 1336 |
| ernie_300m | 300M | 131K | No | temp=0.2 |
|
| 1337 |
| granite_350m | 350M | 32K | No | temp=0.1 |
|
| 1338 |
-
| lfm2_350m | 350M | 32K | No | temp=0.2 |
|
| 1339 |
| bitcpm4_500m | 500M | 128K | No | temp=0.2 |
|
| 1340 |
| hunyuan_500m | 500M | 256K | No | temp=0.2 |
|
| 1341 |
| qwen3_600m_q4 | 600M | 32K | **Hybrid** | temp=0.3 |
|
| 1342 |
| granite_3_1_1b_q8 | 1B | 128K | No | temp=0.3 |
|
| 1343 |
| falcon_h1_1.5b_q4 | 1.5B | 32K | No | temp=0.2 |
|
| 1344 |
| qwen3_1.7b_q4 | 1.7B | 32K | **Hybrid** | temp=0.3 |
|
| 1345 |
-
| lfm2_extract_350m | 350M | 32K | No | temp=0.2 |
|
| 1346 |
| lfm2_extract_1.2b | 1.2B | 32K | No | temp=0.2 |
|
| 1347 |
|
| 1348 |
### Synthesis Models (16)
|
|
|
|
| 49 |
| **New Code** | ~1,800 lines |
|
| 50 |
| **Modified Code** | ~60 lines |
|
| 51 |
| **Total Models** | 33 unique (13 + 4 + 16) |
|
| 52 |
+
| **Default Models** | `qwen3_1.7b_q4`, `granite-107m`, `qwen3_1.7b_q4` |
|
| 53 |
| **Memory Strategy** | Sequential load/unload (safe for HF Spaces Free Tier) |
|
| 54 |
|
| 55 |
---
|
|
|
|
| 57 |
## Design Decisions
|
| 58 |
|
| 59 |
### Q1: Extraction Model List Composition (REVISION)
|
| 60 |
+
**Decision:** Option A - 11 models (≤1.7B), excluding LFM2-Extract models
|
| 61 |
|
| 62 |
+
**Rationale:** 11 models excluding LFM2-Extract specialized models (removed after testing showed 85.7% failure rate due to hallucination and schema non-compliance. Replaced with Qwen3 models that support reasoning and better handle Chinese content.)
|
| 63 |
|
| 64 |
### Q1a: Synthesis Model Selection (NEW)
|
| 65 |
**Decision:** Restrict to models ≤4GB (max 4B parameters)
|
|
|
|
| 78 |
|
| 79 |
### Q4: Default Models
|
| 80 |
**Decision:**
|
| 81 |
+
- Extraction: `qwen3_1.7b_q4` (supports reasoning, better Chinese understanding)
|
| 82 |
- Embedding: `granite-107m` (fastest, good enough)
|
| 83 |
- Synthesis: `qwen3_1.7b_q4` (larger than extraction, better quality)
|
| 84 |
|
| 85 |
+
**Rationale:** Balanced defaults optimized for quality and speed. Qwen3 1.7B chosen over LFM2-Extract based on empirical testing showing superior extraction success rate and schema compliance.
|
| 86 |
|
| 87 |
### Q5: Model Key Naming
|
| 88 |
**Decision:** Keep same keys (no prefix like `adv_synth_`)
|
|
|
|
| 129 |
- ✅ 2 hybrid models with reasoning toggle
|
| 130 |
- ✅ All models verified on HuggingFace
|
| 131 |
|
| 132 |
+
**Complete Registry (LFM2-Extract models removed after testing):**
|
| 133 |
|
| 134 |
```python
|
| 135 |
EXTRACTION_MODELS = {
|
|
|
|
| 457 |
with gr.Row():
|
| 458 |
extraction_model = gr.Dropdown(
|
| 459 |
choices=list(EXTRACTION_MODELS.keys()),
|
| 460 |
+
value="qwen3_1.7b_q4", # ⭐ DEFAULT
|
| 461 |
label="🔍 Stage 1: Extraction Model (≤1.7B)",
|
| 462 |
info="Extracts structured items (action_items, decisions, key_points, questions) from windows"
|
| 463 |
)
|
|
|
|
| 1193 |
3. Verify default models selected
|
| 1194 |
4. Adjust extraction_n_ctx slider (2K → 8K)
|
| 1195 |
5. Select qwen3_600m_q4 for extraction → reasoning checkbox appears
|
| 1196 |
+
6. Select qwen3_1.7b_q4 for extraction → reasoning checkbox visible (Qwen3 supports reasoning)
|
| 1197 |
7. Select qwen3_4b_thinking_q3 for synthesis → reasoning locked ON
|
| 1198 |
8. Verify model info panels update on selection
|
| 1199 |
|
| 1200 |
### Phase 3: Pipeline Test - min.txt (Quick)
|
| 1201 |
|
| 1202 |
**Configuration:**
|
| 1203 |
+
- Extraction: `qwen3_1.7b_q4` (default)
|
| 1204 |
- Extraction n_ctx: 4096 (default)
|
| 1205 |
- Embedding: `granite-107m` (default)
|
| 1206 |
- Synthesis: `qwen3_1.7b_q4` (default)
|
|
|
|
| 1235 |
### Phase 5: Pipeline Test - full.txt (Production)
|
| 1236 |
|
| 1237 |
**Configuration:**
|
| 1238 |
+
- Extraction: `qwen3_1.7b_q4` (high quality, reasoning enabled)
|
| 1239 |
- Extraction n_ctx: 4096 (default)
|
| 1240 |
- Embedding: `qwen-600m` (highest quality)
|
| 1241 |
- Synthesis: `qwen3_4b_thinking_q3` (4B thinking model)
|
|
|
|
| 1316 |
|
| 1317 |
| Risk | Probability | Impact | Mitigation |
|
| 1318 |
|-------|-------------|--------|------------|
|
|
|
|
| 1319 |
| **Memory overflow on HF Spaces Free Tier** | Low | High | Sequential loading/unloading tested; add memory monitoring |
|
| 1320 |
| **Reasoning output breaks JSON parsing** | Medium | Medium | Robust thinking block parsing with fallback; strict error handling |
|
| 1321 |
| **User n_ctx slider causes OOM** | Low | Medium | Cap at MAX_USABLE_CTX (32K); show warning if user sets too high |
|
|
|
|
| 1326 |
|
| 1327 |
## Appendix: Model Comparison Tables
|
| 1328 |
|
| 1329 |
+
### Extraction Models (11)
|
| 1330 |
|
| 1331 |
| Model | Size | Context | Reasoning | Settings |
|
| 1332 |
|--------|------|---------|-----------|----------|
|
|
|
|
| 1334 |
| gemma3_270m | 270M | 32K | No | temp=0.3 |
|
| 1335 |
| ernie_300m | 300M | 131K | No | temp=0.2 |
|
| 1336 |
| granite_350m | 350M | 32K | No | temp=0.1 |
|
|
|
|
| 1337 |
| bitcpm4_500m | 500M | 128K | No | temp=0.2 |
|
| 1338 |
| hunyuan_500m | 500M | 256K | No | temp=0.2 |
|
| 1339 |
| qwen3_600m_q4 | 600M | 32K | **Hybrid** | temp=0.3 |
|
| 1340 |
| granite_3_1_1b_q8 | 1B | 128K | No | temp=0.3 |
|
| 1341 |
| falcon_h1_1.5b_q4 | 1.5B | 32K | No | temp=0.2 |
|
| 1342 |
| qwen3_1.7b_q4 | 1.7B | 32K | **Hybrid** | temp=0.3 |
|
|
|
|
| 1343 |
| lfm2_extract_1.2b | 1.2B | 32K | No | temp=0.2 |
|
| 1344 |
|
| 1345 |
### Synthesis Models (16)
|
meeting_summarizer/extraction.py
CHANGED
|
@@ -688,7 +688,7 @@ def _sample_llm_response(text: str, max_chars: int = 400) -> str:
|
|
| 688 |
# ===== EXTRACTION PROMPT BUILDERS =====
|
| 689 |
|
| 690 |
def _build_schema_extraction_prompt(output_language: str) -> str:
|
| 691 |
-
"""Build concise schema-based extraction prompt (optimized for
|
| 692 |
if output_language == "zh-TW":
|
| 693 |
return """以 JSON 格式返回資料,使用以下架構:
|
| 694 |
|
|
@@ -817,13 +817,13 @@ def stream_extract_from_window(
|
|
| 817 |
|
| 818 |
# Build system prompt based on model type and reasoning mode
|
| 819 |
# For reasoning-enabled hybrid models, use the verbose prompt with reasoning
|
| 820 |
-
# instructions. For non-reasoning models
|
| 821 |
-
#
|
| 822 |
if enable_reasoning and supports_reasoning:
|
| 823 |
# Verbose prompt with reasoning instructions (hybrid models like Qwen3)
|
| 824 |
system_prompt = _build_reasoning_extraction_prompt(output_language)
|
| 825 |
else:
|
| 826 |
-
# Concise
|
| 827 |
system_prompt = _build_schema_extraction_prompt(output_language)
|
| 828 |
|
| 829 |
user_prompt = f"Transcript:\n\n{window.content}"
|
|
@@ -841,8 +841,14 @@ def stream_extract_from_window(
|
|
| 841 |
token_count = 0
|
| 842 |
|
| 843 |
try:
|
| 844 |
-
max_gen_tokens =
|
| 845 |
-
settings = model_config["inference_settings"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
stream = extraction_llm.create_chat_completion(
|
| 847 |
messages=messages,
|
| 848 |
max_tokens=max_gen_tokens,
|
|
@@ -925,6 +931,26 @@ def stream_extract_from_window(
|
|
| 925 |
|
| 926 |
final_items = _try_parse_extraction_json(json_text, log_repair=True)
|
| 927 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
if not final_items:
|
| 929 |
# Graceful degradation: log warning but don't crash the pipeline.
|
| 930 |
# Other windows may still succeed and produce useful data.
|
|
@@ -954,6 +980,16 @@ def stream_extract_from_window(
|
|
| 954 |
current_snippet=f"⚠️ Window {window_id} parse failed, continuing..."
|
| 955 |
)
|
| 956 |
yield (ticker, thinking_content, empty_items, True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 957 |
return
|
| 958 |
|
| 959 |
# Log success
|
|
@@ -1019,7 +1055,7 @@ def deduplicate_items(
|
|
| 1019 |
embedding_model: EmbeddingModel,
|
| 1020 |
similarity_threshold: float,
|
| 1021 |
tracer: Any
|
| 1022 |
-
) -> Dict[str, List[str]]:
|
| 1023 |
"""
|
| 1024 |
Deduplicate items across all categories using embeddings.
|
| 1025 |
|
|
@@ -1029,8 +1065,8 @@ def deduplicate_items(
|
|
| 1029 |
similarity_threshold: Cosine similarity threshold (0.0-1.0)
|
| 1030 |
tracer: Tracer instance
|
| 1031 |
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
"""
|
| 1035 |
deduplicated = {}
|
| 1036 |
|
|
@@ -1096,8 +1132,11 @@ def deduplicate_items(
|
|
| 1096 |
)
|
| 1097 |
|
| 1098 |
logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
|
|
|
|
|
|
|
|
|
|
| 1099 |
|
| 1100 |
-
return
|
| 1101 |
|
| 1102 |
|
| 1103 |
def stream_synthesize_executive_summary(
|
|
@@ -1134,11 +1173,49 @@ def stream_synthesize_executive_summary(
|
|
| 1134 |
items_text += f"{i}. {item}\n"
|
| 1135 |
|
| 1136 |
if output_language == "zh-TW":
|
| 1137 |
-
system_prompt = "你是執行摘要專家。生成簡潔的執行摘要。
|
| 1138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1139 |
else:
|
| 1140 |
-
system_prompt = "You are an executive summary expert. Generate concise summaries.
|
| 1141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1142 |
|
| 1143 |
messages = [
|
| 1144 |
{"role": "system", "content": system_prompt},
|
|
|
|
| 688 |
# ===== EXTRACTION PROMPT BUILDERS =====
|
| 689 |
|
| 690 |
def _build_schema_extraction_prompt(output_language: str) -> str:
|
| 691 |
+
"""Build concise schema-based extraction prompt (optimized for non-reasoning models)."""
|
| 692 |
if output_language == "zh-TW":
|
| 693 |
return """以 JSON 格式返回資料,使用以下架構:
|
| 694 |
|
|
|
|
| 817 |
|
| 818 |
# Build system prompt based on model type and reasoning mode
|
| 819 |
# For reasoning-enabled hybrid models, use the verbose prompt with reasoning
|
| 820 |
+
# instructions. For non-reasoning models, use the concise schema-based prompt
|
| 821 |
+
# optimized for structured extraction.
|
| 822 |
if enable_reasoning and supports_reasoning:
|
| 823 |
# Verbose prompt with reasoning instructions (hybrid models like Qwen3)
|
| 824 |
system_prompt = _build_reasoning_extraction_prompt(output_language)
|
| 825 |
else:
|
| 826 |
+
# Concise schema-based prompt for non-reasoning models
|
| 827 |
system_prompt = _build_schema_extraction_prompt(output_language)
|
| 828 |
|
| 829 |
user_prompt = f"Transcript:\n\n{window.content}"
|
|
|
|
| 841 |
token_count = 0
|
| 842 |
|
| 843 |
try:
|
| 844 |
+
max_gen_tokens = max(2048, window.token_count // 2)
|
| 845 |
+
settings = model_config["inference_settings"].copy()
|
| 846 |
+
|
| 847 |
+
# Qwen3 models need lower temperature for extraction (not synthesis)
|
| 848 |
+
# to avoid empty JSON output on larger windows
|
| 849 |
+
if "qwen3" in model_config["repo_id"].lower():
|
| 850 |
+
settings["temperature"] = 0.1
|
| 851 |
+
|
| 852 |
stream = extraction_llm.create_chat_completion(
|
| 853 |
messages=messages,
|
| 854 |
max_tokens=max_gen_tokens,
|
|
|
|
| 931 |
|
| 932 |
final_items = _try_parse_extraction_json(json_text, log_repair=True)
|
| 933 |
|
| 934 |
+
# Detect and retry on empty JSON (Qwen3 returns {} for large windows)
|
| 935 |
+
if final_items and not any(final_items.values()):
|
| 936 |
+
logger.warning(f"Window {window_id}: Model returned empty JSON, retrying with stricter prompt...")
|
| 937 |
+
|
| 938 |
+
# Re-parse with strict instruction prepended
|
| 939 |
+
strict_instruction = "\n\nIMPORTANT: Extract at least one item per category. Empty JSON is not acceptable."
|
| 940 |
+
modified_response = full_response + strict_instruction
|
| 941 |
+
|
| 942 |
+
# Try to extract JSON from modified response
|
| 943 |
+
if enable_reasoning and supports_reasoning:
|
| 944 |
+
thinking_match = re.search(r'<think(?:ing)?>(.*?)</think(?:ing)?>', modified_response, re.DOTALL)
|
| 945 |
+
if thinking_match:
|
| 946 |
+
json_text = modified_response[:thinking_match.start()] + modified_response[thinking_match.end():]
|
| 947 |
+
else:
|
| 948 |
+
json_text = modified_response
|
| 949 |
+
else:
|
| 950 |
+
json_text = modified_response
|
| 951 |
+
|
| 952 |
+
final_items = _try_parse_extraction_json(json_text, log_repair=True)
|
| 953 |
+
|
| 954 |
if not final_items:
|
| 955 |
# Graceful degradation: log warning but don't crash the pipeline.
|
| 956 |
# Other windows may still succeed and produce useful data.
|
|
|
|
| 980 |
current_snippet=f"⚠️ Window {window_id} parse failed, continuing..."
|
| 981 |
)
|
| 982 |
yield (ticker, thinking_content, empty_items, True)
|
| 983 |
+
|
| 984 |
+
# Log failed extraction to tracer for debugging
|
| 985 |
+
tracer.log_extraction_detail(
|
| 986 |
+
window_id=window_id,
|
| 987 |
+
extracted_items=empty_items,
|
| 988 |
+
full_llm_response=full_response,
|
| 989 |
+
full_thinking=thinking_content,
|
| 990 |
+
json_repaired=False,
|
| 991 |
+
parse_attempts=1
|
| 992 |
+
)
|
| 993 |
return
|
| 994 |
|
| 995 |
# Log success
|
|
|
|
| 1055 |
embedding_model: EmbeddingModel,
|
| 1056 |
similarity_threshold: float,
|
| 1057 |
tracer: Any
|
| 1058 |
+
) -> Generator[Dict[str, List[str]], None, None]:
|
| 1059 |
"""
|
| 1060 |
Deduplicate items across all categories using embeddings.
|
| 1061 |
|
|
|
|
| 1065 |
similarity_threshold: Cosine similarity threshold (0.0-1.0)
|
| 1066 |
tracer: Tracer instance
|
| 1067 |
|
| 1068 |
+
Yields:
|
| 1069 |
+
Intermediate deduplication results for progress tracking
|
| 1070 |
"""
|
| 1071 |
deduplicated = {}
|
| 1072 |
|
|
|
|
| 1132 |
)
|
| 1133 |
|
| 1134 |
logger.info(f"Dedup {category}: {original_count} → {len(unique_items)} ({duplicates_removed} removed)")
|
| 1135 |
+
|
| 1136 |
+
# Yield intermediate results for progress tracking
|
| 1137 |
+
yield deduplicated
|
| 1138 |
|
| 1139 |
+
return
|
| 1140 |
|
| 1141 |
|
| 1142 |
def stream_synthesize_executive_summary(
|
|
|
|
| 1173 |
items_text += f"{i}. {item}\n"
|
| 1174 |
|
| 1175 |
if output_language == "zh-TW":
|
| 1176 |
+
system_prompt = """你是執行摘要專家。請根據提供的結構化資訊,生成簡潔、專業的執行摘要。
|
| 1177 |
+
|
| 1178 |
+
輸出格式要求:
|
| 1179 |
+
- 使用 **粗體標題** 標記各部分
|
| 1180 |
+
- 使用項目符號(bullet points)列出重點
|
| 1181 |
+
- 每個重點簡短有力,不超過一句話
|
| 1182 |
+
- 不要說明、不要解釋、不要自我分析
|
| 1183 |
+
- 直接輸出執行摘要,不要說"首先..."、"我注意到..."等開頭語
|
| 1184 |
+
|
| 1185 |
+
示例格式:
|
| 1186 |
+
**核心重點**
|
| 1187 |
+
- DDR4 今年是供應鏈優先項目
|
| 1188 |
+
- 預計 2027 年 Q1 開始產出
|
| 1189 |
+
|
| 1190 |
+
**主要行動**
|
| 1191 |
+
- 協調三星、海力士等供應商
|
| 1192 |
+
- 優先保障嵌入式市場供給
|
| 1193 |
+
|
| 1194 |
+
**未解決問題**
|
| 1195 |
+
- DDR4 產能不足仍待解決"""
|
| 1196 |
+
user_prompt = f"基於以下結構化資訊生成執行摘要:\n{items_text}\n\n請按照上述格式要求,直接輸出執行摘要。"
|
| 1197 |
else:
|
| 1198 |
+
system_prompt = """You are an executive summary expert. Generate concise, professional executive summaries based on structured information.
|
| 1199 |
+
|
| 1200 |
+
Output format requirements:
|
| 1201 |
+
- Use **BOLD HEADERS** for each section
|
| 1202 |
+
- Use bullet points for key points
|
| 1203 |
+
- Keep each point brief and powerful (one sentence max)
|
| 1204 |
+
- NO explanations, NO analysis, NO self-reflection
|
| 1205 |
+
- Start directly with the summary, do NOT use phrases like "First I need to...", "Let me analyze...", "I noticed..."
|
| 1206 |
+
|
| 1207 |
+
Example format:
|
| 1208 |
+
**Key Priorities**
|
| 1209 |
+
- DDR4 is supply chain priority this year
|
| 1210 |
+
- Production expected to start in 2027 Q1
|
| 1211 |
+
|
| 1212 |
+
**Main Actions**
|
| 1213 |
+
- Coordinate with Samsung, Hynix, and other suppliers
|
| 1214 |
+
- Prioritize embedded market supply
|
| 1215 |
+
|
| 1216 |
+
**Open Issues**
|
| 1217 |
+
- DDR4 capacity shortage still needs resolution"""
|
| 1218 |
+
user_prompt = f"Generate an executive summary based on these structured items:\n{items_text}\n\nPlease follow the format requirements above and output the executive summary directly."
|
| 1219 |
|
| 1220 |
messages = [
|
| 1221 |
{"role": "system", "content": system_prompt},
|