Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Feb 6

Commit

126dfa5

1 Parent(s): f03d62b

Add extraction benchmark and results for small models

- Created benchmark_extraction.py to test 8 small models individually
- Tests show 75% success rate (6/8 models) with fixes applied
- Granite-4.0 350M: 8 items in 14.4s
- Qwen3 0.6B: 9 items in 13.0s with reasoning
- BitCPM4 0.5B: fastest at 4.6s
- Falcon-H1 100M and Gemma-3 270M failed (model/repo issues)

Fixes include:
- Loosened schema validation (accept partial JSON)
- Added concrete extraction examples to prompts
- Improved reasoning prompts with bullet-point format

Files changed (4) hide show

benchmark_extraction.py +215 -0
extraction_benchmark_results.json +186 -0
summary_20260206_013146.json +0 -0
summary_20260206_014542.json +0 -0

benchmark_extraction.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env python3
+"""
+Benchmark script for testing extraction models individually.
+Tests each model on a single small window to verify extraction works.
+"""
+import json
+import time
+from typing import Dict, List, Tuple, Optional
+import sys
+sys.path.insert(0, '/home/luigi/tiny-scribe')
+from meeting_summarizer.extraction import (
+    _build_schema_extraction_prompt,
+    _build_reasoning_extraction_prompt,
+    _try_parse_extraction_json,
+)
+from llama_cpp import Llama
+# Test window - small excerpt from transcripts/full.txt
+TEST_WINDOW = """SPEAKER_02: 三星在去年Q3的時候已經告訴，今年，它所有的產出50會在AI跟Service上面。25在Mobile20在PCM那模組廠就是PCMOthers這一塊。所以26年的供給已經會比25年的供給在PCMOthers這塊少了15那再加上現在的狀況。所以我們覺得看起來應該缺到了8年，再加上現在昨天我不知道昨天你們看到SanDisk有一個這不是只有DDRName也是這樣Name你知道。
+SPEAKER_03: 我想請教一下，以現在來講第四三一，對於就是說三星他們減產，或是甚至於後面可能會停產的。這樣的狀況跟凱力士也差不多的情況。
+SPEAKER_02: 對於這塊，你們怎麼應？該是這樣說他們就算減產或停產，vivo是不會停的，顆粒會停，它的成品會停，但vivo是不會停的。"""
+# Small models to test (< 2B parameters)
+TEST_MODELS = [
+    {
+        "name": "Falcon-H1 100M",
+        "repo_id": "tiiuae/Falcon-H1-100M-Base-GGUF",
+        "filename": "*Q8_0.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": False,
+    },
+    {
+        "name": "Gemma-3 270M",
+        "repo_id": "google/gemma-3-270m-it-GGUF",
+        "filename": "*Q4_K_M.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": False,
+    },
+    {
+        "name": "Granite-4.0 350M",
+        "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
+        "filename": "*Q8_0.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": False,
+    },
+    {
+        "name": "BitCPM4 0.5B",
+        "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
+        "filename": "*q4_0.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": False,
+    },
+    {
+        "name": "Qwen3 0.6B",
+        "repo_id": "unsloth/Qwen3-0.6B-GGUF",
+        "filename": "*Q4_0.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": True,
+    },
+    {
+        "name": "Granite 3.1 1B",
+        "repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
+        "filename": "*Q8_0.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": False,
+    },
+    {
+        "name": "Falcon-H1 1.5B",
+        "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
+        "filename": "*Q4_K_M.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": False,
+    },
+    {
+        "name": "Qwen3 1.7B",
+        "repo_id": "unsloth/Qwen3-1.7B-GGUF",
+        "filename": "*Q4_0.gguf",
+        "temperature": 0.1,
+        "supports_reasoning": True,
+    },
+]
+def test_model(model_config: Dict) -> Dict:
+    """Test a single model on the test window."""
+    print(f"\n{'='*60}")
+    print(f"Testing: {model_config['name']}")
+    print(f"{'='*60}")
+    result = {
+        "model": model_config['name'],
+        "repo_id": model_config['repo_id'],
+        "success": False,
+        "items_extracted": 0,
+        "response": "",
+        "error": "",
+        "time_seconds": 0,
+    }
+    try:
+        # Load model
+        print(f"Loading {model_config['name']}...")
+        start_time = time.time()
+        llm = Llama.from_pretrained(
+            repo_id=model_config['repo_id'],
+            filename=model_config['filename'],
+            n_ctx=4096,
+            verbose=False,
+        )
+        # Build prompt
+        supports_reasoning = model_config.get('supports_reasoning', False)
+        if supports_reasoning:
+            system_prompt = _build_reasoning_extraction_prompt('zh-TW')
+        else:
+            system_prompt = _build_schema_extraction_prompt('zh-TW')
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"Transcript:\n\n{TEST_WINDOW}"}
+        ]
+        # Run extraction
+        print("Running extraction...")
+        response = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=1024,
+            temperature=model_config['temperature'],
+            top_p=0.9,
+            top_k=30,
+        )
+        result['time_seconds'] = time.time() - start_time
+        # Get response text
+        full_response = response["choices"][0]["message"]["content"]
+        result['response'] = full_response[:500] + "..." if len(full_response) > 500 else full_response
+        print(f"\nRaw response (first 300 chars):")
+        print(full_response[:300])
+        # Parse JSON
+        parsed = _try_parse_extraction_json(full_response, log_repair=True)
+        if parsed:
+            total_items = sum(len(v) for v in parsed.values())
+            result['success'] = True
+            result['items_extracted'] = total_items
+            result['parsed_data'] = parsed
+            print(f"\n✅ SUCCESS - Extracted {total_items} items:")
+            for key, items in parsed.items():
+                print(f"  {key}: {len(items)} items")
+                for item in items[:2]:  # Show first 2 items
+                    print(f"    - {item[:80]}...")
+        else:
+            result['error'] = "Failed to parse JSON"
+            print(f"\n❌ FAILED - Could not parse JSON")
+    except Exception as e:
+        result['error'] = str(e)
+        result['time_seconds'] = time.time() - start_time if 'start_time' in locals() else 0
+        print(f"\n❌ ERROR: {e}")
+    return result
+def main():
+    """Run benchmark on all test models."""
+    print("=" * 60)
+    print("EXTRACTION MODEL BENCHMARK")
+    print("=" * 60)
+    print(f"\nTest window size: {len(TEST_WINDOW)} characters")
+    print(f"Models to test: {len(TEST_MODELS)}")
+    results = []
+    for model_config in TEST_MODELS:
+        result = test_model(model_config)
+        results.append(result)
+        # Small delay between models
+        time.sleep(2)
+    # Summary
+    print("\n" + "=" * 60)
+    print("BENCHMARK SUMMARY")
+    print("=" * 60)
+    successful = [r for r in results if r['success']]
+    failed = [r for r in results if not r['success']]
+    print(f"\nSuccessful: {len(successful)}/{len(results)}")
+    print(f"Failed: {len(failed)}/{len(results)}")
+    print("\nSuccessful Models:")
+    for r in successful:
+        print(f"  ✅ {r['model']}: {r['items_extracted']} items ({r['time_seconds']:.1f}s)")
+    print("\nFailed Models:")
+    for r in failed:
+        print(f"  ❌ {r['model']}: {r['error']}")
+    # Save results
+    with open('extraction_benchmark_results.json', 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print("\nResults saved to: extraction_benchmark_results.json")
+if __name__ == "__main__":
+    main()

extraction_benchmark_results.json ADDED Viewed

	@@ -0,0 +1,186 @@

+[
+  {
+    "model": "Falcon-H1 100M",
+    "repo_id": "tiiuae/Falcon-H1-100M-Base-GGUF",
+    "success": false,
+    "items_extracted": 0,
+    "response": "",
+    "error": "tiiuae/Falcon-H1-100M-Base-GGUF (repository not found)",
+    "time_seconds": 1.1495630741119385
+  },
+  {
+    "model": "Gemma-3 270M",
+    "repo_id": "google/gemma-3-270m-it-GGUF",
+    "success": false,
+    "items_extracted": 0,
+    "response": "",
+    "error": "google/gemma-3-270m-it-GGUF (repository not found)",
+    "time_seconds": 0.8675286769866943
+  },
+  {
+    "model": "Granite-4.0 350M",
+    "repo_id": "unsloth/granite-4.0-h-350m-GGUF",
+    "success": true,
+    "items_extracted": 8,
+    "response": "{\n  \"action_items\": [\n    \"對三星進行供應調整\",\n    \"確認供應數量\"\n  ],\n  \"decisions\": [\n    \"優先供應大客戶\",\n    \"暫停接受新訂單\"\n  ],\n  \"key_points\": [\n    \"AI和Service上的供應需求\",\n    \"PCMOthers模組廠供應數量\"\n  ],\n  \"open_questions\": [\n    \"三星是否有其他供應策略\",\n    \"是否有其他供應方的應對策略\"\n  ]\n}",
+    "error": "",
+    "time_seconds": 14.357566356658936,
+    "parsed_data": {
+      "decisions": [
+        "優先供應大客戶",
+        "暫停接受新訂單"
+      ],
+      "action_items": [
+        "對三星進行供應調整",
+        "確認供應數量"
+      ],
+      "key_points": [
+        "AI和Service上的供應需求",
+        "PCMOthers模組廠供應數量"
+      ],
+      "open_questions": [
+        "三星是否有其他供應策略",
+        "是否有其他供應方的應對策略"
+      ]
+    }
+  },
+  {
+    "model": "BitCPM4 0.5B",
+    "repo_id": "openbmb/BitCPM4-0.5B-GGUF",
+    "success": true,
+    "items_extracted": 9,
+    "response": "{\n  \"action_items\": [\"今年的AI和Service供应\", \"今年的Mobile20供应\", \"今年的PCM供应\"],\n  \"decisions\": [\"是否继续供应AI和Service\", \"是否停止供应AI和Service\", \"是否继续供应Mobile20和PCM\"],\n  \"key_points\": [\"明年是否继续供应AI和Service\", \"明年是否继续供应Mobile20和PCM\"],\n  \"open_questions\": [\"是否继续供应AI和Service\"]\n}",
+    "error": "",
+    "time_seconds": 4.5774476528167725,
+    "parsed_data": {
+      "decisions": [
+        "是否继续供应AI和Service",
+        "是否停止供应AI和Service",
+        "是否继续供应Mobile20和PCM"
+      ],
+      "action_items": [
+        "今年的AI和Service供应",
+        "今年的Mobile20供应",
+        "今年的PCM供应"
+      ],
+      "key_points": [
+        "明年是否继续供应AI和Service",
+        "明年是否继续供应Mobile20和PCM"
+      ],
+      "open_questions": [
+        "是否继续供应AI和Service"
+      ]
+    }
+  },
+  {
+    "model": "Qwen3 0.6B",
+    "repo_id": "unsloth/Qwen3-0.6B-GGUF",
+    "success": true,
+    "items_extracted": 9,
+    "response": "<think>\n好的，我需要处理用户提供的会议 transcript 并按照要求提取关键資訊。首先，我要仔细阅读对话内容，理解讨论的主题和涉及的公司或产品。\n\n首先，用户提到三星在去年Q3时计划将产出50%用于AI和服务，而25年在Mobile20的PCM模块厂是PCMOthers。26年的供应比25年在PCMOthers少了15%。现在的情况是，三星可能缺到了8年，加上昨天看到的SanDisk的情况，说明可能需要考虑供应链问题。接下来，SPEAKER_03提到凯力士的情况类似，而SPEAKER_02回应说vivo不会停产，但PCMOthers会停，但vivo不会停。这里可能涉及到供应链调整和产品线调整。\n\n接下来，我需要按照用户的要求提取关键資訊。首先，action_items是具体的行动项目，比如讨论PCMOthers的供应情况和调整策略。decisions包括优先考虑供应大客户和暂停新订单。key_points包括DDR4的缺货和AI需求占比，以及两家公司的策略调整。open_questions可能包括价格预测和测试场良率确认。\n\n需要确保每个类别有1-5个项目，并且是具体的完整...",
+    "error": "",
+    "time_seconds": 13.032857418060303,
+    "parsed_data": {
+      "decisions": [
+        "優先供應大客戶浪潮",
+        "暫停接受新訂單"
+      ],
+      "action_items": [
+        "與三星討論PCMOthers的供應調整",
+        "確認LPDDR4供應數量"
+      ],
+      "key_points": [
+        "DDR4缺貨持續到2028年",
+        "AI需求占全球產能45%",
+        "美光可能跟進SanDisk付款條件"
+      ],
+      "open_questions": [
+        "Q2價格漲幅預估",
+        "深圳測試場良率確認"
+      ]
+    }
+  },
+  {
+    "model": "Granite 3.1 1B",
+    "repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
+    "success": true,
+    "items_extracted": 12,
+    "response": "{\n  \"action_items\": [\"三星減產PCMOthers塊\", \"vivo不會停產\", \"SanDisk可能停產\"],\n  \"decisions\": [\"三星減產PCMOthers塊\", \"vivo不會停產\", \"SanDisk可能停產\"],\n  \"key_points\": [\"PCMOthers塊供應量將減少8年\", \"vivo不會停產\", \"SanDisk可能停產\"],\n  \"open_questions\": [\"是否會停產PCMOthers塊\", \"是否會停產vivo\", \"SanDisk是否會停產\"]\n}",
+    "error": "",
+    "time_seconds": 11.540041446685791,
+    "parsed_data": {
+      "decisions": [
+        "三星減產PCMOthers塊",
+        "vivo不會停產",
+        "SanDisk可能停產"
+      ],
+      "action_items": [
+        "三星減產PCMOthers塊",
+        "vivo不會停產",
+        "SanDisk可能停產"
+      ],
+      "key_points": [
+        "PCMOthers塊供應量將減少8年",
+        "vivo不會停產",
+        "SanDisk可能停產"
+      ],
+      "open_questions": [
+        "是否會停產PCMOthers塊",
+        "是否會停產vivo",
+        "SanDisk是否會停產"
+      ]
+    }
+  },
+  {
+    "model": "Falcon-H1 1.5B",
+    "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
+    "success": true,
+    "items_extracted": 4,
+    "response": "```json\n{\n  \"action_items\": [],\n  \"decisions\": [],\n  \"key_points\": [\"三星產出50%將投入AI和服務領域\", \"PCMOthers分區25%將投入Mobile，20%投入PCM模組廠\"],\n  \"open_questions\": [\"對於三星減產或停產的影響\", \"DDRName產品與SanDisk的競爭情況\"]\n}\n```",
+    "error": "",
+    "time_seconds": 24.45098614692688,
+    "parsed_data": {
+      "decisions": [],
+      "action_items": [],
+      "key_points": [
+        "三星產出50%將投入AI和服務領域",
+        "PCMOthers分區25%將投入Mobile，20%投入PCM模組廠"
+      ],
+      "open_questions": [
+        "對於三星減產或停產的影響",
+        "DDRName產品與SanDisk的競爭情況"
+      ]
+    }
+  },
+  {
+    "model": "Qwen3 1.7B",
+    "repo_id": "unsloth/Qwen3-1.7B-GGUF",
+    "success": true,
+    "items_extracted": 12,
+    "response": "<think>\n好的，我需要分析用户提供的会议逐字稿，并按照要求提取关键信息。首先，我要理解对话内容。会议涉及三星的产能调整，特别是PCMOthers板块的减产，以及这对供应和需求的影响。\n\n接下来，识别决策点。会议中提到三星在2026年减少PCMOthers的产能，这可能影响供应。另外，决定暂停接受新订单，以应对供应短缺。还有关于价格调整的讨论，但可能没有明确决策。\n\n行动项目方面，需要跟进三星的产能调整，确认LPDDR4的供应数量，以及与vivo的沟通。此外，可能需要确认价格调整的细节。\n\n关键点包括市场趋势（AI和Mobile需求增长），供应状况（短缺持续到2028年），策略调整（暂停新订单）。\n\n未解决的问题可能涉及价格涨幅预估和深圳测试场的良率确认。\n</think>\n\n{\n  \"action_items\": [\"與三星確認2026年PCMOthers產能調整細節\", \"確認LPDDR4供應數量及到貨時間\", \"與vivo協調成品供應穩定策略\"],\n  \"decisions\": [\"暫停接受新訂單以應對供應缺口\", \"三星2026年PCMOthers產能減產15%\"],\n  ...",
+    "error": "",
+    "time_seconds": 19.010337352752686,
+    "parsed_data": {
+      "decisions": [
+        "暫停接受新訂單以應對供應缺口",
+        "三星2026年PCMOthers產能減產15%"
+      ],
+      "action_items": [
+        "與三星確認2026年PCMOthers產能調整細節",
+        "確認LPDDR4供應數量及到貨時間",
+        "與vivo協調成品供應穩定策略"
+      ],
+      "key_points": [
+        "DDR4缺貨持續至2028年",
+        "AI需求占全球產能45%",
+        "美光可能跟進SanDisk付款條件",
+        "價格調整需與vivo協商"
+      ],
+      "open_questions": [
+        "Q2價格漲幅預估",
+        "深圳測試場良率確認",
+        "vivo供應鏈風險評估"
+      ]
+    }
+  }
+]

summary_20260206_013146.json ADDED Viewed

The diff for this file is too large to render. See raw diff

summary_20260206_014542.json ADDED Viewed

The diff for this file is too large to render. See raw diff