#!/usr/bin/env python3 """ Benchmark script for testing extraction models individually. Tests each model on a single small window to verify extraction works. """ import json import time from typing import Dict, List, Tuple, Optional import sys sys.path.insert(0, '/home/luigi/tiny-scribe') from meeting_summarizer.extraction import ( _build_schema_extraction_prompt, _build_reasoning_extraction_prompt, _try_parse_extraction_json, ) from llama_cpp import Llama # Test window - small excerpt from transcripts/full.txt TEST_WINDOW = """SPEAKER_02: 三星在去年Q3的時候已經告訴,今年,它所有的產出50會在AI跟Service上面。25在Mobile20在PCM那模組廠就是PCMOthers這一塊。所以26年的供給已經會比25年的供給在PCMOthers這塊少了15那再加上現在的狀況。所以我們覺得看起來應該缺到了8年,再加上現在昨天我不知道昨天你們看到SanDisk有一個這不是只有DDRName也是這樣Name你知道。 SPEAKER_03: 我想請教一下,以現在來講第四三一,對於就是說三星他們減產,或是甚至於後面可能會停產的。這樣的狀況跟凱力士也差不多的情況。 SPEAKER_02: 對於這塊,你們怎麼應?該是這樣說他們就算減產或停產,vivo是不會停的,顆粒會停,它的成品會停,但vivo是不會停的。""" # Small models to test (< 2B parameters) TEST_MODELS = [ { "name": "Falcon-H1 100M", "repo_id": "tiiuae/Falcon-H1-100M-Base-GGUF", "filename": "*Q8_0.gguf", "temperature": 0.1, "supports_reasoning": False, }, { "name": "Gemma-3 270M", "repo_id": "google/gemma-3-270m-it-GGUF", "filename": "*Q4_K_M.gguf", "temperature": 0.1, "supports_reasoning": False, }, { "name": "Granite-4.0 350M", "repo_id": "unsloth/granite-4.0-h-350m-GGUF", "filename": "*Q8_0.gguf", "temperature": 0.1, "supports_reasoning": False, }, { "name": "BitCPM4 0.5B", "repo_id": "openbmb/BitCPM4-0.5B-GGUF", "filename": "*q4_0.gguf", "temperature": 0.1, "supports_reasoning": False, }, { "name": "Qwen3 0.6B", "repo_id": "unsloth/Qwen3-0.6B-GGUF", "filename": "*Q4_0.gguf", "temperature": 0.1, "supports_reasoning": True, }, { "name": "Granite 3.1 1B", "repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF", "filename": "*Q8_0.gguf", "temperature": 0.1, "supports_reasoning": False, }, { "name": "Falcon-H1 1.5B", "repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF", "filename": "*Q4_K_M.gguf", "temperature": 0.1, "supports_reasoning": False, }, { "name": "Qwen3 1.7B", "repo_id": "unsloth/Qwen3-1.7B-GGUF", "filename": "*Q4_0.gguf", "temperature": 0.1, "supports_reasoning": True, }, ] def test_model(model_config: Dict) -> Dict: """Test a single model on the test window.""" print(f"\n{'='*60}") print(f"Testing: {model_config['name']}") print(f"{'='*60}") result = { "model": model_config['name'], "repo_id": model_config['repo_id'], "success": False, "items_extracted": 0, "response": "", "error": "", "time_seconds": 0, } try: # Load model print(f"Loading {model_config['name']}...") start_time = time.time() llm = Llama.from_pretrained( repo_id=model_config['repo_id'], filename=model_config['filename'], n_ctx=4096, verbose=False, ) # Build prompt supports_reasoning = model_config.get('supports_reasoning', False) if supports_reasoning: system_prompt = _build_reasoning_extraction_prompt('zh-TW') else: system_prompt = _build_schema_extraction_prompt('zh-TW') messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"Transcript:\n\n{TEST_WINDOW}"} ] # Run extraction print("Running extraction...") response = llm.create_chat_completion( messages=messages, max_tokens=1024, temperature=model_config['temperature'], top_p=0.9, top_k=30, ) result['time_seconds'] = time.time() - start_time # Get response text full_response = response["choices"][0]["message"]["content"] result['response'] = full_response[:500] + "..." if len(full_response) > 500 else full_response print(f"\nRaw response (first 300 chars):") print(full_response[:300]) # Parse JSON parsed = _try_parse_extraction_json(full_response, log_repair=True) if parsed: total_items = sum(len(v) for v in parsed.values()) result['success'] = True result['items_extracted'] = total_items result['parsed_data'] = parsed print(f"\n✅ SUCCESS - Extracted {total_items} items:") for key, items in parsed.items(): print(f" {key}: {len(items)} items") for item in items[:2]: # Show first 2 items print(f" - {item[:80]}...") else: result['error'] = "Failed to parse JSON" print(f"\n❌ FAILED - Could not parse JSON") except Exception as e: result['error'] = str(e) result['time_seconds'] = time.time() - start_time if 'start_time' in locals() else 0 print(f"\n❌ ERROR: {e}") return result def main(): """Run benchmark on all test models.""" print("=" * 60) print("EXTRACTION MODEL BENCHMARK") print("=" * 60) print(f"\nTest window size: {len(TEST_WINDOW)} characters") print(f"Models to test: {len(TEST_MODELS)}") results = [] for model_config in TEST_MODELS: result = test_model(model_config) results.append(result) # Small delay between models time.sleep(2) # Summary print("\n" + "=" * 60) print("BENCHMARK SUMMARY") print("=" * 60) successful = [r for r in results if r['success']] failed = [r for r in results if not r['success']] print(f"\nSuccessful: {len(successful)}/{len(results)}") print(f"Failed: {len(failed)}/{len(results)}") print("\nSuccessful Models:") for r in successful: print(f" ✅ {r['model']}: {r['items_extracted']} items ({r['time_seconds']:.1f}s)") print("\nFailed Models:") for r in failed: print(f" ❌ {r['model']}: {r['error']}") # Save results with open('extraction_benchmark_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print("\nResults saved to: extraction_benchmark_results.json") if __name__ == "__main__": main()