Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Benchmark all 4 embedding models for Chinese deduplication | |
| Tests: granite-107m, granite-278m, gemma-300m, qwen-600m | |
| """ | |
| import sys | |
| import os | |
| import time | |
| import json | |
| import numpy as np | |
| # Add project path | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| from meeting_summarizer.extraction import EmbeddingModel, EMBEDDING_MODELS | |
| # Test pairs: Chinese text that should/shouldn't match | |
| TEST_PAIRS = [ | |
| # Exact duplicates (should match) | |
| {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配", "should_match": True, "type": "exact"}, | |
| {"text1": "優先供應大客戶浪潮", "text2": "優先供應大客戶浪潮", "should_match": True, "type": "exact"}, | |
| {"text1": "DDR4缺貨持續到2028年", "text2": "DDR4缺貨持續到2028年", "should_match": True, "type": "exact"}, | |
| # Different items (should NOT match) | |
| {"text1": "與三星討論Q3產能分配", "text2": "確認LPDDR4供應數量", "should_match": False, "type": "different"}, | |
| {"text1": "優先供應大客戶浪潮", "text2": "與浪潮討論大客戶付款能力", "should_match": False, "type": "related"}, | |
| {"text1": "DDR4缺貨持續到2028年", "text2": "AI需求占全球產能45%", "should_match": False, "type": "different"}, | |
| {"text1": "Q2價格漲幅預估", "text2": "深圳測試場良率確認", "should_match": False, "type": "different"}, | |
| # Edge cases | |
| {"text1": "ModuleHouse為嵌入式產品", "text2": "中興、創惟啟興也是重要客戶", "should_match": False, "type": "different"}, | |
| {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配及價格", "should_match": False, "type": "extended"}, | |
| ] | |
| def test_embedding_model(model_key, threshold=0.85): | |
| """Test a single embedding model""" | |
| config = EMBEDDING_MODELS[model_key] | |
| print(f"\n{'='*70}") | |
| print(f"Testing: {config['name']}") | |
| print(f"Repo: {config['repo_id']}") | |
| print(f"Dimensions: {config['embedding_dim']}") | |
| print(f"{'='*70}") | |
| try: | |
| # Load model | |
| start = time.time() | |
| model = EmbeddingModel(model_key, n_threads=2) | |
| msg = model.load() | |
| load_time = time.time() - start | |
| print(f"✓ Loaded in {load_time:.2f}s") | |
| print(f" Status: {msg}") | |
| results = { | |
| "model_key": model_key, | |
| "model_name": config['name'], | |
| "dimensions": config['embedding_dim'], | |
| "load_time": load_time, | |
| "threshold": threshold, | |
| "tests": [], | |
| "correct": 0, | |
| "false_positives": 0, | |
| "false_negatives": 0, | |
| } | |
| # Test each pair | |
| for i, test in enumerate(TEST_PAIRS, 1): | |
| # Get embeddings | |
| emb1 = model.embed(test['text1']) | |
| emb2 = model.embed(test['text2']) | |
| # Calculate cosine similarity (vectors are already normalized) | |
| similarity = float(np.dot(emb1, emb2)) | |
| predicted = similarity >= threshold | |
| # Check accuracy | |
| is_correct = predicted == test['should_match'] | |
| if is_correct: | |
| results['correct'] += 1 | |
| elif predicted and not test['should_match']: | |
| results['false_positives'] += 1 | |
| else: | |
| results['false_negatives'] += 1 | |
| # Store result | |
| results['tests'].append({ | |
| "id": i, | |
| "type": test['type'], | |
| "similarity": float(similarity), | |
| "predicted": predicted, | |
| "expected": test['should_match'], | |
| "correct": is_correct | |
| }) | |
| status = "✅" if is_correct else "❌" | |
| print(f"{status} Test {i} ({test['type'][:10]:<10}): sim={similarity:.3f}, " | |
| f"match={predicted}, expected={test['should_match']}") | |
| # Calculate accuracy | |
| total = len(TEST_PAIRS) | |
| results['accuracy'] = results['correct'] / total | |
| print(f"\n📊 {config['name']} Results:") | |
| print(f" Accuracy: {results['accuracy']:.1%} ({results['correct']}/{total})") | |
| print(f" False Positives: {results['false_positives']}") | |
| print(f" False Negatives: {results['false_negatives']}") | |
| # Cleanup | |
| model.unload() | |
| return results | |
| except Exception as e: | |
| print(f"❌ Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| def main(): | |
| print("="*70) | |
| print("EMBEDDING MODEL BENCHMARK - All 4 Models") | |
| print("Chinese Transcript Deduplication") | |
| print("="*70) | |
| print(f"\nTest pairs: {len(TEST_PAIRS)}") | |
| print(f"Similarity threshold: 0.85") | |
| print(f"\nModels to test: {len(EMBEDDING_MODELS)}") | |
| for key, cfg in EMBEDDING_MODELS.items(): | |
| print(f" • {cfg['name']} ({cfg['embedding_dim']}d)") | |
| all_results = [] | |
| for model_key in EMBEDDING_MODELS.keys(): | |
| result = test_embedding_model(model_key) | |
| if result: | |
| all_results.append(result) | |
| # Summary | |
| print("\n" + "="*70) | |
| print("FINAL COMPARISON") | |
| print("="*70) | |
| if all_results: | |
| # Sort by accuracy | |
| all_results.sort(key=lambda x: x['accuracy'], reverse=True) | |
| print(f"\n{'Rank':<6}{'Model':<30}{'Dims':<8}{'Accuracy':<12}{'Load(s)':<10}") | |
| print("-"*70) | |
| for i, r in enumerate(all_results, 1): | |
| print(f"{i:<6}{r['model_name']:<30}{r['dimensions']:<8}{r['accuracy']:.1%} {r['load_time']:.2f}") | |
| # Save results | |
| output_file = "embedding_benchmark_all_models.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "benchmark_info": { | |
| "test_pairs": len(TEST_PAIRS), | |
| "threshold": 0.85, | |
| "models_tested": len(all_results) | |
| }, | |
| "results": all_results | |
| }, f, indent=2, ensure_ascii=False) | |
| print(f"\n💾 Results saved to: {output_file}") | |
| # Best model | |
| best = all_results[0] | |
| print(f"\n🏆 Best: {best['model_name']}") | |
| print(f" Accuracy: {best['accuracy']:.1%}") | |
| print(f" Dimensions: {best['dimensions']}") | |
| else: | |
| print("❌ No models successfully tested") | |
| if __name__ == "__main__": | |
| main() | |