Spaces:
Running
Running
File size: 6,543 Bytes
de2e4cb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | #!/usr/bin/env python3
"""
Benchmark all 4 embedding models for Chinese deduplication
Tests: granite-107m, granite-278m, gemma-300m, qwen-600m
"""
import sys
import os
import time
import json
import numpy as np
# Add project path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from meeting_summarizer.extraction import EmbeddingModel, EMBEDDING_MODELS
# Test pairs: Chinese text that should/shouldn't match
TEST_PAIRS = [
# Exact duplicates (should match)
{"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配", "should_match": True, "type": "exact"},
{"text1": "優先供應大客戶浪潮", "text2": "優先供應大客戶浪潮", "should_match": True, "type": "exact"},
{"text1": "DDR4缺貨持續到2028年", "text2": "DDR4缺貨持續到2028年", "should_match": True, "type": "exact"},
# Different items (should NOT match)
{"text1": "與三星討論Q3產能分配", "text2": "確認LPDDR4供應數量", "should_match": False, "type": "different"},
{"text1": "優先供應大客戶浪潮", "text2": "與浪潮討論大客戶付款能力", "should_match": False, "type": "related"},
{"text1": "DDR4缺貨持續到2028年", "text2": "AI需求占全球產能45%", "should_match": False, "type": "different"},
{"text1": "Q2價格漲幅預估", "text2": "深圳測試場良率確認", "should_match": False, "type": "different"},
# Edge cases
{"text1": "ModuleHouse為嵌入式產品", "text2": "中興、創惟啟興也是重要客戶", "should_match": False, "type": "different"},
{"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配及價格", "should_match": False, "type": "extended"},
]
def test_embedding_model(model_key, threshold=0.85):
"""Test a single embedding model"""
config = EMBEDDING_MODELS[model_key]
print(f"\n{'='*70}")
print(f"Testing: {config['name']}")
print(f"Repo: {config['repo_id']}")
print(f"Dimensions: {config['embedding_dim']}")
print(f"{'='*70}")
try:
# Load model
start = time.time()
model = EmbeddingModel(model_key, n_threads=2)
msg = model.load()
load_time = time.time() - start
print(f"✓ Loaded in {load_time:.2f}s")
print(f" Status: {msg}")
results = {
"model_key": model_key,
"model_name": config['name'],
"dimensions": config['embedding_dim'],
"load_time": load_time,
"threshold": threshold,
"tests": [],
"correct": 0,
"false_positives": 0,
"false_negatives": 0,
}
# Test each pair
for i, test in enumerate(TEST_PAIRS, 1):
# Get embeddings
emb1 = model.embed(test['text1'])
emb2 = model.embed(test['text2'])
# Calculate cosine similarity (vectors are already normalized)
similarity = float(np.dot(emb1, emb2))
predicted = similarity >= threshold
# Check accuracy
is_correct = predicted == test['should_match']
if is_correct:
results['correct'] += 1
elif predicted and not test['should_match']:
results['false_positives'] += 1
else:
results['false_negatives'] += 1
# Store result
results['tests'].append({
"id": i,
"type": test['type'],
"similarity": float(similarity),
"predicted": predicted,
"expected": test['should_match'],
"correct": is_correct
})
status = "✅" if is_correct else "❌"
print(f"{status} Test {i} ({test['type'][:10]:<10}): sim={similarity:.3f}, "
f"match={predicted}, expected={test['should_match']}")
# Calculate accuracy
total = len(TEST_PAIRS)
results['accuracy'] = results['correct'] / total
print(f"\n📊 {config['name']} Results:")
print(f" Accuracy: {results['accuracy']:.1%} ({results['correct']}/{total})")
print(f" False Positives: {results['false_positives']}")
print(f" False Negatives: {results['false_negatives']}")
# Cleanup
model.unload()
return results
except Exception as e:
print(f"❌ Error: {str(e)}")
import traceback
traceback.print_exc()
return None
def main():
print("="*70)
print("EMBEDDING MODEL BENCHMARK - All 4 Models")
print("Chinese Transcript Deduplication")
print("="*70)
print(f"\nTest pairs: {len(TEST_PAIRS)}")
print(f"Similarity threshold: 0.85")
print(f"\nModels to test: {len(EMBEDDING_MODELS)}")
for key, cfg in EMBEDDING_MODELS.items():
print(f" • {cfg['name']} ({cfg['embedding_dim']}d)")
all_results = []
for model_key in EMBEDDING_MODELS.keys():
result = test_embedding_model(model_key)
if result:
all_results.append(result)
# Summary
print("\n" + "="*70)
print("FINAL COMPARISON")
print("="*70)
if all_results:
# Sort by accuracy
all_results.sort(key=lambda x: x['accuracy'], reverse=True)
print(f"\n{'Rank':<6}{'Model':<30}{'Dims':<8}{'Accuracy':<12}{'Load(s)':<10}")
print("-"*70)
for i, r in enumerate(all_results, 1):
print(f"{i:<6}{r['model_name']:<30}{r['dimensions']:<8}{r['accuracy']:.1%} {r['load_time']:.2f}")
# Save results
output_file = "embedding_benchmark_all_models.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
"benchmark_info": {
"test_pairs": len(TEST_PAIRS),
"threshold": 0.85,
"models_tested": len(all_results)
},
"results": all_results
}, f, indent=2, ensure_ascii=False)
print(f"\n💾 Results saved to: {output_file}")
# Best model
best = all_results[0]
print(f"\n🏆 Best: {best['model_name']}")
print(f" Accuracy: {best['accuracy']:.1%}")
print(f" Dimensions: {best['dimensions']}")
else:
print("❌ No models successfully tested")
if __name__ == "__main__":
main()
|