File size: 6,543 Bytes
de2e4cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python3
"""
Benchmark all 4 embedding models for Chinese deduplication
Tests: granite-107m, granite-278m, gemma-300m, qwen-600m
"""

import sys
import os
import time
import json
import numpy as np

# Add project path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from meeting_summarizer.extraction import EmbeddingModel, EMBEDDING_MODELS

# Test pairs: Chinese text that should/shouldn't match
TEST_PAIRS = [
    # Exact duplicates (should match)
    {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配", "should_match": True, "type": "exact"},
    {"text1": "優先供應大客戶浪潮", "text2": "優先供應大客戶浪潮", "should_match": True, "type": "exact"},
    {"text1": "DDR4缺貨持續到2028年", "text2": "DDR4缺貨持續到2028年", "should_match": True, "type": "exact"},
    
    # Different items (should NOT match)
    {"text1": "與三星討論Q3產能分配", "text2": "確認LPDDR4供應數量", "should_match": False, "type": "different"},
    {"text1": "優先供應大客戶浪潮", "text2": "與浪潮討論大客戶付款能力", "should_match": False, "type": "related"},
    {"text1": "DDR4缺貨持續到2028年", "text2": "AI需求占全球產能45%", "should_match": False, "type": "different"},
    {"text1": "Q2價格漲幅預估", "text2": "深圳測試場良率確認", "should_match": False, "type": "different"},
    
    # Edge cases
    {"text1": "ModuleHouse為嵌入式產品", "text2": "中興、創惟啟興也是重要客戶", "should_match": False, "type": "different"},
    {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配及價格", "should_match": False, "type": "extended"},
]

def test_embedding_model(model_key, threshold=0.85):
    """Test a single embedding model"""
    config = EMBEDDING_MODELS[model_key]
    print(f"\n{'='*70}")
    print(f"Testing: {config['name']}")
    print(f"Repo: {config['repo_id']}")
    print(f"Dimensions: {config['embedding_dim']}")
    print(f"{'='*70}")
    
    try:
        # Load model
        start = time.time()
        model = EmbeddingModel(model_key, n_threads=2)
        msg = model.load()
        load_time = time.time() - start
        
        print(f"✓ Loaded in {load_time:.2f}s")
        print(f"  Status: {msg}")
        
        results = {
            "model_key": model_key,
            "model_name": config['name'],
            "dimensions": config['embedding_dim'],
            "load_time": load_time,
            "threshold": threshold,
            "tests": [],
            "correct": 0,
            "false_positives": 0,
            "false_negatives": 0,
        }
        
        # Test each pair
        for i, test in enumerate(TEST_PAIRS, 1):
            # Get embeddings
            emb1 = model.embed(test['text1'])
            emb2 = model.embed(test['text2'])
            
            # Calculate cosine similarity (vectors are already normalized)
            similarity = float(np.dot(emb1, emb2))
            predicted = similarity >= threshold
            
            # Check accuracy
            is_correct = predicted == test['should_match']
            if is_correct:
                results['correct'] += 1
            elif predicted and not test['should_match']:
                results['false_positives'] += 1
            else:
                results['false_negatives'] += 1
            
            # Store result
            results['tests'].append({
                "id": i,
                "type": test['type'],
                "similarity": float(similarity),
                "predicted": predicted,
                "expected": test['should_match'],
                "correct": is_correct
            })
            
            status = "✅" if is_correct else "❌"
            print(f"{status} Test {i} ({test['type'][:10]:<10}): sim={similarity:.3f}, "
                  f"match={predicted}, expected={test['should_match']}")
        
        # Calculate accuracy
        total = len(TEST_PAIRS)
        results['accuracy'] = results['correct'] / total
        
        print(f"\n📊 {config['name']} Results:")
        print(f"  Accuracy: {results['accuracy']:.1%} ({results['correct']}/{total})")
        print(f"  False Positives: {results['false_positives']}")
        print(f"  False Negatives: {results['false_negatives']}")
        
        # Cleanup
        model.unload()
        
        return results
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def main():
    print("="*70)
    print("EMBEDDING MODEL BENCHMARK - All 4 Models")
    print("Chinese Transcript Deduplication")
    print("="*70)
    print(f"\nTest pairs: {len(TEST_PAIRS)}")
    print(f"Similarity threshold: 0.85")
    print(f"\nModels to test: {len(EMBEDDING_MODELS)}")
    for key, cfg in EMBEDDING_MODELS.items():
        print(f"  • {cfg['name']} ({cfg['embedding_dim']}d)")
    
    all_results = []
    
    for model_key in EMBEDDING_MODELS.keys():
        result = test_embedding_model(model_key)
        if result:
            all_results.append(result)
    
    # Summary
    print("\n" + "="*70)
    print("FINAL COMPARISON")
    print("="*70)
    
    if all_results:
        # Sort by accuracy
        all_results.sort(key=lambda x: x['accuracy'], reverse=True)
        
        print(f"\n{'Rank':<6}{'Model':<30}{'Dims':<8}{'Accuracy':<12}{'Load(s)':<10}")
        print("-"*70)
        
        for i, r in enumerate(all_results, 1):
            print(f"{i:<6}{r['model_name']:<30}{r['dimensions']:<8}{r['accuracy']:.1%}      {r['load_time']:.2f}")
        
        # Save results
        output_file = "embedding_benchmark_all_models.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({
                "benchmark_info": {
                    "test_pairs": len(TEST_PAIRS),
                    "threshold": 0.85,
                    "models_tested": len(all_results)
                },
                "results": all_results
            }, f, indent=2, ensure_ascii=False)
        
        print(f"\n💾 Results saved to: {output_file}")
        
        # Best model
        best = all_results[0]
        print(f"\n🏆 Best: {best['model_name']}")
        print(f"   Accuracy: {best['accuracy']:.1%}")
        print(f"   Dimensions: {best['dimensions']}")
    else:
        print("❌ No models successfully tested")

if __name__ == "__main__":
    main()