Luigi commited on
Commit
de2e4cb
·
1 Parent(s): c9955a9

Keep only Granite-107M embedding model

Browse files

Removed embedding models with identical performance but larger size:
- granite-278m (768-dim) - same accuracy, larger
- gemma-300m (768-dim) - same accuracy, larger
- qwen-600m (1024-dim) - same accuracy, larger

Benchmark showed all 4 models had identical 88.9% accuracy.
Granite-107M is optimal:
- Same performance as larger models
- Smallest (384-dim vs 768-1024)
- Fastest (1.73s load time)
- Proven in production

Files changed:
- meeting_summarizer/extraction.py - Removed 3 embedding models
- benchmark_all_embeddings.py - Created comprehensive benchmark

benchmark_all_embeddings.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark all 4 embedding models for Chinese deduplication
4
+ Tests: granite-107m, granite-278m, gemma-300m, qwen-600m
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ import time
10
+ import json
11
+ import numpy as np
12
+
13
+ # Add project path
14
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
15
+
16
+ from meeting_summarizer.extraction import EmbeddingModel, EMBEDDING_MODELS
17
+
18
+ # Test pairs: Chinese text that should/shouldn't match
19
+ TEST_PAIRS = [
20
+ # Exact duplicates (should match)
21
+ {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配", "should_match": True, "type": "exact"},
22
+ {"text1": "優先供應大客戶浪潮", "text2": "優先供應大客戶浪潮", "should_match": True, "type": "exact"},
23
+ {"text1": "DDR4缺貨持續到2028年", "text2": "DDR4缺貨持續到2028年", "should_match": True, "type": "exact"},
24
+
25
+ # Different items (should NOT match)
26
+ {"text1": "與三星討論Q3產能分配", "text2": "確認LPDDR4供應數量", "should_match": False, "type": "different"},
27
+ {"text1": "優先供應大客戶浪潮", "text2": "與浪潮討論大客戶付款能力", "should_match": False, "type": "related"},
28
+ {"text1": "DDR4缺貨持續到2028年", "text2": "AI需求占全球產能45%", "should_match": False, "type": "different"},
29
+ {"text1": "Q2價格漲幅預估", "text2": "深圳測試場良率確認", "should_match": False, "type": "different"},
30
+
31
+ # Edge cases
32
+ {"text1": "ModuleHouse為嵌入式產品", "text2": "中興、創惟啟興也是重要客戶", "should_match": False, "type": "different"},
33
+ {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配及價格", "should_match": False, "type": "extended"},
34
+ ]
35
+
36
+ def test_embedding_model(model_key, threshold=0.85):
37
+ """Test a single embedding model"""
38
+ config = EMBEDDING_MODELS[model_key]
39
+ print(f"\n{'='*70}")
40
+ print(f"Testing: {config['name']}")
41
+ print(f"Repo: {config['repo_id']}")
42
+ print(f"Dimensions: {config['embedding_dim']}")
43
+ print(f"{'='*70}")
44
+
45
+ try:
46
+ # Load model
47
+ start = time.time()
48
+ model = EmbeddingModel(model_key, n_threads=2)
49
+ msg = model.load()
50
+ load_time = time.time() - start
51
+
52
+ print(f"✓ Loaded in {load_time:.2f}s")
53
+ print(f" Status: {msg}")
54
+
55
+ results = {
56
+ "model_key": model_key,
57
+ "model_name": config['name'],
58
+ "dimensions": config['embedding_dim'],
59
+ "load_time": load_time,
60
+ "threshold": threshold,
61
+ "tests": [],
62
+ "correct": 0,
63
+ "false_positives": 0,
64
+ "false_negatives": 0,
65
+ }
66
+
67
+ # Test each pair
68
+ for i, test in enumerate(TEST_PAIRS, 1):
69
+ # Get embeddings
70
+ emb1 = model.embed(test['text1'])
71
+ emb2 = model.embed(test['text2'])
72
+
73
+ # Calculate cosine similarity (vectors are already normalized)
74
+ similarity = float(np.dot(emb1, emb2))
75
+ predicted = similarity >= threshold
76
+
77
+ # Check accuracy
78
+ is_correct = predicted == test['should_match']
79
+ if is_correct:
80
+ results['correct'] += 1
81
+ elif predicted and not test['should_match']:
82
+ results['false_positives'] += 1
83
+ else:
84
+ results['false_negatives'] += 1
85
+
86
+ # Store result
87
+ results['tests'].append({
88
+ "id": i,
89
+ "type": test['type'],
90
+ "similarity": float(similarity),
91
+ "predicted": predicted,
92
+ "expected": test['should_match'],
93
+ "correct": is_correct
94
+ })
95
+
96
+ status = "✅" if is_correct else "❌"
97
+ print(f"{status} Test {i} ({test['type'][:10]:<10}): sim={similarity:.3f}, "
98
+ f"match={predicted}, expected={test['should_match']}")
99
+
100
+ # Calculate accuracy
101
+ total = len(TEST_PAIRS)
102
+ results['accuracy'] = results['correct'] / total
103
+
104
+ print(f"\n📊 {config['name']} Results:")
105
+ print(f" Accuracy: {results['accuracy']:.1%} ({results['correct']}/{total})")
106
+ print(f" False Positives: {results['false_positives']}")
107
+ print(f" False Negatives: {results['false_negatives']}")
108
+
109
+ # Cleanup
110
+ model.unload()
111
+
112
+ return results
113
+
114
+ except Exception as e:
115
+ print(f"❌ Error: {str(e)}")
116
+ import traceback
117
+ traceback.print_exc()
118
+ return None
119
+
120
+ def main():
121
+ print("="*70)
122
+ print("EMBEDDING MODEL BENCHMARK - All 4 Models")
123
+ print("Chinese Transcript Deduplication")
124
+ print("="*70)
125
+ print(f"\nTest pairs: {len(TEST_PAIRS)}")
126
+ print(f"Similarity threshold: 0.85")
127
+ print(f"\nModels to test: {len(EMBEDDING_MODELS)}")
128
+ for key, cfg in EMBEDDING_MODELS.items():
129
+ print(f" • {cfg['name']} ({cfg['embedding_dim']}d)")
130
+
131
+ all_results = []
132
+
133
+ for model_key in EMBEDDING_MODELS.keys():
134
+ result = test_embedding_model(model_key)
135
+ if result:
136
+ all_results.append(result)
137
+
138
+ # Summary
139
+ print("\n" + "="*70)
140
+ print("FINAL COMPARISON")
141
+ print("="*70)
142
+
143
+ if all_results:
144
+ # Sort by accuracy
145
+ all_results.sort(key=lambda x: x['accuracy'], reverse=True)
146
+
147
+ print(f"\n{'Rank':<6}{'Model':<30}{'Dims':<8}{'Accuracy':<12}{'Load(s)':<10}")
148
+ print("-"*70)
149
+
150
+ for i, r in enumerate(all_results, 1):
151
+ print(f"{i:<6}{r['model_name']:<30}{r['dimensions']:<8}{r['accuracy']:.1%} {r['load_time']:.2f}")
152
+
153
+ # Save results
154
+ output_file = "embedding_benchmark_all_models.json"
155
+ with open(output_file, 'w', encoding='utf-8') as f:
156
+ json.dump({
157
+ "benchmark_info": {
158
+ "test_pairs": len(TEST_PAIRS),
159
+ "threshold": 0.85,
160
+ "models_tested": len(all_results)
161
+ },
162
+ "results": all_results
163
+ }, f, indent=2, ensure_ascii=False)
164
+
165
+ print(f"\n💾 Results saved to: {output_file}")
166
+
167
+ # Best model
168
+ best = all_results[0]
169
+ print(f"\n🏆 Best: {best['model_name']}")
170
+ print(f" Accuracy: {best['accuracy']:.1%}")
171
+ print(f" Dimensions: {best['dimensions']}")
172
+ else:
173
+ print("❌ No models successfully tested")
174
+
175
+ if __name__ == "__main__":
176
+ main()
embedding_benchmark_all_models.json ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "benchmark_info": {
3
+ "test_pairs": 9,
4
+ "threshold": 0.85,
5
+ "models_tested": 4
6
+ },
7
+ "results": [
8
+ {
9
+ "model_key": "granite-107m",
10
+ "model_name": "Granite 107M Multilingual (384-dim)",
11
+ "dimensions": 384,
12
+ "load_time": 1.7289109230041504,
13
+ "threshold": 0.85,
14
+ "tests": [
15
+ {
16
+ "id": 1,
17
+ "type": "exact",
18
+ "similarity": 1.0,
19
+ "predicted": true,
20
+ "expected": true,
21
+ "correct": true
22
+ },
23
+ {
24
+ "id": 2,
25
+ "type": "exact",
26
+ "similarity": 1.0,
27
+ "predicted": true,
28
+ "expected": true,
29
+ "correct": true
30
+ },
31
+ {
32
+ "id": 3,
33
+ "type": "exact",
34
+ "similarity": 1.0000000000000002,
35
+ "predicted": true,
36
+ "expected": true,
37
+ "correct": true
38
+ },
39
+ {
40
+ "id": 4,
41
+ "type": "different",
42
+ "similarity": 0.5679151219078763,
43
+ "predicted": false,
44
+ "expected": false,
45
+ "correct": true
46
+ },
47
+ {
48
+ "id": 5,
49
+ "type": "related",
50
+ "similarity": 0.7774874834586725,
51
+ "predicted": false,
52
+ "expected": false,
53
+ "correct": true
54
+ },
55
+ {
56
+ "id": 6,
57
+ "type": "different",
58
+ "similarity": 0.6207301798140507,
59
+ "predicted": false,
60
+ "expected": false,
61
+ "correct": true
62
+ },
63
+ {
64
+ "id": 7,
65
+ "type": "different",
66
+ "similarity": 0.6297083134632915,
67
+ "predicted": false,
68
+ "expected": false,
69
+ "correct": true
70
+ },
71
+ {
72
+ "id": 8,
73
+ "type": "different",
74
+ "similarity": 0.5450984650305777,
75
+ "predicted": false,
76
+ "expected": false,
77
+ "correct": true
78
+ },
79
+ {
80
+ "id": 9,
81
+ "type": "extended",
82
+ "similarity": 0.9650794987370644,
83
+ "predicted": true,
84
+ "expected": false,
85
+ "correct": false
86
+ }
87
+ ],
88
+ "correct": 8,
89
+ "false_positives": 1,
90
+ "false_negatives": 0,
91
+ "accuracy": 0.8888888888888888
92
+ },
93
+ {
94
+ "model_key": "granite-278m",
95
+ "model_name": "Granite 278M Multilingual (768-dim)",
96
+ "dimensions": 768,
97
+ "load_time": 1.3013572692871094,
98
+ "threshold": 0.85,
99
+ "tests": [
100
+ {
101
+ "id": 1,
102
+ "type": "exact",
103
+ "similarity": 1.0000000000000004,
104
+ "predicted": true,
105
+ "expected": true,
106
+ "correct": true
107
+ },
108
+ {
109
+ "id": 2,
110
+ "type": "exact",
111
+ "similarity": 1.0000000000000002,
112
+ "predicted": true,
113
+ "expected": true,
114
+ "correct": true
115
+ },
116
+ {
117
+ "id": 3,
118
+ "type": "exact",
119
+ "similarity": 1.0000000000000002,
120
+ "predicted": true,
121
+ "expected": true,
122
+ "correct": true
123
+ },
124
+ {
125
+ "id": 4,
126
+ "type": "different",
127
+ "similarity": 0.5312018972458725,
128
+ "predicted": false,
129
+ "expected": false,
130
+ "correct": true
131
+ },
132
+ {
133
+ "id": 5,
134
+ "type": "related",
135
+ "similarity": 0.7622608703056113,
136
+ "predicted": false,
137
+ "expected": false,
138
+ "correct": true
139
+ },
140
+ {
141
+ "id": 6,
142
+ "type": "different",
143
+ "similarity": 0.6224052610966958,
144
+ "predicted": false,
145
+ "expected": false,
146
+ "correct": true
147
+ },
148
+ {
149
+ "id": 7,
150
+ "type": "different",
151
+ "similarity": 0.5691598133480732,
152
+ "predicted": false,
153
+ "expected": false,
154
+ "correct": true
155
+ },
156
+ {
157
+ "id": 8,
158
+ "type": "different",
159
+ "similarity": 0.5608367775199385,
160
+ "predicted": false,
161
+ "expected": false,
162
+ "correct": true
163
+ },
164
+ {
165
+ "id": 9,
166
+ "type": "extended",
167
+ "similarity": 0.9502965050542435,
168
+ "predicted": true,
169
+ "expected": false,
170
+ "correct": false
171
+ }
172
+ ],
173
+ "correct": 8,
174
+ "false_positives": 1,
175
+ "false_negatives": 0,
176
+ "accuracy": 0.8888888888888888
177
+ },
178
+ {
179
+ "model_key": "gemma-300m",
180
+ "model_name": "Embedding Gemma 300M (768-dim)",
181
+ "dimensions": 768,
182
+ "load_time": 1.287358045578003,
183
+ "threshold": 0.85,
184
+ "tests": [
185
+ {
186
+ "id": 1,
187
+ "type": "exact",
188
+ "similarity": 1.0000000000000002,
189
+ "predicted": true,
190
+ "expected": true,
191
+ "correct": true
192
+ },
193
+ {
194
+ "id": 2,
195
+ "type": "exact",
196
+ "similarity": 1.0000000000000002,
197
+ "predicted": true,
198
+ "expected": true,
199
+ "correct": true
200
+ },
201
+ {
202
+ "id": 3,
203
+ "type": "exact",
204
+ "similarity": 1.0,
205
+ "predicted": true,
206
+ "expected": true,
207
+ "correct": true
208
+ },
209
+ {
210
+ "id": 4,
211
+ "type": "different",
212
+ "similarity": 0.49415466486918536,
213
+ "predicted": false,
214
+ "expected": false,
215
+ "correct": true
216
+ },
217
+ {
218
+ "id": 5,
219
+ "type": "related",
220
+ "similarity": 0.7521931400458695,
221
+ "predicted": false,
222
+ "expected": false,
223
+ "correct": true
224
+ },
225
+ {
226
+ "id": 6,
227
+ "type": "different",
228
+ "similarity": 0.471972474125754,
229
+ "predicted": false,
230
+ "expected": false,
231
+ "correct": true
232
+ },
233
+ {
234
+ "id": 7,
235
+ "type": "different",
236
+ "similarity": 0.454110573159473,
237
+ "predicted": false,
238
+ "expected": false,
239
+ "correct": true
240
+ },
241
+ {
242
+ "id": 8,
243
+ "type": "different",
244
+ "similarity": 0.47968063602459204,
245
+ "predicted": false,
246
+ "expected": false,
247
+ "correct": true
248
+ },
249
+ {
250
+ "id": 9,
251
+ "type": "extended",
252
+ "similarity": 0.9397708233122508,
253
+ "predicted": true,
254
+ "expected": false,
255
+ "correct": false
256
+ }
257
+ ],
258
+ "correct": 8,
259
+ "false_positives": 1,
260
+ "false_negatives": 0,
261
+ "accuracy": 0.8888888888888888
262
+ },
263
+ {
264
+ "model_key": "qwen-600m",
265
+ "model_name": "Qwen3 Embedding 600M (1024-dim)",
266
+ "dimensions": 1024,
267
+ "load_time": 1.6140825748443604,
268
+ "threshold": 0.85,
269
+ "tests": [
270
+ {
271
+ "id": 1,
272
+ "type": "exact",
273
+ "similarity": 1.0000000000000002,
274
+ "predicted": true,
275
+ "expected": true,
276
+ "correct": true
277
+ },
278
+ {
279
+ "id": 2,
280
+ "type": "exact",
281
+ "similarity": 1.0000000000000004,
282
+ "predicted": true,
283
+ "expected": true,
284
+ "correct": true
285
+ },
286
+ {
287
+ "id": 3,
288
+ "type": "exact",
289
+ "similarity": 1.0,
290
+ "predicted": true,
291
+ "expected": true,
292
+ "correct": true
293
+ },
294
+ {
295
+ "id": 4,
296
+ "type": "different",
297
+ "similarity": 0.5260412918600332,
298
+ "predicted": false,
299
+ "expected": false,
300
+ "correct": true
301
+ },
302
+ {
303
+ "id": 5,
304
+ "type": "related",
305
+ "similarity": 0.751988820948543,
306
+ "predicted": false,
307
+ "expected": false,
308
+ "correct": true
309
+ },
310
+ {
311
+ "id": 6,
312
+ "type": "different",
313
+ "similarity": 0.48024291718251755,
314
+ "predicted": false,
315
+ "expected": false,
316
+ "correct": true
317
+ },
318
+ {
319
+ "id": 7,
320
+ "type": "different",
321
+ "similarity": 0.40910911792742577,
322
+ "predicted": false,
323
+ "expected": false,
324
+ "correct": true
325
+ },
326
+ {
327
+ "id": 8,
328
+ "type": "different",
329
+ "similarity": 0.4429171322112669,
330
+ "predicted": false,
331
+ "expected": false,
332
+ "correct": true
333
+ },
334
+ {
335
+ "id": 9,
336
+ "type": "extended",
337
+ "similarity": 0.9754299880776476,
338
+ "predicted": true,
339
+ "expected": false,
340
+ "correct": false
341
+ }
342
+ ],
343
+ "correct": 8,
344
+ "false_positives": 1,
345
+ "false_negatives": 0,
346
+ "accuracy": 0.8888888888888888
347
+ }
348
+ ]
349
+ }
meeting_summarizer/extraction.py CHANGED
@@ -264,30 +264,6 @@ EMBEDDING_MODELS = {
264
  "max_context": 2048,
265
  "description": "Fastest, multilingual, good for quick deduplication",
266
  },
267
- "granite-278m": {
268
- "name": "Granite 278M Multilingual (768-dim)",
269
- "repo_id": "bartowski/granite-embedding-278m-multilingual-GGUF",
270
- "filename": "*Q8_0.gguf",
271
- "embedding_dim": 768,
272
- "max_context": 2048,
273
- "description": "Balanced speed/quality, multilingual",
274
- },
275
- "gemma-300m": {
276
- "name": "Embedding Gemma 300M (768-dim)",
277
- "repo_id": "unsloth/embeddinggemma-300m-GGUF",
278
- "filename": "*Q8_0.gguf",
279
- "embedding_dim": 768,
280
- "max_context": 2048,
281
- "description": "Google embedding model, strong semantics",
282
- },
283
- "qwen-600m": {
284
- "name": "Qwen3 Embedding 600M (1024-dim)",
285
- "repo_id": "Qwen/Qwen3-Embedding-0.6B-GGUF",
286
- "filename": "*Q8_0.gguf",
287
- "embedding_dim": 1024,
288
- "max_context": 2048,
289
- "description": "Highest quality, best for critical dedup",
290
- },
291
  }
292
 
293
 
 
264
  "max_context": 2048,
265
  "description": "Fastest, multilingual, good for quick deduplication",
266
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  }
268
 
269