Spaces:

Luigi
/

tiny-scribe

Sleeping

Luigi commited on Feb 6

Commit

de2e4cb

1 Parent(s): c9955a9

Keep only Granite-107M embedding model

Removed embedding models with identical performance but larger size:
- granite-278m (768-dim) - same accuracy, larger
- gemma-300m (768-dim) - same accuracy, larger
- qwen-600m (1024-dim) - same accuracy, larger

Benchmark showed all 4 models had identical 88.9% accuracy.
Granite-107M is optimal:
- Same performance as larger models
- Smallest (384-dim vs 768-1024)
- Fastest (1.73s load time)
- Proven in production

Files changed:
- meeting_summarizer/extraction.py - Removed 3 embedding models
- benchmark_all_embeddings.py - Created comprehensive benchmark

Files changed (3) hide show

benchmark_all_embeddings.py +176 -0
embedding_benchmark_all_models.json +349 -0
meeting_summarizer/extraction.py +0 -24

benchmark_all_embeddings.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python3
+"""
+Benchmark all 4 embedding models for Chinese deduplication
+Tests: granite-107m, granite-278m, gemma-300m, qwen-600m
+"""
+import sys
+import os
+import time
+import json
+import numpy as np
+# Add project path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from meeting_summarizer.extraction import EmbeddingModel, EMBEDDING_MODELS
+# Test pairs: Chinese text that should/shouldn't match
+TEST_PAIRS = [
+    # Exact duplicates (should match)
+    {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配", "should_match": True, "type": "exact"},
+    {"text1": "優先供應大客戶浪潮", "text2": "優先供應大客戶浪潮", "should_match": True, "type": "exact"},
+    {"text1": "DDR4缺貨持續到2028年", "text2": "DDR4缺貨持續到2028年", "should_match": True, "type": "exact"},
+    # Different items (should NOT match)
+    {"text1": "與三星討論Q3產能分配", "text2": "確認LPDDR4供應數量", "should_match": False, "type": "different"},
+    {"text1": "優先供應大客戶浪潮", "text2": "與浪潮討論大客戶付款能力", "should_match": False, "type": "related"},
+    {"text1": "DDR4缺貨持續到2028年", "text2": "AI需求占全球產能45%", "should_match": False, "type": "different"},
+    {"text1": "Q2價格漲幅預估", "text2": "深圳測試場良率確認", "should_match": False, "type": "different"},
+    # Edge cases
+    {"text1": "ModuleHouse為嵌入式產品", "text2": "中興、創惟啟興也是重要客戶", "should_match": False, "type": "different"},
+    {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配及價格", "should_match": False, "type": "extended"},
+]
+def test_embedding_model(model_key, threshold=0.85):
+    """Test a single embedding model"""
+    config = EMBEDDING_MODELS[model_key]
+    print(f"\n{'='*70}")
+    print(f"Testing: {config['name']}")
+    print(f"Repo: {config['repo_id']}")
+    print(f"Dimensions: {config['embedding_dim']}")
+    print(f"{'='*70}")
+    try:
+        # Load model
+        start = time.time()
+        model = EmbeddingModel(model_key, n_threads=2)
+        msg = model.load()
+        load_time = time.time() - start
+        print(f"✓ Loaded in {load_time:.2f}s")
+        print(f"  Status: {msg}")
+        results = {
+            "model_key": model_key,
+            "model_name": config['name'],
+            "dimensions": config['embedding_dim'],
+            "load_time": load_time,
+            "threshold": threshold,
+            "tests": [],
+            "correct": 0,
+            "false_positives": 0,
+            "false_negatives": 0,
+        }
+        # Test each pair
+        for i, test in enumerate(TEST_PAIRS, 1):
+            # Get embeddings
+            emb1 = model.embed(test['text1'])
+            emb2 = model.embed(test['text2'])
+            # Calculate cosine similarity (vectors are already normalized)
+            similarity = float(np.dot(emb1, emb2))
+            predicted = similarity >= threshold
+            # Check accuracy
+            is_correct = predicted == test['should_match']
+            if is_correct:
+                results['correct'] += 1
+            elif predicted and not test['should_match']:
+                results['false_positives'] += 1
+            else:
+                results['false_negatives'] += 1
+            # Store result
+            results['tests'].append({
+                "id": i,
+                "type": test['type'],
+                "similarity": float(similarity),
+                "predicted": predicted,
+                "expected": test['should_match'],
+                "correct": is_correct
+            })
+            status = "✅" if is_correct else "❌"
+            print(f"{status} Test {i} ({test['type'][:10]:<10}): sim={similarity:.3f}, "
+                  f"match={predicted}, expected={test['should_match']}")
+        # Calculate accuracy
+        total = len(TEST_PAIRS)
+        results['accuracy'] = results['correct'] / total
+        print(f"\n📊 {config['name']} Results:")
+        print(f"  Accuracy: {results['accuracy']:.1%} ({results['correct']}/{total})")
+        print(f"  False Positives: {results['false_positives']}")
+        print(f"  False Negatives: {results['false_negatives']}")
+        # Cleanup
+        model.unload()
+        return results
+    except Exception as e:
+        print(f"❌ Error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+def main():
+    print("="*70)
+    print("EMBEDDING MODEL BENCHMARK - All 4 Models")
+    print("Chinese Transcript Deduplication")
+    print("="*70)
+    print(f"\nTest pairs: {len(TEST_PAIRS)}")
+    print(f"Similarity threshold: 0.85")
+    print(f"\nModels to test: {len(EMBEDDING_MODELS)}")
+    for key, cfg in EMBEDDING_MODELS.items():
+        print(f"  • {cfg['name']} ({cfg['embedding_dim']}d)")
+    all_results = []
+    for model_key in EMBEDDING_MODELS.keys():
+        result = test_embedding_model(model_key)
+        if result:
+            all_results.append(result)
+    # Summary
+    print("\n" + "="*70)
+    print("FINAL COMPARISON")
+    print("="*70)
+    if all_results:
+        # Sort by accuracy
+        all_results.sort(key=lambda x: x['accuracy'], reverse=True)
+        print(f"\n{'Rank':<6}{'Model':<30}{'Dims':<8}{'Accuracy':<12}{'Load(s)':<10}")
+        print("-"*70)
+        for i, r in enumerate(all_results, 1):
+            print(f"{i:<6}{r['model_name']:<30}{r['dimensions']:<8}{r['accuracy']:.1%}      {r['load_time']:.2f}")
+        # Save results
+        output_file = "embedding_benchmark_all_models.json"
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump({
+                "benchmark_info": {
+                    "test_pairs": len(TEST_PAIRS),
+                    "threshold": 0.85,
+                    "models_tested": len(all_results)
+                },
+                "results": all_results
+            }, f, indent=2, ensure_ascii=False)
+        print(f"\n💾 Results saved to: {output_file}")
+        # Best model
+        best = all_results[0]
+        print(f"\n🏆 Best: {best['model_name']}")
+        print(f"   Accuracy: {best['accuracy']:.1%}")
+        print(f"   Dimensions: {best['dimensions']}")
+    else:
+        print("❌ No models successfully tested")
+if __name__ == "__main__":
+    main()

embedding_benchmark_all_models.json ADDED Viewed

	@@ -0,0 +1,349 @@

+{
+  "benchmark_info": {
+    "test_pairs": 9,
+    "threshold": 0.85,
+    "models_tested": 4
+  },
+  "results": [
+    {
+      "model_key": "granite-107m",
+      "model_name": "Granite 107M Multilingual (384-dim)",
+      "dimensions": 384,
+      "load_time": 1.7289109230041504,
+      "threshold": 0.85,
+      "tests": [
+        {
+          "id": 1,
+          "type": "exact",
+          "similarity": 1.0,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 2,
+          "type": "exact",
+          "similarity": 1.0,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 3,
+          "type": "exact",
+          "similarity": 1.0000000000000002,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 4,
+          "type": "different",
+          "similarity": 0.5679151219078763,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 5,
+          "type": "related",
+          "similarity": 0.7774874834586725,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 6,
+          "type": "different",
+          "similarity": 0.6207301798140507,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 7,
+          "type": "different",
+          "similarity": 0.6297083134632915,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 8,
+          "type": "different",
+          "similarity": 0.5450984650305777,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 9,
+          "type": "extended",
+          "similarity": 0.9650794987370644,
+          "predicted": true,
+          "expected": false,
+          "correct": false
+        }
+      ],
+      "correct": 8,
+      "false_positives": 1,
+      "false_negatives": 0,
+      "accuracy": 0.8888888888888888
+    },
+    {
+      "model_key": "granite-278m",
+      "model_name": "Granite 278M Multilingual (768-dim)",
+      "dimensions": 768,
+      "load_time": 1.3013572692871094,
+      "threshold": 0.85,
+      "tests": [
+        {
+          "id": 1,
+          "type": "exact",
+          "similarity": 1.0000000000000004,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 2,
+          "type": "exact",
+          "similarity": 1.0000000000000002,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 3,
+          "type": "exact",
+          "similarity": 1.0000000000000002,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 4,
+          "type": "different",
+          "similarity": 0.5312018972458725,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 5,
+          "type": "related",
+          "similarity": 0.7622608703056113,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 6,
+          "type": "different",
+          "similarity": 0.6224052610966958,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 7,
+          "type": "different",
+          "similarity": 0.5691598133480732,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 8,
+          "type": "different",
+          "similarity": 0.5608367775199385,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 9,
+          "type": "extended",
+          "similarity": 0.9502965050542435,
+          "predicted": true,
+          "expected": false,
+          "correct": false
+        }
+      ],
+      "correct": 8,
+      "false_positives": 1,
+      "false_negatives": 0,
+      "accuracy": 0.8888888888888888
+    },
+    {
+      "model_key": "gemma-300m",
+      "model_name": "Embedding Gemma 300M (768-dim)",
+      "dimensions": 768,
+      "load_time": 1.287358045578003,
+      "threshold": 0.85,
+      "tests": [
+        {
+          "id": 1,
+          "type": "exact",
+          "similarity": 1.0000000000000002,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 2,
+          "type": "exact",
+          "similarity": 1.0000000000000002,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 3,
+          "type": "exact",
+          "similarity": 1.0,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 4,
+          "type": "different",
+          "similarity": 0.49415466486918536,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 5,
+          "type": "related",
+          "similarity": 0.7521931400458695,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 6,
+          "type": "different",
+          "similarity": 0.471972474125754,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 7,
+          "type": "different",
+          "similarity": 0.454110573159473,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 8,
+          "type": "different",
+          "similarity": 0.47968063602459204,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 9,
+          "type": "extended",
+          "similarity": 0.9397708233122508,
+          "predicted": true,
+          "expected": false,
+          "correct": false
+        }
+      ],
+      "correct": 8,
+      "false_positives": 1,
+      "false_negatives": 0,
+      "accuracy": 0.8888888888888888
+    },
+    {
+      "model_key": "qwen-600m",
+      "model_name": "Qwen3 Embedding 600M (1024-dim)",
+      "dimensions": 1024,
+      "load_time": 1.6140825748443604,
+      "threshold": 0.85,
+      "tests": [
+        {
+          "id": 1,
+          "type": "exact",
+          "similarity": 1.0000000000000002,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 2,
+          "type": "exact",
+          "similarity": 1.0000000000000004,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 3,
+          "type": "exact",
+          "similarity": 1.0,
+          "predicted": true,
+          "expected": true,
+          "correct": true
+        },
+        {
+          "id": 4,
+          "type": "different",
+          "similarity": 0.5260412918600332,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 5,
+          "type": "related",
+          "similarity": 0.751988820948543,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 6,
+          "type": "different",
+          "similarity": 0.48024291718251755,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 7,
+          "type": "different",
+          "similarity": 0.40910911792742577,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 8,
+          "type": "different",
+          "similarity": 0.4429171322112669,
+          "predicted": false,
+          "expected": false,
+          "correct": true
+        },
+        {
+          "id": 9,
+          "type": "extended",
+          "similarity": 0.9754299880776476,
+          "predicted": true,
+          "expected": false,
+          "correct": false
+        }
+      ],
+      "correct": 8,
+      "false_positives": 1,
+      "false_negatives": 0,
+      "accuracy": 0.8888888888888888
+    }
+  ]
+}

meeting_summarizer/extraction.py CHANGED Viewed

@@ -264,30 +264,6 @@ EMBEDDING_MODELS = {
         "max_context": 2048,
         "description": "Fastest, multilingual, good for quick deduplication",
     },
-    "granite-278m": {
-        "name": "Granite 278M Multilingual (768-dim)",
-        "repo_id": "bartowski/granite-embedding-278m-multilingual-GGUF",
-        "filename": "*Q8_0.gguf",
-        "embedding_dim": 768,
-        "max_context": 2048,
-        "description": "Balanced speed/quality, multilingual",
-    },
-    "gemma-300m": {
-        "name": "Embedding Gemma 300M (768-dim)",
-        "repo_id": "unsloth/embeddinggemma-300m-GGUF",
-        "filename": "*Q8_0.gguf",
-        "embedding_dim": 768,
-        "max_context": 2048,
-        "description": "Google embedding model, strong semantics",
-    },
-    "qwen-600m": {
-        "name": "Qwen3 Embedding 600M (1024-dim)",
-        "repo_id": "Qwen/Qwen3-Embedding-0.6B-GGUF",
-        "filename": "*Q8_0.gguf",
-        "embedding_dim": 1024,
-        "max_context": 2048,
-        "description": "Highest quality, best for critical dedup",
-    },
 }

         "max_context": 2048,
         "description": "Fastest, multilingual, good for quick deduplication",
     },
 }