Spaces:

Luigi
/

tiny-scribe

Running

File size: 6,543 Bytes

de2e4cb

#!/usr/bin/env python3
"""
Benchmark all 4 embedding models for Chinese deduplication
Tests: granite-107m, granite-278m, gemma-300m, qwen-600m
"""

import sys
import os
import time
import json
import numpy as np

# Add project path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from meeting_summarizer.extraction import EmbeddingModel, EMBEDDING_MODELS

# Test pairs: Chinese text that should/shouldn't match
TEST_PAIRS = [
    # Exact duplicates (should match)
    {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配", "should_match": True, "type": "exact"},
    {"text1": "優先供應大客戶浪潮", "text2": "優先供應大客戶浪潮", "should_match": True, "type": "exact"},
    {"text1": "DDR4缺貨持續到2028年", "text2": "DDR4缺貨持續到2028年", "should_match": True, "type": "exact"},
    
    # Different items (should NOT match)
    {"text1": "與三星討論Q3產能分配", "text2": "確認LPDDR4供應數量", "should_match": False, "type": "different"},
    {"text1": "優先供應大客戶浪潮", "text2": "與浪潮討論大客戶付款能力", "should_match": False, "type": "related"},
    {"text1": "DDR4缺貨持續到2028年", "text2": "AI需求占全球產能45%", "should_match": False, "type": "different"},
    {"text1": "Q2價格漲幅預估", "text2": "深圳測試場良率確認", "should_match": False, "type": "different"},
    
    # Edge cases
    {"text1": "ModuleHouse為嵌入式產品", "text2": "中興、創惟啟興也是重要客戶", "should_match": False, "type": "different"},
    {"text1": "與三星討論Q3產能分配", "text2": "與三星討論Q3產能分配及價格", "should_match": False, "type": "extended"},
]

def test_embedding_model(model_key, threshold=0.85):
    """Test a single embedding model"""
    config = EMBEDDING_MODELS[model_key]
    print(f"\n{'='*70}")
    print(f"Testing: {config['name']}")
    print(f"Repo: {config['repo_id']}")
    print(f"Dimensions: {config['embedding_dim']}")
    print(f"{'='*70}")
    
    try:
        # Load model
        start = time.time()
        model = EmbeddingModel(model_key, n_threads=2)
        msg = model.load()
        load_time = time.time() - start
        
        print(f"✓ Loaded in {load_time:.2f}s")
        print(f"  Status: {msg}")
        
        results = {
            "model_key": model_key,
            "model_name": config['name'],
            "dimensions": config['embedding_dim'],
            "load_time": load_time,
            "threshold": threshold,
            "tests": [],
            "correct": 0,
            "false_positives": 0,
            "false_negatives": 0,
        }
        
        # Test each pair
        for i, test in enumerate(TEST_PAIRS, 1):
            # Get embeddings
            emb1 = model.embed(test['text1'])
            emb2 = model.embed(test['text2'])
            
            # Calculate cosine similarity (vectors are already normalized)
            similarity = float(np.dot(emb1, emb2))
            predicted = similarity >= threshold
            
            # Check accuracy
            is_correct = predicted == test['should_match']
            if is_correct:
                results['correct'] += 1
            elif predicted and not test['should_match']:
                results['false_positives'] += 1
            else:
                results['false_negatives'] += 1
            
            # Store result
            results['tests'].append({
                "id": i,
                "type": test['type'],
                "similarity": float(similarity),
                "predicted": predicted,
                "expected": test['should_match'],
                "correct": is_correct
            })
            
            status = "✅" if is_correct else "❌"
            print(f"{status} Test {i} ({test['type'][:10]:<10}): sim={similarity:.3f}, "
                  f"match={predicted}, expected={test['should_match']}")
        
        # Calculate accuracy
        total = len(TEST_PAIRS)
        results['accuracy'] = results['correct'] / total
        
        print(f"\n📊 {config['name']} Results:")
        print(f"  Accuracy: {results['accuracy']:.1%} ({results['correct']}/{total})")
        print(f"  False Positives: {results['false_positives']}")
        print(f"  False Negatives: {results['false_negatives']}")
        
        # Cleanup
        model.unload()
        
        return results
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def main():
    print("="*70)
    print("EMBEDDING MODEL BENCHMARK - All 4 Models")
    print("Chinese Transcript Deduplication")
    print("="*70)
    print(f"\nTest pairs: {len(TEST_PAIRS)}")
    print(f"Similarity threshold: 0.85")
    print(f"\nModels to test: {len(EMBEDDING_MODELS)}")
    for key, cfg in EMBEDDING_MODELS.items():
        print(f"  • {cfg['name']} ({cfg['embedding_dim']}d)")
    
    all_results = []
    
    for model_key in EMBEDDING_MODELS.keys():
        result = test_embedding_model(model_key)
        if result:
            all_results.append(result)
    
    # Summary
    print("\n" + "="*70)
    print("FINAL COMPARISON")
    print("="*70)
    
    if all_results:
        # Sort by accuracy
        all_results.sort(key=lambda x: x['accuracy'], reverse=True)
        
        print(f"\n{'Rank':<6}{'Model':<30}{'Dims':<8}{'Accuracy':<12}{'Load(s)':<10}")
        print("-"*70)
        
        for i, r in enumerate(all_results, 1):
            print(f"{i:<6}{r['model_name']:<30}{r['dimensions']:<8}{r['accuracy']:.1%}      {r['load_time']:.2f}")
        
        # Save results
        output_file = "embedding_benchmark_all_models.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({
                "benchmark_info": {
                    "test_pairs": len(TEST_PAIRS),
                    "threshold": 0.85,
                    "models_tested": len(all_results)
                },
                "results": all_results
            }, f, indent=2, ensure_ascii=False)
        
        print(f"\n💾 Results saved to: {output_file}")
        
        # Best model
        best = all_results[0]
        print(f"\n🏆 Best: {best['model_name']}")
        print(f"   Accuracy: {best['accuracy']:.1%}")
        print(f"   Dimensions: {best['dimensions']}")
    else:
        print("❌ No models successfully tested")

if __name__ == "__main__":
    main()