Spaces:

Luigi
/

tiny-scribe

Running

File size: 5,608 Bytes

c9955a9

#!/usr/bin/env python3
"""
Embedding Model Analysis for Chinese Deduplication

Current: Granite-107M (already working)
Alternatives to consider:
- BGE-M3 (better multilingual, larger)
- Multilingual-E5 (Microsoft, proven)

Note: These require sentence-transformers, not GGUF
"""

import json

ANALYSIS = {
    "current_model": {
        "name": "Granite-107M-Multilingual",
        "repo": "ibm-granite/granite-embedding-107m-multilingual",
        "params": "107M",
        "pros": [
            "Already integrated and working",
            "Fast (107M parameters)",
            "Proven in production tests",
            "Correctly deduplicated Gemma-3 (47.8% dupes)",
            "0% false positives with Qwen2.5 1.5B"
        ],
        "cons": [
            "Smaller model (107M vs 500M+)",
            "May miss nuanced similarities"
        ],
        "test_results": {
            "qwen2.5_1.5b_extraction": {
                "duplicate_rate": "0%",
                "deduplication_accuracy": "100%",
                "note": "Extraction already unique per window"
            },
            "gemma3_1b_extraction": {
                "duplicate_rate": "47.8%",
                "deduplication_accuracy": "100%",
                "note": "Correctly identified all duplicates"
            }
        }
    },
    
    "alternatives": {
        "bge_m3": {
            "name": "BGE-M3",
            "repo": "BAAI/bge-m3",
            "gguf_repo": "lm-kit/bge-m3-gguf",
            "params": "568M",
            "pros": [
                "SOTA on MTEB Chinese benchmarks",
                "Larger model (568M vs 107M)",
                "Better semantic understanding"
            ],
            "cons": [
                "5x larger (slower)",
                "Requires sentence-transformers (not GGUF)",
                "Unknown if GGUF version works with llama-cpp"
            ],
            "recommendation": "Worth testing if accuracy issues arise"
        },
        
        "multilingual_e5": {
            "name": "Multilingual-E5-Large",
            "repo": "intfloat/multilingual-e5-large",
            "params": "560M",
            "pros": [
                "Microsoft-backed, widely tested",
                "Excellent for multilingual",
                "Good for Chinese text"
            ],
            "cons": [
                "5x larger than Granite-107M",
                "Requires sentence-transformers",
                "No GGUF version readily available"
            ],
            "recommendation": "Consider if switching to sentence-transformers"
        }
    },
    
    "recommendation": {
        "current_status": "KEEP Granite-107M",
        "rationale": [
            "Working correctly in production",
            "Fast enough for real-time use",
            "Zero false positives in tests",
            "Simple GGUF integration"
        ],
        "when_to_upgrade": [
            "If false positives/negatives appear in production",
            "If need better semantic matching (not just exact duplicates)",
            "If processing very long texts (need better context understanding)"
        ],
        "suggested_thresholds": {
            "strict": 0.90,
            "default": 0.85,
            "lenient": 0.80
        }
    }
}

def print_analysis():
    print("="*70)
    print("EMBEDDING MODEL ANALYSIS - Chinese Transcript Deduplication")
    print("="*70)
    
    # Current model
    curr = ANALYSIS["current_model"]
    print(f"\n🎯 CURRENT MODEL: {curr['name']}")
    print(f"   Repository: {curr['repo']}")
    print(f"   Parameters: {curr['params']}")
    
    print("\n   ✅ Pros:")
    for pro in curr['pros']:
        print(f"      • {pro}")
    
    print("\n   ⚠️  Cons:")
    for con in curr['cons']:
        print(f"      • {con}")
    
    print("\n   📊 Test Results:")
    for test_name, results in curr['test_results'].items():
        print(f"      {test_name}:")
        print(f"        - Duplicate rate: {results['duplicate_rate']}")
        print(f"        - Accuracy: {results['deduplication_accuracy']}")
    
    # Alternatives
    print("\n" + "="*70)
    print("ALTERNATIVES (if needed)")
    print("="*70)
    
    for key, alt in ANALYSIS["alternatives"].items():
        print(f"\n🔍 {alt['name']} ({alt['params']})")
        print(f"   Repo: {alt['repo']}")
        if 'gguf_repo' in alt:
            print(f"   GGUF: {alt['gguf_repo']}")
        
        print("   ✅ Pros:")
        for pro in alt['pros']:
            print(f"      • {pro}")
        
        print("   ⚠️  Cons:")
        for con in alt['cons']:
            print(f"      • {con}")
        
        print(f"   💡 Recommendation: {alt['recommendation']}")
    
    # Final recommendation
    rec = ANALYSIS["recommendation"]
    print("\n" + "="*70)
    print("FINAL RECOMMENDATION")
    print("="*70)
    print(f"\n✅ {rec['current_status']}")
    
    print("\n📝 Rationale:")
    for r in rec['rationale']:
        print(f"   • {r}")
    
    print("\n🔄 When to Consider Upgrade:")
    for when in rec['when_to_upgrade']:
        print(f"   • {when}")
    
    print("\n⚙️  Suggested Similarity Thresholds:")
    for thresh_type, value in rec['suggested_thresholds'].items():
        print(f"   • {thresh_type.capitalize()}: {value}")
    
    # Save to file
    with open("embedding_model_analysis.json", 'w', encoding='utf-8') as f:
        json.dump(ANALYSIS, f, indent=2, ensure_ascii=False)
    
    print(f"\n💾 Analysis saved to: embedding_model_analysis.json")

if __name__ == "__main__":
    print_analysis()