tiny-scribe / analyze_embeddings.py
Luigi's picture
Add embedding model analysis documentation
c9955a9
#!/usr/bin/env python3
"""
Embedding Model Analysis for Chinese Deduplication
Current: Granite-107M (already working)
Alternatives to consider:
- BGE-M3 (better multilingual, larger)
- Multilingual-E5 (Microsoft, proven)
Note: These require sentence-transformers, not GGUF
"""
import json
ANALYSIS = {
"current_model": {
"name": "Granite-107M-Multilingual",
"repo": "ibm-granite/granite-embedding-107m-multilingual",
"params": "107M",
"pros": [
"Already integrated and working",
"Fast (107M parameters)",
"Proven in production tests",
"Correctly deduplicated Gemma-3 (47.8% dupes)",
"0% false positives with Qwen2.5 1.5B"
],
"cons": [
"Smaller model (107M vs 500M+)",
"May miss nuanced similarities"
],
"test_results": {
"qwen2.5_1.5b_extraction": {
"duplicate_rate": "0%",
"deduplication_accuracy": "100%",
"note": "Extraction already unique per window"
},
"gemma3_1b_extraction": {
"duplicate_rate": "47.8%",
"deduplication_accuracy": "100%",
"note": "Correctly identified all duplicates"
}
}
},
"alternatives": {
"bge_m3": {
"name": "BGE-M3",
"repo": "BAAI/bge-m3",
"gguf_repo": "lm-kit/bge-m3-gguf",
"params": "568M",
"pros": [
"SOTA on MTEB Chinese benchmarks",
"Larger model (568M vs 107M)",
"Better semantic understanding"
],
"cons": [
"5x larger (slower)",
"Requires sentence-transformers (not GGUF)",
"Unknown if GGUF version works with llama-cpp"
],
"recommendation": "Worth testing if accuracy issues arise"
},
"multilingual_e5": {
"name": "Multilingual-E5-Large",
"repo": "intfloat/multilingual-e5-large",
"params": "560M",
"pros": [
"Microsoft-backed, widely tested",
"Excellent for multilingual",
"Good for Chinese text"
],
"cons": [
"5x larger than Granite-107M",
"Requires sentence-transformers",
"No GGUF version readily available"
],
"recommendation": "Consider if switching to sentence-transformers"
}
},
"recommendation": {
"current_status": "KEEP Granite-107M",
"rationale": [
"Working correctly in production",
"Fast enough for real-time use",
"Zero false positives in tests",
"Simple GGUF integration"
],
"when_to_upgrade": [
"If false positives/negatives appear in production",
"If need better semantic matching (not just exact duplicates)",
"If processing very long texts (need better context understanding)"
],
"suggested_thresholds": {
"strict": 0.90,
"default": 0.85,
"lenient": 0.80
}
}
}
def print_analysis():
print("="*70)
print("EMBEDDING MODEL ANALYSIS - Chinese Transcript Deduplication")
print("="*70)
# Current model
curr = ANALYSIS["current_model"]
print(f"\n🎯 CURRENT MODEL: {curr['name']}")
print(f" Repository: {curr['repo']}")
print(f" Parameters: {curr['params']}")
print("\n ✅ Pros:")
for pro in curr['pros']:
print(f" • {pro}")
print("\n ⚠️ Cons:")
for con in curr['cons']:
print(f" • {con}")
print("\n 📊 Test Results:")
for test_name, results in curr['test_results'].items():
print(f" {test_name}:")
print(f" - Duplicate rate: {results['duplicate_rate']}")
print(f" - Accuracy: {results['deduplication_accuracy']}")
# Alternatives
print("\n" + "="*70)
print("ALTERNATIVES (if needed)")
print("="*70)
for key, alt in ANALYSIS["alternatives"].items():
print(f"\n🔍 {alt['name']} ({alt['params']})")
print(f" Repo: {alt['repo']}")
if 'gguf_repo' in alt:
print(f" GGUF: {alt['gguf_repo']}")
print(" ✅ Pros:")
for pro in alt['pros']:
print(f" • {pro}")
print(" ⚠️ Cons:")
for con in alt['cons']:
print(f" • {con}")
print(f" 💡 Recommendation: {alt['recommendation']}")
# Final recommendation
rec = ANALYSIS["recommendation"]
print("\n" + "="*70)
print("FINAL RECOMMENDATION")
print("="*70)
print(f"\n✅ {rec['current_status']}")
print("\n📝 Rationale:")
for r in rec['rationale']:
print(f" • {r}")
print("\n🔄 When to Consider Upgrade:")
for when in rec['when_to_upgrade']:
print(f" • {when}")
print("\n⚙️ Suggested Similarity Thresholds:")
for thresh_type, value in rec['suggested_thresholds'].items():
print(f" • {thresh_type.capitalize()}: {value}")
# Save to file
with open("embedding_model_analysis.json", 'w', encoding='utf-8') as f:
json.dump(ANALYSIS, f, indent=2, ensure_ascii=False)
print(f"\n💾 Analysis saved to: embedding_model_analysis.json")
if __name__ == "__main__":
print_analysis()