Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Embedding Model Analysis for Chinese Deduplication | |
| Current: Granite-107M (already working) | |
| Alternatives to consider: | |
| - BGE-M3 (better multilingual, larger) | |
| - Multilingual-E5 (Microsoft, proven) | |
| Note: These require sentence-transformers, not GGUF | |
| """ | |
| import json | |
| ANALYSIS = { | |
| "current_model": { | |
| "name": "Granite-107M-Multilingual", | |
| "repo": "ibm-granite/granite-embedding-107m-multilingual", | |
| "params": "107M", | |
| "pros": [ | |
| "Already integrated and working", | |
| "Fast (107M parameters)", | |
| "Proven in production tests", | |
| "Correctly deduplicated Gemma-3 (47.8% dupes)", | |
| "0% false positives with Qwen2.5 1.5B" | |
| ], | |
| "cons": [ | |
| "Smaller model (107M vs 500M+)", | |
| "May miss nuanced similarities" | |
| ], | |
| "test_results": { | |
| "qwen2.5_1.5b_extraction": { | |
| "duplicate_rate": "0%", | |
| "deduplication_accuracy": "100%", | |
| "note": "Extraction already unique per window" | |
| }, | |
| "gemma3_1b_extraction": { | |
| "duplicate_rate": "47.8%", | |
| "deduplication_accuracy": "100%", | |
| "note": "Correctly identified all duplicates" | |
| } | |
| } | |
| }, | |
| "alternatives": { | |
| "bge_m3": { | |
| "name": "BGE-M3", | |
| "repo": "BAAI/bge-m3", | |
| "gguf_repo": "lm-kit/bge-m3-gguf", | |
| "params": "568M", | |
| "pros": [ | |
| "SOTA on MTEB Chinese benchmarks", | |
| "Larger model (568M vs 107M)", | |
| "Better semantic understanding" | |
| ], | |
| "cons": [ | |
| "5x larger (slower)", | |
| "Requires sentence-transformers (not GGUF)", | |
| "Unknown if GGUF version works with llama-cpp" | |
| ], | |
| "recommendation": "Worth testing if accuracy issues arise" | |
| }, | |
| "multilingual_e5": { | |
| "name": "Multilingual-E5-Large", | |
| "repo": "intfloat/multilingual-e5-large", | |
| "params": "560M", | |
| "pros": [ | |
| "Microsoft-backed, widely tested", | |
| "Excellent for multilingual", | |
| "Good for Chinese text" | |
| ], | |
| "cons": [ | |
| "5x larger than Granite-107M", | |
| "Requires sentence-transformers", | |
| "No GGUF version readily available" | |
| ], | |
| "recommendation": "Consider if switching to sentence-transformers" | |
| } | |
| }, | |
| "recommendation": { | |
| "current_status": "KEEP Granite-107M", | |
| "rationale": [ | |
| "Working correctly in production", | |
| "Fast enough for real-time use", | |
| "Zero false positives in tests", | |
| "Simple GGUF integration" | |
| ], | |
| "when_to_upgrade": [ | |
| "If false positives/negatives appear in production", | |
| "If need better semantic matching (not just exact duplicates)", | |
| "If processing very long texts (need better context understanding)" | |
| ], | |
| "suggested_thresholds": { | |
| "strict": 0.90, | |
| "default": 0.85, | |
| "lenient": 0.80 | |
| } | |
| } | |
| } | |
| def print_analysis(): | |
| print("="*70) | |
| print("EMBEDDING MODEL ANALYSIS - Chinese Transcript Deduplication") | |
| print("="*70) | |
| # Current model | |
| curr = ANALYSIS["current_model"] | |
| print(f"\n🎯 CURRENT MODEL: {curr['name']}") | |
| print(f" Repository: {curr['repo']}") | |
| print(f" Parameters: {curr['params']}") | |
| print("\n ✅ Pros:") | |
| for pro in curr['pros']: | |
| print(f" • {pro}") | |
| print("\n ⚠️ Cons:") | |
| for con in curr['cons']: | |
| print(f" • {con}") | |
| print("\n 📊 Test Results:") | |
| for test_name, results in curr['test_results'].items(): | |
| print(f" {test_name}:") | |
| print(f" - Duplicate rate: {results['duplicate_rate']}") | |
| print(f" - Accuracy: {results['deduplication_accuracy']}") | |
| # Alternatives | |
| print("\n" + "="*70) | |
| print("ALTERNATIVES (if needed)") | |
| print("="*70) | |
| for key, alt in ANALYSIS["alternatives"].items(): | |
| print(f"\n🔍 {alt['name']} ({alt['params']})") | |
| print(f" Repo: {alt['repo']}") | |
| if 'gguf_repo' in alt: | |
| print(f" GGUF: {alt['gguf_repo']}") | |
| print(" ✅ Pros:") | |
| for pro in alt['pros']: | |
| print(f" • {pro}") | |
| print(" ⚠️ Cons:") | |
| for con in alt['cons']: | |
| print(f" • {con}") | |
| print(f" 💡 Recommendation: {alt['recommendation']}") | |
| # Final recommendation | |
| rec = ANALYSIS["recommendation"] | |
| print("\n" + "="*70) | |
| print("FINAL RECOMMENDATION") | |
| print("="*70) | |
| print(f"\n✅ {rec['current_status']}") | |
| print("\n📝 Rationale:") | |
| for r in rec['rationale']: | |
| print(f" • {r}") | |
| print("\n🔄 When to Consider Upgrade:") | |
| for when in rec['when_to_upgrade']: | |
| print(f" • {when}") | |
| print("\n⚙️ Suggested Similarity Thresholds:") | |
| for thresh_type, value in rec['suggested_thresholds'].items(): | |
| print(f" • {thresh_type.capitalize()}: {value}") | |
| # Save to file | |
| with open("embedding_model_analysis.json", 'w', encoding='utf-8') as f: | |
| json.dump(ANALYSIS, f, indent=2, ensure_ascii=False) | |
| print(f"\n💾 Analysis saved to: embedding_model_analysis.json") | |
| if __name__ == "__main__": | |
| print_analysis() | |