#!/usr/bin/env python3 """ Embedding Model Analysis for Chinese Deduplication Current: Granite-107M (already working) Alternatives to consider: - BGE-M3 (better multilingual, larger) - Multilingual-E5 (Microsoft, proven) Note: These require sentence-transformers, not GGUF """ import json ANALYSIS = { "current_model": { "name": "Granite-107M-Multilingual", "repo": "ibm-granite/granite-embedding-107m-multilingual", "params": "107M", "pros": [ "Already integrated and working", "Fast (107M parameters)", "Proven in production tests", "Correctly deduplicated Gemma-3 (47.8% dupes)", "0% false positives with Qwen2.5 1.5B" ], "cons": [ "Smaller model (107M vs 500M+)", "May miss nuanced similarities" ], "test_results": { "qwen2.5_1.5b_extraction": { "duplicate_rate": "0%", "deduplication_accuracy": "100%", "note": "Extraction already unique per window" }, "gemma3_1b_extraction": { "duplicate_rate": "47.8%", "deduplication_accuracy": "100%", "note": "Correctly identified all duplicates" } } }, "alternatives": { "bge_m3": { "name": "BGE-M3", "repo": "BAAI/bge-m3", "gguf_repo": "lm-kit/bge-m3-gguf", "params": "568M", "pros": [ "SOTA on MTEB Chinese benchmarks", "Larger model (568M vs 107M)", "Better semantic understanding" ], "cons": [ "5x larger (slower)", "Requires sentence-transformers (not GGUF)", "Unknown if GGUF version works with llama-cpp" ], "recommendation": "Worth testing if accuracy issues arise" }, "multilingual_e5": { "name": "Multilingual-E5-Large", "repo": "intfloat/multilingual-e5-large", "params": "560M", "pros": [ "Microsoft-backed, widely tested", "Excellent for multilingual", "Good for Chinese text" ], "cons": [ "5x larger than Granite-107M", "Requires sentence-transformers", "No GGUF version readily available" ], "recommendation": "Consider if switching to sentence-transformers" } }, "recommendation": { "current_status": "KEEP Granite-107M", "rationale": [ "Working correctly in production", "Fast enough for real-time use", "Zero false positives in tests", "Simple GGUF integration" ], "when_to_upgrade": [ "If false positives/negatives appear in production", "If need better semantic matching (not just exact duplicates)", "If processing very long texts (need better context understanding)" ], "suggested_thresholds": { "strict": 0.90, "default": 0.85, "lenient": 0.80 } } } def print_analysis(): print("="*70) print("EMBEDDING MODEL ANALYSIS - Chinese Transcript Deduplication") print("="*70) # Current model curr = ANALYSIS["current_model"] print(f"\nšŸŽÆ CURRENT MODEL: {curr['name']}") print(f" Repository: {curr['repo']}") print(f" Parameters: {curr['params']}") print("\n āœ… Pros:") for pro in curr['pros']: print(f" • {pro}") print("\n āš ļø Cons:") for con in curr['cons']: print(f" • {con}") print("\n šŸ“Š Test Results:") for test_name, results in curr['test_results'].items(): print(f" {test_name}:") print(f" - Duplicate rate: {results['duplicate_rate']}") print(f" - Accuracy: {results['deduplication_accuracy']}") # Alternatives print("\n" + "="*70) print("ALTERNATIVES (if needed)") print("="*70) for key, alt in ANALYSIS["alternatives"].items(): print(f"\nšŸ” {alt['name']} ({alt['params']})") print(f" Repo: {alt['repo']}") if 'gguf_repo' in alt: print(f" GGUF: {alt['gguf_repo']}") print(" āœ… Pros:") for pro in alt['pros']: print(f" • {pro}") print(" āš ļø Cons:") for con in alt['cons']: print(f" • {con}") print(f" šŸ’” Recommendation: {alt['recommendation']}") # Final recommendation rec = ANALYSIS["recommendation"] print("\n" + "="*70) print("FINAL RECOMMENDATION") print("="*70) print(f"\nāœ… {rec['current_status']}") print("\nšŸ“ Rationale:") for r in rec['rationale']: print(f" • {r}") print("\nšŸ”„ When to Consider Upgrade:") for when in rec['when_to_upgrade']: print(f" • {when}") print("\nāš™ļø Suggested Similarity Thresholds:") for thresh_type, value in rec['suggested_thresholds'].items(): print(f" • {thresh_type.capitalize()}: {value}") # Save to file with open("embedding_model_analysis.json", 'w', encoding='utf-8') as f: json.dump(ANALYSIS, f, indent=2, ensure_ascii=False) print(f"\nšŸ’¾ Analysis saved to: embedding_model_analysis.json") if __name__ == "__main__": print_analysis()