Spaces:
Running
Running
File size: 5,608 Bytes
c9955a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | #!/usr/bin/env python3
"""
Embedding Model Analysis for Chinese Deduplication
Current: Granite-107M (already working)
Alternatives to consider:
- BGE-M3 (better multilingual, larger)
- Multilingual-E5 (Microsoft, proven)
Note: These require sentence-transformers, not GGUF
"""
import json
ANALYSIS = {
"current_model": {
"name": "Granite-107M-Multilingual",
"repo": "ibm-granite/granite-embedding-107m-multilingual",
"params": "107M",
"pros": [
"Already integrated and working",
"Fast (107M parameters)",
"Proven in production tests",
"Correctly deduplicated Gemma-3 (47.8% dupes)",
"0% false positives with Qwen2.5 1.5B"
],
"cons": [
"Smaller model (107M vs 500M+)",
"May miss nuanced similarities"
],
"test_results": {
"qwen2.5_1.5b_extraction": {
"duplicate_rate": "0%",
"deduplication_accuracy": "100%",
"note": "Extraction already unique per window"
},
"gemma3_1b_extraction": {
"duplicate_rate": "47.8%",
"deduplication_accuracy": "100%",
"note": "Correctly identified all duplicates"
}
}
},
"alternatives": {
"bge_m3": {
"name": "BGE-M3",
"repo": "BAAI/bge-m3",
"gguf_repo": "lm-kit/bge-m3-gguf",
"params": "568M",
"pros": [
"SOTA on MTEB Chinese benchmarks",
"Larger model (568M vs 107M)",
"Better semantic understanding"
],
"cons": [
"5x larger (slower)",
"Requires sentence-transformers (not GGUF)",
"Unknown if GGUF version works with llama-cpp"
],
"recommendation": "Worth testing if accuracy issues arise"
},
"multilingual_e5": {
"name": "Multilingual-E5-Large",
"repo": "intfloat/multilingual-e5-large",
"params": "560M",
"pros": [
"Microsoft-backed, widely tested",
"Excellent for multilingual",
"Good for Chinese text"
],
"cons": [
"5x larger than Granite-107M",
"Requires sentence-transformers",
"No GGUF version readily available"
],
"recommendation": "Consider if switching to sentence-transformers"
}
},
"recommendation": {
"current_status": "KEEP Granite-107M",
"rationale": [
"Working correctly in production",
"Fast enough for real-time use",
"Zero false positives in tests",
"Simple GGUF integration"
],
"when_to_upgrade": [
"If false positives/negatives appear in production",
"If need better semantic matching (not just exact duplicates)",
"If processing very long texts (need better context understanding)"
],
"suggested_thresholds": {
"strict": 0.90,
"default": 0.85,
"lenient": 0.80
}
}
}
def print_analysis():
print("="*70)
print("EMBEDDING MODEL ANALYSIS - Chinese Transcript Deduplication")
print("="*70)
# Current model
curr = ANALYSIS["current_model"]
print(f"\n🎯 CURRENT MODEL: {curr['name']}")
print(f" Repository: {curr['repo']}")
print(f" Parameters: {curr['params']}")
print("\n ✅ Pros:")
for pro in curr['pros']:
print(f" • {pro}")
print("\n ⚠️ Cons:")
for con in curr['cons']:
print(f" • {con}")
print("\n 📊 Test Results:")
for test_name, results in curr['test_results'].items():
print(f" {test_name}:")
print(f" - Duplicate rate: {results['duplicate_rate']}")
print(f" - Accuracy: {results['deduplication_accuracy']}")
# Alternatives
print("\n" + "="*70)
print("ALTERNATIVES (if needed)")
print("="*70)
for key, alt in ANALYSIS["alternatives"].items():
print(f"\n🔍 {alt['name']} ({alt['params']})")
print(f" Repo: {alt['repo']}")
if 'gguf_repo' in alt:
print(f" GGUF: {alt['gguf_repo']}")
print(" ✅ Pros:")
for pro in alt['pros']:
print(f" • {pro}")
print(" ⚠️ Cons:")
for con in alt['cons']:
print(f" • {con}")
print(f" 💡 Recommendation: {alt['recommendation']}")
# Final recommendation
rec = ANALYSIS["recommendation"]
print("\n" + "="*70)
print("FINAL RECOMMENDATION")
print("="*70)
print(f"\n✅ {rec['current_status']}")
print("\n📝 Rationale:")
for r in rec['rationale']:
print(f" • {r}")
print("\n🔄 When to Consider Upgrade:")
for when in rec['when_to_upgrade']:
print(f" • {when}")
print("\n⚙️ Suggested Similarity Thresholds:")
for thresh_type, value in rec['suggested_thresholds'].items():
print(f" • {thresh_type.capitalize()}: {value}")
# Save to file
with open("embedding_model_analysis.json", 'w', encoding='utf-8') as f:
json.dump(ANALYSIS, f, indent=2, ensure_ascii=False)
print(f"\n💾 Analysis saved to: embedding_model_analysis.json")
if __name__ == "__main__":
print_analysis()
|