Spaces:

Luigi
/

tiny-scribe

Running

App Files Files Community

tiny-scribe / analyze_embeddings.py

Luigi

Add embedding model analysis documentation

c9955a9 about 1 month ago

raw

history blame contribute delete

5.61 kB

	#!/usr/bin/env python3
	"""
	Embedding Model Analysis for Chinese Deduplication

	Current: Granite-107M (already working)
	Alternatives to consider:
	- BGE-M3 (better multilingual, larger)
	- Multilingual-E5 (Microsoft, proven)

	Note: These require sentence-transformers, not GGUF
	"""

	import json

	ANALYSIS = {
	"current_model": {
	"name": "Granite-107M-Multilingual",
	"repo": "ibm-granite/granite-embedding-107m-multilingual",
	"params": "107M",
	"pros": [
	"Already integrated and working",
	"Fast (107M parameters)",
	"Proven in production tests",
	"Correctly deduplicated Gemma-3 (47.8% dupes)",
	"0% false positives with Qwen2.5 1.5B"
	],
	"cons": [
	"Smaller model (107M vs 500M+)",
	"May miss nuanced similarities"
	],
	"test_results": {
	"qwen2.5_1.5b_extraction": {
	"duplicate_rate": "0%",
	"deduplication_accuracy": "100%",
	"note": "Extraction already unique per window"
	},
	"gemma3_1b_extraction": {
	"duplicate_rate": "47.8%",
	"deduplication_accuracy": "100%",
	"note": "Correctly identified all duplicates"
	}
	}
	},

	"alternatives": {
	"bge_m3": {
	"name": "BGE-M3",
	"repo": "BAAI/bge-m3",
	"gguf_repo": "lm-kit/bge-m3-gguf",
	"params": "568M",
	"pros": [
	"SOTA on MTEB Chinese benchmarks",
	"Larger model (568M vs 107M)",
	"Better semantic understanding"
	],
	"cons": [
	"5x larger (slower)",
	"Requires sentence-transformers (not GGUF)",
	"Unknown if GGUF version works with llama-cpp"
	],
	"recommendation": "Worth testing if accuracy issues arise"
	},

	"multilingual_e5": {
	"name": "Multilingual-E5-Large",
	"repo": "intfloat/multilingual-e5-large",
	"params": "560M",
	"pros": [
	"Microsoft-backed, widely tested",
	"Excellent for multilingual",
	"Good for Chinese text"
	],
	"cons": [
	"5x larger than Granite-107M",
	"Requires sentence-transformers",
	"No GGUF version readily available"
	],
	"recommendation": "Consider if switching to sentence-transformers"
	}
	},

	"recommendation": {
	"current_status": "KEEP Granite-107M",
	"rationale": [
	"Working correctly in production",
	"Fast enough for real-time use",
	"Zero false positives in tests",
	"Simple GGUF integration"
	],
	"when_to_upgrade": [
	"If false positives/negatives appear in production",
	"If need better semantic matching (not just exact duplicates)",
	"If processing very long texts (need better context understanding)"
	],
	"suggested_thresholds": {
	"strict": 0.90,
	"default": 0.85,
	"lenient": 0.80
	}
	}
	}

	def print_analysis():
	print("="*70)
	print("EMBEDDING MODEL ANALYSIS - Chinese Transcript Deduplication")
	print("="*70)

	# Current model
	curr = ANALYSIS["current_model"]
	print(f"\n🎯 CURRENT MODEL: {curr['name']}")
	print(f" Repository: {curr['repo']}")
	print(f" Parameters: {curr['params']}")

	print("\n ✅ Pros:")
	for pro in curr['pros']:
	print(f" • {pro}")

	print("\n ⚠️ Cons:")
	for con in curr['cons']:
	print(f" • {con}")

	print("\n 📊 Test Results:")
	for test_name, results in curr['test_results'].items():
	print(f" {test_name}:")
	print(f" - Duplicate rate: {results['duplicate_rate']}")
	print(f" - Accuracy: {results['deduplication_accuracy']}")

	# Alternatives
	print("\n" + "="*70)
	print("ALTERNATIVES (if needed)")
	print("="*70)

	for key, alt in ANALYSIS["alternatives"].items():
	print(f"\n🔍 {alt['name']} ({alt['params']})")
	print(f" Repo: {alt['repo']}")
	if 'gguf_repo' in alt:
	print(f" GGUF: {alt['gguf_repo']}")

	print(" ✅ Pros:")
	for pro in alt['pros']:
	print(f" • {pro}")

	print(" ⚠️ Cons:")
	for con in alt['cons']:
	print(f" • {con}")

	print(f" 💡 Recommendation: {alt['recommendation']}")

	# Final recommendation
	rec = ANALYSIS["recommendation"]
	print("\n" + "="*70)
	print("FINAL RECOMMENDATION")
	print("="*70)
	print(f"\n✅ {rec['current_status']}")

	print("\n📝 Rationale:")
	for r in rec['rationale']:
	print(f" • {r}")

	print("\n🔄 When to Consider Upgrade:")
	for when in rec['when_to_upgrade']:
	print(f" • {when}")

	print("\n⚙️ Suggested Similarity Thresholds:")
	for thresh_type, value in rec['suggested_thresholds'].items():
	print(f" • {thresh_type.capitalize()}: {value}")

	# Save to file
	with open("embedding_model_analysis.json", 'w', encoding='utf-8') as f:
	json.dump(ANALYSIS, f, indent=2, ensure_ascii=False)

	print(f"\n💾 Analysis saved to: embedding_model_analysis.json")

	if __name__ == "__main__":
	print_analysis()