e5-math-smart-binary / usage_examples.md

Smart Binary E5-Math Model - MRR: 0.9526 (+0.0414), Hit@3: 1.0000 (+0.0000) - 2025-07-02

18d7e32 verified 7 months ago

11.2 kB

	# Smart Binary Model: Usage Examples

	## 1. Basic Retrieval Example
	```python
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	# Load smart binary model
	model = SentenceTransformer('ThanhLe0125/e5-math-smart-binary')

	# Example query và chunks
	query = "query: Định nghĩa đạo hàm của hàm số"
	chunks = [
	"passage: Đạo hàm của hàm số f(x) tại x₀ là giới hạn của tỉ số...", # Correct
	"passage: Các quy tắc tính đạo hàm: (xⁿ)' = nxⁿ⁻¹, (sin x)' = cos x...", # Related
	"passage: Tích phân xác định của hàm số trên đoạn [a,b]...", # Irrelevant
	"passage: Phương trình vi phân bậc nhất có dạng y' + P(x)y = Q(x)" # Irrelevant
	]

	# Smart binary retrieval
	query_emb = model.encode([query])
	chunk_embs = model.encode(chunks)
	similarities = cosine_similarity(query_emb, chunk_embs)[0]

	print("Smart Binary Rankings:")
	ranked_indices = similarities.argsort()[::-1]
	for rank, idx in enumerate(ranked_indices, 1):
	chunk_type = ["CORRECT", "RELATED", "IRRELEVANT", "IRRELEVANT"][idx]
	print(f"Rank {rank}: {chunk_type} (Score: {similarities[idx]:.4f})")
	print(f" {chunks[idx][:70]}...")
	print()

	# Expected smart binary results:
	# Rank 1: CORRECT (Score: ~0.87)
	# Rank 2: RELATED (Score: ~0.65)
	# Rank 3: IRRELEVANT (Score: ~0.25)
	# Rank 4: IRRELEVANT (Score: ~0.20)
	```

	## 2. Batch Processing Multiple Queries
	```python
	# Multiple Vietnamese math queries
	queries = [
	"query: Cách giải phương trình bậc hai",
	"query: Định nghĩa hàm số đồng biến",
	"query: Công thức tính thể tích hình cầu"
	]

	math_content_pool = [
	"passage: Phương trình bậc hai ax² + bx + c = 0 có nghiệm x = (-b ± √Δ)/2a",
	"passage: Hàm số đồng biến trên khoảng I khi f'(x) > 0 với mọi x ∈ I",
	"passage: Thể tích hình cầu bán kính R là V = (4/3)πR³",
	"passage: Diện tích hình tròn bán kính r là S = πr²",
	"passage: Định lý Pythagoras: a² + b² = c² trong tam giác vuông"
	]

	# Process all queries efficiently
	for query in queries:
	print(f"\nQuery: {query.replace('query: ', '')}")

	query_emb = model.encode([query])
	chunk_embs = model.encode(math_content_pool)
	similarities = cosine_similarity(query_emb, chunk_embs)[0]

	# Get top 3 với smart binary model
	top_3_indices = similarities.argsort()[::-1][:3]

	for rank, idx in enumerate(top_3_indices, 1):
	score = similarities[idx]
	confidence = "HIGH" if score > 0.8 else "MEDIUM" if score > 0.5 else "LOW"
	print(f" {rank}. [{confidence}] {score:.3f} - {math_content_pool[idx]}")
	```

	## 3. Production Class Implementation
	```python
	class SmartBinaryMathRetriever:
	def __init__(self, model_name='ThanhLe0125/e5-math-smart-binary'):
	self.model = SentenceTransformer(model_name)
	print(f"Smart Binary Model loaded: {model_name}")

	def retrieve_with_confidence(self, query, chunks, top_k=5, min_confidence=0.3):
	"""
	Smart binary retrieval với confidence scoring

	Args:
	query: Vietnamese math question
	chunks: List of educational content
	top_k: Number of results to return
	min_confidence: Minimum similarity threshold
	"""
	# Ensure E5 format
	formatted_query = f"query: {query}" if not query.startswith("query:") else query
	formatted_chunks = [
	f"passage: {chunk}" if not chunk.startswith("passage:") else chunk
	for chunk in chunks
	]

	# Encode với smart binary model
	query_emb = self.model.encode([formatted_query])
	chunk_embs = self.model.encode(formatted_chunks)
	similarities = cosine_similarity(query_emb, chunk_embs)[0]

	# Filter by confidence và rank
	results = []
	for idx, similarity in enumerate(similarities):
	if similarity >= min_confidence:
	results.append({
	'chunk_index': idx,
	'chunk': chunks[idx],
	'similarity': float(similarity),
	'confidence_level': self._get_confidence_level(similarity)
	})

	# Sort by similarity và limit
	results.sort(key=lambda x: x['similarity'], reverse=True)
	results = results[:top_k]

	# Add ranking
	for rank, result in enumerate(results, 1):
	result['rank'] = rank

	return results

	def _get_confidence_level(self, similarity):
	"""Convert similarity to confidence level"""
	if similarity >= 0.85:
	return "VERY_HIGH"
	elif similarity >= 0.7:
	return "HIGH"
	elif similarity >= 0.5:
	return "MEDIUM"
	elif similarity >= 0.3:
	return "LOW"
	else:
	return "VERY_LOW"

	def batch_retrieve(self, queries, chunk_pool, top_k_per_query=3):
	"""Process multiple queries efficiently"""
	all_results = {}

	for query in queries:
	results = self.retrieve_with_confidence(query, chunk_pool, top_k_per_query)
	all_results[query] = results

	return all_results

	# Usage example
	retriever = SmartBinaryMathRetriever()

	# Single query
	query = "Cách tính đạo hàm của hàm hợp"
	chunks = [
	"Đạo hàm hàm hợp: (f(g(x)))' = f'(g(x)) × g'(x)",
	"Ví dụ: Tính đạo hàm của (x² + 1)³",
	"Tích phân từng phần: ∫u dv = uv - ∫v du"
	]

	results = retriever.retrieve_with_confidence(query, chunks, top_k=3, min_confidence=0.2)

	print("Smart Binary Retrieval Results:")
	for result in results:
	print(f"Rank {result['rank']}: {result['confidence_level']}")
	print(f" Similarity: {result['similarity']:.4f}")
	print(f" Content: {result['chunk'][:60]}...")
	print()
	```

	## 4. Comparison và Evaluation
	```python
	# Compare smart binary với base model
	def compare_models(query, chunks):
	# Load models
	base_model = SentenceTransformer('intfloat/multilingual-e5-base')
	smart_binary_model = SentenceTransformer('ThanhLe0125/e5-math-smart-binary')

	# Format query
	formatted_query = f"query: {query}"
	formatted_chunks = [f"passage: {chunk}" for chunk in chunks]

	# Encode với both models
	query_emb_base = base_model.encode([formatted_query])
	query_emb_smart = smart_binary_model.encode([formatted_query])

	chunk_embs_base = base_model.encode(formatted_chunks)
	chunk_embs_smart = smart_binary_model.encode(formatted_chunks)

	# Calculate similarities
	similarities_base = cosine_similarity(query_emb_base, chunk_embs_base)[0]
	similarities_smart = cosine_similarity(query_emb_smart, chunk_embs_smart)[0]

	# Compare rankings
	print(f"Query: {query}")
	print("="*50)

	for i, chunk in enumerate(chunks):
	base_score = similarities_base[i]
	smart_score = similarities_smart[i]
	improvement = smart_score - base_score

	print(f"Chunk {i+1}:")
	print(f" Base Model: {base_score:.4f}")
	print(f" Smart Binary: {smart_score:.4f}")
	print(f" Improvement: {improvement:+.4f}")
	print(f" Content: {chunk[:50]}...")
	print()

	# Example comparison
	compare_models(
	"Định nghĩa hàm số liên tục",
	[
	"Hàm số f liên tục tại x₀ nếu lim(x→x₀) f(x) = f(x₀)", # Correct
	"Ví dụ hàm số liên tục: f(x) = x², g(x) = sin(x)", # Related
	"Phương trình vi phân có nghiệm tổng quát y = Ce^x" # Irrelevant
	]
	)
	```

	## 5. Advanced Analytics
	```python
	def analyze_smart_binary_performance(queries, chunks, ground_truth):
	"""
	Comprehensive performance analysis

	Args:
	queries: List of test queries
	chunks: List of content chunks
	ground_truth: List of correct chunk indices for each query
	"""
	model = SentenceTransformer('ThanhLe0125/e5-math-smart-binary')

	metrics = {
	'mrr_scores': [],
	'hit_at_1': 0,
	'hit_at_3': 0,
	'hit_at_5': 0,
	'total_queries': len(queries)
	}

	for i, query in enumerate(queries):
	# Format và encode
	formatted_query = f"query: {query}"
	formatted_chunks = [f"passage: {chunk}" for chunk in chunks]

	query_emb = model.encode([formatted_query])
	chunk_embs = model.encode(formatted_chunks)
	similarities = cosine_similarity(query_emb, chunk_embs)[0]

	# Rank chunks
	ranked_indices = similarities.argsort()[::-1]
	correct_idx = ground_truth[i]

	# Find rank of correct answer
	correct_rank = None
	for rank, idx in enumerate(ranked_indices, 1):
	if idx == correct_idx:
	correct_rank = rank
	break

	if correct_rank:
	# Calculate MRR
	mrr = 1.0 / correct_rank
	metrics['mrr_scores'].append(mrr)

	# Hit@K metrics
	if correct_rank <= 1:
	metrics['hit_at_1'] += 1
	if correct_rank <= 3:
	metrics['hit_at_3'] += 1
	if correct_rank <= 5:
	metrics['hit_at_5'] += 1

	# Calculate final metrics
	avg_mrr = np.mean(metrics['mrr_scores']) if metrics['mrr_scores'] else 0
	hit_1_rate = metrics['hit_at_1'] / metrics['total_queries']
	hit_3_rate = metrics['hit_at_3'] / metrics['total_queries']
	hit_5_rate = metrics['hit_at_5'] / metrics['total_queries']

	print("Smart Binary Model Performance Analysis:")
	print(f" MRR (Mean Reciprocal Rank): {avg_mrr:.4f}")
	print(f" Hit@1 (Accuracy): {hit_1_rate:.4f} ({metrics['hit_at_1']}/{metrics['total_queries']})")
	print(f" Hit@3: {hit_3_rate:.4f} ({metrics['hit_at_3']}/{metrics['total_queries']})")
	print(f" Hit@5: {hit_5_rate:.4f} ({metrics['hit_at_5']}/{metrics['total_queries']})")

	return {
	'mrr': avg_mrr,
	'hit_at_1': hit_1_rate,
	'hit_at_3': hit_3_rate,
	'hit_at_5': hit_5_rate
	}

	# Example usage
	test_queries = [
	"Công thức tính đạo hàm",
	"Định nghĩa tích phân",
	"Cách giải phương trình bậc hai"
	]

	test_chunks = [
	"Đạo hàm của hàm số f(x) = lim[h→0] (f(x+h)-f(x))/h", # For query 1
	"Tích phân của f(x) trên [a,b] = ∫[a,b] f(x)dx", # For query 2
	"Nghiệm phương trình ax²+bx+c=0 là x = (-b±√Δ)/2a", # For query 3
	"Định lý vi phân trung bình",
	"Công thức Taylor"
	]

	ground_truth = [0, 1, 2] # Correct chunk indices

	performance = analyze_smart_binary_performance(test_queries, test_chunks, ground_truth)
	```

	These examples demonstrate the smart binary model's balanced approach to precision and recall, making it ideal for Vietnamese mathematical content retrieval with optimal user experience.