Spaces:

yuvis
/

Enterprise_RAG

Running

Enterprise_RAG / src /pipeline /context_opt.py

Upload folder using huggingface_hub

f4c70c8 verified about 1 month ago

1.96 kB

	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity

	def maximal_marginal_relevance(query_embedding: np.ndarray, doc_embeddings: np.ndarray, lambda_mult: float = 0.5, top_k: int = 5):
	"""
	Selects docs that are relevant to query but diverse from each other.
	"""
	if len(doc_embeddings) == 0:
	return []

	# Simple MMR implementation
	selected_indices = []
	candidate_indices = list(range(len(doc_embeddings)))

	for _ in range(top_k):
	best_score = -np.inf
	best_idx = -1

	for idx in candidate_indices:
	# Relevance
	rel_score = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings[idx].reshape(1, -1))[0][0]

	# Diversity (sim to already selected)
	if not selected_indices:
	div_score = 0
	else:
	sims = cosine_similarity(doc_embeddings[idx].reshape(1, -1), doc_embeddings[selected_indices])[0]
	div_score = np.max(sims)

	mmr_score = lambda_mult * rel_score - (1 - lambda_mult) * div_score

	if mmr_score > best_score:
	best_score = mmr_score
	best_idx = idx

	if best_idx != -1:
	selected_indices.append(best_idx)
	candidate_indices.remove(best_idx)

	return selected_indices

	def deduplicate_docs(docs: list[dict], threshold: float = 0.95) -> list[dict]:
	"""
	Remove near-duplicates based on content string similarity (simple)
	or just exact match for now to be fast.
	"""
	seen = set()
	unique_docs = []
	for doc in docs:
	# Assuming doc is a string or dict with 'content'
	content = doc if isinstance(doc, str) else doc.get('content', '')
	if content not in seen:
	seen.add(content)
	unique_docs.append(doc)
	return unique_docs