import numpy as np from sklearn.metrics.pairwise import cosine_similarity def maximal_marginal_relevance(query_embedding: np.ndarray, doc_embeddings: np.ndarray, lambda_mult: float = 0.5, top_k: int = 5): """ Selects docs that are relevant to query but diverse from each other. """ if len(doc_embeddings) == 0: return [] # Simple MMR implementation selected_indices = [] candidate_indices = list(range(len(doc_embeddings))) for _ in range(top_k): best_score = -np.inf best_idx = -1 for idx in candidate_indices: # Relevance rel_score = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings[idx].reshape(1, -1))[0][0] # Diversity (sim to already selected) if not selected_indices: div_score = 0 else: sims = cosine_similarity(doc_embeddings[idx].reshape(1, -1), doc_embeddings[selected_indices])[0] div_score = np.max(sims) mmr_score = lambda_mult * rel_score - (1 - lambda_mult) * div_score if mmr_score > best_score: best_score = mmr_score best_idx = idx if best_idx != -1: selected_indices.append(best_idx) candidate_indices.remove(best_idx) return selected_indices def deduplicate_docs(docs: list[dict], threshold: float = 0.95) -> list[dict]: """ Remove near-duplicates based on content string similarity (simple) or just exact match for now to be fast. """ seen = set() unique_docs = [] for doc in docs: # Assuming doc is a string or dict with 'content' content = doc if isinstance(doc, str) else doc.get('content', '') if content not in seen: seen.add(content) unique_docs.append(doc) return unique_docs