Spaces:
Sleeping
Sleeping
File size: 1,960 Bytes
9c4c212 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def maximal_marginal_relevance(query_embedding: np.ndarray, doc_embeddings: np.ndarray, lambda_mult: float = 0.5, top_k: int = 5):
"""
Selects docs that are relevant to query but diverse from each other.
"""
if len(doc_embeddings) == 0:
return []
# Simple MMR implementation
selected_indices = []
candidate_indices = list(range(len(doc_embeddings)))
for _ in range(top_k):
best_score = -np.inf
best_idx = -1
for idx in candidate_indices:
# Relevance
rel_score = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings[idx].reshape(1, -1))[0][0]
# Diversity (sim to already selected)
if not selected_indices:
div_score = 0
else:
sims = cosine_similarity(doc_embeddings[idx].reshape(1, -1), doc_embeddings[selected_indices])[0]
div_score = np.max(sims)
mmr_score = lambda_mult * rel_score - (1 - lambda_mult) * div_score
if mmr_score > best_score:
best_score = mmr_score
best_idx = idx
if best_idx != -1:
selected_indices.append(best_idx)
candidate_indices.remove(best_idx)
return selected_indices
def deduplicate_docs(docs: list[dict], threshold: float = 0.95) -> list[dict]:
"""
Remove near-duplicates based on content string similarity (simple)
or just exact match for now to be fast.
"""
seen = set()
unique_docs = []
for doc in docs:
# Assuming doc is a string or dict with 'content'
content = doc if isinstance(doc, str) else doc.get('content', '')
if content not in seen:
seen.add(content)
unique_docs.append(doc)
return unique_docs
|