File size: 1,960 Bytes
9c4c212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def maximal_marginal_relevance(query_embedding: np.ndarray, doc_embeddings: np.ndarray, lambda_mult: float = 0.5, top_k: int = 5):
    """
    Selects docs that are relevant to query but diverse from each other.
    """
    if len(doc_embeddings) == 0:
        return []
    
    # Simple MMR implementation
    selected_indices = []
    candidate_indices = list(range(len(doc_embeddings)))
    
    for _ in range(top_k):
        best_score = -np.inf
        best_idx = -1
        
        for idx in candidate_indices:
            # Relevance
            rel_score = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings[idx].reshape(1, -1))[0][0]
            
            # Diversity (sim to already selected)
            if not selected_indices:
                div_score = 0
            else:
                sims = cosine_similarity(doc_embeddings[idx].reshape(1, -1), doc_embeddings[selected_indices])[0]
                div_score = np.max(sims)
                
            mmr_score = lambda_mult * rel_score - (1 - lambda_mult) * div_score
            
            if mmr_score > best_score:
                best_score = mmr_score
                best_idx = idx
                
        if best_idx != -1:
            selected_indices.append(best_idx)
            candidate_indices.remove(best_idx)
            
    return selected_indices

def deduplicate_docs(docs: list[dict], threshold: float = 0.95) -> list[dict]:
    """
    Remove near-duplicates based on content string similarity (simple) 
    or just exact match for now to be fast.
    """
    seen = set()
    unique_docs = []
    for doc in docs:
        # Assuming doc is a string or dict with 'content'
        content = doc if isinstance(doc, str) else doc.get('content', '')
        if content not in seen:
            seen.add(content)
            unique_docs.append(doc)
    return unique_docs