clusd-search / src /baselines.py
Ishika-max
CluSD end-to-end app
4b3b4fa
Raw
History Blame Contribute Delete
1.29 kB
import numpy as np
def dense_only_retrieve(query_emb, doc_embeddings, top_k=1000):
sims = np.dot(doc_embeddings, query_emb)
top_indices = np.argsort(sims)[::-1][:top_k]
return [(int(i), float(sims[i])) for i in top_indices]
def ivf_retrieve(query_emb, cluster_manager, doc_embeddings, top_clusters=5):
c_sims = np.dot(cluster_manager.centroids, query_emb)
top_c = np.argsort(c_sims)[::-1][:top_clusters]
candidate_docs = []
for c in top_c:
candidate_docs.extend(cluster_manager.cluster_to_docs[c])
if not candidate_docs:
return []
candidate_embs = doc_embeddings[candidate_docs]
sims = np.dot(candidate_embs, query_emb)
results = [(candidate_docs[i], float(sims[i])) for i in range(len(candidate_docs))]
results.sort(key=lambda x: x[1], reverse=True)
return results
def rerank_retrieve(query_emb, sparse_results, doc_embeddings, top_sparse=1000):
candidates = [r[0] for r in sparse_results[:top_sparse]]
if not candidates: return []
candidate_embs = doc_embeddings[candidates]
sims = np.dot(candidate_embs, query_emb)
results = [(candidates[i], float(sims[i])) for i in range(len(candidates))]
results.sort(key=lambda x: x[1], reverse=True)
return results