clusd-search / src /sparse.py
Ishika-max
CluSD end-to-end app
4b3b4fa
Raw
History Blame Contribute Delete
500 Bytes
import numpy as np
from rank_bm25 import BM25Okapi
class SparseRetriever:
def __init__(self, doc_texts):
self.tokenized_corpus = [t.lower().split() for t in doc_texts]
self.bm25 = BM25Okapi(self.tokenized_corpus)
def retrieve(self, query_text, top_k=1000):
tokens = query_text.lower().split()
scores = self.bm25.get_scores(tokens)
top_indices = np.argsort(scores)[::-1][:top_k]
return [(int(i), float(scores[i])) for i in top_indices]