from rank_bm25 import BM25Okapi from .utils import tokenize class BM25Index: def __init__(self, docs): self.docs = docs self.corpus_tokens = [tokenize(d.text) for d in docs] self.bm25 = BM25Okapi(self.corpus_tokens) def search(self, query: str, k: int = 50): q = tokenize(query) scores = self.bm25.get_scores(q) top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k] return [(self.docs[i], float(scores[i])) for i in top]