File size: 523 Bytes
b7f3196 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
from rank_bm25 import BM25Okapi
from .utils import tokenize
class BM25Index:
def __init__(self, docs):
self.docs = docs
self.corpus_tokens = [tokenize(d.text) for d in docs]
self.bm25 = BM25Okapi(self.corpus_tokens)
def search(self, query: str, k: int = 50):
q = tokenize(query)
scores = self.bm25.get_scores(q)
top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
return [(self.docs[i], float(scores[i])) for i in top]
|