File size: 523 Bytes
b7f3196
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from rank_bm25 import BM25Okapi
from .utils import tokenize

class BM25Index:
    def __init__(self, docs):
        self.docs = docs
        self.corpus_tokens = [tokenize(d.text) for d in docs]
        self.bm25 = BM25Okapi(self.corpus_tokens)

    def search(self, query: str, k: int = 50):
        q = tokenize(query)
        scores = self.bm25.get_scores(q)
        top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
        return [(self.docs[i], float(scores[i])) for i in top]