File size: 579 Bytes
c35e65d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class TfidfRetriever:
    def __init__(self, train_texts):
        self.vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
        self.X = self.vec.fit_transform(train_texts)
        self.texts = list(train_texts)

    def topk(self, query_text, k=1):
        q = self.vec.transform([query_text])
        sims = cosine_similarity(q, self.X)[0]
        idxs = np.argsort(-sims)[:k]
        return [(self.texts[i], float(sims[i])) for i in idxs]