File size: 579 Bytes
c35e65d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class TfidfRetriever:
def __init__(self, train_texts):
self.vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
self.X = self.vec.fit_transform(train_texts)
self.texts = list(train_texts)
def topk(self, query_text, k=1):
q = self.vec.transform([query_text])
sims = cosine_similarity(q, self.X)[0]
idxs = np.argsort(-sims)[:k]
return [(self.texts[i], float(sims[i])) for i in idxs] |