| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| class TfidfRetriever: | |
| def __init__(self, train_texts): | |
| self.vec = TfidfVectorizer(ngram_range=(1,2), min_df=1) | |
| self.X = self.vec.fit_transform(train_texts) | |
| self.texts = list(train_texts) | |
| def topk(self, query_text, k=1): | |
| q = self.vec.transform([query_text]) | |
| sims = cosine_similarity(q, self.X)[0] | |
| idxs = np.argsort(-sims)[:k] | |
| return [(self.texts[i], float(sims[i])) for i in idxs] |