Add selection.py
Browse files- selection.py +15 -0
selection.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
+
|
| 5 |
+
class TfidfRetriever:
|
| 6 |
+
def __init__(self, train_texts):
|
| 7 |
+
self.vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
|
| 8 |
+
self.X = self.vec.fit_transform(train_texts)
|
| 9 |
+
self.texts = list(train_texts)
|
| 10 |
+
|
| 11 |
+
def topk(self, query_text, k=1):
|
| 12 |
+
q = self.vec.transform([query_text])
|
| 13 |
+
sims = cosine_similarity(q, self.X)[0]
|
| 14 |
+
idxs = np.argsort(-sims)[:k]
|
| 15 |
+
return [(self.texts[i], float(sims[i])) for i in idxs]
|