|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import numpy as np |
|
|
|
|
|
class ExampleSelector: |
|
|
def __init__(self, df): |
|
|
self.vectorizer = TfidfVectorizer() |
|
|
self.embeddings = self.vectorizer.fit_transform(df["text"].tolist()) |
|
|
self.texts = df["text"].tolist() |
|
|
self.labels = df["label"].tolist() |
|
|
self.df = df |
|
|
|
|
|
def most_similar(self, query, k=1): |
|
|
q_emb = self.vectorizer.transform([query]) |
|
|
sims = cosine_similarity(q_emb, self.embeddings).flatten() |
|
|
idxs = np.argsort(sims)[::-1][:k] |
|
|
return [(self.texts[i], self.labels[i]) for i in idxs] |