ecopus's picture
Add selection.py
3704e18 verified
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np # Import numpy
class ExampleSelector:
def __init__(self, df):
self.vectorizer = TfidfVectorizer() # Define TF-IDF vectorizer
self.embeddings = self.vectorizer.fit_transform(df["text"].tolist())
self.texts = df["text"].tolist()
self.labels = df["label"].tolist()
self.df = df
def most_similar(self, query, k=1):
q_emb = self.vectorizer.transform([query]) # Embed training text
sims = cosine_similarity(q_emb, self.embeddings).flatten() # Performs cosine similatiry search
idxs = np.argsort(sims)[::-1][:k]
return [(self.texts[i], self.labels[i]) for i in idxs]