from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Import numpy class ExampleSelector: def __init__(self, df): self.vectorizer = TfidfVectorizer() # Define TF-IDF vectorizer self.embeddings = self.vectorizer.fit_transform(df["text"].tolist()) self.texts = df["text"].tolist() self.labels = df["label"].tolist() self.df = df def most_similar(self, query, k=1): q_emb = self.vectorizer.transform([query]) # Embed training text sims = cosine_similarity(q_emb, self.embeddings).flatten() # Performs cosine similatiry search idxs = np.argsort(sims)[::-1][:k] return [(self.texts[i], self.labels[i]) for i in idxs]