Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| class AmharicChatbot: | |
| def __init__(self, csv_path, threshold=0.80): | |
| self.df = pd.read_csv(csv_path) | |
| self.model = SentenceTransformer("intfloat/multilingual-e5-small") | |
| self.threshold = threshold | |
| self.build_index() | |
| def build_index(self): | |
| self.embeddings = self.model.encode( | |
| ["passage: " + q for q in self.df["question"].tolist()], | |
| show_progress_bar=True | |
| ).astype("float32") | |
| self.index = faiss.IndexFlatL2(self.embeddings.shape[1]) | |
| self.index.add(self.embeddings) | |
| def get_answer(self, user_question, k=1): | |
| user_embedding = self.model.encode([f"query: {user_question}"])[0].astype("float32") | |
| D, I = self.index.search(np.array([user_embedding]), k) | |
| if len(I[0]) == 0: | |
| return "__OUT_OF_SCOPE__" | |
| top_idx = I[0][0] | |
| top_embedding = self.embeddings[top_idx] | |
| # Normalize embeddings before cosine similarity | |
| user_embedding = user_embedding / np.linalg.norm(user_embedding) | |
| top_embedding = top_embedding / np.linalg.norm(top_embedding) | |
| score = cosine_similarity([user_embedding], [top_embedding])[0][0] | |
| if score < self.threshold: | |
| return "__OUT_OF_SCOPE__" | |
| return self.df.iloc[top_idx]["answer"] | |