proj_chatbot / utils.py
musaashaikh's picture
Update utils.py
ec69ee6 verified
import kagglehub
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gc
# Step 1: Download and Load the Dataset
def load_dataset():
#kagglehub.dataset_download("nkenyor/physics-question-and-answer-brief")
dataset_path = "./questions_answers.csv"
df = pd.read_csv(dataset_path)
return df
# Step 2: Create Embeddings and FAISS Index
def setup_faiss_index():
df = load_dataset() # Load dataset
questions = df["question"].tolist()
del df # Delete DataFrame if no longer needed
gc.collect() # Force garbage collection
# Load embedding model
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Convert dataset questions into embeddings
question_embeddings = embedder.encode(questions, convert_to_numpy=True)
# Create FAISS index
index = faiss.IndexFlatL2(question_embeddings.shape[1])
index.add(question_embeddings)
# Save FAISS index and questions
faiss.write_index(index, "physics_faiss.index")
np.save("questions.npy", np.array(questions))
return index, questions, embedder
# Step 3: Load the FAISS index and retrieve answers
def retrieve_best_match(user_query, top_k=1):
index = faiss.read_index("physics_faiss.index")
questions = np.load("questions.npy", allow_pickle=True)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_embedding = embedder.encode([user_query], convert_to_numpy=True)
_, retrieved_indices = index.search(query_embedding, top_k)
return questions[retrieved_indices[0][0]] # Return the best-matching question