import kagglehub import pandas as pd from sentence_transformers import SentenceTransformer import faiss import numpy as np import gc # Step 1: Download and Load the Dataset def load_dataset(): #kagglehub.dataset_download("nkenyor/physics-question-and-answer-brief") dataset_path = "./questions_answers.csv" df = pd.read_csv(dataset_path) return df # Step 2: Create Embeddings and FAISS Index def setup_faiss_index(): df = load_dataset() # Load dataset questions = df["question"].tolist() del df # Delete DataFrame if no longer needed gc.collect() # Force garbage collection # Load embedding model embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Convert dataset questions into embeddings question_embeddings = embedder.encode(questions, convert_to_numpy=True) # Create FAISS index index = faiss.IndexFlatL2(question_embeddings.shape[1]) index.add(question_embeddings) # Save FAISS index and questions faiss.write_index(index, "physics_faiss.index") np.save("questions.npy", np.array(questions)) return index, questions, embedder # Step 3: Load the FAISS index and retrieve answers def retrieve_best_match(user_query, top_k=1): index = faiss.read_index("physics_faiss.index") questions = np.load("questions.npy", allow_pickle=True) embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") query_embedding = embedder.encode([user_query], convert_to_numpy=True) _, retrieved_indices = index.search(query_embedding, top_k) return questions[retrieved_indices[0][0]] # Return the best-matching question