Spaces:
Runtime error
Runtime error
File size: 1,659 Bytes
a1f1a38 e35733f a1f1a38 7cc1a64 7f0994a a1f1a38 ec69ee6 e35733f a1f1a38 fad2a9f ec69ee6 a1f1a38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import kagglehub
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gc
# Step 1: Download and Load the Dataset
def load_dataset():
#kagglehub.dataset_download("nkenyor/physics-question-and-answer-brief")
dataset_path = "./questions_answers.csv"
df = pd.read_csv(dataset_path)
return df
# Step 2: Create Embeddings and FAISS Index
def setup_faiss_index():
df = load_dataset() # Load dataset
questions = df["question"].tolist()
del df # Delete DataFrame if no longer needed
gc.collect() # Force garbage collection
# Load embedding model
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Convert dataset questions into embeddings
question_embeddings = embedder.encode(questions, convert_to_numpy=True)
# Create FAISS index
index = faiss.IndexFlatL2(question_embeddings.shape[1])
index.add(question_embeddings)
# Save FAISS index and questions
faiss.write_index(index, "physics_faiss.index")
np.save("questions.npy", np.array(questions))
return index, questions, embedder
# Step 3: Load the FAISS index and retrieve answers
def retrieve_best_match(user_query, top_k=1):
index = faiss.read_index("physics_faiss.index")
questions = np.load("questions.npy", allow_pickle=True)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_embedding = embedder.encode([user_query], convert_to_numpy=True)
_, retrieved_indices = index.search(query_embedding, top_k)
return questions[retrieved_indices[0][0]] # Return the best-matching question
|