Spaces:

musaashaikh
/

proj_chatbot

Runtime error

App Files Files Community

proj_chatbot / utils.py

musaashaikh

Update utils.py

ec69ee6 verified about 1 year ago

raw

history blame contribute delete

1.66 kB

	import kagglehub
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import gc

	# Step 1: Download and Load the Dataset
	def load_dataset():
	#kagglehub.dataset_download("nkenyor/physics-question-and-answer-brief")
	dataset_path = "./questions_answers.csv"
	df = pd.read_csv(dataset_path)
	return df



	# Step 2: Create Embeddings and FAISS Index
	def setup_faiss_index():
	df = load_dataset() # Load dataset
	questions = df["question"].tolist()

	del df # Delete DataFrame if no longer needed
	gc.collect() # Force garbage collection

	# Load embedding model
	embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	# Convert dataset questions into embeddings
	question_embeddings = embedder.encode(questions, convert_to_numpy=True)

	# Create FAISS index
	index = faiss.IndexFlatL2(question_embeddings.shape[1])
	index.add(question_embeddings)

	# Save FAISS index and questions
	faiss.write_index(index, "physics_faiss.index")
	np.save("questions.npy", np.array(questions))

	return index, questions, embedder

	# Step 3: Load the FAISS index and retrieve answers
	def retrieve_best_match(user_query, top_k=1):
	index = faiss.read_index("physics_faiss.index")
	questions = np.load("questions.npy", allow_pickle=True)
	embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	query_embedding = embedder.encode([user_query], convert_to_numpy=True)
	_, retrieved_indices = index.search(query_embedding, top_k)
	return questions[retrieved_indices[0][0]] # Return the best-matching question