Spaces:

Vineetha00
/

llm-chat-assistant

Runtime error

llm-chat-assistant / process_dataset.py

Update process_dataset.py

d66eafd verified 12 months ago

1.59 kB

	import os
	import pandas as pd
	import faiss
	import pickle
	from sentence_transformers import SentenceTransformer

	def chunk_dataset_rows(file_path="data/medical_qa.csv"):
	if not os.path.exists(file_path):
	# Fallback: Sample data if the CSV file is missing
	data = {
	"question": [
	"What are the symptoms of diabetes?",
	"How is hypertension treated?"
	],
	"answer": [
	"Symptoms include increased thirst and frequent urination.",
	"Treatment includes lifestyle changes and medication."
	]
	}
	df = pd.DataFrame(data)
	else:
	df = pd.read_csv(file_path)

	chunks = []
	for i, row in df.iterrows():
	question = row.get("question", "")
	answer = row.get("answer", "")
	if isinstance(question, str) and isinstance(answer, str):
	chunks.append(f"Q: {question}\nA: {answer}")
	return chunks

	def process_medical_dataset():
	file_path = "data/medical_qa.csv"
	chunks = chunk_dataset_rows(file_path)

	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	embeddings = model.encode(chunks)

	# Create FAISS index
	dim = embeddings[0].shape[0]
	index = faiss.IndexFlatL2(dim)
	index.add(embeddings)

	# Save index and chunks
	os.makedirs("faiss_index", exist_ok=True)
	faiss.write_index(index, "faiss_index/index.faiss")
	with open("faiss_index/index.pkl", "wb") as f:
	pickle.dump(chunks, f)

	print("✅ Medical dataset processed and indexed.")