llm-chat-assistant / process_dataset.py
Vineetha00's picture
Update process_dataset.py
d66eafd verified
import os
import pandas as pd
import faiss
import pickle
from sentence_transformers import SentenceTransformer
def chunk_dataset_rows(file_path="data/medical_qa.csv"):
if not os.path.exists(file_path):
# Fallback: Sample data if the CSV file is missing
data = {
"question": [
"What are the symptoms of diabetes?",
"How is hypertension treated?"
],
"answer": [
"Symptoms include increased thirst and frequent urination.",
"Treatment includes lifestyle changes and medication."
]
}
df = pd.DataFrame(data)
else:
df = pd.read_csv(file_path)
chunks = []
for i, row in df.iterrows():
question = row.get("question", "")
answer = row.get("answer", "")
if isinstance(question, str) and isinstance(answer, str):
chunks.append(f"Q: {question}\nA: {answer}")
return chunks
def process_medical_dataset():
file_path = "data/medical_qa.csv"
chunks = chunk_dataset_rows(file_path)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(chunks)
# Create FAISS index
dim = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
# Save index and chunks
os.makedirs("faiss_index", exist_ok=True)
faiss.write_index(index, "faiss_index/index.faiss")
with open("faiss_index/index.pkl", "wb") as f:
pickle.dump(chunks, f)
print("βœ… Medical dataset processed and indexed.")