import os import pandas as pd import faiss import pickle from sentence_transformers import SentenceTransformer def chunk_dataset_rows(file_path="data/medical_qa.csv"): if not os.path.exists(file_path): # Fallback: Sample data if the CSV file is missing data = { "question": [ "What are the symptoms of diabetes?", "How is hypertension treated?" ], "answer": [ "Symptoms include increased thirst and frequent urination.", "Treatment includes lifestyle changes and medication." ] } df = pd.DataFrame(data) else: df = pd.read_csv(file_path) chunks = [] for i, row in df.iterrows(): question = row.get("question", "") answer = row.get("answer", "") if isinstance(question, str) and isinstance(answer, str): chunks.append(f"Q: {question}\nA: {answer}") return chunks def process_medical_dataset(): file_path = "data/medical_qa.csv" chunks = chunk_dataset_rows(file_path) model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embeddings = model.encode(chunks) # Create FAISS index dim = embeddings[0].shape[0] index = faiss.IndexFlatL2(dim) index.add(embeddings) # Save index and chunks os.makedirs("faiss_index", exist_ok=True) faiss.write_index(index, "faiss_index/index.faiss") with open("faiss_index/index.pkl", "wb") as f: pickle.dump(chunks, f) print("✅ Medical dataset processed and indexed.")