import os
import pandas as pd
import faiss
import pickle
from sentence_transformers import SentenceTransformer

def chunk_dataset_rows(file_path="data/medical_qa.csv"):
    if not os.path.exists(file_path):
        # Fallback: Sample data if the CSV file is missing
        data = {
            "question": [
                "What are the symptoms of diabetes?",
                "How is hypertension treated?"
            ],
            "answer": [
                "Symptoms include increased thirst and frequent urination.",
                "Treatment includes lifestyle changes and medication."
            ]
        }
        df = pd.DataFrame(data)
    else:
        df = pd.read_csv(file_path)

    chunks = []
    for i, row in df.iterrows():
        question = row.get("question", "")
        answer = row.get("answer", "")
        if isinstance(question, str) and isinstance(answer, str):
            chunks.append(f"Q: {question}\nA: {answer}")
    return chunks

def process_medical_dataset():
    file_path = "data/medical_qa.csv"
    chunks = chunk_dataset_rows(file_path)

    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    embeddings = model.encode(chunks)

    # Create FAISS index
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    # Save index and chunks
    os.makedirs("faiss_index", exist_ok=True)
    faiss.write_index(index, "faiss_index/index.faiss")
    with open("faiss_index/index.pkl", "wb") as f:
        pickle.dump(chunks, f)

    print("✅ Medical dataset processed and indexed.")