import kagglehub
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gc

# Step 1: Download and Load the Dataset
def load_dataset():
    #kagglehub.dataset_download("nkenyor/physics-question-and-answer-brief")
    dataset_path = "./questions_answers.csv"
    df = pd.read_csv(dataset_path)
    return df


# Step 2: Create Embeddings and FAISS Index
def setup_faiss_index():
    df = load_dataset()  # Load dataset
    questions = df["question"].tolist()

    del df  # Delete DataFrame if no longer needed
    gc.collect()  # Force garbage collection
    
    # Load embedding model
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    # Convert dataset questions into embeddings
    question_embeddings = embedder.encode(questions, convert_to_numpy=True)

    # Create FAISS index
    index = faiss.IndexFlatL2(question_embeddings.shape[1])
    index.add(question_embeddings)

    # Save FAISS index and questions
    faiss.write_index(index, "physics_faiss.index")
    np.save("questions.npy", np.array(questions))

    return index, questions, embedder

# Step 3: Load the FAISS index and retrieve answers
def retrieve_best_match(user_query, top_k=1):
    index = faiss.read_index("physics_faiss.index")
    questions = np.load("questions.npy", allow_pickle=True)
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    query_embedding = embedder.encode([user_query], convert_to_numpy=True)
    _, retrieved_indices = index.search(query_embedding, top_k)
    return questions[retrieved_indices[0][0]]  # Return the best-matching question