File size: 1,659 Bytes
a1f1a38
 
 
 
 
e35733f
a1f1a38
 
 
7cc1a64
7f0994a
a1f1a38
 
 
ec69ee6
e35733f
a1f1a38
 
 
fad2a9f
ec69ee6
 
 
a1f1a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import kagglehub
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gc

# Step 1: Download and Load the Dataset
def load_dataset():
    #kagglehub.dataset_download("nkenyor/physics-question-and-answer-brief")
    dataset_path = "./questions_answers.csv"
    df = pd.read_csv(dataset_path)
    return df



# Step 2: Create Embeddings and FAISS Index
def setup_faiss_index():
    df = load_dataset()  # Load dataset
    questions = df["question"].tolist()

    del df  # Delete DataFrame if no longer needed
    gc.collect()  # Force garbage collection
    
    # Load embedding model
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    # Convert dataset questions into embeddings
    question_embeddings = embedder.encode(questions, convert_to_numpy=True)

    # Create FAISS index
    index = faiss.IndexFlatL2(question_embeddings.shape[1])
    index.add(question_embeddings)

    # Save FAISS index and questions
    faiss.write_index(index, "physics_faiss.index")
    np.save("questions.npy", np.array(questions))

    return index, questions, embedder

# Step 3: Load the FAISS index and retrieve answers
def retrieve_best_match(user_query, top_k=1):
    index = faiss.read_index("physics_faiss.index")
    questions = np.load("questions.npy", allow_pickle=True)
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    query_embedding = embedder.encode([user_query], convert_to_numpy=True)
    _, retrieved_indices = index.search(query_embedding, top_k)
    return questions[retrieved_indices[0][0]]  # Return the best-matching question