Spaces:

WillyCodesInit
/

finSmart.ai

Sleeping

WillyCodesInit commited on May 7, 2025

Commit

f143c26

verified ·

1 Parent(s): 308e42a

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -1,25 +1,38 @@
-import pandas as pd
-from sentence_transformers import SentenceTransformer
-import faiss
 import numpy as np
-def load_dataset(path):
-    df = pd.read_csv(path)
-    df = df.dropna(subset=['question', 'answer'])
     return df
-def embed_questions(df):
-    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-    embeddings = embed_model.encode(df["question"].tolist(), convert_to_tensor=False)
-    index = faiss.IndexFlatL2(embeddings[0].shape[0])
-    index.add(np.array(embeddings))
-    return embed_model, index
-def retrieve_context(query, embed_model, index, df, k=3):
-    query_embedding = embed_model.encode([query])[0]
-    distances, indices = index.search(np.array([query_embedding]), k)
-    results = []
-    for i in indices[0]:
-        if i < len(df):
-            results.append(f"Q: {df.iloc[i]['question']}\nA: {df.iloc[i]['answer']}")
-    return "\n\n".join(results)

 import numpy as np
+import faiss
+import json
+from sentence_transformers import SentenceTransformer
+def load_dataset(file_path):
+    """
+    Loads the dataset (CSV file) and returns a list of Q&A pairs.
+    """
+    import pandas as pd
+    df = pd.read_csv(file_path)
+    df.dropna(subset=["question", "answer"], inplace=True)  # Remove any rows with missing questions/answers
     return df
+def embed_questions(df, model_name='all-MiniLM-L6-v2'):
+    """
+    Embeds the questions and answers using the sentence transformer model.
+    """
+    model = SentenceTransformer(model_name)
+    qa_pairs = [f"Q: {q.strip()} A: {a.strip()}" for q, a in zip(df["question"], df["answer"])]
+    embeddings = model.encode(qa_pairs, show_progress_bar=True)
+    embeddings = np.array(embeddings).astype("float32")
+    # Create FAISS index
+    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create the index for cosine similarity search
+    index.add(embeddings)
+    # Return QA pairs and the index
+    return qa_pairs, index
+def retrieve_context(query, embed_model, index, qa_pairs, top_k=3):
+    """
+    Retrieves the most relevant context from the dataset for a given query.
+    """
+    query_embedding = embed_model.encode([query])
+    D, I = index.search(np.array(query_embedding).astype("float32"), top_k)
+    retrieved_qa_pairs = [qa_pairs[i] for i in I[0]]
+    return "\n".join([f"- {pair}" for pair in retrieved_qa_pairs])