Spaces:

WillyCodesInit
/

finSmart.ai

Sleeping

App Files Files Community

WillyCodesInit commited on May 7, 2025

Commit

89da9cc

verified ·

1 Parent(s): b02256f

Update utils.py

Browse files

Files changed (1) hide show

utils.py +73 -22

utils.py CHANGED Viewed

@@ -1,32 +1,83 @@
 import pandas as pd
 import numpy as np
-import faiss
 from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# Load your CSV with 'question' and 'answer' columns
-df = pd.read_csv("train_data.csv")
-qa_pairs = df["question"] + " | " + df["answer"]
-# Sentence Transformer for embeddings
-embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-embeddings = embedding_model.encode(qa_pairs.tolist(), convert_to_numpy=True)
-# FAISS index
-dimension = embeddings.shape[1]
-index = faiss.IndexFlatL2(dimension)
-index.add(embeddings)
-# FLAN-T5
-tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
-model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
-def ask_finance_bot(user_query, top_k=3):
     query_embedding = embedding_model.encode([user_query])
-    D, I = index.search(np.array(query_embedding), top_k)
-    context = "\n".join([qa_pairs[i] for i in I[0]])
-    prompt = f"Context:\n{context}\n\nQuestion: {user_query}\nAnswer:"
-    inputs = tokenizer(prompt, return_tensors="pt")
-    outputs = model.generate(**inputs, max_new_tokens=2045)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)

+import json
 import pandas as pd
 import numpy as np
 from sentence_transformers import SentenceTransformer
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Initialize model and tokenizer
+model_name = "google/flan-t5-base"  # You can use a different model if needed
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Sentence transformer model to encode questions for similarity
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Load question-answer data from CSV
+def load_qa_data_from_csv(file_path):
+    """
+    Reads a CSV file containing question-answer pairs.
+    Assumes the CSV file has columns 'question' and 'answer'.
+    """
+    data = pd.read_csv(file_path)
+    qa_pairs = list(zip(data['question'], data['answer']))
+    return qa_pairs
+# Load question-answer data from JSON
+def load_qa_data_from_json(file_path):
+    """
+    Reads a JSON file containing question-answer pairs.
+    """
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    qa_pairs = [(item['question'], item['answer']) for item in data]
+    return qa_pairs
+# Check if the question is related to finance
+def is_valid_finance_question(question):
+    # Here you can refine the check to use model verification as well
+    # For now, we are doing a simple check based on keywords
+    finance_keywords = ['finance', 'investment', 'bank', 'insurance', 'credit', 'budget', 'economy', 'inflation',
+                        'debt', 'interest', 'mortgage', 'pension', 'retirement', 'savings']
+    return any(keyword in question.lower() for keyword in finance_keywords)
+# Generate the response for a valid financial question
+def ask_finance_bot(user_query, qa_pairs):
+    # Embed the user query
     query_embedding = embedding_model.encode([user_query])
+    # Assuming 'index' here is a pre-built FAISS index or similar structure
+    # For this example, using a basic search from qa_pairs
+    retrieved_qa_pairs = qa_pairs[:3]  # Take top 3 for now, or improve with vector search
+    # Temperature control to avoid repetition if same question is asked frequently
+    temperature = 0.7
+    instruction = (
+        "You are a highly knowledgeable AI assistant specializing strictly in finance.\n"
+        "Strictly answer only financially related topics.\n"
+        "Do not answer anything outside finance.\n"
+        "Always provide accurate, objective, and concise answers to financial questions.\n"
+    )
+    # Create the prompt for the model
+    prompt = f"{instruction}\n\nUser query: {user_query}\nAnswer:"
+    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
+    output_ids = model.generate(
+        **input_ids,
+        max_new_tokens=256,
+        temperature=temperature,
+        top_p=0.9,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    answer_text = response.split("Answer:")[-1].strip()
+    if is_valid_finance_question(answer_text):
+        return answer_text
+    else:
+        return "I'm specialized in finance and can't help with that."