import json import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Initialize model and tokenizer model_name = "google/flan-t5-base" # You can use a different model if needed model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Sentence transformer model to encode questions for similarity embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Load question-answer data from CSV def load_qa_data_from_csv(file_path): """ Reads a CSV file containing question-answer pairs. Assumes the CSV file has columns 'question' and 'answer'. """ data = pd.read_csv(file_path) qa_pairs = list(zip(data['question'], data['answer'])) return qa_pairs # Load question-answer data from JSON def load_qa_data_from_json(file_path): """ Reads a JSON file containing question-answer pairs. """ with open(file_path, 'r') as file: data = json.load(file) qa_pairs = [(item['question'], item['answer']) for item in data] return qa_pairs # Check if the question is related to finance def is_valid_finance_question(question): # Here you can refine the check to use model verification as well # For now, we are doing a simple check based on keywords finance_keywords = ['finance', 'investment', 'bank', 'insurance', 'credit', 'budget', 'economy', 'inflation', 'debt', 'interest', 'mortgage', 'pension', 'retirement', 'savings'] return any(keyword in question.lower() for keyword in finance_keywords) # Generate the response for a valid financial question def ask_finance_bot(user_query, qa_pairs): # Embed the user query query_embedding = embedding_model.encode([user_query]) # Assuming 'index' here is a pre-built FAISS index or similar structure # For this example, using a basic search from qa_pairs retrieved_qa_pairs = qa_pairs[:3] # Take top 3 for now, or improve with vector search # Temperature control to avoid repetition if same question is asked frequently temperature = 0.7 instruction = ( "You are a highly knowledgeable AI assistant specializing strictly in finance.\n" "Strictly answer only financially related topics.\n" "Do not answer anything outside finance.\n" "Always provide accurate, objective, and concise answers to financial questions.\n" ) # Create the prompt for the model prompt = f"{instruction}\n\nUser query: {user_query}\nAnswer:" input_ids = tokenizer(prompt, return_tensors="pt").to(model.device) output_ids = model.generate( **input_ids, max_new_tokens=256, temperature=temperature, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(output_ids[0], skip_special_tokens=True) answer_text = response.split("Answer:")[-1].strip() if is_valid_finance_question(answer_text): return answer_text else: return "I'm specialized in finance and can't help with that."