Spaces:

Vikrant26
/

Finance_Bot

Sleeping

App Files Files Community

Finance_Bot / rag.py

Vikrant26

Upload 6 files

65cdc34 verified about 1 year ago

raw

history blame contribute delete

4.92 kB

	from typing import List
	import google.generativeai as genai
	from langchain.embeddings.base import Embeddings
	from langchain_community.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from PyPDF2 import PdfReader
	import pandas as pd
	import os

	class CustomGoogleEmbeddings(Embeddings):
	"""Custom Embedding Class for Google Generative AI"""
	def __init__(self, model='models/embedding-001'):
	self.client = genai
	self.model = model

	def embed_documents(self, texts: List[str]) -> List[List[float]]:
	embeddings = []
	for text in texts:
	text = text[:2048] if len(text) > 2048 else text
	try:
	embedding = self.client.embed_content(
	model=self.model,
	content=text,
	task_type="retrieval_document"
	)['embedding']
	embeddings.append(embedding)
	except Exception as e:
	print(f"Embedding error: {e}")
	embeddings.append([0.0] * 768)
	return embeddings

	def embed_query(self, text: str) -> List[float]:
	text = text[:2048] if len(text) > 2048 else text
	try:
	return self.client.embed_content(
	model=self.model,
	content=text,
	task_type="retrieval_query"
	)['embedding']
	except Exception as e:
	print(f"Query embedding error: {e}")
	return [0.0] * 768

	class RAGProcessor:
	def __init__(self):
	self.embeddings = CustomGoogleEmbeddings()
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	separators=["\n\n", "\n", ".", ",", " ", ""]
	)
	genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
	self.model = genai.GenerativeModel('gemini-pro')

	def extract_text_from_pdf(self, pdf_file) -> str:
	"""Extract text from PDF with focus on structured content"""
	try:
	pdf_reader = PdfReader(pdf_file)
	text = ""

	for page in pdf_reader.pages:
	text += page.extract_text() + "\n\n"

	# Basic structure preservation
	# Look for common P&L statement patterns
	lines = text.split('\n')
	structured_text = ""
	for line in lines:
	# Identify potential financial entries (e.g., "Revenue: $1000")
	if any(keyword in line.lower() for keyword in ['revenue', 'profit', 'loss', 'expenses', 'income', 'cost', 'margin', 'ebitda', 'tax']):
	structured_text += f"FINANCIAL_ENTRY: {line}\n"
	else:
	structured_text += line + "\n"

	return structured_text

	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return ""

	def process_documents(self, pdf_files: List[str]) -> FAISS:
	"""Process multiple PDF documents and create vector store"""
	combined_text = ""
	for pdf in pdf_files:
	combined_text += self.extract_text_from_pdf(pdf)

	# Create more focused chunks
	text_chunks = self.text_splitter.split_text(combined_text)

	# Create vector store
	try:
	vector_store = FAISS.from_texts(text_chunks, embedding=self.embeddings)
	return vector_store
	except Exception as e:
	print(f"Error creating vector store: {e}")
	raise

	def generate_response(self, question: str, vector_store: FAISS) -> str:
	"""Generate response using RAG approach"""
	# Retrieve relevant context
	docs = vector_store.similarity_search(question, k=4)
	context = "\n".join([doc.page_content for doc in docs])

	prompt = f"""
	You are a financial analyst assistant. Using the following financial data context,
	answer the question accurately and professionally. Include specific numbers and
	calculations when relevant.

	Context: {context}

	Question: {question}

	If the context doesn't contain enough information to answer accurately,
	please state that clearly. Focus on P&L related information and financial metrics.
	When providing financial figures, please format them clearly with appropriate units
	(e.g., "$1,234,567" or "1.2M" for millions).
	"""

	try:
	response = self.model.generate_content(prompt)
	return response.text
	except Exception as e:
	return f"Error generating response: {e}"