Spaces:

Miraj74
/

Financial_Document_QA

Sleeping

App Files Files Community

Financial_Document_QA / app.py

Miraj74

Update app.py

a06da41 verified 8 months ago

raw

history blame contribute delete

12.7 kB

	import gradio as gr
	import PyPDF2
	import io
	from together import Together
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains import ConversationalRetrievalChain
	from langchain.memory import ConversationBufferMemory
	from langchain.llms.base import LLM
	from typing import List, Optional
	import traceback


	# ---------------------------
	# WRAP TOGETHER API AS LLM
	# ---------------------------
	class TogetherLLM(LLM):
	client: Together = None
	model: str = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
	temperature: float = 0.3
	max_tokens: int = 1000

	def __init__(self, client, model="meta-llama/Llama-3.3-70B-Instruct-Turbo", temperature=0.3, max_tokens=1000, **kwargs):
	super().__init__(**kwargs)
	object.__setattr__(self, 'client', client)
	object.__setattr__(self, 'model', model)
	object.__setattr__(self, 'temperature', temperature)
	object.__setattr__(self, 'max_tokens', max_tokens)

	@property
	def _llm_type(self) -> str:
	return "together-llm"

	def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
	try:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=self.max_tokens,
	temperature=self.temperature,
	)
	return response.choices[0].message.content.strip()
	except Exception as e:
	return f"Error generating response: {str(e)}"

	class Config:
	arbitrary_types_allowed = True


	# ---------------------------
	# PDF TEXT EXTRACTION
	# ---------------------------
	def extract_text_from_pdf(pdf_file):
	"""Extract text from PDF with page references"""
	docs = []
	try:
	print("Starting PDF extraction...")

	# Handle different input types
	if hasattr(pdf_file, 'name'):
	# File uploaded through Gradio
	with open(pdf_file.name, 'rb') as file:
	pdf_content = file.read()
	elif hasattr(pdf_file, "read"):
	pdf_content = pdf_file.read()
	if hasattr(pdf_file, "seek"):
	pdf_file.seek(0)
	else:
	pdf_content = pdf_file

	pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
	print(f"PDF has {len(pdf_reader.pages)} pages")

	for page_num, page in enumerate(pdf_reader.pages, start=1):
	try:
	page_text = page.extract_text()
	if page_text and page_text.strip():
	docs.append(Document(
	page_content=page_text.strip(),
	metadata={"page": page_num, "source": "financial_policy"}
	))
	print(f"Extracted text from page {page_num}: {len(page_text)} characters")
	else:
	docs.append(Document(
	page_content="[No extractable text found on this page]",
	metadata={"page": page_num, "source": "financial_policy"}
	))
	except Exception as e:
	print(f"Error extracting page {page_num}: {str(e)}")
	docs.append(Document(
	page_content=f"[Error extracting page {page_num}: {str(e)}]",
	metadata={"page": page_num, "source": "financial_policy"}
	))

	print(f"Total documents extracted: {len(docs)}")
	return docs

	except Exception as e:
	print(f"Error in PDF extraction: {str(e)}")
	traceback.print_exc()
	return [Document(page_content=f"Error extracting text: {str(e)}", metadata={"page": -1})]


	# ---------------------------
	# BUILD KNOWLEDGE BASE (FAISS)
	# ---------------------------
	def build_vector_db(docs):
	"""Convert extracted documents into FAISS vector DB"""
	try:
	print("Building vector database...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100,
	separators=["\n\n", "\n", ". ", " ", ""]
	)
	split_docs = text_splitter.split_documents(docs)
	print(f"Split into {len(split_docs)} chunks")

	# Initialize embeddings
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'}
	)
	print("Embeddings model loaded")

	# Create FAISS database
	db = FAISS.from_documents(split_docs, embeddings)
	print("Vector database created successfully")
	return db

	except Exception as e:
	print(f"Error building vector database: {str(e)}")
	traceback.print_exc()
	return None


	# ---------------------------
	# CHATBOT PIPELINE
	# ---------------------------
	def create_chatbot(api_key, db):
	"""Set up ConversationalRetrievalChain with memory"""
	try:
	print("Creating chatbot...")
	client = Together(api_key=api_key)
	llm = TogetherLLM(client=client)

	retriever = db.as_retriever(
	search_type="similarity",
	search_kwargs={"k": 4}
	)

	memory = ConversationBufferMemory(
	memory_key="chat_history",
	return_messages=True,
	output_key="answer"
	)

	qa_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=retriever,
	memory=memory,
	return_source_documents=True,
	verbose=True,
	)
	print("Chatbot created successfully")
	return qa_chain

	except Exception as e:
	print(f"Error creating chatbot: {str(e)}")
	traceback.print_exc()
	return None


	# ---------------------------
	# GRADIO APP
	# ---------------------------
	def create_app():
	with gr.Blocks(title="📊 Financial Policy Document Chatbot", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 📊 Financial Policy Document Chatbot")
	gr.Markdown("""
	Upload a financial policy PDF document and ask questions about its content.
	The chatbot will provide answers with page references from the document.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	api_key_input = gr.Textbox(
	label="Together API Key",
	placeholder="Enter your Together API key here...",
	type="password",
	)
	pdf_file = gr.File(
	label="Upload Financial Policy PDF",
	file_types=[".pdf"],
	)
	process_button = gr.Button("📄 Process PDF", variant="primary")
	status_message = gr.Textbox(label="Status", interactive=False, lines=3)

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="Chat with Financial Policy Document", height=500)
	with gr.Row():
	question = gr.Textbox(
	label="Ask a question about the document",
	placeholder="Example: What is the budget allocation for infrastructure?",
	lines=2,
	scale=4
	)
	submit_button = gr.Button("🔍 Ask", variant="secondary", scale=1)

	gr.Markdown("""
	Sample Questions:
	- What is the debt policy outlined in the document?
	- How much budget is allocated for infrastructure?
	- What are the revenue sources mentioned?
	- What are the key financial objectives?
	""")

	# State variables
	db_state = gr.State()
	qa_chain_state = gr.State()

	# Event handlers
	def process_pdf_handler(pdf_file, api_key):
	try:
	if pdf_file is None:
	return "⚠️ Please upload a PDF file.", None, None

	if not api_key or api_key.strip() == "":
	return "⚠️ Please enter your Together API key.", None, None

	status_msg = "🔄 Processing PDF... This may take a few moments."
	yield status_msg, None, None

	# Extract text from PDF
	docs = extract_text_from_pdf(pdf_file)
	if not docs or len(docs) == 0:
	yield "⚠️ No text could be extracted from the PDF.", None, None
	return

	# Check if extraction was successful
	valid_docs = [doc for doc in docs if not doc.page_content.startswith("[Error") and not doc.page_content.startswith("[No extractable")]
	if len(valid_docs) == 0:
	yield "⚠️ No readable text found in the PDF.", None, None
	return

	status_msg = f"📄 Extracted text from {len(docs)} pages. Building search database..."
	yield status_msg, None, None

	# Build vector database
	db = build_vector_db(docs)
	if db is None:
	yield "⚠️ Failed to build search database.", None, None
	return

	status_msg = f"🔍 Search database created. Setting up chatbot..."
	yield status_msg, None, None

	# Create chatbot
	qa_chain = create_chatbot(api_key, db)
	if qa_chain is None:
	yield "⚠️ Failed to create chatbot.", None, None
	return

	final_status = f"✅ Successfully processed PDF with {len(docs)} pages. Ready to answer questions!"
	yield final_status, db, qa_chain

	except Exception as e:
	error_msg = f"❌ Error processing PDF: {str(e)}"
	print(f"Process PDF Error: {str(e)}")
	traceback.print_exc()
	yield error_msg, None, None

	def chat_handler(user_question, qa_chain, history):
	if not user_question or user_question.strip() == "":
	return history, history, ""

	if qa_chain is None:
	return history + [(user_question, "⚠️ Please process a PDF document first.")], history, ""

	try:
	# Get response from the chain
	result = qa_chain({"question": user_question})
	answer = result["answer"]

	# Add source references
	if "source_documents" in result and result["source_documents"]:
	pages = []
	for doc in result["source_documents"]:
	if "page" in doc.metadata:
	pages.append(doc.metadata["page"])

	if pages:
	unique_pages = sorted(set(pages))
	if len(unique_pages) == 1:
	answer += f"\n\n📌 Reference: Page {unique_pages[0]}"
	else:
	answer += f"\n\n📌 References: Pages {', '.join(map(str, unique_pages))}"

	new_history = history + [(user_question, answer)]
	return new_history, new_history, ""

	except Exception as e:
	error_response = f"❌ Error processing question: {str(e)}"
	print(f"Chat Error: {str(e)}")
	traceback.print_exc()
	return history + [(user_question, error_response)], history, ""

	def clear_input():
	return ""

	# Bind events
	process_button.click(
	fn=process_pdf_handler,
	inputs=[pdf_file, api_key_input],
	outputs=[status_message, db_state, qa_chain_state],
	)

	submit_button.click(
	fn=chat_handler,
	inputs=[question, qa_chain_state, chatbot],
	outputs=[chatbot, chatbot, question],
	)

	question.submit(
	fn=chat_handler,
	inputs=[question, qa_chain_state, chatbot],
	outputs=[chatbot, chatbot, question],
	)

	return app


	# ---------------------------
	# MAIN EXECUTION
	# ---------------------------
	if __name__ == "__main__":
	app = create_app()
	app.launch(
	share=True,
	server_name="0.0.0.0",
	server_port=7860,
	debug=True
	)