Spaces:

nikhmr1235
/

PDF_document_chatbot

Sleeping

App Files Files Community

PDF_document_chatbot / app.py

nikhmr1235

fix indentation issue

d64bbb5 verified 8 months ago

raw

history blame

4.85 kB

	import gradio as gr
	import os
	import uuid
	import shutil
	import fitz
	from langchain_community.vectorstores import Chroma
	from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.prompts import PromptTemplate
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser

	import tempfile

	# Constants
	LLM_MODEL = "gemini-1.5-flash"
	EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
	CHROMA_DB_PATH = tempfile.gettempdir() + "/chroma_db"

	class PDFChatbot:
	def __init__(self):
	self.state = SessionState()

	def process_pdf(self, pdf_file):
	try:
	if self.state.is_db_ready():
	print("Database is already ready.")
	return

	file_size_mb = os.path.getsize(pdf_file.name) / (1024 * 1024)
	if file_size_mb >= 75:
	print("File size exceeds the 75 MB limit.")
	gr.Error("File size exceeds the 75 MB limit. Please upload a smaller PDF.")
	return

	self.state = SessionState()
	doc = fitz.open(pdf_file.name)
	text = ""
	for page in doc:
	text += page.get_text()
	doc.close()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	docs = text_splitter.create_documents([text])

	embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)
	self.state.db = Chroma.from_documents(
	documents=docs,
	embedding=embeddings,
	persist_directory=self.state.vector_store_path
	)
	print("PDF processed successfully! Database is ready.")
	except Exception as e:
	if os.path.exists(self.state.vector_store_path):
	shutil.rmtree(self.state.vector_store_path)
	print(f"An error occurred: {str(e)}")

	def chat_with_pdf(self, message, history):
	print("Chat interface called. Checking if database is ready...")
	if not self.state.is_db_ready():
	print("Database is not ready.")
	yield "Error: Database not ready."
	return

	print("Database is ready. Retrieving relevant documents...")
	retriever = self.state.db.as_retriever()
	llm = ChatGoogleGenerativeAI(model=LLM_MODEL, temperature=0.7)

	prompt_template = PromptTemplate(
	template="""
	You are a helpful assistant for a PDF document.
	Answer the user's question based on the following context.
	If you don't know the answer, just say that you don't know, don't try to make up an answer.
	----------------
	Context: {context}
	Question: {question}
	""",
	input_variables=["context", "question"],
	)

	rag_chain = (
	{"context": retriever, "question": RunnablePassthrough()}
	\| prompt_template
	\| llm
	\| StrOutputParser()
	)

	response = rag_chain.invoke(message)
	yield response

	def is_db_ready(self):
	return self.state.db is not None

	class SessionState:
	def __init__(self):
	self.session_id = str(uuid.uuid4())
	self.db = None
	self.vector_store_path = os.path.join(CHROMA_DB_PATH, self.session_id)

	def is_db_ready(self):
	return self.db is not None

	# Set the Google API key from environment variables
	if "GOOGLE_API_KEY" not in os.environ:
	raise Exception("Please set the GOOGLE_API_KEY environment variable.")

	with gr.Blocks(title="PDF Chatbot") as demo:
	chatbot = PDFChatbot()

	gr.Markdown(
	"""
	# PDF Chatbot
	Upload a PDF to start a conversation with your document.
	"""
	)

	with gr.Row():
	file_upload_input = gr.File(
	file_types=[".pdf"],
	label="Upload your PDF document",
	interactive=True
	)

	with gr.Row(visible=False) as chat_row:
	chat_interface = gr.ChatInterface(
	fn=chatbot.chat_with_pdf,
	chatbot=gr.Chatbot(type="messages"),
	textbox=gr.Textbox(placeholder="Type your question here...", scale=7),
	examples=[["What is the main topic of the document?"], ["Summarize the key findings."], ["Who are the authors?"]],
	title="Chat Interface",
	theme="soft"
	)

	def process_and_show_chat(file):
	chatbot.process_pdf(file)
	return gr.update(visible=True), gr.update(interactive=False)

	file_upload_input.upload(
	fn=process_and_show_chat,
	inputs=[file_upload_input],
	outputs=[chat_row, file_upload_input]
	)

	demo.launch()