Spaces:

himanshukumar378
/

pdfchatbot

Sleeping

pdfchatbot / app.py

Himanshu kumar Vishwakrma

rapp

6288d51 7 months ago

4.74 kB

	import gradio as gr
	from PyPDF2 import PdfReader
	import docx
	import os
	from dotenv import load_dotenv
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chains import ConversationalRetrievalChain
	from langchain.memory import ConversationBufferMemory
	from langchain_community.llms import HuggingFaceHub

	# Initialize conversation state
	conversation = None
	chat_history = []

	def get_pdf_text(pdf_docs):
	"""Improved PDF text extraction with error handling"""
	text = ""
	for pdf in pdf_docs:
	try:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text: # Only add if text was extracted
	text += page_text + "\n"
	except Exception as e:
	print(f"Error reading PDF: {str(e)}")
	return text if text.strip() else None

	def get_text_chunks(text):
	"""Split text into chunks"""
	if not text:
	return []

	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	return text_splitter.split_text(text)

	def get_vectorstore(text_chunks):
	"""Create vector store using HuggingFace embeddings"""
	if not text_chunks:
	return None

	embeddings = HuggingFaceEmbeddings()
	return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

	def get_conversation_chain(vectorstore):
	"""Create conversation chain with HuggingFace model"""
	global conversation

	llm = HuggingFaceHub(
	repo_id="google/flan-t5-xxl",
	model_kwargs={"temperature":0.5, "max_length":512}
	)

	memory = ConversationBufferMemory(
	memory_key='chat_history',
	return_messages=True
	)

	conversation = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=vectorstore.as_retriever(),
	memory=memory
	)
	return conversation

	def process_files(files):
	"""Handle file processing"""
	global conversation, chat_history

	if not files:
	return "Please upload files first"

	try:
	# Get PDF text
	raw_text = get_pdf_text(files)
	if not raw_text:
	return "❌ Could not extract text from PDF(s). The file may be scanned or corrupted."

	# Get text chunks
	text_chunks = get_text_chunks(raw_text)
	if not text_chunks:
	return "❌ No valid text chunks could be created."

	# Create vector store
	vectorstore = get_vectorstore(text_chunks)
	if not vectorstore:
	return "❌ Failed to create vector store."

	# Create conversation chain
	get_conversation_chain(vectorstore)
	return "✅ Files processed successfully! You can now ask questions."

	except Exception as e:
	return f"❌ Error processing files: {str(e)}"

	def ask_question(question, history):
	"""Handle question answering"""
	global conversation, chat_history

	if not question:
	return history

	if not conversation:
	return history + [(question, "Please process files first")]

	try:
	response = conversation({"question": question})
	answer = response["answer"]
	chat_history = response["chat_history"]
	return history + [(question, answer)]
	except Exception as e:
	return history + [(question, f"Error: {str(e)}")]

	# Gradio Interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📄 Chat with PDFs")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="Upload PDFs",
	file_types=[".pdf"],
	file_count="multiple"
	)
	process_btn = gr.Button("Process")
	status = gr.Textbox(label="Status")

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="Conversation")
	question = gr.Textbox(
	label="Your Question",
	placeholder="Ask about your documents..."
	)
	submit_btn = gr.Button("Submit")

	# Event handlers
	process_btn.click(
	process_files,
	inputs=file_input,
	outputs=status
	)

	submit_btn.click(
	ask_question,
	inputs=[question, chatbot],
	outputs=[chatbot]
	)

	question.submit(
	ask_question,
	inputs=[question, chatbot],
	outputs=[chatbot]
	)

	if __name__ == '__main__':
	load_dotenv()
	demo.launch()