pdfchatbot / app.py
Himanshu kumar Vishwakrma
rapp
6288d51
import gradio as gr
from PyPDF2 import PdfReader
import docx
import os
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceHub
# Initialize conversation state
conversation = None
chat_history = []
def get_pdf_text(pdf_docs):
"""Improved PDF text extraction with error handling"""
text = ""
for pdf in pdf_docs:
try:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text: # Only add if text was extracted
text += page_text + "\n"
except Exception as e:
print(f"Error reading PDF: {str(e)}")
return text if text.strip() else None
def get_text_chunks(text):
"""Split text into chunks"""
if not text:
return []
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
return text_splitter.split_text(text)
def get_vectorstore(text_chunks):
"""Create vector store using HuggingFace embeddings"""
if not text_chunks:
return None
embeddings = HuggingFaceEmbeddings()
return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
def get_conversation_chain(vectorstore):
"""Create conversation chain with HuggingFace model"""
global conversation
llm = HuggingFaceHub(
repo_id="google/flan-t5-xxl",
model_kwargs={"temperature":0.5, "max_length":512}
)
memory = ConversationBufferMemory(
memory_key='chat_history',
return_messages=True
)
conversation = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation
def process_files(files):
"""Handle file processing"""
global conversation, chat_history
if not files:
return "Please upload files first"
try:
# Get PDF text
raw_text = get_pdf_text(files)
if not raw_text:
return "❌ Could not extract text from PDF(s). The file may be scanned or corrupted."
# Get text chunks
text_chunks = get_text_chunks(raw_text)
if not text_chunks:
return "❌ No valid text chunks could be created."
# Create vector store
vectorstore = get_vectorstore(text_chunks)
if not vectorstore:
return "❌ Failed to create vector store."
# Create conversation chain
get_conversation_chain(vectorstore)
return "βœ… Files processed successfully! You can now ask questions."
except Exception as e:
return f"❌ Error processing files: {str(e)}"
def ask_question(question, history):
"""Handle question answering"""
global conversation, chat_history
if not question:
return history
if not conversation:
return history + [(question, "Please process files first")]
try:
response = conversation({"question": question})
answer = response["answer"]
chat_history = response["chat_history"]
return history + [(question, answer)]
except Exception as e:
return history + [(question, f"Error: {str(e)}")]
# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸ“„ Chat with PDFs")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload PDFs",
file_types=[".pdf"],
file_count="multiple"
)
process_btn = gr.Button("Process")
status = gr.Textbox(label="Status")
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Conversation")
question = gr.Textbox(
label="Your Question",
placeholder="Ask about your documents..."
)
submit_btn = gr.Button("Submit")
# Event handlers
process_btn.click(
process_files,
inputs=file_input,
outputs=status
)
submit_btn.click(
ask_question,
inputs=[question, chatbot],
outputs=[chatbot]
)
question.submit(
ask_question,
inputs=[question, chatbot],
outputs=[chatbot]
)
if __name__ == '__main__':
load_dotenv()
demo.launch()