Spaces:

himanshukumar378
/

pdfchatbot

Sleeping

App Files Files Community

Himanshu kumar Vishwakrma commited on Jul 20, 2025

Commit

6288d51

1 Parent(s): b96bff0

rapp

Browse files

Files changed (1) hide show

app.py +57 -41

app.py CHANGED Viewed

@@ -10,36 +10,29 @@ from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain_community.llms import HuggingFaceHub
-# Initialize global variables
 conversation = None
 chat_history = []
-process_complete = False
-def get_pdf_text(pdf_file):
-    """Extract text from PDF"""
-    reader = PdfReader(pdf_file)
     text = ""
-    for page in reader.pages:
-        text += page.extract_text() or ""
-    return text
-def get_docx_text(docx_file):
-    """Extract text from DOCX"""
-    doc = docx.Document(docx_file)
-    return "\n".join([para.text for para in doc.paragraphs])
-def get_files_text(files):
-    """Process multiple files"""
-    text = ""
-    for file in files:
-        if file.name.endswith(".pdf"):
-            text += get_pdf_text(file)
-        elif file.name.endswith(".docx"):
-            text += get_docx_text(file)
-    return text
 def get_text_chunks(text):
     """Split text into chunks"""
     text_splitter = CharacterTextSplitter(
         separator="\n",
         chunk_size=1000,
@@ -49,51 +42,74 @@ def get_text_chunks(text):
     return text_splitter.split_text(text)
 def get_vectorstore(text_chunks):
-    """Create vector store from text"""
     embeddings = HuggingFaceEmbeddings()
-    return FAISS.from_texts(text_chunks, embeddings)
 def get_conversation_chain(vectorstore):
-    """Initialize conversation chain"""
     llm = HuggingFaceHub(
-        repo_id="google/flan-t5-large",
-        model_kwargs={"temperature": 0.5, "max_length": 512}
     )
     memory = ConversationBufferMemory(
         memory_key='chat_history',
         return_messages=True
     )
-    return ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=vectorstore.as_retriever(),
         memory=memory
     )
 def process_files(files):
     """Handle file processing"""
-    global conversation, process_complete
     if not files:
         return "Please upload files first"
     try:
-        raw_text = get_files_text(files)
         text_chunks = get_text_chunks(raw_text)
         vectorstore = get_vectorstore(text_chunks)
-        conversation = get_conversation_chain(vectorstore)
-        process_complete = True
         return "✅ Files processed successfully! You can now ask questions."
     except Exception as e:
-        return f"❌ Error: {str(e)}"
 def ask_question(question, history):
     """Handle question answering"""
     global conversation, chat_history
-    if not process_complete:
-        return history + [(question, "Please process files first")]
     if not question:
         return history
     try:
         response = conversation({"question": question})
         answer = response["answer"]
@@ -104,16 +120,16 @@ def ask_question(question, history):
 # Gradio Interface
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📄 PDF/DOCX Chatbot")
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
-                label="Upload Files",
-                file_types=[".pdf", ".docx"],
                 file_count="multiple"
             )
-            process_btn = gr.Button("Process Files")
             status = gr.Textbox(label="Status")
         with gr.Column(scale=2):
@@ -143,6 +159,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         outputs=[chatbot]
     )
-if __name__ == "__main__":
     load_dotenv()
     demo.launch()

 from langchain.memory import ConversationBufferMemory
 from langchain_community.llms import HuggingFaceHub
+# Initialize conversation state
 conversation = None
 chat_history = []
+def get_pdf_text(pdf_docs):
+    """Improved PDF text extraction with error handling"""
     text = ""
+    for pdf in pdf_docs:
+        try:
+            pdf_reader = PdfReader(pdf)
+            for page in pdf_reader.pages:
+                page_text = page.extract_text()
+                if page_text:  # Only add if text was extracted
+                    text += page_text + "\n"
+        except Exception as e:
+            print(f"Error reading PDF: {str(e)}")
+    return text if text.strip() else None
 def get_text_chunks(text):
     """Split text into chunks"""
+    if not text:
+        return []
     text_splitter = CharacterTextSplitter(
         separator="\n",
         chunk_size=1000,
     return text_splitter.split_text(text)
 def get_vectorstore(text_chunks):
+    """Create vector store using HuggingFace embeddings"""
+    if not text_chunks:
+        return None
     embeddings = HuggingFaceEmbeddings()
+    return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
 def get_conversation_chain(vectorstore):
+    """Create conversation chain with HuggingFace model"""
+    global conversation
     llm = HuggingFaceHub(
+        repo_id="google/flan-t5-xxl",
+        model_kwargs={"temperature":0.5, "max_length":512}
     )
     memory = ConversationBufferMemory(
         memory_key='chat_history',
         return_messages=True
     )
+    conversation = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=vectorstore.as_retriever(),
         memory=memory
     )
+    return conversation
 def process_files(files):
     """Handle file processing"""
+    global conversation, chat_history
     if not files:
         return "Please upload files first"
     try:
+        # Get PDF text
+        raw_text = get_pdf_text(files)
+        if not raw_text:
+            return "❌ Could not extract text from PDF(s). The file may be scanned or corrupted."
+        # Get text chunks
         text_chunks = get_text_chunks(raw_text)
+        if not text_chunks:
+            return "❌ No valid text chunks could be created."
+        # Create vector store
         vectorstore = get_vectorstore(text_chunks)
+        if not vectorstore:
+            return "❌ Failed to create vector store."
+        # Create conversation chain
+        get_conversation_chain(vectorstore)
         return "✅ Files processed successfully! You can now ask questions."
     except Exception as e:
+        return f"❌ Error processing files: {str(e)}"
 def ask_question(question, history):
     """Handle question answering"""
     global conversation, chat_history
     if not question:
         return history
+    if not conversation:
+        return history + [(question, "Please process files first")]
     try:
         response = conversation({"question": question})
         answer = response["answer"]
 # Gradio Interface
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 Chat with PDFs")
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
+                label="Upload PDFs",
+                file_types=[".pdf"],
                 file_count="multiple"
             )
+            process_btn = gr.Button("Process")
             status = gr.Textbox(label="Status")
         with gr.Column(scale=2):
         outputs=[chatbot]
     )
+if __name__ == '__main__':
     load_dotenv()
     demo.launch()