Spaces:

manabb
/

CharBotWithPDF

Running

App Files Files Community

manabb commited on Dec 21, 2025

Commit

e4e5c5c

verified ·

1 Parent(s): 8eb71cf

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -253

app.py CHANGED Viewed

@@ -1,266 +1,87 @@
 import gradio as gr
-from langchain.document_loaders import PyPDFLoader, DirectoryLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
-from langchain.llms import HuggingFaceHub
 from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
-import os
-import tempfile
-import datetime
-class EnhancedPDFChatbot:
-    def __init__(self):
-        self.vectorstore = None
-        self.qa_chain = None
-        self.embeddings = HuggingFaceEmbeddings()
-        self.is_ready = False
-        self.chat_history = []
-    def process_pdf(self, pdf_file):
-        """Process uploaded PDF file with enhanced error handling"""
-        try:
-            if pdf_file is None:
-                return "Please select a PDF file first!"
-            # Save uploaded file
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
-                tmp_file.write(pdf_file)
-                tmp_path = tmp_file.name
-            # Load and process PDF
-            loader = PyPDFLoader(tmp_path)
-            documents = loader.load()
-            # Clean up
-            os.unlink(tmp_path)
-            if not documents:
-                return "No content could be extracted from the PDF."
-            # Split text
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=800,
-                chunk_overlap=150,
-                length_function=len,
-            )
-            chunks = text_splitter.split_documents(documents)
-            # Create vector store
-            self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
-            self.setup_qa_chain()
-            self.is_ready = True
-            self.chat_history = []
-            return f"✅ Success! Processed {len(documents)} pages into {len(chunks)} chunks. You can now ask questions!"
-        except Exception as e:
-            return f"❌ Error: {str(e)}"
-    def setup_qa_chain(self):
-        """Setup QA chain with enhanced prompt"""
-        llm = HuggingFaceHub(
-            repo_id="google/flan-t5-small",
-            model_kwargs={"temperature": 0.2, "max_length": 512, "repetition_penalty": 1.1}
-        )
-        prompt_template = """As an AI assistant, provide accurate answers based on the given context.
-CONTEXT:
-{context}
-QUESTION:
-{question}
-INSTRUCTIONS:
-- Answer clearly and concisely
-- Base your answer strictly on the context provided
-- If the answer isn't in the context, say "I cannot find this information in the document"
-- Use bullet points for lists when appropriate
-- Be helpful and professional
-ANSWER:
-"""
-        PROMPT = PromptTemplate(
-            template=prompt_template,
-            input_variables=["context", "question"]
-        )
-        self.qa_chain = RetrievalQA.from_chain_type(
-            llm=llm,
-            chain_type="stuff",
-            retriever=self.vectorstore.as_retriever(
-                search_type="similarity",
-                search_kwargs={"k": 4}
-            ),
-            chain_type_kwargs={"prompt": PROMPT},
-            return_source_documents=True
-        )
-    def ask_question(self, question, history):
-        """Ask question with enhanced response formatting"""
-        if not self.is_ready:
-            return "Please upload and process a PDF first!", history
-        if not question.strip():
-            return "", history
-        try:
-            # Add timestamp
-            timestamp = datetime.datetime.now().strftime("%H:%M:%S")
-            result = self.qa_chain({"query": question})
-            answer = result["result"]
-            # Format response
-            formatted_response = f"**{timestamp}**\n\n{answer}\n\n---\n**Sources:**"
-            for i, doc in enumerate(result["source_documents"][:3]):
-                page_num = doc.metadata.get('page', 'N/A') + 1  # Convert to 1-indexed
-                content = doc.page_content.replace('\n', ' ').strip()
-                preview = content[:120] + "..." if len(content) > 120 else content
-                formatted_response += f"\n• Page {page_num}: {preview}"
-            # Update history
-            history.append((question, formatted_response))
-            self.chat_history = history
-            return "", history
-        except Exception as e:
-            error_msg = f"Error processing your question: {str(e)}"
-            history.append((question, error_msg))
-            return "", history
-    def clear_chat(self):
-        """Clear chat history"""
-        self.chat_history = []
-        return []
-# Create enhanced chatbot
-enhanced_chatbot = EnhancedPDFChatbot()
-# Create enhanced Gradio interface
-with gr.Blocks(title="Enhanced PDF Chatbot", theme=gr.themes.Default()) as enhanced_demo:
-    gr.Markdown("""
-    # 🚀 Enhanced PDF Chatbot Agent
-    **Upload a PDF document and have a conversation with AI about its content!**
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            with gr.Group():
-                gr.Markdown("### 📄 Document Upload")
-                pdf_input = gr.File(
-                    label="Upload PDF File",
-                    file_types=[".pdf"],
-                    type="binary"
-                )
-                upload_btn = gr.Button("Process Document", variant="primary")
-                status_output = gr.Textbox(label="Status", interactive=False)
-            with gr.Group():
-                gr.Markdown("### ⚙️ Settings")
-                chunk_size = gr.Slider(
-                    minimum=500,
-                    maximum=2000,
-                    value=800,
-                    step=100,
-                    label="Chunk Size"
-                )
-                temperature = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.2,
-                    step=0.1,
-                    label="Temperature"
-                )
-        with gr.Column(scale=2):
-            gr.Markdown("### 💬 Chat Interface")
-            chatbot = gr.Chatbot(height=450, show_copy_button=True)
-            with gr.Row():
-                question_box = gr.Textbox(
-                    placeholder="Ask a question about the PDF...",
-                    label="Your Question",
-                    scale=4
-                )
-                ask_btn = gr.Button("Ask", scale=1)
-            with gr.Row():
-                clear_btn = gr.Button("Clear Chat", variant="secondary")
-                export_btn = gr.Button("Export Chat", variant="secondary")
-    # Examples
-    gr.Examples(
-        examples=[
-            "What is the main purpose of this document?",
-            "Summarize the key points in bullet form",
-            "What are the main findings or conclusions?",
-            "List any recommendations mentioned"
-        ],
-        inputs=question_box,
-        label="Example Questions"
-    )
-    # Event handlers
-    upload_btn.click(
-        fn=enhanced_chatbot.process_pdf,
-        inputs=pdf_input,
-        outputs=status_output
-    )
-    def ask_question_wrapper(question, history):
-        return enhanced_chatbot.ask_question(question, history)
-    ask_btn.click(
-        fn=ask_question_wrapper,
-        inputs=[question_box, chatbot],
-        outputs=[question_box, chatbot]
-    )
-    question_box.submit(
-        fn=ask_question_wrapper,
-        inputs=[question_box, chatbot],
-        outputs=[question_box, chatbot]
-    )
-    clear_btn.click(
-        fn=enhanced_chatbot.clear_chat,
-        inputs=[],
-        outputs=chatbot
-    )
-    # Export functionality
-    def export_chat():
-        if not enhanced_chatbot.chat_history:
-            return "No chat history to export!"
-        export_text = "PDF Chatbot Conversation Export\n"
-        export_text += "=" * 40 + "\n\n"
-        for i, (question, answer) in enumerate(enhanced_chatbot.chat_history, 1):
-            export_text += f"Q{i}: {question}\n"
-            export_text += f"A{i}: {answer}\n"
-            export_text += "-" * 30 + "\n"
-        return export_text
-    export_btn.click(
-        fn=export_chat,
-        inputs=[],
-        outputs=gr.Textbox(label="Exported Chat", lines=20)
     )
 if __name__ == "__main__":
-    enhanced_demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )

+# app.py
+import os
 import gradio as gr
 from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+# Optional: Set HF Token if needed
+# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'hf_XXXX'
+# Initialize embedding model
+embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+# Load HF model (lightweight for CPU)
+model_name = "google/flan-t5-small"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Wrap in pipeline
+pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
+llm = HuggingFacePipeline(pipeline=pipe)
+def process_file(file_path):
+    # Load & split document
+    loader = TextLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    docs = text_splitter.split_documents(documents)
+    # Create vector DB
+    vector_db = FAISS.from_documents(docs, embedding_model)
+    retriever = vector_db.as_retriever()
+    # Setup RetrievalQA chain
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=retriever
     )
+    return qa_chain
+# Store the QA chain globally (across UI events)
+qa_chain = None
+def upload_and_prepare(file):
+    global qa_chain
+    # qa_chain = process_file(file)
+    qa_chain = process_file(file.name)
+    return "✅ Document processed. You can now ask questions!"
+def ask_question(query):
+    if not qa_chain:
+        return "❌ Please upload a document first."
+    response = qa_chain.invoke({"query": query})
+    return response["result"]
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🧠 Ask Questions About Your Document (LangChain + Hugging Face)")
+    with gr.Row():
+        file_input = gr.File(label="📄 Upload .txt File", type="filepath")
+        upload_btn = gr.Button("🔄 Process Document")
+    upload_output = gr.Textbox(label="📁 Status", interactive=False)
+    with gr.Row():
+        query_input = gr.Textbox(label="❓ Your Question")
+        query_btn = gr.Button("🧠 Get Answer")
+    answer_output = gr.Textbox(label="✅ Answer", lines=4)
+    upload_btn.click(upload_and_prepare, inputs=file_input, outputs=upload_output)
+    query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
+# For local dev use: demo.launch()
+# For HF Spaces
 if __name__ == "__main__":
+    demo.launch()