Spaces:

manabb
/

CharBotWithPDF

Sleeping

App Files Files Community

manabb commited on Dec 21, 2025

Commit

bbe6774

verified ·

1 Parent(s): e4e5c5c

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -80

app.py CHANGED Viewed

@@ -1,87 +1,186 @@
 # app.py
-import os
 import gradio as gr
-from langchain.vectorstores import FAISS
 from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.document_loaders import TextLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
-from langchain.llms import HuggingFacePipeline
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-# Optional: Set HF Token if needed
-# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'hf_XXXX'
-# Initialize embedding model
-embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-# Load HF model (lightweight for CPU)
-model_name = "google/flan-t5-small"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-# Wrap in pipeline
-pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
-llm = HuggingFacePipeline(pipeline=pipe)
-def process_file(file_path):
-    # Load & split document
-    loader = TextLoader(file_path)
-    documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    docs = text_splitter.split_documents(documents)
-    # Create vector DB
-    vector_db = FAISS.from_documents(docs, embedding_model)
-    retriever = vector_db.as_retriever()
-    # Setup RetrievalQA chain
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=retriever
-    )
-    return qa_chain
-# Store the QA chain globally (across UI events)
-qa_chain = None
-def upload_and_prepare(file):
-    global qa_chain
-    # qa_chain = process_file(file)
-    qa_chain = process_file(file.name)
-    return "✅ Document processed. You can now ask questions!"
-def ask_question(query):
-    if not qa_chain:
-        return "❌ Please upload a document first."
-    response = qa_chain.invoke({"query": query})
-    return response["result"]
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Ask Questions About Your Document (LangChain + Hugging Face)")
-    with gr.Row():
-        file_input = gr.File(label="📄 Upload .txt File", type="filepath")
-        upload_btn = gr.Button("🔄 Process Document")
-    upload_output = gr.Textbox(label="📁 Status", interactive=False)
     with gr.Row():
-        query_input = gr.Textbox(label="❓ Your Question")
-        query_btn = gr.Button("🧠 Get Answer")
-    answer_output = gr.Textbox(label="✅ Answer", lines=4)
-    upload_btn.click(upload_and_prepare, inputs=file_input, outputs=upload_output)
-    query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
-# For local dev use: demo.launch()
-# For HF Spaces
 if __name__ == "__main__":
-    demo.launch()

 # app.py
 import gradio as gr
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.llms import HuggingFaceHub
 from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+import os
+import tempfile
+import datetime
+class PDFChatbotWithGradio:
+    def __init__(self):
+        self.vectorstore = None
+        self.qa_chain = None
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        self.is_ready = False
+    def process_pdf(self, file_obj):
+        """Process uploaded PDF file - fixed to handle Gradio File object"""
+        try:
+            if file_obj is None:
+                return "Please select a PDF file first!"
+            # Extract the file path from Gradio's NamedString object
+            # Gradio File component returns an object with 'name' attribute
+            file_path = file_obj.name
+            # Load PDF using the file path
+            loader = PyPDFLoader(file_path)
+            documents = loader.load()
+            if not documents:
+                return "No content could be extracted from the PDF."
+            # Split text into chunks
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1000,
+                chunk_overlap=200
+            )
+            chunks = text_splitter.split_documents(documents)
+            # Create vector store
+            self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
+            # Setup QA chain
+            self.setup_qa_chain()
+            self.is_ready = True
+            return f"PDF processed successfully! Loaded {len(documents)} pages and created {len(chunks)} chunks."
+        except Exception as e:
+            return f"Error processing PDF: {str(e)}"
+    def setup_qa_chain(self):
+        """Set up the question-answering chain"""
+        # Initialize the language model
+        llm = HuggingFaceHub(
+            repo_id="google/flan-t5-small",
+            model_kwargs={"temperature": 0.1, "max_length": 512}
+        )
+        # Custom prompt template
+        prompt_template = """You are a helpful assistant that answers questions based on the provided context.
+        Context: {context}
+        Question: {question}
+        Please provide a clear and concise answer based on the context above.
+        If the answer cannot be found in the context, say "I don't know based on the document."
+        Answer: """
+        PROMPT = PromptTemplate(
+            template=prompt_template,
+            input_variables=["context", "question"]
+        )
+        # Create retrieval QA chain
+        self.qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
+            chain_type_kwargs={"prompt": PROMPT},
+            return_source_documents=True
+        )
+    def ask_question(self, question, history):
+        """Ask a question and get answer from the chatbot"""
+        if not self.is_ready:
+            return "Please upload and process a PDF first!", history
+        if not question.strip():
+            return "", history
+        try:
+            result = self.qa_chain({"query": question})
+            answer = result["result"]
+            # Format response with sources
+            response = f"{answer}\n\n**Sources:**"
+            for i, doc in enumerate(result["source_documents"][:2]):
+                page_num = doc.metadata.get('page', 'N/A')
+                if isinstance(page_num, int):
+                    page_num += 1  # Convert to 1-indexed for user readability
+                content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
+                response += f"\n{i+1}. Page {page_num}: {content_preview}"
+            # Update chat history
+            history.append((question, response))
+            return "", history
+        except Exception as e:
+            error_msg = f"Error: {str(e)}"
+            history.append((question, error_msg))
+            return "", history
+# Create chatbot instance
+chatbot = PDFChatbotWithGradio()
+# Create Gradio interface
+with gr.Blocks(title="PDF Chatbot Agent", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 PDF Chatbot Agent")
+    gr.Markdown("Upload a PDF document and ask questions about its content!")
     with gr.Row():
+        with gr.Column(scale=1):
+            pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
+            upload_status = gr.Textbox(label="Upload Status", interactive=False)
+            process_btn = gr.Button("Process PDF", variant="primary")
+        with gr.Column(scale=2):
+            chatbot_interface = gr.Chatbot(label="Chat", height=400)
+            question_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the PDF...")
+            with gr.Row():
+                submit_btn = gr.Button("Ask Question")
+                clear_btn = gr.Button("Clear Chat")
+    # Event handlers
+    process_btn.click(
+        fn=chatbot.process_pdf,
+        inputs=pdf_upload,
+        outputs=upload_status
+    )
+    def ask_and_clear(question, history):
+        return chatbot.ask_question(question, history)
+    submit_btn.click(
+        fn=ask_and_clear,
+        inputs=[question_input, chatbot_interface],
+        outputs=[question_input, chatbot_interface]
+    )
+    question_input.submit(
+        fn=ask_and_clear,
+        inputs=[question_input, chatbot_interface],
+        outputs=[question_input, chatbot_interface]
+    )
+    clear_btn.click(
+        fn=lambda: [],
+        inputs=[],
+        outputs=chatbot_interface
+    )
+    gr.Examples(
+        examples=[
+            "What is the main topic of this document?",
+            "Can you summarize the key points?",
+            "What are the main conclusions?",
+            "List the important findings mentioned."
+        ],
+        inputs=question_input
+    )
+# Launch the application
 if __name__ == "__main__":
+    demo.launch(share=True)