Spaces:

manabb
/

CharBotWithPDF

Running

App Files Files Community

manabb commited on Dec 24, 2025

Commit

11bfceb

verified ·

1 Parent(s): 3b7db7f

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -179

app.py CHANGED Viewed

@@ -1,186 +1,90 @@
 # app.py
 import gradio as gr
-from langchain.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
-from langchain.llms import HuggingFaceHub
 from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
-import os
-import tempfile
-import datetime
-class PDFChatbotWithGradio:
-    def __init__(self):
-        self.vectorstore = None
-        self.qa_chain = None
-        self.embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-MiniLM-L6-v2"
-        )
-        self.is_ready = False
-    def process_pdf(self, file_obj):
-        """Process uploaded PDF file - fixed to handle Gradio File object"""
-        try:
-            if file_obj is None:
-                return "Please select a PDF file first!"
-            # Extract the file path from Gradio's NamedString object
-            # Gradio File component returns an object with 'name' attribute
-            file_path = file_obj.name
-            # Load PDF using the file path
-            loader = PyPDFLoader(file_path)
-            documents = loader.load()
-            if not documents:
-                return "No content could be extracted from the PDF."
-            # Split text into chunks
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=1000,
-                chunk_overlap=200
-            )
-            chunks = text_splitter.split_documents(documents)
-            # Create vector store
-            self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
-            # Setup QA chain
-            self.setup_qa_chain()
-            self.is_ready = True
-            return f"PDF processed successfully! Loaded {len(documents)} pages and created {len(chunks)} chunks."
-        except Exception as e:
-            return f"Error processing PDF: {str(e)}"
-    def setup_qa_chain(self):
-        """Set up the question-answering chain"""
-        # Initialize the language model
-        llm = HuggingFaceHub(
-            repo_id="google/flan-t5-small",
-            model_kwargs={"temperature": 0.1, "max_length": 512}
-        )
-        # Custom prompt template
-        prompt_template = """You are a helpful assistant that answers questions based on the provided context.
-        Context: {context}
-        Question: {question}
-        Please provide a clear and concise answer based on the context above.
-        If the answer cannot be found in the context, say "I don't know based on the document."
-        Answer: """
-        PROMPT = PromptTemplate(
-            template=prompt_template,
-            input_variables=["context", "question"]
-        )
-        # Create retrieval QA chain
-        self.qa_chain = RetrievalQA.from_chain_type(
-            llm=llm,
-            chain_type="stuff",
-            retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
-            chain_type_kwargs={"prompt": PROMPT},
-            return_source_documents=True
-        )
-    def ask_question(self, question, history):
-        """Ask a question and get answer from the chatbot"""
-        if not self.is_ready:
-            return "Please upload and process a PDF first!", history
-        if not question.strip():
-            return "", history
-        try:
-            result = self.qa_chain({"query": question})
-            answer = result["result"]
-            # Format response with sources
-            response = f"{answer}\n\n**Sources:**"
-            for i, doc in enumerate(result["source_documents"][:2]):
-                page_num = doc.metadata.get('page', 'N/A')
-                if isinstance(page_num, int):
-                    page_num += 1  # Convert to 1-indexed for user readability
-                content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
-                response += f"\n{i+1}. Page {page_num}: {content_preview}"
-            # Update chat history
-            history.append((question, response))
-            return "", history
-        except Exception as e:
-            error_msg = f"Error: {str(e)}"
-            history.append((question, error_msg))
-            return "", history
-# Create chatbot instance
-chatbot = PDFChatbotWithGradio()
-# Create Gradio interface
-with gr.Blocks(title="PDF Chatbot Agent", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📄 PDF Chatbot Agent")
-    gr.Markdown("Upload a PDF document and ask questions about its content!")
-    with gr.Row():
-        with gr.Column(scale=1):
-            pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
-            upload_status = gr.Textbox(label="Upload Status", interactive=False)
-            process_btn = gr.Button("Process PDF", variant="primary")
-        with gr.Column(scale=2):
-            chatbot_interface = gr.Chatbot(label="Chat", height=400)
-            question_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the PDF...")
-            with gr.Row():
-                submit_btn = gr.Button("Ask Question")
-                clear_btn = gr.Button("Clear Chat")
-    # Event handlers
-    process_btn.click(
-        fn=chatbot.process_pdf,
-        inputs=pdf_upload,
-        outputs=upload_status
-    )
-    def ask_and_clear(question, history):
-        return chatbot.ask_question(question, history)
-    submit_btn.click(
-        fn=ask_and_clear,
-        inputs=[question_input, chatbot_interface],
-        outputs=[question_input, chatbot_interface]
-    )
-    question_input.submit(
-        fn=ask_and_clear,
-        inputs=[question_input, chatbot_interface],
-        outputs=[question_input, chatbot_interface]
-    )
-    clear_btn.click(
-        fn=lambda: [],
-        inputs=[],
-        outputs=chatbot_interface
-    )
-    gr.Examples(
-        examples=[
-            "What is the main topic of this document?",
-            "Can you summarize the key points?",
-            "What are the main conclusions?",
-            "List the important findings mentioned."
-        ],
-        inputs=question_input
     )
-# Launch the application
 if __name__ == "__main__":
-    demo.launch(share=True)

 # app.py
+import os
 import gradio as gr
 from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from langchain.document_loaders import PyPDFLoader
+# Optional: Set HF Token if needed
+# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'hf_XXXX'
+# Initialize embedding model
+embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+# Load HF model (lightweight for CPU)
+model_name = "google/flan-t5-small"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Wrap in pipeline
+pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
+llm = HuggingFacePipeline(pipeline=pipe)
+def process_file(file_path):
+    # Load & split document
+    #loader = TextLoader(file_path)
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    docs = text_splitter.split_documents(documents)
+    # Create vector DB
+    vector_db = FAISS.from_documents(docs, embedding_model)
+    retriever = vector_db.as_retriever()
+    # Setup RetrievalQA chain
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=retriever
     )
+    return qa_chain
+# Store the QA chain globally (across UI events)
+qa_chain = None
+def upload_and_prepare(file):
+    global qa_chain
+    # qa_chain = process_file(file)
+    qa_chain = process_file(file.name)
+    return "✅ Document processed. You can now ask questions!"
+def ask_question(query):
+    if not qa_chain:
+        return "❌ Please upload a document first."
+    response = qa_chain.invoke({"query": query})
+    return response["result"]
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🧠 Ask Questions About Your Document (LangChain + Hugging Face)")
+    with gr.Row():
+        file_input = gr.File(label="📄 Upload .txt File", type="filepath")
+        upload_btn = gr.Button("🔄 Process Document")
+    upload_output = gr.Textbox(label="📁 Status", interactive=False)
+    with gr.Row():
+        query_input = gr.Textbox(label="❓ Your Question")
+        query_btn = gr.Button("🧠 Get Answer")
+    answer_output = gr.Textbox(label="✅ Answer", lines=4)
+    upload_btn.click(upload_and_prepare, inputs=file_input, outputs=upload_output)
+    query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
+# For local dev use: demo.launch()
+# For HF Spaces
 if __name__ == "__main__":
+    demo.launch()