Spaces:

msmaje
/

ragmodel

Sleeping

App Files Files Community

msmaje commited on Jul 2, 2025

Commit

08b18a5

verified ·

1 Parent(s): c28a344

Create app.py

Browse files

Files changed (1) hide show

app.py +455 -0

app.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import gradio as gr
+import os
+import tempfile
+import shutil
+from pathlib import Path
+import logging
+import zipfile
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+try:
+    from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.embeddings import HuggingFaceEmbeddings
+    from langchain_community.vectorstores import FAISS
+    from langchain.prompts import PromptTemplate
+    from langchain.chains import RetrievalQA
+    from langchain_community.llms import HuggingFaceHub
+    LANGCHAIN_AVAILABLE = True
+except ImportError as e:
+    logger.error(f"LangChain import error: {e}")
+    LANGCHAIN_AVAILABLE = False
+# Global variables for the RAG system
+vectorstore = None
+retrieval_qa = None
+embedding_model = None
+# Check for pre-existing PDF folder
+PDF_FOLDER_PATH = "./pdfs"  # Default folder for PDFs in the space
+PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
+def initialize_models():
+    """Initialize the embedding model and LLM"""
+    global embedding_model
+    try:
+        # Initialize embedding model
+        embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={'device': 'cpu'}
+        )
+        # Get HuggingFace token from environment
+        hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        if not hf_token:
+            return False, "❌ HuggingFace API token not found in environment variables"
+        # Initialize LLM
+        llm = HuggingFaceHub(
+            repo_id="microsoft/DialoGPT-medium",
+            model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
+            huggingfacehub_api_token=hf_token
+        )
+        return True, "✅ Models initialized successfully"
+    except Exception as e:
+        logger.error(f"Model initialization error: {e}")
+        return False, f"❌ Error initializing models: {str(e)}"
+def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
+    """Load PDFs from the pre-existing folder"""
+    global vectorstore, retrieval_qa, embedding_model
+    if not LANGCHAIN_AVAILABLE:
+        return "❌ LangChain is not available. Please check the installation."
+    if not PRELOADED_PDFS:
+        return "❌ No pre-loaded PDFs found in ./pdfs folder."
+    try:
+        # Initialize models if not already done
+        if embedding_model is None:
+            success, message = initialize_models()
+            if not success:
+                return message
+        # Load documents from pre-existing folder
+        loader = PyPDFDirectoryLoader(PDF_FOLDER_PATH)
+        documents = loader.load()
+        if not documents:
+            return "❌ No documents were loaded from the PDFs folder."
+        # Split documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=int(chunk_size),
+            chunk_overlap=int(chunk_overlap)
+        )
+        chunks = text_splitter.split_documents(documents)
+        # Create vector store
+        vectorstore = FAISS.from_documents(chunks, embedding_model)
+        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
+        # Setup prompt template
+        prompt_template = """
+Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
+Context:
+{context}
+Question: {question}
+Helpful Answer:
+"""
+        prompt = PromptTemplate(
+            input_variables=["context", "question"],
+            template=prompt_template
+        )
+        # Initialize LLM
+        hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        llm = HuggingFaceHub(
+            repo_id="google/flan-t5-base",
+            model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
+            huggingfacehub_api_token=hf_token
+        )
+        # Create RetrievalQA chain
+        retrieval_qa = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=retriever,
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": prompt}
+        )
+        pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
+        return f"✅ Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
+    except Exception as e:
+        logger.error(f"Pre-loaded PDF processing error: {e}")
+        return f"❌ Error processing pre-loaded PDFs: {str(e)}"
+def extract_zip_to_pdfs(zip_file):
+    """Extract uploaded ZIP file to PDFs folder"""
+    if not zip_file:
+        return "❌ Please upload a ZIP file."
+    try:
+        # Create PDFs directory if it doesn't exist
+        os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
+        # Extract ZIP file
+        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+            # Extract only PDF files
+            pdf_files = [f for f in zip_ref.namelist() if f.lower().endswith('.pdf')]
+            if not pdf_files:
+                return "❌ No PDF files found in the ZIP archive."
+            for pdf_file in pdf_files:
+                # Extract to PDFs folder
+                zip_ref.extract(pdf_file, PDF_FOLDER_PATH)
+                # If file is in a subfolder, move it to the root of PDFs folder
+                extracted_path = os.path.join(PDF_FOLDER_PATH, pdf_file)
+                if os.path.dirname(pdf_file):  # File is in a subfolder
+                    new_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
+                    shutil.move(extracted_path, new_path)
+                    # Clean up empty directories
+                    try:
+                        os.rmdir(os.path.dirname(extracted_path))
+                    except:
+                        pass
+        global PRELOADED_PDFS
+        PRELOADED_PDFS = True
+        return f"✅ Successfully extracted {len(pdf_files)} PDF files. Now click 'Load Pre-existing PDFs' to process them."
+    except Exception as e:
+        return f"❌ Error extracting ZIP file: {str(e)}"
+def process_pdfs(pdf_files, chunk_size, chunk_overlap):
+    """Process uploaded PDF files and create vector store"""
+    global vectorstore, retrieval_qa, embedding_model
+    if not LANGCHAIN_AVAILABLE:
+        return "❌ LangChain is not available. Please check the installation."
+    if not pdf_files:
+        return "❌ Please upload at least one PDF file or use pre-loaded PDFs."
+    try:
+        # Initialize models if not already done
+        if embedding_model is None:
+            success, message = initialize_models()
+            if not success:
+                return message
+        # Create temporary directory for PDFs
+        temp_dir = tempfile.mkdtemp()
+        # Save uploaded files to temp directory
+        for pdf_file in pdf_files:
+            if pdf_file is not None:
+                temp_path = os.path.join(temp_dir, os.path.basename(pdf_file.name))
+                shutil.copy2(pdf_file.name, temp_path)
+        # Load documents
+        loader = PyPDFDirectoryLoader(temp_dir)
+        documents = loader.load()
+        if not documents:
+            return "❌ No documents were loaded. Please check your PDF files."
+        # Split documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=int(chunk_size),
+            chunk_overlap=int(chunk_overlap)
+        )
+        chunks = text_splitter.split_documents(documents)
+        # Create vector store
+        vectorstore = FAISS.from_documents(chunks, embedding_model)
+        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
+        # Setup prompt template
+        prompt_template = """
+Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
+Context:
+{context}
+Question: {question}
+Helpful Answer:
+"""
+        prompt = PromptTemplate(
+            input_variables=["context", "question"],
+            template=prompt_template
+        )
+        # Initialize LLM
+        hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        llm = HuggingFaceHub(
+            repo_id="google/flan-t5-base",
+            model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
+            huggingfacehub_api_token=hf_token
+        )
+        # Create RetrievalQA chain
+        retrieval_qa = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=retriever,
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": prompt}
+        )
+        # Clean up temp directory
+        shutil.rmtree(temp_dir)
+        return f"✅ Successfully processed {len(documents)} documents into {len(chunks)} chunks. Ready for questions!"
+    except Exception as e:
+        logger.error(f"PDF processing error: {e}")
+        return f"❌ Error processing PDFs: {str(e)}"
+def answer_question(question):
+    """Answer a question using the RAG system"""
+    global retrieval_qa
+    if not question.strip():
+        return "❌ Please enter a question.", ""
+    if retrieval_qa is None:
+        return "❌ Please upload and process PDF files first.", ""
+    try:
+        # Get answer from RAG system
+        result = retrieval_qa({"query": question})
+        answer = result["result"]
+        # Format source documents
+        sources = []
+        for i, doc in enumerate(result.get("source_documents", []), 1):
+            source = doc.metadata.get("source", "Unknown")
+            page = doc.metadata.get("page", "Unknown")
+            content_preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
+            sources.append(f"**Source {i}:**\n- File: {Path(source).name}\n- Page: {page}\n- Preview: {content_preview}\n")
+        sources_text = "\n".join(sources) if sources else "No sources found."
+        return answer, sources_text
+    except Exception as e:
+        logger.error(f"Question answering error: {e}")
+        return f"❌ Error answering question: {str(e)}", ""
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="PDF RAG System", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 📚 PDF Question Answering System
+        Upload your PDF documents and ask questions about their content!
+        **Instructions:**
+        1. **Option A**: Upload individual PDF files and click "Process PDFs"
+        2. **Option B**: Upload a ZIP file containing PDFs and extract them
+        3. **Option C**: Use pre-loaded PDFs (if available in ./pdfs folder)
+        4. Ask questions about your documents
+        """)
+        # Check for pre-loaded PDFs
+        if PRELOADED_PDFS:
+            gr.Markdown("🎉 **Pre-loaded PDFs detected!** You can use the 'Load Pre-existing PDFs' button.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📄 Upload & Settings")
+                with gr.Tabs():
+                    with gr.TabItem("📁 Individual PDFs"):
+                        pdf_files = gr.File(
+                            label="Upload PDF Files",
+                            file_count="multiple",
+                            file_types=[".pdf"],
+                            height=150
+                        )
+                        process_btn = gr.Button("🔄 Process PDFs", variant="primary")
+                    with gr.TabItem("🗂️ ZIP Upload"):
+                        zip_file = gr.File(
+                            label="Upload ZIP File (containing PDFs)",
+                            file_count="single",
+                            file_types=[".zip"],
+                            height=100
+                        )
+                        extract_btn = gr.Button("📦 Extract ZIP to PDFs Folder", variant="secondary")
+                        extract_output = gr.Textbox(label="Extraction Status", lines=2)
+                    with gr.TabItem("💾 Pre-loaded"):
+                        if PRELOADED_PDFS:
+                            pdf_list = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
+                            gr.Markdown(f"**Found {len(pdf_list)} PDF files:**")
+                            for pdf in pdf_list[:10]:  # Show first 10
+                                gr.Markdown(f"- {pdf}")
+                            if len(pdf_list) > 10:
+                                gr.Markdown(f"... and {len(pdf_list) - 10} more files")
+                        else:
+                            gr.Markdown("No pre-loaded PDFs found. Place PDF files in `./pdfs/` folder.")
+                        preload_btn = gr.Button("📚 Load Pre-existing PDFs", variant="primary",
+                                              interactive=PRELOADED_PDFS)
+                with gr.Row():
+                    chunk_size = gr.Slider(
+                        minimum=200,
+                        maximum=2000,
+                        value=1000,
+                        step=100,
+                        label="Chunk Size"
+                    )
+                    chunk_overlap = gr.Slider(
+                        minimum=0,
+                        maximum=500,
+                        value=200,
+                        step=50,
+                        label="Chunk Overlap"
+                    )
+                process_output = gr.Textbox(label="Processing Status", lines=4)
+            with gr.Column(scale=2):
+                gr.Markdown("### ❓ Ask Questions")
+                question_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="What would you like to know about your documents?",
+                    lines=2
+                )
+                ask_btn = gr.Button("🤔 Ask Question", variant="secondary")
+                with gr.Row():
+                    with gr.Column():
+                        answer_output = gr.Textbox(
+                            label="Answer",
+                            lines=8,
+                            max_lines=15
+                        )
+                    with gr.Column():
+                        sources_output = gr.Textbox(
+                            label="Sources",
+                            lines=8,
+                            max_lines=15
+                        )
+        # Event handlers
+        process_btn.click(
+            fn=process_pdfs,
+            inputs=[pdf_files, chunk_size, chunk_overlap],
+            outputs=[process_output]
+        )
+        preload_btn.click(
+            fn=load_preloaded_pdfs,
+            inputs=[chunk_size, chunk_overlap],
+            outputs=[process_output]
+        )
+        extract_btn.click(
+            fn=extract_zip_to_pdfs,
+            inputs=[zip_file],
+            outputs=[extract_output]
+        )
+        ask_btn.click(
+            fn=answer_question,
+            inputs=[question_input],
+            outputs=[answer_output, sources_output]
+        )
+        question_input.submit(
+            fn=answer_question,
+            inputs=[question_input],
+            outputs=[answer_output, sources_output]
+        )
+        # Example questions
+        gr.Markdown("""
+        ### 💡 Example Questions:
+        - What are the main topics covered in these documents?
+        - Can you summarize the key findings?
+        - What data is available for [specific topic]?
+        - What are the differences between [X] and [Y]?
+        - What are the differences in the uninsured rate by state in 2022?
+        """)
+    return demo
+if __name__ == "__main__":
+    # Check if running on HuggingFace Spaces
+    if os.getenv("SPACE_ID"):
+        demo = create_interface()
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False
+        )
+    else:
+        # Local development
+        demo = create_interface()
+        demo.launch(share=True)