Spaces:

mahmoudalrefaey
/

PDFPal-PDF-chatbot

Sleeping

App Files Files Community

mahmoudalrefaey commited on Jul 10, 2025

Commit

4b500d0

verified ·

1 Parent(s): b6babda

Upload 2 files

Browse files

Files changed (2) hide show

app.py +363 -0
config.py +108 -0

app.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""
+PDFPal - A lightweight, chat-based RAG application
+Built with free, local models and deployable via Gradio
+"""
+import os
+import tempfile
+import gradio as gr
+import time
+from typing import List, Dict, Any
+from pathlib import Path
+# Import our custom modules
+from modules.pdf_processor import PDFProcessor
+from modules.embedding_manager import EmbeddingManager
+from modules.llm_manager import LLMManager
+from modules.rag_pipeline import RAGPipeline
+from modules.chat_manager import ChatManager
+from config import Config
+class PDFPalApp:
+    """Main PDFPal application using Gradio"""
+    def __init__(self):
+        """Initialize the PDFPal application"""
+        self.chat_manager = ChatManager()
+        self.rag_pipeline = None
+        self.uploaded_files = []
+        self.current_model = Config.DEFAULT_LLM_MODEL
+        # Initialize components
+        self.pdf_processor = PDFProcessor()
+        self.embedding_manager = EmbeddingManager()
+        self.llm_manager = None
+        # Create Gradio interface
+        self.interface = self._create_interface()
+    def _create_interface(self):
+        """Create the Gradio interface"""
+        # Custom CSS for better styling
+        css = """
+        .gradio-container {
+            max-width: 1200px !important;
+            margin: auto !important;
+        }
+        .chat-container {
+            height: 600px;
+            overflow-y: auto;
+            border: 1px solid #e0e0e0;
+            border-radius: 8px;
+            padding: 20px;
+            background: #fafafa;
+        }
+        .file-upload {
+            border: 2px dashed #007bff;
+            border-radius: 8px;
+            padding: 20px;
+            text-align: center;
+            background: #f8f9fa;
+        }
+        """
+        with gr.Blocks(css=css, title="PDFPal - AI Chatbot", theme=gr.themes.Soft()) as interface:
+            # Header
+            gr.Markdown("""
+            # 📚 PDFPal - AI Chatbot
+            **Chat with your PDF documents using local AI models!**
+            Upload one or more PDF files and start asking questions in natural language.
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    # Sidebar for configuration
+                    gr.Markdown("### ⚙️ Configuration")
+                    # Model selection
+                    model_dropdown = gr.Dropdown(
+                        choices=Config.get_model_names(),
+                        value=Config.get_recommended_model(),
+                        label="🤖 Language Model",
+                        info="Choose a lightweight local model"
+                    )
+                    # Advanced settings
+                    with gr.Accordion("🔧 Advanced Settings", open=False):
+                        chunk_size = gr.Slider(
+                            minimum=500, maximum=2000, value=800, step=100,
+                            label="Chunk Size", info="Size of text chunks (smaller = faster)"
+                        )
+                        chunk_overlap = gr.Slider(
+                            minimum=50, maximum=500, value=100, step=50,
+                            label="Chunk Overlap", info="Overlap between chunks"
+                        )
+                        max_tokens = gr.Slider(
+                            minimum=100, maximum=1000, value=300, step=50,
+                            label="Max Response Tokens", info="Maximum response length (smaller = faster)"
+                        )
+                        temperature = gr.Slider(
+                            minimum=0.0, maximum=1.0, value=0.7, step=0.1,
+                            label="Temperature", info="Creativity level"
+                        )
+                    # File upload section
+                    gr.Markdown("### 📁 Upload Documents")
+                    file_upload = gr.File(
+                        file_count="multiple",
+                        file_types=[".pdf"],
+                        label="Choose PDF files"
+                    )
+                    process_btn = gr.Button("🔄 Process Documents", variant="primary")
+                    process_status = gr.Textbox(label="Status", interactive=False)
+                    # Model info
+                    model_info = gr.JSON(label="Model Information", visible=False)
+                with gr.Column(scale=2):
+                    # Chat interface
+                    gr.Markdown("### 💬 Chat Interface")
+                    # Chat history display
+                    chat_history = gr.Chatbot(
+                        label="Conversation",
+                        height=500,
+                        show_label=False,
+                        container=True,
+                        bubble_full_width=False
+                    )
+                    # Chat input
+                    with gr.Row():
+                        chat_input = gr.Textbox(
+                            placeholder="Ask a question about your documents...",
+                            label="Your Question",
+                            scale=4
+                        )
+                        send_btn = gr.Button("Send", variant="primary", scale=1)
+                    # Clear chat button
+                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+                    # Export options
+                    with gr.Row():
+                        export_json_btn = gr.Button("📄 Export JSON")
+                        export_txt_btn = gr.Button("📝 Export Text")
+                    # Statistics
+                    stats_display = gr.JSON(label="Chat Statistics", visible=False)
+            # Event handlers
+            model_dropdown.change(
+                fn=self._change_model,
+                inputs=[model_dropdown],
+                outputs=[model_info, process_status]
+            )
+            process_btn.click(
+                fn=self._process_documents,
+                inputs=[file_upload, chunk_size, chunk_overlap, model_dropdown],
+                outputs=[process_status, model_info]
+            )
+            send_btn.click(
+                fn=self._send_message,
+                inputs=[chat_input, max_tokens, temperature],
+                outputs=[chat_history, chat_input, stats_display],
+                show_progress=True
+            )
+            chat_input.submit(
+                fn=self._send_message,
+                inputs=[chat_input, max_tokens, temperature],
+                outputs=[chat_history, chat_input, stats_display],
+                show_progress=True
+            )
+            clear_btn.click(
+                fn=self._clear_chat,
+                outputs=[chat_history, stats_display]
+            )
+            export_json_btn.click(
+                fn=self._export_conversation_json,
+                outputs=[gr.File()]
+            )
+            export_txt_btn.click(
+                fn=self._export_conversation_text,
+                outputs=[gr.File()]
+            )
+        return interface
+    def _change_model(self, model_name):
+        """Change the language model"""
+        try:
+            self.current_model = model_name
+            self.llm_manager = LLMManager(model_name=model_name)
+            model_info = self.llm_manager.get_model_info()
+            return model_info, f"✅ Model changed to {model_name}"
+        except Exception as e:
+            return {}, f"❌ Error changing model: {str(e)}"
+    def _process_documents(self, files, chunk_size, chunk_overlap, model_name):
+        """Process uploaded PDF documents"""
+        if not files:
+            return "⚠️ Please upload PDF files first", {}
+        try:
+            # Update processor settings
+            self.pdf_processor = PDFProcessor(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+            # Initialize LLM manager
+            self.llm_manager = LLMManager(model_name=model_name)
+            # Process all files
+            all_chunks = []
+            self.uploaded_files = []
+            for file in files:
+                # Handle different file object types from Gradio
+                if hasattr(file, 'read'):
+                    # File-like object
+                    file_content = file.read()
+                    file_name = getattr(file, 'name', f'file_{len(self.uploaded_files)}.pdf')
+                elif isinstance(file, str):
+                    # File path string
+                    with open(file, 'rb') as f:
+                        file_content = f.read()
+                    file_name = os.path.basename(file)
+                else:
+                    # Try to get content as bytes
+                    file_content = bytes(file) if hasattr(file, '__bytes__') else str(file).encode()
+                    file_name = f'file_{len(self.uploaded_files)}.pdf'
+                # Save uploaded file temporarily
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                    tmp_file.write(file_content)
+                    tmp_path = tmp_file.name
+                try:
+                    # Process PDF
+                    chunks = self.pdf_processor.process_pdf(tmp_path)
+                    all_chunks.extend(chunks)
+                    self.uploaded_files.append(file_name)
+                finally:
+                    # Clean up temporary file
+                    os.unlink(tmp_path)
+            if all_chunks:
+                # Create knowledge base
+                knowledge_base = self.embedding_manager.create_knowledge_base(all_chunks)
+                # Initialize RAG pipeline
+                self.rag_pipeline = RAGPipeline(
+                    knowledge_base=knowledge_base,
+                    llm_manager=self.llm_manager
+                )
+                model_info = self.llm_manager.get_model_info()
+                status = f"✅ Processed {len(all_chunks)} text chunks from {len(files)} file(s)"
+                return status, model_info
+            else:
+                return "❌ No text could be extracted from the uploaded files", {}
+        except Exception as e:
+            return f"❌ Error processing files: {str(e)}", {}
+    def _send_message(self, message, max_tokens, temperature):
+        """Send a message and get response"""
+        start_time = time.time()
+        if not message.strip():
+            return self.chat_manager.get_messages(), "", {}
+        if not self.rag_pipeline:
+            # Add user message
+            self.chat_manager.add_message("user", message)
+            # Add error response
+            error_msg = "⚠️ Please upload and process documents first!"
+            self.chat_manager.add_message("assistant", error_msg)
+            return self.chat_manager.get_messages(), "", self.chat_manager.get_statistics()
+        try:
+            # Add user message
+            self.chat_manager.add_message("user", message)
+            # Get AI response with timing
+            response_start = time.time()
+            response = self.rag_pipeline.get_response(
+                message,
+                max_tokens=max_tokens,
+                temperature=temperature
+            )
+            response_time = time.time() - response_start
+            # Add AI response
+            self.chat_manager.add_message("assistant", response)
+            # Add performance info to statistics
+            total_time = time.time() - start_time
+            stats = self.chat_manager.get_statistics()
+            stats.update({
+                "response_time_seconds": round(response_time, 2),
+                "total_time_seconds": round(total_time, 2),
+                "performance_note": f"Response generated in {round(response_time, 2)}s"
+            })
+            return self.chat_manager.get_messages(), "", stats
+        except Exception as e:
+            error_msg = f"❌ Error: {str(e)}"
+            self.chat_manager.add_message("assistant", error_msg)
+            return self.chat_manager.get_messages(), "", self.chat_manager.get_statistics()
+    def _clear_chat(self):
+        """Clear chat history"""
+        self.chat_manager.clear_history()
+        return [], {}
+    def _export_conversation_json(self):
+        """Export conversation as JSON"""
+        try:
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json')
+            self.chat_manager.save_conversation(temp_file.name)
+            return temp_file.name
+        except Exception as e:
+            return None
+    def _export_conversation_text(self):
+        """Export conversation as text"""
+        try:
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.txt')
+            self.chat_manager.export_conversation_text(temp_file.name)
+            return temp_file.name
+        except Exception as e:
+            return None
+    def launch(self, **kwargs):
+        """Launch the Gradio interface"""
+        return self.interface.launch(**kwargs)
+def main():
+    """Main entry point"""
+    app = PDFPalApp()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Configuration file for PDFPal RAG application
+Centralized settings for easy customization
+"""
+import os
+from typing import Dict, Any
+class Config:
+    """Application configuration"""
+    # Application settings
+    APP_NAME = "PDFPal - AI Chatbot"
+    APP_VERSION = "1.0.0"
+    DEBUG = os.getenv("DEBUG", "False").lower() == "true"
+    # Model configurations
+    DEFAULT_LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+    # Available models
+    AVAILABLE_MODELS = {
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
+            "name": "TinyLlama 1.1B",
+            "description": "Fast and efficient 1.1B parameter model",
+            "size": "1.1B",
+            "recommended": True
+        },
+        "microsoft/DialoGPT-medium": {
+            "name": "DialoGPT Medium",
+            "description": "Conversational model optimized for chat",
+            "size": "345M",
+            "recommended": False
+        },
+        "microsoft/phi-2": {
+            "name": "Phi-2",
+            "description": "High-quality 2.7B parameter model",
+            "size": "2.7B",
+            "recommended": False
+        }
+    }
+    # Default processing settings
+    DEFAULT_CHUNK_SIZE = 800  # Reduced from 1000 for faster processing
+    DEFAULT_CHUNK_OVERLAP = 100  # Reduced from 200
+    DEFAULT_MAX_TOKENS = 300  # Reduced from 500 for faster generation
+    DEFAULT_TEMPERATURE = 0.7
+    # RAG settings
+    DEFAULT_RETRIEVAL_K = 2  # Reduced from 4 for faster retrieval
+    DEFAULT_SEARCH_TYPE = "similarity"
+    # Chat settings
+    MAX_CHAT_HISTORY = 50  # Reduced from 100
+    MAX_CONTEXT_MESSAGES = 3  # Reduced from 5
+    # File settings
+    MAX_FILE_SIZE_MB = 25  # Reduced from 50
+    SUPPORTED_FILE_TYPES = ["pdf"]
+    # Performance settings
+    CACHE_RESPONSES = True
+    MAX_RETRIEVAL_DOCS = 1  # Reduced from 2 for fastest retrieval
+    ENABLE_RESPONSE_CACHING = True
+    USE_FASTER_MODEL = True
+    OPTIMIZE_FOR_SPEED = True
+    ENABLE_GPU = True
+    ENABLE_QUANTIZATION = True
+    CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
+    # UI settings
+    SIDEBAR_EXPANDED = True
+    PAGE_LAYOUT = "wide"
+    @classmethod
+    def get_model_config(cls, model_name: str) -> Dict[str, Any]:
+        """Get configuration for a specific model"""
+        return cls.AVAILABLE_MODELS.get(model_name, cls.AVAILABLE_MODELS[cls.DEFAULT_LLM_MODEL])
+    @classmethod
+    def get_recommended_model(cls) -> str:
+        """Get the recommended model name"""
+        for model_name, config in cls.AVAILABLE_MODELS.items():
+            if config.get("recommended", False):
+                return model_name
+        return cls.DEFAULT_LLM_MODEL
+    @classmethod
+    def get_model_names(cls) -> list:
+        """Get list of available model names"""
+        return list(cls.AVAILABLE_MODELS.keys())
+    @classmethod
+    def validate_model_name(cls, model_name: str) -> bool:
+        """Validate if a model name is supported"""
+        return model_name in cls.AVAILABLE_MODELS
+    @classmethod
+    def get_ui_config(cls) -> Dict[str, Any]:
+        """Get UI configuration"""
+        return {
+            "page_title": cls.APP_NAME,
+            "page_icon": "📚",
+            "layout": cls.PAGE_LAYOUT,
+            "initial_sidebar_state": "expanded" if cls.SIDEBAR_EXPANDED else "collapsed"
+        }