Spaces:

Adieee5
/

Document-Research-RAG

Sleeping

App Files Files Community

Adieee5 commited on Aug 24, 2025

Commit

e9ce2a7

verified ·

1 Parent(s): f53c091

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +25 -0
app.py +332 -0
config.py +22 -0
pdf_processor.py +98 -0
rag_engine.py +90 -0
requirements.txt +7 -0
vector_db/.DS_Store +0 -0
vector_store.py +115 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p uploads vector_db
+# Expose port
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import gradio as gr
+import os
+import uuid
+import tempfile
+from typing import List, Tuple, Optional
+from config import Config
+from pdf_processor import PDFProcessor
+from vector_store import VectorStore
+from rag_engine import RAGEngine
+# Initialize components
+pdf_processor = PDFProcessor(
+    chunk_size=Config.CHUNK_SIZE,
+    chunk_overlap=Config.CHUNK_OVERLAP
+)
+vector_store = VectorStore(
+    model_name=Config.EMBEDDING_MODEL,
+    vector_db_path=Config.VECTOR_DB_PATH
+)
+rag_engine = RAGEngine(vector_store)
+def upload_and_process_pdfs(files: List[tempfile._TemporaryFileWrapper]) -> str:
+    """Process uploaded PDF files and add them to the vector store."""
+    if not files:
+        return "❌ No files uploaded."
+    try:
+        uploaded_files = []
+        total_chunks = 0
+        for file in files:
+            if file is None:
+                continue
+            file_path = file.name
+            filename = os.path.basename(file_path)
+            # Check if it's a PDF
+            if not filename.lower().endswith('.pdf'):
+                continue
+            # Process PDF
+            chunks = pdf_processor.extract_text_from_pdf(file_path)
+            # Add to vector store
+            vector_store.add_documents(chunks)
+            uploaded_files.append(filename)
+            total_chunks += len(chunks)
+        if uploaded_files:
+            stats = vector_store.get_stats()
+            return f"✅ Successfully processed {len(uploaded_files)} PDF(s):\n" + \
+                   f"📄 Files: {', '.join(uploaded_files)}\n" + \
+                   f"📊 Total chunks created: {total_chunks}\n" + \
+                   f"🗃️ Database now contains {stats['total_documents']} total documents"
+        else:
+            return "❌ No valid PDF files found."
+    except Exception as e:
+        return f"❌ Error processing files: {str(e)}"
+def get_database_stats() -> str:
+    """Get current database statistics."""
+    stats = vector_store.get_stats()
+    return f"📊 **Database Statistics**\n\n" + \
+           f"📄 Total Documents: {stats['total_documents']}\n" + \
+           f"🔍 Index Size: {stats['index_size']}\n" + \
+           f"📏 Vector Dimension: {stats.get('dimension', 'N/A')}"
+def clear_database() -> str:
+    """Clear the entire vector database."""
+    try:
+        vector_store.clear_index()
+        return "✅ Database cleared successfully!"
+    except Exception as e:
+        return f"❌ Error clearing database: {str(e)}"
+def respond(message: str, chat_history: List[dict]) -> Tuple[str, List[dict]]:
+    """Chat function that handles the new messages format."""
+    if not message.strip():
+        return "", chat_history
+    try:
+        # Get response from RAG engine
+        result = rag_engine.generate_answer(message, top_k=Config.TOP_K)
+        response = result['answer']
+        sources = result.get('sources', [])
+        # Add source information to response
+        if sources:
+            response += "\n\n**📚 Sources:**\n"
+            for i, source in enumerate(sources[:3], 1):
+                response += f"{i}. 📄 **{source['source_file']}** (Page {source['page_number']})\n"
+                response += f"   📝 _{source['content_preview']}_\n"
+        # Add user message to chat history
+        chat_history.append({"role": "user", "content": message})
+        # Add assistant response to chat history
+        chat_history.append({"role": "assistant", "content": response})
+        return "", chat_history
+    except Exception as e:
+        error_response = f"❌ Error: {str(e)}"
+        # Add user message and error response to chat history
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": error_response})
+        return "", chat_history
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="PDF RAG System") as interface:
+        # Header
+        gr.Markdown("# 🤖 PDF RAG Assistant")
+        gr.Markdown("Upload PDFs and ask intelligent questions about their content using AI")
+        with gr.Tabs():
+            # Tab 1: Document Management
+            with gr.Tab("📁 Document Management"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.Markdown("## 📤 Upload PDF Documents")
+                        gr.Markdown("Drag and drop your PDF files or click to browse")
+                        file_upload = gr.File(
+                            file_count="multiple",
+                            file_types=[".pdf"],
+                            label="Select PDF files to upload"
+                        )
+                        upload_btn = gr.Button(
+                            "🚀 Process PDFs",
+                            variant="primary",
+                            size="lg"
+                        )
+                        upload_status = gr.Textbox(
+                            label="📊 Upload Status",
+                            interactive=False,
+                            max_lines=8
+                        )
+                    with gr.Column(scale=1):
+                        gr.Markdown("## 🗄️ Database Management")
+                        stats_display = gr.Markdown(get_database_stats())
+                        with gr.Row():
+                            refresh_btn = gr.Button("🔄 Refresh", size="sm", variant="secondary")
+                            clear_btn = gr.Button("🗑️ Clear Database", size="sm", variant="stop")
+                        clear_status = gr.Textbox(
+                            label="🔧 Database Status",
+                            interactive=False,
+                            max_lines=3
+                        )
+                # Event handlers for document management
+                def update_stats_display():
+                    return get_database_stats()
+                upload_btn.click(
+                    fn=upload_and_process_pdfs,
+                    inputs=[file_upload],
+                    outputs=[upload_status]
+                ).then(
+                    fn=update_stats_display,
+                    outputs=[stats_display]
+                )
+                refresh_btn.click(
+                    fn=update_stats_display,
+                    outputs=[stats_display]
+                )
+                clear_btn.click(
+                    fn=clear_database,
+                    outputs=[clear_status]
+                ).then(
+                    fn=update_stats_display,
+                    outputs=[stats_display]
+                )
+            # Tab 2: Chat Interface
+            with gr.Tab("💬 AI Assistant"):
+                gr.Markdown("## 🤖 Ask questions about your uploaded documents")
+                gr.Markdown("**💡 Tips:** Upload PDFs first, then ask specific questions about their content for detailed answers with source references.")
+                # Create chat interface with messages format
+                chatbot = gr.Chatbot(
+                    height=500,
+                    show_label=False,
+                    type="messages",
+                    value=[{
+                        "role": "assistant",
+                        "content": "👋 **Welcome to PDF RAG Assistant!**\n\nI'm here to help you analyze and understand your PDF documents. \n\n📋 **Getting started:**\n1. Upload PDFs in the 'Document Management' tab\n2. Come back here and ask me questions\n3. I'll provide detailed answers with source references\n\n🚀 **Ready to get started?**"
+                    }]
+                )
+                with gr.Row():
+                    msg_input = gr.Textbox(
+                        placeholder="💭 Ask a question about your documents...",
+                        label="Your Question",
+                        lines=2,
+                        scale=4
+                    )
+                    send_btn = gr.Button(
+                        "📨 Send",
+                        variant="primary",
+                        size="lg",
+                        scale=1
+                    )
+                clear_chat_btn = gr.Button(
+                    "🧹 Clear Chat",
+                    variant="secondary",
+                    size="sm"
+                )
+                # Event handlers for chat
+                send_btn.click(
+                    fn=respond,
+                    inputs=[msg_input, chatbot],
+                    outputs=[msg_input, chatbot]
+                )
+                msg_input.submit(
+                    fn=respond,
+                    inputs=[msg_input, chatbot],
+                    outputs=[msg_input, chatbot]
+                )
+                clear_chat_btn.click(
+                    fn=lambda: [{
+                        "role": "assistant",
+                        "content": "👋 **Welcome back!**\n\nI'm ready to help you with your PDF documents again. What would you like to know?"
+                    }],
+                    outputs=[chatbot]
+                )
+            # Tab 3: System Information
+            with gr.Tab("ℹ️ System Information"):
+                gr.Markdown("# ⚙️ System Configuration & Information")
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("## 🔧 Current Settings")
+                        settings_info = f"""
+**🧠 Embedding Model:** `{Config.EMBEDDING_MODEL}`
+**📝 Chunk Size:** {Config.CHUNK_SIZE} characters
+**🔗 Chunk Overlap:** {Config.CHUNK_OVERLAP} characters
+**🎯 Search Results:** Top {Config.TOP_K} most relevant chunks
+**📁 Max File Size:** 16MB per PDF
+"""
+                        gr.Markdown(settings_info)
+                    with gr.Column():
+                        gr.Markdown("## 🚀 Key Features")
+                        features_info = """
+✅ Multiple PDF upload and processing
+✅ Intelligent text chunking
+✅ Vector similarity search using FAISS
+✅ AI-powered Q&A with Google Gemini
+✅ Source attribution with page numbers
+✅ Persistent vector database storage
+✅ Real-time chat interface
+✅ Responsive modern UI
+"""
+                        gr.Markdown(features_info)
+                gr.Markdown("## 🛠️ Technology Stack")
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("**🖥️ Framework:** Gradio 4.44+")
+                        gr.Markdown("**📄 PDF Processing:** PyMuPDF")
+                    with gr.Column():
+                        gr.Markdown("**🧮 Embeddings:** Sentence Transformers")
+                        gr.Markdown("**🗃️ Vector Database:** FAISS")
+                    with gr.Column():
+                        gr.Markdown("**🤖 Language Model:** Google Gemini 1.5")
+                gr.Markdown("## 📝 Quick Start Guide")
+                guide_info = """
+**1.** Upload Documents - Go to 'Document Management' tab and upload your PDF files
+**2.** Process & Index - Wait for the system to extract text and create embeddings
+**3.** Ask Questions - Switch to 'AI Assistant' tab and start asking questions
+**4.** Get Intelligent Answers - Receive detailed responses with source references and page numbers
+"""
+                gr.Markdown(guide_info)
+    return interface
+if __name__ == "__main__":
+    # Create and launch the interface
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

config.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+class Config:
+    GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY') #os.getenv('GEMINI_API_KEY')
+    SECRET_KEY = os.environ.get('SECRET_KEY', 'your-secret-key-here')
+    UPLOAD_FOLDER = 'uploads'
+    VECTOR_DB_PATH = 'vector_db'
+    MAX_CONTENT_LENGTH = 16 * 1024 * 1024
+    EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+    CHUNK_SIZE = 1000
+    CHUNK_OVERLAP = 200
+    TOP_K = 5
+    ALLOWED_EXTENSIONS = {'pdf'}
+os.makedirs(Config.UPLOAD_FOLDER, exist_ok=True)
+os.makedirs(Config.VECTOR_DB_PATH, exist_ok=True)

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import fitz
+import os
+import re
+from typing import List, Dict, Any
+from dataclasses import dataclass
+@dataclass
+class DocumentChunk:
+    content: str
+    metadata: Dict[str, Any]
+    page_number: int
+    source_file: str
+class PDFProcessor:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def extract_text_from_pdf(self, pdf_path: str) -> List[DocumentChunk]:
+        """Extract text from PDF and return chunks with metadata."""
+        chunks = []
+        try:
+            doc = fitz.open(pdf_path)
+            filename = os.path.basename(pdf_path)
+            for page_num in range(len(doc)):
+                page = doc.load_page(page_num)
+                text = page.get_text()
+                if text.strip():
+                    cleaned_text = self._clean_text(text)
+                    page_chunks = self._create_chunks(cleaned_text, page_num + 1, filename)
+                    chunks.extend(page_chunks)
+            doc.close()
+            return chunks
+        except Exception as e:
+            raise Exception(f"Error processing PDF {pdf_path}: {str(e)}")
+    def _clean_text(self, text: str) -> str:
+        """Clean and normalize text."""
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'[^\w\s.,!?;:()\[\]{}"-]', '', text)
+        text = re.sub(r'([.,!?;:]){2,}', r'\1', text)
+        return text.strip()
+    def _create_chunks(self, text: str, page_number: int, filename: str) -> List[DocumentChunk]:
+        """Split text into overlapping chunks."""
+        chunks = []
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        current_chunk = ""
+        current_length = 0
+        for sentence in sentences:
+            sentence_length = len(sentence)
+            if current_length + sentence_length > self.chunk_size and current_chunk:
+                chunks.append(DocumentChunk(
+                    content=current_chunk.strip(),
+                    metadata={
+                        'filename': filename,
+                        'page_number': page_number,
+                        'chunk_length': len(current_chunk)
+                    },
+                    page_number=page_number,
+                    source_file=filename
+                ))
+                overlap_text = self._get_overlap_text(current_chunk)
+                current_chunk = overlap_text + " " + sentence
+                current_length = len(current_chunk)
+            else:
+                current_chunk += " " + sentence if current_chunk else sentence
+                current_length = len(current_chunk)
+        if current_chunk.strip():
+            chunks.append(DocumentChunk(
+                content=current_chunk.strip(),
+                metadata={
+                    'filename': filename,
+                    'page_number': page_number,
+                    'chunk_length': len(current_chunk)
+                },
+                page_number=page_number,
+                source_file=filename
+            ))
+        return chunks
+    def _get_overlap_text(self, text: str) -> str:
+        """Get overlap text from the end of current chunk."""
+        if len(text) <= self.chunk_overlap:
+            return text
+        return text[-self.chunk_overlap:]

rag_engine.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import google.generativeai as genai
+from typing import List, Dict, Any
+from vector_store import VectorStore
+from config import Config
+class RAGEngine:
+    def __init__(self, vector_store: VectorStore):
+        self.vector_store = vector_store
+        genai.configure(api_key=Config.GEMINI_API_KEY) # type: ignore
+        self.model = genai.GenerativeModel('gemini-2.0-flash-lite') # type: ignore
+    def generate_answer(self, query: str, top_k: int = 5) -> Dict[str, Any]:
+        """Generate answer using RAG pipeline."""
+        try:
+            search_results = self.vector_store.search(query, top_k)
+            if not search_results:
+                return {
+                    'answer': "I couldn't find any relevant information in the uploaded documents to answer your question.",
+                    'sources': [],
+                    'context_used': ""
+                }
+            context_parts = []
+            sources = []
+            for i, result in enumerate(search_results):
+                context_parts.append(f"[Context {i+1}]: {result['content']}")
+                sources.append({
+                    'source_file': result['source_file'],
+                    'page_number': result['page_number'],
+                    'similarity_score': result['similarity_score'],
+                    'content_preview': result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
+                })
+            context = "\n\n".join(context_parts)
+            prompt = self._create_prompt(query, context)
+            response = self.model.generate_content(prompt)
+            return {
+                'answer': response.text,
+                'sources': sources,
+                'context_used': context,
+                'query': query
+            }
+        except Exception as e:
+            return {
+                'answer': f"An error occurred while generating the answer: {str(e)}",
+                'sources': [],
+                'context_used': "",
+                'error': str(e)
+            }
+    def _create_prompt(self, query: str, context: str) -> str:
+        """Create a prompt for the language model."""
+        prompt = f"""You are an AI assistant that answers questions based on provided document context.
+                    Instructions:
+                    1. Answer the question using ONLY the information provided in the context below
+                    2. If the context doesn't contain enough information to answer the question, say so clearly
+                    3. Be concise but comprehensive in your answer
+                    4. If you reference specific information, mention which context section it comes from
+                    5. Do not make up information that's not in the provided context
+                    Context from documents:
+                    {context}
+                    Question: {query}
+                    Answer:"""
+        return prompt
+    def get_conversation_response(self, query: str) -> str:
+        """Get a simple text response for conversation interface."""
+        result = self.generate_answer(query)
+        answer = result['answer']
+        sources = result.get('sources', [])
+        if sources:
+            answer += "\n\n**Sources:**\n"
+            for i, source in enumerate(sources[:3], 1):
+                answer += f"{i}. {source['source_file']} (Page {source['page_number']})\n"
+        return answer

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+PyMuPDF
+sentence-transformers
+faiss-cpu
+google-generativeai
+python-dotenv
+numpy

vector_db/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

vector_store.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import faiss
+import numpy as np
+import pickle
+import os
+from typing import List, Dict, Any, Tuple
+from sentence_transformers import SentenceTransformer
+from pdf_processor import DocumentChunk
+class VectorStore:
+    def __init__(self, model_name: str, vector_db_path: str):
+        self.model = SentenceTransformer(model_name)
+        self.vector_db_path = vector_db_path
+        self.index_path = os.path.join(vector_db_path, 'faiss_index.bin')
+        self.metadata_path = os.path.join(vector_db_path, 'metadata.pkl')
+        self.index = None
+        self.metadata = []
+        self.load_index()
+    def load_index(self):
+        """Load existing FAISS index and metadata."""
+        try:
+            if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
+                self.index = faiss.read_index(self.index_path)
+                with open(self.metadata_path, 'rb') as f:
+                    self.metadata = pickle.load(f)
+                print(f"Loaded existing index with {len(self.metadata)} documents")
+            else:
+                print("No existing index found. Will create new one.")
+        except Exception as e:
+            print(f"Error loading index: {e}")
+            self.index = None
+            self.metadata = []
+    def add_documents(self, chunks: List[DocumentChunk]):
+        """Add document chunks to the vector store."""
+        if not chunks:
+            return
+        texts = [chunk.content for chunk in chunks]
+        embeddings = self.model.encode(texts, convert_to_tensor=False)
+        embeddings = np.array(embeddings).astype('float32')
+        if self.index is None:
+            dimension = embeddings.shape[1]
+            self.index = faiss.IndexFlatIP(dimension)
+            faiss.normalize_L2(embeddings)
+        self.index.add(embeddings) # type: ignore
+        for chunk in chunks:
+            self.metadata.append({
+                'content': chunk.content,
+                'metadata': chunk.metadata,
+                'page_number': chunk.page_number,
+                'source_file': chunk.source_file
+            })
+        self.save_index()
+        print(f"Added {len(chunks)} chunks to vector store")
+    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Search for similar documents."""
+        if self.index is None or len(self.metadata) == 0:
+            return []
+        query_embedding = self.model.encode([query], convert_to_tensor=False)
+        query_embedding = np.array(query_embedding).astype('float32')
+        faiss.normalize_L2(query_embedding)
+        scores, indices = self.index.search(query_embedding, min(top_k, len(self.metadata))) # type: ignore
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx != -1:
+                result = self.metadata[idx].copy()
+                result['similarity_score'] = float(score)
+                results.append(result)
+        return results
+    def save_index(self):
+        """Save FAISS index and metadata to disk."""
+        try:
+            if self.index is not None:
+                faiss.write_index(self.index, self.index_path)
+            with open(self.metadata_path, 'wb') as f:
+                pickle.dump(self.metadata, f)
+        except Exception as e:
+            print(f"Error saving index: {e}")
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the vector store."""
+        if self.index is None:
+            return {'total_documents': 0, 'index_size': 0}
+        return {
+            'total_documents': len(self.metadata),
+            'index_size': self.index.ntotal,
+            'dimension': self.index.d
+        }
+    def clear_index(self):
+        """Clear the entire index."""
+        self.index = None
+        self.metadata = []
+        if os.path.exists(self.index_path):
+            os.remove(self.index_path)
+        if os.path.exists(self.metadata_path):
+            os.remove(self.metadata_path)
+        print("Index cleared successfully")