Spaces:

sairika
/

Rag-based-api-task

Runtime error

App Files Files Community

sairika commited on Aug 7, 2025

Commit

f48e31e

verified ·

1 Parent(s): efaba82

Update app.py

Browse files

Files changed (1) hide show

app.py +324 -352

app.py CHANGED Viewed

@@ -1,408 +1,380 @@
 import os
-import io
 import base64
-import sqlite3
-import pandas as pd
-from typing import List, Optional, Dict, Any
 from pathlib import Path
-import asyncio
-import uuid
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-import uvicorn
-# Document processing
-import PyPDF2
-import pdfplumber
-from docx import Document
-import pytesseract
-from PIL import Image
-# ML/AI components
-import torch
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-import faiss
-import numpy as np
-import pickle
-# Configuration
-class Config:
-    UPLOAD_DIR = "uploads"
-    VECTOR_STORE_DIR = "vector_store"
-    CHUNK_SIZE = 500
-    CHUNK_OVERLAP = 50
-    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
-    # Hugging Face Models (Free)
-    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-    LLM_MODEL = "microsoft/DialoGPT-medium"  # For conversational responses
-    # Alternative: "google/flan-t5-base" for better text generation
 config = Config()
-# Ensure directories exist
-os.makedirs(config.UPLOAD_DIR, exist_ok=True)
-os.makedirs(config.VECTOR_STORE_DIR, exist_ok=True)
-# Pydantic models
-class QueryRequest(BaseModel):
-    question: str
-    image_base64: Optional[str] = None
-    file_id: Optional[str] = None
-class QueryResponse(BaseModel):
-    answer: str
-    context: List[str]
-    sources: List[Dict[str, Any]]
-    confidence: float
-class UploadResponse(BaseModel):
-    file_id: str
-    filename: str
-    file_type: str
-    chunks_created: int
-    message: str
-# Document Processor Class
-class DocumentProcessor:
-    def __init__(self):
-        self.embedding_model = SentenceTransformer(config.EMBEDDING_MODEL)
-    def extract_text_from_pdf(self, file_path: str) -> str:
-        """Extract text from PDF using pdfplumber"""
-        text = ""
-        try:
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages:
-                    page_text = page.extract_text()
-                    if page_text:
-                        text += page_text + "\n"
-        except Exception as e:
-            # Fallback to PyPDF2
-            with open(file_path, 'rb') as file:
-                pdf_reader = PyPDF2.PdfReader(file)
-                for page in pdf_reader.pages:
-                    text += page.extract_text() + "\n"
-        return text
-    def extract_text_from_docx(self, file_path: str) -> str:
-        """Extract text from Word document"""
-        doc = Document(file_path)
-        text = ""
-        for paragraph in doc.paragraphs:
-            text += paragraph.text + "\n"
-        return text
-    def extract_text_from_image(self, image_data: bytes) -> str:
-        """Extract text from image using OCR"""
-        try:
-            image = Image.open(io.BytesIO(image_data))
-            text = pytesseract.image_to_string(image)
-            return text
-        except Exception as e:
-            raise HTTPException(status_code=400, f"OCR failed: {str(e)}")
-    def extract_text_from_csv(self, file_path: str) -> str:
-        """Extract text from CSV"""
-        df = pd.read_csv(file_path)
-        return df.to_string()
-    def extract_text_from_db(self, file_path: str) -> str:
-        """Extract text from SQLite database"""
-        conn = sqlite3.connect(file_path)
-        text = ""
-        # Get all table names
-        cursor = conn.cursor()
-        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
-        tables = cursor.fetchall()
-        for table in tables:
-            table_name = table[0]
-            df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
-            text += f"Table: {table_name}\n"
-            text += df.to_string() + "\n\n"
-        conn.close()
-        return text
-    def chunk_text(self, text: str) -> List[str]:
-        """Split text into chunks with overlap"""
-        chunks = []
-        words = text.split()
-        for i in range(0, len(words), config.CHUNK_SIZE - config.CHUNK_OVERLAP):
-            chunk = " ".join(words[i:i + config.CHUNK_SIZE])
-            chunks.append(chunk)
-        return chunks
-    def process_document(self, file_path: str, file_type: str) -> List[str]:
-        """Process document based on file type"""
-        text = ""
-        if file_type.lower() == '.pdf':
-            text = self.extract_text_from_pdf(file_path)
-        elif file_type.lower() == '.docx':
-            text = self.extract_text_from_docx(file_path)
-        elif file_type.lower() == '.txt':
-            with open(file_path, 'r', encoding='utf-8') as f:
-                text = f.read()
-        elif file_type.lower() in ['.jpg', '.jpeg', '.png']:
-            with open(file_path, 'rb') as f:
-                text = self.extract_text_from_image(f.read())
-        elif file_type.lower() == '.csv':
-            text = self.extract_text_from_csv(file_path)
-        elif file_type.lower() == '.db':
-            text = self.extract_text_from_db(file_path)
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_type}")
-        return self.chunk_text(text)
-# Vector Store Class
-class VectorStore:
-    def __init__(self, embedding_model: SentenceTransformer):
-        self.embedding_model = embedding_model
-        self.dimension = 384  # all-MiniLM-L6-v2 embedding dimension
-        self.index = faiss.IndexFlatIP(self.dimension)  # Inner product for similarity
-        self.chunks = []
-        self.metadata = []
-    def add_documents(self, chunks: List[str], file_id: str, filename: str):
-        """Add documents to vector store"""
-        embeddings = self.embedding_model.encode(chunks)
-        # Normalize embeddings for inner product similarity
-        faiss.normalize_L2(embeddings)
-        self.index.add(embeddings.astype(np.float32))
-        for i, chunk in enumerate(chunks):
-            self.chunks.append(chunk)
-            self.metadata.append({
-                'file_id': file_id,
-                'filename': filename,
-                'chunk_index': i,
-                'text': chunk
-            })
-    def search(self, query: str, k: int = 5) -> List[Dict]:
-        """Search for similar documents"""
-        query_embedding = self.embedding_model.encode([query])
-        faiss.normalize_L2(query_embedding)
-        scores, indices = self.index.search(query_embedding.astype(np.float32), k)
-        results = []
-        for score, idx in zip(scores[0], indices[0]):
-            if idx != -1:  # Valid index
-                results.append({
-                    'text': self.chunks[idx],
-                    'metadata': self.metadata[idx],
-                    'score': float(score)
-                })
-        return results
-    def save(self, path: str):
-        """Save vector store to disk"""
-        faiss.write_index(self.index, f"{path}/index.faiss")
-        with open(f"{path}/data.pkl", 'wb') as f:
-            pickle.dump({
-                'chunks': self.chunks,
-                'metadata': self.metadata
-            }, f)
-    def load(self, path: str):
-        """Load vector store from disk"""
-        if os.path.exists(f"{path}/index.faiss"):
-            self.index = faiss.read_index(f"{path}/index.faiss")
-            with open(f"{path}/data.pkl", 'rb') as f:
-                data = pickle.load(f)
-                self.chunks = data['chunks']
-                self.metadata = data['metadata']
-# LLM Handler Class
-class LLMHandler:
-    def __init__(self):
-        # Using Flan-T5 for better text generation
-        self.model_name = "google/flan-t5-base"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
-        self.generator = pipeline(
-            "text2text-generation",
-            model=self.model,
-            tokenizer=self.tokenizer,
-            max_length=512,
-            temperature=0.7,
-            do_sample=True
-        )
-    def generate_answer(self, question: str, context: List[str]) -> str:
-        """Generate answer using LLM"""
-        # Construct prompt
-        context_text = "\n".join(context[:3])  # Use top 3 contexts
-        prompt = f"""Based on the following context, answer the question accurately and concisely.
-Context:
-{context_text}
-Question: {question}
-Answer:"""
-        try:
-            response = self.generator(
-                prompt,
-                max_length=200,
-                num_return_sequences=1,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-            answer = response[0]['generated_text']
-            # Clean up the answer
-            if "Answer:" in answer:
-                answer = answer.split("Answer:")[-1].strip()
-            return answer
-        except Exception as e:
-            return f"I apologize, but I encountered an error generating the answer: {str(e)}"
-# Initialize components
-document_processor = DocumentProcessor()
-vector_store = VectorStore(document_processor.embedding_model)
-llm_handler = LLMHandler()
-# Load existing vector store if available
-vector_store.load(config.VECTOR_STORE_DIR)
-# FastAPI app
-app = FastAPI(
-    title="Smart RAG API",
-    description="Retrieval-Augmented Generation API for document Q&A",
-    version="1.0.0"
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.post("/upload", response_model=UploadResponse)
-async def upload_file(file: UploadFile = File(...)):
-    """Upload and process a document"""
-    # Validate file size
-    file_content = await file.read()
-    if len(file_content) > config.MAX_FILE_SIZE:
-        raise HTTPException(status_code=413, detail="File too large")
-    # Generate file ID
-    file_id = str(uuid.uuid4())
-    file_extension = Path(file.filename).suffix.lower()
-    # Save file
-    file_path = os.path.join(config.UPLOAD_DIR, f"{file_id}_{file.filename}")
-    with open(file_path, "wb") as f:
-        f.write(file_content)
-    try:
-        # Process document
-        chunks = document_processor.process_document(file_path, file_extension)
-        # Add to vector store
-        vector_store.add_documents(chunks, file_id, file.filename)
-        # Save vector store
-        vector_store.save(config.VECTOR_STORE_DIR)
-        return UploadResponse(
-            file_id=file_id,
-            filename=file.filename,
-            file_type=file_extension,
-            chunks_created=len(chunks),
-            message="File uploaded and processed successfully"
-        )
-    except Exception as e:
-        # Clean up file on error
-        os.remove(file_path)
-        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
-@app.post("/query", response_model=QueryResponse)
-async def query_documents(request: QueryRequest):
-    """Query documents with a question"""
-    question = request.question
-    # Handle image-based questions
-    if request.image_base64:
-        try:
-            # Decode base64 image
-            image_data = base64.b64decode(request.image_base64)
-            # Extract text from image
-            ocr_text = document_processor.extract_text_from_image(image_data)
-            # Combine question with OCR text
-            question = f"{request.question} [Image content: {ocr_text}]"
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")
-    # Search vector store
-    search_results = vector_store.search(question, k=5)
-    if not search_results:
-        raise HTTPException(status_code=404, detail="No relevant documents found")
-    # Extract context and sources
-    contexts = [result['text'] for result in search_results]
-    sources = [result['metadata'] for result in search_results]
-    # Generate answer
-    answer = llm_handler.generate_answer(request.question, contexts)
-    # Calculate confidence (average similarity score)
-    confidence = sum(result['score'] for result in search_results) / len(search_results)
-    return QueryResponse(
-        answer=answer,
-        context=contexts,
-        sources=sources,
-        confidence=confidence
     )
-@app.get("/health")
-async def health_check():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "documents_indexed": len(vector_store.chunks),
-        "model_loaded": llm_handler.model is not None
-    }
-@app.get("/")
-async def root():
-    """Root endpoint with API information"""
-    return {
-        "message": "Smart RAG API",
-        "version": "1.0.0",
-        "endpoints": {
-            "/upload": "POST - Upload documents",
-            "/query": "POST - Query documents",
-            "/health": "GET - Health check"
-        }
-    }
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+import gradio as gr
 import os
+import tempfile
 import base64
+from typing import List, Tuple, Optional
+import json
 from pathlib import Path
+# Import our modules
+from src.document_processor import DocumentProcessor
+from src.vector_store import VectorStore
+from src.llm_handler import LLMHandler
+from src.utils import setup_directories, get_file_icon
+from config import Config
+# Initialize configuration
 config = Config()
+# Setup directories
+setup_directories()
+# Initialize components
+print("🚀 Initializing Smart RAG API components...")
+document_processor = DocumentProcessor()
+vector_store = VectorStore(document_processor.embedding_model)
+llm_handler = LLMHandler()
+# Load existing vector store
+try:
+    vector_store.load(config.VECTOR_STORE_DIR)
+    print(f"✅ Loaded existing vector store with {len(vector_store.chunks)} documents")
+except:
+    print("📝 Starting with empty vector store")
+# Global state for uploaded files
+uploaded_files = []
+def process_uploaded_file(file_path: str) -> Tuple[str, str]:
+    """Process uploaded file and return status message and file info"""
+    try:
+        if file_path is None:
+            return "❌ No file uploaded", ""
+        file_name = Path(file_path).name
+        file_extension = Path(file_path).suffix.lower()
+        # Check file size
+        file_size = os.path.getsize(file_path)
+        if file_size > config.MAX_FILE_SIZE:
+            return f"�� File too large. Maximum size: {config.MAX_FILE_SIZE/1024/1024:.1f}MB", ""
+        # Process document
+        print(f"📄 Processing {file_name}...")
+        chunks = document_processor.process_document(file_path, file_extension)
+        if not chunks:
+            return "❌ No text content found in the file", ""
+        # Generate file ID
+        file_id = f"file_{len(uploaded_files)}"
+        # Add to vector store
+        vector_store.add_documents(chunks, file_id, file_name)
+        # Save vector store
+        vector_store.save(config.VECTOR_STORE_DIR)
+        # Track uploaded file
+        file_info = {
+            'id': file_id,
+            'name': file_name,
+            'type': file_extension,
+            'chunks': len(chunks),
+            'size': file_size
+        }
+        uploaded_files.append(file_info)
+        # Create status message
+        icon = get_file_icon(file_extension)
+        status_msg = f"✅ Successfully processed: {file_name}"
+        file_details = f"""
+{icon} **{file_name}**
+- Type: {file_extension.upper()}
+- Size: {file_size/1024:.1f} KB
+- Chunks created: {len(chunks)}
+- File ID: {file_id}
+        """
+        return status_msg, file_details
+    except Exception as e:
+        error_msg = f"❌ Error processing file: {str(e)}"
+        print(error_msg)
+        return error_msg, ""
+def answer_question(question: str, image_input=None) -> Tuple[str, str, str]:
+    """Answer question based on uploaded documents"""
+    try:
+        if not question.strip():
+            return "❌ Please enter a question", "", ""
+        if len(vector_store.chunks) == 0:
+            return "❌ No documents uploaded yet. Please upload a document first.", "", ""
+        # Handle image input if provided
+        processed_question = question
+        if image_input is not None:
+            try:
+                # Convert image to base64 and extract text
+                import tempfile
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
+                    image_input.save(tmp_file.name)
+                    # Extract text from image
+                    with open(tmp_file.name, 'rb') as img_file:
+                        ocr_text = document_processor.extract_text_from_image(img_file.read())
+                    os.unlink(tmp_file.name)
+                    if ocr_text.strip():
+                        processed_question = f"{question}\n\nImage content: {ocr_text}"
+            except Exception as e:
+                print(f"Image processing error: {e}")
+        # Search vector store
+        search_results = vector_store.search(processed_question, k=5)
+        if not search_results:
+            return "❌ No relevant information found in uploaded documents", "", ""
+        # Extract context and sources
+        contexts = [result['text'] for result in search_results]
+        sources = [result['metadata'] for result in search_results]
+        # Generate answer
+        answer = llm_handler.generate_answer(question, contexts)
+        # Format context
+        context_display = "\n\n".join([
+            f"**Context {i+1}** (Score: {result['score']:.3f}):\n{result['text'][:300]}..."
+            for i, result in enumerate(search_results[:3])
+        ])
+        # Format sources
+        sources_display = "\n".join([
+            f"• **{source['filename']}** (Chunk {source['chunk_index']})"
+            for source in sources[:3]
+        ])
+        return answer, context_display, sources_display
+    except Exception as e:
+        error_msg = f"❌ Error generating answer: {str(e)}"
+        print(error_msg)
+        return error_msg, "", ""
+def get_uploaded_files_status():
+    """Get status of all uploaded files"""
+    if not uploaded_files:
+        return "📭 No files uploaded yet"
+    status = f"📚 **{len(uploaded_files)} files uploaded** ({len(vector_store.chunks)} total chunks)\n\n"
+    for file_info in uploaded_files:
+        icon = get_file_icon(file_info['type'])
+        status += f"{icon} **{file_info['name']}** ({file_info['chunks']} chunks)\n"
+    return status
+def clear_all_documents():
+    """Clear all uploaded documents"""
+    global uploaded_files
+    try:
+        # Reset vector store
+        vector_store.reset()
+        # Clear uploaded files list
+        uploaded_files = []
+        # Save empty vector store
+        vector_store.save(config.VECTOR_STORE_DIR)
+        return "✅ All documents cleared successfully", "📭 No files uploaded"
+    except Exception as e:
+        return f"❌ Error clearing documents: {str(e)}", get_uploaded_files_status()
+# Custom CSS
+custom_css = """
+.gradio-container {
+    max-width: 1200px !important;
+}
+.file-upload-area {
+    border: 2px dashed #ccc;
+    border-radius: 10px;
+    padding: 20px;
+    text-align: center;
+    transition: border-color 0.3s ease;
+}
+.file-upload-area:hover {
+    border-color: #007bff;
+}
+.status-success {
+    color: #28a745;
+    font-weight: bold;
+}
+.status-error {
+    color: #dc3545;
+    font-weight: bold;
+}
+.answer-box {
+    background: #f8f9fa;
+    border-left: 4px solid #007bff;
+    padding: 15px;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+.context-box {
+    background: #fff3cd;
+    border-left: 4px solid #ffc107;
+    padding: 15px;
+    border-radius: 5px;
+    margin: 10px 0;
+    max-height: 300px;
+    overflow-y: auto;
+}
+.sources-box {
+    background: #d4edda;
+    border-left: 4px solid #28a745;
+    padding: 15px;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(css=custom_css, title="Smart RAG API", theme=gr.themes.Soft()) as demo:
+    # Header
+    gr.Markdown("""
+    # 🤖 Smart RAG API
+    ### Intelligent Document Q&A System
+    Upload documents (PDF, DOCX, TXT, Images, CSV, SQLite) and ask questions about their content!
+    **Supported formats**: PDF, Word, Text, Images (with OCR), CSV, SQLite databases
+    """)
+    with gr.Row():
+        # Left Column - File Upload
+        with gr.Column(scale=1):
+            gr.Markdown("## 📤 Upload Documents")
+            file_input = gr.File(
+                label="Choose File",
+                file_types=[".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png", ".csv", ".db"],
+                type="filepath"
+            )
+            upload_btn = gr.Button("📄 Process Document", variant="primary", size="lg")
+            upload_status = gr.Markdown("📭 No files uploaded yet")
+            file_details = gr.Markdown("")
+            gr.Markdown("---")
+            # File Management
+            with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
+                clear_btn = gr.Button("🗑️ Clear All", size="sm", variant="secondary")
+        # Right Column - Question Answering
+        with gr.Column(scale=2):
+            gr.Markdown("## ❓ Ask Questions")
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="What is this document about?",
+                lines=2
+            )
+            image_input = gr.Image(
+                label="Upload Image (Optional)",
+                type="pil",
+                height=150
+            )
+            ask_btn = gr.Button("🔍 Get Answer", variant="primary", size="lg")
+            # Results
+            gr.Markdown("### 💡 Answer")
+            answer_output = gr.Markdown(
+                value="Ask a question to see the answer here...",
+                elem_classes=["answer-box"]
+            )
+            with gr.Accordion("📋 Context & Sources", open=False):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("**📄 Context Used:**")
+                        context_output = gr.Markdown(elem_classes=["context-box"])
+                    with gr.Column():
+                        gr.Markdown("**📚 Sources:**")
+                        sources_output = gr.Markdown(elem_classes=["sources-box"])
+    # Example Questions
+    gr.Markdown("""
+    ## 💡 Example Questions
+    Try asking questions like:
+    - "What is the main topic of this document?"
+    - "Summarize the key points"
+    - "What are the important dates mentioned?"
+    - "Who are the people mentioned in the document?"
+    - "What are the financial figures?"
+    """)
+    # Sample Files
+    with gr.Accordion("📁 Sample Files for Testing", open=False):
+        gr.Markdown("""
+        You can test the system with these types of documents:
+        - **PDF**: Research papers, reports, invoices
+        - **Word**: Documents, proposals, contracts
+        - **Text**: Plain text files, logs, notes
+        - **Images**: Screenshots, scanned documents, diagrams
+        - **CSV**: Data tables, spreadsheets
+        - **Database**: SQLite files with structured data
+        """)
+    # Event handlers
+    upload_btn.click(
+        fn=process_uploaded_file,
+        inputs=[file_input],
+        outputs=[upload_status, file_details]
+    )
+    ask_btn.click(
+        fn=answer_question,
+        inputs=[question_input, image_input],
+        outputs=[answer_output, context_output, sources_output]
+    )
+    refresh_btn.click(
+        fn=get_uploaded_files_status,
+        outputs=[upload_status]
+    )
+    clear_btn.click(
+        fn=clear_all_documents,
+        outputs=[upload_status, file_details]
+    )
+    # Auto-refresh status on file input change
+    file_input.change(
+        fn=lambda: get_uploaded_files_status(),
+        outputs=[upload_status]
     )
+# Launch configuration
 if __name__ == "__main__":
+    print("🚀 Launching Smart RAG API...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,  # Creates public link
+        show_error=True,
+        show_tips=True,
+        enable_queue=True
+    )