Spaces:

sairika
/

Rag-based-api-task

Runtime error

App Files Files Community

sairika commited on Aug 7, 2025

Commit

efaba82

verified ·

1 Parent(s): 06320e3

Create app.py

Browse files

Files changed (1) hide show

app.py +408 -0

app.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import os
+import io
+import base64
+import sqlite3
+import pandas as pd
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+import asyncio
+import uuid
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import uvicorn
+# Document processing
+import PyPDF2
+import pdfplumber
+from docx import Document
+import pytesseract
+from PIL import Image
+# ML/AI components
+import torch
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import faiss
+import numpy as np
+import pickle
+# Configuration
+class Config:
+    UPLOAD_DIR = "uploads"
+    VECTOR_STORE_DIR = "vector_store"
+    CHUNK_SIZE = 500
+    CHUNK_OVERLAP = 50
+    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+    # Hugging Face Models (Free)
+    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+    LLM_MODEL = "microsoft/DialoGPT-medium"  # For conversational responses
+    # Alternative: "google/flan-t5-base" for better text generation
+config = Config()
+# Ensure directories exist
+os.makedirs(config.UPLOAD_DIR, exist_ok=True)
+os.makedirs(config.VECTOR_STORE_DIR, exist_ok=True)
+# Pydantic models
+class QueryRequest(BaseModel):
+    question: str
+    image_base64: Optional[str] = None
+    file_id: Optional[str] = None
+class QueryResponse(BaseModel):
+    answer: str
+    context: List[str]
+    sources: List[Dict[str, Any]]
+    confidence: float
+class UploadResponse(BaseModel):
+    file_id: str
+    filename: str
+    file_type: str
+    chunks_created: int
+    message: str
+# Document Processor Class
+class DocumentProcessor:
+    def __init__(self):
+        self.embedding_model = SentenceTransformer(config.EMBEDDING_MODEL)
+    def extract_text_from_pdf(self, file_path: str) -> str:
+        """Extract text from PDF using pdfplumber"""
+        text = ""
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+        except Exception as e:
+            # Fallback to PyPDF2
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + "\n"
+        return text
+    def extract_text_from_docx(self, file_path: str) -> str:
+        """Extract text from Word document"""
+        doc = Document(file_path)
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + "\n"
+        return text
+    def extract_text_from_image(self, image_data: bytes) -> str:
+        """Extract text from image using OCR"""
+        try:
+            image = Image.open(io.BytesIO(image_data))
+            text = pytesseract.image_to_string(image)
+            return text
+        except Exception as e:
+            raise HTTPException(status_code=400, f"OCR failed: {str(e)}")
+    def extract_text_from_csv(self, file_path: str) -> str:
+        """Extract text from CSV"""
+        df = pd.read_csv(file_path)
+        return df.to_string()
+    def extract_text_from_db(self, file_path: str) -> str:
+        """Extract text from SQLite database"""
+        conn = sqlite3.connect(file_path)
+        text = ""
+        # Get all table names
+        cursor = conn.cursor()
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+        tables = cursor.fetchall()
+        for table in tables:
+            table_name = table[0]
+            df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
+            text += f"Table: {table_name}\n"
+            text += df.to_string() + "\n\n"
+        conn.close()
+        return text
+    def chunk_text(self, text: str) -> List[str]:
+        """Split text into chunks with overlap"""
+        chunks = []
+        words = text.split()
+        for i in range(0, len(words), config.CHUNK_SIZE - config.CHUNK_OVERLAP):
+            chunk = " ".join(words[i:i + config.CHUNK_SIZE])
+            chunks.append(chunk)
+        return chunks
+    def process_document(self, file_path: str, file_type: str) -> List[str]:
+        """Process document based on file type"""
+        text = ""
+        if file_type.lower() == '.pdf':
+            text = self.extract_text_from_pdf(file_path)
+        elif file_type.lower() == '.docx':
+            text = self.extract_text_from_docx(file_path)
+        elif file_type.lower() == '.txt':
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read()
+        elif file_type.lower() in ['.jpg', '.jpeg', '.png']:
+            with open(file_path, 'rb') as f:
+                text = self.extract_text_from_image(f.read())
+        elif file_type.lower() == '.csv':
+            text = self.extract_text_from_csv(file_path)
+        elif file_type.lower() == '.db':
+            text = self.extract_text_from_db(file_path)
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_type}")
+        return self.chunk_text(text)
+# Vector Store Class
+class VectorStore:
+    def __init__(self, embedding_model: SentenceTransformer):
+        self.embedding_model = embedding_model
+        self.dimension = 384  # all-MiniLM-L6-v2 embedding dimension
+        self.index = faiss.IndexFlatIP(self.dimension)  # Inner product for similarity
+        self.chunks = []
+        self.metadata = []
+    def add_documents(self, chunks: List[str], file_id: str, filename: str):
+        """Add documents to vector store"""
+        embeddings = self.embedding_model.encode(chunks)
+        # Normalize embeddings for inner product similarity
+        faiss.normalize_L2(embeddings)
+        self.index.add(embeddings.astype(np.float32))
+        for i, chunk in enumerate(chunks):
+            self.chunks.append(chunk)
+            self.metadata.append({
+                'file_id': file_id,
+                'filename': filename,
+                'chunk_index': i,
+                'text': chunk
+            })
+    def search(self, query: str, k: int = 5) -> List[Dict]:
+        """Search for similar documents"""
+        query_embedding = self.embedding_model.encode([query])
+        faiss.normalize_L2(query_embedding)
+        scores, indices = self.index.search(query_embedding.astype(np.float32), k)
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx != -1:  # Valid index
+                results.append({
+                    'text': self.chunks[idx],
+                    'metadata': self.metadata[idx],
+                    'score': float(score)
+                })
+        return results
+    def save(self, path: str):
+        """Save vector store to disk"""
+        faiss.write_index(self.index, f"{path}/index.faiss")
+        with open(f"{path}/data.pkl", 'wb') as f:
+            pickle.dump({
+                'chunks': self.chunks,
+                'metadata': self.metadata
+            }, f)
+    def load(self, path: str):
+        """Load vector store from disk"""
+        if os.path.exists(f"{path}/index.faiss"):
+            self.index = faiss.read_index(f"{path}/index.faiss")
+            with open(f"{path}/data.pkl", 'rb') as f:
+                data = pickle.load(f)
+                self.chunks = data['chunks']
+                self.metadata = data['metadata']
+# LLM Handler Class
+class LLMHandler:
+    def __init__(self):
+        # Using Flan-T5 for better text generation
+        self.model_name = "google/flan-t5-base"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+        self.generator = pipeline(
+            "text2text-generation",
+            model=self.model,
+            tokenizer=self.tokenizer,
+            max_length=512,
+            temperature=0.7,
+            do_sample=True
+        )
+    def generate_answer(self, question: str, context: List[str]) -> str:
+        """Generate answer using LLM"""
+        # Construct prompt
+        context_text = "\n".join(context[:3])  # Use top 3 contexts
+        prompt = f"""Based on the following context, answer the question accurately and concisely.
+Context:
+{context_text}
+Question: {question}
+Answer:"""
+        try:
+            response = self.generator(
+                prompt,
+                max_length=200,
+                num_return_sequences=1,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            answer = response[0]['generated_text']
+            # Clean up the answer
+            if "Answer:" in answer:
+                answer = answer.split("Answer:")[-1].strip()
+            return answer
+        except Exception as e:
+            return f"I apologize, but I encountered an error generating the answer: {str(e)}"
+# Initialize components
+document_processor = DocumentProcessor()
+vector_store = VectorStore(document_processor.embedding_model)
+llm_handler = LLMHandler()
+# Load existing vector store if available
+vector_store.load(config.VECTOR_STORE_DIR)
+# FastAPI app
+app = FastAPI(
+    title="Smart RAG API",
+    description="Retrieval-Augmented Generation API for document Q&A",
+    version="1.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.post("/upload", response_model=UploadResponse)
+async def upload_file(file: UploadFile = File(...)):
+    """Upload and process a document"""
+    # Validate file size
+    file_content = await file.read()
+    if len(file_content) > config.MAX_FILE_SIZE:
+        raise HTTPException(status_code=413, detail="File too large")
+    # Generate file ID
+    file_id = str(uuid.uuid4())
+    file_extension = Path(file.filename).suffix.lower()
+    # Save file
+    file_path = os.path.join(config.UPLOAD_DIR, f"{file_id}_{file.filename}")
+    with open(file_path, "wb") as f:
+        f.write(file_content)
+    try:
+        # Process document
+        chunks = document_processor.process_document(file_path, file_extension)
+        # Add to vector store
+        vector_store.add_documents(chunks, file_id, file.filename)
+        # Save vector store
+        vector_store.save(config.VECTOR_STORE_DIR)
+        return UploadResponse(
+            file_id=file_id,
+            filename=file.filename,
+            file_type=file_extension,
+            chunks_created=len(chunks),
+            message="File uploaded and processed successfully"
+        )
+    except Exception as e:
+        # Clean up file on error
+        os.remove(file_path)
+        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+@app.post("/query", response_model=QueryResponse)
+async def query_documents(request: QueryRequest):
+    """Query documents with a question"""
+    question = request.question
+    # Handle image-based questions
+    if request.image_base64:
+        try:
+            # Decode base64 image
+            image_data = base64.b64decode(request.image_base64)
+            # Extract text from image
+            ocr_text = document_processor.extract_text_from_image(image_data)
+            # Combine question with OCR text
+            question = f"{request.question} [Image content: {ocr_text}]"
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")
+    # Search vector store
+    search_results = vector_store.search(question, k=5)
+    if not search_results:
+        raise HTTPException(status_code=404, detail="No relevant documents found")
+    # Extract context and sources
+    contexts = [result['text'] for result in search_results]
+    sources = [result['metadata'] for result in search_results]
+    # Generate answer
+    answer = llm_handler.generate_answer(request.question, contexts)
+    # Calculate confidence (average similarity score)
+    confidence = sum(result['score'] for result in search_results) / len(search_results)
+    return QueryResponse(
+        answer=answer,
+        context=contexts,
+        sources=sources,
+        confidence=confidence
+    )
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "documents_indexed": len(vector_store.chunks),
+        "model_loaded": llm_handler.model is not None
+    }
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "Smart RAG API",
+        "version": "1.0.0",
+        "endpoints": {
+            "/upload": "POST - Upload documents",
+            "/query": "POST - Query documents",
+            "/health": "GET - Health check"
+        }
+    }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)