Spaces:

Hamza4100
/

multi-pdf-rag-api

Sleeping

App Files Files Community

Hamza4100 commited on Jan 15

Commit

6ad61bb

verified ·

1 Parent(s): d6eb4cd

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +28 -0
auth.py +86 -0
hf_storage.py +226 -0
main.py +364 -0
rag_engine.py +784 -0
requirements.txt +43 -0
start.sh +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY main.py .
+COPY rag_engine.py .
+COPY auth.py .
+COPY hf_storage.py .
+COPY start.sh .
+# Make start script executable
+RUN chmod +x start.sh
+# Expose port 7860 (HF Spaces default)
+EXPOSE 7860
+# Run the FastAPI app
+CMD ["./start.sh"]

auth.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+API Authentication Module
+=========================
+Implements API key-based authentication for the FastAPI backend.
+"""
+import os
+import hashlib
+from typing import Optional
+from fastapi import Header, HTTPException, status
+class AuthManager:
+    """Manages API key authentication and user identification."""
+    def __init__(self):
+        """Initialize auth manager with API keys from environment."""
+        api_keys_str = os.environ.get("API_KEYS", "")
+        self.valid_api_keys = set(
+            key.strip() for key in api_keys_str.split(",") if key.strip()
+        )
+        if not self.valid_api_keys:
+            print("⚠️ WARNING: No API keys configured! Set API_KEYS environment variable.")
+        else:
+            print(f"✅ Auth Manager initialized with {len(self.valid_api_keys)} API key(s)")
+    def derive_user_id(self, api_key: str) -> str:
+        """
+        Derive a stable user ID from API key using SHA256.
+        Args:
+            api_key: The API key
+        Returns:
+            12-character user ID derived from key hash
+        """
+        hash_bytes = hashlib.sha256(api_key.encode()).digest()
+        return hash_bytes.hex()[:12]
+    def validate_api_key(self, api_key: Optional[str]) -> str:
+        """
+        Validate API key and return user ID.
+        Args:
+            api_key: API key from request header
+        Returns:
+            user_id: Derived user identifier
+        Raises:
+            HTTPException: If API key is invalid or missing
+        """
+        if not api_key:
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                detail="Missing X-API-KEY header"
+            )
+        if api_key not in self.valid_api_keys:
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                detail="Invalid API key"
+            )
+        return self.derive_user_id(api_key)
+# Global auth manager instance
+auth_manager = AuthManager()
+async def get_current_user(x_api_key: Optional[str] = Header(None, alias="X-API-KEY")) -> str:
+    """
+    FastAPI dependency for extracting authenticated user ID.
+    Args:
+        x_api_key: API key from X-API-KEY header
+    Returns:
+        user_id: Authenticated user identifier
+    Raises:
+        HTTPException: If authentication fails
+    """
+    return auth_manager.validate_api_key(x_api_key)

hf_storage.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+Hugging Face Storage Manager
+=============================
+Handles syncing storage files and uploaded PDFs with HF private repository.
+Functions:
+- sync_storage_from_hf() → Download storage/ and uploaded_pdfs/ on startup
+- push_storage_to_hf() → Upload storage/ and uploaded_pdfs/ after changes
+"""
+import os
+from typing import Optional
+from huggingface_hub import HfApi, hf_hub_download, login
+class HFStorageManager:
+    """Manages persistent storage sync with Hugging Face repository."""
+    def __init__(self, hf_token: Optional[str], hf_repo: str):
+        """
+        Initialize HF Storage Manager.
+        Args:
+            hf_token: Hugging Face API token with write access
+            hf_repo: HF repository ID (e.g., "username/repo-name")
+        """
+        self.hf_token = hf_token
+        self.hf_repo = hf_repo
+        self.enabled = bool(hf_token and hf_repo)
+        self.api = None
+        if self.enabled:
+            try:
+                login(token=hf_token, add_to_git_credential=True)
+                self.api = HfApi()
+                print(f"✅ HF Storage Manager initialized: {hf_repo}")
+            except Exception as e:
+                print(f"⚠️ HF login failed: {e}")
+                self.enabled = False
+        else:
+            print("⚠️ HF Storage disabled (HF_TOKEN or HF_REPO not set)")
+    def sync_storage_from_hf(self, user_id: str) -> bool:
+        """
+        Download storage files and uploaded PDFs for a specific user from HF repo.
+        Downloads:
+          - users/{user_id}/storage/faiss.index
+          - users/{user_id}/storage/metadata.json
+          - users/{user_id}/storage/documents.json
+          - users/{user_id}/uploaded_pdfs/*.pdf
+        Args:
+            user_id: User identifier (12-char hash from API key)
+        Returns:
+            bool: True if sync successful, False otherwise
+        """
+        if not self.enabled:
+            print(f"⚠️ HF Storage sync skipped for user {user_id} (disabled)")
+            return False
+        try:
+            # Setup local directories for this user
+            base_dir = os.path.dirname(os.path.abspath(__file__))
+            user_base = os.path.join(base_dir, "users", user_id)
+            storage_dir = os.path.join(user_base, "storage")
+            uploaded_pdfs_dir = os.path.join(user_base, "uploaded_pdfs")
+            os.makedirs(storage_dir, exist_ok=True)
+            os.makedirs(uploaded_pdfs_dir, exist_ok=True)
+            print(f"📥 Syncing storage for user {user_id} from HF repo: {self.hf_repo}")
+            # Download storage files (FAISS index and metadata)
+            storage_files = ["faiss.index", "metadata.json", "documents.json"]
+            downloaded_count = 0
+            for filename in storage_files:
+                try:
+                    downloaded_path = hf_hub_download(
+                        repo_id=self.hf_repo,
+                        filename=f"users/{user_id}/storage/{filename}",
+                        token=self.hf_token,
+                        repo_type="model",
+                        local_dir=base_dir,
+                        local_dir_use_symlinks=False
+                    )
+                    downloaded_count += 1
+                    print(f"  ✓ Downloaded: users/{user_id}/storage/{filename}")
+                except Exception as e:
+                    # File doesn't exist yet in HF repo (first run is okay)
+                    print(f"  ⚠️ Could not download users/{user_id}/storage/{filename}: {str(e)[:100]}")
+            # Download all uploaded PDF files for this user
+            try:
+                # List all files in user's uploaded_pdfs/ folder
+                files_in_repo = self.api.list_repo_files(
+                    repo_id=self.hf_repo,
+                    token=self.hf_token
+                )
+                pdf_files = [
+                    f for f in files_in_repo
+                    if f.startswith(f"users/{user_id}/uploaded_pdfs/") and f.endswith(".pdf")
+                ]
+                print(f"  Found {len(pdf_files)} PDF files for user {user_id}")
+                for pdf_file in pdf_files:
+                    try:
+                        hf_hub_download(
+                            repo_id=self.hf_repo,
+                            filename=pdf_file,
+                            token=self.hf_token,
+                            repo_type="model",
+                            local_dir=base_dir,
+                            local_dir_use_symlinks=False
+                        )
+                        print(f"  ✓ Downloaded: {pdf_file}")
+                    except Exception as e:
+                        print(f"  ⚠️ Could not download {pdf_file}: {str(e)[:100]}")
+            except Exception as e:
+                # uploaded_pdfs folder doesn't exist yet in repo
+                print(f"  ⚠️ Could not list PDF files for user {user_id}: {str(e)[:100]}")
+            print(f"✅ HF Storage sync complete for user {user_id} ({downloaded_count} storage files)")
+            return True
+        except Exception as e:
+            print(f"❌ HF Storage sync failed for user {user_id}: {e}")
+            return False
+    def push_storage_to_hf(self, user_id: str, commit_message: str = "Update storage") -> bool:
+        """
+        Upload storage files and uploaded PDFs for a specific user to HF repo.
+        Uploads:
+          - users/{user_id}/storage/ folder (FAISS index and metadata)
+          - users/{user_id}/uploaded_pdfs/ folder (PDF files)
+        Args:
+            user_id: User identifier (12-char hash from API key)
+            commit_message: Commit message for the upload
+        Returns:
+            bool: True if push successful, False otherwise
+        """
+        if not self.enabled:
+            print(f"⚠️ HF Storage push skipped for user {user_id} (disabled)")
+            return False
+        try:
+            base_dir = os.path.dirname(os.path.abspath(__file__))
+            user_base = os.path.join(base_dir, "users", user_id)
+            storage_dir = os.path.join(user_base, "storage")
+            uploaded_pdfs_dir = os.path.join(user_base, "uploaded_pdfs")
+            print(f"📤 Pushing storage for user {user_id} to HF repo: {self.hf_repo}")
+            upload_count = 0
+            # Upload storage folder (FAISS index and metadata)
+            if os.path.exists(storage_dir) and os.listdir(storage_dir):
+                try:
+                    self.api.upload_folder(
+                        folder_path=storage_dir,
+                        repo_id=self.hf_repo,
+                        path_in_repo=f"users/{user_id}/storage",
+                        token=self.hf_token,
+                        repo_type="model",
+                        commit_message=f"[User {user_id}] {commit_message}"
+                    )
+                    upload_count += 1
+                    print(f"  ✓ Uploaded: users/{user_id}/storage/ folder")
+                except Exception as e:
+                    print(f"  ❌ Failed to upload storage for user {user_id}: {str(e)[:100]}")
+            # Upload uploaded_pdfs folder
+            if os.path.exists(uploaded_pdfs_dir) and os.listdir(uploaded_pdfs_dir):
+                try:
+                    self.api.upload_folder(
+                        folder_path=uploaded_pdfs_dir,
+                        repo_id=self.hf_repo,
+                        path_in_repo=f"users/{user_id}/uploaded_pdfs",
+                        token=self.hf_token,
+                        repo_type="model",
+                        commit_message=f"[User {user_id}] {commit_message}"
+                    )
+                    upload_count += 1
+                    print(f"  ✓ Uploaded: users/{user_id}/uploaded_pdfs/ folder")
+                except Exception as e:
+                    print(f"  ❌ Failed to upload PDFs for user {user_id}: {str(e)[:100]}")
+            print(f"✅ HF Storage push complete for user {user_id} ({upload_count} folders)")
+            return True
+        except Exception as e:
+            print(f"❌ HF Storage push failed for user {user_id}: {e}")
+            return False
+# ============================================
+# CONVENIENCE FUNCTIONS
+# ============================================
+def create_hf_storage_manager(
+    hf_token: Optional[str] = None,
+    hf_repo: Optional[str] = None
+) -> HFStorageManager:
+    """
+    Create and return an HF Storage Manager instance.
+    Args:
+        hf_token: HF token (reads from env if not provided)
+        hf_repo: HF repo ID (reads from env if not provided)
+    Returns:
+        HFStorageManager instance
+    """
+    if hf_token is None:
+        hf_token = os.environ.get("HF_TOKEN")
+    if hf_repo is None:
+        hf_repo = os.environ.get("HF_REPO", "Hamza4100/multi-pdf-storage")
+    return HFStorageManager(hf_token=hf_token, hf_repo=hf_repo)

main.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""
+FastAPI Backend for Multi-PDF RAG System with Per-User Storage
+===============================================================
+Secure multi-user API with:
+- API key authentication
+- Per-user storage isolation
+- PDF upload and management
+- RAG-based question answering
+- HF persistent storage
+"""
+import os
+import asyncio
+from typing import List, Optional, Dict
+from fastapi import FastAPI, UploadFile, File, HTTPException, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from threading import Lock
+from rag_engine import RAGEngine
+from hf_storage import create_hf_storage_manager
+from auth import get_current_user
+from dotenv import load_dotenv
+# ============================================
+# CONFIGURATION
+# ============================================
+load_dotenv()
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
+HF_TOKEN = os.environ.get("HF_TOKEN")
+HF_REPO = os.environ.get("HF_REPO", "Hamza4100/multi-pdf-storage")
+if not GEMINI_API_KEY:
+    raise RuntimeError("❌ GEMINI_API_KEY not set")
+hf_storage = create_hf_storage_manager(hf_token=HF_TOKEN, hf_repo=HF_REPO)
+app = FastAPI(
+    title="Multi-PDF RAG System",
+    description="Secure multi-user RAG API with persistent storage",
+    version="2.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ============================================
+# PER-USER RAG ENGINE MANAGER
+# ============================================
+class UserRAGManager:
+    """Manages per-user RAG engine instances with lazy loading."""
+    def __init__(self):
+        self.engines: Dict[str, RAGEngine] = {}
+        self.locks: Dict[str, Lock] = {}
+        self.global_lock = Lock()
+    def get_user_lock(self, user_id: str) -> Lock:
+        """Get or create lock for user."""
+        with self.global_lock:
+            if user_id not in self.locks:
+                self.locks[user_id] = Lock()
+            return self.locks[user_id]
+    async def get_engine(self, user_id: str) -> RAGEngine:
+        """Get or create RAG engine for user (lazy loading)."""
+        if user_id in self.engines:
+            return self.engines[user_id]
+        user_lock = self.get_user_lock(user_id)
+        with user_lock:
+            if user_id in self.engines:
+                return self.engines[user_id]
+            print(f"🔧 Initializing RAG for user {user_id}...")
+            # Sync from HF
+            await asyncio.to_thread(hf_storage.sync_storage_from_hf, user_id)
+            # User-specific paths
+            base_dir = os.path.dirname(os.path.abspath(__file__))
+            user_storage_dir = os.path.join(base_dir, "users", user_id, "storage")
+            # Initialize engine
+            engine = await asyncio.to_thread(
+                RAGEngine,
+                gemini_api_key=GEMINI_API_KEY,
+                storage_dir=user_storage_dir
+            )
+            self.engines[user_id] = engine
+            print(f"✅ RAG ready for user {user_id}")
+            return engine
+rag_manager = UserRAGManager()
+# ============================================
+# MODELS
+# ============================================
+class UploadResponse(BaseModel):
+    document_id: str
+    filename: str
+    status: str
+    message: str
+    pages: Optional[int] = None
+    chunks: Optional[int] = None
+class QueryRequest(BaseModel):
+    question: str
+    top_k: Optional[int] = 5
+class QueryResponse(BaseModel):
+    answer: str
+    sources: List[dict]
+class DocumentInfo(BaseModel):
+    doc_id: str
+    filename: str
+    upload_timestamp: str
+    num_chunks: int
+    num_pages: int
+class StatsResponse(BaseModel):
+    total_documents: int
+    total_chunks: int
+    index_size: int
+class DeleteResponse(BaseModel):
+    status: str
+    message: str
+# ============================================
+# STARTUP
+# ============================================
+@app.on_event("startup")
+async def startup_event():
+    print("🚀 Multi-PDF RAG System v2.0")
+    print(f"📦 HF Storage: {'Enabled' if hf_storage.enabled else 'Disabled'}")
+    print("✅ Server ready (per-user lazy loading)")
+# ============================================
+# ENDPOINTS
+# ============================================
+@app.get("/health")
+async def health_check():
+    """Health check (no auth required)."""
+    return {"status": "ok"}
+@app.post("/upload", response_model=UploadResponse)
+async def upload_pdf(
+    file: UploadFile = File(...),
+    user_id: str = Depends(get_current_user)
+):
+    """
+    Upload PDF for authenticated user.
+    Requires: X-API-KEY header
+    """
+    # Validate PDF
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(400, "Only PDF files allowed")
+    if file.content_type and file.content_type not in ['application/pdf']:
+        raise HTTPException(400, "Invalid MIME type")
+    # Read content
+    content = await file.read()
+    # Size limit (10MB)
+    if len(content) > 10 * 1024 * 1024:
+        raise HTTPException(413, "File too large (max 10MB)")
+    try:
+        engine = await rag_manager.get_engine(user_id)
+        user_lock = rag_manager.get_user_lock(user_id)
+        with user_lock:
+            result = await asyncio.to_thread(
+                engine.upload_document,
+                filename=file.filename,
+                file_content=content,
+                action="auto"
+            )
+            if result["status"] == "success":
+                # Save PDF to user's uploaded_pdfs folder
+                base_dir = os.path.dirname(os.path.abspath(__file__))
+                user_pdfs_dir = os.path.join(base_dir, "users", user_id, "uploaded_pdfs")
+                os.makedirs(user_pdfs_dir, exist_ok=True)
+                pdf_path = os.path.join(user_pdfs_dir, file.filename)
+                with open(pdf_path, "wb") as f:
+                    f.write(content)
+                await asyncio.to_thread(
+                    hf_storage.push_storage_to_hf,
+                    user_id,
+                    f"Upload {file.filename}"
+                )
+                print(f"✅ Upload success for user {user_id}: {file.filename}")
+                return UploadResponse(
+                    document_id=result.get("doc_id", ""),
+                    filename=file.filename,
+                    status="success",
+                    message="Uploaded successfully",
+                    pages=result.get("pages"),
+                    chunks=result.get("chunks")
+                )
+            else:
+                raise HTTPException(400, result.get("message", "Upload failed"))
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"❌ Upload error (user {user_id}): {e}")
+        raise HTTPException(500, "Upload failed")
+@app.post("/query", response_model=QueryResponse)
+async def query_documents(
+    request: QueryRequest,
+    user_id: str = Depends(get_current_user)
+):
+    """
+    Query user's documents using RAG.
+    Requires: X-API-KEY header
+    """
+    try:
+        engine = await rag_manager.get_engine(user_id)
+        result = await asyncio.to_thread(
+            engine.ask,
+            query=request.question,
+            top_k=request.top_k
+        )
+        print(f"✅ Query success for user {user_id}")
+        return QueryResponse(
+            answer=result["answer"],
+            sources=result.get("sources", [])
+        )
+    except Exception as e:
+        print(f"❌ Query error (user {user_id}): {e}")
+        raise HTTPException(500, "Query failed")
+@app.get("/documents", response_model=List[DocumentInfo])
+async def get_documents(user_id: str = Depends(get_current_user)):
+    """
+    Get all documents for authenticated user.
+    Requires: X-API-KEY header
+    """
+    try:
+        engine = await rag_manager.get_engine(user_id)
+        documents = await asyncio.to_thread(engine.get_all_documents)
+        return [
+            DocumentInfo(
+                doc_id=doc["doc_id"],
+                filename=doc["filename"],
+                upload_timestamp=doc["upload_timestamp"],
+                num_chunks=doc["num_chunks"],
+                num_pages=doc["num_pages"]
+            )
+            for doc in documents
+        ]
+    except Exception as e:
+        print(f"❌ Get documents error (user {user_id}): {e}")
+        raise HTTPException(500, "Failed to retrieve documents")
+@app.delete("/documents/{doc_id}", response_model=DeleteResponse)
+async def delete_document(
+    doc_id: str,
+    user_id: str = Depends(get_current_user)
+):
+    """
+    Delete document for authenticated user.
+    Requires: X-API-KEY header
+    """
+    try:
+        engine = await rag_manager.get_engine(user_id)
+        user_lock = rag_manager.get_user_lock(user_id)
+        with user_lock:
+            result = await asyncio.to_thread(engine.delete_document, doc_id)
+            if result["status"] == "success":
+                await asyncio.to_thread(
+                    hf_storage.push_storage_to_hf,
+                    user_id,
+                    f"Delete {doc_id}"
+                )
+                print(f"✅ Delete success for user {user_id}: {doc_id}")
+                return DeleteResponse(
+                    status="success",
+                    message=result["message"]
+                )
+            else:
+                raise HTTPException(404, result["message"])
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"❌ Delete error (user {user_id}): {e}")
+        raise HTTPException(500, "Deletion failed")
+@app.get("/stats", response_model=StatsResponse)
+async def get_stats(user_id: str = Depends(get_current_user)):
+    """
+    Get stats for authenticated user.
+    Requires: X-API-KEY header
+    """
+    try:
+        engine = await rag_manager.get_engine(user_id)
+        stats = await asyncio.to_thread(engine.get_stats)
+        return StatsResponse(
+            total_documents=stats["total_documents"],
+            total_chunks=stats["total_chunks"],
+            index_size=stats["index_size"]
+        )
+    except Exception as e:
+        print(f"❌ Stats error (user {user_id}): {e}")
+        raise HTTPException(500, "Failed to retrieve stats")
+# ============================================
+# MAIN
+# ============================================
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 8000))
+    uvicorn.run("main:app", host="0.0.0.0", port=port, reload=True)

rag_engine.py ADDED Viewed

	@@ -0,0 +1,784 @@

+"""
+RAG Engine Module
+=================
+Handles all RAG pipeline operations:
+- PDF text extraction
+- Text chunking with overlap
+- Embedding generation using SentenceTransformers
+- FAISS vector storage and retrieval
+- Metadata and document registry management
+- Persistence of embeddings and metadata
+"""
+import os
+import json
+import hashlib
+from datetime import datetime
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+import PyPDF2
+import google.generativeai as genai
+from PIL import Image
+import io
+# OCR imports (optional)
+try:
+    import pytesseract
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
+    print("Warning: pytesseract not installed. OCR functionality will be disabled.")
+# ============================================
+# CONFIGURATION
+# ============================================
+# Chunking parameters
+DEFAULT_CHUNK_SIZE = 200  # words per chunk
+DEFAULT_OVERLAP_SIZE = 50  # overlapping words
+# Retrieval parameters
+DEFAULT_TOP_K = 5  # number of chunks to retrieve
+# Embedding model
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+EMBEDDING_DIMENSION = 384
+class RAGEngine:
+    """
+    Main RAG Engine class that handles:
+    - Document processing and embedding
+    - FAISS index management
+    - Query processing and answer generation
+    - Persistence of all data
+    """
+    def __init__(self, gemini_api_key: str, storage_dir: Optional[str] = None):
+        """
+        Initialize the RAG Engine.
+        Args:
+            gemini_api_key: API key for Google Gemini
+            storage_dir: Optional custom storage directory for per-user isolation
+        """
+        # Set storage paths
+        if storage_dir is None:
+            storage_dir = os.path.join(os.path.dirname(__file__), "storage")
+        self.storage_dir = storage_dir
+        self.faiss_index_path = os.path.join(storage_dir, "faiss.index")
+        self.metadata_path = os.path.join(storage_dir, "metadata.json")
+        self.documents_path = os.path.join(storage_dir, "documents.json")
+        # Ensure storage directory exists
+        os.makedirs(storage_dir, exist_ok=True)
+        # Initialize embedding model
+        print("Loading embedding model...")
+        self.embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+        # Initialize Gemini
+        genai.configure(api_key=gemini_api_key)
+        self.gemini_model = genai.GenerativeModel("gemini-2.5-flash")
+        # Initialize or load FAISS index
+        self.index: Optional[faiss.IndexFlatL2] = None
+        self.metadata: List[Dict] = []  # Stores chunk text, source, page
+        self.documents: Dict[str, Dict] = {}  # Document registry
+        # Load existing data if available
+        self._load_persistent_data()
+        print(f"RAG Engine initialized. Documents: {len(self.documents)}, Chunks: {len(self.metadata)}")
+    # ============================================
+    # PERSISTENCE METHODS
+    # ============================================
+    def _load_persistent_data(self):
+        """Load FAISS index, metadata, and document registry from disk."""
+        # Load document registry
+        if os.path.exists(self.documents_path):
+            with open(self.documents_path, "r", encoding="utf-8") as f:
+                self.documents = json.load(f)
+            print(f"Loaded {len(self.documents)} documents from registry")
+        # Load metadata
+        if os.path.exists(self.metadata_path):
+            with open(self.metadata_path, "r", encoding="utf-8") as f:
+                self.metadata = json.load(f)
+            print(f"Loaded {len(self.metadata)} chunks metadata")
+        # Load FAISS index
+        if os.path.exists(self.faiss_index_path) and len(self.metadata) > 0:
+            self.index = faiss.read_index(self.faiss_index_path)
+            print(f"Loaded FAISS index with {self.index.ntotal} vectors")
+        else:
+            # Create new empty index
+            self.index = faiss.IndexFlatL2(EMBEDDING_DIMENSION)
+            print("Created new FAISS index")
+    def _save_persistent_data(self):
+        """Save FAISS index, metadata, and document registry to disk."""
+        # Save document registry
+        with open(self.documents_path, "w", encoding="utf-8") as f:
+            json.dump(self.documents, f, indent=2, ensure_ascii=False)
+        # Save metadata
+        with open(self.metadata_path, "w", encoding="utf-8") as f:
+            json.dump(self.metadata, f, indent=2, ensure_ascii=False)
+        # Save FAISS index
+        if self.index is not None and self.index.ntotal > 0:
+            faiss.write_index(self.index, self.faiss_index_path)
+        print("Persistent data saved successfully")
+    # ============================================
+    # DOCUMENT PROCESSING METHODS
+    # ============================================
+    @staticmethod
+    def compute_file_hash(file_content: bytes) -> str:
+        """
+        Compute SHA-256 hash of file content.
+        Args:
+            file_content: Raw bytes of the file
+        Returns:
+            Hexadecimal hash string
+        """
+        return hashlib.sha256(file_content).hexdigest()
+    @staticmethod
+    def chunk_text_with_overlap(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE,
+                                 overlap_size: int = DEFAULT_OVERLAP_SIZE) -> List[str]:
+        """
+        Split text into overlapping chunks.
+        Args:
+            text: Input text to chunk
+            chunk_size: Number of words per chunk
+            overlap_size: Number of overlapping words between chunks
+        Returns:
+            List of text chunks
+        """
+        words = text.split()
+        chunks = []
+        start = 0
+        while start < len(words):
+            end = start + chunk_size
+            chunk = " ".join(words[start:end])
+            if chunk.strip():  # Only add non-empty chunks
+                chunks.append(chunk)
+            start += chunk_size - overlap_size
+        return chunks
+    @staticmethod
+    def extract_text_from_image(image: Image.Image) -> str:
+        """
+        Extract text from an image using OCR.
+        Args:
+            image: PIL Image object
+        Returns:
+            Extracted text string
+        """
+        if not OCR_AVAILABLE:
+            return ""
+        try:
+            # Convert to RGB if needed
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            # Run OCR
+            text = pytesseract.image_to_string(image, lang='eng')
+            return text.strip()
+        except Exception as e:
+            print(f"OCR error: {e}")
+            return ""
+    def extract_text_from_pdf(self, pdf_content: bytes) -> List[Dict]:
+        """
+        Extract text from PDF page by page, including OCR for images.
+        Args:
+            pdf_content: Raw bytes of PDF file
+        Returns:
+            List of dicts with page_num, text, and ocr_text
+        """
+        pages = []
+        try:
+            reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
+            for page_num, page in enumerate(reader.pages):
+                # Extract regular text
+                text = page.extract_text()
+                ocr_text = ""
+                # Extract images and apply OCR
+                if OCR_AVAILABLE:
+                    try:
+                        # Get images from page
+                        if '/XObject' in page['/Resources']:
+                            xObject = page['/Resources']['/XObject'].get_object()
+                            for obj in xObject:
+                                if xObject[obj]['/Subtype'] == '/Image':
+                                    try:
+                                        # Extract image data
+                                        size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
+                                        data = xObject[obj].get_data()
+                                        # Try to create image
+                                        if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
+                                            mode = "RGB"
+                                        elif xObject[obj]['/ColorSpace'] == '/DeviceGray':
+                                            mode = "L"
+                                        else:
+                                            mode = "RGB"  # Default
+                                        try:
+                                            image = Image.frombytes(mode, size, data)
+                                            # Apply OCR
+                                            img_text = self.extract_text_from_image(image)
+                                            if img_text:
+                                                ocr_text += img_text + "\n"
+                                        except Exception as img_error:
+                                            # Try with PIL's open if frombytes fails
+                                            try:
+                                                image = Image.open(io.BytesIO(data))
+                                                img_text = self.extract_text_from_image(image)
+                                                if img_text:
+                                                    ocr_text += img_text + "\n"
+                                            except:
+                                                pass
+                                    except Exception as e:
+                                        # Skip this image if extraction fails
+                                        continue
+                    except Exception as e:
+                        print(f"Error extracting images from page {page_num + 1}: {e}")
+                # Combine regular text and OCR text
+                combined_text = ""
+                if text and text.strip():
+                    combined_text += text.strip()
+                if ocr_text.strip():
+                    if combined_text:
+                        combined_text += "\n\n[Text from images:]\n" + ocr_text.strip()
+                    else:
+                        combined_text = ocr_text.strip()
+                if combined_text:
+                    pages.append({
+                        "page_num": page_num + 1,
+                        "text": combined_text,
+                        "has_ocr": bool(ocr_text.strip())
+                    })
+        except Exception as e:
+            print(f"Error extracting PDF text: {e}")
+            raise
+        return pages
+    def process_pdf(self, filename: str, file_content: bytes,
+                    chunk_size: int = DEFAULT_CHUNK_SIZE,
+                    overlap_size: int = DEFAULT_OVERLAP_SIZE) -> List[Dict]:
+        """
+        Process a PDF: extract text (including OCR), chunk it, and prepare metadata.
+        Args:
+            filename: Original filename
+            file_content: Raw bytes of PDF
+            chunk_size: Words per chunk
+            overlap_size: Overlap between chunks
+        Returns:
+            List of chunk metadata dicts
+        """
+        # Extract pages
+        pages = self.extract_text_from_pdf(file_content)
+        # Chunk each page
+        chunks_metadata = []
+        for page_info in pages:
+            page_chunks = self.chunk_text_with_overlap(
+                page_info["text"],
+                chunk_size,
+                overlap_size
+            )
+            for chunk_text in page_chunks:
+                chunks_metadata.append({
+                    "text": chunk_text,
+                    "source": filename,
+                    "page": page_info["page_num"],
+                    "has_ocr": page_info.get("has_ocr", False)
+                })
+        return chunks_metadata
+    # ============================================
+    # DUPLICATE DETECTION METHODS
+    # ============================================
+    def check_duplicate(self, file_hash: str) -> Optional[Dict]:
+        """
+        Check if a document with the same hash already exists.
+        Args:
+            file_hash: SHA-256 hash of the file
+        Returns:
+            Document info if duplicate found, None otherwise
+        """
+        for doc_id, doc_info in self.documents.items():
+            if doc_info.get("hash") == file_hash:
+                return {"doc_id": doc_id, **doc_info}
+        return None
+    def get_document_by_filename(self, filename: str) -> Optional[Dict]:
+        """
+        Get document info by filename.
+        Args:
+            filename: Original filename
+        Returns:
+            Document info if found, None otherwise
+        """
+        for doc_id, doc_info in self.documents.items():
+            if doc_info.get("filename") == filename:
+                return {"doc_id": doc_id, **doc_info}
+        return None
+    # ============================================
+    # EMBEDDING AND INDEXING METHODS
+    # ============================================
+    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for a list of texts.
+        Args:
+            texts: List of text strings
+        Returns:
+            Numpy array of embeddings
+        """
+        embeddings = self.embed_model.encode(texts)
+        return np.array(embeddings).astype("float32")
+    def add_to_index(self, chunks_metadata: List[Dict]) -> int:
+        """
+        Add new chunks to FAISS index and metadata.
+        Args:
+            chunks_metadata: List of chunk dicts with text, source, page
+        Returns:
+            Number of chunks added
+        """
+        if not chunks_metadata:
+            return 0
+        # Extract texts for embedding
+        texts = [c["text"] for c in chunks_metadata]
+        # Generate embeddings
+        embeddings = self.generate_embeddings(texts)
+        # Add to FAISS index
+        self.index.add(embeddings)
+        # Add to metadata
+        self.metadata.extend(chunks_metadata)
+        return len(chunks_metadata)
+    def remove_document_from_index(self, filename: str):
+        """
+        Remove all chunks of a document from the index.
+        Note: FAISS IndexFlatL2 doesn't support removal, so we rebuild.
+        Args:
+            filename: Filename of document to remove
+        """
+        # Filter out chunks from this document
+        remaining_metadata = [
+            m for m in self.metadata if m["source"] != filename
+        ]
+        if len(remaining_metadata) == len(self.metadata):
+            return  # Nothing to remove
+        # Rebuild index with remaining chunks
+        self.metadata = remaining_metadata
+        if self.metadata:
+            texts = [m["text"] for m in self.metadata]
+            embeddings = self.generate_embeddings(texts)
+            self.index = faiss.IndexFlatL2(EMBEDDING_DIMENSION)
+            self.index.add(embeddings)
+        else:
+            self.index = faiss.IndexFlatL2(EMBEDDING_DIMENSION)
+        print(f"Removed document '{filename}' from index")
+    # ============================================
+    # DOCUMENT UPLOAD METHODS
+    # ============================================
+    def upload_document(self, filename: str, file_content: bytes,
+                        action: str = "auto") -> Dict:
+        """
+        Upload and process a document.
+        Args:
+            filename: Original filename
+            file_content: Raw bytes of PDF
+            action: "auto", "use_existing", "replace", or "cancel"
+        Returns:
+            Result dict with status and info
+        """
+        # Compute hash
+        file_hash = self.compute_file_hash(file_content)
+        # Check for duplicate
+        existing_doc = self.check_duplicate(file_hash)
+        if existing_doc:
+            if action == "auto":
+                # Return duplicate warning
+                return {
+                    "status": "duplicate",
+                    "filename": filename,
+                    "existing_filename": existing_doc["filename"],
+                    "hash": file_hash,
+                    "message": f"Document already exists as '{existing_doc['filename']}'",
+                    "options": ["use_existing", "replace", "cancel"]
+                }
+            elif action == "use_existing":
+                return {
+                    "status": "success",
+                    "filename": existing_doc["filename"],
+                    "message": "Using existing document embeddings",
+                    "chunks": 0,
+                    "reused": True
+                }
+            elif action == "cancel":
+                return {
+                    "status": "cancelled",
+                    "filename": filename,
+                    "message": "Upload cancelled"
+                }
+            elif action == "replace":
+                # Remove old document and continue with upload
+                self.remove_document_from_index(existing_doc["filename"])
+                del self.documents[existing_doc["doc_id"]]
+        # Process new document
+        try:
+            chunks_metadata = self.process_pdf(filename, file_content)
+            if not chunks_metadata:
+                return {
+                    "status": "error",
+                    "filename": filename,
+                    "message": "No text could be extracted from PDF"
+                }
+            # Add to index
+            num_chunks = self.add_to_index(chunks_metadata)
+            # Register document
+            doc_id = f"doc_{len(self.documents) + 1}_{int(datetime.now().timestamp())}"
+            self.documents[doc_id] = {
+                "filename": filename,
+                "hash": file_hash,
+                "upload_timestamp": datetime.now().isoformat(),
+                "num_chunks": num_chunks,
+                "num_pages": max(c["page"] for c in chunks_metadata)
+            }
+            # Persist changes
+            self._save_persistent_data()
+            return {
+                "status": "success",
+                "filename": filename,
+                "message": f"Document processed successfully",
+                "chunks": num_chunks,
+                "pages": self.documents[doc_id]["num_pages"]
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "filename": filename,
+                "message": f"Error processing document: {str(e)}"
+            }
+    # ============================================
+    # QUERY AND RETRIEVAL METHODS
+    # ============================================
+    def retrieve_relevant_chunks(self, query: str, top_k: int = DEFAULT_TOP_K) -> List[Dict]:
+        """
+        Retrieve most relevant chunks for a query.
+        Args:
+            query: User's question
+            top_k: Number of chunks to retrieve
+        Returns:
+            List of relevant chunks with metadata
+        """
+        if self.index is None or self.index.ntotal == 0:
+            return []
+        # Limit top_k to available chunks
+        top_k = min(top_k, self.index.ntotal)
+        # Embed query
+        query_embedding = self.embed_model.encode([query]).astype("float32")
+        # Search FAISS
+        distances, indices = self.index.search(query_embedding, k=top_k)
+        # Gather results
+        results = []
+        for i, idx in enumerate(indices[0]):
+            if idx < len(self.metadata):
+                results.append({
+                    **self.metadata[idx],
+                    "distance": float(distances[0][i]),
+                    "relevance_rank": i + 1
+                })
+        return results
+    def generate_answer(self, query: str, context_chunks: List[Dict]) -> str:
+        """
+        Generate answer using Gemini with retrieved context.
+        Args:
+            query: User's question
+            context_chunks: Retrieved relevant chunks
+        Returns:
+            Generated answer string
+        """
+        if not context_chunks:
+            return "I don't have enough information to answer this question. Please upload relevant documents first."
+        # Build context string
+        context_parts = []
+        for chunk in context_chunks:
+            context_parts.append(
+                f"[Source: {chunk['source']}, Page {chunk['page']}]\n{chunk['text']}"
+            )
+        context = "\n\n".join(context_parts)
+        # Create prompt
+        prompt = f"""You are a helpful assistant that answers questions based ONLY on the provided context.
+Do NOT make up information that is not in the context.
+If the context doesn't contain enough information to answer, say so clearly.
+You may summarize, combine, or rephrase information from the context to make your answer clear and helpful.
+CONTEXT:
+{context}
+QUESTION:
+{query}
+ANSWER:"""
+        try:
+            response = self.gemini_model.generate_content(prompt)
+            return response.text
+        except Exception as e:
+            return f"Error generating answer: {str(e)}"
+    def verify_sources(self, query: str, answer: str, context_chunks: List[Dict]) -> List[int]:
+        """
+        Verify which chunks actually support the generated answer.
+        Args:
+            query: User's question
+            answer: Generated answer
+            context_chunks: All retrieved chunks
+        Returns:
+            List of indices of chunks that support the answer
+        """
+        if not context_chunks:
+            return []
+        # Build context with numbered chunks
+        context_parts = []
+        for i, chunk in enumerate(context_chunks):
+            context_parts.append(
+                f"[{i}] Source: {chunk['source']}, Page {chunk['page']}\n{chunk['text']}"
+            )
+        context = "\n\n".join(context_parts)
+        # Create verification prompt
+        prompt = f"""You are a citation verification assistant. Given a question, an answer, and numbered source chunks, identify which chunks were actually used to generate the answer.
+Return ONLY a comma-separated list of chunk numbers that directly support the answer (e.g., "0,2,3").
+If no chunks support the answer, return "NONE".
+Do not include explanations or any other text.
+QUESTION:
+{query}
+ANSWER:
+{answer}
+NUMBERED CHUNKS:
+{context}
+CHUNK NUMBERS THAT SUPPORT THE ANSWER:"""
+        try:
+            response = self.gemini_model.generate_content(prompt)
+            result = response.text.strip()
+            # Parse the response
+            if result.upper() == "NONE":
+                return []
+            # Extract numbers
+            used_indices = []
+            for part in result.split(","):
+                try:
+                    idx = int(part.strip())
+                    if 0 <= idx < len(context_chunks):
+                        used_indices.append(idx)
+                except ValueError:
+                    continue
+            return used_indices
+        except Exception as e:
+            print(f"Error verifying sources: {e}")
+            # Fallback: return all chunks if verification fails
+            return list(range(len(context_chunks)))
+    def ask(self, query: str, top_k: int = DEFAULT_TOP_K) -> Dict:
+        """
+        Main query method: retrieve context, generate answer, and filter sources.
+        Args:
+            query: User's question
+            top_k: Number of chunks to retrieve
+        Returns:
+            Dict with answer and verified sources
+        """
+        # Retrieve relevant chunks
+        relevant_chunks = self.retrieve_relevant_chunks(query, top_k)
+        # Generate answer
+        answer = self.generate_answer(query, relevant_chunks)
+        # Verify which chunks actually support the answer
+        used_indices = self.verify_sources(query, answer, relevant_chunks)
+        # Filter sources to only those that support the answer
+        sources = []
+        seen = set()
+        for idx in used_indices:
+            if idx < len(relevant_chunks):
+                chunk = relevant_chunks[idx]
+                source_key = f"{chunk['source']}_{chunk['page']}"
+                if source_key not in seen:
+                    sources.append({
+                        "file": chunk["source"],
+                        "page": chunk["page"]
+                    })
+                    seen.add(source_key)
+        return {
+            "answer": answer,
+            "sources": sources,
+            "num_chunks_used": len(sources),
+            "num_chunks_retrieved": len(relevant_chunks)
+        }
+    # ============================================
+    # DOCUMENT MANAGEMENT METHODS
+    # ============================================
+    def get_all_documents(self) -> List[Dict]:
+        """
+        Get list of all uploaded documents.
+        Returns:
+            List of document info dicts
+        """
+        return [
+            {"doc_id": doc_id, **info}
+            for doc_id, info in self.documents.items()
+        ]
+    def delete_document(self, doc_id: str) -> Dict:
+        """
+        Delete a document and its embeddings.
+        Args:
+            doc_id: Document ID to delete
+        Returns:
+            Result dict
+        """
+        if doc_id not in self.documents:
+            return {
+                "status": "error",
+                "message": f"Document {doc_id} not found"
+            }
+        filename = self.documents[doc_id]["filename"]
+        # Remove from index
+        self.remove_document_from_index(filename)
+        # Remove from registry
+        del self.documents[doc_id]
+        # Persist changes
+        self._save_persistent_data()
+        return {
+            "status": "success",
+            "message": f"Document '{filename}' deleted successfully"
+        }
+    def get_stats(self) -> Dict:
+        """
+        Get system statistics.
+        Returns:
+            Dict with stats
+        """
+        return {
+            "total_documents": len(self.documents),
+            "total_chunks": len(self.metadata),
+            "index_size": self.index.ntotal if self.index else 0,
+            "embedding_model": EMBEDDING_MODEL_NAME,
+            "embedding_dimension": EMBEDDING_DIMENSION
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# FastAPI Multi-PDF RAG System - Backend Requirements
+# ===================================================
+# Core Web Framework
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+python-multipart==0.0.6
+# AI & Machine Learning
+sentence-transformers==2.7.0
+google-generativeai==0.3.2
+# Vector Database
+faiss-cpu==1.7.4
+numpy==1.26.3
+# PDF Processing
+PyPDF2==3.0.1
+PyMuPDF==1.23.8
+pytesseract==0.3.10
+# Image Processing
+Pillow==10.2.0
+# Hugging Face Integration
+huggingface-hub==0.20.3
+# Environment & Configuration
+python-dotenv==1.0.0
+# Data Processing
+pandas==2.1.4
+# HTTP Client (for testing)
+httpx==0.26.0
+# UI (optional - only if running Streamlit frontend locally)
+streamlit==1.29.0
+# Development & Testing (optional)
+pytest==7.4.3
+pytest-asyncio==0.21.1

start.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/bash
+# Run FastAPI on HF expected port
+uvicorn main:app --host 0.0.0.0 --port 7860