Hamza4100 commited on
Commit
6ad61bb
·
verified ·
1 Parent(s): d6eb4cd

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile +28 -0
  2. auth.py +86 -0
  3. hf_storage.py +226 -0
  4. main.py +364 -0
  5. rag_engine.py +784 -0
  6. requirements.txt +43 -0
  7. start.sh +3 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ git \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements and install Python dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy application files
15
+ COPY main.py .
16
+ COPY rag_engine.py .
17
+ COPY auth.py .
18
+ COPY hf_storage.py .
19
+ COPY start.sh .
20
+
21
+ # Make start script executable
22
+ RUN chmod +x start.sh
23
+
24
+ # Expose port 7860 (HF Spaces default)
25
+ EXPOSE 7860
26
+
27
+ # Run the FastAPI app
28
+ CMD ["./start.sh"]
auth.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API Authentication Module
3
+ =========================
4
+ Implements API key-based authentication for the FastAPI backend.
5
+ """
6
+
7
+ import os
8
+ import hashlib
9
+ from typing import Optional
10
+ from fastapi import Header, HTTPException, status
11
+
12
+
13
+ class AuthManager:
14
+ """Manages API key authentication and user identification."""
15
+
16
+ def __init__(self):
17
+ """Initialize auth manager with API keys from environment."""
18
+ api_keys_str = os.environ.get("API_KEYS", "")
19
+ self.valid_api_keys = set(
20
+ key.strip() for key in api_keys_str.split(",") if key.strip()
21
+ )
22
+
23
+ if not self.valid_api_keys:
24
+ print("⚠️ WARNING: No API keys configured! Set API_KEYS environment variable.")
25
+ else:
26
+ print(f"✅ Auth Manager initialized with {len(self.valid_api_keys)} API key(s)")
27
+
28
+ def derive_user_id(self, api_key: str) -> str:
29
+ """
30
+ Derive a stable user ID from API key using SHA256.
31
+
32
+ Args:
33
+ api_key: The API key
34
+
35
+ Returns:
36
+ 12-character user ID derived from key hash
37
+ """
38
+ hash_bytes = hashlib.sha256(api_key.encode()).digest()
39
+ return hash_bytes.hex()[:12]
40
+
41
+ def validate_api_key(self, api_key: Optional[str]) -> str:
42
+ """
43
+ Validate API key and return user ID.
44
+
45
+ Args:
46
+ api_key: API key from request header
47
+
48
+ Returns:
49
+ user_id: Derived user identifier
50
+
51
+ Raises:
52
+ HTTPException: If API key is invalid or missing
53
+ """
54
+ if not api_key:
55
+ raise HTTPException(
56
+ status_code=status.HTTP_401_UNAUTHORIZED,
57
+ detail="Missing X-API-KEY header"
58
+ )
59
+
60
+ if api_key not in self.valid_api_keys:
61
+ raise HTTPException(
62
+ status_code=status.HTTP_401_UNAUTHORIZED,
63
+ detail="Invalid API key"
64
+ )
65
+
66
+ return self.derive_user_id(api_key)
67
+
68
+
69
+ # Global auth manager instance
70
+ auth_manager = AuthManager()
71
+
72
+
73
+ async def get_current_user(x_api_key: Optional[str] = Header(None, alias="X-API-KEY")) -> str:
74
+ """
75
+ FastAPI dependency for extracting authenticated user ID.
76
+
77
+ Args:
78
+ x_api_key: API key from X-API-KEY header
79
+
80
+ Returns:
81
+ user_id: Authenticated user identifier
82
+
83
+ Raises:
84
+ HTTPException: If authentication fails
85
+ """
86
+ return auth_manager.validate_api_key(x_api_key)
hf_storage.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Storage Manager
3
+ =============================
4
+ Handles syncing storage files and uploaded PDFs with HF private repository.
5
+
6
+ Functions:
7
+ - sync_storage_from_hf() → Download storage/ and uploaded_pdfs/ on startup
8
+ - push_storage_to_hf() → Upload storage/ and uploaded_pdfs/ after changes
9
+ """
10
+
11
+ import os
12
+ from typing import Optional
13
+ from huggingface_hub import HfApi, hf_hub_download, login
14
+
15
+
16
+ class HFStorageManager:
17
+ """Manages persistent storage sync with Hugging Face repository."""
18
+
19
+ def __init__(self, hf_token: Optional[str], hf_repo: str):
20
+ """
21
+ Initialize HF Storage Manager.
22
+
23
+ Args:
24
+ hf_token: Hugging Face API token with write access
25
+ hf_repo: HF repository ID (e.g., "username/repo-name")
26
+ """
27
+ self.hf_token = hf_token
28
+ self.hf_repo = hf_repo
29
+ self.enabled = bool(hf_token and hf_repo)
30
+ self.api = None
31
+
32
+ if self.enabled:
33
+ try:
34
+ login(token=hf_token, add_to_git_credential=True)
35
+ self.api = HfApi()
36
+ print(f"✅ HF Storage Manager initialized: {hf_repo}")
37
+ except Exception as e:
38
+ print(f"⚠️ HF login failed: {e}")
39
+ self.enabled = False
40
+ else:
41
+ print("⚠️ HF Storage disabled (HF_TOKEN or HF_REPO not set)")
42
+
43
+ def sync_storage_from_hf(self, user_id: str) -> bool:
44
+ """
45
+ Download storage files and uploaded PDFs for a specific user from HF repo.
46
+
47
+ Downloads:
48
+ - users/{user_id}/storage/faiss.index
49
+ - users/{user_id}/storage/metadata.json
50
+ - users/{user_id}/storage/documents.json
51
+ - users/{user_id}/uploaded_pdfs/*.pdf
52
+
53
+ Args:
54
+ user_id: User identifier (12-char hash from API key)
55
+
56
+ Returns:
57
+ bool: True if sync successful, False otherwise
58
+ """
59
+ if not self.enabled:
60
+ print(f"⚠️ HF Storage sync skipped for user {user_id} (disabled)")
61
+ return False
62
+
63
+ try:
64
+ # Setup local directories for this user
65
+ base_dir = os.path.dirname(os.path.abspath(__file__))
66
+ user_base = os.path.join(base_dir, "users", user_id)
67
+ storage_dir = os.path.join(user_base, "storage")
68
+ uploaded_pdfs_dir = os.path.join(user_base, "uploaded_pdfs")
69
+ os.makedirs(storage_dir, exist_ok=True)
70
+ os.makedirs(uploaded_pdfs_dir, exist_ok=True)
71
+
72
+ print(f"📥 Syncing storage for user {user_id} from HF repo: {self.hf_repo}")
73
+
74
+ # Download storage files (FAISS index and metadata)
75
+ storage_files = ["faiss.index", "metadata.json", "documents.json"]
76
+ downloaded_count = 0
77
+
78
+ for filename in storage_files:
79
+ try:
80
+ downloaded_path = hf_hub_download(
81
+ repo_id=self.hf_repo,
82
+ filename=f"users/{user_id}/storage/{filename}",
83
+ token=self.hf_token,
84
+ repo_type="model",
85
+ local_dir=base_dir,
86
+ local_dir_use_symlinks=False
87
+ )
88
+ downloaded_count += 1
89
+ print(f" ✓ Downloaded: users/{user_id}/storage/{filename}")
90
+ except Exception as e:
91
+ # File doesn't exist yet in HF repo (first run is okay)
92
+ print(f" ⚠️ Could not download users/{user_id}/storage/{filename}: {str(e)[:100]}")
93
+
94
+ # Download all uploaded PDF files for this user
95
+ try:
96
+ # List all files in user's uploaded_pdfs/ folder
97
+ files_in_repo = self.api.list_repo_files(
98
+ repo_id=self.hf_repo,
99
+ token=self.hf_token
100
+ )
101
+ pdf_files = [
102
+ f for f in files_in_repo
103
+ if f.startswith(f"users/{user_id}/uploaded_pdfs/") and f.endswith(".pdf")
104
+ ]
105
+
106
+ print(f" Found {len(pdf_files)} PDF files for user {user_id}")
107
+
108
+ for pdf_file in pdf_files:
109
+ try:
110
+ hf_hub_download(
111
+ repo_id=self.hf_repo,
112
+ filename=pdf_file,
113
+ token=self.hf_token,
114
+ repo_type="model",
115
+ local_dir=base_dir,
116
+ local_dir_use_symlinks=False
117
+ )
118
+ print(f" ✓ Downloaded: {pdf_file}")
119
+ except Exception as e:
120
+ print(f" ⚠️ Could not download {pdf_file}: {str(e)[:100]}")
121
+
122
+ except Exception as e:
123
+ # uploaded_pdfs folder doesn't exist yet in repo
124
+ print(f" ⚠️ Could not list PDF files for user {user_id}: {str(e)[:100]}")
125
+
126
+ print(f"✅ HF Storage sync complete for user {user_id} ({downloaded_count} storage files)")
127
+ return True
128
+
129
+ except Exception as e:
130
+ print(f"❌ HF Storage sync failed for user {user_id}: {e}")
131
+ return False
132
+
133
+ def push_storage_to_hf(self, user_id: str, commit_message: str = "Update storage") -> bool:
134
+ """
135
+ Upload storage files and uploaded PDFs for a specific user to HF repo.
136
+
137
+ Uploads:
138
+ - users/{user_id}/storage/ folder (FAISS index and metadata)
139
+ - users/{user_id}/uploaded_pdfs/ folder (PDF files)
140
+
141
+ Args:
142
+ user_id: User identifier (12-char hash from API key)
143
+ commit_message: Commit message for the upload
144
+
145
+ Returns:
146
+ bool: True if push successful, False otherwise
147
+ """
148
+ if not self.enabled:
149
+ print(f"⚠️ HF Storage push skipped for user {user_id} (disabled)")
150
+ return False
151
+
152
+ try:
153
+ base_dir = os.path.dirname(os.path.abspath(__file__))
154
+ user_base = os.path.join(base_dir, "users", user_id)
155
+ storage_dir = os.path.join(user_base, "storage")
156
+ uploaded_pdfs_dir = os.path.join(user_base, "uploaded_pdfs")
157
+
158
+ print(f"📤 Pushing storage for user {user_id} to HF repo: {self.hf_repo}")
159
+
160
+ upload_count = 0
161
+
162
+ # Upload storage folder (FAISS index and metadata)
163
+ if os.path.exists(storage_dir) and os.listdir(storage_dir):
164
+ try:
165
+ self.api.upload_folder(
166
+ folder_path=storage_dir,
167
+ repo_id=self.hf_repo,
168
+ path_in_repo=f"users/{user_id}/storage",
169
+ token=self.hf_token,
170
+ repo_type="model",
171
+ commit_message=f"[User {user_id}] {commit_message}"
172
+ )
173
+ upload_count += 1
174
+ print(f" ✓ Uploaded: users/{user_id}/storage/ folder")
175
+ except Exception as e:
176
+ print(f" ❌ Failed to upload storage for user {user_id}: {str(e)[:100]}")
177
+
178
+ # Upload uploaded_pdfs folder
179
+ if os.path.exists(uploaded_pdfs_dir) and os.listdir(uploaded_pdfs_dir):
180
+ try:
181
+ self.api.upload_folder(
182
+ folder_path=uploaded_pdfs_dir,
183
+ repo_id=self.hf_repo,
184
+ path_in_repo=f"users/{user_id}/uploaded_pdfs",
185
+ token=self.hf_token,
186
+ repo_type="model",
187
+ commit_message=f"[User {user_id}] {commit_message}"
188
+ )
189
+ upload_count += 1
190
+ print(f" ✓ Uploaded: users/{user_id}/uploaded_pdfs/ folder")
191
+ except Exception as e:
192
+ print(f" ❌ Failed to upload PDFs for user {user_id}: {str(e)[:100]}")
193
+
194
+ print(f"✅ HF Storage push complete for user {user_id} ({upload_count} folders)")
195
+ return True
196
+
197
+ except Exception as e:
198
+ print(f"❌ HF Storage push failed for user {user_id}: {e}")
199
+ return False
200
+
201
+
202
+ # ============================================
203
+ # CONVENIENCE FUNCTIONS
204
+ # ============================================
205
+
206
+ def create_hf_storage_manager(
207
+ hf_token: Optional[str] = None,
208
+ hf_repo: Optional[str] = None
209
+ ) -> HFStorageManager:
210
+ """
211
+ Create and return an HF Storage Manager instance.
212
+
213
+ Args:
214
+ hf_token: HF token (reads from env if not provided)
215
+ hf_repo: HF repo ID (reads from env if not provided)
216
+
217
+ Returns:
218
+ HFStorageManager instance
219
+ """
220
+ if hf_token is None:
221
+ hf_token = os.environ.get("HF_TOKEN")
222
+
223
+ if hf_repo is None:
224
+ hf_repo = os.environ.get("HF_REPO", "Hamza4100/multi-pdf-storage")
225
+
226
+ return HFStorageManager(hf_token=hf_token, hf_repo=hf_repo)
main.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend for Multi-PDF RAG System with Per-User Storage
3
+ ===============================================================
4
+ Secure multi-user API with:
5
+ - API key authentication
6
+ - Per-user storage isolation
7
+ - PDF upload and management
8
+ - RAG-based question answering
9
+ - HF persistent storage
10
+ """
11
+
12
+ import os
13
+ import asyncio
14
+ from typing import List, Optional, Dict
15
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Depends
16
+ from fastapi.middleware.cors import CORSMiddleware
17
+ from pydantic import BaseModel
18
+ from threading import Lock
19
+ from rag_engine import RAGEngine
20
+ from hf_storage import create_hf_storage_manager
21
+ from auth import get_current_user
22
+ from dotenv import load_dotenv
23
+
24
+ # ============================================
25
+ # CONFIGURATION
26
+ # ============================================
27
+
28
+ load_dotenv()
29
+
30
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
31
+ HF_TOKEN = os.environ.get("HF_TOKEN")
32
+ HF_REPO = os.environ.get("HF_REPO", "Hamza4100/multi-pdf-storage")
33
+
34
+ if not GEMINI_API_KEY:
35
+ raise RuntimeError("❌ GEMINI_API_KEY not set")
36
+
37
+ hf_storage = create_hf_storage_manager(hf_token=HF_TOKEN, hf_repo=HF_REPO)
38
+
39
+ app = FastAPI(
40
+ title="Multi-PDF RAG System",
41
+ description="Secure multi-user RAG API with persistent storage",
42
+ version="2.0.0"
43
+ )
44
+
45
+ app.add_middleware(
46
+ CORSMiddleware,
47
+ allow_origins=["*"],
48
+ allow_credentials=True,
49
+ allow_methods=["*"],
50
+ allow_headers=["*"],
51
+ )
52
+
53
+ # ============================================
54
+ # PER-USER RAG ENGINE MANAGER
55
+ # ============================================
56
+
57
+ class UserRAGManager:
58
+ """Manages per-user RAG engine instances with lazy loading."""
59
+
60
+ def __init__(self):
61
+ self.engines: Dict[str, RAGEngine] = {}
62
+ self.locks: Dict[str, Lock] = {}
63
+ self.global_lock = Lock()
64
+
65
+ def get_user_lock(self, user_id: str) -> Lock:
66
+ """Get or create lock for user."""
67
+ with self.global_lock:
68
+ if user_id not in self.locks:
69
+ self.locks[user_id] = Lock()
70
+ return self.locks[user_id]
71
+
72
+ async def get_engine(self, user_id: str) -> RAGEngine:
73
+ """Get or create RAG engine for user (lazy loading)."""
74
+ if user_id in self.engines:
75
+ return self.engines[user_id]
76
+
77
+ user_lock = self.get_user_lock(user_id)
78
+
79
+ with user_lock:
80
+ if user_id in self.engines:
81
+ return self.engines[user_id]
82
+
83
+ print(f"🔧 Initializing RAG for user {user_id}...")
84
+
85
+ # Sync from HF
86
+ await asyncio.to_thread(hf_storage.sync_storage_from_hf, user_id)
87
+
88
+ # User-specific paths
89
+ base_dir = os.path.dirname(os.path.abspath(__file__))
90
+ user_storage_dir = os.path.join(base_dir, "users", user_id, "storage")
91
+
92
+ # Initialize engine
93
+ engine = await asyncio.to_thread(
94
+ RAGEngine,
95
+ gemini_api_key=GEMINI_API_KEY,
96
+ storage_dir=user_storage_dir
97
+ )
98
+
99
+ self.engines[user_id] = engine
100
+ print(f"✅ RAG ready for user {user_id}")
101
+
102
+ return engine
103
+
104
+
105
+ rag_manager = UserRAGManager()
106
+
107
+ # ============================================
108
+ # MODELS
109
+ # ============================================
110
+
111
+ class UploadResponse(BaseModel):
112
+ document_id: str
113
+ filename: str
114
+ status: str
115
+ message: str
116
+ pages: Optional[int] = None
117
+ chunks: Optional[int] = None
118
+
119
+
120
+ class QueryRequest(BaseModel):
121
+ question: str
122
+ top_k: Optional[int] = 5
123
+
124
+
125
+ class QueryResponse(BaseModel):
126
+ answer: str
127
+ sources: List[dict]
128
+
129
+
130
+ class DocumentInfo(BaseModel):
131
+ doc_id: str
132
+ filename: str
133
+ upload_timestamp: str
134
+ num_chunks: int
135
+ num_pages: int
136
+
137
+
138
+ class StatsResponse(BaseModel):
139
+ total_documents: int
140
+ total_chunks: int
141
+ index_size: int
142
+
143
+
144
+ class DeleteResponse(BaseModel):
145
+ status: str
146
+ message: str
147
+
148
+
149
+ # ============================================
150
+ # STARTUP
151
+ # ============================================
152
+
153
+ @app.on_event("startup")
154
+ async def startup_event():
155
+ print("🚀 Multi-PDF RAG System v2.0")
156
+ print(f"📦 HF Storage: {'Enabled' if hf_storage.enabled else 'Disabled'}")
157
+ print("✅ Server ready (per-user lazy loading)")
158
+
159
+
160
+ # ============================================
161
+ # ENDPOINTS
162
+ # ============================================
163
+
164
+ @app.get("/health")
165
+ async def health_check():
166
+ """Health check (no auth required)."""
167
+ return {"status": "ok"}
168
+
169
+
170
+ @app.post("/upload", response_model=UploadResponse)
171
+ async def upload_pdf(
172
+ file: UploadFile = File(...),
173
+ user_id: str = Depends(get_current_user)
174
+ ):
175
+ """
176
+ Upload PDF for authenticated user.
177
+ Requires: X-API-KEY header
178
+ """
179
+ # Validate PDF
180
+ if not file.filename.lower().endswith('.pdf'):
181
+ raise HTTPException(400, "Only PDF files allowed")
182
+
183
+ if file.content_type and file.content_type not in ['application/pdf']:
184
+ raise HTTPException(400, "Invalid MIME type")
185
+
186
+ # Read content
187
+ content = await file.read()
188
+
189
+ # Size limit (10MB)
190
+ if len(content) > 10 * 1024 * 1024:
191
+ raise HTTPException(413, "File too large (max 10MB)")
192
+
193
+ try:
194
+ engine = await rag_manager.get_engine(user_id)
195
+ user_lock = rag_manager.get_user_lock(user_id)
196
+
197
+ with user_lock:
198
+ result = await asyncio.to_thread(
199
+ engine.upload_document,
200
+ filename=file.filename,
201
+ file_content=content,
202
+ action="auto"
203
+ )
204
+
205
+ if result["status"] == "success":
206
+ # Save PDF to user's uploaded_pdfs folder
207
+ base_dir = os.path.dirname(os.path.abspath(__file__))
208
+ user_pdfs_dir = os.path.join(base_dir, "users", user_id, "uploaded_pdfs")
209
+ os.makedirs(user_pdfs_dir, exist_ok=True)
210
+
211
+ pdf_path = os.path.join(user_pdfs_dir, file.filename)
212
+ with open(pdf_path, "wb") as f:
213
+ f.write(content)
214
+
215
+ await asyncio.to_thread(
216
+ hf_storage.push_storage_to_hf,
217
+ user_id,
218
+ f"Upload {file.filename}"
219
+ )
220
+
221
+ print(f"✅ Upload success for user {user_id}: {file.filename}")
222
+
223
+ return UploadResponse(
224
+ document_id=result.get("doc_id", ""),
225
+ filename=file.filename,
226
+ status="success",
227
+ message="Uploaded successfully",
228
+ pages=result.get("pages"),
229
+ chunks=result.get("chunks")
230
+ )
231
+ else:
232
+ raise HTTPException(400, result.get("message", "Upload failed"))
233
+
234
+ except HTTPException:
235
+ raise
236
+ except Exception as e:
237
+ print(f"❌ Upload error (user {user_id}): {e}")
238
+ raise HTTPException(500, "Upload failed")
239
+
240
+
241
+ @app.post("/query", response_model=QueryResponse)
242
+ async def query_documents(
243
+ request: QueryRequest,
244
+ user_id: str = Depends(get_current_user)
245
+ ):
246
+ """
247
+ Query user's documents using RAG.
248
+ Requires: X-API-KEY header
249
+ """
250
+ try:
251
+ engine = await rag_manager.get_engine(user_id)
252
+
253
+ result = await asyncio.to_thread(
254
+ engine.ask,
255
+ query=request.question,
256
+ top_k=request.top_k
257
+ )
258
+
259
+ print(f"✅ Query success for user {user_id}")
260
+
261
+ return QueryResponse(
262
+ answer=result["answer"],
263
+ sources=result.get("sources", [])
264
+ )
265
+
266
+ except Exception as e:
267
+ print(f"❌ Query error (user {user_id}): {e}")
268
+ raise HTTPException(500, "Query failed")
269
+
270
+
271
+ @app.get("/documents", response_model=List[DocumentInfo])
272
+ async def get_documents(user_id: str = Depends(get_current_user)):
273
+ """
274
+ Get all documents for authenticated user.
275
+ Requires: X-API-KEY header
276
+ """
277
+ try:
278
+ engine = await rag_manager.get_engine(user_id)
279
+ documents = await asyncio.to_thread(engine.get_all_documents)
280
+
281
+ return [
282
+ DocumentInfo(
283
+ doc_id=doc["doc_id"],
284
+ filename=doc["filename"],
285
+ upload_timestamp=doc["upload_timestamp"],
286
+ num_chunks=doc["num_chunks"],
287
+ num_pages=doc["num_pages"]
288
+ )
289
+ for doc in documents
290
+ ]
291
+
292
+ except Exception as e:
293
+ print(f"❌ Get documents error (user {user_id}): {e}")
294
+ raise HTTPException(500, "Failed to retrieve documents")
295
+
296
+
297
+ @app.delete("/documents/{doc_id}", response_model=DeleteResponse)
298
+ async def delete_document(
299
+ doc_id: str,
300
+ user_id: str = Depends(get_current_user)
301
+ ):
302
+ """
303
+ Delete document for authenticated user.
304
+ Requires: X-API-KEY header
305
+ """
306
+ try:
307
+ engine = await rag_manager.get_engine(user_id)
308
+ user_lock = rag_manager.get_user_lock(user_id)
309
+
310
+ with user_lock:
311
+ result = await asyncio.to_thread(engine.delete_document, doc_id)
312
+
313
+ if result["status"] == "success":
314
+ await asyncio.to_thread(
315
+ hf_storage.push_storage_to_hf,
316
+ user_id,
317
+ f"Delete {doc_id}"
318
+ )
319
+
320
+ print(f"✅ Delete success for user {user_id}: {doc_id}")
321
+
322
+ return DeleteResponse(
323
+ status="success",
324
+ message=result["message"]
325
+ )
326
+ else:
327
+ raise HTTPException(404, result["message"])
328
+
329
+ except HTTPException:
330
+ raise
331
+ except Exception as e:
332
+ print(f"❌ Delete error (user {user_id}): {e}")
333
+ raise HTTPException(500, "Deletion failed")
334
+
335
+
336
+ @app.get("/stats", response_model=StatsResponse)
337
+ async def get_stats(user_id: str = Depends(get_current_user)):
338
+ """
339
+ Get stats for authenticated user.
340
+ Requires: X-API-KEY header
341
+ """
342
+ try:
343
+ engine = await rag_manager.get_engine(user_id)
344
+ stats = await asyncio.to_thread(engine.get_stats)
345
+
346
+ return StatsResponse(
347
+ total_documents=stats["total_documents"],
348
+ total_chunks=stats["total_chunks"],
349
+ index_size=stats["index_size"]
350
+ )
351
+
352
+ except Exception as e:
353
+ print(f"❌ Stats error (user {user_id}): {e}")
354
+ raise HTTPException(500, "Failed to retrieve stats")
355
+
356
+
357
+ # ============================================
358
+ # MAIN
359
+ # ============================================
360
+
361
+ if __name__ == "__main__":
362
+ import uvicorn
363
+ port = int(os.environ.get("PORT", 8000))
364
+ uvicorn.run("main:app", host="0.0.0.0", port=port, reload=True)
rag_engine.py ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG Engine Module
3
+ =================
4
+ Handles all RAG pipeline operations:
5
+ - PDF text extraction
6
+ - Text chunking with overlap
7
+ - Embedding generation using SentenceTransformers
8
+ - FAISS vector storage and retrieval
9
+ - Metadata and document registry management
10
+ - Persistence of embeddings and metadata
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import hashlib
16
+ from datetime import datetime
17
+ from typing import List, Dict, Tuple, Optional
18
+ import numpy as np
19
+ import faiss
20
+ from sentence_transformers import SentenceTransformer
21
+ import PyPDF2
22
+ import google.generativeai as genai
23
+ from PIL import Image
24
+ import io
25
+
26
+ # OCR imports (optional)
27
+ try:
28
+ import pytesseract
29
+
30
+ OCR_AVAILABLE = True
31
+ except ImportError:
32
+ OCR_AVAILABLE = False
33
+ print("Warning: pytesseract not installed. OCR functionality will be disabled.")
34
+
35
+ # ============================================
36
+ # CONFIGURATION
37
+ # ============================================
38
+
39
+ # Chunking parameters
40
+ DEFAULT_CHUNK_SIZE = 200 # words per chunk
41
+ DEFAULT_OVERLAP_SIZE = 50 # overlapping words
42
+
43
+ # Retrieval parameters
44
+ DEFAULT_TOP_K = 5 # number of chunks to retrieve
45
+
46
+ # Embedding model
47
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
48
+ EMBEDDING_DIMENSION = 384
49
+
50
+
51
+
52
+ class RAGEngine:
53
+ """
54
+ Main RAG Engine class that handles:
55
+ - Document processing and embedding
56
+ - FAISS index management
57
+ - Query processing and answer generation
58
+ - Persistence of all data
59
+ """
60
+
61
+ def __init__(self, gemini_api_key: str, storage_dir: Optional[str] = None):
62
+ """
63
+ Initialize the RAG Engine.
64
+
65
+ Args:
66
+ gemini_api_key: API key for Google Gemini
67
+ storage_dir: Optional custom storage directory for per-user isolation
68
+ """
69
+ # Set storage paths
70
+ if storage_dir is None:
71
+ storage_dir = os.path.join(os.path.dirname(__file__), "storage")
72
+
73
+ self.storage_dir = storage_dir
74
+ self.faiss_index_path = os.path.join(storage_dir, "faiss.index")
75
+ self.metadata_path = os.path.join(storage_dir, "metadata.json")
76
+ self.documents_path = os.path.join(storage_dir, "documents.json")
77
+
78
+ # Ensure storage directory exists
79
+ os.makedirs(storage_dir, exist_ok=True)
80
+
81
+ # Initialize embedding model
82
+ print("Loading embedding model...")
83
+ self.embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
84
+
85
+ # Initialize Gemini
86
+ genai.configure(api_key=gemini_api_key)
87
+ self.gemini_model = genai.GenerativeModel("gemini-2.5-flash")
88
+
89
+ # Initialize or load FAISS index
90
+ self.index: Optional[faiss.IndexFlatL2] = None
91
+ self.metadata: List[Dict] = [] # Stores chunk text, source, page
92
+ self.documents: Dict[str, Dict] = {} # Document registry
93
+
94
+ # Load existing data if available
95
+ self._load_persistent_data()
96
+
97
+ print(f"RAG Engine initialized. Documents: {len(self.documents)}, Chunks: {len(self.metadata)}")
98
+
99
+ # ============================================
100
+ # PERSISTENCE METHODS
101
+ # ============================================
102
+
103
+ def _load_persistent_data(self):
104
+ """Load FAISS index, metadata, and document registry from disk."""
105
+
106
+ # Load document registry
107
+ if os.path.exists(self.documents_path):
108
+ with open(self.documents_path, "r", encoding="utf-8") as f:
109
+ self.documents = json.load(f)
110
+ print(f"Loaded {len(self.documents)} documents from registry")
111
+
112
+ # Load metadata
113
+ if os.path.exists(self.metadata_path):
114
+ with open(self.metadata_path, "r", encoding="utf-8") as f:
115
+ self.metadata = json.load(f)
116
+ print(f"Loaded {len(self.metadata)} chunks metadata")
117
+
118
+ # Load FAISS index
119
+ if os.path.exists(self.faiss_index_path) and len(self.metadata) > 0:
120
+ self.index = faiss.read_index(self.faiss_index_path)
121
+ print(f"Loaded FAISS index with {self.index.ntotal} vectors")
122
+ else:
123
+ # Create new empty index
124
+ self.index = faiss.IndexFlatL2(EMBEDDING_DIMENSION)
125
+ print("Created new FAISS index")
126
+
127
+ def _save_persistent_data(self):
128
+ """Save FAISS index, metadata, and document registry to disk."""
129
+
130
+ # Save document registry
131
+ with open(self.documents_path, "w", encoding="utf-8") as f:
132
+ json.dump(self.documents, f, indent=2, ensure_ascii=False)
133
+
134
+ # Save metadata
135
+ with open(self.metadata_path, "w", encoding="utf-8") as f:
136
+ json.dump(self.metadata, f, indent=2, ensure_ascii=False)
137
+
138
+ # Save FAISS index
139
+ if self.index is not None and self.index.ntotal > 0:
140
+ faiss.write_index(self.index, self.faiss_index_path)
141
+
142
+ print("Persistent data saved successfully")
143
+
144
+ # ============================================
145
+ # DOCUMENT PROCESSING METHODS
146
+ # ============================================
147
+
148
+ @staticmethod
149
+ def compute_file_hash(file_content: bytes) -> str:
150
+ """
151
+ Compute SHA-256 hash of file content.
152
+
153
+ Args:
154
+ file_content: Raw bytes of the file
155
+
156
+ Returns:
157
+ Hexadecimal hash string
158
+ """
159
+ return hashlib.sha256(file_content).hexdigest()
160
+
161
+ @staticmethod
162
+ def chunk_text_with_overlap(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE,
163
+ overlap_size: int = DEFAULT_OVERLAP_SIZE) -> List[str]:
164
+ """
165
+ Split text into overlapping chunks.
166
+
167
+ Args:
168
+ text: Input text to chunk
169
+ chunk_size: Number of words per chunk
170
+ overlap_size: Number of overlapping words between chunks
171
+
172
+ Returns:
173
+ List of text chunks
174
+ """
175
+ words = text.split()
176
+ chunks = []
177
+ start = 0
178
+
179
+ while start < len(words):
180
+ end = start + chunk_size
181
+ chunk = " ".join(words[start:end])
182
+ if chunk.strip(): # Only add non-empty chunks
183
+ chunks.append(chunk)
184
+ start += chunk_size - overlap_size
185
+
186
+ return chunks
187
+
188
+ @staticmethod
189
+ def extract_text_from_image(image: Image.Image) -> str:
190
+ """
191
+ Extract text from an image using OCR.
192
+
193
+ Args:
194
+ image: PIL Image object
195
+
196
+ Returns:
197
+ Extracted text string
198
+ """
199
+ if not OCR_AVAILABLE:
200
+ return ""
201
+
202
+ try:
203
+ # Convert to RGB if needed
204
+ if image.mode != 'RGB':
205
+ image = image.convert('RGB')
206
+
207
+ # Run OCR
208
+ text = pytesseract.image_to_string(image, lang='eng')
209
+ return text.strip()
210
+ except Exception as e:
211
+ print(f"OCR error: {e}")
212
+ return ""
213
+
214
+ def extract_text_from_pdf(self, pdf_content: bytes) -> List[Dict]:
215
+ """
216
+ Extract text from PDF page by page, including OCR for images.
217
+
218
+ Args:
219
+ pdf_content: Raw bytes of PDF file
220
+
221
+ Returns:
222
+ List of dicts with page_num, text, and ocr_text
223
+ """
224
+ pages = []
225
+
226
+ try:
227
+ reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
228
+ for page_num, page in enumerate(reader.pages):
229
+ # Extract regular text
230
+ text = page.extract_text()
231
+ ocr_text = ""
232
+
233
+ # Extract images and apply OCR
234
+ if OCR_AVAILABLE:
235
+ try:
236
+ # Get images from page
237
+ if '/XObject' in page['/Resources']:
238
+ xObject = page['/Resources']['/XObject'].get_object()
239
+
240
+ for obj in xObject:
241
+ if xObject[obj]['/Subtype'] == '/Image':
242
+ try:
243
+ # Extract image data
244
+ size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
245
+ data = xObject[obj].get_data()
246
+
247
+ # Try to create image
248
+ if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
249
+ mode = "RGB"
250
+ elif xObject[obj]['/ColorSpace'] == '/DeviceGray':
251
+ mode = "L"
252
+ else:
253
+ mode = "RGB" # Default
254
+
255
+ try:
256
+ image = Image.frombytes(mode, size, data)
257
+ # Apply OCR
258
+ img_text = self.extract_text_from_image(image)
259
+ if img_text:
260
+ ocr_text += img_text + "\n"
261
+ except Exception as img_error:
262
+ # Try with PIL's open if frombytes fails
263
+ try:
264
+ image = Image.open(io.BytesIO(data))
265
+ img_text = self.extract_text_from_image(image)
266
+ if img_text:
267
+ ocr_text += img_text + "\n"
268
+ except:
269
+ pass
270
+ except Exception as e:
271
+ # Skip this image if extraction fails
272
+ continue
273
+ except Exception as e:
274
+ print(f"Error extracting images from page {page_num + 1}: {e}")
275
+
276
+ # Combine regular text and OCR text
277
+ combined_text = ""
278
+ if text and text.strip():
279
+ combined_text += text.strip()
280
+ if ocr_text.strip():
281
+ if combined_text:
282
+ combined_text += "\n\n[Text from images:]\n" + ocr_text.strip()
283
+ else:
284
+ combined_text = ocr_text.strip()
285
+
286
+ if combined_text:
287
+ pages.append({
288
+ "page_num": page_num + 1,
289
+ "text": combined_text,
290
+ "has_ocr": bool(ocr_text.strip())
291
+ })
292
+ except Exception as e:
293
+ print(f"Error extracting PDF text: {e}")
294
+ raise
295
+
296
+ return pages
297
+
298
+ def process_pdf(self, filename: str, file_content: bytes,
299
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
300
+ overlap_size: int = DEFAULT_OVERLAP_SIZE) -> List[Dict]:
301
+ """
302
+ Process a PDF: extract text (including OCR), chunk it, and prepare metadata.
303
+
304
+ Args:
305
+ filename: Original filename
306
+ file_content: Raw bytes of PDF
307
+ chunk_size: Words per chunk
308
+ overlap_size: Overlap between chunks
309
+
310
+ Returns:
311
+ List of chunk metadata dicts
312
+ """
313
+ # Extract pages
314
+ pages = self.extract_text_from_pdf(file_content)
315
+
316
+ # Chunk each page
317
+ chunks_metadata = []
318
+ for page_info in pages:
319
+ page_chunks = self.chunk_text_with_overlap(
320
+ page_info["text"],
321
+ chunk_size,
322
+ overlap_size
323
+ )
324
+ for chunk_text in page_chunks:
325
+ chunks_metadata.append({
326
+ "text": chunk_text,
327
+ "source": filename,
328
+ "page": page_info["page_num"],
329
+ "has_ocr": page_info.get("has_ocr", False)
330
+ })
331
+
332
+ return chunks_metadata
333
+
334
+ # ============================================
335
+ # DUPLICATE DETECTION METHODS
336
+ # ============================================
337
+
338
+ def check_duplicate(self, file_hash: str) -> Optional[Dict]:
339
+ """
340
+ Check if a document with the same hash already exists.
341
+
342
+ Args:
343
+ file_hash: SHA-256 hash of the file
344
+
345
+ Returns:
346
+ Document info if duplicate found, None otherwise
347
+ """
348
+ for doc_id, doc_info in self.documents.items():
349
+ if doc_info.get("hash") == file_hash:
350
+ return {"doc_id": doc_id, **doc_info}
351
+ return None
352
+
353
+ def get_document_by_filename(self, filename: str) -> Optional[Dict]:
354
+ """
355
+ Get document info by filename.
356
+
357
+ Args:
358
+ filename: Original filename
359
+
360
+ Returns:
361
+ Document info if found, None otherwise
362
+ """
363
+ for doc_id, doc_info in self.documents.items():
364
+ if doc_info.get("filename") == filename:
365
+ return {"doc_id": doc_id, **doc_info}
366
+ return None
367
+
368
+ # ============================================
369
+ # EMBEDDING AND INDEXING METHODS
370
+ # ============================================
371
+
372
+ def generate_embeddings(self, texts: List[str]) -> np.ndarray:
373
+ """
374
+ Generate embeddings for a list of texts.
375
+
376
+ Args:
377
+ texts: List of text strings
378
+
379
+ Returns:
380
+ Numpy array of embeddings
381
+ """
382
+ embeddings = self.embed_model.encode(texts)
383
+ return np.array(embeddings).astype("float32")
384
+
385
+ def add_to_index(self, chunks_metadata: List[Dict]) -> int:
386
+ """
387
+ Add new chunks to FAISS index and metadata.
388
+
389
+ Args:
390
+ chunks_metadata: List of chunk dicts with text, source, page
391
+
392
+ Returns:
393
+ Number of chunks added
394
+ """
395
+ if not chunks_metadata:
396
+ return 0
397
+
398
+ # Extract texts for embedding
399
+ texts = [c["text"] for c in chunks_metadata]
400
+
401
+ # Generate embeddings
402
+ embeddings = self.generate_embeddings(texts)
403
+
404
+ # Add to FAISS index
405
+ self.index.add(embeddings)
406
+
407
+ # Add to metadata
408
+ self.metadata.extend(chunks_metadata)
409
+
410
+ return len(chunks_metadata)
411
+
412
+ def remove_document_from_index(self, filename: str):
413
+ """
414
+ Remove all chunks of a document from the index.
415
+ Note: FAISS IndexFlatL2 doesn't support removal, so we rebuild.
416
+
417
+ Args:
418
+ filename: Filename of document to remove
419
+ """
420
+ # Filter out chunks from this document
421
+ remaining_metadata = [
422
+ m for m in self.metadata if m["source"] != filename
423
+ ]
424
+
425
+ if len(remaining_metadata) == len(self.metadata):
426
+ return # Nothing to remove
427
+
428
+ # Rebuild index with remaining chunks
429
+ self.metadata = remaining_metadata
430
+
431
+ if self.metadata:
432
+ texts = [m["text"] for m in self.metadata]
433
+ embeddings = self.generate_embeddings(texts)
434
+ self.index = faiss.IndexFlatL2(EMBEDDING_DIMENSION)
435
+ self.index.add(embeddings)
436
+ else:
437
+ self.index = faiss.IndexFlatL2(EMBEDDING_DIMENSION)
438
+
439
+ print(f"Removed document '{filename}' from index")
440
+
441
+ # ============================================
442
+ # DOCUMENT UPLOAD METHODS
443
+ # ============================================
444
+
445
+ def upload_document(self, filename: str, file_content: bytes,
446
+ action: str = "auto") -> Dict:
447
+ """
448
+ Upload and process a document.
449
+
450
+ Args:
451
+ filename: Original filename
452
+ file_content: Raw bytes of PDF
453
+ action: "auto", "use_existing", "replace", or "cancel"
454
+
455
+ Returns:
456
+ Result dict with status and info
457
+ """
458
+ # Compute hash
459
+ file_hash = self.compute_file_hash(file_content)
460
+
461
+ # Check for duplicate
462
+ existing_doc = self.check_duplicate(file_hash)
463
+
464
+ if existing_doc:
465
+ if action == "auto":
466
+ # Return duplicate warning
467
+ return {
468
+ "status": "duplicate",
469
+ "filename": filename,
470
+ "existing_filename": existing_doc["filename"],
471
+ "hash": file_hash,
472
+ "message": f"Document already exists as '{existing_doc['filename']}'",
473
+ "options": ["use_existing", "replace", "cancel"]
474
+ }
475
+ elif action == "use_existing":
476
+ return {
477
+ "status": "success",
478
+ "filename": existing_doc["filename"],
479
+ "message": "Using existing document embeddings",
480
+ "chunks": 0,
481
+ "reused": True
482
+ }
483
+ elif action == "cancel":
484
+ return {
485
+ "status": "cancelled",
486
+ "filename": filename,
487
+ "message": "Upload cancelled"
488
+ }
489
+ elif action == "replace":
490
+ # Remove old document and continue with upload
491
+ self.remove_document_from_index(existing_doc["filename"])
492
+ del self.documents[existing_doc["doc_id"]]
493
+
494
+ # Process new document
495
+ try:
496
+ chunks_metadata = self.process_pdf(filename, file_content)
497
+
498
+ if not chunks_metadata:
499
+ return {
500
+ "status": "error",
501
+ "filename": filename,
502
+ "message": "No text could be extracted from PDF"
503
+ }
504
+
505
+ # Add to index
506
+ num_chunks = self.add_to_index(chunks_metadata)
507
+
508
+ # Register document
509
+ doc_id = f"doc_{len(self.documents) + 1}_{int(datetime.now().timestamp())}"
510
+ self.documents[doc_id] = {
511
+ "filename": filename,
512
+ "hash": file_hash,
513
+ "upload_timestamp": datetime.now().isoformat(),
514
+ "num_chunks": num_chunks,
515
+ "num_pages": max(c["page"] for c in chunks_metadata)
516
+ }
517
+
518
+ # Persist changes
519
+ self._save_persistent_data()
520
+
521
+ return {
522
+ "status": "success",
523
+ "filename": filename,
524
+ "message": f"Document processed successfully",
525
+ "chunks": num_chunks,
526
+ "pages": self.documents[doc_id]["num_pages"]
527
+ }
528
+
529
+ except Exception as e:
530
+ return {
531
+ "status": "error",
532
+ "filename": filename,
533
+ "message": f"Error processing document: {str(e)}"
534
+ }
535
+
536
+ # ============================================
537
+ # QUERY AND RETRIEVAL METHODS
538
+ # ============================================
539
+
540
+ def retrieve_relevant_chunks(self, query: str, top_k: int = DEFAULT_TOP_K) -> List[Dict]:
541
+ """
542
+ Retrieve most relevant chunks for a query.
543
+
544
+ Args:
545
+ query: User's question
546
+ top_k: Number of chunks to retrieve
547
+
548
+ Returns:
549
+ List of relevant chunks with metadata
550
+ """
551
+ if self.index is None or self.index.ntotal == 0:
552
+ return []
553
+
554
+ # Limit top_k to available chunks
555
+ top_k = min(top_k, self.index.ntotal)
556
+
557
+ # Embed query
558
+ query_embedding = self.embed_model.encode([query]).astype("float32")
559
+
560
+ # Search FAISS
561
+ distances, indices = self.index.search(query_embedding, k=top_k)
562
+
563
+ # Gather results
564
+ results = []
565
+ for i, idx in enumerate(indices[0]):
566
+ if idx < len(self.metadata):
567
+ results.append({
568
+ **self.metadata[idx],
569
+ "distance": float(distances[0][i]),
570
+ "relevance_rank": i + 1
571
+ })
572
+
573
+ return results
574
+
575
+ def generate_answer(self, query: str, context_chunks: List[Dict]) -> str:
576
+ """
577
+ Generate answer using Gemini with retrieved context.
578
+
579
+ Args:
580
+ query: User's question
581
+ context_chunks: Retrieved relevant chunks
582
+
583
+ Returns:
584
+ Generated answer string
585
+ """
586
+ if not context_chunks:
587
+ return "I don't have enough information to answer this question. Please upload relevant documents first."
588
+
589
+ # Build context string
590
+ context_parts = []
591
+ for chunk in context_chunks:
592
+ context_parts.append(
593
+ f"[Source: {chunk['source']}, Page {chunk['page']}]\n{chunk['text']}"
594
+ )
595
+ context = "\n\n".join(context_parts)
596
+
597
+ # Create prompt
598
+ prompt = f"""You are a helpful assistant that answers questions based ONLY on the provided context.
599
+ Do NOT make up information that is not in the context.
600
+ If the context doesn't contain enough information to answer, say so clearly.
601
+ You may summarize, combine, or rephrase information from the context to make your answer clear and helpful.
602
+
603
+ CONTEXT:
604
+ {context}
605
+
606
+ QUESTION:
607
+ {query}
608
+
609
+ ANSWER:"""
610
+
611
+ try:
612
+ response = self.gemini_model.generate_content(prompt)
613
+ return response.text
614
+ except Exception as e:
615
+ return f"Error generating answer: {str(e)}"
616
+
617
+ def verify_sources(self, query: str, answer: str, context_chunks: List[Dict]) -> List[int]:
618
+ """
619
+ Verify which chunks actually support the generated answer.
620
+
621
+ Args:
622
+ query: User's question
623
+ answer: Generated answer
624
+ context_chunks: All retrieved chunks
625
+
626
+ Returns:
627
+ List of indices of chunks that support the answer
628
+ """
629
+ if not context_chunks:
630
+ return []
631
+
632
+ # Build context with numbered chunks
633
+ context_parts = []
634
+ for i, chunk in enumerate(context_chunks):
635
+ context_parts.append(
636
+ f"[{i}] Source: {chunk['source']}, Page {chunk['page']}\n{chunk['text']}"
637
+ )
638
+ context = "\n\n".join(context_parts)
639
+
640
+ # Create verification prompt
641
+ prompt = f"""You are a citation verification assistant. Given a question, an answer, and numbered source chunks, identify which chunks were actually used to generate the answer.
642
+
643
+ Return ONLY a comma-separated list of chunk numbers that directly support the answer (e.g., "0,2,3").
644
+ If no chunks support the answer, return "NONE".
645
+ Do not include explanations or any other text.
646
+
647
+ QUESTION:
648
+ {query}
649
+
650
+ ANSWER:
651
+ {answer}
652
+
653
+ NUMBERED CHUNKS:
654
+ {context}
655
+
656
+ CHUNK NUMBERS THAT SUPPORT THE ANSWER:"""
657
+
658
+ try:
659
+ response = self.gemini_model.generate_content(prompt)
660
+ result = response.text.strip()
661
+
662
+ # Parse the response
663
+ if result.upper() == "NONE":
664
+ return []
665
+
666
+ # Extract numbers
667
+ used_indices = []
668
+ for part in result.split(","):
669
+ try:
670
+ idx = int(part.strip())
671
+ if 0 <= idx < len(context_chunks):
672
+ used_indices.append(idx)
673
+ except ValueError:
674
+ continue
675
+
676
+ return used_indices
677
+ except Exception as e:
678
+ print(f"Error verifying sources: {e}")
679
+ # Fallback: return all chunks if verification fails
680
+ return list(range(len(context_chunks)))
681
+
682
+ def ask(self, query: str, top_k: int = DEFAULT_TOP_K) -> Dict:
683
+ """
684
+ Main query method: retrieve context, generate answer, and filter sources.
685
+
686
+ Args:
687
+ query: User's question
688
+ top_k: Number of chunks to retrieve
689
+
690
+ Returns:
691
+ Dict with answer and verified sources
692
+ """
693
+ # Retrieve relevant chunks
694
+ relevant_chunks = self.retrieve_relevant_chunks(query, top_k)
695
+
696
+ # Generate answer
697
+ answer = self.generate_answer(query, relevant_chunks)
698
+
699
+ # Verify which chunks actually support the answer
700
+ used_indices = self.verify_sources(query, answer, relevant_chunks)
701
+
702
+ # Filter sources to only those that support the answer
703
+ sources = []
704
+ seen = set()
705
+ for idx in used_indices:
706
+ if idx < len(relevant_chunks):
707
+ chunk = relevant_chunks[idx]
708
+ source_key = f"{chunk['source']}_{chunk['page']}"
709
+ if source_key not in seen:
710
+ sources.append({
711
+ "file": chunk["source"],
712
+ "page": chunk["page"]
713
+ })
714
+ seen.add(source_key)
715
+
716
+ return {
717
+ "answer": answer,
718
+ "sources": sources,
719
+ "num_chunks_used": len(sources),
720
+ "num_chunks_retrieved": len(relevant_chunks)
721
+ }
722
+
723
+ # ============================================
724
+ # DOCUMENT MANAGEMENT METHODS
725
+ # ============================================
726
+
727
+ def get_all_documents(self) -> List[Dict]:
728
+ """
729
+ Get list of all uploaded documents.
730
+
731
+ Returns:
732
+ List of document info dicts
733
+ """
734
+ return [
735
+ {"doc_id": doc_id, **info}
736
+ for doc_id, info in self.documents.items()
737
+ ]
738
+
739
+ def delete_document(self, doc_id: str) -> Dict:
740
+ """
741
+ Delete a document and its embeddings.
742
+
743
+ Args:
744
+ doc_id: Document ID to delete
745
+
746
+ Returns:
747
+ Result dict
748
+ """
749
+ if doc_id not in self.documents:
750
+ return {
751
+ "status": "error",
752
+ "message": f"Document {doc_id} not found"
753
+ }
754
+
755
+ filename = self.documents[doc_id]["filename"]
756
+
757
+ # Remove from index
758
+ self.remove_document_from_index(filename)
759
+
760
+ # Remove from registry
761
+ del self.documents[doc_id]
762
+
763
+ # Persist changes
764
+ self._save_persistent_data()
765
+
766
+ return {
767
+ "status": "success",
768
+ "message": f"Document '{filename}' deleted successfully"
769
+ }
770
+
771
+ def get_stats(self) -> Dict:
772
+ """
773
+ Get system statistics.
774
+
775
+ Returns:
776
+ Dict with stats
777
+ """
778
+ return {
779
+ "total_documents": len(self.documents),
780
+ "total_chunks": len(self.metadata),
781
+ "index_size": self.index.ntotal if self.index else 0,
782
+ "embedding_model": EMBEDDING_MODEL_NAME,
783
+ "embedding_dimension": EMBEDDING_DIMENSION
784
+ }
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI Multi-PDF RAG System - Backend Requirements
2
+ # ===================================================
3
+
4
+ # Core Web Framework
5
+ fastapi==0.109.0
6
+ uvicorn[standard]==0.27.0
7
+ python-multipart==0.0.6
8
+
9
+ # AI & Machine Learning
10
+ sentence-transformers==2.7.0
11
+ google-generativeai==0.3.2
12
+
13
+ # Vector Database
14
+ faiss-cpu==1.7.4
15
+ numpy==1.26.3
16
+
17
+ # PDF Processing
18
+ PyPDF2==3.0.1
19
+ PyMuPDF==1.23.8
20
+ pytesseract==0.3.10
21
+
22
+ # Image Processing
23
+ Pillow==10.2.0
24
+
25
+ # Hugging Face Integration
26
+ huggingface-hub==0.20.3
27
+
28
+ # Environment & Configuration
29
+ python-dotenv==1.0.0
30
+
31
+ # Data Processing
32
+ pandas==2.1.4
33
+
34
+ # HTTP Client (for testing)
35
+ httpx==0.26.0
36
+
37
+ # UI (optional - only if running Streamlit frontend locally)
38
+ streamlit==1.29.0
39
+
40
+ # Development & Testing (optional)
41
+ pytest==7.4.3
42
+ pytest-asyncio==0.21.1
43
+
start.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash
2
+ # Run FastAPI on HF expected port
3
+ uvicorn main:app --host 0.0.0.0 --port 7860