Sameer-Handsome173 commited on
Commit
2d99efe
·
verified ·
1 Parent(s): 9b6b721

Upload 2 files

Browse files
Files changed (2) hide show
  1. query_service.py +373 -0
  2. split.py +330 -0
query_service.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import base64
5
+ from fastapi import FastAPI
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
8
+ from langchain_core.documents import Document
9
+
10
+ # Custom JSONFileStore
11
+ class JSONFileStore:
12
+ def __init__(self, store_path: str):
13
+ self.store_path = store_path
14
+ os.makedirs(self.store_path, exist_ok=True)
15
+
16
+ def mget(self, keys: list[str]) -> list[Document]:
17
+ """Retrieve multiple documents by their keys."""
18
+ documents = []
19
+ for key in keys:
20
+ file_path = os.path.join(self.store_path, f"{key}.json")
21
+ if os.path.exists(file_path):
22
+ try:
23
+ with open(file_path, "r", encoding='utf-8') as f:
24
+ doc_dict = json.load(f)
25
+ documents.append(Document(
26
+ page_content=doc_dict["page_content"],
27
+ metadata=doc_dict["metadata"]
28
+ ))
29
+ except Exception as e:
30
+ print(f"Error loading {key}: {e}")
31
+ documents.append(None)
32
+ else:
33
+ documents.append(None)
34
+ return documents
35
+
36
+ app = FastAPI(title="🔍 Multimodal RAG Query Service")
37
+
38
+ # Paths
39
+ VECTOR_PATH = "./vectorstore/faiss_index"
40
+ DOCSTORE_PATH = "./docstore"
41
+
42
+ # Final Answer API endpoint
43
+ FINAL_ANSWER_URL = "https://sameer-handsome173-multi-modal.hf.space/final_answer"
44
+
45
+ # Initialize embedding function
46
+ print("🔄 Loading embedding model...")
47
+ try:
48
+ embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
49
+ print("✅ Embedding model loaded")
50
+ except Exception as e:
51
+ print(f"❌ Error loading embeddings: {e}")
52
+ raise
53
+
54
+ # Load FAISS vectorstore
55
+ try:
56
+ if os.path.exists(VECTOR_PATH):
57
+ vectorstore = FAISS.load_local(
58
+ VECTOR_PATH,
59
+ embedding_fn,
60
+ allow_dangerous_deserialization=True
61
+ )
62
+ print("✅ Loaded FAISS vectorstore")
63
+ else:
64
+ print("❌ Vectorstore not found! Please ingest documents first.")
65
+ raise FileNotFoundError("Vectorstore not found")
66
+ except Exception as e:
67
+ print(f"❌ Error loading vectorstore: {e}")
68
+ raise
69
+
70
+ # Load JSONFileStore
71
+ try:
72
+ if not os.path.exists(DOCSTORE_PATH):
73
+ print("❌ Docstore not found! Please ingest documents first.")
74
+ raise FileNotFoundError("Docstore not found")
75
+ store = JSONFileStore(DOCSTORE_PATH)
76
+ print("✅ Loaded JSONFileStore")
77
+ except Exception as e:
78
+ print(f"❌ Error loading docstore: {e}")
79
+ raise
80
+
81
+ def parse_docs(docs: list[Document]) -> dict:
82
+ """
83
+ Split retrieved documents into texts, tables, and images.
84
+ """
85
+ images = []
86
+ texts = []
87
+ tables = []
88
+
89
+ for doc in docs:
90
+ doc_type = doc.metadata.get("type", "text")
91
+
92
+ if doc_type == "image" and doc.metadata.get("is_base64"):
93
+ try:
94
+ # Validate it's base64
95
+ base64.b64decode(doc.page_content)
96
+ images.append(doc.page_content)
97
+ except Exception:
98
+ # If decoding fails, treat as text
99
+ texts.append(doc.page_content)
100
+ elif doc_type == "table":
101
+ tables.append(doc.page_content)
102
+ else:
103
+ # Regular text
104
+ texts.append(doc.page_content)
105
+
106
+ return {
107
+ "images": images,
108
+ "texts": texts,
109
+ "tables": tables
110
+ }
111
+
112
+ def retrieve_documents(query: str, k: int = 3) -> list[Document]:
113
+ """
114
+ Retrieve documents:
115
+ 1. Search vectorstore for similar summaries
116
+ 2. Get unique doc_ids from results
117
+ 3. Retrieve original documents from docstore
118
+ """
119
+ try:
120
+ similar_docs = vectorstore.similarity_search(query, k=k)
121
+ if not similar_docs:
122
+ print("⚠️ No similar documents found")
123
+ return []
124
+
125
+ # Ensure unique doc_ids
126
+ doc_ids = []
127
+ for doc in similar_docs:
128
+ doc_id = doc.metadata.get("doc_id")
129
+ if doc_id and doc_id not in doc_ids: # prevent duplicates
130
+ doc_ids.append(doc_id)
131
+
132
+ if not doc_ids:
133
+ print("⚠️ No doc_ids found in metadata")
134
+ return []
135
+
136
+ print(f"🔑 Found {len(doc_ids)} unique doc_ids")
137
+
138
+ # Retrieve original documents
139
+ original_docs = store.mget(doc_ids)
140
+ original_docs = [doc for doc in original_docs if doc is not None]
141
+ print(f"📄 Retrieved {len(original_docs)} unique documents")
142
+
143
+ return original_docs
144
+
145
+ except Exception as e:
146
+ print(f"❌ Error in retrieval: {e}")
147
+ return []
148
+
149
+ def build_context_and_images(docs_by_type: dict) -> tuple[str, list[str]]:
150
+ """
151
+ Build context text from texts and tables, and collect image base64 strings.
152
+ Returns: (context_text, list_of_base64_images)
153
+ """
154
+ context_parts = []
155
+
156
+ # Add text documents
157
+ if docs_by_type["texts"]:
158
+ for i, text_content in enumerate(docs_by_type["texts"], 1):
159
+ context_parts.append(f"--- Text Document {i} ---\n{text_content}")
160
+
161
+ # Add table documents
162
+ if docs_by_type["tables"]:
163
+ for i, table_content in enumerate(docs_by_type["tables"], 1):
164
+ context_parts.append(f"--- Table {i} ---\n{table_content}")
165
+
166
+ context_text = "\n\n".join(context_parts)
167
+
168
+ # Get images
169
+ images_b64 = docs_by_type["images"]
170
+
171
+ return context_text.strip(), images_b64
172
+
173
+ def call_final_answer_endpoint(context: str, question: str, images_b64: list[str]) -> dict:
174
+ """
175
+ Call the /final_answer endpoint with context, question, and images.
176
+ """
177
+ try:
178
+ # Prepare form data
179
+ data = {
180
+ "context": context,
181
+ "question": question
182
+ }
183
+
184
+ # Prepare image files if any
185
+ files = []
186
+ if images_b64:
187
+ for i, img_b64 in enumerate(images_b64):
188
+ try:
189
+ # Decode base64 to bytes
190
+ img_bytes = base64.b64decode(img_b64)
191
+ # Add to files list
192
+ files.append(("images", (f"image_{i}.jpg", img_bytes, "image/jpeg")))
193
+ except Exception as e:
194
+ print(f"⚠️ Error processing image {i}: {str(e)}")
195
+
196
+ # Make request
197
+ if files:
198
+ response = requests.post(
199
+ FINAL_ANSWER_URL,
200
+ data=data,
201
+ files=files,
202
+ timeout=150
203
+ )
204
+ else:
205
+ response = requests.post(
206
+ FINAL_ANSWER_URL,
207
+ data=data,
208
+ timeout=150
209
+ )
210
+
211
+ if response.status_code == 200:
212
+ return response.json()
213
+ else:
214
+ return {
215
+ "error": f"API returned status {response.status_code}",
216
+ "details": response.text
217
+ }
218
+
219
+ except Exception as e:
220
+ return {
221
+ "error": f"Error calling final_answer endpoint: {str(e)}"
222
+ }
223
+
224
+ @app.get("/")
225
+ def home():
226
+ return {
227
+ "message": "✅ Multimodal RAG Query Service is running",
228
+ "endpoints": {
229
+ "query": "POST /query?question=YOUR_QUESTION&k=5",
230
+ "query_with_details": "POST /query_with_details?question=YOUR_QUESTION&k=5",
231
+ "stats": "GET /stats"
232
+ },
233
+ "features": ["Text retrieval", "Table retrieval", "Image retrieval", "Multimodal querying"]
234
+ }
235
+
236
+ @app.get("/stats")
237
+ def get_stats():
238
+ """Get system statistics"""
239
+ try:
240
+ vector_count = vectorstore.index.ntotal if hasattr(vectorstore, 'index') else 0
241
+ docstore_files = len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith('.json')]) if os.path.exists(DOCSTORE_PATH) else 0
242
+
243
+ return {
244
+ "status": "ready",
245
+ "vectorstore_count": vector_count,
246
+ "docstore_count": docstore_files
247
+ }
248
+ except Exception as e:
249
+ return {"status": "error", "error": str(e)}
250
+
251
+ @app.post("/query")
252
+ async def query_rag(question: str, k: int = 5):
253
+ """
254
+ Query the Multimodal RAG system:
255
+ 1. Search vectorstore for relevant summaries
256
+ 2. Retrieve original documents (text + tables + images)
257
+ 3. Parse into texts, tables, and images
258
+ 4. Call final_answer endpoint with all content
259
+ 5. Return answer
260
+ """
261
+ try:
262
+ print(f"\n🔍 Query: {question}")
263
+
264
+ # Retrieve documents
265
+ docs = retrieve_documents(question, k=k)
266
+
267
+ if not docs:
268
+ return {
269
+ "question": question,
270
+ "answer": "No relevant documents found. Please ingest documents first.",
271
+ "retrieved_docs": 0
272
+ }
273
+
274
+ # Parse documents into texts, tables, and images
275
+ docs_by_type = parse_docs(docs)
276
+ print(f"📊 Parsed: {len(docs_by_type['texts'])} texts, {len(docs_by_type['tables'])} tables, {len(docs_by_type['images'])} images")
277
+
278
+ # Build context and collect images
279
+ context_text, images_b64 = build_context_and_images(docs_by_type)
280
+
281
+ # Call endpoint
282
+ print("🚀 Calling final_answer endpoint...")
283
+ result = call_final_answer_endpoint(context_text, question, images_b64)
284
+
285
+ # Return response
286
+ if "error" in result:
287
+ return {
288
+ "question": question,
289
+ "error": result["error"],
290
+ "details": result.get("details"),
291
+ "retrieved_docs": len(docs),
292
+ "context_preview": context_text[:300] if context_text else "No context"
293
+ }
294
+
295
+ return {
296
+ "question": question,
297
+ "answer": result.get("response", "No response generated"),
298
+ "retrieved_docs": len(docs),
299
+ "docs_info": {
300
+ "texts": len(docs_by_type['texts']),
301
+ "tables": len(docs_by_type['tables']),
302
+ "images": len(docs_by_type['images'])
303
+ },
304
+ "context_preview": context_text[:300] if context_text else "No context"
305
+ }
306
+
307
+ except Exception as e:
308
+ import traceback
309
+ return {
310
+ "question": question,
311
+ "error": str(e),
312
+ "traceback": traceback.format_exc()
313
+ }
314
+
315
+ @app.post("/query_with_details")
316
+ async def query_with_details(question: str, k: int = 5):
317
+ """Query with detailed document information"""
318
+ try:
319
+ print(f"\n🔍 Detailed Query: {question}")
320
+
321
+ # Retrieve documents
322
+ docs = retrieve_documents(question, k=k)
323
+
324
+ if not docs:
325
+ return {
326
+ "question": question,
327
+ "answer": "No relevant documents found.",
328
+ "retrieved_docs": []
329
+ }
330
+
331
+ # Parse documents
332
+ docs_by_type = parse_docs(docs)
333
+ context_text, images_b64 = build_context_and_images(docs_by_type)
334
+
335
+ # Call endpoint
336
+ result = call_final_answer_endpoint(context_text, question, images_b64)
337
+
338
+ # Prepare document info (without full base64 images)
339
+ docs_info = []
340
+ for doc in docs:
341
+ doc_info = {
342
+ "doc_id": doc.metadata.get("doc_id"),
343
+ "type": doc.metadata.get("type"),
344
+ "source": doc.metadata.get("source"),
345
+ "summary": doc.metadata.get("summary", "")[:200],
346
+ }
347
+
348
+ # Don't include full content for images
349
+ if doc.metadata.get("type") == "image":
350
+ doc_info["content"] = "[Base64 Image Data]"
351
+ else:
352
+ doc_info["content"] = doc.page_content[:300]
353
+
354
+ docs_info.append(doc_info)
355
+
356
+ return {
357
+ "question": question,
358
+ "answer": result.get("response", result.get("error", "No response")),
359
+ "retrieved_docs": docs_info,
360
+ "stats": {
361
+ "total_retrieved": len(docs),
362
+ "texts": len(docs_by_type['texts']),
363
+ "tables": len(docs_by_type['tables']),
364
+ "images": len(docs_by_type['images'])
365
+ }
366
+ }
367
+
368
+ except Exception as e:
369
+ import traceback
370
+ return {
371
+ "error": str(e),
372
+ "traceback": traceback.format_exc()
373
+ }
split.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import uuid
4
+ import requests
5
+ import base64
6
+ import fitz # PyMuPDF
7
+ from fastapi import FastAPI, UploadFile, File
8
+ from pypdf import PdfReader
9
+ import pdfplumber
10
+ from PIL import Image
11
+ import io
12
+ from langchain_community.vectorstores import FAISS
13
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
14
+ from langchain_core.documents import Document
15
+
16
+
17
+ # ================= JSON File Store =================
18
+ class JSONFileStore:
19
+ def __init__(self, store_path: str):
20
+ self.store_path = store_path
21
+ os.makedirs(self.store_path, exist_ok=True)
22
+
23
+ def mset(self, key_value_pairs: list[tuple[str, Document]]) -> None:
24
+ for key, doc in key_value_pairs:
25
+ file_path = os.path.join(self.store_path, f"{key}.json")
26
+ doc_dict = {"page_content": doc.page_content, "metadata": doc.metadata}
27
+ with open(file_path, "w", encoding="utf-8") as f:
28
+ json.dump(doc_dict, f, ensure_ascii=False)
29
+
30
+ def mget(self, keys: list[str]) -> list[Document]:
31
+ documents = []
32
+ for key in keys:
33
+ file_path = os.path.join(self.store_path, f"{key}.json")
34
+ if os.path.exists(file_path):
35
+ try:
36
+ with open(file_path, "r", encoding="utf-8") as f:
37
+ doc_dict = json.load(f)
38
+ documents.append(
39
+ Document(
40
+ page_content=doc_dict["page_content"],
41
+ metadata=doc_dict["metadata"],
42
+ )
43
+ )
44
+ except Exception as e:
45
+ print(f"Error loading {key}: {e}")
46
+ documents.append(None)
47
+ else:
48
+ documents.append(None)
49
+ return documents
50
+
51
+
52
+ # ================= FastAPI Setup =================
53
+ app = FastAPI(title="🚀 Multimodal RAG Ingestion Service (Text + Tables + Images)")
54
+
55
+ VECTOR_PATH = "./vectorstore/faiss_index"
56
+ DOCSTORE_PATH = "./docstore"
57
+ TEMP_DOCS_PATH = "./docs"
58
+
59
+ QWEN_TEXT_URL = "https://sameer-handsome173-multi-modal.hf.space/summarize_qwen"
60
+ BLIP_IMAGE_URL = "https://sameer-handsome173-multi-modal.hf.space/summarize_smol"
61
+
62
+ print("🔄 Loading embedding model...")
63
+ embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
64
+ print("✅ Embedding model loaded")
65
+
66
+ # Load or create vectorstore
67
+ if os.path.exists(VECTOR_PATH):
68
+ vectorstore = FAISS.load_local(
69
+ VECTOR_PATH, embedding_fn, allow_dangerous_deserialization=True
70
+ )
71
+ print("✅ Loaded existing FAISS vectorstore")
72
+ else:
73
+ os.makedirs(os.path.dirname(VECTOR_PATH), exist_ok=True)
74
+ vectorstore = FAISS.from_texts(["init"], embedding_fn)
75
+ print("✅ Created new FAISS vectorstore")
76
+
77
+ # Initialize JSON store
78
+ os.makedirs(DOCSTORE_PATH, exist_ok=True)
79
+ store = JSONFileStore(DOCSTORE_PATH)
80
+ print("✅ Initialized JSONFileStore")
81
+
82
+
83
+ # ================= Extraction Functions =================
84
+ def extract_tables_from_pdf(pdf_path: str) -> list[str]:
85
+ tables = []
86
+ try:
87
+ with pdfplumber.open(pdf_path) as pdf:
88
+ for page_num, page in enumerate(pdf.pages):
89
+ page_tables = page.extract_tables()
90
+ if page_tables:
91
+ for table_idx, table in enumerate(page_tables):
92
+ table_str = f"Table from page {page_num + 1}:\n"
93
+ for row in table:
94
+ if row:
95
+ table_str += " | ".join(
96
+ [str(cell) if cell else "" for cell in row]
97
+ ) + "\n"
98
+ tables.append(table_str)
99
+ print(f"📊 Extracted table from page {page_num + 1}")
100
+ except Exception as e:
101
+ print(f"⚠️ Error extracting tables: {e}")
102
+ return tables
103
+
104
+
105
+ def extract_text_from_pdf(pdf_path: str) -> list[dict]:
106
+ """Extract text per page"""
107
+ texts = []
108
+ try:
109
+ reader = PdfReader(pdf_path)
110
+ for i, page in enumerate(reader.pages):
111
+ text = page.extract_text()
112
+ if text and text.strip():
113
+ texts.append({"page": i + 1, "content": text.strip()})
114
+ print(f"📝 Extracted text from page {i+1}")
115
+ except Exception as e:
116
+ print(f"❌ Error extracting text: {e}")
117
+ return texts
118
+
119
+
120
+ import hashlib
121
+
122
+ def extract_images_from_pdf(pdf_path: str) -> list[str]:
123
+ """Extract large, unique images from PDF as base64"""
124
+ images_b64 = []
125
+ image_hashes = set()
126
+ try:
127
+ reader = PdfReader(pdf_path)
128
+ for page_num, page in enumerate(reader.pages):
129
+ if '/XObject' not in page['/Resources']:
130
+ continue
131
+ xObject = page['/Resources']['/XObject'].get_object()
132
+ for obj in xObject:
133
+ if xObject[obj]['/Subtype'] == '/Image':
134
+ try:
135
+ width = xObject[obj]['/Width']
136
+ height = xObject[obj]['/Height']
137
+ if width < 100 or height < 100:
138
+ continue # skip small images
139
+
140
+ data = xObject[obj].get_data()
141
+ h = hashlib.md5(data).hexdigest()
142
+ if h in image_hashes:
143
+ continue # skip duplicates
144
+ image_hashes.add(h)
145
+
146
+ mode = "RGB" if xObject[obj]['/ColorSpace'] == '/DeviceRGB' else "P"
147
+ image = Image.frombytes(mode, (width, height), data)
148
+ buffered = io.BytesIO()
149
+ image.save(buffered, format="JPEG")
150
+ img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
151
+ images_b64.append(img_b64)
152
+
153
+ print(f"📸 Extracted image from page {page_num+1} ({width}x{height})")
154
+ except Exception as e:
155
+ print(f"⚠️ Error extracting image from page {page_num+1}: {e}")
156
+ except Exception as e:
157
+ print(f"❌ Error extracting images: {e}")
158
+ return images_b64
159
+
160
+
161
+
162
+ # ================= Summarization =================
163
+ def summarize_text(content: str) -> str:
164
+ try:
165
+ response = requests.post(
166
+ QWEN_TEXT_URL,
167
+ data={"prompt": f"Summarize the following content:\n\n{content}"},
168
+ timeout=30,
169
+ )
170
+ if response.status_code == 200:
171
+ return response.json().get("response", content[:200])
172
+ else:
173
+ return content[:200]
174
+ except Exception as e:
175
+ print(f"⚠️ Text summary fallback: {e}")
176
+ return content[:200]
177
+
178
+
179
+ def summarize_image(image_b64: str) -> str:
180
+ try:
181
+ image_bytes = base64.b64decode(image_b64)
182
+ files = {"image": ("image.jpg", image_bytes, "image/jpeg")}
183
+ data = {"text": "Describe this image in detail"}
184
+ response = requests.post(BLIP_IMAGE_URL, files=files, data=data, timeout=30)
185
+ if response.status_code == 200:
186
+ return response.json().get("response", "No image summary generated")
187
+ return "Image extracted from PDF"
188
+ except Exception as e:
189
+ print(f"⚠️ Image summary fallback: {e}")
190
+ return "Image extracted from PDF"
191
+
192
+
193
+ # ================= FastAPI Endpoints =================
194
+ @app.get("/")
195
+ def home():
196
+ return {
197
+ "message": "✅ Multimodal RAG Ingestion Service is running",
198
+ "endpoints": {
199
+ "ingest": "POST /ingest - Upload PDF file",
200
+ "stats": "GET /stats - View system statistics",
201
+ },
202
+ }
203
+
204
+
205
+ @app.get("/stats")
206
+ def get_stats():
207
+ vector_count = (
208
+ vectorstore.index.ntotal if hasattr(vectorstore, "index") else 0
209
+ )
210
+ docstore_files = (
211
+ len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith(".json")])
212
+ if os.path.exists(DOCSTORE_PATH)
213
+ else 0
214
+ )
215
+ return {
216
+ "status": "healthy",
217
+ "vectorstore_count": vector_count,
218
+ "docstore_count": docstore_files,
219
+ }
220
+
221
+
222
+ @app.post("/ingest")
223
+ async def ingest_pdf(file: UploadFile = File(...)):
224
+ if not file.filename.endswith(".pdf"):
225
+ return {"error": "Only PDF files are supported"}
226
+
227
+ os.makedirs(TEMP_DOCS_PATH, exist_ok=True)
228
+ temp_path = os.path.join(TEMP_DOCS_PATH, file.filename)
229
+
230
+ with open(temp_path, "wb") as f:
231
+ content = await file.read()
232
+ f.write(content)
233
+
234
+ print(f"\n📄 Processing {file.filename}...")
235
+ texts = extract_text_from_pdf(temp_path)
236
+ images = extract_images_from_pdf(temp_path)
237
+ tables = extract_tables_from_pdf(temp_path)
238
+
239
+ print(f"📊 Found: {len(texts)} texts, {len(tables)} tables, {len(images)} images")
240
+
241
+ if not texts and not tables and not images:
242
+ return {"error": "No content extracted", "filename": file.filename}
243
+
244
+ doc_ids, summaries, originals = [], [], []
245
+
246
+ # Texts
247
+ for i, item in enumerate(texts):
248
+ page_num = item["page"]
249
+ content = item["content"]
250
+ summary = summarize_text(content)
251
+ doc_id = str(uuid.uuid4())
252
+ doc_ids.append(doc_id)
253
+ summaries.append(summary)
254
+ originals.append(
255
+ Document(
256
+ page_content=content,
257
+ metadata={
258
+ "doc_id": doc_id,
259
+ "type": "text",
260
+ "page": page_num,
261
+ "source": file.filename,
262
+ "summary": summary,
263
+ },
264
+ )
265
+ )
266
+
267
+ # Tables
268
+ for table in tables:
269
+ summary = summarize_text(f"Table content:\n{table}")
270
+ doc_id = str(uuid.uuid4())
271
+ doc_ids.append(doc_id)
272
+ summaries.append(summary)
273
+ originals.append(
274
+ Document(
275
+ page_content=table,
276
+ metadata={
277
+ "doc_id": doc_id,
278
+ "type": "table",
279
+ "source": file.filename,
280
+ "summary": summary,
281
+ },
282
+ )
283
+ )
284
+
285
+ # Images
286
+ for i, item in enumerate(images):
287
+ page_num = item["page"]
288
+ img_b64 = item["image_b64"]
289
+ summary = summarize_image(img_b64)
290
+ doc_id = str(uuid.uuid4())
291
+ doc_ids.append(doc_id)
292
+ summaries.append(summary)
293
+ originals.append(
294
+ Document(
295
+ page_content=img_b64,
296
+ metadata={
297
+ "doc_id": doc_id,
298
+ "type": "image",
299
+ "page": page_num,
300
+ "source": file.filename,
301
+ "summary": summary,
302
+ "is_base64": True,
303
+ },
304
+ )
305
+ )
306
+
307
+ # Store
308
+ vectorstore.add_texts(
309
+ texts=summaries,
310
+ metadatas=[{"doc_id": doc_id, "source": file.filename} for doc_id in doc_ids],
311
+ ids=doc_ids,
312
+ )
313
+ store.mset(list(zip(doc_ids, originals)))
314
+ vectorstore.save_local(VECTOR_PATH)
315
+ print("✅ Saved to disk")
316
+
317
+ os.remove(temp_path)
318
+
319
+ return {
320
+ "status": "success",
321
+ "filename": file.filename,
322
+ "processed": {
323
+ "texts": len(texts),
324
+ "tables": len(tables),
325
+ "images": len(images),
326
+ "total": len(originals),
327
+ },
328
+ "doc_ids_sample": doc_ids[:5],
329
+ "message": f"✅ Processed {len(originals)} components from {file.filename}",
330
+ }