Sameer-Handsome173 commited on
Commit
c56a43d
Β·
verified Β·
1 Parent(s): 31e3146

Update query_service.py

Browse files
Files changed (1) hide show
  1. query_service.py +342 -373
query_service.py CHANGED
@@ -1,373 +1,342 @@
1
- import os
2
- import json
3
- import requests
4
- import base64
5
- from fastapi import FastAPI
6
- from langchain_community.vectorstores import FAISS
7
- from langchain_community.embeddings import SentenceTransformerEmbeddings
8
- from langchain_core.documents import Document
9
-
10
- # Custom JSONFileStore
11
- class JSONFileStore:
12
- def __init__(self, store_path: str):
13
- self.store_path = store_path
14
- os.makedirs(self.store_path, exist_ok=True)
15
-
16
- def mget(self, keys: list[str]) -> list[Document]:
17
- """Retrieve multiple documents by their keys."""
18
- documents = []
19
- for key in keys:
20
- file_path = os.path.join(self.store_path, f"{key}.json")
21
- if os.path.exists(file_path):
22
- try:
23
- with open(file_path, "r", encoding='utf-8') as f:
24
- doc_dict = json.load(f)
25
- documents.append(Document(
26
- page_content=doc_dict["page_content"],
27
- metadata=doc_dict["metadata"]
28
- ))
29
- except Exception as e:
30
- print(f"Error loading {key}: {e}")
31
- documents.append(None)
32
- else:
33
- documents.append(None)
34
- return documents
35
-
36
- app = FastAPI(title="πŸ” Multimodal RAG Query Service")
37
-
38
- # Paths
39
- VECTOR_PATH = "./vectorstore/faiss_index"
40
- DOCSTORE_PATH = "./docstore"
41
-
42
- # Final Answer API endpoint
43
- FINAL_ANSWER_URL = "https://sameer-handsome173-multi-modal.hf.space/final_answer"
44
-
45
- # Initialize embedding function
46
- print("πŸ”„ Loading embedding model...")
47
- try:
48
- embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
49
- print("βœ… Embedding model loaded")
50
- except Exception as e:
51
- print(f"❌ Error loading embeddings: {e}")
52
- raise
53
-
54
- # Load FAISS vectorstore
55
- try:
56
- if os.path.exists(VECTOR_PATH):
57
- vectorstore = FAISS.load_local(
58
- VECTOR_PATH,
59
- embedding_fn,
60
- allow_dangerous_deserialization=True
61
- )
62
- print("βœ… Loaded FAISS vectorstore")
63
- else:
64
- print("❌ Vectorstore not found! Please ingest documents first.")
65
- raise FileNotFoundError("Vectorstore not found")
66
- except Exception as e:
67
- print(f"❌ Error loading vectorstore: {e}")
68
- raise
69
-
70
- # Load JSONFileStore
71
- try:
72
- if not os.path.exists(DOCSTORE_PATH):
73
- print("❌ Docstore not found! Please ingest documents first.")
74
- raise FileNotFoundError("Docstore not found")
75
- store = JSONFileStore(DOCSTORE_PATH)
76
- print("βœ… Loaded JSONFileStore")
77
- except Exception as e:
78
- print(f"❌ Error loading docstore: {e}")
79
- raise
80
-
81
- def parse_docs(docs: list[Document]) -> dict:
82
- """
83
- Split retrieved documents into texts, tables, and images.
84
- """
85
- images = []
86
- texts = []
87
- tables = []
88
-
89
- for doc in docs:
90
- doc_type = doc.metadata.get("type", "text")
91
-
92
- if doc_type == "image" and doc.metadata.get("is_base64"):
93
- try:
94
- # Validate it's base64
95
- base64.b64decode(doc.page_content)
96
- images.append(doc.page_content)
97
- except Exception:
98
- # If decoding fails, treat as text
99
- texts.append(doc.page_content)
100
- elif doc_type == "table":
101
- tables.append(doc.page_content)
102
- else:
103
- # Regular text
104
- texts.append(doc.page_content)
105
-
106
- return {
107
- "images": images,
108
- "texts": texts,
109
- "tables": tables
110
- }
111
-
112
- def retrieve_documents(query: str, k: int = 3) -> list[Document]:
113
- """
114
- Retrieve documents:
115
- 1. Search vectorstore for similar summaries
116
- 2. Get unique doc_ids from results
117
- 3. Retrieve original documents from docstore
118
- """
119
- try:
120
- similar_docs = vectorstore.similarity_search(query, k=k)
121
- if not similar_docs:
122
- print("⚠️ No similar documents found")
123
- return []
124
-
125
- # Ensure unique doc_ids
126
- doc_ids = []
127
- for doc in similar_docs:
128
- doc_id = doc.metadata.get("doc_id")
129
- if doc_id and doc_id not in doc_ids: # prevent duplicates
130
- doc_ids.append(doc_id)
131
-
132
- if not doc_ids:
133
- print("⚠️ No doc_ids found in metadata")
134
- return []
135
-
136
- print(f"πŸ”‘ Found {len(doc_ids)} unique doc_ids")
137
-
138
- # Retrieve original documents
139
- original_docs = store.mget(doc_ids)
140
- original_docs = [doc for doc in original_docs if doc is not None]
141
- print(f"πŸ“„ Retrieved {len(original_docs)} unique documents")
142
-
143
- return original_docs
144
-
145
- except Exception as e:
146
- print(f"❌ Error in retrieval: {e}")
147
- return []
148
-
149
- def build_context_and_images(docs_by_type: dict) -> tuple[str, list[str]]:
150
- """
151
- Build context text from texts and tables, and collect image base64 strings.
152
- Returns: (context_text, list_of_base64_images)
153
- """
154
- context_parts = []
155
-
156
- # Add text documents
157
- if docs_by_type["texts"]:
158
- for i, text_content in enumerate(docs_by_type["texts"], 1):
159
- context_parts.append(f"--- Text Document {i} ---\n{text_content}")
160
-
161
- # Add table documents
162
- if docs_by_type["tables"]:
163
- for i, table_content in enumerate(docs_by_type["tables"], 1):
164
- context_parts.append(f"--- Table {i} ---\n{table_content}")
165
-
166
- context_text = "\n\n".join(context_parts)
167
-
168
- # Get images
169
- images_b64 = docs_by_type["images"]
170
-
171
- return context_text.strip(), images_b64
172
-
173
- def call_final_answer_endpoint(context: str, question: str, images_b64: list[str]) -> dict:
174
- """
175
- Call the /final_answer endpoint with context, question, and images.
176
- """
177
- try:
178
- # Prepare form data
179
- data = {
180
- "context": context,
181
- "question": question
182
- }
183
-
184
- # Prepare image files if any
185
- files = []
186
- if images_b64:
187
- for i, img_b64 in enumerate(images_b64):
188
- try:
189
- # Decode base64 to bytes
190
- img_bytes = base64.b64decode(img_b64)
191
- # Add to files list
192
- files.append(("images", (f"image_{i}.jpg", img_bytes, "image/jpeg")))
193
- except Exception as e:
194
- print(f"⚠️ Error processing image {i}: {str(e)}")
195
-
196
- # Make request
197
- if files:
198
- response = requests.post(
199
- FINAL_ANSWER_URL,
200
- data=data,
201
- files=files,
202
- timeout=150
203
- )
204
- else:
205
- response = requests.post(
206
- FINAL_ANSWER_URL,
207
- data=data,
208
- timeout=150
209
- )
210
-
211
- if response.status_code == 200:
212
- return response.json()
213
- else:
214
- return {
215
- "error": f"API returned status {response.status_code}",
216
- "details": response.text
217
- }
218
-
219
- except Exception as e:
220
- return {
221
- "error": f"Error calling final_answer endpoint: {str(e)}"
222
- }
223
-
224
- @app.get("/")
225
- def home():
226
- return {
227
- "message": "βœ… Multimodal RAG Query Service is running",
228
- "endpoints": {
229
- "query": "POST /query?question=YOUR_QUESTION&k=5",
230
- "query_with_details": "POST /query_with_details?question=YOUR_QUESTION&k=5",
231
- "stats": "GET /stats"
232
- },
233
- "features": ["Text retrieval", "Table retrieval", "Image retrieval", "Multimodal querying"]
234
- }
235
-
236
- @app.get("/stats")
237
- def get_stats():
238
- """Get system statistics"""
239
- try:
240
- vector_count = vectorstore.index.ntotal if hasattr(vectorstore, 'index') else 0
241
- docstore_files = len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith('.json')]) if os.path.exists(DOCSTORE_PATH) else 0
242
-
243
- return {
244
- "status": "ready",
245
- "vectorstore_count": vector_count,
246
- "docstore_count": docstore_files
247
- }
248
- except Exception as e:
249
- return {"status": "error", "error": str(e)}
250
-
251
- @app.post("/query")
252
- async def query_rag(question: str, k: int = 5):
253
- """
254
- Query the Multimodal RAG system:
255
- 1. Search vectorstore for relevant summaries
256
- 2. Retrieve original documents (text + tables + images)
257
- 3. Parse into texts, tables, and images
258
- 4. Call final_answer endpoint with all content
259
- 5. Return answer
260
- """
261
- try:
262
- print(f"\nπŸ” Query: {question}")
263
-
264
- # Retrieve documents
265
- docs = retrieve_documents(question, k=k)
266
-
267
- if not docs:
268
- return {
269
- "question": question,
270
- "answer": "No relevant documents found. Please ingest documents first.",
271
- "retrieved_docs": 0
272
- }
273
-
274
- # Parse documents into texts, tables, and images
275
- docs_by_type = parse_docs(docs)
276
- print(f"πŸ“Š Parsed: {len(docs_by_type['texts'])} texts, {len(docs_by_type['tables'])} tables, {len(docs_by_type['images'])} images")
277
-
278
- # Build context and collect images
279
- context_text, images_b64 = build_context_and_images(docs_by_type)
280
-
281
- # Call endpoint
282
- print("πŸš€ Calling final_answer endpoint...")
283
- result = call_final_answer_endpoint(context_text, question, images_b64)
284
-
285
- # Return response
286
- if "error" in result:
287
- return {
288
- "question": question,
289
- "error": result["error"],
290
- "details": result.get("details"),
291
- "retrieved_docs": len(docs),
292
- "context_preview": context_text[:300] if context_text else "No context"
293
- }
294
-
295
- return {
296
- "question": question,
297
- "answer": result.get("response", "No response generated"),
298
- "retrieved_docs": len(docs),
299
- "docs_info": {
300
- "texts": len(docs_by_type['texts']),
301
- "tables": len(docs_by_type['tables']),
302
- "images": len(docs_by_type['images'])
303
- },
304
- "context_preview": context_text[:300] if context_text else "No context"
305
- }
306
-
307
- except Exception as e:
308
- import traceback
309
- return {
310
- "question": question,
311
- "error": str(e),
312
- "traceback": traceback.format_exc()
313
- }
314
-
315
- @app.post("/query_with_details")
316
- async def query_with_details(question: str, k: int = 5):
317
- """Query with detailed document information"""
318
- try:
319
- print(f"\nπŸ” Detailed Query: {question}")
320
-
321
- # Retrieve documents
322
- docs = retrieve_documents(question, k=k)
323
-
324
- if not docs:
325
- return {
326
- "question": question,
327
- "answer": "No relevant documents found.",
328
- "retrieved_docs": []
329
- }
330
-
331
- # Parse documents
332
- docs_by_type = parse_docs(docs)
333
- context_text, images_b64 = build_context_and_images(docs_by_type)
334
-
335
- # Call endpoint
336
- result = call_final_answer_endpoint(context_text, question, images_b64)
337
-
338
- # Prepare document info (without full base64 images)
339
- docs_info = []
340
- for doc in docs:
341
- doc_info = {
342
- "doc_id": doc.metadata.get("doc_id"),
343
- "type": doc.metadata.get("type"),
344
- "source": doc.metadata.get("source"),
345
- "summary": doc.metadata.get("summary", "")[:200],
346
- }
347
-
348
- # Don't include full content for images
349
- if doc.metadata.get("type") == "image":
350
- doc_info["content"] = "[Base64 Image Data]"
351
- else:
352
- doc_info["content"] = doc.page_content[:300]
353
-
354
- docs_info.append(doc_info)
355
-
356
- return {
357
- "question": question,
358
- "answer": result.get("response", result.get("error", "No response")),
359
- "retrieved_docs": docs_info,
360
- "stats": {
361
- "total_retrieved": len(docs),
362
- "texts": len(docs_by_type['texts']),
363
- "tables": len(docs_by_type['tables']),
364
- "images": len(docs_by_type['images'])
365
- }
366
- }
367
-
368
- except Exception as e:
369
- import traceback
370
- return {
371
- "error": str(e),
372
- "traceback": traceback.format_exc()
373
- }
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import base64
5
+ import re
6
+ from fastapi import FastAPI
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
9
+ from langchain_core.documents import Document
10
+
11
+ # ───────────────────────────────────────────────
12
+ # Configuration
13
+ # ───────────────────────────────────────────────
14
+ VECTOR_PATH = "./vectorstore/faiss_index"
15
+ DOCSTORE_PATH = "./docstore"
16
+ FINAL_ANSWER_URL = "https://sameer-handsome173-multi-modal.hf.space/final_answer"
17
+ EXTENDED_TIMEOUT = int(os.getenv("FINAL_ANSWER_TIMEOUT", 150))
18
+
19
+ app = FastAPI(title="πŸ” Multimodal RAG Query Service")
20
+
21
+ # ───────────────────────────────────────────────
22
+ # JSONFileStore
23
+ # ───────────────────────────────────────────────
24
+ class JSONFileStore:
25
+ def __init__(self, store_path: str):
26
+ self.store_path = store_path
27
+ os.makedirs(self.store_path, exist_ok=True)
28
+
29
+ def mget(self, keys: list[str]) -> list[Document]:
30
+ """Retrieve multiple documents by their keys."""
31
+ documents = []
32
+ for key in keys:
33
+ file_path = os.path.join(self.store_path, f"{key}.json")
34
+ if os.path.exists(file_path):
35
+ try:
36
+ with open(file_path, "r", encoding="utf-8") as f:
37
+ doc_dict = json.load(f)
38
+ documents.append(
39
+ Document(page_content=doc_dict["page_content"], metadata=doc_dict["metadata"])
40
+ )
41
+ except Exception as e:
42
+ print(f"Error loading {key}: {e}")
43
+ documents.append(None)
44
+ else:
45
+ documents.append(None)
46
+ return documents
47
+
48
+ # ───────────────────��───────────────────────────
49
+ # Initialize embeddings, vectorstore, docstore
50
+ # ───────────────────────────────────────────────
51
+ print("πŸ”„ Loading embedding model...")
52
+ try:
53
+ embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
54
+ print("βœ… Embedding model loaded")
55
+ except Exception as e:
56
+ print(f"❌ Error loading embeddings: {e}")
57
+ raise
58
+
59
+ try:
60
+ if os.path.exists(VECTOR_PATH):
61
+ vectorstore = FAISS.load_local(VECTOR_PATH, embedding_fn, allow_dangerous_deserialization=True)
62
+ print("βœ… Loaded FAISS vectorstore")
63
+ else:
64
+ raise FileNotFoundError("Vectorstore not found")
65
+ except Exception as e:
66
+ print(f"❌ Error loading vectorstore: {e}")
67
+ raise
68
+
69
+ try:
70
+ if not os.path.exists(DOCSTORE_PATH):
71
+ raise FileNotFoundError("Docstore not found")
72
+ store = JSONFileStore(DOCSTORE_PATH)
73
+ print("βœ… Loaded JSONFileStore")
74
+ except Exception as e:
75
+ print(f"❌ Error loading docstore: {e}")
76
+ raise
77
+
78
+ # ───────────────────────────────────────────────
79
+ # Response cleaning helper
80
+ # ───────────────────────────────────────────────
81
+ def clean_response_text(text: str) -> str:
82
+ """Clean the model's response to remove hashtags, emojis, repetitions and weird tails."""
83
+ if not text:
84
+ return text
85
+
86
+ # Remove hashtags and URLs
87
+ text = re.sub(r"#\S+", "", text)
88
+ text = re.sub(r"http\S+", "", text)
89
+
90
+ # Remove non-ASCII characters (emojis, special symbols)
91
+ text = text.encode("ascii", "ignore").decode()
92
+
93
+ # Remove repeated words sequences (e.g. "word word word")
94
+ text = re.sub(r"\b(\w+)( \1\b)+", r"\1", text, flags=re.IGNORECASE)
95
+
96
+ # Collapse multiple newlines and spaces
97
+ text = re.sub(r"\n{2,}", "\n", text)
98
+ text = re.sub(r" {2,}", " ", text).strip()
99
+
100
+ # Remove trailing model apology lines or noisy tails
101
+ text = re.sub(r"I'm sorry.*", "", text, flags=re.IGNORECASE)
102
+
103
+ return text.strip()
104
+
105
+ # ───────────────────────────────────────────────
106
+ # Helpers for parsing, retrieval and final call
107
+ # ───────────────────────────────────────────────
108
+ def parse_docs(docs: list[Document]) -> dict:
109
+ """
110
+ Split retrieved documents into images, texts, and tables.
111
+ Returns dict with lists: {"images": [...], "texts": [...], "tables": [...]}
112
+ """
113
+ images, texts, tables = [], [], []
114
+
115
+ for doc in docs:
116
+ doc_type = doc.metadata.get("type", "text")
117
+ if doc_type == "image" and doc.metadata.get("is_base64", False):
118
+ # store base64 string
119
+ images.append(doc.page_content)
120
+ elif doc_type == "table":
121
+ tables.append(doc.page_content)
122
+ else:
123
+ texts.append(doc.page_content)
124
+
125
+ return {"images": images, "texts": texts, "tables": tables}
126
+
127
+
128
+ def retrieve_documents(query: str, k: int = 5) -> list[Document]:
129
+ """
130
+ Retrieve documents:
131
+ 1. Search vectorstore for similar summaries
132
+ 2. Collect unique doc_ids from results (avoid duplicates)
133
+ 3. Retrieve originals from docstore
134
+ """
135
+ try:
136
+ similar_docs = vectorstore.similarity_search(query, k=k)
137
+ if not similar_docs:
138
+ print("⚠️ No similar documents found")
139
+ return []
140
+
141
+ doc_ids = []
142
+ for doc in similar_docs:
143
+ doc_id = doc.metadata.get("doc_id")
144
+ if doc_id and doc_id not in doc_ids:
145
+ doc_ids.append(doc_id)
146
+
147
+ if not doc_ids:
148
+ print("⚠️ No doc_ids found in metadata")
149
+ return []
150
+
151
+ print(f"πŸ”‘ Found {len(doc_ids)} unique doc_ids")
152
+
153
+ original_docs = store.mget(doc_ids)
154
+ original_docs = [d for d in original_docs if d is not None]
155
+ print(f"πŸ“„ Retrieved {len(original_docs)} unique documents")
156
+
157
+ return original_docs
158
+
159
+ except Exception as e:
160
+ print(f"❌ Error in retrieval: {e}")
161
+ return []
162
+
163
+
164
+ def build_context_and_images(docs_by_type: dict) -> tuple[str, list[str]]:
165
+ """
166
+ Build context text from texts and tables, and collect image base64 strings.
167
+ Returns: (context_text, list_of_base64_images)
168
+ """
169
+ context_parts = []
170
+
171
+ # Add text documents
172
+ for i, text_content in enumerate(docs_by_type.get("texts", []), 1):
173
+ context_parts.append(f"--- Text Document {i} ---\n{text_content}")
174
+
175
+ # Add table documents
176
+ for i, table_content in enumerate(docs_by_type.get("tables", []), 1):
177
+ context_parts.append(f"--- Table {i} ---\n{table_content}")
178
+
179
+ context_text = "\n\n".join(context_parts).strip()
180
+ images_b64 = docs_by_type.get("images", [])
181
+
182
+ return context_text, images_b64
183
+
184
+
185
+ def call_final_answer_endpoint(context: str, question: str, images_b64: list[str]) -> dict:
186
+ """
187
+ Call the /final_answer endpoint with context, question, and images.
188
+ Uses extended timeout to allow for slow multimodal inference.
189
+ """
190
+ try:
191
+ # Make prompt instruction clearer for concise output
192
+ data = {
193
+ "context": context,
194
+ "question": f"Answer concisely and without hashtags or emojis.\n\nQuestion: {question}"
195
+ }
196
+
197
+ files = []
198
+ if images_b64:
199
+ for i, img_b64 in enumerate(images_b64):
200
+ try:
201
+ img_bytes = base64.b64decode(img_b64)
202
+ files.append(("images", (f"image_{i}.jpg", img_bytes, "image/jpeg")))
203
+ except Exception as e:
204
+ print(f"⚠️ Error decoding image {i}: {e}")
205
+
206
+ if files:
207
+ response = requests.post(FINAL_ANSWER_URL, data=data, files=files, timeout=EXTENDED_TIMEOUT)
208
+ else:
209
+ response = requests.post(FINAL_ANSWER_URL, data=data, timeout=EXTENDED_TIMEOUT)
210
+
211
+ if response.status_code == 200:
212
+ return response.json()
213
+ else:
214
+ return {"error": f"API returned status {response.status_code}", "details": response.text}
215
+
216
+ except Exception as e:
217
+ return {"error": f"Error calling final_answer endpoint: {str(e)}"}
218
+
219
+ # ───────────────────────────────────────────────
220
+ # FastAPI endpoints
221
+ # ───────────────────────────────────────────────
222
+ @app.get("/")
223
+ def home():
224
+ return {
225
+ "message": "βœ… Multimodal RAG Query Service is running",
226
+ "timeout_seconds": EXTENDED_TIMEOUT,
227
+ "endpoints": {
228
+ "query": "/query?question=Your+Question",
229
+ "query_with_details": "/query_with_details?question=Your+Question",
230
+ "stats": "/stats",
231
+ },
232
+ }
233
+
234
+
235
+ @app.get("/stats")
236
+ def get_stats():
237
+ try:
238
+ vector_count = vectorstore.index.ntotal if hasattr(vectorstore, "index") else 0
239
+ docstore_files = len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith(".json")]) if os.path.exists(DOCSTORE_PATH) else 0
240
+ return {"status": "ready", "vectorstore_count": vector_count, "docstore_count": docstore_files}
241
+ except Exception as e:
242
+ return {"status": "error", "error": str(e)}
243
+
244
+
245
+ @app.post("/query")
246
+ async def query_rag(question: str, k: int = 5):
247
+ """
248
+ Query the Multimodal RAG system:
249
+ 1. Search vectorstore for relevant summaries
250
+ 2. Retrieve original documents (text + tables + images)
251
+ 3. Parse into texts, tables, and images
252
+ 4. Call final_answer endpoint with all content
253
+ 5. Return cleaned answer
254
+ """
255
+ try:
256
+ print(f"\nπŸ” Query: {question}")
257
+
258
+ docs = retrieve_documents(question, k=k)
259
+ if not docs:
260
+ return {"question": question, "answer": "No relevant documents found. Please ingest documents first.", "retrieved_docs": 0}
261
+
262
+ docs_by_type = parse_docs(docs)
263
+ print(f"πŸ“Š Parsed: {len(docs_by_type['texts'])} texts, {len(docs_by_type['tables'])} tables, {len(docs_by_type['images'])} images")
264
+
265
+ context_text, images_b64 = build_context_and_images(docs_by_type)
266
+ print("πŸš€ Calling final_answer endpoint...")
267
+ result = call_final_answer_endpoint(context_text, question, images_b64)
268
+
269
+ if "error" in result:
270
+ return {
271
+ "question": question,
272
+ "error": result["error"],
273
+ "details": result.get("details"),
274
+ "retrieved_docs": len(docs),
275
+ "context_preview": context_text[:300] if context_text else "No context"
276
+ }
277
+
278
+ cleaned_answer = clean_response_text(result.get("response", "No response generated"))
279
+
280
+ return {
281
+ "question": question,
282
+ "answer": cleaned_answer,
283
+ "retrieved_docs": len(docs),
284
+ "docs_info": {
285
+ "texts": len(docs_by_type["texts"]),
286
+ "tables": len(docs_by_type["tables"]),
287
+ "images": len(docs_by_type["images"]),
288
+ },
289
+ "context_preview": context_text[:300] if context_text else "No context",
290
+ }
291
+
292
+ except Exception as e:
293
+ import traceback
294
+ return {"question": question, "error": str(e), "traceback": traceback.format_exc()}
295
+
296
+
297
+ @app.post("/query_with_details")
298
+ async def query_with_details(question: str, k: int = 5):
299
+ """Query with detailed document information"""
300
+ try:
301
+ print(f"\nπŸ” Detailed Query: {question}")
302
+
303
+ docs = retrieve_documents(question, k=k)
304
+ if not docs:
305
+ return {"question": question, "answer": "No relevant documents found.", "retrieved_docs": []}
306
+
307
+ docs_by_type = parse_docs(docs)
308
+ context_text, images_b64 = build_context_and_images(docs_by_type)
309
+
310
+ result = call_final_answer_endpoint(context_text, question, images_b64)
311
+
312
+ if "error" in result:
313
+ return {"question": question, "error": result["error"], "details": result.get("details")}
314
+
315
+ docs_info = []
316
+ for doc in docs:
317
+ doc_info = {
318
+ "doc_id": doc.metadata.get("doc_id"),
319
+ "type": doc.metadata.get("type"),
320
+ "source": doc.metadata.get("source"),
321
+ "summary": doc.metadata.get("summary", "")[:200],
322
+ }
323
+ doc_info["content"] = "[Base64 Image Data]" if doc.metadata.get("type") == "image" else doc.page_content[:300]
324
+ docs_info.append(doc_info)
325
+
326
+ cleaned_answer = clean_response_text(result.get("response", "No response generated"))
327
+
328
+ return {
329
+ "question": question,
330
+ "answer": cleaned_answer,
331
+ "retrieved_docs": docs_info,
332
+ "stats": {
333
+ "total_retrieved": len(docs),
334
+ "texts": len(docs_by_type["texts"]),
335
+ "tables": len(docs_by_type["tables"]),
336
+ "images": len(docs_by_type["images"]),
337
+ },
338
+ }
339
+
340
+ except Exception as e:
341
+ import traceback
342
+ return {"error": str(e), "traceback": traceback.format_exc()}