Ryanfafa commited on
Commit
31341ac
·
verified ·
1 Parent(s): 8f75e88

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +300 -40
rag_engine.py CHANGED
@@ -2,6 +2,19 @@
2
  rag_engine.py — Multimodal RAG Engine with Conversation Memory
3
  Supports: PDF, TXT, DOCX, CSV, XLSX, Images (JPG/PNG/WEBP)
4
  Memory: sliding window of last 6 exchanges
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import os
@@ -9,6 +22,7 @@ import re
9
  import io
10
  import json
11
  import time
 
12
  import tempfile
13
  import requests
14
  import logging
@@ -51,6 +65,11 @@ CANDIDATE_MODELS = [
51
  "HuggingFaceTB/SmolLM3-3B:hf-inference",
52
  ]
53
 
 
 
 
 
 
54
 
55
  def get_suffix(name: str) -> str:
56
  return Path(name).suffix.lower() or ".txt"
@@ -182,6 +201,11 @@ class RAGEngine:
182
  )]
183
 
184
  def _load_docx(self, data: bytes, filename: str) -> List[Document]:
 
 
 
 
 
185
  try:
186
  import docx2txt
187
  with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
@@ -192,7 +216,15 @@ class RAGEngine:
192
  finally:
193
  os.unlink(tmp_path)
194
  except ImportError:
 
 
 
 
195
  text = data.decode("utf-8", errors="replace")
 
 
 
 
196
  return [Document(page_content=text, metadata={"source": filename, "type": "docx"})]
197
 
198
  def _load_csv(self, data: bytes, filename: str) -> List[Document]:
@@ -200,6 +232,7 @@ class RAGEngine:
200
  df = pd.read_csv(io.BytesIO(data))
201
  docs = []
202
 
 
203
  summary = (
204
  f"File: {filename}\n"
205
  f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n"
@@ -208,15 +241,20 @@ class RAGEngine:
208
  )
209
  docs.append(Document(page_content=summary, metadata={"source": filename, "type": "csv_summary"}))
210
 
 
211
  try:
212
  stats = "Statistical summary:\n" + df.describe(include="all").to_string()
213
  docs.append(Document(page_content=stats, metadata={"source": filename, "type": "csv_stats"}))
214
- except Exception:
215
- pass
216
 
217
- for i in range(0, min(len(df), 500), 50):
218
- chunk = f"Rows {i}–{i+50}:\n{df.iloc[i:i+50].to_string(index=False)}"
219
- docs.append(Document(page_content=chunk, metadata={"source": filename, "type": "csv_rows"}))
 
 
 
 
220
 
221
  return docs
222
 
@@ -225,49 +263,261 @@ class RAGEngine:
225
  xl = pd.ExcelFile(io.BytesIO(data))
226
  docs = []
227
  for sheet in xl.sheet_names:
228
- df = xl.parse(sheet)
229
- text = (
230
- f"Sheet: {sheet} | {df.shape[0]} rows × {df.shape[1]} cols\n"
231
- f"Columns: {', '.join(str(c) for c in df.columns)}\n\n"
232
- f"{df.head(10).to_string(index=False)}"
233
- )
234
- docs.append(Document(page_content=text, metadata={"source": filename, "type": "excel", "sheet": sheet}))
 
 
 
 
 
235
  return docs
236
 
 
 
237
  def _load_image(self, data: bytes, filename: str) -> List[Document]:
238
- caption = self._caption_image(data, filename)
239
- text = (
240
- f"Image file: {filename}\n\n"
241
- f"AI-generated image description:\n{caption}\n\n"
242
- f"The above description represents the full visual content of this image."
243
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  return [Document(
245
  page_content=text,
246
- metadata={"source": filename, "type": "image", "caption": caption}
 
 
 
 
 
247
  )]
248
 
249
- def _caption_image(self, data: bytes, filename: str) -> str:
 
 
 
 
 
250
  hf_token = os.environ.get("HF_TOKEN", "")
251
  if not hf_token:
252
  return f"[Image: {filename}] — Add HF_TOKEN secret to enable AI image captioning."
253
- try:
254
- import base64
255
- resp = requests.post(
256
- "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large",
257
- headers={"Authorization": f"Bearer {hf_token}"},
258
- json={"inputs": base64.b64encode(data).decode()},
259
- timeout=30,
260
- )
261
- if resp.status_code == 200:
262
- result = resp.json()
263
- if isinstance(result, list) and result:
264
- caption = result[0].get("generated_text", "")
265
- if caption:
266
- logger.info(f"Image caption: {caption[:80]}")
267
- return caption
268
- except Exception as e:
269
- logger.warning(f"Caption failed: {e}")
270
- return f"[Image: {filename}] — Visual content uploaded (captioning unavailable)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  # ── Indexing ─────────────────────────────────────────────────────────────
273
 
@@ -335,9 +585,19 @@ class RAGEngine:
335
 
336
  doc_type_hint = ""
337
  if self._doc_type in {".jpg", ".jpeg", ".png", ".webp"}:
338
- doc_type_hint = "The document is an IMAGE described by an AI caption. Base your answer on the caption."
 
 
 
 
 
339
  elif self._doc_type in {".csv", ".xlsx", ".xls"}:
340
- doc_type_hint = "The document is tabular data (spreadsheet/CSV). Refer to column names and values precisely."
 
 
 
 
 
341
 
342
  system_prompt = (
343
  f"You are DocMind AI, an expert document analyst built by Ryan Farahani.\n"
 
2
  rag_engine.py — Multimodal RAG Engine with Conversation Memory
3
  Supports: PDF, TXT, DOCX, CSV, XLSX, Images (JPG/PNG/WEBP)
4
  Memory: sliding window of last 6 exchanges
5
+
6
+ FIXES applied (vs original):
7
+ 1. _caption_image: send raw bytes to BLIP API, not JSON-encoded base64.
8
+ The HF Inference API for image-to-text expects raw image bytes.
9
+ 2. Added _describe_image_with_vlm: uses a vision-language model via the
10
+ HF chat completions API to generate a detailed, multi-sentence
11
+ description — much richer than BLIP's one-line captions.
12
+ 3. _load_image: builds a richer document from both short caption + detailed
13
+ VLM description, giving RAG far more content to index and retrieve.
14
+ 4. _load_docx: broadened exception handling so a corrupt .docx doesn't
15
+ crash the ingestion; falls back to raw-text extraction.
16
+ 5. _load_csv / _load_excel: added try/except per section so partial
17
+ failures don't block the rest of the ingestion.
18
  """
19
 
20
  import os
 
22
  import io
23
  import json
24
  import time
25
+ import base64
26
  import tempfile
27
  import requests
28
  import logging
 
65
  "HuggingFaceTB/SmolLM3-3B:hf-inference",
66
  ]
67
 
68
+ # Vision-language models for detailed image descriptions
69
+ VLM_CAPTION_MODELS = [
70
+ "meta-llama/Llama-3.2-11B-Vision-Instruct",
71
+ ]
72
+
73
 
74
  def get_suffix(name: str) -> str:
75
  return Path(name).suffix.lower() or ".txt"
 
201
  )]
202
 
203
  def _load_docx(self, data: bytes, filename: str) -> List[Document]:
204
+ """
205
+ FIX: Catch *all* exceptions from docx2txt, not just ImportError.
206
+ A corrupt or password-protected .docx would otherwise crash ingestion.
207
+ """
208
+ text = ""
209
  try:
210
  import docx2txt
211
  with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
 
216
  finally:
217
  os.unlink(tmp_path)
218
  except ImportError:
219
+ logger.warning("docx2txt not installed — falling back to raw text extraction")
220
+ text = data.decode("utf-8", errors="replace")
221
+ except Exception as e:
222
+ logger.warning(f"docx2txt failed ({e}) — falling back to raw text extraction")
223
  text = data.decode("utf-8", errors="replace")
224
+
225
+ if not text or not text.strip():
226
+ text = f"[Document: {filename}] — Could not extract text content."
227
+
228
  return [Document(page_content=text, metadata={"source": filename, "type": "docx"})]
229
 
230
  def _load_csv(self, data: bytes, filename: str) -> List[Document]:
 
232
  df = pd.read_csv(io.BytesIO(data))
233
  docs = []
234
 
235
+ # Summary
236
  summary = (
237
  f"File: {filename}\n"
238
  f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n"
 
241
  )
242
  docs.append(Document(page_content=summary, metadata={"source": filename, "type": "csv_summary"}))
243
 
244
+ # Statistics (wrapped in try/except so partial failure doesn't block)
245
  try:
246
  stats = "Statistical summary:\n" + df.describe(include="all").to_string()
247
  docs.append(Document(page_content=stats, metadata={"source": filename, "type": "csv_stats"}))
248
+ except Exception as e:
249
+ logger.warning(f"CSV stats failed: {e}")
250
 
251
+ # Row chunks
252
+ try:
253
+ for i in range(0, min(len(df), 500), 50):
254
+ chunk = f"Rows {i}–{i+50}:\n{df.iloc[i:i+50].to_string(index=False)}"
255
+ docs.append(Document(page_content=chunk, metadata={"source": filename, "type": "csv_rows"}))
256
+ except Exception as e:
257
+ logger.warning(f"CSV row chunking failed: {e}")
258
 
259
  return docs
260
 
 
263
  xl = pd.ExcelFile(io.BytesIO(data))
264
  docs = []
265
  for sheet in xl.sheet_names:
266
+ try:
267
+ df = xl.parse(sheet)
268
+ text = (
269
+ f"Sheet: {sheet} | {df.shape[0]} rows × {df.shape[1]} cols\n"
270
+ f"Columns: {', '.join(str(c) for c in df.columns)}\n\n"
271
+ f"{df.head(10).to_string(index=False)}"
272
+ )
273
+ docs.append(Document(page_content=text, metadata={
274
+ "source": filename, "type": "excel", "sheet": sheet
275
+ }))
276
+ except Exception as e:
277
+ logger.warning(f"Excel sheet '{sheet}' failed: {e}")
278
  return docs
279
 
280
+ # ── IMAGE LOADING — FIXED ────────────────────────────────────────────────
281
+
282
  def _load_image(self, data: bytes, filename: str) -> List[Document]:
283
+ """
284
+ FIX: Build a much richer document from the image.
285
+ 1. Get a short caption from BLIP (raw bytes, not JSON+base64).
286
+ 2. Get a detailed description from a VLM (e.g. Llama-3.2-Vision).
287
+ 3. Combine both into a multi-paragraph document so RAG has enough
288
+ content to answer diverse questions about the image.
289
+ """
290
+ short_caption = self._caption_image_blip(data, filename)
291
+ detailed_caption = self._describe_image_with_vlm(data, filename, short_caption)
292
+
293
+ # Build a rich text document from the image analysis
294
+ sections = [
295
+ f"Image file: {filename}",
296
+ "",
297
+ f"=== Short Caption ===",
298
+ short_caption,
299
+ "",
300
+ f"=== Detailed Description ===",
301
+ detailed_caption,
302
+ "",
303
+ f"=== Summary ===",
304
+ f"This image ({filename}) shows: {short_caption}. "
305
+ f"{detailed_caption}",
306
+ ]
307
+ text = "\n".join(sections)
308
+
309
  return [Document(
310
  page_content=text,
311
+ metadata={
312
+ "source": filename,
313
+ "type": "image",
314
+ "caption": short_caption,
315
+ "detailed": detailed_caption[:500],
316
+ }
317
  )]
318
 
319
+ def _caption_image_blip(self, data: bytes, filename: str) -> str:
320
+ """
321
+ FIX: Send raw image bytes to the BLIP API, NOT JSON with base64.
322
+ The HuggingFace Inference API for image-to-text models expects the
323
+ raw binary image data as the request body.
324
+ """
325
  hf_token = os.environ.get("HF_TOKEN", "")
326
  if not hf_token:
327
  return f"[Image: {filename}] — Add HF_TOKEN secret to enable AI image captioning."
328
+
329
+ # List of captioning models to try (in order)
330
+ caption_models = [
331
+ "Salesforce/blip-image-captioning-large",
332
+ "Salesforce/blip-image-captioning-base",
333
+ "nlpconnect/vit-gpt2-image-captioning",
334
+ ]
335
+
336
+ for model_id in caption_models:
337
+ try:
338
+ logger.info(f"Trying BLIP caption with {model_id}...")
339
+ resp = requests.post(
340
+ f"https://api-inference.huggingface.co/models/{model_id}",
341
+ headers={"Authorization": f"Bearer {hf_token}"},
342
+ data=data, # ← FIX: raw bytes, NOT json={...}
343
+ timeout=30,
344
+ )
345
+ if resp.status_code == 200:
346
+ result = resp.json()
347
+ if isinstance(result, list) and result:
348
+ caption = result[0].get("generated_text", "")
349
+ if caption:
350
+ logger.info(f"BLIP caption ({model_id}): {caption[:80]}")
351
+ return caption
352
+ elif resp.status_code == 503:
353
+ # Model is loading — wait and retry once
354
+ logger.info(f"{model_id} is loading, waiting 10s...")
355
+ time.sleep(10)
356
+ resp2 = requests.post(
357
+ f"https://api-inference.huggingface.co/models/{model_id}",
358
+ headers={"Authorization": f"Bearer {hf_token}"},
359
+ data=data,
360
+ timeout=45,
361
+ )
362
+ if resp2.status_code == 200:
363
+ result = resp2.json()
364
+ if isinstance(result, list) and result:
365
+ caption = result[0].get("generated_text", "")
366
+ if caption:
367
+ logger.info(f"BLIP caption (retry {model_id}): {caption[:80]}")
368
+ return caption
369
+ else:
370
+ logger.warning(f"BLIP {model_id} returned {resp.status_code}: {resp.text[:100]}")
371
+ except Exception as e:
372
+ logger.warning(f"BLIP caption failed ({model_id}): {e}")
373
+ continue
374
+
375
+ return f"An image named {filename} was uploaded."
376
+
377
+ def _describe_image_with_vlm(self, data: bytes, filename: str, short_caption: str) -> str:
378
+ """
379
+ Use a Vision-Language Model via the HF chat completions API to get
380
+ a detailed multi-sentence description of the image.
381
+ Falls back gracefully if no VLM is available.
382
+ """
383
+ hf_token = os.environ.get("HF_TOKEN", "")
384
+ if not hf_token:
385
+ return short_caption
386
+
387
+ # Encode image as base64 data URI for the chat completions API
388
+ # Detect MIME type from magic bytes
389
+ mime = "image/jpeg"
390
+ if data[:8] == b'\x89PNG\r\n\x1a\n':
391
+ mime = "image/png"
392
+ elif data[:4] == b'RIFF' and data[8:12] == b'WEBP':
393
+ mime = "image/webp"
394
+
395
+ b64_image = base64.b64encode(data).decode("utf-8")
396
+ image_url = f"data:{mime};base64,{b64_image}"
397
+
398
+ headers = {
399
+ "Authorization": f"Bearer {hf_token}",
400
+ "Content-Type": "application/json",
401
+ }
402
+
403
+ for model_id in VLM_CAPTION_MODELS:
404
+ try:
405
+ logger.info(f"Trying VLM description with {model_id}...")
406
+ payload = {
407
+ "model": model_id,
408
+ "messages": [
409
+ {
410
+ "role": "user",
411
+ "content": [
412
+ {
413
+ "type": "image_url",
414
+ "image_url": {"url": image_url},
415
+ },
416
+ {
417
+ "type": "text",
418
+ "text": (
419
+ "Describe this image in detail. Include: "
420
+ "1) What objects, people, or scenes are visible. "
421
+ "2) Colors, positions, and spatial relationships. "
422
+ "3) Any text or writing visible in the image. "
423
+ "4) The overall mood, setting, or context. "
424
+ "5) Any notable details. "
425
+ "Be thorough and specific — your description will be "
426
+ "used to answer questions about this image later."
427
+ ),
428
+ },
429
+ ],
430
+ }
431
+ ],
432
+ "max_tokens": 600,
433
+ "temperature": 0.2,
434
+ "stream": False,
435
+ }
436
+
437
+ resp = requests.post(
438
+ HF_API_URL,
439
+ headers=headers,
440
+ data=json.dumps(payload),
441
+ timeout=60,
442
+ )
443
+
444
+ if resp.status_code == 200:
445
+ raw = resp.json()["choices"][0]["message"]["content"].strip()
446
+ description = _strip_thinking(raw)
447
+ if description and len(description) > 20:
448
+ logger.info(f"VLM description ({model_id}): {description[:100]}...")
449
+ return description
450
+ else:
451
+ logger.warning(f"VLM {model_id} returned {resp.status_code}: {resp.text[:150]}")
452
+ except Exception as e:
453
+ logger.warning(f"VLM description failed ({model_id}): {e}")
454
+ continue
455
+
456
+ # Fallback: use a text-only LLM to expand the BLIP caption
457
+ return self._expand_caption_with_llm(short_caption, filename)
458
+
459
+ def _expand_caption_with_llm(self, caption: str, filename: str) -> str:
460
+ """
461
+ If the VLM is unavailable, use a text-only LLM to expand the short
462
+ BLIP caption into a more detailed description that's useful for RAG.
463
+ """
464
+ hf_token = os.environ.get("HF_TOKEN", "")
465
+ if not hf_token or caption.startswith("[Image:"):
466
+ return caption
467
+
468
+ headers = {
469
+ "Authorization": f"Bearer {hf_token}",
470
+ "Content-Type": "application/json",
471
+ }
472
+
473
+ messages = [
474
+ {
475
+ "role": "system",
476
+ "content": (
477
+ "You are an image description assistant. Given a short AI-generated "
478
+ "caption of an image, expand it into a detailed paragraph describing "
479
+ "what the image likely contains. Include probable objects, colors, "
480
+ "spatial layout, and context. Be descriptive but stay grounded in "
481
+ "what the caption implies. Do not hallucinate specific details that "
482
+ "cannot be inferred from the caption."
483
+ ),
484
+ },
485
+ {
486
+ "role": "user",
487
+ "content": (
488
+ f"The image file is named '{filename}'. "
489
+ f"The AI caption is: \"{caption}\"\n\n"
490
+ f"Please provide a detailed expanded description of what this "
491
+ f"image likely shows."
492
+ ),
493
+ },
494
+ ]
495
+
496
+ for model_id in CANDIDATE_MODELS:
497
+ try:
498
+ resp = requests.post(
499
+ HF_API_URL,
500
+ headers=headers,
501
+ data=json.dumps({
502
+ "model": model_id,
503
+ "messages": messages,
504
+ "max_tokens": 400,
505
+ "temperature": 0.3,
506
+ "stream": False,
507
+ }),
508
+ timeout=45,
509
+ )
510
+ if resp.status_code == 200:
511
+ raw = resp.json()["choices"][0]["message"]["content"].strip()
512
+ expanded = _strip_thinking(raw)
513
+ if expanded and len(expanded) > 30:
514
+ logger.info(f"Expanded caption ({model_id}): {expanded[:80]}...")
515
+ return expanded
516
+ except Exception as e:
517
+ logger.warning(f"Caption expansion failed ({model_id}): {e}")
518
+ continue
519
+
520
+ return caption
521
 
522
  # ── Indexing ─────────────────────────────────────────────────────────────
523
 
 
585
 
586
  doc_type_hint = ""
587
  if self._doc_type in {".jpg", ".jpeg", ".png", ".webp"}:
588
+ doc_type_hint = (
589
+ "The document is an IMAGE. The context contains an AI-generated "
590
+ "description and caption of the image. Answer questions about the "
591
+ "image based on this description. Be specific about visual details "
592
+ "mentioned in the description."
593
+ )
594
  elif self._doc_type in {".csv", ".xlsx", ".xls"}:
595
+ doc_type_hint = (
596
+ "The document is tabular data (spreadsheet/CSV). Refer to column "
597
+ "names and values precisely."
598
+ )
599
+ elif self._doc_type in {".docx", ".doc"}:
600
+ doc_type_hint = "The document is a Word document."
601
 
602
  system_prompt = (
603
  f"You are DocMind AI, an expert document analyst built by Ryan Farahani.\n"