Spaces:

Ryanfafa
/

docmind-ai

Running

App Files Files Community

Ryanfafa commited on Feb 18

Commit

31341ac

verified ·

1 Parent(s): 8f75e88

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +300 -40

rag_engine.py CHANGED Viewed

@@ -2,6 +2,19 @@
 rag_engine.py — Multimodal RAG Engine with Conversation Memory
 Supports: PDF, TXT, DOCX, CSV, XLSX, Images (JPG/PNG/WEBP)
 Memory: sliding window of last 6 exchanges
 """
 import os
@@ -9,6 +22,7 @@ import re
 import io
 import json
 import time
 import tempfile
 import requests
 import logging
@@ -51,6 +65,11 @@ CANDIDATE_MODELS = [
     "HuggingFaceTB/SmolLM3-3B:hf-inference",
 ]
 def get_suffix(name: str) -> str:
     return Path(name).suffix.lower() or ".txt"
@@ -182,6 +201,11 @@ class RAGEngine:
         )]
     def _load_docx(self, data: bytes, filename: str) -> List[Document]:
         try:
             import docx2txt
             with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
@@ -192,7 +216,15 @@ class RAGEngine:
             finally:
                 os.unlink(tmp_path)
         except ImportError:
             text = data.decode("utf-8", errors="replace")
         return [Document(page_content=text, metadata={"source": filename, "type": "docx"})]
     def _load_csv(self, data: bytes, filename: str) -> List[Document]:
@@ -200,6 +232,7 @@ class RAGEngine:
         df   = pd.read_csv(io.BytesIO(data))
         docs = []
         summary = (
             f"File: {filename}\n"
             f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n"
@@ -208,15 +241,20 @@ class RAGEngine:
         )
         docs.append(Document(page_content=summary, metadata={"source": filename, "type": "csv_summary"}))
         try:
             stats = "Statistical summary:\n" + df.describe(include="all").to_string()
             docs.append(Document(page_content=stats, metadata={"source": filename, "type": "csv_stats"}))
-        except Exception:
-            pass
-        for i in range(0, min(len(df), 500), 50):
-            chunk = f"Rows {i}–{i+50}:\n{df.iloc[i:i+50].to_string(index=False)}"
-            docs.append(Document(page_content=chunk, metadata={"source": filename, "type": "csv_rows"}))
         return docs
@@ -225,49 +263,261 @@ class RAGEngine:
         xl   = pd.ExcelFile(io.BytesIO(data))
         docs = []
         for sheet in xl.sheet_names:
-            df = xl.parse(sheet)
-            text = (
-                f"Sheet: {sheet} | {df.shape[0]} rows × {df.shape[1]} cols\n"
-                f"Columns: {', '.join(str(c) for c in df.columns)}\n\n"
-                f"{df.head(10).to_string(index=False)}"
-            )
-            docs.append(Document(page_content=text, metadata={"source": filename, "type": "excel", "sheet": sheet}))
         return docs
     def _load_image(self, data: bytes, filename: str) -> List[Document]:
-        caption = self._caption_image(data, filename)
-        text = (
-            f"Image file: {filename}\n\n"
-            f"AI-generated image description:\n{caption}\n\n"
-            f"The above description represents the full visual content of this image."
-        )
         return [Document(
             page_content=text,
-            metadata={"source": filename, "type": "image", "caption": caption}
         )]
-    def _caption_image(self, data: bytes, filename: str) -> str:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return f"[Image: {filename}] — Add HF_TOKEN secret to enable AI image captioning."
-        try:
-            import base64
-            resp = requests.post(
-                "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large",
-                headers={"Authorization": f"Bearer {hf_token}"},
-                json={"inputs": base64.b64encode(data).decode()},
-                timeout=30,
-            )
-            if resp.status_code == 200:
-                result = resp.json()
-                if isinstance(result, list) and result:
-                    caption = result[0].get("generated_text", "")
-                    if caption:
-                        logger.info(f"Image caption: {caption[:80]}")
-                        return caption
-        except Exception as e:
-            logger.warning(f"Caption failed: {e}")
-        return f"[Image: {filename}] — Visual content uploaded (captioning unavailable)"
     # ── Indexing ─────────────────────────────────────────────────────────────
@@ -335,9 +585,19 @@ class RAGEngine:
         doc_type_hint = ""
         if self._doc_type in {".jpg", ".jpeg", ".png", ".webp"}:
-            doc_type_hint = "The document is an IMAGE described by an AI caption. Base your answer on the caption."
         elif self._doc_type in {".csv", ".xlsx", ".xls"}:
-            doc_type_hint = "The document is tabular data (spreadsheet/CSV). Refer to column names and values precisely."
         system_prompt = (
             f"You are DocMind AI, an expert document analyst built by Ryan Farahani.\n"

 rag_engine.py — Multimodal RAG Engine with Conversation Memory
 Supports: PDF, TXT, DOCX, CSV, XLSX, Images (JPG/PNG/WEBP)
 Memory: sliding window of last 6 exchanges
+FIXES applied (vs original):
+  1. _caption_image: send raw bytes to BLIP API, not JSON-encoded base64.
+     The HF Inference API for image-to-text expects raw image bytes.
+  2. Added _describe_image_with_vlm: uses a vision-language model via the
+     HF chat completions API to generate a detailed, multi-sentence
+     description — much richer than BLIP's one-line captions.
+  3. _load_image: builds a richer document from both short caption + detailed
+     VLM description, giving RAG far more content to index and retrieve.
+  4. _load_docx: broadened exception handling so a corrupt .docx doesn't
+     crash the ingestion; falls back to raw-text extraction.
+  5. _load_csv / _load_excel: added try/except per section so partial
+     failures don't block the rest of the ingestion.
 """
 import os
 import io
 import json
 import time
+import base64
 import tempfile
 import requests
 import logging
     "HuggingFaceTB/SmolLM3-3B:hf-inference",
 ]
+# Vision-language models for detailed image descriptions
+VLM_CAPTION_MODELS = [
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+]
 def get_suffix(name: str) -> str:
     return Path(name).suffix.lower() or ".txt"
         )]
     def _load_docx(self, data: bytes, filename: str) -> List[Document]:
+        """
+        FIX: Catch *all* exceptions from docx2txt, not just ImportError.
+        A corrupt or password-protected .docx would otherwise crash ingestion.
+        """
+        text = ""
         try:
             import docx2txt
             with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
             finally:
                 os.unlink(tmp_path)
         except ImportError:
+            logger.warning("docx2txt not installed — falling back to raw text extraction")
+            text = data.decode("utf-8", errors="replace")
+        except Exception as e:
+            logger.warning(f"docx2txt failed ({e}) — falling back to raw text extraction")
             text = data.decode("utf-8", errors="replace")
+        if not text or not text.strip():
+            text = f"[Document: {filename}] — Could not extract text content."
         return [Document(page_content=text, metadata={"source": filename, "type": "docx"})]
     def _load_csv(self, data: bytes, filename: str) -> List[Document]:
         df   = pd.read_csv(io.BytesIO(data))
         docs = []
+        # Summary
         summary = (
             f"File: {filename}\n"
             f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n"
         )
         docs.append(Document(page_content=summary, metadata={"source": filename, "type": "csv_summary"}))
+        # Statistics (wrapped in try/except so partial failure doesn't block)
         try:
             stats = "Statistical summary:\n" + df.describe(include="all").to_string()
             docs.append(Document(page_content=stats, metadata={"source": filename, "type": "csv_stats"}))
+        except Exception as e:
+            logger.warning(f"CSV stats failed: {e}")
+        # Row chunks
+        try:
+            for i in range(0, min(len(df), 500), 50):
+                chunk = f"Rows {i}–{i+50}:\n{df.iloc[i:i+50].to_string(index=False)}"
+                docs.append(Document(page_content=chunk, metadata={"source": filename, "type": "csv_rows"}))
+        except Exception as e:
+            logger.warning(f"CSV row chunking failed: {e}")
         return docs
         xl   = pd.ExcelFile(io.BytesIO(data))
         docs = []
         for sheet in xl.sheet_names:
+            try:
+                df = xl.parse(sheet)
+                text = (
+                    f"Sheet: {sheet} | {df.shape[0]} rows × {df.shape[1]} cols\n"
+                    f"Columns: {', '.join(str(c) for c in df.columns)}\n\n"
+                    f"{df.head(10).to_string(index=False)}"
+                )
+                docs.append(Document(page_content=text, metadata={
+                    "source": filename, "type": "excel", "sheet": sheet
+                }))
+            except Exception as e:
+                logger.warning(f"Excel sheet '{sheet}' failed: {e}")
         return docs
+    # ── IMAGE LOADING — FIXED ────────────────────────────────────────────────
     def _load_image(self, data: bytes, filename: str) -> List[Document]:
+        """
+        FIX: Build a much richer document from the image.
+        1. Get a short caption from BLIP (raw bytes, not JSON+base64).
+        2. Get a detailed description from a VLM (e.g. Llama-3.2-Vision).
+        3. Combine both into a multi-paragraph document so RAG has enough
+           content to answer diverse questions about the image.
+        """
+        short_caption    = self._caption_image_blip(data, filename)
+        detailed_caption = self._describe_image_with_vlm(data, filename, short_caption)
+        # Build a rich text document from the image analysis
+        sections = [
+            f"Image file: {filename}",
+            "",
+            f"=== Short Caption ===",
+            short_caption,
+            "",
+            f"=== Detailed Description ===",
+            detailed_caption,
+            "",
+            f"=== Summary ===",
+            f"This image ({filename}) shows: {short_caption}. "
+            f"{detailed_caption}",
+        ]
+        text = "\n".join(sections)
         return [Document(
             page_content=text,
+            metadata={
+                "source": filename,
+                "type": "image",
+                "caption": short_caption,
+                "detailed": detailed_caption[:500],
+            }
         )]
+    def _caption_image_blip(self, data: bytes, filename: str) -> str:
+        """
+        FIX: Send raw image bytes to the BLIP API, NOT JSON with base64.
+        The HuggingFace Inference API for image-to-text models expects the
+        raw binary image data as the request body.
+        """
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return f"[Image: {filename}] — Add HF_TOKEN secret to enable AI image captioning."
+        # List of captioning models to try (in order)
+        caption_models = [
+            "Salesforce/blip-image-captioning-large",
+            "Salesforce/blip-image-captioning-base",
+            "nlpconnect/vit-gpt2-image-captioning",
+        ]
+        for model_id in caption_models:
+            try:
+                logger.info(f"Trying BLIP caption with {model_id}...")
+                resp = requests.post(
+                    f"https://api-inference.huggingface.co/models/{model_id}",
+                    headers={"Authorization": f"Bearer {hf_token}"},
+                    data=data,           # ← FIX: raw bytes, NOT json={...}
+                    timeout=30,
+                )
+                if resp.status_code == 200:
+                    result = resp.json()
+                    if isinstance(result, list) and result:
+                        caption = result[0].get("generated_text", "")
+                        if caption:
+                            logger.info(f"BLIP caption ({model_id}): {caption[:80]}")
+                            return caption
+                elif resp.status_code == 503:
+                    # Model is loading — wait and retry once
+                    logger.info(f"{model_id} is loading, waiting 10s...")
+                    time.sleep(10)
+                    resp2 = requests.post(
+                        f"https://api-inference.huggingface.co/models/{model_id}",
+                        headers={"Authorization": f"Bearer {hf_token}"},
+                        data=data,
+                        timeout=45,
+                    )
+                    if resp2.status_code == 200:
+                        result = resp2.json()
+                        if isinstance(result, list) and result:
+                            caption = result[0].get("generated_text", "")
+                            if caption:
+                                logger.info(f"BLIP caption (retry {model_id}): {caption[:80]}")
+                                return caption
+                else:
+                    logger.warning(f"BLIP {model_id} returned {resp.status_code}: {resp.text[:100]}")
+            except Exception as e:
+                logger.warning(f"BLIP caption failed ({model_id}): {e}")
+                continue
+        return f"An image named {filename} was uploaded."
+    def _describe_image_with_vlm(self, data: bytes, filename: str, short_caption: str) -> str:
+        """
+        Use a Vision-Language Model via the HF chat completions API to get
+        a detailed multi-sentence description of the image.
+        Falls back gracefully if no VLM is available.
+        """
+        hf_token = os.environ.get("HF_TOKEN", "")
+        if not hf_token:
+            return short_caption
+        # Encode image as base64 data URI for the chat completions API
+        # Detect MIME type from magic bytes
+        mime = "image/jpeg"
+        if data[:8] == b'\x89PNG\r\n\x1a\n':
+            mime = "image/png"
+        elif data[:4] == b'RIFF' and data[8:12] == b'WEBP':
+            mime = "image/webp"
+        b64_image = base64.b64encode(data).decode("utf-8")
+        image_url = f"data:{mime};base64,{b64_image}"
+        headers = {
+            "Authorization": f"Bearer {hf_token}",
+            "Content-Type": "application/json",
+        }
+        for model_id in VLM_CAPTION_MODELS:
+            try:
+                logger.info(f"Trying VLM description with {model_id}...")
+                payload = {
+                    "model": model_id,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": image_url},
+                                },
+                                {
+                                    "type": "text",
+                                    "text": (
+                                        "Describe this image in detail. Include: "
+                                        "1) What objects, people, or scenes are visible. "
+                                        "2) Colors, positions, and spatial relationships. "
+                                        "3) Any text or writing visible in the image. "
+                                        "4) The overall mood, setting, or context. "
+                                        "5) Any notable details. "
+                                        "Be thorough and specific — your description will be "
+                                        "used to answer questions about this image later."
+                                    ),
+                                },
+                            ],
+                        }
+                    ],
+                    "max_tokens": 600,
+                    "temperature": 0.2,
+                    "stream": False,
+                }
+                resp = requests.post(
+                    HF_API_URL,
+                    headers=headers,
+                    data=json.dumps(payload),
+                    timeout=60,
+                )
+                if resp.status_code == 200:
+                    raw = resp.json()["choices"][0]["message"]["content"].strip()
+                    description = _strip_thinking(raw)
+                    if description and len(description) > 20:
+                        logger.info(f"VLM description ({model_id}): {description[:100]}...")
+                        return description
+                else:
+                    logger.warning(f"VLM {model_id} returned {resp.status_code}: {resp.text[:150]}")
+            except Exception as e:
+                logger.warning(f"VLM description failed ({model_id}): {e}")
+                continue
+        # Fallback: use a text-only LLM to expand the BLIP caption
+        return self._expand_caption_with_llm(short_caption, filename)
+    def _expand_caption_with_llm(self, caption: str, filename: str) -> str:
+        """
+        If the VLM is unavailable, use a text-only LLM to expand the short
+        BLIP caption into a more detailed description that's useful for RAG.
+        """
+        hf_token = os.environ.get("HF_TOKEN", "")
+        if not hf_token or caption.startswith("[Image:"):
+            return caption
+        headers = {
+            "Authorization": f"Bearer {hf_token}",
+            "Content-Type": "application/json",
+        }
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    "You are an image description assistant. Given a short AI-generated "
+                    "caption of an image, expand it into a detailed paragraph describing "
+                    "what the image likely contains. Include probable objects, colors, "
+                    "spatial layout, and context. Be descriptive but stay grounded in "
+                    "what the caption implies. Do not hallucinate specific details that "
+                    "cannot be inferred from the caption."
+                ),
+            },
+            {
+                "role": "user",
+                "content": (
+                    f"The image file is named '{filename}'. "
+                    f"The AI caption is: \"{caption}\"\n\n"
+                    f"Please provide a detailed expanded description of what this "
+                    f"image likely shows."
+                ),
+            },
+        ]
+        for model_id in CANDIDATE_MODELS:
+            try:
+                resp = requests.post(
+                    HF_API_URL,
+                    headers=headers,
+                    data=json.dumps({
+                        "model": model_id,
+                        "messages": messages,
+                        "max_tokens": 400,
+                        "temperature": 0.3,
+                        "stream": False,
+                    }),
+                    timeout=45,
+                )
+                if resp.status_code == 200:
+                    raw = resp.json()["choices"][0]["message"]["content"].strip()
+                    expanded = _strip_thinking(raw)
+                    if expanded and len(expanded) > 30:
+                        logger.info(f"Expanded caption ({model_id}): {expanded[:80]}...")
+                        return expanded
+            except Exception as e:
+                logger.warning(f"Caption expansion failed ({model_id}): {e}")
+                continue
+        return caption
     # ── Indexing ─────────────────────────────────────────────────────────────
         doc_type_hint = ""
         if self._doc_type in {".jpg", ".jpeg", ".png", ".webp"}:
+            doc_type_hint = (
+                "The document is an IMAGE. The context contains an AI-generated "
+                "description and caption of the image. Answer questions about the "
+                "image based on this description. Be specific about visual details "
+                "mentioned in the description."
+            )
         elif self._doc_type in {".csv", ".xlsx", ".xls"}:
+            doc_type_hint = (
+                "The document is tabular data (spreadsheet/CSV). Refer to column "
+                "names and values precisely."
+            )
+        elif self._doc_type in {".docx", ".doc"}:
+            doc_type_hint = "The document is a Word document."
         system_prompt = (
             f"You are DocMind AI, an expert document analyst built by Ryan Farahani.\n"