Spaces:

Faraz618
/

VisualRAG

Runtime error

App Files Files Community

Faraz618 commited on May 27

Commit

cbd040c

verified ·

1 Parent(s): 6fd2b8d

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -106

app.py CHANGED Viewed

@@ -1,85 +1,31 @@
 """
 VisualRAG — Multi-Modal AI System
 ==================================
-Stack : YOLOv8n · CLIP ViT-B/32 · FAISS · Zephyr-7B · Gradio 4.44.1
 Deploy: HuggingFace Spaces (CPU Basic — free tier)
-ROOT CAUSE OF "500 / Exit code 1" ERROR
-  gradio_client/utils.py line 863: `if "const" in schema:`
-  gr.Image generates `additionalProperties: True` (a Python bool) in its JSON
-  schema. The `in` operator on a bool raises TypeError. Gradio's API
-  introspector hits this on EVERY incoming request (including HF health checks),
-  accumulates errors, and eventually demo.launch() raises ValueError → exit 1.
-FIX STRATEGY (two layers)
-  Layer 1 — monkey-patch:  patch both `get_type` and `_json_schema_to_python_type`
-            in gradio_client.utils BEFORE gradio is imported. Must be the very
-            first code in the file. Recursive calls inside the original functions
-            look up names via module globals at call-time, so replacing the
-            module-level names makes the patch fully recursive.
-  Layer 2 — show_api=False: disables the /api route entirely so the introspector
-            never runs at all. Belt-and-suspenders.
-"""
-# ═══════════════════════════════════════════════════════════════════════════════
-# LAYER 1 — MONKEY-PATCH (must be FIRST, before any other import)
-# ═══════════════════════════════════════════════════════════════════════════════
-import gradio_client.utils as _gc_utils  # import the module, not just a symbol
-_orig_get_type = _gc_utils.get_type
-_orig_j2p      = _gc_utils._json_schema_to_python_type
-def _safe_get_type(schema):
-    """Guard get_type() against non-dict schema (e.g. bool from additionalProperties)."""
-    if not isinstance(schema, dict):
-        return "Any"
-    return _orig_get_type(schema)
-def _safe_j2p(schema, defs=None):
-    """
-    Guard _json_schema_to_python_type() against non-dict schema, and normalise
-    boolean additionalProperties to {} before delegating to the original.
-    Why this works recursively:
-      _orig_j2p() calls `_json_schema_to_python_type(...)` by NAME, which Python
-      resolves via the module's __dict__ at call-time. We replace that name with
-      _safe_j2p, so every recursive call from within _orig_j2p also passes
-      through our guard.
-    """
-    if not isinstance(schema, dict):
-        return "Any"
-    # Normalise `additionalProperties: True/False` → `additionalProperties: {}`
-    if isinstance(schema.get("additionalProperties"), bool):
-        schema = {
-            k: ({} if k == "additionalProperties" else v)
-            for k, v in schema.items()
-        }
-    return _orig_j2p(schema, defs)
-# Replace module-level names so all internal references pick up the safe versions
-_gc_utils.get_type                      = _safe_get_type
-_gc_utils._json_schema_to_python_type   = _safe_j2p
-print("✅ gradio_client monkey-patch applied.")
-# ═══════════════════════════════════════════════════════════════════════════════
 import json
 import os
 from datetime import datetime
 import faiss
 import numpy as np
 import torch
-import gradio as gr
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
 from ultralytics import YOLO
-from huggingface_hub import InferenceClient
-# ── Model loading ──────────────────────────────────────────────────────────────
 print("⏳ Loading CLIP ViT-B/32 ...")
 CLIP_MODEL_ID  = "openai/clip-vit-base-patch32"
 clip_model     = CLIPModel.from_pretrained(CLIP_MODEL_ID)
@@ -90,20 +36,23 @@ print("⏳ Loading YOLOv8n ...")
 yolo = YOLO("yolov8n.pt")           # auto-downloads ~6 MB on first run
 print("⏳ Initialising LLM client ...")
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 llm = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN)
 print("✅ All models ready.")
-# ── FAISS vector store (in-memory) ─────────────────────────────────────────────
-EMBED_DIM   = 512
-faiss_index = faiss.IndexFlatIP(EMBED_DIM)
-image_store = []   # list[dict] parallel to FAISS rows
 # ── Embedding helpers ──────────────────────────────────────────────────────────
 def embed_image(pil_img: Image.Image) -> np.ndarray:
     inputs = clip_processor(images=pil_img, return_tensors="pt")
     with torch.no_grad():
         features = clip_model.get_image_features(**inputs)
@@ -113,6 +62,7 @@ def embed_image(pil_img: Image.Image) -> np.ndarray:
 def embed_text(text: str) -> np.ndarray:
     inputs = clip_processor(text=[text], return_tensors="pt",
                             padding=True, truncation=True)
     with torch.no_grad():
@@ -125,6 +75,7 @@ def embed_text(text: str) -> np.ndarray:
 # ── Detection pipeline ─────────────────────────────────────────────────────────
 def run_detection(pil_img: Image.Image):
     results   = yolo(np.array(pil_img))[0]
     annotated = Image.fromarray(results.plot())
@@ -147,6 +98,7 @@ def run_detection(pil_img: Image.Image):
 # ── Index pipeline ─────────────────────────────────────────────────────────────
 def index_image(image_np, note: str):
     if image_np is None:
         return None, "❌ Please upload an image first.", _badge()
@@ -165,7 +117,8 @@ def index_image(image_np, note: str):
         "ts":         datetime.now().strftime("%H:%M:%S"),
     })
-    return annotated, f"✅ Image #{len(image_store)-1} indexed · Found: {summary}", _badge()
 def _badge() -> str:
@@ -175,8 +128,9 @@ def _badge() -> str:
 # ── RAG query pipeline ─────────────────────────────────────────────────────────
 def query_images(text_query: str, top_k: int):
     if not image_store:
-        return None, "❌ No images indexed yet — upload images first.", ""
     if not text_query.strip():
         return None, "❌ Please enter a question.", ""
@@ -206,7 +160,7 @@ def query_images(text_query: str, top_k: int):
     prompt  = (
         "<|system|>\n"
         "You are a concise visual-AI assistant. "
-        "Answer the user's question using only the retrieved image context. "
         "If context is insufficient, say so.\n"
         "<|user|>\n"
         f"Retrieved context:\n{context}\n\n"
@@ -216,12 +170,14 @@ def query_images(text_query: str, top_k: int):
     try:
         answer = llm.text_generation(
-            prompt, max_new_tokens=300, temperature=0.2,
             repetition_penalty=1.1,
             stop_sequences=["<|user|>", "<|system|>"],
         ).strip()
     except Exception as exc:
-        answer = f"⚠️ LLM unavailable ({exc}).\n\nRaw context:\n{context}"
     best_idx   = int(idxs[0][0]) if len(idxs[0]) > 0 and idxs[0][0] >= 0 else None
     best_image = image_store[best_idx]["annotated"] if best_idx is not None else None
@@ -251,49 +207,54 @@ with gr.Blocks(title="VisualRAG", theme=gr.themes.Soft(primary_hue="blue"), css=
         <span class="badge">CLIP ViT-B/32</span>
         <span class="badge">FAISS</span>
         <span class="badge">Zephyr-7B</span>
-        <span class="badge">Gradio 4.44.1</span>
       </div>
     </div>
     """)
     with gr.Tabs():
         with gr.Tab("📤 Detect & Index"):
             gr.Markdown(
                 "Upload any image. YOLOv8n detects objects, then CLIP ViT-B/32 "
-                "encodes it into a 512-d embedding stored in FAISS."
             )
             with gr.Row():
                 with gr.Column(scale=1):
-                    img_in     = gr.Image(label="Upload image", type="numpy",
-                                          sources=["upload", "webcam"])
-                    note_in    = gr.Textbox(label="Context note (optional)",
-                                            placeholder="e.g. 'Warehouse camera, aisle 3'")
-                    index_btn  = gr.Button("🔍 Detect & Index", variant="primary", size="lg")
                 with gr.Column(scale=1):
                     det_out    = gr.Image(label="Detection result")
                     status_out = gr.Textbox(label="Status", interactive=False)
                     badge_out  = gr.Textbox(label="Vector store", interactive=False,
                                             value=_badge())
-            index_btn.click(fn=index_image, inputs=[img_in, note_in],
-                            outputs=[det_out, status_out, badge_out])
         with gr.Tab("💬 Query (RAG)"):
             gr.Markdown(
-                "Ask any question about indexed images. CLIP embeds the query, "
-                "FAISS retrieves similar images by cosine similarity, "
-                "Zephyr-7B generates a grounded answer."
             )
             with gr.Row():
                 with gr.Column(scale=1):
-                    query_in  = gr.Textbox(label="Your question",
-                                           placeholder="e.g. 'How many people are visible?'",
-                                           lines=3)
                     topk_sl   = gr.Slider(minimum=1, maximum=5, value=3, step=1,
                                           label="Top-K images to retrieve")
-                    query_btn = gr.Button("🔎 Search & Generate Answer",
-                                          variant="primary", size="lg")
                 with gr.Column(scale=1):
                     match_img = gr.Image(label="Best matching image")
                     llm_out   = gr.Textbox(label="AI Answer (RAG-grounded)",
@@ -301,35 +262,51 @@ with gr.Blocks(title="VisualRAG", theme=gr.themes.Soft(primary_hue="blue"), css=
             hits_out = gr.Textbox(label="Retrieval scores", interactive=False, lines=8)
-            query_btn.click(fn=query_images, inputs=[query_in, topk_sl],
-                            outputs=[match_img, llm_out, hits_out])
         with gr.Tab("🏗️ How it works"):
             gr.Markdown("""
 ## System overview
 ### Index pipeline
 ```
-Image → YOLOv8n detection → CLIP ViT-B/32 encoder → 512-d embedding
-      → L2 normalisation → FAISS IndexFlatIP (cosine similarity store)
 ```
 ### Query / RAG pipeline
 ```
-Text → CLIP text encoder → 512-d embedding → L2 norm
-     → FAISS k-NN search → Top-K retrieved context
-     → Zephyr-7B (HF Serverless API) → Natural language answer
 ```
-## Bug fixed in this version
-| Symptom | `500 — Exit code 1` on every Space start |
 |---|---|
-| Root cause | `gradio_client/utils.py`: `"const" in schema` where `schema` is `True` (bool) |
-| Trigger | `gr.Image` emits `additionalProperties: True` in its JSON Schema; the API introspector crashes on it |
-| Fix 1 | Monkey-patch `get_type` + `_json_schema_to_python_type` before importing gradio |
-| Fix 2 | `show_api=False` disables the `/api` route so introspection never runs at runtime |
-| Fix 3 | `opencv-python-headless` added to requirements (YOLO needs cv2) |
             """)
     gr.HTML("""
@@ -338,5 +315,4 @@ Text → CLIP text encoder → 512-d embedding → L2 norm
     </div>
     """)
-# LAYER 2 — show_api=False disables the /api route that triggers schema introspection
-demo.launch(server_name="0.0.0.0", show_api=False)

 """
 VisualRAG — Multi-Modal AI System
 ==================================
+Stack : YOLOv8n · CLIP ViT-B/32 · FAISS · Zephyr-7B · Gradio 4.40.0
 Deploy: HuggingFace Spaces (CPU Basic — free tier)
+Pipeline
+  Index : Image → YOLOv8 detection → CLIP embedding → FAISS vector store
+  Query : Text  → CLIP text embedding → cosine k-NN → LLM answer generation
+No monkey-patching needed with gradio 4.40.0 — the schema introspector
+bug and starlette TemplateResponse API mismatch only affect 4.44.x.
+"""
 import json
 import os
 from datetime import datetime
 import faiss
+import gradio as gr
 import numpy as np
 import torch
+from huggingface_hub import InferenceClient
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
 from ultralytics import YOLO
+# ── Model loading (runs once at Space start-up) ────────────────────────────────
 print("⏳ Loading CLIP ViT-B/32 ...")
 CLIP_MODEL_ID  = "openai/clip-vit-base-patch32"
 clip_model     = CLIPModel.from_pretrained(CLIP_MODEL_ID)
 yolo = YOLO("yolov8n.pt")           # auto-downloads ~6 MB on first run
 print("⏳ Initialising LLM client ...")
+# Free HF Serverless Inference — LLM runs on HF servers, not in the Space.
+# Add HF_TOKEN as a Space Secret for higher rate limits.
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 llm = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN)
 print("✅ All models ready.")
+# ── FAISS vector store (in-memory, session-scoped) ─────────────────────────────
+EMBED_DIM   = 512                          # CLIP ViT-B/32 output dimension
+faiss_index = faiss.IndexFlatIP(EMBED_DIM) # cosine similarity via L2-normalised dot product
+image_store = []                           # parallel list: one dict per indexed image
 # ── Embedding helpers ──────────────────────────────────────────────────────────
 def embed_image(pil_img: Image.Image) -> np.ndarray:
+    """Return L2-normalised 512-d CLIP image embedding (shape 1×512)."""
     inputs = clip_processor(images=pil_img, return_tensors="pt")
     with torch.no_grad():
         features = clip_model.get_image_features(**inputs)
 def embed_text(text: str) -> np.ndarray:
+    """Return L2-normalised 512-d CLIP text embedding (shape 1×512)."""
     inputs = clip_processor(text=[text], return_tensors="pt",
                             padding=True, truncation=True)
     with torch.no_grad():
 # ── Detection pipeline ─────────────────────────────────────────────────────────
 def run_detection(pil_img: Image.Image):
+    """Run YOLOv8n → return (annotated PIL, detections list, summary string)."""
     results   = yolo(np.array(pil_img))[0]
     annotated = Image.fromarray(results.plot())
 # ── Index pipeline ─────────────────────────────────────────────────────────────
 def index_image(image_np, note: str):
+    """Detect → embed → store in FAISS. Called by the Index button."""
     if image_np is None:
         return None, "❌ Please upload an image first.", _badge()
         "ts":         datetime.now().strftime("%H:%M:%S"),
     })
+    msg = f"✅ Image #{len(image_store) - 1} indexed · Found: {summary}"
+    return annotated, msg, _badge()
 def _badge() -> str:
 # ── RAG query pipeline ─────────────────────────────────────────────────────────
 def query_images(text_query: str, top_k: int):
+    """Text → CLIP embed → FAISS k-NN → RAG prompt → Zephyr-7B answer."""
     if not image_store:
+        return None, "❌ No images indexed yet — upload images in the 'Detect & Index' tab first.", ""
     if not text_query.strip():
         return None, "❌ Please enter a question.", ""
     prompt  = (
         "<|system|>\n"
         "You are a concise visual-AI assistant. "
+        "Answer using only the retrieved image context below. "
         "If context is insufficient, say so.\n"
         "<|user|>\n"
         f"Retrieved context:\n{context}\n\n"
     try:
         answer = llm.text_generation(
+            prompt,
+            max_new_tokens=300,
+            temperature=0.2,
             repetition_penalty=1.1,
             stop_sequences=["<|user|>", "<|system|>"],
         ).strip()
     except Exception as exc:
+        answer = f"⚠️ LLM unavailable ({exc}).\n\nRaw retrieval context:\n{context}"
     best_idx   = int(idxs[0][0]) if len(idxs[0]) > 0 and idxs[0][0] >= 0 else None
     best_image = image_store[best_idx]["annotated"] if best_idx is not None else None
         <span class="badge">CLIP ViT-B/32</span>
         <span class="badge">FAISS</span>
         <span class="badge">Zephyr-7B</span>
+        <span class="badge">Gradio 4.40.0</span>
       </div>
     </div>
     """)
     with gr.Tabs():
+        # ── TAB 1: Detect & Index ──────────────────────────────────────────────
         with gr.Tab("📤 Detect & Index"):
             gr.Markdown(
                 "Upload any image. YOLOv8n detects objects, then CLIP ViT-B/32 "
+                "encodes it into a 512-d embedding stored in FAISS for later retrieval."
             )
             with gr.Row():
                 with gr.Column(scale=1):
+                    img_in    = gr.Image(label="Upload image", type="numpy")
+                    note_in   = gr.Textbox(label="Context note (optional)",
+                                           placeholder="e.g. 'Warehouse camera, aisle 3'")
+                    index_btn = gr.Button("🔍 Detect & Index", variant="primary")
                 with gr.Column(scale=1):
                     det_out    = gr.Image(label="Detection result")
                     status_out = gr.Textbox(label="Status", interactive=False)
                     badge_out  = gr.Textbox(label="Vector store", interactive=False,
                                             value=_badge())
+            index_btn.click(
+                fn=index_image,
+                inputs=[img_in, note_in],
+                outputs=[det_out, status_out, badge_out],
+            )
+        # ── TAB 2: Query (RAG) ─────────────────────────────────────────────────
         with gr.Tab("💬 Query (RAG)"):
             gr.Markdown(
+                "Ask any question about your indexed images. CLIP embeds the query, "
+                "FAISS retrieves the most similar images by cosine similarity, "
+                "and Zephyr-7B generates a grounded answer."
             )
             with gr.Row():
                 with gr.Column(scale=1):
+                    query_in  = gr.Textbox(
+                        label="Your question",
+                        placeholder="e.g. 'How many people are visible?' or 'Are there any vehicles?'",
+                        lines=3,
+                    )
                     topk_sl   = gr.Slider(minimum=1, maximum=5, value=3, step=1,
                                           label="Top-K images to retrieve")
+                    query_btn = gr.Button("🔎 Search & Generate Answer", variant="primary")
                 with gr.Column(scale=1):
                     match_img = gr.Image(label="Best matching image")
                     llm_out   = gr.Textbox(label="AI Answer (RAG-grounded)",
             hits_out = gr.Textbox(label="Retrieval scores", interactive=False, lines=8)
+            query_btn.click(
+                fn=query_images,
+                inputs=[query_in, topk_sl],
+                outputs=[match_img, llm_out, hits_out],
+            )
+        # ── TAB 3: How it works ────────────────────────────────────────────────
         with gr.Tab("🏗️ How it works"):
             gr.Markdown("""
 ## System overview
 ### Index pipeline
 ```
+Image → YOLOv8n detection (objects + confidence scores)
+      → CLIP ViT-B/32 image encoder  →  512-d embedding
+      → L2 normalisation
+      → FAISS IndexFlatIP  (cosine similarity store)
 ```
 ### Query / RAG pipeline
 ```
+Text query → CLIP text encoder  →  512-d query embedding
+           → L2 normalisation
+           → FAISS k-NN search (cosine similarity, top-K)
+           → RAG prompt = query + retrieved context
+           → Zephyr-7B-β  (HF Serverless Inference API)
+           → Natural language answer
 ```
+## Stack
+| Component | Technology |
 |---|---|
+| Object detection | YOLOv8n (Ultralytics) |
+| Visual embedding | CLIP ViT-B/32 (OpenAI via HF) |
+| Vector index | FAISS IndexFlatIP (cosine sim) |
+| LLM | Zephyr-7B-β (HF Serverless API) |
+| UI | Gradio 4.40.0 |
+## Why gradio 4.40.0
+Version 4.44.1 has three cascading runtime bugs on HF Spaces: a schema
+introspector TypeError, a non-existent gradio_client pin, and a starlette
+TemplateResponse API mismatch that causes a Jinja2 `unhashable type: dict`
+crash. Version 4.40.0 is widely deployed and has none of these issues.
             """)
     gr.HTML("""
     </div>
     """)
+demo.launch(server_name="0.0.0.0")