Faraz618 commited on
Commit
cbd040c
Β·
verified Β·
1 Parent(s): 6fd2b8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -106
app.py CHANGED
@@ -1,85 +1,31 @@
1
  """
2
  VisualRAG β€” Multi-Modal AI System
3
  ==================================
4
- Stack : YOLOv8n Β· CLIP ViT-B/32 Β· FAISS Β· Zephyr-7B Β· Gradio 4.44.1
5
  Deploy: HuggingFace Spaces (CPU Basic β€” free tier)
6
 
7
- ROOT CAUSE OF "500 / Exit code 1" ERROR
8
- gradio_client/utils.py line 863: `if "const" in schema:`
9
- gr.Image generates `additionalProperties: True` (a Python bool) in its JSON
10
- schema. The `in` operator on a bool raises TypeError. Gradio's API
11
- introspector hits this on EVERY incoming request (including HF health checks),
12
- accumulates errors, and eventually demo.launch() raises ValueError β†’ exit 1.
13
-
14
- FIX STRATEGY (two layers)
15
- Layer 1 β€” monkey-patch: patch both `get_type` and `_json_schema_to_python_type`
16
- in gradio_client.utils BEFORE gradio is imported. Must be the very
17
- first code in the file. Recursive calls inside the original functions
18
- look up names via module globals at call-time, so replacing the
19
- module-level names makes the patch fully recursive.
20
- Layer 2 β€” show_api=False: disables the /api route entirely so the introspector
21
- never runs at all. Belt-and-suspenders.
22
- """
23
-
24
- # ═══════════════════════════════════════════════════════════════════════════════
25
- # LAYER 1 β€” MONKEY-PATCH (must be FIRST, before any other import)
26
- # ═══════════════════════════════════════════════════════════════════════════════
27
- import gradio_client.utils as _gc_utils # import the module, not just a symbol
28
-
29
- _orig_get_type = _gc_utils.get_type
30
- _orig_j2p = _gc_utils._json_schema_to_python_type
31
-
32
-
33
- def _safe_get_type(schema):
34
- """Guard get_type() against non-dict schema (e.g. bool from additionalProperties)."""
35
- if not isinstance(schema, dict):
36
- return "Any"
37
- return _orig_get_type(schema)
38
-
39
-
40
- def _safe_j2p(schema, defs=None):
41
- """
42
- Guard _json_schema_to_python_type() against non-dict schema, and normalise
43
- boolean additionalProperties to {} before delegating to the original.
44
-
45
- Why this works recursively:
46
- _orig_j2p() calls `_json_schema_to_python_type(...)` by NAME, which Python
47
- resolves via the module's __dict__ at call-time. We replace that name with
48
- _safe_j2p, so every recursive call from within _orig_j2p also passes
49
- through our guard.
50
- """
51
- if not isinstance(schema, dict):
52
- return "Any"
53
- # Normalise `additionalProperties: True/False` β†’ `additionalProperties: {}`
54
- if isinstance(schema.get("additionalProperties"), bool):
55
- schema = {
56
- k: ({} if k == "additionalProperties" else v)
57
- for k, v in schema.items()
58
- }
59
- return _orig_j2p(schema, defs)
60
-
61
 
62
- # Replace module-level names so all internal references pick up the safe versions
63
- _gc_utils.get_type = _safe_get_type
64
- _gc_utils._json_schema_to_python_type = _safe_j2p
65
-
66
- print("βœ… gradio_client monkey-patch applied.")
67
- # ═══════════════════════════════════════════════════════════════════════════════
68
 
69
  import json
70
  import os
71
  from datetime import datetime
72
 
73
  import faiss
 
74
  import numpy as np
75
  import torch
76
- import gradio as gr
77
  from PIL import Image
78
  from transformers import CLIPModel, CLIPProcessor
79
  from ultralytics import YOLO
80
- from huggingface_hub import InferenceClient
81
 
82
- # ── Model loading ──────────────────────────────────────────────────────────────
83
  print("⏳ Loading CLIP ViT-B/32 ...")
84
  CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
85
  clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID)
@@ -90,20 +36,23 @@ print("⏳ Loading YOLOv8n ...")
90
  yolo = YOLO("yolov8n.pt") # auto-downloads ~6 MB on first run
91
 
92
  print("⏳ Initialising LLM client ...")
 
 
93
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
94
  llm = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN)
95
 
96
  print("βœ… All models ready.")
97
 
98
- # ── FAISS vector store (in-memory) ─────────────────────────────────────────────
99
- EMBED_DIM = 512
100
- faiss_index = faiss.IndexFlatIP(EMBED_DIM)
101
- image_store = [] # list[dict] parallel to FAISS rows
102
 
103
 
104
  # ── Embedding helpers ──────────────────────────────────────────────────────────
105
 
106
  def embed_image(pil_img: Image.Image) -> np.ndarray:
 
107
  inputs = clip_processor(images=pil_img, return_tensors="pt")
108
  with torch.no_grad():
109
  features = clip_model.get_image_features(**inputs)
@@ -113,6 +62,7 @@ def embed_image(pil_img: Image.Image) -> np.ndarray:
113
 
114
 
115
  def embed_text(text: str) -> np.ndarray:
 
116
  inputs = clip_processor(text=[text], return_tensors="pt",
117
  padding=True, truncation=True)
118
  with torch.no_grad():
@@ -125,6 +75,7 @@ def embed_text(text: str) -> np.ndarray:
125
  # ── Detection pipeline ─────────────────────────────────────────────────────────
126
 
127
  def run_detection(pil_img: Image.Image):
 
128
  results = yolo(np.array(pil_img))[0]
129
  annotated = Image.fromarray(results.plot())
130
 
@@ -147,6 +98,7 @@ def run_detection(pil_img: Image.Image):
147
  # ── Index pipeline ─────────────────────────────────────────────────────────────
148
 
149
  def index_image(image_np, note: str):
 
150
  if image_np is None:
151
  return None, "❌ Please upload an image first.", _badge()
152
 
@@ -165,7 +117,8 @@ def index_image(image_np, note: str):
165
  "ts": datetime.now().strftime("%H:%M:%S"),
166
  })
167
 
168
- return annotated, f"βœ… Image #{len(image_store)-1} indexed Β· Found: {summary}", _badge()
 
169
 
170
 
171
  def _badge() -> str:
@@ -175,8 +128,9 @@ def _badge() -> str:
175
  # ── RAG query pipeline ─────────────────────────────────────────────────────────
176
 
177
  def query_images(text_query: str, top_k: int):
 
178
  if not image_store:
179
- return None, "❌ No images indexed yet β€” upload images first.", ""
180
  if not text_query.strip():
181
  return None, "❌ Please enter a question.", ""
182
 
@@ -206,7 +160,7 @@ def query_images(text_query: str, top_k: int):
206
  prompt = (
207
  "<|system|>\n"
208
  "You are a concise visual-AI assistant. "
209
- "Answer the user's question using only the retrieved image context. "
210
  "If context is insufficient, say so.\n"
211
  "<|user|>\n"
212
  f"Retrieved context:\n{context}\n\n"
@@ -216,12 +170,14 @@ def query_images(text_query: str, top_k: int):
216
 
217
  try:
218
  answer = llm.text_generation(
219
- prompt, max_new_tokens=300, temperature=0.2,
 
 
220
  repetition_penalty=1.1,
221
  stop_sequences=["<|user|>", "<|system|>"],
222
  ).strip()
223
  except Exception as exc:
224
- answer = f"⚠️ LLM unavailable ({exc}).\n\nRaw context:\n{context}"
225
 
226
  best_idx = int(idxs[0][0]) if len(idxs[0]) > 0 and idxs[0][0] >= 0 else None
227
  best_image = image_store[best_idx]["annotated"] if best_idx is not None else None
@@ -251,49 +207,54 @@ with gr.Blocks(title="VisualRAG", theme=gr.themes.Soft(primary_hue="blue"), css=
251
  <span class="badge">CLIP ViT-B/32</span>
252
  <span class="badge">FAISS</span>
253
  <span class="badge">Zephyr-7B</span>
254
- <span class="badge">Gradio 4.44.1</span>
255
  </div>
256
  </div>
257
  """)
258
 
259
  with gr.Tabs():
260
 
 
261
  with gr.Tab("πŸ“€ Detect & Index"):
262
  gr.Markdown(
263
  "Upload any image. YOLOv8n detects objects, then CLIP ViT-B/32 "
264
- "encodes it into a 512-d embedding stored in FAISS."
265
  )
266
  with gr.Row():
267
  with gr.Column(scale=1):
268
- img_in = gr.Image(label="Upload image", type="numpy",
269
- sources=["upload", "webcam"])
270
- note_in = gr.Textbox(label="Context note (optional)",
271
- placeholder="e.g. 'Warehouse camera, aisle 3'")
272
- index_btn = gr.Button("πŸ” Detect & Index", variant="primary", size="lg")
273
  with gr.Column(scale=1):
274
  det_out = gr.Image(label="Detection result")
275
  status_out = gr.Textbox(label="Status", interactive=False)
276
  badge_out = gr.Textbox(label="Vector store", interactive=False,
277
  value=_badge())
278
 
279
- index_btn.click(fn=index_image, inputs=[img_in, note_in],
280
- outputs=[det_out, status_out, badge_out])
 
 
 
281
 
 
282
  with gr.Tab("πŸ’¬ Query (RAG)"):
283
  gr.Markdown(
284
- "Ask any question about indexed images. CLIP embeds the query, "
285
- "FAISS retrieves similar images by cosine similarity, "
286
- "Zephyr-7B generates a grounded answer."
287
  )
288
  with gr.Row():
289
  with gr.Column(scale=1):
290
- query_in = gr.Textbox(label="Your question",
291
- placeholder="e.g. 'How many people are visible?'",
292
- lines=3)
 
 
293
  topk_sl = gr.Slider(minimum=1, maximum=5, value=3, step=1,
294
  label="Top-K images to retrieve")
295
- query_btn = gr.Button("πŸ”Ž Search & Generate Answer",
296
- variant="primary", size="lg")
297
  with gr.Column(scale=1):
298
  match_img = gr.Image(label="Best matching image")
299
  llm_out = gr.Textbox(label="AI Answer (RAG-grounded)",
@@ -301,35 +262,51 @@ with gr.Blocks(title="VisualRAG", theme=gr.themes.Soft(primary_hue="blue"), css=
301
 
302
  hits_out = gr.Textbox(label="Retrieval scores", interactive=False, lines=8)
303
 
304
- query_btn.click(fn=query_images, inputs=[query_in, topk_sl],
305
- outputs=[match_img, llm_out, hits_out])
 
 
 
306
 
 
307
  with gr.Tab("πŸ—οΈ How it works"):
308
  gr.Markdown("""
309
  ## System overview
310
 
311
  ### Index pipeline
312
  ```
313
- Image β†’ YOLOv8n detection β†’ CLIP ViT-B/32 encoder β†’ 512-d embedding
314
- β†’ L2 normalisation β†’ FAISS IndexFlatIP (cosine similarity store)
 
 
315
  ```
316
 
317
  ### Query / RAG pipeline
318
  ```
319
- Text β†’ CLIP text encoder β†’ 512-d embedding β†’ L2 norm
320
- β†’ FAISS k-NN search β†’ Top-K retrieved context
321
- β†’ Zephyr-7B (HF Serverless API) β†’ Natural language answer
 
 
 
322
  ```
323
 
324
- ## Bug fixed in this version
325
 
326
- | Symptom | `500 β€” Exit code 1` on every Space start |
327
  |---|---|
328
- | Root cause | `gradio_client/utils.py`: `"const" in schema` where `schema` is `True` (bool) |
329
- | Trigger | `gr.Image` emits `additionalProperties: True` in its JSON Schema; the API introspector crashes on it |
330
- | Fix 1 | Monkey-patch `get_type` + `_json_schema_to_python_type` before importing gradio |
331
- | Fix 2 | `show_api=False` disables the `/api` route so introspection never runs at runtime |
332
- | Fix 3 | `opencv-python-headless` added to requirements (YOLO needs cv2) |
 
 
 
 
 
 
 
333
  """)
334
 
335
  gr.HTML("""
@@ -338,5 +315,4 @@ Text β†’ CLIP text encoder β†’ 512-d embedding β†’ L2 norm
338
  </div>
339
  """)
340
 
341
- # LAYER 2 β€” show_api=False disables the /api route that triggers schema introspection
342
- demo.launch(server_name="0.0.0.0", show_api=False)
 
1
  """
2
  VisualRAG β€” Multi-Modal AI System
3
  ==================================
4
+ Stack : YOLOv8n Β· CLIP ViT-B/32 Β· FAISS Β· Zephyr-7B Β· Gradio 4.40.0
5
  Deploy: HuggingFace Spaces (CPU Basic β€” free tier)
6
 
7
+ Pipeline
8
+ Index : Image β†’ YOLOv8 detection β†’ CLIP embedding β†’ FAISS vector store
9
+ Query : Text β†’ CLIP text embedding β†’ cosine k-NN β†’ LLM answer generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ No monkey-patching needed with gradio 4.40.0 β€” the schema introspector
12
+ bug and starlette TemplateResponse API mismatch only affect 4.44.x.
13
+ """
 
 
 
14
 
15
  import json
16
  import os
17
  from datetime import datetime
18
 
19
  import faiss
20
+ import gradio as gr
21
  import numpy as np
22
  import torch
23
+ from huggingface_hub import InferenceClient
24
  from PIL import Image
25
  from transformers import CLIPModel, CLIPProcessor
26
  from ultralytics import YOLO
 
27
 
28
+ # ── Model loading (runs once at Space start-up) ────────────────────────────────
29
  print("⏳ Loading CLIP ViT-B/32 ...")
30
  CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
31
  clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID)
 
36
  yolo = YOLO("yolov8n.pt") # auto-downloads ~6 MB on first run
37
 
38
  print("⏳ Initialising LLM client ...")
39
+ # Free HF Serverless Inference β€” LLM runs on HF servers, not in the Space.
40
+ # Add HF_TOKEN as a Space Secret for higher rate limits.
41
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
42
  llm = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN)
43
 
44
  print("βœ… All models ready.")
45
 
46
+ # ── FAISS vector store (in-memory, session-scoped) ─────────────────────────────
47
+ EMBED_DIM = 512 # CLIP ViT-B/32 output dimension
48
+ faiss_index = faiss.IndexFlatIP(EMBED_DIM) # cosine similarity via L2-normalised dot product
49
+ image_store = [] # parallel list: one dict per indexed image
50
 
51
 
52
  # ── Embedding helpers ──────────────────────────────────────────────────────────
53
 
54
  def embed_image(pil_img: Image.Image) -> np.ndarray:
55
+ """Return L2-normalised 512-d CLIP image embedding (shape 1Γ—512)."""
56
  inputs = clip_processor(images=pil_img, return_tensors="pt")
57
  with torch.no_grad():
58
  features = clip_model.get_image_features(**inputs)
 
62
 
63
 
64
  def embed_text(text: str) -> np.ndarray:
65
+ """Return L2-normalised 512-d CLIP text embedding (shape 1Γ—512)."""
66
  inputs = clip_processor(text=[text], return_tensors="pt",
67
  padding=True, truncation=True)
68
  with torch.no_grad():
 
75
  # ── Detection pipeline ─────────────────────────────────────────────────────────
76
 
77
  def run_detection(pil_img: Image.Image):
78
+ """Run YOLOv8n β†’ return (annotated PIL, detections list, summary string)."""
79
  results = yolo(np.array(pil_img))[0]
80
  annotated = Image.fromarray(results.plot())
81
 
 
98
  # ── Index pipeline ─────────────────────────────────────────────────────────────
99
 
100
  def index_image(image_np, note: str):
101
+ """Detect β†’ embed β†’ store in FAISS. Called by the Index button."""
102
  if image_np is None:
103
  return None, "❌ Please upload an image first.", _badge()
104
 
 
117
  "ts": datetime.now().strftime("%H:%M:%S"),
118
  })
119
 
120
+ msg = f"βœ… Image #{len(image_store) - 1} indexed Β· Found: {summary}"
121
+ return annotated, msg, _badge()
122
 
123
 
124
  def _badge() -> str:
 
128
  # ── RAG query pipeline ─────────────────────────────────────────────────────────
129
 
130
  def query_images(text_query: str, top_k: int):
131
+ """Text β†’ CLIP embed β†’ FAISS k-NN β†’ RAG prompt β†’ Zephyr-7B answer."""
132
  if not image_store:
133
+ return None, "❌ No images indexed yet β€” upload images in the 'Detect & Index' tab first.", ""
134
  if not text_query.strip():
135
  return None, "❌ Please enter a question.", ""
136
 
 
160
  prompt = (
161
  "<|system|>\n"
162
  "You are a concise visual-AI assistant. "
163
+ "Answer using only the retrieved image context below. "
164
  "If context is insufficient, say so.\n"
165
  "<|user|>\n"
166
  f"Retrieved context:\n{context}\n\n"
 
170
 
171
  try:
172
  answer = llm.text_generation(
173
+ prompt,
174
+ max_new_tokens=300,
175
+ temperature=0.2,
176
  repetition_penalty=1.1,
177
  stop_sequences=["<|user|>", "<|system|>"],
178
  ).strip()
179
  except Exception as exc:
180
+ answer = f"⚠️ LLM unavailable ({exc}).\n\nRaw retrieval context:\n{context}"
181
 
182
  best_idx = int(idxs[0][0]) if len(idxs[0]) > 0 and idxs[0][0] >= 0 else None
183
  best_image = image_store[best_idx]["annotated"] if best_idx is not None else None
 
207
  <span class="badge">CLIP ViT-B/32</span>
208
  <span class="badge">FAISS</span>
209
  <span class="badge">Zephyr-7B</span>
210
+ <span class="badge">Gradio 4.40.0</span>
211
  </div>
212
  </div>
213
  """)
214
 
215
  with gr.Tabs():
216
 
217
+ # ── TAB 1: Detect & Index ──────────────────────────────────────────────
218
  with gr.Tab("πŸ“€ Detect & Index"):
219
  gr.Markdown(
220
  "Upload any image. YOLOv8n detects objects, then CLIP ViT-B/32 "
221
+ "encodes it into a 512-d embedding stored in FAISS for later retrieval."
222
  )
223
  with gr.Row():
224
  with gr.Column(scale=1):
225
+ img_in = gr.Image(label="Upload image", type="numpy")
226
+ note_in = gr.Textbox(label="Context note (optional)",
227
+ placeholder="e.g. 'Warehouse camera, aisle 3'")
228
+ index_btn = gr.Button("πŸ” Detect & Index", variant="primary")
 
229
  with gr.Column(scale=1):
230
  det_out = gr.Image(label="Detection result")
231
  status_out = gr.Textbox(label="Status", interactive=False)
232
  badge_out = gr.Textbox(label="Vector store", interactive=False,
233
  value=_badge())
234
 
235
+ index_btn.click(
236
+ fn=index_image,
237
+ inputs=[img_in, note_in],
238
+ outputs=[det_out, status_out, badge_out],
239
+ )
240
 
241
+ # ── TAB 2: Query (RAG) ─────────────────────────────────────────────────
242
  with gr.Tab("πŸ’¬ Query (RAG)"):
243
  gr.Markdown(
244
+ "Ask any question about your indexed images. CLIP embeds the query, "
245
+ "FAISS retrieves the most similar images by cosine similarity, "
246
+ "and Zephyr-7B generates a grounded answer."
247
  )
248
  with gr.Row():
249
  with gr.Column(scale=1):
250
+ query_in = gr.Textbox(
251
+ label="Your question",
252
+ placeholder="e.g. 'How many people are visible?' or 'Are there any vehicles?'",
253
+ lines=3,
254
+ )
255
  topk_sl = gr.Slider(minimum=1, maximum=5, value=3, step=1,
256
  label="Top-K images to retrieve")
257
+ query_btn = gr.Button("πŸ”Ž Search & Generate Answer", variant="primary")
 
258
  with gr.Column(scale=1):
259
  match_img = gr.Image(label="Best matching image")
260
  llm_out = gr.Textbox(label="AI Answer (RAG-grounded)",
 
262
 
263
  hits_out = gr.Textbox(label="Retrieval scores", interactive=False, lines=8)
264
 
265
+ query_btn.click(
266
+ fn=query_images,
267
+ inputs=[query_in, topk_sl],
268
+ outputs=[match_img, llm_out, hits_out],
269
+ )
270
 
271
+ # ── TAB 3: How it works ────────────────────────────────────────────────
272
  with gr.Tab("πŸ—οΈ How it works"):
273
  gr.Markdown("""
274
  ## System overview
275
 
276
  ### Index pipeline
277
  ```
278
+ Image β†’ YOLOv8n detection (objects + confidence scores)
279
+ β†’ CLIP ViT-B/32 image encoder β†’ 512-d embedding
280
+ β†’ L2 normalisation
281
+ β†’ FAISS IndexFlatIP (cosine similarity store)
282
  ```
283
 
284
  ### Query / RAG pipeline
285
  ```
286
+ Text query β†’ CLIP text encoder β†’ 512-d query embedding
287
+ β†’ L2 normalisation
288
+ β†’ FAISS k-NN search (cosine similarity, top-K)
289
+ β†’ RAG prompt = query + retrieved context
290
+ β†’ Zephyr-7B-Ξ² (HF Serverless Inference API)
291
+ β†’ Natural language answer
292
  ```
293
 
294
+ ## Stack
295
 
296
+ | Component | Technology |
297
  |---|---|
298
+ | Object detection | YOLOv8n (Ultralytics) |
299
+ | Visual embedding | CLIP ViT-B/32 (OpenAI via HF) |
300
+ | Vector index | FAISS IndexFlatIP (cosine sim) |
301
+ | LLM | Zephyr-7B-Ξ² (HF Serverless API) |
302
+ | UI | Gradio 4.40.0 |
303
+
304
+ ## Why gradio 4.40.0
305
+
306
+ Version 4.44.1 has three cascading runtime bugs on HF Spaces: a schema
307
+ introspector TypeError, a non-existent gradio_client pin, and a starlette
308
+ TemplateResponse API mismatch that causes a Jinja2 `unhashable type: dict`
309
+ crash. Version 4.40.0 is widely deployed and has none of these issues.
310
  """)
311
 
312
  gr.HTML("""
 
315
  </div>
316
  """)
317
 
318
+ demo.launch(server_name="0.0.0.0")