| """ |
| VisualRAG β Multi-Modal AI System |
| ================================== |
| Stack : YOLOv8n Β· CLIP ViT-B/32 Β· FAISS Β· Zephyr-7B Β· Gradio 4.40.0 |
| Deploy: HuggingFace Spaces (CPU Basic β free tier) |
| |
| Pipeline |
| Index : Image β YOLOv8 detection β CLIP embedding β FAISS vector store |
| Query : Text β CLIP text embedding β cosine k-NN β LLM answer generation |
| |
| No monkey-patching needed with gradio 4.40.0 β the schema introspector |
| bug and starlette TemplateResponse API mismatch only affect 4.44.x. |
| """ |
|
|
| import json |
| import os |
| from datetime import datetime |
|
|
| import faiss |
| import gradio as gr |
| import numpy as np |
| import torch |
| from huggingface_hub import InferenceClient |
| from PIL import Image |
| from transformers import CLIPModel, CLIPProcessor |
| from ultralytics import YOLO |
|
|
| |
| print("β³ Loading CLIP ViT-B/32 ...") |
| CLIP_MODEL_ID = "openai/clip-vit-base-patch32" |
| clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID) |
| clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID) |
| clip_model.eval() |
|
|
| print("β³ Loading YOLOv8n ...") |
| yolo = YOLO("yolov8n.pt") |
|
|
| print("β³ Initialising LLM client ...") |
| |
| |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) |
| llm = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN) |
|
|
| print("β
All models ready.") |
|
|
| |
| EMBED_DIM = 512 |
| faiss_index = faiss.IndexFlatIP(EMBED_DIM) |
| image_store = [] |
|
|
|
|
| |
|
|
| def embed_image(pil_img: Image.Image) -> np.ndarray: |
| """Return L2-normalised 512-d CLIP image embedding (shape 1Γ512).""" |
| inputs = clip_processor(images=pil_img, return_tensors="pt") |
| with torch.no_grad(): |
| features = clip_model.get_image_features(**inputs) |
| emb = features.numpy().astype("float32") |
| faiss.normalize_L2(emb) |
| return emb |
|
|
|
|
| def embed_text(text: str) -> np.ndarray: |
| """Return L2-normalised 512-d CLIP text embedding (shape 1Γ512).""" |
| inputs = clip_processor(text=[text], return_tensors="pt", |
| padding=True, truncation=True) |
| with torch.no_grad(): |
| features = clip_model.get_text_features(**inputs) |
| emb = features.numpy().astype("float32") |
| faiss.normalize_L2(emb) |
| return emb |
|
|
|
|
| |
|
|
| def run_detection(pil_img: Image.Image): |
| """Run YOLOv8n β return (annotated PIL, detections list, summary string).""" |
| results = yolo(np.array(pil_img))[0] |
| annotated = Image.fromarray(results.plot()) |
|
|
| detections = [] |
| if results.boxes is not None: |
| for box in results.boxes: |
| detections.append({ |
| "label": yolo.names[int(box.cls[0])], |
| "confidence": round(float(box.conf[0]), 3), |
| }) |
|
|
| counts = {} |
| for d in detections: |
| counts[d["label"]] = counts.get(d["label"], 0) + 1 |
| summary = ", ".join(f"{v} {k}" for k, v in counts.items()) or "no objects detected" |
|
|
| return annotated, detections, summary |
|
|
|
|
| |
|
|
| def index_image(image_np, note: str): |
| """Detect β embed β store in FAISS. Called by the Index button.""" |
| if image_np is None: |
| return None, "β Please upload an image first.", _badge() |
|
|
| pil_img = Image.fromarray(image_np) |
| annotated, detections, summary = run_detection(pil_img) |
| embedding = embed_image(pil_img) |
|
|
| faiss_index.add(embedding) |
| image_store.append({ |
| "id": len(image_store), |
| "image": pil_img.copy(), |
| "annotated": annotated, |
| "detections": detections, |
| "summary": summary, |
| "note": note.strip() or "β", |
| "ts": datetime.now().strftime("%H:%M:%S"), |
| }) |
|
|
| msg = f"β
Image #{len(image_store) - 1} indexed Β· Found: {summary}" |
| return annotated, msg, _badge() |
|
|
|
|
| def _badge() -> str: |
| return f"π¦ {len(image_store)} image(s) in vector store" |
|
|
|
|
| |
|
|
| def query_images(text_query: str, top_k: int): |
| """Text β CLIP embed β FAISS k-NN β RAG prompt β Zephyr-7B answer.""" |
| if not image_store: |
| return None, "β No images indexed yet β upload images in the 'Detect & Index' tab first.", "" |
| if not text_query.strip(): |
| return None, "β Please enter a question.", "" |
|
|
| query_emb = embed_text(text_query) |
| k = min(int(top_k), len(image_store)) |
| scores, idxs = faiss_index.search(query_emb, k) |
|
|
| hits, ctx_lines = [], [] |
| for rank, (score, idx) in enumerate(zip(scores[0], idxs[0])): |
| if idx < 0: |
| continue |
| item = image_store[int(idx)] |
| hits.append({ |
| "rank": rank + 1, |
| "img_id": int(idx), |
| "score": round(float(score), 4), |
| "objects": item["summary"], |
| "note": item["note"], |
| }) |
| ctx_lines.append( |
| f"[Image #{idx}] objects: {item['summary']} | " |
| f"note: {item['note']} | indexed at: {item['ts']} | " |
| f"cosine similarity: {score:.3f}" |
| ) |
|
|
| context = "\n".join(ctx_lines) |
| prompt = ( |
| "<|system|>\n" |
| "You are a concise visual-AI assistant. " |
| "Answer using only the retrieved image context below. " |
| "If context is insufficient, say so.\n" |
| "<|user|>\n" |
| f"Retrieved context:\n{context}\n\n" |
| f"Question: {text_query}\n" |
| "<|assistant|>\n" |
| ) |
|
|
| try: |
| answer = llm.text_generation( |
| prompt, |
| max_new_tokens=300, |
| temperature=0.2, |
| repetition_penalty=1.1, |
| stop_sequences=["<|user|>", "<|system|>"], |
| ).strip() |
| except Exception as exc: |
| answer = f"β οΈ LLM unavailable ({exc}).\n\nRaw retrieval context:\n{context}" |
|
|
| best_idx = int(idxs[0][0]) if len(idxs[0]) > 0 and idxs[0][0] >= 0 else None |
| best_image = image_store[best_idx]["annotated"] if best_idx is not None else None |
|
|
| return best_image, answer, json.dumps(hits, indent=2) |
|
|
|
|
| |
|
|
| CSS = """ |
| .hero{text-align:center;padding:20px 0 4px} |
| .hero h1{font-size:28px;margin:0} |
| .hero p{color:var(--color-subdued);margin:6px 0 0} |
| .badge-row{display:flex;gap:8px;justify-content:center;flex-wrap:wrap;margin-top:10px} |
| .badge{background:var(--color-background-secondary);border:1px solid var(--border-color-primary); |
| border-radius:20px;padding:3px 12px;font-size:12px;color:var(--color-text-body)} |
| """ |
|
|
| with gr.Blocks(title="VisualRAG", theme=gr.themes.Soft(primary_hue="blue"), css=CSS) as demo: |
|
|
| gr.HTML(""" |
| <div class="hero"> |
| <h1>π VisualRAG</h1> |
| <p>Multi-Modal AI Β· Object Detection + Visual Embeddings + RAG + LLM</p> |
| <div class="badge-row"> |
| <span class="badge">YOLOv8</span> |
| <span class="badge">CLIP ViT-B/32</span> |
| <span class="badge">FAISS</span> |
| <span class="badge">Zephyr-7B</span> |
| <span class="badge">Gradio 4.40.0</span> |
| </div> |
| </div> |
| """) |
|
|
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("π€ Detect & Index"): |
| gr.Markdown( |
| "Upload any image. YOLOv8n detects objects, then CLIP ViT-B/32 " |
| "encodes it into a 512-d embedding stored in FAISS for later retrieval." |
| ) |
| with gr.Row(): |
| with gr.Column(scale=1): |
| img_in = gr.Image(label="Upload image", type="numpy") |
| note_in = gr.Textbox(label="Context note (optional)", |
| placeholder="e.g. 'Warehouse camera, aisle 3'") |
| index_btn = gr.Button("π Detect & Index", variant="primary") |
| with gr.Column(scale=1): |
| det_out = gr.Image(label="Detection result") |
| status_out = gr.Textbox(label="Status", interactive=False) |
| badge_out = gr.Textbox(label="Vector store", interactive=False, |
| value=_badge()) |
|
|
| index_btn.click( |
| fn=index_image, |
| inputs=[img_in, note_in], |
| outputs=[det_out, status_out, badge_out], |
| ) |
|
|
| |
| with gr.Tab("π¬ Query (RAG)"): |
| gr.Markdown( |
| "Ask any question about your indexed images. CLIP embeds the query, " |
| "FAISS retrieves the most similar images by cosine similarity, " |
| "and Zephyr-7B generates a grounded answer." |
| ) |
| with gr.Row(): |
| with gr.Column(scale=1): |
| query_in = gr.Textbox( |
| label="Your question", |
| placeholder="e.g. 'How many people are visible?' or 'Are there any vehicles?'", |
| lines=3, |
| ) |
| topk_sl = gr.Slider(minimum=1, maximum=5, value=3, step=1, |
| label="Top-K images to retrieve") |
| query_btn = gr.Button("π Search & Generate Answer", variant="primary") |
| with gr.Column(scale=1): |
| match_img = gr.Image(label="Best matching image") |
| llm_out = gr.Textbox(label="AI Answer (RAG-grounded)", |
| lines=6, interactive=False) |
|
|
| hits_out = gr.Textbox(label="Retrieval scores", interactive=False, lines=8) |
|
|
| query_btn.click( |
| fn=query_images, |
| inputs=[query_in, topk_sl], |
| outputs=[match_img, llm_out, hits_out], |
| ) |
|
|
| |
| with gr.Tab("ποΈ How it works"): |
| gr.Markdown(""" |
| ## System overview |
| |
| ### Index pipeline |
| ``` |
| Image β YOLOv8n detection (objects + confidence scores) |
| β CLIP ViT-B/32 image encoder β 512-d embedding |
| β L2 normalisation |
| β FAISS IndexFlatIP (cosine similarity store) |
| ``` |
| |
| ### Query / RAG pipeline |
| ``` |
| Text query β CLIP text encoder β 512-d query embedding |
| β L2 normalisation |
| β FAISS k-NN search (cosine similarity, top-K) |
| β RAG prompt = query + retrieved context |
| β Zephyr-7B-Ξ² (HF Serverless Inference API) |
| β Natural language answer |
| ``` |
| |
| ## Stack |
| |
| | Component | Technology | |
| |---|---| |
| | Object detection | YOLOv8n (Ultralytics) | |
| | Visual embedding | CLIP ViT-B/32 (OpenAI via HF) | |
| | Vector index | FAISS IndexFlatIP (cosine sim) | |
| | LLM | Zephyr-7B-Ξ² (HF Serverless API) | |
| | UI | Gradio 4.40.0 | |
| |
| ## Why gradio 4.40.0 |
| |
| Version 4.44.1 has three cascading runtime bugs on HF Spaces: a schema |
| introspector TypeError, a non-existent gradio_client pin, and a starlette |
| TemplateResponse API mismatch that causes a Jinja2 `unhashable type: dict` |
| crash. Version 4.40.0 is widely deployed and has none of these issues. |
| """) |
|
|
| gr.HTML(""" |
| <div style="text-align:center;padding:14px 0 4px;color:var(--color-subdued);font-size:12px"> |
| VisualRAG Β· YOLOv8 + CLIP + FAISS + LLM Β· HuggingFace Spaces |
| </div> |
| """) |
|
|
| demo.launch(server_name="0.0.0.0") |