Spaces:

jiang-cc
/

AD-Copilot

Running on Zero

App Files Files Community

jiang-cc commited on Apr 9

Commit

a3b3596

verified ·

1 Parent(s): 875cb52

feat: auto-visualize bounding boxes on test image when model outputs bbox JSON

Browse files

Files changed (1) hide show

app.py +85 -6

app.py CHANGED Viewed

@@ -2,14 +2,16 @@
 AD-Copilot Demo: Comparison-Aware Anomaly Detection with Vision-Language Model
 """
 import os
 import traceback
 import spaces
 import gradio as gr
 import torch
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from qwen_vl_utils import process_vision_info
-from PIL import Image
 # ---------------------------------------------------------------------------
 # Model loading (happens once at Space startup; weights stay on CPU until
@@ -31,6 +33,75 @@ model = AutoModelForImageTextToText.from_pretrained(
 ).eval()
 # ---------------------------------------------------------------------------
 # Inference
 # ---------------------------------------------------------------------------
@@ -42,7 +113,7 @@ def predict(
     max_new_tokens: float,
 ):
     if reference_image is None or test_image is None:
-        return "Please upload both a reference (good) image and a test image."
     try:
         max_new_tokens = int(max_new_tokens)
@@ -88,11 +159,18 @@ def predict(
             skip_special_tokens=True,
             clean_up_tokenization_spaces=False,
         )[0]
-        return output
     except Exception as e:
         tb = traceback.format_exc()
         print(tb, flush=True)
-        return f"Error:\n{tb}"
 # ---------------------------------------------------------------------------
@@ -183,17 +261,18 @@ with gr.Blocks(theme=gr.themes.Soft(), title=TITLE) as demo:
         run_btn = gr.Button("Detect Anomaly", variant="primary", scale=2)
     output = gr.Textbox(label="Model Output", lines=4)
     run_btn.click(
         fn=predict,
         inputs=[ref_img, test_img, prompt, max_tokens],
-        outputs=output,
     )
     gr.Examples(
         examples=EXAMPLES,
         inputs=[ref_img, test_img, prompt, max_tokens],
-        outputs=output,
         fn=predict,
         cache_examples=False,
     )

 AD-Copilot Demo: Comparison-Aware Anomaly Detection with Vision-Language Model
 """
+import json
 import os
+import re
 import traceback
 import spaces
 import gradio as gr
 import torch
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from qwen_vl_utils import process_vision_info
+from PIL import Image, ImageDraw, ImageFont
 # ---------------------------------------------------------------------------
 # Model loading (happens once at Space startup; weights stay on CPU until
 ).eval()
+# ---------------------------------------------------------------------------
+# BBox visualization
+# ---------------------------------------------------------------------------
+COLORS = [
+    "#FF4444", "#44AA44", "#4488FF", "#FF8800",
+    "#AA44FF", "#00CCCC", "#FF44AA", "#88AA00",
+]
+def parse_bboxes(text):
+    """Try to extract bbox JSON from model output."""
+    # Match ```json ... ``` or raw JSON array
+    pattern = r'```(?:json)?\s*(\[.*?\])\s*```'
+    match = re.search(pattern, text, re.DOTALL)
+    if match:
+        raw = match.group(1)
+    else:
+        # Try bare JSON array
+        match = re.search(r'(\[\s*\{.*?\}\s*\])', text, re.DOTALL)
+        if match:
+            raw = match.group(1)
+        else:
+            return None
+    try:
+        bboxes = json.loads(raw)
+        if isinstance(bboxes, list) and len(bboxes) > 0 and "bbox_2d" in bboxes[0]:
+            return bboxes
+    except json.JSONDecodeError:
+        pass
+    return None
+def draw_bboxes(image, bboxes):
+    """Draw bounding boxes with labels on image."""
+    img = image.copy()
+    draw = ImageDraw.Draw(img)
+    # Try to get a reasonable font
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
+        small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 13)
+    except (IOError, OSError):
+        font = ImageFont.load_default()
+        small_font = font
+    for i, bbox_info in enumerate(bboxes):
+        bbox = bbox_info.get("bbox_2d", [])
+        label = bbox_info.get("label", f"defect_{i}")
+        if len(bbox) != 4:
+            continue
+        x1, y1, x2, y2 = bbox
+        color = COLORS[i % len(COLORS)]
+        # Draw box with thicker outline
+        for w in range(3):
+            draw.rectangle([x1 - w, y1 - w, x2 + w, y2 + w], outline=color)
+        # Draw label background
+        text_bbox = draw.textbbox((0, 0), label, font=small_font)
+        tw = text_bbox[2] - text_bbox[0] + 8
+        th = text_bbox[3] - text_bbox[1] + 6
+        label_y = max(0, y1 - th - 2)
+        draw.rectangle([x1, label_y, x1 + tw, label_y + th], fill=color)
+        draw.text((x1 + 4, label_y + 2), label, fill="white", font=small_font)
+    return img
 # ---------------------------------------------------------------------------
 # Inference
 # ---------------------------------------------------------------------------
     max_new_tokens: float,
 ):
     if reference_image is None or test_image is None:
+        return "Please upload both a reference (good) image and a test image.", None
     try:
         max_new_tokens = int(max_new_tokens)
             skip_special_tokens=True,
             clean_up_tokenization_spaces=False,
         )[0]
+        # Try to visualize bboxes if present
+        bboxes = parse_bboxes(output)
+        vis_image = None
+        if bboxes:
+            vis_image = draw_bboxes(test_image, bboxes)
+        return output, vis_image
     except Exception as e:
         tb = traceback.format_exc()
         print(tb, flush=True)
+        return f"Error:\n{tb}", None
 # ---------------------------------------------------------------------------
         run_btn = gr.Button("Detect Anomaly", variant="primary", scale=2)
     output = gr.Textbox(label="Model Output", lines=4)
+    vis_output = gr.Image(label="Detection Visualization", visible=True)
     run_btn.click(
         fn=predict,
         inputs=[ref_img, test_img, prompt, max_tokens],
+        outputs=[output, vis_output],
     )
     gr.Examples(
         examples=EXAMPLES,
         inputs=[ref_img, test_img, prompt, max_tokens],
+        outputs=[output, vis_output],
         fn=predict,
         cache_examples=False,
     )