Spaces:

PraneshJs
/

InsideYolo

Paused

App Files Files Community

PraneshJs commited on Dec 4, 2025

Commit

64245e7

verified ·

1 Parent(s): de061d6

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -108

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # ==========================================================
-#  YOLOv5n Visualizer — "Inside Object Detection"
-#  - Uses small YOLOv5n (CPU-friendly)
 #  - Shows detections + early/mid/late feature maps
-#  - Gradio 5 compatible (theme supported)
 # ==========================================================
 import gradio as gr
@@ -10,11 +11,11 @@ import torch
 import numpy as np
 from PIL import Image
 # ------------------- GLOBALS -------------------
-MODEL_NAME = "yolov5n"   # smallest YOLOv5 model (fast & light)
 DEVICE = "cpu"
 MODEL = None
 FEATURE_MAPS = {}  # {layer_name: tensor(B,C,H,W)}
@@ -23,34 +24,45 @@ FEATURE_MAPS = {}  # {layer_name: tensor(B,C,H,W)}
 def load_model():
     """
-    Load YOLOv5n from torch.hub (ultralytics/yolov5) and
-    register forward hooks to capture internal feature maps.
     """
     global MODEL, FEATURE_MAPS
     if MODEL is not None:
         return MODEL
-    # Download and load YOLOv5n from GitHub (only on first run)
-    # repo 'ultralytics/yolov5' must be reachable during build/first call.
-    model = torch.hub.load("ultralytics/yolov5", MODEL_NAME, pretrained=True)
-    model.to(DEVICE)
-    model.eval()
-    FEATURE_MAPS = {}
-    def make_hook(name):
-        def hook(module, input, output):
-            # YOLO can run on GPU or CPU but we store CPU tensors for visualization
-            with torch.no_grad():
-                FEATURE_MAPS[name] = output.detach().cpu()
-        return hook
-    # Register hooks on some main layers in the YOLOv5 backbone/head
-    # We choose Conv / C3 / SPPF etc. so we can show early, mid, late stages.
-    for idx, m in enumerate(model.model):
-        cls_name = m.__class__.__name__
-        if cls_name in ["Conv", "C3", "Bottleneck", "BottleneckCSP", "SPPF"]:
-            m.register_forward_hook(make_hook(str(idx)))
     MODEL = model
     return MODEL
@@ -61,52 +73,56 @@ def load_model():
 def tensor_to_heatmap(fm, out_size):
     """
     Convert a feature map tensor (C,H,W) to a grayscale heatmap PIL image.
-    Steps:
       - average over channels
       - normalize to 0..1
-      - upscale to out_size
     """
     if fm.ndim != 3:
         return None
     fm_np = fm.numpy().astype(np.float32)  # (C,H,W)
-    # average over channels -> (H,W)
-    heat = fm_np.mean(axis=0)
-    if np.allclose(heat, 0):
         heat = np.zeros_like(heat)
     else:
-        heat = heat - heat.min()
         maxv = heat.max()
         if maxv > 0:
-            heat = heat / maxv
-    heat_img = (heat * 255).astype("uint8")
-    pil = Image.fromarray(heat_img, mode="L")
     pil = pil.resize(out_size, Image.NEAREST)
     return pil
 def pick_feature_maps():
     """
-    After a forward pass, FEATURE_MAPS has many layers.
-    We pick up to 3 layers: early, middle, late.
-    Returns: list of (name, tensor(C,H,W))
     """
     if not FEATURE_MAPS:
         return []
-    # keys are layer indices as strings: "0", "1", "4", ...
     keys = sorted(FEATURE_MAPS.keys(), key=lambda x: int(x))
-    fms = [FEATURE_MAPS[k][0] for k in keys]  # take batch 0
-    # pick early, mid, late
     idxs = [0, len(fms) // 2, len(fms) - 1]
-    idxs = sorted(list(set(idxs)))  # remove duplicate indices
     chosen = []
     for i in idxs:
-        chosen.append((keys[i], fms[i]))
     return chosen
@@ -114,101 +130,106 @@ def pick_feature_maps():
 def analyze_yolo(img, conf_thres, iou_thres, simple_mode):
     """
-    Run YOLO on the input image and return:
-      - detection overlay image
-      - early feature map heatmap
-      - mid feature map heatmap
-      - late feature map heatmap
-      - explanation markdown
     """
     if img is None:
         return (
-            None,  # det img
-            None,  # early fm
-            None,  # mid fm
-            None,  # late fm
             "⚠️ Please upload an image first."
         )
     model = load_model()
-    # Clear old feature maps
     FEATURE_MAPS.clear()
-    # In Gradio, `type="pil"` gives a PIL image already
     pil = img
     # Configure thresholds
-    model.conf = float(conf_thres)
-    model.iou = float(iou_thres)
     with torch.no_grad():
-        results = model(pil)
-    # YOLOv5 .render() draws boxes and labels on the image
-    rendered = results.render()[0]  # numpy array (H,W,C)
-    det_img = Image.fromarray(rendered)
-    # Collect feature maps from hooks
-    chosen_fms = pick_feature_maps()
     W, H = pil.size
-    heatmaps = [None, None, None]  # early, mid, late
-    for idx, item in enumerate(chosen_fms):
         name, fm = item
         hm = tensor_to_heatmap(fm, (W, H))
         heatmaps[idx] = hm
-    # Build readable explanation
     if simple_mode:
         explanation = (
             "🧒 **Simple explanation of what you see:**\n\n"
-            "1. YOLO first looks at your image and tries to find basic patterns like edges and corners.\n"
-            "2. Then it builds more complex shapes (like parts of objects: wheels, faces, etc.).\n"
-            "3. In the last layers, it focuses on whole objects and decides **what** and **where** they are.\n\n"
-            "**From top to bottom:**\n"
-            "- Left: final detections (boxes + labels).\n"
-            "- Early heatmap: where YOLO sees low-level details.\n"
-            "- Middle heatmap: where it sees object parts.\n"
-            "- Late heatmap: where it focuses on full objects.\n"
         )
     else:
         explanation = (
-            "🔬 **Technical explanation:**\n\n"
-            "- We run `yolov5n` (small YOLOv5) on CPU.\n"
-            "- Forward hooks capture intermediate feature maps from several Conv/C3/SPPF blocks.\n"
-            "- For each selected layer, we take the tensor `(C,H,W)`, average over channels to get a 2D\n"
-            "  activation map `(H,W)`, normalize it, and upscale it to the original image size.\n"
-            "- Early feature map ≈ low-level features (edges, textures).\n"
-            "- Middle feature map ≈ mid-level features (parts, shapes).\n"
-            "- Late feature map ≈ high-level features (object-centric regions used for detection).\n"
         )
-    # Append layer shapes info if available
-    fm_shapes_info = []
-    for name, fm in chosen_fms:
-        fm_shapes_info.append(f"Layer {name}: shape {tuple(fm.shape)} (C,H,W)")
-    if fm_shapes_info:
-        explanation += "\n**Feature map shapes captured:**\n" + "\n".join(f"- {s}" for s in fm_shapes_info)
     return det_img, heatmaps[0], heatmaps[1], heatmaps[2], explanation
-# ------------------- GRADIO UI (GRADIO 5) -------------------
-with gr.Blocks(
-    title="YOLOv5n Visualizer — Inside Object Detection",
-    theme=gr.themes.Soft()
-) as demo:
-    gr.Markdown("# 🧠 YOLOv5n Visualizer — See Inside Object Detection")
     gr.Markdown(
-        "Upload an image and see YOLO work **step by step**:\n"
-        "1. Final detections (boxes & labels)\n"
-        "2. Early feature activations (edges/textures)\n"
-        "3. Middle feature activations (parts/shapes)\n"
-        "4. Late feature activations (object focus)\n"
-        "Use the explanation toggle for simple or technical view."
     )
     with gr.Row():
@@ -218,29 +239,47 @@ with gr.Blocks(
                 type="pil"
             )
             conf_slider = gr.Slider(
-                0.1, 0.9, step=0.05, value=0.25,
                 label="Confidence threshold"
             )
             iou_slider = gr.Slider(
-                0.1, 0.9, step=0.05, value=0.45,
-                label="IoU threshold (for NMS)"
             )
             simple_ck = gr.Checkbox(
-                label="Explain in simple terms (kids/elders)",
                 value=True
             )
             run_btn = gr.Button("Run YOLO & Visualize", variant="primary")
         with gr.Column(scale=1):
-            out_det = gr.Image(label="Step 4 — Final detections (YOLOv5n)")
             explanation_md = gr.Markdown(label="Explanation")
-    gr.Markdown("### 🔍 Steps inside the network (feature maps)")
     with gr.Row():
-        fm1 = gr.Image(label="Step 1 — Early layer activation (edges & textures)", interactive=False)
-        fm2 = gr.Image(label="Step 2 — Middle layer activation (parts & shapes)", interactive=False)
-        fm3 = gr.Image(label="Step 3 — Late layer activation (objects)", interactive=False)
     run_btn.click(
         analyze_yolo,

 # ==========================================================
+#  YOLOv8n Visualizer — "Inside Object Detection"
+#  - Uses Ultralytics YOLOv8n (small, CPU-friendly)
 #  - Shows detections + early/mid/late feature maps
+#  - Simple vs Technical explanation
+#  - Gradio 5 compatible, also OK on 6 (no theme arg)
 # ==========================================================
 import gradio as gr
 import numpy as np
 from PIL import Image
+from ultralytics import YOLO
 # ------------------- GLOBALS -------------------
 DEVICE = "cpu"
 MODEL = None
 FEATURE_MAPS = {}  # {layer_name: tensor(B,C,H,W)}
 def load_model():
     """
+    Load YOLOv8n once and register forward hooks
+    on backbone/head layers to capture feature maps.
     """
     global MODEL, FEATURE_MAPS
     if MODEL is not None:
         return MODEL
+    # This will download yolov8n.pt on first run and cache it
+    model = YOLO("yolov8n.pt")
+    # Ensure model on CPU
+    if hasattr(model, "to"):
+        model.to(DEVICE)
+    else:
+        model.model.to(DEVICE)
+    model.model.eval()
+    FEATURE_MAPS = {}
+    # Register hooks on layers in the detection model
+    # For YOLOv8, model.model.model is a list of blocks (backbone + head)
+    for idx, layer in enumerate(model.model.model):
+        def make_hook(name):
+            def hook(module, inputs, output):
+                # Handle tensors vs lists/tuples
+                with torch.no_grad():
+                    out = output
+                    if isinstance(out, (list, tuple)):
+                        # pick first tensor-like element
+                        out = next(
+                            (o for o in out if isinstance(o, torch.Tensor)),
+                            None
+                        )
+                    if isinstance(out, torch.Tensor):
+                        FEATURE_MAPS[name] = out.detach().cpu()
+            return hook
+        layer.register_forward_hook(make_hook(str(idx)))
     MODEL = model
     return MODEL
 def tensor_to_heatmap(fm, out_size):
     """
     Convert a feature map tensor (C,H,W) to a grayscale heatmap PIL image.
       - average over channels
       - normalize to 0..1
+      - resize to out_size (W,H)
     """
     if fm.ndim != 3:
         return None
     fm_np = fm.numpy().astype(np.float32)  # (C,H,W)
+    heat = fm_np.mean(axis=0)  # (H,W)
+    if not np.any(heat):
         heat = np.zeros_like(heat)
     else:
+        heat -= heat.min()
         maxv = heat.max()
         if maxv > 0:
+            heat /= maxv
+    img = (heat * 255).astype("uint8")
+    pil = Image.fromarray(img, mode="L")
     pil = pil.resize(out_size, Image.NEAREST)
     return pil
 def pick_feature_maps():
     """
+    Choose three feature maps: early, middle, late.
+    FEATURE_MAPS keys are stringified indices "0", "1", ...
+    Returns list[(name, fm_tensor(C,H,W))]
     """
     if not FEATURE_MAPS:
         return []
+    # sort by numeric layer index
     keys = sorted(FEATURE_MAPS.keys(), key=lambda x: int(x))
+    fms = []
+    for k in keys:
+        t = FEATURE_MAPS[k]
+        if isinstance(t, torch.Tensor) and t.ndim == 4:
+            fms.append((k, t[0]))  # (name, (C,H,W))
+    if not fms:
+        return []
     idxs = [0, len(fms) // 2, len(fms) - 1]
+    idxs = sorted(set(idxs))
     chosen = []
     for i in idxs:
+        chosen.append(fms[i])
     return chosen
 def analyze_yolo(img, conf_thres, iou_thres, simple_mode):
     """
+    Run YOLOv8n on input image and produce:
+      - detection image with boxes
+      - early/mid/late feature map heatmaps
+      - explanation text (simple or technical)
     """
     if img is None:
         return (
+            None,  # detection image
+            None,  # early heatmap
+            None,  # mid heatmap
+            None,  # late heatmap
             "⚠️ Please upload an image first."
         )
     model = load_model()
+    # Clear old feature maps before forward
     FEATURE_MAPS.clear()
+    # Gradio gives PIL image (type="pil")
     pil = img
     # Configure thresholds
+    conf = float(conf_thres)
+    iou = float(iou_thres)
     with torch.no_grad():
+        results = model(
+            pil,
+            conf=conf,
+            iou=iou,
+            verbose=False
+        )
+    res = results[0]
+    # res.plot() returns numpy array (H,W,3), BGR by default, but visually OK
+    det_np = res.plot()
+    det_img = Image.fromarray(det_np)
+    # Now FEATURE_MAPS should be filled by hooks
+    chosen = pick_feature_maps()
     W, H = pil.size
+    heatmaps = [None, None, None]
+    for idx, item in enumerate(chosen):
         name, fm = item
         hm = tensor_to_heatmap(fm, (W, H))
         heatmaps[idx] = hm
+    # Build explanation
     if simple_mode:
         explanation = (
             "🧒 **Simple explanation of what you see:**\n\n"
+            "**Step 0 — Input image**: This is your original picture.\n\n"
+            "**Step 1 — Early layer heatmap**:\n"
+            "YOLO looks for very small details like edges, corners, and simple textures.\n\n"
+            "**Step 2 — Middle layer heatmap**:\n"
+            "It starts to see groups of pixels as shapes or parts of objects (like wheels, faces, etc.).\n\n"
+            "**Step 3 — Late layer heatmap**:\n"
+            "It focuses on whole objects and regions where it thinks something important is.\n\n"
+            "**Step 4 — Final detections**:\n"
+            "YOLO draws boxes and labels around what it believes are objects in the image.\n"
         )
     else:
         explanation = (
+            "🔬 **Technical explanation of the visualization:**\n\n"
+            "- We use **YOLOv8n** (Ultralytics) running on CPU.\n"
+            "- Forward hooks capture intermediate feature maps from backbone/head blocks.\n"
+            "- For each selected layer, we take the tensor `(C,H,W)` and average over channels to\n"
+            "  obtain a 2D activation map `(H,W)`, then normalize it and upsample it to `(W_img,H_img)`.\n"
+            "- Early feature map ≈ low-level features (edges, corners, local textures).\n"
+            "- Middle feature map ≈ mid-level features (object parts & shapes).\n"
+            "- Late feature map ≈ high-level features (object-centric regions that drive detection head).\n"
+            "- The detection image is produced by YOLO's standard post-processing (objectness, class\n"
+            "  scores, and Non-Maximum Suppression on bounding boxes).\n"
         )
+    # Add feature map shapes
+    if chosen:
+        explanation += "\n**Captured feature map shapes (C,H,W):**\n"
+        for name, fm in chosen:
+            explanation += f"- Layer {name}: {tuple(fm.shape)}\n"
     return det_img, heatmaps[0], heatmaps[1], heatmaps[2], explanation
+# ------------------- GRADIO UI -------------------
+with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection") as demo:
+    gr.Markdown("# 🧠 YOLOv8n Visualizer — Inside Object Detection")
     gr.Markdown(
+        "See what happens **inside** an object detection model.\n\n"
+        "**Steps shown:**\n"
+        "- **Step 0** — Input image\n"
+        "- **Step 1** — Early layer activation (edges & textures)\n"
+        "- **Step 2** — Middle layer activation (parts & shapes)\n"
+        "- **Step 3** — Late layer activation (objects)\n"
+        "- **Step 4** — Final detections (boxes & labels)\n"
     )
     with gr.Row():
                 type="pil"
             )
             conf_slider = gr.Slider(
+                minimum=0.1,
+                maximum=0.9,
+                step=0.05,
+                value=0.25,
                 label="Confidence threshold"
             )
             iou_slider = gr.Slider(
+                minimum=0.1,
+                maximum=0.9,
+                step=0.05,
+                value=0.45,
+                label="IoU threshold (NMS)"
             )
             simple_ck = gr.Checkbox(
+                label="Explain in simple terms (for kids/elders)",
                 value=True
             )
             run_btn = gr.Button("Run YOLO & Visualize", variant="primary")
         with gr.Column(scale=1):
+            out_det = gr.Image(
+                label="Step 4 — Final detections (YOLOv8n)",
+                interactive=False
+            )
             explanation_md = gr.Markdown(label="Explanation")
+    gr.Markdown("### 🔍 Steps 1–3: internal feature maps (what the network focuses on)")
     with gr.Row():
+        fm1 = gr.Image(
+            label="Step 1 — Early layer activation (edges & textures)",
+            interactive=False
+        )
+        fm2 = gr.Image(
+            label="Step 2 — Middle layer activation (parts & shapes)",
+            interactive=False
+        )
+        fm3 = gr.Image(
+            label="Step 3 — Late layer activation (objects)",
+            interactive=False
+        )
     run_btn.click(
         analyze_yolo,