Spaces:

PraneshJs
/

InsideYolo

Paused

App Files Files Community

PraneshJs commited on Dec 4, 2025

Commit

34a6738

verified ·

1 Parent(s): 64245e7

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -65

app.py CHANGED Viewed

@@ -1,9 +1,13 @@
 # ==========================================================
-#  YOLOv8n Visualizer — "Inside Object Detection"
 #  - Uses Ultralytics YOLOv8n (small, CPU-friendly)
-#  - Shows detections + early/mid/late feature maps
-#  - Simple vs Technical explanation
-#  - Gradio 5 compatible, also OK on 6 (no theme arg)
 # ==========================================================
 import gradio as gr
@@ -31,29 +35,24 @@ def load_model():
     if MODEL is not None:
         return MODEL
-    # This will download yolov8n.pt on first run and cache it
     model = YOLO("yolov8n.pt")
-    # Ensure model on CPU
     if hasattr(model, "to"):
         model.to(DEVICE)
     else:
         model.model.to(DEVICE)
     model.model.eval()
     FEATURE_MAPS = {}
-    # Register hooks on layers in the detection model
-    # For YOLOv8, model.model.model is a list of blocks (backbone + head)
     for idx, layer in enumerate(model.model.model):
         def make_hook(name):
             def hook(module, inputs, output):
-                # Handle tensors vs lists/tuples
                 with torch.no_grad():
                     out = output
                     if isinstance(out, (list, tuple)):
-                        # pick first tensor-like element
                         out = next(
                             (o for o in out if isinstance(o, torch.Tensor)),
                             None
@@ -73,14 +72,11 @@ def load_model():
 def tensor_to_heatmap(fm, out_size):
     """
     Convert a feature map tensor (C,H,W) to a grayscale heatmap PIL image.
-      - average over channels
-      - normalize to 0..1
-      - resize to out_size (W,H)
     """
     if fm.ndim != 3:
         return None
-    fm_np = fm.numpy().astype(np.float32)  # (C,H,W)
     heat = fm_np.mean(axis=0)  # (H,W)
     if not np.any(heat):
@@ -97,6 +93,22 @@ def tensor_to_heatmap(fm, out_size):
     return pil
 def pick_feature_maps():
     """
     Choose three feature maps: early, middle, late.
@@ -106,7 +118,6 @@ def pick_feature_maps():
     if not FEATURE_MAPS:
         return []
-    # sort by numeric layer index
     keys = sorted(FEATURE_MAPS.keys(), key=lambda x: int(x))
     fms = []
     for k in keys:
@@ -126,6 +137,50 @@ def pick_feature_maps():
     return chosen
 # ------------------- MAIN ANALYSIS FUNCTION -------------------
 def analyze_yolo(img, conf_thres, iou_thres, simple_mode):
@@ -133,103 +188,177 @@ def analyze_yolo(img, conf_thres, iou_thres, simple_mode):
     Run YOLOv8n on input image and produce:
       - detection image with boxes
       - early/mid/late feature map heatmaps
-      - explanation text (simple or technical)
     """
     if img is None:
         return (
-            None,  # detection image
-            None,  # early heatmap
-            None,  # mid heatmap
-            None,  # late heatmap
-            "⚠️ Please upload an image first."
         )
     model = load_model()
-    # Clear old feature maps before forward
     FEATURE_MAPS.clear()
-    # Gradio gives PIL image (type="pil")
     pil = img
-    # Configure thresholds
     conf = float(conf_thres)
     iou = float(iou_thres)
     with torch.no_grad():
-        results = model(
-            pil,
-            conf=conf,
-            iou=iou,
-            verbose=False
-        )
     res = results[0]
-    # res.plot() returns numpy array (H,W,3), BGR by default, but visually OK
-    det_np = res.plot()
     det_img = Image.fromarray(det_np)
-    # Now FEATURE_MAPS should be filled by hooks
     chosen = pick_feature_maps()
     W, H = pil.size
     heatmaps = [None, None, None]
     for idx, item in enumerate(chosen):
-        name, fm = item
         hm = tensor_to_heatmap(fm, (W, H))
         heatmaps[idx] = hm
-    # Build explanation
     if simple_mode:
         explanation = (
             "🧒 **Simple explanation of what you see:**\n\n"
-            "**Step 0 — Input image**: This is your original picture.\n\n"
-            "**Step 1 — Early layer heatmap**:\n"
-            "YOLO looks for very small details like edges, corners, and simple textures.\n\n"
-            "**Step 2 — Middle layer heatmap**:\n"
-            "It starts to see groups of pixels as shapes or parts of objects (like wheels, faces, etc.).\n\n"
-            "**Step 3 — Late layer heatmap**:\n"
-            "It focuses on whole objects and regions where it thinks something important is.\n\n"
-            "**Step 4 — Final detections**:\n"
-            "YOLO draws boxes and labels around what it believes are objects in the image.\n"
         )
     else:
         explanation = (
-            "🔬 **Technical explanation of the visualization:**\n\n"
-            "- We use **YOLOv8n** (Ultralytics) running on CPU.\n"
-            "- Forward hooks capture intermediate feature maps from backbone/head blocks.\n"
-            "- For each selected layer, we take the tensor `(C,H,W)` and average over channels to\n"
-            "  obtain a 2D activation map `(H,W)`, then normalize it and upsample it to `(W_img,H_img)`.\n"
-            "- Early feature map ≈ low-level features (edges, corners, local textures).\n"
-            "- Middle feature map ≈ mid-level features (object parts & shapes).\n"
-            "- Late feature map ≈ high-level features (object-centric regions that drive detection head).\n"
-            "- The detection image is produced by YOLO's standard post-processing (objectness, class\n"
-            "  scores, and Non-Maximum Suppression on bounding boxes).\n"
         )
-    # Add feature map shapes
     if chosen:
         explanation += "\n**Captured feature map shapes (C,H,W):**\n"
         for name, fm in chosen:
             explanation += f"- Layer {name}: {tuple(fm.shape)}\n"
-    return det_img, heatmaps[0], heatmaps[1], heatmaps[2], explanation
 # ------------------- GRADIO UI -------------------
-with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection") as demo:
-    gr.Markdown("# 🧠 YOLOv8n Visualizer — Inside Object Detection")
     gr.Markdown(
-        "See what happens **inside** an object detection model.\n\n"
         "**Steps shown:**\n"
         "- **Step 0** — Input image\n"
         "- **Step 1** — Early layer activation (edges & textures)\n"
         "- **Step 2** — Middle layer activation (parts & shapes)\n"
         "- **Step 3** — Late layer activation (objects)\n"
         "- **Step 4** — Final detections (boxes & labels)\n"
     )
     with gr.Row():
@@ -253,7 +382,7 @@ with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection") as demo:
                 label="IoU threshold (NMS)"
             )
             simple_ck = gr.Checkbox(
-                label="Explain in simple terms (for kids/elders)",
                 value=True
             )
             run_btn = gr.Button("Run YOLO & Visualize", variant="primary")
@@ -263,6 +392,10 @@ with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection") as demo:
                 label="Step 4 — Final detections (YOLOv8n)",
                 interactive=False
             )
             explanation_md = gr.Markdown(label="Explanation")
     gr.Markdown("### 🔍 Steps 1–3: internal feature maps (what the network focuses on)")
@@ -281,10 +414,44 @@ with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection") as demo:
             interactive=False
         )
     run_btn.click(
         analyze_yolo,
         inputs=[in_img, conf_slider, iou_slider, simple_ck],
-        outputs=[out_det, fm1, fm2, fm3, explanation_md]
     )
 demo.launch()

 # ==========================================================
+#  YOLOv8n Visualizer — Inside Object Detection (Advanced)
 #  - Uses Ultralytics YOLOv8n (small, CPU-friendly)
+#  - Step 0: Input image
+#  - Step 1: Early feature activation (edges/textures)
+#  - Step 2: Middle feature activation (parts/shapes)
+#  - Step 3: Late feature activation (objects)
+#  - Step 4: Final detections (boxes + labels)
+#  - Activation-CAM overlay (late layer heatmap on image)
+#  - Channel explorer for late layer (view individual channels)
 # ==========================================================
 import gradio as gr
     if MODEL is not None:
         return MODEL
     model = YOLO("yolov8n.pt")
+    # ensure on CPU
     if hasattr(model, "to"):
         model.to(DEVICE)
     else:
         model.model.to(DEVICE)
     model.model.eval()
     FEATURE_MAPS = {}
+    # model.model.model is the list of modules (backbone + head)
     for idx, layer in enumerate(model.model.model):
         def make_hook(name):
             def hook(module, inputs, output):
                 with torch.no_grad():
                     out = output
                     if isinstance(out, (list, tuple)):
                         out = next(
                             (o for o in out if isinstance(o, torch.Tensor)),
                             None
 def tensor_to_heatmap(fm, out_size):
     """
     Convert a feature map tensor (C,H,W) to a grayscale heatmap PIL image.
     """
     if fm.ndim != 3:
         return None
+    fm_np = fm.numpy().astype(np.float32)
     heat = fm_np.mean(axis=0)  # (H,W)
     if not np.any(heat):
     return pil
+def heat_array_from_fm(fm):
+    """
+    Same as tensor_to_heatmap but returns 0..1 numpy array (H,W).
+    """
+    fm_np = fm.numpy().astype(np.float32)
+    heat = fm_np.mean(axis=0)
+    if not np.any(heat):
+        heat = np.zeros_like(heat)
+    else:
+        heat -= heat.min()
+        maxv = heat.max()
+        if maxv > 0:
+            heat /= maxv
+    return heat
 def pick_feature_maps():
     """
     Choose three feature maps: early, middle, late.
     if not FEATURE_MAPS:
         return []
     keys = sorted(FEATURE_MAPS.keys(), key=lambda x: int(x))
     fms = []
     for k in keys:
     return chosen
+def make_cam_overlay(base_pil, heat_01):
+    """
+    Build a simple activation-CAM overlay (heatmap over image).
+    heat_01: numpy (H_fm, W_fm) in [0,1], resized to image size.
+    """
+    base = np.array(base_pil).astype(np.float32) / 255.0  # H,W,3
+    h, w = base.shape[:2]
+    heat_resized = Image.fromarray((heat_01 * 255).astype("uint8"), mode="L").resize(
+        (w, h), Image.BILINEAR
+    )
+    heat_resized = np.array(heat_resized).astype(np.float32) / 255.0  # H,W
+    # simple blue→red colormap
+    r = heat_resized
+    g = np.zeros_like(heat_resized)
+    b = 1.0 - heat_resized
+    cam = np.stack([r, g, b], axis=-1)  # H,W,3
+    alpha = 0.45
+    blended = (1 - alpha) * base + alpha * cam
+    blended = np.clip(blended * 255.0, 0, 255).astype("uint8")
+    return Image.fromarray(blended)
+def single_channel_heatmap(channel_2d, out_size):
+    """
+    Convert 2D channel to grayscale PIL heatmap.
+    """
+    arr = channel_2d.astype(np.float32)
+    if not np.any(arr):
+        arr = np.zeros_like(arr)
+    else:
+        arr -= arr.min()
+        maxv = arr.max()
+        if maxv > 0:
+            arr /= maxv
+    img = (arr * 255).astype("uint8")
+    pil = Image.fromarray(img, mode="L")
+    pil = pil.resize(out_size, Image.NEAREST)
+    return pil
 # ------------------- MAIN ANALYSIS FUNCTION -------------------
 def analyze_yolo(img, conf_thres, iou_thres, simple_mode):
     Run YOLOv8n on input image and produce:
       - detection image with boxes
       - early/mid/late feature map heatmaps
+      - activation-CAM overlay
+      - channel explorer state
+      - explanation markdown
     """
     if img is None:
         return (
+            None,  # det img
+            None,  # early
+            None,  # mid
+            None,  # late
+            None,  # cam overlay
+            "⚠️ Please upload an image first.",
+            "",    # channel info
+            gr.update(maximum=0, value=0),
+            None,  # channel heatmap
+            {}     # state
         )
     model = load_model()
     FEATURE_MAPS.clear()
     pil = img
     conf = float(conf_thres)
     iou = float(iou_thres)
     with torch.no_grad():
+        results = model(pil, conf=conf, iou=iou, verbose=False)
     res = results[0]
+    det_np = res.plot()  # numpy HWC
     det_img = Image.fromarray(det_np)
     chosen = pick_feature_maps()
     W, H = pil.size
     heatmaps = [None, None, None]
+    late_fm_np = None
+    late_name = None
     for idx, item in enumerate(chosen):
+        name, fm = item  # fm: (C,H,W)
         hm = tensor_to_heatmap(fm, (W, H))
         heatmaps[idx] = hm
+        if idx == len(chosen) - 1:
+            late_fm_np = fm.numpy().astype(np.float32)  # (C,H,W)
+            late_name = name
+    # Activation-CAM overlay (using late feature map mean)
+    cam_overlay = None
+    channel_slider_update = gr.update(maximum=0, value=0)
+    channel_info = ""
+    channel_heatmap_img = None
+    state = {}
+    if late_fm_np is not None:
+        C, H_fm, W_fm = late_fm_np.shape
+        late_fm_tensor = torch.from_numpy(late_fm_np)
+        heat_01 = heat_array_from_fm(late_fm_tensor)
+        cam_overlay = make_cam_overlay(pil, heat_01)
+        # Channel explorer: compute mean abs activation per channel
+        means = np.mean(np.abs(late_fm_np), axis=(1, 2))  # (C,)
+        order = np.argsort(means)[::-1]
+        top_k = order[: min(8, C)].tolist()
+        channel_info = (
+            f"Late layer **{late_name}** feature map: {C} channels of size {H_fm}×{W_fm}.\n"
+            f"Top active channels (by mean |activation|): {top_k}"
+        )
+        # default channel = strongest
+        default_ch = int(top_k[0]) if top_k else 0
+        channel_slider_update = gr.update(maximum=C - 1, value=default_ch)
+        # build heatmap for default channel
+        default_ch_map = late_fm_np[default_ch]
+        channel_heatmap_img = single_channel_heatmap(default_ch_map, (W, H))
+        # state for slider changes
+        state = {
+            "late_fm": late_fm_np,
+            "W": W,
+            "H": H,
+        }
+    # Explanation
     if simple_mode:
         explanation = (
             "🧒 **Simple explanation of what you see:**\n\n"
+            "- **Step 0 – Input image:** your original picture.\n"
+            "- **Step 1 – Early layer heatmap:** the model sees edges and tiny details.\n"
+            "- **Step 2 – Middle layer heatmap:** it starts seeing parts of objects and shapes.\n"
+            "- **Step 3 – Late layer heatmap:** it focuses on full objects and important regions.\n"
+            "- **Activation overlay:** colored map (blue→red) over the image showing *where* the model\n"
+            "  is looking the most in the final stage.\n"
+            "- **Channel explorer:** each channel is like a tiny specialist (e.g., vertical lines,\n"
+            "  corners, or specific textures). You can slide through channels to see different patterns.\n"
         )
     else:
         explanation = (
+            "🔬 **Technical explanation:**\n\n"
+            "- We run **YOLOv8n** (Ultralytics) on CPU.\n"
+            "- Forward hooks capture internal feature maps from several backbone/head blocks.\n"
+            "- For each chosen layer, we take `(C,H,W)` and average over channels to get a 2D activation\n"
+            "  map `(H,W)`, normalize it, and upsample it to image resolution.\n"
+            "- Early ≈ low-level features; Middle ≈ mid-level parts; Late ≈ high-level object-centric\n"
+            "  features.\n"
+            "- The activation overlay is a CAM-style visualization built from the **mean late-layer\n"
+            "  activation**, colored and blended with the original image (not full gradient-based Grad-CAM,\n"
+            "  but an activation-based approximation).\n"
+            "- In the channel explorer, channels are ranked by mean |activation|, and you can inspect each\n"
+            "  channel separately as a grayscale map, revealing different spatial patterns.\n"
         )
+    # Add feature map shapes if we have them
     if chosen:
         explanation += "\n**Captured feature map shapes (C,H,W):**\n"
         for name, fm in chosen:
             explanation += f"- Layer {name}: {tuple(fm.shape)}\n"
+    return (
+        det_img,
+        heatmaps[0],
+        heatmaps[1],
+        heatmaps[2],
+        cam_overlay,
+        explanation,
+        channel_info,
+        channel_slider_update,
+        channel_heatmap_img,
+        state,
+    )
+# ------------------- CHANNEL SLIDER UPDATE -------------------
+def update_channel(state, ch_idx):
+    """
+    When slider moves, update the channel heatmap (late layer).
+    """
+    if not state or "late_fm" not in state:
+        return gr.update(value=None)
+    late_fm = state["late_fm"]  # (C,H,W)
+    W = state["W"]
+    H = state["H"]
+    C = late_fm.shape[0]
+    idx = int(ch_idx)
+    if idx < 0 or idx >= C:
+        idx = 0
+    ch_map = late_fm[idx]
+    img = single_channel_heatmap(ch_map, (W, H))
+    return gr.update(value=img)
 # ------------------- GRADIO UI -------------------
+with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection (Advanced)") as demo:
+    gr.Markdown("# 🧠 YOLOv8n Visualizer — Inside Object Detection (Advanced)")
     gr.Markdown(
+        "Explore what happens **inside** an object detection model.\n\n"
         "**Steps shown:**\n"
         "- **Step 0** — Input image\n"
         "- **Step 1** — Early layer activation (edges & textures)\n"
         "- **Step 2** — Middle layer activation (parts & shapes)\n"
         "- **Step 3** — Late layer activation (objects)\n"
         "- **Step 4** — Final detections (boxes & labels)\n"
+        "- **Activation overlay** — CAM-style heatmap over the image\n"
+        "- **Channel explorer** — inspect individual channels in the late layer\n"
     )
     with gr.Row():
                 label="IoU threshold (NMS)"
             )
             simple_ck = gr.Checkbox(
+                label="Explain in simple terms (kids/elders)",
                 value=True
             )
             run_btn = gr.Button("Run YOLO & Visualize", variant="primary")
                 label="Step 4 — Final detections (YOLOv8n)",
                 interactive=False
             )
+            cam_img = gr.Image(
+                label="Activation overlay (late layer focus)",
+                interactive=False
+            )
             explanation_md = gr.Markdown(label="Explanation")
     gr.Markdown("### 🔍 Steps 1–3: internal feature maps (what the network focuses on)")
             interactive=False
         )
+    gr.Markdown("### 🔬 Channel explorer (late layer)")
+    channel_info_md = gr.Markdown()
+    channel_slider = gr.Slider(
+        minimum=0,
+        maximum=0,
+        step=1,
+        value=0,
+        label="Channel index (late layer)"
+    )
+    channel_heatmap = gr.Image(
+        label="Selected channel heatmap (grayscale)",
+        interactive=False
+    )
+    state = gr.State()
     run_btn.click(
         analyze_yolo,
         inputs=[in_img, conf_slider, iou_slider, simple_ck],
+        outputs=[
+            out_det,
+            fm1,
+            fm2,
+            fm3,
+            cam_img,
+            explanation_md,
+            channel_info_md,
+            channel_slider,
+            channel_heatmap,
+            state,
+        ],
+    )
+    channel_slider.change(
+        update_channel,
+        inputs=[state, channel_slider],
+        outputs=[channel_heatmap],
     )
 demo.launch()