# ========================================================== # YOLOv8n Visualizer — Inside Object Detection (Advanced) # - Uses Ultralytics YOLOv8n (small, CPU-friendly) # - Step 0: Input image # - Step 1: Early feature activation (edges/textures) # - Step 2: Middle feature activation (parts/shapes) # - Step 3: Late feature activation (objects) # - Step 4: Final detections (boxes + labels) # - Activation-CAM overlay (late layer heatmap on image) # - Channel explorer for late layer (view individual channels) # ========================================================== import gradio as gr import torch import numpy as np from PIL import Image from ultralytics import YOLO # ------------------- GLOBALS ------------------- DEVICE = "cpu" MODEL = None FEATURE_MAPS = {} # {layer_name: tensor(B,C,H,W)} # ------------------- MODEL LOADING ------------------- def load_model(): """ Load YOLOv8n once and register forward hooks on backbone/head layers to capture feature maps. """ global MODEL, FEATURE_MAPS if MODEL is not None: return MODEL model = YOLO("yolov8n.pt") # ensure on CPU if hasattr(model, "to"): model.to(DEVICE) else: model.model.to(DEVICE) model.model.eval() FEATURE_MAPS = {} # model.model.model is the list of modules (backbone + head) for idx, layer in enumerate(model.model.model): def make_hook(name): def hook(module, inputs, output): with torch.no_grad(): out = output if isinstance(out, (list, tuple)): out = next( (o for o in out if isinstance(o, torch.Tensor)), None ) if isinstance(out, torch.Tensor): FEATURE_MAPS[name] = out.detach().cpu() return hook layer.register_forward_hook(make_hook(str(idx))) MODEL = model return MODEL # ------------------- FEATURE MAP UTILITIES ------------------- def tensor_to_heatmap(fm, out_size): """ Convert a feature map tensor (C,H,W) to a grayscale heatmap PIL image. """ if fm.ndim != 3: return None fm_np = fm.numpy().astype(np.float32) heat = fm_np.mean(axis=0) # (H,W) if not np.any(heat): heat = np.zeros_like(heat) else: heat -= heat.min() maxv = heat.max() if maxv > 0: heat /= maxv img = (heat * 255).astype("uint8") pil = Image.fromarray(img, mode="L") pil = pil.resize(out_size, Image.NEAREST) return pil def heat_array_from_fm(fm): """ Same as tensor_to_heatmap but returns 0..1 numpy array (H,W). """ fm_np = fm.numpy().astype(np.float32) heat = fm_np.mean(axis=0) if not np.any(heat): heat = np.zeros_like(heat) else: heat -= heat.min() maxv = heat.max() if maxv > 0: heat /= maxv return heat def pick_feature_maps(): """ Choose three feature maps: early, middle, late. FEATURE_MAPS keys are stringified indices "0", "1", ... Returns list[(name, fm_tensor(C,H,W))] """ if not FEATURE_MAPS: return [] keys = sorted(FEATURE_MAPS.keys(), key=lambda x: int(x)) fms = [] for k in keys: t = FEATURE_MAPS[k] if isinstance(t, torch.Tensor) and t.ndim == 4: fms.append((k, t[0])) # (name, (C,H,W)) if not fms: return [] idxs = [0, len(fms) // 2, len(fms) - 1] idxs = sorted(set(idxs)) chosen = [] for i in idxs: chosen.append(fms[i]) return chosen def make_cam_overlay(base_pil, heat_01): """ Build a simple activation-CAM overlay (heatmap over image). heat_01: numpy (H_fm, W_fm) in [0,1], resized to image size. """ base = np.array(base_pil).astype(np.float32) / 255.0 # H,W,3 h, w = base.shape[:2] heat_resized = Image.fromarray((heat_01 * 255).astype("uint8"), mode="L").resize( (w, h), Image.BILINEAR ) heat_resized = np.array(heat_resized).astype(np.float32) / 255.0 # H,W # simple blue→red colormap r = heat_resized g = np.zeros_like(heat_resized) b = 1.0 - heat_resized cam = np.stack([r, g, b], axis=-1) # H,W,3 alpha = 0.45 blended = (1 - alpha) * base + alpha * cam blended = np.clip(blended * 255.0, 0, 255).astype("uint8") return Image.fromarray(blended) def single_channel_heatmap(channel_2d, out_size): """ Convert 2D channel to grayscale PIL heatmap. """ arr = channel_2d.astype(np.float32) if not np.any(arr): arr = np.zeros_like(arr) else: arr -= arr.min() maxv = arr.max() if maxv > 0: arr /= maxv img = (arr * 255).astype("uint8") pil = Image.fromarray(img, mode="L") pil = pil.resize(out_size, Image.NEAREST) return pil # ------------------- MAIN ANALYSIS FUNCTION ------------------- def analyze_yolo(img, conf_thres, iou_thres, simple_mode): """ Run YOLOv8n on input image and produce: - detection image with boxes - early/mid/late feature map heatmaps - activation-CAM overlay - channel explorer state - explanation markdown """ if img is None: return ( None, # det img None, # early None, # mid None, # late None, # cam overlay "⚠️ Please upload an image first.", "", # channel info gr.update(maximum=0, value=0), None, # channel heatmap {} # state ) model = load_model() FEATURE_MAPS.clear() pil = img conf = float(conf_thres) iou = float(iou_thres) with torch.no_grad(): results = model(pil, conf=conf, iou=iou, verbose=False) res = results[0] det_np = res.plot() # numpy HWC det_img = Image.fromarray(det_np) chosen = pick_feature_maps() W, H = pil.size heatmaps = [None, None, None] late_fm_np = None late_name = None for idx, item in enumerate(chosen): name, fm = item # fm: (C,H,W) hm = tensor_to_heatmap(fm, (W, H)) heatmaps[idx] = hm if idx == len(chosen) - 1: late_fm_np = fm.numpy().astype(np.float32) # (C,H,W) late_name = name # Activation-CAM overlay (using late feature map mean) cam_overlay = None channel_slider_update = gr.update(maximum=0, value=0) channel_info = "" channel_heatmap_img = None state = {} if late_fm_np is not None: C, H_fm, W_fm = late_fm_np.shape late_fm_tensor = torch.from_numpy(late_fm_np) heat_01 = heat_array_from_fm(late_fm_tensor) cam_overlay = make_cam_overlay(pil, heat_01) # Channel explorer: compute mean abs activation per channel means = np.mean(np.abs(late_fm_np), axis=(1, 2)) # (C,) order = np.argsort(means)[::-1] top_k = order[: min(8, C)].tolist() channel_info = ( f"Late layer **{late_name}** feature map: {C} channels of size {H_fm}×{W_fm}.\n" f"Top active channels (by mean |activation|): {top_k}" ) # default channel = strongest default_ch = int(top_k[0]) if top_k else 0 channel_slider_update = gr.update(maximum=C - 1, value=default_ch) # build heatmap for default channel default_ch_map = late_fm_np[default_ch] channel_heatmap_img = single_channel_heatmap(default_ch_map, (W, H)) # state for slider changes state = { "late_fm": late_fm_np, "W": W, "H": H, } # Explanation if simple_mode: explanation = ( "🧒 **Simple explanation of what you see:**\n\n" "- **Step 0 – Input image:** your original picture.\n" "- **Step 1 – Early layer heatmap:** the model sees edges and tiny details.\n" "- **Step 2 – Middle layer heatmap:** it starts seeing parts of objects and shapes.\n" "- **Step 3 – Late layer heatmap:** it focuses on full objects and important regions.\n" "- **Activation overlay:** colored map (blue→red) over the image showing *where* the model\n" " is looking the most in the final stage.\n" "- **Channel explorer:** each channel is like a tiny specialist (e.g., vertical lines,\n" " corners, or specific textures). You can slide through channels to see different patterns.\n" ) else: explanation = ( "🔬 **Technical explanation:**\n\n" "- We run **YOLOv8n** (Ultralytics) on CPU.\n" "- Forward hooks capture internal feature maps from several backbone/head blocks.\n" "- For each chosen layer, we take `(C,H,W)` and average over channels to get a 2D activation\n" " map `(H,W)`, normalize it, and upsample it to image resolution.\n" "- Early ≈ low-level features; Middle ≈ mid-level parts; Late ≈ high-level object-centric\n" " features.\n" "- The activation overlay is a CAM-style visualization built from the **mean late-layer\n" " activation**, colored and blended with the original image (not full gradient-based Grad-CAM,\n" " but an activation-based approximation).\n" "- In the channel explorer, channels are ranked by mean |activation|, and you can inspect each\n" " channel separately as a grayscale map, revealing different spatial patterns.\n" ) # Add feature map shapes if we have them if chosen: explanation += "\n**Captured feature map shapes (C,H,W):**\n" for name, fm in chosen: explanation += f"- Layer {name}: {tuple(fm.shape)}\n" return ( det_img, heatmaps[0], heatmaps[1], heatmaps[2], cam_overlay, explanation, channel_info, channel_slider_update, channel_heatmap_img, state, ) # ------------------- CHANNEL SLIDER UPDATE ------------------- def update_channel(state, ch_idx): """ When slider moves, update the channel heatmap (late layer). """ if not state or "late_fm" not in state: return gr.update(value=None) late_fm = state["late_fm"] # (C,H,W) W = state["W"] H = state["H"] C = late_fm.shape[0] idx = int(ch_idx) if idx < 0 or idx >= C: idx = 0 ch_map = late_fm[idx] img = single_channel_heatmap(ch_map, (W, H)) return gr.update(value=img) # ------------------- GRADIO UI ------------------- with gr.Blocks(title="YOLOv8n Visualizer — Inside Object Detection (Advanced)") as demo: gr.Markdown("# 🧠 YOLOv8n Visualizer — Inside Object Detection (Advanced)") gr.Markdown( "Explore what happens **inside** an object detection model.\n\n" "**Steps shown:**\n" "- **Step 0** — Input image\n" "- **Step 1** — Early layer activation (edges & textures)\n" "- **Step 2** — Middle layer activation (parts & shapes)\n" "- **Step 3** — Late layer activation (objects)\n" "- **Step 4** — Final detections (boxes & labels)\n" "- **Activation overlay** — CAM-style heatmap over the image\n" "- **Channel explorer** — inspect individual channels in the late layer\n" ) with gr.Row(): with gr.Column(scale=1): in_img = gr.Image( label="Step 0 — Input image", type="pil" ) conf_slider = gr.Slider( minimum=0.1, maximum=0.9, step=0.05, value=0.25, label="Confidence threshold" ) iou_slider = gr.Slider( minimum=0.1, maximum=0.9, step=0.05, value=0.45, label="IoU threshold (NMS)" ) simple_ck = gr.Checkbox( label="Explain in simple terms (kids/elders)", value=True ) run_btn = gr.Button("Run YOLO & Visualize", variant="primary") with gr.Column(scale=1): out_det = gr.Image( label="Step 4 — Final detections (YOLOv8n)", interactive=False ) cam_img = gr.Image( label="Activation overlay (late layer focus)", interactive=False ) explanation_md = gr.Markdown(label="Explanation") gr.Markdown("### 🔍 Steps 1–3: internal feature maps (what the network focuses on)") with gr.Row(): fm1 = gr.Image( label="Step 1 — Early layer activation (edges & textures)", interactive=False ) fm2 = gr.Image( label="Step 2 — Middle layer activation (parts & shapes)", interactive=False ) fm3 = gr.Image( label="Step 3 — Late layer activation (objects)", interactive=False ) gr.Markdown("### 🔬 Channel explorer (late layer)") channel_info_md = gr.Markdown() channel_slider = gr.Slider( minimum=0, maximum=0, step=1, value=0, label="Channel index (late layer)" ) channel_heatmap = gr.Image( label="Selected channel heatmap (grayscale)", interactive=False ) state = gr.State() run_btn.click( analyze_yolo, inputs=[in_img, conf_slider, iou_slider, simple_ck], outputs=[ out_det, fm1, fm2, fm3, cam_img, explanation_md, channel_info_md, channel_slider, channel_heatmap, state, ], ) channel_slider.change( update_channel, inputs=[state, channel_slider], outputs=[channel_heatmap], ) demo.launch()