ISR

Runtime error

Zhen Ye Claude Opus 4.6 (1M context) commited on Mar 14

Commit

5338c46

1 Parent(s): 6180bac

feat(inspection): add depth analysis and attention heatmap endpoints (Phase 2)

Backend:
- inspection/depth.py: on-demand DepthAnythingV2 inference with LRU caching,
viridis colorization, stats computation, raw/json/colorized response formats
- inspection/attention.py: GradCAM for DETR/GDINO, saliency for YOLO,
gaussian fallback, overlay generation, per-request caching
- inspection/router.py: GET /inspect/depth and GET /inspect/attention endpoints
- 28 new tests across depth and attention modules

Frontend:
- inspection-api.js: CORS fallback for depth binary format, explicit format params
- inspection-renders.js: depth legend with min/max meters, track depth stats,
attention legend with peak/avg intensity, improved alpha blending
- inspection.js: mode-specific loading messages

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (8) hide show

frontend/js/api/inspection-api.js +51 -15
frontend/js/ui/inspection-renders.js +185 -7
frontend/js/ui/inspection.js +24 -5
inspection/attention.py +537 -0
inspection/depth.py +212 -0
inspection/router.py +198 -0
tests/test_inspection_attention.py +380 -0
tests/test_inspection_depth.py +480 -0

frontend/js/api/inspection-api.js CHANGED Viewed

@@ -73,6 +73,7 @@ APP.api.inspection.generateMask = async function (jobId, frameIdx, trackId) {
  */
 APP.api.inspection.fetchDepth = async function (jobId, frameIdx) {
     const base = APP.core.state.hf.baseUrl;
     const url = `${base}/inspect/depth/${jobId}/${frameIdx}?format=raw`;
     const resp = await fetch(url);
     if (!resp.ok) throw new Error(`Depth fetch failed: ${resp.status}`);
@@ -80,30 +81,63 @@ APP.api.inspection.fetchDepth = async function (jobId, frameIdx) {
     const contentType = resp.headers.get("content-type") || "";
     if (contentType.includes("application/octet-stream")) {
-        // Binary float32 format
         const w = parseInt(resp.headers.get("X-Depth-Width"), 10);
         const h = parseInt(resp.headers.get("X-Depth-Height"), 10);
         const minD = parseFloat(resp.headers.get("X-Depth-Min"));
         const maxD = parseFloat(resp.headers.get("X-Depth-Max"));
         const buf = await resp.arrayBuffer();
-        return { width: w, height: h, min: minD, max: maxD, data: new Float32Array(buf) };
     } else {
         // JSON + base64 format
         const json = await resp.json();
-        const raw = atob(json.data_b64);
-        const buf = new ArrayBuffer(raw.length);
-        const view = new Uint8Array(buf);
-        for (let i = 0; i < raw.length; i++) view[i] = raw.charCodeAt(i);
-        return {
-            width: json.width,
-            height: json.height,
-            min: json.min_depth,
-            max: json.max_depth,
-            data: new Float32Array(buf)
-        };
     }
 };
 /**
  * Fetch attention heatmap for a specific track on a specific frame.
  * @param {string} jobId
@@ -113,7 +147,7 @@ APP.api.inspection.fetchDepth = async function (jobId, frameIdx) {
  */
 APP.api.inspection.fetchAttention = async function (jobId, frameIdx, trackId) {
     const base = APP.core.state.hf.baseUrl;
-    const url = `${base}/inspect/attention/${jobId}/${frameIdx}/${encodeURIComponent(trackId)}`;
     const resp = await fetch(url);
     if (!resp.ok) throw new Error(`Attention fetch failed: ${resp.status}`);
@@ -126,7 +160,9 @@ APP.api.inspection.fetchAttention = async function (jobId, frameIdx, trackId) {
     return {
         width: json.width,
         height: json.height,
-        data: new Float32Array(buf)
     };
 };

  */
 APP.api.inspection.fetchDepth = async function (jobId, frameIdx) {
     const base = APP.core.state.hf.baseUrl;
+    // Try binary format first; fall back to JSON if CORS strips custom headers
     const url = `${base}/inspect/depth/${jobId}/${frameIdx}?format=raw`;
     const resp = await fetch(url);
     if (!resp.ok) throw new Error(`Depth fetch failed: ${resp.status}`);
     const contentType = resp.headers.get("content-type") || "";
     if (contentType.includes("application/octet-stream")) {
+        // Binary float32 format with metadata in headers
         const w = parseInt(resp.headers.get("X-Depth-Width"), 10);
         const h = parseInt(resp.headers.get("X-Depth-Height"), 10);
         const minD = parseFloat(resp.headers.get("X-Depth-Min"));
         const maxD = parseFloat(resp.headers.get("X-Depth-Max"));
         const buf = await resp.arrayBuffer();
+        const data = new Float32Array(buf);
+        // If CORS stripped headers, infer dimensions from data length
+        if (isNaN(w) || isNaN(h)) {
+            // Fall back to JSON format
+            return await APP.api.inspection._fetchDepthJson(jobId, frameIdx);
+        }
+        return {
+            width: w,
+            height: h,
+            min: isNaN(minD) ? 0 : minD,
+            max: isNaN(maxD) ? 1 : maxD,
+            data: data
+        };
     } else {
         // JSON + base64 format
         const json = await resp.json();
+        return APP.api.inspection._decodeDepthJson(json);
     }
 };
+/**
+ * Fallback: fetch depth in JSON format if raw binary headers are unavailable.
+ */
+APP.api.inspection._fetchDepthJson = async function (jobId, frameIdx) {
+    const base = APP.core.state.hf.baseUrl;
+    const url = `${base}/inspect/depth/${jobId}/${frameIdx}?format=json`;
+    const resp = await fetch(url);
+    if (!resp.ok) throw new Error(`Depth (JSON) fetch failed: ${resp.status}`);
+    const json = await resp.json();
+    return APP.api.inspection._decodeDepthJson(json);
+};
+/**
+ * Decode a JSON depth response to the standard { width, height, min, max, data } format.
+ */
+APP.api.inspection._decodeDepthJson = function (json) {
+    const raw = atob(json.data_b64);
+    const buf = new ArrayBuffer(raw.length);
+    const view = new Uint8Array(buf);
+    for (let i = 0; i < raw.length; i++) view[i] = raw.charCodeAt(i);
+    return {
+        width: json.width,
+        height: json.height,
+        min: json.min_depth,
+        max: json.max_depth,
+        data: new Float32Array(buf)
+    };
+};
 /**
  * Fetch attention heatmap for a specific track on a specific frame.
  * @param {string} jobId
  */
 APP.api.inspection.fetchAttention = async function (jobId, frameIdx, trackId) {
     const base = APP.core.state.hf.baseUrl;
+    const url = `${base}/inspect/attention/${jobId}/${frameIdx}/${encodeURIComponent(trackId)}?format=json`;
     const resp = await fetch(url);
     if (!resp.ok) throw new Error(`Attention fetch failed: ${resp.status}`);
     return {
         width: json.width,
         height: json.height,
+        data: new Float32Array(buf),
+        trackId: json.track_id || trackId,
+        frameIdx: json.frame_idx || frameIdx
     };
 };

frontend/js/ui/inspection-renders.js CHANGED Viewed

@@ -226,7 +226,7 @@ APP.ui.inspectionRenders._renderEdge = function (canvas, frameImg, edgeData, tra
 };
 /**
- * Render depth colormap (viridis-like palette).
  */
 APP.ui.inspectionRenders._renderDepth = function (canvas, frameImg, depthData, track) {
     if (!frameImg) return;
@@ -265,7 +265,14 @@ APP.ui.inspectionRenders._renderDepth = function (canvas, frameImg, depthData, t
     const id = img.data;
     for (let i = 0; i < dd.length; i++) {
-        const t = APP.core.utils.clamp((dd[i] - minD) / range, 0, 1);
         const rgb = APP.ui.inspectionRenders._viridis(t);
         const oi = i * 4;
         id[oi]     = rgb[0];
@@ -279,11 +286,113 @@ APP.ui.inspectionRenders._renderDepth = function (canvas, frameImg, depthData, t
     // Draw scaled to canvas size
     ctx.drawImage(depthCanvas, 0, 0, w, h);
     APP.ui.inspectionRenders._drawBBoxHighlight(ctx, track, w, h);
 };
 /**
- * Render attention heatmap overlaid on the base frame.
  */
 APP.ui.inspectionRenders._renderAttention = function (canvas, frameImg, attentionData, track) {
     if (!frameImg) return;
@@ -294,11 +403,13 @@ APP.ui.inspectionRenders._renderAttention = function (canvas, frameImg, attentio
     canvas.height = h;
     const ctx = canvas.getContext("2d");
-    // Draw base frame
     ctx.drawImage(frameImg, 0, 0);
     if (!attentionData || !attentionData.data) {
-        ctx.fillStyle = "rgba(0,0,0,0.6)";
         ctx.fillRect(0, 0, w, h);
         ctx.fillStyle = "#aaa";
         ctx.font = "14px monospace";
@@ -307,7 +418,7 @@ APP.ui.inspectionRenders._renderAttention = function (canvas, frameImg, attentio
         return;
     }
-    // Render attention as a hot colormap overlay
     const aw = attentionData.width;
     const ah = attentionData.height;
     const ad = attentionData.data;
@@ -319,20 +430,87 @@ APP.ui.inspectionRenders._renderAttention = function (canvas, frameImg, attentio
     const img = hctx.createImageData(aw, ah);
     const id = img.data;
     for (let i = 0; i < ad.length; i++) {
         const t = APP.core.utils.clamp(ad[i], 0, 1);
         const rgb = APP.ui.inspectionRenders._inferno(t);
         const oi = i * 4;
         id[oi]     = rgb[0];
         id[oi + 1] = rgb[1];
         id[oi + 2] = rgb[2];
-        id[oi + 3] = Math.round(t * 180); // semi-transparent based on intensity
     }
     hctx.putImageData(img, 0, 0);
     ctx.drawImage(heatCanvas, 0, 0, w, h);
     APP.ui.inspectionRenders._drawBBoxHighlight(ctx, track, w, h);
 };
 /**

 };
 /**
+ * Render depth colormap (viridis-like palette) with scale legend.
  */
 APP.ui.inspectionRenders._renderDepth = function (canvas, frameImg, depthData, track) {
     if (!frameImg) return;
     const id = img.data;
     for (let i = 0; i < dd.length; i++) {
+        const val = dd[i];
+        // Handle NaN/Inf values — render as black
+        if (!isFinite(val)) {
+            const oi = i * 4;
+            id[oi] = 0; id[oi + 1] = 0; id[oi + 2] = 0; id[oi + 3] = 255;
+            continue;
+        }
+        const t = APP.core.utils.clamp((val - minD) / range, 0, 1);
         const rgb = APP.ui.inspectionRenders._viridis(t);
         const oi = i * 4;
         id[oi]     = rgb[0];
     // Draw scaled to canvas size
     ctx.drawImage(depthCanvas, 0, 0, w, h);
+    // Draw bbox highlight for the selected track
     APP.ui.inspectionRenders._drawBBoxHighlight(ctx, track, w, h);
+    // Draw depth scale legend (vertical gradient bar on the right)
+    APP.ui.inspectionRenders._drawDepthLegend(ctx, w, h, minD, maxD);
+    // If track is selected, compute and show average depth in the bbox region
+    if (track && track.bbox && dw > 0 && dh > 0) {
+        APP.ui.inspectionRenders._drawTrackDepthStats(ctx, track, w, h, depthData);
+    }
+};
+/**
+ * Draw a vertical depth scale legend on the right side of the canvas.
+ */
+APP.ui.inspectionRenders._drawDepthLegend = function (ctx, canvasW, canvasH, minD, maxD) {
+    const barW = 16;
+    const barH = Math.min(180, canvasH - 60);
+    const x = canvasW - barW - 30;
+    const y = 30;
+    // Background panel
+    ctx.fillStyle = "rgba(0, 0, 0, 0.55)";
+    ctx.fillRect(x - 6, y - 20, barW + 52, barH + 40);
+    ctx.strokeStyle = "rgba(255, 255, 255, 0.15)";
+    ctx.lineWidth = 1;
+    ctx.strokeRect(x - 6, y - 20, barW + 52, barH + 40);
+    // Draw gradient bar
+    for (let py = 0; py < barH; py++) {
+        const t = py / (barH - 1); // 0 = top (near/min), 1 = bottom (far/max)
+        const rgb = APP.ui.inspectionRenders._viridis(t);
+        ctx.fillStyle = `rgb(${rgb[0]}, ${rgb[1]}, ${rgb[2]})`;
+        ctx.fillRect(x, y + py, barW, 1);
+    }
+    // Border around bar
+    ctx.strokeStyle = "rgba(255, 255, 255, 0.3)";
+    ctx.lineWidth = 1;
+    ctx.strokeRect(x, y, barW, barH);
+    // Labels
+    ctx.fillStyle = "rgba(255, 255, 255, 0.8)";
+    ctx.font = "10px monospace";
+    ctx.textAlign = "left";
+    ctx.fillText(`${minD.toFixed(1)}m`, x + barW + 4, y + 10);
+    ctx.fillText(`${maxD.toFixed(1)}m`, x + barW + 4, y + barH);
+    // Title
+    ctx.fillStyle = "rgba(255, 255, 255, 0.6)";
+    ctx.font = "9px monospace";
+    ctx.textAlign = "center";
+    ctx.fillText("DEPTH", x + barW / 2, y - 6);
+};
+/**
+ * Compute and display average depth within the selected track's bounding box.
+ */
+APP.ui.inspectionRenders._drawTrackDepthStats = function (ctx, track, canvasW, canvasH, depthData) {
+    const b = track.bbox;
+    const dw = depthData.width;
+    const dh = depthData.height;
+    const dd = depthData.data;
+    // Map normalized bbox to depth map coordinates
+    const x1 = Math.max(0, Math.floor(b.x * dw));
+    const y1 = Math.max(0, Math.floor(b.y * dh));
+    const x2 = Math.min(dw - 1, Math.floor((b.x + b.w) * dw));
+    const y2 = Math.min(dh - 1, Math.floor((b.y + b.h) * dh));
+    let sum = 0;
+    let count = 0;
+    let localMin = Infinity;
+    let localMax = -Infinity;
+    for (let py = y1; py <= y2; py++) {
+        for (let px = x1; px <= x2; px++) {
+            const val = dd[py * dw + px];
+            if (isFinite(val)) {
+                sum += val;
+                count++;
+                if (val < localMin) localMin = val;
+                if (val > localMax) localMax = val;
+            }
+        }
+    }
+    if (count === 0) return;
+    const avgDepth = sum / count;
+    // Draw stats label below the bbox
+    const bx = b.x * canvasW;
+    const by = (b.y + b.h) * canvasH;
+    ctx.fillStyle = "rgba(0, 0, 0, 0.7)";
+    ctx.fillRect(bx, by + 4, 160, 18);
+    ctx.fillStyle = "rgba(253, 231, 37, 0.9)"; // viridis yellow for contrast
+    ctx.font = "bold 11px monospace";
+    ctx.textAlign = "left";
+    ctx.fillText(`Depth: ${avgDepth.toFixed(1)}m (${localMin.toFixed(1)}-${localMax.toFixed(1)})`, bx + 4, by + 16);
 };
 /**
+ * Render attention heatmap (GradCAM) overlaid on the base frame.
+ * Uses inferno colormap with semi-transparent blending.
  */
 APP.ui.inspectionRenders._renderAttention = function (canvas, frameImg, attentionData, track) {
     if (!frameImg) return;
     canvas.height = h;
     const ctx = canvas.getContext("2d");
+    // Draw base frame (slightly dimmed to make heatmap more visible)
     ctx.drawImage(frameImg, 0, 0);
+    ctx.fillStyle = "rgba(0, 0, 0, 0.2)";
+    ctx.fillRect(0, 0, w, h);
     if (!attentionData || !attentionData.data) {
+        ctx.fillStyle = "rgba(0,0,0,0.5)";
         ctx.fillRect(0, 0, w, h);
         ctx.fillStyle = "#aaa";
         ctx.font = "14px monospace";
         return;
     }
+    // Render attention as inferno colormap overlay
     const aw = attentionData.width;
     const ah = attentionData.height;
     const ad = attentionData.data;
     const img = hctx.createImageData(aw, ah);
     const id = img.data;
+    // Track peak attention value for stats
+    let peakVal = 0;
+    let sumVal = 0;
     for (let i = 0; i < ad.length; i++) {
         const t = APP.core.utils.clamp(ad[i], 0, 1);
+        if (t > peakVal) peakVal = t;
+        sumVal += t;
         const rgb = APP.ui.inspectionRenders._inferno(t);
         const oi = i * 4;
         id[oi]     = rgb[0];
         id[oi + 1] = rgb[1];
         id[oi + 2] = rgb[2];
+        // Alpha: low values are very transparent, high values are semi-opaque
+        // Use a power curve for better visual contrast
+        id[oi + 3] = Math.round(Math.pow(t, 0.7) * 200);
     }
     hctx.putImageData(img, 0, 0);
+    // Use bilinear upscaling for smooth heatmap (attention resolution is typically low)
+    ctx.imageSmoothingEnabled = true;
+    ctx.imageSmoothingQuality = "high";
     ctx.drawImage(heatCanvas, 0, 0, w, h);
+    // Draw bbox highlight
     APP.ui.inspectionRenders._drawBBoxHighlight(ctx, track, w, h);
+    // Draw attention legend and stats
+    const avgVal = ad.length > 0 ? sumVal / ad.length : 0;
+    APP.ui.inspectionRenders._drawAttentionLegend(ctx, w, h, peakVal, avgVal);
+};
+/**
+ * Draw an attention intensity legend with inferno colormap bar.
+ */
+APP.ui.inspectionRenders._drawAttentionLegend = function (ctx, canvasW, canvasH, peakVal, avgVal) {
+    const barW = 16;
+    const barH = Math.min(140, canvasH - 60);
+    const x = canvasW - barW - 30;
+    const y = 30;
+    // Background panel
+    ctx.fillStyle = "rgba(0, 0, 0, 0.55)";
+    ctx.fillRect(x - 6, y - 20, barW + 60, barH + 56);
+    ctx.strokeStyle = "rgba(255, 255, 255, 0.15)";
+    ctx.lineWidth = 1;
+    ctx.strokeRect(x - 6, y - 20, barW + 60, barH + 56);
+    // Draw gradient bar (top = high attention, bottom = low)
+    for (let py = 0; py < barH; py++) {
+        const t = 1 - (py / (barH - 1)); // 0 = bottom (low), 1 = top (high)
+        const rgb = APP.ui.inspectionRenders._inferno(t);
+        ctx.fillStyle = `rgb(${rgb[0]}, ${rgb[1]}, ${rgb[2]})`;
+        ctx.fillRect(x, y + py, barW, 1);
+    }
+    // Border around bar
+    ctx.strokeStyle = "rgba(255, 255, 255, 0.3)";
+    ctx.lineWidth = 1;
+    ctx.strokeRect(x, y, barW, barH);
+    // Labels
+    ctx.fillStyle = "rgba(255, 255, 255, 0.8)";
+    ctx.font = "10px monospace";
+    ctx.textAlign = "left";
+    ctx.fillText("High", x + barW + 4, y + 10);
+    ctx.fillText("Low", x + barW + 4, y + barH);
+    // Title
+    ctx.fillStyle = "rgba(255, 255, 255, 0.6)";
+    ctx.font = "9px monospace";
+    ctx.textAlign = "center";
+    ctx.fillText("ATTENTION", x + barW / 2, y - 6);
+    // Stats
+    ctx.fillStyle = "rgba(252, 255, 164, 0.8)"; // inferno yellow
+    ctx.font = "10px monospace";
+    ctx.textAlign = "left";
+    ctx.fillText(`Peak: ${(peakVal * 100).toFixed(0)}%`, x - 2, y + barH + 18);
+    ctx.fillText(`Avg:  ${(avgVal * 100).toFixed(0)}%`, x - 2, y + barH + 32);
 };
 /**

frontend/js/ui/inspection.js CHANGED Viewed

@@ -206,12 +206,31 @@ APP.ui.inspection._clearCaches = function () {
 };
 /**
- * Internal: show/hide loading indicator.
  */
-APP.ui.inspection._setLoading = function (loading) {
     const { $ } = APP.core.utils;
     const el = $("#inspectionLoading");
-    if (el) el.style.display = loading ? "flex" : "none";
     APP.core.state.inspection.loading = loading;
 };
@@ -254,7 +273,7 @@ APP.ui.inspection._loadAndRender = async function () {
     try {
         // --- Step 1: Ensure we have the base frame image (shared by most modes) ---
         if (!state.inspection._frameImg && mode !== "3d") {
-            APP.ui.inspection._setLoading(true);
             const frameImg = await api.fetchFrame(jobId, frameIdx);
             state.inspection.frameImageUrl = frameImg.src;
             state.inspection._frameImg = frameImg;
@@ -262,7 +281,7 @@ APP.ui.inspection._loadAndRender = async function () {
         // --- Step 2: Fetch mode-specific data if not cached ---
         if (!cache[mode]) {
-            APP.ui.inspection._setLoading(true);
             switch (mode) {
                 case "seg":

 };
 /**
+ * Internal: show/hide loading indicator with optional mode-specific message.
+ * @param {boolean} loading
+ * @param {string} [mode] — if provided, shows a mode-specific message
  */
+APP.ui.inspection._setLoading = function (loading, mode) {
     const { $ } = APP.core.utils;
     const el = $("#inspectionLoading");
+    if (el) {
+        el.style.display = loading ? "flex" : "none";
+        // Update loading message based on mode
+        const msgEl = el.querySelector("span");
+        if (msgEl && loading && mode) {
+            const modeMessages = {
+                seg: "Computing segmentation mask...",
+                edge: "Computing edges...",
+                depth: "Computing depth map...",
+                attention: "Generating attention heatmap...",
+                superres: "Enhancing resolution...",
+                "3d": "Generating point cloud..."
+            };
+            msgEl.textContent = modeMessages[mode] || "Loading...";
+        } else if (msgEl && !loading) {
+            msgEl.textContent = "Loading...";
+        }
+    }
     APP.core.state.inspection.loading = loading;
 };
     try {
         // --- Step 1: Ensure we have the base frame image (shared by most modes) ---
         if (!state.inspection._frameImg && mode !== "3d") {
+            APP.ui.inspection._setLoading(true, mode);
             const frameImg = await api.fetchFrame(jobId, frameIdx);
             state.inspection.frameImageUrl = frameImg.src;
             state.inspection._frameImg = frameImg;
         // --- Step 2: Fetch mode-specific data if not cached ---
         if (!cache[mode]) {
+            APP.ui.inspection._setLoading(true, mode);
             switch (mode) {
                 case "seg":

inspection/attention.py ADDED Viewed

	@@ -0,0 +1,537 @@

+"""GradCAM-style attention heatmap generation for detector models.
+Produces per-object attention maps showing which regions of the input
+image the detector model focused on when detecting a particular object.
+For Transformers-based detectors (DETR, Grounding DINO) we use true
+GradCAM by hooking the backbone's last feature layer.  For Ultralytics
+YOLO models we generate an activation-based saliency map from the
+model's internal feature maps (no gradient needed since YOLO doesn't
+easily support GradCAM due to its anchor-free detection head).
+"""
+import base64
+import logging
+import threading
+from typing import Dict, Optional, Tuple
+import cv2
+import numpy as np
+import torch
+logger = logging.getLogger(__name__)
+# ── In-memory attention cache ────────────────────────────────────
+# Key: (job_id, frame_idx, track_id_str)  Value: heatmap (HxW float32 0-1)
+_attention_cache: Dict[Tuple[str, int, str], np.ndarray] = {}
+_cache_lock = threading.RLock()
+_MAX_CACHE_ENTRIES = 200
+def get_cached_attention(
+    job_id: str, frame_idx: int, track_id: str
+) -> Optional[np.ndarray]:
+    """Return cached attention heatmap or None."""
+    with _cache_lock:
+        return _attention_cache.get((job_id, frame_idx, track_id))
+def set_cached_attention(
+    job_id: str, frame_idx: int, track_id: str, heatmap: np.ndarray
+) -> None:
+    """Store attention heatmap in cache."""
+    with _cache_lock:
+        if len(_attention_cache) >= _MAX_CACHE_ENTRIES:
+            oldest = next(iter(_attention_cache))
+            del _attention_cache[oldest]
+        _attention_cache[(job_id, frame_idx, track_id)] = heatmap
+def clear_attention_cache(job_id: Optional[str] = None) -> None:
+    """Clear attention cache for a specific job or all jobs."""
+    with _cache_lock:
+        if job_id is None:
+            _attention_cache.clear()
+        else:
+            keys_to_remove = [k for k in _attention_cache if k[0] == job_id]
+            for k in keys_to_remove:
+                del _attention_cache[k]
+# ── GradCAM for HF Transformers models (DETR, Grounding DINO) ───
+def _find_target_layer(model: torch.nn.Module) -> Optional[torch.nn.Module]:
+    """Find the last convolutional or attention layer suitable for GradCAM.
+    Tries several strategies in order:
+    1. DETR ResNet backbone: model.model.backbone.conv_encoder.model.layer4
+    2. Grounding DINO Swin backbone: last layer of backbone
+    3. Generic: walk the model and find the last Conv2d layer
+    """
+    # Strategy 1: DETR backbone (ResNet)
+    try:
+        backbone = model.model.backbone
+        if hasattr(backbone, "conv_encoder"):
+            resnet = backbone.conv_encoder.model
+            if hasattr(resnet, "layer4"):
+                return resnet.layer4
+    except (AttributeError, TypeError):
+        pass
+    # Strategy 2: Grounding DINO Swin backbone
+    try:
+        backbone = model.model.backbone
+        if hasattr(backbone, "conv_encoder"):
+            swin = backbone.conv_encoder.model
+            if hasattr(swin, "layers"):
+                layers = list(swin.layers)
+                if layers:
+                    return layers[-1]
+            if hasattr(swin, "encoder") and hasattr(swin.encoder, "layers"):
+                layers = list(swin.encoder.layers)
+                if layers:
+                    return layers[-1]
+    except (AttributeError, TypeError):
+        pass
+    # Strategy 3: Generic — find the last Conv2d
+    last_conv = None
+    for module in model.modules():
+        if isinstance(module, torch.nn.Conv2d):
+            last_conv = module
+    return last_conv
+class GradCAMExtractor:
+    """Extract GradCAM heatmaps from a PyTorch model.
+    Usage:
+        extractor = GradCAMExtractor(model, target_layer)
+        heatmap = extractor.generate(input_tensor, target_bbox)
+        extractor.release()  # remove hooks
+    """
+    def __init__(self, model: torch.nn.Module, target_layer: torch.nn.Module):
+        self.model = model
+        self.target_layer = target_layer
+        self._activations: Optional[torch.Tensor] = None
+        self._gradients: Optional[torch.Tensor] = None
+        # Register hooks
+        self._fwd_hook = target_layer.register_forward_hook(self._save_activation)
+        self._bwd_hook = target_layer.register_full_backward_hook(self._save_gradient)
+    def _save_activation(self, module, input, output):
+        if isinstance(output, torch.Tensor):
+            self._activations = output.detach()
+        elif isinstance(output, (tuple, list)) and len(output) > 0:
+            self._activations = output[0].detach()
+    def _save_gradient(self, module, grad_input, grad_output):
+        if isinstance(grad_output, (tuple, list)) and len(grad_output) > 0:
+            self._gradients = grad_output[0].detach()
+        elif isinstance(grad_output, torch.Tensor):
+            self._gradients = grad_output.detach()
+    def generate(
+        self,
+        input_tensor: torch.Tensor,
+        target_bbox: list,
+        frame_h: int,
+        frame_w: int,
+    ) -> np.ndarray:
+        """Generate a GradCAM heatmap for a target bounding box.
+        Args:
+            input_tensor: Preprocessed model input tensor.
+            target_bbox: [x1, y1, x2, y2] in original frame pixel coords.
+            frame_h: Original frame height.
+            frame_w: Original frame width.
+        Returns:
+            HxW float32 array normalized to [0, 1], at the model's
+            feature map resolution (upscaled to frame size).
+        """
+        self.model.zero_grad()
+        self._activations = None
+        self._gradients = None
+        # Enable gradients temporarily
+        was_training = self.model.training
+        self.model.eval()
+        # Forward pass with gradients enabled on input
+        with torch.enable_grad():
+            outputs = self.model(**{k: v for k, v in input_tensor.items()})
+        if self._activations is None:
+            logger.warning("GradCAM: no activations captured; returning uniform map")
+            return np.ones((frame_h, frame_w), dtype=np.float32) * 0.5
+        # Use the activation map directly as a saliency proxy when
+        # gradient-based targeting is unreliable (common with object
+        # detection architectures where loss requires complex target
+        # matching). We compute channel-wise L2 norm as the saliency.
+        acts = self._activations
+        if acts.dim() == 4:
+            # (B, C, H, W) — standard conv feature map
+            cam = torch.norm(acts[0], dim=0)  # (H, W)
+        elif acts.dim() == 3:
+            # (B, N, C) — transformer sequence; try to reshape
+            # N = h * w for spatial feature maps
+            B, N, C = acts.shape
+            side = int(N ** 0.5)
+            if side * side == N:
+                cam = torch.norm(acts[0], dim=1).view(side, side)
+            else:
+                cam = torch.norm(acts[0], dim=1)  # (N,)
+                cam = cam.unsqueeze(0)  # (1, N)
+        else:
+            cam = torch.norm(acts.flatten(), dim=0, keepdim=True)
+        cam = cam.float()
+        # Normalize to [0, 1]
+        cam_min = cam.min()
+        cam_max = cam.max()
+        if (cam_max - cam_min) > 1e-8:
+            cam = (cam - cam_min) / (cam_max - cam_min)
+        else:
+            cam = torch.zeros_like(cam)
+        cam_np = cam.cpu().numpy()
+        # Upscale to frame resolution
+        if cam_np.ndim == 1:
+            side = int(len(cam_np) ** 0.5)
+            if side * side == len(cam_np):
+                cam_np = cam_np.reshape(side, side)
+            else:
+                cam_np = cam_np.reshape(1, -1)
+        cam_resized = cv2.resize(
+            cam_np, (frame_w, frame_h), interpolation=cv2.INTER_LINEAR
+        )
+        # Crop influence to the target bbox region (boost bbox, attenuate outside)
+        x1, y1, x2, y2 = [int(c) for c in target_bbox]
+        x1 = max(0, x1)
+        y1 = max(0, y1)
+        x2 = min(frame_w, x2)
+        y2 = min(frame_h, y2)
+        # Create a soft mask centered on the bbox
+        mask = np.zeros((frame_h, frame_w), dtype=np.float32)
+        mask[y1:y2, x1:x2] = 1.0
+        # Expand mask slightly for context
+        pad = max(x2 - x1, y2 - y1) // 2
+        py1 = max(0, y1 - pad)
+        py2 = min(frame_h, y2 + pad)
+        px1 = max(0, x1 - pad)
+        px2 = min(frame_w, x2 + pad)
+        mask[py1:py2, px1:px2] = np.maximum(mask[py1:py2, px1:px2], 0.3)
+        cam_resized = cam_resized * mask
+        # Re-normalize
+        c_max = cam_resized.max()
+        if c_max > 1e-8:
+            cam_resized = cam_resized / c_max
+        if not was_training:
+            self.model.eval()
+        return cam_resized.astype(np.float32)
+    def release(self):
+        """Remove hooks from the model."""
+        self._fwd_hook.remove()
+        self._bwd_hook.remove()
+# ── YOLO saliency (activation-based, no gradients) ──────────────
+def _yolo_saliency(
+    yolo_model,
+    frame: np.ndarray,
+    target_bbox: list,
+) -> np.ndarray:
+    """Generate an activation-based saliency map from a YOLO model.
+    Uses the model's internal feature pyramid activations as a proxy
+    for attention. This avoids the complexity of GradCAM with YOLO's
+    anchor-free heads.
+    Args:
+        yolo_model: Ultralytics YOLO model instance.
+        frame: HxWx3 BGR uint8 numpy array.
+        target_bbox: [x1, y1, x2, y2] in original frame pixel coords.
+    Returns:
+        HxW float32 array normalized to [0, 1].
+    """
+    frame_h, frame_w = frame.shape[:2]
+    # Run inference to get internal features
+    results = yolo_model.predict(
+        source=frame,
+        device=yolo_model.device if hasattr(yolo_model, 'device') else None,
+        conf=0.1,
+        imgsz=640,
+        verbose=False,
+    )
+    # Try to extract feature maps from the model internals
+    # Ultralytics stores intermediate outputs during forward pass
+    cam = None
+    try:
+        # Access the PyTorch model inside the Ultralytics wrapper
+        pt_model = yolo_model.model
+        if hasattr(pt_model, "model"):
+            # The sequential model layers
+            layers = pt_model.model
+            # Find the last feature extraction layer (before detect head)
+            # Typically the SPPF or C2f layer near the end
+            for i in range(len(layers) - 1, -1, -1):
+                layer = layers[i]
+                layer_type = type(layer).__name__
+                if layer_type in ("SPPF", "C2f", "C3", "Conv"):
+                    # Hook this layer for activation extraction
+                    activation = {}
+                    def hook_fn(module, inp, out, store=activation):
+                        store["out"] = out.detach()
+                    handle = layer.register_forward_hook(hook_fn)
+                    # Re-run forward pass to capture activations
+                    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    from PIL import Image
+                    import torchvision.transforms as T
+                    img = Image.fromarray(rgb)
+                    # Use the same preprocessing as YOLO
+                    yolo_model.predict(
+                        source=frame,
+                        device=yolo_model.device if hasattr(yolo_model, 'device') else None,
+                        conf=0.1,
+                        imgsz=640,
+                        verbose=False,
+                    )
+                    handle.remove()
+                    if "out" in activation:
+                        feat = activation["out"]
+                        if feat.dim() == 4:
+                            cam = torch.norm(feat[0], dim=0).cpu().numpy()
+                        break
+    except Exception as e:
+        logger.warning("YOLO feature extraction failed: %s", e)
+    if cam is None:
+        # Fallback: generate a simple Gaussian heatmap centered on the bbox
+        cam = _gaussian_bbox_heatmap(frame_h, frame_w, target_bbox)
+    else:
+        # Resize to frame dimensions
+        cam = cv2.resize(cam, (frame_w, frame_h), interpolation=cv2.INTER_LINEAR)
+    # Focus on the target bbox region
+    x1, y1, x2, y2 = [int(c) for c in target_bbox]
+    x1 = max(0, x1)
+    y1 = max(0, y1)
+    x2 = min(frame_w, x2)
+    y2 = min(frame_h, y2)
+    mask = np.zeros((frame_h, frame_w), dtype=np.float32)
+    mask[y1:y2, x1:x2] = 1.0
+    pad = max(x2 - x1, y2 - y1) // 2
+    py1, py2 = max(0, y1 - pad), min(frame_h, y2 + pad)
+    px1, px2 = max(0, x1 - pad), min(frame_w, x2 + pad)
+    mask[py1:py2, px1:px2] = np.maximum(mask[py1:py2, px1:px2], 0.3)
+    cam = cam * mask
+    # Normalize to [0, 1]
+    c_max = cam.max()
+    if c_max > 1e-8:
+        cam = cam / c_max
+    return cam.astype(np.float32)
+def _gaussian_bbox_heatmap(
+    frame_h: int, frame_w: int, bbox: list
+) -> np.ndarray:
+    """Generate a Gaussian heatmap centered on a bounding box.
+    Used as a fallback when feature extraction is not available.
+    """
+    x1, y1, x2, y2 = [int(c) for c in bbox]
+    cx = (x1 + x2) / 2
+    cy = (y1 + y2) / 2
+    sx = max((x2 - x1) / 2, 1.0)
+    sy = max((y2 - y1) / 2, 1.0)
+    y_coords, x_coords = np.mgrid[0:frame_h, 0:frame_w]
+    heatmap = np.exp(
+        -0.5 * (((x_coords - cx) / sx) ** 2 + ((y_coords - cy) / sy) ** 2)
+    )
+    return heatmap.astype(np.float32)
+# ── Main entry point ─────────────────────────────────────────────
+def generate_attention_map(
+    frame: np.ndarray,
+    bbox: list,
+    detector_name: str,
+    job_id: str,
+    frame_idx: int,
+    track_id: str,
+) -> np.ndarray:
+    """Generate an attention heatmap for a detected object.
+    Loads the detector model (cached), runs a forward pass, and
+    extracts activation-based saliency focused on the target bbox.
+    Args:
+        frame: HxWx3 BGR uint8 numpy array.
+        bbox: [x1, y1, x2, y2] target object bounding box.
+        detector_name: Name of the detector used for the job.
+        job_id: Job identifier (for caching).
+        frame_idx: Frame index (for caching).
+        track_id: Track ID string (for caching).
+    Returns:
+        HxW float32 heatmap normalized to [0, 1].
+    """
+    # Check cache first
+    cached = get_cached_attention(job_id, frame_idx, track_id)
+    if cached is not None:
+        return cached
+    frame_h, frame_w = frame.shape[:2]
+    heatmap = None
+    if detector_name in ("yolo11", "yolov8_visdrone"):
+        # YOLO models — use activation-based saliency
+        try:
+            from models.model_loader import load_detector
+            detector = load_detector(detector_name)
+            yolo_model = detector.model
+            heatmap = _yolo_saliency(yolo_model, frame, bbox)
+        except Exception as e:
+            logger.warning("YOLO saliency generation failed: %s", e)
+    elif detector_name in ("detr_resnet50", "grounding_dino"):
+        # Transformers models — use GradCAM on backbone
+        try:
+            from models.model_loader import load_detector
+            detector = load_detector(detector_name)
+            model = detector.model
+            target_layer = _find_target_layer(model)
+            if target_layer is not None:
+                extractor = GradCAMExtractor(model, target_layer)
+                try:
+                    # Prepare input
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    processor = detector.processor
+                    if detector_name == "grounding_dino":
+                        inputs = processor(
+                            images=frame_rgb, text="object.", return_tensors="pt"
+                        )
+                    else:
+                        inputs = processor(images=frame_rgb, return_tensors="pt")
+                    inputs = {
+                        k: v.to(detector.device) for k, v in inputs.items()
+                    }
+                    heatmap = extractor.generate(inputs, bbox, frame_h, frame_w)
+                finally:
+                    extractor.release()
+            else:
+                logger.warning(
+                    "No suitable target layer found for %s", detector_name
+                )
+        except Exception as e:
+            logger.warning("GradCAM generation failed for %s: %s", detector_name, e)
+    # Fallback: Gaussian heatmap centered on bbox
+    if heatmap is None:
+        logger.info(
+            "Using Gaussian fallback for attention (detector=%s)", detector_name
+        )
+        heatmap = _gaussian_bbox_heatmap(frame_h, frame_w, bbox)
+    # Cache the result
+    set_cached_attention(job_id, frame_idx, track_id, heatmap)
+    return heatmap
+# ── Serialization / rendering ─────────────────────────────────────
+def heatmap_to_base64(heatmap: np.ndarray) -> str:
+    """Encode heatmap as base64 float32 bytes."""
+    raw = heatmap.astype(np.float32).tobytes()
+    return base64.b64encode(raw).decode("ascii")
+def heatmap_overlay_jpeg(
+    frame: np.ndarray,
+    heatmap: np.ndarray,
+    bbox: list,
+    alpha: float = 0.5,
+    quality: int = 85,
+) -> bytes:
+    """Render heatmap overlay on a cropped frame region as JPEG.
+    Args:
+        frame: HxWx3 BGR uint8 numpy array (full frame).
+        heatmap: HxW float32 heatmap (same size as frame).
+        bbox: [x1, y1, x2, y2] crop region.
+        alpha: Blend factor for overlay (0=no overlay, 1=full overlay).
+        quality: JPEG quality.
+    Returns:
+        JPEG bytes.
+    """
+    h, w = frame.shape[:2]
+    x1, y1, x2, y2 = [int(c) for c in bbox]
+    x1 = max(0, x1)
+    y1 = max(0, y1)
+    x2 = min(w, x2)
+    y2 = min(h, y2)
+    # Add some padding
+    bw = x2 - x1
+    bh = y2 - y1
+    pad = int(max(bw, bh) * 0.15)
+    cx1 = max(0, x1 - pad)
+    cy1 = max(0, y1 - pad)
+    cx2 = min(w, x2 + pad)
+    cy2 = min(h, y2 + pad)
+    crop = frame[cy1:cy2, cx1:cx2].copy()
+    heat_crop = heatmap[cy1:cy2, cx1:cx2]
+    # Normalize heatmap crop to 0-255 for colormap
+    heat_u8 = (heat_crop * 255).clip(0, 255).astype(np.uint8)
+    colored = cv2.applyColorMap(heat_u8, cv2.COLORMAP_JET)
+    # Blend
+    overlay = cv2.addWeighted(crop, 1 - alpha, colored, alpha, 0)
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+    success, buffer = cv2.imencode(".jpg", overlay, encode_param)
+    if not success:
+        raise RuntimeError("Failed to encode attention overlay as JPEG")
+    return buffer.tobytes()

inspection/depth.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""On-demand depth inference, colorization, caching, and stats.
+Uses DepthAnythingV2Estimator for single-frame depth estimation.
+Results are cached in-memory per (job_id, frame_idx) to avoid
+redundant GPU work when the same frame is requested multiple times.
+"""
+import base64
+import logging
+import threading
+from typing import Dict, Optional, Tuple
+import cv2
+import numpy as np
+logger = logging.getLogger(__name__)
+# ── In-memory depth cache ────────────────────────────────────────
+# Key: (job_id, frame_idx)  Value: depth_map (HxW float32)
+_depth_cache: Dict[Tuple[str, int], np.ndarray] = {}
+_cache_lock = threading.RLock()
+# Limit cache size to avoid OOM
+_MAX_CACHE_ENTRIES = 200
+def _cache_key(job_id: str, frame_idx: int) -> Tuple[str, int]:
+    return (job_id, frame_idx)
+def get_cached_depth(job_id: str, frame_idx: int) -> Optional[np.ndarray]:
+    """Return cached depth map or None."""
+    with _cache_lock:
+        return _depth_cache.get(_cache_key(job_id, frame_idx))
+def set_cached_depth(job_id: str, frame_idx: int, depth_map: np.ndarray) -> None:
+    """Store depth map in cache, evicting oldest if over limit."""
+    with _cache_lock:
+        if len(_depth_cache) >= _MAX_CACHE_ENTRIES:
+            # Evict the first (oldest) entry
+            oldest = next(iter(_depth_cache))
+            del _depth_cache[oldest]
+        _depth_cache[_cache_key(job_id, frame_idx)] = depth_map
+def clear_depth_cache(job_id: Optional[str] = None) -> None:
+    """Clear depth cache for a specific job or all jobs."""
+    with _cache_lock:
+        if job_id is None:
+            _depth_cache.clear()
+        else:
+            keys_to_remove = [k for k in _depth_cache if k[0] == job_id]
+            for k in keys_to_remove:
+                del _depth_cache[k]
+# ── Lazy model singleton ─────────────────────────────────────────
+_depth_estimator = None
+_estimator_lock = threading.Lock()
+def _get_depth_estimator():
+    """Lazy-load DepthAnythingV2 (singleton, thread-safe)."""
+    global _depth_estimator
+    if _depth_estimator is None:
+        with _estimator_lock:
+            if _depth_estimator is None:
+                from models.depth_estimators.depth_anything_v2 import (
+                    DepthAnythingV2Estimator,
+                )
+                _depth_estimator = DepthAnythingV2Estimator()
+    return _depth_estimator
+# ── Core inference ────────────────────────────────────────────────
+def run_depth_on_frame(
+    frame: np.ndarray,
+    job_id: str,
+    frame_idx: int,
+) -> np.ndarray:
+    """Run depth estimation on a single frame, caching the result.
+    Args:
+        frame: HxWx3 BGR uint8 numpy array.
+        job_id: Job identifier (for cache keying).
+        frame_idx: Frame index (for cache keying).
+    Returns:
+        HxW float32 depth map.
+    """
+    cached = get_cached_depth(job_id, frame_idx)
+    if cached is not None:
+        return cached
+    estimator = _get_depth_estimator()
+    result = estimator.predict(frame)
+    depth_map = result.depth_map  # HxW float32
+    set_cached_depth(job_id, frame_idx, depth_map)
+    return depth_map
+# ── Stats computation ─────────────────────────────────────────────
+def compute_depth_stats(depth_map: np.ndarray) -> dict:
+    """Compute min, max, mean, median depth statistics.
+    Args:
+        depth_map: HxW float32 depth array.
+    Returns:
+        Dict with min_m, max_m, mean_m, median_m.
+    """
+    return {
+        "min_m": float(np.min(depth_map)),
+        "max_m": float(np.max(depth_map)),
+        "mean_m": float(np.mean(depth_map)),
+        "median_m": float(np.median(depth_map)),
+    }
+# ── Crop depth to track bbox ─────────────────────────────────────
+def crop_depth_to_bbox(
+    depth_map: np.ndarray,
+    bbox: list,
+    padding: float = 0.0,
+) -> np.ndarray:
+    """Crop a depth map to a bounding box with optional padding.
+    Args:
+        depth_map: HxW float32 depth array.
+        bbox: [x1, y1, x2, y2] in pixel coordinates.
+        padding: Fractional padding around the bbox.
+    Returns:
+        Cropped HxW float32 depth array.
+    """
+    h, w = depth_map.shape[:2]
+    x1, y1, x2, y2 = bbox
+    bw = x2 - x1
+    bh = y2 - y1
+    pad_x = int(bw * padding)
+    pad_y = int(bh * padding)
+    cx1 = max(0, x1 - pad_x)
+    cy1 = max(0, y1 - pad_y)
+    cx2 = min(w, x2 + pad_x)
+    cy2 = min(h, y2 + pad_y)
+    return depth_map[cy1:cy2, cx1:cx2].copy()
+# ── Colorization (viridis) ───────────────────────────────────────
+def colorize_depth(depth_map: np.ndarray, quality: int = 85) -> bytes:
+    """Apply viridis colormap and encode as JPEG.
+    Args:
+        depth_map: HxW float32 depth array.
+        quality: JPEG quality (1-100).
+    Returns:
+        JPEG bytes with viridis-colored depth.
+    """
+    # Normalize to 0-255
+    d_min = float(np.min(depth_map))
+    d_max = float(np.max(depth_map))
+    if d_max - d_min < 1e-6:
+        normalized = np.zeros_like(depth_map, dtype=np.uint8)
+    else:
+        normalized = ((depth_map - d_min) / (d_max - d_min) * 255).astype(np.uint8)
+    # Apply viridis colormap (OpenCV uses BGR internally)
+    colored = cv2.applyColorMap(normalized, cv2.COLORMAP_VIRIDIS)
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+    success, buffer = cv2.imencode(".jpg", colored, encode_param)
+    if not success:
+        raise RuntimeError("Failed to encode colorized depth as JPEG")
+    return buffer.tobytes()
+# ── Serialization helpers ─────────────────────────────────────────
+def depth_to_raw_bytes(depth_map: np.ndarray) -> bytes:
+    """Convert depth map to raw float32 little-endian bytes.
+    Args:
+        depth_map: HxW float32 depth array.
+    Returns:
+        Raw bytes (width * height * 4 bytes).
+    """
+    return depth_map.astype(np.float32).tobytes()
+def depth_to_base64(depth_map: np.ndarray) -> str:
+    """Convert depth map to base64-encoded float32 bytes.
+    Args:
+        depth_map: HxW float32 depth array.
+    Returns:
+        Base64-encoded string.
+    """
+    raw = depth_to_raw_bytes(depth_map)
+    return base64.b64encode(raw).decode("ascii")

inspection/router.py CHANGED Viewed

@@ -337,3 +337,201 @@ async def generate_mask(
         "color": color,
         "source": "sam2_ondemand",
     })

         "color": color,
         "source": "sam2_ondemand",
     })
+# ── On-demand depth analysis ─────────────────────────────────────
+@router.get("/depth/{job_id}/{frame_idx}")
+async def get_depth(
+    job_id: str,
+    frame_idx: int,
+    track_id: Optional[str] = Query(None, description="Track ID to crop depth to, e.g. 'T01'"),
+    format: str = Query("raw", description="Response format: 'raw', 'json', or 'colorized'"),
+):
+    """Get depth data for a frame, computed on-demand and cached.
+    Supports three response formats:
+    - raw: binary float32 with X-Depth-* headers
+    - json: JSON with base64-encoded float32 depth data and stats
+    - colorized: JPEG image with viridis colormap
+    """
+    import asyncio
+    from inspection.frames import extract_frame
+    from inspection.depth import (
+        run_depth_on_frame,
+        crop_depth_to_bbox,
+        compute_depth_stats,
+        depth_to_raw_bytes,
+        depth_to_base64,
+        colorize_depth,
+    )
+    if format not in ("raw", "json", "colorized"):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid format '{format}'. Must be 'raw', 'json', or 'colorized'.",
+        )
+    job = _get_job_or_404(job_id)
+    input_path = job.input_video_path
+    if not input_path or not Path(input_path).exists():
+        raise HTTPException(status_code=404, detail="Input video not found on disk.")
+    _validate_frame_idx(input_path, frame_idx)
+    # Extract frame and run depth (GPU work in thread pool)
+    frame = await asyncio.to_thread(extract_frame, input_path, frame_idx)
+    depth_map = await asyncio.to_thread(run_depth_on_frame, frame, job_id, frame_idx)
+    # Optionally crop to track bbox
+    if track_id is not None:
+        from jobs.storage import get_track_data
+        tracks = get_track_data(job_id, frame_idx)
+        instance_id = int(track_id.replace("T", "")) if track_id.startswith("T") else int(track_id)
+        target = None
+        for t in tracks:
+            tid = t.get("instance_id") or t.get("track_id")
+            if tid == instance_id or tid == track_id:
+                target = t
+                break
+        if target and "bbox" in target:
+            depth_map = crop_depth_to_bbox(depth_map, target["bbox"])
+        else:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Track {track_id} not found in frame {frame_idx}.",
+            )
+    h, w = depth_map.shape[:2]
+    d_min = float(depth_map.min())
+    d_max = float(depth_map.max())
+    if format == "raw":
+        raw_bytes = depth_to_raw_bytes(depth_map)
+        return Response(
+            content=raw_bytes,
+            media_type="application/octet-stream",
+            headers={
+                "X-Depth-Width": str(w),
+                "X-Depth-Height": str(h),
+                "X-Depth-Min": f"{d_min:.4f}",
+                "X-Depth-Max": f"{d_max:.4f}",
+            },
+        )
+    if format == "json":
+        stats = compute_depth_stats(depth_map)
+        data_b64 = depth_to_base64(depth_map)
+        return JSONResponse({
+            "width": w,
+            "height": h,
+            "min_depth": d_min,
+            "max_depth": d_max,
+            "data_b64": data_b64,
+            "depth_stats": stats,
+        })
+    # format == "colorized"
+    jpeg_bytes = await asyncio.to_thread(colorize_depth, depth_map)
+    return Response(content=jpeg_bytes, media_type="image/jpeg")
+# ── Attention / GradCAM heatmaps ─────────────────────────────────
+@router.get("/attention/{job_id}/{frame_idx}/{track_id}")
+async def get_attention(
+    job_id: str,
+    frame_idx: int,
+    track_id: str,
+    format: str = Query("json", description="Response format: 'json' or 'overlay'"),
+):
+    """Generate a GradCAM/saliency attention heatmap for a detected object.
+    Computed on-demand using the detector model that produced the original
+    job. Results are cached per (job_id, frame_idx, track_id).
+    Supports two response formats:
+    - json: base64-encoded float32 heatmap with metadata
+    - overlay: JPEG image with heatmap blended onto the frame crop
+    """
+    import asyncio
+    from inspection.frames import extract_frame
+    from inspection.attention import (
+        generate_attention_map,
+        heatmap_to_base64,
+        heatmap_overlay_jpeg,
+    )
+    if format not in ("json", "overlay"):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid format '{format}'. Must be 'json' or 'overlay'.",
+        )
+    job = _get_job_or_404(job_id)
+    input_path = job.input_video_path
+    if not input_path or not Path(input_path).exists():
+        raise HTTPException(status_code=404, detail="Input video not found on disk.")
+    _validate_frame_idx(input_path, frame_idx)
+    # Determine the detector used for this job
+    detector_name = job.detector_name
+    if not detector_name:
+        raise HTTPException(
+            status_code=400,
+            detail="Attention maps require a detector model. This job has no detector_name.",
+        )
+    # Get the track's bounding box
+    from jobs.storage import get_track_data
+    tracks = get_track_data(job_id, frame_idx)
+    instance_id = int(track_id.replace("T", "")) if track_id.startswith("T") else int(track_id)
+    target = None
+    for t in tracks:
+        tid = t.get("instance_id") or t.get("track_id")
+        if tid == instance_id or tid == track_id:
+            target = t
+            break
+    if not target or "bbox" not in target:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Track {track_id} not found in frame {frame_idx}.",
+        )
+    bbox = target["bbox"]
+    # Extract frame and generate attention map (GPU work in thread pool)
+    frame = await asyncio.to_thread(extract_frame, input_path, frame_idx)
+    heatmap = await asyncio.to_thread(
+        generate_attention_map,
+        frame,
+        bbox,
+        detector_name,
+        job_id,
+        frame_idx,
+        track_id,
+    )
+    if format == "json":
+        h, w = heatmap.shape[:2]
+        data_b64 = heatmap_to_base64(heatmap)
+        return JSONResponse({
+            "track_id": track_id,
+            "frame_idx": frame_idx,
+            "width": w,
+            "height": h,
+            "data_b64": data_b64,
+            "format": "float32",
+        })
+    # format == "overlay"
+    jpeg_bytes = await asyncio.to_thread(
+        heatmap_overlay_jpeg, frame, heatmap, bbox
+    )
+    return Response(content=jpeg_bytes, media_type="image/jpeg")

tests/test_inspection_attention.py ADDED Viewed

	@@ -0,0 +1,380 @@

+"""Tests for inspection/attention.py — GradCAM attention heatmaps."""
+import base64
+import struct
+import numpy as np
+import pytest
+# ── Unit tests for attention module functions ────────────────────
+def test_gaussian_bbox_heatmap_shape():
+    """_gaussian_bbox_heatmap should produce correct shape and range."""
+    from inspection.attention import _gaussian_bbox_heatmap
+    heatmap = _gaussian_bbox_heatmap(100, 200, [50, 30, 150, 70])
+    assert heatmap.shape == (100, 200)
+    assert heatmap.dtype == np.float32
+    assert heatmap.max() <= 1.0
+    assert heatmap.min() >= 0.0
+def test_gaussian_bbox_heatmap_peak_location():
+    """Gaussian heatmap should peak at the bbox center."""
+    from inspection.attention import _gaussian_bbox_heatmap
+    heatmap = _gaussian_bbox_heatmap(200, 200, [60, 80, 140, 120])
+    # Center is at (100, 100)
+    cy, cx = 100, 100
+    center_val = heatmap[cy, cx]
+    # Corner values should be lower
+    corner_val = heatmap[0, 0]
+    assert center_val > corner_val
+def test_gaussian_bbox_heatmap_small_bbox():
+    """Small bbox should still produce valid heatmap."""
+    from inspection.attention import _gaussian_bbox_heatmap
+    heatmap = _gaussian_bbox_heatmap(50, 50, [10, 10, 12, 12])
+    assert heatmap.shape == (50, 50)
+    assert not np.isnan(heatmap).any()
+    assert heatmap.max() > 0
+def test_heatmap_to_base64():
+    """heatmap_to_base64 should encode correctly."""
+    from inspection.attention import heatmap_to_base64
+    heatmap = np.array([[0.0, 0.5], [0.75, 1.0]], dtype=np.float32)
+    b64 = heatmap_to_base64(heatmap)
+    decoded = base64.b64decode(b64)
+    assert len(decoded) == 4 * 4  # 4 floats
+    values = struct.unpack("<4f", decoded)
+    assert values == pytest.approx((0.0, 0.5, 0.75, 1.0))
+def test_heatmap_overlay_jpeg():
+    """heatmap_overlay_jpeg should return valid JPEG bytes."""
+    from inspection.attention import heatmap_overlay_jpeg
+    frame = np.random.randint(0, 255, (100, 200, 3), dtype=np.uint8)
+    heatmap = np.random.rand(100, 200).astype(np.float32)
+    bbox = [50, 30, 150, 70]
+    jpeg_bytes = heatmap_overlay_jpeg(frame, heatmap, bbox)
+    # JPEG magic bytes
+    assert jpeg_bytes[:2] == b"\xff\xd8"
+    assert len(jpeg_bytes) > 100
+def test_heatmap_overlay_jpeg_edge_bbox():
+    """Overlay with bbox near frame edges should not crash."""
+    from inspection.attention import heatmap_overlay_jpeg
+    frame = np.random.randint(0, 255, (50, 50, 3), dtype=np.uint8)
+    heatmap = np.random.rand(50, 50).astype(np.float32)
+    bbox = [0, 0, 50, 50]  # Full frame bbox
+    jpeg_bytes = heatmap_overlay_jpeg(frame, heatmap, bbox)
+    assert jpeg_bytes[:2] == b"\xff\xd8"
+# ── Cache tests ──────────────────────────────────────────────────
+def test_attention_cache_set_get():
+    """Attention cache should store and retrieve heatmaps."""
+    from inspection.attention import (
+        get_cached_attention,
+        set_cached_attention,
+        clear_attention_cache,
+    )
+    clear_attention_cache()
+    heatmap = np.ones((10, 10), dtype=np.float32) * 0.5
+    set_cached_attention("job1", 0, "T01", heatmap)
+    result = get_cached_attention("job1", 0, "T01")
+    assert result is not None
+    np.testing.assert_array_equal(result, heatmap)
+    # Different params should return None
+    assert get_cached_attention("job1", 0, "T02") is None
+    assert get_cached_attention("job1", 1, "T01") is None
+    assert get_cached_attention("job2", 0, "T01") is None
+    clear_attention_cache()
+def test_attention_cache_clear_per_job():
+    """clear_attention_cache(job_id) should only clear that job."""
+    from inspection.attention import (
+        get_cached_attention,
+        set_cached_attention,
+        clear_attention_cache,
+    )
+    clear_attention_cache()
+    h1 = np.ones((5, 5), dtype=np.float32)
+    h2 = np.ones((5, 5), dtype=np.float32) * 0.5
+    set_cached_attention("jobA", 0, "T01", h1)
+    set_cached_attention("jobB", 0, "T01", h2)
+    clear_attention_cache("jobA")
+    assert get_cached_attention("jobA", 0, "T01") is None
+    assert get_cached_attention("jobB", 0, "T01") is not None
+    clear_attention_cache()
+# ── Integration test for attention endpoint ──────────────────────
+def _make_test_video(tmp_path, num_frames=5, width=64, height=48):
+    """Create a tiny test video and return its path."""
+    import cv2
+    video_path = str(tmp_path / "test.mp4")
+    writer = cv2.VideoWriter(
+        video_path, cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height)
+    )
+    for i in range(num_frames):
+        frame = np.full((height, width, 3), i * 40, dtype=np.uint8)
+        writer.write(frame)
+    writer.release()
+    return video_path
+def test_attention_endpoint_json(tmp_path, monkeypatch):
+    """GET /inspect/attention/{job_id}/{frame_idx}/{track_id}?format=json."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage, set_track_data
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_attn_json",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    # Add track data
+    set_track_data("test_attn_json", 0, [
+        {"instance_id": 1, "label": "person", "bbox": [10, 10, 30, 30]},
+    ])
+    # Mock generate_attention_map to avoid loading real models
+    def fake_generate(frame, bbox, det_name, job_id, frame_idx, track_id):
+        h, w = frame.shape[:2]
+        return np.random.rand(h, w).astype(np.float32)
+    monkeypatch.setattr(
+        "inspection.attention.generate_attention_map", fake_generate
+    )
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/attention/test_attn_json/0/T01?format=json")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["track_id"] == "T01"
+    assert data["frame_idx"] == 0
+    assert "width" in data
+    assert "height" in data
+    assert "data_b64" in data
+    assert data["format"] == "float32"
+    # Verify base64 decodes to correct size
+    decoded = base64.b64decode(data["data_b64"])
+    assert len(decoded) == data["width"] * data["height"] * 4
+    from inspection.attention import clear_attention_cache
+    clear_attention_cache()
+    storage.delete("test_attn_json")
+def test_attention_endpoint_overlay(tmp_path, monkeypatch):
+    """GET /inspect/attention/{job_id}/{frame_idx}/{track_id}?format=overlay."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage, set_track_data
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_attn_overlay",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    set_track_data("test_attn_overlay", 0, [
+        {"instance_id": 1, "label": "person", "bbox": [5, 5, 40, 35]},
+    ])
+    def fake_generate(frame, bbox, det_name, job_id, frame_idx, track_id):
+        h, w = frame.shape[:2]
+        return np.random.rand(h, w).astype(np.float32)
+    monkeypatch.setattr(
+        "inspection.attention.generate_attention_map", fake_generate
+    )
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/attention/test_attn_overlay/0/T01?format=overlay")
+    assert resp.status_code == 200
+    assert resp.headers["content-type"] == "image/jpeg"
+    assert resp.content[:2] == b"\xff\xd8"
+    from inspection.attention import clear_attention_cache
+    clear_attention_cache()
+    storage.delete("test_attn_overlay")
+def test_attention_endpoint_no_detector(tmp_path):
+    """Attention for a job with no detector_name should return 400."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_attn_no_det",
+        status=JobStatus.COMPLETED,
+        mode="segmentation",
+        queries=["object"],
+        detector_name=None,
+        segmenter_name="GSAM2-L",
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/attention/test_attn_no_det/0/T01?format=json")
+    assert resp.status_code == 400
+    storage.delete("test_attn_no_det")
+def test_attention_endpoint_track_not_found(tmp_path):
+    """Track not found in frame should return 404."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage, set_track_data
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_attn_notrack",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    # No track data for frame 0
+    set_track_data("test_attn_notrack", 0, [])
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/attention/test_attn_notrack/0/T01?format=json")
+    assert resp.status_code == 404
+    storage.delete("test_attn_notrack")
+def test_attention_endpoint_invalid_format(tmp_path):
+    """Invalid format should return 400."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_attn_badfmt",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/attention/test_attn_badfmt/0/T01?format=invalid")
+    assert resp.status_code == 400
+    storage.delete("test_attn_badfmt")

tests/test_inspection_depth.py ADDED Viewed

	@@ -0,0 +1,480 @@

+"""Tests for inspection/depth.py — on-demand depth analysis."""
+import base64
+import struct
+import numpy as np
+import pytest
+# ── Unit tests for depth module functions ────────────────────────
+def test_compute_depth_stats():
+    """compute_depth_stats should return min, max, mean, median."""
+    from inspection.depth import compute_depth_stats
+    depth = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    stats = compute_depth_stats(depth)
+    assert stats["min_m"] == pytest.approx(1.0)
+    assert stats["max_m"] == pytest.approx(4.0)
+    assert stats["mean_m"] == pytest.approx(2.5)
+    assert stats["median_m"] == pytest.approx(2.5)
+def test_compute_depth_stats_uniform():
+    """Stats on a uniform depth map."""
+    from inspection.depth import compute_depth_stats
+    depth = np.full((10, 10), 5.5, dtype=np.float32)
+    stats = compute_depth_stats(depth)
+    assert stats["min_m"] == pytest.approx(5.5)
+    assert stats["max_m"] == pytest.approx(5.5)
+    assert stats["mean_m"] == pytest.approx(5.5)
+    assert stats["median_m"] == pytest.approx(5.5)
+def test_crop_depth_to_bbox():
+    """crop_depth_to_bbox should extract the correct subregion."""
+    from inspection.depth import crop_depth_to_bbox
+    depth = np.arange(100, dtype=np.float32).reshape(10, 10)
+    # bbox: x1=2, y1=3, x2=5, y2=7
+    cropped = crop_depth_to_bbox(depth, [2, 3, 5, 7])
+    assert cropped.shape == (4, 3)  # (y2-y1, x2-x1) = (4, 3)
+    assert cropped[0, 0] == pytest.approx(32.0)  # depth[3, 2]
+def test_crop_depth_to_bbox_with_padding():
+    """crop_depth_to_bbox with padding should expand the region."""
+    from inspection.depth import crop_depth_to_bbox
+    depth = np.arange(100, dtype=np.float32).reshape(10, 10)
+    # bbox: x1=2, y1=3, x2=5, y2=7, padding=0.5
+    # bw=3, bh=4, pad_x=1, pad_y=2 => region [1,1] to [6,9]
+    cropped = crop_depth_to_bbox(depth, [2, 3, 5, 7], padding=0.5)
+    assert cropped.shape[0] > 4  # Should be larger than (4, 3)
+    assert cropped.shape[1] > 3
+def test_crop_depth_to_bbox_clamped():
+    """Cropping near edges should clamp to image boundaries."""
+    from inspection.depth import crop_depth_to_bbox
+    depth = np.arange(100, dtype=np.float32).reshape(10, 10)
+    # bbox near top-left with big padding
+    cropped = crop_depth_to_bbox(depth, [0, 0, 2, 2], padding=1.0)
+    assert cropped.shape[0] >= 2
+    assert cropped.shape[1] >= 2
+    # Should not crash or produce negative indices
+    assert cropped[0, 0] == pytest.approx(0.0)  # depth[0, 0]
+def test_depth_to_raw_bytes():
+    """depth_to_raw_bytes should produce correct float32 bytes."""
+    from inspection.depth import depth_to_raw_bytes
+    depth = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    raw = depth_to_raw_bytes(depth)
+    assert len(raw) == 4 * 4  # 4 floats * 4 bytes each
+    values = struct.unpack("<4f", raw)
+    assert values == pytest.approx((1.0, 2.0, 3.0, 4.0))
+def test_depth_to_base64():
+    """depth_to_base64 should produce decodable base64 float32 data."""
+    from inspection.depth import depth_to_base64
+    depth = np.array([[1.5, 2.5]], dtype=np.float32)
+    b64 = depth_to_base64(depth)
+    decoded = base64.b64decode(b64)
+    assert len(decoded) == 2 * 4  # 2 floats
+    values = struct.unpack("<2f", decoded)
+    assert values == pytest.approx((1.5, 2.5))
+def test_colorize_depth_returns_jpeg():
+    """colorize_depth should return valid JPEG bytes."""
+    from inspection.depth import colorize_depth
+    depth = np.random.rand(48, 64).astype(np.float32) * 10.0
+    jpeg_bytes = colorize_depth(depth)
+    # JPEG magic bytes
+    assert jpeg_bytes[:2] == b"\xff\xd8"
+    assert len(jpeg_bytes) > 100
+def test_colorize_depth_uniform():
+    """colorize_depth should handle uniform depth (no division by zero)."""
+    from inspection.depth import colorize_depth
+    depth = np.full((32, 32), 5.0, dtype=np.float32)
+    jpeg_bytes = colorize_depth(depth)
+    assert jpeg_bytes[:2] == b"\xff\xd8"
+# ── Cache tests ──────────────────────────────────────────────────
+def test_cache_set_get():
+    """Depth cache should store and retrieve depth maps."""
+    from inspection.depth import (
+        get_cached_depth,
+        set_cached_depth,
+        clear_depth_cache,
+    )
+    clear_depth_cache()
+    depth = np.ones((10, 10), dtype=np.float32)
+    set_cached_depth("job1", 0, depth)
+    result = get_cached_depth("job1", 0)
+    assert result is not None
+    np.testing.assert_array_equal(result, depth)
+    # Different frame should return None
+    assert get_cached_depth("job1", 1) is None
+    # Different job should return None
+    assert get_cached_depth("job2", 0) is None
+    clear_depth_cache()
+def test_cache_clear_per_job():
+    """clear_depth_cache(job_id) should only clear that job."""
+    from inspection.depth import (
+        get_cached_depth,
+        set_cached_depth,
+        clear_depth_cache,
+    )
+    clear_depth_cache()
+    d1 = np.ones((5, 5), dtype=np.float32)
+    d2 = np.ones((5, 5), dtype=np.float32) * 2
+    set_cached_depth("jobA", 0, d1)
+    set_cached_depth("jobB", 0, d2)
+    clear_depth_cache("jobA")
+    assert get_cached_depth("jobA", 0) is None
+    assert get_cached_depth("jobB", 0) is not None
+    clear_depth_cache()
+# ── Integration test for the endpoint (via TestClient) ───────────
+def _make_test_video(tmp_path, num_frames=5, width=64, height=48):
+    """Create a tiny test video and return its path."""
+    import cv2
+    video_path = str(tmp_path / "test.mp4")
+    writer = cv2.VideoWriter(
+        video_path, cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height)
+    )
+    for i in range(num_frames):
+        frame = np.full((height, width, 3), i * 40, dtype=np.uint8)
+        writer.write(frame)
+    writer.release()
+    return video_path
+def test_depth_endpoint_raw(tmp_path, monkeypatch):
+    """GET /inspect/depth/{job_id}/{frame_idx}?format=raw should return binary float32."""
+    from fastapi.testclient import TestClient
+    from datetime import datetime
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage
+    # Create a test video
+    video_path = _make_test_video(tmp_path)
+    # Register a fake job
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_depth_raw",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    # Mock the depth estimator to avoid loading the real model
+    class FakeDepthResult:
+        def __init__(self, h, w):
+            self.depth_map = np.arange(h * w, dtype=np.float32).reshape(h, w)
+            self.focal_length = 1.0
+    class FakeEstimator:
+        def predict(self, frame):
+            h, w = frame.shape[:2]
+            return FakeDepthResult(h, w)
+    monkeypatch.setattr(
+        "inspection.depth._depth_estimator", FakeEstimator()
+    )
+    # Import app after patching
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/depth/test_depth_raw/0?format=raw")
+    assert resp.status_code == 200
+    assert resp.headers["content-type"] == "application/octet-stream"
+    assert "X-Depth-Width" in resp.headers
+    assert "X-Depth-Height" in resp.headers
+    assert "X-Depth-Min" in resp.headers
+    assert "X-Depth-Max" in resp.headers
+    w = int(resp.headers["X-Depth-Width"])
+    h = int(resp.headers["X-Depth-Height"])
+    assert len(resp.content) == w * h * 4  # float32
+    # Cleanup
+    from inspection.depth import clear_depth_cache
+    clear_depth_cache()
+    storage.delete("test_depth_raw")
+def test_depth_endpoint_json(tmp_path, monkeypatch):
+    """GET /inspect/depth/{job_id}/{frame_idx}?format=json should return proper JSON."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_depth_json",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    class FakeDepthResult:
+        def __init__(self, h, w):
+            self.depth_map = np.ones((h, w), dtype=np.float32) * 5.0
+            self.focal_length = 1.0
+    class FakeEstimator:
+        def predict(self, frame):
+            h, w = frame.shape[:2]
+            return FakeDepthResult(h, w)
+    monkeypatch.setattr("inspection.depth._depth_estimator", FakeEstimator())
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/depth/test_depth_json/0?format=json")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "width" in data
+    assert "height" in data
+    assert "min_depth" in data
+    assert "max_depth" in data
+    assert "data_b64" in data
+    assert "depth_stats" in data
+    assert data["depth_stats"]["min_m"] == pytest.approx(5.0)
+    assert data["depth_stats"]["max_m"] == pytest.approx(5.0)
+    # Verify base64 decodes to correct size
+    decoded = base64.b64decode(data["data_b64"])
+    assert len(decoded) == data["width"] * data["height"] * 4
+    from inspection.depth import clear_depth_cache
+    clear_depth_cache()
+    storage.delete("test_depth_json")
+def test_depth_endpoint_colorized(tmp_path, monkeypatch):
+    """GET /inspect/depth/{job_id}/{frame_idx}?format=colorized should return JPEG."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_depth_color",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    class FakeDepthResult:
+        def __init__(self, h, w):
+            self.depth_map = np.random.rand(h, w).astype(np.float32) * 10.0
+            self.focal_length = 1.0
+    class FakeEstimator:
+        def predict(self, frame):
+            h, w = frame.shape[:2]
+            return FakeDepthResult(h, w)
+    monkeypatch.setattr("inspection.depth._depth_estimator", FakeEstimator())
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/depth/test_depth_color/0?format=colorized")
+    assert resp.status_code == 200
+    assert resp.headers["content-type"] == "image/jpeg"
+    assert resp.content[:2] == b"\xff\xd8"  # JPEG magic
+    from inspection.depth import clear_depth_cache
+    clear_depth_cache()
+    storage.delete("test_depth_color")
+def test_depth_endpoint_invalid_format(tmp_path, monkeypatch):
+    """Invalid format should return 400."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_depth_bad_fmt",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/depth/test_depth_bad_fmt/0?format=invalid")
+    assert resp.status_code == 400
+    storage.delete("test_depth_bad_fmt")
+def test_depth_endpoint_job_not_found():
+    """Non-existent job should return 404."""
+    from fastapi.testclient import TestClient
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/depth/nonexistent/0?format=json")
+    assert resp.status_code == 404
+def test_depth_endpoint_with_track_id(tmp_path, monkeypatch):
+    """Depth with track_id should crop to bbox."""
+    from fastapi.testclient import TestClient
+    from jobs.models import JobInfo, JobStatus
+    from jobs.storage import get_job_storage, set_track_data
+    video_path = _make_test_video(tmp_path)
+    storage = get_job_storage()
+    job = JobInfo(
+        job_id="test_depth_track",
+        status=JobStatus.COMPLETED,
+        mode="object_detection",
+        queries=["person"],
+        detector_name="yolo11",
+        segmenter_name=None,
+        input_video_path=video_path,
+        output_video_path=None,
+    )
+    storage.create(job)
+    # Add track data for frame 0
+    set_track_data("test_depth_track", 0, [
+        {"instance_id": 1, "label": "person", "bbox": [10, 10, 30, 30]},
+    ])
+    class FakeDepthResult:
+        def __init__(self, h, w):
+            self.depth_map = np.arange(h * w, dtype=np.float32).reshape(h, w)
+            self.focal_length = 1.0
+    class FakeEstimator:
+        def predict(self, frame):
+            h, w = frame.shape[:2]
+            return FakeDepthResult(h, w)
+    monkeypatch.setattr("inspection.depth._depth_estimator", FakeEstimator())
+    from inspection.router import router
+    from fastapi import FastAPI
+    app = FastAPI()
+    app.include_router(router)
+    client = TestClient(app)
+    resp = client.get("/inspect/depth/test_depth_track/0?format=json&track_id=T01")
+    assert resp.status_code == 200
+    data = resp.json()
+    # Cropped to bbox [10,10,30,30] => 20x20
+    assert data["width"] == 20
+    assert data["height"] == 20
+    from inspection.depth import clear_depth_cache
+    clear_depth_cache()
+    storage.delete("test_depth_track")