Zhen Ye commited on
Commit
1c2827d
·
1 Parent(s): 5e832fe

added estimated/relative distance

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. LaserPerception/LaserPerception.js +96 -13
  3. app.py +1 -0
  4. inference.py +71 -0
.gitignore CHANGED
@@ -7,3 +7,4 @@ __pycache__/
7
  .env
8
  *.mdcheckpoints/
9
  checkpoints/
 
 
7
  .env
8
  *.mdcheckpoints/
9
  checkpoints/
10
+ *.md
LaserPerception/LaserPerception.js CHANGED
@@ -1725,10 +1725,16 @@
1725
  const y1 = bbox[1] || 0;
1726
  const x2 = bbox[2] || 0;
1727
  const y2 = bbox[3] || 0;
 
 
 
1728
  return {
1729
  bbox: [x1, y1, Math.max(1, x2 - x1), Math.max(1, y2 - y1)],
1730
  class: d.label || "drone",
1731
- score: d.score ?? 0
 
 
 
1732
  };
1733
  });
1734
  }
@@ -1748,10 +1754,16 @@
1748
  const y1 = bbox[1] || 0;
1749
  const x2 = bbox[2] || 0;
1750
  const y2 = bbox[3] || 0;
 
 
 
1751
  return {
1752
  bbox: [x1, y1, Math.max(1, x2 - x1), Math.max(1, y2 - y1)],
1753
  class: d.label || "object",
1754
- score: d.score ?? 0
 
 
 
1755
  };
1756
  });
1757
  }
@@ -1961,7 +1973,10 @@
1961
  baseDwell_s: null,
1962
  reqP_kW: null,
1963
  maxP_kW: null,
1964
- pkill: null
 
 
 
1965
  };
1966
  });
1967
 
@@ -2153,7 +2168,11 @@
2153
  div.className = "obj" + (d.id === state.selectedId ? " active" : "");
2154
  div.dataset.id = d.id;
2155
 
2156
- const rangeTxt = d.baseRange_m ? `${Math.round(d.baseRange_m)} m` : "—";
 
 
 
 
2157
  const dwellTxt = d.baseDwell_s ? `${d.baseDwell_s.toFixed(1)} s` : "—";
2158
  const pkTxt = (d.pkill != null) ? `${Math.round(d.pkill * 100)}%` : "—";
2159
 
@@ -2166,7 +2185,8 @@
2166
  <div style="display:flex; gap:8px; align-items:center; justify-content:flex-end;">${isMissionFocusLabel(d.label) ? `<span class="badge" style="border-color: rgba(34,211,238,.45); background: rgba(34,211,238,.08)">FOCUS</span>` : ""}<div class="badge"><span class="dot" style="width:7px;height:7px"></span><span>${Math.round(d.score * 100)}%</span></div></div>
2167
  </div>
2168
  <div class="meta">
2169
- <span class="badge">RANGE:${rangeTxt}</span>
 
2170
  <span class="badge">DWELL:${dwellTxt}</span>
2171
  <span class="badge">P(k):${pkTxt}</span>
2172
  <span class="badge">AIM:${escapeHtml(d.aim?.label || "center")}</span>
@@ -2476,6 +2496,9 @@
2476
  baseRange_m: d.baseRange_m || +rangeBase.value,
2477
  baseDwell_s: d.baseDwell_s || 4.0,
2478
  reqP_kW: d.reqP_kW || 35,
 
 
 
2479
  lastSeen: now(),
2480
  vx: 0, vy: 0,
2481
  dwellAccum: 0,
@@ -2507,6 +2530,9 @@
2507
  baseRange_m: +rangeBase.value,
2508
  baseDwell_s: 5.0,
2509
  reqP_kW: 40,
 
 
 
2510
  lastSeen: now(),
2511
  vx: 0, vy: 0,
2512
  dwellAccum: 0,
@@ -2585,7 +2611,10 @@
2585
  const detObjs = dets.map(d => ({
2586
  bbox: normBBox(d.bbox, w, h),
2587
  label: d.class,
2588
- score: d.score
 
 
 
2589
  }));
2590
 
2591
  // mark all tracks as unmatched
@@ -2624,6 +2653,11 @@
2624
 
2625
  tr.label = best.label || tr.label;
2626
  tr.score = best.score || tr.score;
 
 
 
 
 
2627
  tr.lastSeen = now();
2628
  }
2629
  }
@@ -2647,6 +2681,9 @@
2647
  baseRange_m: +rangeBase.value,
2648
  baseDwell_s: 5.5,
2649
  reqP_kW: 42,
 
 
 
2650
  lastSeen: now(),
2651
  vx: 0, vy: 0,
2652
  dwellAccum: 0,
@@ -2672,6 +2709,24 @@
2672
  });
2673
  }
2674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2675
  function rangeFromArea(track) {
2676
  const w = videoEngage.videoWidth || state.frame.w;
2677
  const h = videoEngage.videoHeight || state.frame.h;
@@ -2681,6 +2736,11 @@
2681
  return clamp(track.baseRange_m * rel, 80, 16000);
2682
  }
2683
 
 
 
 
 
 
2684
  function dwellFromRange(track, range_m) {
2685
  const mp = maxPowerAtTarget(range_m);
2686
  const baseReq = track.reqP_kW || 40;
@@ -2751,8 +2811,12 @@
2751
 
2752
  // update dwell bar UI
2753
  const pct = clamp(tr.dwellAccum / Math.max(0.001, reqD), 0, 1) * 100;
 
 
 
 
2754
  dwellBar.style.width = `${pct.toFixed(0)}%`;
2755
- dwellText.textContent = `${tr.id} · ${tr.state} · ${(tr.dwellAccum).toFixed(1)}s / ${reqD.toFixed(1)}s · R=${Math.round(range)}m`;
2756
  }
2757
 
2758
  function pickTrackAt(x, y) {
@@ -2824,6 +2888,7 @@
2824
  const ay = b.y + b.h * tr.aimRel.rely;
2825
 
2826
  const range = rangeFromArea(tr);
 
2827
  const reqD = dwellFromRange(tr, range);
2828
 
2829
  const mp = maxPowerAtTarget(range);
@@ -2860,7 +2925,10 @@
2860
  }
2861
 
2862
  // label with distance + dwell + margin
2863
- const tag = `${tr.id} · R=${Math.round(range)}m · DWELL=${reqD.toFixed(1)}s · ΔP=${margin >= 0 ? "+" : ""}${margin.toFixed(1)}kW`;
 
 
 
2864
  ctx.font = "bold 14px " + getComputedStyle(document.body).fontFamily;
2865
  const tw = ctx.measureText(tag).width;
2866
  const tx = clamp(b.x, 6, w - tw - 12);
@@ -2914,6 +2982,12 @@
2914
 
2915
  alive.forEach(tr => {
2916
  const range = rangeFromArea(tr);
 
 
 
 
 
 
2917
  const reqD = dwellFromRange(tr, range);
2918
  const mp = maxPowerAtTarget(range);
2919
  const margin = mp.Ptar - (tr.reqP_kW || 0);
@@ -2930,7 +3004,8 @@
2930
  <div class="badge"><span class="dot" style="background:${margin >= 0 ? "var(--good)" : "var(--bad)"};box-shadow:none"></span><span>${margin >= 0 ? "+" : ""}${margin.toFixed(1)}kW</span></div>
2931
  </div>
2932
  <div class="meta">
2933
- <span class="badge">R:${Math.round(range)}m</span>
 
2934
  <span class="badge">DW:${reqD.toFixed(1)}s</span>
2935
  <span class="badge">Pk:${Math.round(pk * 100)}%</span>
2936
  <span class="badge">AP:${escapeHtml(tr.aimRel.label)}</span>
@@ -2992,8 +3067,12 @@
2992
  // tracks as blips
2993
  const tracks = state.tracker.tracks.filter(t => !t.killed);
2994
  tracks.forEach(tr => {
2995
- const range = rangeFromArea(tr);
2996
- const rr = clamp(range / Math.max(250, +rangeBase.value), 0.1, 3.5); // relative
 
 
 
 
2997
  const b = tr.bbox;
2998
 
2999
  // bearing from image position
@@ -3019,14 +3098,18 @@
3019
 
3020
  ctx.fillStyle = "rgba(255,255,255,.75)";
3021
  ctx.font = "11px " + getComputedStyle(document.body).fontFamily;
3022
- ctx.fillText(tr.id, px + 8, py + 4);
 
 
 
 
3023
  });
3024
 
3025
  // label
3026
  ctx.fillStyle = "rgba(255,255,255,.55)";
3027
  ctx.font = "11px " + getComputedStyle(document.body).fontFamily;
3028
  ctx.fillText("CENTER: OWN-SHIP", 10, 18);
3029
- ctx.fillText("BLIPS: RELATIVE RANGE + BEARING (from video kinematics)", 10, 36);
3030
  }
3031
 
3032
  // ========= Resizing overlays to match video viewports =========
 
1725
  const y1 = bbox[1] || 0;
1726
  const x2 = bbox[2] || 0;
1727
  const y2 = bbox[3] || 0;
1728
+ const depthEst = Number.isFinite(d.depth_est_m) ? d.depth_est_m : null;
1729
+ const depthRel = Number.isFinite(d.depth_rel) ? d.depth_rel : null;
1730
+ const depthValid = d.depth_valid === true && depthEst !== null;
1731
  return {
1732
  bbox: [x1, y1, Math.max(1, x2 - x1), Math.max(1, y2 - y1)],
1733
  class: d.label || "drone",
1734
+ score: d.score ?? 0,
1735
+ depth_est_m: depthEst,
1736
+ depth_rel: depthRel,
1737
+ depth_valid: depthValid
1738
  };
1739
  });
1740
  }
 
1754
  const y1 = bbox[1] || 0;
1755
  const x2 = bbox[2] || 0;
1756
  const y2 = bbox[3] || 0;
1757
+ const depthEst = Number.isFinite(d.depth_est_m) ? d.depth_est_m : null;
1758
+ const depthRel = Number.isFinite(d.depth_rel) ? d.depth_rel : null;
1759
+ const depthValid = d.depth_valid === true && depthEst !== null;
1760
  return {
1761
  bbox: [x1, y1, Math.max(1, x2 - x1), Math.max(1, y2 - y1)],
1762
  class: d.label || "object",
1763
+ score: d.score ?? 0,
1764
+ depth_est_m: depthEst,
1765
+ depth_rel: depthRel,
1766
+ depth_valid: depthValid
1767
  };
1768
  });
1769
  }
 
1973
  baseDwell_s: null,
1974
  reqP_kW: null,
1975
  maxP_kW: null,
1976
+ pkill: null,
1977
+ depth_est_m: Number.isFinite(d.depth_est_m) ? d.depth_est_m : null,
1978
+ depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
1979
+ depth_valid: d.depth_valid === true
1980
  };
1981
  });
1982
 
 
2168
  div.className = "obj" + (d.id === state.selectedId ? " active" : "");
2169
  div.dataset.id = d.id;
2170
 
2171
+ const rangeData = getDisplayRange(d, d.baseRange_m);
2172
+ const rangeTxt = Number.isFinite(rangeData.range) ? `${Math.round(rangeData.range)} m` : "—";
2173
+ const rangeSuffix = rangeTxt === "—" ? "" : ` (${rangeData.source})`;
2174
+ const relVal = getDisplayRel(d);
2175
+ const relTxt = relVal != null ? relVal.toFixed(2) : "—";
2176
  const dwellTxt = d.baseDwell_s ? `${d.baseDwell_s.toFixed(1)} s` : "—";
2177
  const pkTxt = (d.pkill != null) ? `${Math.round(d.pkill * 100)}%` : "—";
2178
 
 
2185
  <div style="display:flex; gap:8px; align-items:center; justify-content:flex-end;">${isMissionFocusLabel(d.label) ? `<span class="badge" style="border-color: rgba(34,211,238,.45); background: rgba(34,211,238,.08)">FOCUS</span>` : ""}<div class="badge"><span class="dot" style="width:7px;height:7px"></span><span>${Math.round(d.score * 100)}%</span></div></div>
2186
  </div>
2187
  <div class="meta">
2188
+ <span class="badge">RANGE:${rangeTxt}${rangeSuffix}</span>
2189
+ <span class="badge">REL:${relTxt}</span>
2190
  <span class="badge">DWELL:${dwellTxt}</span>
2191
  <span class="badge">P(k):${pkTxt}</span>
2192
  <span class="badge">AIM:${escapeHtml(d.aim?.label || "center")}</span>
 
2496
  baseRange_m: d.baseRange_m || +rangeBase.value,
2497
  baseDwell_s: d.baseDwell_s || 4.0,
2498
  reqP_kW: d.reqP_kW || 35,
2499
+ depth_est_m: Number.isFinite(d.depth_est_m) ? d.depth_est_m : null,
2500
+ depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
2501
+ depth_valid: d.depth_valid === true,
2502
  lastSeen: now(),
2503
  vx: 0, vy: 0,
2504
  dwellAccum: 0,
 
2530
  baseRange_m: +rangeBase.value,
2531
  baseDwell_s: 5.0,
2532
  reqP_kW: 40,
2533
+ depth_est_m: Number.isFinite(d.depth_est_m) ? d.depth_est_m : null,
2534
+ depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
2535
+ depth_valid: d.depth_valid === true,
2536
  lastSeen: now(),
2537
  vx: 0, vy: 0,
2538
  dwellAccum: 0,
 
2611
  const detObjs = dets.map(d => ({
2612
  bbox: normBBox(d.bbox, w, h),
2613
  label: d.class,
2614
+ score: d.score,
2615
+ depth_est_m: Number.isFinite(d.depth_est_m) ? d.depth_est_m : null,
2616
+ depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
2617
+ depth_valid: d.depth_valid === true
2618
  }));
2619
 
2620
  // mark all tracks as unmatched
 
2653
 
2654
  tr.label = best.label || tr.label;
2655
  tr.score = best.score || tr.score;
2656
+ if (best.depth_valid && Number.isFinite(best.depth_est_m)) {
2657
+ tr.depth_est_m = best.depth_est_m;
2658
+ tr.depth_rel = Number.isFinite(best.depth_rel) ? best.depth_rel : tr.depth_rel;
2659
+ tr.depth_valid = true;
2660
+ }
2661
  tr.lastSeen = now();
2662
  }
2663
  }
 
2681
  baseRange_m: +rangeBase.value,
2682
  baseDwell_s: 5.5,
2683
  reqP_kW: 42,
2684
+ depth_est_m: detObjs[i].depth_est_m,
2685
+ depth_rel: detObjs[i].depth_rel,
2686
+ depth_valid: detObjs[i].depth_valid,
2687
  lastSeen: now(),
2688
  vx: 0, vy: 0,
2689
  dwellAccum: 0,
 
2709
  });
2710
  }
2711
 
2712
+ function hasValidDepth(item) {
2713
+ return item && item.depth_valid === true && Number.isFinite(item.depth_est_m);
2714
+ }
2715
+
2716
+ function getDisplayRange(item, fallbackRange) {
2717
+ if (hasValidDepth(item)) {
2718
+ return { range: item.depth_est_m, source: "depth" };
2719
+ }
2720
+ return { range: fallbackRange, source: "area" };
2721
+ }
2722
+
2723
+ function getDisplayRel(item) {
2724
+ if (item && Number.isFinite(item.depth_rel)) {
2725
+ return item.depth_rel;
2726
+ }
2727
+ return null;
2728
+ }
2729
+
2730
  function rangeFromArea(track) {
2731
  const w = videoEngage.videoWidth || state.frame.w;
2732
  const h = videoEngage.videoHeight || state.frame.h;
 
2736
  return clamp(track.baseRange_m * rel, 80, 16000);
2737
  }
2738
 
2739
+ function getTrackDisplayRange(track) {
2740
+ const areaRange = rangeFromArea(track);
2741
+ return getDisplayRange(track, areaRange);
2742
+ }
2743
+
2744
  function dwellFromRange(track, range_m) {
2745
  const mp = maxPowerAtTarget(range_m);
2746
  const baseReq = track.reqP_kW || 40;
 
2811
 
2812
  // update dwell bar UI
2813
  const pct = clamp(tr.dwellAccum / Math.max(0.001, reqD), 0, 1) * 100;
2814
+ const displayRange = getTrackDisplayRange(tr);
2815
+ const rangeLabel = Number.isFinite(displayRange.range)
2816
+ ? `${Math.round(displayRange.range)}m (${displayRange.source})`
2817
+ : "—";
2818
  dwellBar.style.width = `${pct.toFixed(0)}%`;
2819
+ dwellText.textContent = `${tr.id} · ${tr.state} · ${(tr.dwellAccum).toFixed(1)}s / ${reqD.toFixed(1)}s · R=${rangeLabel}`;
2820
  }
2821
 
2822
  function pickTrackAt(x, y) {
 
2888
  const ay = b.y + b.h * tr.aimRel.rely;
2889
 
2890
  const range = rangeFromArea(tr);
2891
+ const displayRange = getTrackDisplayRange(tr);
2892
  const reqD = dwellFromRange(tr, range);
2893
 
2894
  const mp = maxPowerAtTarget(range);
 
2925
  }
2926
 
2927
  // label with distance + dwell + margin
2928
+ const rangeTag = Number.isFinite(displayRange.range)
2929
+ ? `${Math.round(displayRange.range)}m (${displayRange.source})`
2930
+ : "—";
2931
+ const tag = `${tr.id} · R=${rangeTag} · DWELL=${reqD.toFixed(1)}s · ΔP=${margin >= 0 ? "+" : ""}${margin.toFixed(1)}kW`;
2932
  ctx.font = "bold 14px " + getComputedStyle(document.body).fontFamily;
2933
  const tw = ctx.measureText(tag).width;
2934
  const tx = clamp(b.x, 6, w - tw - 12);
 
2982
 
2983
  alive.forEach(tr => {
2984
  const range = rangeFromArea(tr);
2985
+ const displayRange = getTrackDisplayRange(tr);
2986
+ const rangeTxt = Number.isFinite(displayRange.range)
2987
+ ? `${Math.round(displayRange.range)}m (${displayRange.source})`
2988
+ : "—";
2989
+ const relVal = getDisplayRel(tr);
2990
+ const relTxt = relVal != null ? relVal.toFixed(2) : "—";
2991
  const reqD = dwellFromRange(tr, range);
2992
  const mp = maxPowerAtTarget(range);
2993
  const margin = mp.Ptar - (tr.reqP_kW || 0);
 
3004
  <div class="badge"><span class="dot" style="background:${margin >= 0 ? "var(--good)" : "var(--bad)"};box-shadow:none"></span><span>${margin >= 0 ? "+" : ""}${margin.toFixed(1)}kW</span></div>
3005
  </div>
3006
  <div class="meta">
3007
+ <span class="badge">R:${rangeTxt}</span>
3008
+ <span class="badge">REL:${relTxt}</span>
3009
  <span class="badge">DW:${reqD.toFixed(1)}s</span>
3010
  <span class="badge">Pk:${Math.round(pk * 100)}%</span>
3011
  <span class="badge">AP:${escapeHtml(tr.aimRel.label)}</span>
 
3067
  // tracks as blips
3068
  const tracks = state.tracker.tracks.filter(t => !t.killed);
3069
  tracks.forEach(tr => {
3070
+ const areaRange = rangeFromArea(tr);
3071
+ const displayRange = getTrackDisplayRange(tr);
3072
+ const relVal = getDisplayRel(tr);
3073
+ const rr = relVal != null
3074
+ ? clamp(0.2 + relVal * 3.0, 0.1, 3.2)
3075
+ : clamp(areaRange / Math.max(250, +rangeBase.value), 0.1, 3.5); // relative
3076
  const b = tr.bbox;
3077
 
3078
  // bearing from image position
 
3098
 
3099
  ctx.fillStyle = "rgba(255,255,255,.75)";
3100
  ctx.font = "11px " + getComputedStyle(document.body).fontFamily;
3101
+ const rangeLabel = Number.isFinite(displayRange.range)
3102
+ ? `${Math.round(displayRange.range)}m`
3103
+ : `${Math.round(areaRange)}m`;
3104
+ const relLabel = relVal != null ? ` · Rel=${relVal.toFixed(2)}` : "";
3105
+ ctx.fillText(`${tr.id} · R=${rangeLabel}${relLabel}`, px + 8, py + 4);
3106
  });
3107
 
3108
  // label
3109
  ctx.fillStyle = "rgba(255,255,255,.55)";
3110
  ctx.font = "11px " + getComputedStyle(document.body).fontFamily;
3111
  ctx.fillText("CENTER: OWN-SHIP", 10, 18);
3112
+ ctx.fillText("BLIPS: DEPTH RELATIVE RANGE + BEARING (area fallback)", 10, 36);
3113
  }
3114
 
3115
  // ========= Resizing overlays to match video viewports =========
app.py CHANGED
@@ -312,6 +312,7 @@ async def detect_async_endpoint(
312
  mode=mode,
313
  detector_name=detector_name,
314
  segmenter_name=segmenter,
 
315
  )
316
  cv2.imwrite(str(first_frame_path), processed_frame)
317
  except Exception:
 
312
  mode=mode,
313
  detector_name=detector_name,
314
  segmenter_name=segmenter,
315
+ depth_estimator_name=depth_estimator,
316
  )
317
  cv2.imwrite(str(first_frame_path), processed_frame)
318
  except Exception:
inference.py CHANGED
@@ -1,4 +1,5 @@
1
  import logging
 
2
  from threading import RLock
3
  from typing import Any, Dict, List, Optional, Sequence, Tuple
4
 
@@ -167,6 +168,7 @@ def _build_detection_records(
167
 
168
  _MODEL_LOCKS: Dict[str, RLock] = {}
169
  _MODEL_LOCKS_GUARD = RLock()
 
170
 
171
 
172
  def _get_model_lock(kind: str, name: str) -> RLock:
@@ -179,6 +181,67 @@ def _get_model_lock(kind: str, name: str) -> RLock:
179
  return lock
180
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def infer_frame(
183
  frame: np.ndarray,
184
  queries: Sequence[str],
@@ -244,6 +307,8 @@ def process_first_frame(
244
  mode: str,
245
  detector_name: Optional[str] = None,
246
  segmenter_name: Optional[str] = None,
 
 
247
  ) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
248
  frame, _, _, _ = extract_first_frame(video_path)
249
  if mode == "segmentation":
@@ -254,6 +319,12 @@ def process_first_frame(
254
  processed, detections = infer_frame(
255
  frame, queries, detector_name=detector_name
256
  )
 
 
 
 
 
 
257
  return processed, detections
258
 
259
 
 
1
  import logging
2
+ import os
3
  from threading import RLock
4
  from typing import Any, Dict, List, Optional, Sequence, Tuple
5
 
 
168
 
169
  _MODEL_LOCKS: Dict[str, RLock] = {}
170
  _MODEL_LOCKS_GUARD = RLock()
171
+ _DEPTH_SCALE = float(os.getenv("DEPTH_SCALE", "1.0"))
172
 
173
 
174
  def _get_model_lock(kind: str, name: str) -> RLock:
 
181
  return lock
182
 
183
 
184
+ def _attach_depth_metrics(
185
+ frame: np.ndarray,
186
+ detections: List[Dict[str, Any]],
187
+ depth_estimator_name: Optional[str],
188
+ depth_scale: float,
189
+ ) -> None:
190
+ if not detections or not depth_estimator_name:
191
+ return
192
+
193
+ from models.depth_estimators.model_loader import load_depth_estimator
194
+
195
+ estimator = load_depth_estimator(depth_estimator_name)
196
+ lock = _get_model_lock("depth", estimator.name)
197
+ with lock:
198
+ depth_result = estimator.predict(frame)
199
+
200
+ depth_map = depth_result.depth_map
201
+ if depth_map is None or depth_map.size == 0:
202
+ return
203
+
204
+ height, width = depth_map.shape[:2]
205
+ valid_depths: List[float] = []
206
+
207
+ for det in detections:
208
+ det["depth_est_m"] = None
209
+ det["depth_rel"] = None
210
+ det["depth_valid"] = False
211
+
212
+ bbox = det.get("bbox")
213
+ if not bbox or len(bbox) < 4:
214
+ continue
215
+
216
+ x1, y1, x2, y2 = [int(coord) for coord in bbox[:4]]
217
+ x1 = max(0, min(width - 1, x1))
218
+ y1 = max(0, min(height - 1, y1))
219
+ x2 = max(x1 + 1, min(width, x2))
220
+ y2 = max(y1 + 1, min(height, y2))
221
+
222
+ patch = depth_map[y1:y2, x1:x2]
223
+ finite = patch[np.isfinite(patch)]
224
+ if finite.size == 0:
225
+ continue
226
+
227
+ depth_raw = float(np.median(finite))
228
+ depth_est = depth_raw * depth_scale
229
+ det["depth_est_m"] = depth_est
230
+ det["depth_valid"] = True
231
+ valid_depths.append(depth_est)
232
+
233
+ if not valid_depths:
234
+ return
235
+
236
+ min_depth = float(min(valid_depths))
237
+ max_depth = float(max(valid_depths))
238
+ denom = max(max_depth - min_depth, 1e-6)
239
+
240
+ for det in detections:
241
+ if det.get("depth_valid"):
242
+ det["depth_rel"] = (float(det["depth_est_m"]) - min_depth) / denom
243
+
244
+
245
  def infer_frame(
246
  frame: np.ndarray,
247
  queries: Sequence[str],
 
307
  mode: str,
308
  detector_name: Optional[str] = None,
309
  segmenter_name: Optional[str] = None,
310
+ depth_estimator_name: Optional[str] = None,
311
+ depth_scale: Optional[float] = None,
312
  ) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
313
  frame, _, _, _ = extract_first_frame(video_path)
314
  if mode == "segmentation":
 
319
  processed, detections = infer_frame(
320
  frame, queries, detector_name=detector_name
321
  )
322
+ _attach_depth_metrics(
323
+ frame,
324
+ detections,
325
+ depth_estimator_name,
326
+ _DEPTH_SCALE if depth_scale is None else depth_scale,
327
+ )
328
  return processed, detections
329
 
330