Zhen Ye commited on
Commit
78d352c
·
1 Parent(s): 89b854c

feat: Implement depth-based distance estimation and frontend integration

Browse files
app.py CHANGED
@@ -34,6 +34,7 @@ from datetime import timedelta
34
  from pathlib import Path
35
 
36
  import cv2
 
37
  from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
38
  from fastapi.middleware.cors import CORSMiddleware
39
  from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, StreamingResponse
@@ -368,7 +369,7 @@ async def detect_async_endpoint(
368
  active_depth = depth_estimator if enable_depth else None
369
 
370
  try:
371
- processed_frame, detections = process_first_frame(
372
  str(input_path),
373
  query_list,
374
  mode=mode,
@@ -380,6 +381,20 @@ async def detect_async_endpoint(
380
  enable_gpt=enable_gpt,
381
  )
382
  cv2.imwrite(str(first_frame_path), processed_frame)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  except Exception:
384
  logging.exception("First-frame processing failed.")
385
  shutil.rmtree(job_dir, ignore_errors=True)
 
34
  from pathlib import Path
35
 
36
  import cv2
37
+ import numpy as np
38
  from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
39
  from fastapi.middleware.cors import CORSMiddleware
40
  from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, StreamingResponse
 
369
  active_depth = depth_estimator if enable_depth else None
370
 
371
  try:
372
+ processed_frame, detections, depth_map = process_first_frame(
373
  str(input_path),
374
  query_list,
375
  mode=mode,
 
381
  enable_gpt=enable_gpt,
382
  )
383
  cv2.imwrite(str(first_frame_path), processed_frame)
384
+
385
+ if depth_map is not None:
386
+ # Simple visualization: Normalize and apply colormap
387
+ try:
388
+ d_min, d_max = np.min(depth_map), np.max(depth_map)
389
+ if d_max - d_min > 1e-6:
390
+ d_norm = (depth_map - d_min) / (d_max - d_min)
391
+ else:
392
+ d_norm = np.zeros_like(depth_map)
393
+ d_uint8 = (d_norm * 255).astype(np.uint8)
394
+ d_color = cv2.applyColorMap(d_uint8, cv2.COLORMAP_INFERNO)
395
+ cv2.imwrite(str(first_frame_depth_path), d_color)
396
+ except Exception as e:
397
+ logging.warning(f"Failed to save depth map: {e}")
398
  except Exception:
399
  logging.exception("First-frame processing failed.")
400
  shutil.rmtree(job_dir, ignore_errors=True)
frontend/js/core/tracker.js CHANGED
@@ -32,7 +32,9 @@ APP.core.tracker.matchAndUpdateTracks = function (dets, dtSec) {
32
  bbox: normBBox(d.bbox, w, h),
33
  label: d.class,
34
  score: d.score,
35
- depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null
 
 
36
  }));
37
 
38
  // mark all tracks as unmatched
@@ -89,6 +91,13 @@ APP.core.tracker.matchAndUpdateTracks = function (dets, dtSec) {
89
  if (Number.isFinite(best.depth_rel)) {
90
  tr.depth_rel = best.depth_rel;
91
  }
 
 
 
 
 
 
 
92
  tr.lastSeen = now();
93
  } else {
94
  // Decay velocity
@@ -118,6 +127,8 @@ APP.core.tracker.matchAndUpdateTracks = function (dets, dtSec) {
118
  baseDwell_s: 5.5,
119
  reqP_kW: 42,
120
  depth_rel: detObjs[i].depth_rel,
 
 
121
 
122
  // GPT properties
123
  gpt_distance_m: null,
@@ -182,7 +193,12 @@ APP.core.tracker.syncWithBackend = async function (frameIdx) {
182
  score: d.score,
183
  angle_deg: d.angle_deg,
184
  gpt_distance_m: d.gpt_distance_m,
 
 
185
  speed_kph: d.speed_kph,
 
 
 
186
 
187
  // Keep UI state fields
188
  lastSeen: Date.now(),
 
32
  bbox: normBBox(d.bbox, w, h),
33
  label: d.class,
34
  score: d.score,
35
+ depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
36
+ depth_est_m: d.depth_est_m,
37
+ depth_valid: d.depth_valid
38
  }));
39
 
40
  // mark all tracks as unmatched
 
91
  if (Number.isFinite(best.depth_rel)) {
92
  tr.depth_rel = best.depth_rel;
93
  }
94
+ if (best.depth_valid) {
95
+ // EMA Smoothing
96
+ const newD = best.depth_est_m;
97
+ if (tr.depth_est_m == null) tr.depth_est_m = newD;
98
+ else tr.depth_est_m = tr.depth_est_m * 0.7 + newD * 0.3;
99
+ tr.depth_valid = true;
100
+ }
101
  tr.lastSeen = now();
102
  } else {
103
  // Decay velocity
 
127
  baseDwell_s: 5.5,
128
  reqP_kW: 42,
129
  depth_rel: detObjs[i].depth_rel,
130
+ depth_est_m: detObjs[i].depth_est_m,
131
+ depth_valid: detObjs[i].depth_valid,
132
 
133
  // GPT properties
134
  gpt_distance_m: null,
 
193
  score: d.score,
194
  angle_deg: d.angle_deg,
195
  gpt_distance_m: d.gpt_distance_m,
196
+ angle_deg: d.angle_deg,
197
+ gpt_distance_m: d.gpt_distance_m,
198
  speed_kph: d.speed_kph,
199
+ depth_est_m: d.depth_est_m,
200
+ depth_rel: d.depth_rel,
201
+ depth_valid: d.depth_valid,
202
 
203
  // Keep UI state fields
204
  lastSeen: Date.now(),
frontend/js/main.js CHANGED
@@ -522,7 +522,10 @@ document.addEventListener("DOMContentLoaded", () => {
522
  reqP_kW: 40,
523
  maxP_kW: 0,
524
  pkill: 0,
525
- depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
 
 
 
526
  gpt_distance_m: d.gpt_distance_m || null,
527
  gpt_direction: d.gpt_direction || null,
528
  gpt_description: d.gpt_description || null
@@ -551,6 +554,9 @@ document.addEventListener("DOMContentLoaded", () => {
551
  baseDwell_s: d.baseDwell_s || 4.0,
552
  reqP_kW: d.reqP_kW || 35,
553
  depth_rel: d.depth_rel,
 
 
 
554
  gpt_distance_m: d.gpt_distance_m,
555
  gpt_direction: d.gpt_direction,
556
  gpt_description: d.gpt_description,
 
522
  reqP_kW: 40,
523
  maxP_kW: 0,
524
  pkill: 0,
525
+ // New depth fields
526
+ depth_est_m: (d.depth_est_m !== undefined && d.depth_est_m !== null) ? d.depth_est_m : null,
527
+ depth_rel: (d.depth_rel !== undefined && d.depth_rel !== null) ? d.depth_rel : null,
528
+ depth_valid: d.depth_valid ?? false,
529
  gpt_distance_m: d.gpt_distance_m || null,
530
  gpt_direction: d.gpt_direction || null,
531
  gpt_description: d.gpt_description || null
 
554
  baseDwell_s: d.baseDwell_s || 4.0,
555
  reqP_kW: d.reqP_kW || 35,
556
  depth_rel: d.depth_rel,
557
+ depth_est_m: d.depth_est_m,
558
+ depth_valid: d.depth_valid,
559
+ lastDepthBbox: d.depth_valid ? { ...d.bbox } : null,
560
  gpt_distance_m: d.gpt_distance_m,
561
  gpt_direction: d.gpt_direction,
562
  gpt_description: d.gpt_description,
frontend/js/ui/cards.js CHANGED
@@ -23,8 +23,12 @@ APP.ui.cards.renderFrameTrackList = function () {
23
  let rangeStr = "---";
24
  let bearingStr = "---";
25
 
26
- if (det.gpt_distance_m) {
 
 
27
  rangeStr = `${det.gpt_distance_m}m (GPT)`;
 
 
28
  }
29
 
30
  if (det.gpt_direction) {
 
23
  let rangeStr = "---";
24
  let bearingStr = "---";
25
 
26
+ if (det.depth_valid && det.depth_est_m != null) {
27
+ rangeStr = `${Math.round(det.depth_est_m)}m (Depth)`;
28
+ } else if (det.gpt_distance_m) {
29
  rangeStr = `${det.gpt_distance_m}m (GPT)`;
30
+ } else if (det.baseRange_m) {
31
+ rangeStr = `${Math.round(det.baseRange_m)}m (Area)`;
32
  }
33
 
34
  if (det.gpt_direction) {
frontend/js/ui/radar.js CHANGED
@@ -84,16 +84,24 @@ APP.ui.radar.render = function (canvasId, trackSource) {
84
  if (source) {
85
  source.forEach(det => {
86
  // Determine Range (pixels)
87
- let dist = 3000; // default unknown
 
 
88
 
89
- if (det.gpt_distance_m) {
90
- dist = det.gpt_distance_m;
 
 
 
 
 
 
 
 
 
 
91
  }
92
 
93
- // Linear scale: 0m -> 0px, 1500m -> R
94
- const maxRangeM = 1500;
95
- const rPx = (clamp(dist, 0, maxRangeM) / maxRangeM) * R;
96
-
97
  const bx = det.bbox.x + det.bbox.w * 0.5;
98
  const fw = state.frame.w || 1280;
99
  const tx = (bx / fw) - 0.5;
@@ -159,6 +167,7 @@ APP.ui.radar.render = function (canvasId, trackSource) {
159
  rotation = -Math.PI / 2;
160
  }
161
 
 
162
  ctx.rotate(rotation);
163
 
164
  const size = isSelected ? 8 : 6;
 
84
  if (source) {
85
  source.forEach(det => {
86
  // Determine Range (pixels)
87
+ let rPx;
88
+ let dist = 3000;
89
+ const maxRangeM = 1500;
90
 
91
+ if (det.depth_valid && det.depth_rel != null) {
92
+ // Use relative depth for accurate relative positioning (0.1 R to R)
93
+ rPx = (det.depth_rel * 0.9 + 0.1) * R;
94
+ dist = det.depth_est_m || 3000;
95
+ } else {
96
+ // Fallback to absolute metrics
97
+ if (det.gpt_distance_m) {
98
+ dist = det.gpt_distance_m;
99
+ } else if (det.baseRange_m) {
100
+ dist = det.baseRange_m;
101
+ }
102
+ rPx = (clamp(dist, 0, maxRangeM) / maxRangeM) * R;
103
  }
104
 
 
 
 
 
105
  const bx = det.bbox.x + det.bbox.w * 0.5;
106
  const fw = state.frame.w || 1280;
107
  const tx = (bx / fw) - 0.5;
 
167
  rotation = -Math.PI / 2;
168
  }
169
 
170
+ // Adjust rotation for canvas (clockwise from X+)
171
  ctx.rotate(rotation);
172
 
173
  const size = isSelected ? 8 : 6;
inference.py CHANGED
@@ -771,6 +771,46 @@ def extract_first_frame(video_path: str) -> Tuple[np.ndarray, float, int, int]:
771
  return frame, fps, width, height
772
 
773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
  def process_first_frame(
775
  video_path: str,
776
  queries: List[str],
@@ -781,27 +821,51 @@ def process_first_frame(
781
  depth_scale: Optional[float] = None,
782
  enable_depth_estimator: bool = False,
783
  enable_gpt: bool = True, # ENABLED BY DEFAULT
784
- ) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
785
  frame, _, _, _ = extract_first_frame(video_path)
786
  if mode == "segmentation":
787
  processed, _ = infer_segmentation_frame(
788
  frame, text_queries=queries, segmenter_name=segmenter_name
789
  )
790
- return processed, []
791
 
792
  processed, detections = infer_frame(
793
  frame, queries, detector_name=detector_name
794
  )
795
 
796
- # 1. Legacy Depth Estimation (Optional)
797
- if enable_depth_estimator:
798
- logging.info("Running legacy depth estimation...")
799
- _attach_depth_metrics(
800
- frame,
801
- detections,
802
- depth_estimator_name,
803
- _DEPTH_SCALE if depth_scale is None else depth_scale,
804
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
805
 
806
  # 2. GPT-based Distance/Direction Estimation (Explicitly enabled)
807
  if enable_gpt:
 
771
  return frame, fps, width, height
772
 
773
 
774
+ def compute_depth_per_detection(
775
+ depth_map: np.ndarray,
776
+ detections: List[Dict],
777
+ depth_scale: float = 1.0
778
+ ) -> List[Dict]:
779
+ """Sample depth for each detection bbox, compute relative distances."""
780
+ depths = []
781
+ for det in detections:
782
+ x1, y1, x2, y2 = det["bbox"]
783
+ # Sample central 50% region for robustness (avoids edge artifacts)
784
+ cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
785
+ hw, hh = max(1, (x2 - x1) // 4), max(1, (y2 - y1) // 4)
786
+ y_start, y_end = max(0, cy - hh), min(depth_map.shape[0], cy + hh)
787
+ x_start, x_end = max(0, cx - hw), min(depth_map.shape[1], cx + hw)
788
+ region = depth_map[y_start:y_end, x_start:x_end]
789
+
790
+ valid = region[np.isfinite(region)]
791
+ if len(valid) >= 10:
792
+ det["depth_est_m"] = float(np.median(valid)) * depth_scale
793
+ det["depth_valid"] = True
794
+ depths.append(det["depth_est_m"])
795
+ else:
796
+ det["depth_est_m"] = None
797
+ det["depth_valid"] = False
798
+ det["depth_rel"] = None
799
+
800
+ # Per-frame relative normalization
801
+ if depths:
802
+ min_d, max_d = min(depths), max(depths)
803
+ span = max_d - min_d + 1e-6
804
+ for det in detections:
805
+ if det.get("depth_valid"):
806
+ det["depth_rel"] = (det["depth_est_m"] - min_d) / span
807
+ elif len(detections) == 1 and detections[0].get("depth_valid"):
808
+ # Single detection: assign neutral relative distance
809
+ detections[0]["depth_rel"] = 0.5
810
+
811
+ return detections
812
+
813
+
814
  def process_first_frame(
815
  video_path: str,
816
  queries: List[str],
 
821
  depth_scale: Optional[float] = None,
822
  enable_depth_estimator: bool = False,
823
  enable_gpt: bool = True, # ENABLED BY DEFAULT
824
+ ) -> Tuple[np.ndarray, List[Dict[str, Any]], Optional[np.ndarray]]:
825
  frame, _, _, _ = extract_first_frame(video_path)
826
  if mode == "segmentation":
827
  processed, _ = infer_segmentation_frame(
828
  frame, text_queries=queries, segmenter_name=segmenter_name
829
  )
830
+ return processed, [], None
831
 
832
  processed, detections = infer_frame(
833
  frame, queries, detector_name=detector_name
834
  )
835
 
836
+ # 1. Synchronous Depth Estimation (HF Backend)
837
+ depth_map = None
838
+ # If a specific depth estimator is requested OR if generic "enable" flag is on
839
+ should_run_depth = (depth_estimator_name is not None) or enable_depth_estimator
840
+
841
+ if should_run_depth and detections:
842
+ try:
843
+ # Resolve name: if none given, default to "depth"
844
+ d_name = depth_estimator_name if depth_estimator_name else "depth"
845
+ scale = depth_scale if depth_scale is not None else 1.0
846
+
847
+ logging.info(f"Running synchronous depth estimation with {d_name} (scale={scale})...")
848
+ estimator = load_depth_estimator(d_name)
849
+
850
+ # Run prediction
851
+ with _get_model_lock("depth", estimator.name):
852
+ result = estimator.predict(frame)
853
+
854
+ depth_map = result.depth_map
855
+
856
+ # Compute per-detection depth metrics
857
+ detections = compute_depth_per_detection(depth_map, detections, scale)
858
+
859
+ except Exception as e:
860
+ logging.exception(f"First frame depth failed: {e}")
861
+ # Mark all detections as depth_valid=False just in case
862
+ for det in detections:
863
+ det["depth_est_m"] = None
864
+ det["depth_rel"] = None
865
+ det["depth_valid"] = False
866
+
867
+ return processed, detections, depth_map
868
+
869
 
870
  # 2. GPT-based Distance/Direction Estimation (Explicitly enabled)
871
  if enable_gpt: