Spaces:

BiasLab2025
/

perception

Paused

App Files Files Community

Zhen Ye commited on 21 days ago

Commit

78d352c

1 Parent(s): 89b854c

feat: Implement depth-based distance estimation and frontend integration

Browse files

Files changed (6) hide show

app.py +16 -1
frontend/js/core/tracker.js +17 -1
frontend/js/main.js +7 -1
frontend/js/ui/cards.js +5 -1
frontend/js/ui/radar.js +16 -7
inference.py +75 -11

app.py CHANGED Viewed

@@ -34,6 +34,7 @@ from datetime import timedelta
 from pathlib import Path
 import cv2
 from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, StreamingResponse
@@ -368,7 +369,7 @@ async def detect_async_endpoint(
     active_depth = depth_estimator if enable_depth else None
     try:
-        processed_frame, detections = process_first_frame(
             str(input_path),
             query_list,
             mode=mode,
@@ -380,6 +381,20 @@ async def detect_async_endpoint(
             enable_gpt=enable_gpt,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
     except Exception:
         logging.exception("First-frame processing failed.")
         shutil.rmtree(job_dir, ignore_errors=True)

 from pathlib import Path
 import cv2
+import numpy as np
 from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, StreamingResponse
     active_depth = depth_estimator if enable_depth else None
     try:
+        processed_frame, detections, depth_map = process_first_frame(
             str(input_path),
             query_list,
             mode=mode,
             enable_gpt=enable_gpt,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
+        if depth_map is not None:
+             # Simple visualization: Normalize and apply colormap
+             try:
+                 d_min, d_max = np.min(depth_map), np.max(depth_map)
+                 if d_max - d_min > 1e-6:
+                     d_norm = (depth_map - d_min) / (d_max - d_min)
+                 else:
+                     d_norm = np.zeros_like(depth_map)
+                 d_uint8 = (d_norm * 255).astype(np.uint8)
+                 d_color = cv2.applyColorMap(d_uint8, cv2.COLORMAP_INFERNO)
+                 cv2.imwrite(str(first_frame_depth_path), d_color)
+             except Exception as e:
+                 logging.warning(f"Failed to save depth map: {e}")
     except Exception:
         logging.exception("First-frame processing failed.")
         shutil.rmtree(job_dir, ignore_errors=True)

frontend/js/core/tracker.js CHANGED Viewed

@@ -32,7 +32,9 @@ APP.core.tracker.matchAndUpdateTracks = function (dets, dtSec) {
         bbox: normBBox(d.bbox, w, h),
         label: d.class,
         score: d.score,
-        depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null
     }));
     // mark all tracks as unmatched
@@ -89,6 +91,13 @@ APP.core.tracker.matchAndUpdateTracks = function (dets, dtSec) {
             if (Number.isFinite(best.depth_rel)) {
                 tr.depth_rel = best.depth_rel;
             }
             tr.lastSeen = now();
         } else {
             // Decay velocity
@@ -118,6 +127,8 @@ APP.core.tracker.matchAndUpdateTracks = function (dets, dtSec) {
                 baseDwell_s: 5.5,
                 reqP_kW: 42,
                 depth_rel: detObjs[i].depth_rel,
                 // GPT properties
                 gpt_distance_m: null,
@@ -182,7 +193,12 @@ APP.core.tracker.syncWithBackend = async function (frameIdx) {
                 score: d.score,
                 angle_deg: d.angle_deg,
                 gpt_distance_m: d.gpt_distance_m,
                 speed_kph: d.speed_kph,
                 // Keep UI state fields
                 lastSeen: Date.now(),

         bbox: normBBox(d.bbox, w, h),
         label: d.class,
         score: d.score,
+        depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
+        depth_est_m: d.depth_est_m,
+        depth_valid: d.depth_valid
     }));
     // mark all tracks as unmatched
             if (Number.isFinite(best.depth_rel)) {
                 tr.depth_rel = best.depth_rel;
             }
+            if (best.depth_valid) {
+                // EMA Smoothing
+                const newD = best.depth_est_m;
+                if (tr.depth_est_m == null) tr.depth_est_m = newD;
+                else tr.depth_est_m = tr.depth_est_m * 0.7 + newD * 0.3;
+                tr.depth_valid = true;
+            }
             tr.lastSeen = now();
         } else {
             // Decay velocity
                 baseDwell_s: 5.5,
                 reqP_kW: 42,
                 depth_rel: detObjs[i].depth_rel,
+                depth_est_m: detObjs[i].depth_est_m,
+                depth_valid: detObjs[i].depth_valid,
                 // GPT properties
                 gpt_distance_m: null,
                 score: d.score,
                 angle_deg: d.angle_deg,
                 gpt_distance_m: d.gpt_distance_m,
+                angle_deg: d.angle_deg,
+                gpt_distance_m: d.gpt_distance_m,
                 speed_kph: d.speed_kph,
+                depth_est_m: d.depth_est_m,
+                depth_rel: d.depth_rel,
+                depth_valid: d.depth_valid,
                 // Keep UI state fields
                 lastSeen: Date.now(),

frontend/js/main.js CHANGED Viewed

@@ -522,7 +522,10 @@ document.addEventListener("DOMContentLoaded", () => {
                 reqP_kW: 40,
                 maxP_kW: 0,
                 pkill: 0,
-                depth_rel: Number.isFinite(d.depth_rel) ? d.depth_rel : null,
                 gpt_distance_m: d.gpt_distance_m || null,
                 gpt_direction: d.gpt_direction || null,
                 gpt_description: d.gpt_description || null
@@ -551,6 +554,9 @@ document.addEventListener("DOMContentLoaded", () => {
             baseDwell_s: d.baseDwell_s || 4.0,
             reqP_kW: d.reqP_kW || 35,
             depth_rel: d.depth_rel,
             gpt_distance_m: d.gpt_distance_m,
             gpt_direction: d.gpt_direction,
             gpt_description: d.gpt_description,

                 reqP_kW: 40,
                 maxP_kW: 0,
                 pkill: 0,
+                // New depth fields
+                depth_est_m: (d.depth_est_m !== undefined && d.depth_est_m !== null) ? d.depth_est_m : null,
+                depth_rel: (d.depth_rel !== undefined && d.depth_rel !== null) ? d.depth_rel : null,
+                depth_valid: d.depth_valid ?? false,
                 gpt_distance_m: d.gpt_distance_m || null,
                 gpt_direction: d.gpt_direction || null,
                 gpt_description: d.gpt_description || null
             baseDwell_s: d.baseDwell_s || 4.0,
             reqP_kW: d.reqP_kW || 35,
             depth_rel: d.depth_rel,
+            depth_est_m: d.depth_est_m,
+            depth_valid: d.depth_valid,
+            lastDepthBbox: d.depth_valid ? { ...d.bbox } : null,
             gpt_distance_m: d.gpt_distance_m,
             gpt_direction: d.gpt_direction,
             gpt_description: d.gpt_description,

frontend/js/ui/cards.js CHANGED Viewed

@@ -23,8 +23,12 @@ APP.ui.cards.renderFrameTrackList = function () {
         let rangeStr = "---";
         let bearingStr = "---";
-        if (det.gpt_distance_m) {
             rangeStr = `${det.gpt_distance_m}m (GPT)`;
         }
         if (det.gpt_direction) {

         let rangeStr = "---";
         let bearingStr = "---";
+        if (det.depth_valid && det.depth_est_m != null) {
+            rangeStr = `${Math.round(det.depth_est_m)}m (Depth)`;
+        } else if (det.gpt_distance_m) {
             rangeStr = `${det.gpt_distance_m}m (GPT)`;
+        } else if (det.baseRange_m) {
+            rangeStr = `${Math.round(det.baseRange_m)}m (Area)`;
         }
         if (det.gpt_direction) {

frontend/js/ui/radar.js CHANGED Viewed

@@ -84,16 +84,24 @@ APP.ui.radar.render = function (canvasId, trackSource) {
     if (source) {
         source.forEach(det => {
             // Determine Range (pixels)
-            let dist = 3000; // default unknown
-            if (det.gpt_distance_m) {
-                dist = det.gpt_distance_m;
             }
-            // Linear scale: 0m -> 0px, 1500m -> R
-            const maxRangeM = 1500;
-            const rPx = (clamp(dist, 0, maxRangeM) / maxRangeM) * R;
             const bx = det.bbox.x + det.bbox.w * 0.5;
             const fw = state.frame.w || 1280;
             const tx = (bx / fw) - 0.5;
@@ -159,6 +167,7 @@ APP.ui.radar.render = function (canvasId, trackSource) {
                 rotation = -Math.PI / 2;
             }
             ctx.rotate(rotation);
             const size = isSelected ? 8 : 6;

     if (source) {
         source.forEach(det => {
             // Determine Range (pixels)
+            let rPx;
+            let dist = 3000;
+            const maxRangeM = 1500;
+            if (det.depth_valid && det.depth_rel != null) {
+                // Use relative depth for accurate relative positioning (0.1 R to R)
+                rPx = (det.depth_rel * 0.9 + 0.1) * R;
+                dist = det.depth_est_m || 3000;
+            } else {
+                // Fallback to absolute metrics
+                if (det.gpt_distance_m) {
+                    dist = det.gpt_distance_m;
+                } else if (det.baseRange_m) {
+                    dist = det.baseRange_m;
+                }
+                rPx = (clamp(dist, 0, maxRangeM) / maxRangeM) * R;
             }
             const bx = det.bbox.x + det.bbox.w * 0.5;
             const fw = state.frame.w || 1280;
             const tx = (bx / fw) - 0.5;
                 rotation = -Math.PI / 2;
             }
+            // Adjust rotation for canvas (clockwise from X+)
             ctx.rotate(rotation);
             const size = isSelected ? 8 : 6;

inference.py CHANGED Viewed

@@ -771,6 +771,46 @@ def extract_first_frame(video_path: str) -> Tuple[np.ndarray, float, int, int]:
     return frame, fps, width, height
 def process_first_frame(
     video_path: str,
     queries: List[str],
@@ -781,27 +821,51 @@ def process_first_frame(
     depth_scale: Optional[float] = None,
     enable_depth_estimator: bool = False,
     enable_gpt: bool = True,  # ENABLED BY DEFAULT
-) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
     frame, _, _, _ = extract_first_frame(video_path)
     if mode == "segmentation":
         processed, _ = infer_segmentation_frame(
             frame, text_queries=queries, segmenter_name=segmenter_name
         )
-        return processed, []
     processed, detections = infer_frame(
         frame, queries, detector_name=detector_name
     )
-    # 1. Legacy Depth Estimation (Optional)
-    if enable_depth_estimator:
-        logging.info("Running legacy depth estimation...")
-        _attach_depth_metrics(
-            frame,
-            detections,
-            depth_estimator_name,
-            _DEPTH_SCALE if depth_scale is None else depth_scale,
-        )
     # 2. GPT-based Distance/Direction Estimation (Explicitly enabled)
     if enable_gpt:

     return frame, fps, width, height
+def compute_depth_per_detection(
+    depth_map: np.ndarray,
+    detections: List[Dict],
+    depth_scale: float = 1.0
+) -> List[Dict]:
+    """Sample depth for each detection bbox, compute relative distances."""
+    depths = []
+    for det in detections:
+        x1, y1, x2, y2 = det["bbox"]
+        # Sample central 50% region for robustness (avoids edge artifacts)
+        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
+        hw, hh = max(1, (x2 - x1) // 4), max(1, (y2 - y1) // 4)
+        y_start, y_end = max(0, cy - hh), min(depth_map.shape[0], cy + hh)
+        x_start, x_end = max(0, cx - hw), min(depth_map.shape[1], cx + hw)
+        region = depth_map[y_start:y_end, x_start:x_end]
+        valid = region[np.isfinite(region)]
+        if len(valid) >= 10:
+            det["depth_est_m"] = float(np.median(valid)) * depth_scale
+            det["depth_valid"] = True
+            depths.append(det["depth_est_m"])
+        else:
+            det["depth_est_m"] = None
+            det["depth_valid"] = False
+            det["depth_rel"] = None
+    # Per-frame relative normalization
+    if depths:
+        min_d, max_d = min(depths), max(depths)
+        span = max_d - min_d + 1e-6
+        for det in detections:
+            if det.get("depth_valid"):
+                det["depth_rel"] = (det["depth_est_m"] - min_d) / span
+    elif len(detections) == 1 and detections[0].get("depth_valid"):
+        # Single detection: assign neutral relative distance
+        detections[0]["depth_rel"] = 0.5
+    return detections
 def process_first_frame(
     video_path: str,
     queries: List[str],
     depth_scale: Optional[float] = None,
     enable_depth_estimator: bool = False,
     enable_gpt: bool = True,  # ENABLED BY DEFAULT
+) -> Tuple[np.ndarray, List[Dict[str, Any]], Optional[np.ndarray]]:
     frame, _, _, _ = extract_first_frame(video_path)
     if mode == "segmentation":
         processed, _ = infer_segmentation_frame(
             frame, text_queries=queries, segmenter_name=segmenter_name
         )
+        return processed, [], None
     processed, detections = infer_frame(
         frame, queries, detector_name=detector_name
     )
+    # 1. Synchronous Depth Estimation (HF Backend)
+    depth_map = None
+    # If a specific depth estimator is requested OR if generic "enable" flag is on
+    should_run_depth = (depth_estimator_name is not None) or enable_depth_estimator
+    if should_run_depth and detections:
+        try:
+            # Resolve name: if none given, default to "depth"
+            d_name = depth_estimator_name if depth_estimator_name else "depth"
+            scale = depth_scale if depth_scale is not None else 1.0
+            logging.info(f"Running synchronous depth estimation with {d_name} (scale={scale})...")
+            estimator = load_depth_estimator(d_name)
+            # Run prediction
+            with _get_model_lock("depth", estimator.name):
+                 result = estimator.predict(frame)
+            depth_map = result.depth_map
+            # Compute per-detection depth metrics
+            detections = compute_depth_per_detection(depth_map, detections, scale)
+        except Exception as e:
+            logging.exception(f"First frame depth failed: {e}")
+            # Mark all detections as depth_valid=False just in case
+            for det in detections:
+                det["depth_est_m"] = None
+                det["depth_rel"] = None
+                det["depth_valid"] = False
+    return processed, detections, depth_map
     # 2. GPT-based Distance/Direction Estimation (Explicitly enabled)
     if enable_gpt: