Spaces:

BiasLab2025
/

perception

Sleeping

Zhen Ye commited on 15 days ago

Commit

391cb94

1 Parent(s): 970cc85

feat: fix depth inversion and add video overlays

- Inverted depth calculation (closer=smaller distance)
- Added bounding box overlays to depth visualization video
- Pipelined detection data to depth generator
- Updated frontend radar logic (implicitly fixed by backend correction)

Files changed (3) hide show

app.py +1 -1
inference.py +86 -32
jobs/background.py +5 -1

app.py CHANGED Viewed

@@ -232,7 +232,7 @@ async def detect_endpoint(
     # Run inference
     try:
         detector_name = "drone_yolo" if mode == "drone_detection" else detector
-        output_path = run_inference(
             input_path,
             output_path,
             query_list,

     # Run inference
     try:
         detector_name = "drone_yolo" if mode == "drone_detection" else detector
+        output_path, _ = run_inference(
             input_path,
             output_path,
             query_list,

inference.py CHANGED Viewed

@@ -240,8 +240,18 @@ def _attach_depth_metrics(
         if finite.size == 0:
             continue
-        depth_raw = float(np.median(finite))
-        depth_est = depth_raw * depth_scale
         det["depth_est_m"] = depth_est
         det["depth_valid"] = True
         valid_depths.append(depth_est)
@@ -395,7 +405,7 @@ def run_inference(
     job_id: Optional[str] = None,
     depth_estimator_name: Optional[str] = None,
     depth_scale: float = 1.0,
-) -> str:
     """
     Run object detection inference on a video.
@@ -469,11 +479,12 @@ def run_inference(
         detectors = None
     processed_frames_map = {}
     # Process frames
     if detectors:
         # Multi-GPU Parallel Processing
-        def process_frame_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray]:
             # Determine which GPU to use based on frame index (round-robin)
             gpu_idx = frame_idx % len(detectors)
             detector_instance = detectors[gpu_idx]
@@ -485,7 +496,7 @@ def run_inference(
             active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
             active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
-            processed, _ = infer_frame(
                 frame_data,
                 queries,
                 detector_name=None, # Use instance
@@ -494,7 +505,7 @@ def run_inference(
                 detector_instance=detector_instance,
                 depth_estimator_instance=active_depth_instance
             )
-            return frame_idx, processed
         # Thread pool with more workers than GPUs to keep them fed
         max_workers = min(len(detectors) * 2, 8)
@@ -507,15 +518,18 @@ def run_inference(
                 futures.append(executor.submit(process_frame_task, idx, frame))
             for future in futures:
-                idx, result_frame = future.result() # Wait for completion (in order or not, but we verify order)
                 processed_frames_map[idx] = result_frame
         # Reasemble in order
         processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
     else:
         # Standard Single-Threaded Loop
         processed_frames = []
         for idx, frame in enumerate(frames):
             # Check for cancellation every frame
             _check_cancellation(job_id)
@@ -527,7 +541,7 @@ def run_inference(
             # Run depth estimation every 3 frames if configured
             active_depth = depth_estimator_name if (idx % 3 == 0) else None
-            processed_frame, _ = infer_frame(
                 frame,
                 queries,
                 detector_name=active_detector,
@@ -535,12 +549,13 @@ def run_inference(
                 depth_scale=depth_scale
             )
             processed_frames.append(processed_frame)
     # Write output video
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
     logging.info("Processed video written to: %s", output_video_path)
-    return output_video_path
 def run_segmentation(
@@ -630,6 +645,7 @@ def run_segmentation(
 def run_depth_inference(
     input_video_path: str,
     output_video_path: str,
     max_frames: Optional[int] = None,
     depth_estimator_name: str = "depth",
     first_frame_depth_path: Optional[str] = None,
@@ -661,8 +677,8 @@ def run_depth_inference(
     if max_frames is not None:
         frames = frames[:max_frames]
-    # Process depth with stable normalization
-    processed_frames = process_frames_depth(frames, depth_estimator_name, job_id)
     # Write output video
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
@@ -680,6 +696,7 @@ def run_depth_inference(
 def process_frames_depth(
     frames: List[np.ndarray],
     depth_estimator_name: str,
     job_id: Optional[str] = None,
 ) -> List[np.ndarray]:
     """
@@ -791,32 +808,69 @@ def process_frames_depth(
         if not np.isfinite(global_min) or not np.isfinite(global_max):
             logging.warning("Depth percentiles are non-finite - using min/max fallback")
-            global_min = float(valid_depths.min())
-            global_max = float(valid_depths.max())
-        # Handle edge case where min == max
-        if abs(global_max - global_min) < 1e-6:
-            global_min = float(valid_depths.min())
-            global_max = float(valid_depths.max())
             if abs(global_max - global_min) < 1e-6:
-                global_max = global_min + 1.0
-    logging.info(
-        "Depth range: %.2f - %.2f meters (1st-99th percentile)",
-        global_min,
-        global_max,
-    )
-    # Second pass: Colorize with stable normalization
-    processed = []
-    for idx, depth_map in enumerate(depth_maps):
-        depth_vis = colorize_depth_map(depth_map, global_min, global_max)
-        processed.append(depth_vis)
-        if idx % 10 == 0:
-            logging.debug("Colorized frame %d/%d", idx + 1, len(depth_maps))
-    return processed
 def colorize_depth_map(

         if finite.size == 0:
             continue
+        if depth_raw <= 1e-6:
+            det["depth_est_m"] = None
+            det["depth_valid"] = False
+            continue
+        # Inverted depth: closer objects have higher raw values
+        # Distance = Scale / RawValue
+        try:
+            depth_est = depth_scale / depth_raw
+        except ZeroDivisionError:
+            continue
         det["depth_est_m"] = depth_est
         det["depth_valid"] = True
         valid_depths.append(depth_est)
     job_id: Optional[str] = None,
     depth_estimator_name: Optional[str] = None,
     depth_scale: float = 1.0,
+) -> tuple[str, List[List[Dict[str, Any]]]]:
     """
     Run object detection inference on a video.
         detectors = None
     processed_frames_map = {}
+    all_detections_map = {}
     # Process frames
     if detectors:
         # Multi-GPU Parallel Processing
+        def process_frame_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray, List[Dict[str, Any]]]:
             # Determine which GPU to use based on frame index (round-robin)
             gpu_idx = frame_idx % len(detectors)
             detector_instance = detectors[gpu_idx]
             active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
             active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
+            processed, frame_dets = infer_frame(
                 frame_data,
                 queries,
                 detector_name=None, # Use instance
                 detector_instance=detector_instance,
                 depth_estimator_instance=active_depth_instance
             )
+            return frame_idx, processed, frame_dets
         # Thread pool with more workers than GPUs to keep them fed
         max_workers = min(len(detectors) * 2, 8)
                 futures.append(executor.submit(process_frame_task, idx, frame))
             for future in futures:
+                idx, result_frame, result_dets = future.result() # Wait for completion (in order or not, but we verify order)
                 processed_frames_map[idx] = result_frame
+                all_detections_map[idx] = result_dets
         # Reasemble in order
         processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
+        all_detections = [all_detections_map[i] for i in range(len(all_detections_map))]
     else:
         # Standard Single-Threaded Loop
         processed_frames = []
+        all_detections = []
         for idx, frame in enumerate(frames):
             # Check for cancellation every frame
             _check_cancellation(job_id)
             # Run depth estimation every 3 frames if configured
             active_depth = depth_estimator_name if (idx % 3 == 0) else None
+            processed_frame, frame_dets = infer_frame(
                 frame,
                 queries,
                 detector_name=active_detector,
                 depth_scale=depth_scale
             )
             processed_frames.append(processed_frame)
+            all_detections.append(frame_dets)
     # Write output video
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
     logging.info("Processed video written to: %s", output_video_path)
+    return output_video_path, all_detections
 def run_segmentation(
 def run_depth_inference(
     input_video_path: str,
     output_video_path: str,
+    detections: Optional[List[List[Dict[str, Any]]]] = None,
     max_frames: Optional[int] = None,
     depth_estimator_name: str = "depth",
     first_frame_depth_path: Optional[str] = None,
     if max_frames is not None:
         frames = frames[:max_frames]
+    # Process depth with stable normalization and overlay
+    processed_frames = process_frames_depth(frames, depth_estimator_name, detections=detections, job_id=job_id)
     # Write output video
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
 def process_frames_depth(
     frames: List[np.ndarray],
     depth_estimator_name: str,
+    detections: Optional[List[List[Dict[str, Any]]]] = None,
     job_id: Optional[str] = None,
 ) -> List[np.ndarray]:
     """
         if not np.isfinite(global_min) or not np.isfinite(global_max):
             logging.warning("Depth percentiles are non-finite - using min/max fallback")
+                global_min = float(valid_depths.min())
+                global_max = float(valid_depths.max())
+            # Handle edge case where min == max
             if abs(global_max - global_min) < 1e-6:
+                global_min = float(valid_depths.min())
+                global_max = float(valid_depths.max())
+                if abs(global_max - global_min) < 1e-6:
+                    global_max = global_min + 1.0
+        logging.info(
+            "Depth range: %.2f - %.2f meters (1st-99th percentile)",
+            global_min,
+            global_max,
+        )
+    # Second pass: Apply colormap and overlay detections
+    visualization_frames = []
+    # draw_boxes is defined in this module, so we can use it directly.
+    # Ensure cv2 is imported
+    import cv2
+    for i, depth_map in enumerate(depth_maps):
+        _check_cancellation(job_id)
+        # Norm: (val - min) / (max - min) -> 0..1
+        # Clip to ensure range
+        norm_map = np.clip(depth_map, global_min, global_max)
+        norm_map = (norm_map - global_min) / (global_max - global_min + 1e-6)
+        # Invert intensity? Usually Near(High val) -> Bright(1.0).
+        # Our val is high for near. So direct map is fine.
+        # Colorize
+        norm_map_u8 = (norm_map * 255).astype(np.uint8)
+        heatmap = cv2.applyColorMap(norm_map_u8, cv2.COLORMAP_INFERNO)
+        # Overlay detections if available
+        if detections and i < len(detections):
+            frame_dets = detections[i]
+            # Convert list of dicts to format for draw_boxes
+            if frame_dets:
+                boxes = []
+                labels = []
+                display_labels = []
+                for d in frame_dets:
+                    boxes.append(d.get("bbox"))
+                    # Create label "Class Dist"
+                    lbl = d.get("label", "obj")
+                    # If we have depth info that was calculated in inference:
+                    if d.get("depth_est_m"):
+                         lbl = f"{lbl} {int(d['depth_est_m'])}m"
+                    labels.append(lbl) # used for color
+                    display_labels.append(lbl)
+                heatmap = draw_boxes(heatmap, boxes, labels, text_queries=None, display_labels=display_labels)
+        visualization_frames.append(heatmap)
+    return visualization_frames
 def colorize_depth_map(

jobs/background.py CHANGED Viewed

@@ -33,7 +33,8 @@ async def process_video_async(job_id: str) -> None:
                 job_id,
             )
         else:
-            detection_path = await asyncio.to_thread(
                 run_inference,
                 job.input_video_path,
                 job.output_video_path,
@@ -44,6 +45,8 @@ async def process_video_async(job_id: str) -> None:
                 job.depth_estimator_name,
                 job.depth_scale,
             )
         # Try to run depth estimation
         try:
@@ -51,6 +54,7 @@ async def process_video_async(job_id: str) -> None:
                 run_depth_inference,
                 job.input_video_path,
                 str(get_depth_output_path(job_id)),
                 None,  # max_frames
                 job.depth_estimator_name,
                 str(get_first_frame_depth_path(job_id)),

                 job_id,
             )
         else:
+            detections_list = None
+            result_pkg = await asyncio.to_thread(
                 run_inference,
                 job.input_video_path,
                 job.output_video_path,
                 job.depth_estimator_name,
                 job.depth_scale,
             )
+            # run_inference now returns (path, detections)
+            detection_path, detections_list = result_pkg
         # Try to run depth estimation
         try:
                 run_depth_inference,
                 job.input_video_path,
                 str(get_depth_output_path(job_id)),
+                detections_list,  # Pass detections for overlay
                 None,  # max_frames
                 job.depth_estimator_name,
                 str(get_first_frame_depth_path(job_id)),