Spaces:
Sleeping
Sleeping
Zhen Ye
commited on
Commit
·
391cb94
1
Parent(s):
970cc85
feat: fix depth inversion and add video overlays
Browse files- Inverted depth calculation (closer=smaller distance)
- Added bounding box overlays to depth visualization video
- Pipelined detection data to depth generator
- Updated frontend radar logic (implicitly fixed by backend correction)
- app.py +1 -1
- inference.py +86 -32
- jobs/background.py +5 -1
app.py
CHANGED
|
@@ -232,7 +232,7 @@ async def detect_endpoint(
|
|
| 232 |
# Run inference
|
| 233 |
try:
|
| 234 |
detector_name = "drone_yolo" if mode == "drone_detection" else detector
|
| 235 |
-
output_path = run_inference(
|
| 236 |
input_path,
|
| 237 |
output_path,
|
| 238 |
query_list,
|
|
|
|
| 232 |
# Run inference
|
| 233 |
try:
|
| 234 |
detector_name = "drone_yolo" if mode == "drone_detection" else detector
|
| 235 |
+
output_path, _ = run_inference(
|
| 236 |
input_path,
|
| 237 |
output_path,
|
| 238 |
query_list,
|
inference.py
CHANGED
|
@@ -240,8 +240,18 @@ def _attach_depth_metrics(
|
|
| 240 |
if finite.size == 0:
|
| 241 |
continue
|
| 242 |
|
| 243 |
-
depth_raw
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
det["depth_est_m"] = depth_est
|
| 246 |
det["depth_valid"] = True
|
| 247 |
valid_depths.append(depth_est)
|
|
@@ -395,7 +405,7 @@ def run_inference(
|
|
| 395 |
job_id: Optional[str] = None,
|
| 396 |
depth_estimator_name: Optional[str] = None,
|
| 397 |
depth_scale: float = 1.0,
|
| 398 |
-
) -> str:
|
| 399 |
"""
|
| 400 |
Run object detection inference on a video.
|
| 401 |
|
|
@@ -469,11 +479,12 @@ def run_inference(
|
|
| 469 |
detectors = None
|
| 470 |
|
| 471 |
processed_frames_map = {}
|
|
|
|
| 472 |
|
| 473 |
# Process frames
|
| 474 |
if detectors:
|
| 475 |
# Multi-GPU Parallel Processing
|
| 476 |
-
def process_frame_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray]:
|
| 477 |
# Determine which GPU to use based on frame index (round-robin)
|
| 478 |
gpu_idx = frame_idx % len(detectors)
|
| 479 |
detector_instance = detectors[gpu_idx]
|
|
@@ -485,7 +496,7 @@ def run_inference(
|
|
| 485 |
active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
|
| 486 |
active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
|
| 487 |
|
| 488 |
-
processed,
|
| 489 |
frame_data,
|
| 490 |
queries,
|
| 491 |
detector_name=None, # Use instance
|
|
@@ -494,7 +505,7 @@ def run_inference(
|
|
| 494 |
detector_instance=detector_instance,
|
| 495 |
depth_estimator_instance=active_depth_instance
|
| 496 |
)
|
| 497 |
-
return frame_idx, processed
|
| 498 |
|
| 499 |
# Thread pool with more workers than GPUs to keep them fed
|
| 500 |
max_workers = min(len(detectors) * 2, 8)
|
|
@@ -507,15 +518,18 @@ def run_inference(
|
|
| 507 |
futures.append(executor.submit(process_frame_task, idx, frame))
|
| 508 |
|
| 509 |
for future in futures:
|
| 510 |
-
idx, result_frame = future.result() # Wait for completion (in order or not, but we verify order)
|
| 511 |
processed_frames_map[idx] = result_frame
|
|
|
|
| 512 |
|
| 513 |
# Reasemble in order
|
| 514 |
processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
|
|
|
|
| 515 |
|
| 516 |
else:
|
| 517 |
# Standard Single-Threaded Loop
|
| 518 |
processed_frames = []
|
|
|
|
| 519 |
for idx, frame in enumerate(frames):
|
| 520 |
# Check for cancellation every frame
|
| 521 |
_check_cancellation(job_id)
|
|
@@ -527,7 +541,7 @@ def run_inference(
|
|
| 527 |
# Run depth estimation every 3 frames if configured
|
| 528 |
active_depth = depth_estimator_name if (idx % 3 == 0) else None
|
| 529 |
|
| 530 |
-
processed_frame,
|
| 531 |
frame,
|
| 532 |
queries,
|
| 533 |
detector_name=active_detector,
|
|
@@ -535,12 +549,13 @@ def run_inference(
|
|
| 535 |
depth_scale=depth_scale
|
| 536 |
)
|
| 537 |
processed_frames.append(processed_frame)
|
|
|
|
| 538 |
|
| 539 |
# Write output video
|
| 540 |
write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
|
| 541 |
logging.info("Processed video written to: %s", output_video_path)
|
| 542 |
|
| 543 |
-
return output_video_path
|
| 544 |
|
| 545 |
|
| 546 |
def run_segmentation(
|
|
@@ -630,6 +645,7 @@ def run_segmentation(
|
|
| 630 |
def run_depth_inference(
|
| 631 |
input_video_path: str,
|
| 632 |
output_video_path: str,
|
|
|
|
| 633 |
max_frames: Optional[int] = None,
|
| 634 |
depth_estimator_name: str = "depth",
|
| 635 |
first_frame_depth_path: Optional[str] = None,
|
|
@@ -661,8 +677,8 @@ def run_depth_inference(
|
|
| 661 |
if max_frames is not None:
|
| 662 |
frames = frames[:max_frames]
|
| 663 |
|
| 664 |
-
# Process depth with stable normalization
|
| 665 |
-
processed_frames = process_frames_depth(frames, depth_estimator_name, job_id)
|
| 666 |
|
| 667 |
# Write output video
|
| 668 |
write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
|
|
@@ -680,6 +696,7 @@ def run_depth_inference(
|
|
| 680 |
def process_frames_depth(
|
| 681 |
frames: List[np.ndarray],
|
| 682 |
depth_estimator_name: str,
|
|
|
|
| 683 |
job_id: Optional[str] = None,
|
| 684 |
) -> List[np.ndarray]:
|
| 685 |
"""
|
|
@@ -791,32 +808,69 @@ def process_frames_depth(
|
|
| 791 |
|
| 792 |
if not np.isfinite(global_min) or not np.isfinite(global_max):
|
| 793 |
logging.warning("Depth percentiles are non-finite - using min/max fallback")
|
| 794 |
-
|
| 795 |
-
|
| 796 |
|
| 797 |
-
|
| 798 |
-
if abs(global_max - global_min) < 1e-6:
|
| 799 |
-
global_min = float(valid_depths.min())
|
| 800 |
-
global_max = float(valid_depths.max())
|
| 801 |
if abs(global_max - global_min) < 1e-6:
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
|
|
|
|
|
|
|
|
|
| 809 |
|
| 810 |
-
# Second pass:
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
|
|
|
| 815 |
|
| 816 |
-
|
| 817 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
-
return
|
| 820 |
|
| 821 |
|
| 822 |
def colorize_depth_map(
|
|
|
|
| 240 |
if finite.size == 0:
|
| 241 |
continue
|
| 242 |
|
| 243 |
+
if depth_raw <= 1e-6:
|
| 244 |
+
det["depth_est_m"] = None
|
| 245 |
+
det["depth_valid"] = False
|
| 246 |
+
continue
|
| 247 |
+
|
| 248 |
+
# Inverted depth: closer objects have higher raw values
|
| 249 |
+
# Distance = Scale / RawValue
|
| 250 |
+
try:
|
| 251 |
+
depth_est = depth_scale / depth_raw
|
| 252 |
+
except ZeroDivisionError:
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
det["depth_est_m"] = depth_est
|
| 256 |
det["depth_valid"] = True
|
| 257 |
valid_depths.append(depth_est)
|
|
|
|
| 405 |
job_id: Optional[str] = None,
|
| 406 |
depth_estimator_name: Optional[str] = None,
|
| 407 |
depth_scale: float = 1.0,
|
| 408 |
+
) -> tuple[str, List[List[Dict[str, Any]]]]:
|
| 409 |
"""
|
| 410 |
Run object detection inference on a video.
|
| 411 |
|
|
|
|
| 479 |
detectors = None
|
| 480 |
|
| 481 |
processed_frames_map = {}
|
| 482 |
+
all_detections_map = {}
|
| 483 |
|
| 484 |
# Process frames
|
| 485 |
if detectors:
|
| 486 |
# Multi-GPU Parallel Processing
|
| 487 |
+
def process_frame_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray, List[Dict[str, Any]]]:
|
| 488 |
# Determine which GPU to use based on frame index (round-robin)
|
| 489 |
gpu_idx = frame_idx % len(detectors)
|
| 490 |
detector_instance = detectors[gpu_idx]
|
|
|
|
| 496 |
active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
|
| 497 |
active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
|
| 498 |
|
| 499 |
+
processed, frame_dets = infer_frame(
|
| 500 |
frame_data,
|
| 501 |
queries,
|
| 502 |
detector_name=None, # Use instance
|
|
|
|
| 505 |
detector_instance=detector_instance,
|
| 506 |
depth_estimator_instance=active_depth_instance
|
| 507 |
)
|
| 508 |
+
return frame_idx, processed, frame_dets
|
| 509 |
|
| 510 |
# Thread pool with more workers than GPUs to keep them fed
|
| 511 |
max_workers = min(len(detectors) * 2, 8)
|
|
|
|
| 518 |
futures.append(executor.submit(process_frame_task, idx, frame))
|
| 519 |
|
| 520 |
for future in futures:
|
| 521 |
+
idx, result_frame, result_dets = future.result() # Wait for completion (in order or not, but we verify order)
|
| 522 |
processed_frames_map[idx] = result_frame
|
| 523 |
+
all_detections_map[idx] = result_dets
|
| 524 |
|
| 525 |
# Reasemble in order
|
| 526 |
processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
|
| 527 |
+
all_detections = [all_detections_map[i] for i in range(len(all_detections_map))]
|
| 528 |
|
| 529 |
else:
|
| 530 |
# Standard Single-Threaded Loop
|
| 531 |
processed_frames = []
|
| 532 |
+
all_detections = []
|
| 533 |
for idx, frame in enumerate(frames):
|
| 534 |
# Check for cancellation every frame
|
| 535 |
_check_cancellation(job_id)
|
|
|
|
| 541 |
# Run depth estimation every 3 frames if configured
|
| 542 |
active_depth = depth_estimator_name if (idx % 3 == 0) else None
|
| 543 |
|
| 544 |
+
processed_frame, frame_dets = infer_frame(
|
| 545 |
frame,
|
| 546 |
queries,
|
| 547 |
detector_name=active_detector,
|
|
|
|
| 549 |
depth_scale=depth_scale
|
| 550 |
)
|
| 551 |
processed_frames.append(processed_frame)
|
| 552 |
+
all_detections.append(frame_dets)
|
| 553 |
|
| 554 |
# Write output video
|
| 555 |
write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
|
| 556 |
logging.info("Processed video written to: %s", output_video_path)
|
| 557 |
|
| 558 |
+
return output_video_path, all_detections
|
| 559 |
|
| 560 |
|
| 561 |
def run_segmentation(
|
|
|
|
| 645 |
def run_depth_inference(
|
| 646 |
input_video_path: str,
|
| 647 |
output_video_path: str,
|
| 648 |
+
detections: Optional[List[List[Dict[str, Any]]]] = None,
|
| 649 |
max_frames: Optional[int] = None,
|
| 650 |
depth_estimator_name: str = "depth",
|
| 651 |
first_frame_depth_path: Optional[str] = None,
|
|
|
|
| 677 |
if max_frames is not None:
|
| 678 |
frames = frames[:max_frames]
|
| 679 |
|
| 680 |
+
# Process depth with stable normalization and overlay
|
| 681 |
+
processed_frames = process_frames_depth(frames, depth_estimator_name, detections=detections, job_id=job_id)
|
| 682 |
|
| 683 |
# Write output video
|
| 684 |
write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
|
|
|
|
| 696 |
def process_frames_depth(
|
| 697 |
frames: List[np.ndarray],
|
| 698 |
depth_estimator_name: str,
|
| 699 |
+
detections: Optional[List[List[Dict[str, Any]]]] = None,
|
| 700 |
job_id: Optional[str] = None,
|
| 701 |
) -> List[np.ndarray]:
|
| 702 |
"""
|
|
|
|
| 808 |
|
| 809 |
if not np.isfinite(global_min) or not np.isfinite(global_max):
|
| 810 |
logging.warning("Depth percentiles are non-finite - using min/max fallback")
|
| 811 |
+
global_min = float(valid_depths.min())
|
| 812 |
+
global_max = float(valid_depths.max())
|
| 813 |
|
| 814 |
+
# Handle edge case where min == max
|
|
|
|
|
|
|
|
|
|
| 815 |
if abs(global_max - global_min) < 1e-6:
|
| 816 |
+
global_min = float(valid_depths.min())
|
| 817 |
+
global_max = float(valid_depths.max())
|
| 818 |
+
if abs(global_max - global_min) < 1e-6:
|
| 819 |
+
global_max = global_min + 1.0
|
| 820 |
+
|
| 821 |
+
logging.info(
|
| 822 |
+
"Depth range: %.2f - %.2f meters (1st-99th percentile)",
|
| 823 |
+
global_min,
|
| 824 |
+
global_max,
|
| 825 |
+
)
|
| 826 |
|
| 827 |
+
# Second pass: Apply colormap and overlay detections
|
| 828 |
+
visualization_frames = []
|
| 829 |
+
|
| 830 |
+
# draw_boxes is defined in this module, so we can use it directly.
|
| 831 |
+
# Ensure cv2 is imported
|
| 832 |
+
import cv2
|
| 833 |
|
| 834 |
+
for i, depth_map in enumerate(depth_maps):
|
| 835 |
+
_check_cancellation(job_id)
|
| 836 |
+
|
| 837 |
+
# Norm: (val - min) / (max - min) -> 0..1
|
| 838 |
+
# Clip to ensure range
|
| 839 |
+
norm_map = np.clip(depth_map, global_min, global_max)
|
| 840 |
+
norm_map = (norm_map - global_min) / (global_max - global_min + 1e-6)
|
| 841 |
+
|
| 842 |
+
# Invert intensity? Usually Near(High val) -> Bright(1.0).
|
| 843 |
+
# Our val is high for near. So direct map is fine.
|
| 844 |
+
|
| 845 |
+
# Colorize
|
| 846 |
+
norm_map_u8 = (norm_map * 255).astype(np.uint8)
|
| 847 |
+
heatmap = cv2.applyColorMap(norm_map_u8, cv2.COLORMAP_INFERNO)
|
| 848 |
+
|
| 849 |
+
# Overlay detections if available
|
| 850 |
+
if detections and i < len(detections):
|
| 851 |
+
frame_dets = detections[i]
|
| 852 |
+
# Convert list of dicts to format for draw_boxes
|
| 853 |
+
if frame_dets:
|
| 854 |
+
boxes = []
|
| 855 |
+
labels = []
|
| 856 |
+
display_labels = []
|
| 857 |
+
|
| 858 |
+
for d in frame_dets:
|
| 859 |
+
boxes.append(d.get("bbox"))
|
| 860 |
+
# Create label "Class Dist"
|
| 861 |
+
lbl = d.get("label", "obj")
|
| 862 |
+
# If we have depth info that was calculated in inference:
|
| 863 |
+
if d.get("depth_est_m"):
|
| 864 |
+
lbl = f"{lbl} {int(d['depth_est_m'])}m"
|
| 865 |
+
|
| 866 |
+
labels.append(lbl) # used for color
|
| 867 |
+
display_labels.append(lbl)
|
| 868 |
+
|
| 869 |
+
heatmap = draw_boxes(heatmap, boxes, labels, text_queries=None, display_labels=display_labels)
|
| 870 |
+
|
| 871 |
+
visualization_frames.append(heatmap)
|
| 872 |
|
| 873 |
+
return visualization_frames
|
| 874 |
|
| 875 |
|
| 876 |
def colorize_depth_map(
|
jobs/background.py
CHANGED
|
@@ -33,7 +33,8 @@ async def process_video_async(job_id: str) -> None:
|
|
| 33 |
job_id,
|
| 34 |
)
|
| 35 |
else:
|
| 36 |
-
|
|
|
|
| 37 |
run_inference,
|
| 38 |
job.input_video_path,
|
| 39 |
job.output_video_path,
|
|
@@ -44,6 +45,8 @@ async def process_video_async(job_id: str) -> None:
|
|
| 44 |
job.depth_estimator_name,
|
| 45 |
job.depth_scale,
|
| 46 |
)
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Try to run depth estimation
|
| 49 |
try:
|
|
@@ -51,6 +54,7 @@ async def process_video_async(job_id: str) -> None:
|
|
| 51 |
run_depth_inference,
|
| 52 |
job.input_video_path,
|
| 53 |
str(get_depth_output_path(job_id)),
|
|
|
|
| 54 |
None, # max_frames
|
| 55 |
job.depth_estimator_name,
|
| 56 |
str(get_first_frame_depth_path(job_id)),
|
|
|
|
| 33 |
job_id,
|
| 34 |
)
|
| 35 |
else:
|
| 36 |
+
detections_list = None
|
| 37 |
+
result_pkg = await asyncio.to_thread(
|
| 38 |
run_inference,
|
| 39 |
job.input_video_path,
|
| 40 |
job.output_video_path,
|
|
|
|
| 45 |
job.depth_estimator_name,
|
| 46 |
job.depth_scale,
|
| 47 |
)
|
| 48 |
+
# run_inference now returns (path, detections)
|
| 49 |
+
detection_path, detections_list = result_pkg
|
| 50 |
|
| 51 |
# Try to run depth estimation
|
| 52 |
try:
|
|
|
|
| 54 |
run_depth_inference,
|
| 55 |
job.input_video_path,
|
| 56 |
str(get_depth_output_path(job_id)),
|
| 57 |
+
detections_list, # Pass detections for overlay
|
| 58 |
None, # max_frames
|
| 59 |
job.depth_estimator_name,
|
| 60 |
str(get_first_frame_depth_path(job_id)),
|