Spaces:

lml2008
/

YOLO-3D

Sleeping

App Files Files Community

unknown commited on Feb 8

Commit

b8e1e8b

1 Parent(s): 63c1121

Add multi-mode HF Space app with CPU realtime profiles

Browse files

Files changed (6) hide show

README.md +41 -0
app.py +381 -0
bbox3d_utils.py +2 -2
depth_model.py +12 -7
requirements.txt +3 -1
run_space.bat +11 -0

README.md CHANGED Viewed

@@ -43,6 +43,47 @@ Run the main script:
 python run.py
 ```
 ### Configuration Options
 You can modify the following parameters in `run.py`:

 python run.py
 ```
+Run the Hugging Face Space app locally:
+```bash
+python app.py
+```
+On Windows, you can also run:
+```bash
+run_space.bat
+```
+## Hugging Face Space (Webcam + CPU Realtime)
+This repo now includes `app.py` for Gradio/Hugging Face Spaces with direct webcam streaming.
+### Modes
+- **Depth V2 Realtime (CPU)**: YOLO + Depth Anything v2 + pseudo-3D boxes (+ optional BEV)
+- **Depth V2 Balanced (CPU)**: Lower resolution/depth refresh profile for smoother CPU FPS
+- **Depth V2 Quality (CPU)**: Higher quality depth profile (heavier on CPU)
+- **Fast Detect (CPU)**: YOLO-only fast path for higher FPS on CPU
+- **Ultra Fast Detect (CPU)**: Aggressive low-latency detect-only profile
+- **Auto Optimize By Mode**: Apply recommended CPU settings per selected mode
+### Deploy steps
+1. Create a new **Gradio Space** on Hugging Face.
+2. Push this repository content to the Space.
+3. Keep `requirements.txt` and `app.py` at repo root.
+4. Hardware recommendation for smoother realtime on CPU:
+   - Pro account: choose a higher CPU tier with more vCPUs.
+5. Open the Space and allow browser webcam access.
+### Performance tuning for CPU
+- Keep model at YOLO `nano`.
+- Start with `Max Inference Side = 640`.
+- In Depth mode, increase `Depth Refresh (frames)` to `3-5` for better FPS.
+- Disable tracking and BEV if you need maximum realtime speed.
 ### Configuration Options
 You can modify the following parameters in `run.py`:

app.py ADDED Viewed

	@@ -0,0 +1,381 @@

+#!/usr/bin/env python3
+import os
+import time
+import threading
+from collections import deque
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from bbox3d_utils import BBox3DEstimator, BirdEyeView
+from depth_model import DepthEstimator
+from detection_model import ObjectDetector
+DEPTH_MODE = "Depth V2 Realtime (CPU)"
+DEPTH_BALANCED_MODE = "Depth V2 Balanced (CPU)"
+DEPTH_QUALITY_MODE = "Depth V2 Quality (CPU)"
+FAST_MODE = "Fast Detect (CPU)"
+ULTRA_FAST_MODE = "Ultra Fast Detect (CPU)"
+MODE_OPTIONS = [
+    DEPTH_MODE,
+    DEPTH_BALANCED_MODE,
+    DEPTH_QUALITY_MODE,
+    FAST_MODE,
+    ULTRA_FAST_MODE,
+]
+MODE_PROFILES = {
+    DEPTH_MODE: {
+        "use_depth": True,
+        "max_side": 640,
+        "depth_side": 384,
+        "depth_interval": 3,
+        "allow_tracking": True,
+        "allow_bev": True,
+        "max_det": 120,
+        "hud": "Depth Realtime",
+    },
+    DEPTH_BALANCED_MODE: {
+        "use_depth": True,
+        "max_side": 576,
+        "depth_side": 320,
+        "depth_interval": 4,
+        "allow_tracking": True,
+        "allow_bev": True,
+        "max_det": 100,
+        "hud": "Depth Balanced",
+    },
+    DEPTH_QUALITY_MODE: {
+        "use_depth": True,
+        "max_side": 768,
+        "depth_side": 512,
+        "depth_interval": 1,
+        "allow_tracking": True,
+        "allow_bev": True,
+        "max_det": 150,
+        "hud": "Depth Quality",
+    },
+    FAST_MODE: {
+        "use_depth": False,
+        "max_side": 640,
+        "depth_side": 0,
+        "depth_interval": 0,
+        "allow_tracking": True,
+        "allow_bev": False,
+        "max_det": 100,
+        "hud": "Fast Detect",
+    },
+    ULTRA_FAST_MODE: {
+        "use_depth": False,
+        "max_side": 416,
+        "depth_side": 0,
+        "depth_interval": 0,
+        "allow_tracking": False,
+        "allow_bev": False,
+        "max_det": 80,
+        "hud": "Ultra Fast",
+    },
+}
+def _configure_cpu_runtime():
+    cpu_count = max(1, os.cpu_count() or 1)
+    thread_count = min(4, cpu_count)
+    os.environ.setdefault("OMP_NUM_THREADS", str(thread_count))
+    os.environ.setdefault("MKL_NUM_THREADS", str(thread_count))
+    torch.set_num_threads(thread_count)
+    if hasattr(torch, "set_num_interop_threads"):
+        torch.set_num_interop_threads(max(1, thread_count // 2))
+class RealtimeEngine:
+    def __init__(self):
+        _configure_cpu_runtime()
+        self.lock = threading.Lock()
+        self.detector = None
+        self.depth_estimator = None
+        self.bbox3d_estimator = BBox3DEstimator()
+        self.bev = BirdEyeView(scale=55, size=(260, 260))
+        self.frame_idx = 0
+        self.cached_depth_map = None
+        self.latency_ms = deque(maxlen=30)
+        self.depth_input_side = 384
+    def _ensure_detector(self):
+        if self.detector is None:
+            self.detector = ObjectDetector(
+                model_size="nano",
+                conf_thres=0.25,
+                iou_thres=0.45,
+                classes=None,
+                device="cpu",
+            )
+            self.detector.model.overrides["max_det"] = 120
+    @staticmethod
+    def _profile(mode):
+        return MODE_PROFILES.get(mode, MODE_PROFILES[DEPTH_MODE])
+    def _ensure_depth(self):
+        if self.depth_estimator is None:
+            self.depth_estimator = DepthEstimator(model_size="small", device="cpu")
+    @staticmethod
+    def _resize_for_inference(frame, max_side):
+        h, w = frame.shape[:2]
+        longest = max(h, w)
+        if longest <= max_side:
+            return frame
+        scale = max_side / float(longest)
+        new_w = max(32, int(w * scale))
+        new_h = max(32, int(h * scale))
+        return cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
+    @staticmethod
+    def _overlay_corner(base, overlay, size_ratio=0.26, anchor="tl"):
+        h, w = base.shape[:2]
+        target_h = max(64, int(h * size_ratio))
+        target_w = int((overlay.shape[1] / max(1, overlay.shape[0])) * target_h)
+        target_w = max(64, min(target_w, w // 2))
+        target_h = min(target_h, h // 2)
+        resized = cv2.resize(overlay, (target_w, target_h), interpolation=cv2.INTER_AREA)
+        if anchor == "tr":
+            x0, y0 = w - target_w, 0
+        elif anchor == "bl":
+            x0, y0 = 0, h - target_h
+        elif anchor == "br":
+            x0, y0 = w - target_w, h - target_h
+        else:
+            x0, y0 = 0, 0
+        base[y0:y0 + target_h, x0:x0 + target_w] = resized
+        cv2.rectangle(base, (x0, y0), (x0 + target_w, y0 + target_h), (255, 255, 255), 1)
+    def _draw_hud(self, frame, mode_name):
+        mean_latency = float(np.mean(self.latency_ms)) if self.latency_ms else 0.0
+        fps = (1000.0 / mean_latency) if mean_latency > 0 else 0.0
+        text = f"{mode_name} | CPU | FPS {fps:.1f} | Latency {mean_latency:.1f} ms"
+        cv2.putText(frame, text, (10, 28), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0, 0, 255), 2)
+    def _render_depth_mode(self, frame_bgr, enable_tracking, enable_bev, depth_interval, hud_name):
+        result_frame = frame_bgr.copy()
+        _, detections = self.detector.detect(frame_bgr, track=enable_tracking)
+        self.frame_idx += 1
+        if self.cached_depth_map is None or (self.frame_idx % depth_interval == 0):
+            depth_input = self._resize_for_inference(frame_bgr, self.depth_input_side)
+            depth_map = self.depth_estimator.estimate_depth(depth_input)
+            if depth_map.shape[:2] != frame_bgr.shape[:2]:
+                depth_map = cv2.resize(
+                    depth_map,
+                    (frame_bgr.shape[1], frame_bgr.shape[0]),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+            self.cached_depth_map = depth_map
+        depth_map = self.cached_depth_map
+        depth_colored = self.depth_estimator.colorize_depth(depth_map)
+        class_names = self.detector.get_class_names()
+        boxes_3d = []
+        active_ids = []
+        for detection in detections:
+            bbox, score, class_id, obj_id = detection
+            class_name = class_names[class_id]
+            if class_name.lower() in ["person", "cat", "dog"]:
+                center_x = int((bbox[0] + bbox[2]) / 2.0)
+                center_y = int((bbox[1] + bbox[3]) / 2.0)
+                depth_value = self.depth_estimator.get_depth_at_point(depth_map, center_x, center_y)
+                depth_method = "center"
+            else:
+                depth_value = self.depth_estimator.get_depth_in_region(depth_map, bbox, method="median")
+                depth_method = "median"
+            boxes_3d.append(
+                {
+                    "bbox_2d": bbox,
+                    "depth_value": float(depth_value),
+                    "depth_method": depth_method,
+                    "class_name": class_name,
+                    "object_id": obj_id,
+                    "score": score,
+                }
+            )
+            if obj_id is not None:
+                active_ids.append(obj_id)
+        self.bbox3d_estimator.cleanup_trackers(active_ids)
+        for box_3d in boxes_3d:
+            result_frame = self.bbox3d_estimator.draw_box_3d(result_frame, box_3d, color=(0, 255, 255))
+        if enable_bev:
+            self.bev.reset()
+            for box_3d in boxes_3d:
+                self.bev.draw_box(box_3d)
+            bev_img = self.bev.get_image()
+            self._overlay_corner(result_frame, bev_img, size_ratio=0.30, anchor="bl")
+        self._overlay_corner(result_frame, depth_colored, size_ratio=0.24, anchor="tl")
+        self._draw_hud(result_frame, hud_name)
+        return result_frame
+    def _render_fast_mode(self, frame_bgr, enable_tracking, hud_name):
+        annotated, _ = self.detector.detect(frame_bgr, track=enable_tracking)
+        self._draw_hud(annotated, hud_name)
+        return annotated
+    def process(
+        self,
+        frame_rgb,
+        mode,
+        conf_threshold,
+        iou_threshold,
+        enable_tracking,
+        enable_bev,
+        auto_optimize,
+        max_side,
+        depth_interval,
+    ):
+        if frame_rgb is None:
+            return None
+        with self.lock:
+            start = time.perf_counter()
+            profile = self._profile(mode)
+            self._ensure_detector()
+            self.detector.model.overrides["conf"] = float(conf_threshold)
+            self.detector.model.overrides["iou"] = float(iou_threshold)
+            self.detector.model.overrides["max_det"] = int(profile["max_det"])
+            if auto_optimize:
+                effective_max_side = int(profile["max_side"])
+                effective_depth_interval = int(profile["depth_interval"])
+                self.depth_input_side = int(profile["depth_side"]) if profile["use_depth"] else self.depth_input_side
+                effective_tracking = bool(enable_tracking and profile["allow_tracking"])
+                effective_bev = bool(enable_bev and profile["allow_bev"])
+            else:
+                effective_max_side = int(max_side)
+                effective_depth_interval = max(1, int(depth_interval))
+                effective_tracking = bool(enable_tracking and profile["allow_tracking"])
+                effective_bev = bool(enable_bev and profile["allow_bev"])
+            frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+            frame_bgr = self._resize_for_inference(frame_bgr, effective_max_side)
+            self.detector.model.overrides["imgsz"] = int(max(frame_bgr.shape[:2]))
+            if profile["use_depth"]:
+                self._ensure_depth()
+                out_bgr = self._render_depth_mode(
+                    frame_bgr=frame_bgr,
+                    enable_tracking=effective_tracking,
+                    enable_bev=effective_bev,
+                    depth_interval=effective_depth_interval,
+                    hud_name=profile["hud"],
+                )
+            else:
+                out_bgr = self._render_fast_mode(
+                    frame_bgr=frame_bgr,
+                    enable_tracking=effective_tracking,
+                    hud_name=profile["hud"],
+                )
+            elapsed_ms = (time.perf_counter() - start) * 1000.0
+            self.latency_ms.append(elapsed_ms)
+            output_rgb = cv2.cvtColor(out_bgr, cv2.COLOR_BGR2RGB)
+            return output_rgb
+engine = RealtimeEngine()
+def process_frame(
+    frame,
+    mode,
+    conf_threshold,
+    iou_threshold,
+    enable_tracking,
+    enable_bev,
+    auto_optimize,
+    max_side,
+    depth_interval,
+):
+    try:
+        return engine.process(
+            frame_rgb=frame,
+            mode=mode,
+            conf_threshold=conf_threshold,
+            iou_threshold=iou_threshold,
+            enable_tracking=enable_tracking,
+            enable_bev=enable_bev,
+            auto_optimize=auto_optimize,
+            max_side=max_side,
+            depth_interval=depth_interval,
+        )
+    except Exception as exc:
+        error_img = np.zeros((360, 640, 3), dtype=np.uint8)
+        cv2.putText(error_img, "Runtime error", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
+        cv2.putText(error_img, str(exc)[:70], (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
+        return error_img
+with gr.Blocks(title="YOLO-3D Realtime CPU (HF Space)") as demo:
+    gr.Markdown(
+        """
+        # YOLO-3D Realtime CPU
+        `Mode 1`: Depth V2 Realtime
+        `Mode 2`: Depth V2 Balanced
+        `Mode 3`: Depth V2 Quality
+        `Mode 4`: Fast Detect
+        `Mode 5`: Ultra Fast Detect
+        """
+    )
+    with gr.Row():
+        mode = gr.Radio(
+            choices=MODE_OPTIONS,
+            value=DEPTH_MODE,
+            label="Inference Mode",
+        )
+        auto_optimize = gr.Checkbox(value=True, label="Auto Optimize By Mode")
+        enable_tracking = gr.Checkbox(value=False, label="Tracking")
+        enable_bev = gr.Checkbox(value=False, label="Bird Eye View (Depth modes)")
+    with gr.Row():
+        conf_threshold = gr.Slider(0.10, 0.80, value=0.25, step=0.05, label="Confidence")
+        iou_threshold = gr.Slider(0.20, 0.80, value=0.45, step=0.05, label="IoU")
+        max_side = gr.Slider(320, 960, value=640, step=32, label="Max Inference Side")
+        depth_interval = gr.Slider(1, 6, value=3, step=1, label="Depth Refresh (frames)")
+    with gr.Row():
+        webcam = gr.Image(sources=["webcam"], streaming=True, type="numpy", label="Webcam")
+        output = gr.Image(streaming=True, type="numpy", label="Output")
+    webcam.stream(
+        fn=process_frame,
+        inputs=[
+            webcam,
+            mode,
+            conf_threshold,
+            iou_threshold,
+            enable_tracking,
+            enable_bev,
+            auto_optimize,
+            max_side,
+            depth_interval,
+        ],
+        outputs=output,
+        show_progress="hidden",
+        trigger_mode="always_last",
+        stream_every=0.1,
+        concurrency_limit=1,
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=4).launch()

bbox3d_utils.py CHANGED Viewed

@@ -659,7 +659,7 @@ class BirdEyeView:
         # Draw distance markers specifically for 1-5 meter range
         # Use fixed steps of 1 meter with intermediate markers at 0.5 meters
-        for dist in [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]:
             y = self.origin_y - int(dist * self.scale)
             if y < 20:  # Skip if too close to top
@@ -796,4 +796,4 @@ class BirdEyeView:
         Returns:
             numpy.ndarray: BEV image
         """
-        return self.bev_image

         # Draw distance markers specifically for 1-5 meter range
         # Use fixed steps of 1 meter with intermediate markers at 0.5 meters
+        for dist in [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]:
             y = self.origin_y - int(dist * self.scale)
             if y < 20:  # Skip if too close to top
         Returns:
             numpy.ndarray: BEV image
         """
+        return self.bev_image

depth_model.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import numpy as np
 import cv2
 from transformers import pipeline
 from PIL import Image
 class DepthEstimator:
@@ -29,16 +28,22 @@ class DepthEstimator:
                 device = 'cpu'
         self.device = device
         # Set MPS fallback for operations not supported on Apple Silicon
         if self.device == 'mps':
             print("Using MPS device with CPU fallback for unsupported operations")
             os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
             # For Depth Anything v2, we'll use CPU directly due to MPS compatibility issues
-            self.pipe_device = 'cpu'
             print("Forcing CPU for depth estimation pipeline due to MPS compatibility issues")
         else:
-            self.pipe_device = self.device
         print(f"Using device: {self.device} for depth estimation (pipeline on {self.pipe_device})")
@@ -59,7 +64,7 @@ class DepthEstimator:
             # Fallback to CPU if there are issues
             print(f"Error loading model on {self.pipe_device}: {e}")
             print("Falling back to CPU for depth estimation")
-            self.pipe_device = 'cpu'
             self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
             print(f"Loaded Depth Anything v2 {model_size} model on CPU (fallback)")
@@ -95,7 +100,7 @@ class DepthEstimator:
                 print(f"MPS error during depth estimation: {e}")
                 print("Temporarily falling back to CPU for this frame")
                 # Create a CPU pipeline for this frame
-                cpu_pipe = pipeline(task="depth-estimation", model=self.pipe.model.config._name_or_path, device='cpu')
                 depth_result = cpu_pipe(pil_image)
                 depth_map = depth_result["depth"]
@@ -181,4 +186,4 @@ class DepthEstimator:
         elif method == 'min':
             return float(np.min(region))
         else:
-            return float(np.median(region))

 import os
 import torch
 import numpy as np
 import cv2
 from transformers import pipeline
+from transformers.utils import logging as hf_logging
 from PIL import Image
 class DepthEstimator:
                 device = 'cpu'
         self.device = device
+        hf_logging.set_verbosity_error()
+        hf_logging.disable_progress_bar()
         # Set MPS fallback for operations not supported on Apple Silicon
         if self.device == 'mps':
             print("Using MPS device with CPU fallback for unsupported operations")
             os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
             # For Depth Anything v2, we'll use CPU directly due to MPS compatibility issues
+            self.pipe_device = -1
             print("Forcing CPU for depth estimation pipeline due to MPS compatibility issues")
+        elif self.device == 'cpu':
+            self.pipe_device = -1
+        elif isinstance(self.device, str) and self.device.startswith('cuda'):
+            self.pipe_device = 0
         else:
+            self.pipe_device = -1
         print(f"Using device: {self.device} for depth estimation (pipeline on {self.pipe_device})")
             # Fallback to CPU if there are issues
             print(f"Error loading model on {self.pipe_device}: {e}")
             print("Falling back to CPU for depth estimation")
+            self.pipe_device = -1
             self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
             print(f"Loaded Depth Anything v2 {model_size} model on CPU (fallback)")
                 print(f"MPS error during depth estimation: {e}")
                 print("Temporarily falling back to CPU for this frame")
                 # Create a CPU pipeline for this frame
+                cpu_pipe = pipeline(task="depth-estimation", model=self.pipe.model.config._name_or_path, device=-1)
                 depth_result = cpu_pipe(pil_image)
                 depth_map = depth_result["depth"]
         elif method == 'min':
             return float(np.min(region))
         else:
+            return float(np.median(region))

requirements.txt CHANGED Viewed

@@ -2,6 +2,8 @@ torch>=2.0.0
 torchvision>=0.15.0
 opencv-python>=4.7.0
 numpy>=1.22.0
 ultralytics>=8.0.0  # For YOLOv11
 timm>=0.9.2  # Required for Depth Anything v2
 matplotlib>=3.7.0
@@ -12,4 +14,4 @@ filterpy>=1.4.5  # For Kalman filtering in tracking
 lap>=0.4.0  # For Hungarian algorithm in tracking
 scikit-image>=0.20.0
 pyyaml>=6.0
-requests>=2.28.0

 torchvision>=0.15.0
 opencv-python>=4.7.0
 numpy>=1.22.0
+gradio>=5.0.0
+transformers>=4.40.0
 ultralytics>=8.0.0  # For YOLOv11
 timm>=0.9.2  # Required for Depth Anything v2
 matplotlib>=3.7.0
 lap>=0.4.0  # For Hungarian algorithm in tracking
 scikit-image>=0.20.0
 pyyaml>=6.0
+requests>=2.28.0

run_space.bat ADDED Viewed

	@@ -0,0 +1,11 @@

+@echo off
+setlocal
+echo [YOLO-3D] Installing dependencies...
+python -m pip install --upgrade pip
+python -m pip install -r requirements.txt
+echo [YOLO-3D] Starting Gradio app...
+python app.py
+endlocal