unknown commited on
Commit
b8e1e8b
·
1 Parent(s): 63c1121

Add multi-mode HF Space app with CPU realtime profiles

Browse files
Files changed (6) hide show
  1. README.md +41 -0
  2. app.py +381 -0
  3. bbox3d_utils.py +2 -2
  4. depth_model.py +12 -7
  5. requirements.txt +3 -1
  6. run_space.bat +11 -0
README.md CHANGED
@@ -43,6 +43,47 @@ Run the main script:
43
  python run.py
44
  ```
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ### Configuration Options
47
 
48
  You can modify the following parameters in `run.py`:
 
43
  python run.py
44
  ```
45
 
46
+ Run the Hugging Face Space app locally:
47
+
48
+ ```bash
49
+ python app.py
50
+ ```
51
+
52
+ On Windows, you can also run:
53
+
54
+ ```bash
55
+ run_space.bat
56
+ ```
57
+
58
+ ## Hugging Face Space (Webcam + CPU Realtime)
59
+
60
+ This repo now includes `app.py` for Gradio/Hugging Face Spaces with direct webcam streaming.
61
+
62
+ ### Modes
63
+
64
+ - **Depth V2 Realtime (CPU)**: YOLO + Depth Anything v2 + pseudo-3D boxes (+ optional BEV)
65
+ - **Depth V2 Balanced (CPU)**: Lower resolution/depth refresh profile for smoother CPU FPS
66
+ - **Depth V2 Quality (CPU)**: Higher quality depth profile (heavier on CPU)
67
+ - **Fast Detect (CPU)**: YOLO-only fast path for higher FPS on CPU
68
+ - **Ultra Fast Detect (CPU)**: Aggressive low-latency detect-only profile
69
+ - **Auto Optimize By Mode**: Apply recommended CPU settings per selected mode
70
+
71
+ ### Deploy steps
72
+
73
+ 1. Create a new **Gradio Space** on Hugging Face.
74
+ 2. Push this repository content to the Space.
75
+ 3. Keep `requirements.txt` and `app.py` at repo root.
76
+ 4. Hardware recommendation for smoother realtime on CPU:
77
+ - Pro account: choose a higher CPU tier with more vCPUs.
78
+ 5. Open the Space and allow browser webcam access.
79
+
80
+ ### Performance tuning for CPU
81
+
82
+ - Keep model at YOLO `nano`.
83
+ - Start with `Max Inference Side = 640`.
84
+ - In Depth mode, increase `Depth Refresh (frames)` to `3-5` for better FPS.
85
+ - Disable tracking and BEV if you need maximum realtime speed.
86
+
87
  ### Configuration Options
88
 
89
  You can modify the following parameters in `run.py`:
app.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import time
4
+ import threading
5
+ from collections import deque
6
+
7
+ import cv2
8
+ import gradio as gr
9
+ import numpy as np
10
+ import torch
11
+
12
+ from bbox3d_utils import BBox3DEstimator, BirdEyeView
13
+ from depth_model import DepthEstimator
14
+ from detection_model import ObjectDetector
15
+
16
+
17
+ DEPTH_MODE = "Depth V2 Realtime (CPU)"
18
+ DEPTH_BALANCED_MODE = "Depth V2 Balanced (CPU)"
19
+ DEPTH_QUALITY_MODE = "Depth V2 Quality (CPU)"
20
+ FAST_MODE = "Fast Detect (CPU)"
21
+ ULTRA_FAST_MODE = "Ultra Fast Detect (CPU)"
22
+
23
+ MODE_OPTIONS = [
24
+ DEPTH_MODE,
25
+ DEPTH_BALANCED_MODE,
26
+ DEPTH_QUALITY_MODE,
27
+ FAST_MODE,
28
+ ULTRA_FAST_MODE,
29
+ ]
30
+
31
+ MODE_PROFILES = {
32
+ DEPTH_MODE: {
33
+ "use_depth": True,
34
+ "max_side": 640,
35
+ "depth_side": 384,
36
+ "depth_interval": 3,
37
+ "allow_tracking": True,
38
+ "allow_bev": True,
39
+ "max_det": 120,
40
+ "hud": "Depth Realtime",
41
+ },
42
+ DEPTH_BALANCED_MODE: {
43
+ "use_depth": True,
44
+ "max_side": 576,
45
+ "depth_side": 320,
46
+ "depth_interval": 4,
47
+ "allow_tracking": True,
48
+ "allow_bev": True,
49
+ "max_det": 100,
50
+ "hud": "Depth Balanced",
51
+ },
52
+ DEPTH_QUALITY_MODE: {
53
+ "use_depth": True,
54
+ "max_side": 768,
55
+ "depth_side": 512,
56
+ "depth_interval": 1,
57
+ "allow_tracking": True,
58
+ "allow_bev": True,
59
+ "max_det": 150,
60
+ "hud": "Depth Quality",
61
+ },
62
+ FAST_MODE: {
63
+ "use_depth": False,
64
+ "max_side": 640,
65
+ "depth_side": 0,
66
+ "depth_interval": 0,
67
+ "allow_tracking": True,
68
+ "allow_bev": False,
69
+ "max_det": 100,
70
+ "hud": "Fast Detect",
71
+ },
72
+ ULTRA_FAST_MODE: {
73
+ "use_depth": False,
74
+ "max_side": 416,
75
+ "depth_side": 0,
76
+ "depth_interval": 0,
77
+ "allow_tracking": False,
78
+ "allow_bev": False,
79
+ "max_det": 80,
80
+ "hud": "Ultra Fast",
81
+ },
82
+ }
83
+
84
+
85
+ def _configure_cpu_runtime():
86
+ cpu_count = max(1, os.cpu_count() or 1)
87
+ thread_count = min(4, cpu_count)
88
+ os.environ.setdefault("OMP_NUM_THREADS", str(thread_count))
89
+ os.environ.setdefault("MKL_NUM_THREADS", str(thread_count))
90
+ torch.set_num_threads(thread_count)
91
+ if hasattr(torch, "set_num_interop_threads"):
92
+ torch.set_num_interop_threads(max(1, thread_count // 2))
93
+
94
+
95
+ class RealtimeEngine:
96
+ def __init__(self):
97
+ _configure_cpu_runtime()
98
+ self.lock = threading.Lock()
99
+ self.detector = None
100
+ self.depth_estimator = None
101
+ self.bbox3d_estimator = BBox3DEstimator()
102
+ self.bev = BirdEyeView(scale=55, size=(260, 260))
103
+ self.frame_idx = 0
104
+ self.cached_depth_map = None
105
+ self.latency_ms = deque(maxlen=30)
106
+ self.depth_input_side = 384
107
+
108
+ def _ensure_detector(self):
109
+ if self.detector is None:
110
+ self.detector = ObjectDetector(
111
+ model_size="nano",
112
+ conf_thres=0.25,
113
+ iou_thres=0.45,
114
+ classes=None,
115
+ device="cpu",
116
+ )
117
+ self.detector.model.overrides["max_det"] = 120
118
+
119
+ @staticmethod
120
+ def _profile(mode):
121
+ return MODE_PROFILES.get(mode, MODE_PROFILES[DEPTH_MODE])
122
+
123
+ def _ensure_depth(self):
124
+ if self.depth_estimator is None:
125
+ self.depth_estimator = DepthEstimator(model_size="small", device="cpu")
126
+
127
+ @staticmethod
128
+ def _resize_for_inference(frame, max_side):
129
+ h, w = frame.shape[:2]
130
+ longest = max(h, w)
131
+ if longest <= max_side:
132
+ return frame
133
+ scale = max_side / float(longest)
134
+ new_w = max(32, int(w * scale))
135
+ new_h = max(32, int(h * scale))
136
+ return cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
137
+
138
+ @staticmethod
139
+ def _overlay_corner(base, overlay, size_ratio=0.26, anchor="tl"):
140
+ h, w = base.shape[:2]
141
+ target_h = max(64, int(h * size_ratio))
142
+ target_w = int((overlay.shape[1] / max(1, overlay.shape[0])) * target_h)
143
+ target_w = max(64, min(target_w, w // 2))
144
+ target_h = min(target_h, h // 2)
145
+ resized = cv2.resize(overlay, (target_w, target_h), interpolation=cv2.INTER_AREA)
146
+
147
+ if anchor == "tr":
148
+ x0, y0 = w - target_w, 0
149
+ elif anchor == "bl":
150
+ x0, y0 = 0, h - target_h
151
+ elif anchor == "br":
152
+ x0, y0 = w - target_w, h - target_h
153
+ else:
154
+ x0, y0 = 0, 0
155
+
156
+ base[y0:y0 + target_h, x0:x0 + target_w] = resized
157
+ cv2.rectangle(base, (x0, y0), (x0 + target_w, y0 + target_h), (255, 255, 255), 1)
158
+
159
+ def _draw_hud(self, frame, mode_name):
160
+ mean_latency = float(np.mean(self.latency_ms)) if self.latency_ms else 0.0
161
+ fps = (1000.0 / mean_latency) if mean_latency > 0 else 0.0
162
+ text = f"{mode_name} | CPU | FPS {fps:.1f} | Latency {mean_latency:.1f} ms"
163
+ cv2.putText(frame, text, (10, 28), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0, 0, 255), 2)
164
+
165
+ def _render_depth_mode(self, frame_bgr, enable_tracking, enable_bev, depth_interval, hud_name):
166
+ result_frame = frame_bgr.copy()
167
+ _, detections = self.detector.detect(frame_bgr, track=enable_tracking)
168
+
169
+ self.frame_idx += 1
170
+ if self.cached_depth_map is None or (self.frame_idx % depth_interval == 0):
171
+ depth_input = self._resize_for_inference(frame_bgr, self.depth_input_side)
172
+ depth_map = self.depth_estimator.estimate_depth(depth_input)
173
+ if depth_map.shape[:2] != frame_bgr.shape[:2]:
174
+ depth_map = cv2.resize(
175
+ depth_map,
176
+ (frame_bgr.shape[1], frame_bgr.shape[0]),
177
+ interpolation=cv2.INTER_LINEAR,
178
+ )
179
+ self.cached_depth_map = depth_map
180
+ depth_map = self.cached_depth_map
181
+ depth_colored = self.depth_estimator.colorize_depth(depth_map)
182
+
183
+ class_names = self.detector.get_class_names()
184
+ boxes_3d = []
185
+ active_ids = []
186
+
187
+ for detection in detections:
188
+ bbox, score, class_id, obj_id = detection
189
+ class_name = class_names[class_id]
190
+ if class_name.lower() in ["person", "cat", "dog"]:
191
+ center_x = int((bbox[0] + bbox[2]) / 2.0)
192
+ center_y = int((bbox[1] + bbox[3]) / 2.0)
193
+ depth_value = self.depth_estimator.get_depth_at_point(depth_map, center_x, center_y)
194
+ depth_method = "center"
195
+ else:
196
+ depth_value = self.depth_estimator.get_depth_in_region(depth_map, bbox, method="median")
197
+ depth_method = "median"
198
+
199
+ boxes_3d.append(
200
+ {
201
+ "bbox_2d": bbox,
202
+ "depth_value": float(depth_value),
203
+ "depth_method": depth_method,
204
+ "class_name": class_name,
205
+ "object_id": obj_id,
206
+ "score": score,
207
+ }
208
+ )
209
+ if obj_id is not None:
210
+ active_ids.append(obj_id)
211
+
212
+ self.bbox3d_estimator.cleanup_trackers(active_ids)
213
+
214
+ for box_3d in boxes_3d:
215
+ result_frame = self.bbox3d_estimator.draw_box_3d(result_frame, box_3d, color=(0, 255, 255))
216
+
217
+ if enable_bev:
218
+ self.bev.reset()
219
+ for box_3d in boxes_3d:
220
+ self.bev.draw_box(box_3d)
221
+ bev_img = self.bev.get_image()
222
+ self._overlay_corner(result_frame, bev_img, size_ratio=0.30, anchor="bl")
223
+
224
+ self._overlay_corner(result_frame, depth_colored, size_ratio=0.24, anchor="tl")
225
+ self._draw_hud(result_frame, hud_name)
226
+ return result_frame
227
+
228
+ def _render_fast_mode(self, frame_bgr, enable_tracking, hud_name):
229
+ annotated, _ = self.detector.detect(frame_bgr, track=enable_tracking)
230
+ self._draw_hud(annotated, hud_name)
231
+ return annotated
232
+
233
+ def process(
234
+ self,
235
+ frame_rgb,
236
+ mode,
237
+ conf_threshold,
238
+ iou_threshold,
239
+ enable_tracking,
240
+ enable_bev,
241
+ auto_optimize,
242
+ max_side,
243
+ depth_interval,
244
+ ):
245
+ if frame_rgb is None:
246
+ return None
247
+
248
+ with self.lock:
249
+ start = time.perf_counter()
250
+ profile = self._profile(mode)
251
+ self._ensure_detector()
252
+ self.detector.model.overrides["conf"] = float(conf_threshold)
253
+ self.detector.model.overrides["iou"] = float(iou_threshold)
254
+ self.detector.model.overrides["max_det"] = int(profile["max_det"])
255
+
256
+ if auto_optimize:
257
+ effective_max_side = int(profile["max_side"])
258
+ effective_depth_interval = int(profile["depth_interval"])
259
+ self.depth_input_side = int(profile["depth_side"]) if profile["use_depth"] else self.depth_input_side
260
+ effective_tracking = bool(enable_tracking and profile["allow_tracking"])
261
+ effective_bev = bool(enable_bev and profile["allow_bev"])
262
+ else:
263
+ effective_max_side = int(max_side)
264
+ effective_depth_interval = max(1, int(depth_interval))
265
+ effective_tracking = bool(enable_tracking and profile["allow_tracking"])
266
+ effective_bev = bool(enable_bev and profile["allow_bev"])
267
+
268
+ frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
269
+ frame_bgr = self._resize_for_inference(frame_bgr, effective_max_side)
270
+ self.detector.model.overrides["imgsz"] = int(max(frame_bgr.shape[:2]))
271
+
272
+ if profile["use_depth"]:
273
+ self._ensure_depth()
274
+ out_bgr = self._render_depth_mode(
275
+ frame_bgr=frame_bgr,
276
+ enable_tracking=effective_tracking,
277
+ enable_bev=effective_bev,
278
+ depth_interval=effective_depth_interval,
279
+ hud_name=profile["hud"],
280
+ )
281
+ else:
282
+ out_bgr = self._render_fast_mode(
283
+ frame_bgr=frame_bgr,
284
+ enable_tracking=effective_tracking,
285
+ hud_name=profile["hud"],
286
+ )
287
+
288
+ elapsed_ms = (time.perf_counter() - start) * 1000.0
289
+ self.latency_ms.append(elapsed_ms)
290
+ output_rgb = cv2.cvtColor(out_bgr, cv2.COLOR_BGR2RGB)
291
+ return output_rgb
292
+
293
+
294
+ engine = RealtimeEngine()
295
+
296
+
297
+ def process_frame(
298
+ frame,
299
+ mode,
300
+ conf_threshold,
301
+ iou_threshold,
302
+ enable_tracking,
303
+ enable_bev,
304
+ auto_optimize,
305
+ max_side,
306
+ depth_interval,
307
+ ):
308
+ try:
309
+ return engine.process(
310
+ frame_rgb=frame,
311
+ mode=mode,
312
+ conf_threshold=conf_threshold,
313
+ iou_threshold=iou_threshold,
314
+ enable_tracking=enable_tracking,
315
+ enable_bev=enable_bev,
316
+ auto_optimize=auto_optimize,
317
+ max_side=max_side,
318
+ depth_interval=depth_interval,
319
+ )
320
+ except Exception as exc:
321
+ error_img = np.zeros((360, 640, 3), dtype=np.uint8)
322
+ cv2.putText(error_img, "Runtime error", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
323
+ cv2.putText(error_img, str(exc)[:70], (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
324
+ return error_img
325
+
326
+
327
+ with gr.Blocks(title="YOLO-3D Realtime CPU (HF Space)") as demo:
328
+ gr.Markdown(
329
+ """
330
+ # YOLO-3D Realtime CPU
331
+ `Mode 1`: Depth V2 Realtime
332
+ `Mode 2`: Depth V2 Balanced
333
+ `Mode 3`: Depth V2 Quality
334
+ `Mode 4`: Fast Detect
335
+ `Mode 5`: Ultra Fast Detect
336
+ """
337
+ )
338
+
339
+ with gr.Row():
340
+ mode = gr.Radio(
341
+ choices=MODE_OPTIONS,
342
+ value=DEPTH_MODE,
343
+ label="Inference Mode",
344
+ )
345
+ auto_optimize = gr.Checkbox(value=True, label="Auto Optimize By Mode")
346
+ enable_tracking = gr.Checkbox(value=False, label="Tracking")
347
+ enable_bev = gr.Checkbox(value=False, label="Bird Eye View (Depth modes)")
348
+
349
+ with gr.Row():
350
+ conf_threshold = gr.Slider(0.10, 0.80, value=0.25, step=0.05, label="Confidence")
351
+ iou_threshold = gr.Slider(0.20, 0.80, value=0.45, step=0.05, label="IoU")
352
+ max_side = gr.Slider(320, 960, value=640, step=32, label="Max Inference Side")
353
+ depth_interval = gr.Slider(1, 6, value=3, step=1, label="Depth Refresh (frames)")
354
+
355
+ with gr.Row():
356
+ webcam = gr.Image(sources=["webcam"], streaming=True, type="numpy", label="Webcam")
357
+ output = gr.Image(streaming=True, type="numpy", label="Output")
358
+
359
+ webcam.stream(
360
+ fn=process_frame,
361
+ inputs=[
362
+ webcam,
363
+ mode,
364
+ conf_threshold,
365
+ iou_threshold,
366
+ enable_tracking,
367
+ enable_bev,
368
+ auto_optimize,
369
+ max_side,
370
+ depth_interval,
371
+ ],
372
+ outputs=output,
373
+ show_progress="hidden",
374
+ trigger_mode="always_last",
375
+ stream_every=0.1,
376
+ concurrency_limit=1,
377
+ )
378
+
379
+
380
+ if __name__ == "__main__":
381
+ demo.queue(max_size=4).launch()
bbox3d_utils.py CHANGED
@@ -659,7 +659,7 @@ class BirdEyeView:
659
 
660
  # Draw distance markers specifically for 1-5 meter range
661
  # Use fixed steps of 1 meter with intermediate markers at 0.5 meters
662
- for dist in [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]:
663
  y = self.origin_y - int(dist * self.scale)
664
 
665
  if y < 20: # Skip if too close to top
@@ -796,4 +796,4 @@ class BirdEyeView:
796
  Returns:
797
  numpy.ndarray: BEV image
798
  """
799
- return self.bev_image
 
659
 
660
  # Draw distance markers specifically for 1-5 meter range
661
  # Use fixed steps of 1 meter with intermediate markers at 0.5 meters
662
+ for dist in [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]:
663
  y = self.origin_y - int(dist * self.scale)
664
 
665
  if y < 20: # Skip if too close to top
 
796
  Returns:
797
  numpy.ndarray: BEV image
798
  """
799
+ return self.bev_image
depth_model.py CHANGED
@@ -1,10 +1,9 @@
1
  import os
2
  import torch
3
- import torch.nn as nn
4
- import torch.nn.functional as F
5
  import numpy as np
6
  import cv2
7
  from transformers import pipeline
 
8
  from PIL import Image
9
 
10
  class DepthEstimator:
@@ -29,16 +28,22 @@ class DepthEstimator:
29
  device = 'cpu'
30
 
31
  self.device = device
 
 
32
 
33
  # Set MPS fallback for operations not supported on Apple Silicon
34
  if self.device == 'mps':
35
  print("Using MPS device with CPU fallback for unsupported operations")
36
  os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
37
  # For Depth Anything v2, we'll use CPU directly due to MPS compatibility issues
38
- self.pipe_device = 'cpu'
39
  print("Forcing CPU for depth estimation pipeline due to MPS compatibility issues")
 
 
 
 
40
  else:
41
- self.pipe_device = self.device
42
 
43
  print(f"Using device: {self.device} for depth estimation (pipeline on {self.pipe_device})")
44
 
@@ -59,7 +64,7 @@ class DepthEstimator:
59
  # Fallback to CPU if there are issues
60
  print(f"Error loading model on {self.pipe_device}: {e}")
61
  print("Falling back to CPU for depth estimation")
62
- self.pipe_device = 'cpu'
63
  self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
64
  print(f"Loaded Depth Anything v2 {model_size} model on CPU (fallback)")
65
 
@@ -95,7 +100,7 @@ class DepthEstimator:
95
  print(f"MPS error during depth estimation: {e}")
96
  print("Temporarily falling back to CPU for this frame")
97
  # Create a CPU pipeline for this frame
98
- cpu_pipe = pipeline(task="depth-estimation", model=self.pipe.model.config._name_or_path, device='cpu')
99
  depth_result = cpu_pipe(pil_image)
100
  depth_map = depth_result["depth"]
101
 
@@ -181,4 +186,4 @@ class DepthEstimator:
181
  elif method == 'min':
182
  return float(np.min(region))
183
  else:
184
- return float(np.median(region))
 
1
  import os
2
  import torch
 
 
3
  import numpy as np
4
  import cv2
5
  from transformers import pipeline
6
+ from transformers.utils import logging as hf_logging
7
  from PIL import Image
8
 
9
  class DepthEstimator:
 
28
  device = 'cpu'
29
 
30
  self.device = device
31
+ hf_logging.set_verbosity_error()
32
+ hf_logging.disable_progress_bar()
33
 
34
  # Set MPS fallback for operations not supported on Apple Silicon
35
  if self.device == 'mps':
36
  print("Using MPS device with CPU fallback for unsupported operations")
37
  os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
38
  # For Depth Anything v2, we'll use CPU directly due to MPS compatibility issues
39
+ self.pipe_device = -1
40
  print("Forcing CPU for depth estimation pipeline due to MPS compatibility issues")
41
+ elif self.device == 'cpu':
42
+ self.pipe_device = -1
43
+ elif isinstance(self.device, str) and self.device.startswith('cuda'):
44
+ self.pipe_device = 0
45
  else:
46
+ self.pipe_device = -1
47
 
48
  print(f"Using device: {self.device} for depth estimation (pipeline on {self.pipe_device})")
49
 
 
64
  # Fallback to CPU if there are issues
65
  print(f"Error loading model on {self.pipe_device}: {e}")
66
  print("Falling back to CPU for depth estimation")
67
+ self.pipe_device = -1
68
  self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
69
  print(f"Loaded Depth Anything v2 {model_size} model on CPU (fallback)")
70
 
 
100
  print(f"MPS error during depth estimation: {e}")
101
  print("Temporarily falling back to CPU for this frame")
102
  # Create a CPU pipeline for this frame
103
+ cpu_pipe = pipeline(task="depth-estimation", model=self.pipe.model.config._name_or_path, device=-1)
104
  depth_result = cpu_pipe(pil_image)
105
  depth_map = depth_result["depth"]
106
 
 
186
  elif method == 'min':
187
  return float(np.min(region))
188
  else:
189
+ return float(np.median(region))
requirements.txt CHANGED
@@ -2,6 +2,8 @@ torch>=2.0.0
2
  torchvision>=0.15.0
3
  opencv-python>=4.7.0
4
  numpy>=1.22.0
 
 
5
  ultralytics>=8.0.0 # For YOLOv11
6
  timm>=0.9.2 # Required for Depth Anything v2
7
  matplotlib>=3.7.0
@@ -12,4 +14,4 @@ filterpy>=1.4.5 # For Kalman filtering in tracking
12
  lap>=0.4.0 # For Hungarian algorithm in tracking
13
  scikit-image>=0.20.0
14
  pyyaml>=6.0
15
- requests>=2.28.0
 
2
  torchvision>=0.15.0
3
  opencv-python>=4.7.0
4
  numpy>=1.22.0
5
+ gradio>=5.0.0
6
+ transformers>=4.40.0
7
  ultralytics>=8.0.0 # For YOLOv11
8
  timm>=0.9.2 # Required for Depth Anything v2
9
  matplotlib>=3.7.0
 
14
  lap>=0.4.0 # For Hungarian algorithm in tracking
15
  scikit-image>=0.20.0
16
  pyyaml>=6.0
17
+ requests>=2.28.0
run_space.bat ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ setlocal
3
+
4
+ echo [YOLO-3D] Installing dependencies...
5
+ python -m pip install --upgrade pip
6
+ python -m pip install -r requirements.txt
7
+
8
+ echo [YOLO-3D] Starting Gradio app...
9
+ python app.py
10
+
11
+ endlocal