Zhen Ye commited on
Commit
1c4206e
·
1 Parent(s): 78f99f1

added depth everything v2

Browse files
app.py CHANGED
@@ -16,6 +16,7 @@ from fastapi.staticfiles import StaticFiles
16
  import uvicorn
17
 
18
  from inference import process_first_frame, run_inference, run_segmentation
 
19
  from jobs.background import process_video_async
20
  from jobs.models import JobInfo, JobStatus
21
  from jobs.storage import (
@@ -259,6 +260,7 @@ async def detect_async_endpoint(
259
  queries: str = Form(""),
260
  detector: str = Form("hf_yolov8"),
261
  segmenter: str = Form("sam3"),
 
262
  ):
263
  if mode not in VALID_MODES:
264
  raise HTTPException(
@@ -289,6 +291,16 @@ async def detect_async_endpoint(
289
  if not query_list:
290
  query_list = _default_queries_for_mode(mode)
291
 
 
 
 
 
 
 
 
 
 
 
292
  detector_name = detector
293
  if mode == "drone_detection":
294
  detector_name = "drone_yolo"
@@ -318,7 +330,7 @@ async def detect_async_endpoint(
318
  output_video_path=str(output_path),
319
  first_frame_path=str(first_frame_path),
320
  first_frame_detections=detections,
321
- depth_estimator_name="depth_pro",
322
  depth_output_path=str(depth_output_path),
323
  first_frame_depth_path=str(first_frame_depth_path),
324
  )
 
16
  import uvicorn
17
 
18
  from inference import process_first_frame, run_inference, run_segmentation
19
+ from models.depth_estimators.model_loader import list_depth_estimators
20
  from jobs.background import process_video_async
21
  from jobs.models import JobInfo, JobStatus
22
  from jobs.storage import (
 
260
  queries: str = Form(""),
261
  detector: str = Form("hf_yolov8"),
262
  segmenter: str = Form("sam3"),
263
+ depth_estimator: str = Form("depth_pro"),
264
  ):
265
  if mode not in VALID_MODES:
266
  raise HTTPException(
 
291
  if not query_list:
292
  query_list = _default_queries_for_mode(mode)
293
 
294
+ available_depth_estimators = set(list_depth_estimators())
295
+ if depth_estimator not in available_depth_estimators:
296
+ raise HTTPException(
297
+ status_code=400,
298
+ detail=(
299
+ f"Invalid depth estimator '{depth_estimator}'. "
300
+ f"Must be one of: {', '.join(sorted(available_depth_estimators))}"
301
+ ),
302
+ )
303
+
304
  detector_name = detector
305
  if mode == "drone_detection":
306
  detector_name = "drone_yolo"
 
330
  output_video_path=str(output_path),
331
  first_frame_path=str(first_frame_path),
332
  first_frame_detections=detections,
333
+ depth_estimator_name=depth_estimator,
334
  depth_output_path=str(depth_output_path),
335
  first_frame_depth_path=str(first_frame_depth_path),
336
  )
demo.html CHANGED
@@ -285,12 +285,6 @@
285
  text-align: center;
286
  }
287
 
288
- .depth-status {
289
- margin-top: 8px;
290
- font-size: 0.85rem;
291
- color: #6b7280;
292
- }
293
-
294
  .spinner {
295
  border: 4px solid #e5e7eb;
296
  border-top: 4px solid #1f2933;
@@ -454,16 +448,6 @@
454
  <img id="firstFrameImage" class="frame-preview" alt="First frame preview">
455
  </div>
456
  </div>
457
- <div class="video-card">
458
- <div class="video-card-header">First Frame (Depth)</div>
459
- <div class="video-card-body">
460
- <div id="depthFramePlaceholder" class="frame-placeholder">
461
- Depth preview will appear after processing.
462
- </div>
463
- <img id="depthFrameImage" class="frame-preview hidden" alt="First frame depth preview">
464
- <div id="depthFrameStatus" class="depth-status"></div>
465
- </div>
466
- </div>
467
  <div class="video-card">
468
  <div class="video-card-header">Original Video</div>
469
  <div class="video-card-body">
@@ -479,16 +463,6 @@
479
  </a>
480
  </div>
481
  </div>
482
- <div class="video-card">
483
- <div class="video-card-header">Depth Video</div>
484
- <div class="video-card-body">
485
- <video id="depthVideo" controls autoplay loop class="hidden"></video>
486
- <a id="depthDownloadBtn" class="download-btn hidden" download="depth.mp4">
487
- Download Depth Video
488
- </a>
489
- <div id="depthVideoStatus" class="depth-status"></div>
490
- </div>
491
- </div>
492
  </div>
493
  </div>
494
  </div>
@@ -502,7 +476,6 @@
502
  let detectionVideoUrl = null;
503
  let depthVideoUrl = null;
504
  let detectionFirstFrameUrl = null;
505
- let depthFirstFrameUrl = null;
506
 
507
  // Elements
508
  const modeCards = document.querySelectorAll('.mode-card');
@@ -521,12 +494,6 @@
521
  const processedVideo = document.getElementById('processedVideo');
522
  const firstFrameImage = document.getElementById('firstFrameImage');
523
  const downloadBtn = document.getElementById('downloadBtn');
524
- const depthFrameImage = document.getElementById('depthFrameImage');
525
- const depthFramePlaceholder = document.getElementById('depthFramePlaceholder');
526
- const depthFrameStatus = document.getElementById('depthFrameStatus');
527
- const depthVideo = document.getElementById('depthVideo');
528
- const depthDownloadBtn = document.getElementById('depthDownloadBtn');
529
- const depthVideoStatus = document.getElementById('depthVideoStatus');
530
  const viewToggleContainer = document.getElementById('viewToggleContainer');
531
  const detectionViewBtn = document.getElementById('detectionViewBtn');
532
  const depthViewBtn = document.getElementById('depthViewBtn');
@@ -543,8 +510,6 @@
543
 
544
  if (detectionFirstFrameUrl) {
545
  firstFrameImage.src = detectionFirstFrameUrl;
546
- depthFrameImage.classList.add('hidden');
547
- depthFramePlaceholder.classList.remove('hidden');
548
  }
549
  if (detectionVideoUrl) {
550
  processedVideo.src = detectionVideoUrl;
@@ -556,11 +521,6 @@
556
  depthViewBtn.classList.add('active');
557
  detectionViewBtn.classList.remove('active');
558
 
559
- if (depthFirstFrameUrl) {
560
- firstFrameImage.src = depthFirstFrameUrl;
561
- depthFrameImage.classList.add('hidden');
562
- depthFramePlaceholder.classList.add('hidden');
563
- }
564
  if (depthVideoUrl) {
565
  processedVideo.src = depthVideoUrl;
566
  downloadBtn.href = depthVideoUrl;
@@ -643,25 +603,14 @@
643
  statusPoller = null;
644
  }
645
  firstFrameImage.removeAttribute('src');
646
- depthFrameImage.removeAttribute('src');
647
- depthFrameImage.classList.add('hidden');
648
- depthFramePlaceholder.classList.remove('hidden');
649
- depthFrameStatus.textContent = '';
650
  processedVideo.removeAttribute('src');
651
  processedVideo.load();
652
  downloadBtn.removeAttribute('href');
653
- depthVideo.removeAttribute('src');
654
- depthVideo.load();
655
- depthVideo.classList.add('hidden');
656
- depthDownloadBtn.removeAttribute('href');
657
- depthDownloadBtn.classList.add('hidden');
658
- depthVideoStatus.textContent = '';
659
  viewToggleContainer.classList.add('hidden');
660
  currentView = 'detection';
661
  detectionVideoUrl = null;
662
  depthVideoUrl = null;
663
  detectionFirstFrameUrl = null;
664
- depthFirstFrameUrl = null;
665
  statusLine.classList.add('hidden');
666
  statusLine.textContent = '';
667
 
@@ -749,51 +698,23 @@
749
  });
750
 
751
  async function loadDepthAssets(jobData) {
752
- if (!jobData.first_frame_depth_url || !jobData.depth_video_url) {
753
- depthFrameStatus.textContent = 'Depth endpoints not available for this job.';
754
- depthVideoStatus.textContent = 'Depth endpoints not available for this job.';
755
  return;
756
  }
757
 
758
- try {
759
- const frameResponse = await fetch(jobData.first_frame_depth_url);
760
- if (frameResponse.ok) {
761
- const frameBlob = await frameResponse.blob();
762
- depthFirstFrameUrl = URL.createObjectURL(frameBlob);
763
- depthFrameImage.src = depthFirstFrameUrl;
764
- depthFrameImage.classList.remove('hidden');
765
- depthFramePlaceholder.classList.add('hidden');
766
- } else {
767
- const error = await frameResponse.json();
768
- depthFrameStatus.textContent = error.detail || 'Depth preview unavailable.';
769
- }
770
- } catch (error) {
771
- depthFrameStatus.textContent = 'Depth preview failed to load.';
772
- }
773
-
774
  try {
775
  const depthResponse = await fetch(jobData.depth_video_url);
776
  if (depthResponse.ok) {
777
  const depthBlob = await depthResponse.blob();
778
  depthVideoUrl = URL.createObjectURL(depthBlob);
779
 
780
- // Keep depth video card hidden - using toggle instead
781
- depthVideo.src = depthVideoUrl;
782
- depthVideo.classList.add('hidden');
783
- depthDownloadBtn.classList.add('hidden');
784
-
785
  // Show toggle buttons now that we have both videos
786
  viewToggleContainer.classList.remove('hidden');
787
 
788
  // Start with detection view
789
  switchToView('detection');
790
- } else {
791
- const error = await depthResponse.json();
792
- depthVideoStatus.textContent = error.detail || 'Depth video unavailable.';
793
  }
794
- } catch (error) {
795
- depthVideoStatus.textContent = 'Depth video failed to load.';
796
- }
797
  }
798
 
799
  </script>
 
285
  text-align: center;
286
  }
287
 
 
 
 
 
 
 
288
  .spinner {
289
  border: 4px solid #e5e7eb;
290
  border-top: 4px solid #1f2933;
 
448
  <img id="firstFrameImage" class="frame-preview" alt="First frame preview">
449
  </div>
450
  </div>
 
 
 
 
 
 
 
 
 
 
451
  <div class="video-card">
452
  <div class="video-card-header">Original Video</div>
453
  <div class="video-card-body">
 
463
  </a>
464
  </div>
465
  </div>
 
 
 
 
 
 
 
 
 
 
466
  </div>
467
  </div>
468
  </div>
 
476
  let detectionVideoUrl = null;
477
  let depthVideoUrl = null;
478
  let detectionFirstFrameUrl = null;
 
479
 
480
  // Elements
481
  const modeCards = document.querySelectorAll('.mode-card');
 
494
  const processedVideo = document.getElementById('processedVideo');
495
  const firstFrameImage = document.getElementById('firstFrameImage');
496
  const downloadBtn = document.getElementById('downloadBtn');
 
 
 
 
 
 
497
  const viewToggleContainer = document.getElementById('viewToggleContainer');
498
  const detectionViewBtn = document.getElementById('detectionViewBtn');
499
  const depthViewBtn = document.getElementById('depthViewBtn');
 
510
 
511
  if (detectionFirstFrameUrl) {
512
  firstFrameImage.src = detectionFirstFrameUrl;
 
 
513
  }
514
  if (detectionVideoUrl) {
515
  processedVideo.src = detectionVideoUrl;
 
521
  depthViewBtn.classList.add('active');
522
  detectionViewBtn.classList.remove('active');
523
 
 
 
 
 
 
524
  if (depthVideoUrl) {
525
  processedVideo.src = depthVideoUrl;
526
  downloadBtn.href = depthVideoUrl;
 
603
  statusPoller = null;
604
  }
605
  firstFrameImage.removeAttribute('src');
 
 
 
 
606
  processedVideo.removeAttribute('src');
607
  processedVideo.load();
608
  downloadBtn.removeAttribute('href');
 
 
 
 
 
 
609
  viewToggleContainer.classList.add('hidden');
610
  currentView = 'detection';
611
  detectionVideoUrl = null;
612
  depthVideoUrl = null;
613
  detectionFirstFrameUrl = null;
 
614
  statusLine.classList.add('hidden');
615
  statusLine.textContent = '';
616
 
 
698
  });
699
 
700
  async function loadDepthAssets(jobData) {
701
+ if (!jobData.depth_video_url) {
 
 
702
  return;
703
  }
704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  try {
706
  const depthResponse = await fetch(jobData.depth_video_url);
707
  if (depthResponse.ok) {
708
  const depthBlob = await depthResponse.blob();
709
  depthVideoUrl = URL.createObjectURL(depthBlob);
710
 
 
 
 
 
 
711
  // Show toggle buttons now that we have both videos
712
  viewToggleContainer.classList.remove('hidden');
713
 
714
  // Start with detection view
715
  switchToView('detection');
 
 
 
716
  }
717
+ } catch (error) {}
 
 
718
  }
719
 
720
  </script>
jobs/models.py CHANGED
@@ -27,7 +27,7 @@ class JobInfo:
27
  error: Optional[str] = None
28
  first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
29
  # Depth estimation fields
30
- depth_estimator_name: str = "depth_pro" # Always depth_pro for now
31
  depth_output_path: Optional[str] = None
32
  first_frame_depth_path: Optional[str] = None
33
  partial_success: bool = False # True if one component failed but job completed
 
27
  error: Optional[str] = None
28
  first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
29
  # Depth estimation fields
30
+ depth_estimator_name: str = "depth_pro"
31
  depth_output_path: Optional[str] = None
32
  first_frame_depth_path: Optional[str] = None
33
  partial_success: bool = False # True if one component failed but job completed
models/depth_estimators/depth_anything_v2.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import numpy as np
4
+ import torch
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ from .base import DepthEstimator, DepthResult
8
+
9
+
10
+ class DepthAnythingV2Estimator(DepthEstimator):
11
+ """Depth-Anything V2 depth estimator."""
12
+
13
+ name = "depth_anything_v2"
14
+
15
+ def __init__(self) -> None:
16
+ try:
17
+ from depth_anything_v2.dpt import DepthAnythingV2
18
+ except ImportError as exc:
19
+ raise ImportError(
20
+ "depth-anything-v2 package not installed. "
21
+ "Install from https://github.com/DepthAnything/Depth-Anything-V2"
22
+ ) from exc
23
+
24
+ logging.info("Loading Depth-Anything V2 model from Hugging Face...")
25
+
26
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ self.model = DepthAnythingV2(
29
+ encoder="vitl",
30
+ features=256,
31
+ out_channels=[256, 512, 1024, 1024],
32
+ )
33
+ weights_path = hf_hub_download(
34
+ repo_id="depth-anything/Depth-Anything-V2-Large",
35
+ filename="depth_anything_v2_vitl.pth",
36
+ repo_type="model",
37
+ )
38
+ state_dict = torch.load(weights_path, map_location="cpu")
39
+ self.model.load_state_dict(state_dict)
40
+ self.model.to(self.device).eval()
41
+
42
+ if torch.cuda.is_available():
43
+ logging.info("Depth-Anything V2 model loaded on GPU")
44
+ else:
45
+ logging.warning("Depth-Anything V2 model loaded on CPU (no CUDA available)")
46
+
47
+ def predict(self, frame: np.ndarray) -> DepthResult:
48
+ """
49
+ Run depth estimation on a single frame.
50
+
51
+ Args:
52
+ frame: HxWx3 BGR uint8 numpy array (OpenCV format)
53
+
54
+ Returns:
55
+ DepthResult with depth_map (HxW float32) and focal_length
56
+ """
57
+ try:
58
+ with torch.no_grad():
59
+ try:
60
+ depth = self.model.infer_image(frame)
61
+ except TypeError:
62
+ depth = self.model.infer_image(frame, device=self.device)
63
+ except Exception as exc:
64
+ logging.error("Depth-Anything V2 inference failed: %s", exc)
65
+ h, w = frame.shape[:2]
66
+ return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
67
+
68
+ depth_map = np.asarray(depth, dtype=np.float32)
69
+ if depth_map.ndim != 2:
70
+ depth_map = depth_map.squeeze()
71
+
72
+ return DepthResult(depth_map=depth_map, focal_length=1.0)
models/depth_estimators/model_loader.py CHANGED
@@ -4,11 +4,13 @@ from functools import lru_cache
4
  from typing import Callable, Dict
5
 
6
  from .base import DepthEstimator
 
7
  from .depth_pro import DepthProEstimator
8
 
9
 
10
  # Registry of depth estimators
11
  _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
 
12
  "depth_pro": DepthProEstimator,
13
  }
14
 
 
4
  from typing import Callable, Dict
5
 
6
  from .base import DepthEstimator
7
+ from .depth_anything_v2 import DepthAnythingV2Estimator
8
  from .depth_pro import DepthProEstimator
9
 
10
 
11
  # Registry of depth estimators
12
  _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
13
+ "depth_anything_v2": DepthAnythingV2Estimator,
14
  "depth_pro": DepthProEstimator,
15
  }
16
 
requirements.txt CHANGED
@@ -8,6 +8,7 @@ accelerate
8
  pillow
9
  scipy
10
  huggingface-hub
 
11
  ultralytics
12
  timm
13
  ffmpeg-python
 
8
  pillow
9
  scipy
10
  huggingface-hub
11
+ depth-anything-v2 @ git+https://github.com/DepthAnything/Depth-Anything-V2.git
12
  ultralytics
13
  timm
14
  ffmpeg-python