Zhen Ye commited on
Commit
012b29b
·
1 Parent(s): 1c4206e

using depth model form transformers

Browse files
demo.html CHANGED
@@ -404,10 +404,21 @@
404
  </div>
405
  </div>
406
 
 
 
 
 
 
 
 
 
 
 
 
407
  <!-- Video Upload -->
408
  <div class="section">
409
  <div class="input-group">
410
- <label>3. Upload Video</label>
411
  <div class="file-input-wrapper">
412
  <label class="file-input-label" id="fileLabel" for="videoFile">
413
  Click to select video file (MP4)
@@ -621,6 +632,7 @@
621
  formData.append('queries', document.getElementById('queries').value);
622
  formData.append('detector', document.getElementById('detector').value);
623
  formData.append('segmenter', document.getElementById('segmenter').value);
 
624
 
625
  try {
626
  const response = await fetch('/detect/async', {
 
404
  </div>
405
  </div>
406
 
407
+ <!-- Depth Model Selection -->
408
+ <div class="section" id="depthModelSection">
409
+ <div class="input-group">
410
+ <label for="depthModel">3. Select Depth Model</label>
411
+ <select id="depthModel">
412
+ <option value="depth_pro">Depth Pro (Apple)</option>
413
+ <option value="depth_anything">Depth Anything (LiheYoung)</option>
414
+ </select>
415
+ </div>
416
+ </div>
417
+
418
  <!-- Video Upload -->
419
  <div class="section">
420
  <div class="input-group">
421
+ <label>4. Upload Video</label>
422
  <div class="file-input-wrapper">
423
  <label class="file-input-label" id="fileLabel" for="videoFile">
424
  Click to select video file (MP4)
 
632
  formData.append('queries', document.getElementById('queries').value);
633
  formData.append('detector', document.getElementById('detector').value);
634
  formData.append('segmenter', document.getElementById('segmenter').value);
635
+ formData.append('depth_estimator', document.getElementById('depthModel').value);
636
 
637
  try {
638
  const response = await fetch('/detect/async', {
models/depth_estimators/depth_anything_v2.py CHANGED
@@ -2,42 +2,25 @@ import logging
2
 
3
  import numpy as np
4
  import torch
5
- from huggingface_hub import hf_hub_download
 
6
 
7
  from .base import DepthEstimator, DepthResult
8
 
9
 
10
  class DepthAnythingV2Estimator(DepthEstimator):
11
- """Depth-Anything V2 depth estimator."""
12
 
13
  name = "depth_anything_v2"
14
 
15
  def __init__(self) -> None:
16
- try:
17
- from depth_anything_v2.dpt import DepthAnythingV2
18
- except ImportError as exc:
19
- raise ImportError(
20
- "depth-anything-v2 package not installed. "
21
- "Install from https://github.com/DepthAnything/Depth-Anything-V2"
22
- ) from exc
23
-
24
- logging.info("Loading Depth-Anything V2 model from Hugging Face...")
25
 
26
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
 
28
- self.model = DepthAnythingV2(
29
- encoder="vitl",
30
- features=256,
31
- out_channels=[256, 512, 1024, 1024],
32
- )
33
- weights_path = hf_hub_download(
34
- repo_id="depth-anything/Depth-Anything-V2-Large",
35
- filename="depth_anything_v2_vitl.pth",
36
- repo_type="model",
37
- )
38
- state_dict = torch.load(weights_path, map_location="cpu")
39
- self.model.load_state_dict(state_dict)
40
- self.model.to(self.device).eval()
41
 
42
  if torch.cuda.is_available():
43
  logging.info("Depth-Anything V2 model loaded on GPU")
@@ -55,18 +38,34 @@ class DepthAnythingV2Estimator(DepthEstimator):
55
  DepthResult with depth_map (HxW float32) and focal_length
56
  """
57
  try:
 
 
 
 
 
58
  with torch.no_grad():
59
- try:
60
- depth = self.model.infer_image(frame)
61
- except TypeError:
62
- depth = self.model.infer_image(frame, device=self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as exc:
64
- logging.error("Depth-Anything V2 inference failed: %s", exc)
65
  h, w = frame.shape[:2]
66
  return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
67
 
68
- depth_map = np.asarray(depth, dtype=np.float32)
69
- if depth_map.ndim != 2:
70
- depth_map = depth_map.squeeze()
71
-
72
  return DepthResult(depth_map=depth_map, focal_length=1.0)
 
2
 
3
  import numpy as np
4
  import torch
5
+ from PIL import Image
6
+ from transformers import AutoImageProcessor, AutoModelForDepthEstimation
7
 
8
  from .base import DepthEstimator, DepthResult
9
 
10
 
11
  class DepthAnythingV2Estimator(DepthEstimator):
12
+ """Depth-Anything depth estimator (Transformers-compatible)."""
13
 
14
  name = "depth_anything_v2"
15
 
16
  def __init__(self) -> None:
17
+ logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
 
 
 
 
 
 
 
 
18
 
19
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
 
21
+ model_id = "LiheYoung/depth-anything-large-hf"
22
+ self.image_processor = AutoImageProcessor.from_pretrained(model_id)
23
+ self.model = AutoModelForDepthEstimation.from_pretrained(model_id).to(self.device).eval()
 
 
 
 
 
 
 
 
 
 
24
 
25
  if torch.cuda.is_available():
26
  logging.info("Depth-Anything V2 model loaded on GPU")
 
38
  DepthResult with depth_map (HxW float32) and focal_length
39
  """
40
  try:
41
+ rgb_frame = frame[:, :, ::-1] # BGR -> RGB
42
+ pil_image = Image.fromarray(rgb_frame)
43
+ height, width = pil_image.height, pil_image.width
44
+
45
+ inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
46
  with torch.no_grad():
47
+ outputs = self.model(**inputs)
48
+
49
+ raw_depth = outputs.predicted_depth
50
+ if raw_depth.dim() == 2:
51
+ raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
52
+ elif raw_depth.dim() == 3:
53
+ raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
54
+
55
+ if raw_depth.shape[-2:] != (height, width):
56
+ import torch.nn.functional as F
57
+
58
+ raw_depth = F.interpolate(
59
+ raw_depth,
60
+ size=(height, width),
61
+ mode="bilinear",
62
+ align_corners=False,
63
+ )
64
+
65
+ depth_map = raw_depth.squeeze().cpu().numpy().astype(np.float32, copy=False)
66
  except Exception as exc:
67
+ logging.error("Depth-Anything inference failed: %s", exc)
68
  h, w = frame.shape[:2]
69
  return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
70
 
 
 
 
 
71
  return DepthResult(depth_map=depth_map, focal_length=1.0)
models/depth_estimators/model_loader.py CHANGED
@@ -10,6 +10,7 @@ from .depth_pro import DepthProEstimator
10
 
11
  # Registry of depth estimators
12
  _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
 
13
  "depth_anything_v2": DepthAnythingV2Estimator,
14
  "depth_pro": DepthProEstimator,
15
  }
 
10
 
11
  # Registry of depth estimators
12
  _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
13
+ "depth_anything": DepthAnythingV2Estimator,
14
  "depth_anything_v2": DepthAnythingV2Estimator,
15
  "depth_pro": DepthProEstimator,
16
  }
requirements.txt CHANGED
@@ -8,7 +8,6 @@ accelerate
8
  pillow
9
  scipy
10
  huggingface-hub
11
- depth-anything-v2 @ git+https://github.com/DepthAnything/Depth-Anything-V2.git
12
  ultralytics
13
  timm
14
  ffmpeg-python
 
8
  pillow
9
  scipy
10
  huggingface-hub
 
11
  ultralytics
12
  timm
13
  ffmpeg-python