Spaces:
Paused
Paused
Zhen Ye
commited on
Commit
·
012b29b
1
Parent(s):
1c4206e
using depth model form transformers
Browse files- demo.html +13 -1
- models/depth_estimators/depth_anything_v2.py +32 -33
- models/depth_estimators/model_loader.py +1 -0
- requirements.txt +0 -1
demo.html
CHANGED
|
@@ -404,10 +404,21 @@
|
|
| 404 |
</div>
|
| 405 |
</div>
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
<!-- Video Upload -->
|
| 408 |
<div class="section">
|
| 409 |
<div class="input-group">
|
| 410 |
-
<label>
|
| 411 |
<div class="file-input-wrapper">
|
| 412 |
<label class="file-input-label" id="fileLabel" for="videoFile">
|
| 413 |
Click to select video file (MP4)
|
|
@@ -621,6 +632,7 @@
|
|
| 621 |
formData.append('queries', document.getElementById('queries').value);
|
| 622 |
formData.append('detector', document.getElementById('detector').value);
|
| 623 |
formData.append('segmenter', document.getElementById('segmenter').value);
|
|
|
|
| 624 |
|
| 625 |
try {
|
| 626 |
const response = await fetch('/detect/async', {
|
|
|
|
| 404 |
</div>
|
| 405 |
</div>
|
| 406 |
|
| 407 |
+
<!-- Depth Model Selection -->
|
| 408 |
+
<div class="section" id="depthModelSection">
|
| 409 |
+
<div class="input-group">
|
| 410 |
+
<label for="depthModel">3. Select Depth Model</label>
|
| 411 |
+
<select id="depthModel">
|
| 412 |
+
<option value="depth_pro">Depth Pro (Apple)</option>
|
| 413 |
+
<option value="depth_anything">Depth Anything (LiheYoung)</option>
|
| 414 |
+
</select>
|
| 415 |
+
</div>
|
| 416 |
+
</div>
|
| 417 |
+
|
| 418 |
<!-- Video Upload -->
|
| 419 |
<div class="section">
|
| 420 |
<div class="input-group">
|
| 421 |
+
<label>4. Upload Video</label>
|
| 422 |
<div class="file-input-wrapper">
|
| 423 |
<label class="file-input-label" id="fileLabel" for="videoFile">
|
| 424 |
Click to select video file (MP4)
|
|
|
|
| 632 |
formData.append('queries', document.getElementById('queries').value);
|
| 633 |
formData.append('detector', document.getElementById('detector').value);
|
| 634 |
formData.append('segmenter', document.getElementById('segmenter').value);
|
| 635 |
+
formData.append('depth_estimator', document.getElementById('depthModel').value);
|
| 636 |
|
| 637 |
try {
|
| 638 |
const response = await fetch('/detect/async', {
|
models/depth_estimators/depth_anything_v2.py
CHANGED
|
@@ -2,42 +2,25 @@ import logging
|
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
-
from
|
|
|
|
| 6 |
|
| 7 |
from .base import DepthEstimator, DepthResult
|
| 8 |
|
| 9 |
|
| 10 |
class DepthAnythingV2Estimator(DepthEstimator):
|
| 11 |
-
"""Depth-Anything
|
| 12 |
|
| 13 |
name = "depth_anything_v2"
|
| 14 |
|
| 15 |
def __init__(self) -> None:
|
| 16 |
-
|
| 17 |
-
from depth_anything_v2.dpt import DepthAnythingV2
|
| 18 |
-
except ImportError as exc:
|
| 19 |
-
raise ImportError(
|
| 20 |
-
"depth-anything-v2 package not installed. "
|
| 21 |
-
"Install from https://github.com/DepthAnything/Depth-Anything-V2"
|
| 22 |
-
) from exc
|
| 23 |
-
|
| 24 |
-
logging.info("Loading Depth-Anything V2 model from Hugging Face...")
|
| 25 |
|
| 26 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
out_channels=[256, 512, 1024, 1024],
|
| 32 |
-
)
|
| 33 |
-
weights_path = hf_hub_download(
|
| 34 |
-
repo_id="depth-anything/Depth-Anything-V2-Large",
|
| 35 |
-
filename="depth_anything_v2_vitl.pth",
|
| 36 |
-
repo_type="model",
|
| 37 |
-
)
|
| 38 |
-
state_dict = torch.load(weights_path, map_location="cpu")
|
| 39 |
-
self.model.load_state_dict(state_dict)
|
| 40 |
-
self.model.to(self.device).eval()
|
| 41 |
|
| 42 |
if torch.cuda.is_available():
|
| 43 |
logging.info("Depth-Anything V2 model loaded on GPU")
|
|
@@ -55,18 +38,34 @@ class DepthAnythingV2Estimator(DepthEstimator):
|
|
| 55 |
DepthResult with depth_map (HxW float32) and focal_length
|
| 56 |
"""
|
| 57 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
with torch.no_grad():
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
except Exception as exc:
|
| 64 |
-
logging.error("Depth-Anything
|
| 65 |
h, w = frame.shape[:2]
|
| 66 |
return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
|
| 67 |
|
| 68 |
-
depth_map = np.asarray(depth, dtype=np.float32)
|
| 69 |
-
if depth_map.ndim != 2:
|
| 70 |
-
depth_map = depth_map.squeeze()
|
| 71 |
-
|
| 72 |
return DepthResult(depth_map=depth_map, focal_length=1.0)
|
|
|
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
|
| 7 |
|
| 8 |
from .base import DepthEstimator, DepthResult
|
| 9 |
|
| 10 |
|
| 11 |
class DepthAnythingV2Estimator(DepthEstimator):
|
| 12 |
+
"""Depth-Anything depth estimator (Transformers-compatible)."""
|
| 13 |
|
| 14 |
name = "depth_anything_v2"
|
| 15 |
|
| 16 |
def __init__(self) -> None:
|
| 17 |
+
logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 20 |
|
| 21 |
+
model_id = "LiheYoung/depth-anything-large-hf"
|
| 22 |
+
self.image_processor = AutoImageProcessor.from_pretrained(model_id)
|
| 23 |
+
self.model = AutoModelForDepthEstimation.from_pretrained(model_id).to(self.device).eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
if torch.cuda.is_available():
|
| 26 |
logging.info("Depth-Anything V2 model loaded on GPU")
|
|
|
|
| 38 |
DepthResult with depth_map (HxW float32) and focal_length
|
| 39 |
"""
|
| 40 |
try:
|
| 41 |
+
rgb_frame = frame[:, :, ::-1] # BGR -> RGB
|
| 42 |
+
pil_image = Image.fromarray(rgb_frame)
|
| 43 |
+
height, width = pil_image.height, pil_image.width
|
| 44 |
+
|
| 45 |
+
inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
|
| 46 |
with torch.no_grad():
|
| 47 |
+
outputs = self.model(**inputs)
|
| 48 |
+
|
| 49 |
+
raw_depth = outputs.predicted_depth
|
| 50 |
+
if raw_depth.dim() == 2:
|
| 51 |
+
raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
|
| 52 |
+
elif raw_depth.dim() == 3:
|
| 53 |
+
raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
|
| 54 |
+
|
| 55 |
+
if raw_depth.shape[-2:] != (height, width):
|
| 56 |
+
import torch.nn.functional as F
|
| 57 |
+
|
| 58 |
+
raw_depth = F.interpolate(
|
| 59 |
+
raw_depth,
|
| 60 |
+
size=(height, width),
|
| 61 |
+
mode="bilinear",
|
| 62 |
+
align_corners=False,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
depth_map = raw_depth.squeeze().cpu().numpy().astype(np.float32, copy=False)
|
| 66 |
except Exception as exc:
|
| 67 |
+
logging.error("Depth-Anything inference failed: %s", exc)
|
| 68 |
h, w = frame.shape[:2]
|
| 69 |
return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
return DepthResult(depth_map=depth_map, focal_length=1.0)
|
models/depth_estimators/model_loader.py
CHANGED
|
@@ -10,6 +10,7 @@ from .depth_pro import DepthProEstimator
|
|
| 10 |
|
| 11 |
# Registry of depth estimators
|
| 12 |
_REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
|
|
|
|
| 13 |
"depth_anything_v2": DepthAnythingV2Estimator,
|
| 14 |
"depth_pro": DepthProEstimator,
|
| 15 |
}
|
|
|
|
| 10 |
|
| 11 |
# Registry of depth estimators
|
| 12 |
_REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
|
| 13 |
+
"depth_anything": DepthAnythingV2Estimator,
|
| 14 |
"depth_anything_v2": DepthAnythingV2Estimator,
|
| 15 |
"depth_pro": DepthProEstimator,
|
| 16 |
}
|
requirements.txt
CHANGED
|
@@ -8,7 +8,6 @@ accelerate
|
|
| 8 |
pillow
|
| 9 |
scipy
|
| 10 |
huggingface-hub
|
| 11 |
-
depth-anything-v2 @ git+https://github.com/DepthAnything/Depth-Anything-V2.git
|
| 12 |
ultralytics
|
| 13 |
timm
|
| 14 |
ffmpeg-python
|
|
|
|
| 8 |
pillow
|
| 9 |
scipy
|
| 10 |
huggingface-hub
|
|
|
|
| 11 |
ultralytics
|
| 12 |
timm
|
| 13 |
ffmpeg-python
|