Spaces:

BiasLab2025
/

perception

Paused

App Files Files Community

Zhen Ye commited on Jan 11

Commit

012b29b

1 Parent(s): 1c4206e

using depth model form transformers

Browse files

Files changed (4) hide show

demo.html +13 -1
models/depth_estimators/depth_anything_v2.py +32 -33
models/depth_estimators/model_loader.py +1 -0
requirements.txt +0 -1

demo.html CHANGED Viewed

@@ -404,10 +404,21 @@
                 </div>
             </div>
             <!-- Video Upload -->
             <div class="section">
                 <div class="input-group">
-                    <label>3. Upload Video</label>
                     <div class="file-input-wrapper">
                         <label class="file-input-label" id="fileLabel" for="videoFile">
                             Click to select video file (MP4)
@@ -621,6 +632,7 @@
             formData.append('queries', document.getElementById('queries').value);
             formData.append('detector', document.getElementById('detector').value);
             formData.append('segmenter', document.getElementById('segmenter').value);
             try {
                 const response = await fetch('/detect/async', {

                 </div>
             </div>
+            <!-- Depth Model Selection -->
+            <div class="section" id="depthModelSection">
+                <div class="input-group">
+                    <label for="depthModel">3. Select Depth Model</label>
+                    <select id="depthModel">
+                        <option value="depth_pro">Depth Pro (Apple)</option>
+                        <option value="depth_anything">Depth Anything (LiheYoung)</option>
+                    </select>
+                </div>
+            </div>
             <!-- Video Upload -->
             <div class="section">
                 <div class="input-group">
+                    <label>4. Upload Video</label>
                     <div class="file-input-wrapper">
                         <label class="file-input-label" id="fileLabel" for="videoFile">
                             Click to select video file (MP4)
             formData.append('queries', document.getElementById('queries').value);
             formData.append('detector', document.getElementById('detector').value);
             formData.append('segmenter', document.getElementById('segmenter').value);
+            formData.append('depth_estimator', document.getElementById('depthModel').value);
             try {
                 const response = await fetch('/detect/async', {

models/depth_estimators/depth_anything_v2.py CHANGED Viewed

@@ -2,42 +2,25 @@ import logging
 import numpy as np
 import torch
-from huggingface_hub import hf_hub_download
 from .base import DepthEstimator, DepthResult
 class DepthAnythingV2Estimator(DepthEstimator):
-    """Depth-Anything V2 depth estimator."""
     name = "depth_anything_v2"
     def __init__(self) -> None:
-        try:
-            from depth_anything_v2.dpt import DepthAnythingV2
-        except ImportError as exc:
-            raise ImportError(
-                "depth-anything-v2 package not installed. "
-                "Install from https://github.com/DepthAnything/Depth-Anything-V2"
-            ) from exc
-        logging.info("Loading Depth-Anything V2 model from Hugging Face...")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = DepthAnythingV2(
-            encoder="vitl",
-            features=256,
-            out_channels=[256, 512, 1024, 1024],
-        )
-        weights_path = hf_hub_download(
-            repo_id="depth-anything/Depth-Anything-V2-Large",
-            filename="depth_anything_v2_vitl.pth",
-            repo_type="model",
-        )
-        state_dict = torch.load(weights_path, map_location="cpu")
-        self.model.load_state_dict(state_dict)
-        self.model.to(self.device).eval()
         if torch.cuda.is_available():
             logging.info("Depth-Anything V2 model loaded on GPU")
@@ -55,18 +38,34 @@ class DepthAnythingV2Estimator(DepthEstimator):
             DepthResult with depth_map (HxW float32) and focal_length
         """
         try:
             with torch.no_grad():
-                try:
-                    depth = self.model.infer_image(frame)
-                except TypeError:
-                    depth = self.model.infer_image(frame, device=self.device)
         except Exception as exc:
-            logging.error("Depth-Anything V2 inference failed: %s", exc)
             h, w = frame.shape[:2]
             return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
-        depth_map = np.asarray(depth, dtype=np.float32)
-        if depth_map.ndim != 2:
-            depth_map = depth_map.squeeze()
         return DepthResult(depth_map=depth_map, focal_length=1.0)

 import numpy as np
 import torch
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForDepthEstimation
 from .base import DepthEstimator, DepthResult
 class DepthAnythingV2Estimator(DepthEstimator):
+    """Depth-Anything depth estimator (Transformers-compatible)."""
     name = "depth_anything_v2"
     def __init__(self) -> None:
+        logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model_id = "LiheYoung/depth-anything-large-hf"
+        self.image_processor = AutoImageProcessor.from_pretrained(model_id)
+        self.model = AutoModelForDepthEstimation.from_pretrained(model_id).to(self.device).eval()
         if torch.cuda.is_available():
             logging.info("Depth-Anything V2 model loaded on GPU")
             DepthResult with depth_map (HxW float32) and focal_length
         """
         try:
+            rgb_frame = frame[:, :, ::-1]  # BGR -> RGB
+            pil_image = Image.fromarray(rgb_frame)
+            height, width = pil_image.height, pil_image.width
+            inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
             with torch.no_grad():
+                outputs = self.model(**inputs)
+            raw_depth = outputs.predicted_depth
+            if raw_depth.dim() == 2:
+                raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
+            elif raw_depth.dim() == 3:
+                raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
+            if raw_depth.shape[-2:] != (height, width):
+                import torch.nn.functional as F
+                raw_depth = F.interpolate(
+                    raw_depth,
+                    size=(height, width),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+            depth_map = raw_depth.squeeze().cpu().numpy().astype(np.float32, copy=False)
         except Exception as exc:
+            logging.error("Depth-Anything inference failed: %s", exc)
             h, w = frame.shape[:2]
             return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
         return DepthResult(depth_map=depth_map, focal_length=1.0)

models/depth_estimators/model_loader.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .depth_pro import DepthProEstimator
 # Registry of depth estimators
 _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
     "depth_anything_v2": DepthAnythingV2Estimator,
     "depth_pro": DepthProEstimator,
 }

 # Registry of depth estimators
 _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
+    "depth_anything": DepthAnythingV2Estimator,
     "depth_anything_v2": DepthAnythingV2Estimator,
     "depth_pro": DepthProEstimator,
 }

requirements.txt CHANGED Viewed

@@ -8,7 +8,6 @@ accelerate
 pillow
 scipy
 huggingface-hub
-depth-anything-v2 @ git+https://github.com/DepthAnything/Depth-Anything-V2.git
 ultralytics
 timm
 ffmpeg-python

 pillow
 scipy
 huggingface-hub
 ultralytics
 timm
 ffmpeg-python