Spaces:

BiasLab2025
/

perception

Running

App Files Files Community

Zhen Ye commited on Jan 11

Commit

1c4206e

1 Parent(s): 78f99f1

added depth everything v2

Browse files

Files changed (6) hide show

app.py +13 -1
demo.html +2 -81
jobs/models.py +1 -1
models/depth_estimators/depth_anything_v2.py +72 -0
models/depth_estimators/model_loader.py +2 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from fastapi.staticfiles import StaticFiles
 import uvicorn
 from inference import process_first_frame, run_inference, run_segmentation
 from jobs.background import process_video_async
 from jobs.models import JobInfo, JobStatus
 from jobs.storage import (
@@ -259,6 +260,7 @@ async def detect_async_endpoint(
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
     segmenter: str = Form("sam3"),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(
@@ -289,6 +291,16 @@ async def detect_async_endpoint(
     if not query_list:
         query_list = _default_queries_for_mode(mode)
     detector_name = detector
     if mode == "drone_detection":
         detector_name = "drone_yolo"
@@ -318,7 +330,7 @@ async def detect_async_endpoint(
         output_video_path=str(output_path),
         first_frame_path=str(first_frame_path),
         first_frame_detections=detections,
-        depth_estimator_name="depth_pro",
         depth_output_path=str(depth_output_path),
         first_frame_depth_path=str(first_frame_depth_path),
     )

 import uvicorn
 from inference import process_first_frame, run_inference, run_segmentation
+from models.depth_estimators.model_loader import list_depth_estimators
 from jobs.background import process_video_async
 from jobs.models import JobInfo, JobStatus
 from jobs.storage import (
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
     segmenter: str = Form("sam3"),
+    depth_estimator: str = Form("depth_pro"),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(
     if not query_list:
         query_list = _default_queries_for_mode(mode)
+    available_depth_estimators = set(list_depth_estimators())
+    if depth_estimator not in available_depth_estimators:
+        raise HTTPException(
+            status_code=400,
+            detail=(
+                f"Invalid depth estimator '{depth_estimator}'. "
+                f"Must be one of: {', '.join(sorted(available_depth_estimators))}"
+            ),
+        )
     detector_name = detector
     if mode == "drone_detection":
         detector_name = "drone_yolo"
         output_video_path=str(output_path),
         first_frame_path=str(first_frame_path),
         first_frame_detections=detections,
+        depth_estimator_name=depth_estimator,
         depth_output_path=str(depth_output_path),
         first_frame_depth_path=str(first_frame_depth_path),
     )

demo.html CHANGED Viewed

@@ -285,12 +285,6 @@
             text-align: center;
         }
-        .depth-status {
-            margin-top: 8px;
-            font-size: 0.85rem;
-            color: #6b7280;
-        }
         .spinner {
             border: 4px solid #e5e7eb;
             border-top: 4px solid #1f2933;
@@ -454,16 +448,6 @@
                             <img id="firstFrameImage" class="frame-preview" alt="First frame preview">
                         </div>
                     </div>
-                    <div class="video-card">
-                        <div class="video-card-header">First Frame (Depth)</div>
-                        <div class="video-card-body">
-                            <div id="depthFramePlaceholder" class="frame-placeholder">
-                                Depth preview will appear after processing.
-                            </div>
-                            <img id="depthFrameImage" class="frame-preview hidden" alt="First frame depth preview">
-                            <div id="depthFrameStatus" class="depth-status"></div>
-                        </div>
-                    </div>
                     <div class="video-card">
                         <div class="video-card-header">Original Video</div>
                         <div class="video-card-body">
@@ -479,16 +463,6 @@
                             </a>
                         </div>
                     </div>
-                    <div class="video-card">
-                        <div class="video-card-header">Depth Video</div>
-                        <div class="video-card-body">
-                            <video id="depthVideo" controls autoplay loop class="hidden"></video>
-                            <a id="depthDownloadBtn" class="download-btn hidden" download="depth.mp4">
-                                Download Depth Video
-                            </a>
-                            <div id="depthVideoStatus" class="depth-status"></div>
-                        </div>
-                    </div>
                 </div>
             </div>
         </div>
@@ -502,7 +476,6 @@
         let detectionVideoUrl = null;
         let depthVideoUrl = null;
         let detectionFirstFrameUrl = null;
-        let depthFirstFrameUrl = null;
         // Elements
         const modeCards = document.querySelectorAll('.mode-card');
@@ -521,12 +494,6 @@
         const processedVideo = document.getElementById('processedVideo');
         const firstFrameImage = document.getElementById('firstFrameImage');
         const downloadBtn = document.getElementById('downloadBtn');
-        const depthFrameImage = document.getElementById('depthFrameImage');
-        const depthFramePlaceholder = document.getElementById('depthFramePlaceholder');
-        const depthFrameStatus = document.getElementById('depthFrameStatus');
-        const depthVideo = document.getElementById('depthVideo');
-        const depthDownloadBtn = document.getElementById('depthDownloadBtn');
-        const depthVideoStatus = document.getElementById('depthVideoStatus');
         const viewToggleContainer = document.getElementById('viewToggleContainer');
         const detectionViewBtn = document.getElementById('detectionViewBtn');
         const depthViewBtn = document.getElementById('depthViewBtn');
@@ -543,8 +510,6 @@
                 if (detectionFirstFrameUrl) {
                     firstFrameImage.src = detectionFirstFrameUrl;
-                    depthFrameImage.classList.add('hidden');
-                    depthFramePlaceholder.classList.remove('hidden');
                 }
                 if (detectionVideoUrl) {
                     processedVideo.src = detectionVideoUrl;
@@ -556,11 +521,6 @@
                 depthViewBtn.classList.add('active');
                 detectionViewBtn.classList.remove('active');
-                if (depthFirstFrameUrl) {
-                    firstFrameImage.src = depthFirstFrameUrl;
-                    depthFrameImage.classList.add('hidden');
-                    depthFramePlaceholder.classList.add('hidden');
-                }
                 if (depthVideoUrl) {
                     processedVideo.src = depthVideoUrl;
                     downloadBtn.href = depthVideoUrl;
@@ -643,25 +603,14 @@
                 statusPoller = null;
             }
             firstFrameImage.removeAttribute('src');
-            depthFrameImage.removeAttribute('src');
-            depthFrameImage.classList.add('hidden');
-            depthFramePlaceholder.classList.remove('hidden');
-            depthFrameStatus.textContent = '';
             processedVideo.removeAttribute('src');
             processedVideo.load();
             downloadBtn.removeAttribute('href');
-            depthVideo.removeAttribute('src');
-            depthVideo.load();
-            depthVideo.classList.add('hidden');
-            depthDownloadBtn.removeAttribute('href');
-            depthDownloadBtn.classList.add('hidden');
-            depthVideoStatus.textContent = '';
             viewToggleContainer.classList.add('hidden');
             currentView = 'detection';
             detectionVideoUrl = null;
             depthVideoUrl = null;
             detectionFirstFrameUrl = null;
-            depthFirstFrameUrl = null;
             statusLine.classList.add('hidden');
             statusLine.textContent = '';
@@ -749,51 +698,23 @@
         });
         async function loadDepthAssets(jobData) {
-            if (!jobData.first_frame_depth_url || !jobData.depth_video_url) {
-                depthFrameStatus.textContent = 'Depth endpoints not available for this job.';
-                depthVideoStatus.textContent = 'Depth endpoints not available for this job.';
                 return;
             }
-            try {
-                const frameResponse = await fetch(jobData.first_frame_depth_url);
-                if (frameResponse.ok) {
-                    const frameBlob = await frameResponse.blob();
-                    depthFirstFrameUrl = URL.createObjectURL(frameBlob);
-                    depthFrameImage.src = depthFirstFrameUrl;
-                    depthFrameImage.classList.remove('hidden');
-                    depthFramePlaceholder.classList.add('hidden');
-                } else {
-                    const error = await frameResponse.json();
-                    depthFrameStatus.textContent = error.detail || 'Depth preview unavailable.';
-                }
-            } catch (error) {
-                depthFrameStatus.textContent = 'Depth preview failed to load.';
-            }
             try {
                 const depthResponse = await fetch(jobData.depth_video_url);
                 if (depthResponse.ok) {
                     const depthBlob = await depthResponse.blob();
                     depthVideoUrl = URL.createObjectURL(depthBlob);
-                    // Keep depth video card hidden - using toggle instead
-                    depthVideo.src = depthVideoUrl;
-                    depthVideo.classList.add('hidden');
-                    depthDownloadBtn.classList.add('hidden');
                     // Show toggle buttons now that we have both videos
                     viewToggleContainer.classList.remove('hidden');
                     // Start with detection view
                     switchToView('detection');
-                } else {
-                    const error = await depthResponse.json();
-                    depthVideoStatus.textContent = error.detail || 'Depth video unavailable.';
                 }
-            } catch (error) {
-                depthVideoStatus.textContent = 'Depth video failed to load.';
-            }
         }
     </script>

             text-align: center;
         }
         .spinner {
             border: 4px solid #e5e7eb;
             border-top: 4px solid #1f2933;
                             <img id="firstFrameImage" class="frame-preview" alt="First frame preview">
                         </div>
                     </div>
                     <div class="video-card">
                         <div class="video-card-header">Original Video</div>
                         <div class="video-card-body">
                             </a>
                         </div>
                     </div>
                 </div>
             </div>
         </div>
         let detectionVideoUrl = null;
         let depthVideoUrl = null;
         let detectionFirstFrameUrl = null;
         // Elements
         const modeCards = document.querySelectorAll('.mode-card');
         const processedVideo = document.getElementById('processedVideo');
         const firstFrameImage = document.getElementById('firstFrameImage');
         const downloadBtn = document.getElementById('downloadBtn');
         const viewToggleContainer = document.getElementById('viewToggleContainer');
         const detectionViewBtn = document.getElementById('detectionViewBtn');
         const depthViewBtn = document.getElementById('depthViewBtn');
                 if (detectionFirstFrameUrl) {
                     firstFrameImage.src = detectionFirstFrameUrl;
                 }
                 if (detectionVideoUrl) {
                     processedVideo.src = detectionVideoUrl;
                 depthViewBtn.classList.add('active');
                 detectionViewBtn.classList.remove('active');
                 if (depthVideoUrl) {
                     processedVideo.src = depthVideoUrl;
                     downloadBtn.href = depthVideoUrl;
                 statusPoller = null;
             }
             firstFrameImage.removeAttribute('src');
             processedVideo.removeAttribute('src');
             processedVideo.load();
             downloadBtn.removeAttribute('href');
             viewToggleContainer.classList.add('hidden');
             currentView = 'detection';
             detectionVideoUrl = null;
             depthVideoUrl = null;
             detectionFirstFrameUrl = null;
             statusLine.classList.add('hidden');
             statusLine.textContent = '';
         });
         async function loadDepthAssets(jobData) {
+            if (!jobData.depth_video_url) {
                 return;
             }
             try {
                 const depthResponse = await fetch(jobData.depth_video_url);
                 if (depthResponse.ok) {
                     const depthBlob = await depthResponse.blob();
                     depthVideoUrl = URL.createObjectURL(depthBlob);
                     // Show toggle buttons now that we have both videos
                     viewToggleContainer.classList.remove('hidden');
                     // Start with detection view
                     switchToView('detection');
                 }
+            } catch (error) {}
         }
     </script>

jobs/models.py CHANGED Viewed

@@ -27,7 +27,7 @@ class JobInfo:
     error: Optional[str] = None
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
     # Depth estimation fields
-    depth_estimator_name: str = "depth_pro"  # Always depth_pro for now
     depth_output_path: Optional[str] = None
     first_frame_depth_path: Optional[str] = None
     partial_success: bool = False  # True if one component failed but job completed

     error: Optional[str] = None
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
     # Depth estimation fields
+    depth_estimator_name: str = "depth_pro"
     depth_output_path: Optional[str] = None
     first_frame_depth_path: Optional[str] = None
     partial_success: bool = False  # True if one component failed but job completed

models/depth_estimators/depth_anything_v2.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import logging
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from .base import DepthEstimator, DepthResult
+class DepthAnythingV2Estimator(DepthEstimator):
+    """Depth-Anything V2 depth estimator."""
+    name = "depth_anything_v2"
+    def __init__(self) -> None:
+        try:
+            from depth_anything_v2.dpt import DepthAnythingV2
+        except ImportError as exc:
+            raise ImportError(
+                "depth-anything-v2 package not installed. "
+                "Install from https://github.com/DepthAnything/Depth-Anything-V2"
+            ) from exc
+        logging.info("Loading Depth-Anything V2 model from Hugging Face...")
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = DepthAnythingV2(
+            encoder="vitl",
+            features=256,
+            out_channels=[256, 512, 1024, 1024],
+        )
+        weights_path = hf_hub_download(
+            repo_id="depth-anything/Depth-Anything-V2-Large",
+            filename="depth_anything_v2_vitl.pth",
+            repo_type="model",
+        )
+        state_dict = torch.load(weights_path, map_location="cpu")
+        self.model.load_state_dict(state_dict)
+        self.model.to(self.device).eval()
+        if torch.cuda.is_available():
+            logging.info("Depth-Anything V2 model loaded on GPU")
+        else:
+            logging.warning("Depth-Anything V2 model loaded on CPU (no CUDA available)")
+    def predict(self, frame: np.ndarray) -> DepthResult:
+        """
+        Run depth estimation on a single frame.
+        Args:
+            frame: HxWx3 BGR uint8 numpy array (OpenCV format)
+        Returns:
+            DepthResult with depth_map (HxW float32) and focal_length
+        """
+        try:
+            with torch.no_grad():
+                try:
+                    depth = self.model.infer_image(frame)
+                except TypeError:
+                    depth = self.model.infer_image(frame, device=self.device)
+        except Exception as exc:
+            logging.error("Depth-Anything V2 inference failed: %s", exc)
+            h, w = frame.shape[:2]
+            return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
+        depth_map = np.asarray(depth, dtype=np.float32)
+        if depth_map.ndim != 2:
+            depth_map = depth_map.squeeze()
+        return DepthResult(depth_map=depth_map, focal_length=1.0)

models/depth_estimators/model_loader.py CHANGED Viewed

@@ -4,11 +4,13 @@ from functools import lru_cache
 from typing import Callable, Dict
 from .base import DepthEstimator
 from .depth_pro import DepthProEstimator
 # Registry of depth estimators
 _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
     "depth_pro": DepthProEstimator,
 }

 from typing import Callable, Dict
 from .base import DepthEstimator
+from .depth_anything_v2 import DepthAnythingV2Estimator
 from .depth_pro import DepthProEstimator
 # Registry of depth estimators
 _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
+    "depth_anything_v2": DepthAnythingV2Estimator,
     "depth_pro": DepthProEstimator,
 }

requirements.txt CHANGED Viewed

@@ -8,6 +8,7 @@ accelerate
 pillow
 scipy
 huggingface-hub
 ultralytics
 timm
 ffmpeg-python

 pillow
 scipy
 huggingface-hub
+depth-anything-v2 @ git+https://github.com/DepthAnything/Depth-Anything-V2.git
 ultralytics
 timm
 ffmpeg-python