Spaces:

BiasLab2025
/

perception

Running

App Files Files Community

Zhen Ye commited on Jan 10

Commit

b8fe2b6

1 Parent(s): 9803004

added apple depth pro

Browse files

Files changed (11) hide show

app.py +58 -0
demo.html +99 -0
inference.py +136 -0
jobs/background.py +53 -5
jobs/models.py +6 -0
jobs/storage.py +10 -0
models/depth_estimators/__init__.py +13 -0
models/depth_estimators/base.py +27 -0
models/depth_estimators/depth_pro.py +75 -0
models/depth_estimators/model_loader.py +67 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -19,6 +19,8 @@ from inference import process_first_frame, run_inference, run_segmentation
 from jobs.background import process_video_async
 from jobs.models import JobInfo, JobStatus
 from jobs.storage import (
     get_first_frame_path,
     get_input_video_path,
     get_job_directory,
@@ -272,6 +274,8 @@ async def detect_async_endpoint(
     input_path = get_input_video_path(job_id)
     output_path = get_output_video_path(job_id)
     first_frame_path = get_first_frame_path(job_id)
     try:
         _save_upload_to_path(video, input_path)
@@ -314,6 +318,9 @@ async def detect_async_endpoint(
         output_video_path=str(output_path),
         first_frame_path=str(first_frame_path),
         first_frame_detections=detections,
     )
     get_job_storage().create(job)
     asyncio.create_task(process_video_async(job_id))
@@ -321,8 +328,10 @@ async def detect_async_endpoint(
     return {
         "job_id": job_id,
         "first_frame_url": f"/detect/first-frame/{job_id}",
         "status_url": f"/detect/status/{job_id}",
         "video_url": f"/detect/video/{job_id}",
         "status": job.status.value,
         "first_frame_detections": detections,
     }
@@ -396,5 +405,54 @@ async def detect_video(job_id: str):
     )
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

 from jobs.background import process_video_async
 from jobs.models import JobInfo, JobStatus
 from jobs.storage import (
+    get_depth_output_path,
+    get_first_frame_depth_path,
     get_first_frame_path,
     get_input_video_path,
     get_job_directory,
     input_path = get_input_video_path(job_id)
     output_path = get_output_video_path(job_id)
     first_frame_path = get_first_frame_path(job_id)
+    depth_output_path = get_depth_output_path(job_id)
+    first_frame_depth_path = get_first_frame_depth_path(job_id)
     try:
         _save_upload_to_path(video, input_path)
         output_video_path=str(output_path),
         first_frame_path=str(first_frame_path),
         first_frame_detections=detections,
+        depth_estimator_name="depth_pro",
+        depth_output_path=str(depth_output_path),
+        first_frame_depth_path=str(first_frame_depth_path),
     )
     get_job_storage().create(job)
     asyncio.create_task(process_video_async(job_id))
     return {
         "job_id": job_id,
         "first_frame_url": f"/detect/first-frame/{job_id}",
+        "first_frame_depth_url": f"/detect/first-frame-depth/{job_id}",
         "status_url": f"/detect/status/{job_id}",
         "video_url": f"/detect/video/{job_id}",
+        "depth_video_url": f"/detect/depth-video/{job_id}",
         "status": job.status.value,
         "first_frame_detections": detections,
     }
     )
+@app.get("/detect/depth-video/{job_id}")
+async def detect_depth_video(job_id: str):
+    """Return depth estimation video."""
+    job = get_job_storage().get(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found or expired.")
+    if not job.depth_output_path:
+        # Check if depth failed (partial success)
+        if job.partial_success and job.depth_error:
+            raise HTTPException(status_code=404, detail=f"Depth unavailable: {job.depth_error}")
+        raise HTTPException(status_code=404, detail="No depth video for this job.")
+    if job.status == JobStatus.FAILED:
+        raise HTTPException(status_code=500, detail=f"Job failed: {job.error}")
+    if job.status == JobStatus.CANCELLED:
+        raise HTTPException(status_code=410, detail="Job was cancelled")
+    if job.status == JobStatus.PROCESSING:
+        return JSONResponse(
+            status_code=202,
+            content={"detail": "Video still processing", "status": "processing"},
+        )
+    if not Path(job.depth_output_path).exists():
+        raise HTTPException(status_code=404, detail="Depth video file not found.")
+    return FileResponse(
+        path=job.depth_output_path,
+        media_type="video/mp4",
+        filename="depth.mp4",
+    )
+@app.get("/detect/first-frame-depth/{job_id}")
+async def detect_first_frame_depth(job_id: str):
+    """Return first frame depth visualization."""
+    job = get_job_storage().get(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found or expired.")
+    if not job.first_frame_depth_path:
+        # Return placeholder or error if depth not available
+        if job.partial_success and job.depth_error:
+            raise HTTPException(status_code=404, detail=f"Depth unavailable: {job.depth_error}")
+        raise HTTPException(status_code=404, detail="First frame depth not found.")
+    if not Path(job.first_frame_depth_path).exists():
+        raise HTTPException(status_code=404, detail="First frame depth file not found.")
+    return FileResponse(
+        path=job.first_frame_depth_path,
+        media_type="image/jpeg",
+        filename="first_frame_depth.jpg",
+    )
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

demo.html CHANGED Viewed

@@ -238,6 +238,20 @@
             display: block;
         }
         .download-btn {
             margin-top: 12px;
             padding: 10px 16px;
@@ -271,6 +285,12 @@
             text-align: center;
         }
         .spinner {
             border: 4px solid #e5e7eb;
             border-top: 4px solid #1f2933;
@@ -402,6 +422,16 @@
                             <img id="firstFrameImage" class="frame-preview" alt="First frame preview">
                         </div>
                     </div>
                     <div class="video-card">
                         <div class="video-card-header">Original Video</div>
                         <div class="video-card-body">
@@ -417,6 +447,16 @@
                             </a>
                         </div>
                     </div>
                 </div>
             </div>
         </div>
@@ -444,6 +484,12 @@
         const processedVideo = document.getElementById('processedVideo');
         const firstFrameImage = document.getElementById('firstFrameImage');
         const downloadBtn = document.getElementById('downloadBtn');
         let statusPoller = null;
         const statusLine = document.getElementById('statusLine');
         // Mode selection handler
@@ -512,9 +558,19 @@
                 statusPoller = null;
             }
             firstFrameImage.removeAttribute('src');
             processedVideo.removeAttribute('src');
             processedVideo.load();
             downloadBtn.removeAttribute('href');
             statusLine.classList.add('hidden');
             statusLine.textContent = '';
@@ -568,6 +624,8 @@
                             const videoUrl = URL.createObjectURL(blob);
                             processedVideo.src = videoUrl;
                             downloadBtn.href = videoUrl;
                         } else if (statusData.status === 'failed') {
                             clearInterval(statusPoller);
                             statusPoller = null;
@@ -593,6 +651,47 @@
             }
         });
     </script>
 </body>
 </html>

             display: block;
         }
+        .frame-placeholder {
+            width: 100%;
+            border-radius: 8px;
+            background: #f3f4f6;
+            color: #6b7280;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            min-height: 200px;
+            font-size: 0.95rem;
+            text-align: center;
+            padding: 16px;
+        }
         .download-btn {
             margin-top: 12px;
             padding: 10px 16px;
             text-align: center;
         }
+        .depth-status {
+            margin-top: 8px;
+            font-size: 0.85rem;
+            color: #6b7280;
+        }
         .spinner {
             border: 4px solid #e5e7eb;
             border-top: 4px solid #1f2933;
                             <img id="firstFrameImage" class="frame-preview" alt="First frame preview">
                         </div>
                     </div>
+                    <div class="video-card">
+                        <div class="video-card-header">First Frame (Depth)</div>
+                        <div class="video-card-body">
+                            <div id="depthFramePlaceholder" class="frame-placeholder">
+                                Depth preview will appear after processing.
+                            </div>
+                            <img id="depthFrameImage" class="frame-preview hidden" alt="First frame depth preview">
+                            <div id="depthFrameStatus" class="depth-status"></div>
+                        </div>
+                    </div>
                     <div class="video-card">
                         <div class="video-card-header">Original Video</div>
                         <div class="video-card-body">
                             </a>
                         </div>
                     </div>
+                    <div class="video-card">
+                        <div class="video-card-header">Depth Video</div>
+                        <div class="video-card-body">
+                            <video id="depthVideo" controls autoplay loop class="hidden"></video>
+                            <a id="depthDownloadBtn" class="download-btn hidden" download="depth.mp4">
+                                Download Depth Video
+                            </a>
+                            <div id="depthVideoStatus" class="depth-status"></div>
+                        </div>
+                    </div>
                 </div>
             </div>
         </div>
         const processedVideo = document.getElementById('processedVideo');
         const firstFrameImage = document.getElementById('firstFrameImage');
         const downloadBtn = document.getElementById('downloadBtn');
+        const depthFrameImage = document.getElementById('depthFrameImage');
+        const depthFramePlaceholder = document.getElementById('depthFramePlaceholder');
+        const depthFrameStatus = document.getElementById('depthFrameStatus');
+        const depthVideo = document.getElementById('depthVideo');
+        const depthDownloadBtn = document.getElementById('depthDownloadBtn');
+        const depthVideoStatus = document.getElementById('depthVideoStatus');
         let statusPoller = null;
         const statusLine = document.getElementById('statusLine');
         // Mode selection handler
                 statusPoller = null;
             }
             firstFrameImage.removeAttribute('src');
+            depthFrameImage.removeAttribute('src');
+            depthFrameImage.classList.add('hidden');
+            depthFramePlaceholder.classList.remove('hidden');
+            depthFrameStatus.textContent = '';
             processedVideo.removeAttribute('src');
             processedVideo.load();
             downloadBtn.removeAttribute('href');
+            depthVideo.removeAttribute('src');
+            depthVideo.load();
+            depthVideo.classList.add('hidden');
+            depthDownloadBtn.removeAttribute('href');
+            depthDownloadBtn.classList.add('hidden');
+            depthVideoStatus.textContent = '';
             statusLine.classList.add('hidden');
             statusLine.textContent = '';
                             const videoUrl = URL.createObjectURL(blob);
                             processedVideo.src = videoUrl;
                             downloadBtn.href = videoUrl;
+                            await loadDepthAssets(data);
                         } else if (statusData.status === 'failed') {
                             clearInterval(statusPoller);
                             statusPoller = null;
             }
         });
+        async function loadDepthAssets(jobData) {
+            if (!jobData.first_frame_depth_url || !jobData.depth_video_url) {
+                depthFrameStatus.textContent = 'Depth endpoints not available for this job.';
+                depthVideoStatus.textContent = 'Depth endpoints not available for this job.';
+                return;
+            }
+            try {
+                const frameResponse = await fetch(jobData.first_frame_depth_url);
+                if (frameResponse.ok) {
+                    const frameBlob = await frameResponse.blob();
+                    const frameUrl = URL.createObjectURL(frameBlob);
+                    depthFrameImage.src = frameUrl;
+                    depthFrameImage.classList.remove('hidden');
+                    depthFramePlaceholder.classList.add('hidden');
+                } else {
+                    const error = await frameResponse.json();
+                    depthFrameStatus.textContent = error.detail || 'Depth preview unavailable.';
+                }
+            } catch (error) {
+                depthFrameStatus.textContent = 'Depth preview failed to load.';
+            }
+            try {
+                const depthResponse = await fetch(jobData.depth_video_url);
+                if (depthResponse.ok) {
+                    const depthBlob = await depthResponse.blob();
+                    const depthUrl = URL.createObjectURL(depthBlob);
+                    depthVideo.src = depthUrl;
+                    depthVideo.classList.remove('hidden');
+                    depthDownloadBtn.href = depthUrl;
+                    depthDownloadBtn.classList.remove('hidden');
+                } else {
+                    const error = await depthResponse.json();
+                    depthVideoStatus.textContent = error.detail || 'Depth video unavailable.';
+                }
+            } catch (error) {
+                depthVideoStatus.textContent = 'Depth video failed to load.';
+            }
+        }
     </script>
 </body>
 </html>

inference.py CHANGED Viewed

@@ -347,3 +347,139 @@ def run_segmentation(
     logging.info("Segmented video written to: %s", output_video_path)
     return output_video_path

     logging.info("Segmented video written to: %s", output_video_path)
     return output_video_path
+def run_depth_inference(
+    input_video_path: str,
+    output_video_path: str,
+    max_frames: Optional[int] = None,
+    depth_estimator_name: str = "depth_pro",
+    job_id: Optional[str] = None,
+) -> str:
+    """
+    Run depth estimation on a video.
+    Args:
+        input_video_path: Path to input video
+        output_video_path: Path to write depth visualization video
+        max_frames: Optional frame limit for testing
+        depth_estimator_name: Depth estimator to use (default: depth_pro)
+        job_id: Optional job ID for cancellation support
+    Returns:
+        Path to depth visualization video
+    """
+    try:
+        frames, fps, width, height = extract_frames(input_video_path)
+    except ValueError as exc:
+        logging.exception("Failed to decode video at %s", input_video_path)
+        raise
+    logging.info("Using depth estimator: %s", depth_estimator_name)
+    # Limit frames if requested
+    if max_frames is not None:
+        frames = frames[:max_frames]
+    # Process depth with stable normalization
+    processed_frames = process_frames_depth(frames, depth_estimator_name, job_id)
+    # Write output video
+    write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
+    logging.info("Depth video written to: %s", output_video_path)
+    return output_video_path
+def process_frames_depth(
+    frames: List[np.ndarray],
+    depth_estimator_name: str,
+    job_id: Optional[str] = None,
+) -> List[np.ndarray]:
+    """
+    Process all frames through depth estimator with stable normalization.
+    Two-pass approach:
+    1. Compute depth for all frames and find global min/max
+    2. Colorize using global range to avoid flicker
+    Args:
+        frames: List of frames (HxWx3 BGR uint8)
+        depth_estimator_name: Name of depth estimator to use
+        job_id: Optional job ID for cancellation
+    Returns:
+        List of depth visualization frames (HxWx3 RGB uint8)
+    """
+    from models.depth_estimators.model_loader import load_depth_estimator
+    estimator = load_depth_estimator(depth_estimator_name)
+    # First pass: Compute all depth maps and find global range
+    depth_maps = []
+    all_values = []
+    for idx, frame in enumerate(frames):
+        _check_cancellation(job_id)
+        lock = _get_model_lock("depth", estimator.name)
+        with lock:
+            depth_result = estimator.predict(frame)
+        depth_maps.append(depth_result.depth_map)
+        all_values.append(depth_result.depth_map.ravel())
+        if idx % 10 == 0:
+            logging.debug("Computed depth for frame %d/%d", idx + 1, len(frames))
+    # Compute global min/max (using percentiles to handle outliers)
+    all_depths = np.concatenate(all_values)
+    global_min = np.percentile(all_depths, 1)  # 1st percentile to clip outliers
+    global_max = np.percentile(all_depths, 99)  # 99th percentile
+    logging.info(
+        "Depth range: %.2f - %.2f meters (1st-99th percentile)",
+        global_min,
+        global_max,
+    )
+    # Second pass: Colorize with stable normalization
+    processed = []
+    for idx, depth_map in enumerate(depth_maps):
+        depth_vis = colorize_depth_map(depth_map, global_min, global_max)
+        processed.append(depth_vis)
+        if idx % 10 == 0:
+            logging.debug("Colorized frame %d/%d", idx + 1, len(depth_maps))
+    return processed
+def colorize_depth_map(
+    depth_map: np.ndarray,
+    global_min: float,
+    global_max: float,
+) -> np.ndarray:
+    """
+    Convert depth map to RGB visualization using TURBO colormap.
+    Args:
+        depth_map: HxW float32 depth in meters
+        global_min: Minimum depth across entire video (for stable normalization)
+        global_max: Maximum depth across entire video (for stable normalization)
+    Returns:
+        HxWx3 uint8 RGB image
+    """
+    import cv2
+    if global_max - global_min < 1e-6:  # Handle uniform depth
+        depth_norm = np.zeros_like(depth_map, dtype=np.uint8)
+    else:
+        # Clip to global range to handle outliers
+        depth_clipped = np.clip(depth_map, global_min, global_max)
+        depth_norm = ((depth_clipped - global_min) / (global_max - global_min) * 255).astype(np.uint8)
+    # Apply TURBO colormap for vibrant, perceptually uniform visualization
+    colored = cv2.applyColorMap(depth_norm, cv2.COLORMAP_TURBO)
+    return colored

jobs/background.py CHANGED Viewed

@@ -2,9 +2,11 @@ import asyncio
 import logging
 from datetime import datetime
 from jobs.models import JobStatus
-from jobs.storage import get_job_storage
-from inference import run_inference, run_segmentation
 async def process_video_async(job_id: str) -> None:
@@ -13,9 +15,15 @@ async def process_video_async(job_id: str) -> None:
     if not job:
         return
     try:
         if job.mode == "segmentation":
-            output_path = await asyncio.to_thread(
                 run_segmentation,
                 job.input_video_path,
                 job.output_video_path,
@@ -25,7 +33,7 @@ async def process_video_async(job_id: str) -> None:
                 job_id,
             )
         else:
-            output_path = await asyncio.to_thread(
                 run_inference,
                 job.input_video_path,
                 job.output_video_path,
@@ -34,12 +42,52 @@ async def process_video_async(job_id: str) -> None:
                 job.detector_name,
                 job_id,
             )
         storage.update(
             job_id,
             status=JobStatus.COMPLETED,
             completed_at=datetime.utcnow(),
-            output_video_path=output_path,
         )
     except RuntimeError as exc:
         # Handle cancellation specifically
         if "cancelled" in str(exc).lower():

 import logging
 from datetime import datetime
+import torch
 from jobs.models import JobStatus
+from jobs.storage import get_job_storage, get_depth_output_path
+from inference import run_inference, run_segmentation, run_depth_inference
 async def process_video_async(job_id: str) -> None:
     if not job:
         return
+    detection_path = None
+    depth_path = None
+    depth_error = None
+    partial_success = False
     try:
+        # Run detection or segmentation first
         if job.mode == "segmentation":
+            detection_path = await asyncio.to_thread(
                 run_segmentation,
                 job.input_video_path,
                 job.output_video_path,
                 job_id,
             )
         else:
+            detection_path = await asyncio.to_thread(
                 run_inference,
                 job.input_video_path,
                 job.output_video_path,
                 job.detector_name,
                 job_id,
             )
+        # Try to run depth estimation
+        try:
+            depth_path = await asyncio.to_thread(
+                run_depth_inference,
+                job.input_video_path,
+                str(get_depth_output_path(job_id)),
+                None,  # max_frames
+                job.depth_estimator_name,
+                job_id,
+            )
+            logging.info("Depth estimation completed for job %s", job_id)
+        except (ImportError, ModuleNotFoundError) as exc:
+            logging.exception("Depth model not available for job %s", job_id)
+            depth_error = f"Depth model import failed: {exc}"
+            partial_success = True
+        except torch.cuda.OutOfMemoryError:
+            logging.exception("Depth estimation failed due to GPU OOM for job %s", job_id)
+            depth_error = "Depth estimation failed due to GPU memory limits"
+            partial_success = True
+        except RuntimeError as exc:
+            # Handle cancellation specifically for depth
+            if "cancelled" in str(exc).lower():
+                logging.info("Depth processing cancelled for job %s", job_id)
+                depth_error = "Depth processing cancelled"
+                partial_success = True
+            else:
+                logging.exception("Depth estimation failed for job %s", job_id)
+                depth_error = f"Depth processing error: {str(exc)}"
+                partial_success = True
+        except Exception as exc:
+            logging.exception("Depth estimation failed for job %s", job_id)
+            depth_error = f"Depth processing error: {str(exc)}"
+            partial_success = True
+        # Mark as completed (with or without depth)
         storage.update(
             job_id,
             status=JobStatus.COMPLETED,
             completed_at=datetime.utcnow(),
+            output_video_path=detection_path,
+            depth_output_path=depth_path,
+            partial_success=partial_success,
+            depth_error=depth_error,
         )
     except RuntimeError as exc:
         # Handle cancellation specifically
         if "cancelled" in str(exc).lower():

jobs/models.py CHANGED Viewed

@@ -26,3 +26,9 @@ class JobInfo:
     completed_at: Optional[datetime] = None
     error: Optional[str] = None
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)

     completed_at: Optional[datetime] = None
     error: Optional[str] = None
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
+    # Depth estimation fields
+    depth_estimator_name: str = "depth_pro"  # Always depth_pro for now
+    depth_output_path: Optional[str] = None
+    first_frame_depth_path: Optional[str] = None
+    partial_success: bool = False  # True if one component failed but job completed
+    depth_error: Optional[str] = None  # Error message if depth failed

jobs/storage.py CHANGED Viewed

@@ -25,6 +25,16 @@ def get_first_frame_path(job_id: str) -> Path:
     return get_job_directory(job_id) / "first_frame.jpg"
 class JobStorage:
     def __init__(self) -> None:
         self._jobs: Dict[str, JobInfo] = {}

     return get_job_directory(job_id) / "first_frame.jpg"
+def get_depth_output_path(job_id: str) -> Path:
+    """Get path for depth estimation video output."""
+    return get_job_directory(job_id) / "depth.mp4"
+def get_first_frame_depth_path(job_id: str) -> Path:
+    """Get path for first frame depth visualization."""
+    return get_job_directory(job_id) / "first_frame_depth.jpg"
 class JobStorage:
     def __init__(self) -> None:
         self._jobs: Dict[str, JobInfo] = {}

models/depth_estimators/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Depth estimation models for video processing."""
+from .base import DepthEstimator, DepthResult
+from .depth_pro import DepthProEstimator
+from .model_loader import list_depth_estimators, load_depth_estimator
+__all__ = [
+    "DepthEstimator",
+    "DepthResult",
+    "DepthProEstimator",
+    "load_depth_estimator",
+    "list_depth_estimators",
+]

models/depth_estimators/base.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from typing import NamedTuple
+import numpy as np
+class DepthResult(NamedTuple):
+    """Result from depth estimation inference."""
+    depth_map: np.ndarray  # HxW float32 depth in meters
+    focal_length: float    # Estimated focal length in pixels
+class DepthEstimator:
+    """Base interface for depth estimation models."""
+    name: str
+    def predict(self, frame: np.ndarray) -> DepthResult:
+        """
+        Run depth estimation on a single frame.
+        Args:
+            frame: Input image as numpy array (HxWxC, BGR format from OpenCV)
+        Returns:
+            DepthResult with depth_map and focal_length
+        """
+        raise NotImplementedError

models/depth_estimators/depth_pro.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import logging
+import numpy as np
+import torch
+from PIL import Image
+from .base import DepthEstimator, DepthResult
+class DepthProEstimator(DepthEstimator):
+    """Apple Depth Pro depth estimator."""
+    name = "depth_pro"
+    def __init__(self):
+        """Initialize Depth Pro model."""
+        try:
+            import depth_pro
+        except ImportError as exc:
+            raise ImportError(
+                "depth_pro package not installed. "
+                "Install with: pip install git+https://github.com/apple/ml-depth-pro.git"
+            ) from exc
+        logging.info("Loading Depth Pro model...")
+        self.model, self.transform = depth_pro.create_model_and_transforms()
+        self.model.eval()
+        # Move model to GPU if available
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            logging.info("Depth Pro model loaded on GPU")
+        else:
+            logging.warning("Depth Pro model loaded on CPU (no CUDA available)")
+    def predict(self, frame: np.ndarray) -> DepthResult:
+        """
+        Run depth estimation on a single frame.
+        Args:
+            frame: HxWx3 BGR uint8 numpy array (OpenCV format)
+        Returns:
+            DepthResult with depth_map (HxW float32 in meters) and focal_length
+        """
+        # Convert BGR to RGB
+        rgb_frame = frame[:, :, ::-1]  # BGR → RGB
+        # Convert to PIL Image for transform
+        pil_image = Image.fromarray(rgb_frame)
+        # Apply transform and move to device
+        image_tensor = self.transform(pil_image)
+        image_tensor = image_tensor.to(self.device)
+        # Run inference (no gradient needed)
+        with torch.no_grad():
+            prediction = self.model.infer(image_tensor, f_px=None)
+        # Extract depth map and move to CPU/numpy
+        # prediction is a dict: {"depth": tensor, "focallength_px": tensor}
+        depth_tensor = prediction["depth"]
+        focal_length_tensor = prediction.get("focallength_px")
+        # Convert to numpy, remove batch dimension if present
+        depth_map = depth_tensor.cpu().numpy().squeeze()
+        # Extract focal length
+        if focal_length_tensor is not None:
+            focal_length = float(focal_length_tensor.cpu().item())
+        else:
+            focal_length = 1.0
+        return DepthResult(depth_map=depth_map, focal_length=focal_length)

models/depth_estimators/model_loader.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Registry and loader for depth estimators."""
+from functools import lru_cache
+from typing import Callable, Dict
+from .base import DepthEstimator
+from .depth_pro import DepthProEstimator
+# Registry of depth estimators
+_REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
+    "depth_pro": DepthProEstimator,
+}
+@lru_cache(maxsize=None)
+def _get_cached_depth_estimator(name: str) -> DepthEstimator:
+    """
+    Create and cache depth estimator instance.
+    Args:
+        name: Depth estimator name (e.g., "depth_pro")
+    Returns:
+        Depth estimator instance
+    """
+    return _create_depth_estimator(name)
+def _create_depth_estimator(name: str) -> DepthEstimator:
+    """
+    Create depth estimator instance.
+    Args:
+        name: Depth estimator name
+    Returns:
+        Depth estimator instance
+    Raises:
+        KeyError: If estimator not found in registry
+    """
+    if name not in _REGISTRY:
+        raise KeyError(
+            f"Depth estimator '{name}' not found. Available: {list(_REGISTRY.keys())}"
+        )
+    estimator_class = _REGISTRY[name]
+    return estimator_class()
+def load_depth_estimator(name: str = "depth_pro") -> DepthEstimator:
+    """
+    Load depth estimator by name (with caching).
+    Args:
+        name: Depth estimator name (default: "depth_pro")
+    Returns:
+        Cached depth estimator instance
+    """
+    return _get_cached_depth_estimator(name)
+def list_depth_estimators() -> list[str]:
+    """Return list of available depth estimator names."""
+    return list(_REGISTRY.keys())

requirements.txt CHANGED Viewed

@@ -11,3 +11,4 @@ huggingface-hub
 ultralytics
 timm
 ffmpeg-python

 ultralytics
 timm
 ffmpeg-python
+depth-pro @ git+https://github.com/apple/ml-depth-pro.git