Spaces:

BiasLab2025
/

perception

Sleeping

App Files Files Community

Zhen Ye commited on 15 days ago

Commit

5e832fe

1 Parent(s): 012b29b

added depth view

Browse files

Files changed (10) hide show

LaserPerception/LaserPerception.html +5 -0
LaserPerception/LaserPerception.js +259 -0
app.py +1 -1
demo.html +1 -2
inference.py +2 -2
jobs/models.py +1 -1
models/depth_estimators/__init__.py +0 -2
models/depth_estimators/depth_anything_v2.py +1 -1
models/depth_estimators/depth_pro.py +0 -132
models/depth_estimators/model_loader.py +4 -7

LaserPerception/LaserPerception.html CHANGED Viewed

@@ -291,6 +291,10 @@
                 <button id="btnRecompute" class="btn secondary">Recompute HEL</button>
                 <button id="btnClear" class="btn secondary">Clear</button>
               </div>
             </div>
             <div class="panel panel-objects">
@@ -419,6 +423,7 @@
                 <span class="chip" id="chipBeam">BEAM:OFF</span>
                 <span class="chip" id="chipHz">DET:6Hz</span>
                 <span class="chip" id="chipFeed" title="Toggle raw vs HF-processed feed (if available)">FEED:RAW</span>
               </div>
               <div class="mt-md">

                 <button id="btnRecompute" class="btn secondary">Recompute HEL</button>
                 <button id="btnClear" class="btn secondary">Clear</button>
               </div>
+              <div class="strip mt-md">
+                <span class="chip" id="chipFrameDepth" title="Toggle depth view of first frame (if available)">VIEW:DEFAULT</span>
+              </div>
             </div>
             <div class="panel panel-objects">
                 <span class="chip" id="chipBeam">BEAM:OFF</span>
                 <span class="chip" id="chipHz">DET:6Hz</span>
                 <span class="chip" id="chipFeed" title="Toggle raw vs HF-processed feed (if available)">FEED:RAW</span>
+                <span class="chip" id="chipDepth" title="Toggle depth view (if available)">VIEW:DEFAULT</span>
               </div>
               <div class="mt-md">

LaserPerception/LaserPerception.js CHANGED Viewed

@@ -25,6 +25,8 @@
         videoFile: null,
         videoLoaded: false,
         useProcessedFeed: false,
         hasReasoned: false,
         isReasoning: false,  // Flag to prevent concurrent Reason executions
@@ -42,6 +44,10 @@
             queries: [],                // Mission objective used as query
             processedUrl: null,
             processedBlob: null,
             summary: null,
             busy: false,
             lastError: null
@@ -179,6 +185,8 @@
     const chipTracks = $("#chipTracks");
     const chipBeam = $("#chipBeam");
     const chipHz = $("#chipHz");
     const dwellText = $("#dwellText");
     const dwellBar = $("#dwellBar");
@@ -246,6 +254,29 @@
         log(`Engage feed set to: ${state.useProcessedFeed ? "HF" : "RAW"}`, "t");
     });
     // Refresh intel summary (unbiased)
     if (btnIntelRefresh) {
@@ -378,15 +409,27 @@
         if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
             try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
         }
         state.videoUrl = null;
         state.videoFile = null;
         state.videoLoaded = false;
         state.useProcessedFeed = false;
         state.hf.missionId = null;
         state.hf.plan = null;
         state.hf.processedUrl = null;
         state.hf.processedBlob = null;
         state.hf.summary = null;
         state.hf.busy = false;
         state.hf.lastError = null;
@@ -469,8 +512,18 @@
         if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
             try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
         }
         state.hf.processedUrl = null;
         state.hf.processedBlob = null;
         state.hf.asyncJobId = null;
         state.hf.firstFrameUrl = null;
         state.hf.firstFrameDetections = null;
@@ -483,6 +536,8 @@
         state.hf.lastError = null;
         state.hf.busy = false;
         state.useProcessedFeed = false;
         setHfStatus("idle");
         renderMissionContext();
         videoHidden.src = state.videoUrl;
@@ -702,6 +757,7 @@
         if (chipFeed) {
             chipFeed.textContent = state.useProcessedFeed ? "FEED:HF" : "FEED:RAW";
         }
     }
     function normalizeToken(s) {
@@ -812,6 +868,9 @@
         }
         // drone_detection uses drone_yolo automatically
         // Submit async job
         setHfStatus(`submitting ${mode} job...`);
         log(`Submitting ${mode} to ${state.hf.baseUrl || "(same-origin)"} (detector=${detector || "n/a"})`, "t");
@@ -837,6 +896,16 @@
         state.hf.videoUrl = `${state.hf.baseUrl}${data.video_url}`;
         state.hf.asyncStatus = data.status;
         // Display first frame immediately (if object detection, segmentation, or drone)
         if ((mode === "object_detection" || mode === "segmentation" || mode === "drone_detection") && state.hf.firstFrameUrl) {
             const count = Array.isArray(data.first_frame_detections) ? data.first_frame_detections.length : null;
@@ -961,6 +1030,8 @@
                         setHfStatus("job completed, fetching video...");
                         try {
                             await fetchProcessedVideo();
                             clearInterval(state.hf.asyncPollInterval);
                             // Clear job ID to prevent cancel attempts after completion
                             state.hf.asyncJobId = null;
@@ -1037,6 +1108,82 @@
         log(`Processed video ready (${(blob.size / 1024 / 1024).toFixed(1)} MB)`);
     }
     function stopAsyncPolling() {
         if (state.hf.asyncPollInterval) {
             clearInterval(state.hf.asyncPollInterval);
@@ -1075,6 +1222,118 @@
         }
     }
     async function startHfPipeline() {
         if (state.hf.busy) {
             log("HF pipeline already running");

         videoFile: null,
         videoLoaded: false,
         useProcessedFeed: false,
+        useDepthFeed: false,        // Flag for depth view (Tab 2 video)
+        useFrameDepthView: false,   // Flag for first frame depth view (Tab 1)
         hasReasoned: false,
         isReasoning: false,  // Flag to prevent concurrent Reason executions
             queries: [],                // Mission objective used as query
             processedUrl: null,
             processedBlob: null,
+            depthVideoUrl: null,        // Depth video URL
+            depthFirstFrameUrl: null,   // First frame depth URL
+            depthBlob: null,            // Depth video blob
+            depthFirstFrameBlob: null,  // Depth first frame blob
             summary: null,
             busy: false,
             lastError: null
     const chipTracks = $("#chipTracks");
     const chipBeam = $("#chipBeam");
     const chipHz = $("#chipHz");
+    const chipDepth = $("#chipDepth");
+    const chipFrameDepth = $("#chipFrameDepth");
     const dwellText = $("#dwellText");
     const dwellBar = $("#dwellBar");
         log(`Engage feed set to: ${state.useProcessedFeed ? "HF" : "RAW"}`, "t");
     });
+    // Toggle depth view
+    chipDepth.addEventListener("click", async () => {
+        if (!state.videoLoaded) return;
+        if (!state.hf.depthVideoUrl) {
+            log("Depth video not ready yet. Run Reason and wait for depth processing.", "w");
+            return;
+        }
+        await toggleDepthView();
+        log(`View set to: ${state.useDepthFeed ? "DEPTH" : "DEFAULT"}`, "t");
+    });
+    // Toggle first frame depth view (Tab 1)
+    if (chipFrameDepth) {
+        chipFrameDepth.addEventListener("click", () => {
+            if (!state.videoLoaded) return;
+            if (!state.hf.depthFirstFrameUrl) {
+                log("First frame depth not ready yet. Run Reason and wait for depth processing.", "w");
+                return;
+            }
+            toggleFirstFrameDepthView();
+            log(`First frame view set to: ${state.useFrameDepthView ? "DEPTH" : "DEFAULT"}`, "t");
+        });
+    }
     // Refresh intel summary (unbiased)
     if (btnIntelRefresh) {
         if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
             try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
         }
+        if (state.hf.depthVideoUrl && state.hf.depthVideoUrl.startsWith("blob:")) {
+            try { URL.revokeObjectURL(state.hf.depthVideoUrl); } catch (_) { }
+        }
+        if (state.hf.depthFirstFrameUrl && state.hf.depthFirstFrameUrl.startsWith("blob:")) {
+            try { URL.revokeObjectURL(state.hf.depthFirstFrameUrl); } catch (_) { }
+        }
         state.videoUrl = null;
         state.videoFile = null;
         state.videoLoaded = false;
         state.useProcessedFeed = false;
+        state.useDepthFeed = false;
+        state.useFrameDepthView = false;
         state.hf.missionId = null;
         state.hf.plan = null;
         state.hf.processedUrl = null;
         state.hf.processedBlob = null;
+        state.hf.depthVideoUrl = null;
+        state.hf.depthBlob = null;
+        state.hf.depthFirstFrameUrl = null;
+        state.hf.depthFirstFrameBlob = null;
         state.hf.summary = null;
         state.hf.busy = false;
         state.hf.lastError = null;
         if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
             try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
         }
+        if (state.hf.depthVideoUrl && state.hf.depthVideoUrl.startsWith("blob:")) {
+            try { URL.revokeObjectURL(state.hf.depthVideoUrl); } catch (_) { }
+        }
+        if (state.hf.depthFirstFrameUrl && state.hf.depthFirstFrameUrl.startsWith("blob:")) {
+            try { URL.revokeObjectURL(state.hf.depthFirstFrameUrl); } catch (_) { }
+        }
         state.hf.processedUrl = null;
         state.hf.processedBlob = null;
+        state.hf.depthVideoUrl = null;
+        state.hf.depthBlob = null;
+        state.hf.depthFirstFrameUrl = null;
+        state.hf.depthFirstFrameBlob = null;
         state.hf.asyncJobId = null;
         state.hf.firstFrameUrl = null;
         state.hf.firstFrameDetections = null;
         state.hf.lastError = null;
         state.hf.busy = false;
         state.useProcessedFeed = false;
+        state.useDepthFeed = false;
+        state.useFrameDepthView = false;
         setHfStatus("idle");
         renderMissionContext();
         videoHidden.src = state.videoUrl;
         if (chipFeed) {
             chipFeed.textContent = state.useProcessedFeed ? "FEED:HF" : "FEED:RAW";
         }
+        updateDepthChip();
     }
     function normalizeToken(s) {
         }
         // drone_detection uses drone_yolo automatically
+        // Add depth_estimator parameter for depth processing
+        form.append("depth_estimator", "depth");
         // Submit async job
         setHfStatus(`submitting ${mode} job...`);
         log(`Submitting ${mode} to ${state.hf.baseUrl || "(same-origin)"} (detector=${detector || "n/a"})`, "t");
         state.hf.videoUrl = `${state.hf.baseUrl}${data.video_url}`;
         state.hf.asyncStatus = data.status;
+        // Store depth URLs if provided
+        if (data.depth_video_url) {
+            state.hf.depthVideoUrl = `${state.hf.baseUrl}${data.depth_video_url}`;
+            log("Depth video URL received", "t");
+        }
+        if (data.first_frame_depth_url) {
+            state.hf.depthFirstFrameUrl = `${state.hf.baseUrl}${data.first_frame_depth_url}`;
+            log("First frame depth URL received (will fetch when ready)", "t");
+        }
         // Display first frame immediately (if object detection, segmentation, or drone)
         if ((mode === "object_detection" || mode === "segmentation" || mode === "drone_detection") && state.hf.firstFrameUrl) {
             const count = Array.isArray(data.first_frame_detections) ? data.first_frame_detections.length : null;
                         setHfStatus("job completed, fetching video...");
                         try {
                             await fetchProcessedVideo();
+                            await fetchDepthVideo();
+                            await fetchDepthFirstFrame();
                             clearInterval(state.hf.asyncPollInterval);
                             // Clear job ID to prevent cancel attempts after completion
                             state.hf.asyncJobId = null;
         log(`Processed video ready (${(blob.size / 1024 / 1024).toFixed(1)} MB)`);
     }
+    async function fetchDepthVideo() {
+        if (!state.hf.depthVideoUrl) {
+            log("No depth video URL available", "w");
+            return;
+        }
+        try {
+            const resp = await fetch(state.hf.depthVideoUrl, { cache: "no-store" });
+            if (!resp.ok) {
+                if (resp.status === 202) {
+                    log("Depth video still processing", "w");
+                    return;
+                }
+                throw new Error(`Failed to fetch depth video: ${resp.statusText}`);
+            }
+            const nullOrigin = (window.location && window.location.origin) === "null";
+            if (nullOrigin) {
+                state.hf.depthBlob = null;
+                state.hf.depthVideoUrl = `${state.hf.depthVideoUrl}?t=${Date.now()}`;
+                log("Depth video ready (streaming URL)");
+                return;
+            }
+            const blob = await resp.blob();
+            // Store the original URL before creating blob
+            const originalUrl = state.hf.depthVideoUrl;
+            state.hf.depthBlob = blob;
+            const blobUrl = URL.createObjectURL(blob);
+            state.hf.depthVideoUrl = blobUrl;
+            log(`Depth video ready (${(blob.size / 1024 / 1024).toFixed(1)} MB) - Click VIEW chip to toggle`, "g");
+            updateDepthChip();
+        } catch (err) {
+            log(`Error fetching depth video: ${err.message}`, "e");
+        }
+    }
+    async function fetchDepthFirstFrame() {
+        if (!state.hf.depthFirstFrameUrl) {
+            log("No depth first frame URL available", "w");
+            return;
+        }
+        try {
+            const resp = await fetch(state.hf.depthFirstFrameUrl, { cache: "no-store" });
+            if (!resp.ok) {
+                if (resp.status === 202) {
+                    log("Depth first frame still processing", "w");
+                    return;
+                }
+                throw new Error(`Failed to fetch depth first frame: ${resp.statusText}`);
+            }
+            // Fetch as blob and create blob URL
+            const blob = await resp.blob();
+            // Store the blob and create a blob URL
+            state.hf.depthFirstFrameBlob = blob;
+            const blobUrl = URL.createObjectURL(blob);
+            // Replace the server URL with the blob URL
+            const originalUrl = state.hf.depthFirstFrameUrl;
+            state.hf.depthFirstFrameUrl = blobUrl;
+            log(`✓ Depth first frame ready (${(blob.size / 1024).toFixed(1)} KB) - Click VIEW chip on Tab 1 to toggle`, "g");
+            updateFirstFrameDepthChip();
+        } catch (err) {
+            log(`Error fetching depth first frame: ${err.message}`, "e");
+        }
+    }
     function stopAsyncPolling() {
         if (state.hf.asyncPollInterval) {
             clearInterval(state.hf.asyncPollInterval);
         }
     }
+    async function toggleDepthView() {
+        state.useDepthFeed = !state.useDepthFeed;
+        updateDepthChip();
+        if (!state.videoLoaded) return;
+        const wasPlaying = !videoEngage.paused;
+        const t = videoEngage.currentTime || 0;
+        try { videoEngage.pause(); } catch (_) { }
+        let desiredSrc;
+        if (state.useDepthFeed && state.hf.depthVideoUrl) {
+            desiredSrc = state.hf.depthVideoUrl;
+        } else if (state.useProcessedFeed && state.hf.processedUrl) {
+            desiredSrc = state.hf.processedUrl;
+        } else {
+            desiredSrc = state.videoUrl;
+        }
+        if (videoEngage.src !== desiredSrc) {
+            videoEngage.src = desiredSrc;
+            videoEngage.setAttribute('data-depth', state.useDepthFeed ? 'true' : 'false');
+            log(`Video view switched to: ${state.useDepthFeed ? 'depth' : 'default'}`, "t");
+            videoEngage.load();
+            await waitVideoReady(videoEngage);
+            try { videoEngage.currentTime = Math.min(t, (videoEngage.duration || t)); } catch (_) { }
+        }
+        resizeOverlays();
+        if (wasPlaying) {
+            try { await videoEngage.play(); } catch (_) { }
+        }
+    }
+    function updateDepthChip() {
+        if (chipDepth) {
+            chipDepth.textContent = state.useDepthFeed ? "VIEW:DEPTH" : "VIEW:DEFAULT";
+        }
+    }
+    function toggleFirstFrameDepthView() {
+        state.useFrameDepthView = !state.useFrameDepthView;
+        updateFirstFrameDepthChip();
+        displayFirstFrameWithDepth();
+    }
+    function updateFirstFrameDepthChip() {
+        if (chipFrameDepth) {
+            chipFrameDepth.textContent = state.useFrameDepthView ? "VIEW:DEPTH" : "VIEW:DEFAULT";
+        }
+    }
+    function displayFirstFrameWithDepth() {
+        // Determine which URL to use based on state
+        let frameUrl;
+        if (state.useFrameDepthView && state.hf.depthFirstFrameUrl) {
+            // Check if we have a blob URL (starts with 'blob:')
+            if (state.hf.depthFirstFrameUrl.startsWith('blob:')) {
+                frameUrl = state.hf.depthFirstFrameUrl;
+            } else {
+                log("Depth first frame not ready yet. Please wait for processing to complete.", "w");
+                state.useFrameDepthView = false; // Revert to default view
+                updateFirstFrameDepthChip();
+                frameUrl = state.hf.firstFrameUrl;
+            }
+        } else if (state.hf.firstFrameUrl) {
+            frameUrl = state.hf.firstFrameUrl;
+        } else {
+            log("No first frame URL available", "w");
+            return;
+        }
+        if (!frameUrl) {
+            log("No valid frame URL to display", "w");
+            return;
+        }
+        log(`Displaying ${state.useFrameDepthView ? 'depth' : 'default'} first frame`, "t");
+        // Load and display the frame
+        const img = new Image();
+        img.crossOrigin = "anonymous";
+        img.src = frameUrl;
+        img.onload = () => {
+            frameCanvas.width = img.width;
+            frameCanvas.height = img.height;
+            frameOverlay.width = img.width;
+            frameOverlay.height = img.height;
+            const ctx = frameCanvas.getContext("2d");
+            ctx.clearRect(0, 0, img.width, img.height);
+            ctx.drawImage(img, 0, 0);
+            frameEmpty.style.display = "none";
+            log(`✓ ${state.useFrameDepthView ? 'Depth' : 'Default'} first frame displayed (${img.width}×${img.height})`, "g");
+        };
+        img.onerror = (err) => {
+            console.error(`Failed to load ${state.useFrameDepthView ? 'depth' : 'default'} first frame:`, err);
+            log(`✗ ${state.useFrameDepthView ? 'Depth' : 'Default'} first frame load failed - reverting to default view`, "e");
+            // If depth frame fails, revert to default
+            if (state.useFrameDepthView) {
+                state.useFrameDepthView = false;
+                updateFirstFrameDepthChip();
+                displayFirstFrameWithDepth(); // Retry with default view
+            }
+        };
+    }
     async function startHfPipeline() {
         if (state.hf.busy) {
             log("HF pipeline already running");

app.py CHANGED Viewed

@@ -260,7 +260,7 @@ async def detect_async_endpoint(
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
     segmenter: str = Form("sam3"),
-    depth_estimator: str = Form("depth_pro"),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(

     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
     segmenter: str = Form("sam3"),
+    depth_estimator: str = Form("depth"),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(

demo.html CHANGED Viewed

@@ -409,8 +409,7 @@
                 <div class="input-group">
                     <label for="depthModel">3. Select Depth Model</label>
                     <select id="depthModel">
-                        <option value="depth_pro">Depth Pro (Apple)</option>
-                        <option value="depth_anything">Depth Anything (LiheYoung)</option>
                     </select>
                 </div>
             </div>

                 <div class="input-group">
                     <label for="depthModel">3. Select Depth Model</label>
                     <select id="depthModel">
+                        <option value="depth">Depth</option>
                     </select>
                 </div>
             </div>

inference.py CHANGED Viewed

@@ -353,7 +353,7 @@ def run_depth_inference(
     input_video_path: str,
     output_video_path: str,
     max_frames: Optional[int] = None,
-    depth_estimator_name: str = "depth_pro",
     first_frame_depth_path: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
@@ -364,7 +364,7 @@ def run_depth_inference(
         input_video_path: Path to input video
         output_video_path: Path to write depth visualization video
         max_frames: Optional frame limit for testing
-        depth_estimator_name: Depth estimator to use (default: depth_pro)
         first_frame_depth_path: Optional path to save the first depth visualization frame
         job_id: Optional job ID for cancellation support

     input_video_path: str,
     output_video_path: str,
     max_frames: Optional[int] = None,
+    depth_estimator_name: str = "depth",
     first_frame_depth_path: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
         input_video_path: Path to input video
         output_video_path: Path to write depth visualization video
         max_frames: Optional frame limit for testing
+        depth_estimator_name: Depth estimator to use (default: depth)
         first_frame_depth_path: Optional path to save the first depth visualization frame
         job_id: Optional job ID for cancellation support

jobs/models.py CHANGED Viewed

@@ -27,7 +27,7 @@ class JobInfo:
     error: Optional[str] = None
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
     # Depth estimation fields
-    depth_estimator_name: str = "depth_pro"
     depth_output_path: Optional[str] = None
     first_frame_depth_path: Optional[str] = None
     partial_success: bool = False  # True if one component failed but job completed

     error: Optional[str] = None
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
     # Depth estimation fields
+    depth_estimator_name: str = "depth"
     depth_output_path: Optional[str] = None
     first_frame_depth_path: Optional[str] = None
     partial_success: bool = False  # True if one component failed but job completed

models/depth_estimators/__init__.py CHANGED Viewed

@@ -1,13 +1,11 @@
 """Depth estimation models for video processing."""
 from .base import DepthEstimator, DepthResult
-from .depth_pro import DepthProEstimator
 from .model_loader import list_depth_estimators, load_depth_estimator
 __all__ = [
     "DepthEstimator",
     "DepthResult",
-    "DepthProEstimator",
     "load_depth_estimator",
     "list_depth_estimators",
 ]

 """Depth estimation models for video processing."""
 from .base import DepthEstimator, DepthResult
 from .model_loader import list_depth_estimators, load_depth_estimator
 __all__ = [
     "DepthEstimator",
     "DepthResult",
     "load_depth_estimator",
     "list_depth_estimators",
 ]

models/depth_estimators/depth_anything_v2.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .base import DepthEstimator, DepthResult
 class DepthAnythingV2Estimator(DepthEstimator):
     """Depth-Anything depth estimator (Transformers-compatible)."""
-    name = "depth_anything_v2"
     def __init__(self) -> None:
         logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")

 class DepthAnythingV2Estimator(DepthEstimator):
     """Depth-Anything depth estimator (Transformers-compatible)."""
+    name = "depth"
     def __init__(self) -> None:
         logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")

models/depth_estimators/depth_pro.py DELETED Viewed

@@ -1,132 +0,0 @@
-import logging
-import numpy as np
-import torch
-from PIL import Image
-from .base import DepthEstimator, DepthResult
-class DepthProEstimator(DepthEstimator):
-    """Apple Depth Pro depth estimator using Hugging Face transformers."""
-    name = "depth_pro"
-    def __init__(self):
-        """Initialize Depth Pro model from Hugging Face."""
-        try:
-            from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
-        except ImportError as exc:
-            raise ImportError(
-                "transformers package not installed or doesn't include DepthPro. "
-                "Update with: pip install transformers --upgrade"
-            ) from exc
-        logging.info("Loading Depth Pro model from Hugging Face...")
-        # Set device
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Load model and processor
-        model_id = "apple/DepthPro-hf"
-        self.image_processor = DepthProImageProcessorFast.from_pretrained(model_id)
-        self.model = DepthProForDepthEstimation.from_pretrained(model_id).to(self.device)
-        self.model.eval()
-        if torch.cuda.is_available():
-            logging.info("Depth Pro model loaded on GPU")
-        else:
-            logging.warning("Depth Pro model loaded on CPU (no CUDA available)")
-    def predict(self, frame: np.ndarray) -> DepthResult:
-        """
-        Run depth estimation on a single frame.
-        Args:
-            frame: HxWx3 BGR uint8 numpy array (OpenCV format)
-        Returns:
-            DepthResult with depth_map (HxW float32 in meters) and focal_length
-        """
-        try:
-            # Convert BGR to RGB
-            rgb_frame = frame[:, :, ::-1]  # BGR → RGB
-            # Convert to PIL Image
-            pil_image = Image.fromarray(rgb_frame)
-            height, width = pil_image.height, pil_image.width
-            # Preprocess image
-            inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
-            # Run inference (no gradient needed)
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-            # Debug: Inspect output structure
-            logging.debug(f"Model outputs type: {type(outputs)}")
-            logging.debug(f"Model outputs keys: {outputs.keys() if hasattr(outputs, 'keys') else 'N/A'}")
-            # Get raw depth prediction - the shape varies by model
-            raw_depth = outputs.predicted_depth
-            # Log the actual shape for debugging
-            logging.info(f"Raw depth shape: {raw_depth.shape}, dtype: {raw_depth.dtype}")
-            # Ensure we have a 4D tensor [B, C, H, W]
-            if raw_depth.dim() == 2:
-                # [H, W] -> [1, 1, H, W]
-                raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
-            elif raw_depth.dim() == 3:
-                # [B, H, W] or [C, H, W] -> [1, 1, H, W]
-                raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
-            elif raw_depth.dim() == 1:
-                # This is unexpected - possibly a flattened output
-                # Try to reshape based on expected output size
-                expected_size = 1536  # Model's default output size
-                raw_depth = raw_depth.reshape(1, 1, expected_size, expected_size)
-            # Now resize to target size
-            if raw_depth.shape[-2:] != (height, width):
-                import torch.nn.functional as F
-                raw_depth = F.interpolate(
-                    raw_depth,
-                    size=(height, width),
-                    mode='bilinear',
-                    align_corners=False
-                )
-            # Convert to numpy and remove batch/channel dims
-            depth_map = raw_depth.squeeze().cpu().numpy()  # Shape: [H, W]
-            # Get focal length from outputs if available
-            if hasattr(outputs, 'fov_deg') and outputs.fov_deg is not None:
-                # Convert field of view to focal length
-                fov_rad = float(outputs.fov_deg) * np.pi / 180.0
-                focal_length = float(width / (2.0 * np.tan(fov_rad / 2.0)))
-            else:
-                focal_length = 1.0
-            # Debug: Check for NaN values
-            if np.isnan(depth_map).any():
-                nan_count = np.isnan(depth_map).sum()
-                total = depth_map.size
-                logging.warning(
-                    f"Depth map contains {nan_count}/{total} ({100*nan_count/total:.1f}%) NaN values"
-                )
-                logging.warning(f"Depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}")
-                valid_depths = depth_map[np.isfinite(depth_map)]
-                if len(valid_depths) > 0:
-                    logging.warning(
-                        f"Valid depth range: {valid_depths.min():.4f} - {valid_depths.max():.4f}"
-                    )
-            return DepthResult(depth_map=depth_map, focal_length=focal_length)
-        except Exception as e:
-            logging.error(f"Depth estimation failed: {e}")
-            logging.error(f"Frame shape: {frame.shape}")
-            # Return a blank depth map as fallback
-            h, w = frame.shape[:2]
-            depth_map = np.zeros((h, w), dtype=np.float32)
-            return DepthResult(depth_map=depth_map, focal_length=1.0)

models/depth_estimators/model_loader.py CHANGED Viewed

@@ -5,14 +5,11 @@ from typing import Callable, Dict
 from .base import DepthEstimator
 from .depth_anything_v2 import DepthAnythingV2Estimator
-from .depth_pro import DepthProEstimator
 # Registry of depth estimators
 _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
-    "depth_anything": DepthAnythingV2Estimator,
-    "depth_anything_v2": DepthAnythingV2Estimator,
-    "depth_pro": DepthProEstimator,
 }
@@ -22,7 +19,7 @@ def _get_cached_depth_estimator(name: str) -> DepthEstimator:
     Create and cache depth estimator instance.
     Args:
-        name: Depth estimator name (e.g., "depth_pro")
     Returns:
         Depth estimator instance
@@ -52,12 +49,12 @@ def _create_depth_estimator(name: str) -> DepthEstimator:
     return estimator_class()
-def load_depth_estimator(name: str = "depth_pro") -> DepthEstimator:
     """
     Load depth estimator by name (with caching).
     Args:
-        name: Depth estimator name (default: "depth_pro")
     Returns:
         Cached depth estimator instance

 from .base import DepthEstimator
 from .depth_anything_v2 import DepthAnythingV2Estimator
 # Registry of depth estimators
 _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
+    "depth": DepthAnythingV2Estimator,
 }
     Create and cache depth estimator instance.
     Args:
+        name: Depth estimator name (e.g., "depth")
     Returns:
         Depth estimator instance
     return estimator_class()
+def load_depth_estimator(name: str = "depth") -> DepthEstimator:
     """
     Load depth estimator by name (with caching).
     Args:
+        name: Depth estimator name (default: "depth")
     Returns:
         Cached depth estimator instance