Spaces:

BiasLab2025
/

perception

Sleeping

App Files Files Community

Zhen Ye commited on Jan 10

Commit

18ba97a

1 Parent(s): 91f3b56

fixed shape mismatch

Browse files

Files changed (1) hide show

models/depth_estimators/depth_pro.py +80 -51

models/depth_estimators/depth_pro.py CHANGED Viewed

@@ -48,56 +48,85 @@ class DepthProEstimator(DepthEstimator):
         Returns:
             DepthResult with depth_map (HxW float32 in meters) and focal_length
         """
-        # Convert BGR to RGB
-        rgb_frame = frame[:, :, ::-1]  # BGR → RGB
-        # Convert to PIL Image
-        pil_image = Image.fromarray(rgb_frame)
-        height, width = pil_image.height, pil_image.width
-        # Preprocess image
-        inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
-        # Run inference (no gradient needed)
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        # Get raw depth prediction
-        raw_depth = outputs.predicted_depth  # Shape: [1, 1, H, W]
-        # Resize to target size if needed
-        if raw_depth.shape[-2:] != (height, width):
-            import torch.nn.functional as F
-            raw_depth = F.interpolate(
-                raw_depth,
-                size=(height, width),
-                mode='bilinear',
-                align_corners=False
-            )
-        # Convert to numpy and remove batch/channel dims
-        depth_map = raw_depth.squeeze().cpu().numpy()  # Shape: [H, W]
-        # Get focal length from outputs if available
-        if hasattr(outputs, 'fov_deg') and outputs.fov_deg is not None:
-            # Convert field of view to focal length
-            fov_rad = outputs.fov_deg * np.pi / 180.0
-            focal_length = float(width / (2.0 * np.tan(fov_rad / 2.0)))
-        else:
-            focal_length = 1.0
-        # Debug: Check for NaN values
-        if np.isnan(depth_map).any():
-            nan_count = np.isnan(depth_map).sum()
-            total = depth_map.size
-            logging.warning(
-                f"Depth map contains {nan_count}/{total} ({100*nan_count/total:.1f}%) NaN values"
-            )
-            logging.warning(f"Depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}")
-            valid_depths = depth_map[np.isfinite(depth_map)]
-            if len(valid_depths) > 0:
-                logging.warning(
-                    f"Valid depth range: {valid_depths.min():.4f} - {valid_depths.max():.4f}"
                 )
-        return DepthResult(depth_map=depth_map, focal_length=focal_length)

         Returns:
             DepthResult with depth_map (HxW float32 in meters) and focal_length
         """
+        try:
+            # Convert BGR to RGB
+            rgb_frame = frame[:, :, ::-1]  # BGR → RGB
+            # Convert to PIL Image
+            pil_image = Image.fromarray(rgb_frame)
+            height, width = pil_image.height, pil_image.width
+            # Preprocess image
+            inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
+            # Run inference (no gradient needed)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+            # Debug: Inspect output structure
+            logging.debug(f"Model outputs type: {type(outputs)}")
+            logging.debug(f"Model outputs keys: {outputs.keys() if hasattr(outputs, 'keys') else 'N/A'}")
+            # Get raw depth prediction - the shape varies by model
+            raw_depth = outputs.predicted_depth
+            # Log the actual shape for debugging
+            logging.info(f"Raw depth shape: {raw_depth.shape}, dtype: {raw_depth.dtype}")
+            # Ensure we have a 4D tensor [B, C, H, W]
+            if raw_depth.dim() == 2:
+                # [H, W] -> [1, 1, H, W]
+                raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
+            elif raw_depth.dim() == 3:
+                # [B, H, W] or [C, H, W] -> [1, 1, H, W]
+                raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
+            elif raw_depth.dim() == 1:
+                # This is unexpected - possibly a flattened output
+                # Try to reshape based on expected output size
+                expected_size = 1536  # Model's default output size
+                raw_depth = raw_depth.reshape(1, 1, expected_size, expected_size)
+            # Now resize to target size
+            if raw_depth.shape[-2:] != (height, width):
+                import torch.nn.functional as F
+                raw_depth = F.interpolate(
+                    raw_depth,
+                    size=(height, width),
+                    mode='bilinear',
+                    align_corners=False
                 )
+            # Convert to numpy and remove batch/channel dims
+            depth_map = raw_depth.squeeze().cpu().numpy()  # Shape: [H, W]
+            # Get focal length from outputs if available
+            if hasattr(outputs, 'fov_deg') and outputs.fov_deg is not None:
+                # Convert field of view to focal length
+                fov_rad = float(outputs.fov_deg) * np.pi / 180.0
+                focal_length = float(width / (2.0 * np.tan(fov_rad / 2.0)))
+            else:
+                focal_length = 1.0
+            # Debug: Check for NaN values
+            if np.isnan(depth_map).any():
+                nan_count = np.isnan(depth_map).sum()
+                total = depth_map.size
+                logging.warning(
+                    f"Depth map contains {nan_count}/{total} ({100*nan_count/total:.1f}%) NaN values"
+                )
+                logging.warning(f"Depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}")
+                valid_depths = depth_map[np.isfinite(depth_map)]
+                if len(valid_depths) > 0:
+                    logging.warning(
+                        f"Valid depth range: {valid_depths.min():.4f} - {valid_depths.max():.4f}"
+                    )
+            return DepthResult(depth_map=depth_map, focal_length=focal_length)
+        except Exception as e:
+            logging.error(f"Depth estimation failed: {e}")
+            logging.error(f"Frame shape: {frame.shape}")
+            # Return a blank depth map as fallback
+            h, w = frame.shape[:2]
+            depth_map = np.zeros((h, w), dtype=np.float32)
+            return DepthResult(depth_map=depth_map, focal_length=1.0)