""" Depth estimation using Depth Anything V2 Small. Loads the model via the HuggingFace transformers depth-estimation pipeline. Returns uint8 depth maps normalised to 0-255 and resized to match the input. """ import numpy as np import torch from PIL import Image from transformers import pipeline from ..config import DEPTH_MODEL class DepthEstimator: """Depth Anything V2 Small wrapper around the HuggingFace pipeline.""" def __init__(self) -> None: """Load the depth-estimation pipeline onto the available device.""" print("Loading Depth Anything V2 Small...") # pipeline() uses a plain int device (0 = first CUDA GPU, -1 = CPU). # device_map={"": "cuda"} is for from_pretrained, not pipeline — using # it here leaves the pipeline's internal device as -1 (CPU) and causes # a device mismatch when moving input tensors. device: int = 0 if torch.cuda.is_available() else -1 self.pipe = pipeline( task="depth-estimation", model=DEPTH_MODEL, device=device, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, ) if torch.cuda.is_available(): print( f" GPU memory allocated: " f"{torch.cuda.memory_allocated() / 1024**2:.0f} MB" ) def estimate_depth(self, image: np.ndarray) -> np.ndarray: """Estimate a depth map from an RGB image. Args: image: uint8 RGB numpy array of shape (H, W, 3). Returns: uint8 numpy array of shape (H, W) with values in [0, 255]. Higher values indicate objects closer to the camera. """ h, w = image.shape[:2] pil_image = Image.fromarray(image) with torch.inference_mode(): result = self.pipe(pil_image) # The pipeline returns a dict; "depth" is a PIL Image whose mode is # typically "I" (32-bit int) or "F" (32-bit float). depth_pil: Image.Image = result["depth"] # Resize to original spatial dimensions before normalisation so that # BILINEAR interpolation operates on the native depth values. depth_resized = depth_pil.resize((w, h), Image.BILINEAR) depth_array = np.array(depth_resized, dtype=np.float32) # Normalise to uint8. Guard against a flat scene (max == min) by # clamping the range to at least 1 to avoid divide-by-zero. d_min = float(depth_array.min()) d_max = float(depth_array.max()) d_range = d_max - d_min if d_max > d_min else 1.0 depth_uint8 = ((depth_array - d_min) / d_range * 255).astype(np.uint8) return depth_uint8