DepthLens / src /models /depth.py
Rishabh Jain
Initial upload β€” depth-aware scene description system
5412d82
"""
Depth estimation using Depth Anything V2 Small.
Loads the model via the HuggingFace transformers depth-estimation pipeline.
Returns uint8 depth maps normalised to 0-255 and resized to match the input.
"""
import numpy as np
import torch
from PIL import Image
from transformers import pipeline
from ..config import DEPTH_MODEL
class DepthEstimator:
"""Depth Anything V2 Small wrapper around the HuggingFace pipeline."""
def __init__(self) -> None:
"""Load the depth-estimation pipeline onto the available device."""
print("Loading Depth Anything V2 Small...")
# pipeline() uses a plain int device (0 = first CUDA GPU, -1 = CPU).
# device_map={"": "cuda"} is for from_pretrained, not pipeline β€” using
# it here leaves the pipeline's internal device as -1 (CPU) and causes
# a device mismatch when moving input tensors.
device: int = 0 if torch.cuda.is_available() else -1
self.pipe = pipeline(
task="depth-estimation",
model=DEPTH_MODEL,
device=device,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)
if torch.cuda.is_available():
print(
f" GPU memory allocated: "
f"{torch.cuda.memory_allocated() / 1024**2:.0f} MB"
)
def estimate_depth(self, image: np.ndarray) -> np.ndarray:
"""Estimate a depth map from an RGB image.
Args:
image: uint8 RGB numpy array of shape (H, W, 3).
Returns:
uint8 numpy array of shape (H, W) with values in [0, 255].
Higher values indicate objects closer to the camera.
"""
h, w = image.shape[:2]
pil_image = Image.fromarray(image)
with torch.inference_mode():
result = self.pipe(pil_image)
# The pipeline returns a dict; "depth" is a PIL Image whose mode is
# typically "I" (32-bit int) or "F" (32-bit float).
depth_pil: Image.Image = result["depth"]
# Resize to original spatial dimensions before normalisation so that
# BILINEAR interpolation operates on the native depth values.
depth_resized = depth_pil.resize((w, h), Image.BILINEAR)
depth_array = np.array(depth_resized, dtype=np.float32)
# Normalise to uint8. Guard against a flat scene (max == min) by
# clamping the range to at least 1 to avoid divide-by-zero.
d_min = float(depth_array.min())
d_max = float(depth_array.max())
d_range = d_max - d_min if d_max > d_min else 1.0
depth_uint8 = ((depth_array - d_min) / d_range * 255).astype(np.uint8)
return depth_uint8