import numpy as np import torch from PIL import Image _depth_cache = None def get_depth_model(): global _depth_cache if _depth_cache is None: from transformers import pipeline as hf_pipeline # Depth Pro from Apple — best metric depth, ~600MB _depth_cache = hf_pipeline( "depth-estimation", model="apple/DepthPro-hf", device=0 if torch.cuda.is_available() else -1, ) return _depth_cache def estimate_depth(image: Image.Image) -> dict: """ Returns {"depth": [[float]], "width": int, "height": int, "min": float, "max": float} Depth values are metric (meters) when Depth Pro is used. """ pipe = get_depth_model() result = pipe(image) depth_map = result["depth"] # PIL image or numpy array if isinstance(depth_map, Image.Image): arr = np.array(depth_map).astype(np.float32) else: arr = np.array(depth_map, dtype=np.float32) # Resize to match source image if needed if arr.shape[:2] != (image.height, image.width): depth_pil = Image.fromarray(arr).resize( (image.width, image.height), Image.BILINEAR ) arr = np.array(depth_pil) dmin = float(arr.min()) dmax = float(arr.max()) return { "depth": arr.tolist(), "width": image.width, "height": image.height, "min": dmin, "max": dmax, }