depth-anything-v2-large-api

+from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+from PIL import Image
+import torch
+import torch.nn.functional as F
+import io
+import base64
+import numpy as np
+class EndpointHandler:
+    def __init__(self, path=""):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load processor + model from the *endpoint repo*
+        self.processor = AutoImageProcessor.from_pretrained(path)
+        self.model = AutoModelForDepthEstimation.from_pretrained(path)
+        self.model.to(self.device)
+        self.model.eval()
+    def __call__(self, data):
+        """
+        Expected request body: raw image bytes (recommended)
+        Hugging Face Endpoints typically pass:
+          data["inputs"] -> bytes
+        """
+        image_bytes = data.get("inputs", None)
+        if image_bytes is None:
+            raise ValueError('Missing "inputs". Send raw image bytes as the request body.')
+        # Load image
+        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        orig_w, orig_h = image.size
+        # Preprocess
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Inference
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            predicted_depth = outputs.predicted_depth  # shape: [B, H, W] (or similar)
+        # Upsample depth to original image size (as in the docs)
+        # Make it [B,1,H,W] for interpolate
+        depth = predicted_depth.unsqueeze(1)
+        depth = F.interpolate(
+            depth,
+            size=(orig_h, orig_w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        depth = depth.squeeze(1).squeeze(0)  # [H, W]
+        depth_np = depth.detach().float().cpu().numpy()
+        # ---- Make a nice visualization PNG (0..255) ----
+        dmin, dmax = float(depth_np.min()), float(depth_np.max())
+        denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0
+        depth_norm = (depth_np - dmin) / denom
+        depth_uint8 = (depth_norm * 255.0).clip(0, 255).astype(np.uint8)
+        depth_img = Image.fromarray(depth_uint8, mode="L")  # grayscale
+        buf = io.BytesIO()
+        depth_img.save(buf, format="PNG")
+        depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        # ---- Optional: return raw depth as float16 bytes (compact) ----
+        depth_f16 = depth_np.astype(np.float16)
+        raw_bytes = depth_f16.tobytes()
+        depth_raw_base64_f16 = base64.b64encode(raw_bytes).decode("utf-8")
+        return {
+            "type": "relative_depth",
+            "width": orig_w,
+            "height": orig_h,
+            "depth_png_base64": depth_png_base64,
+            "depth_raw_base64_f16": depth_raw_base64_f16,
+            "raw_dtype": "float16",
+            "raw_shape": [orig_h, orig_w],
+            "viz_min": dmin,
+            "viz_max": dmax,
+        }