File size: 3,370 Bytes

e561725

"""
Custom handler for Hugging Face Inference Endpoints.
Serves the Depth Anything V3 Metric Large model for depth estimation.
"""

import base64
import io
from typing import Any

import numpy as np
import torch
from PIL import Image


class EndpointHandler:
    def __init__(self, path: str = ""):
        """
        Initialize the depth estimation model.

        Args:
            path: Path to the model directory (provided by HF Inference Endpoints)
        """
        from depth_anything_3.api import DepthAnything3

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = DepthAnything3.from_pretrained("depth-anything/da3metric-large")
        self.model = self.model.to(device=self.device)

    def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
        """
        Process incoming requests for depth estimation.

        Args:
            data: Request payload with 'inputs' containing base64 image(s)

        Returns:
            Dictionary with depth map, confidence, intrinsics, extrinsics
        """
        inputs = data.get("inputs")

        # Handle base64 encoded image input
        if isinstance(inputs, str):
            # Single base64 image
            image_data = base64.b64decode(inputs)
            image = Image.open(io.BytesIO(image_data)).convert("RGB")
            images = [image]
        elif isinstance(inputs, dict) and "image" in inputs:
            # Dict with image key
            image_data = base64.b64decode(inputs["image"])
            image = Image.open(io.BytesIO(image_data)).convert("RGB")
            images = [image]
        elif isinstance(inputs, list):
            # List of base64 images
            images = []
            for img_b64 in inputs:
                image_data = base64.b64decode(img_b64)
                image = Image.open(io.BytesIO(image_data)).convert("RGB")
                images.append(image)
        else:
            return {"error": "Invalid input format. Expected base64 encoded image(s)."}

        # Run inference
        with torch.inference_mode():
            prediction = self.model.inference(images)

        # Extract results
        depth = prediction.depth.cpu().numpy()  # [N, H, W]
        conf = prediction.conf.cpu().numpy()    # [N, H, W]
        intrinsics = prediction.intrinsics.cpu().numpy()  # [N, 3, 3]
        extrinsics = prediction.extrinsics.cpu().numpy()  # [N, 3, 4]

        # Return base64-encoded numpy arrays
        response = {
            "depth": self._encode_array(depth),
            "confidence": self._encode_array(conf),
            "intrinsics": self._encode_array(intrinsics),
            "extrinsics": self._encode_array(extrinsics),
            "shape": {
                "depth": list(depth.shape),
                "confidence": list(conf.shape),
                "intrinsics": list(intrinsics.shape),
                "extrinsics": list(extrinsics.shape),
            },
            "depth_range": {
                "min": float(depth.min()),
                "max": float(depth.max()),
            },
        }

        return response

    def _encode_array(self, arr: np.ndarray) -> str:
        """Encode numpy array as base64 string."""
        buffer = io.BytesIO()
        np.save(buffer, arr.astype(np.float32))
        return base64.b64encode(buffer.getvalue()).decode("utf-8")