depth-anything-v2-large-api

@@ -5,76 +5,97 @@ import torch.nn.functional as F
 import io
 import base64
 import numpy as np
 class EndpointHandler:
     def __init__(self, path=""):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.processor = AutoImageProcessor.from_pretrained(path)
         self.model = AutoModelForDepthEstimation.from_pretrained(path)
         self.model.to(self.device)
         self.model.eval()
-    def __call__(self, data):
         """
-        Supports both common endpoint input styles:
-          1) JSON: {"inputs": "<base64-encoded image bytes>"}  (recommended)
-          2) Raw bytes passed through as inputs (fallback)
         """
-        inputs = data.get("inputs", None)
-        if inputs is None:
-            raise ValueError('Missing "inputs". Send JSON {"inputs": "<base64>"} or raw bytes.')
-        # Decode inputs -> image_bytes
-        if isinstance(inputs, str):
-            # JSON base64 string
             try:
-                image_bytes = base64.b64decode(inputs)
-            except Exception as e:
-                raise ValueError(f'Failed to base64-decode "inputs" string: {e}')
-        elif isinstance(inputs, (bytes, bytearray)):
-            # raw bytes
-            image_bytes = bytes(inputs)
-        else:
-            raise ValueError(f'Unsupported inputs type: {type(inputs)}')
-        # Load image
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         orig_w, orig_h = image.size
-        # Preprocess
         inputs_t = self.processor(images=image, return_tensors="pt")
         inputs_t = {k: v.to(self.device) for k, v in inputs_t.items()}
-        # Inference
         with torch.no_grad():
             outputs = self.model(**inputs_t)
             predicted_depth = outputs.predicted_depth  # [B, H, W]
-        # Upsample to original image size
         depth = predicted_depth.unsqueeze(1)  # [B,1,H,W]
         depth = F.interpolate(
-            depth,
-            size=(orig_h, orig_w),
-            mode="bicubic",
-            align_corners=False,
         )
         depth = depth.squeeze(1).squeeze(0)  # [H,W]
         depth_np = depth.detach().float().cpu().numpy()
-        # Visualization (0..255 grayscale)
         dmin, dmax = float(depth_np.min()), float(depth_np.max())
         denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0
-        depth_norm = (depth_np - dmin) / denom
-        depth_uint8 = (depth_norm * 255.0).clip(0, 255).astype(np.uint8)
         depth_img = Image.fromarray(depth_uint8, mode="L")
         buf = io.BytesIO()
         depth_img.save(buf, format="PNG")
         depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
-        # Raw float16 depth (compact) — NOTE: relative depth, not meters
         depth_f16 = depth_np.astype(np.float16)
         depth_raw_base64_f16 = base64.b64encode(depth_f16.tobytes()).decode("utf-8")

 import io
 import base64
 import numpy as np
+import json
 class EndpointHandler:
     def __init__(self, path=""):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.processor = AutoImageProcessor.from_pretrained(path)
         self.model = AutoModelForDepthEstimation.from_pretrained(path)
         self.model.to(self.device)
         self.model.eval()
+    def _coerce_to_image_bytes(self, obj):
         """
+        Accepts:
+          - bytes/bytearray: raw image bytes
+          - str: base64 string OR JSON string containing {"inputs": "..."} OR plain text (fallback)
+          - dict: expects dict["inputs"] (which can itself be str/bytes/etc)
+        Returns:
+          - image_bytes (bytes)
         """
+        # If toolkit passes dict
+        if isinstance(obj, dict):
+            if "inputs" not in obj:
+                raise ValueError(f'Missing "inputs" key. Keys={list(obj.keys())}')
+            return self._coerce_to_image_bytes(obj["inputs"])
+        # If toolkit passes raw bytes
+        if isinstance(obj, (bytes, bytearray)):
+            b = bytes(obj)
+            # Sometimes body is JSON bytes; try parse
+            try:
+                txt = b.decode("utf-8")
+                if txt.lstrip().startswith("{") and '"inputs"' in txt:
+                    return self._coerce_to_image_bytes(json.loads(txt))
+            except Exception:
+                pass
+            return b
+        # If toolkit passes str
+        if isinstance(obj, str):
+            s = obj.strip()
+            # Sometimes it's a JSON string
+            if s.startswith("{") and '"inputs"' in s:
+                try:
+                    return self._coerce_to_image_bytes(json.loads(s))
+                except Exception:
+                    pass
+            # Most common: base64 string of image bytes
             try:
+                return base64.b64decode(s, validate=False)
+            except Exception:
+                # Last resort: treat as utf-8 bytes (won't be a valid image, but avoids str->BytesIO crash)
+                return s.encode("utf-8")
+        raise ValueError(f"Unsupported request type: {type(obj)}")
+    def __call__(self, data):
+        image_bytes = self._coerce_to_image_bytes(data)
+        # Now guaranteed bytes
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         orig_w, orig_h = image.size
         inputs_t = self.processor(images=image, return_tensors="pt")
         inputs_t = {k: v.to(self.device) for k, v in inputs_t.items()}
         with torch.no_grad():
             outputs = self.model(**inputs_t)
             predicted_depth = outputs.predicted_depth  # [B, H, W]
+        # Upsample to original size
         depth = predicted_depth.unsqueeze(1)  # [B,1,H,W]
         depth = F.interpolate(
+            depth, size=(orig_h, orig_w), mode="bicubic", align_corners=False
         )
         depth = depth.squeeze(1).squeeze(0)  # [H,W]
         depth_np = depth.detach().float().cpu().numpy()
+        # viz png
         dmin, dmax = float(depth_np.min()), float(depth_np.max())
         denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0
+        depth_uint8 = (((depth_np - dmin) / denom) * 255.0).clip(0, 255).astype(np.uint8)
         depth_img = Image.fromarray(depth_uint8, mode="L")
         buf = io.BytesIO()
         depth_img.save(buf, format="PNG")
         depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        # raw float16 depth
         depth_f16 = depth_np.astype(np.float16)
         depth_raw_base64_f16 = base64.b64encode(depth_f16.tobytes()).decode("utf-8")