| | from transformers import AutoImageProcessor, AutoModelForDepthEstimation |
| | from PIL import Image |
| | import torch |
| | import torch.nn.functional as F |
| | import io |
| | import base64 |
| | import numpy as np |
| | import json |
| |
|
| |
|
| | class EndpointHandler: |
| | def __init__(self, path=""): |
| | self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| | self.processor = AutoImageProcessor.from_pretrained(path) |
| | self.model = AutoModelForDepthEstimation.from_pretrained(path) |
| | self.model.to(self.device) |
| | self.model.eval() |
| |
|
| | def _coerce_to_image_bytes(self, obj): |
| | """ |
| | Accepts: |
| | - bytes/bytearray: raw image bytes |
| | - str: base64 string OR JSON string containing {"inputs": "..."} OR plain text (fallback) |
| | - dict: expects dict["inputs"] (which can itself be str/bytes/etc) |
| | Returns: |
| | - image_bytes (bytes) |
| | """ |
| | |
| | if isinstance(obj, dict): |
| | if "inputs" not in obj: |
| | raise ValueError(f'Missing "inputs" key. Keys={list(obj.keys())}') |
| | return self._coerce_to_image_bytes(obj["inputs"]) |
| |
|
| | |
| | if isinstance(obj, (bytes, bytearray)): |
| | b = bytes(obj) |
| | |
| | try: |
| | txt = b.decode("utf-8") |
| | if txt.lstrip().startswith("{") and '"inputs"' in txt: |
| | return self._coerce_to_image_bytes(json.loads(txt)) |
| | except Exception: |
| | pass |
| | return b |
| |
|
| | |
| | if isinstance(obj, str): |
| | s = obj.strip() |
| |
|
| | |
| | if s.startswith("{") and '"inputs"' in s: |
| | try: |
| | return self._coerce_to_image_bytes(json.loads(s)) |
| | except Exception: |
| | pass |
| |
|
| | |
| | try: |
| | return base64.b64decode(s, validate=False) |
| | except Exception: |
| | |
| | return s.encode("utf-8") |
| |
|
| | raise ValueError(f"Unsupported request type: {type(obj)}") |
| |
|
| | def __call__(self, data): |
| | image_bytes = self._coerce_to_image_bytes(data) |
| |
|
| | |
| | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") |
| | orig_w, orig_h = image.size |
| |
|
| | inputs_t = self.processor(images=image, return_tensors="pt") |
| | inputs_t = {k: v.to(self.device) for k, v in inputs_t.items()} |
| |
|
| | with torch.no_grad(): |
| | outputs = self.model(**inputs_t) |
| | predicted_depth = outputs.predicted_depth |
| |
|
| | |
| | depth = predicted_depth.unsqueeze(1) |
| | depth = F.interpolate( |
| | depth, size=(orig_h, orig_w), mode="bicubic", align_corners=False |
| | ) |
| | depth = depth.squeeze(1).squeeze(0) |
| | depth_np = depth.detach().float().cpu().numpy() |
| |
|
| | |
| | dmin, dmax = float(depth_np.min()), float(depth_np.max()) |
| | denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0 |
| | depth_uint8 = (((depth_np - dmin) / denom) * 255.0).clip(0, 255).astype(np.uint8) |
| |
|
| | depth_img = Image.fromarray(depth_uint8, mode="L") |
| | buf = io.BytesIO() |
| | depth_img.save(buf, format="PNG") |
| | depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8") |
| |
|
| | |
| | depth_f16 = depth_np.astype(np.float16) |
| | depth_raw_base64_f16 = base64.b64encode(depth_f16.tobytes()).decode("utf-8") |
| |
|
| | return { |
| | "type": "relative_depth", |
| | "width": orig_w, |
| | "height": orig_h, |
| | "depth_png_base64": depth_png_base64, |
| | "depth_raw_base64_f16": depth_raw_base64_f16, |
| | "raw_dtype": "float16", |
| | "raw_shape": [orig_h, orig_w], |
| | "viz_min": dmin, |
| | "viz_max": dmax, |
| | } |