from transformers import AutoImageProcessor, AutoModelForDepthEstimation
from PIL import Image
import torch
import torch.nn.functional as F
import io
import base64
import numpy as np
import json


class EndpointHandler:
    def __init__(self, path=""):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.processor = AutoImageProcessor.from_pretrained(path)
        self.model = AutoModelForDepthEstimation.from_pretrained(path)
        self.model.to(self.device)
        self.model.eval()

    def _coerce_to_image_bytes(self, obj):
        """
        Accepts:
          - bytes/bytearray: raw image bytes
          - str: base64 string OR JSON string containing {"inputs": "..."} OR plain text (fallback)
          - dict: expects dict["inputs"] (which can itself be str/bytes/etc)
        Returns:
          - image_bytes (bytes)
        """
        # If toolkit passes dict
        if isinstance(obj, dict):
            if "inputs" not in obj:
                raise ValueError(f'Missing "inputs" key. Keys={list(obj.keys())}')
            return self._coerce_to_image_bytes(obj["inputs"])

        # If toolkit passes raw bytes
        if isinstance(obj, (bytes, bytearray)):
            b = bytes(obj)
            # Sometimes body is JSON bytes; try parse
            try:
                txt = b.decode("utf-8")
                if txt.lstrip().startswith("{") and '"inputs"' in txt:
                    return self._coerce_to_image_bytes(json.loads(txt))
            except Exception:
                pass
            return b

        # If toolkit passes str
        if isinstance(obj, str):
            s = obj.strip()

            # Sometimes it's a JSON string
            if s.startswith("{") and '"inputs"' in s:
                try:
                    return self._coerce_to_image_bytes(json.loads(s))
                except Exception:
                    pass

            # Most common: base64 string of image bytes
            try:
                return base64.b64decode(s, validate=False)
            except Exception:
                # Last resort: treat as utf-8 bytes (won't be a valid image, but avoids str->BytesIO crash)
                return s.encode("utf-8")

        raise ValueError(f"Unsupported request type: {type(obj)}")

    def __call__(self, data):
        image_bytes = self._coerce_to_image_bytes(data)

        # Now guaranteed bytes
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        orig_w, orig_h = image.size

        inputs_t = self.processor(images=image, return_tensors="pt")
        inputs_t = {k: v.to(self.device) for k, v in inputs_t.items()}

        with torch.no_grad():
            outputs = self.model(**inputs_t)
            predicted_depth = outputs.predicted_depth  # [B, H, W]

        # Upsample to original size
        depth = predicted_depth.unsqueeze(1)  # [B,1,H,W]
        depth = F.interpolate(
            depth, size=(orig_h, orig_w), mode="bicubic", align_corners=False
        )
        depth = depth.squeeze(1).squeeze(0)  # [H,W]
        depth_np = depth.detach().float().cpu().numpy()

        # viz png
        dmin, dmax = float(depth_np.min()), float(depth_np.max())
        denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0
        depth_uint8 = (((depth_np - dmin) / denom) * 255.0).clip(0, 255).astype(np.uint8)

        depth_img = Image.fromarray(depth_uint8, mode="L")
        buf = io.BytesIO()
        depth_img.save(buf, format="PNG")
        depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")

        # raw float16 depth
        depth_f16 = depth_np.astype(np.float16)
        depth_raw_base64_f16 = base64.b64encode(depth_f16.tobytes()).decode("utf-8")

        return {
            "type": "relative_depth",
            "width": orig_w,
            "height": orig_h,
            "depth_png_base64": depth_png_base64,
            "depth_raw_base64_f16": depth_raw_base64_f16,
            "raw_dtype": "float16",
            "raw_shape": [orig_h, orig_w],
            "viz_min": dmin,
            "viz_max": dmax,
        }