"""GYF GPU serving lane — HF ZeroGPU Space (free-tier GPU per doctrine D7).

Serves the fashion encoder's GPU-heavy embedding as a tiny JSON API the local
GYF stack calls through ``perception.remote.RemoteEncoder``. Only the forward
pass runs here; retrieval scoring, ranking, and the M2 bake-off stay on the
caller's CPU, so this one small Space backs every GPU need (M2 embeddings now;
the M3/M4 photo modules can add more ``@spaces.GPU`` endpoints later) without the
catalog ever leaving the local machine.

``spaces`` must be imported before torch so ZeroGPU can intercept CUDA init; the
model is loaded on CPU and moved to ``cuda`` *inside* the GPU-decorated function,
which is how ZeroGPU allocates a GPU per request and releases it after.
"""

from __future__ import annotations

import base64
import io
from functools import lru_cache

import gradio as gr
import numpy as np
import open_clip
import spaces
import torch
from PIL import Image

# Models the lane is allowed to serve. Mirrors the GYF model registry's `encoder`
# capability (incumbent + the M2 research candidates). Keep in sync with
# models.registry.json — only commercial-clean (Apache-2.0) weights belong here.
ALLOWED_MODELS = {
    "hf://Marqo/marqo-fashionSigLIP",  # production incumbent
    "hf-hub:Marqo/marqo-fashionSigLIP",  # open_clip-prefixed alias (config default)
    "hf-hub:timm/ViT-B-16-SigLIP2",  # M2 research candidate
    "hf-hub:timm/ViT-SO400M-16-SigLIP2-384",  # M2 research candidate
}


def _l2_normalize(x: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(x, axis=-1, keepdims=True)
    return x / np.clip(norms, 1e-12, None)


@lru_cache(maxsize=4)
def _load(model_id: str):
    """Load (model, preprocess, tokenizer) once per model_id, on CPU."""
    if model_id not in ALLOWED_MODELS:
        raise gr.Error(f"model '{model_id}' is not in this lane's allow-list")
    model, preprocess = open_clip.create_model_from_pretrained(model_id)
    tokenizer = open_clip.get_tokenizer(model_id)
    return model.eval(), preprocess, tokenizer


def _decode_image(b64: str) -> Image.Image:
    return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")


@spaces.GPU
def embed_images(model_id: str, images_b64: list[str]) -> dict:
    """Embed base64-PNG images → {'embeddings': [[...]], 'dim': int} (L2-normalized)."""
    if not images_b64:
        return {"embeddings": [], "dim": 0}
    model, preprocess, _ = _load(model_id)
    model = model.to("cuda")
    batch = torch.stack([preprocess(_decode_image(b)) for b in images_b64]).to("cuda")
    with torch.no_grad():
        feats = model.encode_image(batch)
    emb = _l2_normalize(feats.cpu().numpy().astype(np.float32))
    return {"embeddings": emb.tolist(), "dim": int(emb.shape[1])}


@spaces.GPU
def embed_texts(model_id: str, texts: list[str]) -> dict:
    """Embed text strings → {'embeddings': [[...]], 'dim': int} (L2-normalized)."""
    if not texts:
        return {"embeddings": [], "dim": 0}
    model, _, tokenizer = _load(model_id)
    model = model.to("cuda")
    tokens = tokenizer(list(texts)).to("cuda")
    with torch.no_grad():
        feats = model.encode_text(tokens)
    emb = _l2_normalize(feats.cpu().numpy().astype(np.float32))
    return {"embeddings": emb.tolist(), "dim": int(emb.shape[1])}


# --- Skin-tone lane (M4): face-parse → CIELAB → MST, runs the vendored pipeline -
@lru_cache(maxsize=1)
def _skin_estimator():
    """The real face-parsing skin-tone estimator (vendored under skintone/)."""
    from skintone import FaceParsingSkinToneEstimator

    return FaceParsingSkinToneEstimator()


@spaces.GPU
def estimate_skin_tone(image_b64: str) -> dict:
    """One photo → {'skin_tone': 'mstN', 'undertone': str, 'field_confidence': {...},
    'model_version': str}. Abstains ('unknown') honestly when no face/skin is found."""
    from skintone import estimate_skin_tone as _run

    est = _run(_decode_image(image_b64), _skin_estimator())
    return {
        "skin_tone": est.skin_tone,
        "undertone": est.undertone,
        "field_confidence": dict(est.field_confidence),
        "model_version": est.model_version,
    }


# --- Body-type lane (M3): BiRefNet silhouette + RTMW keypoints → torso widths ----
# Commercial-clean + ZeroGPU-deployable (SAM 3D Body needs detectron2/pyrender/
# pytorch3d/conda — not pip-installable on a Space; Sapiens is CC-BY-NC). We segment
# the body silhouette with BiRefNet (MIT, SOTA high-res matting) and locate the
# shoulder/hip landmarks with RTMW whole-body 2D keypoints (Apache-2.0, rtmlib ONNX).
# The vendored pure geometry (bodyshape.silhouette_measurements) reads the *arm-robust
# torso width* at each keypoint-anchored height — pose-, crop-, and lighting-invariant,
# unlike the v1 raw-extent silhouette. The caller's CPU classifies the widths
# unchanged. See docs/plans/m3-body-type-rtmw-birefnet.md.
_BODY_MODEL = "ZhengPeng7/BiRefNet"
_BIREFNET_SIZE = 1024
_BODY_MODEL_VERSION = "rtmw-birefnet-v1"
# Below this fraction of image height the foreground isn't a full standing body
# (a face/upper-body selfie) — abstain rather than fabricate a silhouette class.
_MIN_BODY_HEIGHT_FRAC = 0.35

_BODY_ABSTAIN = {
    "measurements": {},
    "region_quality": {},
    "model_confidence": 0.0,
    "model_version": _BODY_MODEL_VERSION,
}


@lru_cache(maxsize=1)
def _load_body():
    """Load BiRefNet once on CPU (moved to GPU inside the @spaces.GPU call)."""
    from transformers import AutoModelForImageSegmentation

    model = AutoModelForImageSegmentation.from_pretrained(_BODY_MODEL, trust_remote_code=True)
    # BiRefNet ships half-precision weights; pin float32 so it matches our float input.
    return model.float().eval()


@lru_cache(maxsize=1)
def _load_pose():
    """Load the RTMW whole-body keypoint detector once (ONNX, GPU-backed)."""
    from rtmlib import Wholebody

    return Wholebody(mode="performance", backend="onnxruntime", device="cuda")


def _silhouette_mask(image: Image.Image) -> np.ndarray:
    """BiRefNet foreground mask (bool, original H×W) for the largest subject."""
    from torchvision import transforms

    tfm = transforms.Compose(
        [
            transforms.Resize((_BIREFNET_SIZE, _BIREFNET_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    model = _load_body().to("cuda")
    x = tfm(image).unsqueeze(0).to("cuda")
    with torch.no_grad():
        pred = model(x)[-1].sigmoid().cpu()[0, 0]  # (size, size) in [0,1]
    mask = Image.fromarray((pred.numpy() * 255).astype("uint8")).resize(image.size)
    return np.asarray(mask) > 127  # bool H×W


@spaces.GPU
def estimate_body(image_b64: str) -> dict:
    """One photo → height-normalized torso widths for the body-type taxonomy.

    Returns ``{'measurements': {...}, 'region_quality': {...}, 'model_confidence':
    float, 'model_version': str}``. RTMW gives the shoulder/hip landmark heights;
    BiRefNet gives the silhouette; the vendored ``silhouette_measurements`` reads the
    arm-robust torso width at each landmark. Abstains (confidence 0, empty
    measurements) when no plausible full-body / front-facing subject is found — the
    caller turns that into ``unknown`` body type and the manual field is the fallback.
    """
    from bodyshape import silhouette_measurements

    image = _decode_image(image_b64)
    rgb = np.ascontiguousarray(np.asarray(image))

    keypoints, scores = _load_pose()(rgb)  # (P,K,2), (P,K)
    if keypoints is None or len(keypoints) == 0:
        return dict(_BODY_ABSTAIN)
    subject = int(np.argmax([s.mean() for s in scores]))
    kp, sc = keypoints[subject], scores[subject]

    mask = _silhouette_mask(image)
    rows = np.where(mask.any(axis=1))[0]
    if rows.size == 0:
        return dict(_BODY_ABSTAIN)
    if float(rows[-1] - rows[0] + 1) / float(mask.shape[0]) < _MIN_BODY_HEIGHT_FRAC:
        return dict(_BODY_ABSTAIN)

    measurements, region_quality, confidence = silhouette_measurements(mask, kp, sc)
    if not measurements or confidence <= 0.0:
        return dict(_BODY_ABSTAIN)

    return {
        "measurements": {k: round(float(v), 6) for k, v in measurements.items()},
        "region_quality": {k: round(float(v), 4) for k, v in region_quality.items()},
        "model_confidence": round(float(confidence), 4),
        "model_version": _BODY_MODEL_VERSION,
    }


with gr.Blocks(title="GYF GPU lane") as demo:
    gr.Markdown(
        "# GYF GPU serving lane\n"
        "Fashion encoder embeddings on free-tier ZeroGPU. Called by "
        "`perception.remote.RemoteEncoder`; also browsable here for a smoke test."
    )
    model_in = gr.Textbox(label="model_id", value="hf-hub:Marqo/marqo-fashionSigLIP")
    with gr.Tab("images"):
        imgs_in = gr.JSON(label="images_b64 (list of base64 PNG strings)")
        imgs_out = gr.JSON(label="embeddings")
        gr.Button("embed_images").click(
            embed_images, [model_in, imgs_in], imgs_out, api_name="embed_images"
        )
    with gr.Tab("texts"):
        txt_in = gr.JSON(label="texts (list of strings)")
        txt_out = gr.JSON(label="embeddings")
        gr.Button("embed_texts").click(
            embed_texts, [model_in, txt_in], txt_out, api_name="embed_texts"
        )
    with gr.Tab("skintone"):
        skin_in = gr.Textbox(label="image_b64 (base64 PNG)")
        skin_out = gr.JSON(label="skin tone")
        gr.Button("estimate_skin_tone").click(
            estimate_skin_tone, skin_in, skin_out, api_name="estimate_skin_tone"
        )
    with gr.Tab("body"):
        body_in = gr.Textbox(label="image_b64 (base64 PNG)")
        body_out = gr.JSON(label="mesh")
        gr.Button("estimate_body").click(
            estimate_body, body_in, body_out, api_name="estimate_body"
        )


if __name__ == "__main__":
    demo.launch()