gyf-gpu / app.py
GetYourFit's picture
Upload folder using huggingface_hub
bdd0a45 verified
Raw
History Blame Contribute Delete
9.87 kB
"""GYF GPU serving lane — HF ZeroGPU Space (free-tier GPU per doctrine D7).
Serves the fashion encoder's GPU-heavy embedding as a tiny JSON API the local
GYF stack calls through ``perception.remote.RemoteEncoder``. Only the forward
pass runs here; retrieval scoring, ranking, and the M2 bake-off stay on the
caller's CPU, so this one small Space backs every GPU need (M2 embeddings now;
the M3/M4 photo modules can add more ``@spaces.GPU`` endpoints later) without the
catalog ever leaving the local machine.
``spaces`` must be imported before torch so ZeroGPU can intercept CUDA init; the
model is loaded on CPU and moved to ``cuda`` *inside* the GPU-decorated function,
which is how ZeroGPU allocates a GPU per request and releases it after.
"""
from __future__ import annotations
import base64
import io
from functools import lru_cache
import gradio as gr
import numpy as np
import open_clip
import spaces
import torch
from PIL import Image
# Models the lane is allowed to serve. Mirrors the GYF model registry's `encoder`
# capability (incumbent + the M2 research candidates). Keep in sync with
# models.registry.json — only commercial-clean (Apache-2.0) weights belong here.
ALLOWED_MODELS = {
"hf://Marqo/marqo-fashionSigLIP", # production incumbent
"hf-hub:Marqo/marqo-fashionSigLIP", # open_clip-prefixed alias (config default)
"hf-hub:timm/ViT-B-16-SigLIP2", # M2 research candidate
"hf-hub:timm/ViT-SO400M-16-SigLIP2-384", # M2 research candidate
}
def _l2_normalize(x: np.ndarray) -> np.ndarray:
norms = np.linalg.norm(x, axis=-1, keepdims=True)
return x / np.clip(norms, 1e-12, None)
@lru_cache(maxsize=4)
def _load(model_id: str):
"""Load (model, preprocess, tokenizer) once per model_id, on CPU."""
if model_id not in ALLOWED_MODELS:
raise gr.Error(f"model '{model_id}' is not in this lane's allow-list")
model, preprocess = open_clip.create_model_from_pretrained(model_id)
tokenizer = open_clip.get_tokenizer(model_id)
return model.eval(), preprocess, tokenizer
def _decode_image(b64: str) -> Image.Image:
return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
@spaces.GPU
def embed_images(model_id: str, images_b64: list[str]) -> dict:
"""Embed base64-PNG images → {'embeddings': [[...]], 'dim': int} (L2-normalized)."""
if not images_b64:
return {"embeddings": [], "dim": 0}
model, preprocess, _ = _load(model_id)
model = model.to("cuda")
batch = torch.stack([preprocess(_decode_image(b)) for b in images_b64]).to("cuda")
with torch.no_grad():
feats = model.encode_image(batch)
emb = _l2_normalize(feats.cpu().numpy().astype(np.float32))
return {"embeddings": emb.tolist(), "dim": int(emb.shape[1])}
@spaces.GPU
def embed_texts(model_id: str, texts: list[str]) -> dict:
"""Embed text strings → {'embeddings': [[...]], 'dim': int} (L2-normalized)."""
if not texts:
return {"embeddings": [], "dim": 0}
model, _, tokenizer = _load(model_id)
model = model.to("cuda")
tokens = tokenizer(list(texts)).to("cuda")
with torch.no_grad():
feats = model.encode_text(tokens)
emb = _l2_normalize(feats.cpu().numpy().astype(np.float32))
return {"embeddings": emb.tolist(), "dim": int(emb.shape[1])}
# --- Skin-tone lane (M4): face-parse → CIELAB → MST, runs the vendored pipeline -
@lru_cache(maxsize=1)
def _skin_estimator():
"""The real face-parsing skin-tone estimator (vendored under skintone/)."""
from skintone import FaceParsingSkinToneEstimator
return FaceParsingSkinToneEstimator()
@spaces.GPU
def estimate_skin_tone(image_b64: str) -> dict:
"""One photo → {'skin_tone': 'mstN', 'undertone': str, 'field_confidence': {...},
'model_version': str}. Abstains ('unknown') honestly when no face/skin is found."""
from skintone import estimate_skin_tone as _run
est = _run(_decode_image(image_b64), _skin_estimator())
return {
"skin_tone": est.skin_tone,
"undertone": est.undertone,
"field_confidence": dict(est.field_confidence),
"model_version": est.model_version,
}
# --- Body-type lane (M3): BiRefNet silhouette + RTMW keypoints → torso widths ----
# Commercial-clean + ZeroGPU-deployable (SAM 3D Body needs detectron2/pyrender/
# pytorch3d/conda — not pip-installable on a Space; Sapiens is CC-BY-NC). We segment
# the body silhouette with BiRefNet (MIT, SOTA high-res matting) and locate the
# shoulder/hip landmarks with RTMW whole-body 2D keypoints (Apache-2.0, rtmlib ONNX).
# The vendored pure geometry (bodyshape.silhouette_measurements) reads the *arm-robust
# torso width* at each keypoint-anchored height — pose-, crop-, and lighting-invariant,
# unlike the v1 raw-extent silhouette. The caller's CPU classifies the widths
# unchanged. See docs/plans/m3-body-type-rtmw-birefnet.md.
_BODY_MODEL = "ZhengPeng7/BiRefNet"
_BIREFNET_SIZE = 1024
_BODY_MODEL_VERSION = "rtmw-birefnet-v1"
# Below this fraction of image height the foreground isn't a full standing body
# (a face/upper-body selfie) — abstain rather than fabricate a silhouette class.
_MIN_BODY_HEIGHT_FRAC = 0.35
_BODY_ABSTAIN = {
"measurements": {},
"region_quality": {},
"model_confidence": 0.0,
"model_version": _BODY_MODEL_VERSION,
}
@lru_cache(maxsize=1)
def _load_body():
"""Load BiRefNet once on CPU (moved to GPU inside the @spaces.GPU call)."""
from transformers import AutoModelForImageSegmentation
model = AutoModelForImageSegmentation.from_pretrained(_BODY_MODEL, trust_remote_code=True)
# BiRefNet ships half-precision weights; pin float32 so it matches our float input.
return model.float().eval()
@lru_cache(maxsize=1)
def _load_pose():
"""Load the RTMW whole-body keypoint detector once (ONNX, GPU-backed)."""
from rtmlib import Wholebody
return Wholebody(mode="performance", backend="onnxruntime", device="cuda")
def _silhouette_mask(image: Image.Image) -> np.ndarray:
"""BiRefNet foreground mask (bool, original H×W) for the largest subject."""
from torchvision import transforms
tfm = transforms.Compose(
[
transforms.Resize((_BIREFNET_SIZE, _BIREFNET_SIZE)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
model = _load_body().to("cuda")
x = tfm(image).unsqueeze(0).to("cuda")
with torch.no_grad():
pred = model(x)[-1].sigmoid().cpu()[0, 0] # (size, size) in [0,1]
mask = Image.fromarray((pred.numpy() * 255).astype("uint8")).resize(image.size)
return np.asarray(mask) > 127 # bool H×W
@spaces.GPU
def estimate_body(image_b64: str) -> dict:
"""One photo → height-normalized torso widths for the body-type taxonomy.
Returns ``{'measurements': {...}, 'region_quality': {...}, 'model_confidence':
float, 'model_version': str}``. RTMW gives the shoulder/hip landmark heights;
BiRefNet gives the silhouette; the vendored ``silhouette_measurements`` reads the
arm-robust torso width at each landmark. Abstains (confidence 0, empty
measurements) when no plausible full-body / front-facing subject is found — the
caller turns that into ``unknown`` body type and the manual field is the fallback.
"""
from bodyshape import silhouette_measurements
image = _decode_image(image_b64)
rgb = np.ascontiguousarray(np.asarray(image))
keypoints, scores = _load_pose()(rgb) # (P,K,2), (P,K)
if keypoints is None or len(keypoints) == 0:
return dict(_BODY_ABSTAIN)
subject = int(np.argmax([s.mean() for s in scores]))
kp, sc = keypoints[subject], scores[subject]
mask = _silhouette_mask(image)
rows = np.where(mask.any(axis=1))[0]
if rows.size == 0:
return dict(_BODY_ABSTAIN)
if float(rows[-1] - rows[0] + 1) / float(mask.shape[0]) < _MIN_BODY_HEIGHT_FRAC:
return dict(_BODY_ABSTAIN)
measurements, region_quality, confidence = silhouette_measurements(mask, kp, sc)
if not measurements or confidence <= 0.0:
return dict(_BODY_ABSTAIN)
return {
"measurements": {k: round(float(v), 6) for k, v in measurements.items()},
"region_quality": {k: round(float(v), 4) for k, v in region_quality.items()},
"model_confidence": round(float(confidence), 4),
"model_version": _BODY_MODEL_VERSION,
}
with gr.Blocks(title="GYF GPU lane") as demo:
gr.Markdown(
"# GYF GPU serving lane\n"
"Fashion encoder embeddings on free-tier ZeroGPU. Called by "
"`perception.remote.RemoteEncoder`; also browsable here for a smoke test."
)
model_in = gr.Textbox(label="model_id", value="hf-hub:Marqo/marqo-fashionSigLIP")
with gr.Tab("images"):
imgs_in = gr.JSON(label="images_b64 (list of base64 PNG strings)")
imgs_out = gr.JSON(label="embeddings")
gr.Button("embed_images").click(
embed_images, [model_in, imgs_in], imgs_out, api_name="embed_images"
)
with gr.Tab("texts"):
txt_in = gr.JSON(label="texts (list of strings)")
txt_out = gr.JSON(label="embeddings")
gr.Button("embed_texts").click(
embed_texts, [model_in, txt_in], txt_out, api_name="embed_texts"
)
with gr.Tab("skintone"):
skin_in = gr.Textbox(label="image_b64 (base64 PNG)")
skin_out = gr.JSON(label="skin tone")
gr.Button("estimate_skin_tone").click(
estimate_skin_tone, skin_in, skin_out, api_name="estimate_skin_tone"
)
with gr.Tab("body"):
body_in = gr.Textbox(label="image_b64 (base64 PNG)")
body_out = gr.JSON(label="mesh")
gr.Button("estimate_body").click(
estimate_body, body_in, body_out, api_name="estimate_body"
)
if __name__ == "__main__":
demo.launch()