"""HF Space probe: Gradio SDK + ZeroGPU. Exposes a /rewrite Gradio API endpoint that returns: rewritten: str examples: list[str] latency_s: float input_tokens: int output_tokens: int cold_load_s: float uptime_s: float Plus a /healthz endpoint via gr.routes for the probe to poll. """ from __future__ import annotations import threading import time from pathlib import Path import gradio as gr try: import spaces _HAS_SPACES = True except ImportError: _HAS_SPACES = False from rewriter import Rewriter DATA_DIR = Path(__file__).parent / "data" _BOOT_AT = time.time() _REWRITER: Rewriter | None = None _LOAD_LOCK = threading.Lock() _LOAD_LATENCY: float | None = None _FIRST_INFERENCE_AT: float | None = None def get_rewriter() -> Rewriter: global _REWRITER, _LOAD_LATENCY with _LOAD_LOCK: if _REWRITER is None: t0 = time.time() _REWRITER = Rewriter(DATA_DIR) _LOAD_LATENCY = time.time() - t0 print(f"[rewriter] singleton ready in {_LOAD_LATENCY:.1f}s on {_REWRITER.device}", flush=True) return _REWRITER def _do_rewrite(prompt: str) -> dict: return get_rewriter().rewrite(prompt) # On ZeroGPU, wrap inference so HF allocates a GPU burst. if _HAS_SPACES: _do_rewrite = spaces.GPU(duration=30)(_do_rewrite) def rewrite_api(prompt: str) -> dict: """Gradio API entry point exposed at /api/rewrite (and /rewrite for direct REST style).""" global _FIRST_INFERENCE_AT if not prompt or not prompt.strip(): return {"rewritten": "", "examples": [], "latency_s": 0.0, "input_tokens": 0, "output_tokens": 0, "cold_load_s": _LOAD_LATENCY or 0.0, "uptime_s": time.time() - _BOOT_AT, "error": "empty prompt"} t0 = time.time() out = _do_rewrite(prompt) if _FIRST_INFERENCE_AT is None: _FIRST_INFERENCE_AT = time.time() return { **out, "wall_latency_s": time.time() - t0, "cold_load_s": _LOAD_LATENCY or 0.0, "uptime_s": time.time() - _BOOT_AT, } def healthz_api() -> dict: return { "ready": _REWRITER is not None, "uptime_s": time.time() - _BOOT_AT, "load_latency_s": _LOAD_LATENCY, "first_inference_at_uptime_s": (_FIRST_INFERENCE_AT - _BOOT_AT) if _FIRST_INFERENCE_AT else None, "device": _REWRITER.device if _REWRITER else None, "model_id": _REWRITER.model_id if _REWRITER else None, } def gradio_rewrite(prompt: str): if not prompt.strip(): return "(empty)", "(empty)", 0.0 out = _do_rewrite(prompt) examples_str = "\n".join(f" · {e}" for e in out["examples"]) return out["rewritten"], examples_str, out["latency_s"] # Eager-load the rewriter at module import so the first Gradio call doesn't pay the load cost. # We do this AFTER the spaces decorator is wired up so ZeroGPU is initialised. get_rewriter() with gr.Blocks(title="AnimoFlow Rewriter Probe") as demo: gr.Markdown("# AnimoFlow Rewriter Probe — Qwen2.5-1.5B-Instruct + RAFSL on ZeroGPU") gr.Markdown( "Type a motion prompt in any language. The rewriter normalises it to a " "HumanML3D-style English caption. Powered by Qwen2.5-1.5B-Instruct + a multilingual " "MiniLM retriever over the 52K HumanML3D caption corpus." ) with gr.Row(): with gr.Column(): inp = gr.Textbox(label="Your prompt (any language)", placeholder="一个人向前走", lines=2) btn = gr.Button("Rewrite", variant="primary") with gr.Column(): out_text = gr.Textbox(label="Rewritten (HumanML3D-style English)", lines=2) out_examples = gr.Textbox(label="Retrieved exemplars", lines=4) out_latency = gr.Number(label="Latency (s)", precision=3) btn.click(gradio_rewrite, inputs=[inp], outputs=[out_text, out_examples, out_latency], api_name="rewrite_ui") # Pure API endpoint with JSON dict response — what the probe + clients should hit. with gr.Row(visible=False): api_in = gr.Textbox() api_out = gr.JSON() api_btn = gr.Button() api_btn.click(rewrite_api, inputs=[api_in], outputs=[api_out], api_name="rewrite") with gr.Row(visible=False): hz_btn = gr.Button() hz_out = gr.JSON() hz_btn.click(healthz_api, inputs=[], outputs=[hz_out], api_name="healthz") if __name__ == "__main__": demo.queue().launch(server_name="0.0.0.0", server_port=7860)