LongCat-Video-Avatar-1.5-2nd

Running on Zero

App Files Files Community

victor HF Staff commited on 18 days ago

Commit

3e2d0e9

verified ·

1 Parent(s): 5e87129

refactor: defer model loads inside @spaces.GPU (avoid CPU OOM)

Browse files

Files changed (1) hide show

app.py +116 -135

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Gradio ZeroGPU Space for LongCat-Video-Avatar 1.5 (single-person AI2V).
-Loads the INT8-quantized DiT + DMD2 8-step LoRA + Whisper-Large-v3 audio encoder
-and exposes one inference function: reference image + audio + prompt -> mp4.
 """
 # IMPORTANT: spaces must be imported before torch (per HF guide).
@@ -27,7 +27,7 @@ WEIGHTS_DIR = Path(os.environ.get("WEIGHTS_DIR", DEFAULT_WEIGHTS))
 WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
 BASE_DIR = WEIGHTS_DIR / "LongCat-Video"
 AVATAR_DIR = WEIGHTS_DIR / "LongCat-Video-Avatar-1.5"
-print(f"[boot] WEIGHTS_DIR={WEIGHTS_DIR}")
 # Make vendored package importable
 sys.path.insert(0, str(Path(__file__).parent.resolve()))
@@ -42,15 +42,13 @@ from PIL import Image
 # ---------------------------------------------------------------------------
 # 0) Replace xformers.memory_efficient_attention with a PyTorch-SDPA shim.
-#    xformers wheels for torch 2.12+cu130 aren't available; SDPA is always
-#    in-tree, fast on Blackwell, and matches the inputs the model passes.
 # ---------------------------------------------------------------------------
 def _install_sdpa_shim():
-    import xformers.ops  # the package exists; only its CUDA ext is broken
-    # Replace BlockDiagonalMask with a thin record so we don't depend on
-    # the xformers internal layout for cumulative seq starts.
     class _BDShim:
         def __init__(self, q_seqlen, kv_seqlen):
             self.q_seqlen = list(q_seqlen)
@@ -63,46 +61,40 @@ def _install_sdpa_shim():
     xformers.ops.fmha.attn_bias.BlockDiagonalMask = _BDShim
     def _meff(q, k, v, attn_bias=None, op=None, **_):
-        # xformers convention: q, k, v are [B, M, H, D]
         if attn_bias is None:
             q_ = q.transpose(1, 2).contiguous()
             k_ = k.transpose(1, 2).contiguous()
             v_ = v.transpose(1, 2).contiguous()
-            out = F.scaled_dot_product_attention(q_, k_, v_)
-            return out.transpose(1, 2)
         if isinstance(attn_bias, _BDShim):
-            # Variable-length cross-attention: batch elements concatenated
-            # along seq dim. Loop per-element using SDPA.
-            outs = []
-            q_off = k_off = 0
             for q_len, k_len in zip(attn_bias.q_seqlen, attn_bias.kv_seqlen):
                 q_b = q[:, q_off:q_off + q_len].transpose(1, 2).contiguous()
                 k_b = k[:, k_off:k_off + k_len].transpose(1, 2).contiguous()
                 v_b = v[:, k_off:k_off + k_len].transpose(1, 2).contiguous()
-                o = F.scaled_dot_product_attention(q_b, k_b, v_b)
-                outs.append(o.transpose(1, 2))
                 q_off += q_len
                 k_off += k_len
             return torch.cat(outs, dim=1)
         raise NotImplementedError(f"Unsupported attn_bias in SDPA shim: {type(attn_bias)}")
     xformers.ops.memory_efficient_attention = _meff
-    print("[boot] installed xformers→SDPA shim")
 _install_sdpa_shim()
 # ---------------------------------------------------------------------------
-# 1) Download weights (one-time per container if /data is persistent)
 # ---------------------------------------------------------------------------
 def _ensure_weights():
     token = os.environ.get("HF_TOKEN")
-    # We only need text_encoder / vae / tokenizer from the base LongCat-Video repo.
     base_marker = BASE_DIR / "vae" / "config.json"
     if not base_marker.exists():
-        print("[boot] downloading LongCat-Video (vae/text_encoder/tokenizer)…")
         snapshot_download(
             "meituan-longcat/LongCat-Video",
             local_dir=str(BASE_DIR),
@@ -126,7 +118,7 @@ def _ensure_weights():
     avatar_marker = AVATAR_DIR / "base_model_int8" / "config.json"
     if not avatar_marker.exists():
-        print("[boot] downloading LongCat-Video-Avatar-1.5 (INT8 + lora + whisper + vocal_separator)…")
         snapshot_download(
             "meituan-longcat/LongCat-Video-Avatar-1.5",
             local_dir=str(AVATAR_DIR),
@@ -136,28 +128,25 @@ def _ensure_weights():
                 "lora/*",
                 "scheduler/*",
                 "vocal_separator/*",
-                # Whisper-Large-v3: only the bf16 safetensors + tokenizer/config files
                 "whisper-large-v3/model.safetensors",
                 "whisper-large-v3/*.json",
                 "whisper-large-v3/*.txt",
             ],
             ignore_patterns=[
-                # Drop the fp32 sharded copies, flax, TF, and pickled legacy weights
                 "whisper-large-v3/model.fp32*",
                 "whisper-large-v3/flax_model*",
                 "whisper-large-v3/tf_model*",
                 "whisper-large-v3/pytorch_model*",
             ],
         )
-    print("[boot] weights ready.")
 _ensure_weights()
 # ---------------------------------------------------------------------------
-# 2) Patch DiT config: prefer xformers, disable flash-attn (not buildable on
-#    ZeroGPU's Blackwell sm_120). Both base and int8 configs share these flags.
 # ---------------------------------------------------------------------------
 def _patch_dit_config():
@@ -175,87 +164,87 @@ def _patch_dit_config():
         changed = True
     if changed:
         cfg_path.write_text(json.dumps(cfg, indent=2))
-        print(f"[boot] patched {cfg_path.name} -> xformers backend")
 _patch_dit_config()
 # ---------------------------------------------------------------------------
-# 3) Load models on CPU at module level (spaces moves them to GPU on demand)
 # ---------------------------------------------------------------------------
-from transformers import AutoTokenizer, UMT5EncoderModel  # noqa: E402
-from longcat_video.pipeline_longcat_video_avatar import LongCatVideoAvatarPipeline  # noqa: E402
-from longcat_video.modules.scheduling_flow_match_euler_discrete import (  # noqa: E402
-    FlowMatchEulerDiscreteScheduler,
-)
-from longcat_video.modules.autoencoder_kl_wan import AutoencoderKLWan  # noqa: E402
-from longcat_video.modules.quantization import load_quantized_dit  # noqa: E402
-from longcat_video.audio_process import (  # noqa: E402
-    get_audio_encoder,
-    get_audio_feature_extractor,
-)
-from longcat_video.audio_process.torch_utils import save_video_ffmpeg  # noqa: E402
-CP_SPLIT_HW = [1, 1]  # single-GPU, no context-parallel split
-print("[boot] loading tokenizer + text_encoder (UMT5-XXL)…")
-tokenizer = AutoTokenizer.from_pretrained(str(BASE_DIR), subfolder="tokenizer")
-text_encoder = UMT5EncoderModel.from_pretrained(
-    str(BASE_DIR),
-    subfolder="text_encoder",
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-)
-print("[boot] loading VAE (Wan)…")
-vae = AutoencoderKLWan.from_pretrained(str(BASE_DIR), subfolder="vae", torch_dtype=torch.bfloat16)
-print("[boot] loading scheduler…")
-scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(AVATAR_DIR), subfolder="scheduler")
-print("[boot] loading INT8 DiT + DMD2 LoRA…")
-dit = load_quantized_dit(str(AVATAR_DIR), subfolder="base_model_int8", cp_split_hw=CP_SPLIT_HW)
-_lora_path = AVATAR_DIR / "lora" / "dmd_lora.safetensors"
-if _lora_path.exists():
-    dit.load_lora(str(_lora_path), "dmd", multiplier=1.0, lora_network_dim=128, lora_network_alpha=64)
-    dit.enable_loras(["dmd"])
-    print("[boot] DMD2 LoRA enabled (8-step distillation)")
-print("[boot] loading Whisper-Large-v3 audio encoder…")
-audio_encoder = get_audio_encoder(str(AVATAR_DIR / "whisper-large-v3"), "avatar-v1.5")
-audio_feature_extractor = get_audio_feature_extractor(str(AVATAR_DIR / "whisper-large-v3"), "avatar-v1.5")
-print("[boot] loading vocal separator (Kim_Vocal_2)…")
-from audio_separator.separator import Separator  # noqa: E402
-VOCAL_TMP = Path("/tmp/vocal_out")
-VOCAL_TMP.mkdir(parents=True, exist_ok=True)
-vocal_separator = Separator(
-    output_dir=str(VOCAL_TMP / "vocals"),
-    output_single_stem="vocals",
-    model_file_dir=str(AVATAR_DIR / "vocal_separator"),
-)
-vocal_separator.load_model("Kim_Vocal_2.onnx")
-print("[boot] assembling pipeline…")
-pipe = LongCatVideoAvatarPipeline(
-    tokenizer=tokenizer,
-    text_encoder=text_encoder,
-    vae=vae,
-    scheduler=scheduler,
-    dit=dit,
-    audio_encoder=audio_encoder,
-    audio_feature_extractor=audio_feature_extractor,
-    model_type="avatar-v1.5",
-)
-print("[boot] ready.")
 # ---------------------------------------------------------------------------
-# 4) Inference helpers
 # ---------------------------------------------------------------------------
 NEGATIVE_PROMPT = (
@@ -269,13 +258,14 @@ NEGATIVE_PROMPT = (
 def _extract_vocal(src: str) -> str:
-    """Run the vocal separator; return path to vocals-only wav, or src if it fails."""
     try:
-        outputs = vocal_separator.separate(src)
         if outputs:
-            return str((VOCAL_TMP / "vocals" / outputs[0]).resolve())
     except Exception as e:
-        print(f"[vocal] separation failed, using raw audio: {e}")
     return src
@@ -283,13 +273,12 @@ def _extract_vocal(src: str) -> str:
 # 5) GPU-bound inference function
 # ---------------------------------------------------------------------------
-@spaces.GPU(duration=300)
 def generate(
     image_path: str,
     audio_path: str,
     prompt: str,
     resolution: str,
-    audio_cfg: float,
     seed: int,
     progress=gr.Progress(track_tqdm=True),
 ):
@@ -297,75 +286,73 @@ def generate(
         raise gr.Error("Please upload a reference image.")
     if not audio_path:
         raise gr.Error("Please upload an audio clip.")
-    if not prompt or not prompt.strip():
-        prompt = "A person is talking naturally."
-    # Move pipeline onto GPU for the duration of this call.
-    # NB: pipe.to() moves dit / text_encoder / vae but not audio_encoder,
-    # so we move Whisper explicitly here.
-    pipe.to("cuda")
-    audio_encoder.to("cuda")
-    width, height = (832, 480) if resolution == "480p" else (1280, 768)
     save_fps = 25
     audio_stride = 1
-    num_frames = 93  # one 93-frame segment (~3.7s @ 25fps)
-    import librosa  # local import to keep boot fast
     # 1) Vocal isolation
-    progress(0.05, desc="Isolating vocals…")
     vocal_path = _extract_vocal(audio_path)
-    # 2) Pad audio to required duration
     speech, sr = librosa.load(vocal_path, sr=16000)
-    target_duration = num_frames / save_fps
-    pad = math.ceil((target_duration - len(speech) / sr) * sr)
     if pad > 0:
         speech = np.concatenate([speech, np.zeros(pad, dtype=speech.dtype)])
     # 3) Whisper audio embedding
-    progress(0.15, desc="Encoding audio (Whisper-Large-v3)…")
-    full_audio_emb = pipe.get_audio_embedding(
         speech, fps=save_fps * audio_stride, device="cuda", sample_rate=sr, model_type="avatar-v1.5"
     )
     if torch.isnan(full_audio_emb).any():
-        raise gr.Error("Audio embedding contains NaN — try a cleaner audio clip.")
-    # 4) Build per-frame windowed audio tensor: [1, T, 5, 5, D]
-    indices = torch.arange(2 * 2 + 1) - 2  # 5-frame window centered on each latent frame
     center = torch.arange(0, audio_stride * num_frames, audio_stride).unsqueeze(1) + indices.unsqueeze(0)
     center = torch.clamp(center, min=0, max=full_audio_emb.shape[0] - 1)
     audio_emb = full_audio_emb[center][None, ...].to("cuda")
-    # 5) Run AI2V generation (8 steps thanks to DMD2 LoRA)
     progress(0.30, desc="Generating video (DMD2 8-step)…")
     image = Image.open(image_path).convert("RGB")
     generator = torch.Generator(device="cuda").manual_seed(int(seed))
-    output, _latent = pipe.generate_ai2v(
         image=image,
-        prompt=prompt.strip(),
         negative_prompt=NEGATIVE_PROMPT,
         resolution=resolution,
         num_frames=num_frames,
         num_inference_steps=8,
         text_guidance_scale=1.0,
-        audio_guidance_scale=float(audio_cfg),
         output_type="both",
         generator=generator,
         audio_emb=audio_emb,
         use_distill=True,
     )
-    # 6) Save with audio
     progress(0.92, desc="Muxing audio + video…")
     frames = (output[0] * 255).astype(np.uint8)
     out_tensor = torch.from_numpy(frames)
     out_base = Path(tempfile.gettempdir()) / f"longcat_{uuid.uuid4().hex[:8]}"
     save_video_ffmpeg(out_tensor, str(out_base), audio_path, fps=save_fps, quality=5)
     out_path = f"{out_base}.mp4"
-    print(f"[gen] wrote {out_path}")
     return out_path
@@ -383,7 +370,6 @@ if (EXAMPLE_DIR / "man.png").exists() and (EXAMPLE_DIR / "man.mp3").exists():
         "their mouth. Wearing a vibrant red jacket with gold embroidery, the singer is speaking "
         "while smoke swirls around them, creating a dynamic and atmospheric scene.",
         "480p",
-        4.0,
         42,
     ])
@@ -395,6 +381,8 @@ with gr.Blocks(title="LongCat-Video-Avatar 1.5") as demo:
         Upload a **reference image** + **audio clip** + a short **text prompt**.
         The model generates a ~3.7-second lip-synced video using Meituan's
         LongCat-Video-Avatar 1.5 (INT8 DiT + DMD2 8-step distilled).
         """
     )
     with gr.Row():
@@ -409,13 +397,6 @@ with gr.Blocks(title="LongCat-Video-Avatar 1.5") as demo:
             with gr.Row():
                 resolution = gr.Radio(["480p", "720p"], value="480p", label="Resolution")
                 seed = gr.Number(value=42, precision=0, label="Seed")
-            audio_cfg = gr.Slider(
-                1.0,
-                6.0,
-                value=4.0,
-                step=0.5,
-                label="Audio CFG (higher = stronger lip sync, 3–5 recommended)",
-            )
             go = gr.Button("Generate", variant="primary")
         with gr.Column():
             video_out = gr.Video(label="Output", autoplay=True)
@@ -423,7 +404,7 @@ with gr.Blocks(title="LongCat-Video-Avatar 1.5") as demo:
     if EXAMPLES:
         gr.Examples(
             examples=EXAMPLES,
-            inputs=[image_in, audio_in, prompt, resolution, audio_cfg, seed],
             outputs=video_out,
             fn=generate,
             cache_examples=False,
@@ -431,7 +412,7 @@ with gr.Blocks(title="LongCat-Video-Avatar 1.5") as demo:
     go.click(
         generate,
-        inputs=[image_in, audio_in, prompt, resolution, audio_cfg, seed],
         outputs=video_out,
     )

 """Gradio ZeroGPU Space for LongCat-Video-Avatar 1.5 (single-person AI2V).
+Lazy-loads the INT8 DiT + DMD2 8-step LoRA + Whisper-Large-v3 inside the first
+@spaces.GPU call (CPU RAM on ZeroGPU is too small for 22GB UMT5-XXL + 14GB DiT).
 """
 # IMPORTANT: spaces must be imported before torch (per HF guide).
 WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
 BASE_DIR = WEIGHTS_DIR / "LongCat-Video"
 AVATAR_DIR = WEIGHTS_DIR / "LongCat-Video-Avatar-1.5"
+print(f"[boot] WEIGHTS_DIR={WEIGHTS_DIR}", flush=True)
 # Make vendored package importable
 sys.path.insert(0, str(Path(__file__).parent.resolve()))
 # ---------------------------------------------------------------------------
 # 0) Replace xformers.memory_efficient_attention with a PyTorch-SDPA shim.
+#    xformers wheels for torch 2.12+cu130 aren't published; SDPA is always
+#    in-tree and matches the inputs the model passes.
 # ---------------------------------------------------------------------------
 def _install_sdpa_shim():
+    import xformers.ops
     class _BDShim:
         def __init__(self, q_seqlen, kv_seqlen):
             self.q_seqlen = list(q_seqlen)
     xformers.ops.fmha.attn_bias.BlockDiagonalMask = _BDShim
     def _meff(q, k, v, attn_bias=None, op=None, **_):
+        # xformers convention: [B, M, H, D]; SDPA wants [B, H, M, D].
         if attn_bias is None:
             q_ = q.transpose(1, 2).contiguous()
             k_ = k.transpose(1, 2).contiguous()
             v_ = v.transpose(1, 2).contiguous()
+            return F.scaled_dot_product_attention(q_, k_, v_).transpose(1, 2)
         if isinstance(attn_bias, _BDShim):
+            outs, q_off, k_off = [], 0, 0
             for q_len, k_len in zip(attn_bias.q_seqlen, attn_bias.kv_seqlen):
                 q_b = q[:, q_off:q_off + q_len].transpose(1, 2).contiguous()
                 k_b = k[:, k_off:k_off + k_len].transpose(1, 2).contiguous()
                 v_b = v[:, k_off:k_off + k_len].transpose(1, 2).contiguous()
+                outs.append(F.scaled_dot_product_attention(q_b, k_b, v_b).transpose(1, 2))
                 q_off += q_len
                 k_off += k_len
             return torch.cat(outs, dim=1)
         raise NotImplementedError(f"Unsupported attn_bias in SDPA shim: {type(attn_bias)}")
     xformers.ops.memory_efficient_attention = _meff
+    print("[boot] installed xformers→SDPA shim", flush=True)
 _install_sdpa_shim()
 # ---------------------------------------------------------------------------
+# 1) Download weights (one-time per container thanks to /data bucket)
 # ---------------------------------------------------------------------------
 def _ensure_weights():
     token = os.environ.get("HF_TOKEN")
     base_marker = BASE_DIR / "vae" / "config.json"
     if not base_marker.exists():
+        print("[boot] downloading LongCat-Video (vae/text_encoder/tokenizer)…", flush=True)
         snapshot_download(
             "meituan-longcat/LongCat-Video",
             local_dir=str(BASE_DIR),
     avatar_marker = AVATAR_DIR / "base_model_int8" / "config.json"
     if not avatar_marker.exists():
+        print("[boot] downloading LongCat-Video-Avatar-1.5 (INT8 + lora + whisper + vocal_separator)…", flush=True)
         snapshot_download(
             "meituan-longcat/LongCat-Video-Avatar-1.5",
             local_dir=str(AVATAR_DIR),
                 "lora/*",
                 "scheduler/*",
                 "vocal_separator/*",
                 "whisper-large-v3/model.safetensors",
                 "whisper-large-v3/*.json",
                 "whisper-large-v3/*.txt",
             ],
             ignore_patterns=[
                 "whisper-large-v3/model.fp32*",
                 "whisper-large-v3/flax_model*",
                 "whisper-large-v3/tf_model*",
                 "whisper-large-v3/pytorch_model*",
             ],
         )
+    print("[boot] weights ready.", flush=True)
 _ensure_weights()
 # ---------------------------------------------------------------------------
+# 2) Patch DiT config: prefer xformers (now our SDPA shim) over flash-attn.
 # ---------------------------------------------------------------------------
 def _patch_dit_config():
         changed = True
     if changed:
         cfg_path.write_text(json.dumps(cfg, indent=2))
+        print(f"[boot] patched {cfg_path.name} -> xformers/SDPA backend", flush=True)
 _patch_dit_config()
 # ---------------------------------------------------------------------------
+# 3) Lazy pipeline cache. Built inside the first @spaces.GPU call.
 # ---------------------------------------------------------------------------
+_PIPE = None
+_VOCAL = None
+def _build_pipeline():
+    """Construct the whole pipeline directly on the GPU (avoids CPU RAM cap)."""
+    global _PIPE
+    print("[load] building pipeline (first call may take ~60s)…", flush=True)
+    t0 = time.time()
+    from transformers import AutoTokenizer, UMT5EncoderModel
+    from longcat_video.pipeline_longcat_video_avatar import LongCatVideoAvatarPipeline
+    from longcat_video.modules.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+    from longcat_video.modules.autoencoder_kl_wan import AutoencoderKLWan
+    from longcat_video.modules.quantization import load_quantized_dit
+    from longcat_video.audio_process import get_audio_encoder, get_audio_feature_extractor
+    cp_split_hw = [1, 1]
+    dtype = torch.bfloat16
+    tokenizer = AutoTokenizer.from_pretrained(str(BASE_DIR), subfolder="tokenizer")
+    text_encoder = UMT5EncoderModel.from_pretrained(
+        str(BASE_DIR), subfolder="text_encoder", torch_dtype=dtype, device_map="cuda"
+    )
+    vae = AutoencoderKLWan.from_pretrained(
+        str(BASE_DIR), subfolder="vae", torch_dtype=dtype
+    ).to("cuda")
+    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(AVATAR_DIR), subfolder="scheduler")
+    dit = load_quantized_dit(str(AVATAR_DIR), subfolder="base_model_int8", cp_split_hw=cp_split_hw).to("cuda")
+    lora_path = AVATAR_DIR / "lora" / "dmd_lora.safetensors"
+    if lora_path.exists():
+        dit.load_lora(str(lora_path), "dmd", multiplier=1.0, lora_network_dim=128, lora_network_alpha=64)
+        dit.enable_loras(["dmd"])
+        print("[load] DMD2 8-step LoRA enabled", flush=True)
+    audio_encoder = get_audio_encoder(str(AVATAR_DIR / "whisper-large-v3"), "avatar-v1.5").to("cuda", dtype=dtype)
+    audio_feature_extractor = get_audio_feature_extractor(str(AVATAR_DIR / "whisper-large-v3"), "avatar-v1.5")
+    _PIPE = LongCatVideoAvatarPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        dit=dit,
+        audio_encoder=audio_encoder,
+        audio_feature_extractor=audio_feature_extractor,
+        model_type="avatar-v1.5",
+    )
+    _PIPE.device = "cuda"
+    print(f"[load] pipeline ready in {time.time() - t0:.1f}s", flush=True)
+def _build_vocal_separator():
+    """Vocal separator is ONNX (CPU); build lazily as well so /data hits late."""
+    global _VOCAL
+    from audio_separator.separator import Separator
+    vocal_tmp = Path("/tmp/vocal_out")
+    (vocal_tmp / "vocals").mkdir(parents=True, exist_ok=True)
+    _VOCAL = Separator(
+        output_dir=str(vocal_tmp / "vocals"),
+        output_single_stem="vocals",
+        model_file_dir=str(AVATAR_DIR / "vocal_separator"),
+    )
+    _VOCAL.load_model("Kim_Vocal_2.onnx")
+    print("[load] vocal separator ready", flush=True)
 # ---------------------------------------------------------------------------
+# 4) Inference helper
 # ---------------------------------------------------------------------------
 NEGATIVE_PROMPT = (
 def _extract_vocal(src: str) -> str:
+    if _VOCAL is None:
+        return src
     try:
+        outputs = _VOCAL.separate(src)
         if outputs:
+            return str((Path("/tmp/vocal_out") / "vocals" / outputs[0]).resolve())
     except Exception as e:
+        print(f"[vocal] separation failed, using raw audio: {e}", flush=True)
     return src
 # 5) GPU-bound inference function
 # ---------------------------------------------------------------------------
+@spaces.GPU(duration=420)
 def generate(
     image_path: str,
     audio_path: str,
     prompt: str,
     resolution: str,
     seed: int,
     progress=gr.Progress(track_tqdm=True),
 ):
         raise gr.Error("Please upload a reference image.")
     if not audio_path:
         raise gr.Error("Please upload an audio clip.")
+    prompt = (prompt or "A person is talking naturally.").strip()
+    progress(0.02, desc="Warming up models (one-time on cold start)…")
+    if _PIPE is None:
+        _build_pipeline()
+    if _VOCAL is None:
+        _build_vocal_separator()
+    from longcat_video.audio_process.torch_utils import save_video_ffmpeg
+    import librosa
     save_fps = 25
     audio_stride = 1
+    num_frames = 93
     # 1) Vocal isolation
+    progress(0.10, desc="Isolating vocals…")
     vocal_path = _extract_vocal(audio_path)
+    # 2) Pad audio to target duration
     speech, sr = librosa.load(vocal_path, sr=16000)
+    pad = math.ceil((num_frames / save_fps - len(speech) / sr) * sr)
     if pad > 0:
         speech = np.concatenate([speech, np.zeros(pad, dtype=speech.dtype)])
     # 3) Whisper audio embedding
+    progress(0.20, desc="Encoding audio (Whisper-Large-v3)…")
+    full_audio_emb = _PIPE.get_audio_embedding(
         speech, fps=save_fps * audio_stride, device="cuda", sample_rate=sr, model_type="avatar-v1.5"
     )
     if torch.isnan(full_audio_emb).any():
+        raise gr.Error("Audio embedding contains NaN — try a different audio clip.")
+    # 4) Build windowed audio tensor: [1, T, 5, 5, D]
+    indices = torch.arange(2 * 2 + 1) - 2
     center = torch.arange(0, audio_stride * num_frames, audio_stride).unsqueeze(1) + indices.unsqueeze(0)
     center = torch.clamp(center, min=0, max=full_audio_emb.shape[0] - 1)
     audio_emb = full_audio_emb[center][None, ...].to("cuda")
+    # 5) Generate (8-step distilled, both CFG=1.0 → 1 forward per step)
     progress(0.30, desc="Generating video (DMD2 8-step)…")
     image = Image.open(image_path).convert("RGB")
     generator = torch.Generator(device="cuda").manual_seed(int(seed))
+    output, _ = _PIPE.generate_ai2v(
         image=image,
+        prompt=prompt,
         negative_prompt=NEGATIVE_PROMPT,
         resolution=resolution,
         num_frames=num_frames,
         num_inference_steps=8,
         text_guidance_scale=1.0,
+        audio_guidance_scale=1.0,
         output_type="both",
         generator=generator,
         audio_emb=audio_emb,
         use_distill=True,
     )
+    # 6) Mux + save
     progress(0.92, desc="Muxing audio + video…")
     frames = (output[0] * 255).astype(np.uint8)
     out_tensor = torch.from_numpy(frames)
     out_base = Path(tempfile.gettempdir()) / f"longcat_{uuid.uuid4().hex[:8]}"
     save_video_ffmpeg(out_tensor, str(out_base), audio_path, fps=save_fps, quality=5)
     out_path = f"{out_base}.mp4"
+    print(f"[gen] wrote {out_path}", flush=True)
     return out_path
         "their mouth. Wearing a vibrant red jacket with gold embroidery, the singer is speaking "
         "while smoke swirls around them, creating a dynamic and atmospheric scene.",
         "480p",
         42,
     ])
         Upload a **reference image** + **audio clip** + a short **text prompt**.
         The model generates a ~3.7-second lip-synced video using Meituan's
         LongCat-Video-Avatar 1.5 (INT8 DiT + DMD2 8-step distilled).
+        *First call is slow: ~60s to warm up models on GPU.*
         """
     )
     with gr.Row():
             with gr.Row():
                 resolution = gr.Radio(["480p", "720p"], value="480p", label="Resolution")
                 seed = gr.Number(value=42, precision=0, label="Seed")
             go = gr.Button("Generate", variant="primary")
         with gr.Column():
             video_out = gr.Video(label="Output", autoplay=True)
     if EXAMPLES:
         gr.Examples(
             examples=EXAMPLES,
+            inputs=[image_in, audio_in, prompt, resolution, seed],
             outputs=video_out,
             fn=generate,
             cache_examples=False,
     go.click(
         generate,
+        inputs=[image_in, audio_in, prompt, resolution, seed],
         outputs=video_out,
     )