Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Opus 4.6 commited on 8 days ago

Commit

a4226e1

1 Parent(s): 1b4e6b8

perf: comprehensive optimization pass — caching, dedup, cleanup

- Complete .npy migration: segment wavs stored as file paths instead of
serialized arrays, removing all .tolist() calls
- Cache CAVP+onset features (TARO) and text features (HunyuanFoley) in
seg_meta so regen skips re-extraction (~5-7s saved per TARO regen,
~2-3s per HunyuanFoley regen)
- Extract shared model loading into helper functions (_load_taro_models,
_load_taro_feature_extractors, _load_mmaudio_models, _load_hunyuan_model)
to deduplicate generate/regen code
- Batch ffmpeg segment extraction before sample loop (MMAudio, HunyuanFoley)
so clips are extracted once and reused across samples
- Add temp directory registry with automatic cleanup of old dirs (keeps
last 10, removes older ones on new generation)
- Make TARO inference cache thread-safe with threading.Lock

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +232 -220

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ Supported models
 import os
 import sys
 import json
-import base64
 import tempfile
 import random
 import threading
@@ -111,6 +110,121 @@ def strip_audio_from_video(video_path: str, output_path: str):
         overwrite_output=True, quiet=True
     )
 def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
     """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
     ffmpeg.output(
@@ -193,6 +307,8 @@ GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than t
 _TARO_CACHE_MAXLEN = 16   # evict oldest entries beyond this limit
 _TARO_INFERENCE_CACHE: dict = {}   # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
@@ -308,50 +424,14 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
-    # Imports are inside the GPU context so the Space only pays for GPU time here
-    from TARO.cavp_util import Extract_CAVP_Features
-    from TARO.onset_util import VideoOnsetNet, extract_onset
-    from TARO.models    import MMDiT
-    from TARO.samplers  import euler_sampler, euler_maruyama_sampler
-    from diffusers      import AutoencoderKL
-    from transformers   import SpeechT5HifiGan
-    # -- Load CAVP encoder (uses checkpoint from our HF repo) --
-    extract_cavp = Extract_CAVP_Features(
-        device=device,
-        config_path="TARO/cavp/cavp.yaml",
-        ckpt_path=cavp_ckpt_path,
-    )
-    # -- Load onset detection model --
-    # Key remapping matches the original TARO infer.py exactly
-    raw_sd = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-    onset_sd = {}
-    for k, v in raw_sd.items():
-        if "model.net.model" in k:
-            k = k.replace("model.net.model", "net.model")
-        elif "model.fc." in k:
-            k = k.replace("model.fc", "fc")
-        onset_sd[k] = v
-    onset_model = VideoOnsetNet(pretrained=False).to(device)
-    onset_model.load_state_dict(onset_sd)
-    onset_model.eval()
-    # -- Load TARO MMDiT --
-    # Architecture params match TARO/train.py: adm_in_channels=120 (onset dim),
-    # z_dims=[768] (CAVP dim), encoder_depth=4
-    model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
-    model.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
-    model.eval().to(weight_dtype)
-    # -- Load AudioLDM2 VAE + vocoder only (saves ~3-4 GB vs loading the full pipeline) --
-    # TARO only needs VAE and vocoder for decoding; the text encoder and UNet are never used.
-    vae     = AutoencoderKL.from_pretrained("cvssp/audioldm2", subfolder="vae").to(device).eval()
-    vocoder = SpeechT5HifiGan.from_pretrained("cvssp/audioldm2", subfolder="vocoder").to(device)
-    latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
     # -- Prepare silent video (shared across all samples) --
-    tmp_dir      = tempfile.mkdtemp()
     silent_video = os.path.join(tmp_dir, "silent_input.mp4")
     strip_audio_from_video(video_file, silent_video)
@@ -366,9 +446,11 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
         sample_seed = seed_val + sample_idx
         cache_key   = (video_file, sample_seed, float(cfg_scale), int(num_steps), mode, crossfade_s)
-        if cache_key in _TARO_INFERENCE_CACHE:
             print(f"[TARO] Sample {sample_idx+1}: cache hit.")
-            wavs = _TARO_INFERENCE_CACHE[cache_key]["wavs"]
         else:
             set_global_seed(sample_seed)
             onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
@@ -392,19 +474,25 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
             print(f"[TARO] Inference done: {_n_segs} seg(s) × {int(num_steps)} steps in "
                   f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
                   f"(current constant={TARO_SECS_PER_STEP})")
-            _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
-            # Evict oldest entries if cache exceeds max size
-            while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
-                _TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         seg_meta = {
             "segments":    segments,
-            "wavs":        [w.copy() for w in wavs],
             "audio_path":  audio_path,
             "video_path":  video_path,
             "silent_video": silent_video,
@@ -413,6 +501,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
             "crossfade_s": crossfade_s,
             "crossfade_db": crossfade_db,
             "total_dur_s": total_dur_s,
         }
         outputs.append((video_path, audio_path, seg_meta))
@@ -456,10 +546,8 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     if _mmaudio_dir not in sys.path:
         sys.path.insert(0, _mmaudio_dir)
-    from mmaudio.eval_utils        import all_model_cfg, generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
-    from mmaudio.model.networks        import get_my_mmaudio
-    from mmaudio.model.utils.features_utils import FeaturesUtils
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
@@ -469,33 +557,9 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
-    # Use large_44k_v2 variant; override paths to our consolidated HF checkpoint repo
-    model_cfg = all_model_cfg["large_44k_v2"]
-    # Patch checkpoint paths to our downloaded files
-    from pathlib import Path as _Path
-    model_cfg.model_path        = _Path(mmaudio_model_path)
-    model_cfg.vae_path          = _Path(mmaudio_vae_path)
-    model_cfg.synchformer_ckpt  = _Path(mmaudio_synchformer_path)
-    # large_44k_v2 is 44k mode, no BigVGAN vocoder needed
-    model_cfg.bigvgan_16k_path  = None
-    seq_cfg = model_cfg.seq_cfg   # CONFIG_44K: 8 s, 44100 Hz
-    # Load network weights
-    net = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
-    net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
-    # Load feature utilities: CLIP (auto-downloaded from apple/DFN5B-CLIP-ViT-H-14-384),
-    # Synchformer (from our repo), VAE (from our repo), no BigVGAN for 44k mode
-    feature_utils = FeaturesUtils(
-        tod_vae_ckpt=str(model_cfg.vae_path),
-        synchformer_ckpt=str(model_cfg.synchformer_ckpt),
-        enable_conditions=True,
-        mode=model_cfg.mode,        # "44k"
-        bigvgan_vocoder_ckpt=None,
-        need_vae_encoder=False,
-    ).to(device, dtype).eval()
-    tmp_dir = tempfile.mkdtemp()
     outputs = []
     # Strip original audio so the muxed output only contains the generated track
@@ -510,6 +574,16 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     sr = seq_cfg.sampling_rate   # 44100
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
         if seed_val >= 0:
@@ -522,11 +596,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
-            # Trim a clean video clip for this segment (stream-copy, no re-encode)
-            seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
-            ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-                seg_path, vcodec="copy", an=None
-            ).run(overwrite_output=True, quiet=True)
             fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
             video_info  = load_video(seg_path, seg_dur)
@@ -575,9 +645,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         seg_meta = {
             "segments":    segments,
-            "wavs":        [w.copy() for w in seg_audios],
             "audio_path":  audio_path,
             "video_path":  video_path,
             "silent_video": silent_video,
@@ -631,7 +702,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     if _hf_path not in sys.path:
         sys.path.insert(0, _hf_path)
-    from hunyuanvideo_foley.utils.model_utils  import load_model, denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.media_utils   import merge_audio_video
@@ -645,25 +716,9 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model_size  = model_size.lower()   # "xl" or "xxl"
-    config_map = {
-        "xl":  "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml",
-        "xxl": "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml",
-    }
-    config_path = config_map.get(model_size, config_map["xxl"])
-    # hf_hub_download preserves the repo subfolder, so weights land in
-    # HUNYUAN_MODEL_DIR/HunyuanVideo-Foley/  — pass that as the weights dir.
-    hunyuan_weights_dir = str(HUNYUAN_MODEL_DIR / "HunyuanVideo-Foley")
-    print(f"[HunyuanFoley] Loading {model_size.upper()} model from {hunyuan_weights_dir}")
-    model_dict, cfg = load_model(
-        hunyuan_weights_dir,
-        config_path,
-        device,
-        enable_offload=False,
-        model_size=model_size,
-    )
-    tmp_dir = tempfile.mkdtemp()
     outputs = []
     # Strip original audio so the muxed output only contains the generated track
@@ -690,6 +745,16 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         neg_prompt=negative_prompt if negative_prompt else None,
     )
     # Generate audio per segment, then stitch
     for sample_idx in range(num_samples):
         seg_wavs = []
@@ -697,10 +762,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         _t_hny_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
-            seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
-            ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-                seg_path, vcodec="copy", an=None
-            ).run(overwrite_output=True, quiet=True)
             # feature_process returns (visual_feats, text_feats, audio_len).
             # We discard the returned text_feats (_) and use the pre-computed
@@ -750,9 +812,13 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
         seg_meta = {
             "segments":    segments,
-            "wavs":        [w.copy() for w in seg_wavs],
             "audio_path":  audio_path,
             "video_path":  video_path,
             "silent_video": silent_video,
@@ -761,6 +827,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
             "crossfade_s": crossfade_s,
             "crossfade_db": crossfade_db,
             "total_dur_s": total_dur_s,
         }
         outputs.append((video_path, audio_path, seg_meta))
@@ -781,7 +848,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
     """
-    wavs         = [w.copy() for w in meta["wavs"]]
     wavs[seg_idx]= new_wav
     crossfade_s  = float(meta["crossfade_s"])
     crossfade_db = float(meta["crossfade_db"])
@@ -830,15 +897,14 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     else:
         mux_video_audio(silent_video, audio_path, video_path)
     updated_meta = dict(meta)
-    updated_meta["wavs"]       = wavs
     updated_meta["audio_path"] = audio_path
     updated_meta["video_path"] = video_path
-    # Serialise for embedding in waveform HTML data-state (wavs as lists for JSON)
-    _serialised_meta            = dict(updated_meta)
-    _serialised_meta["wavs"]    = [w.tolist() for w in wavs]
-    state_json_new = json.dumps(_serialised_meta)
     waveform_html = _build_waveform_html(audio_path, segments, slot_id, "",
                                          state_json=state_json_new,
@@ -872,36 +938,27 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
-    from TARO.cavp_util import Extract_CAVP_Features
-    from TARO.onset_util import VideoOnsetNet, extract_onset
-    from TARO.models    import MMDiT
-    from TARO.samplers  import euler_sampler, euler_maruyama_sampler
-    from diffusers      import AutoencoderKL
-    from transformers   import SpeechT5HifiGan
-    silent_video  = meta["silent_video"]
-    tmp_dir       = tempfile.mkdtemp()
-    extract_cavp  = Extract_CAVP_Features(device=device, config_path="TARO/cavp/cavp.yaml", ckpt_path=cavp_ckpt_path)
-    raw_sd        = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-    onset_sd      = {}
-    for k, v in raw_sd.items():
-        if "model.net.model" in k:   k = k.replace("model.net.model", "net.model")
-        elif "model.fc." in k:       k = k.replace("model.fc", "fc")
-        onset_sd[k] = v
-    onset_model   = VideoOnsetNet(pretrained=False).to(device)
-    onset_model.load_state_dict(onset_sd)
-    onset_model.eval()
-    model_net     = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
-    model_net.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
-    model_net.eval().to(weight_dtype)
-    vae           = AutoencoderKL.from_pretrained("cvssp/audioldm2", subfolder="vae").to(device).eval()
-    vocoder       = SpeechT5HifiGan.from_pretrained("cvssp/audioldm2", subfolder="vocoder").to(device)
-    latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
-    cavp_feats    = extract_cavp(silent_video, tmp_path=tmp_dir)
     set_global_seed(random.randint(0, 2**32 - 1))
-    onset_feats   = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
     new_wav = _taro_infer_segment(
         model_net, vae, vocoder, cavp_feats, onset_feats,
@@ -910,14 +967,9 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
         euler_sampler, euler_maruyama_sampler,
     )
-    # Deserialise stored wavs from lists back to numpy arrays (json roundtrip)
-    stored_wavs = [np.array(w, dtype=np.float32) for w in meta["wavs"]]
-    meta["wavs"] = stored_wavs
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
-    updated_meta["wavs"] = [w.tolist() for w in updated_meta["wavs"]]
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
@@ -944,30 +996,13 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
     if _mmaudio_dir not in sys.path:
         sys.path.insert(0, _mmaudio_dir)
-    from mmaudio.eval_utils                  import all_model_cfg, generate, load_video
-    from mmaudio.model.flow_matching         import FlowMatching
-    from mmaudio.model.networks              import get_my_mmaudio
-    from mmaudio.model.utils.features_utils  import FeaturesUtils
-    from pathlib import Path as _Path
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
-    model_cfg = all_model_cfg["large_44k_v2"]
-    model_cfg.model_path       = _Path(mmaudio_model_path)
-    model_cfg.vae_path         = _Path(mmaudio_vae_path)
-    model_cfg.synchformer_ckpt = _Path(mmaudio_synchformer_path)
-    model_cfg.bigvgan_16k_path = None
-    seq_cfg = model_cfg.seq_cfg
-    net = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
-    net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
-    feature_utils = FeaturesUtils(
-        tod_vae_ckpt=str(model_cfg.vae_path),
-        synchformer_ckpt=str(model_cfg.synchformer_ckpt),
-        enable_conditions=True, mode=model_cfg.mode,
-        bigvgan_vocoder_ckpt=None, need_vae_encoder=False,
-    ).to(device, dtype).eval()
     sr          = seq_cfg.sampling_rate
     silent_video = meta["silent_video"]
@@ -999,14 +1034,11 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
-    stored_wavs  = [np.array(w, dtype=np.float32) for w in meta["wavs"]]
-    meta["wavs"] = stored_wavs
     meta["sr"]   = sr
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
-    updated_meta["wavs"] = [w.tolist() for w in updated_meta["wavs"]]
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
@@ -1035,19 +1067,11 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     if _hf_path not in sys.path:
         sys.path.insert(0, _hf_path)
-    from hunyuanvideo_foley.utils.model_utils   import load_model, denoise_process
-    from hunyuanvideo_foley.utils.feature_utils  import feature_process
     device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model_size  = model_size.lower()
-    config_map  = {
-        "xl":  "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml",
-        "xxl": "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml",
-    }
-    config_path = config_map.get(model_size, config_map["xxl"])
-    hunyuan_weights_dir = str(HUNYUAN_MODEL_DIR / "HunyuanVideo-Foley")
-    model_dict, cfg = load_model(hunyuan_weights_dir, config_path, device,
-                                  enable_offload=False, model_size=model_size)
     set_global_seed(random.randint(0, 2**32 - 1))
@@ -1058,10 +1082,19 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
         seg_path, vcodec="copy", an=None
     ).run(overwrite_output=True, quiet=True)
-    visual_feats, text_feats, seg_audio_len = feature_process(
-        seg_path, prompt if prompt else "", model_dict, cfg,
-        neg_prompt=negative_prompt if negative_prompt else None,
-    )
     audio_batch, sr = denoise_process(
         visual_feats, text_feats, seg_audio_len, model_dict, cfg,
         guidance_scale=float(guidance_scale),
@@ -1072,14 +1105,11 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
-    stored_wavs  = [np.array(w, dtype=np.float32) for w in meta["wavs"]]
-    meta["wavs"] = stored_wavs
     meta["sr"]   = sr
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
-    updated_meta["wavs"] = [w.tolist() for w in updated_meta["wavs"]]
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
@@ -1093,7 +1123,7 @@ def _pad_outputs(outputs: list) -> list:
     Each entry in *outputs* must be a (video_path, audio_path, seg_meta) tuple where
     seg_meta = {"segments": [...], "audio_path": str, "video_path": str,
                 "sr": int, "model": str, "crossfade_s": float,
-                "crossfade_db": float, "wavs": list[np.ndarray]}
     """
     result = []
     for i in range(MAX_SLOTS):
@@ -1180,9 +1210,9 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
     if not audio_path or not os.path.exists(audio_path):
         return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
-    with open(audio_path, "rb") as f:
-        b64 = base64.b64encode(f.read()).decode()
-    data_uri = f"data:audio/wav;base64,{b64}"
     segs_json = json.dumps(segments)
@@ -1365,17 +1395,15 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
     }}
   }}, 50);
-  // ── Decode audio ───────────────────────────────────────────────────
-  const b64str = '{b64}';
-  const bin = atob(b64str);
-  const buf = new Uint8Array(bin.length);
-  for (let i=0; i<bin.length; i++) buf[i]=bin.charCodeAt(i);
-  const AudioCtx = window.AudioContext || window.webkitAudioContext;
-  if (AudioCtx) {{
-    const tmpCtx = new AudioCtx({{sampleRate:44100}});
-    try {{
-      tmpCtx.decodeAudioData(buf.buffer.slice(0),
         function(ab) {{
           try {{ tmpCtx.close(); }} catch(e) {{}}
           function tryDraw() {{
@@ -1387,8 +1415,8 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
         }},
         function(err) {{}}
       );
-    }} catch(e) {{}}
-  }}
 }})();
 </script>
 </body>
@@ -1415,7 +1443,7 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
   </div>
   <div style="display:flex;align-items:center;gap:8px;margin-top:6px;">
     <span style="color:#888;font-size:11px;">Click a segment to regenerate &nbsp;|&nbsp; Playhead syncs to video</span>
-    <a href="{data_uri}" download="audio_{slot_id}.wav"
        style="margin-left:auto;background:#333;color:#eee;border:1px solid #555;
               border-radius:4px;padding:3px 10px;font-size:12px;text-decoration:none;">
       &#8595; Audio</a>{f'''
@@ -1875,12 +1903,6 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
                 flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
-                # Serialise wavs in meta to JSON-safe lists
-                for i in range(MAX_SLOTS):
-                    meta = flat[i * 3 + 2]
-                    if meta is not None:
-                        meta["wavs"] = [w.tolist() for w in meta["wavs"]]
-                        flat[i * 3 + 2] = meta
                 return _unpack_outputs(flat, n, "taro")
             # Split group visibility into a separate .then() to avoid Gradio 5 SSR
@@ -1977,11 +1999,6 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
                 flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n)
-                for i in range(MAX_SLOTS):
-                    meta = flat[i * 3 + 2]
-                    if meta is not None:
-                        meta["wavs"] = [w.tolist() for w in meta["wavs"]]
-                        flat[i * 3 + 2] = meta
                 return _unpack_outputs(flat, n, "mma")
             (mma_btn.click(
@@ -2069,11 +2086,6 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
                 flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n)
-                for i in range(MAX_SLOTS):
-                    meta = flat[i * 3 + 2]
-                    if meta is not None:
-                        meta["wavs"] = [w.tolist() for w in meta["wavs"]]
-                        flat[i * 3 + 2] = meta
                 return _unpack_outputs(flat, n, "hf")
             (hf_btn.click(

 import os
 import sys
 import json
 import tempfile
 import random
 import threading
         overwrite_output=True, quiet=True
     )
+# ------------------------------------------------------------------ #
+# Temp directory registry — tracks dirs for cleanup on new generation #
+# ------------------------------------------------------------------ #
+_TEMP_DIRS: list = []      # list of tmp_dir paths created by generate_*
+_TEMP_DIRS_MAX  = 10       # keep at most this many; older ones get cleaned up
+def _register_tmp_dir(tmp_dir: str) -> str:
+    """Register a temp dir so it can be cleaned up when newer ones replace it."""
+    import shutil
+    _TEMP_DIRS.append(tmp_dir)
+    while len(_TEMP_DIRS) > _TEMP_DIRS_MAX:
+        old = _TEMP_DIRS.pop(0)
+        try:
+            shutil.rmtree(old, ignore_errors=True)
+            print(f"[cleanup] Removed old temp dir: {old}")
+        except Exception:
+            pass
+    return tmp_dir
+def _save_seg_wavs(wavs: list, tmp_dir: str, prefix: str) -> list:
+    """Save a list of numpy wav arrays to .npy files, return list of paths.
+    This avoids serialising large float arrays into JSON/HTML data-state."""
+    paths = []
+    for i, w in enumerate(wavs):
+        p = os.path.join(tmp_dir, f"{prefix}_seg{i}.npy")
+        np.save(p, w)
+        paths.append(p)
+    return paths
+def _load_seg_wavs(paths: list) -> list:
+    """Load segment wav arrays from .npy file paths."""
+    return [np.load(p) for p in paths]
+# ------------------------------------------------------------------ #
+# Shared model-loading helpers (deduplicate generate / regen code)    #
+# ------------------------------------------------------------------ #
+def _load_taro_models(device, weight_dtype):
+    """Load TARO MMDiT + AudioLDM2 VAE/vocoder. Returns (model_net, vae, vocoder, latents_scale)."""
+    from TARO.models   import MMDiT
+    from diffusers     import AutoencoderKL
+    from transformers  import SpeechT5HifiGan
+    model_net = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
+    model_net.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
+    model_net.eval().to(weight_dtype)
+    vae     = AutoencoderKL.from_pretrained("cvssp/audioldm2", subfolder="vae").to(device).eval()
+    vocoder = SpeechT5HifiGan.from_pretrained("cvssp/audioldm2", subfolder="vocoder").to(device)
+    latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
+    return model_net, vae, vocoder, latents_scale
+def _load_taro_feature_extractors(device):
+    """Load CAVP + onset extractors. Returns (extract_cavp, onset_model)."""
+    from TARO.cavp_util  import Extract_CAVP_Features
+    from TARO.onset_util import VideoOnsetNet
+    extract_cavp = Extract_CAVP_Features(
+        device=device, config_path="TARO/cavp/cavp.yaml", ckpt_path=cavp_ckpt_path,
+    )
+    raw_sd = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+    onset_sd = {}
+    for k, v in raw_sd.items():
+        if "model.net.model" in k:   k = k.replace("model.net.model", "net.model")
+        elif "model.fc." in k:       k = k.replace("model.fc", "fc")
+        onset_sd[k] = v
+    onset_model = VideoOnsetNet(pretrained=False).to(device)
+    onset_model.load_state_dict(onset_sd)
+    onset_model.eval()
+    return extract_cavp, onset_model
+def _load_mmaudio_models(device, dtype):
+    """Load MMAudio net + feature_utils. Returns (net, feature_utils, model_cfg, seq_cfg)."""
+    from mmaudio.eval_utils               import all_model_cfg
+    from mmaudio.model.networks            import get_my_mmaudio
+    from mmaudio.model.utils.features_utils import FeaturesUtils
+    from pathlib import Path as _Path
+    model_cfg = all_model_cfg["large_44k_v2"]
+    model_cfg.model_path       = _Path(mmaudio_model_path)
+    model_cfg.vae_path         = _Path(mmaudio_vae_path)
+    model_cfg.synchformer_ckpt = _Path(mmaudio_synchformer_path)
+    model_cfg.bigvgan_16k_path = None
+    seq_cfg = model_cfg.seq_cfg
+    net = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
+    net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
+    feature_utils = FeaturesUtils(
+        tod_vae_ckpt=str(model_cfg.vae_path),
+        synchformer_ckpt=str(model_cfg.synchformer_ckpt),
+        enable_conditions=True, mode=model_cfg.mode,
+        bigvgan_vocoder_ckpt=None, need_vae_encoder=False,
+    ).to(device, dtype).eval()
+    return net, feature_utils, model_cfg, seq_cfg
+def _load_hunyuan_model(device, model_size):
+    """Load HunyuanFoley model dict + config. Returns (model_dict, cfg)."""
+    from hunyuanvideo_foley.utils.model_utils import load_model
+    model_size = model_size.lower()
+    config_map = {
+        "xl":  "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml",
+        "xxl": "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml",
+    }
+    config_path = config_map.get(model_size, config_map["xxl"])
+    hunyuan_weights_dir = str(HUNYUAN_MODEL_DIR / "HunyuanVideo-Foley")
+    print(f"[HunyuanFoley] Loading {model_size.upper()} model from {hunyuan_weights_dir}")
+    return load_model(hunyuan_weights_dir, config_path, device,
+                      enable_offload=False, model_size=model_size)
 def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
     """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
     ffmpeg.output(
 _TARO_CACHE_MAXLEN = 16   # evict oldest entries beyond this limit
 _TARO_INFERENCE_CACHE: dict = {}   # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
+import threading
+_TARO_CACHE_LOCK = threading.Lock()
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
+    from TARO.onset_util import extract_onset
+    from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    extract_cavp, onset_model = _load_taro_feature_extractors(device)
+    model, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
     # -- Prepare silent video (shared across all samples) --
+    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
     silent_video = os.path.join(tmp_dir, "silent_input.mp4")
     strip_audio_from_video(video_file, silent_video)
         sample_seed = seed_val + sample_idx
         cache_key   = (video_file, sample_seed, float(cfg_scale), int(num_steps), mode, crossfade_s)
+        with _TARO_CACHE_LOCK:
+            cached = _TARO_INFERENCE_CACHE.get(cache_key)
+        if cached is not None:
             print(f"[TARO] Sample {sample_idx+1}: cache hit.")
+            wavs = cached["wavs"]
         else:
             set_global_seed(sample_seed)
             onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
             print(f"[TARO] Inference done: {_n_segs} seg(s) × {int(num_steps)} steps in "
                   f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
                   f"(current constant={TARO_SECS_PER_STEP})")
+            with _TARO_CACHE_LOCK:
+                _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
+                while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
+                    _TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
+        wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
+        # Cache CAVP + onset features so regen can skip re-extraction (~5-7s saved)
+        cavp_path  = os.path.join(tmp_dir, f"taro_{sample_idx}_cavp.npy")
+        onset_path = os.path.join(tmp_dir, f"taro_{sample_idx}_onset.npy")
+        np.save(cavp_path, cavp_feats)
+        np.save(onset_path, onset_feats)
         seg_meta = {
             "segments":    segments,
+            "wav_paths":   wav_paths,
             "audio_path":  audio_path,
             "video_path":  video_path,
             "silent_video": silent_video,
             "crossfade_s": crossfade_s,
             "crossfade_db": crossfade_db,
             "total_dur_s": total_dur_s,
+            "cavp_path":   cavp_path,
+            "onset_path":  onset_path,
         }
         outputs.append((video_path, audio_path, seg_meta))
     if _mmaudio_dir not in sys.path:
         sys.path.insert(0, _mmaudio_dir)
+    from mmaudio.eval_utils        import generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
+    net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
     outputs = []
     # Strip original audio so the muxed output only contains the generated track
     sr = seq_cfg.sampling_rate   # 44100
+    # Pre-extract all segment clips once (shared across samples, saves ffmpeg overhead)
+    seg_clip_paths = []
+    for seg_i, (seg_start, seg_end) in enumerate(segments):
+        seg_dur = seg_end - seg_start
+        seg_path = os.path.join(tmp_dir, f"mma_seg_{seg_i}.mp4")
+        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+            seg_path, vcodec="copy", an=None
+        ).run(overwrite_output=True, quiet=True)
+        seg_clip_paths.append(seg_path)
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
         if seed_val >= 0:
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
+            seg_path = seg_clip_paths[seg_i]
             fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
             video_info  = load_video(seg_path, seg_dur)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
+        wav_paths = _save_seg_wavs(seg_audios, tmp_dir, f"mmaudio_{sample_idx}")
         seg_meta = {
             "segments":    segments,
+            "wav_paths":   wav_paths,
             "audio_path":  audio_path,
             "video_path":  video_path,
             "silent_video": silent_video,
     if _hf_path not in sys.path:
         sys.path.insert(0, _hf_path)
+    from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.media_utils   import merge_audio_video
     device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model_size  = model_size.lower()   # "xl" or "xxl"
+    model_dict, cfg = _load_hunyuan_model(device, model_size)
+    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
     outputs = []
     # Strip original audio so the muxed output only contains the generated track
         neg_prompt=negative_prompt if negative_prompt else None,
     )
+    # Pre-extract all segment clips once (shared across samples, saves ffmpeg overhead)
+    hny_seg_clip_paths = []
+    for seg_i, (seg_start, seg_end) in enumerate(segments):
+        seg_dur = seg_end - seg_start
+        seg_path = os.path.join(tmp_dir, f"hny_seg_{seg_i}.mp4")
+        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+            seg_path, vcodec="copy", an=None
+        ).run(overwrite_output=True, quiet=True)
+        hny_seg_clip_paths.append(seg_path)
     # Generate audio per segment, then stitch
     for sample_idx in range(num_samples):
         seg_wavs = []
         _t_hny_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
+            seg_path = hny_seg_clip_paths[seg_i]
             # feature_process returns (visual_feats, text_feats, audio_len).
             # We discard the returned text_feats (_) and use the pre-computed
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
+        wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"hunyuan_{sample_idx}")
+        # Cache text features so regen can skip text encoding (~2-3s saved)
+        text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
+        torch.save(text_feats, text_feats_path)
         seg_meta = {
             "segments":    segments,
+            "wav_paths":   wav_paths,
             "audio_path":  audio_path,
             "video_path":  video_path,
             "silent_video": silent_video,
             "crossfade_s": crossfade_s,
             "crossfade_db": crossfade_db,
             "total_dur_s": total_dur_s,
+            "text_feats_path": text_feats_path,
         }
         outputs.append((video_path, audio_path, seg_meta))
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
     """
+    wavs         = _load_seg_wavs(meta["wav_paths"])
     wavs[seg_idx]= new_wav
     crossfade_s  = float(meta["crossfade_s"])
     crossfade_db = float(meta["crossfade_db"])
     else:
         mux_video_audio(silent_video, audio_path, video_path)
+    # Save updated segment wavs to .npy files
+    updated_wav_paths = _save_seg_wavs(wavs, tmp_dir, os.path.splitext(_base_clean)[0])
     updated_meta = dict(meta)
+    updated_meta["wav_paths"]  = updated_wav_paths
     updated_meta["audio_path"] = audio_path
     updated_meta["video_path"] = video_path
+    state_json_new = json.dumps(updated_meta)
     waveform_html = _build_waveform_html(audio_path, segments, slot_id, "",
                                          state_json=state_json_new,
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
+    from TARO.samplers import euler_sampler, euler_maruyama_sampler
+    # Load cached CAVP + onset features if available (saves ~5-7s of GPU work)
+    cavp_path = meta.get("cavp_path")
+    onset_path = meta.get("onset_path")
+    if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
+        print("[TARO regen] Loading cached CAVP + onset features")
+        cavp_feats  = np.load(cavp_path)
+        onset_feats = np.load(onset_path)
+    else:
+        print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
+        from TARO.onset_util import extract_onset
+        extract_cavp, onset_model = _load_taro_feature_extractors(device)
+        silent_video = meta["silent_video"]
+        tmp_dir      = tempfile.mkdtemp()
+        cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
+        onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
+    model_net, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
     set_global_seed(random.randint(0, 2**32 - 1))
     new_wav = _taro_infer_segment(
         model_net, vae, vocoder, cavp_feats, onset_feats,
         euler_sampler, euler_maruyama_sampler,
     )
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
     if _mmaudio_dir not in sys.path:
         sys.path.insert(0, _mmaudio_dir)
+    from mmaudio.eval_utils          import generate, load_video
+    from mmaudio.model.flow_matching import FlowMatching
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
+    net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr          = seq_cfg.sampling_rate
     silent_video = meta["silent_video"]
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
     meta["sr"]   = sr
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
     if _hf_path not in sys.path:
         sys.path.insert(0, _hf_path)
+    from hunyuanvideo_foley.utils.model_utils  import denoise_process
+    from hunyuanvideo_foley.utils.feature_utils import feature_process
     device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_dict, cfg = _load_hunyuan_model(device, model_size)
     set_global_seed(random.randint(0, 2**32 - 1))
         seg_path, vcodec="copy", an=None
     ).run(overwrite_output=True, quiet=True)
+    # Load cached text features if available (saves ~2-3s text encoding)
+    text_feats_path = meta.get("text_feats_path")
+    if text_feats_path and os.path.exists(text_feats_path):
+        print("[HunyuanFoley regen] Loading cached text features, extracting visual only")
+        from hunyuanvideo_foley.utils.feature_utils import encode_video_features
+        visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
+        text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
+    else:
+        print("[HunyuanFoley regen] Cache miss — extracting text + visual features")
+        visual_feats, text_feats, seg_audio_len = feature_process(
+            seg_path, prompt if prompt else "", model_dict, cfg,
+            neg_prompt=negative_prompt if negative_prompt else None,
+        )
     audio_batch, sr = denoise_process(
         visual_feats, text_feats, seg_audio_len, model_dict, cfg,
         guidance_scale=float(guidance_scale),
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
     meta["sr"]   = sr
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
     Each entry in *outputs* must be a (video_path, audio_path, seg_meta) tuple where
     seg_meta = {"segments": [...], "audio_path": str, "video_path": str,
                 "sr": int, "model": str, "crossfade_s": float,
+                "crossfade_db": float, "wav_paths": list[str]}
     """
     result = []
     for i in range(MAX_SLOTS):
     if not audio_path or not os.path.exists(audio_path):
         return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
+    # Serve audio via Gradio's file API instead of base64-encoding the entire
+    # WAV inline. For a 25s stereo 44.1kHz track this saves ~5 MB per slot.
+    audio_url = f"/gradio_api/file={audio_path}"
     segs_json = json.dumps(segments)
     }}
   }}, 50);
+  // ── Fetch + decode audio from Gradio file API ──────────────────────
+  const audioUrl = '{audio_url}';
+  fetch(audioUrl)
+    .then(function(r) {{ return r.arrayBuffer(); }})
+    .then(function(arrayBuf) {{
+      const AudioCtx = window.AudioContext || window.webkitAudioContext;
+      if (!AudioCtx) return;
+      const tmpCtx = new AudioCtx({{sampleRate:44100}});
+      tmpCtx.decodeAudioData(arrayBuf,
         function(ab) {{
           try {{ tmpCtx.close(); }} catch(e) {{}}
           function tryDraw() {{
         }},
         function(err) {{}}
       );
+    }})
+    .catch(function(e) {{}});
 }})();
 </script>
 </body>
   </div>
   <div style="display:flex;align-items:center;gap:8px;margin-top:6px;">
     <span style="color:#888;font-size:11px;">Click a segment to regenerate &nbsp;|&nbsp; Playhead syncs to video</span>
+    <a href="{audio_url}" download="audio_{slot_id}.wav"
        style="margin-left:auto;background:#333;color:#eee;border:1px solid #555;
               border-radius:4px;padding:3px 10px;font-size:12px;text-decoration:none;">
       &#8595; Audio</a>{f'''
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
                 flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
                 return _unpack_outputs(flat, n, "taro")
             # Split group visibility into a separate .then() to avoid Gradio 5 SSR
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
                 flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n)
                 return _unpack_outputs(flat, n, "mma")
             (mma_btn.click(
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
                 flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n)
                 return _unpack_outputs(flat, n, "hf")
             (hf_btn.click(