Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

Jack Wu commited on Mar 25

Commit

6cf4573

1 Parent(s): 53f384c

Restructure app.py: multi-model support (TARO, MMAudio, HunyuanFoley)

- Split into three tabbed UI sections, one per model
- Updated checkpoint repo/folder paths to JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints
with TARO/, MMAudio/, HunyuanFoley/ subfolders
- TARO: preserve exact infer.py invocation (CAVP+VideoOnsetNet+MMDiT+AudioLDM2 decoder)
with sliding-window segmentation; fix samplers tuple return indexing
- MMAudio: use official load_video()+generate() pipeline from gradio_demo.py;
override model paths to our HF checkpoint repo; large_44k_v2 variant
- HunyuanFoley: use official load_model()+feature_process()+denoise_process()
pipeline; batch inference for multiple samples; xl/xxl size selector
- Add model-specific optimal duration constants from source configs
- Shared slot UI helper; MAX_SLOTS=8 maintained across all tabs

Files changed (18) hide show

TARO/README.md +0 -0
{cavp → TARO/cavp}/cavp.yaml +0 -0
{cavp → TARO/cavp}/model/cavp_model.py +0 -0
{cavp → TARO/cavp}/model/cavp_modules.py +0 -0
cavp_util.py → TARO/cavp_util.py +0 -0
dataset.py → TARO/dataset.py +0 -0
infer.py → TARO/infer.py +0 -0
loss.py → TARO/loss.py +0 -0
models.py → TARO/models.py +0 -0
onset_util.py → TARO/onset_util.py +0 -0
{preprocess → TARO/preprocess}/extract_cavp.py +0 -0
{preprocess → TARO/preprocess}/extract_fbank.py +0 -0
{preprocess → TARO/preprocess}/extract_mel.py +0 -0
{preprocess → TARO/preprocess}/extract_onset.py +0 -0
samplers.py → TARO/samplers.py +0 -0
train.py → TARO/train.py +3 -3
train.sh → TARO/train.sh +0 -0
app.py +603 -297

TARO/README.md ADDED Viewed

File without changes

{cavp → TARO/cavp}/cavp.yaml RENAMED Viewed

File without changes

{cavp → TARO/cavp}/model/cavp_model.py RENAMED Viewed

File without changes

{cavp → TARO/cavp}/model/cavp_modules.py RENAMED Viewed

File without changes

cavp_util.py → TARO/cavp_util.py RENAMED Viewed

File without changes

dataset.py → TARO/dataset.py RENAMED Viewed

File without changes

infer.py → TARO/infer.py RENAMED Viewed

File without changes

loss.py → TARO/loss.py RENAMED Viewed

File without changes

models.py → TARO/models.py RENAMED Viewed

File without changes

onset_util.py → TARO/onset_util.py RENAMED Viewed

File without changes

{preprocess → TARO/preprocess}/extract_cavp.py RENAMED Viewed

File without changes

{preprocess → TARO/preprocess}/extract_fbank.py RENAMED Viewed

File without changes

{preprocess → TARO/preprocess}/extract_mel.py RENAMED Viewed

File without changes

{preprocess → TARO/preprocess}/extract_onset.py RENAMED Viewed

File without changes

samplers.py → TARO/samplers.py RENAMED Viewed

File without changes

train.py → TARO/train.py RENAMED Viewed

@@ -18,10 +18,10 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
-from models import MMDiT
-from loss import SILoss
-from dataset import audio_video_spec_fullset_Dataset_Train, collate_fn_taro
 from diffusers import AudioLDM2Pipeline
 import wandb

 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
+from TARO.models import MMDiT
+from TARO.loss import SILoss
+from TARO.dataset import audio_video_spec_fullset_Dataset_Train, collate_fn_taro
 from diffusers import AudioLDM2Pipeline
 import wandb

train.sh → TARO/train.sh RENAMED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,8 +1,24 @@
 import os
 import subprocess
 import sys
-from math import ceil, floor
 try:
     import mmcv
     print("mmcv already installed")
@@ -13,133 +29,169 @@ except ImportError:
 import torch
 import numpy as np
-import random
 import soundfile as sf
 import ffmpeg
-import tempfile
 import spaces
 import gradio as gr
 from huggingface_hub import hf_hub_download
-REPO_ID = "JackIsNotInTheBox/Taro_checkpoints"
-CACHE_DIR = "/tmp/taro_ckpts"
-os.makedirs(CACHE_DIR, exist_ok=True)
-print("Downloading checkpoints...")
-cavp_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="cavp_epoch66.ckpt", cache_dir=CACHE_DIR)
-onset_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="onset_model.ckpt", cache_dir=CACHE_DIR)
-taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache_dir=CACHE_DIR)
-print("Checkpoints downloaded.")
-# Model constants
-SR            = 16000
-TRUNCATE      = 131072
-FPS           = 4
-TRUNCATE_FRAME = int(FPS * TRUNCATE / SR)   # 32 cavp frames per model window
-TRUNCATE_ONSET = 120                         # onset frames per model window
-MODEL_DUR     = TRUNCATE / SR               # 8.192 s
-MAX_SLOTS     = 8                            # max sample output slots in UI
-SECS_PER_STEP = 2.5                          # estimated seconds of GPU time per diffusion step
-# ------------------------------------------------------------------ #
-# Inference cache                                                      #
-# Key: (video_path, seed, cfg_scale, num_steps, mode, crossfade_s)    #
-# Value: {"wavs": [...], "total_dur_s": float,                        #
-#          "tmp_dir": str, "silent_video": str}                        #
-# ------------------------------------------------------------------ #
-_INFERENCE_CACHE = {}
-def set_global_seed(seed):
     np.random.seed(seed % (2**32))
     random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True
-def strip_audio_from_video(video_path, output_path):
-    """Strip any existing audio from a video file, outputting a silent video."""
-    (
-        ffmpeg
-        .input(video_path)
-        .output(output_path, vcodec="libx264", an=None)
-        .run(overwrite_output=True, quiet=True)
-    )
-def get_video_duration(video_path):
-    """Read video duration in seconds using ffprobe (no GPU needed)."""
     probe = ffmpeg.probe(video_path)
     return float(probe["format"]["duration"])
-def build_segments(total_dur_s, crossfade_s):
-    """
-    Build list of (seg_start_s, seg_end_s) segment windows.
-    For videos <= MODEL_DUR: single segment [0, total_dur_s].
-    For longer videos: advance by step_s = MODEL_DUR - crossfade_s each time.
-    The LAST segment is always anchored at [total_dur_s - MODEL_DUR, total_dur_s]
-    so it is a full-length window with no zero-padding, giving the best quality
-    at the tail end of the video.
-    """
-    if total_dur_s <= MODEL_DUR:
         return [(0.0, total_dur_s)]
-    step_s = MODEL_DUR - crossfade_s
-    segments = []
-    seg_start = 0.0
     while True:
-        seg_end = seg_start + MODEL_DUR
-        if seg_end >= total_dur_s:
-            # Replace this segment with a full-length tail-anchored window
-            seg_start = max(0.0, total_dur_s - MODEL_DUR)
             segments.append((seg_start, total_dur_s))
             break
-        segments.append((seg_start, seg_start + MODEL_DUR))
         seg_start += step_s
     return segments
-def calc_max_samples(total_dur_s, num_steps, crossfade_s):
-    """Estimate max samples that fit within the 600s ZeroGPU budget."""
-    num_segments = len(build_segments(total_dur_s, crossfade_s))
-    time_per_seg = num_steps * SECS_PER_STEP
-    budget = 600.0
-    max_s = floor(budget / (num_segments * time_per_seg))
     return max(1, min(max_s, MAX_SLOTS))
-def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
-                  seg_start_s, seg_end_s,
-                  device, weight_dtype,
-                  cfg_scale, num_steps, mode,
-                  latents_scale,
-                  euler_sampler, euler_maruyama_sampler):
-    """Run one model inference pass. Returns wav trimmed to segment duration."""
     # CAVP features (4 fps)
-    cavp_start = int(round(seg_start_s * FPS))
-    cavp_slice = cavp_feats_full[cavp_start : cavp_start + TRUNCATE_FRAME]
-    if cavp_slice.shape[0] < TRUNCATE_FRAME:
         pad = np.zeros(
-            (TRUNCATE_FRAME - cavp_slice.shape[0],) + cavp_slice.shape[1:],
             dtype=cavp_slice.dtype,
         )
         cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
-    video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device).to(weight_dtype)
-    # Onset features
-    onset_fps   = TRUNCATE_ONSET / MODEL_DUR
     onset_start = int(round(seg_start_s * onset_fps))
-    onset_slice = onset_feats_full[onset_start : onset_start + TRUNCATE_ONSET]
-    if onset_slice.shape[0] < TRUNCATE_ONSET:
-        pad_len = TRUNCATE_ONSET - onset_slice.shape[0]
-        onset_slice = np.pad(onset_slice, ((0, pad_len),), mode="constant", constant_values=0)
-    onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device).to(weight_dtype)
-    z = torch.randn(1, model.in_channels, 204, 16, device=device).to(weight_dtype)
     sampling_kwargs = dict(
         model=model,
         latents=z,
@@ -153,169 +205,119 @@ def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
         path_type="linear",
     )
     with torch.no_grad():
-        if mode == "sde":
-            samples = euler_maruyama_sampler(**sampling_kwargs)
-        else:
-            samples = euler_sampler(**sampling_kwargs)
     samples = vae.decode(samples / latents_scale).sample
     wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
-    seg_samples = int(round((seg_end_s - seg_start_s) * SR))
     return wav[:seg_samples]
-def crossfade_join(wav_a, wav_b, crossfade_s, db_boost):
-    """
-    Join two wav arrays with a crossfade.
-    Both signals are scaled by gain = 10^(db_boost/20) in the overlap region
-    and summed, producing a +db_boost bump at the midpoint.
-    """
-    cf_samples = int(round(crossfade_s * SR))
-    cf_samples  = min(cf_samples, len(wav_a), len(wav_b))
-    if cf_samples <= 0:
         return np.concatenate([wav_a, wav_b])
     gain    = 10 ** (db_boost / 20.0)
-    overlap = wav_a[-cf_samples:] * gain + wav_b[:cf_samples] * gain
-    return np.concatenate([wav_a[:-cf_samples], overlap, wav_b[cf_samples:]])
-def stitch_wavs(wavs, crossfade_s, db_boost, total_dur_s):
-    """Stitch segment wavs with crossfades and clip to total_dur_s."""
-    if len(wavs) == 1:
-        final_wav = wavs[0]
-    else:
-        final_wav = wavs[0]
-        for nw in wavs[1:]:
-            final_wav = crossfade_join(final_wav, nw, crossfade_s, db_boost)
-    return final_wav[:int(round(total_dur_s * SR))]
-def mux_video_audio(silent_video, audio_path, output_path):
-    input_v = ffmpeg.input(silent_video)
-    input_a = ffmpeg.input(audio_path)
-    (
-        ffmpeg
-        .output(input_v, input_a, output_path,
-                vcodec="libx264", acodec="aac", strict="experimental")
-        .run(overwrite_output=True, quiet=True)
-    )
-# ------------------------------------------------------------------ #
-# UI helpers (no GPU)                                                  #
-# ------------------------------------------------------------------ #
-def on_video_upload(video_file, num_steps, crossfade_s):
-    """Called when video is uploaded or sliders change. Updates samples slider."""
-    if video_file is None:
-        return gr.update(maximum=MAX_SLOTS, value=1)
-    try:
-        D     = get_video_duration(video_file)
-        max_s = calc_max_samples(D, int(num_steps), float(crossfade_s))
-    except Exception:
-        max_s = MAX_SLOTS
-    return gr.update(maximum=max_s, value=min(1, max_s))
-def get_random_seed():
-    return random.randint(0, 2**32 - 1)
-# ------------------------------------------------------------------ #
-# Main inference                                                       #
-# ------------------------------------------------------------------ #
 @spaces.GPU(duration=600)
-def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
-                   crossfade_s, crossfade_db, num_samples):
-    global _INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
     num_samples  = int(num_samples)
     if seed_val < 0:
         seed_val = random.randint(0, 2**32 - 1)
-    # Load models once (shared across all samples this call)
     torch.set_grad_enabled(False)
     device       = "cuda" if torch.cuda.is_available() else "cpu"
     weight_dtype = torch.bfloat16
-    from cavp_util import Extract_CAVP_Features
-    from onset_util import VideoOnsetNet, extract_onset
-    from models import MMDiT
-    from samplers import euler_sampler, euler_maruyama_sampler
-    from diffusers import AudioLDM2Pipeline
     extract_cavp = Extract_CAVP_Features(
-        device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path
     )
-    state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if "model.net.model" in key:
-            new_key = key.replace("model.net.model", "net.model")
-        elif "model.fc." in key:
-            new_key = key.replace("model.fc", "fc")
-        else:
-            new_key = key
-        new_state_dict[new_key] = value
-    onset_model = VideoOnsetNet(False).to(device)
-    onset_model.load_state_dict(new_state_dict)
     onset_model.eval()
     model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
-    ckpt  = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
-    model.load_state_dict(ckpt)
-    model.eval()
-    model.to(weight_dtype)
-    model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
-    vae     = model_audioldm.vae.to(device)
-    vae.eval()
-    vocoder = model_audioldm.vocoder.to(device)
     latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
-    # Prepare silent video (shared across all samples)
     tmp_dir      = tempfile.mkdtemp()
     silent_video = os.path.join(tmp_dir, "silent_input.mp4")
     strip_audio_from_video(video_file, silent_video)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
-    total_frames = cavp_feats.shape[0]
-    total_dur_s  = total_frames / FPS
-    segments     = build_segments(total_dur_s, crossfade_s)
-    # ------------------------------------------------------------------ #
-    # Generate N samples                                                   #
-    # ------------------------------------------------------------------ #
-    outputs = []   # list of (video_path, audio_path)
     for sample_idx in range(num_samples):
         sample_seed = seed_val + sample_idx
-        cache_key   = (video_file, sample_seed, float(cfg_scale),
-                       int(num_steps), mode, crossfade_s)
-        if cache_key in _INFERENCE_CACHE:
-            print(f"Sample {sample_idx+1}: cache hit, re-stitching.")
-            cached      = _INFERENCE_CACHE[cache_key]
-            wavs        = cached["wavs"]
         else:
             set_global_seed(sample_seed)
-            onset_feats = extract_onset(
-                silent_video, onset_model, tmp_path=tmp_dir, device=device
-            )
             wavs = []
             for seg_start_s, seg_end_s in segments:
-                print(f"  Sample {sample_idx+1} | segment {seg_start_s:.2f}s – {seg_end_s:.2f}s")
-                wav = infer_segment(
                     model, vae, vocoder,
                     cavp_feats, onset_feats,
                     seg_start_s, seg_end_s,
@@ -325,117 +327,421 @@ def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
                     euler_sampler, euler_maruyama_sampler,
                 )
                 wavs.append(wav)
-            _INFERENCE_CACHE[cache_key] = {"wavs": wavs}
-        # Stitch
-        final_wav  = stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s)
-        audio_path = os.path.join(tmp_dir, f"output_{sample_idx}.wav")
-        sf.write(audio_path, final_wav, SR)
-        video_path = os.path.join(tmp_dir, f"output_{sample_idx}.mp4")
-        mux_video_audio(silent_video, audio_path, video_path)
         outputs.append((video_path, audio_path))
-    # ------------------------------------------------------------------ #
-    # Return flat list of (video, audio) pairs padded with None           #
-    # so Gradio output list length is always MAX_SLOTS * 2                #
-    # ------------------------------------------------------------------ #
     result = []
     for i in range(MAX_SLOTS):
         if i < len(outputs):
-            result.append(outputs[i][0])   # video
-            result.append(outputs[i][1])   # audio
         else:
-            result.append(None)
-            result.append(None)
     return result
-# ------------------------------------------------------------------ #
-# Build gr.Blocks UI                                                   #
-# ------------------------------------------------------------------ #
-with gr.Blocks(title="TARO: Video-to-Audio Synthesis") as demo:
     gr.Markdown(
-        "# TARO: Video-to-Audio Synthesis (ICCV 2025)\n"
-        "Upload a video and generate synchronized audio. "
-        "Optimal clip duration is 8.2s. Longer videos are automatically "
-        "split into overlapping segments and stitched with a crossfade."
     )
-    with gr.Row():
-        with gr.Column():
-            video_input   = gr.Video(label="Input Video")
-            seed_input    = gr.Number(label="Seed", value=get_random_seed, precision=0)
-            cfg_input     = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5)
-            steps_input   = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
-            mode_input    = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
-            cf_dur_input  = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
-            cf_db_input   = gr.Textbox(label="Crossfade Boost (dB)", value="3")
-            samples_input = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS,
-                                      value=1, step=1)
-            run_btn       = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            # All MAX_SLOTS slots pre-built.
-            # Slot 0 is always visible (shows loading progress during inference).
-            # Slots 1-N become visible when user drags the Generations slider.
-            slot_videos = []
-            slot_audios = []
-            slot_grps   = []
-            for i in range(MAX_SLOTS):
-                with gr.Group(visible=(i == 0)) as grp:
-                    sv = gr.Video(label=f"Generation {i+1} — Video")
-                    sa = gr.Audio(label=f"Generation {i+1} — Audio")
-                slot_grps.append(grp)
-                slot_videos.append(sv)
-                slot_audios.append(sa)
-    # -------------------------------------------------------------- #
-    # Events                                                           #
-    # -------------------------------------------------------------- #
-    # Update Generations slider max on video upload / steps / crossfade change
-    def _update_samples_slider(video_file, num_steps, crossfade_s):
-        return on_video_upload(video_file, num_steps, crossfade_s)
-    for trigger in [video_input, steps_input, cf_dur_input]:
-        trigger.change(
-            fn=_update_samples_slider,
-            inputs=[video_input, steps_input, cf_dur_input],
-            outputs=[samples_input],
-        )
-    # Show/hide output slots instantly when Generations slider is dragged
-    def _update_slot_visibility(num_samples):
-        n = int(num_samples)
-        return [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
-    samples_input.change(
-        fn=_update_slot_visibility,
-        inputs=[samples_input],
-        outputs=slot_grps,
-    )
-    # Main generate: calls inference then populates slots
-    def _generate_and_update(video_file, seed_val, cfg_scale, num_steps, mode,
-                             crossfade_s, crossfade_db, num_samples):
-        flat = generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
-                              crossfade_s, crossfade_db, num_samples)
-        n = int(num_samples)
-        grp_updates   = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
-        video_updates = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
-        audio_updates = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
-        return grp_updates + video_updates + audio_updates
-    run_btn.click(
-        fn=_generate_and_update,
-        inputs=[video_input, seed_input, cfg_input, steps_input, mode_input,
-                cf_dur_input, cf_db_input, samples_input],
-        outputs=slot_grps + slot_videos + slot_audios,
-    )
 demo.queue().launch()

+"""
+Generate Audio for Video — multi-model Gradio app.
+Supported models
+----------------
+  TARO          – video-conditioned diffusion via CAVP + onset features (16 kHz, 8.192 s window)
+  MMAudio       – multimodal flow-matching with CLIP/Synchformer + text prompt (44 kHz, 8 s window)
+  HunyuanFoley  – text-guided foley via SigLIP2 + Synchformer + CLAP (48 kHz, up to 15 s)
+"""
 import os
 import subprocess
 import sys
+import tempfile
+import random
+from math import floor
+from pathlib import Path
+# ------------------------------------------------------------------ #
+# mmcv bootstrap (needed by TARO's CAVP encoder)                      #
+# ------------------------------------------------------------------ #
 try:
     import mmcv
     print("mmcv already installed")
 import torch
 import numpy as np
 import soundfile as sf
 import ffmpeg
 import spaces
 import gradio as gr
 from huggingface_hub import hf_hub_download
+# ================================================================== #
+#                     CHECKPOINT CONFIGURATION                        #
+# ================================================================== #
+CKPT_REPO_ID = "JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints"
+CACHE_DIR    = "/tmp/model_ckpts"
+os.makedirs(CACHE_DIR, exist_ok=True)
+# ---- TARO checkpoints (in TARO/ subfolder of the HF repo) ----
+print("Downloading TARO checkpoints…")
+cavp_ckpt_path  = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/cavp_epoch66.ckpt",  cache_dir=CACHE_DIR)
+onset_ckpt_path = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/onset_model.ckpt",   cache_dir=CACHE_DIR)
+taro_ckpt_path  = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/taro_ckpt.pt",       cache_dir=CACHE_DIR)
+print("TARO checkpoints downloaded.")
+# ---- MMAudio checkpoints (in MMAudio/ subfolder) ----
+# MMAudio normally auto-downloads from its own HF repo, but we
+# override the paths so it pulls from our consolidated repo instead.
+MMAUDIO_WEIGHTS_DIR  = Path(CACHE_DIR) / "MMAudio" / "weights"
+MMAUDIO_EXT_DIR      = Path(CACHE_DIR) / "MMAudio" / "ext_weights"
+MMAUDIO_WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
+MMAUDIO_EXT_DIR.mkdir(parents=True, exist_ok=True)
+print("Downloading MMAudio checkpoints…")
+mmaudio_model_path       = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/mmaudio_large_44k_v2.pth",     cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_WEIGHTS_DIR), local_dir_use_symlinks=False)
+mmaudio_vae_path         = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/v1-44.pth",                    cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_EXT_DIR),     local_dir_use_symlinks=False)
+mmaudio_synchformer_path = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/synchformer_state_dict.pth",   cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_EXT_DIR),     local_dir_use_symlinks=False)
+print("MMAudio checkpoints downloaded.")
+# ---- HunyuanVideoFoley checkpoints (in HunyuanFoley/ subfolder) ----
+HUNYUAN_MODEL_DIR = Path(CACHE_DIR) / "HunyuanFoley"
+HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
+print("Downloading HunyuanVideoFoley checkpoints…")
+hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanFoley/hunyuanvideo_foley.pth",      cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanFoley/vae_128d_48k.pth",            cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanFoley/synchformer_state_dict.pth",  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+print("HunyuanVideoFoley checkpoints downloaded.")
+# ================================================================== #
+#                     SHARED CONSTANTS / HELPERS                      #
+# ================================================================== #
+MAX_SLOTS = 8   # max parallel generation slots shown in UI
+def set_global_seed(seed: int):
     np.random.seed(seed % (2**32))
     random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True
+def get_random_seed() -> int:
+    return random.randint(0, 2**32 - 1)
+def get_video_duration(video_path: str) -> float:
+    """Return video duration in seconds (CPU only)."""
     probe = ffmpeg.probe(video_path)
     return float(probe["format"]["duration"])
+def strip_audio_from_video(video_path: str, output_path: str):
+    """Write a silent copy of *video_path* to *output_path*."""
+    ffmpeg.input(video_path).output(output_path, vcodec="libx264", an=None).run(
+        overwrite_output=True, quiet=True
+    )
+def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
+    """Mux a silent video with an audio file into *output_path*."""
+    ffmpeg.output(
+        ffmpeg.input(silent_video),
+        ffmpeg.input(audio_path),
+        output_path,
+        vcodec="libx264", acodec="aac", strict="experimental",
+    ).run(overwrite_output=True, quiet=True)
+# ================================================================== #
+#                              TARO                                   #
+# ================================================================== #
+# Constants sourced from TARO/infer.py and TARO/models.py:
+#   SR=16000, TRUNCATE=131072  →  8.192 s window
+#   TRUNCATE_FRAME = 4 fps × 131072/16000 = 32 CAVP frames per window
+#   TRUNCATE_ONSET = 120 onset frames per window
+#   latent shape: (1, 8, 204, 16) — fixed by MMDiT architecture
+#   latents_scale: [0.18215]*8 — AudioLDM2 VAE scale factor
+# ================================================================== #
+TARO_SR            = 16000
+TARO_TRUNCATE      = 131072
+TARO_FPS           = 4
+TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
+TARO_TRUNCATE_ONSET = 120
+TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
+TARO_SECS_PER_STEP = 2.5   # estimated GPU-seconds per diffusion step
+_TARO_INFERENCE_CACHE: dict = {}
+def _taro_build_segments(total_dur_s: float, crossfade_s: float) -> list:
+    """Sliding-window segmentation for videos longer than one TARO window."""
+    if total_dur_s <= TARO_MODEL_DUR:
         return [(0.0, total_dur_s)]
+    step_s = TARO_MODEL_DUR - crossfade_s
+    segments, seg_start = [], 0.0
     while True:
+        if seg_start + TARO_MODEL_DUR >= total_dur_s:
+            seg_start = max(0.0, total_dur_s - TARO_MODEL_DUR)
             segments.append((seg_start, total_dur_s))
             break
+        segments.append((seg_start, seg_start + TARO_MODEL_DUR))
         seg_start += step_s
     return segments
+def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
+    n_segs        = len(_taro_build_segments(total_dur_s, crossfade_s))
+    time_per_seg  = num_steps * TARO_SECS_PER_STEP
+    max_s         = floor(600.0 / (n_segs * time_per_seg))
     return max(1, min(max_s, MAX_SLOTS))
+def _taro_infer_segment(
+    model, vae, vocoder,
+    cavp_feats_full, onset_feats_full,
+    seg_start_s: float, seg_end_s: float,
+    device, weight_dtype,
+    cfg_scale: float, num_steps: int, mode: str,
+    latents_scale,
+    euler_sampler, euler_maruyama_sampler,
+) -> np.ndarray:
+    """Single-segment TARO inference. Returns wav array trimmed to segment length."""
     # CAVP features (4 fps)
+    cavp_start = int(round(seg_start_s * TARO_FPS))
+    cavp_slice = cavp_feats_full[cavp_start : cavp_start + TARO_TRUNCATE_FRAME]
+    if cavp_slice.shape[0] < TARO_TRUNCATE_FRAME:
         pad = np.zeros(
+            (TARO_TRUNCATE_FRAME - cavp_slice.shape[0],) + cavp_slice.shape[1:],
             dtype=cavp_slice.dtype,
         )
         cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
+    video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device, weight_dtype)
+    # Onset features  (onset_fps = TRUNCATE_ONSET / MODEL_DUR ≈ 14.65 fps)
+    onset_fps   = TARO_TRUNCATE_ONSET / TARO_MODEL_DUR
     onset_start = int(round(seg_start_s * onset_fps))
+    onset_slice = onset_feats_full[onset_start : onset_start + TARO_TRUNCATE_ONSET]
+    if onset_slice.shape[0] < TARO_TRUNCATE_ONSET:
+        onset_slice = np.pad(
+            onset_slice,
+            ((0, TARO_TRUNCATE_ONSET - onset_slice.shape[0]),),
+            mode="constant",
+        )
+    onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device, weight_dtype)
+    # Latent noise — shape matches MMDiT architecture (in_channels=8, 204×16 spatial)
+    z = torch.randn(1, model.in_channels, 204, 16, device=device, dtype=weight_dtype)
     sampling_kwargs = dict(
         model=model,
         latents=z,
         path_type="linear",
     )
     with torch.no_grad():
+        samples = (euler_maruyama_sampler if mode == "sde" else euler_sampler)(**sampling_kwargs)
+        # samplers return (output_tensor, zs) — index [0] for the audio latent
+        if isinstance(samples, tuple):
+            samples = samples[0]
+    # Decode: AudioLDM2 VAE → mel → vocoder → waveform
     samples = vae.decode(samples / latents_scale).sample
     wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
+    seg_samples = int(round((seg_end_s - seg_start_s) * TARO_SR))
     return wav[:seg_samples]
+def _crossfade_join(wav_a: np.ndarray, wav_b: np.ndarray,
+                    crossfade_s: float, db_boost: float) -> np.ndarray:
+    cf = int(round(crossfade_s * TARO_SR))
+    cf = min(cf, len(wav_a), len(wav_b))
+    if cf <= 0:
         return np.concatenate([wav_a, wav_b])
     gain    = 10 ** (db_boost / 20.0)
+    overlap = wav_a[-cf:] * gain + wav_b[:cf] * gain
+    return np.concatenate([wav_a[:-cf], overlap, wav_b[cf:]])
+def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float, total_dur_s: float) -> np.ndarray:
+    out = wavs[0]
+    for nw in wavs[1:]:
+        out = _crossfade_join(out, nw, crossfade_s, db_boost)
+    return out[:int(round(total_dur_s * TARO_SR))]
 @spaces.GPU(duration=600)
+def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
+                  crossfade_s, crossfade_db, num_samples):
+    """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
+    global _TARO_INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
     num_samples  = int(num_samples)
     if seed_val < 0:
         seed_val = random.randint(0, 2**32 - 1)
     torch.set_grad_enabled(False)
     device       = "cuda" if torch.cuda.is_available() else "cpu"
     weight_dtype = torch.bfloat16
+    # Imports are inside the GPU context so the Space only pays for GPU time here
+    from TARO.cavp_util import Extract_CAVP_Features
+    from TARO.onset_util import VideoOnsetNet, extract_onset
+    from TARO.models    import MMDiT
+    from TARO.samplers  import euler_sampler, euler_maruyama_sampler
+    from diffusers      import AudioLDM2Pipeline
+    # -- Load CAVP encoder (uses checkpoint from our HF repo) --
     extract_cavp = Extract_CAVP_Features(
+        device=device,
+        config_path="TARO/cavp/cavp.yaml",
+        ckpt_path=cavp_ckpt_path,
     )
+    # -- Load onset detection model --
+    # Key remapping matches the original TARO infer.py exactly
+    raw_sd = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+    onset_sd = {}
+    for k, v in raw_sd.items():
+        if "model.net.model" in k:
+            k = k.replace("model.net.model", "net.model")
+        elif "model.fc." in k:
+            k = k.replace("model.fc", "fc")
+        onset_sd[k] = v
+    onset_model = VideoOnsetNet(pretrained=False).to(device)
+    onset_model.load_state_dict(onset_sd)
     onset_model.eval()
+    # -- Load TARO MMDiT --
+    # Architecture params match TARO/train.py: adm_in_channels=120 (onset dim),
+    # z_dims=[768] (CAVP dim), encoder_depth=4
     model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
+    model.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
+    model.eval().to(weight_dtype)
+    # -- Load AudioLDM2 VAE + vocoder (decoder pipeline only) --
+    # TARO uses AudioLDM2's VAE and vocoder for decoding; no encoder needed at inference
+    audioldm2 = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
+    vae        = audioldm2.vae.to(device).eval()
+    vocoder    = audioldm2.vocoder.to(device)
     latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
+    # -- Prepare silent video (shared across all samples) --
     tmp_dir      = tempfile.mkdtemp()
     silent_video = os.path.join(tmp_dir, "silent_input.mp4")
     strip_audio_from_video(video_file, silent_video)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
+    total_dur_s = cavp_feats.shape[0] / TARO_FPS
+    segments    = _taro_build_segments(total_dur_s, crossfade_s)
+    outputs = []
     for sample_idx in range(num_samples):
         sample_seed = seed_val + sample_idx
+        cache_key   = (video_file, sample_seed, float(cfg_scale), int(num_steps), mode, crossfade_s)
+        if cache_key in _TARO_INFERENCE_CACHE:
+            print(f"[TARO] Sample {sample_idx+1}: cache hit.")
+            wavs = _TARO_INFERENCE_CACHE[cache_key]["wavs"]
         else:
             set_global_seed(sample_seed)
+            onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
             wavs = []
             for seg_start_s, seg_end_s in segments:
+                print(f"[TARO] Sample {sample_idx+1} | {seg_start_s:.2f}s – {seg_end_s:.2f}s")
+                wav = _taro_infer_segment(
                     model, vae, vocoder,
                     cavp_feats, onset_feats,
                     seg_start_s, seg_end_s,
                     euler_sampler, euler_maruyama_sampler,
                 )
                 wavs.append(wav)
+            _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
+        final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s)
+        audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
+        sf.write(audio_path, final_wav, TARO_SR)
+        video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
+        mux_video_audio(silent_video, audio_path, video_path)
+        outputs.append((video_path, audio_path))
+    return _pad_outputs(outputs)
+# ================================================================== #
+#                            MMAudio                                  #
+# ================================================================== #
+# Constants sourced from MMAudio/mmaudio/model/sequence_config.py:
+#   CONFIG_44K: duration=8.0 s, sampling_rate=44100
+#   CLIP encoder: 8 fps, 384×384 px
+#   Synchformer: 25 fps, 224×224 px
+#   Default variant: large_44k_v2
+# MMAudio uses flow-matching (FlowMatching with euler inference).
+# generate() handles all feature extraction + decoding internally.
+# ================================================================== #
+@spaces.GPU(duration=600)
+def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
+                     cfg_strength, num_steps, duration, num_samples):
+    """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s window, text-guided."""
+    import torchaudio
+    from mmaudio.eval_utils        import all_model_cfg, generate, load_video, make_video
+    from mmaudio.model.flow_matching   import FlowMatching
+    from mmaudio.model.networks        import get_my_mmaudio
+    from mmaudio.model.utils.features_utils import FeaturesUtils
+    seed_val    = int(seed_val)
+    num_samples = int(num_samples)
+    duration    = float(duration)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype  = torch.bfloat16
+    # Use large_44k_v2 variant; override paths to our consolidated HF checkpoint repo
+    model_cfg = all_model_cfg["large_44k_v2"]
+    # Patch checkpoint paths to our downloaded files
+    from pathlib import Path as _Path
+    model_cfg.model_path        = _Path(mmaudio_model_path)
+    model_cfg.vae_path          = _Path(mmaudio_vae_path)
+    model_cfg.synchformer_ckpt  = _Path(mmaudio_synchformer_path)
+    # large_44k_v2 is 44k mode, no BigVGAN vocoder needed
+    model_cfg.bigvgan_16k_path  = None
+    seq_cfg = model_cfg.seq_cfg   # CONFIG_44K: 8 s, 44100 Hz
+    # Load network weights
+    net = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
+    net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
+    # Load feature utilities: CLIP (auto-downloaded from apple/DFN5B-CLIP-ViT-H-14-384),
+    # Synchformer (from our repo), VAE (from our repo), no BigVGAN for 44k mode
+    feature_utils = FeaturesUtils(
+        tod_vae_ckpt=str(model_cfg.vae_path),
+        synchformer_ckpt=str(model_cfg.synchformer_ckpt),
+        enable_conditions=True,
+        mode=model_cfg.mode,        # "44k"
+        bigvgan_vocoder_ckpt=None,
+        need_vae_encoder=False,
+    ).to(device, dtype).eval()
+    tmp_dir = tempfile.mkdtemp()
+    outputs = []
+    for sample_idx in range(num_samples):
+        rng = torch.Generator(device=device)
+        if seed_val >= 0:
+            rng.manual_seed(seed_val + sample_idx)
+        else:
+            rng.seed()
+        fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
+        # load_video() resamples to 8 fps (CLIP) and 25 fps (Synchformer) on the fly
+        video_info  = load_video(video_file, duration)
+        clip_frames = video_info.clip_frames.unsqueeze(0)    # (1, T_clip, C, H, W)
+        sync_frames = video_info.sync_frames.unsqueeze(0)    # (1, T_sync, C, H, W)
+        actual_dur  = video_info.duration_sec
+        seq_cfg.duration = actual_dur
+        net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
+        print(f"[MMAudio] Sample {sample_idx+1} | duration={actual_dur:.2f}s | prompt='{prompt}'")
+        audios = generate(
+            clip_frames,
+            sync_frames,
+            [prompt],
+            negative_text=[negative_prompt] if negative_prompt else None,
+            feature_utils=feature_utils,
+            net=net,
+            fm=fm,
+            rng=rng,
+            cfg_strength=float(cfg_strength),
+        )
+        audio = audios.float().cpu()[0]   # (C, T)
+        audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
+        torchaudio.save(audio_path, audio, seq_cfg.sampling_rate)
+        video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
+        make_video(video_info, video_path, audio, sampling_rate=seq_cfg.sampling_rate)
         outputs.append((video_path, audio_path))
+    return _pad_outputs(outputs)
+# ================================================================== #
+#                        HunyuanVideoFoley                            #
+# ================================================================== #
+# Constants sourced from HunyuanVideo-Foley/hunyuanvideo_foley/constants.py
+# and configs/hunyuanvideo-foley-xxl.yaml:
+#   sample_rate = 48000 Hz (from DAC VAE)
+#   audio_frame_rate = 50 (latent fps, xxl config)
+#   max video duration = 15 s
+#   SigLIP2 fps = 8, Synchformer fps = 25
+#   CLAP text encoder: laion/larger_clap_general (auto-downloaded from HF Hub)
+#   Default guidance_scale=4.5, num_inference_steps=50
+# ================================================================== #
+HUNYUAN_MAX_DUR = 15.0   # seconds
+@spaces.GPU(duration=600)
+def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
+                     guidance_scale, num_steps, model_size, num_samples):
+    """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
+    import torchaudio
+    import sys as _sys
+    # Ensure HunyuanVideo-Foley package is importable
+    _hf_path = str(Path("HunyuanVideo-Foley").resolve())
+    if _hf_path not in _sys.path:
+        _sys.path.insert(0, _hf_path)
+    from hunyuanvideo_foley.utils.model_utils  import load_model, denoise_process
+    from hunyuanvideo_foley.utils.feature_utils import feature_process
+    from hunyuanvideo_foley.utils.media_utils   import merge_audio_video
+    seed_val     = int(seed_val)
+    num_samples  = int(num_samples)
+    if seed_val >= 0:
+        set_global_seed(seed_val)
+    device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_size  = model_size.lower()   # "xl" or "xxl"
+    config_map = {
+        "xl":  "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml",
+        "xxl": "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml",
+    }
+    config_path = config_map.get(model_size, config_map["xxl"])
+    print(f"[HunyuanFoley] Loading {model_size.upper()} model from {HUNYUAN_MODEL_DIR}")
+    # load_model() handles: HunyuanVideoFoley main model, DAC-VAE, SigLIP2, CLAP, Synchformer
+    # CLAP (laion/larger_clap_general) and SigLIP2 (google/siglip2-base-patch16-512) are
+    # downloaded from HuggingFace Hub automatically by load_model().
+    model_dict, cfg = load_model(
+        str(HUNYUAN_MODEL_DIR),
+        config_path,
+        device,
+        enable_offload=False,
+        model_size=model_size,
+    )
+    tmp_dir = tempfile.mkdtemp()
+    outputs = []
+    # feature_process() extracts SigLIP2 visual features + Synchformer sync features
+    # + CLAP text embeddings — exactly as in HunyuanVideo-Foley/gradio_app.py
+    visual_feats, text_feats, audio_len_in_s = feature_process(
+        video_file,
+        prompt if prompt else "",
+        model_dict,
+        cfg,
+        neg_prompt=negative_prompt if negative_prompt else None,
+    )
+    print(f"[HunyuanFoley] Audio length: {audio_len_in_s:.2f}s | generating {num_samples} sample(s)")
+    # denoise_process() runs the flow-matching diffusion loop and decodes with DAC-VAE
+    # batch_size=num_samples generates all samples in one pass
+    audio, sample_rate = denoise_process(
+        visual_feats,
+        text_feats,
+        audio_len_in_s,
+        model_dict,
+        cfg,
+        guidance_scale=float(guidance_scale),
+        num_inference_steps=int(num_steps),
+        batch_size=num_samples,
+    )
+    # audio shape: (batch, channels, samples)
+    for sample_idx in range(num_samples):
+        audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
+        torchaudio.save(audio_path, audio[sample_idx], sample_rate)
+        video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
+        merge_audio_video(audio_path, video_file, video_path)
+        outputs.append((video_path, audio_path))
+    return _pad_outputs(outputs)
+# ================================================================== #
+#                        SHARED UI HELPERS                            #
+# ================================================================== #
+def _pad_outputs(outputs: list) -> list:
+    """Flatten (video, audio) pairs and pad to MAX_SLOTS * 2 with None."""
     result = []
     for i in range(MAX_SLOTS):
         if i < len(outputs):
+            result.extend(outputs[i])
         else:
+            result.extend([None, None])
     return result
+def _on_video_upload_taro(video_file, num_steps, crossfade_s):
+    if video_file is None:
+        return gr.update(maximum=MAX_SLOTS, value=1)
+    try:
+        D     = get_video_duration(video_file)
+        max_s = _taro_calc_max_samples(D, int(num_steps), float(crossfade_s))
+    except Exception:
+        max_s = MAX_SLOTS
+    return gr.update(maximum=max_s, value=min(1, max_s))
+def _update_slot_visibility(n):
+    n = int(n)
+    return [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
+# ================================================================== #
+#                       GRADIO UI                                     #
+# ================================================================== #
+with gr.Blocks(title="Video-to-Audio Generation") as demo:
     gr.Markdown(
+        "# Video-to-Audio Generation\n"
+        "Choose a model and upload a video to generate synchronized audio.\n\n"
+        "| Model | Sample rate | Optimal duration | Notes |\n"
+        "|-------|------------|-----------------|-------|\n"
+        "| **TARO** | 16 kHz | 8.2 s | Video-only, sliding window for longer clips |\n"
+        "| **MMAudio** | 44.1 kHz | 8 s | Text prompt supported |\n"
+        "| **HunyuanFoley** | 48 kHz | up to 15 s | Text-guided foley, highest fidelity |"
     )
+    with gr.Tabs():
+        # ---------------------------------------------------------- #
+        # Tab 1 — TARO                                                #
+        # ---------------------------------------------------------- #
+        with gr.Tab("TARO"):
+            gr.Markdown(
+                "**TARO** — Video-conditioned diffusion (ICCV 2025). No text prompt needed. "
+                "8.192 s model window; longer videos are split into overlapping segments "
+                "and stitched with a crossfade."
+            )
+            with gr.Row():
+                with gr.Column():
+                    taro_video   = gr.Video(label="Input Video")
+                    taro_seed    = gr.Number(label="Seed (-1 = random)", value=get_random_seed, precision=0)
+                    taro_cfg     = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5)
+                    taro_steps   = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
+                    taro_mode    = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
+                    taro_cf_dur  = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
+                    taro_cf_db   = gr.Textbox(label="Crossfade Boost (dB)", value="3")
+                    taro_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
+                    taro_btn     = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    taro_slot_grps, taro_slot_vids, taro_slot_auds = [], [], []
+                    for i in range(MAX_SLOTS):
+                        with gr.Group(visible=(i == 0)) as g:
+                            sv = gr.Video(label=f"Generation {i+1} — Video")
+                            sa = gr.Audio(label=f"Generation {i+1} — Audio")
+                        taro_slot_grps.append(g)
+                        taro_slot_vids.append(sv)
+                        taro_slot_auds.append(sa)
+            for trigger in [taro_video, taro_steps, taro_cf_dur]:
+                trigger.change(
+                    fn=_on_video_upload_taro,
+                    inputs=[taro_video, taro_steps, taro_cf_dur],
+                    outputs=[taro_samples],
+                )
+            taro_samples.change(
+                fn=_update_slot_visibility,
+                inputs=[taro_samples],
+                outputs=taro_slot_grps,
+            )
+            def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
+                flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
+                n = int(n)
+                grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
+                vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
+                aud_upd = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
+                return grp_upd + vid_upd + aud_upd
+            taro_btn.click(
+                fn=_run_taro,
+                inputs=[taro_video, taro_seed, taro_cfg, taro_steps, taro_mode,
+                        taro_cf_dur, taro_cf_db, taro_samples],
+                outputs=taro_slot_grps + taro_slot_vids + taro_slot_auds,
+            )
+        # ---------------------------------------------------------- #
+        # Tab 2 — MMAudio                                             #
+        # ---------------------------------------------------------- #
+        with gr.Tab("MMAudio"):
+            gr.Markdown(
+                "**MMAudio** — Multimodal flow-matching (CVPR 2025). "
+                "Supports a text prompt for additional control. "
+                "Native window is 8 s at 44.1 kHz. "
+                "Duration slider lets you control how many seconds are processed."
+            )
+            with gr.Row():
+                with gr.Column():
+                    mma_video    = gr.Video(label="Input Video")
+                    mma_prompt   = gr.Textbox(label="Prompt", placeholder="e.g. footsteps on gravel")
+                    mma_neg      = gr.Textbox(label="Negative Prompt", placeholder="music, speech")
+                    mma_seed     = gr.Number(label="Seed (-1 = random)", value=get_random_seed, precision=0)
+                    mma_cfg      = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
+                    mma_steps    = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
+                    mma_dur      = gr.Slider(label="Duration (s)", minimum=1, maximum=10, value=8, step=0.5)
+                    mma_samples  = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
+                    mma_btn      = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    mma_slot_grps, mma_slot_vids, mma_slot_auds = [], [], []
+                    for i in range(MAX_SLOTS):
+                        with gr.Group(visible=(i == 0)) as g:
+                            sv = gr.Video(label=f"Generation {i+1} — Video")
+                            sa = gr.Audio(label=f"Generation {i+1} — Audio")
+                        mma_slot_grps.append(g)
+                        mma_slot_vids.append(sv)
+                        mma_slot_auds.append(sa)
+            mma_samples.change(
+                fn=_update_slot_visibility,
+                inputs=[mma_samples],
+                outputs=mma_slot_grps,
+            )
+            def _run_mmaudio(video, prompt, neg, seed, cfg, steps, dur, n):
+                flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, dur, n)
+                n = int(n)
+                grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
+                vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
+                aud_upd = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
+                return grp_upd + vid_upd + aud_upd
+            mma_btn.click(
+                fn=_run_mmaudio,
+                inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
+                        mma_cfg, mma_steps, mma_dur, mma_samples],
+                outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
+            )
+        # ---------------------------------------------------------- #
+        # Tab 3 — HunyuanVideoFoley                                   #
+        # ---------------------------------------------------------- #
+        with gr.Tab("HunyuanFoley"):
+            gr.Markdown(
+                "**HunyuanVideo-Foley** (Tencent Hunyuan). "
+                "Professional-grade text-guided foley at 48 kHz, up to 15 s. "
+                "Requires a text prompt describing the desired sound."
+            )
+            with gr.Row():
+                with gr.Column():
+                    hf_video    = gr.Video(label="Input Video")
+                    hf_prompt   = gr.Textbox(label="Prompt", placeholder="e.g. rain hitting a metal roof")
+                    hf_neg      = gr.Textbox(label="Negative Prompt", value="noisy, harsh")
+                    hf_seed     = gr.Number(label="Seed (-1 = random)", value=get_random_seed, precision=0)
+                    hf_guidance = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, value=4.5, step=0.5)
+                    hf_steps    = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=5)
+                    hf_size     = gr.Radio(label="Model Size", choices=["xl", "xxl"], value="xxl")
+                    hf_samples  = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
+                    hf_btn      = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    hf_slot_grps, hf_slot_vids, hf_slot_auds = [], [], []
+                    for i in range(MAX_SLOTS):
+                        with gr.Group(visible=(i == 0)) as g:
+                            sv = gr.Video(label=f"Generation {i+1} — Video")
+                            sa = gr.Audio(label=f"Generation {i+1} — Audio")
+                        hf_slot_grps.append(g)
+                        hf_slot_vids.append(sv)
+                        hf_slot_auds.append(sa)
+            hf_samples.change(
+                fn=_update_slot_visibility,
+                inputs=[hf_samples],
+                outputs=hf_slot_grps,
+            )
+            def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, n):
+                flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n)
+                n = int(n)
+                grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
+                vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
+                aud_upd = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
+                return grp_upd + vid_upd + aud_upd
+            hf_btn.click(
+                fn=_run_hunyuan,
+                inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
+                        hf_guidance, hf_steps, hf_size, hf_samples],
+                outputs=hf_slot_grps + hf_slot_vids + hf_slot_auds,
+            )
 demo.queue().launch()