Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

JackIsNotInTheBox commited on 11 days ago

Commit

160db86

1 Parent(s): 5806ea4

Multi-sample support, last-segment tail anchor fix, dynamic samples cap, gr.Blocks UI, duration=600

Browse files

Files changed (1) hide show

app.py +317 -193

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import subprocess
 import sys
 try:
     import mmcv
@@ -30,13 +31,23 @@ onset_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="onset_model.ckpt",
 taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache_dir=CACHE_DIR)
 print("Checkpoints downloaded.")
 # ------------------------------------------------------------------ #
-# Inference cache: keyed by (video_path, seed, cfg_scale,            #
-#                             num_steps, mode, crossfade_s)           #
-# Stores the raw per-segment wavs so that only the dB value can be   #
-# changed without re-running the model.                               #
 # ------------------------------------------------------------------ #
-_INFERENCE_CACHE = {}   # key -> {"wavs": [...], "sr": int}
 def set_global_seed(seed):
@@ -57,33 +68,74 @@ def strip_audio_from_video(video_path, output_path):
     )
 def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
                   seg_start_s, seg_end_s,
-                  sr, fps, truncate_frame, truncate_onset, model_dur,
-                  latents_scale, device, weight_dtype,
                   cfg_scale, num_steps, mode,
                   euler_sampler, euler_maruyama_sampler):
-    """
-    Run one model inference pass for the video window starting at seg_start_s.
-    Returns a numpy float32 wav array trimmed to (seg_end_s - seg_start_s).
-    """
-    # CAVP features at fps (4 fps)
-    cavp_start = int(round(seg_start_s * fps))
-    cavp_slice = cavp_feats_full[cavp_start : cavp_start + truncate_frame]
-    if cavp_slice.shape[0] < truncate_frame:
         pad = np.zeros(
-            (truncate_frame - cavp_slice.shape[0],) + cavp_slice.shape[1:],
             dtype=cavp_slice.dtype,
         )
         cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
     video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device).to(weight_dtype)
-    # Onset features at truncate_onset / model_dur frames per second
-    onset_fps   = truncate_onset / model_dur
     onset_start = int(round(seg_start_s * onset_fps))
-    onset_slice = onset_feats_full[onset_start : onset_start + truncate_onset]
-    if onset_slice.shape[0] < truncate_onset:
-        pad_len = truncate_onset - onset_slice.shape[0]
         onset_slice = np.pad(onset_slice, ((0, pad_len),), mode="constant", constant_values=0)
     onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device).to(weight_dtype)
@@ -108,214 +160,286 @@ def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
     samples = vae.decode(samples / latents_scale).sample
     wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
-    seg_samples = int(round((seg_end_s - seg_start_s) * sr))
     return wav[:seg_samples]
-def crossfade_join(wav_a, wav_b, crossfade_s, db_boost, sr):
     """
-    Join wav_a and wav_b with a crossfade_s-second crossfade.
-    db_boost controls the gain applied to both signals in the overlap region:
-        gain = 10 ** (db_boost / 20)
-    At +3 dB (gain ≈ 1.414), the two summed unity signals produce +3 dB at midpoint.
-    At 0 dB (gain = 1.0), each signal is kept at full amplitude — same as +3 dB sum
-    since both are 1.0.  The parameter lets the user tune the blend level freely.
-    The crossfade window is the last crossfade_s seconds of wav_a overlapping with
-    the first crossfade_s seconds of wav_b.  Both are scaled by gain and summed.
     """
-    cf_samples = int(round(crossfade_s * sr))
-    # Guard: if either wav is shorter than the crossfade window, shrink the window
-    cf_samples = min(cf_samples, len(wav_a), len(wav_b))
     if cf_samples <= 0:
         return np.concatenate([wav_a, wav_b])
-    gain = 10 ** (db_boost / 20.0)
-    tail_a  = wav_a[-cf_samples:] * gain
-    head_b  = wav_b[:cf_samples]  * gain
-    overlap = tail_a + head_b
-    return np.concatenate([
-        wav_a[:-cf_samples],
-        overlap,
-        wav_b[cf_samples:],
-    ])
-def stitch_wavs(wavs, crossfade_s, db_boost, sr, total_dur_s):
-    """Stitch a list of wav arrays using crossfade_join, then clip to total_dur_s."""
     if len(wavs) == 1:
         final_wav = wavs[0]
     else:
         final_wav = wavs[0]
-        for next_wav in wavs[1:]:
-            final_wav = crossfade_join(final_wav, next_wav, crossfade_s, db_boost, sr)
-    target_samples = int(round(total_dur_s * sr))
-    return final_wav[:target_samples]
-@spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
-                   crossfade_s, crossfade_db):
     global _INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
     if seed_val < 0:
         seed_val = random.randint(0, 2**32 - 1)
-    sr            = 16000
-    truncate      = 131072
-    fps           = 4
-    truncate_frame = int(fps * truncate / sr)
-    truncate_onset = 120
-    model_dur     = truncate / sr       # 8.192 s
-    step_s        = model_dur - crossfade_s
-    # Cache key covers everything that affects segmentation and inference
-    cache_key = (video_file, seed_val, float(cfg_scale), int(num_steps), mode,
-                 crossfade_s)
-    if cache_key in _INFERENCE_CACHE:
-        print("Cache hit — skipping inference, re-stitching with new dB value.")
-        cached      = _INFERENCE_CACHE[cache_key]
-        wavs        = cached["wavs"]
-        total_dur_s = cached["total_dur_s"]
-        tmp_dir     = cached["tmp_dir"]
-        silent_video = cached["silent_video"]
-    else:
-        set_global_seed(seed_val)
-        torch.set_grad_enabled(False)
-        device       = "cuda" if torch.cuda.is_available() else "cpu"
-        weight_dtype = torch.bfloat16
-        from cavp_util import Extract_CAVP_Features
-        from onset_util import VideoOnsetNet, extract_onset
-        from models import MMDiT
-        from samplers import euler_sampler, euler_maruyama_sampler
-        from diffusers import AudioLDM2Pipeline
-        extract_cavp = Extract_CAVP_Features(
-            device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path
-        )
-        state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-        new_state_dict = {}
-        for key, value in state_dict.items():
-            if "model.net.model" in key:
-                new_key = key.replace("model.net.model", "net.model")
-            elif "model.fc." in key:
-                new_key = key.replace("model.fc", "fc")
-            else:
-                new_key = key
-            new_state_dict[new_key] = value
-        onset_model = VideoOnsetNet(False).to(device)
-        onset_model.load_state_dict(new_state_dict)
-        onset_model.eval()
-        model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
-        ckpt  = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
-        model.load_state_dict(ckpt)
-        model.eval()
-        model.to(weight_dtype)
-        model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
-        vae     = model_audioldm.vae.to(device)
-        vae.eval()
-        vocoder = model_audioldm.vocoder.to(device)
-        tmp_dir      = tempfile.mkdtemp()
-        silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-        strip_audio_from_video(video_file, silent_video)
-        cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
-        onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
-        latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
-        total_frames = cavp_feats.shape[0]
-        total_dur_s  = total_frames / fps
-        # Build segment list
-        segments = []
-        seg_start = 0.0
-        while True:
-            seg_end = min(seg_start + model_dur, total_dur_s)
-            segments.append((seg_start, seg_end))
-            if seg_end >= total_dur_s:
-                break
-            seg_start += step_s
-        # Run inference for every segment
-        wavs = []
-        for seg_start_s, seg_end_s in segments:
-            print(f"Inferring segment {seg_start_s:.2f}s – {seg_end_s:.2f}s ...")
-            wav = infer_segment(
-                model, vae, vocoder,
-                cavp_feats, onset_feats,
-                seg_start_s, seg_end_s,
-                sr, fps, truncate_frame, truncate_onset, model_dur,
-                latents_scale, device, weight_dtype,
-                cfg_scale, num_steps, mode,
-                euler_sampler, euler_maruyama_sampler,
-            )
-            wavs.append(wav)
-        # Store in cache
-        _INFERENCE_CACHE[cache_key] = {
-            "wavs":         wavs,
-            "total_dur_s":  total_dur_s,
-            "tmp_dir":      tmp_dir,
-            "silent_video": silent_video,
-        }
-    # Stitch with current crossfade params
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    final_wav = stitch_wavs(wavs, crossfade_s, crossfade_db, sr, total_dur_s)
-    audio_path   = os.path.join(tmp_dir, "output.wav")
-    sf.write(audio_path, final_wav, sr)
-    output_video = os.path.join(tmp_dir, "output.mp4")
-    input_v = ffmpeg.input(silent_video)
-    input_a = ffmpeg.input(audio_path)
-    (
-        ffmpeg
-        .output(input_v, input_a, output_video,
-                vcodec="libx264", acodec="aac", strict="experimental")
-        .run(overwrite_output=True, quiet=True)
-    )
-    return output_video, audio_path
-def get_random_seed():
-    return random.randint(0, 2**32 - 1)
-demo = gr.Interface(
-    fn=generate_audio,
-    inputs=[
-        gr.Video(label="Input Video"),
-        gr.Number(label="Seed", value=get_random_seed, precision=0),
-        gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5),
-        gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1),
-        gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde"),
-        gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1),
-        gr.Textbox(label="Crossfade Boost (dB)", value="3"),
-    ],
-    outputs=[
-        gr.Video(label="Output Video with Audio"),
-        gr.Audio(label="Generated Audio"),
-    ],
-    title="TARO: Video-to-Audio Synthesis (ICCV 2025)",
-    description="Upload a video and generate synchronized audio using TARO. Optimal clip duration is 8.2s. Longer videos are automatically split into overlapping segments and stitched with a crossfade.",
-)
 demo.queue().launch()

 import os
 import subprocess
 import sys
+from math import ceil, floor
 try:
     import mmcv
 taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache_dir=CACHE_DIR)
 print("Checkpoints downloaded.")
+# Model constants
+SR            = 16000
+TRUNCATE      = 131072
+FPS           = 4
+TRUNCATE_FRAME = int(FPS * TRUNCATE / SR)   # 32 cavp frames per model window
+TRUNCATE_ONSET = 120                         # onset frames per model window
+MODEL_DUR     = TRUNCATE / SR               # 8.192 s
+MAX_SLOTS     = 8                            # max sample output slots in UI
+SECS_PER_STEP = 2.5                          # estimated seconds of GPU time per diffusion step
 # ------------------------------------------------------------------ #
+# Inference cache                                                      #
+# Key: (video_path, seed, cfg_scale, num_steps, mode, crossfade_s)    #
+# Value: {"wavs": [...], "total_dur_s": float,                        #
+#          "tmp_dir": str, "silent_video": str}                        #
 # ------------------------------------------------------------------ #
+_INFERENCE_CACHE = {}
 def set_global_seed(seed):
     )
+def get_video_duration(video_path):
+    """Read video duration in seconds using ffprobe (no GPU needed)."""
+    probe = ffmpeg.probe(video_path)
+    return float(probe["format"]["duration"])
+def build_segments(total_dur_s, crossfade_s):
+    """
+    Build list of (seg_start_s, seg_end_s) segment windows.
+    For videos <= MODEL_DUR: single segment [0, total_dur_s].
+    For longer videos: advance by step_s = MODEL_DUR - crossfade_s each time.
+    The LAST segment is always anchored at [total_dur_s - MODEL_DUR, total_dur_s]
+    so it is a full-length window with no zero-padding, giving the best quality
+    at the tail end of the video.
+    """
+    if total_dur_s <= MODEL_DUR:
+        return [(0.0, total_dur_s)]
+    step_s = MODEL_DUR - crossfade_s
+    segments = []
+    seg_start = 0.0
+    while True:
+        seg_end = seg_start + MODEL_DUR
+        if seg_end >= total_dur_s:
+            # Replace this segment with a full-length tail-anchored window
+            seg_start = max(0.0, total_dur_s - MODEL_DUR)
+            segments.append((seg_start, total_dur_s))
+            break
+        segments.append((seg_start, seg_start + MODEL_DUR))
+        seg_start += step_s
+    return segments
+def calc_max_samples(total_dur_s, num_steps, crossfade_s):
+    """Estimate max samples that fit within the 600s ZeroGPU budget."""
+    num_segments = len(build_segments(total_dur_s, crossfade_s))
+    time_per_seg = num_steps * SECS_PER_STEP
+    budget = 600.0
+    max_s = floor(budget / (num_segments * time_per_seg))
+    return max(1, min(max_s, MAX_SLOTS))
 def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
                   seg_start_s, seg_end_s,
+                  device, weight_dtype,
                   cfg_scale, num_steps, mode,
+                  latents_scale,
                   euler_sampler, euler_maruyama_sampler):
+    """Run one model inference pass. Returns wav trimmed to segment duration."""
+    # CAVP features (4 fps)
+    cavp_start = int(round(seg_start_s * FPS))
+    cavp_slice = cavp_feats_full[cavp_start : cavp_start + TRUNCATE_FRAME]
+    if cavp_slice.shape[0] < TRUNCATE_FRAME:
         pad = np.zeros(
+            (TRUNCATE_FRAME - cavp_slice.shape[0],) + cavp_slice.shape[1:],
             dtype=cavp_slice.dtype,
         )
         cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
     video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device).to(weight_dtype)
+    # Onset features
+    onset_fps   = TRUNCATE_ONSET / MODEL_DUR
     onset_start = int(round(seg_start_s * onset_fps))
+    onset_slice = onset_feats_full[onset_start : onset_start + TRUNCATE_ONSET]
+    if onset_slice.shape[0] < TRUNCATE_ONSET:
+        pad_len = TRUNCATE_ONSET - onset_slice.shape[0]
         onset_slice = np.pad(onset_slice, ((0, pad_len),), mode="constant", constant_values=0)
     onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device).to(weight_dtype)
     samples = vae.decode(samples / latents_scale).sample
     wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
+    seg_samples = int(round((seg_end_s - seg_start_s) * SR))
     return wav[:seg_samples]
+def crossfade_join(wav_a, wav_b, crossfade_s, db_boost):
     """
+    Join two wav arrays with a crossfade.
+    Both signals are scaled by gain = 10^(db_boost/20) in the overlap region
+    and summed, producing a +db_boost bump at the midpoint.
     """
+    cf_samples = int(round(crossfade_s * SR))
+    cf_samples  = min(cf_samples, len(wav_a), len(wav_b))
     if cf_samples <= 0:
         return np.concatenate([wav_a, wav_b])
+    gain    = 10 ** (db_boost / 20.0)
+    overlap = wav_a[-cf_samples:] * gain + wav_b[:cf_samples] * gain
+    return np.concatenate([wav_a[:-cf_samples], overlap, wav_b[cf_samples:]])
+def stitch_wavs(wavs, crossfade_s, db_boost, total_dur_s):
+    """Stitch segment wavs with crossfades and clip to total_dur_s."""
     if len(wavs) == 1:
         final_wav = wavs[0]
     else:
         final_wav = wavs[0]
+        for nw in wavs[1:]:
+            final_wav = crossfade_join(final_wav, nw, crossfade_s, db_boost)
+    return final_wav[:int(round(total_dur_s * SR))]
+def mux_video_audio(silent_video, audio_path, output_path):
+    input_v = ffmpeg.input(silent_video)
+    input_a = ffmpeg.input(audio_path)
+    (
+        ffmpeg
+        .output(input_v, input_a, output_path,
+                vcodec="libx264", acodec="aac", strict="experimental")
+        .run(overwrite_output=True, quiet=True)
+    )
+# ------------------------------------------------------------------ #
+# UI helpers (no GPU)                                                  #
+# ------------------------------------------------------------------ #
+def on_video_upload(video_file, num_steps, crossfade_s):
+    """Called when video is uploaded or sliders change. Updates samples slider."""
+    if video_file is None:
+        return gr.update(maximum=MAX_SLOTS, value=1)
+    try:
+        D     = get_video_duration(video_file)
+        max_s = calc_max_samples(D, int(num_steps), float(crossfade_s))
+    except Exception:
+        max_s = MAX_SLOTS
+    return gr.update(maximum=max_s, value=min(1, max_s))
+def get_random_seed():
+    return random.randint(0, 2**32 - 1)
+# ------------------------------------------------------------------ #
+# Main inference                                                       #
+# ------------------------------------------------------------------ #
+@spaces.GPU(duration=600)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
+                   crossfade_s, crossfade_db, num_samples):
     global _INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
+    num_samples  = int(num_samples)
     if seed_val < 0:
         seed_val = random.randint(0, 2**32 - 1)
+    # Load models once (shared across all samples this call)
+    torch.set_grad_enabled(False)
+    device       = "cuda" if torch.cuda.is_available() else "cpu"
+    weight_dtype = torch.bfloat16
+    from cavp_util import Extract_CAVP_Features
+    from onset_util import VideoOnsetNet, extract_onset
+    from models import MMDiT
+    from samplers import euler_sampler, euler_maruyama_sampler
+    from diffusers import AudioLDM2Pipeline
+    extract_cavp = Extract_CAVP_Features(
+        device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path
+    )
+    state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if "model.net.model" in key:
+            new_key = key.replace("model.net.model", "net.model")
+        elif "model.fc." in key:
+            new_key = key.replace("model.fc", "fc")
+        else:
+            new_key = key
+        new_state_dict[new_key] = value
+    onset_model = VideoOnsetNet(False).to(device)
+    onset_model.load_state_dict(new_state_dict)
+    onset_model.eval()
+    model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
+    ckpt  = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
+    model.load_state_dict(ckpt)
+    model.eval()
+    model.to(weight_dtype)
+    model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
+    vae     = model_audioldm.vae.to(device)
+    vae.eval()
+    vocoder = model_audioldm.vocoder.to(device)
+    latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
+    # Prepare silent video (shared across all samples)
+    tmp_dir      = tempfile.mkdtemp()
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
+    cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
+    total_frames = cavp_feats.shape[0]
+    total_dur_s  = total_frames / FPS
+    segments     = build_segments(total_dur_s, crossfade_s)
+    # ------------------------------------------------------------------ #
+    # Generate N samples                                                   #
+    # ------------------------------------------------------------------ #
+    outputs = []   # list of (video_path, audio_path)
+    for sample_idx in range(num_samples):
+        sample_seed = seed_val + sample_idx
+        cache_key   = (video_file, sample_seed, float(cfg_scale),
+                       int(num_steps), mode, crossfade_s)
+        if cache_key in _INFERENCE_CACHE:
+            print(f"Sample {sample_idx+1}: cache hit, re-stitching.")
+            cached      = _INFERENCE_CACHE[cache_key]
+            wavs        = cached["wavs"]
+        else:
+            set_global_seed(sample_seed)
+            onset_feats = extract_onset(
+                silent_video, onset_model, tmp_path=tmp_dir, device=device
+            )
+            wavs = []
+            for seg_start_s, seg_end_s in segments:
+                print(f"  Sample {sample_idx+1} | segment {seg_start_s:.2f}s – {seg_end_s:.2f}s")
+                wav = infer_segment(
+                    model, vae, vocoder,
+                    cavp_feats, onset_feats,
+                    seg_start_s, seg_end_s,
+                    device, weight_dtype,
+                    cfg_scale, num_steps, mode,
+                    latents_scale,
+                    euler_sampler, euler_maruyama_sampler,
+                )
+                wavs.append(wav)
+            _INFERENCE_CACHE[cache_key] = {"wavs": wavs}
+        # Stitch
+        final_wav  = stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s)
+        audio_path = os.path.join(tmp_dir, f"output_{sample_idx}.wav")
+        sf.write(audio_path, final_wav, SR)
+        video_path = os.path.join(tmp_dir, f"output_{sample_idx}.mp4")
+        mux_video_audio(silent_video, audio_path, video_path)
+        outputs.append((video_path, audio_path))
+    # ------------------------------------------------------------------ #
+    # Return flat list of (video, audio) pairs padded with None           #
+    # so Gradio output list length is always MAX_SLOTS * 2                #
+    # ------------------------------------------------------------------ #
+    result = []
+    for i in range(MAX_SLOTS):
+        if i < len(outputs):
+            result.append(outputs[i][0])   # video
+            result.append(outputs[i][1])   # audio
+        else:
+            result.append(None)
+            result.append(None)
+    return result
+# ------------------------------------------------------------------ #
+# Build gr.Blocks UI                                                   #
+# ------------------------------------------------------------------ #
+with gr.Blocks(title="TARO: Video-to-Audio Synthesis") as demo:
+    gr.Markdown(
+        "# TARO: Video-to-Audio Synthesis (ICCV 2025)\n"
+        "Upload a video and generate synchronized audio. "
+        "Optimal clip duration is 8.2s. Longer videos are automatically "
+        "split into overlapping segments and stitched with a crossfade."
+    )
+    with gr.Row():
+        with gr.Column():
+            video_input   = gr.Video(label="Input Video")
+            seed_input    = gr.Number(label="Seed", value=get_random_seed, precision=0)
+            cfg_input     = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5)
+            steps_input   = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
+            mode_input    = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
+            cf_dur_input  = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
+            cf_db_input   = gr.Textbox(label="Crossfade Boost (dB)", value="3")
+            samples_input = gr.Slider(label="Number of Samples", minimum=1, maximum=MAX_SLOTS,
+                                      value=1, step=1)
+            run_btn       = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            # Pre-build MAX_SLOTS output slots; hide all initially
+            slot_videos = []
+            slot_audios = []
+            for i in range(MAX_SLOTS):
+                with gr.Group(visible=False) as grp:
+                    sv = gr.Video(label=f"Sample {i+1} — Video")
+                    sa = gr.Audio(label=f"Sample {i+1} — Audio")
+                slot_videos.append((grp, sv))
+                slot_audios.append((grp, sa))
+    # ------------------------------------------------------------------ #
+    # Events                                                              #
+    # ------------------------------------------------------------------ #
+    # Update samples slider max when video uploaded or relevant sliders change
+    def _update_samples_slider(video_file, num_steps, crossfade_s):
+        return on_video_upload(video_file, num_steps, crossfade_s)
+    for trigger in [video_input, steps_input, cf_dur_input]:
+        trigger.change(
+            fn=_update_samples_slider,
+            inputs=[video_input, steps_input, cf_dur_input],
+            outputs=[samples_input],
+        )
+    # Collect all output components (flat: grp_visible, video, audio per slot)
+    all_outputs = []
+    for grp, sv in slot_videos:
+        all_outputs.append(grp)
+    for _, sa in slot_audios:
+        all_outputs.append(sa)
+    # Actually build properly: interleaved group + video + audio
+    all_outputs = []
+    slot_video_comps = [sv for _, sv in slot_videos]
+    slot_audio_comps = [sa for _, sa in slot_audios]
+    slot_grp_comps   = [grp for grp, _ in slot_videos]
+    def _generate_and_update(video_file, seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, num_samples):
+        flat = generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
+                               crossfade_s, crossfade_db, num_samples)
+        num_samples = int(num_samples)
+        # flat = [vid0, aud0, vid1, aud1, ...]
+        grp_updates   = []
+        video_updates = []
+        audio_updates = []
+        for i in range(MAX_SLOTS):
+            visible = i < num_samples
+            vid = flat[i * 2]
+            aud = flat[i * 2 + 1]
+            grp_updates.append(gr.update(visible=visible))
+            video_updates.append(gr.update(value=vid))
+            audio_updates.append(gr.update(value=aud))
+        return grp_updates + video_updates + audio_updates
+    run_btn.click(
+        fn=_generate_and_update,
+        inputs=[video_input, seed_input, cfg_input, steps_input, mode_input,
+                cf_dur_input, cf_db_input, samples_input],
+        outputs=slot_grp_comps + slot_video_comps + slot_audio_comps,
+    )
 demo.queue().launch()