Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

BoxOfColors commited on 5 days ago

Commit

02a1f95

1 Parent(s): 09846c9

.

Browse files

Files changed (1) hide show

app.py +745 -29

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ Supported models
 import os
 import sys
 import tempfile
 import random
 from pathlib import Path
@@ -383,7 +385,19 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
-        outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
@@ -544,7 +558,19 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
-        outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
@@ -707,45 +733,633 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
-        outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
 # ================================================================== #
 #                        SHARED UI HELPERS                            #
 # ================================================================== #
 def _pad_outputs(outputs: list) -> list:
-    """Flatten (video, audio) pairs and pad to MAX_SLOTS * 2 with None."""
     result = []
     for i in range(MAX_SLOTS):
         if i < len(outputs):
-            result.extend(outputs[i])
         else:
-            result.extend([None, None])
     return result
-def _make_output_slots() -> tuple:
-    """Build MAX_SLOTS video+audio output groups. Returns (grps, vids, auds)."""
-    grps, vids, auds = [], [], []
     for i in range(MAX_SLOTS):
         with gr.Group(visible=(i == 0)) as g:
             vids.append(gr.Video(label=f"Generation {i+1} — Video"))
-            auds.append(gr.Audio(label=f"Generation {i+1} — Audio"))
         grps.append(g)
-    return grps, vids, auds
-def _unpack_outputs(flat: list, n: int) -> list:
-    """Turn a flat _pad_outputs list into Gradio update lists for grps+vids+auds."""
     n = int(n)
-    return (
-        [gr.update(visible=(i < n))         for i in range(MAX_SLOTS)] +
-        [gr.update(value=flat[i * 2])       for i in range(MAX_SLOTS)] +
-        [gr.update(value=flat[i * 2 + 1])   for i in range(MAX_SLOTS)]
-    )
 def _on_video_upload_taro(video_file, num_steps, crossfade_s):
@@ -798,7 +1412,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     taro_btn     = gr.Button("Generate", variant="primary")
                 with gr.Column():
-                    taro_slot_grps, taro_slot_vids, taro_slot_auds = _make_output_slots()
             for trigger in [taro_video, taro_steps, taro_cf_dur]:
                 trigger.change(
@@ -813,15 +1429,49 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
-                return _unpack_outputs(generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n), n)
             taro_btn.click(
                 fn=_run_taro,
                 inputs=[taro_video, taro_seed, taro_cfg, taro_steps, taro_mode,
                         taro_cf_dur, taro_cf_db, taro_samples],
-                outputs=taro_slot_grps + taro_slot_vids + taro_slot_auds,
             )
         # ---------------------------------------------------------- #
         # Tab 2 — MMAudio                                             #
         # ---------------------------------------------------------- #
@@ -840,7 +1490,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     mma_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
-                    mma_slot_grps, mma_slot_vids, mma_slot_auds = _make_output_slots()
             mma_samples.change(
                 fn=_update_slot_visibility,
@@ -849,15 +1501,47 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
-                return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n), n)
             mma_btn.click(
                 fn=_run_mmaudio,
                 inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
                         mma_cfg, mma_steps, mma_cf_dur, mma_cf_db, mma_samples],
-                outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
             )
         # ---------------------------------------------------------- #
         # Tab 3 — HunyuanVideoFoley                                   #
         # ---------------------------------------------------------- #
@@ -877,7 +1561,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     hf_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
-                    hf_slot_grps, hf_slot_vids, hf_slot_auds = _make_output_slots()
             hf_samples.change(
                 fn=_update_slot_visibility,
@@ -886,18 +1572,48 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
-                return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n), n)
             hf_btn.click(
                 fn=_run_hunyuan,
                 inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
                         hf_guidance, hf_steps, hf_size, hf_cf_dur, hf_cf_db, hf_samples],
-                outputs=hf_slot_grps + hf_slot_vids + hf_slot_auds,
             )
     # ---- Cross-tab video sync ----
-    # When any tab's video changes, push the value to the other two tabs.
-    # Clearing (value=None) also propagates so the X button clears all.
     _sync = lambda v: (gr.update(value=v), gr.update(value=v))
     taro_video.change(fn=_sync, inputs=[taro_video], outputs=[mma_video, hf_video])
     mma_video.change(fn=_sync,  inputs=[mma_video],  outputs=[taro_video, hf_video])

 import os
 import sys
+import json
+import base64
 import tempfile
 import random
 from pathlib import Path
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
+        seg_meta = {
+            "segments":    segments,
+            "wavs":        [w.copy() for w in wavs],
+            "audio_path":  audio_path,
+            "video_path":  video_path,
+            "silent_video": silent_video,
+            "sr":          TARO_SR,
+            "model":       "taro",
+            "crossfade_s": crossfade_s,
+            "crossfade_db": crossfade_db,
+            "total_dur_s": total_dur_s,
+        }
+        outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
+        seg_meta = {
+            "segments":    segments,
+            "wavs":        [w.copy() for w in seg_audios],
+            "audio_path":  audio_path,
+            "video_path":  video_path,
+            "silent_video": silent_video,
+            "sr":          sr,
+            "model":       "mmaudio",
+            "crossfade_s": crossfade_s,
+            "crossfade_db": crossfade_db,
+            "total_dur_s": total_dur_s,
+        }
+        outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
+        seg_meta = {
+            "segments":    segments,
+            "wavs":        [w.copy() for w in seg_wavs],
+            "audio_path":  audio_path,
+            "video_path":  video_path,
+            "silent_video": silent_video,
+            "sr":          sr,
+            "model":       "hunyuan",
+            "crossfade_s": crossfade_s,
+            "crossfade_db": crossfade_db,
+            "total_dur_s": total_dur_s,
+        }
+        outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
+# ================================================================== #
+#                  SEGMENT REGENERATION HELPERS                       #
+# ================================================================== #
+# Each regen function:
+#   1. Runs inference for ONE segment (random seed, current settings)
+#   2. Splices the new wav into the stored wavs list
+#   3. Re-stitches the full track, re-saves .wav and re-muxes .mp4
+#   4. Returns (new_video_path, new_audio_path, updated_seg_meta, new_waveform_html)
+# ================================================================== #
+def _splice_and_save(new_wav, seg_idx, meta, slot_id):
+    """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
+    Returns (video_path, audio_path, updated_meta, waveform_html).
+    """
+    wavs         = [w.copy() for w in meta["wavs"]]
+    wavs[seg_idx]= new_wav
+    crossfade_s  = float(meta["crossfade_s"])
+    crossfade_db = float(meta["crossfade_db"])
+    sr           = int(meta["sr"])
+    total_dur_s  = float(meta["total_dur_s"])
+    silent_video = meta["silent_video"]
+    segments     = meta["segments"]
+    model        = meta["model"]
+    # Stitch (works for both mono and stereo)
+    stereo = wavs[0].ndim == 2
+    full_wav = wavs[0]
+    for nw in wavs[1:]:
+        full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
+    n_total = int(round(total_dur_s * sr))
+    if stereo:
+        full_wav = full_wav[:, :n_total]
+    else:
+        full_wav = full_wav[:n_total]
+    # Save new audio
+    tmp_dir    = os.path.dirname(meta["audio_path"])
+    audio_path = meta["audio_path"]   # overwrite in-place
+    if stereo:
+        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
+    else:
+        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)).unsqueeze(0), sr)
+    # Re-mux video
+    video_path = meta["video_path"]   # overwrite in-place
+    if model == "hunyuan":
+        # HunyuanFoley uses its own merge_audio_video
+        _hf_path = str(Path("HunyuanVideo-Foley").resolve())
+        if _hf_path not in sys.path:
+            sys.path.insert(0, _hf_path)
+        from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+        merge_audio_video(audio_path, silent_video, video_path)
+    else:
+        mux_video_audio(silent_video, audio_path, video_path)
+    updated_meta = dict(meta)
+    updated_meta["wavs"]       = wavs
+    updated_meta["audio_path"] = audio_path
+    updated_meta["video_path"] = video_path
+    hidden_el_id  = f"regen_trigger_{slot_id}"
+    waveform_html = _build_waveform_html(audio_path, segments, slot_id, hidden_el_id)
+    return video_path, audio_path, updated_meta, waveform_html
+def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
+                         seed_val, cfg_scale, num_steps, mode,
+                         crossfade_s, crossfade_db):
+    secs   = int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] TARO regen: 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
+    return result
+@spaces.GPU(duration=_taro_regen_duration)
+def regen_taro_segment(video_file, seg_idx, seg_meta_json,
+                       seed_val, cfg_scale, num_steps, mode,
+                       crossfade_s, crossfade_db, slot_id):
+    """Regenerate one TARO segment with a fresh random seed."""
+    meta    = json.loads(seg_meta_json)
+    seg_idx = int(seg_idx)
+    seg_start_s, seg_end_s = meta["segments"][seg_idx]
+    torch.set_grad_enabled(False)
+    device       = "cuda" if torch.cuda.is_available() else "cpu"
+    weight_dtype = torch.bfloat16
+    _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
+    if _taro_dir not in sys.path:
+        sys.path.insert(0, _taro_dir)
+    from TARO.cavp_util import Extract_CAVP_Features
+    from TARO.onset_util import VideoOnsetNet, extract_onset
+    from TARO.models    import MMDiT
+    from TARO.samplers  import euler_sampler, euler_maruyama_sampler
+    from diffusers      import AudioLDM2Pipeline
+    silent_video  = meta["silent_video"]
+    tmp_dir       = tempfile.mkdtemp()
+    extract_cavp  = Extract_CAVP_Features(device=device, config_path="TARO/cavp/cavp.yaml", ckpt_path=cavp_ckpt_path)
+    raw_sd        = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+    onset_sd      = {}
+    for k, v in raw_sd.items():
+        if "model.net.model" in k:   k = k.replace("model.net.model", "net.model")
+        elif "model.fc." in k:       k = k.replace("model.fc", "fc")
+        onset_sd[k] = v
+    onset_model   = VideoOnsetNet(pretrained=False).to(device)
+    onset_model.load_state_dict(onset_sd)
+    onset_model.eval()
+    model_net     = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
+    model_net.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
+    model_net.eval().to(weight_dtype)
+    audioldm2     = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
+    vae           = audioldm2.vae.to(device).eval()
+    vocoder       = audioldm2.vocoder.to(device)
+    latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
+    cavp_feats    = extract_cavp(silent_video, tmp_path=tmp_dir)
+    set_global_seed(random.randint(0, 2**32 - 1))
+    onset_feats   = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
+    new_wav = _taro_infer_segment(
+        model_net, vae, vocoder, cavp_feats, onset_feats,
+        seg_start_s, seg_end_s, device, weight_dtype,
+        float(cfg_scale), int(num_steps), mode, latents_scale,
+        euler_sampler, euler_maruyama_sampler,
+    )
+    # Deserialise stored wavs from lists back to numpy arrays (json roundtrip)
+    stored_wavs = [np.array(w, dtype=np.float32) for w in meta["wavs"]]
+    meta["wavs"] = stored_wavs
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
+    )
+    updated_meta["wavs"] = [w.tolist() for w in updated_meta["wavs"]]
+    return video_path, audio_path, json.dumps(updated_meta), waveform_html
+def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
+                             prompt, negative_prompt, seed_val,
+                             cfg_strength, num_steps, crossfade_s, crossfade_db):
+    secs   = int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] MMAudio regen: 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
+    return result
+@spaces.GPU(duration=_mmaudio_regen_duration)
+def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
+                          prompt, negative_prompt, seed_val,
+                          cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id):
+    """Regenerate one MMAudio segment with a fresh random seed."""
+    meta    = json.loads(seg_meta_json)
+    seg_idx = int(seg_idx)
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_dur = seg_end - seg_start
+    _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
+    if _mmaudio_dir not in sys.path:
+        sys.path.insert(0, _mmaudio_dir)
+    from mmaudio.eval_utils                  import all_model_cfg, generate, load_video
+    from mmaudio.model.flow_matching         import FlowMatching
+    from mmaudio.model.networks              import get_my_mmaudio
+    from mmaudio.model.utils.features_utils  import FeaturesUtils
+    from pathlib import Path as _Path
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype  = torch.bfloat16
+    model_cfg = all_model_cfg["large_44k_v2"]
+    model_cfg.model_path       = _Path(mmaudio_model_path)
+    model_cfg.vae_path         = _Path(mmaudio_vae_path)
+    model_cfg.synchformer_ckpt = _Path(mmaudio_synchformer_path)
+    model_cfg.bigvgan_16k_path = None
+    seq_cfg = model_cfg.seq_cfg
+    net = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
+    net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
+    feature_utils = FeaturesUtils(
+        tod_vae_ckpt=str(model_cfg.vae_path),
+        synchformer_ckpt=str(model_cfg.synchformer_ckpt),
+        enable_conditions=True, mode=model_cfg.mode,
+        bigvgan_vocoder_ckpt=None, need_vae_encoder=False,
+    ).to(device, dtype).eval()
+    sr          = seq_cfg.sampling_rate
+    silent_video = meta["silent_video"]
+    tmp_dir     = tempfile.mkdtemp()
+    seg_path    = os.path.join(tmp_dir, "regen_seg.mp4")
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    rng = torch.Generator(device=device)
+    rng.manual_seed(random.randint(0, 2**32 - 1))
+    fm          = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=int(num_steps))
+    video_info  = load_video(seg_path, seg_dur)
+    clip_frames = video_info.clip_frames.unsqueeze(0)
+    sync_frames = video_info.sync_frames.unsqueeze(0)
+    actual_dur  = video_info.duration_sec
+    seq_cfg.duration = actual_dur
+    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
+    with torch.no_grad():
+        audios = generate(
+            clip_frames, sync_frames, [prompt],
+            negative_text=[negative_prompt] if negative_prompt else None,
+            feature_utils=feature_utils, net=net, fm=fm, rng=rng,
+            cfg_strength=float(cfg_strength),
+        )
+    new_wav     = audios.float().cpu()[0].numpy()
+    seg_samples = int(round(seg_dur * sr))
+    new_wav     = new_wav[:, :seg_samples]
+    stored_wavs  = [np.array(w, dtype=np.float32) for w in meta["wavs"]]
+    meta["wavs"] = stored_wavs
+    meta["sr"]   = sr
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
+    )
+    updated_meta["wavs"] = [w.tolist() for w in updated_meta["wavs"]]
+    return video_path, audio_path, json.dumps(updated_meta), waveform_html
+def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
+                             prompt, negative_prompt, seed_val,
+                             guidance_scale, num_steps, model_size,
+                             crossfade_s, crossfade_db):
+    secs   = int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] HunyuanFoley regen: 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
+    return result
+@spaces.GPU(duration=_hunyuan_regen_duration)
+def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
+                          prompt, negative_prompt, seed_val,
+                          guidance_scale, num_steps, model_size,
+                          crossfade_s, crossfade_db, slot_id):
+    """Regenerate one HunyuanFoley segment with a fresh random seed."""
+    meta    = json.loads(seg_meta_json)
+    seg_idx = int(seg_idx)
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_dur = seg_end - seg_start
+    _hf_path = str(Path("HunyuanVideo-Foley").resolve())
+    if _hf_path not in sys.path:
+        sys.path.insert(0, _hf_path)
+    from hunyuanvideo_foley.utils.model_utils   import load_model, denoise_process
+    from hunyuanvideo_foley.utils.feature_utils  import feature_process
+    device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_size  = model_size.lower()
+    config_map  = {
+        "xl":  "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml",
+        "xxl": "HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml",
+    }
+    config_path = config_map.get(model_size, config_map["xxl"])
+    hunyuan_weights_dir = str(HUNYUAN_MODEL_DIR / "HunyuanVideo-Foley")
+    model_dict, cfg = load_model(hunyuan_weights_dir, config_path, device,
+                                  enable_offload=False, model_size=model_size)
+    set_global_seed(random.randint(0, 2**32 - 1))
+    silent_video = meta["silent_video"]
+    tmp_dir      = tempfile.mkdtemp()
+    seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    visual_feats, text_feats, seg_audio_len = feature_process(
+        seg_path, prompt if prompt else "", model_dict, cfg,
+        neg_prompt=negative_prompt if negative_prompt else None,
+    )
+    audio_batch, sr = denoise_process(
+        visual_feats, text_feats, seg_audio_len, model_dict, cfg,
+        guidance_scale=float(guidance_scale),
+        num_inference_steps=int(num_steps),
+        batch_size=1,
+    )
+    new_wav     = audio_batch[0].float().cpu().numpy()
+    seg_samples = int(round(seg_dur * sr))
+    new_wav     = new_wav[:, :seg_samples]
+    stored_wavs  = [np.array(w, dtype=np.float32) for w in meta["wavs"]]
+    meta["wavs"] = stored_wavs
+    meta["sr"]   = sr
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
+    )
+    updated_meta["wavs"] = [w.tolist() for w in updated_meta["wavs"]]
+    return video_path, audio_path, json.dumps(updated_meta), waveform_html
 # ================================================================== #
 #                        SHARED UI HELPERS                            #
 # ================================================================== #
 def _pad_outputs(outputs: list) -> list:
+    """Flatten (video, audio, seg_meta) triples and pad to MAX_SLOTS * 3 with None.
+    Each entry in *outputs* must be a (video_path, audio_path, seg_meta) tuple where
+    seg_meta = {"segments": [...], "audio_path": str, "video_path": str,
+                "sr": int, "model": str, "crossfade_s": float,
+                "crossfade_db": float, "wavs": list[np.ndarray]}
+    """
     result = []
     for i in range(MAX_SLOTS):
         if i < len(outputs):
+            result.extend(outputs[i])          # 3 items: video, audio, meta
         else:
+            result.extend([None, None, None])
     return result
+# ------------------------------------------------------------------ #
+# WaveSurfer waveform + segment marker HTML builder                   #
+# ------------------------------------------------------------------ #
+_WAVESURFER_CDN = "https://cdnjs.cloudflare.com/ajax/libs/wavesurfer.js/7.8.7/wavesurfer.min.js"
+_REGIONS_CDN    = "https://cdnjs.cloudflare.com/ajax/libs/wavesurfer.js/7.8.7/plugins/regions.min.js"
+def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
+                         hidden_input_id: str) -> str:
+    """Return a self-contained HTML block with a WaveSurfer waveform,
+    segment boundary markers, a play/pause button, and a download link.
+    Clicking a region shows a small popup near the cursor with a
+    "Regenerate" button.  Clicking elsewhere dismisses the popup.
+    Clicking "Regenerate" fires the hidden Gradio textbox to trigger Python.
+    Args:
+        audio_path:       absolute path to the .wav file
+        segments:         list of (start_s, end_s) tuples
+        slot_id:          unique string id for this slot (e.g. "taro_0")
+        hidden_input_id:  elem_id of the hidden gr.Textbox to fire
+    """
+    if not audio_path or not os.path.exists(audio_path):
+        return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
+    with open(audio_path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode()
+    data_uri = f"data:audio/wav;base64,{b64}"
+    segs_json = json.dumps(segments)
+    colors = ["rgba(100,180,255,0.22)", "rgba(255,160,100,0.22)",
+              "rgba(120,220,140,0.22)", "rgba(220,120,220,0.22)",
+              "rgba(255,220,80,0.22)",  "rgba(80,220,220,0.22)",
+              "rgba(255,100,100,0.22)", "rgba(180,255,180,0.22)"]
+    return f"""
+<div id="wf_container_{slot_id}"
+     style="background:#1a1a1a;border-radius:8px;padding:10px;margin-top:6px;position:relative;">
+  <div id="wf_{slot_id}" style="width:100%;min-height:80px;"></div>
+  <div style="display:flex;align-items:center;gap:8px;margin-top:6px;">
+    <button id="wf_playbtn_{slot_id}" onclick="wf_toggle_{slot_id}()"
+      style="background:#333;color:#eee;border:1px solid #555;border-radius:4px;
+             padding:3px 10px;font-size:12px;cursor:pointer;">&#9654; Play</button>
+    <span style="color:#888;font-size:11px;">Click a segment to regenerate</span>
+    <a href="{data_uri}" download="audio_{slot_id}.wav"
+       style="margin-left:auto;background:#333;color:#eee;border:1px solid #555;
+              border-radius:4px;padding:3px 10px;font-size:12px;text-decoration:none;">
+      &#8595; Download</a>
+  </div>
+  <div id="wf_seglabel_{slot_id}"
+       style="color:#aaa;font-size:11px;margin-top:4px;min-height:16px;"></div>
+  <!-- Popup that appears on segment click -->
+  <div id="wf_popup_{slot_id}"
+       style="display:none;position:fixed;z-index:9999;
+              background:#2a2a2a;border:1px solid #555;border-radius:6px;
+              padding:8px 12px;box-shadow:0 4px 16px rgba(0,0,0,0.5);">
+    <div id="wf_popup_label_{slot_id}"
+         style="color:#ccc;font-size:11px;margin-bottom:6px;white-space:nowrap;"></div>
+    <button id="wf_regen_btn_{slot_id}"
+            style="background:#1d6fa5;color:#fff;border:none;border-radius:4px;
+                   padding:5px 14px;font-size:12px;cursor:pointer;width:100%;">
+      &#10227; Regenerate
+    </button>
+  </div>
+</div>
+<script>
+(function() {{
+  // Guard against double-init on Gradio re-renders
+  if (window["_wf_init_{slot_id}"]) return;
+  window["_wf_init_{slot_id}"] = true;
+  let _pendingSegIdx_{slot_id} = null;
+  function fireRegen(idx) {{
+    const popup = document.getElementById('wf_popup_{slot_id}');
+    if (popup) popup.style.display = 'none';
+    const lbl = document.getElementById('wf_seglabel_{slot_id}');
+    const segs = {segs_json};
+    if (lbl) lbl.textContent = 'Regenerating Seg ' + (idx+1) +
+      ' (' + segs[idx][0].toFixed(2) + 's \u2013 ' + segs[idx][1].toFixed(2) + 's)\u2026';
+    // Trigger Gradio via the hidden textbox
+    const el = document.getElementById('{hidden_input_id}');
+    if (el) {{
+      const input = el.querySelector('input, textarea');
+      if (input) {{
+        const setter =
+          Object.getOwnPropertyDescriptor(window.HTMLInputElement.prototype, 'value').set ||
+          Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
+        setter.call(input, '{slot_id}|' + idx);
+        input.dispatchEvent(new Event('input', {{ bubbles: true }}));
+      }}
+    }}
+  }}
+  function showPopup(idx, mouseX, mouseY) {{
+    _pendingSegIdx_{slot_id} = idx;
+    const segs  = {segs_json};
+    const popup = document.getElementById('wf_popup_{slot_id}');
+    const plbl  = document.getElementById('wf_popup_label_{slot_id}');
+    if (plbl) plbl.textContent =
+      'Seg ' + (idx+1) + '  (' + segs[idx][0].toFixed(2) + 's \u2013 ' + segs[idx][1].toFixed(2) + 's)';
+    if (popup) {{
+      popup.style.display = 'block';
+      // Position near cursor, keep inside viewport
+      const vw = window.innerWidth, vh = window.innerHeight;
+      let x = mouseX + 10, y = mouseY + 10;
+      popup.style.left = x + 'px';
+      popup.style.top  = y + 'px';
+      // nudge back if off screen
+      requestAnimationFrame(function() {{
+        const r = popup.getBoundingClientRect();
+        if (r.right  > vw - 8) popup.style.left = (vw - r.width  - 8) + 'px';
+        if (r.bottom > vh - 8) popup.style.top  = (vh - r.height - 8) + 'px';
+      }});
+    }}
+  }}
+  function hidePopup() {{
+    const popup = document.getElementById('wf_popup_{slot_id}');
+    if (popup) popup.style.display = 'none';
+    _pendingSegIdx_{slot_id} = null;
+  }}
+  // Wire the Regenerate button
+  document.addEventListener('DOMContentLoaded', function() {{
+    const btn = document.getElementById('wf_regen_btn_{slot_id}');
+    if (btn) btn.addEventListener('click', function(e) {{
+      e.stopPropagation();
+      if (_pendingSegIdx_{slot_id} !== null) fireRegen(_pendingSegIdx_{slot_id});
+    }});
+  }});
+  // Also wire immediately in case DOM already loaded
+  (function tryWireBtn() {{
+    const btn = document.getElementById('wf_regen_btn_{slot_id}');
+    if (btn) {{
+      btn.onclick = function(e) {{
+        e.stopPropagation();
+        if (_pendingSegIdx_{slot_id} !== null) fireRegen(_pendingSegIdx_{slot_id});
+      }};
+    }} else {{
+      setTimeout(tryWireBtn, 100);
+    }}
+  }})();
+  // Dismiss popup on click outside
+  document.addEventListener('click', function(e) {{
+    const popup = document.getElementById('wf_popup_{slot_id}');
+    if (popup && popup.style.display !== 'none') {{
+      if (!popup.contains(e.target)) hidePopup();
+    }}
+  }}, true);
+  function loadWS() {{
+    if (!window.WaveSurfer || !window.WaveSurfer.Regions) {{
+      setTimeout(loadWS, 200);
+      return;
+    }}
+    const RegionsPlugin = window.WaveSurfer.Regions.create();
+    const ws = WaveSurfer.create({{
+      container:    '#wf_{slot_id}',
+      waveColor:    '#4a9eff',
+      progressColor:'#1a5fa8',
+      height:       80,
+      barWidth:     2,
+      barGap:       1,
+      barRadius:    2,
+      backend:      'WebAudio',
+      url:          '{data_uri}',
+      plugins:      [RegionsPlugin],
+    }});
+    window["_wf_ws_{slot_id}"] = ws;
+    window["wf_toggle_{slot_id}"] = function() {{ ws.playPause(); }};
+    const segments = {segs_json};
+    const colors   = {json.dumps(colors)};
+    ws.on('ready', function() {{
+      segments.forEach(function(seg, idx) {{
+        RegionsPlugin.addRegion({{
+          id:      'seg_' + idx,
+          start:   seg[0],
+          end:     seg[1],
+          color:   colors[idx % colors.length],
+          drag:    false,
+          resize:  false,
+          content: 'Seg ' + (idx + 1),
+        }});
+      }});
+    }});
+    RegionsPlugin.on('region-clicked', function(region, e) {{
+      e.stopPropagation();
+      const idx = parseInt(region.id.replace('seg_', ''));
+      showPopup(idx, e.clientX, e.clientY);
+    }});
+    ws.on('play',   function() {{
+      const b = document.getElementById('wf_playbtn_{slot_id}');
+      if (b) b.textContent = '\u23f8 Pause';
+    }});
+    ws.on('pause',  function() {{
+      const b = document.getElementById('wf_playbtn_{slot_id}');
+      if (b) b.textContent = '\u25b6 Play';
+    }});
+    ws.on('finish', function() {{
+      const b = document.getElementById('wf_playbtn_{slot_id}');
+      if (b) b.textContent = '\u25b6 Play';
+    }});
+  }}
+  if (!document.getElementById('wavesurfer_script')) {{
+    const s = document.createElement('script');
+    s.id  = 'wavesurfer_script';
+    s.src = '{_WAVESURFER_CDN}';
+    s.onload = function() {{
+      const r = document.createElement('script');
+      r.id  = 'wavesurfer_regions_script';
+      r.src = '{_REGIONS_CDN}';
+      r.onload = loadWS;
+      document.head.appendChild(r);
+    }};
+    document.head.appendChild(s);
+  }} else {{
+    loadWS();
+  }}
+}})();
+</script>
+"""
+def _make_output_slots(tab_prefix: str) -> tuple:
+    """Build MAX_SLOTS output groups for one tab.
+    Each slot has: video, waveform HTML, hidden regen trigger textbox, seg state.
+    Returns (grps, vids, waveforms, regen_triggers, seg_states).
+    """
+    grps, vids, waveforms, regen_triggers, seg_states = [], [], [], [], []
     for i in range(MAX_SLOTS):
         with gr.Group(visible=(i == 0)) as g:
+            slot_id = f"{tab_prefix}_{i}"
             vids.append(gr.Video(label=f"Generation {i+1} — Video"))
+            waveforms.append(gr.HTML(
+                value="<p style='color:#888;font-size:12px'>Generate audio to see waveform.</p>",
+                label=f"Generation {i+1} — Waveform",
+            ))
+            # Hidden textbox: JS writes "<slot_id>|<seg_idx>" here to trigger regen
+            regen_triggers.append(gr.Textbox(
+                value="",
+                visible=False,
+                elem_id=f"regen_trigger_{slot_id}",
+                label=f"regen_trigger_{slot_id}",
+            ))
+            seg_states.append(gr.State(value=None))
         grps.append(g)
+    return grps, vids, waveforms, regen_triggers, seg_states
+def _unpack_outputs(flat: list, n: int, tab_prefix: str) -> list:
+    """Turn a flat _pad_outputs list into Gradio update lists.
+    flat has MAX_SLOTS * 3 items: [vid0, aud0, meta0, vid1, aud1, meta1, ...]
+    Returns updates for: grps + vids + waveforms + seg_states
+    """
     n = int(n)
+    grp_updates  = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
+    vid_updates  = []
+    wave_updates = []
+    state_updates= []
+    for i in range(MAX_SLOTS):
+        vid_path  = flat[i * 3]
+        aud_path  = flat[i * 3 + 1]
+        meta      = flat[i * 3 + 2]
+        vid_updates.append(gr.update(value=vid_path))
+        if aud_path and meta:
+            slot_id       = f"{tab_prefix}_{i}"
+            hidden_el_id  = f"regen_trigger_{slot_id}"
+            html = _build_waveform_html(aud_path, meta["segments"], slot_id, hidden_el_id)
+            wave_updates.append(gr.update(value=html))
+            state_updates.append(meta)
+        else:
+            wave_updates.append(gr.update(
+                value="<p style='color:#888;font-size:12px'>Generate audio to see waveform.</p>"
+            ))
+            state_updates.append(None)
+    return grp_updates + vid_updates + wave_updates + state_updates
 def _on_video_upload_taro(video_file, num_steps, crossfade_s):
                     taro_btn     = gr.Button("Generate", variant="primary")
                 with gr.Column():
+                    (taro_slot_grps, taro_slot_vids,
+                     taro_slot_waves, taro_slot_rtrigs,
+                     taro_slot_states) = _make_output_slots("taro")
             for trigger in [taro_video, taro_steps, taro_cf_dur]:
                 trigger.change(
             )
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
+                flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
+                # Serialise wavs in meta to JSON-safe lists
+                for i in range(MAX_SLOTS):
+                    meta = flat[i * 3 + 2]
+                    if meta is not None:
+                        meta["wavs"] = [w.tolist() for w in meta["wavs"]]
+                        flat[i * 3 + 2] = meta
+                return _unpack_outputs(flat, n, "taro")
             taro_btn.click(
                 fn=_run_taro,
                 inputs=[taro_video, taro_seed, taro_cfg, taro_steps, taro_mode,
                         taro_cf_dur, taro_cf_db, taro_samples],
+                outputs=taro_slot_grps + taro_slot_vids + taro_slot_waves + taro_slot_states,
             )
+            # Per-slot regen trigger wiring for TARO
+            for _i, _rtrig in enumerate(taro_slot_rtrigs):
+                _slot_id = f"taro_{_i}"
+                def _make_taro_regen(_si, _sid):
+                    def _do(trigger_val, video, seed, cfg, steps, mode, cf_dur, cf_db, state):
+                        if not trigger_val or not state:
+                            return gr.update(), gr.update(), state, gr.update()
+                        parts = trigger_val.split("|")
+                        if len(parts) != 2 or parts[0] != _sid:
+                            return gr.update(), gr.update(), state, gr.update()
+                        seg_idx    = int(parts[1])
+                        meta_json  = json.dumps(state)
+                        vid, aud, new_meta_json, html = regen_taro_segment(
+                            video, seg_idx, meta_json,
+                            seed, cfg, steps, mode, cf_dur, cf_db, _sid,
+                        )
+                        new_meta = json.loads(new_meta_json)
+                        return gr.update(value=vid), gr.update(value=html), new_meta, gr.update(value="")
+                    return _do
+                _rtrig.change(
+                    fn=_make_taro_regen(_i, _slot_id),
+                    inputs=[_rtrig, taro_video, taro_seed, taro_cfg, taro_steps,
+                            taro_mode, taro_cf_dur, taro_cf_db, taro_slot_states[_i]],
+                    outputs=[taro_slot_vids[_i], taro_slot_waves[_i],
+                             taro_slot_states[_i], _rtrig],
+                )
         # ---------------------------------------------------------- #
         # Tab 2 — MMAudio                                             #
         # ---------------------------------------------------------- #
                     mma_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
+                    (mma_slot_grps, mma_slot_vids,
+                     mma_slot_waves, mma_slot_rtrigs,
+                     mma_slot_states) = _make_output_slots("mma")
             mma_samples.change(
                 fn=_update_slot_visibility,
             )
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
+                flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n)
+                for i in range(MAX_SLOTS):
+                    meta = flat[i * 3 + 2]
+                    if meta is not None:
+                        meta["wavs"] = [w.tolist() for w in meta["wavs"]]
+                        flat[i * 3 + 2] = meta
+                return _unpack_outputs(flat, n, "mma")
             mma_btn.click(
                 fn=_run_mmaudio,
                 inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
                         mma_cfg, mma_steps, mma_cf_dur, mma_cf_db, mma_samples],
+                outputs=mma_slot_grps + mma_slot_vids + mma_slot_waves + mma_slot_states,
             )
+            for _i, _rtrig in enumerate(mma_slot_rtrigs):
+                _slot_id = f"mma_{_i}"
+                def _make_mma_regen(_si, _sid):
+                    def _do(trigger_val, video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, state):
+                        if not trigger_val or not state:
+                            return gr.update(), gr.update(), state, gr.update()
+                        parts = trigger_val.split("|")
+                        if len(parts) != 2 or parts[0] != _sid:
+                            return gr.update(), gr.update(), state, gr.update()
+                        seg_idx   = int(parts[1])
+                        meta_json = json.dumps(state)
+                        vid, aud, new_meta_json, html = regen_mmaudio_segment(
+                            video, seg_idx, meta_json,
+                            prompt, neg, seed, cfg, steps, cf_dur, cf_db, _sid,
+                        )
+                        new_meta = json.loads(new_meta_json)
+                        return gr.update(value=vid), gr.update(value=html), new_meta, gr.update(value="")
+                    return _do
+                _rtrig.change(
+                    fn=_make_mma_regen(_i, _slot_id),
+                    inputs=[_rtrig, mma_video, mma_prompt, mma_neg, mma_seed,
+                            mma_cfg, mma_steps, mma_cf_dur, mma_cf_db, mma_slot_states[_i]],
+                    outputs=[mma_slot_vids[_i], mma_slot_waves[_i],
+                             mma_slot_states[_i], _rtrig],
+                )
         # ---------------------------------------------------------- #
         # Tab 3 — HunyuanVideoFoley                                   #
         # ---------------------------------------------------------- #
                     hf_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
+                    (hf_slot_grps, hf_slot_vids,
+                     hf_slot_waves, hf_slot_rtrigs,
+                     hf_slot_states) = _make_output_slots("hf")
             hf_samples.change(
                 fn=_update_slot_visibility,
             )
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
+                flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n)
+                for i in range(MAX_SLOTS):
+                    meta = flat[i * 3 + 2]
+                    if meta is not None:
+                        meta["wavs"] = [w.tolist() for w in meta["wavs"]]
+                        flat[i * 3 + 2] = meta
+                return _unpack_outputs(flat, n, "hf")
             hf_btn.click(
                 fn=_run_hunyuan,
                 inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
                         hf_guidance, hf_steps, hf_size, hf_cf_dur, hf_cf_db, hf_samples],
+                outputs=hf_slot_grps + hf_slot_vids + hf_slot_waves + hf_slot_states,
             )
+            for _i, _rtrig in enumerate(hf_slot_rtrigs):
+                _slot_id = f"hf_{_i}"
+                def _make_hf_regen(_si, _sid):
+                    def _do(trigger_val, video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, state):
+                        if not trigger_val or not state:
+                            return gr.update(), gr.update(), state, gr.update()
+                        parts = trigger_val.split("|")
+                        if len(parts) != 2 or parts[0] != _sid:
+                            return gr.update(), gr.update(), state, gr.update()
+                        seg_idx   = int(parts[1])
+                        meta_json = json.dumps(state)
+                        vid, aud, new_meta_json, html = regen_hunyuan_segment(
+                            video, seg_idx, meta_json,
+                            prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, _sid,
+                        )
+                        new_meta = json.loads(new_meta_json)
+                        return gr.update(value=vid), gr.update(value=html), new_meta, gr.update(value="")
+                    return _do
+                _rtrig.change(
+                    fn=_make_hf_regen(_i, _slot_id),
+                    inputs=[_rtrig, hf_video, hf_prompt, hf_neg, hf_seed,
+                            hf_guidance, hf_steps, hf_size, hf_cf_dur, hf_cf_db, hf_slot_states[_i]],
+                    outputs=[hf_slot_vids[_i], hf_slot_waves[_i],
+                             hf_slot_states[_i], _rtrig],
+                )
     # ---- Cross-tab video sync ----
     _sync = lambda v: (gr.update(value=v), gr.update(value=v))
     taro_video.change(fn=_sync, inputs=[taro_video], outputs=[mma_video, hf_video])
     mma_video.change(fn=_sync,  inputs=[mma_video],  outputs=[taro_video, hf_video])