Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 3 days ago

Commit

51979c2

1 Parent(s): d141e30

refactor: extract shared helpers to reduce technical debt

- Replace _splice_and_save inline stitch with _stitch_wavs (fixes latent stereo bug)
- Extract _build_seg_meta helper (deduplicates 3 identical dict constructions)
- Extract _cpu_preprocess helper (deduplicates 3 identical pre-processing blocks)
- Extract _save_wav helper (deduplicates mono/stereo torchaudio.save logic)
- Extract _log_inference_timing helper (deduplicates 3 identical timing blocks)
- Remove redundant 'from pathlib import Path as _Path' in _load_mmaudio_models
- Remove unnecessary 'global _TARO_INFERENCE_CACHE' statement

Files changed (1) hide show

app.py +228 -246

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ Supported models
   HunyuanFoley  – text-guided foley via SigLIP2 + Synchformer + CLAP (48 kHz, up to 15 s)
 """
 import os
 import sys
 import json
@@ -78,6 +79,41 @@ print("CLAP model pre-downloaded.")
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
 # Per-slot reentrant locks — prevent concurrent regens on the same slot from
 # producing a race condition where the second regen reads stale state
 # (the shared seg_state textbox hasn't been updated yet by the first regen).
@@ -91,11 +127,12 @@ def _get_slot_lock(slot_id: str) -> threading.Lock:
             _SLOT_LOCKS[slot_id] = threading.Lock()
         return _SLOT_LOCKS[slot_id]
-def set_global_seed(seed: int):
     np.random.seed(seed % (2**32))
     random.seed(seed)
     torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
 def get_random_seed() -> int:
     return random.randint(0, 2**32 - 1)
@@ -105,7 +142,7 @@ def get_video_duration(video_path: str) -> float:
     probe = ffmpeg.probe(video_path)
     return float(probe["format"]["duration"])
-def strip_audio_from_video(video_path: str, output_path: str):
     """Write a silent copy of *video_path* to *output_path* (stream-copy, no re-encode)."""
     ffmpeg.input(video_path).output(output_path, vcodec="copy", an=None).run(
         overwrite_output=True, quiet=True
@@ -130,7 +167,7 @@ def _register_tmp_dir(tmp_dir: str) -> str:
     return tmp_dir
-def _save_seg_wavs(wavs: list, tmp_dir: str, prefix: str) -> list:
     """Save a list of numpy wav arrays to .npy files, return list of paths.
     This avoids serialising large float arrays into JSON/HTML data-state."""
     paths = []
@@ -141,7 +178,7 @@ def _save_seg_wavs(wavs: list, tmp_dir: str, prefix: str) -> list:
     return paths
-def _load_seg_wavs(paths: list) -> list:
     """Load segment wav arrays from .npy file paths."""
     return [np.load(p) for p in paths]
@@ -190,12 +227,11 @@ def _load_mmaudio_models(device, dtype):
     from mmaudio.eval_utils               import all_model_cfg
     from mmaudio.model.networks            import get_my_mmaudio
     from mmaudio.model.utils.features_utils import FeaturesUtils
-    from pathlib import Path as _Path
     model_cfg = all_model_cfg["large_44k_v2"]
-    model_cfg.model_path       = _Path(mmaudio_model_path)
-    model_cfg.vae_path         = _Path(mmaudio_vae_path)
-    model_cfg.synchformer_ckpt = _Path(mmaudio_synchformer_path)
     model_cfg.bigvgan_16k_path = None
     seq_cfg = model_cfg.seq_cfg
@@ -225,7 +261,7 @@ def _load_hunyuan_model(device, model_size):
                       enable_offload=False, model_size=model_size)
-def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
     """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
     ffmpeg.output(
         ffmpeg.input(silent_video),
@@ -240,7 +276,7 @@ def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
 # Used by all three models (TARO, MMAudio, HunyuanFoley).            #
 # ------------------------------------------------------------------ #
-def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) -> list:
     """Return list of (start, end) pairs covering *total_dur_s* with a sliding
     window of *window_s* and *crossfade_s* overlap between consecutive segments."""
     if total_dur_s <= window_s:
@@ -460,12 +496,66 @@ def _taro_infer_segment(
     return wav[:seg_samples]
-def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float,
                  total_dur_s: float, sr: int) -> np.ndarray:
     out = wavs[0]
     for nw in wavs[1:]:
         out = _cf_join(out, nw, crossfade_s, db_boost, sr)
-    return out[:int(round(total_dur_s * sr))]
 @spaces.GPU(duration=_taro_duration)
@@ -473,8 +563,6 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
                     crossfade_s, crossfade_db, num_samples):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
-    global _TARO_INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     num_samples  = int(num_samples)
@@ -482,13 +570,9 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
         seed_val = random.randint(0, 2**32 - 1)
     torch.set_grad_enabled(False)
-    device       = "cuda" if torch.cuda.is_available() else "cpu"
-    weight_dtype = torch.bfloat16
-    _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
-    if _taro_dir not in sys.path:
-        sys.path.insert(0, _taro_dir)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
@@ -500,9 +584,16 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     total_dur_s = ctx["total_dur_s"]
     extract_cavp, onset_model = _load_taro_feature_extractors(device)
-    model, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
     results = []   # list of (wavs, onset_feats) per sample
     for sample_idx in range(num_samples):
@@ -516,7 +607,6 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
             results.append((cached["wavs"], cavp_feats, None))
         else:
             set_global_seed(sample_seed)
-            onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
             wavs = []
             _t_infer_start = time.perf_counter()
             for seg_start_s, seg_end_s in segments:
@@ -531,12 +621,8 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
                     euler_sampler, euler_maruyama_sampler,
                 )
                 wavs.append(wav)
-            _t_infer_elapsed = time.perf_counter() - _t_infer_start
-            _n_segs = len(segments)
-            _secs_per_step = _t_infer_elapsed / (_n_segs * int(num_steps)) if _n_segs * int(num_steps) > 0 else 0
-            print(f"[TARO] Inference done: {_n_segs} seg(s) × {int(num_steps)} steps in "
-                  f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
-                  f"(current constant={TARO_SECS_PER_STEP})")
             with _TARO_CACHE_LOCK:
                 _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
                 while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
@@ -563,11 +649,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     num_samples  = int(num_samples)
     # ── CPU pre-processing (no GPU needed) ──
-    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
-    total_dur_s  = get_video_duration(video_file)
-    segments     = _build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s)
     # Pass pre-computed CPU results to the GPU function via context
     _taro_gpu_infer._cpu_ctx = {
@@ -588,7 +671,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
-        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
@@ -598,20 +681,12 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
             if onset_feats is not None:
                 np.save(onset_path, onset_feats)
             first_cavp_saved = True
-        seg_meta = {
-            "segments":    segments,
-            "wav_paths":   wav_paths,
-            "audio_path":  audio_path,
-            "video_path":  video_path,
-            "silent_video": silent_video,
-            "sr":          TARO_SR,
-            "model":       "taro",
-            "crossfade_s": crossfade_s,
-            "crossfade_db": crossfade_db,
-            "total_dur_s": total_dur_s,
-            "cavp_path":   cavp_path,
-            "onset_path":  onset_path,
-        }
         outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
@@ -643,10 +718,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
-    _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
-    if _mmaudio_dir not in sys.path:
-        sys.path.insert(0, _mmaudio_dir)
     from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
@@ -654,8 +726,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype  = torch.bfloat16
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
@@ -709,12 +780,8 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
             wav = wav[:, :seg_samples]
             seg_audios.append(wav)
-        _t_mma_elapsed = time.perf_counter() - _t_mma_start
-        _n_segs_mma = len(segments)
-        _secs_per_step_mma = _t_mma_elapsed / (_n_segs_mma * int(num_steps)) if _n_segs_mma * int(num_steps) > 0 else 0
-        print(f"[MMAudio] Inference done: {_n_segs_mma} seg(s) × {int(num_steps)} steps in "
-              f"{_t_mma_elapsed:.1f}s wall → {_secs_per_step_mma:.3f}s/step "
-              f"(current constant={MMAUDIO_SECS_PER_STEP})")
         results.append((seg_audios, sr))
         # Free GPU memory between samples to prevent VRAM fragmentation
@@ -735,21 +802,14 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     crossfade_db = float(crossfade_db)
     # ── CPU pre-processing ──
-    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
-    total_dur_s  = get_video_duration(video_file)
-    segments     = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
-    seg_clip_paths = []
-    for seg_i, (seg_start, seg_end) in enumerate(segments):
-        seg_dur = seg_end - seg_start
-        seg_path = os.path.join(tmp_dir, f"mma_seg_{seg_i}.mp4")
-        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-            seg_path, vcodec="copy", an=None
-        ).run(overwrite_output=True, quiet=True)
-        seg_clip_paths.append(seg_path)
     _mmaudio_gpu_infer._cpu_ctx = {
         "segments": segments, "seg_clip_paths": seg_clip_paths,
@@ -762,28 +822,19 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     # ── CPU post-processing ──
     outputs = []
     for sample_idx, (seg_audios, sr) in enumerate(results):
-        full_wav = seg_audios[0]
-        for nw in seg_audios[1:]:
-            full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
-        full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.wav")
-        torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(seg_audios, tmp_dir, f"mmaudio_{sample_idx}")
-        seg_meta = {
-            "segments":    segments,
-            "wav_paths":   wav_paths,
-            "audio_path":  audio_path,
-            "video_path":  video_path,
-            "silent_video": silent_video,
-            "sr":          sr,
-            "model":       "mmaudio",
-            "crossfade_s": crossfade_s,
-            "crossfade_db": crossfade_db,
-            "total_dur_s": total_dur_s,
-        }
         outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
@@ -816,10 +867,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
-    _hf_path = str(Path("HunyuanVideo-Foley").resolve())
-    if _hf_path not in sys.path:
-        sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
@@ -829,8 +877,9 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     if seed_val >= 0:
         set_global_seed(seed_val)
-    device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model_size  = model_size.lower()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
@@ -882,12 +931,8 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
-        _t_hny_elapsed = time.perf_counter() - _t_hny_start
-        _n_segs_hny = len(segments)
-        _secs_per_step_hny = _t_hny_elapsed / (_n_segs_hny * int(num_steps)) if _n_segs_hny * int(num_steps) > 0 else 0
-        print(f"[HunyuanFoley] Inference done: {_n_segs_hny} seg(s) × {int(num_steps)} steps in "
-              f"{_t_hny_elapsed:.1f}s wall → {_secs_per_step_hny:.3f}s/step "
-              f"(current constant={HUNYUAN_SECS_PER_STEP})")
         results.append((seg_wavs, sr, text_feats))
         # Free GPU memory between samples to prevent VRAM fragmentation
@@ -908,28 +953,21 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     crossfade_db = float(crossfade_db)
     # ── CPU pre-processing (no GPU needed) ──
-    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
-    total_dur_s  = get_video_duration(silent_video)
-    segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-extract dummy segment for text feature extraction (ffmpeg, CPU)
-    dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
-    ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
-        dummy_seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     # Pre-extract all segment clips (ffmpeg, CPU)
-    seg_clip_paths = []
-    for seg_i, (seg_start, seg_end) in enumerate(segments):
-        seg_dur = seg_end - seg_start
-        seg_path = os.path.join(tmp_dir, f"hny_seg_{seg_i}.mp4")
-        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-            seg_path, vcodec="copy", an=None
-        ).run(overwrite_output=True, quiet=True)
-        seg_clip_paths.append(seg_path)
     _hunyuan_gpu_infer._cpu_ctx = {
         "segments": segments, "total_dur_s": total_dur_s,
@@ -942,38 +980,26 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                                  crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
-    _hf_path = str(Path("HunyuanVideo-Foley").resolve())
-    if _hf_path not in sys.path:
-        sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.media_utils import merge_audio_video
     outputs = []
     for sample_idx, (seg_wavs, sr, text_feats) in enumerate(results):
-        full_wav = seg_wavs[0]
-        for nw in seg_wavs[1:]:
-            full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
-        full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
-        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
         wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"hunyuan_{sample_idx}")
         text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
         torch.save(text_feats, text_feats_path)
-        seg_meta = {
-            "segments":    segments,
-            "wav_paths":   wav_paths,
-            "audio_path":  audio_path,
-            "video_path":  video_path,
-            "silent_video": silent_video,
-            "sr":          sr,
-            "model":       "hunyuan",
-            "crossfade_s": crossfade_s,
-            "crossfade_db": crossfade_db,
-            "total_dur_s": total_dur_s,
-            "text_feats_path": text_feats_path,
-        }
         outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
@@ -1003,16 +1029,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     segments     = meta["segments"]
     model        = meta["model"]
-    # Stitch (works for both mono and stereo)
-    stereo = wavs[0].ndim == 2
-    full_wav = wavs[0]
-    for nw in wavs[1:]:
-        full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
-    n_total = int(round(total_dur_s * sr))
-    if stereo:
-        full_wav = full_wav[:, :n_total]
-    else:
-        full_wav = full_wav[:n_total]
     # Save new audio — use a new timestamped filename so Gradio / the browser
     # treats it as a genuinely different file and reloads the video player.
@@ -1022,10 +1039,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     # Strip any previous timestamp suffix before adding a new one
     _base_clean = _base.rsplit("_regen_", 1)[0]
     audio_path = os.path.join(tmp_dir, f"{_base_clean}_regen_{_ts}.wav")
-    if stereo:
-        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
-    else:
-        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)).unsqueeze(0), sr)
     # Re-mux into a new video file so the browser is forced to reload it
     _vid_base   = os.path.splitext(os.path.basename(meta["video_path"]))[0]
@@ -1033,9 +1047,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     video_path  = os.path.join(tmp_dir, f"{_vid_base_clean}_regen_{_ts}.mp4")
     if model == "hunyuan":
         # HunyuanFoley uses its own merge_audio_video
-        _hf_path = str(Path("HunyuanVideo-Foley").resolve())
-        if _hf_path not in sys.path:
-            sys.path.insert(0, _hf_path)
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
         merge_audio_video(audio_path, silent_video, video_path)
     else:
@@ -1072,13 +1084,9 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     seg_start_s, seg_end_s = meta["segments"][seg_idx]
     torch.set_grad_enabled(False)
-    device       = "cuda" if torch.cuda.is_available() else "cpu"
-    weight_dtype = torch.bfloat16
-    _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
-    if _taro_dir not in sys.path:
-        sys.path.insert(0, _taro_dir)
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     cavp_path = meta.get("cavp_path")
@@ -1095,6 +1103,10 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
         tmp_dir      = tempfile.mkdtemp()
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
     model_net, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
@@ -1143,15 +1155,11 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     seg_start, seg_end = meta["segments"][seg_idx]
     seg_dur = seg_end - seg_start
-    _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
-    if _mmaudio_dir not in sys.path:
-        sys.path.insert(0, _mmaudio_dir)
     from mmaudio.eval_utils          import generate, load_video
     from mmaudio.model.flow_matching import FlowMatching
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype  = torch.bfloat16
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
@@ -1160,12 +1168,10 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     seg_path = _regen_mmaudio_gpu._cpu_ctx.get("seg_path")
     if not seg_path:
         # Fallback: extract inside GPU (shouldn't happen)
-        silent_video = meta["silent_video"]
-        tmp_dir      = tempfile.mkdtemp()
-        seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
-        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-            seg_path, vcodec="copy", an=None
-        ).run(overwrite_output=True, quiet=True)
     rng = torch.Generator(device=device)
     rng.manual_seed(random.randint(0, 2**32 - 1))
@@ -1203,12 +1209,11 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
     seg_dur = seg_end - seg_start
     # CPU: pre-extract segment clip
-    silent_video = meta["silent_video"]
-    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
-    seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
-    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-        seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
     # GPU: inference only
@@ -1243,14 +1248,12 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     seg_start, seg_end = meta["segments"][seg_idx]
     seg_dur = seg_end - seg_start
-    _hf_path = str(Path("HunyuanVideo-Foley").resolve())
-    if _hf_path not in sys.path:
-        sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
-    device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model_dict, cfg = _load_hunyuan_model(device, model_size)
     set_global_seed(random.randint(0, 2**32 - 1))
@@ -1258,12 +1261,10 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     # Use pre-extracted segment clip from wrapper
     seg_path = _regen_hunyuan_gpu._cpu_ctx.get("seg_path")
     if not seg_path:
-        silent_video = meta["silent_video"]
-        tmp_dir      = tempfile.mkdtemp()
-        seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
-        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-            seg_path, vcodec="copy", an=None
-        ).run(overwrite_output=True, quiet=True)
     text_feats_path = meta.get("text_feats_path")
     if text_feats_path and os.path.exists(text_feats_path):
@@ -1302,12 +1303,11 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     seg_dur = seg_end - seg_start
     # CPU: pre-extract segment clip
-    silent_video = meta["silent_video"]
-    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
-    seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
-    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-        seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
     # GPU: inference only
@@ -1374,6 +1374,19 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
     return wav
 def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db,
@@ -1381,7 +1394,6 @@ def xregen_taro(seg_idx, state_json, slot_id,
     """Cross-model regen: run TARO inference and splice into *slot_id*."""
     meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
-    slot_sr = int(meta["sr"])
     # Show pending waveform immediately
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
@@ -1390,11 +1402,7 @@ def xregen_taro(seg_idx, state_json, slot_id,
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
-    slot_wavs = _load_seg_wavs(meta["wav_paths"])
-    new_wav = _resample_to_slot_sr(new_wav_raw, TARO_SR, slot_sr, slot_wavs[0])
-    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
-        new_wav, seg_idx, meta, slot_id
-    )
     yield gr.update(value=video_path), gr.update(value=waveform_html)
@@ -1406,30 +1414,22 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
     meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
-    seg_dur = seg_end - seg_start
-    slot_sr = int(meta["sr"])
     # Show pending waveform immediately
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
-    silent_video = meta["silent_video"]
-    tmp_dir      = tempfile.mkdtemp()
-    seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
-    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-        seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
     new_wav_raw, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                              prompt, negative_prompt, seed_val,
                                              cfg_strength, num_steps,
                                              crossfade_s, crossfade_db, slot_id)
-    slot_wavs = _load_seg_wavs(meta["wav_paths"])
-    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
-    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
-        new_wav, seg_idx, meta, slot_id
-    )
     yield gr.update(value=video_path), gr.update(value=waveform_html)
@@ -1442,30 +1442,22 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
     meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
-    seg_dur = seg_end - seg_start
-    slot_sr = int(meta["sr"])
     # Show pending waveform immediately
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
-    silent_video = meta["silent_video"]
-    tmp_dir      = tempfile.mkdtemp()
-    seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
-    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-        seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
     new_wav_raw, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                              prompt, negative_prompt, seed_val,
                                              guidance_scale, num_steps, model_size,
                                              crossfade_s, crossfade_db, slot_id)
-    slot_wavs = _load_seg_wavs(meta["wav_paths"])
-    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
-    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
-        new_wav, seg_idx, meta, slot_id
-    )
     yield gr.update(value=video_path), gr.update(value=waveform_html)
@@ -1567,10 +1559,7 @@ def _build_regen_pending_html(segments: list, regen_seg_idx: int, slot_id: str,
     Renders a dark bar with the active segment highlighted in amber + a spinner.
     """
     segs_json = json.dumps(segments)
-    seg_colors = ["rgba(100,180,255,0.25)", "rgba(255,160,100,0.25)",
-                  "rgba(120,220,140,0.25)", "rgba(220,120,220,0.25)",
-                  "rgba(255,220,80,0.25)",  "rgba(80,220,220,0.25)",
-                  "rgba(255,100,100,0.25)", "rgba(180,255,180,0.25)"]
     active_color = "rgba(255,180,0,0.55)"
     duration = segments[-1][1] if segments else 1.0
@@ -1637,11 +1626,7 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
     audio_url = f"/gradio_api/file={audio_path}"
     segs_json = json.dumps(segments)
-    seg_colors = ["rgba(100,180,255,0.35)", "rgba(255,160,100,0.35)",
-                  "rgba(120,220,140,0.35)", "rgba(220,120,220,0.35)",
-                  "rgba(255,220,80,0.35)",  "rgba(80,220,220,0.35)",
-                  "rgba(255,100,100,0.35)", "rgba(180,255,180,0.35)"]
     # NOTE: Gradio updates gr.HTML via innerHTML which does NOT execute <script> tags.
     # Solution: put the entire waveform (canvas + JS) inside an <iframe srcdoc="...">.
@@ -1845,11 +1830,8 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
 </html>"""
     # Escape for HTML attribute (srcdoc uses HTML entities)
-    import html as _html
-    srcdoc = _html.escape(iframe_inner, quote=True)
-    import html as _html2
-    state_escaped  = _html2.escape(state_json or "", quote=True)
     return f"""
 <div id="wf_container_{slot_id}"

   HunyuanFoley  – text-guided foley via SigLIP2 + Synchformer + CLAP (48 kHz, up to 15 s)
 """
+import html as _html
 import os
 import sys
 import json
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
+# Segment overlay palette — shared between _build_waveform_html and _build_regen_pending_html
+SEG_COLORS = [
+    "rgba(100,180,255,{a})", "rgba(255,160,100,{a})",
+    "rgba(120,220,140,{a})", "rgba(220,120,220,{a})",
+    "rgba(255,220,80,{a})",  "rgba(80,220,220,{a})",
+    "rgba(255,100,100,{a})", "rgba(180,255,180,{a})",
+]
+# ------------------------------------------------------------------ #
+# Micro-helpers that eliminate repeated boilerplate across the file   #
+# ------------------------------------------------------------------ #
+def _ensure_syspath(subdir: str) -> str:
+    """Add *subdir* (relative to app.py) to sys.path if not already present.
+    Returns the absolute path for convenience."""
+    p = os.path.join(os.path.dirname(os.path.abspath(__file__)), subdir)
+    if p not in sys.path:
+        sys.path.insert(0, p)
+    return p
+def _get_device_and_dtype() -> tuple:
+    """Return (device, weight_dtype) pair used by all GPU functions."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    return device, torch.bfloat16
+def _extract_segment_clip(silent_video: str, seg_start: float, seg_dur: float,
+                          output_path: str) -> str:
+    """Stream-copy a segment from *silent_video* to *output_path*. Returns *output_path*."""
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        output_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    return output_path
 # Per-slot reentrant locks — prevent concurrent regens on the same slot from
 # producing a race condition where the second regen reads stale state
 # (the shared seg_state textbox hasn't been updated yet by the first regen).
             _SLOT_LOCKS[slot_id] = threading.Lock()
         return _SLOT_LOCKS[slot_id]
+def set_global_seed(seed: int) -> None:
     np.random.seed(seed % (2**32))
     random.seed(seed)
     torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
 def get_random_seed() -> int:
     return random.randint(0, 2**32 - 1)
     probe = ffmpeg.probe(video_path)
     return float(probe["format"]["duration"])
+def strip_audio_from_video(video_path: str, output_path: str) -> None:
     """Write a silent copy of *video_path* to *output_path* (stream-copy, no re-encode)."""
     ffmpeg.input(video_path).output(output_path, vcodec="copy", an=None).run(
         overwrite_output=True, quiet=True
     return tmp_dir
+def _save_seg_wavs(wavs: list[np.ndarray], tmp_dir: str, prefix: str) -> list[str]:
     """Save a list of numpy wav arrays to .npy files, return list of paths.
     This avoids serialising large float arrays into JSON/HTML data-state."""
     paths = []
     return paths
+def _load_seg_wavs(paths: list[str]) -> list[np.ndarray]:
     """Load segment wav arrays from .npy file paths."""
     return [np.load(p) for p in paths]
     from mmaudio.eval_utils               import all_model_cfg
     from mmaudio.model.networks            import get_my_mmaudio
     from mmaudio.model.utils.features_utils import FeaturesUtils
     model_cfg = all_model_cfg["large_44k_v2"]
+    model_cfg.model_path       = Path(mmaudio_model_path)
+    model_cfg.vae_path         = Path(mmaudio_vae_path)
+    model_cfg.synchformer_ckpt = Path(mmaudio_synchformer_path)
     model_cfg.bigvgan_16k_path = None
     seq_cfg = model_cfg.seq_cfg
                       enable_offload=False, model_size=model_size)
+def mux_video_audio(silent_video: str, audio_path: str, output_path: str) -> None:
     """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
     ffmpeg.output(
         ffmpeg.input(silent_video),
 # Used by all three models (TARO, MMAudio, HunyuanFoley).            #
 # ------------------------------------------------------------------ #
+def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) -> list[tuple[float, float]]:
     """Return list of (start, end) pairs covering *total_dur_s* with a sliding
     window of *window_s* and *crossfade_s* overlap between consecutive segments."""
     if total_dur_s <= window_s:
     return wav[:seg_samples]
+def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
                  total_dur_s: float, sr: int) -> np.ndarray:
+    """Crossfade-join a list of wav arrays and trim to *total_dur_s*.
+    Works for both mono (T,) and stereo (C, T) arrays."""
     out = wavs[0]
     for nw in wavs[1:]:
         out = _cf_join(out, nw, crossfade_s, db_boost, sr)
+    n = int(round(total_dur_s * sr))
+    return out[:, :n] if out.ndim == 2 else out[:n]
+def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
+    """Save a numpy wav array (mono or stereo) to *path* via torchaudio."""
+    t = torch.from_numpy(np.ascontiguousarray(wav))
+    if t.ndim == 1:
+        t = t.unsqueeze(0)
+    torchaudio.save(path, t, sr)
+def _log_inference_timing(label: str, elapsed: float, n_segs: int,
+                          num_steps: int, constant: float) -> None:
+    """Print a standardised inference-timing summary line."""
+    total_steps = n_segs * num_steps
+    secs_per_step = elapsed / total_steps if total_steps > 0 else 0
+    print(f"[{label}] Inference done: {n_segs} seg(s) × {num_steps} steps in "
+          f"{elapsed:.1f}s wall → {secs_per_step:.3f}s/step "
+          f"(current constant={constant})")
+def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
+                    silent_video, sr, model, crossfade_s, crossfade_db,
+                    total_dur_s, **extras) -> dict:
+    """Build the seg_meta dict shared by all three generate_* functions.
+    Model-specific keys are passed via **extras."""
+    meta = {
+        "segments":     segments,
+        "wav_paths":    wav_paths,
+        "audio_path":   audio_path,
+        "video_path":   video_path,
+        "silent_video": silent_video,
+        "sr":           sr,
+        "model":        model,
+        "crossfade_s":  crossfade_s,
+        "crossfade_db": crossfade_db,
+        "total_dur_s":  total_dur_s,
+    }
+    meta.update(extras)
+    return meta
+def _cpu_preprocess(video_file: str, model_dur: float,
+                    crossfade_s: float) -> tuple:
+    """Shared CPU pre-processing for all generate_* wrappers.
+    Returns (tmp_dir, silent_video, total_dur_s, segments)."""
+    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
+    total_dur_s  = get_video_duration(video_file)
+    segments     = _build_segments(total_dur_s, model_dur, crossfade_s)
+    return tmp_dir, silent_video, total_dur_s, segments
 @spaces.GPU(duration=_taro_duration)
                     crossfade_s, crossfade_db, num_samples):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     num_samples  = int(num_samples)
         seed_val = random.randint(0, 2**32 - 1)
     torch.set_grad_enabled(False)
+    device, weight_dtype = _get_device_and_dtype()
+    _ensure_syspath("TARO")
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
     total_dur_s = ctx["total_dur_s"]
     extract_cavp, onset_model = _load_taro_feature_extractors(device)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
+    # Onset features depend only on the video — extract once for all samples
+    onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
+    # Free feature extractors before loading the heavier inference models
+    del extract_cavp, onset_model
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
     results = []   # list of (wavs, onset_feats) per sample
     for sample_idx in range(num_samples):
             results.append((cached["wavs"], cavp_feats, None))
         else:
             set_global_seed(sample_seed)
             wavs = []
             _t_infer_start = time.perf_counter()
             for seg_start_s, seg_end_s in segments:
                     euler_sampler, euler_maruyama_sampler,
                 )
                 wavs.append(wav)
+            _log_inference_timing("TARO", time.perf_counter() - _t_infer_start,
+                                  len(segments), int(num_steps), TARO_SECS_PER_STEP)
             with _TARO_CACHE_LOCK:
                 _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
                 while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
     num_samples  = int(num_samples)
     # ── CPU pre-processing (no GPU needed) ──
+    tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
+        video_file, TARO_MODEL_DUR, crossfade_s)
     # Pass pre-computed CPU results to the GPU function via context
     _taro_gpu_infer._cpu_ctx = {
     for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
+        _save_wav(audio_path, final_wav, TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
             if onset_feats is not None:
                 np.save(onset_path, onset_feats)
             first_cavp_saved = True
+        seg_meta = _build_seg_meta(
+            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
+            video_path=video_path, silent_video=silent_video, sr=TARO_SR,
+            model="taro", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+            total_dur_s=total_dur_s, cavp_path=cavp_path, onset_path=onset_path,
+        )
         outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
                        cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
+    _ensure_syspath("MMAudio")
     from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
+    device, dtype = _get_device_and_dtype()
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
             wav = wav[:, :seg_samples]
             seg_audios.append(wav)
+        _log_inference_timing("MMAudio", time.perf_counter() - _t_mma_start,
+                              len(segments), int(num_steps), MMAUDIO_SECS_PER_STEP)
         results.append((seg_audios, sr))
         # Free GPU memory between samples to prevent VRAM fragmentation
     crossfade_db = float(crossfade_db)
     # ── CPU pre-processing ──
+    tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
+        video_file, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
+    seg_clip_paths = [
+        _extract_segment_clip(silent_video, s, e - s, os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
+        for i, (s, e) in enumerate(segments)
+    ]
     _mmaudio_gpu_infer._cpu_ctx = {
         "segments": segments, "seg_clip_paths": seg_clip_paths,
     # ── CPU post-processing ──
     outputs = []
     for sample_idx, (seg_audios, sr) in enumerate(results):
+        full_wav = _stitch_wavs(seg_audios, crossfade_s, crossfade_db, total_dur_s, sr)
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.wav")
+        _save_wav(audio_path, full_wav, sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(seg_audios, tmp_dir, f"mmaudio_{sample_idx}")
+        seg_meta = _build_seg_meta(
+            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
+            video_path=video_path, silent_video=silent_video, sr=sr,
+            model="mmaudio", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+            total_dur_s=total_dur_s,
+        )
         outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
+    _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     if seed_val >= 0:
         set_global_seed(seed_val)
+    device, _    = _get_device_and_dtype()
+    device       = torch.device(device)
+    model_size   = model_size.lower()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
+        _log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
+                              len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
         results.append((seg_wavs, sr, text_feats))
         # Free GPU memory between samples to prevent VRAM fragmentation
     crossfade_db = float(crossfade_db)
     # ── CPU pre-processing (no GPU needed) ──
+    tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
+        video_file, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-extract dummy segment for text feature extraction (ffmpeg, CPU)
+    dummy_seg_path = _extract_segment_clip(
+        silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
+        os.path.join(tmp_dir, "_seg_dummy.mp4"),
+    )
     # Pre-extract all segment clips (ffmpeg, CPU)
+    seg_clip_paths = [
+        _extract_segment_clip(silent_video, s, e - s, os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
+        for i, (s, e) in enumerate(segments)
+    ]
     _hunyuan_gpu_infer._cpu_ctx = {
         "segments": segments, "total_dur_s": total_dur_s,
                                  crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
+    _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.media_utils import merge_audio_video
     outputs = []
     for sample_idx, (seg_wavs, sr, text_feats) in enumerate(results):
+        full_wav = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr)
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
+        _save_wav(audio_path, full_wav, sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
         wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"hunyuan_{sample_idx}")
         text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
         torch.save(text_feats, text_feats_path)
+        seg_meta = _build_seg_meta(
+            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
+            video_path=video_path, silent_video=silent_video, sr=sr,
+            model="hunyuan", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+            total_dur_s=total_dur_s, text_feats_path=text_feats_path,
+        )
         outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
     segments     = meta["segments"]
     model        = meta["model"]
+    full_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, sr)
     # Save new audio — use a new timestamped filename so Gradio / the browser
     # treats it as a genuinely different file and reloads the video player.
     # Strip any previous timestamp suffix before adding a new one
     _base_clean = _base.rsplit("_regen_", 1)[0]
     audio_path = os.path.join(tmp_dir, f"{_base_clean}_regen_{_ts}.wav")
+    _save_wav(audio_path, full_wav, sr)
     # Re-mux into a new video file so the browser is forced to reload it
     _vid_base   = os.path.splitext(os.path.basename(meta["video_path"]))[0]
     video_path  = os.path.join(tmp_dir, f"{_vid_base_clean}_regen_{_ts}.mp4")
     if model == "hunyuan":
         # HunyuanFoley uses its own merge_audio_video
+        _ensure_syspath("HunyuanVideo-Foley")
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
         merge_audio_video(audio_path, silent_video, video_path)
     else:
     seg_start_s, seg_end_s = meta["segments"][seg_idx]
     torch.set_grad_enabled(False)
+    device, weight_dtype = _get_device_and_dtype()
+    _ensure_syspath("TARO")
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     cavp_path = meta.get("cavp_path")
         tmp_dir      = tempfile.mkdtemp()
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
+        # Free feature extractors before loading inference models
+        del extract_cavp, onset_model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     model_net, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
     seg_start, seg_end = meta["segments"][seg_idx]
     seg_dur = seg_end - seg_start
+    _ensure_syspath("MMAudio")
     from mmaudio.eval_utils          import generate, load_video
     from mmaudio.model.flow_matching import FlowMatching
+    device, dtype = _get_device_and_dtype()
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
     seg_path = _regen_mmaudio_gpu._cpu_ctx.get("seg_path")
     if not seg_path:
         # Fallback: extract inside GPU (shouldn't happen)
+        seg_path = _extract_segment_clip(
+            meta["silent_video"], seg_start, seg_dur,
+            os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
+        )
     rng = torch.Generator(device=device)
     rng.manual_seed(random.randint(0, 2**32 - 1))
     seg_dur = seg_end - seg_start
     # CPU: pre-extract segment clip
+    tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
+    seg_path = _extract_segment_clip(
+        meta["silent_video"], seg_start, seg_dur,
+        os.path.join(tmp_dir, "regen_seg.mp4"),
+    )
     _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
     # GPU: inference only
     seg_start, seg_end = meta["segments"][seg_idx]
     seg_dur = seg_end - seg_start
+    _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
+    device, _   = _get_device_and_dtype()
+    device      = torch.device(device)
     model_dict, cfg = _load_hunyuan_model(device, model_size)
     set_global_seed(random.randint(0, 2**32 - 1))
     # Use pre-extracted segment clip from wrapper
     seg_path = _regen_hunyuan_gpu._cpu_ctx.get("seg_path")
     if not seg_path:
+        seg_path = _extract_segment_clip(
+            meta["silent_video"], seg_start, seg_dur,
+            os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
+        )
     text_feats_path = meta.get("text_feats_path")
     if text_feats_path and os.path.exists(text_feats_path):
     seg_dur = seg_end - seg_start
     # CPU: pre-extract segment clip
+    tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
+    seg_path = _extract_segment_clip(
+        meta["silent_video"], seg_start, seg_dur,
+        os.path.join(tmp_dir, "regen_seg.mp4"),
+    )
     _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
     # GPU: inference only
     return wav
+def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
+                   meta: dict, seg_idx: int, slot_id: str) -> tuple:
+    """Shared epilogue for all xregen_* functions: resample → splice → save.
+    Returns (video_path, waveform_html)."""
+    slot_sr   = int(meta["sr"])
+    slot_wavs = _load_seg_wavs(meta["wav_paths"])
+    new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
+    )
+    return video_path, waveform_html
 def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db,
     """Cross-model regen: run TARO inference and splice into *slot_id*."""
     meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     # Show pending waveform immediately
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
+    video_path, waveform_html = _xregen_splice(new_wav_raw, TARO_SR, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)
     meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
     # Show pending waveform immediately
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
+    seg_path = _extract_segment_clip(
+        meta["silent_video"], seg_start, seg_end - seg_start,
+        os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
+    )
     _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
     new_wav_raw, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                              prompt, negative_prompt, seed_val,
                                              cfg_strength, num_steps,
                                              crossfade_s, crossfade_db, slot_id)
+    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)
     meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
     # Show pending waveform immediately
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
+    seg_path = _extract_segment_clip(
+        meta["silent_video"], seg_start, seg_end - seg_start,
+        os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
+    )
     _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
     new_wav_raw, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                              prompt, negative_prompt, seed_val,
                                              guidance_scale, num_steps, model_size,
                                              crossfade_s, crossfade_db, slot_id)
+    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)
     Renders a dark bar with the active segment highlighted in amber + a spinner.
     """
     segs_json = json.dumps(segments)
+    seg_colors = [c.format(a="0.25") for c in SEG_COLORS]
     active_color = "rgba(255,180,0,0.55)"
     duration = segments[-1][1] if segments else 1.0
     audio_url = f"/gradio_api/file={audio_path}"
     segs_json = json.dumps(segments)
+    seg_colors = [c.format(a="0.35") for c in SEG_COLORS]
     # NOTE: Gradio updates gr.HTML via innerHTML which does NOT execute <script> tags.
     # Solution: put the entire waveform (canvas + JS) inside an <iframe srcdoc="...">.
 </html>"""
     # Escape for HTML attribute (srcdoc uses HTML entities)
+    srcdoc         = _html.escape(iframe_inner, quote=True)
+    state_escaped  = _html.escape(state_json or "", quote=True)
     return f"""
 <div id="wf_container_{slot_id}"