Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 10 days ago

Commit

8697d49

1 Parent(s): 585a112

Cleanup: remove duplication and dead code across app.py

- Remove [DIAG] print left over from debugging
- Move torchaudio to top-level import; drop per-function import torchaudio
inside generate_mmaudio and generate_hunyuan
- Replace sf.write in generate_taro with torchaudio.save (consistent with
other two models); drop soundfile import
- Remove _taro_build_segments wrapper — was a one-liner; call _build_segments
with TARO_MODEL_DUR directly at both call sites
- Drop MMA_CF_S/MMA_CF_DB and CF_S/CF_DB local aliases in generate_mmaudio
and generate_hunyuan — cast crossfade_s/crossfade_db once at function top
- Extract _make_output_slots() — builds the 8-slot video+audio output column;
replaces identical 7-line loop duplicated across all 3 tabs
- Extract _unpack_outputs(flat, n) — turns _pad_outputs list into Gradio
update lists; replaces identical 4-line block in all 3 _run_* functions

Files changed (1) hide show

app.py +43 -66

app.py CHANGED Viewed

@@ -15,9 +15,8 @@ from math import floor
 from pathlib import Path
 import torch
-print(f"[DIAG] torch={torch.__version__}  cuda={torch.version.cuda}")
 import numpy as np
-import soundfile as sf
 import ffmpeg
 import spaces
 import gradio as gr
@@ -171,13 +170,8 @@ TARO_SECS_PER_STEP = 2.5   # estimated GPU-seconds per diffusion step
 _TARO_INFERENCE_CACHE: dict = {}
-def _taro_build_segments(total_dur_s: float, crossfade_s: float) -> list:
-    """Sliding-window segmentation using TARO's 8.192 s window."""
-    return _build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s)
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
-    n_segs        = len(_taro_build_segments(total_dur_s, crossfade_s))
     time_per_seg  = num_steps * TARO_SECS_PER_STEP
     max_s         = floor(600.0 / (n_segs * time_per_seg))
     return max(1, min(max_s, MAX_SLOTS))
@@ -325,7 +319,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
     total_dur_s = cavp_feats.shape[0] / TARO_FPS
-    segments    = _taro_build_segments(total_dur_s, crossfade_s)
     outputs = []
     for sample_idx in range(num_samples):
@@ -355,7 +349,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
-        sf.write(audio_path, final_wav, TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         outputs.append((video_path, audio_path))
@@ -382,7 +376,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, num_samples,
                      crossfade_s=1.0, crossfade_db=3.0):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
-    # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
     if _mmaudio_dir not in _sys.path:
@@ -431,10 +424,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
     # with a crossfade overlap and stitch the results into a full-length track.
-    total_dur_s = get_video_duration(video_file)
-    MMA_CF_S    = float(crossfade_s)
-    MMA_CF_DB   = float(crossfade_db)
-    segments = _build_segments(total_dur_s, MMAUDIO_WINDOW, MMA_CF_S)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
     sr = seq_cfg.sampling_rate   # 44100
@@ -488,7 +481,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
-            full_wav = _cf_join(full_wav, nw, MMA_CF_S, MMA_CF_DB, sr)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
@@ -521,7 +514,6 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, num_samples,
                      crossfade_s=2.0, crossfade_db=3.0):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
-    import torchaudio
     import sys as _sys
     # Ensure HunyuanVideo-Foley package is importable
     _hf_path = str(Path("HunyuanVideo-Foley").resolve())
@@ -564,10 +556,10 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
     # input into overlapping segments, generate audio for each, then crossfade-
     # stitch the results into a single full-length audio track.
-    total_dur_s = get_video_duration(video_file)
-    CF_S  = float(crossfade_s)
-    CF_DB = float(crossfade_db)
-    segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, CF_S)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-encode text features once (same for every segment)
@@ -624,7 +616,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
-            full_wav = _cf_join(full_wav, nw, CF_S, CF_DB, sr)
         # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
@@ -652,6 +644,27 @@ def _pad_outputs(outputs: list) -> list:
     return result
 def _on_video_upload_taro(video_file, num_steps, crossfade_s):
     if video_file is None:
         return gr.update(maximum=MAX_SLOTS, value=1)
@@ -702,14 +715,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     taro_btn     = gr.Button("Generate", variant="primary")
                 with gr.Column():
-                    taro_slot_grps, taro_slot_vids, taro_slot_auds = [], [], []
-                    for i in range(MAX_SLOTS):
-                        with gr.Group(visible=(i == 0)) as g:
-                            sv = gr.Video(label=f"Generation {i+1} — Video")
-                            sa = gr.Audio(label=f"Generation {i+1} — Audio")
-                        taro_slot_grps.append(g)
-                        taro_slot_vids.append(sv)
-                        taro_slot_auds.append(sa)
             for trigger in [taro_video, taro_steps, taro_cf_dur]:
                 trigger.change(
@@ -724,12 +730,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
-                flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
-                n = int(n)
-                grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
-                vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
-                aud_upd = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
-                return grp_upd + vid_upd + aud_upd
             taro_btn.click(
                 fn=_run_taro,
@@ -756,14 +757,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     mma_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
-                    mma_slot_grps, mma_slot_vids, mma_slot_auds = [], [], []
-                    for i in range(MAX_SLOTS):
-                        with gr.Group(visible=(i == 0)) as g:
-                            sv = gr.Video(label=f"Generation {i+1} — Video")
-                            sa = gr.Audio(label=f"Generation {i+1} — Audio")
-                        mma_slot_grps.append(g)
-                        mma_slot_vids.append(sv)
-                        mma_slot_auds.append(sa)
             mma_samples.change(
                 fn=_update_slot_visibility,
@@ -772,13 +766,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
-                flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
-                                        crossfade_s=cf_dur, crossfade_db=cf_db)
-                n = int(n)
-                grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
-                vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
-                aud_upd = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
-                return grp_upd + vid_upd + aud_upd
             mma_btn.click(
                 fn=_run_mmaudio,
@@ -806,14 +795,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     hf_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
-                    hf_slot_grps, hf_slot_vids, hf_slot_auds = [], [], []
-                    for i in range(MAX_SLOTS):
-                        with gr.Group(visible=(i == 0)) as g:
-                            sv = gr.Video(label=f"Generation {i+1} — Video")
-                            sa = gr.Audio(label=f"Generation {i+1} — Audio")
-                        hf_slot_grps.append(g)
-                        hf_slot_vids.append(sv)
-                        hf_slot_auds.append(sa)
             hf_samples.change(
                 fn=_update_slot_visibility,
@@ -822,13 +804,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
-                flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
-                                        crossfade_s=cf_dur, crossfade_db=cf_db)
-                n = int(n)
-                grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
-                vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
-                aud_upd = [gr.update(value=flat[i * 2 + 1]) for i in range(MAX_SLOTS)]
-                return grp_upd + vid_upd + aud_upd
             hf_btn.click(
                 fn=_run_hunyuan,

 from pathlib import Path
 import torch
 import numpy as np
+import torchaudio
 import ffmpeg
 import spaces
 import gradio as gr
 _TARO_INFERENCE_CACHE: dict = {}
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
+    n_segs        = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
     time_per_seg  = num_steps * TARO_SECS_PER_STEP
     max_s         = floor(600.0 / (n_segs * time_per_seg))
     return max(1, min(max_s, MAX_SLOTS))
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
     total_dur_s = cavp_feats.shape[0] / TARO_FPS
+    segments    = _build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s)
     outputs = []
     for sample_idx in range(num_samples):
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
+        torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         outputs.append((video_path, audio_path))
                      cfg_strength, num_steps, num_samples,
                      crossfade_s=1.0, crossfade_db=3.0):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
     if _mmaudio_dir not in _sys.path:
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
     # with a crossfade overlap and stitch the results into a full-length track.
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    total_dur_s  = get_video_duration(video_file)
+    segments     = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
     sr = seq_cfg.sampling_rate   # 44100
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
+            full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
                      guidance_scale, num_steps, model_size, num_samples,
                      crossfade_s=2.0, crossfade_db=3.0):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
     import sys as _sys
     # Ensure HunyuanVideo-Foley package is importable
     _hf_path = str(Path("HunyuanVideo-Foley").resolve())
     # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
     # input into overlapping segments, generate audio for each, then crossfade-
     # stitch the results into a single full-length audio track.
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    total_dur_s  = get_video_duration(video_file)
+    segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-encode text features once (same for every segment)
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
+            full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
         # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
     return result
+def _make_output_slots() -> tuple:
+    """Build MAX_SLOTS video+audio output groups. Returns (grps, vids, auds)."""
+    grps, vids, auds = [], [], []
+    for i in range(MAX_SLOTS):
+        with gr.Group(visible=(i == 0)) as g:
+            vids.append(gr.Video(label=f"Generation {i+1} — Video"))
+            auds.append(gr.Audio(label=f"Generation {i+1} — Audio"))
+        grps.append(g)
+    return grps, vids, auds
+def _unpack_outputs(flat: list, n: int) -> list:
+    """Turn a flat _pad_outputs list into Gradio update lists for grps+vids+auds."""
+    n = int(n)
+    return (
+        [gr.update(visible=(i < n))         for i in range(MAX_SLOTS)] +
+        [gr.update(value=flat[i * 2])       for i in range(MAX_SLOTS)] +
+        [gr.update(value=flat[i * 2 + 1])   for i in range(MAX_SLOTS)]
+    )
 def _on_video_upload_taro(video_file, num_steps, crossfade_s):
     if video_file is None:
         return gr.update(maximum=MAX_SLOTS, value=1)
                     taro_btn     = gr.Button("Generate", variant="primary")
                 with gr.Column():
+                    taro_slot_grps, taro_slot_vids, taro_slot_auds = _make_output_slots()
             for trigger in [taro_video, taro_steps, taro_cf_dur]:
                 trigger.change(
             )
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
+                return _unpack_outputs(generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n), n)
             taro_btn.click(
                 fn=_run_taro,
                     mma_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
+                    mma_slot_grps, mma_slot_vids, mma_slot_auds = _make_output_slots()
             mma_samples.change(
                 fn=_update_slot_visibility,
             )
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
+                return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
+                                                        crossfade_s=cf_dur, crossfade_db=cf_db), n)
             mma_btn.click(
                 fn=_run_mmaudio,
                     hf_btn      = gr.Button("Generate", variant="primary")
                 with gr.Column():
+                    hf_slot_grps, hf_slot_vids, hf_slot_auds = _make_output_slots()
             hf_samples.change(
                 fn=_update_slot_visibility,
             )
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
+                return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
+                                                        crossfade_s=cf_dur, crossfade_db=cf_db), n)
             hf_btn.click(
                 fn=_run_hunyuan,