Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 5 days ago

Commit

679c604

1 Parent(s): 537e4ca

Fix dynamic duration arg order mismatch + leftover torchaudio import

Duration callables were receiving args in .click() positional order but
signatures had num_samples before crossfade_s/crossfade_db, causing
crossfade values to be interpreted as num_samples (inflating reservation).

Fix: reorder _mmaudio_duration, generate_mmaudio, _hunyuan_duration,
generate_hunyuan signatures to match .click() input order exactly:
...num_steps, crossfade_s, crossfade_db, num_samples

Update _run_mmaudio and _run_hunyuan wrappers to pass args positionally.
Move crossfade_s/crossfade_db casts to top of each generate function,
remove duplicate casts that were further down. Remove stray torchaudio
import inside generate_mmaudio (already top-level). Add diagnostic prints
to all three duration callables to log actual reservation in logs.

Files changed (1) hide show

app.py +18 -22

app.py CHANGED Viewed

@@ -185,13 +185,14 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
 def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
                    crossfade_s, crossfade_db, num_samples):
-    """Pre-GPU callable: returns the GPU seconds to reserve for this TARO run."""
     try:
         total_s = get_video_duration(video_file)
         n_segs  = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
     return max(60, int(secs))
@@ -391,36 +392,36 @@ MMAUDIO_WINDOW = 8.0   # seconds — MMAudio's fixed generation window
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
-                      cfg_strength, num_steps, num_samples,
-                      crossfade_s=2.0, crossfade_db=3.0):
-    """Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
     try:
         total_s = get_video_duration(video_file)
         n_segs  = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
     return max(60, int(secs))
 @spaces.GPU(duration=_mmaudio_duration)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
-                     cfg_strength, num_steps, num_samples,
-                     crossfade_s=2.0, crossfade_db=3.0):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
     if _mmaudio_dir not in _sys.path:
         _sys.path.insert(0, _mmaudio_dir)
-    import torchaudio
     from mmaudio.eval_utils        import all_model_cfg, generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
     from mmaudio.model.networks        import get_my_mmaudio
     from mmaudio.model.utils.features_utils import FeaturesUtils
-    seed_val    = int(seed_val)
-    num_samples = int(num_samples)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
@@ -456,8 +457,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
     # with a crossfade overlap and stitch the results into a full-length track.
-    crossfade_s  = float(crossfade_s)
-    crossfade_db = float(crossfade_db)
     total_dur_s  = get_video_duration(video_file)
     segments     = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
@@ -543,22 +542,21 @@ HUNYUAN_MAX_DUR = 15.0   # seconds
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
-                      guidance_scale, num_steps, model_size, num_samples,
-                      crossfade_s=2.0, crossfade_db=3.0):
-    """Pre-GPU callable: returns the GPU seconds to reserve for this HunyuanFoley run."""
     try:
         total_s = get_video_duration(video_file)
         n_segs  = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
     return max(60, int(secs))
 @spaces.GPU(duration=_hunyuan_duration)
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
-                     guidance_scale, num_steps, model_size, num_samples,
-                     crossfade_s=2.0, crossfade_db=3.0):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
     import sys as _sys
     # Ensure HunyuanVideo-Foley package is importable
@@ -572,6 +570,8 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     if seed_val >= 0:
         set_global_seed(seed_val)
@@ -602,8 +602,6 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
     # input into overlapping segments, generate audio for each, then crossfade-
     # stitch the results into a single full-length audio track.
-    crossfade_s  = float(crossfade_s)
-    crossfade_db = float(crossfade_db)
     total_dur_s  = get_video_duration(video_file)
     segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
@@ -812,8 +810,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
-                return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
-                                                        crossfade_s=cf_dur, crossfade_db=cf_db), n)
             mma_btn.click(
                 fn=_run_mmaudio,
@@ -850,8 +847,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             )
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
-                return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
-                                                        crossfade_s=cf_dur, crossfade_db=cf_db), n)
             hf_btn.click(
                 fn=_run_hunyuan,

 def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
                    crossfade_s, crossfade_db, num_samples):
+    """Pre-GPU callable — must match _run_taro's input order exactly."""
     try:
         total_s = get_video_duration(video_file)
         n_segs  = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
+    print(f"[duration] TARO: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
     return max(60, int(secs))
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
+                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
+    """Pre-GPU callable — must match _run_mmaudio's input order exactly."""
     try:
         total_s = get_video_duration(video_file)
         n_segs  = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
+    print(f"[duration] MMAudio: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
     return max(60, int(secs))
 @spaces.GPU(duration=_mmaudio_duration)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
+                     cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
     if _mmaudio_dir not in _sys.path:
         _sys.path.insert(0, _mmaudio_dir)
     from mmaudio.eval_utils        import all_model_cfg, generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
     from mmaudio.model.networks        import get_my_mmaudio
     from mmaudio.model.utils.features_utils import FeaturesUtils
+    seed_val     = int(seed_val)
+    num_samples  = int(num_samples)
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
     # with a crossfade overlap and stitch the results into a full-length track.
     total_dur_s  = get_video_duration(video_file)
     segments     = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
+                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
+    """Pre-GPU callable — must match _run_hunyuan's input order exactly."""
     try:
         total_s = get_video_duration(video_file)
         n_segs  = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
+    print(f"[duration] HunyuanFoley: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
     return max(60, int(secs))
 @spaces.GPU(duration=_hunyuan_duration)
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
+                     guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
     import sys as _sys
     # Ensure HunyuanVideo-Foley package is importable
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
     if seed_val >= 0:
         set_global_seed(seed_val)
     # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
     # input into overlapping segments, generate audio for each, then crossfade-
     # stitch the results into a single full-length audio track.
     total_dur_s  = get_video_duration(video_file)
     segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
             )
             def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
+                return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n), n)
             mma_btn.click(
                 fn=_run_mmaudio,
             )
             def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
+                return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n), n)
             hf_btn.click(
                 fn=_run_hunyuan,