BoxOfColors commited on
Commit
679c604
·
1 Parent(s): 537e4ca

Fix dynamic duration arg order mismatch + leftover torchaudio import

Browse files

Duration callables were receiving args in .click() positional order but
signatures had num_samples before crossfade_s/crossfade_db, causing
crossfade values to be interpreted as num_samples (inflating reservation).

Fix: reorder _mmaudio_duration, generate_mmaudio, _hunyuan_duration,
generate_hunyuan signatures to match .click() input order exactly:
...num_steps, crossfade_s, crossfade_db, num_samples

Update _run_mmaudio and _run_hunyuan wrappers to pass args positionally.
Move crossfade_s/crossfade_db casts to top of each generate function,
remove duplicate casts that were further down. Remove stray torchaudio
import inside generate_mmaudio (already top-level). Add diagnostic prints
to all three duration callables to log actual reservation in logs.

Files changed (1) hide show
  1. app.py +18 -22
app.py CHANGED
@@ -185,13 +185,14 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
185
 
186
  def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
187
  crossfade_s, crossfade_db, num_samples):
188
- """Pre-GPU callable: returns the GPU seconds to reserve for this TARO run."""
189
  try:
190
  total_s = get_video_duration(video_file)
191
  n_segs = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
192
  except Exception:
193
  n_segs = 1
194
  secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
 
195
  return max(60, int(secs))
196
 
197
 
@@ -391,36 +392,36 @@ MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
391
 
392
 
393
  def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
394
- cfg_strength, num_steps, num_samples,
395
- crossfade_s=2.0, crossfade_db=3.0):
396
- """Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
397
  try:
398
  total_s = get_video_duration(video_file)
399
  n_segs = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
400
  except Exception:
401
  n_segs = 1
402
  secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
 
403
  return max(60, int(secs))
404
 
405
 
406
  @spaces.GPU(duration=_mmaudio_duration)
407
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
408
- cfg_strength, num_steps, num_samples,
409
- crossfade_s=2.0, crossfade_db=3.0):
410
  """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
411
  import sys as _sys, os as _os
412
  _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
413
  if _mmaudio_dir not in _sys.path:
414
  _sys.path.insert(0, _mmaudio_dir)
415
 
416
- import torchaudio
417
  from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
418
  from mmaudio.model.flow_matching import FlowMatching
419
  from mmaudio.model.networks import get_my_mmaudio
420
  from mmaudio.model.utils.features_utils import FeaturesUtils
421
 
422
- seed_val = int(seed_val)
423
- num_samples = int(num_samples)
 
 
424
 
425
  device = "cuda" if torch.cuda.is_available() else "cpu"
426
  dtype = torch.bfloat16
@@ -456,8 +457,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
456
 
457
  # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
458
  # with a crossfade overlap and stitch the results into a full-length track.
459
- crossfade_s = float(crossfade_s)
460
- crossfade_db = float(crossfade_db)
461
  total_dur_s = get_video_duration(video_file)
462
  segments = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
463
  print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
@@ -543,22 +542,21 @@ HUNYUAN_MAX_DUR = 15.0 # seconds
543
 
544
 
545
  def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
546
- guidance_scale, num_steps, model_size, num_samples,
547
- crossfade_s=2.0, crossfade_db=3.0):
548
- """Pre-GPU callable: returns the GPU seconds to reserve for this HunyuanFoley run."""
549
  try:
550
  total_s = get_video_duration(video_file)
551
  n_segs = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
552
  except Exception:
553
  n_segs = 1
554
  secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
 
555
  return max(60, int(secs))
556
 
557
 
558
  @spaces.GPU(duration=_hunyuan_duration)
559
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
560
- guidance_scale, num_steps, model_size, num_samples,
561
- crossfade_s=2.0, crossfade_db=3.0):
562
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
563
  import sys as _sys
564
  # Ensure HunyuanVideo-Foley package is importable
@@ -572,6 +570,8 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
572
 
573
  seed_val = int(seed_val)
574
  num_samples = int(num_samples)
 
 
575
  if seed_val >= 0:
576
  set_global_seed(seed_val)
577
 
@@ -602,8 +602,6 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
602
  # HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
603
  # input into overlapping segments, generate audio for each, then crossfade-
604
  # stitch the results into a single full-length audio track.
605
- crossfade_s = float(crossfade_s)
606
- crossfade_db = float(crossfade_db)
607
  total_dur_s = get_video_duration(video_file)
608
  segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
609
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
@@ -812,8 +810,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
812
  )
813
 
814
  def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
815
- return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
816
- crossfade_s=cf_dur, crossfade_db=cf_db), n)
817
 
818
  mma_btn.click(
819
  fn=_run_mmaudio,
@@ -850,8 +847,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
850
  )
851
 
852
  def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
853
- return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
854
- crossfade_s=cf_dur, crossfade_db=cf_db), n)
855
 
856
  hf_btn.click(
857
  fn=_run_hunyuan,
 
185
 
186
  def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
187
  crossfade_s, crossfade_db, num_samples):
188
+ """Pre-GPU callable must match _run_taro's input order exactly."""
189
  try:
190
  total_s = get_video_duration(video_file)
191
  n_segs = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
192
  except Exception:
193
  n_segs = 1
194
  secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
195
+ print(f"[duration] TARO: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
196
  return max(60, int(secs))
197
 
198
 
 
392
 
393
 
394
  def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
395
+ cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
396
+ """Pre-GPU callable — must match _run_mmaudio's input order exactly."""
 
397
  try:
398
  total_s = get_video_duration(video_file)
399
  n_segs = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
400
  except Exception:
401
  n_segs = 1
402
  secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
403
+ print(f"[duration] MMAudio: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
404
  return max(60, int(secs))
405
 
406
 
407
  @spaces.GPU(duration=_mmaudio_duration)
408
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
409
+ cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
 
410
  """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
411
  import sys as _sys, os as _os
412
  _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
413
  if _mmaudio_dir not in _sys.path:
414
  _sys.path.insert(0, _mmaudio_dir)
415
 
 
416
  from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
417
  from mmaudio.model.flow_matching import FlowMatching
418
  from mmaudio.model.networks import get_my_mmaudio
419
  from mmaudio.model.utils.features_utils import FeaturesUtils
420
 
421
+ seed_val = int(seed_val)
422
+ num_samples = int(num_samples)
423
+ crossfade_s = float(crossfade_s)
424
+ crossfade_db = float(crossfade_db)
425
 
426
  device = "cuda" if torch.cuda.is_available() else "cpu"
427
  dtype = torch.bfloat16
 
457
 
458
  # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
459
  # with a crossfade overlap and stitch the results into a full-length track.
 
 
460
  total_dur_s = get_video_duration(video_file)
461
  segments = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
462
  print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
 
542
 
543
 
544
  def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
545
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
546
+ """Pre-GPU callable — must match _run_hunyuan's input order exactly."""
 
547
  try:
548
  total_s = get_video_duration(video_file)
549
  n_segs = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
550
  except Exception:
551
  n_segs = 1
552
  secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
553
+ print(f"[duration] HunyuanFoley: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
554
  return max(60, int(secs))
555
 
556
 
557
  @spaces.GPU(duration=_hunyuan_duration)
558
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
559
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
 
560
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
561
  import sys as _sys
562
  # Ensure HunyuanVideo-Foley package is importable
 
570
 
571
  seed_val = int(seed_val)
572
  num_samples = int(num_samples)
573
+ crossfade_s = float(crossfade_s)
574
+ crossfade_db = float(crossfade_db)
575
  if seed_val >= 0:
576
  set_global_seed(seed_val)
577
 
 
602
  # HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
603
  # input into overlapping segments, generate audio for each, then crossfade-
604
  # stitch the results into a single full-length audio track.
 
 
605
  total_dur_s = get_video_duration(video_file)
606
  segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
607
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
 
810
  )
811
 
812
  def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
813
+ return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n), n)
 
814
 
815
  mma_btn.click(
816
  fn=_run_mmaudio,
 
847
  )
848
 
849
  def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
850
+ return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n), n)
 
851
 
852
  hf_btn.click(
853
  fn=_run_hunyuan,