Spaces:
Running on Zero
Fix dynamic duration arg order mismatch + leftover torchaudio import
Browse filesDuration callables were receiving args in .click() positional order but
signatures had num_samples before crossfade_s/crossfade_db, causing
crossfade values to be interpreted as num_samples (inflating reservation).
Fix: reorder _mmaudio_duration, generate_mmaudio, _hunyuan_duration,
generate_hunyuan signatures to match .click() input order exactly:
...num_steps, crossfade_s, crossfade_db, num_samples
Update _run_mmaudio and _run_hunyuan wrappers to pass args positionally.
Move crossfade_s/crossfade_db casts to top of each generate function,
remove duplicate casts that were further down. Remove stray torchaudio
import inside generate_mmaudio (already top-level). Add diagnostic prints
to all three duration callables to log actual reservation in logs.
|
@@ -185,13 +185,14 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
|
|
| 185 |
|
| 186 |
def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
|
| 187 |
crossfade_s, crossfade_db, num_samples):
|
| 188 |
-
"""Pre-GPU callable
|
| 189 |
try:
|
| 190 |
total_s = get_video_duration(video_file)
|
| 191 |
n_segs = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
|
| 192 |
except Exception:
|
| 193 |
n_segs = 1
|
| 194 |
secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
|
|
|
|
| 195 |
return max(60, int(secs))
|
| 196 |
|
| 197 |
|
|
@@ -391,36 +392,36 @@ MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
|
|
| 391 |
|
| 392 |
|
| 393 |
def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
|
| 394 |
-
cfg_strength, num_steps,
|
| 395 |
-
|
| 396 |
-
"""Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
|
| 397 |
try:
|
| 398 |
total_s = get_video_duration(video_file)
|
| 399 |
n_segs = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
|
| 400 |
except Exception:
|
| 401 |
n_segs = 1
|
| 402 |
secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
|
|
|
|
| 403 |
return max(60, int(secs))
|
| 404 |
|
| 405 |
|
| 406 |
@spaces.GPU(duration=_mmaudio_duration)
|
| 407 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 408 |
-
cfg_strength, num_steps,
|
| 409 |
-
crossfade_s=2.0, crossfade_db=3.0):
|
| 410 |
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 411 |
import sys as _sys, os as _os
|
| 412 |
_mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
|
| 413 |
if _mmaudio_dir not in _sys.path:
|
| 414 |
_sys.path.insert(0, _mmaudio_dir)
|
| 415 |
|
| 416 |
-
import torchaudio
|
| 417 |
from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
|
| 418 |
from mmaudio.model.flow_matching import FlowMatching
|
| 419 |
from mmaudio.model.networks import get_my_mmaudio
|
| 420 |
from mmaudio.model.utils.features_utils import FeaturesUtils
|
| 421 |
|
| 422 |
-
seed_val
|
| 423 |
-
num_samples
|
|
|
|
|
|
|
| 424 |
|
| 425 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 426 |
dtype = torch.bfloat16
|
|
@@ -456,8 +457,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 456 |
|
| 457 |
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 458 |
# with a crossfade overlap and stitch the results into a full-length track.
|
| 459 |
-
crossfade_s = float(crossfade_s)
|
| 460 |
-
crossfade_db = float(crossfade_db)
|
| 461 |
total_dur_s = get_video_duration(video_file)
|
| 462 |
segments = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
|
| 463 |
print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
|
|
@@ -543,22 +542,21 @@ HUNYUAN_MAX_DUR = 15.0 # seconds
|
|
| 543 |
|
| 544 |
|
| 545 |
def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
|
| 546 |
-
guidance_scale, num_steps, model_size,
|
| 547 |
-
|
| 548 |
-
"""Pre-GPU callable: returns the GPU seconds to reserve for this HunyuanFoley run."""
|
| 549 |
try:
|
| 550 |
total_s = get_video_duration(video_file)
|
| 551 |
n_segs = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
|
| 552 |
except Exception:
|
| 553 |
n_segs = 1
|
| 554 |
secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
|
|
|
|
| 555 |
return max(60, int(secs))
|
| 556 |
|
| 557 |
|
| 558 |
@spaces.GPU(duration=_hunyuan_duration)
|
| 559 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 560 |
-
guidance_scale, num_steps, model_size,
|
| 561 |
-
crossfade_s=2.0, crossfade_db=3.0):
|
| 562 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
|
| 563 |
import sys as _sys
|
| 564 |
# Ensure HunyuanVideo-Foley package is importable
|
|
@@ -572,6 +570,8 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 572 |
|
| 573 |
seed_val = int(seed_val)
|
| 574 |
num_samples = int(num_samples)
|
|
|
|
|
|
|
| 575 |
if seed_val >= 0:
|
| 576 |
set_global_seed(seed_val)
|
| 577 |
|
|
@@ -602,8 +602,6 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 602 |
# HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
|
| 603 |
# input into overlapping segments, generate audio for each, then crossfade-
|
| 604 |
# stitch the results into a single full-length audio track.
|
| 605 |
-
crossfade_s = float(crossfade_s)
|
| 606 |
-
crossfade_db = float(crossfade_db)
|
| 607 |
total_dur_s = get_video_duration(video_file)
|
| 608 |
segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
|
| 609 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
|
@@ -812,8 +810,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 812 |
)
|
| 813 |
|
| 814 |
def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
|
| 815 |
-
return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
|
| 816 |
-
crossfade_s=cf_dur, crossfade_db=cf_db), n)
|
| 817 |
|
| 818 |
mma_btn.click(
|
| 819 |
fn=_run_mmaudio,
|
|
@@ -850,8 +847,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 850 |
)
|
| 851 |
|
| 852 |
def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
|
| 853 |
-
return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
|
| 854 |
-
crossfade_s=cf_dur, crossfade_db=cf_db), n)
|
| 855 |
|
| 856 |
hf_btn.click(
|
| 857 |
fn=_run_hunyuan,
|
|
|
|
| 185 |
|
| 186 |
def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
|
| 187 |
crossfade_s, crossfade_db, num_samples):
|
| 188 |
+
"""Pre-GPU callable — must match _run_taro's input order exactly."""
|
| 189 |
try:
|
| 190 |
total_s = get_video_duration(video_file)
|
| 191 |
n_segs = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
|
| 192 |
except Exception:
|
| 193 |
n_segs = 1
|
| 194 |
secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
|
| 195 |
+
print(f"[duration] TARO: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
|
| 196 |
return max(60, int(secs))
|
| 197 |
|
| 198 |
|
|
|
|
| 392 |
|
| 393 |
|
| 394 |
def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
|
| 395 |
+
cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
|
| 396 |
+
"""Pre-GPU callable — must match _run_mmaudio's input order exactly."""
|
|
|
|
| 397 |
try:
|
| 398 |
total_s = get_video_duration(video_file)
|
| 399 |
n_segs = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
|
| 400 |
except Exception:
|
| 401 |
n_segs = 1
|
| 402 |
secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
|
| 403 |
+
print(f"[duration] MMAudio: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
|
| 404 |
return max(60, int(secs))
|
| 405 |
|
| 406 |
|
| 407 |
@spaces.GPU(duration=_mmaudio_duration)
|
| 408 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 409 |
+
cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
|
|
|
|
| 410 |
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 411 |
import sys as _sys, os as _os
|
| 412 |
_mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
|
| 413 |
if _mmaudio_dir not in _sys.path:
|
| 414 |
_sys.path.insert(0, _mmaudio_dir)
|
| 415 |
|
|
|
|
| 416 |
from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
|
| 417 |
from mmaudio.model.flow_matching import FlowMatching
|
| 418 |
from mmaudio.model.networks import get_my_mmaudio
|
| 419 |
from mmaudio.model.utils.features_utils import FeaturesUtils
|
| 420 |
|
| 421 |
+
seed_val = int(seed_val)
|
| 422 |
+
num_samples = int(num_samples)
|
| 423 |
+
crossfade_s = float(crossfade_s)
|
| 424 |
+
crossfade_db = float(crossfade_db)
|
| 425 |
|
| 426 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 427 |
dtype = torch.bfloat16
|
|
|
|
| 457 |
|
| 458 |
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 459 |
# with a crossfade overlap and stitch the results into a full-length track.
|
|
|
|
|
|
|
| 460 |
total_dur_s = get_video_duration(video_file)
|
| 461 |
segments = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
|
| 462 |
print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
|
|
|
|
| 542 |
|
| 543 |
|
| 544 |
def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
|
| 545 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 546 |
+
"""Pre-GPU callable — must match _run_hunyuan's input order exactly."""
|
|
|
|
| 547 |
try:
|
| 548 |
total_s = get_video_duration(video_file)
|
| 549 |
n_segs = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
|
| 550 |
except Exception:
|
| 551 |
n_segs = 1
|
| 552 |
secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
|
| 553 |
+
print(f"[duration] HunyuanFoley: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
|
| 554 |
return max(60, int(secs))
|
| 555 |
|
| 556 |
|
| 557 |
@spaces.GPU(duration=_hunyuan_duration)
|
| 558 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 559 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
|
|
|
| 560 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
|
| 561 |
import sys as _sys
|
| 562 |
# Ensure HunyuanVideo-Foley package is importable
|
|
|
|
| 570 |
|
| 571 |
seed_val = int(seed_val)
|
| 572 |
num_samples = int(num_samples)
|
| 573 |
+
crossfade_s = float(crossfade_s)
|
| 574 |
+
crossfade_db = float(crossfade_db)
|
| 575 |
if seed_val >= 0:
|
| 576 |
set_global_seed(seed_val)
|
| 577 |
|
|
|
|
| 602 |
# HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
|
| 603 |
# input into overlapping segments, generate audio for each, then crossfade-
|
| 604 |
# stitch the results into a single full-length audio track.
|
|
|
|
|
|
|
| 605 |
total_dur_s = get_video_duration(video_file)
|
| 606 |
segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
|
| 607 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
|
|
|
| 810 |
)
|
| 811 |
|
| 812 |
def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
|
| 813 |
+
return _unpack_outputs(generate_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n), n)
|
|
|
|
| 814 |
|
| 815 |
mma_btn.click(
|
| 816 |
fn=_run_mmaudio,
|
|
|
|
| 847 |
)
|
| 848 |
|
| 849 |
def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
|
| 850 |
+
return _unpack_outputs(generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n), n)
|
|
|
|
| 851 |
|
| 852 |
hf_btn.click(
|
| 853 |
fn=_run_hunyuan,
|