Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 5 days ago

Commit

07afd9c

1 Parent(s): 8697d49

Dynamic GPU duration: reserve only what each run actually needs

Replace fixed @spaces.GPU(duration=600) on all three models with per-call
duration callables (_taro_duration, _mmaudio_duration, _hunyuan_duration).

Each callable runs before GPU allocation (no GPU cost) and computes:
num_samples x num_segments x num_steps x secs_per_step + load_overhead

num_segments is derived from the actual video duration + crossfade setting,
so a 5s clip with 1 sample reserves ~60s instead of 600s. Worst-case long
videos with many samples still get the time they need.

Per-step estimates: TARO=2.5s, MMAudio=2.5s, HunyuanFoley=5.0s.
Load overheads: TARO=30s, MMAudio=20s, HunyuanFoley=30s. Floor is 60s.

Files changed (1) hide show

app.py +49 -3

app.py CHANGED Viewed

@@ -167,6 +167,12 @@ TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
 TARO_SECS_PER_STEP = 2.5   # estimated GPU-seconds per diffusion step
 _TARO_INFERENCE_CACHE: dict = {}
@@ -177,6 +183,18 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
     return max(1, min(max_s, MAX_SLOTS))
 def _taro_infer_segment(
     model, vae, vocoder,
     cavp_feats_full, onset_feats_full,
@@ -246,7 +264,7 @@ def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float,
     return out[:int(round(total_dur_s * sr))]
-@spaces.GPU(duration=600)
 def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                   crossfade_s, crossfade_db, num_samples):
     """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
@@ -371,7 +389,21 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
 MMAUDIO_WINDOW = 8.0   # seconds — MMAudio's fixed generation window
-@spaces.GPU(duration=600)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, num_samples,
                      crossfade_s=1.0, crossfade_db=3.0):
@@ -509,7 +541,21 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
 HUNYUAN_MAX_DUR = 15.0   # seconds
-@spaces.GPU(duration=600)
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, num_samples,
                      crossfade_s=2.0, crossfade_db=3.0):

 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
 TARO_SECS_PER_STEP = 2.5   # estimated GPU-seconds per diffusion step
+TARO_LOAD_OVERHEAD  = 30    # seconds: model load + CAVP feature extraction
+MMAUDIO_SECS_PER_STEP  = 2.5   # estimated GPU-seconds per flow-matching step
+MMAUDIO_LOAD_OVERHEAD  = 20
+HUNYUAN_SECS_PER_STEP  = 5.0   # estimated GPU-seconds per denoising step (heavier model)
+HUNYUAN_LOAD_OVERHEAD  = 30
 _TARO_INFERENCE_CACHE: dict = {}
     return max(1, min(max_s, MAX_SLOTS))
+def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
+                   crossfade_s, crossfade_db, num_samples):
+    """Pre-GPU callable: returns the GPU seconds to reserve for this TARO run."""
+    try:
+        total_s = get_video_duration(video_file)
+        n_segs  = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
+    except Exception:
+        n_segs  = 1
+    secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
+    return max(60, int(secs))
 def _taro_infer_segment(
     model, vae, vocoder,
     cavp_feats_full, onset_feats_full,
     return out[:int(round(total_dur_s * sr))]
+@spaces.GPU(duration=_taro_duration)
 def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                   crossfade_s, crossfade_db, num_samples):
     """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
 MMAUDIO_WINDOW = 8.0   # seconds — MMAudio's fixed generation window
+def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
+                      cfg_strength, num_steps, num_samples,
+                      crossfade_s=1.0, crossfade_db=3.0):
+    """Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
+    try:
+        total_s = get_video_duration(video_file)
+        n_segs  = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
+    except Exception:
+        n_segs  = 1
+    secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
+    return max(60, int(secs))
+@spaces.GPU(duration=_mmaudio_duration)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, num_samples,
                      crossfade_s=1.0, crossfade_db=3.0):
 HUNYUAN_MAX_DUR = 15.0   # seconds
+def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
+                      guidance_scale, num_steps, model_size, num_samples,
+                      crossfade_s=2.0, crossfade_db=3.0):
+    """Pre-GPU callable: returns the GPU seconds to reserve for this HunyuanFoley run."""
+    try:
+        total_s = get_video_duration(video_file)
+        n_segs  = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
+    except Exception:
+        n_segs  = 1
+    secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
+    return max(60, int(secs))
+@spaces.GPU(duration=_hunyuan_duration)
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, num_samples,
                      crossfade_s=2.0, crossfade_db=3.0):