Spaces:
Running on Zero
Running on Zero
Commit ·
07afd9c
1
Parent(s): 8697d49
Dynamic GPU duration: reserve only what each run actually needs
Browse filesReplace fixed @spaces.GPU(duration=600) on all three models with per-call
duration callables (_taro_duration, _mmaudio_duration, _hunyuan_duration).
Each callable runs before GPU allocation (no GPU cost) and computes:
num_samples x num_segments x num_steps x secs_per_step + load_overhead
num_segments is derived from the actual video duration + crossfade setting,
so a 5s clip with 1 sample reserves ~60s instead of 600s. Worst-case long
videos with many samples still get the time they need.
Per-step estimates: TARO=2.5s, MMAudio=2.5s, HunyuanFoley=5.0s.
Load overheads: TARO=30s, MMAudio=20s, HunyuanFoley=30s. Floor is 60s.
app.py
CHANGED
|
@@ -167,6 +167,12 @@ TARO_TRUNCATE_ONSET = 120
|
|
| 167 |
TARO_MODEL_DUR = TARO_TRUNCATE / TARO_SR # 8.192 s
|
| 168 |
TARO_SECS_PER_STEP = 2.5 # estimated GPU-seconds per diffusion step
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
_TARO_INFERENCE_CACHE: dict = {}
|
| 171 |
|
| 172 |
|
|
@@ -177,6 +183,18 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
|
|
| 177 |
return max(1, min(max_s, MAX_SLOTS))
|
| 178 |
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
def _taro_infer_segment(
|
| 181 |
model, vae, vocoder,
|
| 182 |
cavp_feats_full, onset_feats_full,
|
|
@@ -246,7 +264,7 @@ def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float,
|
|
| 246 |
return out[:int(round(total_dur_s * sr))]
|
| 247 |
|
| 248 |
|
| 249 |
-
@spaces.GPU(duration=
|
| 250 |
def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
| 251 |
crossfade_s, crossfade_db, num_samples):
|
| 252 |
"""TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
|
|
@@ -371,7 +389,21 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 371 |
|
| 372 |
MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
|
| 373 |
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 376 |
cfg_strength, num_steps, num_samples,
|
| 377 |
crossfade_s=1.0, crossfade_db=3.0):
|
|
@@ -509,7 +541,21 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 509 |
|
| 510 |
HUNYUAN_MAX_DUR = 15.0 # seconds
|
| 511 |
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 514 |
guidance_scale, num_steps, model_size, num_samples,
|
| 515 |
crossfade_s=2.0, crossfade_db=3.0):
|
|
|
|
| 167 |
TARO_MODEL_DUR = TARO_TRUNCATE / TARO_SR # 8.192 s
|
| 168 |
TARO_SECS_PER_STEP = 2.5 # estimated GPU-seconds per diffusion step
|
| 169 |
|
| 170 |
+
TARO_LOAD_OVERHEAD = 30 # seconds: model load + CAVP feature extraction
|
| 171 |
+
MMAUDIO_SECS_PER_STEP = 2.5 # estimated GPU-seconds per flow-matching step
|
| 172 |
+
MMAUDIO_LOAD_OVERHEAD = 20
|
| 173 |
+
HUNYUAN_SECS_PER_STEP = 5.0 # estimated GPU-seconds per denoising step (heavier model)
|
| 174 |
+
HUNYUAN_LOAD_OVERHEAD = 30
|
| 175 |
+
|
| 176 |
_TARO_INFERENCE_CACHE: dict = {}
|
| 177 |
|
| 178 |
|
|
|
|
| 183 |
return max(1, min(max_s, MAX_SLOTS))
|
| 184 |
|
| 185 |
|
| 186 |
+
def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
|
| 187 |
+
crossfade_s, crossfade_db, num_samples):
|
| 188 |
+
"""Pre-GPU callable: returns the GPU seconds to reserve for this TARO run."""
|
| 189 |
+
try:
|
| 190 |
+
total_s = get_video_duration(video_file)
|
| 191 |
+
n_segs = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
|
| 192 |
+
except Exception:
|
| 193 |
+
n_segs = 1
|
| 194 |
+
secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
|
| 195 |
+
return max(60, int(secs))
|
| 196 |
+
|
| 197 |
+
|
| 198 |
def _taro_infer_segment(
|
| 199 |
model, vae, vocoder,
|
| 200 |
cavp_feats_full, onset_feats_full,
|
|
|
|
| 264 |
return out[:int(round(total_dur_s * sr))]
|
| 265 |
|
| 266 |
|
| 267 |
+
@spaces.GPU(duration=_taro_duration)
|
| 268 |
def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
| 269 |
crossfade_s, crossfade_db, num_samples):
|
| 270 |
"""TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
|
|
|
|
| 389 |
|
| 390 |
MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
|
| 391 |
|
| 392 |
+
|
| 393 |
+
def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
|
| 394 |
+
cfg_strength, num_steps, num_samples,
|
| 395 |
+
crossfade_s=1.0, crossfade_db=3.0):
|
| 396 |
+
"""Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
|
| 397 |
+
try:
|
| 398 |
+
total_s = get_video_duration(video_file)
|
| 399 |
+
n_segs = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
|
| 400 |
+
except Exception:
|
| 401 |
+
n_segs = 1
|
| 402 |
+
secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
|
| 403 |
+
return max(60, int(secs))
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
@spaces.GPU(duration=_mmaudio_duration)
|
| 407 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 408 |
cfg_strength, num_steps, num_samples,
|
| 409 |
crossfade_s=1.0, crossfade_db=3.0):
|
|
|
|
| 541 |
|
| 542 |
HUNYUAN_MAX_DUR = 15.0 # seconds
|
| 543 |
|
| 544 |
+
|
| 545 |
+
def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
|
| 546 |
+
guidance_scale, num_steps, model_size, num_samples,
|
| 547 |
+
crossfade_s=2.0, crossfade_db=3.0):
|
| 548 |
+
"""Pre-GPU callable: returns the GPU seconds to reserve for this HunyuanFoley run."""
|
| 549 |
+
try:
|
| 550 |
+
total_s = get_video_duration(video_file)
|
| 551 |
+
n_segs = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
|
| 552 |
+
except Exception:
|
| 553 |
+
n_segs = 1
|
| 554 |
+
secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
|
| 555 |
+
return max(60, int(secs))
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
@spaces.GPU(duration=_hunyuan_duration)
|
| 559 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 560 |
guidance_scale, num_steps, model_size, num_samples,
|
| 561 |
crossfade_s=2.0, crossfade_db=3.0):
|