BoxOfColors commited on
Commit
07afd9c
·
1 Parent(s): 8697d49

Dynamic GPU duration: reserve only what each run actually needs

Browse files

Replace fixed @spaces.GPU(duration=600) on all three models with per-call
duration callables (_taro_duration, _mmaudio_duration, _hunyuan_duration).

Each callable runs before GPU allocation (no GPU cost) and computes:
num_samples x num_segments x num_steps x secs_per_step + load_overhead

num_segments is derived from the actual video duration + crossfade setting,
so a 5s clip with 1 sample reserves ~60s instead of 600s. Worst-case long
videos with many samples still get the time they need.

Per-step estimates: TARO=2.5s, MMAudio=2.5s, HunyuanFoley=5.0s.
Load overheads: TARO=30s, MMAudio=20s, HunyuanFoley=30s. Floor is 60s.

Files changed (1) hide show
  1. app.py +49 -3
app.py CHANGED
@@ -167,6 +167,12 @@ TARO_TRUNCATE_ONSET = 120
167
  TARO_MODEL_DUR = TARO_TRUNCATE / TARO_SR # 8.192 s
168
  TARO_SECS_PER_STEP = 2.5 # estimated GPU-seconds per diffusion step
169
 
 
 
 
 
 
 
170
  _TARO_INFERENCE_CACHE: dict = {}
171
 
172
 
@@ -177,6 +183,18 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
177
  return max(1, min(max_s, MAX_SLOTS))
178
 
179
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  def _taro_infer_segment(
181
  model, vae, vocoder,
182
  cavp_feats_full, onset_feats_full,
@@ -246,7 +264,7 @@ def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float,
246
  return out[:int(round(total_dur_s * sr))]
247
 
248
 
249
- @spaces.GPU(duration=600)
250
  def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
251
  crossfade_s, crossfade_db, num_samples):
252
  """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
@@ -371,7 +389,21 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
371
 
372
  MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
373
 
374
- @spaces.GPU(duration=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
376
  cfg_strength, num_steps, num_samples,
377
  crossfade_s=1.0, crossfade_db=3.0):
@@ -509,7 +541,21 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
509
 
510
  HUNYUAN_MAX_DUR = 15.0 # seconds
511
 
512
- @spaces.GPU(duration=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
514
  guidance_scale, num_steps, model_size, num_samples,
515
  crossfade_s=2.0, crossfade_db=3.0):
 
167
  TARO_MODEL_DUR = TARO_TRUNCATE / TARO_SR # 8.192 s
168
  TARO_SECS_PER_STEP = 2.5 # estimated GPU-seconds per diffusion step
169
 
170
+ TARO_LOAD_OVERHEAD = 30 # seconds: model load + CAVP feature extraction
171
+ MMAUDIO_SECS_PER_STEP = 2.5 # estimated GPU-seconds per flow-matching step
172
+ MMAUDIO_LOAD_OVERHEAD = 20
173
+ HUNYUAN_SECS_PER_STEP = 5.0 # estimated GPU-seconds per denoising step (heavier model)
174
+ HUNYUAN_LOAD_OVERHEAD = 30
175
+
176
  _TARO_INFERENCE_CACHE: dict = {}
177
 
178
 
 
183
  return max(1, min(max_s, MAX_SLOTS))
184
 
185
 
186
+ def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
187
+ crossfade_s, crossfade_db, num_samples):
188
+ """Pre-GPU callable: returns the GPU seconds to reserve for this TARO run."""
189
+ try:
190
+ total_s = get_video_duration(video_file)
191
+ n_segs = len(_build_segments(total_s, TARO_MODEL_DUR, float(crossfade_s)))
192
+ except Exception:
193
+ n_segs = 1
194
+ secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
195
+ return max(60, int(secs))
196
+
197
+
198
  def _taro_infer_segment(
199
  model, vae, vocoder,
200
  cavp_feats_full, onset_feats_full,
 
264
  return out[:int(round(total_dur_s * sr))]
265
 
266
 
267
+ @spaces.GPU(duration=_taro_duration)
268
  def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
269
  crossfade_s, crossfade_db, num_samples):
270
  """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
 
389
 
390
  MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
391
 
392
+
393
+ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
394
+ cfg_strength, num_steps, num_samples,
395
+ crossfade_s=1.0, crossfade_db=3.0):
396
+ """Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
397
+ try:
398
+ total_s = get_video_duration(video_file)
399
+ n_segs = len(_build_segments(total_s, MMAUDIO_WINDOW, float(crossfade_s)))
400
+ except Exception:
401
+ n_segs = 1
402
+ secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
403
+ return max(60, int(secs))
404
+
405
+
406
+ @spaces.GPU(duration=_mmaudio_duration)
407
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
408
  cfg_strength, num_steps, num_samples,
409
  crossfade_s=1.0, crossfade_db=3.0):
 
541
 
542
  HUNYUAN_MAX_DUR = 15.0 # seconds
543
 
544
+
545
+ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
546
+ guidance_scale, num_steps, model_size, num_samples,
547
+ crossfade_s=2.0, crossfade_db=3.0):
548
+ """Pre-GPU callable: returns the GPU seconds to reserve for this HunyuanFoley run."""
549
+ try:
550
+ total_s = get_video_duration(video_file)
551
+ n_segs = len(_build_segments(total_s, HUNYUAN_MAX_DUR, float(crossfade_s)))
552
+ except Exception:
553
+ n_segs = 1
554
+ secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
555
+ return max(60, int(secs))
556
+
557
+
558
+ @spaces.GPU(duration=_hunyuan_duration)
559
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
560
  guidance_scale, num_steps, model_size, num_samples,
561
  crossfade_s=2.0, crossfade_db=3.0):