Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 5 days ago

Commit

39d7b17

1 Parent(s): 679c604

Lower GPU duration estimates and add 300s hard cap

SECS_PER_STEP estimates were calibrated for a slower GPU. H200 is much faster:
TARO: 2.5 → 0.8 s/step
MMAudio: 2.5 → 0.8 s/step
HunyuanFoley: 5.0 → 2.0 s/step
Load overheads trimmed similarly (TARO 30→20, MMAudio 20→15, Hunyuan 30→20).

Add GPU_DURATION_CAP=300s — a single call can never reserve more than 5 min
regardless of video length or sample count, preventing quota exhaustion on
long videos. The diagnostic print now shows both the raw calc and capped value.

56s video / 1 sample / 25 steps: was 592s reserved, now 200s.

Files changed (1) hide show

app.py +16 -12

app.py CHANGED Viewed

@@ -165,13 +165,14 @@ TARO_FPS           = 4
 TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
-TARO_SECS_PER_STEP = 2.5   # estimated GPU-seconds per diffusion step
-TARO_LOAD_OVERHEAD  = 30    # seconds: model load + CAVP feature extraction
-MMAUDIO_SECS_PER_STEP  = 2.5   # estimated GPU-seconds per flow-matching step
-MMAUDIO_LOAD_OVERHEAD  = 20
-HUNYUAN_SECS_PER_STEP  = 5.0   # estimated GPU-seconds per denoising step (heavier model)
-HUNYUAN_LOAD_OVERHEAD  = 30
 _TARO_INFERENCE_CACHE: dict = {}
@@ -192,8 +193,9 @@ def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
-    print(f"[duration] TARO: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
-    return max(60, int(secs))
 def _taro_infer_segment(
@@ -400,8 +402,9 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
-    print(f"[duration] MMAudio: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
-    return max(60, int(secs))
 @spaces.GPU(duration=_mmaudio_duration)
@@ -550,8 +553,9 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
-    print(f"[duration] HunyuanFoley: {int(num_samples)}s × {n_segs}seg × {int(num_steps)}steps = {secs:.0f}s reserved")
-    return max(60, int(secs))
 @spaces.GPU(duration=_hunyuan_duration)

 TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
+TARO_SECS_PER_STEP = 0.8   # estimated GPU-seconds per diffusion step on H200
+TARO_LOAD_OVERHEAD     = 20    # seconds: model load + CAVP feature extraction
+MMAUDIO_SECS_PER_STEP  = 0.8   # estimated GPU-seconds per flow-matching step on H200
+MMAUDIO_LOAD_OVERHEAD  = 15
+HUNYUAN_SECS_PER_STEP  = 2.0   # estimated GPU-seconds per denoising step on H200 (heavier model)
+HUNYUAN_LOAD_OVERHEAD  = 20
+GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
 _TARO_INFERENCE_CACHE: dict = {}
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * TARO_SECS_PER_STEP + TARO_LOAD_OVERHEAD
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] TARO: {int(num_samples)}samp × {n_segs}seg × {int(num_steps)}steps → {secs:.0f}s → capped {result}s")
+    return result
 def _taro_infer_segment(
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * MMAUDIO_SECS_PER_STEP + MMAUDIO_LOAD_OVERHEAD
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] MMAudio: {int(num_samples)}samp × {n_segs}seg × {int(num_steps)}steps → {secs:.0f}s → capped {result}s")
+    return result
 @spaces.GPU(duration=_mmaudio_duration)
     except Exception:
         n_segs  = 1
     secs = int(num_samples) * n_segs * int(num_steps) * HUNYUAN_SECS_PER_STEP + HUNYUAN_LOAD_OVERHEAD
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] HunyuanFoley: {int(num_samples)}samp × {n_segs}seg × {int(num_steps)}steps → {secs:.0f}s → capped {result}s")
+    return result
 @spaces.GPU(duration=_hunyuan_duration)