BoxOfColors commited on
Commit
a4f4e65
·
1 Parent(s): 5aeadc9

Calibrate GPU duration constants from measured H200 timings

Browse files
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -166,13 +166,13 @@ TARO_FPS = 4
166
  TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR) # 32
167
  TARO_TRUNCATE_ONSET = 120
168
  TARO_MODEL_DUR = TARO_TRUNCATE / TARO_SR # 8.192 s
169
- TARO_SECS_PER_STEP = 0.8 # estimated GPU-seconds per diffusion step on H200
170
 
171
- TARO_LOAD_OVERHEAD = 20 # seconds: model load + CAVP feature extraction
172
- MMAUDIO_SECS_PER_STEP = 0.8 # estimated GPU-seconds per flow-matching step on H200
173
  MMAUDIO_LOAD_OVERHEAD = 15
174
- HUNYUAN_SECS_PER_STEP = 2.0 # estimated GPU-seconds per denoising step on H200 (heavier model)
175
- HUNYUAN_LOAD_OVERHEAD = 20
176
  GPU_DURATION_CAP = 300 # hard cap per call — never reserve more than this
177
 
178
  _TARO_INFERENCE_CACHE: dict = {}
@@ -778,7 +778,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
778
  with gr.Column():
779
  taro_video = gr.Video(label="Input Video")
780
  taro_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
781
- taro_cfg = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5)
782
  taro_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
783
  taro_mode = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
784
  taro_cf_dur = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
 
166
  TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR) # 32
167
  TARO_TRUNCATE_ONSET = 120
168
  TARO_MODEL_DUR = TARO_TRUNCATE / TARO_SR # 8.192 s
169
+ TARO_SECS_PER_STEP = 0.05 # measured 0.043s/step on H200 (8.2s video, 2 segs × 25 steps = 2.2s wall)
170
 
171
+ TARO_LOAD_OVERHEAD = 15 # seconds: model load + CAVP feature extraction
172
+ MMAUDIO_SECS_PER_STEP = 0.25 # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
173
  MMAUDIO_LOAD_OVERHEAD = 15
174
+ HUNYUAN_SECS_PER_STEP = 0.35 # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
175
+ HUNYUAN_LOAD_OVERHEAD = 55 # ~55s to load the 10GB XXL model weights into GPU
176
  GPU_DURATION_CAP = 300 # hard cap per call — never reserve more than this
177
 
178
  _TARO_INFERENCE_CACHE: dict = {}
 
778
  with gr.Column():
779
  taro_video = gr.Video(label="Input Video")
780
  taro_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
781
+ taro_cfg = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=7.5, step=0.5)
782
  taro_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
783
  taro_mode = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
784
  taro_cf_dur = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)