Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors commited on 13 days ago

Commit

a4f4e65

1 Parent(s): 5aeadc9

Calibrate GPU duration constants from measured H200 timings

Files changed (1) hide show

app.py CHANGED Viewed

@@ -166,13 +166,13 @@ TARO_FPS           = 4
 TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
-TARO_SECS_PER_STEP = 0.8   # estimated GPU-seconds per diffusion step on H200
-TARO_LOAD_OVERHEAD     = 20    # seconds: model load + CAVP feature extraction
-MMAUDIO_SECS_PER_STEP  = 0.8   # estimated GPU-seconds per flow-matching step on H200
 MMAUDIO_LOAD_OVERHEAD  = 15
-HUNYUAN_SECS_PER_STEP  = 2.0   # estimated GPU-seconds per denoising step on H200 (heavier model)
-HUNYUAN_LOAD_OVERHEAD  = 20
 GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
 _TARO_INFERENCE_CACHE: dict = {}
@@ -778,7 +778,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                 with gr.Column():
                     taro_video   = gr.Video(label="Input Video")
                     taro_seed    = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
-                    taro_cfg     = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5)
                     taro_steps   = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
                     taro_mode    = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
                     taro_cf_dur  = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)

 TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
+TARO_SECS_PER_STEP = 0.05  # measured 0.043s/step on H200 (8.2s video, 2 segs × 25 steps = 2.2s wall)
+TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
+MMAUDIO_SECS_PER_STEP  = 0.25  # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
 MMAUDIO_LOAD_OVERHEAD  = 15
+HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
+HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
 GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
 _TARO_INFERENCE_CACHE: dict = {}
                 with gr.Column():
                     taro_video   = gr.Video(label="Input Video")
                     taro_seed    = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
+                    taro_cfg     = gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=7.5, step=0.5)
                     taro_steps   = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1)
                     taro_mode    = gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
                     taro_cf_dur  = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)