Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 4 days ago

Commit

15539fe

1 Parent(s): d9929fb

fix: move MMAUDIO_WINDOW and HUNYUAN_MAX_DUR before MODEL_CONFIGS

Both constants were defined after MODEL_CONFIGS referenced them,
causing NameError on startup. Moved them up alongside the other
per-model constants and removed the now-duplicate definitions.

Files changed (1) hide show

app.py +2 -2

app.py CHANGED Viewed

@@ -299,8 +299,10 @@ TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
 TARO_SECS_PER_STEP = 0.05  # measured 0.043s/step on H200 (8.2s video, 2 segs × 25 steps = 2.2s wall)
 TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
 MMAUDIO_SECS_PER_STEP  = 0.25  # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
 MMAUDIO_LOAD_OVERHEAD  = 15
 HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
 HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
 GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
@@ -610,7 +612,6 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
 # generate() handles all feature extraction + decoding internally.
 # ================================================================== #
-MMAUDIO_WINDOW = 8.0   # seconds — MMAudio's fixed generation window
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
@@ -780,7 +781,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
 #   Default guidance_scale=4.5, num_inference_steps=50
 # ================================================================== #
-HUNYUAN_MAX_DUR = 15.0   # seconds
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,

 TARO_SECS_PER_STEP = 0.05  # measured 0.043s/step on H200 (8.2s video, 2 segs × 25 steps = 2.2s wall)
 TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
+MMAUDIO_WINDOW         = 8.0   # seconds — MMAudio's fixed generation window
 MMAUDIO_SECS_PER_STEP  = 0.25  # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
 MMAUDIO_LOAD_OVERHEAD  = 15
+HUNYUAN_MAX_DUR        = 15.0  # seconds — HunyuanFoley max video duration
 HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
 HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
 GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
 # generate() handles all feature extraction + decoding internally.
 # ================================================================== #
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 #   Default guidance_scale=4.5, num_inference_steps=50
 # ================================================================== #
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,