Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 6 days ago

Commit

537e4ca

1 Parent(s): 07afd9c

Standardise crossfade defaults to 2s/3dB across all three models

MMAudio crossfade_s was 1.0 in function signature, duration callable, and
UI slider default — align to 2.0 to match TARO and HunyuanFoley.
All model inference defaults (CFG, steps, mode) already matched paper
recommendations: TARO 8.0/25/SDE, MMAudio 4.5/25, HunyuanFoley 4.5/50.

Files changed (1) hide show

app.py +3 -3

app.py CHANGED Viewed

@@ -392,7 +392,7 @@ MMAUDIO_WINDOW = 8.0   # seconds — MMAudio's fixed generation window
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
                       cfg_strength, num_steps, num_samples,
-                      crossfade_s=1.0, crossfade_db=3.0):
     """Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
     try:
         total_s = get_video_duration(video_file)
@@ -406,7 +406,7 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, num_samples,
-                     crossfade_s=1.0, crossfade_db=3.0):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
@@ -797,7 +797,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     mma_seed     = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
                     mma_cfg      = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
                     mma_steps    = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
-                    mma_cf_dur   = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=1, step=0.1)
                     mma_cf_db    = gr.Textbox(label="Crossfade Boost (dB)", value="3")
                     mma_samples  = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
                     mma_btn      = gr.Button("Generate", variant="primary")

 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
                       cfg_strength, num_steps, num_samples,
+                      crossfade_s=2.0, crossfade_db=3.0):
     """Pre-GPU callable: returns the GPU seconds to reserve for this MMAudio run."""
     try:
         total_s = get_video_duration(video_file)
 @spaces.GPU(duration=_mmaudio_duration)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, num_samples,
+                     crossfade_s=2.0, crossfade_db=3.0):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
                     mma_seed     = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
                     mma_cfg      = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
                     mma_steps    = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
+                    mma_cf_dur   = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
                     mma_cf_db    = gr.Textbox(label="Crossfade Boost (dB)", value="3")
                     mma_samples  = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
                     mma_btn      = gr.Button("Generate", variant="primary")