Spaces:
Running on Zero
Running on Zero
Commit ·
95c5c55
1
Parent(s): 1c5fa8d
Add crossfade Duration/Boost controls to MMAudio and HunyuanFoley tabs
Browse files- Add Crossfade Duration (s) slider and Crossfade Boost (dB) textbox to both
MMAudio and HunyuanFoley UI tabs, matching TARO existing controls exactly
- Update generate_mmaudio() and generate_hunyuan() signatures to accept
crossfade_s and crossfade_db parameters (same defaults as before)
- Replace hardcoded MMA_CF_S/MMA_CF_DB and CF_S/CF_DB with user-supplied values
passed through _run_mmaudio and _run_hunyuan wrappers
- All 3 models now share identical crossfade UI and equal-power implementation
app.py
CHANGED
|
@@ -360,7 +360,8 @@ MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
|
|
| 360 |
|
| 361 |
@spaces.GPU(duration=600)
|
| 362 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 363 |
-
cfg_strength, num_steps, num_samples
|
|
|
|
| 364 |
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 365 |
# MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
|
| 366 |
import sys as _sys, os as _os
|
|
@@ -412,8 +413,8 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 412 |
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 413 |
# with a 1 s crossfade overlap and stitch the results into a full-length track.
|
| 414 |
total_dur_s = get_video_duration(video_file)
|
| 415 |
-
MMA_CF_S =
|
| 416 |
-
MMA_CF_DB =
|
| 417 |
|
| 418 |
def _mma_build_segments(total_s, cf_s):
|
| 419 |
if total_s <= MMAUDIO_WINDOW:
|
|
@@ -524,7 +525,8 @@ HUNYUAN_MAX_DUR = 15.0 # seconds
|
|
| 524 |
|
| 525 |
@spaces.GPU(duration=600)
|
| 526 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 527 |
-
guidance_scale, num_steps, model_size, num_samples
|
|
|
|
| 528 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
|
| 529 |
import torchaudio
|
| 530 |
import sys as _sys
|
|
@@ -570,8 +572,8 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 570 |
# input into overlapping segments, generate audio for each, then crossfade-
|
| 571 |
# stitch the results into a single full-length audio track.
|
| 572 |
total_dur_s = get_video_duration(video_file)
|
| 573 |
-
CF_S =
|
| 574 |
-
CF_DB =
|
| 575 |
segments = _taro_build_segments(total_dur_s, CF_S) # reuse TARO helper
|
| 576 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 577 |
|
|
@@ -767,6 +769,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 767 |
mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
|
| 768 |
mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
|
| 769 |
mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
|
|
|
|
|
|
|
| 770 |
mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
|
| 771 |
mma_btn = gr.Button("Generate", variant="primary")
|
| 772 |
|
|
@@ -786,8 +790,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 786 |
outputs=mma_slot_grps,
|
| 787 |
)
|
| 788 |
|
| 789 |
-
def _run_mmaudio(video, prompt, neg, seed, cfg, steps, n):
|
| 790 |
-
flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n
|
|
|
|
| 791 |
n = int(n)
|
| 792 |
grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
|
| 793 |
vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
|
|
@@ -797,7 +802,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 797 |
mma_btn.click(
|
| 798 |
fn=_run_mmaudio,
|
| 799 |
inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
|
| 800 |
-
mma_cfg, mma_steps, mma_samples],
|
| 801 |
outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
|
| 802 |
)
|
| 803 |
|
|
@@ -814,6 +819,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 814 |
hf_guidance = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, value=4.5, step=0.5)
|
| 815 |
hf_steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=5)
|
| 816 |
hf_size = gr.Radio(label="Model Size", choices=["xl", "xxl"], value="xxl")
|
|
|
|
|
|
|
| 817 |
hf_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
|
| 818 |
hf_btn = gr.Button("Generate", variant="primary")
|
| 819 |
|
|
@@ -833,8 +840,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 833 |
outputs=hf_slot_grps,
|
| 834 |
)
|
| 835 |
|
| 836 |
-
def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, n):
|
| 837 |
-
flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n
|
|
|
|
| 838 |
n = int(n)
|
| 839 |
grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
|
| 840 |
vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
|
|
@@ -844,7 +852,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 844 |
hf_btn.click(
|
| 845 |
fn=_run_hunyuan,
|
| 846 |
inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
|
| 847 |
-
hf_guidance, hf_steps, hf_size, hf_samples],
|
| 848 |
outputs=hf_slot_grps + hf_slot_vids + hf_slot_auds,
|
| 849 |
)
|
| 850 |
|
|
|
|
| 360 |
|
| 361 |
@spaces.GPU(duration=600)
|
| 362 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 363 |
+
cfg_strength, num_steps, num_samples,
|
| 364 |
+
crossfade_s=1.0, crossfade_db=3.0):
|
| 365 |
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 366 |
# MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
|
| 367 |
import sys as _sys, os as _os
|
|
|
|
| 413 |
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 414 |
# with a 1 s crossfade overlap and stitch the results into a full-length track.
|
| 415 |
total_dur_s = get_video_duration(video_file)
|
| 416 |
+
MMA_CF_S = float(crossfade_s)
|
| 417 |
+
MMA_CF_DB = float(crossfade_db)
|
| 418 |
|
| 419 |
def _mma_build_segments(total_s, cf_s):
|
| 420 |
if total_s <= MMAUDIO_WINDOW:
|
|
|
|
| 525 |
|
| 526 |
@spaces.GPU(duration=600)
|
| 527 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 528 |
+
guidance_scale, num_steps, model_size, num_samples,
|
| 529 |
+
crossfade_s=2.0, crossfade_db=3.0):
|
| 530 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
|
| 531 |
import torchaudio
|
| 532 |
import sys as _sys
|
|
|
|
| 572 |
# input into overlapping segments, generate audio for each, then crossfade-
|
| 573 |
# stitch the results into a single full-length audio track.
|
| 574 |
total_dur_s = get_video_duration(video_file)
|
| 575 |
+
CF_S = float(crossfade_s)
|
| 576 |
+
CF_DB = float(crossfade_db)
|
| 577 |
segments = _taro_build_segments(total_dur_s, CF_S) # reuse TARO helper
|
| 578 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 579 |
|
|
|
|
| 769 |
mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
|
| 770 |
mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
|
| 771 |
mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
|
| 772 |
+
mma_cf_dur = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=1, step=0.1)
|
| 773 |
+
mma_cf_db = gr.Textbox(label="Crossfade Boost (dB)", value="3")
|
| 774 |
mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
|
| 775 |
mma_btn = gr.Button("Generate", variant="primary")
|
| 776 |
|
|
|
|
| 790 |
outputs=mma_slot_grps,
|
| 791 |
)
|
| 792 |
|
| 793 |
+
def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
|
| 794 |
+
flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
|
| 795 |
+
crossfade_s=cf_dur, crossfade_db=cf_db)
|
| 796 |
n = int(n)
|
| 797 |
grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
|
| 798 |
vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
|
|
|
|
| 802 |
mma_btn.click(
|
| 803 |
fn=_run_mmaudio,
|
| 804 |
inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
|
| 805 |
+
mma_cfg, mma_steps, mma_cf_dur, mma_cf_db, mma_samples],
|
| 806 |
outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
|
| 807 |
)
|
| 808 |
|
|
|
|
| 819 |
hf_guidance = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, value=4.5, step=0.5)
|
| 820 |
hf_steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=5)
|
| 821 |
hf_size = gr.Radio(label="Model Size", choices=["xl", "xxl"], value="xxl")
|
| 822 |
+
hf_cf_dur = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
|
| 823 |
+
hf_cf_db = gr.Textbox(label="Crossfade Boost (dB)", value="3")
|
| 824 |
hf_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
|
| 825 |
hf_btn = gr.Button("Generate", variant="primary")
|
| 826 |
|
|
|
|
| 840 |
outputs=hf_slot_grps,
|
| 841 |
)
|
| 842 |
|
| 843 |
+
def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
|
| 844 |
+
flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
|
| 845 |
+
crossfade_s=cf_dur, crossfade_db=cf_db)
|
| 846 |
n = int(n)
|
| 847 |
grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
|
| 848 |
vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
|
|
|
|
| 852 |
hf_btn.click(
|
| 853 |
fn=_run_hunyuan,
|
| 854 |
inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
|
| 855 |
+
hf_guidance, hf_steps, hf_size, hf_cf_dur, hf_cf_db, hf_samples],
|
| 856 |
outputs=hf_slot_grps + hf_slot_vids + hf_slot_auds,
|
| 857 |
)
|
| 858 |
|