BoxOfColors commited on
Commit
95c5c55
·
1 Parent(s): 1c5fa8d

Add crossfade Duration/Boost controls to MMAudio and HunyuanFoley tabs

Browse files

- Add Crossfade Duration (s) slider and Crossfade Boost (dB) textbox to both
MMAudio and HunyuanFoley UI tabs, matching TARO existing controls exactly
- Update generate_mmaudio() and generate_hunyuan() signatures to accept
crossfade_s and crossfade_db parameters (same defaults as before)
- Replace hardcoded MMA_CF_S/MMA_CF_DB and CF_S/CF_DB with user-supplied values
passed through _run_mmaudio and _run_hunyuan wrappers
- All 3 models now share identical crossfade UI and equal-power implementation

Files changed (1) hide show
  1. app.py +20 -12
app.py CHANGED
@@ -360,7 +360,8 @@ MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
360
 
361
  @spaces.GPU(duration=600)
362
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
363
- cfg_strength, num_steps, num_samples):
 
364
  """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
365
  # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
366
  import sys as _sys, os as _os
@@ -412,8 +413,8 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
412
  # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
413
  # with a 1 s crossfade overlap and stitch the results into a full-length track.
414
  total_dur_s = get_video_duration(video_file)
415
- MMA_CF_S = 1.0 # crossfade seconds between segments
416
- MMA_CF_DB = 3.0
417
 
418
  def _mma_build_segments(total_s, cf_s):
419
  if total_s <= MMAUDIO_WINDOW:
@@ -524,7 +525,8 @@ HUNYUAN_MAX_DUR = 15.0 # seconds
524
 
525
  @spaces.GPU(duration=600)
526
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
527
- guidance_scale, num_steps, model_size, num_samples):
 
528
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
529
  import torchaudio
530
  import sys as _sys
@@ -570,8 +572,8 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
570
  # input into overlapping segments, generate audio for each, then crossfade-
571
  # stitch the results into a single full-length audio track.
572
  total_dur_s = get_video_duration(video_file)
573
- CF_S = 2.0 # crossfade seconds between segments
574
- CF_DB = 3.0 # crossfade boost in dB
575
  segments = _taro_build_segments(total_dur_s, CF_S) # reuse TARO helper
576
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
577
 
@@ -767,6 +769,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
767
  mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
768
  mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
769
  mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
 
 
770
  mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
771
  mma_btn = gr.Button("Generate", variant="primary")
772
 
@@ -786,8 +790,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
786
  outputs=mma_slot_grps,
787
  )
788
 
789
- def _run_mmaudio(video, prompt, neg, seed, cfg, steps, n):
790
- flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n)
 
791
  n = int(n)
792
  grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
793
  vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
@@ -797,7 +802,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
797
  mma_btn.click(
798
  fn=_run_mmaudio,
799
  inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
800
- mma_cfg, mma_steps, mma_samples],
801
  outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
802
  )
803
 
@@ -814,6 +819,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
814
  hf_guidance = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, value=4.5, step=0.5)
815
  hf_steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=5)
816
  hf_size = gr.Radio(label="Model Size", choices=["xl", "xxl"], value="xxl")
 
 
817
  hf_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
818
  hf_btn = gr.Button("Generate", variant="primary")
819
 
@@ -833,8 +840,9 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
833
  outputs=hf_slot_grps,
834
  )
835
 
836
- def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, n):
837
- flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n)
 
838
  n = int(n)
839
  grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
840
  vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
@@ -844,7 +852,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
844
  hf_btn.click(
845
  fn=_run_hunyuan,
846
  inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
847
- hf_guidance, hf_steps, hf_size, hf_samples],
848
  outputs=hf_slot_grps + hf_slot_vids + hf_slot_auds,
849
  )
850
 
 
360
 
361
  @spaces.GPU(duration=600)
362
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
363
+ cfg_strength, num_steps, num_samples,
364
+ crossfade_s=1.0, crossfade_db=3.0):
365
  """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
366
  # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
367
  import sys as _sys, os as _os
 
413
  # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
414
  # with a 1 s crossfade overlap and stitch the results into a full-length track.
415
  total_dur_s = get_video_duration(video_file)
416
+ MMA_CF_S = float(crossfade_s)
417
+ MMA_CF_DB = float(crossfade_db)
418
 
419
  def _mma_build_segments(total_s, cf_s):
420
  if total_s <= MMAUDIO_WINDOW:
 
525
 
526
  @spaces.GPU(duration=600)
527
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
528
+ guidance_scale, num_steps, model_size, num_samples,
529
+ crossfade_s=2.0, crossfade_db=3.0):
530
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
531
  import torchaudio
532
  import sys as _sys
 
572
  # input into overlapping segments, generate audio for each, then crossfade-
573
  # stitch the results into a single full-length audio track.
574
  total_dur_s = get_video_duration(video_file)
575
+ CF_S = float(crossfade_s)
576
+ CF_DB = float(crossfade_db)
577
  segments = _taro_build_segments(total_dur_s, CF_S) # reuse TARO helper
578
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
579
 
 
769
  mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
770
  mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
771
  mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
772
+ mma_cf_dur = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=1, step=0.1)
773
+ mma_cf_db = gr.Textbox(label="Crossfade Boost (dB)", value="3")
774
  mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
775
  mma_btn = gr.Button("Generate", variant="primary")
776
 
 
790
  outputs=mma_slot_grps,
791
  )
792
 
793
+ def _run_mmaudio(video, prompt, neg, seed, cfg, steps, cf_dur, cf_db, n):
794
+ flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n,
795
+ crossfade_s=cf_dur, crossfade_db=cf_db)
796
  n = int(n)
797
  grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
798
  vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
 
802
  mma_btn.click(
803
  fn=_run_mmaudio,
804
  inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
805
+ mma_cfg, mma_steps, mma_cf_dur, mma_cf_db, mma_samples],
806
  outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
807
  )
808
 
 
819
  hf_guidance = gr.Slider(label="Guidance Scale", minimum=1, maximum=10, value=4.5, step=0.5)
820
  hf_steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=5)
821
  hf_size = gr.Radio(label="Model Size", choices=["xl", "xxl"], value="xxl")
822
+ hf_cf_dur = gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1)
823
+ hf_cf_db = gr.Textbox(label="Crossfade Boost (dB)", value="3")
824
  hf_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
825
  hf_btn = gr.Button("Generate", variant="primary")
826
 
 
840
  outputs=hf_slot_grps,
841
  )
842
 
843
+ def _run_hunyuan(video, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db, n):
844
+ flat = generate_hunyuan(video, prompt, neg, seed, guidance, steps, size, n,
845
+ crossfade_s=cf_dur, crossfade_db=cf_db)
846
  n = int(n)
847
  grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
848
  vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
 
852
  hf_btn.click(
853
  fn=_run_hunyuan,
854
  inputs=[hf_video, hf_prompt, hf_neg, hf_seed,
855
+ hf_guidance, hf_steps, hf_size, hf_cf_dur, hf_cf_db, hf_samples],
856
  outputs=hf_slot_grps + hf_slot_vids + hf_slot_auds,
857
  )
858