BoxOfColors commited on
Commit
2b2a599
·
1 Parent(s): f8b59b5

HunyuanFoley: sliding-window segmentation for videos longer than 15 s

Browse files

The model is hard-limited to 15 s per pass (MAX_VIDEO_DURATION_SECONDS=15
in constants.py, enforced in get_frames_av). For longer videos, slice the
input with ffmpeg into overlapping <=15 s segments, run feature_process +
denoise_process on each, then crossfade-stitch all segment wavs into a
single full-length audio track — same strategy as TARO. Text features are
encoded once from the first segment and reused across all segments.

Files changed (1) hide show
  1. app.py +71 -20
app.py CHANGED
@@ -507,33 +507,84 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
507
  tmp_dir = tempfile.mkdtemp()
508
  outputs = []
509
 
510
- # feature_process() extracts SigLIP2 visual features + Synchformer sync features
511
- # + CLAP text embeddings exactly as in HunyuanVideo-Foley/gradio_app.py
512
- visual_feats, text_feats, audio_len_in_s = feature_process(
513
- video_file,
 
 
 
 
 
 
 
 
 
 
 
 
514
  prompt if prompt else "",
515
  model_dict,
516
  cfg,
517
  neg_prompt=negative_prompt if negative_prompt else None,
518
  )
519
- print(f"[HunyuanFoley] Audio length: {audio_len_in_s:.2f}s | generating {num_samples} sample(s)")
520
-
521
- # denoise_process() runs the flow-matching diffusion loop and decodes with DAC-VAE
522
- # batch_size=num_samples generates all samples in one pass
523
- audio, sample_rate = denoise_process(
524
- visual_feats,
525
- text_feats,
526
- audio_len_in_s,
527
- model_dict,
528
- cfg,
529
- guidance_scale=float(guidance_scale),
530
- num_inference_steps=int(num_steps),
531
- batch_size=num_samples,
532
- )
533
- # audio shape: (batch, channels, samples)
534
  for sample_idx in range(num_samples):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
536
- torchaudio.save(audio_path, audio[sample_idx], sample_rate)
537
  video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
538
  merge_audio_video(audio_path, video_file, video_path)
539
  outputs.append((video_path, audio_path))
 
507
  tmp_dir = tempfile.mkdtemp()
508
  outputs = []
509
 
510
+ # HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
511
+ # input into overlapping segments, generate audio for each, then crossfade-
512
+ # stitch the results into a single full-length audio track.
513
+ total_dur_s = get_video_duration(video_file)
514
+ CF_S = 2.0 # crossfade seconds between segments
515
+ CF_DB = 3.0 # crossfade boost in dB
516
+ segments = _taro_build_segments(total_dur_s, CF_S) # reuse TARO helper
517
+ print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
518
+
519
+ # Pre-encode text features once (same for every segment)
520
+ _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
521
+ ffmpeg.input(video_file, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
522
+ _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
523
+ ).run(overwrite_output=True, quiet=True)
524
+ _, text_feats, _ = feature_process(
525
+ _dummy_seg_path,
526
  prompt if prompt else "",
527
  model_dict,
528
  cfg,
529
  neg_prompt=negative_prompt if negative_prompt else None,
530
  )
531
+
532
+ # Generate audio per segment, then stitch
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  for sample_idx in range(num_samples):
534
+ seg_wavs = []
535
+ sr = 48000 # HunyuanFoley always outputs 48 kHz
536
+ for seg_i, (seg_start, seg_end) in enumerate(segments):
537
+ seg_dur = seg_end - seg_start
538
+ seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
539
+ ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
540
+ seg_path, vcodec="libx264", acodec="aac", strict="experimental"
541
+ ).run(overwrite_output=True, quiet=True)
542
+
543
+ visual_feats, _, seg_audio_len = feature_process(
544
+ seg_path,
545
+ prompt if prompt else "",
546
+ model_dict,
547
+ cfg,
548
+ neg_prompt=negative_prompt if negative_prompt else None,
549
+ )
550
+ print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
551
+ f"{seg_start:.1f}–{seg_end:.1f}s → {seg_audio_len:.2f}s audio")
552
+
553
+ audio_batch, sr = denoise_process(
554
+ visual_feats,
555
+ text_feats,
556
+ seg_audio_len,
557
+ model_dict,
558
+ cfg,
559
+ guidance_scale=float(guidance_scale),
560
+ num_inference_steps=int(num_steps),
561
+ batch_size=1,
562
+ )
563
+ # audio_batch shape: (1, channels, samples) — take first (and only) sample
564
+ wav = audio_batch[0].float().cpu().numpy() # (channels, samples)
565
+ # Trim to exact segment length in samples
566
+ seg_samples = int(round(seg_dur * sr))
567
+ wav = wav[:, :seg_samples]
568
+ seg_wavs.append(wav)
569
+
570
+ # Stitch segments with crossfade (operates on (channels, samples) arrays)
571
+ def _cf_join_stereo(a, b, cf_s, db):
572
+ cf = int(round(cf_s * sr))
573
+ cf = min(cf, a.shape[1], b.shape[1])
574
+ if cf <= 0:
575
+ return np.concatenate([a, b], axis=1)
576
+ gain = 10 ** (db / 20.0)
577
+ overlap = a[:, -cf:] * gain + b[:, :cf] * gain
578
+ return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
579
+
580
+ full_wav = seg_wavs[0]
581
+ for nw in seg_wavs[1:]:
582
+ full_wav = _cf_join_stereo(full_wav, nw, CF_S, CF_DB)
583
+ # Trim to exact video duration
584
+ full_wav = full_wav[:, : int(round(total_dur_s * sr))]
585
+
586
  audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
587
+ torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
588
  video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
589
  merge_audio_video(audio_path, video_file, video_path)
590
  outputs.append((video_path, audio_path))