BoxOfColors Claude Sonnet 4.6 commited on
Commit
d141e30
Β·
1 Parent(s): af578ae

Optimize GPU usage and fix resource leaks

Browse files

- HunyuanFoley: extract visual features only per segment (reuse
text_feats from single initial extraction) β€” saves ~2-5s GPU/segment
- MMAudio/Hunyuan: add torch.cuda.empty_cache() between samples to
prevent VRAM fragmentation (TARO already had this)
- Regen duration floor lowered from 60s to 30s β€” single-segment regen
takes ~16-20s actual GPU, saving ~30s wasted ZeroGPU quota per call
- Register regen temp dirs with _register_tmp_dir for cleanup (was
leaking tmpfiles indefinitely)
- TARO: save CAVP/onset features once instead of per-sample (identical
data, depends only on video)
- Remove redundant import time in _splice_and_save

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +31 -19
app.py CHANGED
@@ -371,10 +371,12 @@ def _estimate_gpu_duration(model_key: str, num_samples: int, num_steps: int,
371
 
372
 
373
  def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
374
- """Generic GPU duration estimator for single-segment regen."""
 
 
375
  cfg = MODEL_CONFIGS[model_key]
376
  secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
377
- result = min(GPU_DURATION_CAP, max(60, int(secs)))
378
  print(f"[duration] {cfg['label']} regen: 1 seg Γ— {int(num_steps)} steps β†’ {secs:.0f}s β†’ capped {result}s")
379
  return result
380
 
@@ -578,6 +580,10 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
578
  crossfade_s, crossfade_db, num_samples)
579
 
580
  # ── CPU post-processing (no GPU needed) ──
 
 
 
 
581
  outputs = []
582
  for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
583
  final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
@@ -586,12 +592,12 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
586
  video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
587
  mux_video_audio(silent_video, audio_path, video_path)
588
  wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
589
- # Cache CAVP + onset features so regen can skip re-extraction (~5-7s saved)
590
- cavp_path = os.path.join(tmp_dir, f"taro_{sample_idx}_cavp.npy")
591
- onset_path = os.path.join(tmp_dir, f"taro_{sample_idx}_onset.npy")
592
- np.save(cavp_path, cavp_feats)
593
- if onset_feats is not None:
594
- np.save(onset_path, onset_feats)
595
  seg_meta = {
596
  "segments": segments,
597
  "wav_paths": wav_paths,
@@ -711,6 +717,10 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
711
  f"(current constant={MMAUDIO_SECS_PER_STEP})")
712
  results.append((seg_audios, sr))
713
 
 
 
 
 
714
  return results
715
 
716
  _mmaudio_gpu_infer._cpu_ctx = {}
@@ -839,6 +849,10 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
839
  neg_prompt=negative_prompt if negative_prompt else None,
840
  )
841
 
 
 
 
 
842
  results = []
843
  for sample_idx in range(num_samples):
844
  seg_wavs = []
@@ -848,13 +862,8 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
848
  seg_dur = seg_end - seg_start
849
  seg_path = seg_clip_paths[seg_i]
850
 
851
- visual_feats, _, seg_audio_len = feature_process(
852
- seg_path,
853
- prompt if prompt else "",
854
- model_dict,
855
- cfg,
856
- neg_prompt=negative_prompt if negative_prompt else None,
857
- )
858
  print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
859
  f"{seg_start:.1f}–{seg_end:.1f}s β†’ {seg_audio_len:.2f}s audio")
860
 
@@ -881,6 +890,10 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
881
  f"(current constant={HUNYUAN_SECS_PER_STEP})")
882
  results.append((seg_wavs, sr, text_feats))
883
 
 
 
 
 
884
  return results
885
 
886
  _hunyuan_gpu_infer._cpu_ctx = {}
@@ -1003,8 +1016,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
1003
 
1004
  # Save new audio β€” use a new timestamped filename so Gradio / the browser
1005
  # treats it as a genuinely different file and reloads the video player.
1006
- import time as _time
1007
- _ts = int(_time.time() * 1000)
1008
  tmp_dir = os.path.dirname(meta["audio_path"])
1009
  _base = os.path.splitext(os.path.basename(meta["audio_path"]))[0]
1010
  # Strip any previous timestamp suffix before adding a new one
@@ -1192,7 +1204,7 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
1192
 
1193
  # CPU: pre-extract segment clip
1194
  silent_video = meta["silent_video"]
1195
- tmp_dir = tempfile.mkdtemp()
1196
  seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
1197
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
1198
  seg_path, vcodec="copy", an=None
@@ -1291,7 +1303,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
1291
 
1292
  # CPU: pre-extract segment clip
1293
  silent_video = meta["silent_video"]
1294
- tmp_dir = tempfile.mkdtemp()
1295
  seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
1296
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
1297
  seg_path, vcodec="copy", an=None
 
371
 
372
 
373
  def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
374
+ """Generic GPU duration estimator for single-segment regen.
375
+ Uses a lower floor (30s) than initial generation since regen only runs
376
+ one segment β€” saves 30s of wasted ZeroGPU quota per regen call."""
377
  cfg = MODEL_CONFIGS[model_key]
378
  secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
379
+ result = min(GPU_DURATION_CAP, max(30, int(secs)))
380
  print(f"[duration] {cfg['label']} regen: 1 seg Γ— {int(num_steps)} steps β†’ {secs:.0f}s β†’ capped {result}s")
381
  return result
382
 
 
580
  crossfade_s, crossfade_db, num_samples)
581
 
582
  # ── CPU post-processing (no GPU needed) ──
583
+ # Cache CAVP + onset features once (same for all samples β€” they depend only on the video)
584
+ cavp_path = os.path.join(tmp_dir, "taro_cavp.npy")
585
+ onset_path = os.path.join(tmp_dir, "taro_onset.npy")
586
+ first_cavp_saved = False
587
  outputs = []
588
  for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
589
  final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
 
592
  video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
593
  mux_video_audio(silent_video, audio_path, video_path)
594
  wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
595
+ # Save shared features once (not per-sample β€” they're identical)
596
+ if not first_cavp_saved:
597
+ np.save(cavp_path, cavp_feats)
598
+ if onset_feats is not None:
599
+ np.save(onset_path, onset_feats)
600
+ first_cavp_saved = True
601
  seg_meta = {
602
  "segments": segments,
603
  "wav_paths": wav_paths,
 
717
  f"(current constant={MMAUDIO_SECS_PER_STEP})")
718
  results.append((seg_audios, sr))
719
 
720
+ # Free GPU memory between samples to prevent VRAM fragmentation
721
+ if torch.cuda.is_available():
722
+ torch.cuda.empty_cache()
723
+
724
  return results
725
 
726
  _mmaudio_gpu_infer._cpu_ctx = {}
 
849
  neg_prompt=negative_prompt if negative_prompt else None,
850
  )
851
 
852
+ # Import visual-only feature extractor to avoid redundant text extraction
853
+ # per segment (text_feats already computed once above for the whole batch).
854
+ from hunyuanvideo_foley.utils.feature_utils import encode_video_features
855
+
856
  results = []
857
  for sample_idx in range(num_samples):
858
  seg_wavs = []
 
862
  seg_dur = seg_end - seg_start
863
  seg_path = seg_clip_paths[seg_i]
864
 
865
+ # Extract only visual features β€” reuse text_feats from above
866
+ visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
 
 
 
 
 
867
  print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
868
  f"{seg_start:.1f}–{seg_end:.1f}s β†’ {seg_audio_len:.2f}s audio")
869
 
 
890
  f"(current constant={HUNYUAN_SECS_PER_STEP})")
891
  results.append((seg_wavs, sr, text_feats))
892
 
893
+ # Free GPU memory between samples to prevent VRAM fragmentation
894
+ if torch.cuda.is_available():
895
+ torch.cuda.empty_cache()
896
+
897
  return results
898
 
899
  _hunyuan_gpu_infer._cpu_ctx = {}
 
1016
 
1017
  # Save new audio β€” use a new timestamped filename so Gradio / the browser
1018
  # treats it as a genuinely different file and reloads the video player.
1019
+ _ts = int(time.time() * 1000)
 
1020
  tmp_dir = os.path.dirname(meta["audio_path"])
1021
  _base = os.path.splitext(os.path.basename(meta["audio_path"]))[0]
1022
  # Strip any previous timestamp suffix before adding a new one
 
1204
 
1205
  # CPU: pre-extract segment clip
1206
  silent_video = meta["silent_video"]
1207
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1208
  seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
1209
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
1210
  seg_path, vcodec="copy", an=None
 
1303
 
1304
  # CPU: pre-extract segment clip
1305
  silent_video = meta["silent_video"]
1306
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1307
  seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
1308
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
1309
  seg_path, vcodec="copy", an=None