Spaces:
Running on Zero
Running on Zero
Commit Β·
d141e30
1
Parent(s): af578ae
Optimize GPU usage and fix resource leaks
Browse files- HunyuanFoley: extract visual features only per segment (reuse
text_feats from single initial extraction) β saves ~2-5s GPU/segment
- MMAudio/Hunyuan: add torch.cuda.empty_cache() between samples to
prevent VRAM fragmentation (TARO already had this)
- Regen duration floor lowered from 60s to 30s β single-segment regen
takes ~16-20s actual GPU, saving ~30s wasted ZeroGPU quota per call
- Register regen temp dirs with _register_tmp_dir for cleanup (was
leaking tmpfiles indefinitely)
- TARO: save CAVP/onset features once instead of per-sample (identical
data, depends only on video)
- Remove redundant import time in _splice_and_save
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -371,10 +371,12 @@ def _estimate_gpu_duration(model_key: str, num_samples: int, num_steps: int,
|
|
| 371 |
|
| 372 |
|
| 373 |
def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
|
| 374 |
-
"""Generic GPU duration estimator for single-segment regen.
|
|
|
|
|
|
|
| 375 |
cfg = MODEL_CONFIGS[model_key]
|
| 376 |
secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
|
| 377 |
-
result = min(GPU_DURATION_CAP, max(
|
| 378 |
print(f"[duration] {cfg['label']} regen: 1 seg Γ {int(num_steps)} steps β {secs:.0f}s β capped {result}s")
|
| 379 |
return result
|
| 380 |
|
|
@@ -578,6 +580,10 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 578 |
crossfade_s, crossfade_db, num_samples)
|
| 579 |
|
| 580 |
# ββ CPU post-processing (no GPU needed) ββ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
outputs = []
|
| 582 |
for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
|
| 583 |
final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
|
|
@@ -586,12 +592,12 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 586 |
video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
|
| 587 |
mux_video_audio(silent_video, audio_path, video_path)
|
| 588 |
wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
|
| 589 |
-
#
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
seg_meta = {
|
| 596 |
"segments": segments,
|
| 597 |
"wav_paths": wav_paths,
|
|
@@ -711,6 +717,10 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 711 |
f"(current constant={MMAUDIO_SECS_PER_STEP})")
|
| 712 |
results.append((seg_audios, sr))
|
| 713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
return results
|
| 715 |
|
| 716 |
_mmaudio_gpu_infer._cpu_ctx = {}
|
|
@@ -839,6 +849,10 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 839 |
neg_prompt=negative_prompt if negative_prompt else None,
|
| 840 |
)
|
| 841 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 842 |
results = []
|
| 843 |
for sample_idx in range(num_samples):
|
| 844 |
seg_wavs = []
|
|
@@ -848,13 +862,8 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 848 |
seg_dur = seg_end - seg_start
|
| 849 |
seg_path = seg_clip_paths[seg_i]
|
| 850 |
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
prompt if prompt else "",
|
| 854 |
-
model_dict,
|
| 855 |
-
cfg,
|
| 856 |
-
neg_prompt=negative_prompt if negative_prompt else None,
|
| 857 |
-
)
|
| 858 |
print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
|
| 859 |
f"{seg_start:.1f}β{seg_end:.1f}s β {seg_audio_len:.2f}s audio")
|
| 860 |
|
|
@@ -881,6 +890,10 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 881 |
f"(current constant={HUNYUAN_SECS_PER_STEP})")
|
| 882 |
results.append((seg_wavs, sr, text_feats))
|
| 883 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
return results
|
| 885 |
|
| 886 |
_hunyuan_gpu_infer._cpu_ctx = {}
|
|
@@ -1003,8 +1016,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
|
|
| 1003 |
|
| 1004 |
# Save new audio β use a new timestamped filename so Gradio / the browser
|
| 1005 |
# treats it as a genuinely different file and reloads the video player.
|
| 1006 |
-
|
| 1007 |
-
_ts = int(_time.time() * 1000)
|
| 1008 |
tmp_dir = os.path.dirname(meta["audio_path"])
|
| 1009 |
_base = os.path.splitext(os.path.basename(meta["audio_path"]))[0]
|
| 1010 |
# Strip any previous timestamp suffix before adding a new one
|
|
@@ -1192,7 +1204,7 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
|
|
| 1192 |
|
| 1193 |
# CPU: pre-extract segment clip
|
| 1194 |
silent_video = meta["silent_video"]
|
| 1195 |
-
tmp_dir = tempfile.mkdtemp()
|
| 1196 |
seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
|
| 1197 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 1198 |
seg_path, vcodec="copy", an=None
|
|
@@ -1291,7 +1303,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
|
|
| 1291 |
|
| 1292 |
# CPU: pre-extract segment clip
|
| 1293 |
silent_video = meta["silent_video"]
|
| 1294 |
-
tmp_dir = tempfile.mkdtemp()
|
| 1295 |
seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
|
| 1296 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 1297 |
seg_path, vcodec="copy", an=None
|
|
|
|
| 371 |
|
| 372 |
|
| 373 |
def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
|
| 374 |
+
"""Generic GPU duration estimator for single-segment regen.
|
| 375 |
+
Uses a lower floor (30s) than initial generation since regen only runs
|
| 376 |
+
one segment β saves 30s of wasted ZeroGPU quota per regen call."""
|
| 377 |
cfg = MODEL_CONFIGS[model_key]
|
| 378 |
secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
|
| 379 |
+
result = min(GPU_DURATION_CAP, max(30, int(secs)))
|
| 380 |
print(f"[duration] {cfg['label']} regen: 1 seg Γ {int(num_steps)} steps β {secs:.0f}s β capped {result}s")
|
| 381 |
return result
|
| 382 |
|
|
|
|
| 580 |
crossfade_s, crossfade_db, num_samples)
|
| 581 |
|
| 582 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 583 |
+
# Cache CAVP + onset features once (same for all samples β they depend only on the video)
|
| 584 |
+
cavp_path = os.path.join(tmp_dir, "taro_cavp.npy")
|
| 585 |
+
onset_path = os.path.join(tmp_dir, "taro_onset.npy")
|
| 586 |
+
first_cavp_saved = False
|
| 587 |
outputs = []
|
| 588 |
for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
|
| 589 |
final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
|
|
|
|
| 592 |
video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
|
| 593 |
mux_video_audio(silent_video, audio_path, video_path)
|
| 594 |
wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
|
| 595 |
+
# Save shared features once (not per-sample β they're identical)
|
| 596 |
+
if not first_cavp_saved:
|
| 597 |
+
np.save(cavp_path, cavp_feats)
|
| 598 |
+
if onset_feats is not None:
|
| 599 |
+
np.save(onset_path, onset_feats)
|
| 600 |
+
first_cavp_saved = True
|
| 601 |
seg_meta = {
|
| 602 |
"segments": segments,
|
| 603 |
"wav_paths": wav_paths,
|
|
|
|
| 717 |
f"(current constant={MMAUDIO_SECS_PER_STEP})")
|
| 718 |
results.append((seg_audios, sr))
|
| 719 |
|
| 720 |
+
# Free GPU memory between samples to prevent VRAM fragmentation
|
| 721 |
+
if torch.cuda.is_available():
|
| 722 |
+
torch.cuda.empty_cache()
|
| 723 |
+
|
| 724 |
return results
|
| 725 |
|
| 726 |
_mmaudio_gpu_infer._cpu_ctx = {}
|
|
|
|
| 849 |
neg_prompt=negative_prompt if negative_prompt else None,
|
| 850 |
)
|
| 851 |
|
| 852 |
+
# Import visual-only feature extractor to avoid redundant text extraction
|
| 853 |
+
# per segment (text_feats already computed once above for the whole batch).
|
| 854 |
+
from hunyuanvideo_foley.utils.feature_utils import encode_video_features
|
| 855 |
+
|
| 856 |
results = []
|
| 857 |
for sample_idx in range(num_samples):
|
| 858 |
seg_wavs = []
|
|
|
|
| 862 |
seg_dur = seg_end - seg_start
|
| 863 |
seg_path = seg_clip_paths[seg_i]
|
| 864 |
|
| 865 |
+
# Extract only visual features β reuse text_feats from above
|
| 866 |
+
visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 867 |
print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
|
| 868 |
f"{seg_start:.1f}β{seg_end:.1f}s β {seg_audio_len:.2f}s audio")
|
| 869 |
|
|
|
|
| 890 |
f"(current constant={HUNYUAN_SECS_PER_STEP})")
|
| 891 |
results.append((seg_wavs, sr, text_feats))
|
| 892 |
|
| 893 |
+
# Free GPU memory between samples to prevent VRAM fragmentation
|
| 894 |
+
if torch.cuda.is_available():
|
| 895 |
+
torch.cuda.empty_cache()
|
| 896 |
+
|
| 897 |
return results
|
| 898 |
|
| 899 |
_hunyuan_gpu_infer._cpu_ctx = {}
|
|
|
|
| 1016 |
|
| 1017 |
# Save new audio β use a new timestamped filename so Gradio / the browser
|
| 1018 |
# treats it as a genuinely different file and reloads the video player.
|
| 1019 |
+
_ts = int(time.time() * 1000)
|
|
|
|
| 1020 |
tmp_dir = os.path.dirname(meta["audio_path"])
|
| 1021 |
_base = os.path.splitext(os.path.basename(meta["audio_path"]))[0]
|
| 1022 |
# Strip any previous timestamp suffix before adding a new one
|
|
|
|
| 1204 |
|
| 1205 |
# CPU: pre-extract segment clip
|
| 1206 |
silent_video = meta["silent_video"]
|
| 1207 |
+
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1208 |
seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
|
| 1209 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 1210 |
seg_path, vcodec="copy", an=None
|
|
|
|
| 1303 |
|
| 1304 |
# CPU: pre-extract segment clip
|
| 1305 |
silent_video = meta["silent_video"]
|
| 1306 |
+
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1307 |
seg_path = os.path.join(tmp_dir, "regen_seg.mp4")
|
| 1308 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 1309 |
seg_path, vcodec="copy", an=None
|