Spaces:
Running on Zero
Running on Zero
Commit Β·
e3d955b
1
Parent(s): 5cb2f31
Fix ZeroGPU tmp file isolation: extract clips inside GPU worker
Browse filesPre-extracted tmp clips (/tmp/xregen_*_clip.mp4) created in the caller
process are invisible to the ZeroGPU GPU worker (separate process, fresh
/tmp). Fix: pass source_video + clip_start_s/clip_dur_s as positional
args; GPU fns extract the xregen clip internally before sub-segment clips.
Also convert all remaining silent_video/segments_json/total_dur_s kwargs
to positional args in gpu_infer calls (kwargs silently dropped by ZeroGPU).
app.py
CHANGED
|
@@ -1015,13 +1015,18 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
|
|
| 1015 |
@spaces.GPU(duration=_mmaudio_duration)
|
| 1016 |
def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1017 |
cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
|
| 1018 |
-
silent_video, segments_json
|
|
|
|
| 1019 |
"""GPU-only MMAudio inference β model loading + flow-matching generation.
|
| 1020 |
Returns list of (seg_audios, sr) per sample.
|
| 1021 |
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
"""
|
| 1026 |
_ensure_syspath("MMAudio")
|
| 1027 |
from mmaudio.eval_utils import generate, load_video
|
|
@@ -1035,9 +1040,19 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 1035 |
|
| 1036 |
net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
|
| 1037 |
|
| 1038 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1039 |
segments = json.loads(segments_json)
|
| 1040 |
-
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1041 |
seg_clip_paths = [
|
| 1042 |
_extract_segment_clip(silent_video, s, e - s,
|
| 1043 |
os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
|
|
@@ -1113,8 +1128,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 1113 |
results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1114 |
cfg_strength, num_steps, crossfade_s, crossfade_db,
|
| 1115 |
num_samples,
|
| 1116 |
-
silent_video
|
| 1117 |
-
segments_json=json.dumps(segments))
|
| 1118 |
|
| 1119 |
# ββ CPU post-processing ββ
|
| 1120 |
# Resample 44100 β 48000 and normalise tuples to (seg_wavs, ...)
|
|
@@ -1163,12 +1177,13 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
|
|
| 1163 |
@spaces.GPU(duration=_hunyuan_duration)
|
| 1164 |
def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1165 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1166 |
-
num_samples, silent_video, segments_json, total_dur_s
|
|
|
|
| 1167 |
"""GPU-only HunyuanFoley inference β model loading + feature extraction + denoising.
|
| 1168 |
Returns list of (seg_wavs, sr, text_feats) per sample.
|
| 1169 |
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
"""
|
| 1173 |
_ensure_syspath("HunyuanVideo-Foley")
|
| 1174 |
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
|
@@ -1185,9 +1200,18 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 1185 |
|
| 1186 |
model_dict, cfg = _load_hunyuan_model(device, model_size)
|
| 1187 |
|
| 1188 |
-
# Extract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1189 |
segments = json.loads(segments_json)
|
| 1190 |
-
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1191 |
dummy_seg_path = _extract_segment_clip(
|
| 1192 |
silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
|
| 1193 |
os.path.join(tmp_dir, "_seg_dummy.mp4"),
|
|
@@ -1266,9 +1290,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 1266 |
results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1267 |
guidance_scale, num_steps, model_size,
|
| 1268 |
crossfade_s, crossfade_db, num_samples,
|
| 1269 |
-
silent_video
|
| 1270 |
-
segments_json=json.dumps(segments),
|
| 1271 |
-
total_dur_s=total_dur_s)
|
| 1272 |
|
| 1273 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 1274 |
def _hunyuan_extras(sample_idx, result, td):
|
|
@@ -1791,16 +1813,14 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
|
|
| 1791 |
|
| 1792 |
def _run():
|
| 1793 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
|
| 1794 |
-
|
| 1795 |
-
clip_path = _extract_segment_clip(
|
| 1796 |
-
_resolve_silent_video(meta), clip_start, clip_dur,
|
| 1797 |
-
os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
|
| 1798 |
-
)
|
| 1799 |
sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
|
| 1800 |
-
|
| 1801 |
-
|
| 1802 |
-
|
| 1803 |
-
|
|
|
|
|
|
|
| 1804 |
seg_wavs, sr = results[0]
|
| 1805 |
wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
|
| 1806 |
clip_dur, sr, sub_segs)
|
|
@@ -1823,18 +1843,13 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
|
|
| 1823 |
|
| 1824 |
def _run():
|
| 1825 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
|
| 1826 |
-
|
| 1827 |
-
clip_path = _extract_segment_clip(
|
| 1828 |
-
_resolve_silent_video(meta), clip_start, clip_dur,
|
| 1829 |
-
os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
|
| 1830 |
-
)
|
| 1831 |
sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
|
| 1832 |
-
results = _hunyuan_gpu_infer(
|
| 1833 |
guidance_scale, num_steps, model_size,
|
| 1834 |
crossfade_s, crossfade_db, 1,
|
| 1835 |
-
|
| 1836 |
-
|
| 1837 |
-
total_dur_s=clip_dur)
|
| 1838 |
seg_wavs, sr, _ = results[0]
|
| 1839 |
wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
|
| 1840 |
clip_dur, sr, sub_segs)
|
|
|
|
| 1015 |
@spaces.GPU(duration=_mmaudio_duration)
|
| 1016 |
def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1017 |
cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
|
| 1018 |
+
silent_video, segments_json,
|
| 1019 |
+
clip_start_s=0.0, clip_dur_s=None):
|
| 1020 |
"""GPU-only MMAudio inference β model loading + flow-matching generation.
|
| 1021 |
Returns list of (seg_audios, sr) per sample.
|
| 1022 |
|
| 1023 |
+
All video paths and segment data are passed explicitly as positional args
|
| 1024 |
+
to survive ZeroGPU process isolation (kwargs are silently dropped).
|
| 1025 |
+
|
| 1026 |
+
When *clip_dur_s* is set, *silent_video* is the full source and a clip
|
| 1027 |
+
[clip_start_s, clip_start_s+clip_dur_s] is extracted first inside the
|
| 1028 |
+
GPU window (ffmpeg is CPU-safe here). This avoids passing pre-extracted
|
| 1029 |
+
tmp files that don't exist in the GPU worker's process.
|
| 1030 |
"""
|
| 1031 |
_ensure_syspath("MMAudio")
|
| 1032 |
from mmaudio.eval_utils import generate, load_video
|
|
|
|
| 1040 |
|
| 1041 |
net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
|
| 1042 |
|
| 1043 |
+
# If a clip window is specified, extract it now (inside the GPU fn, so the
|
| 1044 |
+
# file exists in this worker's /tmp).
|
| 1045 |
+
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1046 |
+
if clip_dur_s is not None:
|
| 1047 |
+
clip_dur_s = float(clip_dur_s)
|
| 1048 |
+
clip_path = _extract_segment_clip(
|
| 1049 |
+
silent_video, float(clip_start_s), clip_dur_s,
|
| 1050 |
+
os.path.join(tmp_dir, "mma_xregen_clip.mp4"),
|
| 1051 |
+
)
|
| 1052 |
+
silent_video = clip_path
|
| 1053 |
+
|
| 1054 |
+
# Extract per-segment clips from silent_video (now the correct clip source).
|
| 1055 |
segments = json.loads(segments_json)
|
|
|
|
| 1056 |
seg_clip_paths = [
|
| 1057 |
_extract_segment_clip(silent_video, s, e - s,
|
| 1058 |
os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
|
|
|
|
| 1128 |
results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1129 |
cfg_strength, num_steps, crossfade_s, crossfade_db,
|
| 1130 |
num_samples,
|
| 1131 |
+
silent_video, json.dumps(segments))
|
|
|
|
| 1132 |
|
| 1133 |
# ββ CPU post-processing ββ
|
| 1134 |
# Resample 44100 β 48000 and normalise tuples to (seg_wavs, ...)
|
|
|
|
| 1177 |
@spaces.GPU(duration=_hunyuan_duration)
|
| 1178 |
def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1179 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1180 |
+
num_samples, silent_video, segments_json, total_dur_s,
|
| 1181 |
+
clip_start_s=0.0, clip_dur_s=None):
|
| 1182 |
"""GPU-only HunyuanFoley inference β model loading + feature extraction + denoising.
|
| 1183 |
Returns list of (seg_wavs, sr, text_feats) per sample.
|
| 1184 |
|
| 1185 |
+
All paths passed explicitly as positional args to survive ZeroGPU isolation.
|
| 1186 |
+
When *clip_dur_s* is set, the clip is extracted inside the GPU window.
|
| 1187 |
"""
|
| 1188 |
_ensure_syspath("HunyuanVideo-Foley")
|
| 1189 |
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
|
|
|
| 1200 |
|
| 1201 |
model_dict, cfg = _load_hunyuan_model(device, model_size)
|
| 1202 |
|
| 1203 |
+
# Extract xregen clip inside GPU fn if needed (tmp files from caller invisible here).
|
| 1204 |
+
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1205 |
+
if clip_dur_s is not None:
|
| 1206 |
+
clip_dur_s = float(clip_dur_s)
|
| 1207 |
+
clip_path = _extract_segment_clip(
|
| 1208 |
+
silent_video, float(clip_start_s), clip_dur_s,
|
| 1209 |
+
os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
|
| 1210 |
+
)
|
| 1211 |
+
silent_video = clip_path
|
| 1212 |
+
total_dur_s = clip_dur_s
|
| 1213 |
+
|
| 1214 |
segments = json.loads(segments_json)
|
|
|
|
| 1215 |
dummy_seg_path = _extract_segment_clip(
|
| 1216 |
silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
|
| 1217 |
os.path.join(tmp_dir, "_seg_dummy.mp4"),
|
|
|
|
| 1290 |
results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1291 |
guidance_scale, num_steps, model_size,
|
| 1292 |
crossfade_s, crossfade_db, num_samples,
|
| 1293 |
+
silent_video, json.dumps(segments), total_dur_s)
|
|
|
|
|
|
|
| 1294 |
|
| 1295 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 1296 |
def _hunyuan_extras(sample_idx, result, td):
|
|
|
|
| 1813 |
|
| 1814 |
def _run():
|
| 1815 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
|
| 1816 |
+
source_video = _resolve_silent_video(meta)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1817 |
sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
|
| 1818 |
+
# Pass clip_start_s/clip_dur_s so the GPU fn extracts the clip internally β
|
| 1819 |
+
# pre-extracted tmp files are invisible to the ZeroGPU worker process.
|
| 1820 |
+
results = _mmaudio_gpu_infer(source_video, prompt, negative_prompt, seed_val,
|
| 1821 |
+
cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
|
| 1822 |
+
source_video, json.dumps(sub_segs),
|
| 1823 |
+
clip_start, clip_dur)
|
| 1824 |
seg_wavs, sr = results[0]
|
| 1825 |
wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
|
| 1826 |
clip_dur, sr, sub_segs)
|
|
|
|
| 1843 |
|
| 1844 |
def _run():
|
| 1845 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
|
| 1846 |
+
source_video = _resolve_silent_video(meta)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1847 |
sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
|
| 1848 |
+
results = _hunyuan_gpu_infer(source_video, prompt, negative_prompt, seed_val,
|
| 1849 |
guidance_scale, num_steps, model_size,
|
| 1850 |
crossfade_s, crossfade_db, 1,
|
| 1851 |
+
source_video, json.dumps(sub_segs), clip_dur,
|
| 1852 |
+
clip_start, clip_dur)
|
|
|
|
| 1853 |
seg_wavs, sr, _ = results[0]
|
| 1854 |
wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
|
| 1855 |
clip_dur, sr, sub_segs)
|