BoxOfColors commited on
Commit
e3d955b
Β·
1 Parent(s): 5cb2f31

Fix ZeroGPU tmp file isolation: extract clips inside GPU worker

Browse files

Pre-extracted tmp clips (/tmp/xregen_*_clip.mp4) created in the caller
process are invisible to the ZeroGPU GPU worker (separate process, fresh
/tmp). Fix: pass source_video + clip_start_s/clip_dur_s as positional
args; GPU fns extract the xregen clip internally before sub-segment clips.

Also convert all remaining silent_video/segments_json/total_dur_s kwargs
to positional args in gpu_infer calls (kwargs silently dropped by ZeroGPU).

Files changed (1) hide show
  1. app.py +49 -34
app.py CHANGED
@@ -1015,13 +1015,18 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
1015
  @spaces.GPU(duration=_mmaudio_duration)
1016
  def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1017
  cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
1018
- silent_video, segments_json):
 
1019
  """GPU-only MMAudio inference β€” model loading + flow-matching generation.
1020
  Returns list of (seg_audios, sr) per sample.
1021
 
1022
- *silent_video* and *segments_json* are passed explicitly to avoid
1023
- cross-process shared-state (ZeroGPU isolation). Segment clips are
1024
- extracted here via ffmpeg (CPU-safe inside GPU window).
 
 
 
 
1025
  """
1026
  _ensure_syspath("MMAudio")
1027
  from mmaudio.eval_utils import generate, load_video
@@ -1035,9 +1040,19 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1035
 
1036
  net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
1037
 
1038
- # Extract segment clips inside GPU fn β€” ffmpeg is CPU-only, safe here.
 
 
 
 
 
 
 
 
 
 
 
1039
  segments = json.loads(segments_json)
1040
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1041
  seg_clip_paths = [
1042
  _extract_segment_clip(silent_video, s, e - s,
1043
  os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
@@ -1113,8 +1128,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
1113
  results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1114
  cfg_strength, num_steps, crossfade_s, crossfade_db,
1115
  num_samples,
1116
- silent_video=silent_video,
1117
- segments_json=json.dumps(segments))
1118
 
1119
  # ── CPU post-processing ──
1120
  # Resample 44100 β†’ 48000 and normalise tuples to (seg_wavs, ...)
@@ -1163,12 +1177,13 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
1163
  @spaces.GPU(duration=_hunyuan_duration)
1164
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1165
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1166
- num_samples, silent_video, segments_json, total_dur_s):
 
1167
  """GPU-only HunyuanFoley inference β€” model loading + feature extraction + denoising.
1168
  Returns list of (seg_wavs, sr, text_feats) per sample.
1169
 
1170
- *silent_video*, *segments_json*, and *total_dur_s* are passed explicitly
1171
- to avoid cross-process shared-state under ZeroGPU isolation.
1172
  """
1173
  _ensure_syspath("HunyuanVideo-Foley")
1174
  from hunyuanvideo_foley.utils.model_utils import denoise_process
@@ -1185,9 +1200,18 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1185
 
1186
  model_dict, cfg = _load_hunyuan_model(device, model_size)
1187
 
1188
- # Extract segment clips inside GPU fn β€” ffmpeg is CPU-only, safe here.
 
 
 
 
 
 
 
 
 
 
1189
  segments = json.loads(segments_json)
1190
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1191
  dummy_seg_path = _extract_segment_clip(
1192
  silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
1193
  os.path.join(tmp_dir, "_seg_dummy.mp4"),
@@ -1266,9 +1290,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1266
  results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1267
  guidance_scale, num_steps, model_size,
1268
  crossfade_s, crossfade_db, num_samples,
1269
- silent_video=silent_video,
1270
- segments_json=json.dumps(segments),
1271
- total_dur_s=total_dur_s)
1272
 
1273
  # ── CPU post-processing (no GPU needed) ──
1274
  def _hunyuan_extras(sample_idx, result, td):
@@ -1791,16 +1813,14 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
1791
 
1792
  def _run():
1793
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
1794
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1795
- clip_path = _extract_segment_clip(
1796
- _resolve_silent_video(meta), clip_start, clip_dur,
1797
- os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
1798
- )
1799
  sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
1800
- results = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
1801
- cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
1802
- silent_video=clip_path,
1803
- segments_json=json.dumps(sub_segs))
 
 
1804
  seg_wavs, sr = results[0]
1805
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1806
  clip_dur, sr, sub_segs)
@@ -1823,18 +1843,13 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
1823
 
1824
  def _run():
1825
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
1826
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1827
- clip_path = _extract_segment_clip(
1828
- _resolve_silent_video(meta), clip_start, clip_dur,
1829
- os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
1830
- )
1831
  sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
1832
- results = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
1833
  guidance_scale, num_steps, model_size,
1834
  crossfade_s, crossfade_db, 1,
1835
- silent_video=clip_path,
1836
- segments_json=json.dumps(sub_segs),
1837
- total_dur_s=clip_dur)
1838
  seg_wavs, sr, _ = results[0]
1839
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1840
  clip_dur, sr, sub_segs)
 
1015
  @spaces.GPU(duration=_mmaudio_duration)
1016
  def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1017
  cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
1018
+ silent_video, segments_json,
1019
+ clip_start_s=0.0, clip_dur_s=None):
1020
  """GPU-only MMAudio inference β€” model loading + flow-matching generation.
1021
  Returns list of (seg_audios, sr) per sample.
1022
 
1023
+ All video paths and segment data are passed explicitly as positional args
1024
+ to survive ZeroGPU process isolation (kwargs are silently dropped).
1025
+
1026
+ When *clip_dur_s* is set, *silent_video* is the full source and a clip
1027
+ [clip_start_s, clip_start_s+clip_dur_s] is extracted first inside the
1028
+ GPU window (ffmpeg is CPU-safe here). This avoids passing pre-extracted
1029
+ tmp files that don't exist in the GPU worker's process.
1030
  """
1031
  _ensure_syspath("MMAudio")
1032
  from mmaudio.eval_utils import generate, load_video
 
1040
 
1041
  net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
1042
 
1043
+ # If a clip window is specified, extract it now (inside the GPU fn, so the
1044
+ # file exists in this worker's /tmp).
1045
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1046
+ if clip_dur_s is not None:
1047
+ clip_dur_s = float(clip_dur_s)
1048
+ clip_path = _extract_segment_clip(
1049
+ silent_video, float(clip_start_s), clip_dur_s,
1050
+ os.path.join(tmp_dir, "mma_xregen_clip.mp4"),
1051
+ )
1052
+ silent_video = clip_path
1053
+
1054
+ # Extract per-segment clips from silent_video (now the correct clip source).
1055
  segments = json.loads(segments_json)
 
1056
  seg_clip_paths = [
1057
  _extract_segment_clip(silent_video, s, e - s,
1058
  os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
 
1128
  results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1129
  cfg_strength, num_steps, crossfade_s, crossfade_db,
1130
  num_samples,
1131
+ silent_video, json.dumps(segments))
 
1132
 
1133
  # ── CPU post-processing ──
1134
  # Resample 44100 β†’ 48000 and normalise tuples to (seg_wavs, ...)
 
1177
  @spaces.GPU(duration=_hunyuan_duration)
1178
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1179
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1180
+ num_samples, silent_video, segments_json, total_dur_s,
1181
+ clip_start_s=0.0, clip_dur_s=None):
1182
  """GPU-only HunyuanFoley inference β€” model loading + feature extraction + denoising.
1183
  Returns list of (seg_wavs, sr, text_feats) per sample.
1184
 
1185
+ All paths passed explicitly as positional args to survive ZeroGPU isolation.
1186
+ When *clip_dur_s* is set, the clip is extracted inside the GPU window.
1187
  """
1188
  _ensure_syspath("HunyuanVideo-Foley")
1189
  from hunyuanvideo_foley.utils.model_utils import denoise_process
 
1200
 
1201
  model_dict, cfg = _load_hunyuan_model(device, model_size)
1202
 
1203
+ # Extract xregen clip inside GPU fn if needed (tmp files from caller invisible here).
1204
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1205
+ if clip_dur_s is not None:
1206
+ clip_dur_s = float(clip_dur_s)
1207
+ clip_path = _extract_segment_clip(
1208
+ silent_video, float(clip_start_s), clip_dur_s,
1209
+ os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
1210
+ )
1211
+ silent_video = clip_path
1212
+ total_dur_s = clip_dur_s
1213
+
1214
  segments = json.loads(segments_json)
 
1215
  dummy_seg_path = _extract_segment_clip(
1216
  silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
1217
  os.path.join(tmp_dir, "_seg_dummy.mp4"),
 
1290
  results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1291
  guidance_scale, num_steps, model_size,
1292
  crossfade_s, crossfade_db, num_samples,
1293
+ silent_video, json.dumps(segments), total_dur_s)
 
 
1294
 
1295
  # ── CPU post-processing (no GPU needed) ──
1296
  def _hunyuan_extras(sample_idx, result, td):
 
1813
 
1814
  def _run():
1815
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
1816
+ source_video = _resolve_silent_video(meta)
 
 
 
 
1817
  sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
1818
+ # Pass clip_start_s/clip_dur_s so the GPU fn extracts the clip internally β€”
1819
+ # pre-extracted tmp files are invisible to the ZeroGPU worker process.
1820
+ results = _mmaudio_gpu_infer(source_video, prompt, negative_prompt, seed_val,
1821
+ cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
1822
+ source_video, json.dumps(sub_segs),
1823
+ clip_start, clip_dur)
1824
  seg_wavs, sr = results[0]
1825
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1826
  clip_dur, sr, sub_segs)
 
1843
 
1844
  def _run():
1845
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
1846
+ source_video = _resolve_silent_video(meta)
 
 
 
 
1847
  sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
1848
+ results = _hunyuan_gpu_infer(source_video, prompt, negative_prompt, seed_val,
1849
  guidance_scale, num_steps, model_size,
1850
  crossfade_s, crossfade_db, 1,
1851
+ source_video, json.dumps(sub_segs), clip_dur,
1852
+ clip_start, clip_dur)
 
1853
  seg_wavs, sr, _ = results[0]
1854
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1855
  clip_dur, sr, sub_segs)