Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 2 days ago

Commit

e3d955b

1 Parent(s): 5cb2f31

Fix ZeroGPU tmp file isolation: extract clips inside GPU worker

Pre-extracted tmp clips (/tmp/xregen_*_clip.mp4) created in the caller
process are invisible to the ZeroGPU GPU worker (separate process, fresh
/tmp). Fix: pass source_video + clip_start_s/clip_dur_s as positional
args; GPU fns extract the xregen clip internally before sub-segment clips.

Also convert all remaining silent_video/segments_json/total_dur_s kwargs
to positional args in gpu_infer calls (kwargs silently dropped by ZeroGPU).

Files changed (1) hide show

app.py +49 -34

app.py CHANGED Viewed

@@ -1015,13 +1015,18 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
-                       silent_video, segments_json):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample.
-    *silent_video* and *segments_json* are passed explicitly to avoid
-    cross-process shared-state (ZeroGPU isolation).  Segment clips are
-    extracted here via ffmpeg (CPU-safe inside GPU window).
     """
     _ensure_syspath("MMAudio")
     from mmaudio.eval_utils        import generate, load_video
@@ -1035,9 +1040,19 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    # Extract segment clips inside GPU fn — ffmpeg is CPU-only, safe here.
     segments = json.loads(segments_json)
-    tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
     seg_clip_paths = [
         _extract_segment_clip(silent_video, s, e - s,
                               os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
@@ -1113,8 +1128,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
                                  num_samples,
-                                 silent_video=silent_video,
-                                 segments_json=json.dumps(segments))
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
@@ -1163,12 +1177,13 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                       num_samples, silent_video, segments_json, total_dur_s):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample.
-    *silent_video*, *segments_json*, and *total_dur_s* are passed explicitly
-    to avoid cross-process shared-state under ZeroGPU isolation.
     """
     _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
@@ -1185,9 +1200,18 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    # Extract segment clips inside GPU fn — ffmpeg is CPU-only, safe here.
     segments = json.loads(segments_json)
-    tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
     dummy_seg_path = _extract_segment_clip(
         silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
         os.path.join(tmp_dir, "_seg_dummy.mp4"),
@@ -1266,9 +1290,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
                                  crossfade_s, crossfade_db, num_samples,
-                                 silent_video=silent_video,
-                                 segments_json=json.dumps(segments),
-                                 total_dur_s=total_dur_s)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
@@ -1791,16 +1813,14 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
     def _run():
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
-        tmp_dir   = _register_tmp_dir(tempfile.mkdtemp())
-        clip_path = _extract_segment_clip(
-            _resolve_silent_video(meta), clip_start, clip_dur,
-            os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
-        )
         sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
-        results  = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
-                                      cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
-                                      silent_video=clip_path,
-                                      segments_json=json.dumps(sub_segs))
         seg_wavs, sr = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)
@@ -1823,18 +1843,13 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
     def _run():
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
-        tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
-        clip_path    = _extract_segment_clip(
-            _resolve_silent_video(meta), clip_start, clip_dur,
-            os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
-        )
         sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
-        results  = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
                                       guidance_scale, num_steps, model_size,
                                       crossfade_s, crossfade_db, 1,
-                                      silent_video=clip_path,
-                                      segments_json=json.dumps(sub_segs),
-                                      total_dur_s=clip_dur)
         seg_wavs, sr, _ = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)

 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
+                       silent_video, segments_json,
+                       clip_start_s=0.0, clip_dur_s=None):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample.
+    All video paths and segment data are passed explicitly as positional args
+    to survive ZeroGPU process isolation (kwargs are silently dropped).
+    When *clip_dur_s* is set, *silent_video* is the full source and a clip
+    [clip_start_s, clip_start_s+clip_dur_s] is extracted first inside the
+    GPU window (ffmpeg is CPU-safe here).  This avoids passing pre-extracted
+    tmp files that don't exist in the GPU worker's process.
     """
     _ensure_syspath("MMAudio")
     from mmaudio.eval_utils        import generate, load_video
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    # If a clip window is specified, extract it now (inside the GPU fn, so the
+    # file exists in this worker's /tmp).
+    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
+    if clip_dur_s is not None:
+        clip_dur_s = float(clip_dur_s)
+        clip_path  = _extract_segment_clip(
+            silent_video, float(clip_start_s), clip_dur_s,
+            os.path.join(tmp_dir, "mma_xregen_clip.mp4"),
+        )
+        silent_video = clip_path
+    # Extract per-segment clips from silent_video (now the correct clip source).
     segments = json.loads(segments_json)
     seg_clip_paths = [
         _extract_segment_clip(silent_video, s, e - s,
                               os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
                                  num_samples,
+                                 silent_video, json.dumps(segments))
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                       num_samples, silent_video, segments_json, total_dur_s,
+                       clip_start_s=0.0, clip_dur_s=None):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample.
+    All paths passed explicitly as positional args to survive ZeroGPU isolation.
+    When *clip_dur_s* is set, the clip is extracted inside the GPU window.
     """
     _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    # Extract xregen clip inside GPU fn if needed (tmp files from caller invisible here).
+    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
+    if clip_dur_s is not None:
+        clip_dur_s = float(clip_dur_s)
+        clip_path  = _extract_segment_clip(
+            silent_video, float(clip_start_s), clip_dur_s,
+            os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
+        )
+        silent_video = clip_path
+        total_dur_s  = clip_dur_s
     segments = json.loads(segments_json)
     dummy_seg_path = _extract_segment_clip(
         silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
         os.path.join(tmp_dir, "_seg_dummy.mp4"),
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
                                  crossfade_s, crossfade_db, num_samples,
+                                 silent_video, json.dumps(segments), total_dur_s)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
     def _run():
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
+        source_video = _resolve_silent_video(meta)
         sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
+        # Pass clip_start_s/clip_dur_s so the GPU fn extracts the clip internally —
+        # pre-extracted tmp files are invisible to the ZeroGPU worker process.
+        results = _mmaudio_gpu_infer(source_video, prompt, negative_prompt, seed_val,
+                                     cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
+                                     source_video, json.dumps(sub_segs),
+                                     clip_start, clip_dur)
         seg_wavs, sr = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)
     def _run():
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
+        source_video = _resolve_silent_video(meta)
         sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
+        results  = _hunyuan_gpu_infer(source_video, prompt, negative_prompt, seed_val,
                                       guidance_scale, num_steps, model_size,
                                       crossfade_s, crossfade_db, 1,
+                                      source_video, json.dumps(sub_segs), clip_dur,
+                                      clip_start, clip_dur)
         seg_wavs, sr, _ = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)