Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on about 11 hours ago

Commit

4d5093e

1 Parent(s): b3f7f32

Fix ZeroGPU isolation for MMAudio/HunyuanFoley regen + waveform contact-edge fix

- Pass silent_video, segments_json, total_dur_s as explicit params to
_mmaudio_gpu_infer and _hunyuan_gpu_infer; extract segment clips inside
the GPU fn (ffmpeg is CPU-safe inside GPU window). Removes _ctx_store/
_ctx_load for both models — root cause of xregen GPU task aborts.
- xregen_mmaudio/xregen_hunyuan pass params directly without _ctx_store.
- Fix waveform color boundaries: use contact edges (seg[i][1]+seg[i+1][0])/2
instead of seg[i+1][0]+crossfade/2 (wrong with equal-spacing algo).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +69 -79

app.py CHANGED Viewed

@@ -994,7 +994,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
-                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """Pre-GPU callable — must match _mmaudio_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -1002,9 +1003,15 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
-                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
-    Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
     from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
@@ -1017,9 +1024,14 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    ctx            = _ctx_load("mmaudio_gpu_infer")
-    segments       = ctx["segments"]
-    seg_clip_paths = ctx["seg_clip_paths"]
     sr = seq_cfg.sampling_rate   # 44100
@@ -1086,17 +1098,12 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         video_file, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
-    seg_clip_paths = [
-        _extract_segment_clip(silent_video, s, e - s, os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
-        for i, (s, e) in enumerate(segments)
-    ]
-    _ctx_store("mmaudio_gpu_infer", {"segments": segments, "seg_clip_paths": seg_clip_paths})
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
-                                 num_samples)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
@@ -1134,7 +1141,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                      num_samples):
     """Pre-GPU callable — must match _hunyuan_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -1143,9 +1150,13 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                       num_samples):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
-    Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
@@ -1153,6 +1164,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     seed_val     = _resolve_seed(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
     set_global_seed(seed_val)
     device, _    = _get_device_and_dtype()
@@ -1160,11 +1172,18 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    ctx              = _ctx_load("hunyuan_gpu_infer")
-    segments         = ctx["segments"]
-    total_dur_s      = ctx["total_dur_s"]
-    dummy_seg_path   = ctx["dummy_seg_path"]
-    seg_clip_paths   = ctx["seg_clip_paths"]
     # Text feature extraction (GPU — runs once for all segments)
     _, text_feats, _ = feature_process(
@@ -1230,27 +1249,13 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         video_file, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
-    # Pre-extract dummy segment for text feature extraction (ffmpeg, CPU)
-    dummy_seg_path = _extract_segment_clip(
-        silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
-        os.path.join(tmp_dir, "_seg_dummy.mp4"),
-    )
-    # Pre-extract all segment clips (ffmpeg, CPU)
-    seg_clip_paths = [
-        _extract_segment_clip(silent_video, s, e - s, os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
-        for i, (s, e) in enumerate(segments)
-    ]
-    _ctx_store("hunyuan_gpu_infer", {
-        "segments": segments, "total_dur_s": total_dur_s,
-        "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
-    })
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
-                                 crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
@@ -1753,19 +1758,11 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
             meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
         )
-        sub_segs  = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
-        seg_clip_paths = [
-            _extract_segment_clip(
-                clip_path, s, e - s,
-                os.path.join(tmp_dir, f"xregen_mma_sub_{i}.mp4"),
-            )
-            for i, (s, e) in enumerate(sub_segs)
-        ]
-        _ctx_store("mmaudio_gpu_infer", {
-            "segments": sub_segs, "seg_clip_paths": seg_clip_paths,
-        })
-        results = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
-                                     cfg_strength, num_steps, crossfade_s, crossfade_db, 1)
         seg_wavs, sr = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)
@@ -1793,25 +1790,13 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
             meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
         )
-        sub_segs     = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
-        seg_clip_paths = [
-            _extract_segment_clip(
-                clip_path, s, e - s,
-                os.path.join(tmp_dir, f"xregen_hny_sub_{i}.mp4"),
-            )
-            for i, (s, e) in enumerate(sub_segs)
-        ]
-        dummy_seg_path = _extract_segment_clip(
-            clip_path, 0, min(clip_dur, HUNYUAN_MAX_DUR),
-            os.path.join(tmp_dir, "xregen_hny_dummy.mp4"),
-        )
-        _ctx_store("hunyuan_gpu_infer", {
-            "segments": sub_segs, "total_dur_s": clip_dur,
-            "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
-        })
-        results = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
-                                     guidance_scale, num_steps, model_size,
-                                     crossfade_s, crossfade_db, 1)
         seg_wavs, sr, _ = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)
@@ -2066,14 +2051,19 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
     ctx.fillStyle = '#1e1e2e';
     ctx.fillRect(0, 0, W, H);
     segments.forEach(function(seg, idx) {{
-      // Color boundary = midpoint of the crossfade zone = where the blend is
-      // 50/50. This is also where the cut would land if crossfade were 0, and
-      // where the listener perceptually hears the transition to the next segment.
-      const x1 = (seg[0] / duration) * W;
-      const xEnd = idx + 1 < segments.length
-        ? ((segments[idx + 1][0] + crossfadeSec / 2) / duration) * W
-        : (seg[1] / duration) * W;
       ctx.fillStyle = segColors[idx % segColors.length];
       ctx.fillRect(x1, 0, xEnd - x1, H);
       ctx.fillStyle = 'rgba(255,255,255,0.6)';

 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
+                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
+                      silent_video=None, segments_json=None):
     """Pre-GPU callable — must match _mmaudio_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
+                       silent_video=None, segments_json=None):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
+    Returns list of (seg_audios, sr) per sample.
+    *silent_video* and *segments_json* are passed explicitly to avoid
+    cross-process shared-state (ZeroGPU isolation).  Segment clips are
+    extracted here via ffmpeg (CPU-safe inside GPU window).
+    """
     _ensure_syspath("MMAudio")
     from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    # Extract segment clips inside GPU fn — ffmpeg is CPU-only, safe here.
+    segments = json.loads(segments_json)
+    tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
+    seg_clip_paths = [
+        _extract_segment_clip(silent_video, s, e - s,
+                              os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
+        for i, (s, e) in enumerate(segments)
+    ]
     sr = seq_cfg.sampling_rate   # 44100
         video_file, MMAUDIO_WINDOW, crossfade_s)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                 num_samples,
+                                 silent_video=silent_video,
+                                 segments_json=json.dumps(segments))
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                      num_samples, silent_video=None, segments_json=None, total_dur_s=None):
     """Pre-GPU callable — must match _hunyuan_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                       num_samples, silent_video=None, segments_json=None, total_dur_s=None):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
+    Returns list of (seg_wavs, sr, text_feats) per sample.
+    *silent_video*, *segments_json*, and *total_dur_s* are passed explicitly
+    to avoid cross-process shared-state under ZeroGPU isolation.
+    """
     _ensure_syspath("HunyuanVideo-Foley")
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     seed_val     = _resolve_seed(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
+    total_dur_s  = float(total_dur_s)
     set_global_seed(seed_val)
     device, _    = _get_device_and_dtype()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    # Extract segment clips inside GPU fn — ffmpeg is CPU-only, safe here.
+    segments = json.loads(segments_json)
+    tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
+    dummy_seg_path = _extract_segment_clip(
+        silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
+        os.path.join(tmp_dir, "_seg_dummy.mp4"),
+    )
+    seg_clip_paths = [
+        _extract_segment_clip(silent_video, s, e - s,
+                              os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
+        for i, (s, e) in enumerate(segments)
+    ]
     # Text feature extraction (GPU — runs once for all segments)
     _, text_feats, _ = feature_process(
         video_file, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
+                                 crossfade_s, crossfade_db, num_samples,
+                                 silent_video=silent_video,
+                                 segments_json=json.dumps(segments),
+                                 total_dur_s=total_dur_s)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
             meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
         )
+        sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
+        results  = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
+                                      cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
+                                      silent_video=clip_path,
+                                      segments_json=json.dumps(sub_segs))
         seg_wavs, sr = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)
             meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
         )
+        sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
+        results  = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
+                                      guidance_scale, num_steps, model_size,
+                                      crossfade_s, crossfade_db, 1,
+                                      silent_video=clip_path,
+                                      segments_json=json.dumps(sub_segs),
+                                      total_dur_s=clip_dur)
         seg_wavs, sr, _ = results[0]
         wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
                            clip_dur, sr, sub_segs)
     ctx.fillStyle = '#1e1e2e';
     ctx.fillRect(0, 0, W, H);
+    // Compute contact edges: midpoint of overlap between consecutive segments.
+    // Each segment is colored from its left contact edge to its right contact edge.
+    // First segment starts at 0; last segment ends at duration.
+    const contactEdges = [];
+    for (let i = 0; i < segments.length - 1; i++) {{
+      contactEdges.push((segments[i][1] + segments[i+1][0]) / 2);
+    }}
     segments.forEach(function(seg, idx) {{
+      const x1   = idx === 0 ? 0 : (contactEdges[idx-1] / duration) * W;
+      const xEnd = idx === segments.length - 1
+        ? W
+        : (contactEdges[idx] / duration) * W;
       ctx.fillStyle = segColors[idx % segColors.length];
       ctx.fillRect(x1, 0, xEnd - x1, H);
       ctx.fillStyle = 'rgba(255,255,255,0.6)';