BoxOfColors Claude Sonnet 4.6 commited on
Commit
4d5093e
Β·
1 Parent(s): b3f7f32

Fix ZeroGPU isolation for MMAudio/HunyuanFoley regen + waveform contact-edge fix

Browse files

- Pass silent_video, segments_json, total_dur_s as explicit params to
_mmaudio_gpu_infer and _hunyuan_gpu_infer; extract segment clips inside
the GPU fn (ffmpeg is CPU-safe inside GPU window). Removes _ctx_store/
_ctx_load for both models β€” root cause of xregen GPU task aborts.
- xregen_mmaudio/xregen_hunyuan pass params directly without _ctx_store.
- Fix waveform color boundaries: use contact edges (seg[i][1]+seg[i+1][0])/2
instead of seg[i+1][0]+crossfade/2 (wrong with equal-spacing algo).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +69 -79
app.py CHANGED
@@ -994,7 +994,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
994
 
995
 
996
  def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
997
- cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
 
998
  """Pre-GPU callable β€” must match _mmaudio_gpu_infer's input order exactly."""
999
  return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
1000
  video_file=video_file, crossfade_s=crossfade_s)
@@ -1002,9 +1003,15 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
1002
 
1003
  @spaces.GPU(duration=_mmaudio_duration)
1004
  def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1005
- cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
 
1006
  """GPU-only MMAudio inference β€” model loading + flow-matching generation.
1007
- Returns list of (seg_audios, sr) per sample."""
 
 
 
 
 
1008
  _ensure_syspath("MMAudio")
1009
  from mmaudio.eval_utils import generate, load_video
1010
  from mmaudio.model.flow_matching import FlowMatching
@@ -1017,9 +1024,14 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1017
 
1018
  net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
1019
 
1020
- ctx = _ctx_load("mmaudio_gpu_infer")
1021
- segments = ctx["segments"]
1022
- seg_clip_paths = ctx["seg_clip_paths"]
 
 
 
 
 
1023
 
1024
  sr = seq_cfg.sampling_rate # 44100
1025
 
@@ -1086,17 +1098,12 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
1086
  video_file, MMAUDIO_WINDOW, crossfade_s)
1087
  print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀8 s")
1088
 
1089
- seg_clip_paths = [
1090
- _extract_segment_clip(silent_video, s, e - s, os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
1091
- for i, (s, e) in enumerate(segments)
1092
- ]
1093
-
1094
- _ctx_store("mmaudio_gpu_infer", {"segments": segments, "seg_clip_paths": seg_clip_paths})
1095
-
1096
  # ── GPU inference only ──
1097
  results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1098
  cfg_strength, num_steps, crossfade_s, crossfade_db,
1099
- num_samples)
 
 
1100
 
1101
  # ── CPU post-processing ──
1102
  # Resample 44100 β†’ 48000 and normalise tuples to (seg_wavs, ...)
@@ -1134,7 +1141,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
1134
 
1135
  def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
1136
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1137
- num_samples):
1138
  """Pre-GPU callable β€” must match _hunyuan_gpu_infer's input order exactly."""
1139
  return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
1140
  video_file=video_file, crossfade_s=crossfade_s)
@@ -1143,9 +1150,13 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
1143
  @spaces.GPU(duration=_hunyuan_duration)
1144
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1145
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1146
- num_samples):
1147
  """GPU-only HunyuanFoley inference β€” model loading + feature extraction + denoising.
1148
- Returns list of (seg_wavs, sr, text_feats) per sample."""
 
 
 
 
1149
  _ensure_syspath("HunyuanVideo-Foley")
1150
  from hunyuanvideo_foley.utils.model_utils import denoise_process
1151
  from hunyuanvideo_foley.utils.feature_utils import feature_process
@@ -1153,6 +1164,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1153
  seed_val = _resolve_seed(seed_val)
1154
  num_samples = int(num_samples)
1155
  crossfade_s = float(crossfade_s)
 
1156
  set_global_seed(seed_val)
1157
 
1158
  device, _ = _get_device_and_dtype()
@@ -1160,11 +1172,18 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1160
 
1161
  model_dict, cfg = _load_hunyuan_model(device, model_size)
1162
 
1163
- ctx = _ctx_load("hunyuan_gpu_infer")
1164
- segments = ctx["segments"]
1165
- total_dur_s = ctx["total_dur_s"]
1166
- dummy_seg_path = ctx["dummy_seg_path"]
1167
- seg_clip_paths = ctx["seg_clip_paths"]
 
 
 
 
 
 
 
1168
 
1169
  # Text feature extraction (GPU β€” runs once for all segments)
1170
  _, text_feats, _ = feature_process(
@@ -1230,27 +1249,13 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1230
  video_file, HUNYUAN_MAX_DUR, crossfade_s)
1231
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀15 s")
1232
 
1233
- # Pre-extract dummy segment for text feature extraction (ffmpeg, CPU)
1234
- dummy_seg_path = _extract_segment_clip(
1235
- silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
1236
- os.path.join(tmp_dir, "_seg_dummy.mp4"),
1237
- )
1238
-
1239
- # Pre-extract all segment clips (ffmpeg, CPU)
1240
- seg_clip_paths = [
1241
- _extract_segment_clip(silent_video, s, e - s, os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1242
- for i, (s, e) in enumerate(segments)
1243
- ]
1244
-
1245
- _ctx_store("hunyuan_gpu_infer", {
1246
- "segments": segments, "total_dur_s": total_dur_s,
1247
- "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
1248
- })
1249
-
1250
  # ── GPU inference only ──
1251
  results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1252
  guidance_scale, num_steps, model_size,
1253
- crossfade_s, crossfade_db, num_samples)
 
 
 
1254
 
1255
  # ── CPU post-processing (no GPU needed) ──
1256
  def _hunyuan_extras(sample_idx, result, td):
@@ -1753,19 +1758,11 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
1753
  meta["silent_video"], clip_start, clip_dur,
1754
  os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
1755
  )
1756
- sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
1757
- seg_clip_paths = [
1758
- _extract_segment_clip(
1759
- clip_path, s, e - s,
1760
- os.path.join(tmp_dir, f"xregen_mma_sub_{i}.mp4"),
1761
- )
1762
- for i, (s, e) in enumerate(sub_segs)
1763
- ]
1764
- _ctx_store("mmaudio_gpu_infer", {
1765
- "segments": sub_segs, "seg_clip_paths": seg_clip_paths,
1766
- })
1767
- results = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
1768
- cfg_strength, num_steps, crossfade_s, crossfade_db, 1)
1769
  seg_wavs, sr = results[0]
1770
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1771
  clip_dur, sr, sub_segs)
@@ -1793,25 +1790,13 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
1793
  meta["silent_video"], clip_start, clip_dur,
1794
  os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
1795
  )
1796
- sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
1797
- seg_clip_paths = [
1798
- _extract_segment_clip(
1799
- clip_path, s, e - s,
1800
- os.path.join(tmp_dir, f"xregen_hny_sub_{i}.mp4"),
1801
- )
1802
- for i, (s, e) in enumerate(sub_segs)
1803
- ]
1804
- dummy_seg_path = _extract_segment_clip(
1805
- clip_path, 0, min(clip_dur, HUNYUAN_MAX_DUR),
1806
- os.path.join(tmp_dir, "xregen_hny_dummy.mp4"),
1807
- )
1808
- _ctx_store("hunyuan_gpu_infer", {
1809
- "segments": sub_segs, "total_dur_s": clip_dur,
1810
- "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
1811
- })
1812
- results = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
1813
- guidance_scale, num_steps, model_size,
1814
- crossfade_s, crossfade_db, 1)
1815
  seg_wavs, sr, _ = results[0]
1816
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1817
  clip_dur, sr, sub_segs)
@@ -2066,14 +2051,19 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
2066
  ctx.fillStyle = '#1e1e2e';
2067
  ctx.fillRect(0, 0, W, H);
2068
 
 
 
 
 
 
 
 
 
2069
  segments.forEach(function(seg, idx) {{
2070
- // Color boundary = midpoint of the crossfade zone = where the blend is
2071
- // 50/50. This is also where the cut would land if crossfade were 0, and
2072
- // where the listener perceptually hears the transition to the next segment.
2073
- const x1 = (seg[0] / duration) * W;
2074
- const xEnd = idx + 1 < segments.length
2075
- ? ((segments[idx + 1][0] + crossfadeSec / 2) / duration) * W
2076
- : (seg[1] / duration) * W;
2077
  ctx.fillStyle = segColors[idx % segColors.length];
2078
  ctx.fillRect(x1, 0, xEnd - x1, H);
2079
  ctx.fillStyle = 'rgba(255,255,255,0.6)';
 
994
 
995
 
996
  def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
997
+ cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
998
+ silent_video=None, segments_json=None):
999
  """Pre-GPU callable β€” must match _mmaudio_gpu_infer's input order exactly."""
1000
  return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
1001
  video_file=video_file, crossfade_s=crossfade_s)
 
1003
 
1004
  @spaces.GPU(duration=_mmaudio_duration)
1005
  def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1006
+ cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
1007
+ silent_video=None, segments_json=None):
1008
  """GPU-only MMAudio inference β€” model loading + flow-matching generation.
1009
+ Returns list of (seg_audios, sr) per sample.
1010
+
1011
+ *silent_video* and *segments_json* are passed explicitly to avoid
1012
+ cross-process shared-state (ZeroGPU isolation). Segment clips are
1013
+ extracted here via ffmpeg (CPU-safe inside GPU window).
1014
+ """
1015
  _ensure_syspath("MMAudio")
1016
  from mmaudio.eval_utils import generate, load_video
1017
  from mmaudio.model.flow_matching import FlowMatching
 
1024
 
1025
  net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
1026
 
1027
+ # Extract segment clips inside GPU fn β€” ffmpeg is CPU-only, safe here.
1028
+ segments = json.loads(segments_json)
1029
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1030
+ seg_clip_paths = [
1031
+ _extract_segment_clip(silent_video, s, e - s,
1032
+ os.path.join(tmp_dir, f"mma_seg_{i}.mp4"))
1033
+ for i, (s, e) in enumerate(segments)
1034
+ ]
1035
 
1036
  sr = seq_cfg.sampling_rate # 44100
1037
 
 
1098
  video_file, MMAUDIO_WINDOW, crossfade_s)
1099
  print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀8 s")
1100
 
 
 
 
 
 
 
 
1101
  # ── GPU inference only ──
1102
  results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1103
  cfg_strength, num_steps, crossfade_s, crossfade_db,
1104
+ num_samples,
1105
+ silent_video=silent_video,
1106
+ segments_json=json.dumps(segments))
1107
 
1108
  # ── CPU post-processing ──
1109
  # Resample 44100 β†’ 48000 and normalise tuples to (seg_wavs, ...)
 
1141
 
1142
  def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
1143
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1144
+ num_samples, silent_video=None, segments_json=None, total_dur_s=None):
1145
  """Pre-GPU callable β€” must match _hunyuan_gpu_infer's input order exactly."""
1146
  return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
1147
  video_file=video_file, crossfade_s=crossfade_s)
 
1150
  @spaces.GPU(duration=_hunyuan_duration)
1151
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1152
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1153
+ num_samples, silent_video=None, segments_json=None, total_dur_s=None):
1154
  """GPU-only HunyuanFoley inference β€” model loading + feature extraction + denoising.
1155
+ Returns list of (seg_wavs, sr, text_feats) per sample.
1156
+
1157
+ *silent_video*, *segments_json*, and *total_dur_s* are passed explicitly
1158
+ to avoid cross-process shared-state under ZeroGPU isolation.
1159
+ """
1160
  _ensure_syspath("HunyuanVideo-Foley")
1161
  from hunyuanvideo_foley.utils.model_utils import denoise_process
1162
  from hunyuanvideo_foley.utils.feature_utils import feature_process
 
1164
  seed_val = _resolve_seed(seed_val)
1165
  num_samples = int(num_samples)
1166
  crossfade_s = float(crossfade_s)
1167
+ total_dur_s = float(total_dur_s)
1168
  set_global_seed(seed_val)
1169
 
1170
  device, _ = _get_device_and_dtype()
 
1172
 
1173
  model_dict, cfg = _load_hunyuan_model(device, model_size)
1174
 
1175
+ # Extract segment clips inside GPU fn β€” ffmpeg is CPU-only, safe here.
1176
+ segments = json.loads(segments_json)
1177
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1178
+ dummy_seg_path = _extract_segment_clip(
1179
+ silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
1180
+ os.path.join(tmp_dir, "_seg_dummy.mp4"),
1181
+ )
1182
+ seg_clip_paths = [
1183
+ _extract_segment_clip(silent_video, s, e - s,
1184
+ os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1185
+ for i, (s, e) in enumerate(segments)
1186
+ ]
1187
 
1188
  # Text feature extraction (GPU β€” runs once for all segments)
1189
  _, text_feats, _ = feature_process(
 
1249
  video_file, HUNYUAN_MAX_DUR, crossfade_s)
1250
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀15 s")
1251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1252
  # ── GPU inference only ──
1253
  results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1254
  guidance_scale, num_steps, model_size,
1255
+ crossfade_s, crossfade_db, num_samples,
1256
+ silent_video=silent_video,
1257
+ segments_json=json.dumps(segments),
1258
+ total_dur_s=total_dur_s)
1259
 
1260
  # ── CPU post-processing (no GPU needed) ──
1261
  def _hunyuan_extras(sample_idx, result, td):
 
1758
  meta["silent_video"], clip_start, clip_dur,
1759
  os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
1760
  )
1761
+ sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
1762
+ results = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
1763
+ cfg_strength, num_steps, crossfade_s, crossfade_db, 1,
1764
+ silent_video=clip_path,
1765
+ segments_json=json.dumps(sub_segs))
 
 
 
 
 
 
 
 
1766
  seg_wavs, sr = results[0]
1767
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1768
  clip_dur, sr, sub_segs)
 
1790
  meta["silent_video"], clip_start, clip_dur,
1791
  os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
1792
  )
1793
+ sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
1794
+ results = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
1795
+ guidance_scale, num_steps, model_size,
1796
+ crossfade_s, crossfade_db, 1,
1797
+ silent_video=clip_path,
1798
+ segments_json=json.dumps(sub_segs),
1799
+ total_dur_s=clip_dur)
 
 
 
 
 
 
 
 
 
 
 
 
1800
  seg_wavs, sr, _ = results[0]
1801
  wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
1802
  clip_dur, sr, sub_segs)
 
2051
  ctx.fillStyle = '#1e1e2e';
2052
  ctx.fillRect(0, 0, W, H);
2053
 
2054
+ // Compute contact edges: midpoint of overlap between consecutive segments.
2055
+ // Each segment is colored from its left contact edge to its right contact edge.
2056
+ // First segment starts at 0; last segment ends at duration.
2057
+ const contactEdges = [];
2058
+ for (let i = 0; i < segments.length - 1; i++) {{
2059
+ contactEdges.push((segments[i][1] + segments[i+1][0]) / 2);
2060
+ }}
2061
+
2062
  segments.forEach(function(seg, idx) {{
2063
+ const x1 = idx === 0 ? 0 : (contactEdges[idx-1] / duration) * W;
2064
+ const xEnd = idx === segments.length - 1
2065
+ ? W
2066
+ : (contactEdges[idx] / duration) * W;
 
 
 
2067
  ctx.fillStyle = segColors[idx % segColors.length];
2068
  ctx.fillRect(x1, 0, xEnd - x1, H);
2069
  ctx.fillStyle = 'rgba(255,255,255,0.6)';