BoxOfColors commited on
Commit
d5399ac
·
1 Parent(s): e7175d4

Fix regen GPU fns: move seg clip extraction inside GPU scope

Browse files

_regen_mmaudio_gpu and _regen_hunyuan_gpu now call _extract_segment_clip
internally (pure ffmpeg, safe within GPU window) instead of relying on
_ctx_store/_ctx_load cross-process context passing.

Removes redundant _extract_segment_clip + _ctx_store calls from CPU
wrappers: regen_mmaudio_segment, regen_hunyuan_segment, xregen_mmaudio,
xregen_hunyuan.

Files changed (1) hide show
  1. app.py +16 -48
app.py CHANGED
@@ -1389,8 +1389,12 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
1389
  net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
1390
  sr = seq_cfg.sampling_rate
1391
 
1392
- seg_path = _ctx_load("regen_mmaudio_gpu").get("seg_path")
1393
- assert seg_path, "[MMAudio regen] seg_path not set wrapper must pre-extract segment clip"
 
 
 
 
1394
 
1395
  rng = torch.Generator(device=device)
1396
  rng.manual_seed(random.randint(0, 2**32 - 1))
@@ -1422,18 +1426,8 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
1422
  """Regenerate one MMAudio segment. GPU inference + CPU splice/save."""
1423
  meta = json.loads(seg_meta_json)
1424
  seg_idx = int(seg_idx)
1425
- seg_start, seg_end = meta["segments"][seg_idx]
1426
- seg_dur = seg_end - seg_start
1427
 
1428
- # CPU: pre-extract segment clip
1429
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1430
- seg_path = _extract_segment_clip(
1431
- meta["silent_video"], seg_start, seg_dur,
1432
- os.path.join(tmp_dir, "regen_seg.mp4"),
1433
- )
1434
- _ctx_store("regen_mmaudio_gpu", {"seg_path": seg_path})
1435
-
1436
- # GPU: inference only
1437
  new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
1438
  prompt, negative_prompt, seed_val,
1439
  cfg_strength, num_steps, crossfade_s, crossfade_db,
@@ -1481,11 +1475,13 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
1481
 
1482
  set_global_seed(random.randint(0, 2**32 - 1))
1483
 
1484
- ctx = _ctx_load("regen_hunyuan_gpu")
1485
- seg_path = ctx.get("seg_path")
1486
- assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
 
 
1487
 
1488
- text_feats_path = ctx.get("text_feats_path", "")
1489
  if text_feats_path and os.path.exists(text_feats_path):
1490
  print("[HunyuanFoley regen] Loading cached text features from disk")
1491
  from hunyuanvideo_foley.utils.feature_utils import encode_video_features
@@ -1516,21 +1512,8 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
1516
  """Regenerate one HunyuanFoley segment. GPU inference + CPU splice/save."""
1517
  meta = json.loads(seg_meta_json)
1518
  seg_idx = int(seg_idx)
1519
- seg_start, seg_end = meta["segments"][seg_idx]
1520
- seg_dur = seg_end - seg_start
1521
 
1522
- # CPU: pre-extract segment clip + pre-load cached text features
1523
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1524
- seg_path = _extract_segment_clip(
1525
- meta["silent_video"], seg_start, seg_dur,
1526
- os.path.join(tmp_dir, "regen_seg.mp4"),
1527
- )
1528
- _ctx_store("regen_hunyuan_gpu", {
1529
- "seg_path": seg_path,
1530
- "text_feats_path": meta.get("text_feats_path", ""),
1531
- })
1532
-
1533
- # GPU: inference only
1534
  new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
1535
  prompt, negative_prompt, seed_val,
1536
  guidance_scale, num_steps, model_size,
@@ -1643,15 +1626,9 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
1643
  request: gr.Request = None):
1644
  """Cross-model regen: run MMAudio inference and splice into *slot_id*."""
1645
  seg_idx = int(seg_idx)
1646
- meta = json.loads(state_json)
1647
- seg_start, seg_end = meta["segments"][seg_idx]
1648
 
1649
  def _run():
1650
- seg_path = _extract_segment_clip(
1651
- meta["silent_video"], seg_start, seg_end - seg_start,
1652
- os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
1653
- )
1654
- _ctx_store("regen_mmaudio_gpu", {"seg_path": seg_path})
1655
  wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
1656
  prompt, negative_prompt, seed_val,
1657
  cfg_strength, num_steps,
@@ -1668,18 +1645,9 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
1668
  request: gr.Request = None):
1669
  """Cross-model regen: run HunyuanFoley inference and splice into *slot_id*."""
1670
  seg_idx = int(seg_idx)
1671
- meta = json.loads(state_json)
1672
- seg_start, seg_end = meta["segments"][seg_idx]
1673
 
1674
  def _run():
1675
- seg_path = _extract_segment_clip(
1676
- meta["silent_video"], seg_start, seg_end - seg_start,
1677
- os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
1678
- )
1679
- _ctx_store("regen_hunyuan_gpu", {
1680
- "seg_path": seg_path,
1681
- "text_feats_path": meta.get("text_feats_path", ""),
1682
- })
1683
  wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
1684
  prompt, negative_prompt, seed_val,
1685
  guidance_scale, num_steps, model_size,
 
1389
  net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
1390
  sr = seq_cfg.sampling_rate
1391
 
1392
+ # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
1393
+ # This avoids any cross-process context passing that fails under ZeroGPU isolation.
1394
+ seg_path = _extract_segment_clip(
1395
+ meta["silent_video"], seg_start, seg_dur,
1396
+ os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
1397
+ )
1398
 
1399
  rng = torch.Generator(device=device)
1400
  rng.manual_seed(random.randint(0, 2**32 - 1))
 
1426
  """Regenerate one MMAudio segment. GPU inference + CPU splice/save."""
1427
  meta = json.loads(seg_meta_json)
1428
  seg_idx = int(seg_idx)
 
 
1429
 
1430
+ # GPU: inference (segment clip extraction happens inside the GPU function)
 
 
 
 
 
 
 
 
1431
  new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
1432
  prompt, negative_prompt, seed_val,
1433
  cfg_strength, num_steps, crossfade_s, crossfade_db,
 
1475
 
1476
  set_global_seed(random.randint(0, 2**32 - 1))
1477
 
1478
+ # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
1479
+ seg_path = _extract_segment_clip(
1480
+ meta["silent_video"], seg_start, seg_dur,
1481
+ os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
1482
+ )
1483
 
1484
+ text_feats_path = meta.get("text_feats_path", "")
1485
  if text_feats_path and os.path.exists(text_feats_path):
1486
  print("[HunyuanFoley regen] Loading cached text features from disk")
1487
  from hunyuanvideo_foley.utils.feature_utils import encode_video_features
 
1512
  """Regenerate one HunyuanFoley segment. GPU inference + CPU splice/save."""
1513
  meta = json.loads(seg_meta_json)
1514
  seg_idx = int(seg_idx)
 
 
1515
 
1516
+ # GPU: inference (segment clip extraction happens inside the GPU function)
 
 
 
 
 
 
 
 
 
 
 
1517
  new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
1518
  prompt, negative_prompt, seed_val,
1519
  guidance_scale, num_steps, model_size,
 
1626
  request: gr.Request = None):
1627
  """Cross-model regen: run MMAudio inference and splice into *slot_id*."""
1628
  seg_idx = int(seg_idx)
 
 
1629
 
1630
  def _run():
1631
+ # Segment clip extraction happens inside _regen_mmaudio_gpu
 
 
 
 
1632
  wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
1633
  prompt, negative_prompt, seed_val,
1634
  cfg_strength, num_steps,
 
1645
  request: gr.Request = None):
1646
  """Cross-model regen: run HunyuanFoley inference and splice into *slot_id*."""
1647
  seg_idx = int(seg_idx)
 
 
1648
 
1649
  def _run():
1650
+ # Segment clip extraction happens inside _regen_hunyuan_gpu
 
 
 
 
 
 
 
1651
  wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
1652
  prompt, negative_prompt, seed_val,
1653
  guidance_scale, num_steps, model_size,