Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on about 13 hours ago

Commit

ac67bf3

1 Parent(s): 4c173d1

fix: use UUID-keyed global dict + ctx_key param for ZeroGPU context passing

ctx_json arg was being set to None by ZeroGPU because the duration callable
signature didn't include it — ZeroGPU validates/forwards args against the
duration fn signature and silently drops any extra params not present there.

Fix: add ctx_key='' to ALL duration callables (not just GPU fns) so the
param survives ZeroGPU's arg-forwarding pipeline. Use a UUID-keyed global
dict (_GPU_CTX) instead of JSON-encoding the full context — the UUID is a
tiny hex string that round-trips safely through any arg marshalling, and the
global dict is readable from any thread (unlike threading.local).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +63 -35

app.py CHANGED Viewed

@@ -124,10 +124,35 @@ print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
-# CPU → GPU context passing: each wrapper serialises pre-computed CPU data
-# into a JSON string and passes it as the last argument (ctx_json) to the
-# @spaces.GPU function.  ZeroGPU forwards all arguments to the GPU worker
-# unchanged, so no shared state or thread-local tricks are needed.
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
@@ -552,7 +577,7 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
 def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
-                   crossfade_s, crossfade_db, num_samples):
     """Pre-GPU callable — must match _taro_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("taro", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -769,7 +794,7 @@ def _cpu_preprocess(video_file: str, model_dur: float,
 @spaces.GPU(duration=_taro_duration)
 def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
-                    crossfade_s, crossfade_db, num_samples, ctx_json="{}"):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
@@ -785,7 +810,7 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
-    ctx        = json.loads(ctx_json)
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
@@ -857,14 +882,14 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
-    ctx_json = json.dumps({
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
-                              crossfade_s, crossfade_db, num_samples, ctx_json)
     # ── CPU post-processing (no GPU needed) ──
     # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
@@ -913,7 +938,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
-                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """Pre-GPU callable — must match _mmaudio_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -922,7 +948,7 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
-                       ctx_json="{}"):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
@@ -937,7 +963,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    ctx            = json.loads(ctx_json)
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
@@ -1016,12 +1042,12 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    ctx_json = json.dumps({"segments": segments, "seg_clip_paths": seg_clip_paths})
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
-                                 num_samples, ctx_json)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
@@ -1058,7 +1084,8 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
-                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """Pre-GPU callable — must match _hunyuan_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -1067,7 +1094,7 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                       num_samples, ctx_json="{}"):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
@@ -1086,7 +1113,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    ctx              = json.loads(ctx_json)
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
@@ -1170,7 +1197,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    ctx_json = json.dumps({
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     })
@@ -1178,7 +1205,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
-                                 crossfade_s, crossfade_db, num_samples, ctx_json)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
@@ -1258,7 +1285,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
 def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
                          seed_val, cfg_scale, num_steps, mode,
-                         crossfade_s, crossfade_db, slot_id=None):
     # If cached CAVP/onset features exist, skip ~10s feature-extractor overhead
     try:
         meta = json.loads(seg_meta_json)
@@ -1278,7 +1305,7 @@ def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
 @spaces.GPU(duration=_taro_regen_duration)
 def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                     seed_val, cfg_scale, num_steps, mode,
-                    crossfade_s, crossfade_db, slot_id=None):
     """GPU-only TARO regen — returns new_wav for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1344,7 +1371,8 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
 def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
-                             cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id=None):
     return _estimate_regen_duration("mmaudio", int(num_steps))
@@ -1352,7 +1380,7 @@ def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
 def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db,
-                       slot_id=None, ctx_json="{}"):
     """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1368,7 +1396,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
-    seg_path = json.loads(ctx_json).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
@@ -1410,13 +1438,13 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    ctx_json = json.dumps({"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      cfg_strength, num_steps, crossfade_s, crossfade_db,
-                                     slot_id, ctx_json)
     # Resample to 48kHz if needed (MMAudio outputs at 44100 Hz)
     if sr != TARGET_SR:
@@ -1435,7 +1463,7 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
 def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
                              guidance_scale, num_steps, model_size,
-                             crossfade_s, crossfade_db, slot_id=None):
     return _estimate_regen_duration("hunyuan", int(num_steps))
@@ -1443,7 +1471,7 @@ def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
 def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size,
-                       crossfade_s, crossfade_db, slot_id=None, ctx_json="{}"):
     """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1460,7 +1488,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
-    ctx      = json.loads(ctx_json)
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
@@ -1504,7 +1532,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    ctx_json = json.dumps({
         "seg_path": seg_path,
         "text_feats_path": meta.get("text_feats_path", ""),
     })
@@ -1513,7 +1541,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      guidance_scale, num_steps, model_size,
-                                     crossfade_s, crossfade_db, slot_id, ctx_json)
     meta["sr"] = sr
@@ -1630,11 +1658,11 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        ctx_json = json.dumps({"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
-                                         crossfade_s, crossfade_db, slot_id, ctx_json)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
@@ -1655,14 +1683,14 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        ctx_json = json.dumps({
             "seg_path": seg_path,
             "text_feats_path": meta.get("text_feats_path", ""),
         })
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,
-                                         crossfade_s, crossfade_db, slot_id, ctx_json)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)

 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
+# CPU → GPU context passing via UUID-keyed global store.
+#
+# ZeroGPU dispatches @spaces.GPU functions on its own worker thread, so
+# threading.local() doesn't work.  Passing context as a function argument
+# is the right idea, but ZeroGPU validates args against the *duration*
+# callable's signature — any extra param not present in the duration fn
+# gets dropped or set to None before the GPU fn runs.
+#
+# Solution: add ctx_key="" to BOTH the duration fn AND the GPU fn.
+# The wrapper stores the context dict in _GPU_CTX[uuid] and passes the
+# uuid string as ctx_key.  The GPU fn does _GPU_CTX.pop(ctx_key).
+# Since the dict is global (not thread-local), the GPU worker thread can
+# read it regardless of which thread wrote it.  The uuid ensures
+# concurrent requests don't collide.
+import uuid as _uuid_mod
+_GPU_CTX: dict       = {}
+_GPU_CTX_LOCK        = threading.Lock()
+def _ctx_store(data: dict) -> str:
+    """Store *data* in the global context dict; return the UUID key."""
+    key = _uuid_mod.uuid4().hex
+    with _GPU_CTX_LOCK:
+        _GPU_CTX[key] = data
+    return key
+def _ctx_load(key: str) -> dict:
+    """Pop and return the context dict for *key*."""
+    with _GPU_CTX_LOCK:
+        return _GPU_CTX.pop(key, {})
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
 def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
+                   crossfade_s, crossfade_db, num_samples, ctx_key=""):
     """Pre-GPU callable — must match _taro_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("taro", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_taro_duration)
 def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, num_samples, ctx_key=""):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    ctx        = _ctx_load(ctx_key)
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
+    ctx_key = _ctx_store({
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, num_samples, ctx_key)
     # ── CPU post-processing (no GPU needed) ──
     # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
+                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
+                      ctx_key=""):
     """Pre-GPU callable — must match _mmaudio_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
+                       ctx_key=""):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    ctx            = _ctx_load(ctx_key)
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
         for i, (s, e) in enumerate(segments)
     ]
+    ctx_key = _ctx_store({"segments": segments, "seg_clip_paths": seg_clip_paths})
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                 num_samples, ctx_key)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
+                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                      num_samples, ctx_key=""):
     """Pre-GPU callable — must match _hunyuan_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                       num_samples, ctx_key=""):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    ctx              = _ctx_load(ctx_key)
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
         for i, (s, e) in enumerate(segments)
     ]
+    ctx_key = _ctx_store({
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     })
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
+                                 crossfade_s, crossfade_db, num_samples, ctx_key)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
 def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
                          seed_val, cfg_scale, num_steps, mode,
+                         crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     # If cached CAVP/onset features exist, skip ~10s feature-extractor overhead
     try:
         meta = json.loads(seg_meta_json)
 @spaces.GPU(duration=_taro_regen_duration)
 def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                     seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     """GPU-only TARO regen — returns new_wav for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
 def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
+                             cfg_strength, num_steps, crossfade_s, crossfade_db,
+                             slot_id=None, ctx_key=""):
     return _estimate_regen_duration("mmaudio", int(num_steps))
 def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db,
+                       slot_id=None, ctx_key=""):
     """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
+    seg_path = _ctx_load(ctx_key).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    ctx_key = _ctx_store({"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                     slot_id, ctx_key)
     # Resample to 48kHz if needed (MMAudio outputs at 44100 Hz)
     if sr != TARGET_SR:
 def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
                              guidance_scale, num_steps, model_size,
+                             crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     return _estimate_regen_duration("hunyuan", int(num_steps))
 def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size,
+                       crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     set_global_seed(random.randint(0, 2**32 - 1))
+    ctx      = _ctx_load(ctx_key)
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    ctx_key = _ctx_store({
         "seg_path": seg_path,
         "text_feats_path": meta.get("text_feats_path", ""),
     })
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      guidance_scale, num_steps, model_size,
+                                     crossfade_s, crossfade_db, slot_id, ctx_key)
     meta["sr"] = sr
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        ctx_key = _ctx_store({"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
+                                         crossfade_s, crossfade_db, slot_id, ctx_key)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        ctx_key = _ctx_store({
             "seg_path": seg_path,
             "text_feats_path": meta.get("text_feats_path", ""),
         })
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,
+                                         crossfade_s, crossfade_db, slot_id, ctx_key)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)