Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors Claude Sonnet 4.6 commited on 5 days ago

Commit

4c173d1

1 Parent(s): e152b28

fix: pass CPU context as ctx_json argument to @spaces.GPU functions

ZeroGPU runs GPU functions on its own worker thread pool — thread-local
storage and per-thread dicts both fail because the writer and reader are
on different threads with different thread IDs.

The only reliable approach: pass context as a JSON string argument.
ZeroGPU forwards all arguments to the GPU worker unchanged.

Changes:
- Add ctx_json='{}' parameter to all 6 @spaces.GPU functions
(_taro_gpu_infer, _mmaudio_gpu_infer, _hunyuan_gpu_infer,
_regen_taro_gpu, _regen_mmaudio_gpu, _regen_hunyuan_gpu)
- Each wrapper serialises its pre-computed data to json.dumps({...})
and passes it as the last positional argument
- GPU functions parse with json.loads(ctx_json)
- TARO/HunyuanFoley regen: numpy/tensor features loaded directly from
disk paths already stored in seg_meta_json — no pre-serialisation needed
- Remove dead _preload_taro_regen_ctx, _preload_hunyuan_regen_ctx helpers
- Remove _CTX/_ctx_set/_ctx_get infrastructure (replaced by arg passing)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +52 -92

app.py CHANGED Viewed

@@ -124,32 +124,10 @@ print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
-# Per-caller-thread context store for CPU → GPU context passing.
-# Replaces the fragile function-attribute pattern (_fn._cpu_ctx = {...}).
-#
-# WHY NOT threading.local():
-#   ZeroGPU dispatches @spaces.GPU functions on its OWN worker thread, not
-#   the Gradio handler thread.  threading.local() values are invisible across
-#   threads, so the GPU worker would always see an empty namespace.
-#
-# SOLUTION — a plain dict keyed by (caller_thread_id, context_name):
-#   The wrapper writes _CTX[(tid, key)] = value before calling the GPU fn.
-#   The GPU fn reads _CTX.get((tid, key)) — same tid because ZeroGPU runs
-#   the function synchronously on behalf of the calling thread (the caller
-#   blocks until the GPU task completes, so there is no concurrent write).
-#   Entries are deleted after the GPU fn reads them to avoid memory leaks.
-_CTX: dict  = {}
-_CTX_LOCK   = threading.Lock()
-def _ctx_set(key: str, value) -> None:
-    tid = threading.get_ident()
-    with _CTX_LOCK:
-        _CTX[(tid, key)] = value
-def _ctx_get(key: str, default=None):
-    tid = threading.get_ident()
-    with _CTX_LOCK:
-        return _CTX.pop((tid, key), default)
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
@@ -791,7 +769,7 @@ def _cpu_preprocess(video_file: str, model_dur: float,
 @spaces.GPU(duration=_taro_duration)
 def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
-                    crossfade_s, crossfade_db, num_samples):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
@@ -807,8 +785,7 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
-    # Use pre-computed CPU results passed via cross-thread context store
-    ctx        = _ctx_get("taro_gen_ctx")
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
@@ -880,15 +857,14 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
-    # Pass pre-computed CPU results to the GPU function via cross-thread context store
-    _ctx_set("taro_gen_ctx", {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
-                              crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
     # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
@@ -945,7 +921,8 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
-                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
@@ -960,7 +937,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    ctx            = _ctx_get("mmaudio_gen_ctx")
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
@@ -1039,13 +1016,12 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    _ctx_set("mmaudio_gen_ctx", {
-        "segments": segments, "seg_clip_paths": seg_clip_paths,
-    })
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
-                                 cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
@@ -1090,7 +1066,8 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
-                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
@@ -1109,7 +1086,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    ctx              = _ctx_get("hunyuan_gen_ctx")
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
@@ -1193,7 +1170,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    _ctx_set("hunyuan_gen_ctx", {
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     })
@@ -1201,7 +1178,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
-                                 crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
@@ -1230,27 +1207,6 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
 #   4. Returns (new_video_path, new_audio_path, updated_seg_meta, new_waveform_html)
 # ================================================================== #
-def _preload_taro_regen_ctx(meta: dict) -> dict:
-    """Pre-load TARO CAVP/onset features on CPU for regen.
-    Returns a dict for _ctx_set("taro_regen_ctx", ...)."""
-    cavp_path  = meta.get("cavp_path", "")
-    onset_path = meta.get("onset_path", "")
-    ctx = {}
-    if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
-        ctx["cavp"]  = np.load(cavp_path)
-        ctx["onset"] = np.load(onset_path)
-    return ctx
-def _preload_hunyuan_regen_ctx(meta: dict, seg_path: str) -> dict:
-    """Pre-load HunyuanFoley text features + segment path on CPU for regen.
-    Returns a dict for _ctx_set("hunyuan_regen_ctx", ...)."""
-    ctx = {"seg_path": seg_path}
-    text_feats_path = meta.get("text_feats_path", "")
-    if text_feats_path and os.path.exists(text_feats_path):
-        ctx["text_feats"] = torch.load(text_feats_path, map_location="cpu", weights_only=False)
-    return ctx
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
@@ -1334,12 +1290,13 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     _ensure_syspath("TARO")
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
-    # Use pre-loaded features from CPU wrapper (avoids np.load inside GPU window)
-    ctx = _ctx_get("taro_regen_ctx", {})
-    if "cavp" in ctx and "onset" in ctx:
-        print("[TARO regen] Using pre-loaded CAVP + onset features (CPU cache hit)")
-        cavp_feats  = ctx["cavp"]
-        onset_feats = ctx["onset"]
     else:
         print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
         from TARO.onset_util import extract_onset
@@ -1348,7 +1305,6 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
         tmp_dir      = tempfile.mkdtemp()
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
-        # Free feature extractors before loading inference models
         del extract_cavp, onset_model
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -1372,10 +1328,7 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
-    # CPU: pre-load cached features so np.load doesn't happen inside GPU window
-    _ctx_set("taro_regen_ctx", _preload_taro_regen_ctx(meta))
-    # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
@@ -1398,7 +1351,8 @@ def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
 @spaces.GPU(duration=_mmaudio_regen_duration)
 def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
-                       cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id=None):
     """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1414,8 +1368,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
-    # Use pre-extracted segment clip from the CPU wrapper
-    seg_path = _ctx_get("mmaudio_regen_ctx", {}).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
@@ -1457,12 +1410,13 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _ctx_set("mmaudio_regen_ctx", {"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
-                                     cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id)
     # Resample to 48kHz if needed (MMAudio outputs at 44100 Hz)
     if sr != TARGET_SR:
@@ -1489,7 +1443,7 @@ def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
 def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size,
-                       crossfade_s, crossfade_db, slot_id=None):
     """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1506,16 +1460,16 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
-    # Use pre-extracted segment clip + text_feats from CPU wrapper
-    ctx = _ctx_get("hunyuan_regen_ctx", {})
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
-    if "text_feats" in ctx:
-        print("[HunyuanFoley regen] Using pre-loaded text features (CPU cache hit)")
         from hunyuanvideo_foley.utils.feature_utils import encode_video_features
         visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
-        text_feats = ctx["text_feats"].to(device)
     else:
         print("[HunyuanFoley regen] Cache miss — extracting text + visual features")
         visual_feats, text_feats, seg_audio_len = feature_process(
@@ -1550,13 +1504,16 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _ctx_set("hunyuan_regen_ctx", _preload_hunyuan_regen_ctx(meta, seg_path))
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      guidance_scale, num_steps, model_size,
-                                     crossfade_s, crossfade_db, slot_id)
     meta["sr"] = sr
@@ -1650,7 +1607,7 @@ def xregen_taro(seg_idx, state_json, slot_id,
     meta    = json.loads(state_json)
     def _run():
-        _ctx_set("taro_regen_ctx", _preload_taro_regen_ctx(meta))
         wav = _regen_taro_gpu(None, seg_idx, state_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
@@ -1673,11 +1630,11 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        _ctx_set("mmaudio_regen_ctx", {"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
-                                         crossfade_s, crossfade_db, slot_id)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
@@ -1698,11 +1655,14 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        _ctx_set("hunyuan_regen_ctx", _preload_hunyuan_regen_ctx(meta, seg_path))
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,
-                                         crossfade_s, crossfade_db, slot_id)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)

 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
+# CPU → GPU context passing: each wrapper serialises pre-computed CPU data
+# into a JSON string and passes it as the last argument (ctx_json) to the
+# @spaces.GPU function.  ZeroGPU forwards all arguments to the GPU worker
+# unchanged, so no shared state or thread-local tricks are needed.
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
 @spaces.GPU(duration=_taro_duration)
 def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, num_samples, ctx_json="{}"):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    ctx        = json.loads(ctx_json)
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
+    ctx_json = json.dumps({
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, num_samples, ctx_json)
     # ── CPU post-processing (no GPU needed) ──
     # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
+                       ctx_json="{}"):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    ctx            = json.loads(ctx_json)
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
         for i, (s, e) in enumerate(segments)
     ]
+    ctx_json = json.dumps({"segments": segments, "seg_clip_paths": seg_clip_paths})
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                                 cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                 num_samples, ctx_json)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                       num_samples, ctx_json="{}"):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    ctx              = json.loads(ctx_json)
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
         for i, (s, e) in enumerate(segments)
     ]
+    ctx_json = json.dumps({
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     })
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
+                                 crossfade_s, crossfade_db, num_samples, ctx_json)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
 #   4. Returns (new_video_path, new_audio_path, updated_seg_meta, new_waveform_html)
 # ================================================================== #
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     _ensure_syspath("TARO")
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
+    # Load cached CAVP/onset features from .npy files (CPU I/O, fast, outside GPU budget)
+    cavp_path  = meta.get("cavp_path", "")
+    onset_path = meta.get("onset_path", "")
+    if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
+        print("[TARO regen] Loading cached CAVP + onset features from disk")
+        cavp_feats  = np.load(cavp_path)
+        onset_feats = np.load(onset_path)
     else:
         print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
         from TARO.onset_util import extract_onset
         tmp_dir      = tempfile.mkdtemp()
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
         del extract_cavp, onset_model
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
+    # GPU: inference — CAVP/onset features loaded from disk paths in seg_meta_json
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
 @spaces.GPU(duration=_mmaudio_regen_duration)
 def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
+                       cfg_strength, num_steps, crossfade_s, crossfade_db,
+                       slot_id=None, ctx_json="{}"):
     """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
+    seg_path = json.loads(ctx_json).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    ctx_json = json.dumps({"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
+                                     cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                     slot_id, ctx_json)
     # Resample to 48kHz if needed (MMAudio outputs at 44100 Hz)
     if sr != TARGET_SR:
 def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size,
+                       crossfade_s, crossfade_db, slot_id=None, ctx_json="{}"):
     """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     set_global_seed(random.randint(0, 2**32 - 1))
+    ctx      = json.loads(ctx_json)
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
+    text_feats_path = ctx.get("text_feats_path", "")
+    if text_feats_path and os.path.exists(text_feats_path):
+        print("[HunyuanFoley regen] Loading cached text features from disk")
         from hunyuanvideo_foley.utils.feature_utils import encode_video_features
         visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
+        text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
     else:
         print("[HunyuanFoley regen] Cache miss — extracting text + visual features")
         visual_feats, text_feats, seg_audio_len = feature_process(
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    ctx_json = json.dumps({
+        "seg_path": seg_path,
+        "text_feats_path": meta.get("text_feats_path", ""),
+    })
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      guidance_scale, num_steps, model_size,
+                                     crossfade_s, crossfade_db, slot_id, ctx_json)
     meta["sr"] = sr
     meta    = json.loads(state_json)
     def _run():
+        # CAVP/onset features are loaded from disk paths inside the GPU fn
         wav = _regen_taro_gpu(None, seg_idx, state_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        ctx_json = json.dumps({"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
+                                         crossfade_s, crossfade_db, slot_id, ctx_json)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        ctx_json = json.dumps({
+            "seg_path": seg_path,
+            "text_feats_path": meta.get("text_feats_path", ""),
+        })
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,
+                                         crossfade_s, crossfade_db, slot_id, ctx_json)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)