Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on about 19 hours ago

Commit

e152b28

1 Parent(s): 7592f82

fix: replace threading.local with caller-thread-id dict for ZeroGPU context passing

ZeroGPU dispatches @spaces.GPU functions on its own worker thread pool, not
the Gradio handler thread. threading.local() values are invisible across
threads, causing AttributeError when the GPU worker reads the context.

Replace _tl (threading.local) with _CTX (dict keyed by caller thread ID)
+ _ctx_set/_ctx_get helpers. The caller writes context under (tid, key)
before invoking the GPU function; the GPU worker reads using the same tid
(ZeroGPU runs synchronously on behalf of the caller, so no concurrent write
occurs). Entries are popped on read to prevent unbounded growth.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +47 -28

app.py CHANGED Viewed

@@ -124,13 +124,32 @@ print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
-# Thread-local storage for CPU → GPU context passing.
 # Replaces the fragile function-attribute pattern (_fn._cpu_ctx = {...}).
-# Each wrapper writes its context under a unique key before calling the
-# @spaces.GPU function; the GPU function reads it back.  Using thread-local
-# storage means concurrent requests on different threads don't clobber
-# each other's context — the function-attribute approach was not thread-safe.
-_tl = threading.local()
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
@@ -788,8 +807,8 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
-    # Use pre-computed CPU results passed via thread-local storage
-    ctx        = _tl.taro_gen_ctx
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
@@ -861,11 +880,11 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
-    # Pass pre-computed CPU results to the GPU function via thread-local storage
-    _tl.taro_gen_ctx = {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
-    }
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
@@ -941,7 +960,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    ctx            = _tl.mmaudio_gen_ctx
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
@@ -1020,9 +1039,9 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    _tl.mmaudio_gen_ctx = {
         "segments": segments, "seg_clip_paths": seg_clip_paths,
-    }
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
@@ -1090,7 +1109,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    ctx              = _tl.hunyuan_gen_ctx
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
@@ -1174,10 +1193,10 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    _tl.hunyuan_gen_ctx = {
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
-    }
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
@@ -1213,7 +1232,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
 def _preload_taro_regen_ctx(meta: dict) -> dict:
     """Pre-load TARO CAVP/onset features on CPU for regen.
-    Returns a dict for _tl.taro_regen_ctx (thread-local storage)."""
     cavp_path  = meta.get("cavp_path", "")
     onset_path = meta.get("onset_path", "")
     ctx = {}
@@ -1225,7 +1244,7 @@ def _preload_taro_regen_ctx(meta: dict) -> dict:
 def _preload_hunyuan_regen_ctx(meta: dict, seg_path: str) -> dict:
     """Pre-load HunyuanFoley text features + segment path on CPU for regen.
-    Returns a dict for _tl.hunyuan_regen_ctx (thread-local storage)."""
     ctx = {"seg_path": seg_path}
     text_feats_path = meta.get("text_feats_path", "")
     if text_feats_path and os.path.exists(text_feats_path):
@@ -1316,7 +1335,7 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     # Use pre-loaded features from CPU wrapper (avoids np.load inside GPU window)
-    ctx = getattr(_tl, "taro_regen_ctx", {})
     if "cavp" in ctx and "onset" in ctx:
         print("[TARO regen] Using pre-loaded CAVP + onset features (CPU cache hit)")
         cavp_feats  = ctx["cavp"]
@@ -1354,7 +1373,7 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     seg_idx = int(seg_idx)
     # CPU: pre-load cached features so np.load doesn't happen inside GPU window
-    _tl.taro_regen_ctx = _preload_taro_regen_ctx(meta)
     # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
@@ -1396,7 +1415,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     sr = seq_cfg.sampling_rate
     # Use pre-extracted segment clip from the CPU wrapper
-    seg_path = getattr(_tl, "mmaudio_regen_ctx", {}).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
@@ -1438,7 +1457,7 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _tl.mmaudio_regen_ctx = {"seg_path": seg_path}
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
@@ -1488,7 +1507,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
     # Use pre-extracted segment clip + text_feats from CPU wrapper
-    ctx = getattr(_tl, "hunyuan_regen_ctx", {})
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
@@ -1531,7 +1550,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _tl.hunyuan_regen_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
@@ -1631,7 +1650,7 @@ def xregen_taro(seg_idx, state_json, slot_id,
     meta    = json.loads(state_json)
     def _run():
-        _tl.taro_regen_ctx = _preload_taro_regen_ctx(meta)
         wav = _regen_taro_gpu(None, seg_idx, state_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
@@ -1654,7 +1673,7 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        _tl.mmaudio_regen_ctx = {"seg_path": seg_path}
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
@@ -1679,7 +1698,7 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        _tl.hunyuan_regen_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,

 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
+# Per-caller-thread context store for CPU → GPU context passing.
 # Replaces the fragile function-attribute pattern (_fn._cpu_ctx = {...}).
+#
+# WHY NOT threading.local():
+#   ZeroGPU dispatches @spaces.GPU functions on its OWN worker thread, not
+#   the Gradio handler thread.  threading.local() values are invisible across
+#   threads, so the GPU worker would always see an empty namespace.
+#
+# SOLUTION — a plain dict keyed by (caller_thread_id, context_name):
+#   The wrapper writes _CTX[(tid, key)] = value before calling the GPU fn.
+#   The GPU fn reads _CTX.get((tid, key)) — same tid because ZeroGPU runs
+#   the function synchronously on behalf of the calling thread (the caller
+#   blocks until the GPU task completes, so there is no concurrent write).
+#   Entries are deleted after the GPU fn reads them to avoid memory leaks.
+_CTX: dict  = {}
+_CTX_LOCK   = threading.Lock()
+def _ctx_set(key: str, value) -> None:
+    tid = threading.get_ident()
+    with _CTX_LOCK:
+        _CTX[(tid, key)] = value
+def _ctx_get(key: str, default=None):
+    tid = threading.get_ident()
+    with _CTX_LOCK:
+        return _CTX.pop((tid, key), default)
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    # Use pre-computed CPU results passed via cross-thread context store
+    ctx        = _ctx_get("taro_gen_ctx")
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
+    # Pass pre-computed CPU results to the GPU function via cross-thread context store
+    _ctx_set("taro_gen_ctx", {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
+    })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    ctx            = _ctx_get("mmaudio_gen_ctx")
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
         for i, (s, e) in enumerate(segments)
     ]
+    _ctx_set("mmaudio_gen_ctx", {
         "segments": segments, "seg_clip_paths": seg_clip_paths,
+    })
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    ctx              = _ctx_get("hunyuan_gen_ctx")
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
         for i, (s, e) in enumerate(segments)
     ]
+    _ctx_set("hunyuan_gen_ctx", {
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
+    })
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
 def _preload_taro_regen_ctx(meta: dict) -> dict:
     """Pre-load TARO CAVP/onset features on CPU for regen.
+    Returns a dict for _ctx_set("taro_regen_ctx", ...)."""
     cavp_path  = meta.get("cavp_path", "")
     onset_path = meta.get("onset_path", "")
     ctx = {}
 def _preload_hunyuan_regen_ctx(meta: dict, seg_path: str) -> dict:
     """Pre-load HunyuanFoley text features + segment path on CPU for regen.
+    Returns a dict for _ctx_set("hunyuan_regen_ctx", ...)."""
     ctx = {"seg_path": seg_path}
     text_feats_path = meta.get("text_feats_path", "")
     if text_feats_path and os.path.exists(text_feats_path):
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     # Use pre-loaded features from CPU wrapper (avoids np.load inside GPU window)
+    ctx = _ctx_get("taro_regen_ctx", {})
     if "cavp" in ctx and "onset" in ctx:
         print("[TARO regen] Using pre-loaded CAVP + onset features (CPU cache hit)")
         cavp_feats  = ctx["cavp"]
     seg_idx = int(seg_idx)
     # CPU: pre-load cached features so np.load doesn't happen inside GPU window
+    _ctx_set("taro_regen_ctx", _preload_taro_regen_ctx(meta))
     # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     sr = seq_cfg.sampling_rate
     # Use pre-extracted segment clip from the CPU wrapper
+    seg_path = _ctx_get("mmaudio_regen_ctx", {}).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _ctx_set("mmaudio_regen_ctx", {"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
     # Use pre-extracted segment clip + text_feats from CPU wrapper
+    ctx = _ctx_get("hunyuan_regen_ctx", {})
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _ctx_set("hunyuan_regen_ctx", _preload_hunyuan_regen_ctx(meta, seg_path))
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     meta    = json.loads(state_json)
     def _run():
+        _ctx_set("taro_regen_ctx", _preload_taro_regen_ctx(meta))
         wav = _regen_taro_gpu(None, seg_idx, state_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        _ctx_set("mmaudio_regen_ctx", {"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        _ctx_set("hunyuan_regen_ctx", _preload_hunyuan_regen_ctx(meta, seg_path))
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,