Spaces:

hetchyy
/

quranic-universal-aligner

Running on Zero

App Files Files Community

hetchyy commited on Apr 14

Commit

d46a954

verified ·

1 Parent(s): 3955329

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

app.py +23 -0
config.py +13 -11
src/api/session_api.py +87 -0
src/core/cpu_worker_pool.py +525 -0
src/core/zero_gpu.py +20 -6
src/ui/event_wiring.py +16 -0

app.py CHANGED Viewed

@@ -54,6 +54,29 @@ else:
 from src.ui.interface import build_interface
 # =============================================================================
 # Module-level demo for Gradio hot-reload (`gradio app.py`)
 # =============================================================================

 from src.ui.interface import build_interface
+# =============================================================================
+# Persistent CPU worker pool — spawn BEFORE any GPU use if enabled.
+# This keeps the workers free of any inherited CUDA/ZeroGPU state.
+# =============================================================================
+try:
+    from config import (
+        CPU_STRATEGY as _CPU_STRATEGY,
+        CPU_WORKER_MODE as _CPU_WORKER_MODE,
+        CPU_SUBPROCESS_CONCURRENCY as _CPU_SUBPROCESS_CONCURRENCY,
+        CPU_POOL_PRELOAD_LARGE as _CPU_POOL_PRELOAD_LARGE,
+    )
+    from src.core.zero_gpu import IS_CPU_WORKER as _IS_CPU_WORKER
+    if (
+        _CPU_STRATEGY == "subprocess"
+        and _CPU_WORKER_MODE == "persistent"
+        and not _IS_CPU_WORKER
+    ):
+        print(f"[APP] Bootstrapping persistent CPU pool: {_CPU_SUBPROCESS_CONCURRENCY} worker(s), preload_large={_CPU_POOL_PRELOAD_LARGE}")
+        from src.core.cpu_worker_pool import start_pool as _start_pool
+        _start_pool(_CPU_SUBPROCESS_CONCURRENCY, preload_large=_CPU_POOL_PRELOAD_LARGE)
+except Exception as _e:
+    print(f"[APP] Persistent CPU pool bootstrap failed (non-fatal): {_e}")
 # =============================================================================
 # Module-level demo for Gradio hot-reload (`gradio app.py`)
 # =============================================================================

config.py CHANGED Viewed

@@ -48,14 +48,21 @@ SESSION_EXPIRY_SECONDS = 3600*5              # 5 hours — matches DELETE_CACHE_
 CPU_STRATEGY = os.environ.get("CPU_STRATEGY", "subprocess").lower()
 # Max seconds a subprocess CPU job can run before SIGKILL (used by "subprocess" and "both" strategies).
-CPU_SUBPROCESS_TIMEOUT = int(os.environ.get("CPU_SUBPROCESS_TIMEOUT", str(3600 * 4)))
-# Max concurrent CPU subprocesses on the main Space. Each subprocess loads its
-# own copy of the VAD + ASR models (~3.6 GB RAM). On zero-a10g with 48 GB RAM
-# and 8 vCPU we can safely run 2–3; pushing higher risks OOM and CPU thrash
-# that would also slow GPU dispatches on main.
 CPU_SUBPROCESS_CONCURRENCY = int(os.environ.get("CPU_SUBPROCESS_CONCURRENCY", "2"))
 # Model dtype for CPU inference.
 #   "bfloat16" — default. Routes attention through PyTorch's chunked CPU flash
 #                kernel (`_scaled_dot_product_flash_attention_for_cpu`), which
@@ -171,12 +178,7 @@ INFERENCE_BATCH_SIZE = 32      # Fixed segments per batch (used when BATCHING_ST
 # Dynamic batching constraints
 MAX_BATCH_SECONDS = 600      # GPU: max total audio seconds per batch (sum of durations)
-# CPU: tighter cap. SDPA materialises the QK^T tensor per encoder layer; once
-# (batch * heads * seq^2 * 2B) exceeds the L3 cache (~32 MB on zero-a10g Xeon),
-# every attention layer becomes DRAM-bound instead of cache-bound — on a 22 min
-# m4a we saw one batch (59 segs × 12.5 s) hit a ~24× slowdown vs its neighbours.
-# 300 s keeps QK^T comfortably under L3 at realistic (batch, seq) combinations.
-MAX_BATCH_SECONDS_CPU = 300
 MAX_PAD_WASTE = 0.2          # Max fraction of padded tensor that is wasted (0=no waste, 1=all waste)
 MIN_BATCH_SIZE = 8           # Minimum segments per batch (prevents underutilization)

 CPU_STRATEGY = os.environ.get("CPU_STRATEGY", "subprocess").lower()
 # Max seconds a subprocess CPU job can run before SIGKILL (used by "subprocess" and "both" strategies).
+CPU_SUBPROCESS_TIMEOUT = int(os.environ.get("CPU_SUBPROCESS_TIMEOUT", str(3600 * 2)))
+# Max concurrent CPU subprocesses on the main Space.
 CPU_SUBPROCESS_CONCURRENCY = int(os.environ.get("CPU_SUBPROCESS_CONCURRENCY", "2"))
+# CPU_WORKER_MODE — when CPU_STRATEGY="subprocess", chooses between:
+#   "spawn"      — legacy: fork a fresh subprocess per request (cpu_subprocess.py).
+#   "persistent" — new: route to a pool of long-lived workers (cpu_worker_pool.py).
+# Semaphore capacity stays = CPU_SUBPROCESS_CONCURRENCY either way.
+CPU_WORKER_MODE = os.environ.get("CPU_WORKER_MODE", "persistent").lower()
+# Whether the persistent pool preloads ASR Large at boot. If False, Large is
+# loaded on-demand inside the worker and cached there.
+CPU_POOL_PRELOAD_LARGE = os.environ.get("CPU_POOL_PRELOAD_LARGE", "1") == "1"
 # Model dtype for CPU inference.
 #   "bfloat16" — default. Routes attention through PyTorch's chunked CPU flash
 #                kernel (`_scaled_dot_product_flash_attention_for_cpu`), which
 # Dynamic batching constraints
 MAX_BATCH_SECONDS = 600      # GPU: max total audio seconds per batch (sum of durations)
+MAX_BATCH_SECONDS_CPU = 300  # CPU: tighter cap. SDPA materialises the QK^T tensor per encoder layer
 MAX_PAD_WASTE = 0.2          # Max fraction of padded tensor that is wasted (0=no waste, 1=all waste)
 MIN_BATCH_SIZE = 8           # Minimum segments per batch (prevents underutilization)

src/api/session_api.py CHANGED Viewed

@@ -957,6 +957,93 @@ def pool_status(hf_token):
     }
 # ---------------------------------------------------------------------------
 # Hidden debug endpoint
 # ---------------------------------------------------------------------------

     }
+def cpu_pool_kill(hf_token, worker_id):
+    """Kill a persistent worker for crash-recovery testing. HF-token-gated."""
+    space_token = os.environ.get("HF_TOKEN", "")
+    if not hf_token or (space_token and hf_token != space_token):
+        return {"error": "Unauthorized"}
+    try:
+        from src.core.cpu_worker_pool import _get_pool
+        import signal as _signal
+        import time as _time
+        p = _get_pool()
+        wid = int(worker_id)
+        h = p.workers[wid]
+        pid = h.pid
+        was_alive = h.process is not None and h.process.is_alive()
+        try:
+            os.kill(pid, _signal.SIGKILL)
+            sent = True
+            send_err = None
+        except Exception as ke:
+            sent = False
+            send_err = str(ke)
+        # give OS a moment to reap
+        _time.sleep(0.3)
+        alive_after = h.process is not None and h.process.is_alive()
+        return {
+            "worker_id": wid,
+            "pid": pid,
+            "was_alive": was_alive,
+            "kill_sent": sent,
+            "send_err": send_err,
+            "alive_after": alive_after,
+        }
+    except Exception as e:
+        return {"error": str(e)}
+def cpu_pool_status(hf_token):
+    """Return persistent CPU worker pool state. HF-token-gated.
+    Prototype diagnostic: shows per-worker boot snapshots, load times, pids,
+    live RSS, and job counts. Safe to call on Spaces where CPU_WORKER_MODE
+    is not 'persistent' — just returns `started=False`.
+    """
+    space_token = os.environ.get("HF_TOKEN", "")
+    if not hf_token or (space_token and hf_token != space_token):
+        return {"error": "Unauthorized"}
+    try:
+        from src.core.cpu_worker_pool import is_started, stats as pool_stats, probe_rss
+    except Exception as e:
+        return {"error": f"pool import failed: {e}"}
+    if not is_started():
+        return {"started": False, "note": "CPU_WORKER_MODE != persistent or pool not yet bootstrapped"}
+    s = pool_stats()
+    # Augment with live RSS probe per worker
+    for w in s.get("workers", []):
+        try:
+            w["rss_now"] = probe_rss(w["id"])
+        except Exception as e:
+            w["rss_now_error"] = str(e)
+    # Include main process RSS
+    try:
+        import psutil as _ps
+        s["main_rss"] = _ps.Process(os.getpid()).memory_info().rss
+        vm = _ps.virtual_memory()
+        s["host_mem"] = {"total": vm.total, "available": vm.available, "used": vm.used, "percent": vm.percent}
+    except Exception as e:
+        s["main_rss_error"] = str(e)
+    # Probe cgroup (container) memory limit — authoritative Space budget.
+    cgroup = {}
+    for path in ("/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory/memory.limit_in_bytes"):
+        try:
+            with open(path) as _f:
+                cgroup[path] = _f.read().strip()
+        except Exception as e:
+            cgroup[path] = f"err: {e}"
+    try:
+        with open("/sys/fs/cgroup/memory.current") as _f:
+            cgroup["memory.current"] = _f.read().strip()
+    except Exception:
+        pass
+    s["cgroup"] = cgroup
+    return s
 # ---------------------------------------------------------------------------
 # Hidden debug endpoint
 # ---------------------------------------------------------------------------

src/core/cpu_worker_pool.py ADDED Viewed

	@@ -0,0 +1,525 @@

+"""Persistent CPU worker pool — prototype.
+Replaces the spawn-per-request path (cpu_subprocess.py) with long-lived
+workers that preload VAD + ASR (Base) once and stay ready. ASR Large is
+loaded on-demand in each worker (and cached there) so steady-state RAM
+stays predictable.
+Gated behind env var CPU_WORKER_MODE=persistent. CPU_WORKER_MODE=spawn
+(default) keeps the existing spawn-per-request behavior.
+Semaphore capacity = CPU_SUBPROCESS_CONCURRENCY (shared with spawn path).
+A free-worker queue gives O(1) idle-worker pickup and guarantees at most
+one concurrent job per worker.
+"""
+from __future__ import annotations
+import importlib
+import multiprocessing as mp
+import os
+import queue as queue_mod
+import signal
+import sys
+import threading
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, Optional
+# ---------------------------------------------------------------------------
+# Worker side
+# ---------------------------------------------------------------------------
+def _worker_loop(
+    worker_id: int,
+    extra_paths: list,
+    req_q: mp.Queue,
+    res_q: mp.Queue,
+    ready_ev,
+    preload_large: bool,
+):
+    """Long-lived worker body. Runs in a spawn-context process.
+    Steps:
+      1. Env hygiene — hide CUDA, disable ZeroGPU patches.
+      2. Import project modules, call force_cpu_mode().
+      3. Preload VAD + ASR Base (Large optional).
+      4. Signal `ready_ev`.
+      5. Loop: pull task → execute → push result.
+    Tasks are pickled tuples: (task_id, kind, payload)
+      kind="run":    payload=(func_module, func_name, args, kwargs)
+      kind="rss":    payload=None     (return rss bytes)
+      kind="load_large": payload=None (preload ASR Large if not cached)
+      kind="shutdown": payload=None   (exit loop)
+    """
+    # ---- Env hygiene BEFORE any torch import ----------------------------
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    os.environ["SPACES_ZERO_GPU"] = ""
+    # Suppress the HF download progress bars — the parent app sets this but
+    # the spawned child inherits only env, not module state.
+    os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+    # Restore sys.path from parent so src/ is importable
+    for p in extra_paths:
+        if p and p not in sys.path:
+            sys.path.insert(0, p)
+    # Helpful process label for ps/htop
+    try:
+        import setproctitle  # type: ignore
+        setproctitle.setproctitle(f"cpu-worker-{worker_id}")
+    except Exception:
+        pass
+    log = lambda msg: print(f"[CPU-POOL/W{worker_id}] {msg}", flush=True)
+    # ---- RSS probe ------------------------------------------------------
+    def _rss_bytes() -> int:
+        try:
+            import psutil  # type: ignore
+            return psutil.Process(os.getpid()).memory_info().rss
+        except Exception:
+            try:
+                with open(f"/proc/{os.getpid()}/status") as f:
+                    for line in f:
+                        if line.startswith("VmRSS:"):
+                            return int(line.split()[1]) * 1024
+            except Exception:
+                return 0
+            return 0
+    snapshots = {"start": _rss_bytes()}
+    t0 = time.time()
+    log(f"booted pid={os.getpid()} rss={snapshots['start']/1e6:.1f}MB")
+    # ---- Imports + force CPU mode --------------------------------------
+    try:
+        from src.core.zero_gpu import force_cpu_mode
+        force_cpu_mode()
+        snapshots["after_imports"] = _rss_bytes()
+        log(f"imports done +{(time.time()-t0):.1f}s rss={snapshots['after_imports']/1e6:.1f}MB")
+    except Exception as e:
+        log(f"FATAL imports: {e}\n{traceback.format_exc()}")
+        res_q.put(("__boot_error__", "error", (type(e).__name__, str(e), traceback.format_exc())))
+        return
+    load_times = {}
+    # ---- Preload VAD ---------------------------------------------------
+    try:
+        t = time.time()
+        from src.segmenter.segmenter_model import load_segmenter
+        load_segmenter()
+        load_times["vad"] = time.time() - t
+        snapshots["after_vad"] = _rss_bytes()
+        log(f"VAD loaded in {load_times['vad']:.2f}s rss={snapshots['after_vad']/1e6:.1f}MB")
+    except Exception as e:
+        log(f"VAD load failed: {e}")
+        res_q.put(("__boot_error__", "error", (type(e).__name__, str(e), traceback.format_exc())))
+        return
+    # ---- Preload ASR Base ---------------------------------------------
+    try:
+        t = time.time()
+        from src.alignment.phoneme_asr import load_phoneme_asr
+        load_phoneme_asr("Base")
+        load_times["asr_base"] = time.time() - t
+        snapshots["after_asr_base"] = _rss_bytes()
+        log(f"ASR Base loaded in {load_times['asr_base']:.2f}s rss={snapshots['after_asr_base']/1e6:.1f}MB")
+    except Exception as e:
+        log(f"ASR Base load failed: {e}")
+        res_q.put(("__boot_error__", "error", (type(e).__name__, str(e), traceback.format_exc())))
+        return
+    # ---- Preload caches (ngram index, phoneme chapters) ----------------
+    try:
+        t = time.time()
+        from src.alignment.ngram_index import get_ngram_index
+        from src.alignment.phoneme_matcher_cache import preload_all_chapters
+        get_ngram_index()
+        preload_all_chapters()
+        load_times["caches"] = time.time() - t
+        snapshots["after_caches"] = _rss_bytes()
+        log(f"caches loaded in {load_times['caches']:.2f}s rss={snapshots['after_caches']/1e6:.1f}MB")
+    except Exception as e:
+        log(f"caches load failed (non-fatal): {e}")
+    # ---- Optionally preload ASR Large ---------------------------------
+    if preload_large:
+        try:
+            t = time.time()
+            from src.alignment.phoneme_asr import load_phoneme_asr
+            load_phoneme_asr("Large")
+            load_times["asr_large"] = time.time() - t
+            snapshots["after_asr_large"] = _rss_bytes()
+            log(f"ASR Large loaded in {load_times['asr_large']:.2f}s rss={snapshots['after_asr_large']/1e6:.1f}MB")
+        except Exception as e:
+            log(f"ASR Large preload failed: {e}")
+    # ---- Warm up resampler --------------------------------------------
+    try:
+        import numpy as np, librosa
+        from config import RESAMPLE_TYPE
+        _ = librosa.resample(np.zeros(1600, dtype=np.float32),
+                             orig_sr=44100, target_sr=16000, res_type=RESAMPLE_TYPE)
+    except Exception:
+        pass
+    snapshots["ready"] = _rss_bytes()
+    total_boot = time.time() - t0
+    log(f"READY in {total_boot:.2f}s, final rss={snapshots['ready']/1e6:.1f}MB")
+    # Signal parent that this worker booted successfully
+    res_q.put(("__ready__", "ok", {
+        "worker_id": worker_id,
+        "pid": os.getpid(),
+        "snapshots": snapshots,
+        "load_times": load_times,
+        "boot_time": total_boot,
+    }))
+    ready_ev.set()
+    # ---- Main loop -----------------------------------------------------
+    while True:
+        try:
+            item = req_q.get()
+        except (EOFError, OSError, KeyboardInterrupt):
+            break
+        if item is None:
+            break
+        task_id, kind, payload = item
+        try:
+            if kind == "shutdown":
+                break
+            elif kind == "rss":
+                res_q.put((task_id, "ok", _rss_bytes()))
+                continue
+            elif kind == "load_large":
+                try:
+                    from src.alignment.phoneme_asr import load_phoneme_asr
+                    t = time.time()
+                    load_phoneme_asr("Large")
+                    res_q.put((task_id, "ok", {"load_time": time.time() - t, "rss": _rss_bytes()}))
+                except Exception as e:
+                    res_q.put((task_id, "error", (type(e).__name__, str(e), traceback.format_exc())))
+                continue
+            elif kind == "run":
+                func_module, func_name, args, kwargs = payload
+                try:
+                    module = importlib.import_module(func_module)
+                    func = getattr(module, func_name)
+                    while hasattr(func, "__wrapped__"):
+                        func = func.__wrapped__
+                    result = func(*args, **kwargs)
+                    res_q.put((task_id, "ok", result))
+                except Exception as e:
+                    res_q.put((task_id, "error", (type(e).__name__, str(e), traceback.format_exc())))
+                continue
+            else:
+                res_q.put((task_id, "error", ("ValueError", f"unknown kind {kind!r}", "")))
+        except Exception as e:
+            # Catch-all so the loop survives
+            res_q.put((task_id, "error", (type(e).__name__, str(e), traceback.format_exc())))
+    log("exiting cleanly")
+# ---------------------------------------------------------------------------
+# Parent side
+# ---------------------------------------------------------------------------
+@dataclass
+class _WorkerHandle:
+    worker_id: int
+    process: Optional[Any] = None
+    req_q: Optional[Any] = None
+    res_q: Optional[Any] = None
+    ready_ev: Optional[Any] = None
+    snapshots: dict = field(default_factory=dict)
+    load_times: dict = field(default_factory=dict)
+    boot_time: float = 0.0
+    pid: Optional[int] = None
+    total_jobs: int = 0
+    lock: threading.Lock = field(default_factory=threading.Lock)
+class _Pool:
+    def __init__(self):
+        self.ctx = mp.get_context("spawn")
+        self.workers: list[_WorkerHandle] = []
+        self.free_q: "queue_mod.Queue[int]" = queue_mod.Queue()
+        self._started = False
+        self._lock = threading.Lock()
+        self._task_counter = 0
+        self._preload_large = False
+        self._extra_paths: list[str] = []
+    # ---- lifecycle -------------------------------------------------------
+    def start(self, n_workers: int, preload_large: bool = False, boot_timeout: float = 600.0):
+        with self._lock:
+            if self._started:
+                return
+            self._started = True
+            self._preload_large = preload_large
+            self._extra_paths = list(sys.path)
+            print(f"[CPU-POOL] Starting {n_workers} persistent worker(s) preload_large={preload_large}")
+            for i in range(n_workers):
+                h = self._spawn_worker(i)
+                self.workers.append(h)
+            # Wait for ready signal from each (serial — avoids RAM spike)
+            for h in self.workers:
+                self._wait_ready(h, timeout=boot_timeout)
+                self.free_q.put(h.worker_id)
+            print(f"[CPU-POOL] All {n_workers} workers READY")
+    def _spawn_worker(self, worker_id: int) -> _WorkerHandle:
+        req_q = self.ctx.Queue()
+        res_q = self.ctx.Queue()
+        ready_ev = self.ctx.Event()
+        p = self.ctx.Process(
+            target=_worker_loop,
+            args=(worker_id, self._extra_paths, req_q, res_q, ready_ev, self._preload_large),
+            daemon=True,
+            name=f"cpu-worker-{worker_id}",
+        )
+        p.start()
+        return _WorkerHandle(
+            worker_id=worker_id,
+            process=p,
+            req_q=req_q,
+            res_q=res_q,
+            ready_ev=ready_ev,
+            pid=p.pid,
+        )
+    def _wait_ready(self, h: _WorkerHandle, timeout: float):
+        """Drain res_q until we see the __ready__ tag or a __boot_error__."""
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            try:
+                tag, status, payload = h.res_q.get(timeout=min(10.0, deadline - time.time()))
+            except queue_mod.Empty:
+                if h.process is not None and not h.process.is_alive():
+                    raise RuntimeError(f"Worker {h.worker_id} died during boot (exit={h.process.exitcode})")
+                continue
+            if tag == "__ready__":
+                h.snapshots = payload.get("snapshots", {})
+                h.load_times = payload.get("load_times", {})
+                h.boot_time = payload.get("boot_time", 0.0)
+                h.pid = payload.get("pid", h.pid)
+                return
+            if tag == "__boot_error__":
+                exc_type, exc_msg, tb = payload
+                raise RuntimeError(f"Worker {h.worker_id} boot failed: {exc_type}: {exc_msg}\n{tb}")
+            # Unexpected tag during boot — ignore and keep waiting.
+        raise TimeoutError(f"Worker {h.worker_id} did not become ready within {timeout}s")
+    def shutdown(self, timeout: float = 5.0):
+        with self._lock:
+            if not self._started:
+                return
+            for h in self.workers:
+                try:
+                    h.req_q.put((0, "shutdown", None))
+                except Exception:
+                    pass
+            for h in self.workers:
+                try:
+                    if h.process is not None:
+                        h.process.join(timeout=timeout)
+                        if h.process.is_alive():
+                            h.process.kill()
+                            h.process.join(timeout=2)
+                except Exception:
+                    pass
+            self.workers.clear()
+            self._started = False
+    # ---- task dispatch ---------------------------------------------------
+    def _next_task_id(self) -> int:
+        with self._lock:
+            self._task_counter += 1
+            return self._task_counter
+    def _acquire_worker(self, timeout: Optional[float] = None) -> _WorkerHandle:
+        wid = self.free_q.get(timeout=timeout)
+        # Validate the worker is still alive; if not, respawn in-place.
+        h = self.workers[wid]
+        if h.process is None or not h.process.is_alive():
+            print(f"[CPU-POOL] Worker {wid} dead on acquire — respawning")
+            self._respawn_worker(wid)
+            h = self.workers[wid]
+        return h
+    def _release_worker(self, h: _WorkerHandle):
+        self.free_q.put(h.worker_id)
+    def _respawn_worker(self, worker_id: int):
+        """Replace a dead worker in-place. Blocks until ready."""
+        t0 = time.time()
+        new_h = self._spawn_worker(worker_id)
+        self._wait_ready(new_h, timeout=600.0)
+        self.workers[worker_id] = new_h
+        print(f"[CPU-POOL] Worker {worker_id} respawned in {time.time()-t0:.1f}s (new pid={new_h.pid})")
+    def run(self, func, args, kwargs, timeout: Optional[float] = None) -> Any:
+        if not self._started:
+            raise RuntimeError("Pool not started")
+        h = self._acquire_worker(timeout=timeout)
+        try:
+            task_id = self._next_task_id()
+            func_module = func.__module__
+            func_name = func.__qualname__
+            print(f"[CPU-POOL] dispatch task#{task_id} {func_module}.{func_name} -> W{h.worker_id} (pid={h.pid})")
+            t0 = time.time()
+            h.req_q.put((task_id, "run", (func_module, func_name, args, kwargs)))
+            # Drain res_q; tolerate process death.
+            deadline = time.time() + (timeout or 3600 * 4)
+            while True:
+                try:
+                    tag, status, payload = h.res_q.get(timeout=min(30.0, max(1.0, deadline - time.time())))
+                except queue_mod.Empty:
+                    if not h.process.is_alive():
+                        # worker died mid-task. respawn and raise so caller can retry.
+                        print(f"[CPU-POOL] Worker {h.worker_id} died mid-task (exit={h.process.exitcode})")
+                        self._respawn_worker(h.worker_id)
+                        raise RuntimeError(f"Worker {h.worker_id} died mid-task")
+                    if time.time() >= deadline:
+                        raise TimeoutError(f"CPU pool task timed out after {timeout}s")
+                    continue
+                if tag == task_id:
+                    break
+                # stray message (e.g. leftover rss reply). Drop.
+                print(f"[CPU-POOL] W{h.worker_id} stray message tag={tag!r}, ignoring")
+            h.total_jobs += 1
+            dt = time.time() - t0
+            if status == "ok":
+                print(f"[CPU-POOL] task#{task_id} ok in {dt:.2f}s on W{h.worker_id}")
+                return payload
+            exc_type, exc_msg, tb = payload
+            print(f"[CPU-POOL] task#{task_id} error on W{h.worker_id}: {exc_type}: {exc_msg}\n{tb}")
+            raise RuntimeError(f"Worker error ({exc_type}): {exc_msg}")
+        finally:
+            # If the worker died we may have respawned it inside _run. In that
+            # case it's already in workers[] but not in free_q. Add it back.
+            if h.process is not None and not h.process.is_alive():
+                # respawn already put nothing back on free_q; add the *new* handle
+                new_h = self.workers[h.worker_id]
+                if new_h is not h:
+                    self.free_q.put(new_h.worker_id)
+                else:
+                    # lost — dead and not replaced. Try a respawn now.
+                    try:
+                        self._respawn_worker(h.worker_id)
+                        self.free_q.put(h.worker_id)
+                    except Exception as e:
+                        print(f"[CPU-POOL] could not respawn W{h.worker_id}: {e}")
+            else:
+                self._release_worker(h)
+    # ---- diagnostics -----------------------------------------------------
+    def probe_rss(self, worker_id: int, timeout: float = 10.0) -> int:
+        h = self.workers[worker_id]
+        task_id = self._next_task_id()
+        h.req_q.put((task_id, "rss", None))
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            tag, status, payload = h.res_q.get(timeout=deadline - time.time())
+            if tag == task_id:
+                return int(payload)
+        raise TimeoutError("rss probe timed out")
+    def load_large(self, worker_id: int, timeout: float = 300.0) -> dict:
+        h = self.workers[worker_id]
+        task_id = self._next_task_id()
+        h.req_q.put((task_id, "load_large", None))
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            tag, status, payload = h.res_q.get(timeout=deadline - time.time())
+            if tag == task_id:
+                if status == "ok":
+                    return payload
+                raise RuntimeError(f"load_large failed: {payload}")
+        raise TimeoutError("load_large timed out")
+    def stats(self) -> dict:
+        return {
+            "started": self._started,
+            "n_workers": len(self.workers),
+            "workers": [
+                {
+                    "id": h.worker_id,
+                    "pid": h.pid,
+                    "alive": h.process is not None and h.process.is_alive(),
+                    "total_jobs": h.total_jobs,
+                    "boot_time": h.boot_time,
+                    "snapshots": {k: v for k, v in h.snapshots.items()},
+                    "load_times": h.load_times,
+                }
+                for h in self.workers
+            ],
+        }
+# ---------------------------------------------------------------------------
+# Module-level singleton API
+# ---------------------------------------------------------------------------
+_POOL: Optional[_Pool] = None
+_START_LOCK = threading.Lock()
+def _get_pool() -> _Pool:
+    global _POOL
+    if _POOL is None:
+        with _START_LOCK:
+            if _POOL is None:
+                _POOL = _Pool()
+    return _POOL
+def start_pool(n_workers: int, preload_large: bool = False):
+    """Spawn the persistent worker pool. Idempotent."""
+    _get_pool().start(n_workers, preload_large=preload_large)
+def is_started() -> bool:
+    return _POOL is not None and _POOL._started
+def stats() -> dict:
+    return _get_pool().stats()
+def probe_rss(worker_id: int) -> int:
+    return _get_pool().probe_rss(worker_id)
+def load_large(worker_id: int) -> dict:
+    return _get_pool().load_large(worker_id)
+def shutdown():
+    if _POOL is not None:
+        _POOL.shutdown()
+def run_on_persistent_worker(func, args, kwargs, timeout: Optional[float] = None):
+    """Run a function on a free persistent worker. Blocks until done.
+    Caller is responsible for concurrency gating (the wrapper in zero_gpu.py
+    uses the same semaphore as the spawn path).
+    """
+    return _get_pool().run(func, args, kwargs, timeout=timeout)

src/core/zero_gpu.py CHANGED Viewed

@@ -269,18 +269,32 @@ def gpu_with_fallback(duration=60):
                 if CPU_STRATEGY == "subprocess":
                     import time as _time
                     sem = _get_subprocess_semaphore()
                     _t_acq = _time.time()
                     sem.acquire()
                     _wait = _time.time() - _t_acq
                     try:
                         _check_cuda_fork_state(f"before CPU subprocess ({func.__name__})")
-                        print(
-                            f"[CPU] Running {func.__name__} in isolated subprocess "
-                            f"(CPU_STRATEGY=subprocess, queue_wait={_wait:.2f}s)"
-                        )
-                        from .cpu_subprocess import run_in_cpu_subprocess
-                        result = run_in_cpu_subprocess(func, args, kwargs)
                         _check_cuda_fork_state(f"after CPU subprocess ({func.__name__})")
                         return result
                     finally:

                 if CPU_STRATEGY == "subprocess":
                     import time as _time
+                    from config import CPU_WORKER_MODE
                     sem = _get_subprocess_semaphore()
                     _t_acq = _time.time()
                     sem.acquire()
                     _wait = _time.time() - _t_acq
                     try:
                         _check_cuda_fork_state(f"before CPU subprocess ({func.__name__})")
+                        if CPU_WORKER_MODE == "persistent":
+                            from .cpu_worker_pool import run_on_persistent_worker, is_started, start_pool
+                            if not is_started():
+                                # Lazy start fallback — slow first request.
+                                from config import CPU_SUBPROCESS_CONCURRENCY, CPU_POOL_PRELOAD_LARGE
+                                print("[CPU] Pool not started — lazy-starting now")
+                                start_pool(CPU_SUBPROCESS_CONCURRENCY, preload_large=CPU_POOL_PRELOAD_LARGE)
+                            print(
+                                f"[CPU] Running {func.__name__} on persistent worker "
+                                f"(CPU_WORKER_MODE=persistent, queue_wait={_wait:.2f}s)"
+                            )
+                            result = run_on_persistent_worker(func, args, kwargs)
+                        else:
+                            print(
+                                f"[CPU] Running {func.__name__} in isolated subprocess "
+                                f"(CPU_STRATEGY=subprocess, queue_wait={_wait:.2f}s)"
+                            )
+                            from .cpu_subprocess import run_in_cpu_subprocess
+                            result = run_in_cpu_subprocess(func, args, kwargs)
                         _check_cuda_fork_state(f"after CPU subprocess ({func.__name__})")
                         return result
                     finally:

src/ui/event_wiring.py CHANGED Viewed

@@ -17,6 +17,8 @@ from src.api.session_api import (
     debug_process,
     cpu_exec,
     pool_status,
 )
 from src.mfa import compute_mfa_timestamps
 from src.ui.progress_bar import pipeline_progress_bar_html
@@ -935,6 +937,20 @@ def _wire_api_endpoint(c):
         outputs=[c.api_result],
         api_name="pool_status",
     )
 def _wire_dev_tab(c):

     debug_process,
     cpu_exec,
     pool_status,
+    cpu_pool_status,
+    cpu_pool_kill,
 )
 from src.mfa import compute_mfa_timestamps
 from src.ui.progress_bar import pipeline_progress_bar_html
         outputs=[c.api_result],
         api_name="pool_status",
     )
+    # Persistent CPU worker pool status — HF-token-gated.
+    gr.Button(visible=False).click(
+        fn=cpu_pool_status,
+        inputs=[c.api_pool_status_token],
+        outputs=[c.api_result],
+        api_name="cpu_pool_status",
+    )
+    # Kill a persistent worker — crash-recovery test helper, HF-token-gated.
+    gr.Button(visible=False).click(
+        fn=cpu_pool_kill,
+        inputs=[c.api_pool_status_token, c.api_cpu_exec_module],  # reuse token + a string input
+        outputs=[c.api_result],
+        api_name="cpu_pool_kill",
+    )
 def _wire_dev_tab(c):