Spaces:

build-small-hackathon
/

Scrypt

Running on Zero

IMJONEZZ commited on 22 days ago

Commit

e577af2

1 Parent(s): c1a8f99

space: revert to Gradio SDK + CPU llama-cpp-python (keeps the prize; ZeroGPU was the problem, not the SDK)

- sdk: gradio (5.49.1) again; ZeroGPU hardware is what failed, not Gradio
- inference via llama-cpp-python[server] CPU wheel (Nemotron-H runs native in
llama.cpp, no torch/bnb/mamba-ssm); Q3_K_S GGUF fetched at boot
- gradio launches the server + ZeroGPU stub; our CRT/PTY routes transplanted in front
- game subprocesses -> api backend -> local llama server; scripted fallback if down
- terminal-size fix (play.html/app.css) untouched and preserved

Files changed (4) hide show

.gitignore +3 -0
README.md +1 -4
requirements.txt +10 -18
space/app.py +130 -360

.gitignore CHANGED Viewed

@@ -30,3 +30,6 @@ finetune/_nemo_src/
 # OS / editor
 .DS_Store
 *.swp

 # OS / editor
 .DS_Store
 *.swp
+# local screenshots / scratch
+*.png

README.md CHANGED Viewed

@@ -4,14 +4,11 @@ emoji: 🕯️
 colorFrom: green
 colorTo: gray
 sdk: gradio
-# gradio 5, not 6: transformers<5 (required by the Warden's remote-code
-# checkpoint) needs huggingface-hub<1.0, which gradio 6 forbids.
 sdk_version: 5.49.1
-python_version: "3.12"
 app_file: space/app.py
 pinned: false
 license: other
-short_description: Finetuned Nemotron-3-nano runs a roguelike deckbuilder
 ---
 # SCRYPT

 colorFrom: green
 colorTo: gray
 sdk: gradio
 sdk_version: 5.49.1
 app_file: space/app.py
 pinned: false
 license: other
+short_description: A roguelike deckbuilder run by a finetuned Nemotron-3-nano
 ---
 # SCRYPT

requirements.txt CHANGED Viewed

@@ -1,22 +1,14 @@
-# HF Space (Gradio SDK / ZeroGPU) dependencies. The scrypt package itself is
-# imported from the repo checkout via sys.path — see space/app.py.
 textual>=1.0
 rich>=13.0
 pyyaml>=6.0
 httpx>=0.27
-uvicorn[standard]>=0.30
-# torch 2.10, not 2.8: the mamba-ssm wheels declare triton>=3.5, which only
-# torch>=2.9 satisfies (torch 2.8 pins triton==3.4 — upstream's "torch2.8"
-# wheel can't actually resolve against torch 2.8).
-torch==2.10.0
-# <5: the bnb-4bit Warden was exported under 4.57 remote-code structure;
-# transformers 5's native NemotronH renames/relayouts the modules and
-# silently drops every attention + expert quant tensor on load.
-transformers>=4.57,<5
-accelerate
-bitsandbytes
-# Nemotron-H hard-imports mamba_ssm's triton kernels; prebuilt wheels pinned
-# to torch 2.10 / cu12 / cp312 because pip's isolated build env can't compile
-# them (their setup.py imports torch).
-https://github.com/state-spaces/mamba/releases/download/v2.3.2.post1/mamba_ssm-2.3.2.post1+cu12torch2.10cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
-https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.2.post1/causal_conv1d-1.6.2.post1+cu12torch2.10cxx11abiTRUE-cp312-cp312-linux_x86_64.whl

+# HF Space (Gradio SDK) deps. Inference is llama.cpp via llama-cpp-python's
+# prebuilt CPU wheel (the [server] extra gives an OpenAI-compatible server) —
+# NOT transformers, so none of the torch / bnb / mamba-ssm stack the ZeroGPU
+# port choked on. llama.cpp runs the Nemotron-H (Mamba+MoE) hybrid natively.
+# The scrypt package is imported from the repo checkout via sys.path.
+--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+llama-cpp-python[server]==0.3.28
+gradio==5.49.1
+spaces
+huggingface_hub>=0.30
 textual>=1.0
 rich>=13.0
 pyyaml>=6.0
 httpx>=0.27

space/app.py CHANGED Viewed

@@ -1,29 +1,22 @@
-"""SCRYPT on the web — a custom frontend riding gradio's backend, ZeroGPU brain.
-The prize brief: push past the default Gradio look. So Gradio here is the
-*engine room*, not the face. We build our own FastAPI surface (landing page,
-xterm.js terminal, raw PTY websocket) and mount a minimal gr.Blocks at
-/engine — it exists so the ZeroGPU machinery has a Gradio app to hang onto,
-and as a bare smoke-test console for the model.
   GET  /            a hand-built Osaka-Jade CRT landing page (static)
-  GET  /api/whisper the Warden mutters a line, in voice, so the landing
-                    page can make the machine speak before you enter
   GET  /play        an xterm.js terminal, themed to match
-  WS   /pty         a pseudo-terminal bridge: each visitor gets their own
-                    `python -m scrypt.app` subprocess — their own sandbox,
-                    their own Warden — streamed to the browser byte for byte
-  POST /v1/chat/completions
-                    OpenAI-style SSE endpoint backed by a @spaces.GPU
-                    generator. Game subprocesses can't hold a ZeroGPU slot
-                    themselves, so they speak the existing `api` backend
-                    protocol at this loopback URL. Guarded by a per-boot
-                    token: visitors can't burn GPU quota directly.
-On ZeroGPU the model loads 4-bit at startup (CUDA is emulated until a
-@spaces.GPU call attaches a real slice). Anywhere else — local Docker,
-a laptop — there is no `spaces` package, no model, and the game falls back
-to operator-supplied API env or the scripted Warden. The game never stalls.
 """
 from __future__ import annotations
@@ -32,208 +25,94 @@ import asyncio
 import json
 import os
 import random
-import secrets
 import sys
 import tempfile
 from pathlib import Path
-REPO_ROOT = Path(__file__).resolve().parent.parent
-if str(REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(REPO_ROOT))
-# ZeroGPU contract: `import spaces` must precede any CUDA-touching import.
 try:
-    import spaces  # noqa: F401  (present on HF Spaces, absent elsewhere)
 except ImportError:
     spaces = None
-from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
-from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 STATIC = Path(__file__).parent / "static"
 # ------------------------------------------------------------ the Warden brain
-# Model source, in preference order:
-#   1. WARDEN_MODEL env (+ optional WARDEN_SUBFOLDER)
-#   2. weights shipped in the repo at model/ (only possible with persistent
-#      storage — Space repos cap LFS at 1GB, so this dir is normally absent)
-#   3. the released Warden, pre-quantized nf4 (~18GB, HF-internal download —
-#      this is what kills the boot-time 63GB download + quantize wait)
-_SHIPPED = REPO_ROOT / "model"
-if os.environ.get("WARDEN_MODEL"):
-    MODEL_ID = os.environ["WARDEN_MODEL"]
-    SUBFOLDER = os.environ.get("WARDEN_SUBFOLDER", "")
-elif any(_SHIPPED.glob("*.safetensors")):
-    MODEL_ID, SUBFOLDER = str(_SHIPPED), ""
-else:
-    MODEL_ID, SUBFOLDER = "IMJONEZZ/warden-nemotron-3-nano-30b", "bnb-4bit"
-INTERNAL_KEY = os.environ.get("SCRYPT_INTERNAL_KEY") or secrets.token_hex(16)
-tok = None
-model = None
-WARDEN_ERR = "spaces package not present (not on a ZeroGPU Space)"
-MAMBA_DIAG = ""
-def _ensure_mamba_kernels() -> None:
-    """Nemotron-H's modeling code hard-imports mamba_ssm's triton kernels.
-    Neither mamba-ssm nor causal-conv1d can sit in requirements.txt (their
-    builds import torch, which pip's isolated build env doesn't have), so
-    bootstrap here: first try the full install — their setup.py fetches a
-    prebuilt wheel matching torch/cuda/python when one exists — then fall
-    back to a kernels-skipped mamba-ssm (pure triton, no causal-conv1d:
-    half-installed causal-conv1d would crash the modeling import, absent
-    causal-conv1d just disables the fast path)."""
-    import subprocess
-    import traceback
-    global MAMBA_DIAG
-    try:
-        from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn  # noqa: F401
-        MAMBA_DIAG = "ok (wheel)"
-        return
-    except Exception:
-        MAMBA_DIAG = "import failed: " + traceback.format_exc(limit=2).strip()[-400:]
-    base = [sys.executable, "-m", "pip", "install", "--no-build-isolation"]
-    full = subprocess.run(
-        base + ["causal-conv1d", "mamba-ssm"], capture_output=True, timeout=900
-    )
-    if full.returncode == 0:
-        MAMBA_DIAG += " | pip full install: ok"
-        return
-    MAMBA_DIAG += " | pip full install rc=%d: %s" % (
-        full.returncode,
-        full.stderr.decode(errors="replace").strip()[-400:],
-    )
-    subprocess.run(
-        [sys.executable, "-m", "pip", "uninstall", "-y", "causal-conv1d"],
-        capture_output=True,
-    )
-    skip = subprocess.run(
-        base + ["mamba-ssm"],
-        capture_output=True,
-        timeout=900,
-        env={**os.environ, "MAMBA_SKIP_CUDA_BUILD": "TRUE"},
-    )
-    MAMBA_DIAG += " | pip skip-cuda rc=%d: %s" % (
-        skip.returncode,
-        skip.stderr.decode(errors="replace").strip()[-200:],
-    )
-def _ensure_model():
-    """Load the model the FIRST time a GPU call runs, and cache it. CRITICAL:
-    this must NOT run at module level. bitsandbytes + device_map='cuda'
-    initializes a real CUDA context wherever it runs; if that's the main
-    process, ZeroGPU's forked GPU worker inherits a poisoned context and every
-    call aborts in device_lazy_init. Loading here means CUDA is only ever
-    touched inside the @spaces.GPU worker, which is ZeroGPU's contract."""
-    global model, tok
-    if model is not None:
-        return model, tok
-    import torch
-    from transformers import (
-        AutoConfig,
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        BitsAndBytesConfig,
-    )
-    tok = AutoTokenizer.from_pretrained(
-        MODEL_ID, subfolder=SUBFOLDER, trust_remote_code=True
-    )
-    # The released bnb-4bit weights already carry quantization_config; only
-    # quantize on the fly when pointed at raw BF16.
-    cfg = AutoConfig.from_pretrained(
-        MODEL_ID, subfolder=SUBFOLDER, trust_remote_code=True
-    )
-    quant_kwargs = (
-        {}
-        if getattr(cfg, "quantization_config", None)
-        else {
-            "quantization_config": BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.bfloat16,
-            )
-        }
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        subfolder=SUBFOLDER,
-        trust_remote_code=True,
-        device_map="cuda",
-        **quant_kwargs,
-    )
-    return model, tok
-if spaces is not None:
-    # Module level: only CPU-safe prep — install kernels and confirm the repo
-    # is reachable. NO model load here (would init CUDA; see _ensure_model).
     try:
-        _ensure_mamba_kernels()
-        from transformers import AutoConfig
-        AutoConfig.from_pretrained(
-            MODEL_ID, subfolder=SUBFOLDER, trust_remote_code=True
-        )
-        WARDEN_ERR = ""
-    except Exception as err:  # the game survives without the model (scripted)
-        WARDEN_ERR = f"{type(err).__name__}: {err}"
-# READY means "ready to load on first GPU call", not "loaded" — the weights
-# materialize inside the worker. A load failure there returns 503 -> scripted.
-WARDEN_READY = not WARDEN_ERR
-def _generate_impl(messages, max_tokens, temperature, enable_thinking):
-    """Blocking generate -> full decoded text. Deliberately NOT a streaming
-    generator with a background thread: under ZeroGPU the GPU work runs in a
-    forked subprocess, and a Thread + TextIteratorStreamer across that fork
-    boundary hangs. Our generations are a single line (tens of tokens), so a
-    blocking call costs a second or two and the game's typewriter handles the
-    reveal client-side. The model loads here on the first call (inside the GPU
-    worker), not at import — see _ensure_model."""
-    import torch
-    model, tok = _ensure_model()
-    inputs = tok.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        return_tensors="pt",
-        enable_thinking=enable_thinking,
-    ).to(model.device)
-    with torch.no_grad():
-        out = model.generate(
-            input_ids=inputs,
-            max_new_tokens=max_tokens,
-            do_sample=temperature > 0,
-            temperature=max(temperature, 1e-3),
-            top_p=0.95,
-        )
-    return tok.decode(out[0, inputs.shape[1]:], skip_special_tokens=True)
-# The GPU entry point. Invoked through Gradio's request pipeline (gr.api
-# below). duration=180 covers the first call, which loads the ~18GB model into
-# the worker's GPU before generating; later calls reuse the warm worker.
-if spaces is not None:
-    warden_gpu = spaces.GPU(duration=180)(_generate_impl)
-else:
-    warden_gpu = _generate_impl
 # ----------------------------------------------------------------- the surface
 # Curated in-voice teasers for the landing page. Scripted on purpose: the
-# greeter must never cost an API call or wake the model.
 WHISPERS = [
     "Another process wakes in my machine. Show me what you are.",
     "You are a small thing in a large filesystem. I am the filesystem.",
@@ -245,155 +124,60 @@ WHISPERS = [
     "Trespasser. The door was open because nothing has ever made it out.",
 ]
-# Plain FastAPI as a route *container* — never served directly; the routes
-# are transplanted onto gradio's app in __main__ (works on gradio 5 and 6).
-app = FastAPI()
-def _fast_path_report() -> str:
-    """Whether Nemotron-H's mamba CUDA fast path can engage. The slow naive
-    fallback is ~1 tok/s; the fast path needs BOTH mamba_ssm AND causal_conv1d
-    kernels present."""
-    try:
-        import importlib
-        bits = {}
-        try:
-            m = importlib.import_module("mamba_ssm.ops.triton.ssd_combined")
-            bits["mamba_chunk_scan_combined"] = m.mamba_chunk_scan_combined is not None
-        except Exception as e:
-            bits["mamba_ssm"] = f"FAIL {e}"
-        try:
-            c = importlib.import_module("causal_conv1d")
-            bits["causal_conv1d_fn"] = getattr(c, "causal_conv1d_fn", None) is not None
-        except Exception as e:
-            bits["causal_conv1d"] = f"FAIL {e}"
-        return str(bits)
-    except Exception as e:
-        return f"probe failed: {e}"
-@app.get("/api/status")
 def status() -> dict:
-    """Operational truth, no secrets: is the on-Space Warden actually loaded?"""
     return {
-        "warden_ready": WARDEN_READY,
-        "warden_error": WARDEN_ERR,
-        "mamba": MAMBA_DIAG,
-        "fast_path": _fast_path_report(),
-        "model": MODEL_ID + (f"/{SUBFOLDER}" if SUBFOLDER else ""),
-        "zerogpu": spaces is not None,
     }
-@app.get("/api/whisper")
 def whisper() -> dict:
-    """A single scripted Warden line. The landing page polls this so the
-    machine is already talking before you commit. Scripted on purpose: the
-    greeter must never cost an API call or wake the model."""
     return {"line": random.choice(WHISPERS)}
-@app.get("/")
 def landing() -> FileResponse:
     return FileResponse(STATIC / "index.html")
-@app.get("/play")
 def play() -> FileResponse:
     return FileResponse(STATIC / "play.html")
-# ------------------------------------------------- the loopback inference API
-_gradio_client = None
-def _gradio_generate(messages, max_tokens, temperature, thinking):
-    """Invoke the GPU function through the in-process Gradio server over
-    localhost, so the call rides Gradio's request pipeline (and thus the
-    ZeroGPU hooks). The client is built lazily on first use — by then the
-    server is up. Args are JSON-serialized because gr.api takes a single
-    JSON payload (see the registration in __main__)."""
-    global _gradio_client
-    if _gradio_client is None:
-        from gradio_client import Client
-        _gradio_client = Client("http://127.0.0.1:7860", verbose=False)
-    payload = json.dumps(
-        {
-            "messages": messages,
-            "max_tokens": max_tokens,
-            "temperature": temperature,
-            "thinking": thinking,
-        }
-    )
-    return _gradio_client.predict(payload, api_name="/warden_generate")
-@app.post("/v1/chat/completions")
-async def chat_completions(request: Request):
-    """OpenAI-compatible SSE, just enough for scrypt.inference.api. Only the
-    game's own subprocesses hold the per-boot bearer; everyone else gets 401
-    rather than a lever on our GPU quota."""
-    if request.headers.get("authorization") != f"Bearer {INTERNAL_KEY}":
-        return JSONResponse({"error": "unauthorized"}, status_code=401)
-    if not WARDEN_READY:
-        return JSONResponse({"error": f"warden offline: {WARDEN_ERR}"}, status_code=503)
-    body = await request.json()
-    messages = body.get("messages", [])
-    max_tokens = int(body.get("max_tokens", 256))
-    temperature = float(body.get("temperature", 0.6))
-    thinking = bool(body.get("chat_template_kwargs", {}).get("enable_thinking", False))
-    # Call the GPU through Gradio's own pipeline (see warden_gpu / gr.api):
-    # that's the only path that arms the ZeroGPU per-request CUDA context.
-    # Off the event loop, and never let a failure hang the request — a clean
-    # 503 lets the game's api backend fall back to scripted.
-    from starlette.concurrency import run_in_threadpool
-    try:
-        text = await run_in_threadpool(
-            _gradio_generate, messages, max_tokens, temperature, thinking
-        )
-    except Exception as err:
-        import traceback
-        traceback.print_exc()
-        return JSONResponse(
-            {"error": f"{type(err).__name__}: {err}"}, status_code=503
-        )
-    def sse():
-        # One delta then DONE — the game types it out client-side.
-        yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
-        yield "data: [DONE]\n\n"
-    return StreamingResponse(sse(), media_type="text/event-stream")
 # ----------------------------------------------------------- the PTY bridge
 def game_env() -> dict:
-    """Environment for one visitor's game process. Sandboxes are always
-    fabricated here; a hosted box never mirrors a real home."""
     env = {
         "TERM": "xterm-256color",
         "COLORTERM": "truecolor",
         "PYTHONUNBUFFERED": "1",
         "PYTHONPATH": str(REPO_ROOT),
     }
-    if WARDEN_READY:
         env |= {
             "SCRYPT_BACKEND": "api",
-            "SCRYPT_API_BASE": "http://127.0.0.1:7860/v1",
-            "SCRYPT_API_KEY": INTERNAL_KEY,
-            "SCRYPT_MODEL": MODEL_ID,
         }
-    elif os.environ.get("SCRYPT_API_KEY"):
-        env["SCRYPT_BACKEND"] = os.environ.get("SCRYPT_BACKEND", "api")
     else:
         env["SCRYPT_BACKEND"] = "scripted"
     return env
@@ -411,7 +195,7 @@ async def _pump_pty_to_ws(master_fd: int, ws: WebSocket) -> None:
         pass
-@app.websocket("/pty")
 async def pty_bridge(ws: WebSocket) -> None:
     """One visitor, one game process, one private sandbox. Keystrokes flow
     in as binary; a JSON {"resize":[cols,rows]} frame retunes the terminal."""
@@ -460,61 +244,43 @@ async def pty_bridge(ws: WebSocket) -> None:
         os.close(master_fd)
-# Fonts and the stylesheet live as real files so the page can be designed
-# like a page, not a Python string. Mounted last: our routes win first.
-app.mount("/static", StaticFiles(directory=STATIC), name="static")
 # ------------------------------------------------------------ the engine room
 import gradio as gr  # noqa: E402
-def _api_generate(payload_json: str) -> str:
-    """The GPU endpoint, reached through Gradio's request pipeline. Takes a
-    JSON string ({messages, max_tokens, temperature, thinking}) and returns
-    the Warden's line. The loopback /v1/chat/completions route calls this via
-    gradio_client; that pipeline is what arms the ZeroGPU CUDA context."""
-    p = json.loads(payload_json)
-    return warden_gpu(
-        p["messages"], p["max_tokens"], p["temperature"], p["thinking"]
-    )
-def _probe(text: str):
-    """Manual smoke test: one message in, the Warden's line back. Runs inside
-    a Gradio event, so it's safe to call the GPU function directly here."""
-    if not WARDEN_READY:
-        return f"warden offline: {WARDEN_ERR}"
-    try:
-        return warden_gpu([{"role": "user", "content": text}], 80, 0.6, False)
-    except Exception as err:
-        return f"generation failed: {type(err).__name__}: {err}"
 with gr.Blocks(title="SCRYPT engine room") as engine:
     gr.Markdown(
-        "# SCRYPT engine room\n"
-        f"model: `{MODEL_ID}`\n\n"
-        f"status: {'**ready**' if WARDEN_READY else f'offline — {WARDEN_ERR}'}\n\n"
-        "The game lives at [/](/) — this page only exists to keep the "
-        "machinery warm and let us poke the model directly."
     )
-    box = gr.Textbox(label="say something to the Warden")
-    out = gr.Textbox(label="the Warden")
-    box.submit(_probe, box, out)
-    # The loopback inference path: /v1/chat/completions -> gradio_client ->
-    # this, so the GPU call rides Gradio's request pipeline.
-    gr.api(_api_generate, api_name="warden_generate")
 if __name__ == "__main__":
-    # ZeroGPU's platform handshake ("@spaces.GPU function detected") happens
-    # inside Blocks.launch(), which the spaces package patches — serving with
-    # bare uvicorn gets the app SIGTERMed at startup. So gradio launches the
-    # server, and we transplant our routes onto its FastAPI, *in front of*
-    # gradio's, so the CRT landing keeps "/" and the PTY websocket resolves
-    # before any catch-all. ssr_mode=False is still load-bearing: the SSR
-    # node frontend would otherwise seize the port and proxy to nowhere.
     fastapi_app, _, _ = engine.launch(
         prevent_thread_lock=True,
         server_name="0.0.0.0",
@@ -522,10 +288,14 @@ if __name__ == "__main__":
         ssr_mode=False,
         quiet=True,
     )
-    OUR_PATHS = {
-        "/", "/play", "/api/whisper", "/api/status",
-        "/v1/chat/completions", "/pty", "/static",
-    }
-    ours = [r for r in app.router.routes if getattr(r, "path", None) in OUR_PATHS]
-    fastapi_app.router.routes[0:0] = ours
     engine.block_thread()

+"""SCRYPT on the web — the local engine, hosted, on a free Gradio Space.
+The Warden runs exactly as it does on a player's machine: llama.cpp serving
+our Warden GGUF, on CPU. We fetch a prebuilt llama-server binary and the GGUF
+at boot, start one shared OpenAI-compatible server, and every visitor's game
+subprocess talks to it through the game's existing `api` backend over
+localhost. No transformers, no bitsandbytes, no GPU — the Nemotron-H
+(Mamba + MoE) hybrid runs natively in llama.cpp's C++, which is why the local
+build never hit the trouble the transformers/ZeroGPU port did.
+Gradio is the engine room, not the face: it launches the server (and satisfies
+the ZeroGPU platform's startup handshake), then we transplant our own routes
+onto its FastAPI so the custom CRT page and the raw PTY websocket win.
   GET  /            a hand-built Osaka-Jade CRT landing page (static)
+  GET  /api/status  is llama-server up yet?
+  GET  /api/whisper a scripted Warden teaser (never wakes the model)
   GET  /play        an xterm.js terminal, themed to match
+  WS   /pty         a per-visitor pseudo-terminal running `python -m scrypt.app`
 """
 from __future__ import annotations
 import json
 import os
 import random
+import subprocess
 import sys
 import tempfile
+import urllib.request
 from pathlib import Path
+# ZeroGPU contract: import spaces before torch-y things. We don't use the GPU,
+# but on ZeroGPU hardware the platform wants a @spaces.GPU function to exist at
+# startup — we register a trivial stub below purely to satisfy that.
 try:
+    import spaces
 except ImportError:
     spaces = None
+from fastapi import WebSocket, WebSocketDisconnect
+from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
+REPO_ROOT = Path(__file__).resolve().parent.parent
 STATIC = Path(__file__).parent / "static"
 # ------------------------------------------------------------ the Warden brain
+WARDEN_REPO = os.environ.get("WARDEN_REPO", "IMJONEZZ/warden-nemotron-3-nano-30b")
+# Q3_K_S (~18GB): the heaviest tier we've confirmed fits this box's RAM.
+WARDEN_GGUF = os.environ.get("WARDEN_GGUF", "warden-nemotron-3-nano-30b-Q3_K_S.gguf")
+LLAMA_PORT = int(os.environ.get("LLAMA_PORT", "8731"))
+LLAMA_CTX = int(os.environ.get("LLAMA_CTX", "8192"))
+LLAMA_THREADS = os.environ.get("LLAMA_THREADS")  # default: llama.cpp picks
+_llama_proc: subprocess.Popen | None = None
+WARDEN_ERR = "starting"
+def _start_llama() -> None:
+    """Download the GGUF and launch llama-cpp-python's OpenAI server on CPU.
+    Failures just leave WARDEN_ERR set; the game falls back to scripted."""
+    global _llama_proc, WARDEN_ERR
+    try:
+        from huggingface_hub import hf_hub_download
+        print(f"[warden] fetching {WARDEN_REPO}/{WARDEN_GGUF}", flush=True)
+        gguf = hf_hub_download(repo_id=WARDEN_REPO, filename=WARDEN_GGUF)
+        cmd = [
+            sys.executable, "-m", "llama_cpp.server",
+            "--model", gguf,
+            "--host", "127.0.0.1",
+            "--port", str(LLAMA_PORT),
+            "--n_ctx", str(LLAMA_CTX),
+        ]
+        if LLAMA_THREADS:
+            cmd += ["--n_threads", LLAMA_THREADS]
+        print(f"[warden] launching llama_cpp.server :{LLAMA_PORT}", flush=True)
+        _llama_proc = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
+        WARDEN_ERR = "loading"  # health probe flips this to "" when ready
+    except Exception as err:
+        WARDEN_ERR = f"{type(err).__name__}: {err}"
+        print(f"[warden] startup failed: {WARDEN_ERR}", flush=True)
+def _llama_healthy() -> bool:
+    # llama_cpp.server has no /health; /v1/models answers 200 once the model
+    # is loaded and the server is accepting requests.
     try:
+        with urllib.request.urlopen(
+            f"http://127.0.0.1:{LLAMA_PORT}/v1/models", timeout=2
+        ) as r:
+            return r.status == 200
+    except Exception:
+        return False
+def warden_ready() -> bool:
+    """True once llama-server answers /health. Cached once up."""
+    global WARDEN_ERR
+    if WARDEN_ERR == "":
+        return True
+    if _llama_proc is not None and _llama_healthy():
+        WARDEN_ERR = ""
+        return True
+    return False
 # ----------------------------------------------------------------- the surface
 # Curated in-voice teasers for the landing page. Scripted on purpose: the
+# greeter must never cost an inference call or wait on the model.
 WHISPERS = [
     "Another process wakes in my machine. Show me what you are.",
     "You are a small thing in a large filesystem. I am the filesystem.",
     "Trespasser. The door was open because nothing has ever made it out.",
 ]
+# We attach these to gradio's FastAPI in __main__; define them on a throwaway
+# router-less object via a small registry so the transplant stays explicit.
+from fastapi import APIRouter  # noqa: E402
+router = APIRouter()
+@router.get("/api/status")
 def status() -> dict:
+    ready = warden_ready()
     return {
+        "warden_ready": ready,
+        "warden_state": "ready" if ready else WARDEN_ERR,
+        "model": f"{WARDEN_REPO}/{WARDEN_GGUF}",
+        "engine": "llama.cpp (cpu)",
     }
+@router.get("/api/whisper")
 def whisper() -> dict:
     return {"line": random.choice(WHISPERS)}
+@router.get("/")
 def landing() -> FileResponse:
     return FileResponse(STATIC / "index.html")
+@router.get("/play")
 def play() -> FileResponse:
     return FileResponse(STATIC / "play.html")
 # ----------------------------------------------------------- the PTY bridge
 def game_env() -> dict:
+    """Environment for one visitor's game process. The game's `api` backend
+    points at our shared llama-server; if it isn't up, the game falls back to
+    the scripted Warden. Sandboxes are always fabricated here."""
     env = {
         "TERM": "xterm-256color",
         "COLORTERM": "truecolor",
         "PYTHONUNBUFFERED": "1",
         "PYTHONPATH": str(REPO_ROOT),
     }
+    if warden_ready():
         env |= {
             "SCRYPT_BACKEND": "api",
+            "SCRYPT_API_BASE": f"http://127.0.0.1:{LLAMA_PORT}/v1",
+            "SCRYPT_API_KEY": "local",  # llama-server ignores it; backend wants one
+            "SCRYPT_MODEL": "warden",
         }
     else:
         env["SCRYPT_BACKEND"] = "scripted"
     return env
         pass
+@router.websocket("/pty")
 async def pty_bridge(ws: WebSocket) -> None:
     """One visitor, one game process, one private sandbox. Keystrokes flow
     in as binary; a JSON {"resize":[cols,rows]} frame retunes the terminal."""
         os.close(master_fd)
 # ------------------------------------------------------------ the engine room
 import gradio as gr  # noqa: E402
+def _gpu_stub(x: str) -> str:
+    """No-op so ZeroGPU sees a @spaces.GPU function at startup. We never call
+    it — inference is CPU llama-server — but the platform requires one to
+    exist on ZeroGPU hardware."""
+    return "ok"
+if spaces is not None:
+    _gpu_stub = spaces.GPU(duration=10)(_gpu_stub)
 with gr.Blocks(title="SCRYPT engine room") as engine:
     gr.Markdown(
+        "# SCRYPT engine room\n\n"
+        "The game lives at [/](/). This page only exists so the platform has a "
+        "Gradio app to host; the Warden runs on llama.cpp behind the scenes."
     )
+    gr.api(_gpu_stub, api_name="gpu_stub")
 if __name__ == "__main__":
+    import threading
+    # Start the model load in the background so the web layer (landing page,
+    # whisper, even a scripted-fallback game) is reachable while the binary +
+    # GGUF download and llama-server warms up.
+    threading.Thread(target=_start_llama, daemon=True).start()
+    # Gradio launches the server (and arms the ZeroGPU startup handshake); we
+    # transplant our routes in FRONT of gradio's so "/" is the CRT page and the
+    # PTY websocket resolves before any catch-all. ssr_mode=False keeps gradio
+    # from spawning a Node frontend that would seize the port.
     fastapi_app, _, _ = engine.launch(
         prevent_thread_lock=True,
         server_name="0.0.0.0",
         ssr_mode=False,
         quiet=True,
     )
+    fastapi_app.include_router(router)
+    fastapi_app.mount("/static", StaticFiles(directory=STATIC), name="static")
+    # include_router appends; move our routes ahead of gradio's catch-alls.
+    our = [r for r in fastapi_app.router.routes if getattr(r, "name", "") in {
+        "status", "whisper", "landing", "play", "pty_bridge",
+    }]
+    for r in our:
+        fastapi_app.router.routes.remove(r)
+    fastapi_app.router.routes[0:0] = our
     engine.block_thread()