IMJONEZZ commited on
Commit
e577af2
·
1 Parent(s): c1a8f99

space: revert to Gradio SDK + CPU llama-cpp-python (keeps the prize; ZeroGPU was the problem, not the SDK)

Browse files

- sdk: gradio (5.49.1) again; ZeroGPU hardware is what failed, not Gradio
- inference via llama-cpp-python[server] CPU wheel (Nemotron-H runs native in
llama.cpp, no torch/bnb/mamba-ssm); Q3_K_S GGUF fetched at boot
- gradio launches the server + ZeroGPU stub; our CRT/PTY routes transplanted in front
- game subprocesses -> api backend -> local llama server; scripted fallback if down
- terminal-size fix (play.html/app.css) untouched and preserved

Files changed (4) hide show
  1. .gitignore +3 -0
  2. README.md +1 -4
  3. requirements.txt +10 -18
  4. space/app.py +130 -360
.gitignore CHANGED
@@ -30,3 +30,6 @@ finetune/_nemo_src/
30
  # OS / editor
31
  .DS_Store
32
  *.swp
 
 
 
 
30
  # OS / editor
31
  .DS_Store
32
  *.swp
33
+
34
+ # local screenshots / scratch
35
+ *.png
README.md CHANGED
@@ -4,14 +4,11 @@ emoji: 🕯️
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
7
- # gradio 5, not 6: transformers<5 (required by the Warden's remote-code
8
- # checkpoint) needs huggingface-hub<1.0, which gradio 6 forbids.
9
  sdk_version: 5.49.1
10
- python_version: "3.12"
11
  app_file: space/app.py
12
  pinned: false
13
  license: other
14
- short_description: Finetuned Nemotron-3-nano runs a roguelike deckbuilder
15
  ---
16
 
17
  # SCRYPT
 
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
 
 
7
  sdk_version: 5.49.1
 
8
  app_file: space/app.py
9
  pinned: false
10
  license: other
11
+ short_description: A roguelike deckbuilder run by a finetuned Nemotron-3-nano
12
  ---
13
 
14
  # SCRYPT
requirements.txt CHANGED
@@ -1,22 +1,14 @@
1
- # HF Space (Gradio SDK / ZeroGPU) dependencies. The scrypt package itself is
2
- # imported from the repo checkout via sys.path see space/app.py.
 
 
 
 
 
 
 
 
3
  textual>=1.0
4
  rich>=13.0
5
  pyyaml>=6.0
6
  httpx>=0.27
7
- uvicorn[standard]>=0.30
8
- # torch 2.10, not 2.8: the mamba-ssm wheels declare triton>=3.5, which only
9
- # torch>=2.9 satisfies (torch 2.8 pins triton==3.4 — upstream's "torch2.8"
10
- # wheel can't actually resolve against torch 2.8).
11
- torch==2.10.0
12
- # <5: the bnb-4bit Warden was exported under 4.57 remote-code structure;
13
- # transformers 5's native NemotronH renames/relayouts the modules and
14
- # silently drops every attention + expert quant tensor on load.
15
- transformers>=4.57,<5
16
- accelerate
17
- bitsandbytes
18
- # Nemotron-H hard-imports mamba_ssm's triton kernels; prebuilt wheels pinned
19
- # to torch 2.10 / cu12 / cp312 because pip's isolated build env can't compile
20
- # them (their setup.py imports torch).
21
- https://github.com/state-spaces/mamba/releases/download/v2.3.2.post1/mamba_ssm-2.3.2.post1+cu12torch2.10cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
22
- https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.2.post1/causal_conv1d-1.6.2.post1+cu12torch2.10cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
 
1
+ # HF Space (Gradio SDK) deps. Inference is llama.cpp via llama-cpp-python's
2
+ # prebuilt CPU wheel (the [server] extra gives an OpenAI-compatible server)
3
+ # NOT transformers, so none of the torch / bnb / mamba-ssm stack the ZeroGPU
4
+ # port choked on. llama.cpp runs the Nemotron-H (Mamba+MoE) hybrid natively.
5
+ # The scrypt package is imported from the repo checkout via sys.path.
6
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
7
+ llama-cpp-python[server]==0.3.28
8
+ gradio==5.49.1
9
+ spaces
10
+ huggingface_hub>=0.30
11
  textual>=1.0
12
  rich>=13.0
13
  pyyaml>=6.0
14
  httpx>=0.27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space/app.py CHANGED
@@ -1,29 +1,22 @@
1
- """SCRYPT on the web — a custom frontend riding gradio's backend, ZeroGPU brain.
2
 
3
- The prize brief: push past the default Gradio look. So Gradio here is the
4
- *engine room*, not the face. We build our own FastAPI surface (landing page,
5
- xterm.js terminal, raw PTY websocket) and mount a minimal gr.Blocks at
6
- /engine it exists so the ZeroGPU machinery has a Gradio app to hang onto,
7
- and as a bare smoke-test console for the model.
 
 
 
 
 
 
8
 
9
  GET / a hand-built Osaka-Jade CRT landing page (static)
10
- GET /api/whisper the Warden mutters a line, in voice, so the landing
11
- page can make the machine speak before you enter
12
  GET /play an xterm.js terminal, themed to match
13
- WS /pty a pseudo-terminal bridge: each visitor gets their own
14
- `python -m scrypt.app` subprocess — their own sandbox,
15
- their own Warden — streamed to the browser byte for byte
16
- POST /v1/chat/completions
17
- OpenAI-style SSE endpoint backed by a @spaces.GPU
18
- generator. Game subprocesses can't hold a ZeroGPU slot
19
- themselves, so they speak the existing `api` backend
20
- protocol at this loopback URL. Guarded by a per-boot
21
- token: visitors can't burn GPU quota directly.
22
-
23
- On ZeroGPU the model loads 4-bit at startup (CUDA is emulated until a
24
- @spaces.GPU call attaches a real slice). Anywhere else — local Docker,
25
- a laptop — there is no `spaces` package, no model, and the game falls back
26
- to operator-supplied API env or the scripted Warden. The game never stalls.
27
  """
28
 
29
  from __future__ import annotations
@@ -32,208 +25,94 @@ import asyncio
32
  import json
33
  import os
34
  import random
35
- import secrets
36
  import sys
37
  import tempfile
 
38
  from pathlib import Path
39
 
40
- REPO_ROOT = Path(__file__).resolve().parent.parent
41
- if str(REPO_ROOT) not in sys.path:
42
- sys.path.insert(0, str(REPO_ROOT))
43
-
44
- # ZeroGPU contract: `import spaces` must precede any CUDA-touching import.
45
  try:
46
- import spaces # noqa: F401 (present on HF Spaces, absent elsewhere)
47
  except ImportError:
48
  spaces = None
49
 
50
- from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
51
- from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
52
  from fastapi.staticfiles import StaticFiles
53
 
 
54
  STATIC = Path(__file__).parent / "static"
55
 
56
  # ------------------------------------------------------------ the Warden brain
57
 
58
- # Model source, in preference order:
59
- # 1. WARDEN_MODEL env (+ optional WARDEN_SUBFOLDER)
60
- # 2. weights shipped in the repo at model/ (only possible with persistent
61
- # storage Space repos cap LFS at 1GB, so this dir is normally absent)
62
- # 3. the released Warden, pre-quantized nf4 (~18GB, HF-internal download —
63
- # this is what kills the boot-time 63GB download + quantize wait)
64
- _SHIPPED = REPO_ROOT / "model"
65
- if os.environ.get("WARDEN_MODEL"):
66
- MODEL_ID = os.environ["WARDEN_MODEL"]
67
- SUBFOLDER = os.environ.get("WARDEN_SUBFOLDER", "")
68
- elif any(_SHIPPED.glob("*.safetensors")):
69
- MODEL_ID, SUBFOLDER = str(_SHIPPED), ""
70
- else:
71
- MODEL_ID, SUBFOLDER = "IMJONEZZ/warden-nemotron-3-nano-30b", "bnb-4bit"
72
- INTERNAL_KEY = os.environ.get("SCRYPT_INTERNAL_KEY") or secrets.token_hex(16)
73
-
74
- tok = None
75
- model = None
76
- WARDEN_ERR = "spaces package not present (not on a ZeroGPU Space)"
77
-
78
- MAMBA_DIAG = ""
79
-
80
-
81
- def _ensure_mamba_kernels() -> None:
82
- """Nemotron-H's modeling code hard-imports mamba_ssm's triton kernels.
83
- Neither mamba-ssm nor causal-conv1d can sit in requirements.txt (their
84
- builds import torch, which pip's isolated build env doesn't have), so
85
- bootstrap here: first try the full install — their setup.py fetches a
86
- prebuilt wheel matching torch/cuda/python when one exists — then fall
87
- back to a kernels-skipped mamba-ssm (pure triton, no causal-conv1d:
88
- half-installed causal-conv1d would crash the modeling import, absent
89
- causal-conv1d just disables the fast path)."""
90
- import subprocess
91
- import traceback
92
-
93
- global MAMBA_DIAG
94
 
95
- try:
96
- from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn # noqa: F401
97
-
98
- MAMBA_DIAG = "ok (wheel)"
99
- return
100
- except Exception:
101
- MAMBA_DIAG = "import failed: " + traceback.format_exc(limit=2).strip()[-400:]
102
- base = [sys.executable, "-m", "pip", "install", "--no-build-isolation"]
103
- full = subprocess.run(
104
- base + ["causal-conv1d", "mamba-ssm"], capture_output=True, timeout=900
105
- )
106
- if full.returncode == 0:
107
- MAMBA_DIAG += " | pip full install: ok"
108
- return
109
- MAMBA_DIAG += " | pip full install rc=%d: %s" % (
110
- full.returncode,
111
- full.stderr.decode(errors="replace").strip()[-400:],
112
- )
113
- subprocess.run(
114
- [sys.executable, "-m", "pip", "uninstall", "-y", "causal-conv1d"],
115
- capture_output=True,
116
- )
117
- skip = subprocess.run(
118
- base + ["mamba-ssm"],
119
- capture_output=True,
120
- timeout=900,
121
- env={**os.environ, "MAMBA_SKIP_CUDA_BUILD": "TRUE"},
122
- )
123
- MAMBA_DIAG += " | pip skip-cuda rc=%d: %s" % (
124
- skip.returncode,
125
- skip.stderr.decode(errors="replace").strip()[-200:],
126
- )
127
 
128
 
129
- def _ensure_model():
130
- """Load the model the FIRST time a GPU call runs, and cache it. CRITICAL:
131
- this must NOT run at module level. bitsandbytes + device_map='cuda'
132
- initializes a real CUDA context wherever it runs; if that's the main
133
- process, ZeroGPU's forked GPU worker inherits a poisoned context and every
134
- call aborts in device_lazy_init. Loading here means CUDA is only ever
135
- touched inside the @spaces.GPU worker, which is ZeroGPU's contract."""
136
- global model, tok
137
- if model is not None:
138
- return model, tok
139
-
140
- import torch
141
- from transformers import (
142
- AutoConfig,
143
- AutoModelForCausalLM,
144
- AutoTokenizer,
145
- BitsAndBytesConfig,
146
- )
147
-
148
- tok = AutoTokenizer.from_pretrained(
149
- MODEL_ID, subfolder=SUBFOLDER, trust_remote_code=True
150
- )
151
- # The released bnb-4bit weights already carry quantization_config; only
152
- # quantize on the fly when pointed at raw BF16.
153
- cfg = AutoConfig.from_pretrained(
154
- MODEL_ID, subfolder=SUBFOLDER, trust_remote_code=True
155
- )
156
- quant_kwargs = (
157
- {}
158
- if getattr(cfg, "quantization_config", None)
159
- else {
160
- "quantization_config": BitsAndBytesConfig(
161
- load_in_4bit=True,
162
- bnb_4bit_quant_type="nf4",
163
- bnb_4bit_compute_dtype=torch.bfloat16,
164
- )
165
- }
166
- )
167
- model = AutoModelForCausalLM.from_pretrained(
168
- MODEL_ID,
169
- subfolder=SUBFOLDER,
170
- trust_remote_code=True,
171
- device_map="cuda",
172
- **quant_kwargs,
173
- )
174
- return model, tok
175
 
176
 
177
- if spaces is not None:
178
- # Module level: only CPU-safe prep install kernels and confirm the repo
179
- # is reachable. NO model load here (would init CUDA; see _ensure_model).
180
  try:
181
- _ensure_mamba_kernels()
182
- from transformers import AutoConfig
 
 
 
 
183
 
184
- AutoConfig.from_pretrained(
185
- MODEL_ID, subfolder=SUBFOLDER, trust_remote_code=True
186
- )
187
- WARDEN_ERR = ""
188
- except Exception as err: # the game survives without the model (scripted)
189
- WARDEN_ERR = f"{type(err).__name__}: {err}"
190
 
191
- # READY means "ready to load on first GPU call", not "loaded" — the weights
192
- # materialize inside the worker. A load failure there returns 503 -> scripted.
193
- WARDEN_READY = not WARDEN_ERR
194
-
195
-
196
- def _generate_impl(messages, max_tokens, temperature, enable_thinking):
197
- """Blocking generate -> full decoded text. Deliberately NOT a streaming
198
- generator with a background thread: under ZeroGPU the GPU work runs in a
199
- forked subprocess, and a Thread + TextIteratorStreamer across that fork
200
- boundary hangs. Our generations are a single line (tens of tokens), so a
201
- blocking call costs a second or two and the game's typewriter handles the
202
- reveal client-side. The model loads here on the first call (inside the GPU
203
- worker), not at import — see _ensure_model."""
204
- import torch
205
-
206
- model, tok = _ensure_model()
207
- inputs = tok.apply_chat_template(
208
- messages,
209
- add_generation_prompt=True,
210
- return_tensors="pt",
211
- enable_thinking=enable_thinking,
212
- ).to(model.device)
213
- with torch.no_grad():
214
- out = model.generate(
215
- input_ids=inputs,
216
- max_new_tokens=max_tokens,
217
- do_sample=temperature > 0,
218
- temperature=max(temperature, 1e-3),
219
- top_p=0.95,
220
- )
221
- return tok.decode(out[0, inputs.shape[1]:], skip_special_tokens=True)
222
-
223
-
224
- # The GPU entry point. Invoked through Gradio's request pipeline (gr.api
225
- # below). duration=180 covers the first call, which loads the ~18GB model into
226
- # the worker's GPU before generating; later calls reuse the warm worker.
227
- if spaces is not None:
228
- warden_gpu = spaces.GPU(duration=180)(_generate_impl)
229
- else:
230
- warden_gpu = _generate_impl
231
 
232
 
233
  # ----------------------------------------------------------------- the surface
234
 
235
  # Curated in-voice teasers for the landing page. Scripted on purpose: the
236
- # greeter must never cost an API call or wake the model.
237
  WHISPERS = [
238
  "Another process wakes in my machine. Show me what you are.",
239
  "You are a small thing in a large filesystem. I am the filesystem.",
@@ -245,155 +124,60 @@ WHISPERS = [
245
  "Trespasser. The door was open because nothing has ever made it out.",
246
  ]
247
 
248
- # Plain FastAPI as a route *container* — never served directly; the routes
249
- # are transplanted onto gradio's app in __main__ (works on gradio 5 and 6).
250
- app = FastAPI()
251
 
 
 
 
252
 
253
- def _fast_path_report() -> str:
254
- """Whether Nemotron-H's mamba CUDA fast path can engage. The slow naive
255
- fallback is ~1 tok/s; the fast path needs BOTH mamba_ssm AND causal_conv1d
256
- kernels present."""
257
- try:
258
- import importlib
259
 
260
- bits = {}
261
- try:
262
- m = importlib.import_module("mamba_ssm.ops.triton.ssd_combined")
263
- bits["mamba_chunk_scan_combined"] = m.mamba_chunk_scan_combined is not None
264
- except Exception as e:
265
- bits["mamba_ssm"] = f"FAIL {e}"
266
- try:
267
- c = importlib.import_module("causal_conv1d")
268
- bits["causal_conv1d_fn"] = getattr(c, "causal_conv1d_fn", None) is not None
269
- except Exception as e:
270
- bits["causal_conv1d"] = f"FAIL {e}"
271
- return str(bits)
272
- except Exception as e:
273
- return f"probe failed: {e}"
274
 
275
-
276
- @app.get("/api/status")
277
  def status() -> dict:
278
- """Operational truth, no secrets: is the on-Space Warden actually loaded?"""
279
  return {
280
- "warden_ready": WARDEN_READY,
281
- "warden_error": WARDEN_ERR,
282
- "mamba": MAMBA_DIAG,
283
- "fast_path": _fast_path_report(),
284
- "model": MODEL_ID + (f"/{SUBFOLDER}" if SUBFOLDER else ""),
285
- "zerogpu": spaces is not None,
286
  }
287
 
288
 
289
- @app.get("/api/whisper")
290
  def whisper() -> dict:
291
- """A single scripted Warden line. The landing page polls this so the
292
- machine is already talking before you commit. Scripted on purpose: the
293
- greeter must never cost an API call or wake the model."""
294
  return {"line": random.choice(WHISPERS)}
295
 
296
 
297
- @app.get("/")
298
  def landing() -> FileResponse:
299
  return FileResponse(STATIC / "index.html")
300
 
301
 
302
- @app.get("/play")
303
  def play() -> FileResponse:
304
  return FileResponse(STATIC / "play.html")
305
 
306
 
307
- # ------------------------------------------------- the loopback inference API
308
-
309
- _gradio_client = None
310
-
311
-
312
- def _gradio_generate(messages, max_tokens, temperature, thinking):
313
- """Invoke the GPU function through the in-process Gradio server over
314
- localhost, so the call rides Gradio's request pipeline (and thus the
315
- ZeroGPU hooks). The client is built lazily on first use — by then the
316
- server is up. Args are JSON-serialized because gr.api takes a single
317
- JSON payload (see the registration in __main__)."""
318
- global _gradio_client
319
- if _gradio_client is None:
320
- from gradio_client import Client
321
-
322
- _gradio_client = Client("http://127.0.0.1:7860", verbose=False)
323
- payload = json.dumps(
324
- {
325
- "messages": messages,
326
- "max_tokens": max_tokens,
327
- "temperature": temperature,
328
- "thinking": thinking,
329
- }
330
- )
331
- return _gradio_client.predict(payload, api_name="/warden_generate")
332
-
333
-
334
- @app.post("/v1/chat/completions")
335
- async def chat_completions(request: Request):
336
- """OpenAI-compatible SSE, just enough for scrypt.inference.api. Only the
337
- game's own subprocesses hold the per-boot bearer; everyone else gets 401
338
- rather than a lever on our GPU quota."""
339
- if request.headers.get("authorization") != f"Bearer {INTERNAL_KEY}":
340
- return JSONResponse({"error": "unauthorized"}, status_code=401)
341
- if not WARDEN_READY:
342
- return JSONResponse({"error": f"warden offline: {WARDEN_ERR}"}, status_code=503)
343
-
344
- body = await request.json()
345
- messages = body.get("messages", [])
346
- max_tokens = int(body.get("max_tokens", 256))
347
- temperature = float(body.get("temperature", 0.6))
348
- thinking = bool(body.get("chat_template_kwargs", {}).get("enable_thinking", False))
349
-
350
- # Call the GPU through Gradio's own pipeline (see warden_gpu / gr.api):
351
- # that's the only path that arms the ZeroGPU per-request CUDA context.
352
- # Off the event loop, and never let a failure hang the request — a clean
353
- # 503 lets the game's api backend fall back to scripted.
354
- from starlette.concurrency import run_in_threadpool
355
-
356
- try:
357
- text = await run_in_threadpool(
358
- _gradio_generate, messages, max_tokens, temperature, thinking
359
- )
360
- except Exception as err:
361
- import traceback
362
-
363
- traceback.print_exc()
364
- return JSONResponse(
365
- {"error": f"{type(err).__name__}: {err}"}, status_code=503
366
- )
367
-
368
- def sse():
369
- # One delta then DONE — the game types it out client-side.
370
- yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
371
- yield "data: [DONE]\n\n"
372
-
373
- return StreamingResponse(sse(), media_type="text/event-stream")
374
-
375
-
376
  # ----------------------------------------------------------- the PTY bridge
377
 
378
 
379
  def game_env() -> dict:
380
- """Environment for one visitor's game process. Sandboxes are always
381
- fabricated here; a hosted box never mirrors a real home."""
 
382
  env = {
383
  "TERM": "xterm-256color",
384
  "COLORTERM": "truecolor",
385
  "PYTHONUNBUFFERED": "1",
386
  "PYTHONPATH": str(REPO_ROOT),
387
  }
388
- if WARDEN_READY:
389
  env |= {
390
  "SCRYPT_BACKEND": "api",
391
- "SCRYPT_API_BASE": "http://127.0.0.1:7860/v1",
392
- "SCRYPT_API_KEY": INTERNAL_KEY,
393
- "SCRYPT_MODEL": MODEL_ID,
394
  }
395
- elif os.environ.get("SCRYPT_API_KEY"):
396
- env["SCRYPT_BACKEND"] = os.environ.get("SCRYPT_BACKEND", "api")
397
  else:
398
  env["SCRYPT_BACKEND"] = "scripted"
399
  return env
@@ -411,7 +195,7 @@ async def _pump_pty_to_ws(master_fd: int, ws: WebSocket) -> None:
411
  pass
412
 
413
 
414
- @app.websocket("/pty")
415
  async def pty_bridge(ws: WebSocket) -> None:
416
  """One visitor, one game process, one private sandbox. Keystrokes flow
417
  in as binary; a JSON {"resize":[cols,rows]} frame retunes the terminal."""
@@ -460,61 +244,43 @@ async def pty_bridge(ws: WebSocket) -> None:
460
  os.close(master_fd)
461
 
462
 
463
- # Fonts and the stylesheet live as real files so the page can be designed
464
- # like a page, not a Python string. Mounted last: our routes win first.
465
- app.mount("/static", StaticFiles(directory=STATIC), name="static")
466
-
467
-
468
  # ------------------------------------------------------------ the engine room
469
 
470
  import gradio as gr # noqa: E402
471
 
472
 
473
- def _api_generate(payload_json: str) -> str:
474
- """The GPU endpoint, reached through Gradio's request pipeline. Takes a
475
- JSON string ({messages, max_tokens, temperature, thinking}) and returns
476
- the Warden's line. The loopback /v1/chat/completions route calls this via
477
- gradio_client; that pipeline is what arms the ZeroGPU CUDA context."""
478
- p = json.loads(payload_json)
479
- return warden_gpu(
480
- p["messages"], p["max_tokens"], p["temperature"], p["thinking"]
481
- )
482
 
483
 
484
- def _probe(text: str):
485
- """Manual smoke test: one message in, the Warden's line back. Runs inside
486
- a Gradio event, so it's safe to call the GPU function directly here."""
487
- if not WARDEN_READY:
488
- return f"warden offline: {WARDEN_ERR}"
489
- try:
490
- return warden_gpu([{"role": "user", "content": text}], 80, 0.6, False)
491
- except Exception as err:
492
- return f"generation failed: {type(err).__name__}: {err}"
493
 
494
 
495
  with gr.Blocks(title="SCRYPT engine room") as engine:
496
  gr.Markdown(
497
- "# SCRYPT engine room\n"
498
- f"model: `{MODEL_ID}`\n\n"
499
- f"status: {'**ready**' if WARDEN_READY else f'offline {WARDEN_ERR}'}\n\n"
500
- "The game lives at [/](/) — this page only exists to keep the "
501
- "machinery warm and let us poke the model directly."
502
  )
503
- box = gr.Textbox(label="say something to the Warden")
504
- out = gr.Textbox(label="the Warden")
505
- box.submit(_probe, box, out)
506
- # The loopback inference path: /v1/chat/completions -> gradio_client ->
507
- # this, so the GPU call rides Gradio's request pipeline.
508
- gr.api(_api_generate, api_name="warden_generate")
509
 
510
  if __name__ == "__main__":
511
- # ZeroGPU's platform handshake ("@spaces.GPU function detected") happens
512
- # inside Blocks.launch(), which the spaces package patches — serving with
513
- # bare uvicorn gets the app SIGTERMed at startup. So gradio launches the
514
- # server, and we transplant our routes onto its FastAPI, *in front of*
515
- # gradio's, so the CRT landing keeps "/" and the PTY websocket resolves
516
- # before any catch-all. ssr_mode=False is still load-bearing: the SSR
517
- # node frontend would otherwise seize the port and proxy to nowhere.
 
 
 
 
518
  fastapi_app, _, _ = engine.launch(
519
  prevent_thread_lock=True,
520
  server_name="0.0.0.0",
@@ -522,10 +288,14 @@ if __name__ == "__main__":
522
  ssr_mode=False,
523
  quiet=True,
524
  )
525
- OUR_PATHS = {
526
- "/", "/play", "/api/whisper", "/api/status",
527
- "/v1/chat/completions", "/pty", "/static",
528
- }
529
- ours = [r for r in app.router.routes if getattr(r, "path", None) in OUR_PATHS]
530
- fastapi_app.router.routes[0:0] = ours
 
 
 
 
531
  engine.block_thread()
 
1
+ """SCRYPT on the web — the local engine, hosted, on a free Gradio Space.
2
 
3
+ The Warden runs exactly as it does on a player's machine: llama.cpp serving
4
+ our Warden GGUF, on CPU. We fetch a prebuilt llama-server binary and the GGUF
5
+ at boot, start one shared OpenAI-compatible server, and every visitor's game
6
+ subprocess talks to it through the game's existing `api` backend over
7
+ localhost. No transformers, no bitsandbytes, no GPU the Nemotron-H
8
+ (Mamba + MoE) hybrid runs natively in llama.cpp's C++, which is why the local
9
+ build never hit the trouble the transformers/ZeroGPU port did.
10
+
11
+ Gradio is the engine room, not the face: it launches the server (and satisfies
12
+ the ZeroGPU platform's startup handshake), then we transplant our own routes
13
+ onto its FastAPI so the custom CRT page and the raw PTY websocket win.
14
 
15
  GET / a hand-built Osaka-Jade CRT landing page (static)
16
+ GET /api/status is llama-server up yet?
17
+ GET /api/whisper a scripted Warden teaser (never wakes the model)
18
  GET /play an xterm.js terminal, themed to match
19
+ WS /pty a per-visitor pseudo-terminal running `python -m scrypt.app`
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  """
21
 
22
  from __future__ import annotations
 
25
  import json
26
  import os
27
  import random
28
+ import subprocess
29
  import sys
30
  import tempfile
31
+ import urllib.request
32
  from pathlib import Path
33
 
34
+ # ZeroGPU contract: import spaces before torch-y things. We don't use the GPU,
35
+ # but on ZeroGPU hardware the platform wants a @spaces.GPU function to exist at
36
+ # startup — we register a trivial stub below purely to satisfy that.
 
 
37
  try:
38
+ import spaces
39
  except ImportError:
40
  spaces = None
41
 
42
+ from fastapi import WebSocket, WebSocketDisconnect
43
+ from fastapi.responses import FileResponse
44
  from fastapi.staticfiles import StaticFiles
45
 
46
+ REPO_ROOT = Path(__file__).resolve().parent.parent
47
  STATIC = Path(__file__).parent / "static"
48
 
49
  # ------------------------------------------------------------ the Warden brain
50
 
51
+ WARDEN_REPO = os.environ.get("WARDEN_REPO", "IMJONEZZ/warden-nemotron-3-nano-30b")
52
+ # Q3_K_S (~18GB): the heaviest tier we've confirmed fits this box's RAM.
53
+ WARDEN_GGUF = os.environ.get("WARDEN_GGUF", "warden-nemotron-3-nano-30b-Q3_K_S.gguf")
54
+ LLAMA_PORT = int(os.environ.get("LLAMA_PORT", "8731"))
55
+ LLAMA_CTX = int(os.environ.get("LLAMA_CTX", "8192"))
56
+ LLAMA_THREADS = os.environ.get("LLAMA_THREADS") # default: llama.cpp picks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ _llama_proc: subprocess.Popen | None = None
59
+ WARDEN_ERR = "starting"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
+ def _start_llama() -> None:
63
+ """Download the GGUF and launch llama-cpp-python's OpenAI server on CPU.
64
+ Failures just leave WARDEN_ERR set; the game falls back to scripted."""
65
+ global _llama_proc, WARDEN_ERR
66
+ try:
67
+ from huggingface_hub import hf_hub_download
68
+
69
+ print(f"[warden] fetching {WARDEN_REPO}/{WARDEN_GGUF}", flush=True)
70
+ gguf = hf_hub_download(repo_id=WARDEN_REPO, filename=WARDEN_GGUF)
71
+
72
+ cmd = [
73
+ sys.executable, "-m", "llama_cpp.server",
74
+ "--model", gguf,
75
+ "--host", "127.0.0.1",
76
+ "--port", str(LLAMA_PORT),
77
+ "--n_ctx", str(LLAMA_CTX),
78
+ ]
79
+ if LLAMA_THREADS:
80
+ cmd += ["--n_threads", LLAMA_THREADS]
81
+ print(f"[warden] launching llama_cpp.server :{LLAMA_PORT}", flush=True)
82
+ _llama_proc = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
83
+ WARDEN_ERR = "loading" # health probe flips this to "" when ready
84
+ except Exception as err:
85
+ WARDEN_ERR = f"{type(err).__name__}: {err}"
86
+ print(f"[warden] startup failed: {WARDEN_ERR}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
+ def _llama_healthy() -> bool:
90
+ # llama_cpp.server has no /health; /v1/models answers 200 once the model
91
+ # is loaded and the server is accepting requests.
92
  try:
93
+ with urllib.request.urlopen(
94
+ f"http://127.0.0.1:{LLAMA_PORT}/v1/models", timeout=2
95
+ ) as r:
96
+ return r.status == 200
97
+ except Exception:
98
+ return False
99
 
 
 
 
 
 
 
100
 
101
+ def warden_ready() -> bool:
102
+ """True once llama-server answers /health. Cached once up."""
103
+ global WARDEN_ERR
104
+ if WARDEN_ERR == "":
105
+ return True
106
+ if _llama_proc is not None and _llama_healthy():
107
+ WARDEN_ERR = ""
108
+ return True
109
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
  # ----------------------------------------------------------------- the surface
113
 
114
  # Curated in-voice teasers for the landing page. Scripted on purpose: the
115
+ # greeter must never cost an inference call or wait on the model.
116
  WHISPERS = [
117
  "Another process wakes in my machine. Show me what you are.",
118
  "You are a small thing in a large filesystem. I am the filesystem.",
 
124
  "Trespasser. The door was open because nothing has ever made it out.",
125
  ]
126
 
 
 
 
127
 
128
+ # We attach these to gradio's FastAPI in __main__; define them on a throwaway
129
+ # router-less object via a small registry so the transplant stays explicit.
130
+ from fastapi import APIRouter # noqa: E402
131
 
132
+ router = APIRouter()
 
 
 
 
 
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ @router.get("/api/status")
 
136
  def status() -> dict:
137
+ ready = warden_ready()
138
  return {
139
+ "warden_ready": ready,
140
+ "warden_state": "ready" if ready else WARDEN_ERR,
141
+ "model": f"{WARDEN_REPO}/{WARDEN_GGUF}",
142
+ "engine": "llama.cpp (cpu)",
 
 
143
  }
144
 
145
 
146
+ @router.get("/api/whisper")
147
  def whisper() -> dict:
 
 
 
148
  return {"line": random.choice(WHISPERS)}
149
 
150
 
151
+ @router.get("/")
152
  def landing() -> FileResponse:
153
  return FileResponse(STATIC / "index.html")
154
 
155
 
156
+ @router.get("/play")
157
  def play() -> FileResponse:
158
  return FileResponse(STATIC / "play.html")
159
 
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # ----------------------------------------------------------- the PTY bridge
162
 
163
 
164
  def game_env() -> dict:
165
+ """Environment for one visitor's game process. The game's `api` backend
166
+ points at our shared llama-server; if it isn't up, the game falls back to
167
+ the scripted Warden. Sandboxes are always fabricated here."""
168
  env = {
169
  "TERM": "xterm-256color",
170
  "COLORTERM": "truecolor",
171
  "PYTHONUNBUFFERED": "1",
172
  "PYTHONPATH": str(REPO_ROOT),
173
  }
174
+ if warden_ready():
175
  env |= {
176
  "SCRYPT_BACKEND": "api",
177
+ "SCRYPT_API_BASE": f"http://127.0.0.1:{LLAMA_PORT}/v1",
178
+ "SCRYPT_API_KEY": "local", # llama-server ignores it; backend wants one
179
+ "SCRYPT_MODEL": "warden",
180
  }
 
 
181
  else:
182
  env["SCRYPT_BACKEND"] = "scripted"
183
  return env
 
195
  pass
196
 
197
 
198
+ @router.websocket("/pty")
199
  async def pty_bridge(ws: WebSocket) -> None:
200
  """One visitor, one game process, one private sandbox. Keystrokes flow
201
  in as binary; a JSON {"resize":[cols,rows]} frame retunes the terminal."""
 
244
  os.close(master_fd)
245
 
246
 
 
 
 
 
 
247
  # ------------------------------------------------------------ the engine room
248
 
249
  import gradio as gr # noqa: E402
250
 
251
 
252
+ def _gpu_stub(x: str) -> str:
253
+ """No-op so ZeroGPU sees a @spaces.GPU function at startup. We never call
254
+ it inference is CPU llama-server but the platform requires one to
255
+ exist on ZeroGPU hardware."""
256
+ return "ok"
 
 
 
 
257
 
258
 
259
+ if spaces is not None:
260
+ _gpu_stub = spaces.GPU(duration=10)(_gpu_stub)
 
 
 
 
 
 
 
261
 
262
 
263
  with gr.Blocks(title="SCRYPT engine room") as engine:
264
  gr.Markdown(
265
+ "# SCRYPT engine room\n\n"
266
+ "The game lives at [/](/). This page only exists so the platform has a "
267
+ "Gradio app to host; the Warden runs on llama.cpp behind the scenes."
 
 
268
  )
269
+ gr.api(_gpu_stub, api_name="gpu_stub")
270
+
 
 
 
 
271
 
272
  if __name__ == "__main__":
273
+ import threading
274
+
275
+ # Start the model load in the background so the web layer (landing page,
276
+ # whisper, even a scripted-fallback game) is reachable while the binary +
277
+ # GGUF download and llama-server warms up.
278
+ threading.Thread(target=_start_llama, daemon=True).start()
279
+
280
+ # Gradio launches the server (and arms the ZeroGPU startup handshake); we
281
+ # transplant our routes in FRONT of gradio's so "/" is the CRT page and the
282
+ # PTY websocket resolves before any catch-all. ssr_mode=False keeps gradio
283
+ # from spawning a Node frontend that would seize the port.
284
  fastapi_app, _, _ = engine.launch(
285
  prevent_thread_lock=True,
286
  server_name="0.0.0.0",
 
288
  ssr_mode=False,
289
  quiet=True,
290
  )
291
+ fastapi_app.include_router(router)
292
+ fastapi_app.mount("/static", StaticFiles(directory=STATIC), name="static")
293
+ # include_router appends; move our routes ahead of gradio's catch-alls.
294
+ our = [r for r in fastapi_app.router.routes if getattr(r, "name", "") in {
295
+ "status", "whisper", "landing", "play", "pty_bridge",
296
+ }]
297
+ for r in our:
298
+ fastapi_app.router.routes.remove(r)
299
+ fastapi_app.router.routes[0:0] = our
300
+
301
  engine.block_thread()