Spaces:

build-small-hackathon
/

Scrypt

Running on Zero

File size: 11,243 Bytes

"""SCRYPT on the web — the finetuned Warden on ZeroGPU.

Structured to match how working ZeroGPU spaces with a custom frontend actually
do it (e.g. the org's own NPCverse): a `gradio.Server` (which IS a FastAPI app)
hosts our custom HTML/websocket routes AND exposes GPU inference through
`@app.api(...)`, and the whole thing is started with gradio's own
`app.launch(...)`. That launch is what installs ZeroGPU's hooks + queue — the
piece my earlier `engine.launch(prevent_thread_lock=True)` + manual route
surgery skipped, which is why every GPU call segfaulted in CUDA init.

The model is bf16, placed on cuda at module level with `.to('cuda')` (NO
device_map="cuda", NO bitsandbytes — both fight ZeroGPU). The @spaces.GPU
function is only ever entered through Gradio (via the @app.api handler, reached
from the loopback /v1 shim with gradio_client), never a bare threadpool call.

  GET  /            CRT landing page
  GET  /api/status  is the Warden loaded?
  GET  /api/probe   ask the live Warden one line
  GET  /api/whisper scripted teaser
  GET  /play        xterm.js terminal
  WS   /pty         per-visitor game subprocess
  POST /v1/chat/completions  loopback OpenAI shim for the game's `api` backend
  api  warden_generate        the @spaces.GPU endpoint, in Gradio's context
"""

from __future__ import annotations

import asyncio
import json
import os
import random
import secrets
import tempfile
from pathlib import Path

# ZeroGPU contract: import spaces before torch.
try:
    import spaces
except ImportError:
    spaces = None

from fastapi import Request, WebSocket, WebSocketDisconnect
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from starlette.concurrency import run_in_threadpool
from gradio import Server

REPO_ROOT = Path(__file__).resolve().parent.parent
STATIC = Path(__file__).parent / "static"

WARDEN_REPO = os.environ.get("WARDEN_MODEL", "IMJONEZZ/warden-nemotron-3-nano-30b")
INTERNAL_KEY = os.environ.get("SCRYPT_INTERNAL_KEY") or secrets.token_hex(16)

tok = None
model = None
WARDEN_ERR = "spaces package not present (not on a ZeroGPU Space)"

if spaces is not None:
    try:
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

        # NO trust_remote_code: use transformers' NATIVE NemotronH, which falls
        # back to pure-PyTorch Mamba ops when mamba_ssm isn't installed. The
        # NVIDIA remote modeling code instead hard-requires mamba_ssm's Triton
        # CUDA kernels, which segfault under ZeroGPU. This is how working
        # Nemotron ZeroGPU spaces do it.
        tok = AutoTokenizer.from_pretrained(WARDEN_REPO)
        model = AutoModelForCausalLM.from_pretrained(
            WARDEN_REPO,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
        )
        model.to("cuda")  # intercepted by ZeroGPU emulation; migrated per call
        model.eval()
        WARDEN_ERR = ""
    except Exception as err:
        import traceback

        traceback.print_exc()
        WARDEN_ERR = f"{type(err).__name__}: {err}"

WARDEN_READY = not WARDEN_ERR


def _generate_impl(messages, max_tokens, temperature, enable_thinking):
    import torch

    # transformers 5: apply_chat_template returns a BatchEncoding (dict), not a
    # bare tensor — splat it into generate() rather than passing as input_ids.
    enc = tok.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
        enable_thinking=enable_thinking,
    )
    enc = {k: v.to("cuda") for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=max_tokens,
            do_sample=temperature > 0,
            temperature=max(temperature, 1e-3),
            top_p=0.95,
        )
    input_len = enc["input_ids"].shape[1]
    return tok.decode(out[0, input_len:], skip_special_tokens=True)


# bf16 30B (~60GB) needs the 96GB xlarge slice; duration covers first-call
# migration. ONLY entered through Gradio (the @app.api handler below).
if spaces is not None:
    warden_gpu = spaces.GPU(size="xlarge", duration=120)(_generate_impl)
else:
    warden_gpu = _generate_impl


# ----------------------------------------------------------------- the app

WHISPERS = [
    "Another process wakes in my machine. Show me what you are.",
    "You are a small thing in a large filesystem. I am the filesystem.",
    "Sit. The board is set. Your move is already a mistake.",
    "I keep files on everyone who has died here. There is always room for more.",
    "The scale does not lie. It is the only thing in here that doesn't.",
    "Sell me a command. Keep a crown. Everyone chooses the crown.",
    "I have read your crash dumps. They read like apologies.",
    "Trespasser. The door was open because nothing has ever made it out.",
]

app = Server(title="SCRYPT")


@app.api(name="warden_generate")
def warden_generate(payload_json: str) -> str:
    """The @spaces.GPU entry point, in Gradio's hooked context. Reached over
    localhost by the /v1 shim via gradio_client. Plain JSON in, text out."""
    p = json.loads(payload_json)
    return warden_gpu(p["messages"], p["max_tokens"], p["temperature"], p["thinking"])


# The loopback OpenAI shim hits warden_generate through Gradio, so the GPU call
# executes in Gradio's context (our own thread only does localhost HTTP).
_gradio_client = None


def _gradio_generate(messages, max_tokens, temperature, thinking) -> str:
    global _gradio_client
    if _gradio_client is None:
        from gradio_client import Client

        _gradio_client = Client("http://127.0.0.1:7860", verbose=False)
    payload = json.dumps({
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "thinking": thinking,
    })
    return _gradio_client.predict(payload, api_name="/warden_generate")


@app.get("/api/status")
def status() -> dict:
    return {
        "warden_ready": WARDEN_READY,
        "warden_state": "ready" if WARDEN_READY else WARDEN_ERR,
        "model": WARDEN_REPO,
    }


@app.get("/api/probe")
async def probe(q: str = "A new process woke up in your machine. Greet it in one short line, in voice.") -> dict:
    import time

    if not WARDEN_READY:
        return {"ok": False, "state": WARDEN_ERR}
    msgs = [
        {"role": "system", "content": "You are the Warden, the malevolent operating system of SCRYPTOS. Terse, menacing, Unix-flavored."},
        {"role": "user", "content": q},
    ]
    t0 = time.time()
    try:
        line = await run_in_threadpool(_gradio_generate, msgs, 60, 0.6, False)
        return {"ok": True, "line": line.strip(), "seconds": round(time.time() - t0, 1)}
    except Exception as err:
        return {"ok": False, "error": f"{type(err).__name__}: {err}"}


@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    if request.headers.get("authorization") != f"Bearer {INTERNAL_KEY}":
        return JSONResponse({"error": "unauthorized"}, status_code=401)
    if not WARDEN_READY:
        return JSONResponse({"error": f"warden offline: {WARDEN_ERR}"}, status_code=503)

    body = await request.json()
    messages = body.get("messages", [])
    max_tokens = int(body.get("max_tokens", 256))
    temperature = float(body.get("temperature", 0.6))
    thinking = bool(body.get("chat_template_kwargs", {}).get("enable_thinking", False))
    try:
        text = await run_in_threadpool(
            _gradio_generate, messages, max_tokens, temperature, thinking
        )
    except Exception as err:
        import traceback

        traceback.print_exc()
        return JSONResponse({"error": f"{type(err).__name__}: {err}"}, status_code=503)

    def sse():
        yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
        yield "data: [DONE]\n\n"

    return StreamingResponse(sse(), media_type="text/event-stream")


@app.get("/api/whisper")
def whisper() -> dict:
    return {"line": random.choice(WHISPERS)}


@app.get("/")
def landing() -> FileResponse:
    return FileResponse(STATIC / "index.html")


@app.get("/play")
def play() -> FileResponse:
    return FileResponse(STATIC / "play.html")


# ----------------------------------------------------------- the PTY bridge


def game_env() -> dict:
    env = {
        "TERM": "xterm-256color",
        "COLORTERM": "truecolor",
        "PYTHONUNBUFFERED": "1",
        "PYTHONPATH": str(REPO_ROOT),
    }
    if WARDEN_READY:
        env |= {
            "SCRYPT_BACKEND": "api",
            "SCRYPT_API_BASE": "http://127.0.0.1:7860/v1",
            "SCRYPT_API_KEY": INTERNAL_KEY,
            "SCRYPT_MODEL": "warden",
        }
    else:
        env["SCRYPT_BACKEND"] = "scripted"
    return env


async def _pump_pty_to_ws(master_fd: int, ws: WebSocket) -> None:
    loop = asyncio.get_event_loop()
    try:
        while True:
            data = await loop.run_in_executor(None, os.read, master_fd, 65536)
            if not data:
                break
            await ws.send_bytes(data)
    except Exception:
        pass


@app.websocket("/pty")
async def pty_bridge(ws: WebSocket) -> None:
    import fcntl
    import pty
    import signal
    import struct
    import termios

    await ws.accept()
    home = tempfile.mkdtemp(prefix="scrypt-")
    pid, master_fd = pty.fork()
    if pid == 0:  # child: become the game
        env = {**os.environ, **game_env(), "SCRYPT_HOME": home}
        os.execvpe("python", ["python", "-m", "scrypt.app"], env)
        os._exit(127)

    reader = asyncio.create_task(_pump_pty_to_ws(master_fd, ws))
    try:
        while True:
            msg = await ws.receive()
            if msg["type"] == "websocket.disconnect":
                break
            if (text := msg.get("text")) is not None:
                try:
                    payload = json.loads(text)
                    cols, rows = payload["resize"]
                    winsz = struct.pack("HHHH", rows, cols, 0, 0)
                    fcntl.ioctl(master_fd, termios.TIOCSWINSZ, winsz)
                    continue
                except (ValueError, KeyError, TypeError):
                    os.write(master_fd, text.encode())
            elif (data := msg.get("bytes")) is not None:
                os.write(master_fd, data)
    except WebSocketDisconnect:
        pass
    except Exception:
        pass
    finally:
        reader.cancel()
        try:
            os.kill(pid, signal.SIGKILL)
            os.waitpid(pid, 0)
        except OSError:
            pass
        os.close(master_fd)


app.mount("/static", StaticFiles(directory=STATIC), name="static")


if __name__ == "__main__":
    # gradio's own launch — installs the ZeroGPU hooks + queue and serves our
    # custom routes. ssr_mode=False is load-bearing: gradio 6's SSR spins up a
    # Node proxy that does NOT forward our raw /pty websocket (custom GET routes
    # get through, the websocket doesn't). Disabling SSR keeps everything in the
    # one Python server so the PTY bridge works.
    app.launch(
        server_name="0.0.0.0", server_port=7860, show_error=True, ssr_mode=False
    )