Scrypt / scrypt /inference /local.py
IMJONEZZ's picture
game: totem ritual now downloads the finetuned Warden GGUF
713805c
Raw
History Blame Contribute Delete
8.83 kB
"""Local inference: manage a llama-server subprocess running Nemotron.
We bundle nothing. On first run the player is walked through fetching the
llama.cpp release binary and the GGUF (the carving-the-totem ritual lives
in the UI layer; this module only knows paths, processes, and health).
"""
from __future__ import annotations
import os
import shutil
import subprocess
import time
import urllib.request
from pathlib import Path
from .api import OpenAIChatBackend
SCRYPT_HOME = Path(os.environ.get("SCRYPT_HOME", "~/.scrypt")).expanduser()
HF_REPO = "https://huggingface.co/IMJONEZZ/warden-nemotron-3-nano-30b/resolve/main"
MIN_RAM_GB = 32
PORT = 8731
# Quant ladder: pick the heaviest GGUF that leaves the OS and the game
# breathing room on this machine's TOTAL memory. (file size GB, min RAM GB)
# Below 32GB total we refuse local mode and point at the API.
QUANT_LADDER = [
("Q8_0", 34, 96),
("Q6_K", 34, 80),
("Q5_K_M", 27, 64),
("Q4_K_S", 22, 40),
("Q3_K_S", 19, 32),
]
def choose_quant(ram_gb: float | None = None) -> str | None:
"""Best quant for this machine, or None if below the floor.
Honors SCRYPT_QUANT. Unknown RAM (non-Linux) defaults to Q4_K_S —
the player can always override.
"""
forced = os.environ.get("SCRYPT_QUANT")
if forced:
return forced
ram = system_ram_gb() if ram_gb is None else ram_gb
if ram == 0:
return "Q4_K_S"
for quant, _size, min_ram in QUANT_LADDER:
if ram >= min_ram:
return quant
return None
def model_file(quant: str) -> str:
return f"warden-nemotron-3-nano-30b-{quant}.gguf"
def model_url(quant: str) -> str:
return f"{HF_REPO}/{model_file(quant)}"
class LocalSetupError(Exception):
pass
def system_ram_gb() -> float:
try:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal"):
return int(line.split()[1]) / 1024 / 1024
except OSError:
pass
return 0.0
def find_binary() -> Path | None:
# Our curated binary first: it's the one we built/verified for this
# machine (e.g. the CUDA build), so it outranks whatever is on PATH.
local = SCRYPT_HOME / "bin" / "llama-server"
if local.exists():
return local
on_path = shutil.which("llama-server")
return Path(on_path) if on_path else None
def llama_cpp_available() -> bool:
"""Bundled runtime: pip install 'scrypt[local]' brings llama-cpp-python."""
import importlib.util
return importlib.util.find_spec("llama_cpp") is not None
def server_command(model: Path, port: int, ctx_size: int) -> list[str] | None:
"""How to launch an OpenAI-compatible server on this machine.
Prefers a native llama-server binary (faster, full --jinja template
support); falls back to the bundled llama-cpp-python server so players
never need anything on PATH.
"""
binary = find_binary()
if binary is not None:
return [
str(binary), "--model", str(model), "--port", str(port),
"--ctx-size", str(ctx_size),
"--jinja", # required for Nemotron chat template kwargs
"--no-webui",
]
if llama_cpp_available():
import sys
# llama_cpp.server ignores chat_template_kwargs; enable_thinking
# stays off by default in the GGUF template, which is what we want.
return [
sys.executable, "-m", "llama_cpp.server", "--model", str(model),
"--port", str(port), "--n_ctx", str(ctx_size),
]
return None
def model_path() -> Path:
quant = choose_quant()
name = model_file(quant) if quant else model_file("Q3_K_S")
return SCRYPT_HOME / "models" / name
def installed_model() -> Path | None:
"""Any already-downloaded model, so a player who manually fetched a
different tier — or named the file whatever they liked — still gets
local mode. Ladder names win; otherwise the largest .gguf present."""
models_dir = SCRYPT_HOME / "models"
if not models_dir.is_dir():
return None
for quant, _size, _ram in QUANT_LADDER:
candidate = models_dir / model_file(quant)
if candidate.exists():
return candidate
ggufs = sorted(
models_dir.glob("*.gguf"), key=lambda p: p.stat().st_size, reverse=True
)
return ggufs[0] if ggufs else None
def preflight() -> list[str]:
"""Human-readable problems blocking local inference. Empty = ready."""
problems = []
quant = choose_quant()
if quant is None:
problems.append(
f"this machine has {system_ram_gb():.0f}GB RAM; local play needs "
f"{MIN_RAM_GB}GB. use API mode instead (set SCRYPT_API_KEY)."
)
if find_binary() is None and not llama_cpp_available():
problems.append(
"no llama runtime. easiest: pip install 'scrypt[local]' (bundles "
"llama-cpp-python). or install llama.cpp's llama-server "
f"(https://github.com/ggml-org/llama.cpp/releases) on PATH or at "
f"{SCRYPT_HOME / 'bin' / 'llama-server'}."
)
if installed_model() is None and quant is not None:
size = next(s for q, s, _ in QUANT_LADDER if q == quant)
problems.append(
f"model not found. this machine's tier is {quant} (~{size}GB):\n"
f" {model_url(quant)}\n -> {SCRYPT_HOME / 'models' / model_file(quant)}"
)
return problems
def download_model(progress=None) -> Path:
"""Fetch this machine's tier of GGUF with a progress callback(fraction)."""
quant = choose_quant()
if quant is None:
raise LocalSetupError(f"below the {MIN_RAM_GB}GB floor; use API mode")
dest = SCRYPT_HOME / "models" / model_file(quant)
dest.parent.mkdir(parents=True, exist_ok=True)
tmp = dest.with_suffix(".part")
def hook(blocks, block_size, total):
if progress and total > 0:
progress(min(1.0, blocks * block_size / total))
try:
urllib.request.urlretrieve(model_url(quant), tmp, reporthook=hook)
except BaseException:
tmp.unlink(missing_ok=True) # a stale 20GB .part helps nobody
raise
tmp.rename(dest)
return dest
class LlamaServer:
"""Owns the llama-server subprocess for one game session."""
def __init__(self, port: int = PORT):
self.port = port
self.proc: subprocess.Popen | None = None
@property
def base_url(self) -> str:
return f"http://127.0.0.1:{self.port}/v1"
def start(self, *, ctx_size: int = 8192, wait_s: float = 300.0) -> None:
problems = preflight()
if problems:
raise LocalSetupError("\n".join(problems))
cmd = server_command(installed_model(), self.port, ctx_size)
assert cmd is not None # preflight guaranteed a runtime
log_path = SCRYPT_HOME / "llama-server.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
with log_path.open("wb") as log:
self.proc = subprocess.Popen(cmd, stdout=log, stderr=log)
deadline = time.monotonic() + wait_s
while time.monotonic() < deadline:
if self.proc.poll() is not None:
raise LocalSetupError(
"llama-server exited during startup: "
+ self._log_tail(log_path)
+ f"\n(full log: {log_path})"
)
if self._healthy():
return
time.sleep(0.5)
self.stop()
raise LocalSetupError("llama-server did not become healthy in time")
@staticmethod
def _log_tail(log_path: Path, lines: int = 3) -> str:
try:
tail = log_path.read_text(errors="replace").strip().splitlines()
interesting = [l for l in tail if "error" in l.lower()] or tail
return " | ".join(interesting[-lines:])
except OSError:
return "(no log)"
def _healthy(self) -> bool:
# llama-server answers /health; llama_cpp.server answers /v1/models.
for probe in ("health", "v1/models"):
try:
with urllib.request.urlopen(
f"http://127.0.0.1:{self.port}/{probe}", timeout=1
) as r:
if r.status == 200:
return True
except OSError:
continue
return False
def backend(self, **kw) -> OpenAIChatBackend:
return OpenAIChatBackend(self.base_url, model="local", **kw)
def stop(self) -> None:
if self.proc and self.proc.poll() is None:
self.proc.terminate()
try:
self.proc.wait(timeout=5)
except subprocess.TimeoutExpired:
self.proc.kill()
self.proc = None