Spaces:

build-small-hackathon
/

Scrypt

Running on Zero

App Files Files Community

Scrypt / scrypt /inference /local.py

IMJONEZZ

game: totem ritual now downloads the finetuned Warden GGUF

713805c 17 days ago

Raw

History Blame Contribute Delete

8.83 kB

	"""Local inference: manage a llama-server subprocess running Nemotron.

	We bundle nothing. On first run the player is walked through fetching the
	llama.cpp release binary and the GGUF (the carving-the-totem ritual lives
	in the UI layer; this module only knows paths, processes, and health).
	"""

	from __future__ import annotations

	import os
	import shutil
	import subprocess
	import time
	import urllib.request
	from pathlib import Path

	from .api import OpenAIChatBackend

	SCRYPT_HOME = Path(os.environ.get("SCRYPT_HOME", "~/.scrypt")).expanduser()
	HF_REPO = "https://huggingface.co/IMJONEZZ/warden-nemotron-3-nano-30b/resolve/main"
	MIN_RAM_GB = 32
	PORT = 8731

	# Quant ladder: pick the heaviest GGUF that leaves the OS and the game
	# breathing room on this machine's TOTAL memory. (file size GB, min RAM GB)
	# Below 32GB total we refuse local mode and point at the API.
	QUANT_LADDER = [
	("Q8_0", 34, 96),
	("Q6_K", 34, 80),
	("Q5_K_M", 27, 64),
	("Q4_K_S", 22, 40),
	("Q3_K_S", 19, 32),
	]


	def choose_quant(ram_gb: float \| None = None) -> str \| None:
	"""Best quant for this machine, or None if below the floor.

	Honors SCRYPT_QUANT. Unknown RAM (non-Linux) defaults to Q4_K_S —
	the player can always override.
	"""
	forced = os.environ.get("SCRYPT_QUANT")
	if forced:
	return forced
	ram = system_ram_gb() if ram_gb is None else ram_gb
	if ram == 0:
	return "Q4_K_S"
	for quant, _size, min_ram in QUANT_LADDER:
	if ram >= min_ram:
	return quant
	return None


	def model_file(quant: str) -> str:
	return f"warden-nemotron-3-nano-30b-{quant}.gguf"


	def model_url(quant: str) -> str:
	return f"{HF_REPO}/{model_file(quant)}"


	class LocalSetupError(Exception):
	pass


	def system_ram_gb() -> float:
	try:
	with open("/proc/meminfo") as f:
	for line in f:
	if line.startswith("MemTotal"):
	return int(line.split()[1]) / 1024 / 1024
	except OSError:
	pass
	return 0.0


	def find_binary() -> Path \| None:
	# Our curated binary first: it's the one we built/verified for this
	# machine (e.g. the CUDA build), so it outranks whatever is on PATH.
	local = SCRYPT_HOME / "bin" / "llama-server"
	if local.exists():
	return local
	on_path = shutil.which("llama-server")
	return Path(on_path) if on_path else None


	def llama_cpp_available() -> bool:
	"""Bundled runtime: pip install 'scrypt[local]' brings llama-cpp-python."""
	import importlib.util

	return importlib.util.find_spec("llama_cpp") is not None


	def server_command(model: Path, port: int, ctx_size: int) -> list[str] \| None:
	"""How to launch an OpenAI-compatible server on this machine.

	Prefers a native llama-server binary (faster, full --jinja template
	support); falls back to the bundled llama-cpp-python server so players
	never need anything on PATH.
	"""
	binary = find_binary()
	if binary is not None:
	return [
	str(binary), "--model", str(model), "--port", str(port),
	"--ctx-size", str(ctx_size),
	"--jinja", # required for Nemotron chat template kwargs
	"--no-webui",
	]
	if llama_cpp_available():
	import sys

	# llama_cpp.server ignores chat_template_kwargs; enable_thinking
	# stays off by default in the GGUF template, which is what we want.
	return [
	sys.executable, "-m", "llama_cpp.server", "--model", str(model),
	"--port", str(port), "--n_ctx", str(ctx_size),
	]
	return None


	def model_path() -> Path:
	quant = choose_quant()
	name = model_file(quant) if quant else model_file("Q3_K_S")
	return SCRYPT_HOME / "models" / name


	def installed_model() -> Path \| None:
	"""Any already-downloaded model, so a player who manually fetched a
	different tier — or named the file whatever they liked — still gets
	local mode. Ladder names win; otherwise the largest .gguf present."""
	models_dir = SCRYPT_HOME / "models"
	if not models_dir.is_dir():
	return None
	for quant, _size, _ram in QUANT_LADDER:
	candidate = models_dir / model_file(quant)
	if candidate.exists():
	return candidate
	ggufs = sorted(
	models_dir.glob("*.gguf"), key=lambda p: p.stat().st_size, reverse=True
	)
	return ggufs[0] if ggufs else None


	def preflight() -> list[str]:
	"""Human-readable problems blocking local inference. Empty = ready."""
	problems = []
	quant = choose_quant()
	if quant is None:
	problems.append(
	f"this machine has {system_ram_gb():.0f}GB RAM; local play needs "
	f"{MIN_RAM_GB}GB. use API mode instead (set SCRYPT_API_KEY)."
	)
	if find_binary() is None and not llama_cpp_available():
	problems.append(
	"no llama runtime. easiest: pip install 'scrypt[local]' (bundles "
	"llama-cpp-python). or install llama.cpp's llama-server "
	f"(https://github.com/ggml-org/llama.cpp/releases) on PATH or at "
	f"{SCRYPT_HOME / 'bin' / 'llama-server'}."
	)
	if installed_model() is None and quant is not None:
	size = next(s for q, s, _ in QUANT_LADDER if q == quant)
	problems.append(
	f"model not found. this machine's tier is {quant} (~{size}GB):\n"
	f" {model_url(quant)}\n -> {SCRYPT_HOME / 'models' / model_file(quant)}"
	)
	return problems


	def download_model(progress=None) -> Path:
	"""Fetch this machine's tier of GGUF with a progress callback(fraction)."""
	quant = choose_quant()
	if quant is None:
	raise LocalSetupError(f"below the {MIN_RAM_GB}GB floor; use API mode")
	dest = SCRYPT_HOME / "models" / model_file(quant)
	dest.parent.mkdir(parents=True, exist_ok=True)
	tmp = dest.with_suffix(".part")

	def hook(blocks, block_size, total):
	if progress and total > 0:
	progress(min(1.0, blocks * block_size / total))

	try:
	urllib.request.urlretrieve(model_url(quant), tmp, reporthook=hook)
	except BaseException:
	tmp.unlink(missing_ok=True) # a stale 20GB .part helps nobody
	raise
	tmp.rename(dest)
	return dest


	class LlamaServer:
	"""Owns the llama-server subprocess for one game session."""

	def __init__(self, port: int = PORT):
	self.port = port
	self.proc: subprocess.Popen \| None = None

	@property
	def base_url(self) -> str:
	return f"http://127.0.0.1:{self.port}/v1"

	def start(self, *, ctx_size: int = 8192, wait_s: float = 300.0) -> None:
	problems = preflight()
	if problems:
	raise LocalSetupError("\n".join(problems))
	cmd = server_command(installed_model(), self.port, ctx_size)
	assert cmd is not None # preflight guaranteed a runtime
	log_path = SCRYPT_HOME / "llama-server.log"
	log_path.parent.mkdir(parents=True, exist_ok=True)
	with log_path.open("wb") as log:
	self.proc = subprocess.Popen(cmd, stdout=log, stderr=log)
	deadline = time.monotonic() + wait_s
	while time.monotonic() < deadline:
	if self.proc.poll() is not None:
	raise LocalSetupError(
	"llama-server exited during startup: "
	+ self._log_tail(log_path)
	+ f"\n(full log: {log_path})"
	)
	if self._healthy():
	return
	time.sleep(0.5)
	self.stop()
	raise LocalSetupError("llama-server did not become healthy in time")

	@staticmethod
	def _log_tail(log_path: Path, lines: int = 3) -> str:
	try:
	tail = log_path.read_text(errors="replace").strip().splitlines()
	interesting = [l for l in tail if "error" in l.lower()] or tail
	return " \| ".join(interesting[-lines:])
	except OSError:
	return "(no log)"

	def _healthy(self) -> bool:
	# llama-server answers /health; llama_cpp.server answers /v1/models.
	for probe in ("health", "v1/models"):
	try:
	with urllib.request.urlopen(
	f"http://127.0.0.1:{self.port}/{probe}", timeout=1
	) as r:
	if r.status == 200:
	return True
	except OSError:
	continue
	return False

	def backend(self, **kw) -> OpenAIChatBackend:
	return OpenAIChatBackend(self.base_url, model="local", **kw)

	def stop(self) -> None:
	if self.proc and self.proc.poll() is None:
	self.proc.terminate()
	try:
	self.proc.wait(timeout=5)
	except subprocess.TimeoutExpired:
	self.proc.kill()
	self.proc = None