"""Local inference: manage a llama-server subprocess running Nemotron. We bundle nothing. On first run the player is walked through fetching the llama.cpp release binary and the GGUF (the carving-the-totem ritual lives in the UI layer; this module only knows paths, processes, and health). """ from __future__ import annotations import os import shutil import subprocess import time import urllib.request from pathlib import Path from .api import OpenAIChatBackend SCRYPT_HOME = Path(os.environ.get("SCRYPT_HOME", "~/.scrypt")).expanduser() HF_REPO = "https://huggingface.co/IMJONEZZ/warden-nemotron-3-nano-30b/resolve/main" MIN_RAM_GB = 32 PORT = 8731 # Quant ladder: pick the heaviest GGUF that leaves the OS and the game # breathing room on this machine's TOTAL memory. (file size GB, min RAM GB) # Below 32GB total we refuse local mode and point at the API. QUANT_LADDER = [ ("Q8_0", 34, 96), ("Q6_K", 34, 80), ("Q5_K_M", 27, 64), ("Q4_K_S", 22, 40), ("Q3_K_S", 19, 32), ] def choose_quant(ram_gb: float | None = None) -> str | None: """Best quant for this machine, or None if below the floor. Honors SCRYPT_QUANT. Unknown RAM (non-Linux) defaults to Q4_K_S — the player can always override. """ forced = os.environ.get("SCRYPT_QUANT") if forced: return forced ram = system_ram_gb() if ram_gb is None else ram_gb if ram == 0: return "Q4_K_S" for quant, _size, min_ram in QUANT_LADDER: if ram >= min_ram: return quant return None def model_file(quant: str) -> str: return f"warden-nemotron-3-nano-30b-{quant}.gguf" def model_url(quant: str) -> str: return f"{HF_REPO}/{model_file(quant)}" class LocalSetupError(Exception): pass def system_ram_gb() -> float: try: with open("/proc/meminfo") as f: for line in f: if line.startswith("MemTotal"): return int(line.split()[1]) / 1024 / 1024 except OSError: pass return 0.0 def find_binary() -> Path | None: # Our curated binary first: it's the one we built/verified for this # machine (e.g. the CUDA build), so it outranks whatever is on PATH. local = SCRYPT_HOME / "bin" / "llama-server" if local.exists(): return local on_path = shutil.which("llama-server") return Path(on_path) if on_path else None def llama_cpp_available() -> bool: """Bundled runtime: pip install 'scrypt[local]' brings llama-cpp-python.""" import importlib.util return importlib.util.find_spec("llama_cpp") is not None def server_command(model: Path, port: int, ctx_size: int) -> list[str] | None: """How to launch an OpenAI-compatible server on this machine. Prefers a native llama-server binary (faster, full --jinja template support); falls back to the bundled llama-cpp-python server so players never need anything on PATH. """ binary = find_binary() if binary is not None: return [ str(binary), "--model", str(model), "--port", str(port), "--ctx-size", str(ctx_size), "--jinja", # required for Nemotron chat template kwargs "--no-webui", ] if llama_cpp_available(): import sys # llama_cpp.server ignores chat_template_kwargs; enable_thinking # stays off by default in the GGUF template, which is what we want. return [ sys.executable, "-m", "llama_cpp.server", "--model", str(model), "--port", str(port), "--n_ctx", str(ctx_size), ] return None def model_path() -> Path: quant = choose_quant() name = model_file(quant) if quant else model_file("Q3_K_S") return SCRYPT_HOME / "models" / name def installed_model() -> Path | None: """Any already-downloaded model, so a player who manually fetched a different tier — or named the file whatever they liked — still gets local mode. Ladder names win; otherwise the largest .gguf present.""" models_dir = SCRYPT_HOME / "models" if not models_dir.is_dir(): return None for quant, _size, _ram in QUANT_LADDER: candidate = models_dir / model_file(quant) if candidate.exists(): return candidate ggufs = sorted( models_dir.glob("*.gguf"), key=lambda p: p.stat().st_size, reverse=True ) return ggufs[0] if ggufs else None def preflight() -> list[str]: """Human-readable problems blocking local inference. Empty = ready.""" problems = [] quant = choose_quant() if quant is None: problems.append( f"this machine has {system_ram_gb():.0f}GB RAM; local play needs " f"{MIN_RAM_GB}GB. use API mode instead (set SCRYPT_API_KEY)." ) if find_binary() is None and not llama_cpp_available(): problems.append( "no llama runtime. easiest: pip install 'scrypt[local]' (bundles " "llama-cpp-python). or install llama.cpp's llama-server " f"(https://github.com/ggml-org/llama.cpp/releases) on PATH or at " f"{SCRYPT_HOME / 'bin' / 'llama-server'}." ) if installed_model() is None and quant is not None: size = next(s for q, s, _ in QUANT_LADDER if q == quant) problems.append( f"model not found. this machine's tier is {quant} (~{size}GB):\n" f" {model_url(quant)}\n -> {SCRYPT_HOME / 'models' / model_file(quant)}" ) return problems def download_model(progress=None) -> Path: """Fetch this machine's tier of GGUF with a progress callback(fraction).""" quant = choose_quant() if quant is None: raise LocalSetupError(f"below the {MIN_RAM_GB}GB floor; use API mode") dest = SCRYPT_HOME / "models" / model_file(quant) dest.parent.mkdir(parents=True, exist_ok=True) tmp = dest.with_suffix(".part") def hook(blocks, block_size, total): if progress and total > 0: progress(min(1.0, blocks * block_size / total)) try: urllib.request.urlretrieve(model_url(quant), tmp, reporthook=hook) except BaseException: tmp.unlink(missing_ok=True) # a stale 20GB .part helps nobody raise tmp.rename(dest) return dest class LlamaServer: """Owns the llama-server subprocess for one game session.""" def __init__(self, port: int = PORT): self.port = port self.proc: subprocess.Popen | None = None @property def base_url(self) -> str: return f"http://127.0.0.1:{self.port}/v1" def start(self, *, ctx_size: int = 8192, wait_s: float = 300.0) -> None: problems = preflight() if problems: raise LocalSetupError("\n".join(problems)) cmd = server_command(installed_model(), self.port, ctx_size) assert cmd is not None # preflight guaranteed a runtime log_path = SCRYPT_HOME / "llama-server.log" log_path.parent.mkdir(parents=True, exist_ok=True) with log_path.open("wb") as log: self.proc = subprocess.Popen(cmd, stdout=log, stderr=log) deadline = time.monotonic() + wait_s while time.monotonic() < deadline: if self.proc.poll() is not None: raise LocalSetupError( "llama-server exited during startup: " + self._log_tail(log_path) + f"\n(full log: {log_path})" ) if self._healthy(): return time.sleep(0.5) self.stop() raise LocalSetupError("llama-server did not become healthy in time") @staticmethod def _log_tail(log_path: Path, lines: int = 3) -> str: try: tail = log_path.read_text(errors="replace").strip().splitlines() interesting = [l for l in tail if "error" in l.lower()] or tail return " | ".join(interesting[-lines:]) except OSError: return "(no log)" def _healthy(self) -> bool: # llama-server answers /health; llama_cpp.server answers /v1/models. for probe in ("health", "v1/models"): try: with urllib.request.urlopen( f"http://127.0.0.1:{self.port}/{probe}", timeout=1 ) as r: if r.status == 200: return True except OSError: continue return False def backend(self, **kw) -> OpenAIChatBackend: return OpenAIChatBackend(self.base_url, model="local", **kw) def stop(self) -> None: if self.proc and self.proc.poll() is None: self.proc.terminate() try: self.proc.wait(timeout=5) except subprocess.TimeoutExpired: self.proc.kill() self.proc = None