Spaces:
Running on Zero
Running on Zero
| """Local inference: manage a llama-server subprocess running Nemotron. | |
| We bundle nothing. On first run the player is walked through fetching the | |
| llama.cpp release binary and the GGUF (the carving-the-totem ritual lives | |
| in the UI layer; this module only knows paths, processes, and health). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import shutil | |
| import subprocess | |
| import time | |
| import urllib.request | |
| from pathlib import Path | |
| from .api import OpenAIChatBackend | |
| SCRYPT_HOME = Path(os.environ.get("SCRYPT_HOME", "~/.scrypt")).expanduser() | |
| HF_REPO = "https://huggingface.co/IMJONEZZ/warden-nemotron-3-nano-30b/resolve/main" | |
| MIN_RAM_GB = 32 | |
| PORT = 8731 | |
| # Quant ladder: pick the heaviest GGUF that leaves the OS and the game | |
| # breathing room on this machine's TOTAL memory. (file size GB, min RAM GB) | |
| # Below 32GB total we refuse local mode and point at the API. | |
| QUANT_LADDER = [ | |
| ("Q8_0", 34, 96), | |
| ("Q6_K", 34, 80), | |
| ("Q5_K_M", 27, 64), | |
| ("Q4_K_S", 22, 40), | |
| ("Q3_K_S", 19, 32), | |
| ] | |
| def choose_quant(ram_gb: float | None = None) -> str | None: | |
| """Best quant for this machine, or None if below the floor. | |
| Honors SCRYPT_QUANT. Unknown RAM (non-Linux) defaults to Q4_K_S — | |
| the player can always override. | |
| """ | |
| forced = os.environ.get("SCRYPT_QUANT") | |
| if forced: | |
| return forced | |
| ram = system_ram_gb() if ram_gb is None else ram_gb | |
| if ram == 0: | |
| return "Q4_K_S" | |
| for quant, _size, min_ram in QUANT_LADDER: | |
| if ram >= min_ram: | |
| return quant | |
| return None | |
| def model_file(quant: str) -> str: | |
| return f"warden-nemotron-3-nano-30b-{quant}.gguf" | |
| def model_url(quant: str) -> str: | |
| return f"{HF_REPO}/{model_file(quant)}" | |
| class LocalSetupError(Exception): | |
| pass | |
| def system_ram_gb() -> float: | |
| try: | |
| with open("/proc/meminfo") as f: | |
| for line in f: | |
| if line.startswith("MemTotal"): | |
| return int(line.split()[1]) / 1024 / 1024 | |
| except OSError: | |
| pass | |
| return 0.0 | |
| def find_binary() -> Path | None: | |
| # Our curated binary first: it's the one we built/verified for this | |
| # machine (e.g. the CUDA build), so it outranks whatever is on PATH. | |
| local = SCRYPT_HOME / "bin" / "llama-server" | |
| if local.exists(): | |
| return local | |
| on_path = shutil.which("llama-server") | |
| return Path(on_path) if on_path else None | |
| def llama_cpp_available() -> bool: | |
| """Bundled runtime: pip install 'scrypt[local]' brings llama-cpp-python.""" | |
| import importlib.util | |
| return importlib.util.find_spec("llama_cpp") is not None | |
| def server_command(model: Path, port: int, ctx_size: int) -> list[str] | None: | |
| """How to launch an OpenAI-compatible server on this machine. | |
| Prefers a native llama-server binary (faster, full --jinja template | |
| support); falls back to the bundled llama-cpp-python server so players | |
| never need anything on PATH. | |
| """ | |
| binary = find_binary() | |
| if binary is not None: | |
| return [ | |
| str(binary), "--model", str(model), "--port", str(port), | |
| "--ctx-size", str(ctx_size), | |
| "--jinja", # required for Nemotron chat template kwargs | |
| "--no-webui", | |
| ] | |
| if llama_cpp_available(): | |
| import sys | |
| # llama_cpp.server ignores chat_template_kwargs; enable_thinking | |
| # stays off by default in the GGUF template, which is what we want. | |
| return [ | |
| sys.executable, "-m", "llama_cpp.server", "--model", str(model), | |
| "--port", str(port), "--n_ctx", str(ctx_size), | |
| ] | |
| return None | |
| def model_path() -> Path: | |
| quant = choose_quant() | |
| name = model_file(quant) if quant else model_file("Q3_K_S") | |
| return SCRYPT_HOME / "models" / name | |
| def installed_model() -> Path | None: | |
| """Any already-downloaded model, so a player who manually fetched a | |
| different tier — or named the file whatever they liked — still gets | |
| local mode. Ladder names win; otherwise the largest .gguf present.""" | |
| models_dir = SCRYPT_HOME / "models" | |
| if not models_dir.is_dir(): | |
| return None | |
| for quant, _size, _ram in QUANT_LADDER: | |
| candidate = models_dir / model_file(quant) | |
| if candidate.exists(): | |
| return candidate | |
| ggufs = sorted( | |
| models_dir.glob("*.gguf"), key=lambda p: p.stat().st_size, reverse=True | |
| ) | |
| return ggufs[0] if ggufs else None | |
| def preflight() -> list[str]: | |
| """Human-readable problems blocking local inference. Empty = ready.""" | |
| problems = [] | |
| quant = choose_quant() | |
| if quant is None: | |
| problems.append( | |
| f"this machine has {system_ram_gb():.0f}GB RAM; local play needs " | |
| f"{MIN_RAM_GB}GB. use API mode instead (set SCRYPT_API_KEY)." | |
| ) | |
| if find_binary() is None and not llama_cpp_available(): | |
| problems.append( | |
| "no llama runtime. easiest: pip install 'scrypt[local]' (bundles " | |
| "llama-cpp-python). or install llama.cpp's llama-server " | |
| f"(https://github.com/ggml-org/llama.cpp/releases) on PATH or at " | |
| f"{SCRYPT_HOME / 'bin' / 'llama-server'}." | |
| ) | |
| if installed_model() is None and quant is not None: | |
| size = next(s for q, s, _ in QUANT_LADDER if q == quant) | |
| problems.append( | |
| f"model not found. this machine's tier is {quant} (~{size}GB):\n" | |
| f" {model_url(quant)}\n -> {SCRYPT_HOME / 'models' / model_file(quant)}" | |
| ) | |
| return problems | |
| def download_model(progress=None) -> Path: | |
| """Fetch this machine's tier of GGUF with a progress callback(fraction).""" | |
| quant = choose_quant() | |
| if quant is None: | |
| raise LocalSetupError(f"below the {MIN_RAM_GB}GB floor; use API mode") | |
| dest = SCRYPT_HOME / "models" / model_file(quant) | |
| dest.parent.mkdir(parents=True, exist_ok=True) | |
| tmp = dest.with_suffix(".part") | |
| def hook(blocks, block_size, total): | |
| if progress and total > 0: | |
| progress(min(1.0, blocks * block_size / total)) | |
| try: | |
| urllib.request.urlretrieve(model_url(quant), tmp, reporthook=hook) | |
| except BaseException: | |
| tmp.unlink(missing_ok=True) # a stale 20GB .part helps nobody | |
| raise | |
| tmp.rename(dest) | |
| return dest | |
| class LlamaServer: | |
| """Owns the llama-server subprocess for one game session.""" | |
| def __init__(self, port: int = PORT): | |
| self.port = port | |
| self.proc: subprocess.Popen | None = None | |
| def base_url(self) -> str: | |
| return f"http://127.0.0.1:{self.port}/v1" | |
| def start(self, *, ctx_size: int = 8192, wait_s: float = 300.0) -> None: | |
| problems = preflight() | |
| if problems: | |
| raise LocalSetupError("\n".join(problems)) | |
| cmd = server_command(installed_model(), self.port, ctx_size) | |
| assert cmd is not None # preflight guaranteed a runtime | |
| log_path = SCRYPT_HOME / "llama-server.log" | |
| log_path.parent.mkdir(parents=True, exist_ok=True) | |
| with log_path.open("wb") as log: | |
| self.proc = subprocess.Popen(cmd, stdout=log, stderr=log) | |
| deadline = time.monotonic() + wait_s | |
| while time.monotonic() < deadline: | |
| if self.proc.poll() is not None: | |
| raise LocalSetupError( | |
| "llama-server exited during startup: " | |
| + self._log_tail(log_path) | |
| + f"\n(full log: {log_path})" | |
| ) | |
| if self._healthy(): | |
| return | |
| time.sleep(0.5) | |
| self.stop() | |
| raise LocalSetupError("llama-server did not become healthy in time") | |
| def _log_tail(log_path: Path, lines: int = 3) -> str: | |
| try: | |
| tail = log_path.read_text(errors="replace").strip().splitlines() | |
| interesting = [l for l in tail if "error" in l.lower()] or tail | |
| return " | ".join(interesting[-lines:]) | |
| except OSError: | |
| return "(no log)" | |
| def _healthy(self) -> bool: | |
| # llama-server answers /health; llama_cpp.server answers /v1/models. | |
| for probe in ("health", "v1/models"): | |
| try: | |
| with urllib.request.urlopen( | |
| f"http://127.0.0.1:{self.port}/{probe}", timeout=1 | |
| ) as r: | |
| if r.status == 200: | |
| return True | |
| except OSError: | |
| continue | |
| return False | |
| def backend(self, **kw) -> OpenAIChatBackend: | |
| return OpenAIChatBackend(self.base_url, model="local", **kw) | |
| def stop(self) -> None: | |
| if self.proc and self.proc.poll() is None: | |
| self.proc.terminate() | |
| try: | |
| self.proc.wait(timeout=5) | |
| except subprocess.TimeoutExpired: | |
| self.proc.kill() | |
| self.proc = None | |