"""Hugging Face Space entry point for Mindlock. One process serves everything (the Off-Brand pattern): / — the game: custom canvas front (FastAPI + static), the "open the skull" UI /about — a Gradio block with how-to-play, links and the eligibility map Backends, picked by env: MINDLOCK_FAKE=1 deterministic demo, no models (default, laptop) Ollama — minicpm-v4.6 + nemotron-3-nano:4b MINDLOCK_BACKEND=llamacpp llama.cpp runtime: llama-server subprocesses, GGUF weights pulled from the Hub on first boot python app.py # laptop (Ollama) MINDLOCK_FAKE=1 python app.py # no models MINDLOCK_BACKEND=llamacpp python app.py # llama.cpp (the Space path) """ import atexit import os import shutil import subprocess import sys import time import urllib.request sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) # --------------------------------------------------------------- llama.cpp boot (Space) # Two single-model servers (sensors + voice), weights from the Hub. Started BEFORE the # game session imports so health checks see them. _LLAMA = [ # (env host var, port, repo, file, ) ("MINDLOCK_LLAMA_HOST", 8091, os.environ.get("MINDLOCK_GGUF_REPO", "openbmb/MiniCPM5-1B-GGUF"), os.environ.get("MINDLOCK_GGUF_FILE", "MiniCPM5-1B-Q4_K_M.gguf")), ("MINDLOCK_LLAMA_DLPFC_HOST", 8092, os.environ.get("MINDLOCK_GGUF_DLPFC_REPO", "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"), os.environ.get("MINDLOCK_GGUF_DLPFC_FILE", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf")), ] def _fetch_llama_server() -> str: """No system llama-server (the Space): pull the official prebuilt linux-x64 binary from the llama.cpp release — a 5MB zip beats a 30-minute source build of the wheel.""" import io import json as _json import tarfile cache = os.path.expanduser("~/.cache/mindlock/llama") marker = os.path.join(cache, ".ready") if os.path.exists(marker): with open(marker) as fh: return fh.read().strip() os.makedirs(cache, exist_ok=True) rel = _json.loads(urllib.request.urlopen( "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest", timeout=30).read()) asset = next(a for a in rel["assets"] if "bin-ubuntu-x64" in a["name"] and a["name"].endswith(".tar.gz")) blob = urllib.request.urlopen(asset["browser_download_url"], timeout=300).read() tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz").extractall(cache) binpath = "" for root, _, files in os.walk(cache): # binary + its .so live in build/bin for f in files: full = os.path.join(root, f) if f == "llama-server": os.chmod(full, 0o755) binpath = full elif f.endswith(".so"): os.chmod(full, 0o755) if not binpath: raise RuntimeError("llama-server not found inside the release archive") os.environ["LD_LIBRARY_PATH"] = (os.path.dirname(binpath) + ":" + os.environ.get("LD_LIBRARY_PATH", "")) with open(marker, "w") as fh: fh.write(binpath) return binpath def _llama_cmd(path: str, port: int) -> list[str]: server = shutil.which("llama-server") or _fetch_llama_server() return [server, "-m", path, "--port", str(port), "-c", "4096", "--no-warmup"] def _boot_llama() -> None: from huggingface_hub import hf_hub_download for env_var, port, repo, fname in _LLAMA: path = hf_hub_download(repo_id=repo, filename=fname) proc = subprocess.Popen( _llama_cmd(path, port), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) atexit.register(proc.terminate) os.environ[env_var] = f"http://127.0.0.1:{port}" deadline = time.time() + 300 for _, port, _, _ in _LLAMA: while time.time() < deadline: try: urllib.request.urlopen(f"http://127.0.0.1:{port}/health", timeout=2) break except Exception: # noqa: BLE001 time.sleep(1.5) # warm each server with one real templated token in the background, so the first # player turn doesn't pay the cold prompt-processing cost on top of its own import json as _json import threading def _warm(port: int) -> None: try: req = urllib.request.Request( f"http://127.0.0.1:{port}/v1/chat/completions", data=_json.dumps({"model": "default", "max_tokens": 1, "messages": [{"role": "user", "content": "hi"}]}).encode(), headers={"Content-Type": "application/json"}) urllib.request.urlopen(req, timeout=120) except Exception: # noqa: BLE001 — warmup is best-effort pass for _, port, _, _ in _LLAMA: threading.Thread(target=_warm, args=(port,), daemon=True).start() if os.environ.get("SPACE_ID") and "MINDLOCK_BACKEND" not in os.environ: os.environ["MINDLOCK_BACKEND"] = "llamacpp" # a Space has no Ollama — default to llama.cpp if os.environ.get("MINDLOCK_BACKEND") == "llamacpp" and not os.environ.get("MINDLOCK_FAKE"): _boot_llama() import gradio as gr # noqa: E402 from mindlock.game.server import app # noqa: E402 — the FastAPI game (custom front at /) ABOUT_MD = """ # 🧠 MINDLOCK **An escape room where every character is not one AI — but a hierarchy of tiny offline language models: the departments of one mind.** You don't pick a lock. You change a mind. **[▶ Play](/)** · every NPC runs a 6-region brain cascade (amygdala → hippocampus → striatum → ACC → vmPFC → dlPFC) on small local models. Cruelty burns their finite thinking tokens; empathy lets them heal. A mind that reaches zero is gone — with everything it knew. ## How to play - **WASD / arrows** — move · **E** — talk / use · **🧠 / /brain** — open the skull - Each room: someone holds the key. Someone else knows what reaches them. Listen, learn the word that matters, say it like you mean it. - **Story** — ten rooms of one man's memory. Find out who you are. *(~30 min)* - **Endless** — procedurally generated minds, forever. - **👁 Watch a mind** (menu) — a *real* recorded session replays instantly, zero model calls, so you can see the cascade, the token burn and a mind's death without waiting. ## The brain is real, not a metaphor - **[The engine →](/docs/architecture)** — the value-network + dual-system grounding, the deterministic vmPFC, every mechanic tied to a citation (not the debunked triune brain). - **[A recorded mind →](/docs/trace)** — one NPC's full six-region deliberation, two ways: reached with empathy, and burned down with cruelty until it dies. - **Conviction** — each region shows how sharply it committed, read from the model's own token logits. A hosted chat API never exposes that; only a local mind can. ## Small models, doing the carrying | Role | Model | |---|---| | Sensory regions (threat, memory, habit, cost) | MiniCPM (OpenBMB) | | Voice — the words a mind says out loud | Nemotron 3 Nano 4B (NVIDIA) | | Runtime | llama.cpp / Ollama — fully offline | *Links: demo video · social post · GitHub repo — added at submission.* --- 🤗 *Built small for the **Hugging Face × Gradio** hackathon (Thousand Token Wood track). Minds by **OpenBMB MiniCPM** and **NVIDIA Nemotron**, running on **llama.cpp** — fully offline. Original soundtrack by **DjinAscet**.* """ about = gr.Blocks(title="Mindlock — about") with about: gr.Markdown(ABOUT_MD) app = gr.mount_gradio_app(app, about, path="/about") if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", os.environ.get("MINDLOCK_PORT", 7860))))