maindlock / app.py
arbios's picture
Update: forgotten/epitaph/conviction, instant demo, VoxCPM2 voices, docs, card
9a41b58 verified
Raw
History Blame Contribute Delete
8.06 kB
"""Hugging Face Space entry point for Mindlock.
One process serves everything (the Off-Brand pattern):
/ β€” the game: custom canvas front (FastAPI + static), the "open the skull" UI
/about β€” a Gradio block with how-to-play, links and the eligibility map
Backends, picked by env:
MINDLOCK_FAKE=1 deterministic demo, no models
(default, laptop) Ollama β€” minicpm-v4.6 + nemotron-3-nano:4b
MINDLOCK_BACKEND=llamacpp llama.cpp runtime: llama-server subprocesses,
GGUF weights pulled from the Hub on first boot
python app.py # laptop (Ollama)
MINDLOCK_FAKE=1 python app.py # no models
MINDLOCK_BACKEND=llamacpp python app.py # llama.cpp (the Space path)
"""
import atexit
import os
import shutil
import subprocess
import sys
import time
import urllib.request
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
# --------------------------------------------------------------- llama.cpp boot (Space)
# Two single-model servers (sensors + voice), weights from the Hub. Started BEFORE the
# game session imports so health checks see them.
_LLAMA = [
# (env host var, port, repo, file, )
("MINDLOCK_LLAMA_HOST", 8091, os.environ.get("MINDLOCK_GGUF_REPO", "openbmb/MiniCPM5-1B-GGUF"),
os.environ.get("MINDLOCK_GGUF_FILE", "MiniCPM5-1B-Q4_K_M.gguf")),
("MINDLOCK_LLAMA_DLPFC_HOST", 8092, os.environ.get("MINDLOCK_GGUF_DLPFC_REPO", "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"),
os.environ.get("MINDLOCK_GGUF_DLPFC_FILE", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf")),
]
def _fetch_llama_server() -> str:
"""No system llama-server (the Space): pull the official prebuilt linux-x64 binary
from the llama.cpp release β€” a 5MB zip beats a 30-minute source build of the wheel."""
import io
import json as _json
import tarfile
cache = os.path.expanduser("~/.cache/mindlock/llama")
marker = os.path.join(cache, ".ready")
if os.path.exists(marker):
with open(marker) as fh:
return fh.read().strip()
os.makedirs(cache, exist_ok=True)
rel = _json.loads(urllib.request.urlopen(
"https://api.github.com/repos/ggml-org/llama.cpp/releases/latest", timeout=30).read())
asset = next(a for a in rel["assets"]
if "bin-ubuntu-x64" in a["name"] and a["name"].endswith(".tar.gz"))
blob = urllib.request.urlopen(asset["browser_download_url"], timeout=300).read()
tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz").extractall(cache)
binpath = ""
for root, _, files in os.walk(cache): # binary + its .so live in build/bin
for f in files:
full = os.path.join(root, f)
if f == "llama-server":
os.chmod(full, 0o755)
binpath = full
elif f.endswith(".so"):
os.chmod(full, 0o755)
if not binpath:
raise RuntimeError("llama-server not found inside the release archive")
os.environ["LD_LIBRARY_PATH"] = (os.path.dirname(binpath) + ":"
+ os.environ.get("LD_LIBRARY_PATH", ""))
with open(marker, "w") as fh:
fh.write(binpath)
return binpath
def _llama_cmd(path: str, port: int) -> list[str]:
server = shutil.which("llama-server") or _fetch_llama_server()
return [server, "-m", path, "--port", str(port), "-c", "4096", "--no-warmup"]
def _boot_llama() -> None:
from huggingface_hub import hf_hub_download
for env_var, port, repo, fname in _LLAMA:
path = hf_hub_download(repo_id=repo, filename=fname)
proc = subprocess.Popen(
_llama_cmd(path, port),
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
atexit.register(proc.terminate)
os.environ[env_var] = f"http://127.0.0.1:{port}"
deadline = time.time() + 300
for _, port, _, _ in _LLAMA:
while time.time() < deadline:
try:
urllib.request.urlopen(f"http://127.0.0.1:{port}/health", timeout=2)
break
except Exception: # noqa: BLE001
time.sleep(1.5)
# warm each server with one real templated token in the background, so the first
# player turn doesn't pay the cold prompt-processing cost on top of its own
import json as _json
import threading
def _warm(port: int) -> None:
try:
req = urllib.request.Request(
f"http://127.0.0.1:{port}/v1/chat/completions",
data=_json.dumps({"model": "default", "max_tokens": 1,
"messages": [{"role": "user", "content": "hi"}]}).encode(),
headers={"Content-Type": "application/json"})
urllib.request.urlopen(req, timeout=120)
except Exception: # noqa: BLE001 β€” warmup is best-effort
pass
for _, port, _, _ in _LLAMA:
threading.Thread(target=_warm, args=(port,), daemon=True).start()
if os.environ.get("SPACE_ID") and "MINDLOCK_BACKEND" not in os.environ:
os.environ["MINDLOCK_BACKEND"] = "llamacpp" # a Space has no Ollama β€” default to llama.cpp
if os.environ.get("MINDLOCK_BACKEND") == "llamacpp" and not os.environ.get("MINDLOCK_FAKE"):
_boot_llama()
import gradio as gr # noqa: E402
from mindlock.game.server import app # noqa: E402 β€” the FastAPI game (custom front at /)
ABOUT_MD = """
# 🧠 MINDLOCK
**An escape room where every character is not one AI β€” but a hierarchy of tiny offline
language models: the departments of one mind.** You don't pick a lock. You change a mind.
**[β–Ά Play](/)** Β· every NPC runs a 6-region brain cascade (amygdala β†’ hippocampus β†’
striatum β†’ ACC β†’ vmPFC β†’ dlPFC) on small local models. Cruelty burns their finite
thinking tokens; empathy lets them heal. A mind that reaches zero is gone β€” with
everything it knew.
## How to play
- **WASD / arrows** β€” move Β· **E** β€” talk / use Β· **🧠 / /brain** β€” open the skull
- Each room: someone holds the key. Someone else knows what reaches them. Listen,
learn the word that matters, say it like you mean it.
- **Story** β€” ten rooms of one man's memory. Find out who you are. *(~30 min)*
- **Endless** β€” procedurally generated minds, forever.
- **πŸ‘ Watch a mind** (menu) β€” a *real* recorded session replays instantly, zero model
calls, so you can see the cascade, the token burn and a mind's death without waiting.
## The brain is real, not a metaphor
- **[The engine β†’](/docs/architecture)** β€” the value-network + dual-system grounding, the
deterministic vmPFC, every mechanic tied to a citation (not the debunked triune brain).
- **[A recorded mind β†’](/docs/trace)** β€” one NPC's full six-region deliberation, two ways:
reached with empathy, and burned down with cruelty until it dies.
- **Conviction** β€” each region shows how sharply it committed, read from the model's own
token logits. A hosted chat API never exposes that; only a local mind can.
## Small models, doing the carrying
| Role | Model |
|---|---|
| Sensory regions (threat, memory, habit, cost) | MiniCPM (OpenBMB) |
| Voice β€” the words a mind says out loud | Nemotron 3 Nano 4B (NVIDIA) |
| Runtime | llama.cpp / Ollama β€” fully offline |
*Links: demo video Β· social post Β· GitHub repo β€” added at submission.*
---
πŸ€— *Built small for the **Hugging Face Γ— Gradio** hackathon (Thousand Token Wood track).
Minds by **OpenBMB MiniCPM** and **NVIDIA Nemotron**, running on **llama.cpp** β€” fully offline.
Original soundtrack by **DjinAscet**.*
"""
about = gr.Blocks(title="Mindlock β€” about")
with about:
gr.Markdown(ABOUT_MD)
app = gr.mount_gradio_app(app, about, path="/about")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", os.environ.get("MINDLOCK_PORT", 7860))))