Spaces:

build-small-hackathon
/

maindlock

Running

App Files Files Community

maindlock / app.py

arbios

Update: forgotten/epitaph/conviction, instant demo, VoxCPM2 voices, docs, card

9a41b58 verified 16 days ago

Raw

History Blame Contribute Delete

8.06 kB

	"""Hugging Face Space entry point for Mindlock.

	One process serves everything (the Off-Brand pattern):
	/ — the game: custom canvas front (FastAPI + static), the "open the skull" UI
	/about — a Gradio block with how-to-play, links and the eligibility map

	Backends, picked by env:
	MINDLOCK_FAKE=1 deterministic demo, no models
	(default, laptop) Ollama — minicpm-v4.6 + nemotron-3-nano:4b
	MINDLOCK_BACKEND=llamacpp llama.cpp runtime: llama-server subprocesses,
	GGUF weights pulled from the Hub on first boot

	python app.py # laptop (Ollama)
	MINDLOCK_FAKE=1 python app.py # no models
	MINDLOCK_BACKEND=llamacpp python app.py # llama.cpp (the Space path)
	"""
	import atexit
	import os
	import shutil
	import subprocess
	import sys
	import time
	import urllib.request

	sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))

	# --------------------------------------------------------------- llama.cpp boot (Space)
	# Two single-model servers (sensors + voice), weights from the Hub. Started BEFORE the
	# game session imports so health checks see them.
	_LLAMA = [
	# (env host var, port, repo, file, )
	("MINDLOCK_LLAMA_HOST", 8091, os.environ.get("MINDLOCK_GGUF_REPO", "openbmb/MiniCPM5-1B-GGUF"),
	os.environ.get("MINDLOCK_GGUF_FILE", "MiniCPM5-1B-Q4_K_M.gguf")),
	("MINDLOCK_LLAMA_DLPFC_HOST", 8092, os.environ.get("MINDLOCK_GGUF_DLPFC_REPO", "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"),
	os.environ.get("MINDLOCK_GGUF_DLPFC_FILE", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf")),
	]


	def _fetch_llama_server() -> str:
	"""No system llama-server (the Space): pull the official prebuilt linux-x64 binary
	from the llama.cpp release — a 5MB zip beats a 30-minute source build of the wheel."""
	import io
	import json as _json
	import tarfile

	cache = os.path.expanduser("~/.cache/mindlock/llama")
	marker = os.path.join(cache, ".ready")
	if os.path.exists(marker):
	with open(marker) as fh:
	return fh.read().strip()
	os.makedirs(cache, exist_ok=True)
	rel = _json.loads(urllib.request.urlopen(
	"https://api.github.com/repos/ggml-org/llama.cpp/releases/latest", timeout=30).read())
	asset = next(a for a in rel["assets"]
	if "bin-ubuntu-x64" in a["name"] and a["name"].endswith(".tar.gz"))
	blob = urllib.request.urlopen(asset["browser_download_url"], timeout=300).read()
	tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz").extractall(cache)
	binpath = ""
	for root, _, files in os.walk(cache): # binary + its .so live in build/bin
	for f in files:
	full = os.path.join(root, f)
	if f == "llama-server":
	os.chmod(full, 0o755)
	binpath = full
	elif f.endswith(".so"):
	os.chmod(full, 0o755)
	if not binpath:
	raise RuntimeError("llama-server not found inside the release archive")
	os.environ["LD_LIBRARY_PATH"] = (os.path.dirname(binpath) + ":"
	+ os.environ.get("LD_LIBRARY_PATH", ""))
	with open(marker, "w") as fh:
	fh.write(binpath)
	return binpath


	def _llama_cmd(path: str, port: int) -> list[str]:
	server = shutil.which("llama-server") or _fetch_llama_server()
	return [server, "-m", path, "--port", str(port), "-c", "4096", "--no-warmup"]


	def _boot_llama() -> None:
	from huggingface_hub import hf_hub_download
	for env_var, port, repo, fname in _LLAMA:
	path = hf_hub_download(repo_id=repo, filename=fname)
	proc = subprocess.Popen(
	_llama_cmd(path, port),
	stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
	)
	atexit.register(proc.terminate)
	os.environ[env_var] = f"http://127.0.0.1:{port}"
	deadline = time.time() + 300
	for _, port, _, _ in _LLAMA:
	while time.time() < deadline:
	try:
	urllib.request.urlopen(f"http://127.0.0.1:{port}/health", timeout=2)
	break
	except Exception: # noqa: BLE001
	time.sleep(1.5)
	# warm each server with one real templated token in the background, so the first
	# player turn doesn't pay the cold prompt-processing cost on top of its own
	import json as _json
	import threading

	def _warm(port: int) -> None:
	try:
	req = urllib.request.Request(
	f"http://127.0.0.1:{port}/v1/chat/completions",
	data=_json.dumps({"model": "default", "max_tokens": 1,
	"messages": [{"role": "user", "content": "hi"}]}).encode(),
	headers={"Content-Type": "application/json"})
	urllib.request.urlopen(req, timeout=120)
	except Exception: # noqa: BLE001 — warmup is best-effort
	pass

	for _, port, _, _ in _LLAMA:
	threading.Thread(target=_warm, args=(port,), daemon=True).start()


	if os.environ.get("SPACE_ID") and "MINDLOCK_BACKEND" not in os.environ:
	os.environ["MINDLOCK_BACKEND"] = "llamacpp" # a Space has no Ollama — default to llama.cpp

	if os.environ.get("MINDLOCK_BACKEND") == "llamacpp" and not os.environ.get("MINDLOCK_FAKE"):
	_boot_llama()

	import gradio as gr # noqa: E402

	from mindlock.game.server import app # noqa: E402 — the FastAPI game (custom front at /)

	ABOUT_MD = """
	# 🧠 MINDLOCK

	**An escape room where every character is not one AI — but a hierarchy of tiny offline
	language models: the departments of one mind.** You don't pick a lock. You change a mind.

	[▶ Play](/) · every NPC runs a 6-region brain cascade (amygdala → hippocampus →
	striatum → ACC → vmPFC → dlPFC) on small local models. Cruelty burns their finite
	thinking tokens; empathy lets them heal. A mind that reaches zero is gone — with
	everything it knew.

	## How to play
	- WASD / arrows — move · E — talk / use · 🧠 / /brain — open the skull
	- Each room: someone holds the key. Someone else knows what reaches them. Listen,
	learn the word that matters, say it like you mean it.
	- Story — ten rooms of one man's memory. Find out who you are. (~30 min)
	- Endless — procedurally generated minds, forever.
	- 👁 Watch a mind (menu) — a real recorded session replays instantly, zero model
	calls, so you can see the cascade, the token burn and a mind's death without waiting.

	## The brain is real, not a metaphor
	- [The engine →](/docs/architecture) — the value-network + dual-system grounding, the
	deterministic vmPFC, every mechanic tied to a citation (not the debunked triune brain).
	- [A recorded mind →](/docs/trace) — one NPC's full six-region deliberation, two ways:
	reached with empathy, and burned down with cruelty until it dies.
	- Conviction — each region shows how sharply it committed, read from the model's own
	token logits. A hosted chat API never exposes that; only a local mind can.

	## Small models, doing the carrying
	\| Role \| Model \|
	\|---\|---\|
	\| Sensory regions (threat, memory, habit, cost) \| MiniCPM (OpenBMB) \|
	\| Voice — the words a mind says out loud \| Nemotron 3 Nano 4B (NVIDIA) \|
	\| Runtime \| llama.cpp / Ollama — fully offline \|

	Links: demo video · social post · GitHub repo — added at submission.

	---

	🤗 Built small for the Hugging Face × Gradio* hackathon (Thousand Token Wood track).
	Minds by OpenBMB MiniCPM and NVIDIA Nemotron, running on llama.cpp — fully offline.
	Original soundtrack by DjinAscet.*
	"""

	about = gr.Blocks(title="Mindlock — about")
	with about:
	gr.Markdown(ABOUT_MD)

	app = gr.mount_gradio_app(app, about, path="/about")

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", os.environ.get("MINDLOCK_PORT", 7860))))