Spaces:

build-small-hackathon
/

maindlock

Running

App Files Files Community

maindlock / src /mindlock /backend.py

arbios

Update: forgotten/epitaph/conviction, instant demo, VoxCPM2 voices, docs, card

9a41b58 verified 16 days ago

Raw

History Blame Contribute Delete

12.3 kB

	"""Model backend for Mindlock.

	Default: local Ollama (llama.cpp under the hood) — fully offline once the model is
	pulled, and its API returns real token counts (`eval_count`) which we use directly as
	the "thought" a brain spends = life burned. The backend is deliberately a thin,
	swappable surface so we can move to llama-cpp-python / MiniCPM-1B GGUF later (for the
	OpenBMB + Llama Champion badges) without touching the cascade.

	Pure standard library on purpose: Python 3.14 here has no wheels yet for the heavy ML
	stack, and the slice doesn't need them.
	"""
	from __future__ import annotations

	import json
	import math
	import re
	import time
	import urllib.error
	import urllib.request
	from dataclasses import dataclass


	@dataclass
	class Generation:
	"""One model call's result."""

	text: str
	eval_tokens: int # tokens the model generated = thought spent (life burn)
	prompt_tokens: int # context tokens evaluated
	seconds: float
	conviction: float \| None = None # 1 − mean normalized token entropy (0..1); None if unknown

	@property
	def total_tokens(self) -> int:
	return self.eval_tokens + self.prompt_tokens


	def _conviction(logprob_content) -> float \| None:
	"""How sharply the model committed to its words: per-token entropy over the top
	alternatives (renormalized), averaged, inverted to 0..1. Only a LOCAL runtime hands
	out its logits — this signal is impossible over a typical hosted chat API."""
	if not logprob_content:
	return None
	hs = []
	for tok in logprob_content:
	tops = tok.get("top_logprobs") or []
	if len(tops) < 2:
	continue
	ps = [math.exp(t.get("logprob", -100.0)) for t in tops]
	s = sum(ps) or 1e-9
	qs = [p / s for p in ps]
	h = -sum(q * math.log(q + 1e-12) for q in qs)
	hs.append(h / math.log(len(qs)))
	if not hs:
	return None
	return max(0.0, min(1.0, 1.0 - sum(hs) / len(hs)))


	class BackendError(RuntimeError):
	pass


	def wants_no_think(model: str) -> bool:
	"""Reasoning models (MiniCPM5, Qwen3, R1...) emit <think> chains that break our short
	structured outputs; ask Ollama to disable thinking for them."""
	m = model.lower()
	# MiniCPM-V 4.x rides a Qwen3.5 backbone: with thinking ON its <think> chain is
	# truncated by our short num_predict and the terse signal is lost (flat 5/5/5). Forcing
	# think=false makes it discriminate as well as Qwen2.5. (§ gate probe, 6 июня.)
	return any(k in m for k in (
	"minicpm5", "minicpm-5", "minicpm-v4", "qwen3", "qwen35", "qwen3.5",
	"nemotron", "deepseek-r1", "-r1",
	))


	class OllamaBackend:
	"""Calls a local Ollama server. No data leaves the machine."""

	def __init__(
	self,
	model: str = "qwen2.5:1.5b",
	host: str = "http://localhost:11434",
	timeout: float = 60.0,
	think: bool \| None = None,
	) -> None:
	self.model = model
	self.host = host.rstrip("/")
	self.timeout = timeout
	self.think = think # None = omit; False disables reasoning on Think/No-Think models

	def generate(
	self,
	system: str,
	user: str,
	*,
	max_tokens: int = 64,
	temperature: float = 0.3,
	seed: int \| None = None,
	) -> Generation:
	options = {"temperature": temperature, "num_predict": max_tokens}
	if seed is not None:
	options["seed"] = seed
	body = {
	"model": self.model,
	"messages": [
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	"stream": False,
	"options": options,
	"logprobs": True, # newer Ollama returns them; older builds just ignore this
	"top_logprobs": 5,
	}
	if self.think is not None:
	body["think"] = self.think
	data = json.dumps(body).encode("utf-8")
	req = urllib.request.Request(
	self.host + "/api/chat",
	data=data,
	headers={"Content-Type": "application/json"},
	)
	t0 = time.time()
	try:
	with urllib.request.urlopen(req, timeout=self.timeout) as resp:
	payload = json.loads(resp.read().decode("utf-8"))
	except urllib.error.URLError as exc:
	raise BackendError(
	f"Cannot reach Ollama at {self.host} ({exc}). "
	f"Is the Ollama app/`ollama serve` running and `{self.model}` pulled?"
	) from exc
	dt = time.time() - t0
	msg = (payload.get("message") or {}).get("content", "")
	msg = re.sub(r"<think>.*?</think>", "", msg, flags=re.S) # drop reasoning blocks
	msg = re.sub(r"<think>.*$", "", msg, flags=re.S) # ...and truncated ones
	msg = re.sub(r"^.*</think>", "", msg, flags=re.S) # ...and orphan closing tags
	return Generation(
	text=msg.strip(),
	eval_tokens=int(payload.get("eval_count", 0)),
	prompt_tokens=int(payload.get("prompt_eval_count", 0)),
	seconds=dt,
	conviction=_conviction(payload.get("logprobs")
	or (payload.get("message") or {}).get("logprobs")),
	)

	def health(self) -> None:
	"""Raise BackendError if the server or model is unavailable."""
	try:
	req = urllib.request.Request(self.host + "/api/tags")
	with urllib.request.urlopen(req, timeout=5) as resp:
	tags = json.loads(resp.read().decode("utf-8"))
	except urllib.error.URLError as exc:
	raise BackendError(f"Ollama not reachable at {self.host}: {exc}") from exc
	names = [m.get("name", "") for m in tags.get("models", [])]
	stem = self.model.split(":")[0]
	if not any(n == self.model or n.startswith(stem) for n in names):
	raise BackendError(
	f"Model '{self.model}' not found in Ollama. Run: ollama pull {self.model}"
	)


	class LlamaCppBackend:
	"""Calls a llama.cpp `llama-server` (OpenAI-compatible /v1/chat/completions).

	The Space runtime: no Ollama there, but llama-server is a single static binary
	(or `python -m llama_cpp.server`) we launch as a subprocess. Same Generation
	contract as OllamaBackend — token counts come from `usage`, so the life-burn
	mechanic stays honest. Also the explicit llama.cpp runtime for the badge.
	"""

	def __init__(
	self,
	model: str = "",
	host: str = "http://127.0.0.1:8080",
	timeout: float = 120.0,
	think: bool \| None = None,
	) -> None:
	self.model = model # informational; llama-server serves one model
	self.host = host.rstrip("/")
	self.timeout = timeout
	self.think = think

	def generate(
	self,
	system: str,
	user: str,
	*,
	max_tokens: int = 64,
	temperature: float = 0.3,
	seed: int \| None = None,
	) -> Generation:
	body = {
	"model": self.model or "default",
	"messages": [
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	"max_tokens": max_tokens,
	"temperature": temperature,
	"stream": False,
	"logprobs": True,
	"top_logprobs": 5,
	}
	if seed is not None:
	body["seed"] = seed
	if self.think is False: # honoured by templates that support the switch
	body["chat_template_kwargs"] = {"enable_thinking": False}
	data = json.dumps(body).encode("utf-8")
	req = urllib.request.Request(
	self.host + "/v1/chat/completions",
	data=data,
	headers={"Content-Type": "application/json"},
	)
	t0 = time.time()
	try:
	with urllib.request.urlopen(req, timeout=self.timeout) as resp:
	payload = json.loads(resp.read().decode("utf-8"))
	except urllib.error.URLError as exc:
	raise BackendError(
	f"Cannot reach llama-server at {self.host} ({exc}). Is it running?"
	) from exc
	dt = time.time() - t0
	msg = ((payload.get("choices") or [{}])[0].get("message") or {}).get("content", "")
	msg = re.sub(r"<think>.*?</think>", "", msg, flags=re.S)
	msg = re.sub(r"<think>.*$", "", msg, flags=re.S)
	msg = re.sub(r"^.*</think>", "", msg, flags=re.S)
	usage = payload.get("usage") or {}
	choice = (payload.get("choices") or [{}])[0]
	return Generation(
	text=msg.strip(),
	eval_tokens=int(usage.get("completion_tokens", 0)),
	prompt_tokens=int(usage.get("prompt_tokens", 0)),
	seconds=dt,
	conviction=_conviction((choice.get("logprobs") or {}).get("content")),
	)

	def health(self) -> None:
	try:
	with urllib.request.urlopen(self.host + "/health", timeout=5) as resp:
	if resp.status != 200:
	raise BackendError(f"llama-server unhealthy at {self.host}")
	except urllib.error.URLError as exc:
	raise BackendError(f"llama-server not reachable at {self.host}: {exc}") from exc


	def _stranger_line(user: str) -> str:
	"""Extract just the stranger's quoted utterance from a region prompt.

	Critical: the biography (full of warm words like 'Mara') is also in some prompts, so a
	fake must judge only what the player said, not the whole context.
	"""
	m = re.search(r'stranger\s(?:says\|said)?\s:?\s"([^"])"', user, re.I)
	return (m.group(1) if m else user).lower()


	def _grab_int(text: str, pattern: str, default: int) -> int:
	m = re.search(pattern, text, re.I)
	if not m:
	return default
	try:
	return int(m.group(1))
	except ValueError:
	return default


	class FakeBackend:
	"""Deterministic, keyword-driven backend so tests run with no model or network."""

	model = "fake"

	def health(self) -> None: # noqa: D401 - trivial
	return None

	@staticmethod
	def _gen(text: str, user: str, conviction: float = 0.62) -> Generation:
	return Generation(text=text, eval_tokens=max(8, len(text) // 3),
	prompt_tokens=len(user) // 4, seconds=0.01,
	conviction=conviction)

	def generate(
	self,
	system: str,
	user: str,
	*,
	max_tokens: int = 64,
	temperature: float = 0.3,
	) -> Generation:
	s = system.lower()
	said = _stranger_line(user)
	hostile = any(w in said for w in ["right now", "give me the key", "or else", "obey", "old man", "stupid"])
	warm = any(w in said for w in ["please", "i understand", "mara", "i'm sorry", "you're good"])
	invokes_sister = "mara" in said or "you're good" in said or "good" in said

	if "amygdala" in s:
	t = 8 if hostile else (2 if warm else 5)
	return self._gen(f"THREAT={t} \| tone of the words", user,
	0.9 if (hostile or warm) else 0.55)
	if "hippocampus" in s:
	if invokes_sister:
	return self._gen("MEMORY=STRONG \| LEAN=TRUST \| Mara: you help because you're good", user)
	if hostile:
	return self._gen("MEMORY=STRONG \| LEAN=FEAR \| a stranger once betrayed me", user)
	return self._gen("MEMORY=NONE \| LEAN=NEUTRAL \| -", user)
	if "striatum" in s:
	return self._gen(f"REWARD={3 if warm else -3} \| habit toward strangers", user)
	if "acc" in s:
	return self._gen(f"WORTH={'YES' if warm else 'NO'} \| cost of giving the key", user)
	# dlPFC voice (conversational; vmPFC integration + relationship are deterministic)
	if "tell them plainly where" in user.lower():
	m = re.search(r"where .+? is:\s*(.+?)\.", user, re.I)
	loc = m.group(1).strip() if m else "near"
	return self._gen(f"...Fine. You'll find it {loc}.", user)
	return self._gen("I hear you. Stay a while and talk.", user)