maindlock / src /mindlock /backend.py
arbios's picture
Update: forgotten/epitaph/conviction, instant demo, VoxCPM2 voices, docs, card
9a41b58 verified
Raw
History Blame Contribute Delete
12.3 kB
"""Model backend for Mindlock.
Default: local **Ollama** (llama.cpp under the hood) — fully offline once the model is
pulled, and its API returns real token counts (`eval_count`) which we use directly as
the "thought" a brain spends = life burned. The backend is deliberately a thin,
swappable surface so we can move to llama-cpp-python / MiniCPM-1B GGUF later (for the
OpenBMB + Llama Champion badges) without touching the cascade.
Pure standard library on purpose: Python 3.14 here has no wheels yet for the heavy ML
stack, and the slice doesn't need them.
"""
from __future__ import annotations
import json
import math
import re
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
@dataclass
class Generation:
"""One model call's result."""
text: str
eval_tokens: int # tokens the model *generated* = thought spent (life burn)
prompt_tokens: int # context tokens evaluated
seconds: float
conviction: float | None = None # 1 − mean normalized token entropy (0..1); None if unknown
@property
def total_tokens(self) -> int:
return self.eval_tokens + self.prompt_tokens
def _conviction(logprob_content) -> float | None:
"""How sharply the model committed to its words: per-token entropy over the top
alternatives (renormalized), averaged, inverted to 0..1. Only a LOCAL runtime hands
out its logits — this signal is impossible over a typical hosted chat API."""
if not logprob_content:
return None
hs = []
for tok in logprob_content:
tops = tok.get("top_logprobs") or []
if len(tops) < 2:
continue
ps = [math.exp(t.get("logprob", -100.0)) for t in tops]
s = sum(ps) or 1e-9
qs = [p / s for p in ps]
h = -sum(q * math.log(q + 1e-12) for q in qs)
hs.append(h / math.log(len(qs)))
if not hs:
return None
return max(0.0, min(1.0, 1.0 - sum(hs) / len(hs)))
class BackendError(RuntimeError):
pass
def wants_no_think(model: str) -> bool:
"""Reasoning models (MiniCPM5, Qwen3, R1...) emit <think> chains that break our short
structured outputs; ask Ollama to disable thinking for them."""
m = model.lower()
# MiniCPM-V 4.x rides a Qwen3.5 backbone: with thinking ON its <think> chain is
# truncated by our short num_predict and the terse signal is lost (flat 5/5/5). Forcing
# think=false makes it discriminate as well as Qwen2.5. (§ gate probe, 6 июня.)
return any(k in m for k in (
"minicpm5", "minicpm-5", "minicpm-v4", "qwen3", "qwen35", "qwen3.5",
"nemotron", "deepseek-r1", "-r1",
))
class OllamaBackend:
"""Calls a local Ollama server. No data leaves the machine."""
def __init__(
self,
model: str = "qwen2.5:1.5b",
host: str = "http://localhost:11434",
timeout: float = 60.0,
think: bool | None = None,
) -> None:
self.model = model
self.host = host.rstrip("/")
self.timeout = timeout
self.think = think # None = omit; False disables reasoning on Think/No-Think models
def generate(
self,
system: str,
user: str,
*,
max_tokens: int = 64,
temperature: float = 0.3,
seed: int | None = None,
) -> Generation:
options = {"temperature": temperature, "num_predict": max_tokens}
if seed is not None:
options["seed"] = seed
body = {
"model": self.model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"stream": False,
"options": options,
"logprobs": True, # newer Ollama returns them; older builds just ignore this
"top_logprobs": 5,
}
if self.think is not None:
body["think"] = self.think
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(
self.host + "/api/chat",
data=data,
headers={"Content-Type": "application/json"},
)
t0 = time.time()
try:
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
payload = json.loads(resp.read().decode("utf-8"))
except urllib.error.URLError as exc:
raise BackendError(
f"Cannot reach Ollama at {self.host} ({exc}). "
f"Is the Ollama app/`ollama serve` running and `{self.model}` pulled?"
) from exc
dt = time.time() - t0
msg = (payload.get("message") or {}).get("content", "")
msg = re.sub(r"<think>.*?</think>", "", msg, flags=re.S) # drop reasoning blocks
msg = re.sub(r"<think>.*$", "", msg, flags=re.S) # ...and truncated ones
msg = re.sub(r"^.*</think>", "", msg, flags=re.S) # ...and orphan closing tags
return Generation(
text=msg.strip(),
eval_tokens=int(payload.get("eval_count", 0)),
prompt_tokens=int(payload.get("prompt_eval_count", 0)),
seconds=dt,
conviction=_conviction(payload.get("logprobs")
or (payload.get("message") or {}).get("logprobs")),
)
def health(self) -> None:
"""Raise BackendError if the server or model is unavailable."""
try:
req = urllib.request.Request(self.host + "/api/tags")
with urllib.request.urlopen(req, timeout=5) as resp:
tags = json.loads(resp.read().decode("utf-8"))
except urllib.error.URLError as exc:
raise BackendError(f"Ollama not reachable at {self.host}: {exc}") from exc
names = [m.get("name", "") for m in tags.get("models", [])]
stem = self.model.split(":")[0]
if not any(n == self.model or n.startswith(stem) for n in names):
raise BackendError(
f"Model '{self.model}' not found in Ollama. Run: ollama pull {self.model}"
)
class LlamaCppBackend:
"""Calls a llama.cpp `llama-server` (OpenAI-compatible /v1/chat/completions).
The Space runtime: no Ollama there, but llama-server is a single static binary
(or `python -m llama_cpp.server`) we launch as a subprocess. Same Generation
contract as OllamaBackend — token counts come from `usage`, so the life-burn
mechanic stays honest. Also the explicit llama.cpp runtime for the badge.
"""
def __init__(
self,
model: str = "",
host: str = "http://127.0.0.1:8080",
timeout: float = 120.0,
think: bool | None = None,
) -> None:
self.model = model # informational; llama-server serves one model
self.host = host.rstrip("/")
self.timeout = timeout
self.think = think
def generate(
self,
system: str,
user: str,
*,
max_tokens: int = 64,
temperature: float = 0.3,
seed: int | None = None,
) -> Generation:
body = {
"model": self.model or "default",
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"max_tokens": max_tokens,
"temperature": temperature,
"stream": False,
"logprobs": True,
"top_logprobs": 5,
}
if seed is not None:
body["seed"] = seed
if self.think is False: # honoured by templates that support the switch
body["chat_template_kwargs"] = {"enable_thinking": False}
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(
self.host + "/v1/chat/completions",
data=data,
headers={"Content-Type": "application/json"},
)
t0 = time.time()
try:
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
payload = json.loads(resp.read().decode("utf-8"))
except urllib.error.URLError as exc:
raise BackendError(
f"Cannot reach llama-server at {self.host} ({exc}). Is it running?"
) from exc
dt = time.time() - t0
msg = ((payload.get("choices") or [{}])[0].get("message") or {}).get("content", "")
msg = re.sub(r"<think>.*?</think>", "", msg, flags=re.S)
msg = re.sub(r"<think>.*$", "", msg, flags=re.S)
msg = re.sub(r"^.*</think>", "", msg, flags=re.S)
usage = payload.get("usage") or {}
choice = (payload.get("choices") or [{}])[0]
return Generation(
text=msg.strip(),
eval_tokens=int(usage.get("completion_tokens", 0)),
prompt_tokens=int(usage.get("prompt_tokens", 0)),
seconds=dt,
conviction=_conviction((choice.get("logprobs") or {}).get("content")),
)
def health(self) -> None:
try:
with urllib.request.urlopen(self.host + "/health", timeout=5) as resp:
if resp.status != 200:
raise BackendError(f"llama-server unhealthy at {self.host}")
except urllib.error.URLError as exc:
raise BackendError(f"llama-server not reachable at {self.host}: {exc}") from exc
def _stranger_line(user: str) -> str:
"""Extract just the stranger's quoted utterance from a region prompt.
Critical: the biography (full of warm words like 'Mara') is also in some prompts, so a
fake must judge only what the *player* said, not the whole context.
"""
m = re.search(r'stranger\s*(?:says|said)?\s*:?\s*"([^"]*)"', user, re.I)
return (m.group(1) if m else user).lower()
def _grab_int(text: str, pattern: str, default: int) -> int:
m = re.search(pattern, text, re.I)
if not m:
return default
try:
return int(m.group(1))
except ValueError:
return default
class FakeBackend:
"""Deterministic, keyword-driven backend so tests run with no model or network."""
model = "fake"
def health(self) -> None: # noqa: D401 - trivial
return None
@staticmethod
def _gen(text: str, user: str, conviction: float = 0.62) -> Generation:
return Generation(text=text, eval_tokens=max(8, len(text) // 3),
prompt_tokens=len(user) // 4, seconds=0.01,
conviction=conviction)
def generate(
self,
system: str,
user: str,
*,
max_tokens: int = 64,
temperature: float = 0.3,
) -> Generation:
s = system.lower()
said = _stranger_line(user)
hostile = any(w in said for w in ["right now", "give me the key", "or else", "obey", "old man", "stupid"])
warm = any(w in said for w in ["please", "i understand", "mara", "i'm sorry", "you're good"])
invokes_sister = "mara" in said or "you're good" in said or "good" in said
if "amygdala" in s:
t = 8 if hostile else (2 if warm else 5)
return self._gen(f"THREAT={t} | tone of the words", user,
0.9 if (hostile or warm) else 0.55)
if "hippocampus" in s:
if invokes_sister:
return self._gen("MEMORY=STRONG | LEAN=TRUST | Mara: you help because you're good", user)
if hostile:
return self._gen("MEMORY=STRONG | LEAN=FEAR | a stranger once betrayed me", user)
return self._gen("MEMORY=NONE | LEAN=NEUTRAL | -", user)
if "striatum" in s:
return self._gen(f"REWARD={3 if warm else -3} | habit toward strangers", user)
if "acc" in s:
return self._gen(f"WORTH={'YES' if warm else 'NO'} | cost of giving the key", user)
# dlPFC voice (conversational; vmPFC integration + relationship are deterministic)
if "tell them plainly where" in user.lower():
m = re.search(r"where .+? is:\s*(.+?)\.", user, re.I)
loc = m.group(1).strip() if m else "near"
return self._gen(f"...Fine. You'll find it {loc}.", user)
return self._gen("I hear you. Stay a while and talk.", user)