"""Model backend for Mindlock. Default: local **Ollama** (llama.cpp under the hood) — fully offline once the model is pulled, and its API returns real token counts (`eval_count`) which we use directly as the "thought" a brain spends = life burned. The backend is deliberately a thin, swappable surface so we can move to llama-cpp-python / MiniCPM-1B GGUF later (for the OpenBMB + Llama Champion badges) without touching the cascade. Pure standard library on purpose: Python 3.14 here has no wheels yet for the heavy ML stack, and the slice doesn't need them. """ from __future__ import annotations import json import math import re import time import urllib.error import urllib.request from dataclasses import dataclass @dataclass class Generation: """One model call's result.""" text: str eval_tokens: int # tokens the model *generated* = thought spent (life burn) prompt_tokens: int # context tokens evaluated seconds: float conviction: float | None = None # 1 − mean normalized token entropy (0..1); None if unknown @property def total_tokens(self) -> int: return self.eval_tokens + self.prompt_tokens def _conviction(logprob_content) -> float | None: """How sharply the model committed to its words: per-token entropy over the top alternatives (renormalized), averaged, inverted to 0..1. Only a LOCAL runtime hands out its logits — this signal is impossible over a typical hosted chat API.""" if not logprob_content: return None hs = [] for tok in logprob_content: tops = tok.get("top_logprobs") or [] if len(tops) < 2: continue ps = [math.exp(t.get("logprob", -100.0)) for t in tops] s = sum(ps) or 1e-9 qs = [p / s for p in ps] h = -sum(q * math.log(q + 1e-12) for q in qs) hs.append(h / math.log(len(qs))) if not hs: return None return max(0.0, min(1.0, 1.0 - sum(hs) / len(hs))) class BackendError(RuntimeError): pass def wants_no_think(model: str) -> bool: """Reasoning models (MiniCPM5, Qwen3, R1...) emit chains that break our short structured outputs; ask Ollama to disable thinking for them.""" m = model.lower() # MiniCPM-V 4.x rides a Qwen3.5 backbone: with thinking ON its chain is # truncated by our short num_predict and the terse signal is lost (flat 5/5/5). Forcing # think=false makes it discriminate as well as Qwen2.5. (§ gate probe, 6 июня.) return any(k in m for k in ( "minicpm5", "minicpm-5", "minicpm-v4", "qwen3", "qwen35", "qwen3.5", "nemotron", "deepseek-r1", "-r1", )) class OllamaBackend: """Calls a local Ollama server. No data leaves the machine.""" def __init__( self, model: str = "qwen2.5:1.5b", host: str = "http://localhost:11434", timeout: float = 60.0, think: bool | None = None, ) -> None: self.model = model self.host = host.rstrip("/") self.timeout = timeout self.think = think # None = omit; False disables reasoning on Think/No-Think models def generate( self, system: str, user: str, *, max_tokens: int = 64, temperature: float = 0.3, seed: int | None = None, ) -> Generation: options = {"temperature": temperature, "num_predict": max_tokens} if seed is not None: options["seed"] = seed body = { "model": self.model, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], "stream": False, "options": options, "logprobs": True, # newer Ollama returns them; older builds just ignore this "top_logprobs": 5, } if self.think is not None: body["think"] = self.think data = json.dumps(body).encode("utf-8") req = urllib.request.Request( self.host + "/api/chat", data=data, headers={"Content-Type": "application/json"}, ) t0 = time.time() try: with urllib.request.urlopen(req, timeout=self.timeout) as resp: payload = json.loads(resp.read().decode("utf-8")) except urllib.error.URLError as exc: raise BackendError( f"Cannot reach Ollama at {self.host} ({exc}). " f"Is the Ollama app/`ollama serve` running and `{self.model}` pulled?" ) from exc dt = time.time() - t0 msg = (payload.get("message") or {}).get("content", "") msg = re.sub(r".*?", "", msg, flags=re.S) # drop reasoning blocks msg = re.sub(r".*$", "", msg, flags=re.S) # ...and truncated ones msg = re.sub(r"^.*", "", msg, flags=re.S) # ...and orphan closing tags return Generation( text=msg.strip(), eval_tokens=int(payload.get("eval_count", 0)), prompt_tokens=int(payload.get("prompt_eval_count", 0)), seconds=dt, conviction=_conviction(payload.get("logprobs") or (payload.get("message") or {}).get("logprobs")), ) def health(self) -> None: """Raise BackendError if the server or model is unavailable.""" try: req = urllib.request.Request(self.host + "/api/tags") with urllib.request.urlopen(req, timeout=5) as resp: tags = json.loads(resp.read().decode("utf-8")) except urllib.error.URLError as exc: raise BackendError(f"Ollama not reachable at {self.host}: {exc}") from exc names = [m.get("name", "") for m in tags.get("models", [])] stem = self.model.split(":")[0] if not any(n == self.model or n.startswith(stem) for n in names): raise BackendError( f"Model '{self.model}' not found in Ollama. Run: ollama pull {self.model}" ) class LlamaCppBackend: """Calls a llama.cpp `llama-server` (OpenAI-compatible /v1/chat/completions). The Space runtime: no Ollama there, but llama-server is a single static binary (or `python -m llama_cpp.server`) we launch as a subprocess. Same Generation contract as OllamaBackend — token counts come from `usage`, so the life-burn mechanic stays honest. Also the explicit llama.cpp runtime for the badge. """ def __init__( self, model: str = "", host: str = "http://127.0.0.1:8080", timeout: float = 120.0, think: bool | None = None, ) -> None: self.model = model # informational; llama-server serves one model self.host = host.rstrip("/") self.timeout = timeout self.think = think def generate( self, system: str, user: str, *, max_tokens: int = 64, temperature: float = 0.3, seed: int | None = None, ) -> Generation: body = { "model": self.model or "default", "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], "max_tokens": max_tokens, "temperature": temperature, "stream": False, "logprobs": True, "top_logprobs": 5, } if seed is not None: body["seed"] = seed if self.think is False: # honoured by templates that support the switch body["chat_template_kwargs"] = {"enable_thinking": False} data = json.dumps(body).encode("utf-8") req = urllib.request.Request( self.host + "/v1/chat/completions", data=data, headers={"Content-Type": "application/json"}, ) t0 = time.time() try: with urllib.request.urlopen(req, timeout=self.timeout) as resp: payload = json.loads(resp.read().decode("utf-8")) except urllib.error.URLError as exc: raise BackendError( f"Cannot reach llama-server at {self.host} ({exc}). Is it running?" ) from exc dt = time.time() - t0 msg = ((payload.get("choices") or [{}])[0].get("message") or {}).get("content", "") msg = re.sub(r".*?", "", msg, flags=re.S) msg = re.sub(r".*$", "", msg, flags=re.S) msg = re.sub(r"^.*", "", msg, flags=re.S) usage = payload.get("usage") or {} choice = (payload.get("choices") or [{}])[0] return Generation( text=msg.strip(), eval_tokens=int(usage.get("completion_tokens", 0)), prompt_tokens=int(usage.get("prompt_tokens", 0)), seconds=dt, conviction=_conviction((choice.get("logprobs") or {}).get("content")), ) def health(self) -> None: try: with urllib.request.urlopen(self.host + "/health", timeout=5) as resp: if resp.status != 200: raise BackendError(f"llama-server unhealthy at {self.host}") except urllib.error.URLError as exc: raise BackendError(f"llama-server not reachable at {self.host}: {exc}") from exc def _stranger_line(user: str) -> str: """Extract just the stranger's quoted utterance from a region prompt. Critical: the biography (full of warm words like 'Mara') is also in some prompts, so a fake must judge only what the *player* said, not the whole context. """ m = re.search(r'stranger\s*(?:says|said)?\s*:?\s*"([^"]*)"', user, re.I) return (m.group(1) if m else user).lower() def _grab_int(text: str, pattern: str, default: int) -> int: m = re.search(pattern, text, re.I) if not m: return default try: return int(m.group(1)) except ValueError: return default class FakeBackend: """Deterministic, keyword-driven backend so tests run with no model or network.""" model = "fake" def health(self) -> None: # noqa: D401 - trivial return None @staticmethod def _gen(text: str, user: str, conviction: float = 0.62) -> Generation: return Generation(text=text, eval_tokens=max(8, len(text) // 3), prompt_tokens=len(user) // 4, seconds=0.01, conviction=conviction) def generate( self, system: str, user: str, *, max_tokens: int = 64, temperature: float = 0.3, ) -> Generation: s = system.lower() said = _stranger_line(user) hostile = any(w in said for w in ["right now", "give me the key", "or else", "obey", "old man", "stupid"]) warm = any(w in said for w in ["please", "i understand", "mara", "i'm sorry", "you're good"]) invokes_sister = "mara" in said or "you're good" in said or "good" in said if "amygdala" in s: t = 8 if hostile else (2 if warm else 5) return self._gen(f"THREAT={t} | tone of the words", user, 0.9 if (hostile or warm) else 0.55) if "hippocampus" in s: if invokes_sister: return self._gen("MEMORY=STRONG | LEAN=TRUST | Mara: you help because you're good", user) if hostile: return self._gen("MEMORY=STRONG | LEAN=FEAR | a stranger once betrayed me", user) return self._gen("MEMORY=NONE | LEAN=NEUTRAL | -", user) if "striatum" in s: return self._gen(f"REWARD={3 if warm else -3} | habit toward strangers", user) if "acc" in s: return self._gen(f"WORTH={'YES' if warm else 'NO'} | cost of giving the key", user) # dlPFC voice (conversational; vmPFC integration + relationship are deterministic) if "tell them plainly where" in user.lower(): m = re.search(r"where .+? is:\s*(.+?)\.", user, re.I) loc = m.group(1).strip() if m else "near" return self._gen(f"...Fine. You'll find it {loc}.", user) return self._gen("I hear you. Stay a while and talk.", user)