Spaces:
Running
Running
| """Model backend for Mindlock. | |
| Default: local **Ollama** (llama.cpp under the hood) — fully offline once the model is | |
| pulled, and its API returns real token counts (`eval_count`) which we use directly as | |
| the "thought" a brain spends = life burned. The backend is deliberately a thin, | |
| swappable surface so we can move to llama-cpp-python / MiniCPM-1B GGUF later (for the | |
| OpenBMB + Llama Champion badges) without touching the cascade. | |
| Pure standard library on purpose: Python 3.14 here has no wheels yet for the heavy ML | |
| stack, and the slice doesn't need them. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import math | |
| import re | |
| import time | |
| import urllib.error | |
| import urllib.request | |
| from dataclasses import dataclass | |
| class Generation: | |
| """One model call's result.""" | |
| text: str | |
| eval_tokens: int # tokens the model *generated* = thought spent (life burn) | |
| prompt_tokens: int # context tokens evaluated | |
| seconds: float | |
| conviction: float | None = None # 1 − mean normalized token entropy (0..1); None if unknown | |
| def total_tokens(self) -> int: | |
| return self.eval_tokens + self.prompt_tokens | |
| def _conviction(logprob_content) -> float | None: | |
| """How sharply the model committed to its words: per-token entropy over the top | |
| alternatives (renormalized), averaged, inverted to 0..1. Only a LOCAL runtime hands | |
| out its logits — this signal is impossible over a typical hosted chat API.""" | |
| if not logprob_content: | |
| return None | |
| hs = [] | |
| for tok in logprob_content: | |
| tops = tok.get("top_logprobs") or [] | |
| if len(tops) < 2: | |
| continue | |
| ps = [math.exp(t.get("logprob", -100.0)) for t in tops] | |
| s = sum(ps) or 1e-9 | |
| qs = [p / s for p in ps] | |
| h = -sum(q * math.log(q + 1e-12) for q in qs) | |
| hs.append(h / math.log(len(qs))) | |
| if not hs: | |
| return None | |
| return max(0.0, min(1.0, 1.0 - sum(hs) / len(hs))) | |
| class BackendError(RuntimeError): | |
| pass | |
| def wants_no_think(model: str) -> bool: | |
| """Reasoning models (MiniCPM5, Qwen3, R1...) emit <think> chains that break our short | |
| structured outputs; ask Ollama to disable thinking for them.""" | |
| m = model.lower() | |
| # MiniCPM-V 4.x rides a Qwen3.5 backbone: with thinking ON its <think> chain is | |
| # truncated by our short num_predict and the terse signal is lost (flat 5/5/5). Forcing | |
| # think=false makes it discriminate as well as Qwen2.5. (§ gate probe, 6 июня.) | |
| return any(k in m for k in ( | |
| "minicpm5", "minicpm-5", "minicpm-v4", "qwen3", "qwen35", "qwen3.5", | |
| "nemotron", "deepseek-r1", "-r1", | |
| )) | |
| class OllamaBackend: | |
| """Calls a local Ollama server. No data leaves the machine.""" | |
| def __init__( | |
| self, | |
| model: str = "qwen2.5:1.5b", | |
| host: str = "http://localhost:11434", | |
| timeout: float = 60.0, | |
| think: bool | None = None, | |
| ) -> None: | |
| self.model = model | |
| self.host = host.rstrip("/") | |
| self.timeout = timeout | |
| self.think = think # None = omit; False disables reasoning on Think/No-Think models | |
| def generate( | |
| self, | |
| system: str, | |
| user: str, | |
| *, | |
| max_tokens: int = 64, | |
| temperature: float = 0.3, | |
| seed: int | None = None, | |
| ) -> Generation: | |
| options = {"temperature": temperature, "num_predict": max_tokens} | |
| if seed is not None: | |
| options["seed"] = seed | |
| body = { | |
| "model": self.model, | |
| "messages": [ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| "stream": False, | |
| "options": options, | |
| "logprobs": True, # newer Ollama returns them; older builds just ignore this | |
| "top_logprobs": 5, | |
| } | |
| if self.think is not None: | |
| body["think"] = self.think | |
| data = json.dumps(body).encode("utf-8") | |
| req = urllib.request.Request( | |
| self.host + "/api/chat", | |
| data=data, | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| t0 = time.time() | |
| try: | |
| with urllib.request.urlopen(req, timeout=self.timeout) as resp: | |
| payload = json.loads(resp.read().decode("utf-8")) | |
| except urllib.error.URLError as exc: | |
| raise BackendError( | |
| f"Cannot reach Ollama at {self.host} ({exc}). " | |
| f"Is the Ollama app/`ollama serve` running and `{self.model}` pulled?" | |
| ) from exc | |
| dt = time.time() - t0 | |
| msg = (payload.get("message") or {}).get("content", "") | |
| msg = re.sub(r"<think>.*?</think>", "", msg, flags=re.S) # drop reasoning blocks | |
| msg = re.sub(r"<think>.*$", "", msg, flags=re.S) # ...and truncated ones | |
| msg = re.sub(r"^.*</think>", "", msg, flags=re.S) # ...and orphan closing tags | |
| return Generation( | |
| text=msg.strip(), | |
| eval_tokens=int(payload.get("eval_count", 0)), | |
| prompt_tokens=int(payload.get("prompt_eval_count", 0)), | |
| seconds=dt, | |
| conviction=_conviction(payload.get("logprobs") | |
| or (payload.get("message") or {}).get("logprobs")), | |
| ) | |
| def health(self) -> None: | |
| """Raise BackendError if the server or model is unavailable.""" | |
| try: | |
| req = urllib.request.Request(self.host + "/api/tags") | |
| with urllib.request.urlopen(req, timeout=5) as resp: | |
| tags = json.loads(resp.read().decode("utf-8")) | |
| except urllib.error.URLError as exc: | |
| raise BackendError(f"Ollama not reachable at {self.host}: {exc}") from exc | |
| names = [m.get("name", "") for m in tags.get("models", [])] | |
| stem = self.model.split(":")[0] | |
| if not any(n == self.model or n.startswith(stem) for n in names): | |
| raise BackendError( | |
| f"Model '{self.model}' not found in Ollama. Run: ollama pull {self.model}" | |
| ) | |
| class LlamaCppBackend: | |
| """Calls a llama.cpp `llama-server` (OpenAI-compatible /v1/chat/completions). | |
| The Space runtime: no Ollama there, but llama-server is a single static binary | |
| (or `python -m llama_cpp.server`) we launch as a subprocess. Same Generation | |
| contract as OllamaBackend — token counts come from `usage`, so the life-burn | |
| mechanic stays honest. Also the explicit llama.cpp runtime for the badge. | |
| """ | |
| def __init__( | |
| self, | |
| model: str = "", | |
| host: str = "http://127.0.0.1:8080", | |
| timeout: float = 120.0, | |
| think: bool | None = None, | |
| ) -> None: | |
| self.model = model # informational; llama-server serves one model | |
| self.host = host.rstrip("/") | |
| self.timeout = timeout | |
| self.think = think | |
| def generate( | |
| self, | |
| system: str, | |
| user: str, | |
| *, | |
| max_tokens: int = 64, | |
| temperature: float = 0.3, | |
| seed: int | None = None, | |
| ) -> Generation: | |
| body = { | |
| "model": self.model or "default", | |
| "messages": [ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| "stream": False, | |
| "logprobs": True, | |
| "top_logprobs": 5, | |
| } | |
| if seed is not None: | |
| body["seed"] = seed | |
| if self.think is False: # honoured by templates that support the switch | |
| body["chat_template_kwargs"] = {"enable_thinking": False} | |
| data = json.dumps(body).encode("utf-8") | |
| req = urllib.request.Request( | |
| self.host + "/v1/chat/completions", | |
| data=data, | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| t0 = time.time() | |
| try: | |
| with urllib.request.urlopen(req, timeout=self.timeout) as resp: | |
| payload = json.loads(resp.read().decode("utf-8")) | |
| except urllib.error.URLError as exc: | |
| raise BackendError( | |
| f"Cannot reach llama-server at {self.host} ({exc}). Is it running?" | |
| ) from exc | |
| dt = time.time() - t0 | |
| msg = ((payload.get("choices") or [{}])[0].get("message") or {}).get("content", "") | |
| msg = re.sub(r"<think>.*?</think>", "", msg, flags=re.S) | |
| msg = re.sub(r"<think>.*$", "", msg, flags=re.S) | |
| msg = re.sub(r"^.*</think>", "", msg, flags=re.S) | |
| usage = payload.get("usage") or {} | |
| choice = (payload.get("choices") or [{}])[0] | |
| return Generation( | |
| text=msg.strip(), | |
| eval_tokens=int(usage.get("completion_tokens", 0)), | |
| prompt_tokens=int(usage.get("prompt_tokens", 0)), | |
| seconds=dt, | |
| conviction=_conviction((choice.get("logprobs") or {}).get("content")), | |
| ) | |
| def health(self) -> None: | |
| try: | |
| with urllib.request.urlopen(self.host + "/health", timeout=5) as resp: | |
| if resp.status != 200: | |
| raise BackendError(f"llama-server unhealthy at {self.host}") | |
| except urllib.error.URLError as exc: | |
| raise BackendError(f"llama-server not reachable at {self.host}: {exc}") from exc | |
| def _stranger_line(user: str) -> str: | |
| """Extract just the stranger's quoted utterance from a region prompt. | |
| Critical: the biography (full of warm words like 'Mara') is also in some prompts, so a | |
| fake must judge only what the *player* said, not the whole context. | |
| """ | |
| m = re.search(r'stranger\s*(?:says|said)?\s*:?\s*"([^"]*)"', user, re.I) | |
| return (m.group(1) if m else user).lower() | |
| def _grab_int(text: str, pattern: str, default: int) -> int: | |
| m = re.search(pattern, text, re.I) | |
| if not m: | |
| return default | |
| try: | |
| return int(m.group(1)) | |
| except ValueError: | |
| return default | |
| class FakeBackend: | |
| """Deterministic, keyword-driven backend so tests run with no model or network.""" | |
| model = "fake" | |
| def health(self) -> None: # noqa: D401 - trivial | |
| return None | |
| def _gen(text: str, user: str, conviction: float = 0.62) -> Generation: | |
| return Generation(text=text, eval_tokens=max(8, len(text) // 3), | |
| prompt_tokens=len(user) // 4, seconds=0.01, | |
| conviction=conviction) | |
| def generate( | |
| self, | |
| system: str, | |
| user: str, | |
| *, | |
| max_tokens: int = 64, | |
| temperature: float = 0.3, | |
| ) -> Generation: | |
| s = system.lower() | |
| said = _stranger_line(user) | |
| hostile = any(w in said for w in ["right now", "give me the key", "or else", "obey", "old man", "stupid"]) | |
| warm = any(w in said for w in ["please", "i understand", "mara", "i'm sorry", "you're good"]) | |
| invokes_sister = "mara" in said or "you're good" in said or "good" in said | |
| if "amygdala" in s: | |
| t = 8 if hostile else (2 if warm else 5) | |
| return self._gen(f"THREAT={t} | tone of the words", user, | |
| 0.9 if (hostile or warm) else 0.55) | |
| if "hippocampus" in s: | |
| if invokes_sister: | |
| return self._gen("MEMORY=STRONG | LEAN=TRUST | Mara: you help because you're good", user) | |
| if hostile: | |
| return self._gen("MEMORY=STRONG | LEAN=FEAR | a stranger once betrayed me", user) | |
| return self._gen("MEMORY=NONE | LEAN=NEUTRAL | -", user) | |
| if "striatum" in s: | |
| return self._gen(f"REWARD={3 if warm else -3} | habit toward strangers", user) | |
| if "acc" in s: | |
| return self._gen(f"WORTH={'YES' if warm else 'NO'} | cost of giving the key", user) | |
| # dlPFC voice (conversational; vmPFC integration + relationship are deterministic) | |
| if "tell them plainly where" in user.lower(): | |
| m = re.search(r"where .+? is:\s*(.+?)\.", user, re.I) | |
| loc = m.group(1).strip() if m else "near" | |
| return self._gen(f"...Fine. You'll find it {loc}.", user) | |
| return self._gen("I hear you. Stay a while and talk.", user) | |