"""Local Ollama client (real calls, not mocks).
Fully local Gemma inference → earns Off the Grid + Llama Champion by
construction (Ollama runs on llama.cpp). The UI surfaces which model ran and
whether it was a real call or the deterministic fallback (never crash the demo).
`gemma4:e4b` is the default (= gemma4:latest, 9.6GB). `gemma4:e2b` if CPU
latency demands. NOTE: `gemma4:4b` does NOT exist — never use that tag.
"""
from __future__ import annotations
import json
import os
MODEL = os.environ.get("CHIEF_ENGINEER_MODEL", "gemma4:e4b")
MODAL_API_URL = os.environ.get("CHIEF_ENGINEER_MODAL_URL",
"https://kylebrodeur--microfactory-node-inference-serve.modal.run/v1/chat/completions")
# Backend select. Default "ollama" keeps local/recording behavior IDENTICAL.
# CHIEF_ENGINEER_BACKEND is the *initial default* (e.g. a Space var = zerogpu); it is
# NOT a hard lock. The in-app model switcher changes the backend at runtime by setting
# the env var, so routing reads it dynamically via _backend() — a fixed Space var would
# otherwise freeze the ZeroGPU<->Modal switch. BACKEND keeps the startup value for
# back-compat/logging. Unknown/import-fail → ollama.
BACKEND = os.environ.get("CHIEF_ENGINEER_BACKEND", "ollama").lower()
def _backend() -> str:
"""Active backend, read dynamically so the model switcher's selection takes effect
at runtime (the dropdown sets CHIEF_ENGINEER_BACKEND via app._apply_model_choice)."""
return os.environ.get("CHIEF_ENGINEER_BACKEND", BACKEND).lower()
try:
import ollama # type: ignore
except Exception: # pragma: no cover
ollama = None # type: ignore
def _zerogpu():
"""Lazily import the ZeroGPU backend; None unless selected and importable."""
if _backend() != "zerogpu":
return None
try:
from . import llm_zerogpu # heavy deps are import-guarded inside
return llm_zerogpu
except Exception:
return None
def _modal_api():
"""Lazily check if Modal API backend is selected."""
if _backend() != "modal":
return None
return True # Modal API is always available (HTTP endpoint)
def _forced_offline() -> bool:
"""Force the deterministic fallback path regardless of any daemon/backend.
Read dynamically (not cached) so tests can toggle it. Used by the offline
core suite so `make test` never touches Ollama, even when `ollama serve` is up."""
return os.environ.get("CHIEF_ENGINEER_OFFLINE", "").lower() in ("1", "true", "yes")
def is_available() -> bool:
"""True if the active backend can serve a real call."""
if _forced_offline():
return False
zg = _zerogpu()
if zg is not None:
return zg.is_available()
if _modal_api():
return True
if ollama is None:
return False
try:
ollama.list()
return True
except Exception:
return False
def backend_status() -> str:
zg = _zerogpu()
if zg is not None:
return zg.backend_status()
if _modal_api():
return f"● live · Modal API (remote GPU)"
return (f"● live · {MODEL} (local Ollama)"
if is_available() else
f"● offline fallback · "
f"{MODEL} unreachable (deterministic)")
def warm_up() -> str:
"""Pay the model's cold start now (off-camera), so the first real BUILD is fast.
On ZeroGPU this enters the GPU window and loads the model; on Ollama/fallback it is
a cheap no-op. On Modal API it's a no-op (Modal handles its own warm-up).
Returns the (post-load) backend status. Never raises."""
zg = _zerogpu()
if zg is not None:
try:
return zg.warm()
except Exception:
return backend_status()
return backend_status()
def chat_json(system: str, user: str, temperature: float = 0.4) -> dict | None:
"""One JSON-mode chat turn. Returns parsed dict, or None to signal fallback."""
if _forced_offline():
return None
zg = _zerogpu()
if zg is not None:
try:
return zg.chat_json(system, user, temperature)
except Exception:
return None
if _modal_api():
try:
import urllib.request
body = json.dumps({
"messages": [{"role": "user", "content": f"{system}\n\n{user}"}],
"max_tokens": 512,
"temperature": temperature,
}).encode()
req = urllib.request.Request(MODAL_API_URL, data=body,
headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=120) as resp:
data = json.loads(resp.read())
text = data["choices"][0]["message"]["content"].strip()
if text.startswith("```"):
text = text.strip("`").lstrip()
if text[:4].lower() == "json":
text = text[4:]
return json.loads(text)
except Exception:
return None
if not is_available():
return None
try:
resp = ollama.chat(
model=MODEL,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
format="json",
options={"temperature": temperature},
)
content = resp["message"]["content"].strip()
# Fence-strip safety net (GEMMA-STEERING Technique 2): small Gemmas can
# wrap JSON in ```json fences even in JSON mode. Strip before parsing.
if content.startswith("```"):
content = content.strip("`").lstrip()
if content[:4].lower() == "json":
content = content[4:]
return json.loads(content)
except Exception:
return None