"""Local Ollama client (real calls, not mocks). Fully local Gemma inference → earns Off the Grid + Llama Champion by construction (Ollama runs on llama.cpp). The UI surfaces which model ran and whether it was a real call or the deterministic fallback (never crash the demo). `gemma4:e4b` is the default (= gemma4:latest, 9.6GB). `gemma4:e2b` if CPU latency demands. NOTE: `gemma4:4b` does NOT exist — never use that tag. """ from __future__ import annotations import json import os MODEL = os.environ.get("CHIEF_ENGINEER_MODEL", "gemma4:e4b") MODAL_API_URL = os.environ.get("CHIEF_ENGINEER_MODAL_URL", "https://kylebrodeur--microfactory-node-inference-serve.modal.run/v1/chat/completions") # Backend select. Default "ollama" keeps local/recording behavior IDENTICAL. # CHIEF_ENGINEER_BACKEND is the *initial default* (e.g. a Space var = zerogpu); it is # NOT a hard lock. The in-app model switcher changes the backend at runtime by setting # the env var, so routing reads it dynamically via _backend() — a fixed Space var would # otherwise freeze the ZeroGPU<->Modal switch. BACKEND keeps the startup value for # back-compat/logging. Unknown/import-fail → ollama. BACKEND = os.environ.get("CHIEF_ENGINEER_BACKEND", "ollama").lower() def _backend() -> str: """Active backend, read dynamically so the model switcher's selection takes effect at runtime (the dropdown sets CHIEF_ENGINEER_BACKEND via app._apply_model_choice).""" return os.environ.get("CHIEF_ENGINEER_BACKEND", BACKEND).lower() try: import ollama # type: ignore except Exception: # pragma: no cover ollama = None # type: ignore def _zerogpu(): """Lazily import the ZeroGPU backend; None unless selected and importable.""" if _backend() != "zerogpu": return None try: from . import llm_zerogpu # heavy deps are import-guarded inside return llm_zerogpu except Exception: return None def _modal_api(): """Lazily check if Modal API backend is selected.""" if _backend() != "modal": return None return True # Modal API is always available (HTTP endpoint) def _forced_offline() -> bool: """Force the deterministic fallback path regardless of any daemon/backend. Read dynamically (not cached) so tests can toggle it. Used by the offline core suite so `make test` never touches Ollama, even when `ollama serve` is up.""" return os.environ.get("CHIEF_ENGINEER_OFFLINE", "").lower() in ("1", "true", "yes") def is_available() -> bool: """True if the active backend can serve a real call.""" if _forced_offline(): return False zg = _zerogpu() if zg is not None: return zg.is_available() if _modal_api(): return True if ollama is None: return False try: ollama.list() return True except Exception: return False def backend_status() -> str: zg = _zerogpu() if zg is not None: return zg.backend_status() if _modal_api(): return f" live · Modal API (remote GPU)" return (f" live · {MODEL} (local Ollama)" if is_available() else f" offline fallback · " f"{MODEL} unreachable (deterministic)") def warm_up() -> str: """Pay the model's cold start now (off-camera), so the first real BUILD is fast. On ZeroGPU this enters the GPU window and loads the model; on Ollama/fallback it is a cheap no-op. On Modal API it's a no-op (Modal handles its own warm-up). Returns the (post-load) backend status. Never raises.""" zg = _zerogpu() if zg is not None: try: return zg.warm() except Exception: return backend_status() return backend_status() def chat_json(system: str, user: str, temperature: float = 0.4) -> dict | None: """One JSON-mode chat turn. Returns parsed dict, or None to signal fallback.""" if _forced_offline(): return None zg = _zerogpu() if zg is not None: try: return zg.chat_json(system, user, temperature) except Exception: return None if _modal_api(): try: import urllib.request body = json.dumps({ "messages": [{"role": "user", "content": f"{system}\n\n{user}"}], "max_tokens": 512, "temperature": temperature, }).encode() req = urllib.request.Request(MODAL_API_URL, data=body, headers={"Content-Type": "application/json"}) with urllib.request.urlopen(req, timeout=120) as resp: data = json.loads(resp.read()) text = data["choices"][0]["message"]["content"].strip() if text.startswith("```"): text = text.strip("`").lstrip() if text[:4].lower() == "json": text = text[4:] return json.loads(text) except Exception: return None if not is_available(): return None try: resp = ollama.chat( model=MODEL, messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], format="json", options={"temperature": temperature}, ) content = resp["message"]["content"].strip() # Fence-strip safety net (GEMMA-STEERING Technique 2): small Gemmas can # wrap JSON in ```json fences even in JSON mode. Strip before parsing. if content.startswith("```"): content = content.strip("`").lstrip() if content[:4].lower() == "json": content = content[4:] return json.loads(content) except Exception: return None