"""Backend presets for smolcode. smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just selects the base_url and the model *tiers* the router may escalate through. Everything is overridable by environment variables so the same code runs on a laptop, inside an HF Space, or against the hal-9000 "home supercomputer". Env overrides (highest priority): SMALLCODE_PRESET space | laptop | hal | hal-smol (default: hal) SMALLCODE_BASE_URL OpenAI-compatible /v1 URL SMALLCODE_API_KEY bearer token (most local servers ignore it) SMALLCODE_MODEL force a single model (disables tiering) """ from __future__ import annotations import os import re from dataclasses import dataclass, field @dataclass(frozen=True) class Tier: """One rung of the model ladder. `name` is what the router shows in the UI.""" name: str model: str @dataclass(frozen=True) class Preset: key: str base_url: str api_key: str # Ordered cheap -> expensive. The router starts at tiers[0] and escalates. tiers: list[Tier] = field(default_factory=list) @property def default_model(self) -> str: return self.tiers[0].model @dataclass(frozen=True) class SpecialistLadder: """One specialist family's size ladder (cheap -> expensive), reusing Tier.""" specialty: str tiers: list[Tier] = field(default_factory=list) @dataclass(frozen=True) class SpecialistPreset(Preset): """A Preset whose escalation space is 2D: specialty -> size ladder. Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/ .default_model (bench, builder, agent) keeps working: the inherited `tiers` is the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs. """ ladders: dict[str, SpecialistLadder] = field(default_factory=dict) def ladder_for(self, specialty: str) -> SpecialistLadder: """The specialist ladder for a key, or the generic ladder as a fallback.""" lad = self.ladders.get(specialty) if lad and lad.tiers: return lad return SpecialistLadder(specialty="general", tiers=self.tiers) # Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1. # NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model. # Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native # `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also # <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser # (P1) will let qwen-coder back in for code quality. _LAPTOP = Preset( key="laptop", base_url="http://localhost:11435/v1", api_key="ollama", tiers=[Tier("3B", "granite4.1:3b")], ) # The submission Space: a single tiny model served by llama.cpp's llama-server. # Kept to one <=4B model so the Tiny Titan claim is unambiguous. # Port is configurable: 8080 inside the Space, but on the workstation 8080 is # taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088. # llama-server ignores the model name and serves whatever GGUF was loaded. _LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080") _SPACE = Preset( key="space", base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1", api_key="local", tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")], ) # hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama # (:11434/v1), which serves every pulled model over one OpenAI-compatible # endpoint with native tool_calls — simpler than LiteLLM (whose :4000 exposed no # models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives # the loop; escalate to bigger Qwen *coder* models for hard codegen. (Tiny coder # models can't native-tool-call — see engine/config laptop note.) _HAL = Preset( key="hal", base_url="http://10.8.0.6:11434/v1", api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"), # All-Granite ladder: every tier emits native tool_calls on Ollama (verified # on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama # at ANY size (3b/14b text-emit the call) — bringing the Qwen *coder* models # in (for the benchmark story) requires the dual-mode parser (see task 6). tiers=[ Tier("3B", "granite4.1:3b"), Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b"), ], ) # hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline # trains Qwen2.5-Coder-1.5B to emit native (see finetune/README.md), # so once it's served on hal's Ollama it can be the cheap first rung and we only # escalate to Granite on verification failure. The served tag is configurable via # SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF # into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server. _SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools") _HAL_SMOL = Preset( key="hal-smol", base_url="http://10.8.0.6:11434/v1", api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"), tiers=[ Tier("1.5B-tuned", _SMOL_MODEL), Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b"), ], ) # --- the 2D specialist matrix (hal-matrix preset) ---------------------------- # A model per language/function (smolcode-coder-{specialty}-{size}:tools), served # on hal's Ollama. The router classifies the task's specialty, picks that family's # size ladder, and escalates within it — then into the generic Granite ladder at # the top. Tags are derived by CONVENTION + served-tag discovery, so adding a # specialist is a serving action, not a code edit. _SPECIALIST_SIZES = ("1.5b", "3b", "7b") # 7b deferred but recognized if served. _SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java", "powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform", "orchestrate") # task_batch / parallel fan-out specialist # Pattern is overridable so one env var can repoint the whole matrix. Back-compat: # a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag. _SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-{specialty}-{size}:tools") # Size parsing + specialty detection — shared by the model picker (Tiny-Titan <=32B # display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry # per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in # tests/test_matrix_routing.py. _SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I) def parse_size_b(model: str) -> float: """Parameter count in billions from a model tag (last 'b' group), else 0.0. 'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0 (so size-unknown models pass a '<=32B' filter rather than being hidden).""" found = _SIZE_RE.findall(model or "") return float(found[-1]) if found else 0.0 def is_specialty_model(model: str) -> bool: """True if the tag is a per-specialty fine-tune (smolcode-coder--...).""" m = (model or "").lower() return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES) def specialist_sizes(preset: "Preset") -> list[str]: """Distinct specialist sizes (<=32B) present in a matrix preset's ladders, smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets.""" sizes: dict[float, str] = {} for lad in (getattr(preset, "ladders", {}) or {}).values(): for t in lad.tiers: if is_specialty_model(t.model): sb = parse_size_b(t.model) if 0 < sb <= 32: sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b") return [sizes[k] for k in sorted(sizes)] # Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B). _GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")] # Static fallback set of served tags when /v1/models discovery is unavailable. # Keep in sync with what's pulled on hal; discovery (below) supersedes it. _HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} | \ {f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES} _DISCOVERY_CACHE: dict[str, set[str]] = {} def _discover_served(base_url: str, api_key: str) -> set[str]: """GET the OpenAI-compatible /v1/models once (cached per base_url); the set of served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED).""" if base_url in _DISCOVERY_CACHE: return _DISCOVERY_CACHE[base_url] served: set[str] = set() try: import json import urllib.request req = urllib.request.Request(base_url.rstrip("/") + "/models", headers={"Authorization": f"Bearer {api_key}"}) with urllib.request.urlopen(req, timeout=2) as r: data = json.loads(r.read()) served = {m["id"] for m in data.get("data", []) if "id" in m} except Exception: served = set() _DISCOVERY_CACHE[base_url] = served return served def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder: """One specialist ladder: served specialist sizes (smallest first), then the generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist yields just the generic tiers (ladder_for also guards this).""" tiers: list[Tier] = [] if "{specialty}" in _SMOL_PATTERN: for size in _SPECIALIST_SIZES: tag = _SMOL_PATTERN.format(specialty=specialty, size=size) if tag in served: tiers.append(Tier(f"{size}-{specialty}", tag)) tiers.extend(_GENERIC_TIERS) return SpecialistLadder(specialty=specialty, tiers=tiers) _HAL_MATRIX = SpecialistPreset( key="hal-matrix", base_url="http://10.8.0.6:11434/v1", api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"), tiers=_GENERIC_TIERS, # generic fallback ladder (inherited Preset.tiers) ladders={}, # built lazily in load_preset (needs the resolved base_url) ) _PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)} def default_ui_model(preset: Preset, cfg: dict) -> str: """Resolve the default model for the web UI from config and preset tiers.""" if cfg.get("model"): return str(cfg["model"]) if preset.tiers: return preset.default_model return "" def load_preset() -> Preset: """Resolve the active preset, applying env overrides and Rust config.toml.""" # Default to the 2D specialist matrix so "Auto" routes by specialty out of the box; # it auto-detects served specialists and falls back to the generic Granite ladder # (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED). key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower() base = _PRESETS.get(key, _LAPTOP) rust_cfg: dict = {} try: from .rust_session import load_rust_config rust_cfg = load_rust_config() except Exception: pass base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url)) api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key) # An explicit env SMALLCODE_MODEL is a hard single-model override and wins over # everything (including the matrix). A `model` in config.toml is only a *default* # — it must NOT silently disable the matrix when the user explicitly asked for it # via SMALLCODE_PRESET=hal-matrix. env_model = os.environ.get("SMALLCODE_MODEL") if env_model: return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=[Tier("custom", env_model)]) if isinstance(base, SpecialistPreset): served = _discover_served(base_url, api_key) or _HAL_SERVED ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES} return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key, tiers=_GENERIC_TIERS, ladders=ladders) # A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL, # handled above). If it just names this preset's entry tier — the common case, e.g. # the CLI default == hal-smol's 1.5B entry — keep the full escalation LADDER (so the # router + judge still work). Only a model that ISN'T the preset entry is treated as # a deliberate single-model choice. forced = rust_cfg.get("model") if forced and base.tiers and forced != base.default_model: return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=[Tier("custom", forced)]) return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers)