smolcode / engine /config.py
seanpoyner's picture
Upload folder using huggingface_hub
daea45b verified
Raw
History Blame Contribute Delete
12.9 kB
"""Backend presets for smolcode.
smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just
selects the base_url and the model *tiers* the router may escalate through.
Everything is overridable by environment variables so the same code runs on a
laptop, inside an HF Space, or against the hal-9000 "home supercomputer".
Env overrides (highest priority):
SMALLCODE_PRESET space | laptop | hal | hal-smol (default: hal)
SMALLCODE_BASE_URL OpenAI-compatible /v1 URL
SMALLCODE_API_KEY bearer token (most local servers ignore it)
SMALLCODE_MODEL force a single model (disables tiering)
"""
from __future__ import annotations
import os
import re
from dataclasses import dataclass, field
@dataclass(frozen=True)
class Tier:
"""One rung of the model ladder. `name` is what the router shows in the UI."""
name: str
model: str
@dataclass(frozen=True)
class Preset:
key: str
base_url: str
api_key: str
# Ordered cheap -> expensive. The router starts at tiers[0] and escalates.
tiers: list[Tier] = field(default_factory=list)
@property
def default_model(self) -> str:
return self.tiers[0].model
@dataclass(frozen=True)
class SpecialistLadder:
"""One specialist family's size ladder (cheap -> expensive), reusing Tier."""
specialty: str
tiers: list[Tier] = field(default_factory=list)
@dataclass(frozen=True)
class SpecialistPreset(Preset):
"""A Preset whose escalation space is 2D: specialty -> size ladder.
Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/
.default_model (bench, builder, agent) keeps working: the inherited `tiers` is
the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs.
"""
ladders: dict[str, SpecialistLadder] = field(default_factory=dict)
def ladder_for(self, specialty: str) -> SpecialistLadder:
"""The specialist ladder for a key, or the generic ladder as a fallback."""
lad = self.ladders.get(specialty)
if lad and lad.tiers:
return lad
return SpecialistLadder(specialty="general", tiers=self.tiers)
# Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1.
# NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model.
# Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native
# `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also
# <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser
# (P1) will let qwen-coder back in for code quality.
_LAPTOP = Preset(
key="laptop",
base_url="http://localhost:11435/v1",
api_key="ollama",
tiers=[Tier("3B", "granite4.1:3b")],
)
# The submission Space: a single tiny model served by llama.cpp's llama-server.
# Kept to one <=4B model so the Tiny Titan claim is unambiguous.
# Port is configurable: 8080 inside the Space, but on the workstation 8080 is
# taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088.
# llama-server ignores the model name and serves whatever GGUF was loaded.
_LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080")
_SPACE = Preset(
key="space",
base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1",
api_key="local",
tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")],
)
# hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama
# (:11434/v1), which serves every pulled model over one OpenAI-compatible
# endpoint with native tool_calls — simpler than LiteLLM (whose :4000 exposed no
# models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives
# the loop; escalate to bigger Qwen *coder* models for hard codegen. (Tiny coder
# models can't native-tool-call — see engine/config laptop note.)
_HAL = Preset(
key="hal",
base_url="http://10.8.0.6:11434/v1",
api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
# All-Granite ladder: every tier emits native tool_calls on Ollama (verified
# on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama
# at ANY size (3b/14b text-emit the call) — bringing the Qwen *coder* models
# in (for the benchmark story) requires the dual-mode parser (see task 6).
tiers=[
Tier("3B", "granite4.1:3b"),
Tier("8B", "granite4.1:8b"),
Tier("30B", "granite4.1:30b"),
],
)
# hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline
# trains Qwen2.5-Coder-1.5B to emit native <tool_call> (see finetune/README.md),
# so once it's served on hal's Ollama it can be the cheap first rung and we only
# escalate to Granite on verification failure. The served tag is configurable via
# SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF
# into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server.
_SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools")
_HAL_SMOL = Preset(
key="hal-smol",
base_url="http://10.8.0.6:11434/v1",
api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
tiers=[
Tier("1.5B-tuned", _SMOL_MODEL),
Tier("8B", "granite4.1:8b"),
Tier("30B", "granite4.1:30b"),
],
)
# --- the 2D specialist matrix (hal-matrix preset) ----------------------------
# A model per language/function (smolcode-coder-{specialty}-{size}:tools), served
# on hal's Ollama. The router classifies the task's specialty, picks that family's
# size ladder, and escalates within it — then into the generic Granite ladder at
# the top. Tags are derived by CONVENTION + served-tag discovery, so adding a
# specialist is a serving action, not a code edit.
_SPECIALIST_SIZES = ("1.5b", "3b", "7b") # 7b deferred but recognized if served.
_SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java",
"powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform",
"orchestrate") # task_batch / parallel fan-out specialist
# Pattern is overridable so one env var can repoint the whole matrix. Back-compat:
# a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag.
_SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL",
"smolcode-coder-{specialty}-{size}:tools")
# Size parsing + specialty detection — shared by the model picker (Tiny-Titan <=32B
# display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry
# per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in
# tests/test_matrix_routing.py.
_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I)
def parse_size_b(model: str) -> float:
"""Parameter count in billions from a model tag (last '<n>b' group), else 0.0.
'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0
(so size-unknown models pass a '<=32B' filter rather than being hidden)."""
found = _SIZE_RE.findall(model or "")
return float(found[-1]) if found else 0.0
def is_specialty_model(model: str) -> bool:
"""True if the tag is a per-specialty fine-tune (smolcode-coder-<specialty>-...)."""
m = (model or "").lower()
return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES)
def specialist_sizes(preset: "Preset") -> list[str]:
"""Distinct specialist sizes (<=32B) present in a matrix preset's ladders,
smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets."""
sizes: dict[float, str] = {}
for lad in (getattr(preset, "ladders", {}) or {}).values():
for t in lad.tiers:
if is_specialty_model(t.model):
sb = parse_size_b(t.model)
if 0 < sb <= 32:
sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b")
return [sizes[k] for k in sorted(sizes)]
# Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B).
_GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")]
# Static fallback set of served tags when /v1/models discovery is unavailable.
# Keep in sync with what's pulled on hal; discovery (below) supersedes it.
_HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} | \
{f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES}
_DISCOVERY_CACHE: dict[str, set[str]] = {}
def _discover_served(base_url: str, api_key: str) -> set[str]:
"""GET the OpenAI-compatible /v1/models once (cached per base_url); the set of
served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED)."""
if base_url in _DISCOVERY_CACHE:
return _DISCOVERY_CACHE[base_url]
served: set[str] = set()
try:
import json
import urllib.request
req = urllib.request.Request(base_url.rstrip("/") + "/models",
headers={"Authorization": f"Bearer {api_key}"})
with urllib.request.urlopen(req, timeout=2) as r:
data = json.loads(r.read())
served = {m["id"] for m in data.get("data", []) if "id" in m}
except Exception:
served = set()
_DISCOVERY_CACHE[base_url] = served
return served
def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder:
"""One specialist ladder: served specialist sizes (smallest first), then the
generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist
yields just the generic tiers (ladder_for also guards this)."""
tiers: list[Tier] = []
if "{specialty}" in _SMOL_PATTERN:
for size in _SPECIALIST_SIZES:
tag = _SMOL_PATTERN.format(specialty=specialty, size=size)
if tag in served:
tiers.append(Tier(f"{size}-{specialty}", tag))
tiers.extend(_GENERIC_TIERS)
return SpecialistLadder(specialty=specialty, tiers=tiers)
_HAL_MATRIX = SpecialistPreset(
key="hal-matrix",
base_url="http://10.8.0.6:11434/v1",
api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
tiers=_GENERIC_TIERS, # generic fallback ladder (inherited Preset.tiers)
ladders={}, # built lazily in load_preset (needs the resolved base_url)
)
_PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)}
def default_ui_model(preset: Preset, cfg: dict) -> str:
"""Resolve the default model for the web UI from config and preset tiers."""
if cfg.get("model"):
return str(cfg["model"])
if preset.tiers:
return preset.default_model
return ""
def load_preset() -> Preset:
"""Resolve the active preset, applying env overrides and Rust config.toml."""
# Default to the 2D specialist matrix so "Auto" routes by specialty out of the box;
# it auto-detects served specialists and falls back to the generic Granite ladder
# (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED).
key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower()
base = _PRESETS.get(key, _LAPTOP)
rust_cfg: dict = {}
try:
from .rust_session import load_rust_config
rust_cfg = load_rust_config()
except Exception:
pass
base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url))
api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key)
# An explicit env SMALLCODE_MODEL is a hard single-model override and wins over
# everything (including the matrix). A `model` in config.toml is only a *default*
# — it must NOT silently disable the matrix when the user explicitly asked for it
# via SMALLCODE_PRESET=hal-matrix.
env_model = os.environ.get("SMALLCODE_MODEL")
if env_model:
return Preset(key=base.key, base_url=base_url, api_key=api_key,
tiers=[Tier("custom", env_model)])
if isinstance(base, SpecialistPreset):
served = _discover_served(base_url, api_key) or _HAL_SERVED
ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES}
return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key,
tiers=_GENERIC_TIERS, ladders=ladders)
# A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL,
# handled above). If it just names this preset's entry tier — the common case, e.g.
# the CLI default == hal-smol's 1.5B entry — keep the full escalation LADDER (so the
# router + judge still work). Only a model that ISN'T the preset entry is treated as
# a deliberate single-model choice.
forced = rust_cfg.get("model")
if forced and base.tiers and forced != base.default_model:
return Preset(key=base.key, base_url=base_url, api_key=api_key,
tiers=[Tier("custom", forced)])
return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers)