Spaces:
Paused
Paused
File size: 12,871 Bytes
daea45b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | """Backend presets for smolcode.
smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just
selects the base_url and the model *tiers* the router may escalate through.
Everything is overridable by environment variables so the same code runs on a
laptop, inside an HF Space, or against the hal-9000 "home supercomputer".
Env overrides (highest priority):
SMALLCODE_PRESET space | laptop | hal | hal-smol (default: hal)
SMALLCODE_BASE_URL OpenAI-compatible /v1 URL
SMALLCODE_API_KEY bearer token (most local servers ignore it)
SMALLCODE_MODEL force a single model (disables tiering)
"""
from __future__ import annotations
import os
import re
from dataclasses import dataclass, field
@dataclass(frozen=True)
class Tier:
"""One rung of the model ladder. `name` is what the router shows in the UI."""
name: str
model: str
@dataclass(frozen=True)
class Preset:
key: str
base_url: str
api_key: str
# Ordered cheap -> expensive. The router starts at tiers[0] and escalates.
tiers: list[Tier] = field(default_factory=list)
@property
def default_model(self) -> str:
return self.tiers[0].model
@dataclass(frozen=True)
class SpecialistLadder:
"""One specialist family's size ladder (cheap -> expensive), reusing Tier."""
specialty: str
tiers: list[Tier] = field(default_factory=list)
@dataclass(frozen=True)
class SpecialistPreset(Preset):
"""A Preset whose escalation space is 2D: specialty -> size ladder.
Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/
.default_model (bench, builder, agent) keeps working: the inherited `tiers` is
the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs.
"""
ladders: dict[str, SpecialistLadder] = field(default_factory=dict)
def ladder_for(self, specialty: str) -> SpecialistLadder:
"""The specialist ladder for a key, or the generic ladder as a fallback."""
lad = self.ladders.get(specialty)
if lad and lad.tiers:
return lad
return SpecialistLadder(specialty="general", tiers=self.tiers)
# Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1.
# NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model.
# Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native
# `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also
# <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser
# (P1) will let qwen-coder back in for code quality.
_LAPTOP = Preset(
key="laptop",
base_url="http://localhost:11435/v1",
api_key="ollama",
tiers=[Tier("3B", "granite4.1:3b")],
)
# The submission Space: a single tiny model served by llama.cpp's llama-server.
# Kept to one <=4B model so the Tiny Titan claim is unambiguous.
# Port is configurable: 8080 inside the Space, but on the workstation 8080 is
# taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088.
# llama-server ignores the model name and serves whatever GGUF was loaded.
_LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080")
_SPACE = Preset(
key="space",
base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1",
api_key="local",
tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")],
)
# hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama
# (:11434/v1), which serves every pulled model over one OpenAI-compatible
# endpoint with native tool_calls β simpler than LiteLLM (whose :4000 exposed no
# models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives
# the loop; escalate to bigger Qwen *coder* models for hard codegen. (Tiny coder
# models can't native-tool-call β see engine/config laptop note.)
_HAL = Preset(
key="hal",
base_url="http://10.8.0.6:11434/v1",
api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
# All-Granite ladder: every tier emits native tool_calls on Ollama (verified
# on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama
# at ANY size (3b/14b text-emit the call) β bringing the Qwen *coder* models
# in (for the benchmark story) requires the dual-mode parser (see task 6).
tiers=[
Tier("3B", "granite4.1:3b"),
Tier("8B", "granite4.1:8b"),
Tier("30B", "granite4.1:30b"),
],
)
# hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline
# trains Qwen2.5-Coder-1.5B to emit native <tool_call> (see finetune/README.md),
# so once it's served on hal's Ollama it can be the cheap first rung and we only
# escalate to Granite on verification failure. The served tag is configurable via
# SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF
# into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server.
_SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools")
_HAL_SMOL = Preset(
key="hal-smol",
base_url="http://10.8.0.6:11434/v1",
api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
tiers=[
Tier("1.5B-tuned", _SMOL_MODEL),
Tier("8B", "granite4.1:8b"),
Tier("30B", "granite4.1:30b"),
],
)
# --- the 2D specialist matrix (hal-matrix preset) ----------------------------
# A model per language/function (smolcode-coder-{specialty}-{size}:tools), served
# on hal's Ollama. The router classifies the task's specialty, picks that family's
# size ladder, and escalates within it β then into the generic Granite ladder at
# the top. Tags are derived by CONVENTION + served-tag discovery, so adding a
# specialist is a serving action, not a code edit.
_SPECIALIST_SIZES = ("1.5b", "3b", "7b") # 7b deferred but recognized if served.
_SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java",
"powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform",
"orchestrate") # task_batch / parallel fan-out specialist
# Pattern is overridable so one env var can repoint the whole matrix. Back-compat:
# a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag.
_SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL",
"smolcode-coder-{specialty}-{size}:tools")
# Size parsing + specialty detection β shared by the model picker (Tiny-Titan <=32B
# display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry
# per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in
# tests/test_matrix_routing.py.
_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I)
def parse_size_b(model: str) -> float:
"""Parameter count in billions from a model tag (last '<n>b' group), else 0.0.
'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0
(so size-unknown models pass a '<=32B' filter rather than being hidden)."""
found = _SIZE_RE.findall(model or "")
return float(found[-1]) if found else 0.0
def is_specialty_model(model: str) -> bool:
"""True if the tag is a per-specialty fine-tune (smolcode-coder-<specialty>-...)."""
m = (model or "").lower()
return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES)
def specialist_sizes(preset: "Preset") -> list[str]:
"""Distinct specialist sizes (<=32B) present in a matrix preset's ladders,
smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets."""
sizes: dict[float, str] = {}
for lad in (getattr(preset, "ladders", {}) or {}).values():
for t in lad.tiers:
if is_specialty_model(t.model):
sb = parse_size_b(t.model)
if 0 < sb <= 32:
sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b")
return [sizes[k] for k in sorted(sizes)]
# Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B).
_GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")]
# Static fallback set of served tags when /v1/models discovery is unavailable.
# Keep in sync with what's pulled on hal; discovery (below) supersedes it.
_HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} | \
{f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES}
_DISCOVERY_CACHE: dict[str, set[str]] = {}
def _discover_served(base_url: str, api_key: str) -> set[str]:
"""GET the OpenAI-compatible /v1/models once (cached per base_url); the set of
served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED)."""
if base_url in _DISCOVERY_CACHE:
return _DISCOVERY_CACHE[base_url]
served: set[str] = set()
try:
import json
import urllib.request
req = urllib.request.Request(base_url.rstrip("/") + "/models",
headers={"Authorization": f"Bearer {api_key}"})
with urllib.request.urlopen(req, timeout=2) as r:
data = json.loads(r.read())
served = {m["id"] for m in data.get("data", []) if "id" in m}
except Exception:
served = set()
_DISCOVERY_CACHE[base_url] = served
return served
def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder:
"""One specialist ladder: served specialist sizes (smallest first), then the
generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist
yields just the generic tiers (ladder_for also guards this)."""
tiers: list[Tier] = []
if "{specialty}" in _SMOL_PATTERN:
for size in _SPECIALIST_SIZES:
tag = _SMOL_PATTERN.format(specialty=specialty, size=size)
if tag in served:
tiers.append(Tier(f"{size}-{specialty}", tag))
tiers.extend(_GENERIC_TIERS)
return SpecialistLadder(specialty=specialty, tiers=tiers)
_HAL_MATRIX = SpecialistPreset(
key="hal-matrix",
base_url="http://10.8.0.6:11434/v1",
api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
tiers=_GENERIC_TIERS, # generic fallback ladder (inherited Preset.tiers)
ladders={}, # built lazily in load_preset (needs the resolved base_url)
)
_PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)}
def default_ui_model(preset: Preset, cfg: dict) -> str:
"""Resolve the default model for the web UI from config and preset tiers."""
if cfg.get("model"):
return str(cfg["model"])
if preset.tiers:
return preset.default_model
return ""
def load_preset() -> Preset:
"""Resolve the active preset, applying env overrides and Rust config.toml."""
# Default to the 2D specialist matrix so "Auto" routes by specialty out of the box;
# it auto-detects served specialists and falls back to the generic Granite ladder
# (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED).
key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower()
base = _PRESETS.get(key, _LAPTOP)
rust_cfg: dict = {}
try:
from .rust_session import load_rust_config
rust_cfg = load_rust_config()
except Exception:
pass
base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url))
api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key)
# An explicit env SMALLCODE_MODEL is a hard single-model override and wins over
# everything (including the matrix). A `model` in config.toml is only a *default*
# β it must NOT silently disable the matrix when the user explicitly asked for it
# via SMALLCODE_PRESET=hal-matrix.
env_model = os.environ.get("SMALLCODE_MODEL")
if env_model:
return Preset(key=base.key, base_url=base_url, api_key=api_key,
tiers=[Tier("custom", env_model)])
if isinstance(base, SpecialistPreset):
served = _discover_served(base_url, api_key) or _HAL_SERVED
ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES}
return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key,
tiers=_GENERIC_TIERS, ladders=ladders)
# A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL,
# handled above). If it just names this preset's entry tier β the common case, e.g.
# the CLI default == hal-smol's 1.5B entry β keep the full escalation LADDER (so the
# router + judge still work). Only a model that ISN'T the preset entry is treated as
# a deliberate single-model choice.
forced = rust_cfg.get("model")
if forced and base.tiers and forced != base.default_model:
return Preset(key=base.key, base_url=base_url, api_key=api_key,
tiers=[Tier("custom", forced)])
return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers)
|