Spaces:
Paused
Paused
| """Backend presets for smolcode. | |
| smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just | |
| selects the base_url and the model *tiers* the router may escalate through. | |
| Everything is overridable by environment variables so the same code runs on a | |
| laptop, inside an HF Space, or against the hal-9000 "home supercomputer". | |
| Env overrides (highest priority): | |
| SMALLCODE_PRESET space | laptop | hal | hal-smol (default: hal) | |
| SMALLCODE_BASE_URL OpenAI-compatible /v1 URL | |
| SMALLCODE_API_KEY bearer token (most local servers ignore it) | |
| SMALLCODE_MODEL force a single model (disables tiering) | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| from dataclasses import dataclass, field | |
| class Tier: | |
| """One rung of the model ladder. `name` is what the router shows in the UI.""" | |
| name: str | |
| model: str | |
| class Preset: | |
| key: str | |
| base_url: str | |
| api_key: str | |
| # Ordered cheap -> expensive. The router starts at tiers[0] and escalates. | |
| tiers: list[Tier] = field(default_factory=list) | |
| def default_model(self) -> str: | |
| return self.tiers[0].model | |
| class SpecialistLadder: | |
| """One specialist family's size ladder (cheap -> expensive), reusing Tier.""" | |
| specialty: str | |
| tiers: list[Tier] = field(default_factory=list) | |
| class SpecialistPreset(Preset): | |
| """A Preset whose escalation space is 2D: specialty -> size ladder. | |
| Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/ | |
| .default_model (bench, builder, agent) keeps working: the inherited `tiers` is | |
| the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs. | |
| """ | |
| ladders: dict[str, SpecialistLadder] = field(default_factory=dict) | |
| def ladder_for(self, specialty: str) -> SpecialistLadder: | |
| """The specialist ladder for a key, or the generic ladder as a fallback.""" | |
| lad = self.ladders.get(specialty) | |
| if lad and lad.tiers: | |
| return lad | |
| return SpecialistLadder(specialty="general", tiers=self.tiers) | |
| # Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1. | |
| # NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model. | |
| # Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native | |
| # `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also | |
| # <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser | |
| # (P1) will let qwen-coder back in for code quality. | |
| _LAPTOP = Preset( | |
| key="laptop", | |
| base_url="http://localhost:11435/v1", | |
| api_key="ollama", | |
| tiers=[Tier("3B", "granite4.1:3b")], | |
| ) | |
| # The submission Space: a single tiny model served by llama.cpp's llama-server. | |
| # Kept to one <=4B model so the Tiny Titan claim is unambiguous. | |
| # Port is configurable: 8080 inside the Space, but on the workstation 8080 is | |
| # taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088. | |
| # llama-server ignores the model name and serves whatever GGUF was loaded. | |
| _LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080") | |
| _SPACE = Preset( | |
| key="space", | |
| base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1", | |
| api_key="local", | |
| tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")], | |
| ) | |
| # hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama | |
| # (:11434/v1), which serves every pulled model over one OpenAI-compatible | |
| # endpoint with native tool_calls — simpler than LiteLLM (whose :4000 exposed no | |
| # models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives | |
| # the loop; escalate to bigger Qwen *coder* models for hard codegen. (Tiny coder | |
| # models can't native-tool-call — see engine/config laptop note.) | |
| _HAL = Preset( | |
| key="hal", | |
| base_url="http://10.8.0.6:11434/v1", | |
| api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"), | |
| # All-Granite ladder: every tier emits native tool_calls on Ollama (verified | |
| # on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama | |
| # at ANY size (3b/14b text-emit the call) — bringing the Qwen *coder* models | |
| # in (for the benchmark story) requires the dual-mode parser (see task 6). | |
| tiers=[ | |
| Tier("3B", "granite4.1:3b"), | |
| Tier("8B", "granite4.1:8b"), | |
| Tier("30B", "granite4.1:30b"), | |
| ], | |
| ) | |
| # hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline | |
| # trains Qwen2.5-Coder-1.5B to emit native <tool_call> (see finetune/README.md), | |
| # so once it's served on hal's Ollama it can be the cheap first rung and we only | |
| # escalate to Granite on verification failure. The served tag is configurable via | |
| # SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF | |
| # into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server. | |
| _SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools") | |
| _HAL_SMOL = Preset( | |
| key="hal-smol", | |
| base_url="http://10.8.0.6:11434/v1", | |
| api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"), | |
| tiers=[ | |
| Tier("1.5B-tuned", _SMOL_MODEL), | |
| Tier("8B", "granite4.1:8b"), | |
| Tier("30B", "granite4.1:30b"), | |
| ], | |
| ) | |
| # --- the 2D specialist matrix (hal-matrix preset) ---------------------------- | |
| # A model per language/function (smolcode-coder-{specialty}-{size}:tools), served | |
| # on hal's Ollama. The router classifies the task's specialty, picks that family's | |
| # size ladder, and escalates within it — then into the generic Granite ladder at | |
| # the top. Tags are derived by CONVENTION + served-tag discovery, so adding a | |
| # specialist is a serving action, not a code edit. | |
| _SPECIALIST_SIZES = ("1.5b", "3b", "7b") # 7b deferred but recognized if served. | |
| _SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java", | |
| "powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform", | |
| "orchestrate") # task_batch / parallel fan-out specialist | |
| # Pattern is overridable so one env var can repoint the whole matrix. Back-compat: | |
| # a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag. | |
| _SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL", | |
| "smolcode-coder-{specialty}-{size}:tools") | |
| # Size parsing + specialty detection — shared by the model picker (Tiny-Titan <=32B | |
| # display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry | |
| # per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in | |
| # tests/test_matrix_routing.py. | |
| _SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I) | |
| def parse_size_b(model: str) -> float: | |
| """Parameter count in billions from a model tag (last '<n>b' group), else 0.0. | |
| 'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0 | |
| (so size-unknown models pass a '<=32B' filter rather than being hidden).""" | |
| found = _SIZE_RE.findall(model or "") | |
| return float(found[-1]) if found else 0.0 | |
| def is_specialty_model(model: str) -> bool: | |
| """True if the tag is a per-specialty fine-tune (smolcode-coder-<specialty>-...).""" | |
| m = (model or "").lower() | |
| return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES) | |
| def specialist_sizes(preset: "Preset") -> list[str]: | |
| """Distinct specialist sizes (<=32B) present in a matrix preset's ladders, | |
| smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets.""" | |
| sizes: dict[float, str] = {} | |
| for lad in (getattr(preset, "ladders", {}) or {}).values(): | |
| for t in lad.tiers: | |
| if is_specialty_model(t.model): | |
| sb = parse_size_b(t.model) | |
| if 0 < sb <= 32: | |
| sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b") | |
| return [sizes[k] for k in sorted(sizes)] | |
| # Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B). | |
| _GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")] | |
| # Static fallback set of served tags when /v1/models discovery is unavailable. | |
| # Keep in sync with what's pulled on hal; discovery (below) supersedes it. | |
| _HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} | \ | |
| {f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES} | |
| _DISCOVERY_CACHE: dict[str, set[str]] = {} | |
| def _discover_served(base_url: str, api_key: str) -> set[str]: | |
| """GET the OpenAI-compatible /v1/models once (cached per base_url); the set of | |
| served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED).""" | |
| if base_url in _DISCOVERY_CACHE: | |
| return _DISCOVERY_CACHE[base_url] | |
| served: set[str] = set() | |
| try: | |
| import json | |
| import urllib.request | |
| req = urllib.request.Request(base_url.rstrip("/") + "/models", | |
| headers={"Authorization": f"Bearer {api_key}"}) | |
| with urllib.request.urlopen(req, timeout=2) as r: | |
| data = json.loads(r.read()) | |
| served = {m["id"] for m in data.get("data", []) if "id" in m} | |
| except Exception: | |
| served = set() | |
| _DISCOVERY_CACHE[base_url] = served | |
| return served | |
| def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder: | |
| """One specialist ladder: served specialist sizes (smallest first), then the | |
| generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist | |
| yields just the generic tiers (ladder_for also guards this).""" | |
| tiers: list[Tier] = [] | |
| if "{specialty}" in _SMOL_PATTERN: | |
| for size in _SPECIALIST_SIZES: | |
| tag = _SMOL_PATTERN.format(specialty=specialty, size=size) | |
| if tag in served: | |
| tiers.append(Tier(f"{size}-{specialty}", tag)) | |
| tiers.extend(_GENERIC_TIERS) | |
| return SpecialistLadder(specialty=specialty, tiers=tiers) | |
| _HAL_MATRIX = SpecialistPreset( | |
| key="hal-matrix", | |
| base_url="http://10.8.0.6:11434/v1", | |
| api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"), | |
| tiers=_GENERIC_TIERS, # generic fallback ladder (inherited Preset.tiers) | |
| ladders={}, # built lazily in load_preset (needs the resolved base_url) | |
| ) | |
| _PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)} | |
| def default_ui_model(preset: Preset, cfg: dict) -> str: | |
| """Resolve the default model for the web UI from config and preset tiers.""" | |
| if cfg.get("model"): | |
| return str(cfg["model"]) | |
| if preset.tiers: | |
| return preset.default_model | |
| return "" | |
| def load_preset() -> Preset: | |
| """Resolve the active preset, applying env overrides and Rust config.toml.""" | |
| # Default to the 2D specialist matrix so "Auto" routes by specialty out of the box; | |
| # it auto-detects served specialists and falls back to the generic Granite ladder | |
| # (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED). | |
| key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower() | |
| base = _PRESETS.get(key, _LAPTOP) | |
| rust_cfg: dict = {} | |
| try: | |
| from .rust_session import load_rust_config | |
| rust_cfg = load_rust_config() | |
| except Exception: | |
| pass | |
| base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url)) | |
| api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key) | |
| # An explicit env SMALLCODE_MODEL is a hard single-model override and wins over | |
| # everything (including the matrix). A `model` in config.toml is only a *default* | |
| # — it must NOT silently disable the matrix when the user explicitly asked for it | |
| # via SMALLCODE_PRESET=hal-matrix. | |
| env_model = os.environ.get("SMALLCODE_MODEL") | |
| if env_model: | |
| return Preset(key=base.key, base_url=base_url, api_key=api_key, | |
| tiers=[Tier("custom", env_model)]) | |
| if isinstance(base, SpecialistPreset): | |
| served = _discover_served(base_url, api_key) or _HAL_SERVED | |
| ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES} | |
| return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key, | |
| tiers=_GENERIC_TIERS, ladders=ladders) | |
| # A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL, | |
| # handled above). If it just names this preset's entry tier — the common case, e.g. | |
| # the CLI default == hal-smol's 1.5B entry — keep the full escalation LADDER (so the | |
| # router + judge still work). Only a model that ISN'T the preset entry is treated as | |
| # a deliberate single-model choice. | |
| forced = rust_cfg.get("model") | |
| if forced and base.tiers and forced != base.default_model: | |
| return Preset(key=base.key, base_url=base_url, api_key=api_key, | |
| tiers=[Tier("custom", forced)]) | |
| return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers) | |