Spaces:

seanpoyner
/

smolcode

Paused

App Files Files Community

smolcode / engine /config.py

seanpoyner

Upload folder using huggingface_hub

daea45b verified 13 days ago

Raw

History Blame Contribute Delete

12.9 kB

	"""Backend presets for smolcode.

	smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just
	selects the base_url and the model tiers the router may escalate through.
	Everything is overridable by environment variables so the same code runs on a
	laptop, inside an HF Space, or against the hal-9000 "home supercomputer".

	Env overrides (highest priority):
	SMALLCODE_PRESET space \| laptop \| hal \| hal-smol (default: hal)
	SMALLCODE_BASE_URL OpenAI-compatible /v1 URL
	SMALLCODE_API_KEY bearer token (most local servers ignore it)
	SMALLCODE_MODEL force a single model (disables tiering)
	"""
	from __future__ import annotations

	import os
	import re
	from dataclasses import dataclass, field


	@dataclass(frozen=True)
	class Tier:
	"""One rung of the model ladder. `name` is what the router shows in the UI."""
	name: str
	model: str


	@dataclass(frozen=True)
	class Preset:
	key: str
	base_url: str
	api_key: str
	# Ordered cheap -> expensive. The router starts at tiers[0] and escalates.
	tiers: list[Tier] = field(default_factory=list)

	@property
	def default_model(self) -> str:
	return self.tiers[0].model


	@dataclass(frozen=True)
	class SpecialistLadder:
	"""One specialist family's size ladder (cheap -> expensive), reusing Tier."""
	specialty: str
	tiers: list[Tier] = field(default_factory=list)


	@dataclass(frozen=True)
	class SpecialistPreset(Preset):
	"""A Preset whose escalation space is 2D: specialty -> size ladder.

	Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/
	.default_model (bench, builder, agent) keeps working: the inherited `tiers` is
	the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs.
	"""
	ladders: dict[str, SpecialistLadder] = field(default_factory=dict)

	def ladder_for(self, specialty: str) -> SpecialistLadder:
	"""The specialist ladder for a key, or the generic ladder as a fallback."""
	lad = self.ladders.get(specialty)
	if lad and lad.tiers:
	return lad
	return SpecialistLadder(specialty="general", tiers=self.tiers)


	# Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1.
	# NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model.
	# Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native
	# `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also
	# <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser
	# (P1) will let qwen-coder back in for code quality.
	_LAPTOP = Preset(
	key="laptop",
	base_url="http://localhost:11435/v1",
	api_key="ollama",
	tiers=[Tier("3B", "granite4.1:3b")],
	)

	# The submission Space: a single tiny model served by llama.cpp's llama-server.
	# Kept to one <=4B model so the Tiny Titan claim is unambiguous.
	# Port is configurable: 8080 inside the Space, but on the workstation 8080 is
	# taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088.
	# llama-server ignores the model name and serves whatever GGUF was loaded.
	_LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080")
	_SPACE = Preset(
	key="space",
	base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1",
	api_key="local",
	tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")],
	)

	# hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama
	# (:11434/v1), which serves every pulled model over one OpenAI-compatible
	# endpoint with native tool_calls — simpler than LiteLLM (whose :4000 exposed no
	# models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives
	# the loop; escalate to bigger Qwen coder models for hard codegen. (Tiny coder
	# models can't native-tool-call — see engine/config laptop note.)
	_HAL = Preset(
	key="hal",
	base_url="http://10.8.0.6:11434/v1",
	api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
	# All-Granite ladder: every tier emits native tool_calls on Ollama (verified
	# on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama
	# at ANY size (3b/14b text-emit the call) — bringing the Qwen coder models
	# in (for the benchmark story) requires the dual-mode parser (see task 6).
	tiers=[
	Tier("3B", "granite4.1:3b"),
	Tier("8B", "granite4.1:8b"),
	Tier("30B", "granite4.1:30b"),
	],
	)

	# hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline
	# trains Qwen2.5-Coder-1.5B to emit native <tool_call> (see finetune/README.md),
	# so once it's served on hal's Ollama it can be the cheap first rung and we only
	# escalate to Granite on verification failure. The served tag is configurable via
	# SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF
	# into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server.
	_SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools")
	_HAL_SMOL = Preset(
	key="hal-smol",
	base_url="http://10.8.0.6:11434/v1",
	api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
	tiers=[
	Tier("1.5B-tuned", _SMOL_MODEL),
	Tier("8B", "granite4.1:8b"),
	Tier("30B", "granite4.1:30b"),
	],
	)

	# --- the 2D specialist matrix (hal-matrix preset) ----------------------------
	# A model per language/function (smolcode-coder-{specialty}-{size}:tools), served
	# on hal's Ollama. The router classifies the task's specialty, picks that family's
	# size ladder, and escalates within it — then into the generic Granite ladder at
	# the top. Tags are derived by CONVENTION + served-tag discovery, so adding a
	# specialist is a serving action, not a code edit.

	_SPECIALIST_SIZES = ("1.5b", "3b", "7b") # 7b deferred but recognized if served.
	_SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java",
	"powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform",
	"orchestrate") # task_batch / parallel fan-out specialist

	# Pattern is overridable so one env var can repoint the whole matrix. Back-compat:
	# a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag.
	_SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL",
	"smolcode-coder-{specialty}-{size}:tools")

	# Size parsing + specialty detection — shared by the model picker (Tiny-Titan <=32B
	# display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry
	# per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in
	# tests/test_matrix_routing.py.
	_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I)


	def parse_size_b(model: str) -> float:
	"""Parameter count in billions from a model tag (last '<n>b' group), else 0.0.

	'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0
	(so size-unknown models pass a '<=32B' filter rather than being hidden)."""
	found = _SIZE_RE.findall(model or "")
	return float(found[-1]) if found else 0.0


	def is_specialty_model(model: str) -> bool:
	"""True if the tag is a per-specialty fine-tune (smolcode-coder-<specialty>-...)."""
	m = (model or "").lower()
	return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES)


	def specialist_sizes(preset: "Preset") -> list[str]:
	"""Distinct specialist sizes (<=32B) present in a matrix preset's ladders,
	smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets."""
	sizes: dict[float, str] = {}
	for lad in (getattr(preset, "ladders", {}) or {}).values():
	for t in lad.tiers:
	if is_specialty_model(t.model):
	sb = parse_size_b(t.model)
	if 0 < sb <= 32:
	sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b")
	return [sizes[k] for k in sorted(sizes)]

	# Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B).
	_GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")]

	# Static fallback set of served tags when /v1/models discovery is unavailable.
	# Keep in sync with what's pulled on hal; discovery (below) supersedes it.
	_HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} \| \
	{f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES}

	_DISCOVERY_CACHE: dict[str, set[str]] = {}


	def _discover_served(base_url: str, api_key: str) -> set[str]:
	"""GET the OpenAI-compatible /v1/models once (cached per base_url); the set of
	served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED)."""
	if base_url in _DISCOVERY_CACHE:
	return _DISCOVERY_CACHE[base_url]
	served: set[str] = set()
	try:
	import json
	import urllib.request
	req = urllib.request.Request(base_url.rstrip("/") + "/models",
	headers={"Authorization": f"Bearer {api_key}"})
	with urllib.request.urlopen(req, timeout=2) as r:
	data = json.loads(r.read())
	served = {m["id"] for m in data.get("data", []) if "id" in m}
	except Exception:
	served = set()
	_DISCOVERY_CACHE[base_url] = served
	return served


	def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder:
	"""One specialist ladder: served specialist sizes (smallest first), then the
	generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist
	yields just the generic tiers (ladder_for also guards this)."""
	tiers: list[Tier] = []
	if "{specialty}" in _SMOL_PATTERN:
	for size in _SPECIALIST_SIZES:
	tag = _SMOL_PATTERN.format(specialty=specialty, size=size)
	if tag in served:
	tiers.append(Tier(f"{size}-{specialty}", tag))
	tiers.extend(_GENERIC_TIERS)
	return SpecialistLadder(specialty=specialty, tiers=tiers)


	_HAL_MATRIX = SpecialistPreset(
	key="hal-matrix",
	base_url="http://10.8.0.6:11434/v1",
	api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
	tiers=_GENERIC_TIERS, # generic fallback ladder (inherited Preset.tiers)
	ladders={}, # built lazily in load_preset (needs the resolved base_url)
	)

	_PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)}


	def default_ui_model(preset: Preset, cfg: dict) -> str:
	"""Resolve the default model for the web UI from config and preset tiers."""
	if cfg.get("model"):
	return str(cfg["model"])
	if preset.tiers:
	return preset.default_model
	return ""


	def load_preset() -> Preset:
	"""Resolve the active preset, applying env overrides and Rust config.toml."""
	# Default to the 2D specialist matrix so "Auto" routes by specialty out of the box;
	# it auto-detects served specialists and falls back to the generic Granite ladder
	# (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED).
	key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower()
	base = _PRESETS.get(key, _LAPTOP)

	rust_cfg: dict = {}
	try:
	from .rust_session import load_rust_config
	rust_cfg = load_rust_config()
	except Exception:
	pass

	base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url))
	api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key)

	# An explicit env SMALLCODE_MODEL is a hard single-model override and wins over
	# everything (including the matrix). A `model` in config.toml is only a default
	# — it must NOT silently disable the matrix when the user explicitly asked for it
	# via SMALLCODE_PRESET=hal-matrix.
	env_model = os.environ.get("SMALLCODE_MODEL")
	if env_model:
	return Preset(key=base.key, base_url=base_url, api_key=api_key,
	tiers=[Tier("custom", env_model)])

	if isinstance(base, SpecialistPreset):
	served = _discover_served(base_url, api_key) or _HAL_SERVED
	ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES}
	return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key,
	tiers=_GENERIC_TIERS, ladders=ladders)

	# A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL,
	# handled above). If it just names this preset's entry tier — the common case, e.g.
	# the CLI default == hal-smol's 1.5B entry — keep the full escalation LADDER (so the
	# router + judge still work). Only a model that ISN'T the preset entry is treated as
	# a deliberate single-model choice.
	forced = rust_cfg.get("model")
	if forced and base.tiers and forced != base.default_model:
	return Preset(key=base.key, base_url=base_url, api_key=api_key,
	tiers=[Tier("custom", forced)])

	return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers)