Spaces:

seanpoyner
/

smolcode

Paused

File size: 12,871 Bytes

daea45b

"""Backend presets for smolcode.

smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just
selects the base_url and the model *tiers* the router may escalate through.
Everything is overridable by environment variables so the same code runs on a
laptop, inside an HF Space, or against the hal-9000 "home supercomputer".

Env overrides (highest priority):
  SMALLCODE_PRESET     space | laptop | hal | hal-smol   (default: hal)
  SMALLCODE_BASE_URL   OpenAI-compatible /v1 URL
  SMALLCODE_API_KEY    bearer token (most local servers ignore it)
  SMALLCODE_MODEL      force a single model (disables tiering)
"""
from __future__ import annotations

import os
import re
from dataclasses import dataclass, field


@dataclass(frozen=True)
class Tier:
    """One rung of the model ladder. `name` is what the router shows in the UI."""
    name: str
    model: str


@dataclass(frozen=True)
class Preset:
    key: str
    base_url: str
    api_key: str
    # Ordered cheap -> expensive. The router starts at tiers[0] and escalates.
    tiers: list[Tier] = field(default_factory=list)

    @property
    def default_model(self) -> str:
        return self.tiers[0].model


@dataclass(frozen=True)
class SpecialistLadder:
    """One specialist family's size ladder (cheap -> expensive), reusing Tier."""
    specialty: str
    tiers: list[Tier] = field(default_factory=list)


@dataclass(frozen=True)
class SpecialistPreset(Preset):
    """A Preset whose escalation space is 2D: specialty -> size ladder.

    Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/
    .default_model (bench, builder, agent) keeps working: the inherited `tiers` is
    the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs.
    """
    ladders: dict[str, SpecialistLadder] = field(default_factory=dict)

    def ladder_for(self, specialty: str) -> SpecialistLadder:
        """The specialist ladder for a key, or the generic ladder as a fallback."""
        lad = self.ladders.get(specialty)
        if lad and lad.tiers:
            return lad
        return SpecialistLadder(specialty="general", tiers=self.tiers)


# Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1.
# NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model.
# Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native
# `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also
# <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser
# (P1) will let qwen-coder back in for code quality.
_LAPTOP = Preset(
    key="laptop",
    base_url="http://localhost:11435/v1",
    api_key="ollama",
    tiers=[Tier("3B", "granite4.1:3b")],
)

# The submission Space: a single tiny model served by llama.cpp's llama-server.
# Kept to one <=4B model so the Tiny Titan claim is unambiguous.
# Port is configurable: 8080 inside the Space, but on the workstation 8080 is
# taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088.
# llama-server ignores the model name and serves whatever GGUF was loaded.
_LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080")
_SPACE = Preset(
    key="space",
    base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1",
    api_key="local",
    tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")],
)

# hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama
# (:11434/v1), which serves every pulled model over one OpenAI-compatible
# endpoint with native tool_calls — simpler than LiteLLM (whose :4000 exposed no
# models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives
# the loop; escalate to bigger Qwen *coder* models for hard codegen. (Tiny coder
# models can't native-tool-call — see engine/config laptop note.)
_HAL = Preset(
    key="hal",
    base_url="http://10.8.0.6:11434/v1",
    api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
    # All-Granite ladder: every tier emits native tool_calls on Ollama (verified
    # on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama
    # at ANY size (3b/14b text-emit the call) — bringing the Qwen *coder* models
    # in (for the benchmark story) requires the dual-mode parser (see task 6).
    tiers=[
        Tier("3B", "granite4.1:3b"),
        Tier("8B", "granite4.1:8b"),
        Tier("30B", "granite4.1:30b"),
    ],
)

# hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline
# trains Qwen2.5-Coder-1.5B to emit native <tool_call> (see finetune/README.md),
# so once it's served on hal's Ollama it can be the cheap first rung and we only
# escalate to Granite on verification failure. The served tag is configurable via
# SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF
# into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server.
_SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools")
_HAL_SMOL = Preset(
    key="hal-smol",
    base_url="http://10.8.0.6:11434/v1",
    api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
    tiers=[
        Tier("1.5B-tuned", _SMOL_MODEL),
        Tier("8B", "granite4.1:8b"),
        Tier("30B", "granite4.1:30b"),
    ],
)

# --- the 2D specialist matrix (hal-matrix preset) ----------------------------
# A model per language/function (smolcode-coder-{specialty}-{size}:tools), served
# on hal's Ollama. The router classifies the task's specialty, picks that family's
# size ladder, and escalates within it — then into the generic Granite ladder at
# the top. Tags are derived by CONVENTION + served-tag discovery, so adding a
# specialist is a serving action, not a code edit.

_SPECIALIST_SIZES = ("1.5b", "3b", "7b")   # 7b deferred but recognized if served.
_SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java",
                "powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform",
                "orchestrate")   # task_batch / parallel fan-out specialist

# Pattern is overridable so one env var can repoint the whole matrix. Back-compat:
# a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag.
_SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL",
                               "smolcode-coder-{specialty}-{size}:tools")

# Size parsing + specialty detection — shared by the model picker (Tiny-Titan <=32B
# display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry
# per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in
# tests/test_matrix_routing.py.
_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I)


def parse_size_b(model: str) -> float:
    """Parameter count in billions from a model tag (last '<n>b' group), else 0.0.

    'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0
    (so size-unknown models pass a '<=32B' filter rather than being hidden)."""
    found = _SIZE_RE.findall(model or "")
    return float(found[-1]) if found else 0.0


def is_specialty_model(model: str) -> bool:
    """True if the tag is a per-specialty fine-tune (smolcode-coder-<specialty>-...)."""
    m = (model or "").lower()
    return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES)


def specialist_sizes(preset: "Preset") -> list[str]:
    """Distinct specialist sizes (<=32B) present in a matrix preset's ladders,
    smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets."""
    sizes: dict[float, str] = {}
    for lad in (getattr(preset, "ladders", {}) or {}).values():
        for t in lad.tiers:
            if is_specialty_model(t.model):
                sb = parse_size_b(t.model)
                if 0 < sb <= 32:
                    sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b")
    return [sizes[k] for k in sorted(sizes)]

# Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B).
_GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")]

# Static fallback set of served tags when /v1/models discovery is unavailable.
# Keep in sync with what's pulled on hal; discovery (below) supersedes it.
_HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} | \
                        {f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES}

_DISCOVERY_CACHE: dict[str, set[str]] = {}


def _discover_served(base_url: str, api_key: str) -> set[str]:
    """GET the OpenAI-compatible /v1/models once (cached per base_url); the set of
    served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED)."""
    if base_url in _DISCOVERY_CACHE:
        return _DISCOVERY_CACHE[base_url]
    served: set[str] = set()
    try:
        import json
        import urllib.request
        req = urllib.request.Request(base_url.rstrip("/") + "/models",
                                     headers={"Authorization": f"Bearer {api_key}"})
        with urllib.request.urlopen(req, timeout=2) as r:
            data = json.loads(r.read())
        served = {m["id"] for m in data.get("data", []) if "id" in m}
    except Exception:
        served = set()
    _DISCOVERY_CACHE[base_url] = served
    return served


def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder:
    """One specialist ladder: served specialist sizes (smallest first), then the
    generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist
    yields just the generic tiers (ladder_for also guards this)."""
    tiers: list[Tier] = []
    if "{specialty}" in _SMOL_PATTERN:
        for size in _SPECIALIST_SIZES:
            tag = _SMOL_PATTERN.format(specialty=specialty, size=size)
            if tag in served:
                tiers.append(Tier(f"{size}-{specialty}", tag))
    tiers.extend(_GENERIC_TIERS)
    return SpecialistLadder(specialty=specialty, tiers=tiers)


_HAL_MATRIX = SpecialistPreset(
    key="hal-matrix",
    base_url="http://10.8.0.6:11434/v1",
    api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
    tiers=_GENERIC_TIERS,    # generic fallback ladder (inherited Preset.tiers)
    ladders={},              # built lazily in load_preset (needs the resolved base_url)
)

_PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)}


def default_ui_model(preset: Preset, cfg: dict) -> str:
    """Resolve the default model for the web UI from config and preset tiers."""
    if cfg.get("model"):
        return str(cfg["model"])
    if preset.tiers:
        return preset.default_model
    return ""


def load_preset() -> Preset:
    """Resolve the active preset, applying env overrides and Rust config.toml."""
    # Default to the 2D specialist matrix so "Auto" routes by specialty out of the box;
    # it auto-detects served specialists and falls back to the generic Granite ladder
    # (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED).
    key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower()
    base = _PRESETS.get(key, _LAPTOP)

    rust_cfg: dict = {}
    try:
        from .rust_session import load_rust_config
        rust_cfg = load_rust_config()
    except Exception:
        pass

    base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url))
    api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key)

    # An explicit env SMALLCODE_MODEL is a hard single-model override and wins over
    # everything (including the matrix). A `model` in config.toml is only a *default*
    # — it must NOT silently disable the matrix when the user explicitly asked for it
    # via SMALLCODE_PRESET=hal-matrix.
    env_model = os.environ.get("SMALLCODE_MODEL")
    if env_model:
        return Preset(key=base.key, base_url=base_url, api_key=api_key,
                      tiers=[Tier("custom", env_model)])

    if isinstance(base, SpecialistPreset):
        served = _discover_served(base_url, api_key) or _HAL_SERVED
        ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES}
        return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key,
                                tiers=_GENERIC_TIERS, ladders=ladders)

    # A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL,
    # handled above). If it just names this preset's entry tier — the common case, e.g.
    # the CLI default == hal-smol's 1.5B entry — keep the full escalation LADDER (so the
    # router + judge still work). Only a model that ISN'T the preset entry is treated as
    # a deliberate single-model choice.
    forced = rust_cfg.get("model")
    if forced and base.tiers and forced != base.default_model:
        return Preset(key=base.key, base_url=base_url, api_key=api_key,
                      tiers=[Tier("custom", forced)])

    return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers)