File size: 12,871 Bytes
daea45b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""Backend presets for smolcode.

smolcode always talks to ONE OpenAI-compatible endpoint. A "preset" just
selects the base_url and the model *tiers* the router may escalate through.
Everything is overridable by environment variables so the same code runs on a
laptop, inside an HF Space, or against the hal-9000 "home supercomputer".

Env overrides (highest priority):
  SMALLCODE_PRESET     space | laptop | hal | hal-smol   (default: hal)
  SMALLCODE_BASE_URL   OpenAI-compatible /v1 URL
  SMALLCODE_API_KEY    bearer token (most local servers ignore it)
  SMALLCODE_MODEL      force a single model (disables tiering)
"""
from __future__ import annotations

import os
import re
from dataclasses import dataclass, field


@dataclass(frozen=True)
class Tier:
    """One rung of the model ladder. `name` is what the router shows in the UI."""
    name: str
    model: str


@dataclass(frozen=True)
class Preset:
    key: str
    base_url: str
    api_key: str
    # Ordered cheap -> expensive. The router starts at tiers[0] and escalates.
    tiers: list[Tier] = field(default_factory=list)

    @property
    def default_model(self) -> str:
        return self.tiers[0].model


@dataclass(frozen=True)
class SpecialistLadder:
    """One specialist family's size ladder (cheap -> expensive), reusing Tier."""
    specialty: str
    tiers: list[Tier] = field(default_factory=list)


@dataclass(frozen=True)
class SpecialistPreset(Preset):
    """A Preset whose escalation space is 2D: specialty -> size ladder.

    Subclasses Preset so every existing reader of .base_url/.api_key/.tiers/
    .default_model (bench, builder, agent) keeps working: the inherited `tiers` is
    the GENERIC fallback ladder, and `ladders` holds the per-specialty rungs.
    """
    ladders: dict[str, SpecialistLadder] = field(default_factory=dict)

    def ladder_for(self, specialty: str) -> SpecialistLadder:
        """The specialist ladder for a key, or the generic ladder as a fallback."""
        lad = self.ladders.get(specialty)
        if lad and lad.tiers:
            return lad
        return SpecialistLadder(specialty="general", tiers=self.tiers)


# Local Ollama on the workstation exposes an OpenAI-compatible API at :11435/v1.
# NOTE: the default model is a tool-TUNED 3B (granite4.1:3b), not a coder model.
# Tiny coder models (qwen2.5-coder:3b) text-emit ```json instead of native
# `tool_calls`, which LiteForge's agent loop can't execute. Granite-3B (also
# <=4B, Tiny-Titan-eligible) emits native tool_calls. The dual-mode parser
# (P1) will let qwen-coder back in for code quality.
_LAPTOP = Preset(
    key="laptop",
    base_url="http://localhost:11435/v1",
    api_key="ollama",
    tiers=[Tier("3B", "granite4.1:3b")],
)

# The submission Space: a single tiny model served by llama.cpp's llama-server.
# Kept to one <=4B model so the Tiny Titan claim is unambiguous.
# Port is configurable: 8080 inside the Space, but on the workstation 8080 is
# taken by Guacamole/Tomcat so local dev uses SMALLCODE_LLAMA_PORT=8088.
# llama-server ignores the model name and serves whatever GGUF was loaded.
_LLAMA_PORT = os.environ.get("SMALLCODE_LLAMA_PORT", "8080")
_SPACE = Preset(
    key="space",
    base_url=f"http://127.0.0.1:{_LLAMA_PORT}/v1",
    api_key="local",
    tiers=[Tier("3B", "qwen2.5-coder-3b-instruct-q4_k_m.gguf")],
)

# hal-9000 (DGX Spark): full tiered router. Points straight at hal's Ollama
# (:11434/v1), which serves every pulled model over one OpenAI-compatible
# endpoint with native tool_calls β€” simpler than LiteLLM (whose :4000 exposed no
# models). Tiny tier is a TOOL-TUNED model (granite4.1:3b) that reliably drives
# the loop; escalate to bigger Qwen *coder* models for hard codegen. (Tiny coder
# models can't native-tool-call β€” see engine/config laptop note.)
_HAL = Preset(
    key="hal",
    base_url="http://10.8.0.6:11434/v1",
    api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
    # All-Granite ladder: every tier emits native tool_calls on Ollama (verified
    # on hal), all <=32B. NOTE: qwen2.5-coder does NOT native-tool-call on Ollama
    # at ANY size (3b/14b text-emit the call) β€” bringing the Qwen *coder* models
    # in (for the benchmark story) requires the dual-mode parser (see task 6).
    tiers=[
        Tier("3B", "granite4.1:3b"),
        Tier("8B", "granite4.1:8b"),
        Tier("30B", "granite4.1:30b"),
    ],
)

# hal-9000 with the fine-tuned coder as the entry tier. The finetune/ pipeline
# trains Qwen2.5-Coder-1.5B to emit native <tool_call> (see finetune/README.md),
# so once it's served on hal's Ollama it can be the cheap first rung and we only
# escalate to Granite on verification failure. The served tag is configurable via
# SMALLCODE_SMOL_MODEL (default matches the published model name); import the GGUF
# into Ollama under that tag, or point SMALLCODE_BASE_URL at a llama-server.
_SMOL_MODEL = os.environ.get("SMALLCODE_SMOL_MODEL", "smolcode-coder-1.5b:tools")
_HAL_SMOL = Preset(
    key="hal-smol",
    base_url="http://10.8.0.6:11434/v1",
    api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
    tiers=[
        Tier("1.5B-tuned", _SMOL_MODEL),
        Tier("8B", "granite4.1:8b"),
        Tier("30B", "granite4.1:30b"),
    ],
)

# --- the 2D specialist matrix (hal-matrix preset) ----------------------------
# A model per language/function (smolcode-coder-{specialty}-{size}:tools), served
# on hal's Ollama. The router classifies the task's specialty, picks that family's
# size ladder, and escalates within it β€” then into the generic Granite ladder at
# the top. Tags are derived by CONVENTION + served-tag discovery, so adding a
# specialist is a serving action, not a code edit.

_SPECIALIST_SIZES = ("1.5b", "3b", "7b")   # 7b deferred but recognized if served.
_SPECIALTIES = ("py", "js", "bash", "git", "dotnet", "csharp", "java",
                "powershell", "rust", "docker", "bsd", "go", "sql", "cpp", "terraform",
                "orchestrate")   # task_batch / parallel fan-out specialist

# Pattern is overridable so one env var can repoint the whole matrix. Back-compat:
# a value WITHOUT a "{specialty}" placeholder is treated as a legacy single tag.
_SMOL_PATTERN = os.environ.get("SMALLCODE_SMOL_MODEL",
                               "smolcode-coder-{specialty}-{size}:tools")

# Size parsing + specialty detection β€” shared by the model picker (Tiny-Titan <=32B
# display filter, collapsing the 16-per-size specialty fine-tunes to one "Auto" entry
# per size). Mirrors smolcode-cli/src/router.rs parse_size_b and the size_b() regex in
# tests/test_matrix_routing.py.
_SIZE_RE = re.compile(r"(\d+(?:\.\d+)?)b\b", re.I)


def parse_size_b(model: str) -> float:
    """Parameter count in billions from a model tag (last '<n>b' group), else 0.0.

    'granite4.1:30b' -> 30.0, 'smolcode-coder-py-1.5b:tools' -> 1.5. Unknown -> 0.0
    (so size-unknown models pass a '<=32B' filter rather than being hidden)."""
    found = _SIZE_RE.findall(model or "")
    return float(found[-1]) if found else 0.0


def is_specialty_model(model: str) -> bool:
    """True if the tag is a per-specialty fine-tune (smolcode-coder-<specialty>-...)."""
    m = (model or "").lower()
    return any(m.startswith(f"smolcode-coder-{s}-") for s in _SPECIALTIES)


def specialist_sizes(preset: "Preset") -> list[str]:
    """Distinct specialist sizes (<=32B) present in a matrix preset's ladders,
    smallest first (e.g. ['1.5b', '3b']). Empty for non-matrix presets."""
    sizes: dict[float, str] = {}
    for lad in (getattr(preset, "ladders", {}) or {}).values():
        for t in lad.tiers:
            if is_specialty_model(t.model):
                sb = parse_size_b(t.model)
                if 0 < sb <= 32:
                    sizes.setdefault(sb, f"{_SIZE_RE.findall(t.model)[-1]}b")
    return [sizes[k] for k in sorted(sizes)]

# Generic Granite ladder every specialist escalates INTO at its top rung (all <=32B).
_GENERIC_TIERS = [Tier("8B", "granite4.1:8b"), Tier("30B", "granite4.1:30b")]

# Static fallback set of served tags when /v1/models discovery is unavailable.
# Keep in sync with what's pulled on hal; discovery (below) supersedes it.
_HAL_SERVED: set[str] = {f"smolcode-coder-{s}-1.5b:tools" for s in _SPECIALTIES} | \
                        {f"smolcode-coder-{s}-3b:tools" for s in _SPECIALTIES}

_DISCOVERY_CACHE: dict[str, set[str]] = {}


def _discover_served(base_url: str, api_key: str) -> set[str]:
    """GET the OpenAI-compatible /v1/models once (cached per base_url); the set of
    served model tags. Any failure -> empty set (caller falls back to _HAL_SERVED)."""
    if base_url in _DISCOVERY_CACHE:
        return _DISCOVERY_CACHE[base_url]
    served: set[str] = set()
    try:
        import json
        import urllib.request
        req = urllib.request.Request(base_url.rstrip("/") + "/models",
                                     headers={"Authorization": f"Bearer {api_key}"})
        with urllib.request.urlopen(req, timeout=2) as r:
            data = json.loads(r.read())
        served = {m["id"] for m in data.get("data", []) if "id" in m}
    except Exception:
        served = set()
    _DISCOVERY_CACHE[base_url] = served
    return served


def _build_ladder(specialty: str, served: set[str]) -> SpecialistLadder:
    """One specialist ladder: served specialist sizes (smallest first), then the
    generic Granite tiers. Missing sizes are skipped; a wholly-missing specialist
    yields just the generic tiers (ladder_for also guards this)."""
    tiers: list[Tier] = []
    if "{specialty}" in _SMOL_PATTERN:
        for size in _SPECIALIST_SIZES:
            tag = _SMOL_PATTERN.format(specialty=specialty, size=size)
            if tag in served:
                tiers.append(Tier(f"{size}-{specialty}", tag))
    tiers.extend(_GENERIC_TIERS)
    return SpecialistLadder(specialty=specialty, tiers=tiers)


_HAL_MATRIX = SpecialistPreset(
    key="hal-matrix",
    base_url="http://10.8.0.6:11434/v1",
    api_key=os.environ.get("SMALLCODE_API_KEY", "ollama"),
    tiers=_GENERIC_TIERS,    # generic fallback ladder (inherited Preset.tiers)
    ladders={},              # built lazily in load_preset (needs the resolved base_url)
)

_PRESETS = {p.key: p for p in (_LAPTOP, _SPACE, _HAL, _HAL_SMOL, _HAL_MATRIX)}


def default_ui_model(preset: Preset, cfg: dict) -> str:
    """Resolve the default model for the web UI from config and preset tiers."""
    if cfg.get("model"):
        return str(cfg["model"])
    if preset.tiers:
        return preset.default_model
    return ""


def load_preset() -> Preset:
    """Resolve the active preset, applying env overrides and Rust config.toml."""
    # Default to the 2D specialist matrix so "Auto" routes by specialty out of the box;
    # it auto-detects served specialists and falls back to the generic Granite ladder
    # (per-specialty: ladder_for(); whole matrix: _discover_served -> _HAL_SERVED).
    key = os.environ.get("SMALLCODE_PRESET", "hal-matrix").lower()
    base = _PRESETS.get(key, _LAPTOP)

    rust_cfg: dict = {}
    try:
        from .rust_session import load_rust_config
        rust_cfg = load_rust_config()
    except Exception:
        pass

    base_url = os.environ.get("SMALLCODE_BASE_URL", rust_cfg.get("base_url", base.base_url))
    api_key = os.environ.get("SMALLCODE_API_KEY", base.api_key)

    # An explicit env SMALLCODE_MODEL is a hard single-model override and wins over
    # everything (including the matrix). A `model` in config.toml is only a *default*
    # β€” it must NOT silently disable the matrix when the user explicitly asked for it
    # via SMALLCODE_PRESET=hal-matrix.
    env_model = os.environ.get("SMALLCODE_MODEL")
    if env_model:
        return Preset(key=base.key, base_url=base_url, api_key=api_key,
                      tiers=[Tier("custom", env_model)])

    if isinstance(base, SpecialistPreset):
        served = _discover_served(base_url, api_key) or _HAL_SERVED
        ladders = {s: _build_ladder(s, served) for s in _SPECIALTIES}
        return SpecialistPreset(key=base.key, base_url=base_url, api_key=api_key,
                                tiers=_GENERIC_TIERS, ladders=ladders)

    # A config.toml `model` is a DEFAULT, not a hard override (that's SMALLCODE_MODEL,
    # handled above). If it just names this preset's entry tier β€” the common case, e.g.
    # the CLI default == hal-smol's 1.5B entry β€” keep the full escalation LADDER (so the
    # router + judge still work). Only a model that ISN'T the preset entry is treated as
    # a deliberate single-model choice.
    forced = rust_cfg.get("model")
    if forced and base.tiers and forced != base.default_model:
        return Preset(key=base.key, base_url=base_url, api_key=api_key,
                      tiers=[Tier("custom", forced)])

    return Preset(key=base.key, base_url=base_url, api_key=api_key, tiers=base.tiers)