File size: 10,918 Bytes
ee8ca43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca2bb8e
 
 
 
 
 
 
 
ee8ca43
 
 
 
 
 
 
 
51b6fd5
 
 
ee8ca43
 
51b6fd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee8ca43
 
 
 
0935028
 
 
 
 
ee8ca43
 
 
 
 
 
 
 
 
 
 
 
0935028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee8ca43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0935028
ee8ca43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0935028
 
 
 
 
ee8ca43
 
 
0935028
ee8ca43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
Speed estimation: how fast will it actually feel?

Two-tier design, with provenance the UI always shows:

  1. TRAINED MODEL (when present): an XGBoost regressor trained on real
     community measurements (LocalScore, ~33k data points), following the
     methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient
     boosting over hardware+model features, validated leave-one-accelerator-
     out). Loaded from model/speed_model.skops if scripts/train_speed_model.py
     has been run. method = "measured-model".
  2. ROOFLINE BASELINE (always available, fully offline): decode is memory-
     bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights +
     KV), times an empirical efficiency factor. See kipply's "Transformer
     Inference Arithmetic" and the JAX scaling book inference chapter.
     method = "roofline".

The anti-gimmick rule lives in the training script: the trained model ships
only if it beats this baseline on held-out hardware; otherwise the baseline
IS the product and the UI says so.

Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and
diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with
TFLOPS / model GFLOPs, a different axis with different data (Ultralytics
publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU
TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built;
non-LLM families keep their provenance-labelled memory verdicts only, rather
than getting fake speed numbers.
"""

import json
import re
from functools import lru_cache
from pathlib import Path

_ROOT = Path(__file__).resolve().parent.parent
_SPECS_PATH = _ROOT / "data" / "gpu_specs.json"
_MODEL_PATH = _ROOT / "model" / "speed_model.skops"

# Decode efficiency vs theoretical bandwidth roofline. Real stacks land well
# under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community
# measurements. We centre conservatively and report a band, never a point.
_EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78
# Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5).
_RAM_BW_GBS = 48.0
# Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s.
_READING_TPS = 6.0


@lru_cache(maxsize=1)
def _specs() -> dict:
    try:
        return json.loads(_SPECS_PATH.read_text(encoding="utf-8"))
    except OSError:
        return {"gpus": {}, "apple": {}, "sbc": {}}


def _norm(s: str) -> str:
    return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip()


@lru_cache(maxsize=1)
def _bw_index() -> tuple:
    idx = []
    for name, d in _specs()["gpus"].items():
        idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0))))
    idx.sort(key=lambda t: -len(t[0]))   # longest first: '4080 super' beats '4080'
    return tuple(idx)


# Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use
# M2-generation numbers as the conservative representative (older = slower).
_APPLE_TIER_BW = None


def _apple_bw(tier_hint: str) -> float:
    global _APPLE_TIER_BW
    if _APPLE_TIER_BW is None:
        a = {k: v["bw"] for k, v in _specs()["apple"].items()}
        _APPLE_TIER_BW = {
            "ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0,
            "max": a.get("m2 max") or 400.0,
            "pro": a.get("m2 pro") or 200.0,
            "base": a.get("m2") or 100.0,
        }
    t = (tier_hint or "").lower()
    for key in ("ultra", "max", "pro"):
        if key in t:
            return _APPLE_TIER_BW[key]
    return _APPLE_TIER_BW["base"]


def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float | None, str]:
    """(memory bandwidth GB/s on the fast path, source-note) for a machine."""
    if spec.is_apple_silicon:
        return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)"
    if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0:
        n = _norm(gpu_label or spec.gpu_label)
        # pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants);
        # pass 2: name only — a custom VRAM override must not hide the chart.
        for check_vram in (True, False):
            for key, bw, vram in _bw_index():
                if key and key in n:
                    if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4:
                        continue
                    return bw, "vendor spec sheet"
        return None, ""
    return None, ""


# --------------------------------------------------------------------------
# Trained model (optional, loaded if scripts/train_speed_model.py produced it)
# --------------------------------------------------------------------------

_MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json"


@lru_cache(maxsize=1)
def _trained_model():
    # Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops
    # artifact exists for the Hub, but its loading chain dragged in unrelated
    # imports on the Space).
    if _MODEL_JSON_PATH.exists():
        try:
            from xgboost import XGBRegressor
            model = XGBRegressor()
            model.load_model(_MODEL_JSON_PATH)
            print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True)
            return model
        except Exception as e:  # noqa: BLE001
            import sys
            print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed "
                  f"to load ({e!r}) — trying the skops artifact",
                  file=sys.stderr, flush=True)
    if not _MODEL_PATH.exists():
        return None
    try:
        from skops.io import load as skops_load
        # skops only loads explicitly-trusted types — exactly these two, which
        # scripts/train_speed_model.py produces. Anything else is refused.
        model = skops_load(_MODEL_PATH, trusted=[
            "xgboost.core.Booster", "xgboost.sklearn.XGBRegressor",
        ])
        print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True)
        return model
    except Exception as e:  # noqa: BLE001
        # The file exists but won't load — say so loudly (a silent fallback
        # here would hide a broken deploy behind plausible roofline numbers).
        import sys
        print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to "
              f"load ({e!r}) — falling back to the labelled roofline estimate",
              file=sys.stderr, flush=True)
        return None


_METRICS_PATH = _ROOT / "model" / "metrics.json"


@lru_cache(maxsize=1)
def _envelope() -> dict:
    """The region of feature space the training data actually covered.

    Decision trees cannot extrapolate: outside what they saw, they clamp to
    the nearest seen value and quietly give wrong answers (e.g. a 32B model
    gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the
    trained model only answers inside its measured envelope; outside it, the
    labelled analytical estimate takes over. Bounds come from metrics.json
    when the training script recorded them, else conservative defaults
    matching the LocalScore grid (<=14B Q4 models, consumer hardware).
    """
    env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)}
    try:
        rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope")
        if rec:
            env.update({k: tuple(v) for k, v in rec.items()})
    except OSError:
        pass
    return env


def _in_envelope(eff_bw: float, bytes_gb: float) -> bool:
    env = _envelope()
    return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1]
            and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1])


# --------------------------------------------------------------------------
# Prediction
# --------------------------------------------------------------------------

def predict_decode_tps(
    *,
    bandwidth_gbs: float,
    weights_gb: float,
    kv_gb: float = 0.0,
    active_fraction: float = 1.0,
    offload_fraction: float = 0.0,
) -> dict:
    """Predict decode tokens/sec.

    active_fraction: MoE models only read their active experts per token.
    offload_fraction: share of the model living in system RAM (0 = all on GPU).
    """
    # Bytes read per generated token: the (active) weights + the KV cache.
    bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05)
    if active_fraction < 0.9:
        # MoE conservatism: expert routing scatters reads across the full
        # weight file, so real MoE decode lands well under the active-bytes
        # ideal. 1.5x is a deliberate under-promise until measured data
        # corrects it (community MoE numbers run ~50-70% of ideal).
        bytes_gb *= 1.5

    eff_bw = bandwidth_gbs
    if offload_fraction > 0:
        f = min(max(offload_fraction, 0.0), 1.0)
        eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS)

    model = _trained_model()
    if model is not None and _in_envelope(eff_bw, bytes_gb):
        try:
            import numpy as np
            x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb,
                           active_fraction, offload_fraction,
                           eff_bw / bytes_gb]])
            tps = float(model.predict(x)[0])
            return {"tps": round(tps, 1),
                    "lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1),
                    "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
                    "method": "measured-model",
                    "note": ("predicted by a model trained on real community "
                             "measurements (LocalScore), LLM-Pilot methodology")}
        except Exception:  # noqa: BLE001 — fall through to roofline
            pass

    base = eff_bw / bytes_gb
    note = ("analytical estimate: decode speed is memory-bandwidth-bound "
            "(bandwidth divided by bytes read per token)")
    if model is not None:
        note += (" — this configuration is outside the measured data's range, "
                 "so the physics formula answers instead of the trained model")
    return {"tps": round(base * _EFF_MID, 1),
            "lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1),
            "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
            "method": "roofline", "note": note}


def feel_text(pred: dict) -> str:
    """One honest, plain-English line from a prediction."""
    tps = pred["tps"]
    lo, hi = pred["lo"], pred["hi"]
    if tps >= _READING_TPS * 4:
        speed_word = "much faster than you read"
    elif tps >= _READING_TPS * 1.5:
        speed_word = "faster than you read"
    elif tps >= _READING_TPS * 0.7:
        speed_word = "about reading speed"
    else:
        speed_word = "slower than reading — fine for short tasks"
    return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}"