FitCheck / engine /speed.py
cn0303's picture
GPU spam guard (serialize + button lock), live progress tickers, describe-first layout, real-machine bindings, chart fix for custom VRAM
ca2bb8e verified
"""
Speed estimation: how fast will it actually feel?
Two-tier design, with provenance the UI always shows:
1. TRAINED MODEL (when present): an XGBoost regressor trained on real
community measurements (LocalScore, ~33k data points), following the
methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient
boosting over hardware+model features, validated leave-one-accelerator-
out). Loaded from model/speed_model.skops if scripts/train_speed_model.py
has been run. method = "measured-model".
2. ROOFLINE BASELINE (always available, fully offline): decode is memory-
bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights +
KV), times an empirical efficiency factor. See kipply's "Transformer
Inference Arithmetic" and the JAX scaling book inference chapter.
method = "roofline".
The anti-gimmick rule lives in the training script: the trained model ships
only if it beats this baseline on held-out hardware; otherwise the baseline
IS the product and the UI says so.
Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and
diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with
TFLOPS / model GFLOPs, a different axis with different data (Ultralytics
publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU
TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built;
non-LLM families keep their provenance-labelled memory verdicts only, rather
than getting fake speed numbers.
"""
import json
import re
from functools import lru_cache
from pathlib import Path
_ROOT = Path(__file__).resolve().parent.parent
_SPECS_PATH = _ROOT / "data" / "gpu_specs.json"
_MODEL_PATH = _ROOT / "model" / "speed_model.skops"
# Decode efficiency vs theoretical bandwidth roofline. Real stacks land well
# under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community
# measurements. We centre conservatively and report a band, never a point.
_EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78
# Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5).
_RAM_BW_GBS = 48.0
# Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s.
_READING_TPS = 6.0
@lru_cache(maxsize=1)
def _specs() -> dict:
try:
return json.loads(_SPECS_PATH.read_text(encoding="utf-8"))
except OSError:
return {"gpus": {}, "apple": {}, "sbc": {}}
def _norm(s: str) -> str:
return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip()
@lru_cache(maxsize=1)
def _bw_index() -> tuple:
idx = []
for name, d in _specs()["gpus"].items():
idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0))))
idx.sort(key=lambda t: -len(t[0])) # longest first: '4080 super' beats '4080'
return tuple(idx)
# Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use
# M2-generation numbers as the conservative representative (older = slower).
_APPLE_TIER_BW = None
def _apple_bw(tier_hint: str) -> float:
global _APPLE_TIER_BW
if _APPLE_TIER_BW is None:
a = {k: v["bw"] for k, v in _specs()["apple"].items()}
_APPLE_TIER_BW = {
"ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0,
"max": a.get("m2 max") or 400.0,
"pro": a.get("m2 pro") or 200.0,
"base": a.get("m2") or 100.0,
}
t = (tier_hint or "").lower()
for key in ("ultra", "max", "pro"):
if key in t:
return _APPLE_TIER_BW[key]
return _APPLE_TIER_BW["base"]
def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float | None, str]:
"""(memory bandwidth GB/s on the fast path, source-note) for a machine."""
if spec.is_apple_silicon:
return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)"
if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0:
n = _norm(gpu_label or spec.gpu_label)
# pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants);
# pass 2: name only — a custom VRAM override must not hide the chart.
for check_vram in (True, False):
for key, bw, vram in _bw_index():
if key and key in n:
if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4:
continue
return bw, "vendor spec sheet"
return None, ""
return None, ""
# --------------------------------------------------------------------------
# Trained model (optional, loaded if scripts/train_speed_model.py produced it)
# --------------------------------------------------------------------------
_MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json"
@lru_cache(maxsize=1)
def _trained_model():
# Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops
# artifact exists for the Hub, but its loading chain dragged in unrelated
# imports on the Space).
if _MODEL_JSON_PATH.exists():
try:
from xgboost import XGBRegressor
model = XGBRegressor()
model.load_model(_MODEL_JSON_PATH)
print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True)
return model
except Exception as e: # noqa: BLE001
import sys
print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed "
f"to load ({e!r}) — trying the skops artifact",
file=sys.stderr, flush=True)
if not _MODEL_PATH.exists():
return None
try:
from skops.io import load as skops_load
# skops only loads explicitly-trusted types — exactly these two, which
# scripts/train_speed_model.py produces. Anything else is refused.
model = skops_load(_MODEL_PATH, trusted=[
"xgboost.core.Booster", "xgboost.sklearn.XGBRegressor",
])
print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True)
return model
except Exception as e: # noqa: BLE001
# The file exists but won't load — say so loudly (a silent fallback
# here would hide a broken deploy behind plausible roofline numbers).
import sys
print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to "
f"load ({e!r}) — falling back to the labelled roofline estimate",
file=sys.stderr, flush=True)
return None
_METRICS_PATH = _ROOT / "model" / "metrics.json"
@lru_cache(maxsize=1)
def _envelope() -> dict:
"""The region of feature space the training data actually covered.
Decision trees cannot extrapolate: outside what they saw, they clamp to
the nearest seen value and quietly give wrong answers (e.g. a 32B model
gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the
trained model only answers inside its measured envelope; outside it, the
labelled analytical estimate takes over. Bounds come from metrics.json
when the training script recorded them, else conservative defaults
matching the LocalScore grid (<=14B Q4 models, consumer hardware).
"""
env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)}
try:
rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope")
if rec:
env.update({k: tuple(v) for k, v in rec.items()})
except OSError:
pass
return env
def _in_envelope(eff_bw: float, bytes_gb: float) -> bool:
env = _envelope()
return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1]
and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1])
# --------------------------------------------------------------------------
# Prediction
# --------------------------------------------------------------------------
def predict_decode_tps(
*,
bandwidth_gbs: float,
weights_gb: float,
kv_gb: float = 0.0,
active_fraction: float = 1.0,
offload_fraction: float = 0.0,
) -> dict:
"""Predict decode tokens/sec.
active_fraction: MoE models only read their active experts per token.
offload_fraction: share of the model living in system RAM (0 = all on GPU).
"""
# Bytes read per generated token: the (active) weights + the KV cache.
bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05)
if active_fraction < 0.9:
# MoE conservatism: expert routing scatters reads across the full
# weight file, so real MoE decode lands well under the active-bytes
# ideal. 1.5x is a deliberate under-promise until measured data
# corrects it (community MoE numbers run ~50-70% of ideal).
bytes_gb *= 1.5
eff_bw = bandwidth_gbs
if offload_fraction > 0:
f = min(max(offload_fraction, 0.0), 1.0)
eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS)
model = _trained_model()
if model is not None and _in_envelope(eff_bw, bytes_gb):
try:
import numpy as np
x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb,
active_fraction, offload_fraction,
eff_bw / bytes_gb]])
tps = float(model.predict(x)[0])
return {"tps": round(tps, 1),
"lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1),
"bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
"method": "measured-model",
"note": ("predicted by a model trained on real community "
"measurements (LocalScore), LLM-Pilot methodology")}
except Exception: # noqa: BLE001 — fall through to roofline
pass
base = eff_bw / bytes_gb
note = ("analytical estimate: decode speed is memory-bandwidth-bound "
"(bandwidth divided by bytes read per token)")
if model is not None:
note += (" — this configuration is outside the measured data's range, "
"so the physics formula answers instead of the trained model")
return {"tps": round(base * _EFF_MID, 1),
"lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1),
"bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
"method": "roofline", "note": note}
def feel_text(pred: dict) -> str:
"""One honest, plain-English line from a prediction."""
tps = pred["tps"]
lo, hi = pred["lo"], pred["hi"]
if tps >= _READING_TPS * 4:
speed_word = "much faster than you read"
elif tps >= _READING_TPS * 1.5:
speed_word = "faster than you read"
elif tps >= _READING_TPS * 0.7:
speed_word = "about reading speed"
else:
speed_word = "slower than reading — fine for short tasks"
return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}"