Spaces:
Running on Zero
Running on Zero
File size: 10,918 Bytes
ee8ca43 ca2bb8e ee8ca43 51b6fd5 ee8ca43 51b6fd5 ee8ca43 0935028 ee8ca43 0935028 ee8ca43 0935028 ee8ca43 0935028 ee8ca43 0935028 ee8ca43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | """
Speed estimation: how fast will it actually feel?
Two-tier design, with provenance the UI always shows:
1. TRAINED MODEL (when present): an XGBoost regressor trained on real
community measurements (LocalScore, ~33k data points), following the
methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient
boosting over hardware+model features, validated leave-one-accelerator-
out). Loaded from model/speed_model.skops if scripts/train_speed_model.py
has been run. method = "measured-model".
2. ROOFLINE BASELINE (always available, fully offline): decode is memory-
bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights +
KV), times an empirical efficiency factor. See kipply's "Transformer
Inference Arithmetic" and the JAX scaling book inference chapter.
method = "roofline".
The anti-gimmick rule lives in the training script: the trained model ships
only if it beats this baseline on held-out hardware; otherwise the baseline
IS the product and the UI says so.
Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and
diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with
TFLOPS / model GFLOPs, a different axis with different data (Ultralytics
publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU
TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built;
non-LLM families keep their provenance-labelled memory verdicts only, rather
than getting fake speed numbers.
"""
import json
import re
from functools import lru_cache
from pathlib import Path
_ROOT = Path(__file__).resolve().parent.parent
_SPECS_PATH = _ROOT / "data" / "gpu_specs.json"
_MODEL_PATH = _ROOT / "model" / "speed_model.skops"
# Decode efficiency vs theoretical bandwidth roofline. Real stacks land well
# under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community
# measurements. We centre conservatively and report a band, never a point.
_EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78
# Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5).
_RAM_BW_GBS = 48.0
# Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s.
_READING_TPS = 6.0
@lru_cache(maxsize=1)
def _specs() -> dict:
try:
return json.loads(_SPECS_PATH.read_text(encoding="utf-8"))
except OSError:
return {"gpus": {}, "apple": {}, "sbc": {}}
def _norm(s: str) -> str:
return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip()
@lru_cache(maxsize=1)
def _bw_index() -> tuple:
idx = []
for name, d in _specs()["gpus"].items():
idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0))))
idx.sort(key=lambda t: -len(t[0])) # longest first: '4080 super' beats '4080'
return tuple(idx)
# Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use
# M2-generation numbers as the conservative representative (older = slower).
_APPLE_TIER_BW = None
def _apple_bw(tier_hint: str) -> float:
global _APPLE_TIER_BW
if _APPLE_TIER_BW is None:
a = {k: v["bw"] for k, v in _specs()["apple"].items()}
_APPLE_TIER_BW = {
"ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0,
"max": a.get("m2 max") or 400.0,
"pro": a.get("m2 pro") or 200.0,
"base": a.get("m2") or 100.0,
}
t = (tier_hint or "").lower()
for key in ("ultra", "max", "pro"):
if key in t:
return _APPLE_TIER_BW[key]
return _APPLE_TIER_BW["base"]
def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float | None, str]:
"""(memory bandwidth GB/s on the fast path, source-note) for a machine."""
if spec.is_apple_silicon:
return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)"
if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0:
n = _norm(gpu_label or spec.gpu_label)
# pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants);
# pass 2: name only — a custom VRAM override must not hide the chart.
for check_vram in (True, False):
for key, bw, vram in _bw_index():
if key and key in n:
if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4:
continue
return bw, "vendor spec sheet"
return None, ""
return None, ""
# --------------------------------------------------------------------------
# Trained model (optional, loaded if scripts/train_speed_model.py produced it)
# --------------------------------------------------------------------------
_MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json"
@lru_cache(maxsize=1)
def _trained_model():
# Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops
# artifact exists for the Hub, but its loading chain dragged in unrelated
# imports on the Space).
if _MODEL_JSON_PATH.exists():
try:
from xgboost import XGBRegressor
model = XGBRegressor()
model.load_model(_MODEL_JSON_PATH)
print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True)
return model
except Exception as e: # noqa: BLE001
import sys
print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed "
f"to load ({e!r}) — trying the skops artifact",
file=sys.stderr, flush=True)
if not _MODEL_PATH.exists():
return None
try:
from skops.io import load as skops_load
# skops only loads explicitly-trusted types — exactly these two, which
# scripts/train_speed_model.py produces. Anything else is refused.
model = skops_load(_MODEL_PATH, trusted=[
"xgboost.core.Booster", "xgboost.sklearn.XGBRegressor",
])
print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True)
return model
except Exception as e: # noqa: BLE001
# The file exists but won't load — say so loudly (a silent fallback
# here would hide a broken deploy behind plausible roofline numbers).
import sys
print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to "
f"load ({e!r}) — falling back to the labelled roofline estimate",
file=sys.stderr, flush=True)
return None
_METRICS_PATH = _ROOT / "model" / "metrics.json"
@lru_cache(maxsize=1)
def _envelope() -> dict:
"""The region of feature space the training data actually covered.
Decision trees cannot extrapolate: outside what they saw, they clamp to
the nearest seen value and quietly give wrong answers (e.g. a 32B model
gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the
trained model only answers inside its measured envelope; outside it, the
labelled analytical estimate takes over. Bounds come from metrics.json
when the training script recorded them, else conservative defaults
matching the LocalScore grid (<=14B Q4 models, consumer hardware).
"""
env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)}
try:
rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope")
if rec:
env.update({k: tuple(v) for k, v in rec.items()})
except OSError:
pass
return env
def _in_envelope(eff_bw: float, bytes_gb: float) -> bool:
env = _envelope()
return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1]
and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1])
# --------------------------------------------------------------------------
# Prediction
# --------------------------------------------------------------------------
def predict_decode_tps(
*,
bandwidth_gbs: float,
weights_gb: float,
kv_gb: float = 0.0,
active_fraction: float = 1.0,
offload_fraction: float = 0.0,
) -> dict:
"""Predict decode tokens/sec.
active_fraction: MoE models only read their active experts per token.
offload_fraction: share of the model living in system RAM (0 = all on GPU).
"""
# Bytes read per generated token: the (active) weights + the KV cache.
bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05)
if active_fraction < 0.9:
# MoE conservatism: expert routing scatters reads across the full
# weight file, so real MoE decode lands well under the active-bytes
# ideal. 1.5x is a deliberate under-promise until measured data
# corrects it (community MoE numbers run ~50-70% of ideal).
bytes_gb *= 1.5
eff_bw = bandwidth_gbs
if offload_fraction > 0:
f = min(max(offload_fraction, 0.0), 1.0)
eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS)
model = _trained_model()
if model is not None and _in_envelope(eff_bw, bytes_gb):
try:
import numpy as np
x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb,
active_fraction, offload_fraction,
eff_bw / bytes_gb]])
tps = float(model.predict(x)[0])
return {"tps": round(tps, 1),
"lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1),
"bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
"method": "measured-model",
"note": ("predicted by a model trained on real community "
"measurements (LocalScore), LLM-Pilot methodology")}
except Exception: # noqa: BLE001 — fall through to roofline
pass
base = eff_bw / bytes_gb
note = ("analytical estimate: decode speed is memory-bandwidth-bound "
"(bandwidth divided by bytes read per token)")
if model is not None:
note += (" — this configuration is outside the measured data's range, "
"so the physics formula answers instead of the trained model")
return {"tps": round(base * _EFF_MID, 1),
"lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1),
"bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
"method": "roofline", "note": note}
def feel_text(pred: dict) -> str:
"""One honest, plain-English line from a prediction."""
tps = pred["tps"]
lo, hi = pred["lo"], pred["hi"]
if tps >= _READING_TPS * 4:
speed_word = "much faster than you read"
elif tps >= _READING_TPS * 1.5:
speed_word = "faster than you read"
elif tps >= _READING_TPS * 0.7:
speed_word = "about reading speed"
else:
speed_word = "slower than reading — fine for short tasks"
return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}"
|