Spaces:
Running on Zero
Running on Zero
| """ | |
| Speed estimation: how fast will it actually feel? | |
| Two-tier design, with provenance the UI always shows: | |
| 1. TRAINED MODEL (when present): an XGBoost regressor trained on real | |
| community measurements (LocalScore, ~33k data points), following the | |
| methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient | |
| boosting over hardware+model features, validated leave-one-accelerator- | |
| out). Loaded from model/speed_model.skops if scripts/train_speed_model.py | |
| has been run. method = "measured-model". | |
| 2. ROOFLINE BASELINE (always available, fully offline): decode is memory- | |
| bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights + | |
| KV), times an empirical efficiency factor. See kipply's "Transformer | |
| Inference Arithmetic" and the JAX scaling book inference chapter. | |
| method = "roofline". | |
| The anti-gimmick rule lives in the training script: the trained model ships | |
| only if it beats this baseline on held-out hardware; otherwise the baseline | |
| IS the product and the UI says so. | |
| Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and | |
| diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with | |
| TFLOPS / model GFLOPs, a different axis with different data (Ultralytics | |
| publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU | |
| TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built; | |
| non-LLM families keep their provenance-labelled memory verdicts only, rather | |
| than getting fake speed numbers. | |
| """ | |
| import json | |
| import re | |
| from functools import lru_cache | |
| from pathlib import Path | |
| _ROOT = Path(__file__).resolve().parent.parent | |
| _SPECS_PATH = _ROOT / "data" / "gpu_specs.json" | |
| _MODEL_PATH = _ROOT / "model" / "speed_model.skops" | |
| # Decode efficiency vs theoretical bandwidth roofline. Real stacks land well | |
| # under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community | |
| # measurements. We centre conservatively and report a band, never a point. | |
| _EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78 | |
| # Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5). | |
| _RAM_BW_GBS = 48.0 | |
| # Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s. | |
| _READING_TPS = 6.0 | |
| def _specs() -> dict: | |
| try: | |
| return json.loads(_SPECS_PATH.read_text(encoding="utf-8")) | |
| except OSError: | |
| return {"gpus": {}, "apple": {}, "sbc": {}} | |
| def _norm(s: str) -> str: | |
| return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip() | |
| def _bw_index() -> tuple: | |
| idx = [] | |
| for name, d in _specs()["gpus"].items(): | |
| idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0)))) | |
| idx.sort(key=lambda t: -len(t[0])) # longest first: '4080 super' beats '4080' | |
| return tuple(idx) | |
| # Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use | |
| # M2-generation numbers as the conservative representative (older = slower). | |
| _APPLE_TIER_BW = None | |
| def _apple_bw(tier_hint: str) -> float: | |
| global _APPLE_TIER_BW | |
| if _APPLE_TIER_BW is None: | |
| a = {k: v["bw"] for k, v in _specs()["apple"].items()} | |
| _APPLE_TIER_BW = { | |
| "ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0, | |
| "max": a.get("m2 max") or 400.0, | |
| "pro": a.get("m2 pro") or 200.0, | |
| "base": a.get("m2") or 100.0, | |
| } | |
| t = (tier_hint or "").lower() | |
| for key in ("ultra", "max", "pro"): | |
| if key in t: | |
| return _APPLE_TIER_BW[key] | |
| return _APPLE_TIER_BW["base"] | |
| def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float | None, str]: | |
| """(memory bandwidth GB/s on the fast path, source-note) for a machine.""" | |
| if spec.is_apple_silicon: | |
| return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)" | |
| if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0: | |
| n = _norm(gpu_label or spec.gpu_label) | |
| # pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants); | |
| # pass 2: name only — a custom VRAM override must not hide the chart. | |
| for check_vram in (True, False): | |
| for key, bw, vram in _bw_index(): | |
| if key and key in n: | |
| if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4: | |
| continue | |
| return bw, "vendor spec sheet" | |
| return None, "" | |
| return None, "" | |
| # -------------------------------------------------------------------------- | |
| # Trained model (optional, loaded if scripts/train_speed_model.py produced it) | |
| # -------------------------------------------------------------------------- | |
| _MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json" | |
| def _trained_model(): | |
| # Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops | |
| # artifact exists for the Hub, but its loading chain dragged in unrelated | |
| # imports on the Space). | |
| if _MODEL_JSON_PATH.exists(): | |
| try: | |
| from xgboost import XGBRegressor | |
| model = XGBRegressor() | |
| model.load_model(_MODEL_JSON_PATH) | |
| print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True) | |
| return model | |
| except Exception as e: # noqa: BLE001 | |
| import sys | |
| print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed " | |
| f"to load ({e!r}) — trying the skops artifact", | |
| file=sys.stderr, flush=True) | |
| if not _MODEL_PATH.exists(): | |
| return None | |
| try: | |
| from skops.io import load as skops_load | |
| # skops only loads explicitly-trusted types — exactly these two, which | |
| # scripts/train_speed_model.py produces. Anything else is refused. | |
| model = skops_load(_MODEL_PATH, trusted=[ | |
| "xgboost.core.Booster", "xgboost.sklearn.XGBRegressor", | |
| ]) | |
| print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True) | |
| return model | |
| except Exception as e: # noqa: BLE001 | |
| # The file exists but won't load — say so loudly (a silent fallback | |
| # here would hide a broken deploy behind plausible roofline numbers). | |
| import sys | |
| print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to " | |
| f"load ({e!r}) — falling back to the labelled roofline estimate", | |
| file=sys.stderr, flush=True) | |
| return None | |
| _METRICS_PATH = _ROOT / "model" / "metrics.json" | |
| def _envelope() -> dict: | |
| """The region of feature space the training data actually covered. | |
| Decision trees cannot extrapolate: outside what they saw, they clamp to | |
| the nearest seen value and quietly give wrong answers (e.g. a 32B model | |
| gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the | |
| trained model only answers inside its measured envelope; outside it, the | |
| labelled analytical estimate takes over. Bounds come from metrics.json | |
| when the training script recorded them, else conservative defaults | |
| matching the LocalScore grid (<=14B Q4 models, consumer hardware). | |
| """ | |
| env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)} | |
| try: | |
| rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope") | |
| if rec: | |
| env.update({k: tuple(v) for k, v in rec.items()}) | |
| except OSError: | |
| pass | |
| return env | |
| def _in_envelope(eff_bw: float, bytes_gb: float) -> bool: | |
| env = _envelope() | |
| return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1] | |
| and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1]) | |
| # -------------------------------------------------------------------------- | |
| # Prediction | |
| # -------------------------------------------------------------------------- | |
| def predict_decode_tps( | |
| *, | |
| bandwidth_gbs: float, | |
| weights_gb: float, | |
| kv_gb: float = 0.0, | |
| active_fraction: float = 1.0, | |
| offload_fraction: float = 0.0, | |
| ) -> dict: | |
| """Predict decode tokens/sec. | |
| active_fraction: MoE models only read their active experts per token. | |
| offload_fraction: share of the model living in system RAM (0 = all on GPU). | |
| """ | |
| # Bytes read per generated token: the (active) weights + the KV cache. | |
| bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05) | |
| if active_fraction < 0.9: | |
| # MoE conservatism: expert routing scatters reads across the full | |
| # weight file, so real MoE decode lands well under the active-bytes | |
| # ideal. 1.5x is a deliberate under-promise until measured data | |
| # corrects it (community MoE numbers run ~50-70% of ideal). | |
| bytes_gb *= 1.5 | |
| eff_bw = bandwidth_gbs | |
| if offload_fraction > 0: | |
| f = min(max(offload_fraction, 0.0), 1.0) | |
| eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS) | |
| model = _trained_model() | |
| if model is not None and _in_envelope(eff_bw, bytes_gb): | |
| try: | |
| import numpy as np | |
| x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb, | |
| active_fraction, offload_fraction, | |
| eff_bw / bytes_gb]]) | |
| tps = float(model.predict(x)[0]) | |
| return {"tps": round(tps, 1), | |
| "lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1), | |
| "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1), | |
| "method": "measured-model", | |
| "note": ("predicted by a model trained on real community " | |
| "measurements (LocalScore), LLM-Pilot methodology")} | |
| except Exception: # noqa: BLE001 — fall through to roofline | |
| pass | |
| base = eff_bw / bytes_gb | |
| note = ("analytical estimate: decode speed is memory-bandwidth-bound " | |
| "(bandwidth divided by bytes read per token)") | |
| if model is not None: | |
| note += (" — this configuration is outside the measured data's range, " | |
| "so the physics formula answers instead of the trained model") | |
| return {"tps": round(base * _EFF_MID, 1), | |
| "lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1), | |
| "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1), | |
| "method": "roofline", "note": note} | |
| def feel_text(pred: dict) -> str: | |
| """One honest, plain-English line from a prediction.""" | |
| tps = pred["tps"] | |
| lo, hi = pred["lo"], pred["hi"] | |
| if tps >= _READING_TPS * 4: | |
| speed_word = "much faster than you read" | |
| elif tps >= _READING_TPS * 1.5: | |
| speed_word = "faster than you read" | |
| elif tps >= _READING_TPS * 0.7: | |
| speed_word = "about reading speed" | |
| else: | |
| speed_word = "slower than reading — fine for short tasks" | |
| return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}" | |