Spaces:

build-small-hackathon
/

FitCheck

Running on Zero

File size: 10,918 Bytes

"""
Speed estimation: how fast will it actually feel?

Two-tier design, with provenance the UI always shows:

  1. TRAINED MODEL (when present): an XGBoost regressor trained on real
     community measurements (LocalScore, ~33k data points), following the
     methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient
     boosting over hardware+model features, validated leave-one-accelerator-
     out). Loaded from model/speed_model.skops if scripts/train_speed_model.py
     has been run. method = "measured-model".
  2. ROOFLINE BASELINE (always available, fully offline): decode is memory-
     bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights +
     KV), times an empirical efficiency factor. See kipply's "Transformer
     Inference Arithmetic" and the JAX scaling book inference chapter.
     method = "roofline".

The anti-gimmick rule lives in the training script: the trained model ships
only if it beats this baseline on held-out hardware; otherwise the baseline
IS the product and the UI says so.

Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and
diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with
TFLOPS / model GFLOPs, a different axis with different data (Ultralytics
publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU
TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built;
non-LLM families keep their provenance-labelled memory verdicts only, rather
than getting fake speed numbers.
"""

import json
import re
from functools import lru_cache
from pathlib import Path

_ROOT = Path(__file__).resolve().parent.parent
_SPECS_PATH = _ROOT / "data" / "gpu_specs.json"
_MODEL_PATH = _ROOT / "model" / "speed_model.skops"

# Decode efficiency vs theoretical bandwidth roofline. Real stacks land well
# under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community
# measurements. We centre conservatively and report a band, never a point.
_EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78
# Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5).
_RAM_BW_GBS = 48.0
# Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s.
_READING_TPS = 6.0


@lru_cache(maxsize=1)
def _specs() -> dict:
    try:
        return json.loads(_SPECS_PATH.read_text(encoding="utf-8"))
    except OSError:
        return {"gpus": {}, "apple": {}, "sbc": {}}


def _norm(s: str) -> str:
    return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip()


@lru_cache(maxsize=1)
def _bw_index() -> tuple:
    idx = []
    for name, d in _specs()["gpus"].items():
        idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0))))
    idx.sort(key=lambda t: -len(t[0]))   # longest first: '4080 super' beats '4080'
    return tuple(idx)


# Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use
# M2-generation numbers as the conservative representative (older = slower).
_APPLE_TIER_BW = None


def _apple_bw(tier_hint: str) -> float:
    global _APPLE_TIER_BW
    if _APPLE_TIER_BW is None:
        a = {k: v["bw"] for k, v in _specs()["apple"].items()}
        _APPLE_TIER_BW = {
            "ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0,
            "max": a.get("m2 max") or 400.0,
            "pro": a.get("m2 pro") or 200.0,
            "base": a.get("m2") or 100.0,
        }
    t = (tier_hint or "").lower()
    for key in ("ultra", "max", "pro"):
        if key in t:
            return _APPLE_TIER_BW[key]
    return _APPLE_TIER_BW["base"]


def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float | None, str]:
    """(memory bandwidth GB/s on the fast path, source-note) for a machine."""
    if spec.is_apple_silicon:
        return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)"
    if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0:
        n = _norm(gpu_label or spec.gpu_label)
        # pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants);
        # pass 2: name only — a custom VRAM override must not hide the chart.
        for check_vram in (True, False):
            for key, bw, vram in _bw_index():
                if key and key in n:
                    if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4:
                        continue
                    return bw, "vendor spec sheet"
        return None, ""
    return None, ""


# --------------------------------------------------------------------------
# Trained model (optional, loaded if scripts/train_speed_model.py produced it)
# --------------------------------------------------------------------------

_MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json"


@lru_cache(maxsize=1)
def _trained_model():
    # Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops
    # artifact exists for the Hub, but its loading chain dragged in unrelated
    # imports on the Space).
    if _MODEL_JSON_PATH.exists():
        try:
            from xgboost import XGBRegressor
            model = XGBRegressor()
            model.load_model(_MODEL_JSON_PATH)
            print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True)
            return model
        except Exception as e:  # noqa: BLE001
            import sys
            print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed "
                  f"to load ({e!r}) — trying the skops artifact",
                  file=sys.stderr, flush=True)
    if not _MODEL_PATH.exists():
        return None
    try:
        from skops.io import load as skops_load
        # skops only loads explicitly-trusted types — exactly these two, which
        # scripts/train_speed_model.py produces. Anything else is refused.
        model = skops_load(_MODEL_PATH, trusted=[
            "xgboost.core.Booster", "xgboost.sklearn.XGBRegressor",
        ])
        print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True)
        return model
    except Exception as e:  # noqa: BLE001
        # The file exists but won't load — say so loudly (a silent fallback
        # here would hide a broken deploy behind plausible roofline numbers).
        import sys
        print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to "
              f"load ({e!r}) — falling back to the labelled roofline estimate",
              file=sys.stderr, flush=True)
        return None


_METRICS_PATH = _ROOT / "model" / "metrics.json"


@lru_cache(maxsize=1)
def _envelope() -> dict:
    """The region of feature space the training data actually covered.

    Decision trees cannot extrapolate: outside what they saw, they clamp to
    the nearest seen value and quietly give wrong answers (e.g. a 32B model
    gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the
    trained model only answers inside its measured envelope; outside it, the
    labelled analytical estimate takes over. Bounds come from metrics.json
    when the training script recorded them, else conservative defaults
    matching the LocalScore grid (<=14B Q4 models, consumer hardware).
    """
    env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)}
    try:
        rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope")
        if rec:
            env.update({k: tuple(v) for k, v in rec.items()})
    except OSError:
        pass
    return env


def _in_envelope(eff_bw: float, bytes_gb: float) -> bool:
    env = _envelope()
    return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1]
            and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1])


# --------------------------------------------------------------------------
# Prediction
# --------------------------------------------------------------------------

def predict_decode_tps(
    *,
    bandwidth_gbs: float,
    weights_gb: float,
    kv_gb: float = 0.0,
    active_fraction: float = 1.0,
    offload_fraction: float = 0.0,
) -> dict:
    """Predict decode tokens/sec.

    active_fraction: MoE models only read their active experts per token.
    offload_fraction: share of the model living in system RAM (0 = all on GPU).
    """
    # Bytes read per generated token: the (active) weights + the KV cache.
    bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05)
    if active_fraction < 0.9:
        # MoE conservatism: expert routing scatters reads across the full
        # weight file, so real MoE decode lands well under the active-bytes
        # ideal. 1.5x is a deliberate under-promise until measured data
        # corrects it (community MoE numbers run ~50-70% of ideal).
        bytes_gb *= 1.5

    eff_bw = bandwidth_gbs
    if offload_fraction > 0:
        f = min(max(offload_fraction, 0.0), 1.0)
        eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS)

    model = _trained_model()
    if model is not None and _in_envelope(eff_bw, bytes_gb):
        try:
            import numpy as np
            x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb,
                           active_fraction, offload_fraction,
                           eff_bw / bytes_gb]])
            tps = float(model.predict(x)[0])
            return {"tps": round(tps, 1),
                    "lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1),
                    "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
                    "method": "measured-model",
                    "note": ("predicted by a model trained on real community "
                             "measurements (LocalScore), LLM-Pilot methodology")}
        except Exception:  # noqa: BLE001 — fall through to roofline
            pass

    base = eff_bw / bytes_gb
    note = ("analytical estimate: decode speed is memory-bandwidth-bound "
            "(bandwidth divided by bytes read per token)")
    if model is not None:
        note += (" — this configuration is outside the measured data's range, "
                 "so the physics formula answers instead of the trained model")
    return {"tps": round(base * _EFF_MID, 1),
            "lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1),
            "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
            "method": "roofline", "note": note}


def feel_text(pred: dict) -> str:
    """One honest, plain-English line from a prediction."""
    tps = pred["tps"]
    lo, hi = pred["lo"], pred["hi"]
    if tps >= _READING_TPS * 4:
        speed_word = "much faster than you read"
    elif tps >= _READING_TPS * 1.5:
        speed_word = "faster than you read"
    elif tps >= _READING_TPS * 0.7:
        speed_word = "about reading speed"
    else:
        speed_word = "slower than reading — fine for short tasks"
    return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}"