Spaces:

build-small-hackathon
/

FitCheck

Running on Zero

GPU spam guard (serialize + button lock), live progress tickers, describe-first layout, real-machine bindings, chart fix for custom VRAM

ca2bb8e verified about 14 hours ago

raw

history blame contribute delete

10.9 kB

	"""
	Speed estimation: how fast will it actually feel?

	Two-tier design, with provenance the UI always shows:

	1. TRAINED MODEL (when present): an XGBoost regressor trained on real
	community measurements (LocalScore, ~33k data points), following the
	methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient
	boosting over hardware+model features, validated leave-one-accelerator-
	out). Loaded from model/speed_model.skops if scripts/train_speed_model.py
	has been run. method = "measured-model".
	2. ROOFLINE BASELINE (always available, fully offline): decode is memory-
	bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights +
	KV), times an empirical efficiency factor. See kipply's "Transformer
	Inference Arithmetic" and the JAX scaling book inference chapter.
	method = "roofline".

	The anti-gimmick rule lives in the training script: the trained model ships
	only if it beats this baseline on held-out hardware; otherwise the baseline
	IS the product and the UI says so.

	Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and
	diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with
	TFLOPS / model GFLOPs, a different axis with different data (Ultralytics
	publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU
	TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built;
	non-LLM families keep their provenance-labelled memory verdicts only, rather
	than getting fake speed numbers.
	"""

	import json
	import re
	from functools import lru_cache
	from pathlib import Path

	_ROOT = Path(__file__).resolve().parent.parent
	_SPECS_PATH = _ROOT / "data" / "gpu_specs.json"
	_MODEL_PATH = _ROOT / "model" / "speed_model.skops"

	# Decode efficiency vs theoretical bandwidth roofline. Real stacks land well
	# under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community
	# measurements. We centre conservatively and report a band, never a point.
	_EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78
	# Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5).
	_RAM_BW_GBS = 48.0
	# Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s.
	_READING_TPS = 6.0


	@lru_cache(maxsize=1)
	def _specs() -> dict:
	try:
	return json.loads(_SPECS_PATH.read_text(encoding="utf-8"))
	except OSError:
	return {"gpus": {}, "apple": {}, "sbc": {}}


	def _norm(s: str) -> str:
	return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip()


	@lru_cache(maxsize=1)
	def _bw_index() -> tuple:
	idx = []
	for name, d in _specs()["gpus"].items():
	idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0))))
	idx.sort(key=lambda t: -len(t[0])) # longest first: '4080 super' beats '4080'
	return tuple(idx)


	# Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use
	# M2-generation numbers as the conservative representative (older = slower).
	_APPLE_TIER_BW = None


	def _apple_bw(tier_hint: str) -> float:
	global _APPLE_TIER_BW
	if _APPLE_TIER_BW is None:
	a = {k: v["bw"] for k, v in _specs()["apple"].items()}
	_APPLE_TIER_BW = {
	"ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0,
	"max": a.get("m2 max") or 400.0,
	"pro": a.get("m2 pro") or 200.0,
	"base": a.get("m2") or 100.0,
	}
	t = (tier_hint or "").lower()
	for key in ("ultra", "max", "pro"):
	if key in t:
	return _APPLE_TIER_BW[key]
	return _APPLE_TIER_BW["base"]


	def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float \| None, str]:
	"""(memory bandwidth GB/s on the fast path, source-note) for a machine."""
	if spec.is_apple_silicon:
	return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)"
	if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0:
	n = _norm(gpu_label or spec.gpu_label)
	# pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants);
	# pass 2: name only — a custom VRAM override must not hide the chart.
	for check_vram in (True, False):
	for key, bw, vram in _bw_index():
	if key and key in n:
	if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4:
	continue
	return bw, "vendor spec sheet"
	return None, ""
	return None, ""


	# --------------------------------------------------------------------------
	# Trained model (optional, loaded if scripts/train_speed_model.py produced it)
	# --------------------------------------------------------------------------

	_MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json"


	@lru_cache(maxsize=1)
	def _trained_model():
	# Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops
	# artifact exists for the Hub, but its loading chain dragged in unrelated
	# imports on the Space).
	if _MODEL_JSON_PATH.exists():
	try:
	from xgboost import XGBRegressor
	model = XGBRegressor()
	model.load_model(_MODEL_JSON_PATH)
	print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True)
	return model
	except Exception as e: # noqa: BLE001
	import sys
	print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed "
	f"to load ({e!r}) — trying the skops artifact",
	file=sys.stderr, flush=True)
	if not _MODEL_PATH.exists():
	return None
	try:
	from skops.io import load as skops_load
	# skops only loads explicitly-trusted types — exactly these two, which
	# scripts/train_speed_model.py produces. Anything else is refused.
	model = skops_load(_MODEL_PATH, trusted=[
	"xgboost.core.Booster", "xgboost.sklearn.XGBRegressor",
	])
	print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True)
	return model
	except Exception as e: # noqa: BLE001
	# The file exists but won't load — say so loudly (a silent fallback
	# here would hide a broken deploy behind plausible roofline numbers).
	import sys
	print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to "
	f"load ({e!r}) — falling back to the labelled roofline estimate",
	file=sys.stderr, flush=True)
	return None


	_METRICS_PATH = _ROOT / "model" / "metrics.json"


	@lru_cache(maxsize=1)
	def _envelope() -> dict:
	"""The region of feature space the training data actually covered.

	Decision trees cannot extrapolate: outside what they saw, they clamp to
	the nearest seen value and quietly give wrong answers (e.g. a 32B model
	gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the
	trained model only answers inside its measured envelope; outside it, the
	labelled analytical estimate takes over. Bounds come from metrics.json
	when the training script recorded them, else conservative defaults
	matching the LocalScore grid (<=14B Q4 models, consumer hardware).
	"""
	env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)}
	try:
	rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope")
	if rec:
	env.update({k: tuple(v) for k, v in rec.items()})
	except OSError:
	pass
	return env


	def _in_envelope(eff_bw: float, bytes_gb: float) -> bool:
	env = _envelope()
	return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1]
	and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1])


	# --------------------------------------------------------------------------
	# Prediction
	# --------------------------------------------------------------------------

	def predict_decode_tps(
	*,
	bandwidth_gbs: float,
	weights_gb: float,
	kv_gb: float = 0.0,
	active_fraction: float = 1.0,
	offload_fraction: float = 0.0,
	) -> dict:
	"""Predict decode tokens/sec.

	active_fraction: MoE models only read their active experts per token.
	offload_fraction: share of the model living in system RAM (0 = all on GPU).
	"""
	# Bytes read per generated token: the (active) weights + the KV cache.
	bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05)
	if active_fraction < 0.9:
	# MoE conservatism: expert routing scatters reads across the full
	# weight file, so real MoE decode lands well under the active-bytes
	# ideal. 1.5x is a deliberate under-promise until measured data
	# corrects it (community MoE numbers run ~50-70% of ideal).
	bytes_gb *= 1.5

	eff_bw = bandwidth_gbs
	if offload_fraction > 0:
	f = min(max(offload_fraction, 0.0), 1.0)
	eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS)

	model = _trained_model()
	if model is not None and _in_envelope(eff_bw, bytes_gb):
	try:
	import numpy as np
	x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb,
	active_fraction, offload_fraction,
	eff_bw / bytes_gb]])
	tps = float(model.predict(x)[0])
	return {"tps": round(tps, 1),
	"lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1),
	"bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
	"method": "measured-model",
	"note": ("predicted by a model trained on real community "
	"measurements (LocalScore), LLM-Pilot methodology")}
	except Exception: # noqa: BLE001 — fall through to roofline
	pass

	base = eff_bw / bytes_gb
	note = ("analytical estimate: decode speed is memory-bandwidth-bound "
	"(bandwidth divided by bytes read per token)")
	if model is not None:
	note += (" — this configuration is outside the measured data's range, "
	"so the physics formula answers instead of the trained model")
	return {"tps": round(base * _EFF_MID, 1),
	"lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1),
	"bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1),
	"method": "roofline", "note": note}


	def feel_text(pred: dict) -> str:
	"""One honest, plain-English line from a prediction."""
	tps = pred["tps"]
	lo, hi = pred["lo"], pred["hi"]
	if tps >= _READING_TPS * 4:
	speed_word = "much faster than you read"
	elif tps >= _READING_TPS * 1.5:
	speed_word = "faster than you read"
	elif tps >= _READING_TPS * 0.7:
	speed_word = "about reading speed"
	else:
	speed_word = "slower than reading — fine for short tasks"
	return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}"