Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

webgpu-bench / js /run /device.js

GitHub Actions

sync from abhijitramesh/webgpu-bench@f59fbdb982

a474e4b 18 days ago

history blame contribute delete

19.3 kB

	// Device-fit helpers for the interactive bench page.
	//
	// Two budget probes drive the per-variant fit decision:
	//
	// getDeviceBudgetMB() — empirical WASM heap probe. Grows a
	// WebAssembly.Memory page-by-page in a worker until it fails. Caps
	// the working set (KV cache + compute scratch + JS heap headroom)
	// llama.cpp consumes during inference.
	//
	// probeGpuBudgetMB() — empirical WebGPU memory probe. Allocates real
	// buffers with mappedAtCreation=true on the actual adapter until OOM.
	// Caps the size of model weights llama.cpp can hold in GPU buffers,
	// since OPFS-streaming keeps model bytes off the WASM heap.
	//
	// variantFits() then checks both: model size + GPU overhead ≤ GPU budget,
	// AND heap working-set floor ≤ heap budget. wllama doesn't probe at all
	// — they let load attempts fail naturally — but our auto-select buttons
	// ("All fit", "Run study") need a fit predicate, so we err on the side
	// of measuring rather than guessing.
	//
	// On wasm32 the linear memory caps at 4 GiB no matter how much physical
	// RAM the device has, so heap probe results above 4096 MB cannot exist.

	const DEFAULT_BUDGET_MB = 2 * 1024;
	const HOSTED_QUOTA_FRACTION = 0.4;
	const HOSTED_QUOTA_CAP_MB = 8 * 1024;

	// Mobile per-device budgets. Two independent caps, mirroring the desktop
	// path — model weights stream from OPFS into WebGPU buffers (see
	// bench-worker.js:patchMEMFS / opfsAlloc), so the model size constrains
	// `gpuBudgetMB`, not `heapBudgetMB`. The WASM heap only has to hold the
	// working set (KV cache + ggml compute scratch + JS heap headroom).
	//
	// Earlier we collapsed both into a single tab budget on the theory that
	// iOS Jetsam treats the whole tab process as one pool, so any allocation
	// counts the same. That's true for Jetsam — but it conflates where the
	// memory lives with how much the platform can hand out: the WASM heap
	// has a much tighter practical ceiling than the GPU side, and counting
	// model bytes against the heap ceiling rejected models that load fine
	// via OPFS streaming.
	//
	// Numbers come from public reports / Apple docs:
	//
	// - iPhone WASM practical limit: 300–450 MB → heap budget
	// lapcatsoftware.com/articles/2026/1/7.html
	// news.ycombinator.com/item?id=39039593
	// github.com/emscripten-core/emscripten/issues/19374
	// github.com/godotengine/godot/issues/70621
	//
	// - iOS Safari WebGPU maxBufferSize: 256 MB on iPhone 6 / older,
	// 993 MB on iPad Pro M-class. Per-buffer cap, not total.
	// Apple WWDC 2025 "Unlock GPU computing with WebGPU"
	//
	// - iPhone 12 Pro reports tab OOM around 1.5–3 GB; Jetsam intervenes
	// earlier under pressure. We undershoot the lower bound for headroom.
	// developer.apple.com/forums/thread/761666
	//
	// Heap budgets = WASM heap practical limits.
	const IPHONE_HEAP_BUDGET_MB = 450;
	const IPAD_HEAP_BUDGET_MB = 1500;
	const ANDROID_HEAP_BUDGET_MB = 800;

	// GPU budgets = available GPU-buffer capacity for model weights + KV
	// mirror, sized below the Jetsam tab ceiling minus working-set headroom.
	// Static per-family numbers — we don't probe on mobile because the
	// probe's allocation pulse itself triggers Jetsam on lower-RAM devices,
	// and WebKit doesn't expose a signal that distinguishes (e.g.) iPhone 13
	// from iPhone 17 Pro Max (same maxBufferSize 1024 MB on both, same
	// deviceMemory clamp). See "mobile probe" history in git: bounded
	// probe shipped, then disabled because the iPhone 13 / mid-RAM iPad
	// classes still Jetsamed during or right after the probe pulse.
	//
	// iPhone: empirical — 1200 MB caused tab reloads on first variant of a
	// Run study (Llama-3.2-1B Q2_K, 554 MB) on iPhone 17 Pro Max. 700 MB
	// keeps Llama-1B variants out of variantFits while still allowing the
	// 250–500 MB tier (gemma-3-270m Q8, Qwen3-0.6B Q4, etc.).
	//
	// iPad: empirical — 2500 MB Jetsamed on Llama-3.2-1B (likely Q4_K_M
	// = 770 MB or Q8_0 = 1259 MB). 1500 MB excludes Llama-1B Q8_0 (1459 MB
	// after overhead) but allows Q4_K_M (970 MB), keeping the standard
	// study quant runnable. High-end iPad Pro M-class probably tolerates
	// more, but we have no way to detect device class.
	const IPHONE_GPU_BUDGET_MB = 700;
	const IPAD_GPU_BUDGET_MB = 1500;
	const ANDROID_GPU_BUDGET_MB = 1500;

	function detectMobileFamily() {
	if (typeof navigator === 'undefined') return null;
	const ua = navigator.userAgent \|\| '';
	// iPadOS 13+ reports "Macintosh" UA but exposes touch; that's the
	// standard iPad-detection workaround.
	if (/iPad/.test(ua)) return 'ipad';
	if (navigator.maxTouchPoints > 1 && /Mac/.test(navigator.platform \|\| '')) return 'ipad';
	if (/iPhone\|iPod/.test(ua)) return 'iphone';
	if (/Android.*Mobile/.test(ua)) return 'android';
	if (navigator.userAgentData?.mobile === true) return 'android';
	return null;
	}

	function getMobileBudgetMB(family) {
	if (family === 'ipad') return { heap: IPAD_HEAP_BUDGET_MB, gpu: IPAD_GPU_BUDGET_MB };
	if (family === 'iphone') return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB };
	if (family === 'android') return { heap: ANDROID_HEAP_BUDGET_MB, gpu: ANDROID_GPU_BUDGET_MB };
	return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB }; // safest default
	}

	const PROBE_TIMEOUT_MS = 15_000;
	const GPU_PROBE_STEP_MB = 256;
	const GPU_PROBE_MAX_MB = 8 * 1024;
	const GPU_PROBE_TIMEOUT_MS = 8_000;

	// Working-set floor in the WASM heap. KV cache + compute buffers + JS
	// heap headroom for a typical 1B model at n_ctx=2048 add up to a few
	// hundred MB. Floor at 256 so an absurdly-tiny heap (or a probe failure
	// that returned 0) doesn't pass variantFits.
	const HEAP_WORKING_SET_FLOOR_MB = 256;

	// Per-variant overhead added on top of the model file size when checking
	// GPU fit. Covers compute buffers, alignment padding, and the KV cache
	// mirror that the WebGPU backend keeps. A flat 200 MB is a conservative
	// approximation; in practice it scales somewhat with model + context size.
	const GPU_VARIANT_OVERHEAD_MB = 200;

	export function isMobileDevice() {
	return detectMobileFamily() !== null;
	}

	// ──────────────── WASM heap probe ────────────────

	// Spawn the probe worker, wait for a result, clean up. Returns
	// { probedMB } on success, or { probedMB: 0, error } on any failure mode
	// (timeout, worker construct error, worker onerror — typically the probe
	// itself ran the engine out of memory).
	export function probeHeapBudgetMB({ stepPages, maxPages, timeoutMs = PROBE_TIMEOUT_MS } = {}) {
	return new Promise((resolve) => {
	let worker;
	try {
	worker = new Worker(new URL('./memory-probe.js', import.meta.url));
	} catch (err) {
	resolve({ probedMB: 0, error: `worker construct failed: ${err.message}` });
	return;
	}

	const timer = setTimeout(() => {
	try { worker.terminate(); } catch { /* noop */ }
	resolve({ probedMB: 0, error: 'probe timeout' });
	}, timeoutMs);

	worker.onmessage = (e) => {
	clearTimeout(timer);
	const { committedMB = 0 } = e.data \|\| {};
	try { worker.terminate(); } catch { /* noop */ }
	resolve({ probedMB: committedMB });
	};
	worker.onerror = (err) => {
	clearTimeout(timer);
	try { worker.terminate(); } catch { /* noop */ }
	resolve({ probedMB: 0, error: err.message \|\| 'worker error' });
	};

	worker.postMessage({ stepPages, maxPages });
	});
	}

	// ──────────────── GPU memory probe ────────────────

	// Allocate WebGPU buffers in stepMB increments until OOM, return the
	// total committed bytes as the GPU memory budget. Uses
	// mappedAtCreation=true to force real memory commit (some drivers lazy-
	// allocate until first use otherwise) and captures OOM via the
	// 'out-of-memory' error scope, with device.lost as a backstop.
	//
	// Caveats:
	// - The GPU process is shared with other tabs. If they're holding GPU
	// memory the probe undercounts. (Same as wllama's heap probe — best
	// we can do without a richer browser API.)
	// - Some drivers (notably iOS Metal under WebKit) lazy-fail at dispatch
	// time rather than at createBuffer; this probe's number is therefore
	// an upper bound, not a guarantee. Mobile cap below mitigates.
	export async function probeGpuBudgetMB({
	stepMB = GPU_PROBE_STEP_MB,
	maxMB = GPU_PROBE_MAX_MB,
	timeoutMs = GPU_PROBE_TIMEOUT_MS,
	yieldMs = 0,
	} = {}) {
	if (!navigator.gpu) {
	return { probedMB: 0, error: 'WebGPU not available' };
	}

	let adapter, device;
	try {
	adapter = await navigator.gpu.requestAdapter();
	if (!adapter) return { probedMB: 0, error: 'no WebGPU adapter' };
	// Request the maximum the adapter can give us; defaults are often
	// smaller than what the hardware supports.
	const requiredLimits = {};
	const cap = (k) => {
	const v = adapter.limits?.[k];
	if (typeof v === 'number') requiredLimits[k] = v;
	};
	cap('maxBufferSize');
	cap('maxStorageBufferBindingSize');
	device = await adapter.requestDevice({ requiredLimits });
	} catch (err) {
	return { probedMB: 0, error: `adapter/device init failed: ${err.message}` };
	}

	let deviceLost = false;
	device.lost.then(() => { deviceLost = true; }).catch(() => {});

	const buffers = [];
	const stepBytes = stepMB * 1024 * 1024;
	let totalBytes = 0;
	const start = performance.now();

	try {
	while (totalBytes + stepBytes <= maxMB * 1024 * 1024) {
	if (deviceLost) break;
	if (performance.now() - start > timeoutMs) break;

	device.pushErrorScope('out-of-memory');
	let buffer;
	try {
	buffer = device.createBuffer({
	size: stepBytes,
	usage: GPUBufferUsage.STORAGE,
	mappedAtCreation: true,
	});
	// Touch the start of the mapped range to force a real commit.
	// Drivers can lazy-back the allocation until first write, which
	// would fool the probe into thinking it has more headroom than it
	// really does.
	const touchBytes = Math.min(stepBytes, 64 * 1024);
	new Uint8Array(buffer.getMappedRange(0, touchBytes))[0] = 1;
	buffer.unmap();
	} catch (err) {
	await device.popErrorScope().catch(() => null);
	break;
	}

	const error = await device.popErrorScope().catch(() => null);
	if (error) {
	try { buffer.destroy(); } catch { /* noop */ }
	break;
	}

	buffers.push(buffer);
	totalBytes += stepBytes;

	// Yield so we don't starve the main thread / GC. On mobile a
	// longer yield also gives the OS a chance to update its memory
	// accounting between steps so a fast burst doesn't look like a
	// spike to Jetsam.
	await new Promise((r) => setTimeout(r, yieldMs));
	}
	} finally {
	for (const b of buffers) {
	try { b.destroy(); } catch { /* noop */ }
	}
	try { device.destroy(); } catch { /* noop */ }
	}

	return { probedMB: Math.floor(totalBytes / (1024 * 1024)) };
	}

	// ──────────────── public budget API ────────────────

	// Cache the full budget for the lifetime of the page load. Both probes
	// take 1–8 s; we don't want to pay that twice for the same surface.
	let _budgetPromise = null;

	export async function getDeviceBudgetMB() {
	if (_budgetPromise) return _budgetPromise;
	_budgetPromise = _computeBudget();
	return _budgetPromise;
	}

	async function _computeBudget() {
	const memGB = typeof navigator.deviceMemory === 'number' ? navigator.deviceMemory : null;
	let quotaMB = null;
	try {
	const est = await navigator.storage?.estimate?.();
	if (est?.quota) quotaMB = est.quota / (1024 * 1024);
	} catch {
	// some browsers throw on storage.estimate in non-secure contexts
	}

	const mobileFamily = detectMobileFamily();
	const isMobile = mobileFamily !== null;

	// ── Mobile path: pure static budgets ──
	//
	// No probes on mobile. Both the heap probe and the GPU probe have been
	// shown to themselves trigger Jetsam:
	// - Heap probe: commit 6f33b5d (terminated worker).
	// - GPU probe (unbounded): commit 4f567a5.
	// - GPU probe (bounded): the 1000 MB peak allocation pulse on
	// iPhone 13 / mid-RAM iPad classes still pushed the WebContent
	// process over the Jetsam threshold during or right after the
	// probe — even though the probe itself completed cleanly,
	// subsequent OPFS writes hit "unknown transient" errors and the
	// next inference allocation tipped the tab over.
	//
	// We can't distinguish iPhone 13 (6 GB) from iPhone 17 Pro Max (12 GB)
	// via WebGPU adapter info or navigator.deviceMemory, so we err on the
	// side of the lower-RAM device. The budgets are tuned to admit the
	// 250–500 MB tier (gemma-3-270m, Qwen3-0.6B, LFM2.5-350M) and to
	// exclude variants that empirically caused crashes on the smallest
	// device in each family. We still surface adapter.limits.maxBufferSize
	// in the source string for diagnostics.
	if (isMobile) {
	const { heap: heapBudgetMB, gpu: gpuBudgetMB } = getMobileBudgetMB(mobileFamily);

	// Read adapter limits without allocating a device buffer — purely
	// informational for the device card / log line.
	let maxBufferSizeMB = 0;
	let adapterReadError = null;
	try {
	if (navigator.gpu) {
	const adapter = await navigator.gpu.requestAdapter();
	const lim = adapter?.limits?.maxBufferSize;
	if (typeof lim === 'number') {
	maxBufferSizeMB = Math.floor(lim / (1024 * 1024));
	}
	} else {
	adapterReadError = 'WebGPU not available';
	}
	} catch (err) {
	adapterReadError = err.message;
	}

	const adapterDetail = adapterReadError
	? ` (adapter read failed: ${adapterReadError})`
	: maxBufferSizeMB > 0
	? ` (maxBufferSize ${maxBufferSizeMB} MB)`
	: '';

	return {
	budgetMB: gpuBudgetMB,
	gpuBudgetMB,
	heapBudgetMB,
	memGB,
	quotaMB,
	probedMB: 0,
	gpuProbedMB: 0,
	probeError: 'skipped on mobile (probes themselves trigger Jetsam)',
	gpuProbeError: 'skipped on mobile (probes themselves trigger Jetsam)',
	isMobile: true,
	mobileFamily,
	source: `mobile static budget — ${mobileFamily} (GPU ${gpuBudgetMB} MB for OPFS-streamed weights)${adapterDetail}`,
	heapSource: `mobile static budget — ${mobileFamily} (WASM heap ${heapBudgetMB} MB for KV + compute scratch)`,
	};
	}

	// ── Desktop path: real probes ──
	const [heapProbe, gpuProbe] = await Promise.all([
	probeHeapBudgetMB(),
	probeGpuBudgetMB(),
	]);

	let heapBudgetMB;
	let heapSource;
	if (heapProbe.probedMB > 0) {
	heapBudgetMB = heapProbe.probedMB;
	heapSource = `probe (WASM heap, ${heapProbe.probedMB} MB committed)`;
	} else if (memGB !== null) {
	heapBudgetMB = memGB * 1024 * 0.6;
	heapSource = 'navigator.deviceMemory (heap probe failed)';
	} else if (quotaMB !== null) {
	heapBudgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB);
	heapSource = 'navigator.storage.estimate().quota (heap probe failed)';
	} else {
	heapBudgetMB = DEFAULT_BUDGET_MB;
	heapSource = 'default (heap probe failed)';
	}

	let gpuBudgetMB;
	let gpuSource;
	if (gpuProbe.probedMB > 0) {
	gpuBudgetMB = gpuProbe.probedMB;
	gpuSource = `probe (WebGPU buffers, ${gpuProbe.probedMB} MB allocated)`;
	} else {
	gpuBudgetMB = 0;
	gpuSource = `probe failed: ${gpuProbe.error \|\| 'unknown'}`;
	}

	return {
	budgetMB: gpuBudgetMB,
	gpuBudgetMB,
	heapBudgetMB,
	memGB,
	quotaMB,
	probedMB: heapProbe.probedMB,
	gpuProbedMB: gpuProbe.probedMB,
	probeError: heapProbe.error \|\| null,
	gpuProbeError: gpuProbe.error \|\| null,
	isMobile: false,
	mobileFamily: null,
	source: gpuSource,
	heapSource,
	};
	}

	// variantFits decides whether a model file of `sizeMB` bytes can be
	// loaded and run on this device. Two checks must pass:
	//
	// 1. sizeMB + GPU_VARIANT_OVERHEAD_MB ≤ gpuBudgetMB
	// Model weights live in WebGPU buffers (since OPFS streaming
	// keeps them off the WASM heap). The overhead covers compute
	// scratch + alignment + KV cache mirror.
	//
	// 2. heapBudgetMB ≥ HEAP_WORKING_SET_FLOOR_MB
	// The WASM heap still has to fit the working set: KV cache,
	// ggml compute buffers, and JS heap headroom. Roughly constant
	// per inference regardless of model size at fixed n_ctx.
	//
	// Backwards-compat: if the second arg is a plain number, treat it as
	// the legacy heap-only budget and apply the prior 1.5× sizeMB overhead.
	// New callers should pass { gpuBudgetMB, heapBudgetMB }.
	export function variantFits(sizeMB, budget) {
	if (typeof sizeMB !== 'number' \|\| sizeMB <= 0) return false;

	if (typeof budget === 'number') {
	return budget > 0 && sizeMB * 1.5 <= budget;
	}
	if (!budget \|\| typeof budget !== 'object') return false;

	const { gpuBudgetMB, heapBudgetMB } = budget;
	if (typeof gpuBudgetMB !== 'number' \|\| sizeMB + GPU_VARIANT_OVERHEAD_MB > gpuBudgetMB) {
	return false;
	}
	if (typeof heapBudgetMB !== 'number' \|\| heapBudgetMB < HEAP_WORKING_SET_FLOOR_MB) {
	return false;
	}
	return true;
	}

	export async function describeDevice() {
	const budget = await getDeviceBudgetMB();
	let gpu = null;
	if (navigator.gpu) {
	try {
	const adapter = await navigator.gpu.requestAdapter();
	if (adapter) gpu = adapter.info \|\| { vendor: 'unknown' };
	} catch {
	gpu = null;
	}
	}

	// UA Client Hints: high-entropy values give us the real architecture
	// and OS, neither of which `navigator.platform` reports correctly on
	// Apple Silicon Macs (it returns "MacIntel" forever for back-compat).
	let uaArch = null;
	let uaPlatform = null;
	let uaPlatformVersion = null;
	// `fullVersionList` is the high-entropy version of `brands` and gives
	// us the full dotted version (e.g. "147.0.7390.107") instead of just
	// the major. The default `brands` is major-only.
	let fullVersionList = null;
	try {
	const uad = navigator.userAgentData;
	if (uad?.getHighEntropyValues) {
	const hev = await uad.getHighEntropyValues([
	'architecture', 'platform', 'platformVersion', 'fullVersionList',
	]);
	uaArch = hev.architecture \|\| null;
	uaPlatform = hev.platform \|\| null;
	uaPlatformVersion = hev.platformVersion \|\| null;
	fullVersionList = hev.fullVersionList \|\| null;
	}
	} catch { /* not Chromium or denied */ }

	// UA-CH brands give us a clean { brand, version } pair without parsing
	// the userAgent string. Filter out the "Not(A:Brand)" decoy entries.
	// Prefer `fullVersionList` (full dotted version) over the major-only
	// default `brands` list.
	const brandSource = fullVersionList \|\| navigator.userAgentData?.brands \|\| [];
	const brands = brandSource
	.filter(b => b && !/not[^\w]a[^\w]brand/i.test(b.brand));

	return {
	...budget,
	webgpu: !!navigator.gpu,
	gpu,
	userAgent: navigator.userAgent,
	platform: navigator.platform ?? null,
	uaArch,
	uaPlatform,
	uaPlatformVersion,
	uaBrands: brands,
	};
	}