// Device-fit helpers for the interactive bench page. // // Two budget probes drive the per-variant fit decision: // // getDeviceBudgetMB() — empirical WASM heap probe. Grows a // WebAssembly.Memory page-by-page in a worker until it fails. Caps // the working set (KV cache + compute scratch + JS heap headroom) // llama.cpp consumes during inference. // // probeGpuBudgetMB() — empirical WebGPU memory probe. Allocates real // buffers with mappedAtCreation=true on the actual adapter until OOM. // Caps the size of model weights llama.cpp can hold in GPU buffers, // since OPFS-streaming keeps model bytes off the WASM heap. // // variantFits() then checks both: model size + GPU overhead ≤ GPU budget, // AND heap working-set floor ≤ heap budget. wllama doesn't probe at all // — they let load attempts fail naturally — but our auto-select buttons // ("All fit", "Run study") need a fit predicate, so we err on the side // of measuring rather than guessing. // // On wasm32 the linear memory caps at 4 GiB no matter how much physical // RAM the device has, so heap probe results above 4096 MB cannot exist. const DEFAULT_BUDGET_MB = 2 * 1024; const HOSTED_QUOTA_FRACTION = 0.4; const HOSTED_QUOTA_CAP_MB = 8 * 1024; // Mobile per-device budgets. Two independent caps, mirroring the desktop // path — model weights stream from OPFS into WebGPU buffers (see // bench-worker.js:patchMEMFS / opfsAlloc), so the model size constrains // `gpuBudgetMB`, not `heapBudgetMB`. The WASM heap only has to hold the // working set (KV cache + ggml compute scratch + JS heap headroom). // // Earlier we collapsed both into a single tab budget on the theory that // iOS Jetsam treats the whole tab process as one pool, so any allocation // counts the same. That's true for Jetsam — but it conflates *where* the // memory lives with *how much* the platform can hand out: the WASM heap // has a much tighter practical ceiling than the GPU side, and counting // model bytes against the heap ceiling rejected models that load fine // via OPFS streaming. // // Numbers come from public reports / Apple docs: // // - iPhone WASM practical limit: 300–450 MB → heap budget // lapcatsoftware.com/articles/2026/1/7.html // news.ycombinator.com/item?id=39039593 // github.com/emscripten-core/emscripten/issues/19374 // github.com/godotengine/godot/issues/70621 // // - iOS Safari WebGPU maxBufferSize: 256 MB on iPhone 6 / older, // 993 MB on iPad Pro M-class. Per-buffer cap, not total. // Apple WWDC 2025 "Unlock GPU computing with WebGPU" // // - iPhone 12 Pro reports tab OOM around 1.5–3 GB; Jetsam intervenes // earlier under pressure. We undershoot the lower bound for headroom. // developer.apple.com/forums/thread/761666 // // Heap budgets = WASM heap practical limits. const IPHONE_HEAP_BUDGET_MB = 450; const IPAD_HEAP_BUDGET_MB = 1500; const ANDROID_HEAP_BUDGET_MB = 800; // GPU budgets = available GPU-buffer capacity for model weights + KV // mirror, sized below the Jetsam tab ceiling minus working-set headroom. // Static per-family numbers — we don't probe on mobile because the // probe's allocation pulse itself triggers Jetsam on lower-RAM devices, // and WebKit doesn't expose a signal that distinguishes (e.g.) iPhone 13 // from iPhone 17 Pro Max (same maxBufferSize 1024 MB on both, same // deviceMemory clamp). See "mobile probe" history in git: bounded // probe shipped, then disabled because the iPhone 13 / mid-RAM iPad // classes still Jetsamed during or right after the probe pulse. // // iPhone: empirical — 1200 MB caused tab reloads on first variant of a // Run study (Llama-3.2-1B Q2_K, 554 MB) on iPhone 17 Pro Max. 700 MB // keeps Llama-1B variants out of variantFits while still allowing the // 250–500 MB tier (gemma-3-270m Q8, Qwen3-0.6B Q4, etc.). // // iPad: empirical — 2500 MB Jetsamed on Llama-3.2-1B (likely Q4_K_M // = 770 MB or Q8_0 = 1259 MB). 1500 MB excludes Llama-1B Q8_0 (1459 MB // after overhead) but allows Q4_K_M (970 MB), keeping the standard // study quant runnable. High-end iPad Pro M-class probably tolerates // more, but we have no way to detect device class. const IPHONE_GPU_BUDGET_MB = 700; const IPAD_GPU_BUDGET_MB = 1500; const ANDROID_GPU_BUDGET_MB = 1500; function detectMobileFamily() { if (typeof navigator === 'undefined') return null; const ua = navigator.userAgent || ''; // iPadOS 13+ reports "Macintosh" UA but exposes touch; that's the // standard iPad-detection workaround. if (/iPad/.test(ua)) return 'ipad'; if (navigator.maxTouchPoints > 1 && /Mac/.test(navigator.platform || '')) return 'ipad'; if (/iPhone|iPod/.test(ua)) return 'iphone'; if (/Android.*Mobile/.test(ua)) return 'android'; if (navigator.userAgentData?.mobile === true) return 'android'; return null; } function getMobileBudgetMB(family) { if (family === 'ipad') return { heap: IPAD_HEAP_BUDGET_MB, gpu: IPAD_GPU_BUDGET_MB }; if (family === 'iphone') return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB }; if (family === 'android') return { heap: ANDROID_HEAP_BUDGET_MB, gpu: ANDROID_GPU_BUDGET_MB }; return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB }; // safest default } const PROBE_TIMEOUT_MS = 15_000; const GPU_PROBE_STEP_MB = 256; const GPU_PROBE_MAX_MB = 8 * 1024; const GPU_PROBE_TIMEOUT_MS = 8_000; // Working-set floor in the WASM heap. KV cache + compute buffers + JS // heap headroom for a typical 1B model at n_ctx=2048 add up to a few // hundred MB. Floor at 256 so an absurdly-tiny heap (or a probe failure // that returned 0) doesn't pass variantFits. const HEAP_WORKING_SET_FLOOR_MB = 256; // Per-variant overhead added on top of the model file size when checking // GPU fit. Covers compute buffers, alignment padding, and the KV cache // mirror that the WebGPU backend keeps. A flat 200 MB is a conservative // approximation; in practice it scales somewhat with model + context size. const GPU_VARIANT_OVERHEAD_MB = 200; export function isMobileDevice() { return detectMobileFamily() !== null; } // ──────────────── WASM heap probe ──────────────── // Spawn the probe worker, wait for a result, clean up. Returns // { probedMB } on success, or { probedMB: 0, error } on any failure mode // (timeout, worker construct error, worker onerror — typically the probe // itself ran the engine out of memory). export function probeHeapBudgetMB({ stepPages, maxPages, timeoutMs = PROBE_TIMEOUT_MS } = {}) { return new Promise((resolve) => { let worker; try { worker = new Worker(new URL('./memory-probe.js', import.meta.url)); } catch (err) { resolve({ probedMB: 0, error: `worker construct failed: ${err.message}` }); return; } const timer = setTimeout(() => { try { worker.terminate(); } catch { /* noop */ } resolve({ probedMB: 0, error: 'probe timeout' }); }, timeoutMs); worker.onmessage = (e) => { clearTimeout(timer); const { committedMB = 0 } = e.data || {}; try { worker.terminate(); } catch { /* noop */ } resolve({ probedMB: committedMB }); }; worker.onerror = (err) => { clearTimeout(timer); try { worker.terminate(); } catch { /* noop */ } resolve({ probedMB: 0, error: err.message || 'worker error' }); }; worker.postMessage({ stepPages, maxPages }); }); } // ──────────────── GPU memory probe ──────────────── // Allocate WebGPU buffers in stepMB increments until OOM, return the // total committed bytes as the GPU memory budget. Uses // mappedAtCreation=true to force real memory commit (some drivers lazy- // allocate until first use otherwise) and captures OOM via the // 'out-of-memory' error scope, with device.lost as a backstop. // // Caveats: // - The GPU process is shared with other tabs. If they're holding GPU // memory the probe undercounts. (Same as wllama's heap probe — best // we can do without a richer browser API.) // - Some drivers (notably iOS Metal under WebKit) lazy-fail at dispatch // time rather than at createBuffer; this probe's number is therefore // an upper bound, not a guarantee. Mobile cap below mitigates. export async function probeGpuBudgetMB({ stepMB = GPU_PROBE_STEP_MB, maxMB = GPU_PROBE_MAX_MB, timeoutMs = GPU_PROBE_TIMEOUT_MS, yieldMs = 0, } = {}) { if (!navigator.gpu) { return { probedMB: 0, error: 'WebGPU not available' }; } let adapter, device; try { adapter = await navigator.gpu.requestAdapter(); if (!adapter) return { probedMB: 0, error: 'no WebGPU adapter' }; // Request the maximum the adapter can give us; defaults are often // smaller than what the hardware supports. const requiredLimits = {}; const cap = (k) => { const v = adapter.limits?.[k]; if (typeof v === 'number') requiredLimits[k] = v; }; cap('maxBufferSize'); cap('maxStorageBufferBindingSize'); device = await adapter.requestDevice({ requiredLimits }); } catch (err) { return { probedMB: 0, error: `adapter/device init failed: ${err.message}` }; } let deviceLost = false; device.lost.then(() => { deviceLost = true; }).catch(() => {}); const buffers = []; const stepBytes = stepMB * 1024 * 1024; let totalBytes = 0; const start = performance.now(); try { while (totalBytes + stepBytes <= maxMB * 1024 * 1024) { if (deviceLost) break; if (performance.now() - start > timeoutMs) break; device.pushErrorScope('out-of-memory'); let buffer; try { buffer = device.createBuffer({ size: stepBytes, usage: GPUBufferUsage.STORAGE, mappedAtCreation: true, }); // Touch the start of the mapped range to force a real commit. // Drivers can lazy-back the allocation until first write, which // would fool the probe into thinking it has more headroom than it // really does. const touchBytes = Math.min(stepBytes, 64 * 1024); new Uint8Array(buffer.getMappedRange(0, touchBytes))[0] = 1; buffer.unmap(); } catch (err) { await device.popErrorScope().catch(() => null); break; } const error = await device.popErrorScope().catch(() => null); if (error) { try { buffer.destroy(); } catch { /* noop */ } break; } buffers.push(buffer); totalBytes += stepBytes; // Yield so we don't starve the main thread / GC. On mobile a // longer yield also gives the OS a chance to update its memory // accounting between steps so a fast burst doesn't look like a // spike to Jetsam. await new Promise((r) => setTimeout(r, yieldMs)); } } finally { for (const b of buffers) { try { b.destroy(); } catch { /* noop */ } } try { device.destroy(); } catch { /* noop */ } } return { probedMB: Math.floor(totalBytes / (1024 * 1024)) }; } // ──────────────── public budget API ──────────────── // Cache the full budget for the lifetime of the page load. Both probes // take 1–8 s; we don't want to pay that twice for the same surface. let _budgetPromise = null; export async function getDeviceBudgetMB() { if (_budgetPromise) return _budgetPromise; _budgetPromise = _computeBudget(); return _budgetPromise; } async function _computeBudget() { const memGB = typeof navigator.deviceMemory === 'number' ? navigator.deviceMemory : null; let quotaMB = null; try { const est = await navigator.storage?.estimate?.(); if (est?.quota) quotaMB = est.quota / (1024 * 1024); } catch { // some browsers throw on storage.estimate in non-secure contexts } const mobileFamily = detectMobileFamily(); const isMobile = mobileFamily !== null; // ── Mobile path: pure static budgets ── // // No probes on mobile. Both the heap probe and the GPU probe have been // shown to themselves trigger Jetsam: // - Heap probe: commit 6f33b5d (terminated worker). // - GPU probe (unbounded): commit 4f567a5. // - GPU probe (bounded): the 1000 MB peak allocation pulse on // iPhone 13 / mid-RAM iPad classes still pushed the WebContent // process over the Jetsam threshold during or right after the // probe — even though the probe itself completed cleanly, // subsequent OPFS writes hit "unknown transient" errors and the // next inference allocation tipped the tab over. // // We can't distinguish iPhone 13 (6 GB) from iPhone 17 Pro Max (12 GB) // via WebGPU adapter info or navigator.deviceMemory, so we err on the // side of the lower-RAM device. The budgets are tuned to admit the // 250–500 MB tier (gemma-3-270m, Qwen3-0.6B, LFM2.5-350M) and to // exclude variants that empirically caused crashes on the smallest // device in each family. We still surface adapter.limits.maxBufferSize // in the source string for diagnostics. if (isMobile) { const { heap: heapBudgetMB, gpu: gpuBudgetMB } = getMobileBudgetMB(mobileFamily); // Read adapter limits without allocating a device buffer — purely // informational for the device card / log line. let maxBufferSizeMB = 0; let adapterReadError = null; try { if (navigator.gpu) { const adapter = await navigator.gpu.requestAdapter(); const lim = adapter?.limits?.maxBufferSize; if (typeof lim === 'number') { maxBufferSizeMB = Math.floor(lim / (1024 * 1024)); } } else { adapterReadError = 'WebGPU not available'; } } catch (err) { adapterReadError = err.message; } const adapterDetail = adapterReadError ? ` (adapter read failed: ${adapterReadError})` : maxBufferSizeMB > 0 ? ` (maxBufferSize ${maxBufferSizeMB} MB)` : ''; return { budgetMB: gpuBudgetMB, gpuBudgetMB, heapBudgetMB, memGB, quotaMB, probedMB: 0, gpuProbedMB: 0, probeError: 'skipped on mobile (probes themselves trigger Jetsam)', gpuProbeError: 'skipped on mobile (probes themselves trigger Jetsam)', isMobile: true, mobileFamily, source: `mobile static budget — ${mobileFamily} (GPU ${gpuBudgetMB} MB for OPFS-streamed weights)${adapterDetail}`, heapSource: `mobile static budget — ${mobileFamily} (WASM heap ${heapBudgetMB} MB for KV + compute scratch)`, }; } // ── Desktop path: real probes ── const [heapProbe, gpuProbe] = await Promise.all([ probeHeapBudgetMB(), probeGpuBudgetMB(), ]); let heapBudgetMB; let heapSource; if (heapProbe.probedMB > 0) { heapBudgetMB = heapProbe.probedMB; heapSource = `probe (WASM heap, ${heapProbe.probedMB} MB committed)`; } else if (memGB !== null) { heapBudgetMB = memGB * 1024 * 0.6; heapSource = 'navigator.deviceMemory (heap probe failed)'; } else if (quotaMB !== null) { heapBudgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB); heapSource = 'navigator.storage.estimate().quota (heap probe failed)'; } else { heapBudgetMB = DEFAULT_BUDGET_MB; heapSource = 'default (heap probe failed)'; } let gpuBudgetMB; let gpuSource; if (gpuProbe.probedMB > 0) { gpuBudgetMB = gpuProbe.probedMB; gpuSource = `probe (WebGPU buffers, ${gpuProbe.probedMB} MB allocated)`; } else { gpuBudgetMB = 0; gpuSource = `probe failed: ${gpuProbe.error || 'unknown'}`; } return { budgetMB: gpuBudgetMB, gpuBudgetMB, heapBudgetMB, memGB, quotaMB, probedMB: heapProbe.probedMB, gpuProbedMB: gpuProbe.probedMB, probeError: heapProbe.error || null, gpuProbeError: gpuProbe.error || null, isMobile: false, mobileFamily: null, source: gpuSource, heapSource, }; } // variantFits decides whether a model file of `sizeMB` bytes can be // loaded and run on this device. Two checks must pass: // // 1. sizeMB + GPU_VARIANT_OVERHEAD_MB ≤ gpuBudgetMB // Model weights live in WebGPU buffers (since OPFS streaming // keeps them off the WASM heap). The overhead covers compute // scratch + alignment + KV cache mirror. // // 2. heapBudgetMB ≥ HEAP_WORKING_SET_FLOOR_MB // The WASM heap still has to fit the working set: KV cache, // ggml compute buffers, and JS heap headroom. Roughly constant // per inference regardless of model size at fixed n_ctx. // // Backwards-compat: if the second arg is a plain number, treat it as // the legacy heap-only budget and apply the prior 1.5× sizeMB overhead. // New callers should pass { gpuBudgetMB, heapBudgetMB }. export function variantFits(sizeMB, budget) { if (typeof sizeMB !== 'number' || sizeMB <= 0) return false; if (typeof budget === 'number') { return budget > 0 && sizeMB * 1.5 <= budget; } if (!budget || typeof budget !== 'object') return false; const { gpuBudgetMB, heapBudgetMB } = budget; if (typeof gpuBudgetMB !== 'number' || sizeMB + GPU_VARIANT_OVERHEAD_MB > gpuBudgetMB) { return false; } if (typeof heapBudgetMB !== 'number' || heapBudgetMB < HEAP_WORKING_SET_FLOOR_MB) { return false; } return true; } export async function describeDevice() { const budget = await getDeviceBudgetMB(); let gpu = null; if (navigator.gpu) { try { const adapter = await navigator.gpu.requestAdapter(); if (adapter) gpu = adapter.info || { vendor: 'unknown' }; } catch { gpu = null; } } // UA Client Hints: high-entropy values give us the real architecture // and OS, neither of which `navigator.platform` reports correctly on // Apple Silicon Macs (it returns "MacIntel" forever for back-compat). let uaArch = null; let uaPlatform = null; let uaPlatformVersion = null; // `fullVersionList` is the high-entropy version of `brands` and gives // us the full dotted version (e.g. "147.0.7390.107") instead of just // the major. The default `brands` is major-only. let fullVersionList = null; try { const uad = navigator.userAgentData; if (uad?.getHighEntropyValues) { const hev = await uad.getHighEntropyValues([ 'architecture', 'platform', 'platformVersion', 'fullVersionList', ]); uaArch = hev.architecture || null; uaPlatform = hev.platform || null; uaPlatformVersion = hev.platformVersion || null; fullVersionList = hev.fullVersionList || null; } } catch { /* not Chromium or denied */ } // UA-CH brands give us a clean { brand, version } pair without parsing // the userAgent string. Filter out the "Not(A:Brand)" decoy entries. // Prefer `fullVersionList` (full dotted version) over the major-only // default `brands` list. const brandSource = fullVersionList || navigator.userAgentData?.brands || []; const brands = brandSource .filter(b => b && !/not[^\w]*a[^\w]*brand/i.test(b.brand)); return { ...budget, webgpu: !!navigator.gpu, gpu, userAgent: navigator.userAgent, platform: navigator.platform ?? null, uaArch, uaPlatform, uaPlatformVersion, uaBrands: brands, }; }