Spaces:
Running
Running
| // Device-fit helpers for the interactive bench page. | |
| // | |
| // Two budget probes drive the per-variant fit decision: | |
| // | |
| // getDeviceBudgetMB() — empirical WASM heap probe. Grows a | |
| // WebAssembly.Memory page-by-page in a worker until it fails. Caps | |
| // the working set (KV cache + compute scratch + JS heap headroom) | |
| // llama.cpp consumes during inference. | |
| // | |
| // probeGpuBudgetMB() — empirical WebGPU memory probe. Allocates real | |
| // buffers with mappedAtCreation=true on the actual adapter until OOM. | |
| // Caps the size of model weights llama.cpp can hold in GPU buffers, | |
| // since OPFS-streaming keeps model bytes off the WASM heap. | |
| // | |
| // variantFits() then checks both: model size + GPU overhead ≤ GPU budget, | |
| // AND heap working-set floor ≤ heap budget. wllama doesn't probe at all | |
| // — they let load attempts fail naturally — but our auto-select buttons | |
| // ("All fit", "Run study") need a fit predicate, so we err on the side | |
| // of measuring rather than guessing. | |
| // | |
| // On wasm32 the linear memory caps at 4 GiB no matter how much physical | |
| // RAM the device has, so heap probe results above 4096 MB cannot exist. | |
| const DEFAULT_BUDGET_MB = 2 * 1024; | |
| const HOSTED_QUOTA_FRACTION = 0.4; | |
| const HOSTED_QUOTA_CAP_MB = 8 * 1024; | |
| // Mobile per-device budgets. Two independent caps, mirroring the desktop | |
| // path — model weights stream from OPFS into WebGPU buffers (see | |
| // bench-worker.js:patchMEMFS / opfsAlloc), so the model size constrains | |
| // `gpuBudgetMB`, not `heapBudgetMB`. The WASM heap only has to hold the | |
| // working set (KV cache + ggml compute scratch + JS heap headroom). | |
| // | |
| // Earlier we collapsed both into a single tab budget on the theory that | |
| // iOS Jetsam treats the whole tab process as one pool, so any allocation | |
| // counts the same. That's true for Jetsam — but it conflates *where* the | |
| // memory lives with *how much* the platform can hand out: the WASM heap | |
| // has a much tighter practical ceiling than the GPU side, and counting | |
| // model bytes against the heap ceiling rejected models that load fine | |
| // via OPFS streaming. | |
| // | |
| // Numbers come from public reports / Apple docs: | |
| // | |
| // - iPhone WASM practical limit: 300–450 MB → heap budget | |
| // lapcatsoftware.com/articles/2026/1/7.html | |
| // news.ycombinator.com/item?id=39039593 | |
| // github.com/emscripten-core/emscripten/issues/19374 | |
| // github.com/godotengine/godot/issues/70621 | |
| // | |
| // - iOS Safari WebGPU maxBufferSize: 256 MB on iPhone 6 / older, | |
| // 993 MB on iPad Pro M-class. Per-buffer cap, not total. | |
| // Apple WWDC 2025 "Unlock GPU computing with WebGPU" | |
| // | |
| // - iPhone 12 Pro reports tab OOM around 1.5–3 GB; Jetsam intervenes | |
| // earlier under pressure. We undershoot the lower bound for headroom. | |
| // developer.apple.com/forums/thread/761666 | |
| // | |
| // Heap budgets = WASM heap practical limits. | |
| const IPHONE_HEAP_BUDGET_MB = 450; | |
| const IPAD_HEAP_BUDGET_MB = 1500; | |
| const ANDROID_HEAP_BUDGET_MB = 800; | |
| // GPU budgets = available GPU-buffer capacity for model weights + KV | |
| // mirror, sized below the Jetsam tab ceiling minus working-set headroom. | |
| // Static per-family numbers — we don't probe on mobile because the | |
| // probe's allocation pulse itself triggers Jetsam on lower-RAM devices, | |
| // and WebKit doesn't expose a signal that distinguishes (e.g.) iPhone 13 | |
| // from iPhone 17 Pro Max (same maxBufferSize 1024 MB on both, same | |
| // deviceMemory clamp). See "mobile probe" history in git: bounded | |
| // probe shipped, then disabled because the iPhone 13 / mid-RAM iPad | |
| // classes still Jetsamed during or right after the probe pulse. | |
| // | |
| // iPhone: empirical — 1200 MB caused tab reloads on first variant of a | |
| // Run study (Llama-3.2-1B Q2_K, 554 MB) on iPhone 17 Pro Max. 700 MB | |
| // keeps Llama-1B variants out of variantFits while still allowing the | |
| // 250–500 MB tier (gemma-3-270m Q8, Qwen3-0.6B Q4, etc.). | |
| // | |
| // iPad: empirical — 2500 MB Jetsamed on Llama-3.2-1B (likely Q4_K_M | |
| // = 770 MB or Q8_0 = 1259 MB). 1500 MB excludes Llama-1B Q8_0 (1459 MB | |
| // after overhead) but allows Q4_K_M (970 MB), keeping the standard | |
| // study quant runnable. High-end iPad Pro M-class probably tolerates | |
| // more, but we have no way to detect device class. | |
| const IPHONE_GPU_BUDGET_MB = 700; | |
| const IPAD_GPU_BUDGET_MB = 1500; | |
| const ANDROID_GPU_BUDGET_MB = 1500; | |
| function detectMobileFamily() { | |
| if (typeof navigator === 'undefined') return null; | |
| const ua = navigator.userAgent || ''; | |
| // iPadOS 13+ reports "Macintosh" UA but exposes touch; that's the | |
| // standard iPad-detection workaround. | |
| if (/iPad/.test(ua)) return 'ipad'; | |
| if (navigator.maxTouchPoints > 1 && /Mac/.test(navigator.platform || '')) return 'ipad'; | |
| if (/iPhone|iPod/.test(ua)) return 'iphone'; | |
| if (/Android.*Mobile/.test(ua)) return 'android'; | |
| if (navigator.userAgentData?.mobile === true) return 'android'; | |
| return null; | |
| } | |
| function getMobileBudgetMB(family) { | |
| if (family === 'ipad') return { heap: IPAD_HEAP_BUDGET_MB, gpu: IPAD_GPU_BUDGET_MB }; | |
| if (family === 'iphone') return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB }; | |
| if (family === 'android') return { heap: ANDROID_HEAP_BUDGET_MB, gpu: ANDROID_GPU_BUDGET_MB }; | |
| return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB }; // safest default | |
| } | |
| const PROBE_TIMEOUT_MS = 15_000; | |
| const GPU_PROBE_STEP_MB = 256; | |
| const GPU_PROBE_MAX_MB = 8 * 1024; | |
| const GPU_PROBE_TIMEOUT_MS = 8_000; | |
| // Working-set floor in the WASM heap. KV cache + compute buffers + JS | |
| // heap headroom for a typical 1B model at n_ctx=2048 add up to a few | |
| // hundred MB. Floor at 256 so an absurdly-tiny heap (or a probe failure | |
| // that returned 0) doesn't pass variantFits. | |
| const HEAP_WORKING_SET_FLOOR_MB = 256; | |
| // Per-variant overhead added on top of the model file size when checking | |
| // GPU fit. Covers compute buffers, alignment padding, and the KV cache | |
| // mirror that the WebGPU backend keeps. A flat 200 MB is a conservative | |
| // approximation; in practice it scales somewhat with model + context size. | |
| const GPU_VARIANT_OVERHEAD_MB = 200; | |
| export function isMobileDevice() { | |
| return detectMobileFamily() !== null; | |
| } | |
| // ──────────────── WASM heap probe ──────────────── | |
| // Spawn the probe worker, wait for a result, clean up. Returns | |
| // { probedMB } on success, or { probedMB: 0, error } on any failure mode | |
| // (timeout, worker construct error, worker onerror — typically the probe | |
| // itself ran the engine out of memory). | |
| export function probeHeapBudgetMB({ stepPages, maxPages, timeoutMs = PROBE_TIMEOUT_MS } = {}) { | |
| return new Promise((resolve) => { | |
| let worker; | |
| try { | |
| worker = new Worker(new URL('./memory-probe.js', import.meta.url)); | |
| } catch (err) { | |
| resolve({ probedMB: 0, error: `worker construct failed: ${err.message}` }); | |
| return; | |
| } | |
| const timer = setTimeout(() => { | |
| try { worker.terminate(); } catch { /* noop */ } | |
| resolve({ probedMB: 0, error: 'probe timeout' }); | |
| }, timeoutMs); | |
| worker.onmessage = (e) => { | |
| clearTimeout(timer); | |
| const { committedMB = 0 } = e.data || {}; | |
| try { worker.terminate(); } catch { /* noop */ } | |
| resolve({ probedMB: committedMB }); | |
| }; | |
| worker.onerror = (err) => { | |
| clearTimeout(timer); | |
| try { worker.terminate(); } catch { /* noop */ } | |
| resolve({ probedMB: 0, error: err.message || 'worker error' }); | |
| }; | |
| worker.postMessage({ stepPages, maxPages }); | |
| }); | |
| } | |
| // ──────────────── GPU memory probe ──────────────── | |
| // Allocate WebGPU buffers in stepMB increments until OOM, return the | |
| // total committed bytes as the GPU memory budget. Uses | |
| // mappedAtCreation=true to force real memory commit (some drivers lazy- | |
| // allocate until first use otherwise) and captures OOM via the | |
| // 'out-of-memory' error scope, with device.lost as a backstop. | |
| // | |
| // Caveats: | |
| // - The GPU process is shared with other tabs. If they're holding GPU | |
| // memory the probe undercounts. (Same as wllama's heap probe — best | |
| // we can do without a richer browser API.) | |
| // - Some drivers (notably iOS Metal under WebKit) lazy-fail at dispatch | |
| // time rather than at createBuffer; this probe's number is therefore | |
| // an upper bound, not a guarantee. Mobile cap below mitigates. | |
| export async function probeGpuBudgetMB({ | |
| stepMB = GPU_PROBE_STEP_MB, | |
| maxMB = GPU_PROBE_MAX_MB, | |
| timeoutMs = GPU_PROBE_TIMEOUT_MS, | |
| yieldMs = 0, | |
| } = {}) { | |
| if (!navigator.gpu) { | |
| return { probedMB: 0, error: 'WebGPU not available' }; | |
| } | |
| let adapter, device; | |
| try { | |
| adapter = await navigator.gpu.requestAdapter(); | |
| if (!adapter) return { probedMB: 0, error: 'no WebGPU adapter' }; | |
| // Request the maximum the adapter can give us; defaults are often | |
| // smaller than what the hardware supports. | |
| const requiredLimits = {}; | |
| const cap = (k) => { | |
| const v = adapter.limits?.[k]; | |
| if (typeof v === 'number') requiredLimits[k] = v; | |
| }; | |
| cap('maxBufferSize'); | |
| cap('maxStorageBufferBindingSize'); | |
| device = await adapter.requestDevice({ requiredLimits }); | |
| } catch (err) { | |
| return { probedMB: 0, error: `adapter/device init failed: ${err.message}` }; | |
| } | |
| let deviceLost = false; | |
| device.lost.then(() => { deviceLost = true; }).catch(() => {}); | |
| const buffers = []; | |
| const stepBytes = stepMB * 1024 * 1024; | |
| let totalBytes = 0; | |
| const start = performance.now(); | |
| try { | |
| while (totalBytes + stepBytes <= maxMB * 1024 * 1024) { | |
| if (deviceLost) break; | |
| if (performance.now() - start > timeoutMs) break; | |
| device.pushErrorScope('out-of-memory'); | |
| let buffer; | |
| try { | |
| buffer = device.createBuffer({ | |
| size: stepBytes, | |
| usage: GPUBufferUsage.STORAGE, | |
| mappedAtCreation: true, | |
| }); | |
| // Touch the start of the mapped range to force a real commit. | |
| // Drivers can lazy-back the allocation until first write, which | |
| // would fool the probe into thinking it has more headroom than it | |
| // really does. | |
| const touchBytes = Math.min(stepBytes, 64 * 1024); | |
| new Uint8Array(buffer.getMappedRange(0, touchBytes))[0] = 1; | |
| buffer.unmap(); | |
| } catch (err) { | |
| await device.popErrorScope().catch(() => null); | |
| break; | |
| } | |
| const error = await device.popErrorScope().catch(() => null); | |
| if (error) { | |
| try { buffer.destroy(); } catch { /* noop */ } | |
| break; | |
| } | |
| buffers.push(buffer); | |
| totalBytes += stepBytes; | |
| // Yield so we don't starve the main thread / GC. On mobile a | |
| // longer yield also gives the OS a chance to update its memory | |
| // accounting between steps so a fast burst doesn't look like a | |
| // spike to Jetsam. | |
| await new Promise((r) => setTimeout(r, yieldMs)); | |
| } | |
| } finally { | |
| for (const b of buffers) { | |
| try { b.destroy(); } catch { /* noop */ } | |
| } | |
| try { device.destroy(); } catch { /* noop */ } | |
| } | |
| return { probedMB: Math.floor(totalBytes / (1024 * 1024)) }; | |
| } | |
| // ──────────────── public budget API ──────────────── | |
| // Cache the full budget for the lifetime of the page load. Both probes | |
| // take 1–8 s; we don't want to pay that twice for the same surface. | |
| let _budgetPromise = null; | |
| export async function getDeviceBudgetMB() { | |
| if (_budgetPromise) return _budgetPromise; | |
| _budgetPromise = _computeBudget(); | |
| return _budgetPromise; | |
| } | |
| async function _computeBudget() { | |
| const memGB = typeof navigator.deviceMemory === 'number' ? navigator.deviceMemory : null; | |
| let quotaMB = null; | |
| try { | |
| const est = await navigator.storage?.estimate?.(); | |
| if (est?.quota) quotaMB = est.quota / (1024 * 1024); | |
| } catch { | |
| // some browsers throw on storage.estimate in non-secure contexts | |
| } | |
| const mobileFamily = detectMobileFamily(); | |
| const isMobile = mobileFamily !== null; | |
| // ── Mobile path: pure static budgets ── | |
| // | |
| // No probes on mobile. Both the heap probe and the GPU probe have been | |
| // shown to themselves trigger Jetsam: | |
| // - Heap probe: commit 6f33b5d (terminated worker). | |
| // - GPU probe (unbounded): commit 4f567a5. | |
| // - GPU probe (bounded): the 1000 MB peak allocation pulse on | |
| // iPhone 13 / mid-RAM iPad classes still pushed the WebContent | |
| // process over the Jetsam threshold during or right after the | |
| // probe — even though the probe itself completed cleanly, | |
| // subsequent OPFS writes hit "unknown transient" errors and the | |
| // next inference allocation tipped the tab over. | |
| // | |
| // We can't distinguish iPhone 13 (6 GB) from iPhone 17 Pro Max (12 GB) | |
| // via WebGPU adapter info or navigator.deviceMemory, so we err on the | |
| // side of the lower-RAM device. The budgets are tuned to admit the | |
| // 250–500 MB tier (gemma-3-270m, Qwen3-0.6B, LFM2.5-350M) and to | |
| // exclude variants that empirically caused crashes on the smallest | |
| // device in each family. We still surface adapter.limits.maxBufferSize | |
| // in the source string for diagnostics. | |
| if (isMobile) { | |
| const { heap: heapBudgetMB, gpu: gpuBudgetMB } = getMobileBudgetMB(mobileFamily); | |
| // Read adapter limits without allocating a device buffer — purely | |
| // informational for the device card / log line. | |
| let maxBufferSizeMB = 0; | |
| let adapterReadError = null; | |
| try { | |
| if (navigator.gpu) { | |
| const adapter = await navigator.gpu.requestAdapter(); | |
| const lim = adapter?.limits?.maxBufferSize; | |
| if (typeof lim === 'number') { | |
| maxBufferSizeMB = Math.floor(lim / (1024 * 1024)); | |
| } | |
| } else { | |
| adapterReadError = 'WebGPU not available'; | |
| } | |
| } catch (err) { | |
| adapterReadError = err.message; | |
| } | |
| const adapterDetail = adapterReadError | |
| ? ` (adapter read failed: ${adapterReadError})` | |
| : maxBufferSizeMB > 0 | |
| ? ` (maxBufferSize ${maxBufferSizeMB} MB)` | |
| : ''; | |
| return { | |
| budgetMB: gpuBudgetMB, | |
| gpuBudgetMB, | |
| heapBudgetMB, | |
| memGB, | |
| quotaMB, | |
| probedMB: 0, | |
| gpuProbedMB: 0, | |
| probeError: 'skipped on mobile (probes themselves trigger Jetsam)', | |
| gpuProbeError: 'skipped on mobile (probes themselves trigger Jetsam)', | |
| isMobile: true, | |
| mobileFamily, | |
| source: `mobile static budget — ${mobileFamily} (GPU ${gpuBudgetMB} MB for OPFS-streamed weights)${adapterDetail}`, | |
| heapSource: `mobile static budget — ${mobileFamily} (WASM heap ${heapBudgetMB} MB for KV + compute scratch)`, | |
| }; | |
| } | |
| // ── Desktop path: real probes ── | |
| const [heapProbe, gpuProbe] = await Promise.all([ | |
| probeHeapBudgetMB(), | |
| probeGpuBudgetMB(), | |
| ]); | |
| let heapBudgetMB; | |
| let heapSource; | |
| if (heapProbe.probedMB > 0) { | |
| heapBudgetMB = heapProbe.probedMB; | |
| heapSource = `probe (WASM heap, ${heapProbe.probedMB} MB committed)`; | |
| } else if (memGB !== null) { | |
| heapBudgetMB = memGB * 1024 * 0.6; | |
| heapSource = 'navigator.deviceMemory (heap probe failed)'; | |
| } else if (quotaMB !== null) { | |
| heapBudgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB); | |
| heapSource = 'navigator.storage.estimate().quota (heap probe failed)'; | |
| } else { | |
| heapBudgetMB = DEFAULT_BUDGET_MB; | |
| heapSource = 'default (heap probe failed)'; | |
| } | |
| let gpuBudgetMB; | |
| let gpuSource; | |
| if (gpuProbe.probedMB > 0) { | |
| gpuBudgetMB = gpuProbe.probedMB; | |
| gpuSource = `probe (WebGPU buffers, ${gpuProbe.probedMB} MB allocated)`; | |
| } else { | |
| gpuBudgetMB = 0; | |
| gpuSource = `probe failed: ${gpuProbe.error || 'unknown'}`; | |
| } | |
| return { | |
| budgetMB: gpuBudgetMB, | |
| gpuBudgetMB, | |
| heapBudgetMB, | |
| memGB, | |
| quotaMB, | |
| probedMB: heapProbe.probedMB, | |
| gpuProbedMB: gpuProbe.probedMB, | |
| probeError: heapProbe.error || null, | |
| gpuProbeError: gpuProbe.error || null, | |
| isMobile: false, | |
| mobileFamily: null, | |
| source: gpuSource, | |
| heapSource, | |
| }; | |
| } | |
| // variantFits decides whether a model file of `sizeMB` bytes can be | |
| // loaded and run on this device. Two checks must pass: | |
| // | |
| // 1. sizeMB + GPU_VARIANT_OVERHEAD_MB ≤ gpuBudgetMB | |
| // Model weights live in WebGPU buffers (since OPFS streaming | |
| // keeps them off the WASM heap). The overhead covers compute | |
| // scratch + alignment + KV cache mirror. | |
| // | |
| // 2. heapBudgetMB ≥ HEAP_WORKING_SET_FLOOR_MB | |
| // The WASM heap still has to fit the working set: KV cache, | |
| // ggml compute buffers, and JS heap headroom. Roughly constant | |
| // per inference regardless of model size at fixed n_ctx. | |
| // | |
| // Backwards-compat: if the second arg is a plain number, treat it as | |
| // the legacy heap-only budget and apply the prior 1.5× sizeMB overhead. | |
| // New callers should pass { gpuBudgetMB, heapBudgetMB }. | |
| export function variantFits(sizeMB, budget) { | |
| if (typeof sizeMB !== 'number' || sizeMB <= 0) return false; | |
| if (typeof budget === 'number') { | |
| return budget > 0 && sizeMB * 1.5 <= budget; | |
| } | |
| if (!budget || typeof budget !== 'object') return false; | |
| const { gpuBudgetMB, heapBudgetMB } = budget; | |
| if (typeof gpuBudgetMB !== 'number' || sizeMB + GPU_VARIANT_OVERHEAD_MB > gpuBudgetMB) { | |
| return false; | |
| } | |
| if (typeof heapBudgetMB !== 'number' || heapBudgetMB < HEAP_WORKING_SET_FLOOR_MB) { | |
| return false; | |
| } | |
| return true; | |
| } | |
| export async function describeDevice() { | |
| const budget = await getDeviceBudgetMB(); | |
| let gpu = null; | |
| if (navigator.gpu) { | |
| try { | |
| const adapter = await navigator.gpu.requestAdapter(); | |
| if (adapter) gpu = adapter.info || { vendor: 'unknown' }; | |
| } catch { | |
| gpu = null; | |
| } | |
| } | |
| // UA Client Hints: high-entropy values give us the real architecture | |
| // and OS, neither of which `navigator.platform` reports correctly on | |
| // Apple Silicon Macs (it returns "MacIntel" forever for back-compat). | |
| let uaArch = null; | |
| let uaPlatform = null; | |
| let uaPlatformVersion = null; | |
| // `fullVersionList` is the high-entropy version of `brands` and gives | |
| // us the full dotted version (e.g. "147.0.7390.107") instead of just | |
| // the major. The default `brands` is major-only. | |
| let fullVersionList = null; | |
| try { | |
| const uad = navigator.userAgentData; | |
| if (uad?.getHighEntropyValues) { | |
| const hev = await uad.getHighEntropyValues([ | |
| 'architecture', 'platform', 'platformVersion', 'fullVersionList', | |
| ]); | |
| uaArch = hev.architecture || null; | |
| uaPlatform = hev.platform || null; | |
| uaPlatformVersion = hev.platformVersion || null; | |
| fullVersionList = hev.fullVersionList || null; | |
| } | |
| } catch { /* not Chromium or denied */ } | |
| // UA-CH brands give us a clean { brand, version } pair without parsing | |
| // the userAgent string. Filter out the "Not(A:Brand)" decoy entries. | |
| // Prefer `fullVersionList` (full dotted version) over the major-only | |
| // default `brands` list. | |
| const brandSource = fullVersionList || navigator.userAgentData?.brands || []; | |
| const brands = brandSource | |
| .filter(b => b && !/not[^\w]*a[^\w]*brand/i.test(b.brand)); | |
| return { | |
| ...budget, | |
| webgpu: !!navigator.gpu, | |
| gpu, | |
| userAgent: navigator.userAgent, | |
| platform: navigator.platform ?? null, | |
| uaArch, | |
| uaPlatform, | |
| uaPlatformVersion, | |
| uaBrands: brands, | |
| }; | |
| } | |