Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on 18 days ago

Commit

43a358a

1 Parent(s): 9f7edbf

sync from abhijitramesh/webgpu-bench@bcae90cf03

Browse files

Files changed (4) hide show

js/run/bench-worker.js +26 -5
js/run/controller.js +60 -16
js/run/device.js +229 -45
js/run/source.js +11 -4

js/run/bench-worker.js CHANGED Viewed

@@ -102,6 +102,23 @@ function patchMEMFS(Module) {
   m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
 }
 async function opfsAlloc(Module, name, fileHandle) {
   // createSyncAccessHandle is worker-only and exclusive — only one writer
   // per OPFS file at a time. Caller must ensure no createWritable session
@@ -189,7 +206,7 @@ self.onmessage = async (e) => {
   }
 };
-async function runOne({ params, stream, buffer, fileHandle }) {
   const {
     buildType,
     contentLength,
@@ -206,13 +223,16 @@ async function runOne({ params, stream, buffer, fileHandle }) {
     noWarmup,
   } = params;
   // Three input modes are supported:
-  //   fileHandle  → wllama-style OPFS-streaming load (preferred for >2GB)
   //   stream      → heap-stream mode (zero-copy WASM-heap, transferable)
   //   buffer      → buffered fallback for browsers without transferable streams
   // Exactly one must be provided.
-  const inputCount = (fileHandle ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
   if (inputCount !== 1) {
-    throw new Error('runOne: exactly one of `fileHandle`, `stream`, or `buffer` must be provided');
   }
   const result = {
@@ -282,10 +302,11 @@ async function runOne({ params, stream, buffer, fileHandle }) {
   //               in, register a heap-backed MEMFS file. Faster (mmap'd
   //               zero-copy at load time) but caps at ~2GB.
   let modelPtr = 0;  // tracks heap-path allocation for cleanup
-  const useOpfsPath = !!fileHandle;
   if (useOpfsPath) {
     status('opfs', 'Linking OPFS-backed model into MEMFS...');
     patchMEMFS(Module);
     const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
     log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);

   m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
 }
+// Resolve an OPFS path (rootDir + repo segments + filename) to a
+// FileSystemFileHandle inside this worker. Works around the iOS Safari
+// limitation that FileSystemFileHandle isn't structured-cloneable across
+// postMessage — main thread sends the layout key, worker opens the
+// handle locally.
+async function resolveOpfsHandle({ rootDir, repo, filename }) {
+  if (!self.navigator?.storage?.getDirectory) {
+    throw new Error('OPFS not available in this worker');
+  }
+  let dir = await self.navigator.storage.getDirectory();
+  dir = await dir.getDirectoryHandle(rootDir, { create: false });
+  for (const seg of String(repo).split('/').filter(Boolean)) {
+    dir = await dir.getDirectoryHandle(seg, { create: false });
+  }
+  return dir.getFileHandle(filename, { create: false });
+}
 async function opfsAlloc(Module, name, fileHandle) {
   // createSyncAccessHandle is worker-only and exclusive — only one writer
   // per OPFS file at a time. Caller must ensure no createWritable session
   }
 };
+async function runOne({ params, stream, buffer, opfsPath }) {
   const {
     buildType,
     contentLength,
     noWarmup,
   } = params;
   // Three input modes are supported:
+  //   opfsPath    → wllama-style OPFS-streaming load (preferred for >2GB).
+  //                  Resolved to a FileSystemFileHandle inside the worker
+  //                  via navigator.storage.getDirectory() — FileHandles
+  //                  themselves don't structured-clone reliably (iOS Safari).
   //   stream      → heap-stream mode (zero-copy WASM-heap, transferable)
   //   buffer      → buffered fallback for browsers without transferable streams
   // Exactly one must be provided.
+  const inputCount = (opfsPath ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
   if (inputCount !== 1) {
+    throw new Error('runOne: exactly one of `opfsPath`, `stream`, or `buffer` must be provided');
   }
   const result = {
   //               in, register a heap-backed MEMFS file. Faster (mmap'd
   //               zero-copy at load time) but caps at ~2GB.
   let modelPtr = 0;  // tracks heap-path allocation for cleanup
+  const useOpfsPath = !!opfsPath;
   if (useOpfsPath) {
     status('opfs', 'Linking OPFS-backed model into MEMFS...');
+    const fileHandle = await resolveOpfsHandle(opfsPath);
     patchMEMFS(Module);
     const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
     log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);

js/run/controller.js CHANGED Viewed

@@ -3,8 +3,8 @@
 // classes. Detects `surface` (localhost / space / pages) to gate the
 // server save checkbox and the HF hub sign-in/submit row.
-import { localSource, hostedSource, inventoryOpfs, purgeOpfs } from './source.js';
-import { getDeviceBudgetMB, variantFits, describeDevice } from './device.js';
 import {
   resumeHFSession, beginHFSignIn, signOutHF, submitResultsToDataset,
   HF_OAUTH_PENDING_KEY,
@@ -14,7 +14,6 @@ import { isHubConfigured, HF_DATASET_REPO } from './config.js';
 const RUN_INTENT_STORAGE_KEY = 'webgpu-bench:runIntent';
 const CRASH_STALE_MS = 10_000;
-const OVERHEAD = 1.5;
 const DEFAULT_PROMPT =
   'Explain quantum computing to a software engineer in four concise paragraphs. ' +
   'Cover superposition, entanglement, quantum gates, and one practical use case.';
@@ -176,7 +175,15 @@ function computeWarnings(modelName, quant) {
 }
 function cacheKey(v) { return `${v.repo}/${v.filename}`; }
-function variantFitsDevice(v) { return variantFits(v.sizeMB, state.budget.budgetMB, OVERHEAD); }
 function isCached(v) {
   const entry = state.cacheStatus[cacheKey(v)];
   return !!entry && entry.cachedBytes > 0;
@@ -272,9 +279,14 @@ function renderHeader() {
   const memStr = b.memGB !== null ? `${b.memGB} GB` : '—';
   $('device-memory').textContent = memStr;
   const budgetGB = (b.budgetMB / 1024).toFixed(1);
   $('device-budget').textContent = `${budgetGB} GB`;
-  $('device-budget-source').textContent = `source: ${b.source}`;
   const webgpuCell = $('device-webgpu');
   if (webgpuCell) {
@@ -1047,14 +1059,28 @@ async function onRunClick() {
   state.sessionDownloads = new Set();
   updateButtons();
   const machine = await machineInfo();
   const browser = browserInfo();
   const evictAfter = !!$('evict-after-run')?.checked;
   // One-ahead prefetch: while variant i runs, we may have variant i+1
   // downloading. Only one prefetch in flight at a time.
   const prefetchFor = async (v) => {
     if (!v || isCached(v)) return;
     const row = progressRowFor(v);
     row.setStatus('prefetching', '');
     try {
@@ -1090,7 +1116,10 @@ async function onRunClick() {
     // Wait for variant i to be cached (either via prefetch or pre-existing).
     await prefetchPromise;
     if (state.aborted) break;
-    if (!isCached(v)) {
       row.setStatus('error', 'not cached after prefetch');
       prefetchPromise = prefetchFor(variants[i + 1]);
       continue;
@@ -1164,7 +1193,7 @@ async function onRunClick() {
 function runInWorker({
   params,
   stream,
-  fileHandle,
   onStatus,
   onProgress,
   onLog,
@@ -1206,14 +1235,16 @@ function runInWorker({
     };
     // Three transport modes — see bench-worker.js runOne() for matching shape.
-    if (fileHandle) {
-      // OPFS path: FileSystemFileHandle is structured-cloneable, not
-      // transferable. The worker creates its own sync access handle on the
-      // cloned reference (still bound to the same underlying OPFS file).
       try {
-        worker.postMessage({ type: 'run', params, fileHandle });
       } catch (err) {
-        finish({ status: 'error', error: `postMessage(fileHandle) failed: ${err.message}` });
       }
       return;
     }
@@ -1333,21 +1364,34 @@ async function runBenchmarkInWorker(v, params, callbacks) {
   };
   if (useOpfs) {
-    let fileHandle, contentLength;
     try {
       callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
       const r = await state.source.opfsHandleForModel(
         v.repo, v.filename,
         callbacks.onProgress,
       );
-      fileHandle = r.handle;
       contentLength = r.size;
     } catch (err) {
       return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
     }
     return runInWorker({
       params: { ...baseParams, contentLength },
-      fileHandle,
       onStatus: callbacks.onStatus,
       onProgress: callbacks.onProgress,
       onLog: callbacks.onLog,

 // classes. Detects `surface` (localhost / space / pages) to gate the
 // server save checkbox and the HF hub sign-in/submit row.
+import { localSource, hostedSource, inventoryOpfs, purgeOpfs, OPFS_ROOT_NAME } from './source.js';
+import { getDeviceBudgetMB, variantFits, describeDevice, isMobileDevice } from './device.js';
 import {
   resumeHFSession, beginHFSignIn, signOutHF, submitResultsToDataset,
   HF_OAUTH_PENDING_KEY,
 const RUN_INTENT_STORAGE_KEY = 'webgpu-bench:runIntent';
 const CRASH_STALE_MS = 10_000;
 const DEFAULT_PROMPT =
   'Explain quantum computing to a software engineer in four concise paragraphs. ' +
   'Cover superposition, entanglement, quantum gates, and one practical use case.';
 }
 function cacheKey(v) { return `${v.repo}/${v.filename}`; }
+function variantFitsDevice(v) {
+  // New variantFits signature: pass both budgets so the predicate can
+  // check (a) model fits in GPU memory + small overhead, and (b) WASM
+  // heap can hold the working set. See device.js for the rationale.
+  return variantFits(v.sizeMB, {
+    gpuBudgetMB: state.budget.gpuBudgetMB,
+    heapBudgetMB: state.budget.heapBudgetMB,
+  });
+}
 function isCached(v) {
   const entry = state.cacheStatus[cacheKey(v)];
   return !!entry && entry.cachedBytes > 0;
   const memStr = b.memGB !== null ? `${b.memGB} GB` : '—';
   $('device-memory').textContent = memStr;
+  // budgetMB is now the GPU-memory budget (per device.js _computeBudget),
+  // since with OPFS streaming the model lives in WebGPU buffers, not the
+  // WASM heap. We surface the heap budget separately in the source line so
+  // a curious reader can see both probes' results.
   const budgetGB = (b.budgetMB / 1024).toFixed(1);
+  const heapGB = (b.heapBudgetMB / 1024).toFixed(1);
   $('device-budget').textContent = `${budgetGB} GB`;
+  $('device-budget-source').textContent = `GPU memory · WASM heap: ${heapGB} GB`;
   const webgpuCell = $('device-webgpu');
   if (webgpuCell) {
   state.sessionDownloads = new Set();
   updateButtons();
+  if (isMobileDevice()) {
+    logLine(
+      'Mobile device — running with sequential downloads (no parallel prefetch). ' +
+      'Each variant downloads, runs, evicts, then the next begins.',
+    );
+  }
   const machine = await machineInfo();
   const browser = browserInfo();
   const evictAfter = !!$('evict-after-run')?.checked;
   // One-ahead prefetch: while variant i runs, we may have variant i+1
   // downloading. Only one prefetch in flight at a time.
+  // On mobile, the overlap is a measurement hazard — concurrent download
+  // contends with inference for SoC power, memory bandwidth, and OPFS
+  // write queues. Skip the prefetch entirely; runBenchmarkInWorker's
+  // opfsHandleForModel does the download inline (with the same progress
+  // events the prefetch row would have shown).
+  const skipPrefetch = isMobileDevice();
   const prefetchFor = async (v) => {
     if (!v || isCached(v)) return;
+    if (skipPrefetch) return;
     const row = progressRowFor(v);
     row.setStatus('prefetching', '');
     try {
     // Wait for variant i to be cached (either via prefetch or pre-existing).
     await prefetchPromise;
     if (state.aborted) break;
+    // When skipPrefetch is on (mobile), variants arrive uncached and
+    // runBenchmarkInWorker → opfsHandleForModel handles the inline
+    // download. Skip the cache-check error path in that case.
+    if (!skipPrefetch && !isCached(v)) {
       row.setStatus('error', 'not cached after prefetch');
       prefetchPromise = prefetchFor(variants[i + 1]);
       continue;
 function runInWorker({
   params,
   stream,
+  opfsPath,
   onStatus,
   onProgress,
   onLog,
     };
     // Three transport modes — see bench-worker.js runOne() for matching shape.
+    if (opfsPath) {
+      // OPFS path: send the layout key only (rootDir + repo + filename).
+      // The worker re-resolves to a FileSystemFileHandle via
+      // navigator.storage.getDirectory() itself. Plain JSON-serializable —
+      // works on iOS Safari, where FileSystemFileHandle structured-clone
+      // is not implemented.
       try {
+        worker.postMessage({ type: 'run', params, opfsPath });
       } catch (err) {
+        finish({ status: 'error', error: `postMessage(opfsPath) failed: ${err.message}` });
       }
       return;
     }
   };
   if (useOpfs) {
+    let contentLength;
     try {
       callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
       const r = await state.source.opfsHandleForModel(
         v.repo, v.filename,
         callbacks.onProgress,
       );
       contentLength = r.size;
+      // When the prefetch is skipped (mobile path), the inline download
+      // above is the variant's first arrival in OPFS. Mark it as
+      // session-downloaded so the post-run eviction logic frees it before
+      // the next variant starts — keeping disk usage flat.
+      if (r.wasDownloaded) {
+        state.sessionDownloads.add(cacheKey(v));
+        state.cacheStatus[cacheKey(v)] = { cachedBytes: r.size };
+        refreshCacheBadge(v);
+      }
     } catch (err) {
       return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
     }
+    // Pass the OPFS path components, not the FileHandle. iOS Safari
+    // (and some older Chromium/Firefox versions) can't structured-clone
+    // FileSystemFileHandle across postMessage. The worker re-resolves the
+    // handle via navigator.storage.getDirectory() itself, which works
+    // everywhere OPFS is supported.
     return runInWorker({
       params: { ...baseParams, contentLength },
+      opfsPath: { rootDir: OPFS_ROOT_NAME, repo: v.repo, filename: v.filename },
       onStatus: callbacks.onStatus,
       onProgress: callbacks.onProgress,
       onLog: callbacks.onLog,

js/run/device.js CHANGED Viewed

@@ -1,29 +1,66 @@
 // Device-fit helpers for the interactive bench page.
 //
-// getDeviceBudgetMB() empirically probes the WASM heap's actual maximum
-// growth on this device, mirroring how llama.cpp itself allocates (a single
-// WebAssembly.Memory grown in pages). The probe runs in a worker so an
-// allocation failure dies harmlessly. We fall back to deviceMemory /
-// storage.estimate heuristics if the probe can't run.
 //
-// The probed value is the budget — no extra safety factor. variantFits()
-// already multiplies the GGUF size by 1.5× to cover the WASM heap +
-// activations + KV cache + WebGPU staging beyond the file size.
 //
 // On wasm32 the linear memory caps at 4 GiB no matter how much physical
-// RAM the device has, so probe results above 4096 MB cannot exist.
 const DEFAULT_BUDGET_MB = 2 * 1024;
 const HOSTED_QUOTA_FRACTION = 0.4;
 const HOSTED_QUOTA_CAP_MB = 8 * 1024;
-// Hard ceiling on mobile regardless of probe result. iOS/Android can reap
-// the tab under system memory pressure without raising a JS error the
-// probe could observe, so an "ok at 4 GiB" result is not safe to trust on
-// a phone — the OS gets the last word.
-const MOBILE_BUDGET_CEILING_MB = 600;
 const PROBE_TIMEOUT_MS = 15_000;
 export function isMobileDevice() {
   if (typeof navigator === 'undefined') return false;
@@ -32,6 +69,8 @@ export function isMobileDevice() {
   return /iPhone|iPad|iPod|Android.*Mobile/.test(ua);
 }
 // Spawn the probe worker, wait for a result, clean up. Returns
 // { probedMB } on success, or { probedMB: 0, error } on any failure mode
 // (timeout, worker construct error, worker onerror — typically the probe
@@ -67,9 +106,107 @@ export function probeHeapBudgetMB({ stepPages, maxPages, timeoutMs = PROBE_TIMEO
   });
 }
-// Cache the budget for the lifetime of the page load. controller.js calls
-// both getDeviceBudgetMB() and describeDevice() at mount, and we don't want
-// to run the 1–2 s probe twice.
 let _budgetPromise = null;
 export async function getDeviceBudgetMB() {
@@ -90,49 +227,96 @@ async function _computeBudget() {
   const isMobile = isMobileDevice();
-  // Primary: empirical probe. The worker grows a WebAssembly.Memory page
-  // by page and reports how far it got — that's literally the WASM heap
-  // ceiling on this device, capped at wasm32's 4 GiB. We trust it directly;
-  // variantFits() applies the per-variant 1.5× overhead.
-  const probe = await probeHeapBudgetMB();
-  const probedMB = probe.probedMB;
-  let budgetMB;
-  let source;
-  if (probedMB > 0) {
-    budgetMB = probedMB;
-    source = `probe (WASM heap, ${probedMB} MB committed)`;
   } else if (memGB !== null) {
-    budgetMB = memGB * 1024 * 0.6;
-    source = 'navigator.deviceMemory (probe failed)';
   } else if (quotaMB !== null) {
-    budgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB);
-    source = 'navigator.storage.estimate().quota (probe failed)';
   } else {
-    budgetMB = DEFAULT_BUDGET_MB;
-    source = 'default (probe failed)';
   }
-  if (isMobile && budgetMB > MOBILE_BUDGET_CEILING_MB) {
-    budgetMB = MOBILE_BUDGET_CEILING_MB;
-    source += ' → mobile-capped';
   }
   return {
-    budgetMB,
     memGB,
     quotaMB,
-    probedMB,
-    probeError: probe.error || null,
     isMobile,
-    source,
   };
 }
-export function variantFits(sizeMB, budgetMB, overhead = 1.5) {
   if (typeof sizeMB !== 'number' || sizeMB <= 0) return false;
-  if (typeof budgetMB !== 'number' || budgetMB <= 0) return false;
-  return sizeMB * overhead <= budgetMB;
 }
 export async function describeDevice() {

 // Device-fit helpers for the interactive bench page.
 //
+// Two budget probes drive the per-variant fit decision:
 //
+//   getDeviceBudgetMB() — empirical WASM heap probe. Grows a
+//     WebAssembly.Memory page-by-page in a worker until it fails. Caps
+//     the working set (KV cache + compute scratch + JS heap headroom)
+//     llama.cpp consumes during inference.
+//
+//   probeGpuBudgetMB() — empirical WebGPU memory probe. Allocates real
+//     buffers with mappedAtCreation=true on the actual adapter until OOM.
+//     Caps the size of model weights llama.cpp can hold in GPU buffers,
+//     since OPFS-streaming keeps model bytes off the WASM heap.
+//
+// variantFits() then checks both: model size + GPU overhead ≤ GPU budget,
+// AND heap working-set floor ≤ heap budget. wllama doesn't probe at all
+// — they let load attempts fail naturally — but our auto-select buttons
+// ("All fit", "Run study") need a fit predicate, so we err on the side
+// of measuring rather than guessing.
 //
 // On wasm32 the linear memory caps at 4 GiB no matter how much physical
+// RAM the device has, so heap probe results above 4096 MB cannot exist.
 const DEFAULT_BUDGET_MB = 2 * 1024;
 const HOSTED_QUOTA_FRACTION = 0.4;
 const HOSTED_QUOTA_CAP_MB = 8 * 1024;
+// Hard ceiling on mobile WASM heap regardless of probe result. iOS/Android
+// can reap the tab under system memory pressure without raising a JS error
+// the probe could observe, so an "ok at 4 GiB" result is not safe to trust
+// on a phone.
+//
+// Empirically iOS Safari tabs get reaped well below the WebAssembly.Memory
+// engine cap (~1 GiB on iPhone), and Android Chrome on mid-range devices
+// behaves similarly. Below 500 MB heap usage tends to be safe across
+// modern phones; above that we start seeing tab kills mid-run. The OPFS-
+// streaming model load means model bytes no longer live on the WASM heap,
+// so this budget caps the per-step working set, not the model file.
+const MOBILE_HEAP_CEILING_MB = 500;
+// Hard ceiling on mobile GPU memory probe result. Even when the probe
+// succeeds at higher numbers, the OS may evict the GPU process or the tab
+// before we can actually use it. iPhone WebGPU (Metal-3 under the hood)
+// typically gives a tab 1.5–3 GB usable depending on device class; cap at
+// 3 GB as a conservative ceiling that won't reject anything reasonable.
+const MOBILE_GPU_CEILING_MB = 3 * 1024;
 const PROBE_TIMEOUT_MS = 15_000;
+const GPU_PROBE_STEP_MB = 256;
+const GPU_PROBE_MAX_MB = 8 * 1024;
+const GPU_PROBE_TIMEOUT_MS = 8_000;
+// Working-set floor in the WASM heap. KV cache + compute buffers + JS heap
+// headroom for a typical 1B model at n_ctx=2048 add up to ~400 MB; we
+// require 500 to leave a margin. Bigger contexts scale this up — not
+// modeled yet (worth revisiting if we benchmark at n_ctx >> 2048).
+const HEAP_WORKING_SET_FLOOR_MB = 500;
+// Per-variant overhead added on top of the model file size when checking
+// GPU fit. Covers compute buffers, alignment padding, and the KV cache
+// mirror that the WebGPU backend keeps. A flat 200 MB is a conservative
+// approximation; in practice it scales somewhat with model + context size.
+const GPU_VARIANT_OVERHEAD_MB = 200;
 export function isMobileDevice() {
   if (typeof navigator === 'undefined') return false;
   return /iPhone|iPad|iPod|Android.*Mobile/.test(ua);
 }
+// ──────────────── WASM heap probe ────────────────
 // Spawn the probe worker, wait for a result, clean up. Returns
 // { probedMB } on success, or { probedMB: 0, error } on any failure mode
 // (timeout, worker construct error, worker onerror — typically the probe
   });
 }
+// ──────────────── GPU memory probe ────────────────
+// Allocate WebGPU buffers in stepMB increments until OOM, return the
+// total committed bytes as the GPU memory budget. Uses
+// mappedAtCreation=true to force real memory commit (some drivers lazy-
+// allocate until first use otherwise) and captures OOM via the
+// 'out-of-memory' error scope, with device.lost as a backstop.
+//
+// Caveats:
+//  - The GPU process is shared with other tabs. If they're holding GPU
+//    memory the probe undercounts. (Same as wllama's heap probe — best
+//    we can do without a richer browser API.)
+//  - Some drivers (notably iOS Metal under WebKit) lazy-fail at dispatch
+//    time rather than at createBuffer; this probe's number is therefore
+//    an upper bound, not a guarantee. Mobile cap below mitigates.
+export async function probeGpuBudgetMB({
+  stepMB = GPU_PROBE_STEP_MB,
+  maxMB = GPU_PROBE_MAX_MB,
+  timeoutMs = GPU_PROBE_TIMEOUT_MS,
+} = {}) {
+  if (!navigator.gpu) {
+    return { probedMB: 0, error: 'WebGPU not available' };
+  }
+  let adapter, device;
+  try {
+    adapter = await navigator.gpu.requestAdapter();
+    if (!adapter) return { probedMB: 0, error: 'no WebGPU adapter' };
+    // Request the maximum the adapter can give us; defaults are often
+    // smaller than what the hardware supports.
+    const requiredLimits = {};
+    const cap = (k) => {
+      const v = adapter.limits?.[k];
+      if (typeof v === 'number') requiredLimits[k] = v;
+    };
+    cap('maxBufferSize');
+    cap('maxStorageBufferBindingSize');
+    device = await adapter.requestDevice({ requiredLimits });
+  } catch (err) {
+    return { probedMB: 0, error: `adapter/device init failed: ${err.message}` };
+  }
+  let deviceLost = false;
+  device.lost.then(() => { deviceLost = true; }).catch(() => {});
+  const buffers = [];
+  const stepBytes = stepMB * 1024 * 1024;
+  let totalBytes = 0;
+  const start = performance.now();
+  try {
+    while (totalBytes + stepBytes <= maxMB * 1024 * 1024) {
+      if (deviceLost) break;
+      if (performance.now() - start > timeoutMs) break;
+      device.pushErrorScope('out-of-memory');
+      let buffer;
+      try {
+        buffer = device.createBuffer({
+          size: stepBytes,
+          usage: GPUBufferUsage.STORAGE,
+          mappedAtCreation: true,
+        });
+        // Touch the start of the mapped range to force a real commit.
+        // Drivers can lazy-back the allocation until first write, which
+        // would fool the probe into thinking it has more headroom than it
+        // really does.
+        const touchBytes = Math.min(stepBytes, 64 * 1024);
+        new Uint8Array(buffer.getMappedRange(0, touchBytes))[0] = 1;
+        buffer.unmap();
+      } catch (err) {
+        await device.popErrorScope().catch(() => null);
+        break;
+      }
+      const error = await device.popErrorScope().catch(() => null);
+      if (error) {
+        try { buffer.destroy(); } catch { /* noop */ }
+        break;
+      }
+      buffers.push(buffer);
+      totalBytes += stepBytes;
+      // Yield so we don't starve the main thread / GC.
+      await new Promise((r) => setTimeout(r, 0));
+    }
+  } finally {
+    for (const b of buffers) {
+      try { b.destroy(); } catch { /* noop */ }
+    }
+    try { device.destroy(); } catch { /* noop */ }
+  }
+  return { probedMB: Math.floor(totalBytes / (1024 * 1024)) };
+}
+// ──────────────── public budget API ────────────────
+// Cache the full budget for the lifetime of the page load. Both probes
+// take 1–8 s; we don't want to pay that twice for the same surface.
 let _budgetPromise = null;
 export async function getDeviceBudgetMB() {
   const isMobile = isMobileDevice();
+  // Run both probes in parallel.
+  const [heapProbe, gpuProbe] = await Promise.all([
+    probeHeapBudgetMB(),
+    probeGpuBudgetMB(),
+  ]);
+  // ── Heap budget ──
+  let heapBudgetMB;
+  let heapSource;
+  if (heapProbe.probedMB > 0) {
+    heapBudgetMB = heapProbe.probedMB;
+    heapSource = `probe (WASM heap, ${heapProbe.probedMB} MB committed)`;
   } else if (memGB !== null) {
+    heapBudgetMB = memGB * 1024 * 0.6;
+    heapSource = 'navigator.deviceMemory (heap probe failed)';
   } else if (quotaMB !== null) {
+    heapBudgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB);
+    heapSource = 'navigator.storage.estimate().quota (heap probe failed)';
   } else {
+    heapBudgetMB = DEFAULT_BUDGET_MB;
+    heapSource = 'default (heap probe failed)';
+  }
+  if (isMobile && heapBudgetMB > MOBILE_HEAP_CEILING_MB) {
+    heapBudgetMB = MOBILE_HEAP_CEILING_MB;
+    heapSource += ' → mobile-capped';
   }
+  // ── GPU budget ──
+  let gpuBudgetMB = gpuProbe.probedMB;
+  let gpuSource = gpuProbe.probedMB > 0
+    ? `probe (WebGPU buffers, ${gpuProbe.probedMB} MB allocated)`
+    : `probe failed: ${gpuProbe.error || 'unknown'}`;
+  if (isMobile && gpuBudgetMB > MOBILE_GPU_CEILING_MB) {
+    gpuBudgetMB = MOBILE_GPU_CEILING_MB;
+    gpuSource += ' → mobile-capped';
   }
   return {
+    // Combined headline budget — what the UI shows as "Max model size".
+    // GPU memory is now the constraint that varies per device; heap
+    // budget is a separate floor check.
+    budgetMB: gpuBudgetMB,
+    gpuBudgetMB,
+    heapBudgetMB,
     memGB,
     quotaMB,
+    probedMB: heapProbe.probedMB,
+    gpuProbedMB: gpuProbe.probedMB,
+    probeError: heapProbe.error || null,
+    gpuProbeError: gpuProbe.error || null,
     isMobile,
+    // Two-line source string so the UI stays compact while still
+    // surfacing both probes in the device card tooltip.
+    source: gpuSource,
+    heapSource,
   };
 }
+// variantFits decides whether a model file of `sizeMB` bytes can be
+// loaded and run on this device. Two checks must pass:
+//
+//   1. sizeMB + GPU_VARIANT_OVERHEAD_MB ≤ gpuBudgetMB
+//        Model weights live in WebGPU buffers (since OPFS streaming
+//        keeps them off the WASM heap). The overhead covers compute
+//        scratch + alignment + KV cache mirror.
+//
+//   2. heapBudgetMB ≥ HEAP_WORKING_SET_FLOOR_MB
+//        The WASM heap still has to fit the working set: KV cache,
+//        ggml compute buffers, and JS heap headroom. Roughly constant
+//        per inference regardless of model size at fixed n_ctx.
+//
+// Backwards-compat: if the second arg is a plain number, treat it as
+// the legacy heap-only budget and apply the prior 1.5× sizeMB overhead.
+// New callers should pass { gpuBudgetMB, heapBudgetMB }.
+export function variantFits(sizeMB, budget) {
   if (typeof sizeMB !== 'number' || sizeMB <= 0) return false;
+  if (typeof budget === 'number') {
+    return budget > 0 && sizeMB * 1.5 <= budget;
+  }
+  if (!budget || typeof budget !== 'object') return false;
+  const { gpuBudgetMB, heapBudgetMB } = budget;
+  if (typeof gpuBudgetMB !== 'number' || sizeMB + GPU_VARIANT_OVERHEAD_MB > gpuBudgetMB) {
+    return false;
+  }
+  if (typeof heapBudgetMB !== 'number' || heapBudgetMB < HEAP_WORKING_SET_FLOOR_MB) {
+    return false;
+  }
+  return true;
 }
 export async function describeDevice() {

js/run/source.js CHANGED Viewed

@@ -56,7 +56,12 @@ export function localSource() {
 // ──────────────── hosted / OPFS ────────────────
-const OPFS_ROOT_NAME = 'models';
 async function getOpfsRoot() {
   if (!navigator.storage?.getDirectory) {
@@ -100,14 +105,16 @@ export function hostedSource() {
     // path: the worker opens a sync access handle on this FileHandle and
     // routes MEMFS reads through it, never copying the model into the
     // WASM heap. onProgress is called during the download leg with
-    // (fraction, downloaded, total).
     async opfsHandleForModel(repo, file, onProgress) {
       const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
       if (cached) {
         const f = await cached.getFile();
         if (f.size > 0) {
           onProgress?.(1, f.size, f.size);
-          return { handle: cached, size: f.size };
         }
       }
@@ -136,7 +143,7 @@ export function hostedSource() {
           if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
         }
         await writable.close();
-        return { handle, size: downloaded };
       } catch (err) {
         try { await writable.abort(err); } catch { /* ignore */ }
         throw err;

 // ──────────────── hosted / OPFS ────────────────
+// Exported so bench-worker.js can re-resolve the OPFS file handle inside
+// the worker. We can't transfer FileSystemFileHandle directly across
+// postMessage on every browser (iOS Safari structured-clone is missing
+// the implementation), so instead we send the layout key (rootDir +
+// repo segments + filename) and let the worker open it itself.
+export const OPFS_ROOT_NAME = 'models';
 async function getOpfsRoot() {
   if (!navigator.storage?.getDirectory) {
     // path: the worker opens a sync access handle on this FileHandle and
     // routes MEMFS reads through it, never copying the model into the
     // WASM heap. onProgress is called during the download leg with
+    // (fraction, downloaded, total). The returned `wasDownloaded` flag
+    // distinguishes a fresh download from a cache hit so the caller can
+    // decide whether to mark the variant for post-run eviction.
     async opfsHandleForModel(repo, file, onProgress) {
       const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
       if (cached) {
         const f = await cached.getFile();
         if (f.size > 0) {
           onProgress?.(1, f.size, f.size);
+          return { handle: cached, size: f.size, wasDownloaded: false };
         }
       }
           if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
         }
         await writable.close();
+        return { handle, size: downloaded, wasDownloaded: true };
       } catch (err) {
         try { await writable.abort(err); } catch { /* ignore */ }
         throw err;