GitHub Actions commited on
Commit
43a358a
Β·
1 Parent(s): 9f7edbf

sync from abhijitramesh/webgpu-bench@bcae90cf03

Browse files
Files changed (4) hide show
  1. js/run/bench-worker.js +26 -5
  2. js/run/controller.js +60 -16
  3. js/run/device.js +229 -45
  4. js/run/source.js +11 -4
js/run/bench-worker.js CHANGED
@@ -102,6 +102,23 @@ function patchMEMFS(Module) {
102
  m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
103
  }
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  async function opfsAlloc(Module, name, fileHandle) {
106
  // createSyncAccessHandle is worker-only and exclusive β€” only one writer
107
  // per OPFS file at a time. Caller must ensure no createWritable session
@@ -189,7 +206,7 @@ self.onmessage = async (e) => {
189
  }
190
  };
191
 
192
- async function runOne({ params, stream, buffer, fileHandle }) {
193
  const {
194
  buildType,
195
  contentLength,
@@ -206,13 +223,16 @@ async function runOne({ params, stream, buffer, fileHandle }) {
206
  noWarmup,
207
  } = params;
208
  // Three input modes are supported:
209
- // fileHandle β†’ wllama-style OPFS-streaming load (preferred for >2GB)
 
 
 
210
  // stream β†’ heap-stream mode (zero-copy WASM-heap, transferable)
211
  // buffer β†’ buffered fallback for browsers without transferable streams
212
  // Exactly one must be provided.
213
- const inputCount = (fileHandle ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
214
  if (inputCount !== 1) {
215
- throw new Error('runOne: exactly one of `fileHandle`, `stream`, or `buffer` must be provided');
216
  }
217
 
218
  const result = {
@@ -282,10 +302,11 @@ async function runOne({ params, stream, buffer, fileHandle }) {
282
  // in, register a heap-backed MEMFS file. Faster (mmap'd
283
  // zero-copy at load time) but caps at ~2GB.
284
  let modelPtr = 0; // tracks heap-path allocation for cleanup
285
- const useOpfsPath = !!fileHandle;
286
 
287
  if (useOpfsPath) {
288
  status('opfs', 'Linking OPFS-backed model into MEMFS...');
 
289
  patchMEMFS(Module);
290
  const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
291
  log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);
 
102
  m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
103
  }
104
 
105
+ // Resolve an OPFS path (rootDir + repo segments + filename) to a
106
+ // FileSystemFileHandle inside this worker. Works around the iOS Safari
107
+ // limitation that FileSystemFileHandle isn't structured-cloneable across
108
+ // postMessage β€” main thread sends the layout key, worker opens the
109
+ // handle locally.
110
+ async function resolveOpfsHandle({ rootDir, repo, filename }) {
111
+ if (!self.navigator?.storage?.getDirectory) {
112
+ throw new Error('OPFS not available in this worker');
113
+ }
114
+ let dir = await self.navigator.storage.getDirectory();
115
+ dir = await dir.getDirectoryHandle(rootDir, { create: false });
116
+ for (const seg of String(repo).split('/').filter(Boolean)) {
117
+ dir = await dir.getDirectoryHandle(seg, { create: false });
118
+ }
119
+ return dir.getFileHandle(filename, { create: false });
120
+ }
121
+
122
  async function opfsAlloc(Module, name, fileHandle) {
123
  // createSyncAccessHandle is worker-only and exclusive β€” only one writer
124
  // per OPFS file at a time. Caller must ensure no createWritable session
 
206
  }
207
  };
208
 
209
+ async function runOne({ params, stream, buffer, opfsPath }) {
210
  const {
211
  buildType,
212
  contentLength,
 
223
  noWarmup,
224
  } = params;
225
  // Three input modes are supported:
226
+ // opfsPath β†’ wllama-style OPFS-streaming load (preferred for >2GB).
227
+ // Resolved to a FileSystemFileHandle inside the worker
228
+ // via navigator.storage.getDirectory() β€” FileHandles
229
+ // themselves don't structured-clone reliably (iOS Safari).
230
  // stream β†’ heap-stream mode (zero-copy WASM-heap, transferable)
231
  // buffer β†’ buffered fallback for browsers without transferable streams
232
  // Exactly one must be provided.
233
+ const inputCount = (opfsPath ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
234
  if (inputCount !== 1) {
235
+ throw new Error('runOne: exactly one of `opfsPath`, `stream`, or `buffer` must be provided');
236
  }
237
 
238
  const result = {
 
302
  // in, register a heap-backed MEMFS file. Faster (mmap'd
303
  // zero-copy at load time) but caps at ~2GB.
304
  let modelPtr = 0; // tracks heap-path allocation for cleanup
305
+ const useOpfsPath = !!opfsPath;
306
 
307
  if (useOpfsPath) {
308
  status('opfs', 'Linking OPFS-backed model into MEMFS...');
309
+ const fileHandle = await resolveOpfsHandle(opfsPath);
310
  patchMEMFS(Module);
311
  const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
312
  log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);
js/run/controller.js CHANGED
@@ -3,8 +3,8 @@
3
  // classes. Detects `surface` (localhost / space / pages) to gate the
4
  // server save checkbox and the HF hub sign-in/submit row.
5
 
6
- import { localSource, hostedSource, inventoryOpfs, purgeOpfs } from './source.js';
7
- import { getDeviceBudgetMB, variantFits, describeDevice } from './device.js';
8
  import {
9
  resumeHFSession, beginHFSignIn, signOutHF, submitResultsToDataset,
10
  HF_OAUTH_PENDING_KEY,
@@ -14,7 +14,6 @@ import { isHubConfigured, HF_DATASET_REPO } from './config.js';
14
  const RUN_INTENT_STORAGE_KEY = 'webgpu-bench:runIntent';
15
  const CRASH_STALE_MS = 10_000;
16
 
17
- const OVERHEAD = 1.5;
18
  const DEFAULT_PROMPT =
19
  'Explain quantum computing to a software engineer in four concise paragraphs. ' +
20
  'Cover superposition, entanglement, quantum gates, and one practical use case.';
@@ -176,7 +175,15 @@ function computeWarnings(modelName, quant) {
176
  }
177
 
178
  function cacheKey(v) { return `${v.repo}/${v.filename}`; }
179
- function variantFitsDevice(v) { return variantFits(v.sizeMB, state.budget.budgetMB, OVERHEAD); }
 
 
 
 
 
 
 
 
180
  function isCached(v) {
181
  const entry = state.cacheStatus[cacheKey(v)];
182
  return !!entry && entry.cachedBytes > 0;
@@ -272,9 +279,14 @@ function renderHeader() {
272
  const memStr = b.memGB !== null ? `${b.memGB} GB` : 'β€”';
273
  $('device-memory').textContent = memStr;
274
 
 
 
 
 
275
  const budgetGB = (b.budgetMB / 1024).toFixed(1);
 
276
  $('device-budget').textContent = `${budgetGB} GB`;
277
- $('device-budget-source').textContent = `source: ${b.source}`;
278
 
279
  const webgpuCell = $('device-webgpu');
280
  if (webgpuCell) {
@@ -1047,14 +1059,28 @@ async function onRunClick() {
1047
  state.sessionDownloads = new Set();
1048
  updateButtons();
1049
 
 
 
 
 
 
 
 
1050
  const machine = await machineInfo();
1051
  const browser = browserInfo();
1052
  const evictAfter = !!$('evict-after-run')?.checked;
1053
 
1054
  // One-ahead prefetch: while variant i runs, we may have variant i+1
1055
  // downloading. Only one prefetch in flight at a time.
 
 
 
 
 
 
1056
  const prefetchFor = async (v) => {
1057
  if (!v || isCached(v)) return;
 
1058
  const row = progressRowFor(v);
1059
  row.setStatus('prefetching', '');
1060
  try {
@@ -1090,7 +1116,10 @@ async function onRunClick() {
1090
  // Wait for variant i to be cached (either via prefetch or pre-existing).
1091
  await prefetchPromise;
1092
  if (state.aborted) break;
1093
- if (!isCached(v)) {
 
 
 
1094
  row.setStatus('error', 'not cached after prefetch');
1095
  prefetchPromise = prefetchFor(variants[i + 1]);
1096
  continue;
@@ -1164,7 +1193,7 @@ async function onRunClick() {
1164
  function runInWorker({
1165
  params,
1166
  stream,
1167
- fileHandle,
1168
  onStatus,
1169
  onProgress,
1170
  onLog,
@@ -1206,14 +1235,16 @@ function runInWorker({
1206
  };
1207
 
1208
  // Three transport modes β€” see bench-worker.js runOne() for matching shape.
1209
- if (fileHandle) {
1210
- // OPFS path: FileSystemFileHandle is structured-cloneable, not
1211
- // transferable. The worker creates its own sync access handle on the
1212
- // cloned reference (still bound to the same underlying OPFS file).
 
 
1213
  try {
1214
- worker.postMessage({ type: 'run', params, fileHandle });
1215
  } catch (err) {
1216
- finish({ status: 'error', error: `postMessage(fileHandle) failed: ${err.message}` });
1217
  }
1218
  return;
1219
  }
@@ -1333,21 +1364,34 @@ async function runBenchmarkInWorker(v, params, callbacks) {
1333
  };
1334
 
1335
  if (useOpfs) {
1336
- let fileHandle, contentLength;
1337
  try {
1338
  callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
1339
  const r = await state.source.opfsHandleForModel(
1340
  v.repo, v.filename,
1341
  callbacks.onProgress,
1342
  );
1343
- fileHandle = r.handle;
1344
  contentLength = r.size;
 
 
 
 
 
 
 
 
 
1345
  } catch (err) {
1346
  return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
1347
  }
 
 
 
 
 
1348
  return runInWorker({
1349
  params: { ...baseParams, contentLength },
1350
- fileHandle,
1351
  onStatus: callbacks.onStatus,
1352
  onProgress: callbacks.onProgress,
1353
  onLog: callbacks.onLog,
 
3
  // classes. Detects `surface` (localhost / space / pages) to gate the
4
  // server save checkbox and the HF hub sign-in/submit row.
5
 
6
+ import { localSource, hostedSource, inventoryOpfs, purgeOpfs, OPFS_ROOT_NAME } from './source.js';
7
+ import { getDeviceBudgetMB, variantFits, describeDevice, isMobileDevice } from './device.js';
8
  import {
9
  resumeHFSession, beginHFSignIn, signOutHF, submitResultsToDataset,
10
  HF_OAUTH_PENDING_KEY,
 
14
  const RUN_INTENT_STORAGE_KEY = 'webgpu-bench:runIntent';
15
  const CRASH_STALE_MS = 10_000;
16
 
 
17
  const DEFAULT_PROMPT =
18
  'Explain quantum computing to a software engineer in four concise paragraphs. ' +
19
  'Cover superposition, entanglement, quantum gates, and one practical use case.';
 
175
  }
176
 
177
  function cacheKey(v) { return `${v.repo}/${v.filename}`; }
178
+ function variantFitsDevice(v) {
179
+ // New variantFits signature: pass both budgets so the predicate can
180
+ // check (a) model fits in GPU memory + small overhead, and (b) WASM
181
+ // heap can hold the working set. See device.js for the rationale.
182
+ return variantFits(v.sizeMB, {
183
+ gpuBudgetMB: state.budget.gpuBudgetMB,
184
+ heapBudgetMB: state.budget.heapBudgetMB,
185
+ });
186
+ }
187
  function isCached(v) {
188
  const entry = state.cacheStatus[cacheKey(v)];
189
  return !!entry && entry.cachedBytes > 0;
 
279
  const memStr = b.memGB !== null ? `${b.memGB} GB` : 'β€”';
280
  $('device-memory').textContent = memStr;
281
 
282
+ // budgetMB is now the GPU-memory budget (per device.js _computeBudget),
283
+ // since with OPFS streaming the model lives in WebGPU buffers, not the
284
+ // WASM heap. We surface the heap budget separately in the source line so
285
+ // a curious reader can see both probes' results.
286
  const budgetGB = (b.budgetMB / 1024).toFixed(1);
287
+ const heapGB = (b.heapBudgetMB / 1024).toFixed(1);
288
  $('device-budget').textContent = `${budgetGB} GB`;
289
+ $('device-budget-source').textContent = `GPU memory Β· WASM heap: ${heapGB} GB`;
290
 
291
  const webgpuCell = $('device-webgpu');
292
  if (webgpuCell) {
 
1059
  state.sessionDownloads = new Set();
1060
  updateButtons();
1061
 
1062
+ if (isMobileDevice()) {
1063
+ logLine(
1064
+ 'Mobile device β€” running with sequential downloads (no parallel prefetch). ' +
1065
+ 'Each variant downloads, runs, evicts, then the next begins.',
1066
+ );
1067
+ }
1068
+
1069
  const machine = await machineInfo();
1070
  const browser = browserInfo();
1071
  const evictAfter = !!$('evict-after-run')?.checked;
1072
 
1073
  // One-ahead prefetch: while variant i runs, we may have variant i+1
1074
  // downloading. Only one prefetch in flight at a time.
1075
+ // On mobile, the overlap is a measurement hazard β€” concurrent download
1076
+ // contends with inference for SoC power, memory bandwidth, and OPFS
1077
+ // write queues. Skip the prefetch entirely; runBenchmarkInWorker's
1078
+ // opfsHandleForModel does the download inline (with the same progress
1079
+ // events the prefetch row would have shown).
1080
+ const skipPrefetch = isMobileDevice();
1081
  const prefetchFor = async (v) => {
1082
  if (!v || isCached(v)) return;
1083
+ if (skipPrefetch) return;
1084
  const row = progressRowFor(v);
1085
  row.setStatus('prefetching', '');
1086
  try {
 
1116
  // Wait for variant i to be cached (either via prefetch or pre-existing).
1117
  await prefetchPromise;
1118
  if (state.aborted) break;
1119
+ // When skipPrefetch is on (mobile), variants arrive uncached and
1120
+ // runBenchmarkInWorker β†’ opfsHandleForModel handles the inline
1121
+ // download. Skip the cache-check error path in that case.
1122
+ if (!skipPrefetch && !isCached(v)) {
1123
  row.setStatus('error', 'not cached after prefetch');
1124
  prefetchPromise = prefetchFor(variants[i + 1]);
1125
  continue;
 
1193
  function runInWorker({
1194
  params,
1195
  stream,
1196
+ opfsPath,
1197
  onStatus,
1198
  onProgress,
1199
  onLog,
 
1235
  };
1236
 
1237
  // Three transport modes β€” see bench-worker.js runOne() for matching shape.
1238
+ if (opfsPath) {
1239
+ // OPFS path: send the layout key only (rootDir + repo + filename).
1240
+ // The worker re-resolves to a FileSystemFileHandle via
1241
+ // navigator.storage.getDirectory() itself. Plain JSON-serializable β€”
1242
+ // works on iOS Safari, where FileSystemFileHandle structured-clone
1243
+ // is not implemented.
1244
  try {
1245
+ worker.postMessage({ type: 'run', params, opfsPath });
1246
  } catch (err) {
1247
+ finish({ status: 'error', error: `postMessage(opfsPath) failed: ${err.message}` });
1248
  }
1249
  return;
1250
  }
 
1364
  };
1365
 
1366
  if (useOpfs) {
1367
+ let contentLength;
1368
  try {
1369
  callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
1370
  const r = await state.source.opfsHandleForModel(
1371
  v.repo, v.filename,
1372
  callbacks.onProgress,
1373
  );
 
1374
  contentLength = r.size;
1375
+ // When the prefetch is skipped (mobile path), the inline download
1376
+ // above is the variant's first arrival in OPFS. Mark it as
1377
+ // session-downloaded so the post-run eviction logic frees it before
1378
+ // the next variant starts β€” keeping disk usage flat.
1379
+ if (r.wasDownloaded) {
1380
+ state.sessionDownloads.add(cacheKey(v));
1381
+ state.cacheStatus[cacheKey(v)] = { cachedBytes: r.size };
1382
+ refreshCacheBadge(v);
1383
+ }
1384
  } catch (err) {
1385
  return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
1386
  }
1387
+ // Pass the OPFS path components, not the FileHandle. iOS Safari
1388
+ // (and some older Chromium/Firefox versions) can't structured-clone
1389
+ // FileSystemFileHandle across postMessage. The worker re-resolves the
1390
+ // handle via navigator.storage.getDirectory() itself, which works
1391
+ // everywhere OPFS is supported.
1392
  return runInWorker({
1393
  params: { ...baseParams, contentLength },
1394
+ opfsPath: { rootDir: OPFS_ROOT_NAME, repo: v.repo, filename: v.filename },
1395
  onStatus: callbacks.onStatus,
1396
  onProgress: callbacks.onProgress,
1397
  onLog: callbacks.onLog,
js/run/device.js CHANGED
@@ -1,29 +1,66 @@
1
  // Device-fit helpers for the interactive bench page.
2
  //
3
- // getDeviceBudgetMB() empirically probes the WASM heap's actual maximum
4
- // growth on this device, mirroring how llama.cpp itself allocates (a single
5
- // WebAssembly.Memory grown in pages). The probe runs in a worker so an
6
- // allocation failure dies harmlessly. We fall back to deviceMemory /
7
- // storage.estimate heuristics if the probe can't run.
8
  //
9
- // The probed value is the budget β€” no extra safety factor. variantFits()
10
- // already multiplies the GGUF size by 1.5Γ— to cover the WASM heap +
11
- // activations + KV cache + WebGPU staging beyond the file size.
 
 
 
 
 
 
 
 
 
 
 
 
12
  //
13
  // On wasm32 the linear memory caps at 4 GiB no matter how much physical
14
- // RAM the device has, so probe results above 4096 MB cannot exist.
15
 
16
  const DEFAULT_BUDGET_MB = 2 * 1024;
17
  const HOSTED_QUOTA_FRACTION = 0.4;
18
  const HOSTED_QUOTA_CAP_MB = 8 * 1024;
19
 
20
- // Hard ceiling on mobile regardless of probe result. iOS/Android can reap
21
- // the tab under system memory pressure without raising a JS error the
22
- // probe could observe, so an "ok at 4 GiB" result is not safe to trust on
23
- // a phone β€” the OS gets the last word.
24
- const MOBILE_BUDGET_CEILING_MB = 600;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  const PROBE_TIMEOUT_MS = 15_000;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  export function isMobileDevice() {
29
  if (typeof navigator === 'undefined') return false;
@@ -32,6 +69,8 @@ export function isMobileDevice() {
32
  return /iPhone|iPad|iPod|Android.*Mobile/.test(ua);
33
  }
34
 
 
 
35
  // Spawn the probe worker, wait for a result, clean up. Returns
36
  // { probedMB } on success, or { probedMB: 0, error } on any failure mode
37
  // (timeout, worker construct error, worker onerror β€” typically the probe
@@ -67,9 +106,107 @@ export function probeHeapBudgetMB({ stepPages, maxPages, timeoutMs = PROBE_TIMEO
67
  });
68
  }
69
 
70
- // Cache the budget for the lifetime of the page load. controller.js calls
71
- // both getDeviceBudgetMB() and describeDevice() at mount, and we don't want
72
- // to run the 1–2 s probe twice.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  let _budgetPromise = null;
74
 
75
  export async function getDeviceBudgetMB() {
@@ -90,49 +227,96 @@ async function _computeBudget() {
90
 
91
  const isMobile = isMobileDevice();
92
 
93
- // Primary: empirical probe. The worker grows a WebAssembly.Memory page
94
- // by page and reports how far it got β€” that's literally the WASM heap
95
- // ceiling on this device, capped at wasm32's 4 GiB. We trust it directly;
96
- // variantFits() applies the per-variant 1.5Γ— overhead.
97
- const probe = await probeHeapBudgetMB();
98
- const probedMB = probe.probedMB;
99
-
100
- let budgetMB;
101
- let source;
102
- if (probedMB > 0) {
103
- budgetMB = probedMB;
104
- source = `probe (WASM heap, ${probedMB} MB committed)`;
105
  } else if (memGB !== null) {
106
- budgetMB = memGB * 1024 * 0.6;
107
- source = 'navigator.deviceMemory (probe failed)';
108
  } else if (quotaMB !== null) {
109
- budgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB);
110
- source = 'navigator.storage.estimate().quota (probe failed)';
111
  } else {
112
- budgetMB = DEFAULT_BUDGET_MB;
113
- source = 'default (probe failed)';
 
 
 
 
114
  }
115
 
116
- if (isMobile && budgetMB > MOBILE_BUDGET_CEILING_MB) {
117
- budgetMB = MOBILE_BUDGET_CEILING_MB;
118
- source += ' β†’ mobile-capped';
 
 
 
 
 
119
  }
120
 
121
  return {
122
- budgetMB,
 
 
 
 
 
123
  memGB,
124
  quotaMB,
125
- probedMB,
126
- probeError: probe.error || null,
 
 
127
  isMobile,
128
- source,
 
 
 
129
  };
130
  }
131
 
132
- export function variantFits(sizeMB, budgetMB, overhead = 1.5) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  if (typeof sizeMB !== 'number' || sizeMB <= 0) return false;
134
- if (typeof budgetMB !== 'number' || budgetMB <= 0) return false;
135
- return sizeMB * overhead <= budgetMB;
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
137
 
138
  export async function describeDevice() {
 
1
  // Device-fit helpers for the interactive bench page.
2
  //
3
+ // Two budget probes drive the per-variant fit decision:
 
 
 
 
4
  //
5
+ // getDeviceBudgetMB() β€” empirical WASM heap probe. Grows a
6
+ // WebAssembly.Memory page-by-page in a worker until it fails. Caps
7
+ // the working set (KV cache + compute scratch + JS heap headroom)
8
+ // llama.cpp consumes during inference.
9
+ //
10
+ // probeGpuBudgetMB() β€” empirical WebGPU memory probe. Allocates real
11
+ // buffers with mappedAtCreation=true on the actual adapter until OOM.
12
+ // Caps the size of model weights llama.cpp can hold in GPU buffers,
13
+ // since OPFS-streaming keeps model bytes off the WASM heap.
14
+ //
15
+ // variantFits() then checks both: model size + GPU overhead ≀ GPU budget,
16
+ // AND heap working-set floor ≀ heap budget. wllama doesn't probe at all
17
+ // β€” they let load attempts fail naturally β€” but our auto-select buttons
18
+ // ("All fit", "Run study") need a fit predicate, so we err on the side
19
+ // of measuring rather than guessing.
20
  //
21
  // On wasm32 the linear memory caps at 4 GiB no matter how much physical
22
+ // RAM the device has, so heap probe results above 4096 MB cannot exist.
23
 
24
  const DEFAULT_BUDGET_MB = 2 * 1024;
25
  const HOSTED_QUOTA_FRACTION = 0.4;
26
  const HOSTED_QUOTA_CAP_MB = 8 * 1024;
27
 
28
+ // Hard ceiling on mobile WASM heap regardless of probe result. iOS/Android
29
+ // can reap the tab under system memory pressure without raising a JS error
30
+ // the probe could observe, so an "ok at 4 GiB" result is not safe to trust
31
+ // on a phone.
32
+ //
33
+ // Empirically iOS Safari tabs get reaped well below the WebAssembly.Memory
34
+ // engine cap (~1 GiB on iPhone), and Android Chrome on mid-range devices
35
+ // behaves similarly. Below 500 MB heap usage tends to be safe across
36
+ // modern phones; above that we start seeing tab kills mid-run. The OPFS-
37
+ // streaming model load means model bytes no longer live on the WASM heap,
38
+ // so this budget caps the per-step working set, not the model file.
39
+ const MOBILE_HEAP_CEILING_MB = 500;
40
+
41
+ // Hard ceiling on mobile GPU memory probe result. Even when the probe
42
+ // succeeds at higher numbers, the OS may evict the GPU process or the tab
43
+ // before we can actually use it. iPhone WebGPU (Metal-3 under the hood)
44
+ // typically gives a tab 1.5–3 GB usable depending on device class; cap at
45
+ // 3 GB as a conservative ceiling that won't reject anything reasonable.
46
+ const MOBILE_GPU_CEILING_MB = 3 * 1024;
47
 
48
  const PROBE_TIMEOUT_MS = 15_000;
49
+ const GPU_PROBE_STEP_MB = 256;
50
+ const GPU_PROBE_MAX_MB = 8 * 1024;
51
+ const GPU_PROBE_TIMEOUT_MS = 8_000;
52
+
53
+ // Working-set floor in the WASM heap. KV cache + compute buffers + JS heap
54
+ // headroom for a typical 1B model at n_ctx=2048 add up to ~400 MB; we
55
+ // require 500 to leave a margin. Bigger contexts scale this up β€” not
56
+ // modeled yet (worth revisiting if we benchmark at n_ctx >> 2048).
57
+ const HEAP_WORKING_SET_FLOOR_MB = 500;
58
+
59
+ // Per-variant overhead added on top of the model file size when checking
60
+ // GPU fit. Covers compute buffers, alignment padding, and the KV cache
61
+ // mirror that the WebGPU backend keeps. A flat 200 MB is a conservative
62
+ // approximation; in practice it scales somewhat with model + context size.
63
+ const GPU_VARIANT_OVERHEAD_MB = 200;
64
 
65
  export function isMobileDevice() {
66
  if (typeof navigator === 'undefined') return false;
 
69
  return /iPhone|iPad|iPod|Android.*Mobile/.test(ua);
70
  }
71
 
72
+ // ──────────────── WASM heap probe ────────────────
73
+
74
  // Spawn the probe worker, wait for a result, clean up. Returns
75
  // { probedMB } on success, or { probedMB: 0, error } on any failure mode
76
  // (timeout, worker construct error, worker onerror β€” typically the probe
 
106
  });
107
  }
108
 
109
+ // ──────────────── GPU memory probe ────────────────
110
+
111
+ // Allocate WebGPU buffers in stepMB increments until OOM, return the
112
+ // total committed bytes as the GPU memory budget. Uses
113
+ // mappedAtCreation=true to force real memory commit (some drivers lazy-
114
+ // allocate until first use otherwise) and captures OOM via the
115
+ // 'out-of-memory' error scope, with device.lost as a backstop.
116
+ //
117
+ // Caveats:
118
+ // - The GPU process is shared with other tabs. If they're holding GPU
119
+ // memory the probe undercounts. (Same as wllama's heap probe β€” best
120
+ // we can do without a richer browser API.)
121
+ // - Some drivers (notably iOS Metal under WebKit) lazy-fail at dispatch
122
+ // time rather than at createBuffer; this probe's number is therefore
123
+ // an upper bound, not a guarantee. Mobile cap below mitigates.
124
+ export async function probeGpuBudgetMB({
125
+ stepMB = GPU_PROBE_STEP_MB,
126
+ maxMB = GPU_PROBE_MAX_MB,
127
+ timeoutMs = GPU_PROBE_TIMEOUT_MS,
128
+ } = {}) {
129
+ if (!navigator.gpu) {
130
+ return { probedMB: 0, error: 'WebGPU not available' };
131
+ }
132
+
133
+ let adapter, device;
134
+ try {
135
+ adapter = await navigator.gpu.requestAdapter();
136
+ if (!adapter) return { probedMB: 0, error: 'no WebGPU adapter' };
137
+ // Request the maximum the adapter can give us; defaults are often
138
+ // smaller than what the hardware supports.
139
+ const requiredLimits = {};
140
+ const cap = (k) => {
141
+ const v = adapter.limits?.[k];
142
+ if (typeof v === 'number') requiredLimits[k] = v;
143
+ };
144
+ cap('maxBufferSize');
145
+ cap('maxStorageBufferBindingSize');
146
+ device = await adapter.requestDevice({ requiredLimits });
147
+ } catch (err) {
148
+ return { probedMB: 0, error: `adapter/device init failed: ${err.message}` };
149
+ }
150
+
151
+ let deviceLost = false;
152
+ device.lost.then(() => { deviceLost = true; }).catch(() => {});
153
+
154
+ const buffers = [];
155
+ const stepBytes = stepMB * 1024 * 1024;
156
+ let totalBytes = 0;
157
+ const start = performance.now();
158
+
159
+ try {
160
+ while (totalBytes + stepBytes <= maxMB * 1024 * 1024) {
161
+ if (deviceLost) break;
162
+ if (performance.now() - start > timeoutMs) break;
163
+
164
+ device.pushErrorScope('out-of-memory');
165
+ let buffer;
166
+ try {
167
+ buffer = device.createBuffer({
168
+ size: stepBytes,
169
+ usage: GPUBufferUsage.STORAGE,
170
+ mappedAtCreation: true,
171
+ });
172
+ // Touch the start of the mapped range to force a real commit.
173
+ // Drivers can lazy-back the allocation until first write, which
174
+ // would fool the probe into thinking it has more headroom than it
175
+ // really does.
176
+ const touchBytes = Math.min(stepBytes, 64 * 1024);
177
+ new Uint8Array(buffer.getMappedRange(0, touchBytes))[0] = 1;
178
+ buffer.unmap();
179
+ } catch (err) {
180
+ await device.popErrorScope().catch(() => null);
181
+ break;
182
+ }
183
+
184
+ const error = await device.popErrorScope().catch(() => null);
185
+ if (error) {
186
+ try { buffer.destroy(); } catch { /* noop */ }
187
+ break;
188
+ }
189
+
190
+ buffers.push(buffer);
191
+ totalBytes += stepBytes;
192
+
193
+ // Yield so we don't starve the main thread / GC.
194
+ await new Promise((r) => setTimeout(r, 0));
195
+ }
196
+ } finally {
197
+ for (const b of buffers) {
198
+ try { b.destroy(); } catch { /* noop */ }
199
+ }
200
+ try { device.destroy(); } catch { /* noop */ }
201
+ }
202
+
203
+ return { probedMB: Math.floor(totalBytes / (1024 * 1024)) };
204
+ }
205
+
206
+ // ──────────────── public budget API ────────────────
207
+
208
+ // Cache the full budget for the lifetime of the page load. Both probes
209
+ // take 1–8 s; we don't want to pay that twice for the same surface.
210
  let _budgetPromise = null;
211
 
212
  export async function getDeviceBudgetMB() {
 
227
 
228
  const isMobile = isMobileDevice();
229
 
230
+ // Run both probes in parallel.
231
+ const [heapProbe, gpuProbe] = await Promise.all([
232
+ probeHeapBudgetMB(),
233
+ probeGpuBudgetMB(),
234
+ ]);
235
+
236
+ // ── Heap budget ──
237
+ let heapBudgetMB;
238
+ let heapSource;
239
+ if (heapProbe.probedMB > 0) {
240
+ heapBudgetMB = heapProbe.probedMB;
241
+ heapSource = `probe (WASM heap, ${heapProbe.probedMB} MB committed)`;
242
  } else if (memGB !== null) {
243
+ heapBudgetMB = memGB * 1024 * 0.6;
244
+ heapSource = 'navigator.deviceMemory (heap probe failed)';
245
  } else if (quotaMB !== null) {
246
+ heapBudgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB);
247
+ heapSource = 'navigator.storage.estimate().quota (heap probe failed)';
248
  } else {
249
+ heapBudgetMB = DEFAULT_BUDGET_MB;
250
+ heapSource = 'default (heap probe failed)';
251
+ }
252
+ if (isMobile && heapBudgetMB > MOBILE_HEAP_CEILING_MB) {
253
+ heapBudgetMB = MOBILE_HEAP_CEILING_MB;
254
+ heapSource += ' β†’ mobile-capped';
255
  }
256
 
257
+ // ── GPU budget ──
258
+ let gpuBudgetMB = gpuProbe.probedMB;
259
+ let gpuSource = gpuProbe.probedMB > 0
260
+ ? `probe (WebGPU buffers, ${gpuProbe.probedMB} MB allocated)`
261
+ : `probe failed: ${gpuProbe.error || 'unknown'}`;
262
+ if (isMobile && gpuBudgetMB > MOBILE_GPU_CEILING_MB) {
263
+ gpuBudgetMB = MOBILE_GPU_CEILING_MB;
264
+ gpuSource += ' β†’ mobile-capped';
265
  }
266
 
267
  return {
268
+ // Combined headline budget β€” what the UI shows as "Max model size".
269
+ // GPU memory is now the constraint that varies per device; heap
270
+ // budget is a separate floor check.
271
+ budgetMB: gpuBudgetMB,
272
+ gpuBudgetMB,
273
+ heapBudgetMB,
274
  memGB,
275
  quotaMB,
276
+ probedMB: heapProbe.probedMB,
277
+ gpuProbedMB: gpuProbe.probedMB,
278
+ probeError: heapProbe.error || null,
279
+ gpuProbeError: gpuProbe.error || null,
280
  isMobile,
281
+ // Two-line source string so the UI stays compact while still
282
+ // surfacing both probes in the device card tooltip.
283
+ source: gpuSource,
284
+ heapSource,
285
  };
286
  }
287
 
288
+ // variantFits decides whether a model file of `sizeMB` bytes can be
289
+ // loaded and run on this device. Two checks must pass:
290
+ //
291
+ // 1. sizeMB + GPU_VARIANT_OVERHEAD_MB ≀ gpuBudgetMB
292
+ // Model weights live in WebGPU buffers (since OPFS streaming
293
+ // keeps them off the WASM heap). The overhead covers compute
294
+ // scratch + alignment + KV cache mirror.
295
+ //
296
+ // 2. heapBudgetMB β‰₯ HEAP_WORKING_SET_FLOOR_MB
297
+ // The WASM heap still has to fit the working set: KV cache,
298
+ // ggml compute buffers, and JS heap headroom. Roughly constant
299
+ // per inference regardless of model size at fixed n_ctx.
300
+ //
301
+ // Backwards-compat: if the second arg is a plain number, treat it as
302
+ // the legacy heap-only budget and apply the prior 1.5Γ— sizeMB overhead.
303
+ // New callers should pass { gpuBudgetMB, heapBudgetMB }.
304
+ export function variantFits(sizeMB, budget) {
305
  if (typeof sizeMB !== 'number' || sizeMB <= 0) return false;
306
+
307
+ if (typeof budget === 'number') {
308
+ return budget > 0 && sizeMB * 1.5 <= budget;
309
+ }
310
+ if (!budget || typeof budget !== 'object') return false;
311
+
312
+ const { gpuBudgetMB, heapBudgetMB } = budget;
313
+ if (typeof gpuBudgetMB !== 'number' || sizeMB + GPU_VARIANT_OVERHEAD_MB > gpuBudgetMB) {
314
+ return false;
315
+ }
316
+ if (typeof heapBudgetMB !== 'number' || heapBudgetMB < HEAP_WORKING_SET_FLOOR_MB) {
317
+ return false;
318
+ }
319
+ return true;
320
  }
321
 
322
  export async function describeDevice() {
js/run/source.js CHANGED
@@ -56,7 +56,12 @@ export function localSource() {
56
 
57
  // ──────────────── hosted / OPFS ────────────────
58
 
59
- const OPFS_ROOT_NAME = 'models';
 
 
 
 
 
60
 
61
  async function getOpfsRoot() {
62
  if (!navigator.storage?.getDirectory) {
@@ -100,14 +105,16 @@ export function hostedSource() {
100
  // path: the worker opens a sync access handle on this FileHandle and
101
  // routes MEMFS reads through it, never copying the model into the
102
  // WASM heap. onProgress is called during the download leg with
103
- // (fraction, downloaded, total).
 
 
104
  async opfsHandleForModel(repo, file, onProgress) {
105
  const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
106
  if (cached) {
107
  const f = await cached.getFile();
108
  if (f.size > 0) {
109
  onProgress?.(1, f.size, f.size);
110
- return { handle: cached, size: f.size };
111
  }
112
  }
113
 
@@ -136,7 +143,7 @@ export function hostedSource() {
136
  if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
137
  }
138
  await writable.close();
139
- return { handle, size: downloaded };
140
  } catch (err) {
141
  try { await writable.abort(err); } catch { /* ignore */ }
142
  throw err;
 
56
 
57
  // ──────────────── hosted / OPFS ────────────────
58
 
59
+ // Exported so bench-worker.js can re-resolve the OPFS file handle inside
60
+ // the worker. We can't transfer FileSystemFileHandle directly across
61
+ // postMessage on every browser (iOS Safari structured-clone is missing
62
+ // the implementation), so instead we send the layout key (rootDir +
63
+ // repo segments + filename) and let the worker open it itself.
64
+ export const OPFS_ROOT_NAME = 'models';
65
 
66
  async function getOpfsRoot() {
67
  if (!navigator.storage?.getDirectory) {
 
105
  // path: the worker opens a sync access handle on this FileHandle and
106
  // routes MEMFS reads through it, never copying the model into the
107
  // WASM heap. onProgress is called during the download leg with
108
+ // (fraction, downloaded, total). The returned `wasDownloaded` flag
109
+ // distinguishes a fresh download from a cache hit so the caller can
110
+ // decide whether to mark the variant for post-run eviction.
111
  async opfsHandleForModel(repo, file, onProgress) {
112
  const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
113
  if (cached) {
114
  const f = await cached.getFile();
115
  if (f.size > 0) {
116
  onProgress?.(1, f.size, f.size);
117
+ return { handle: cached, size: f.size, wasDownloaded: false };
118
  }
119
  }
120
 
 
143
  if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
144
  }
145
  await writable.close();
146
+ return { handle, size: downloaded, wasDownloaded: true };
147
  } catch (err) {
148
  try { await writable.abort(err); } catch { /* ignore */ }
149
  throw err;