GitHub Actions commited on
Commit
299e359
·
1 Parent(s): 0fc83ad

sync from abhijitramesh/webgpu-bench@dab7e7757e

Browse files
build/asyncify/bench.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:424ebf14301cc29905a08d5a225bdc98b58dbbd6a949b22adcc8717bcda151c8
3
- size 5233169
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50895b262f9b0da117509d04075ca06f3b30d3482c130d22c827e53e20d8a650
3
+ size 5233188
build/asyncify/build-info.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
- "llamaCppDescribe": "b8979-5-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
- "builtAt": "2026-04-29T22:38:32Z"
6
  }
 
1
  {
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
+ "llamaCppDescribe": "b8981-3-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
+ "builtAt": "2026-04-29T23:41:53Z"
6
  }
build/jspi/bench.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b549092fe3ac53a3cca8efdd6e9a3e4ebf64443b7d76a5946d59dd4052378fd3
3
- size 3612110
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ef71c59da832ad869cbc002665fd3bb3505c7e515a7cefc5d7f7901224ea40
3
+ size 3612135
build/jspi/build-info.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
- "llamaCppDescribe": "b8979-5-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
- "builtAt": "2026-04-29T22:34:33Z"
6
  }
 
1
  {
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
+ "llamaCppDescribe": "b8981-3-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
+ "builtAt": "2026-04-29T23:37:54Z"
6
  }
js/run/bench-worker.js CHANGED
@@ -37,6 +37,95 @@ const post = (msg) => self.postMessage(msg);
37
  const log = (line) => post({ type: 'log', line });
38
  const status = (s, msg) => post({ type: 'status', status: s, msg });
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  // Aggregate raw nanosecond samples into the llama-bench result shape.
41
  // Mirrors core.js buildTest — keep them identical.
42
  function buildTest(name, n_prompt, n_gen, samples_ns) {
@@ -100,7 +189,7 @@ self.onmessage = async (e) => {
100
  }
101
  };
102
 
103
- async function runOne({ params, stream, buffer }) {
104
  const {
105
  buildType,
106
  contentLength,
@@ -116,8 +205,14 @@ async function runOne({ params, stream, buffer }) {
116
  nReps,
117
  noWarmup,
118
  } = params;
119
- if (!stream && !buffer) {
120
- throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
 
 
 
 
 
 
121
  }
122
 
123
  const result = {
@@ -178,55 +273,75 @@ async function runOne({ params, stream, buffer }) {
178
  });
179
  log('WASM module loaded');
180
 
181
- // ─── Stream the model into the WASM heap (HeapFS-style) ───
182
- if (!(contentLength > 0)) {
183
- throw new Error('content-length is required for streaming into WASM heap');
184
- }
185
- status('downloading', 'Streaming model into WASM heap...');
 
 
 
 
 
186
 
187
- let modelPtr = Module._malloc(contentLength);
188
- if (!modelPtr) {
189
- throw new Error(
190
- `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
191
- );
192
- }
 
 
 
 
 
 
 
193
 
194
- try {
195
- let downloaded = 0;
196
- if (stream) {
197
- const reader = stream.getReader();
198
- while (true) {
199
- const { done, value } = await reader.read();
200
- if (done) break;
201
- Module.HEAPU8.set(value, modelPtr + downloaded);
202
- downloaded += value.length;
203
- post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
204
- }
205
- } else {
206
- const view = new Uint8Array(buffer);
207
- if (view.byteLength !== contentLength) {
208
- log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
209
- }
210
- Module.HEAPU8.set(view, modelPtr);
211
- downloaded = view.byteLength;
212
- post({ type: 'progress', fraction: 1, downloaded, total: contentLength });
213
  }
214
- log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
215
 
216
- const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
217
- Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- const node = Module.FS.lookupPath('/model.gguf').node;
220
- Object.defineProperty(node, 'contents', {
221
- get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
222
- set: () => { /* read-only file */ },
223
- configurable: true,
224
- });
225
- node.usedBytes = contentLength;
226
- } catch (err) {
227
- Module._free(modelPtr);
228
- modelPtr = 0;
229
- throw err;
 
 
 
 
230
  }
231
 
232
  // ─── Init backend ───
@@ -236,21 +351,30 @@ async function runOne({ params, stream, buffer }) {
236
  log('Backends initialized');
237
 
238
  // ─── Load model ───
239
- status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
 
 
 
 
240
  const loadResult = await Module.ccall(
241
  'bench_load',
242
  'number',
243
- ['string', 'number', 'number'],
244
- ['/model.gguf', nCtx, nGpuLayers],
245
  { async: true },
246
  );
247
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
248
  log('Model loaded');
249
 
250
- try {
251
- Module.FS.unlink('/model.gguf');
252
- } catch (err) {
253
- log(`Warning: could not remove model FS node: ${err.message}`);
 
 
 
 
 
254
  }
255
 
256
  // ─── Consistency phase ───
@@ -362,7 +486,11 @@ async function runOne({ params, stream, buffer }) {
362
 
363
  await Module.ccall('bench_exit', null, [], [], { async: true });
364
 
365
- if (modelPtr) {
 
 
 
 
366
  Module._free(modelPtr);
367
  modelPtr = 0;
368
  }
 
37
  const log = (line) => post({ type: 'log', line });
38
  const status = (s, msg) => post({ type: 'status', status: s, msg });
39
 
40
+ // ─── OPFS-backed model loading (wllama-style) ───
41
+ // For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray
42
+ // length limits, and it eats the heap budget that KV cache + working memory
43
+ // need). Instead, we open a FileSystemSyncAccessHandle on the OPFS file in
44
+ // this worker, register a zero-byte stub in MEMFS, and patch MEMFS's
45
+ // stream_ops so reads delegate to syncHandle.read(). llama.cpp then loads
46
+ // the model via fread (use_mmap=false), which calls the patched stream_ops
47
+ // — never copying the bytes through the WASM heap.
48
+ //
49
+ // Mirrors wllama's src/workers-code/llama-cpp.js (patchMEMFS / opfsAlloc /
50
+ // opfsFreeAll). Worker-only: sync access handles aren't available on the
51
+ // main thread.
52
+
53
+ const opfsHandles = {}; // map MEMFS-name → { syncHandle, size }
54
+
55
+ function patchMEMFS(Module) {
56
+ const m = Module;
57
+ // Idempotent — only install the patches once per Module.
58
+ if (m.MEMFS.stream_ops._read) return;
59
+ m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read;
60
+ m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek;
61
+ m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap;
62
+
63
+ m.MEMFS.stream_ops.read = function (stream, buffer, offset, length, position) {
64
+ const name = stream.node.name;
65
+ if (opfsHandles[name]) {
66
+ const { syncHandle, size } = opfsHandles[name];
67
+ const toRead = Math.min(length, size - position);
68
+ if (toRead <= 0) return 0;
69
+ const view = new Uint8Array(buffer.buffer, buffer.byteOffset + offset, toRead);
70
+ return syncHandle.read(view, { at: position });
71
+ }
72
+ return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position);
73
+ };
74
+ m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read;
75
+
76
+ m.MEMFS.stream_ops.llseek = function (stream, offset, whence) {
77
+ const name = stream.node.name;
78
+ if (opfsHandles[name]) {
79
+ const { size } = opfsHandles[name];
80
+ let newPos = offset;
81
+ if (whence === 1) newPos += stream.position; // SEEK_CUR
82
+ if (whence === 2) newPos += size; // SEEK_END
83
+ if (newPos < 0) throw new Error('SEEK before start of file');
84
+ stream.position = newPos;
85
+ return newPos;
86
+ }
87
+ return m.MEMFS.stream_ops._llseek(stream, offset, whence);
88
+ };
89
+ m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek;
90
+
91
+ m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) {
92
+ const name = stream.node.name;
93
+ if (opfsHandles[name]) {
94
+ // OPFS-backed files must never be mmap'd — that would force MEMFS to
95
+ // copy the file into the WASM heap, defeating the OPFS path. The C++
96
+ // side passes use_mmap=0 to avoid this. If we ever land here, the
97
+ // caller forgot to disable mmap.
98
+ throw new Error(`[OPFS] mmap called on "${name}" — bench_load was not invoked with use_mmap=0`);
99
+ }
100
+ return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags);
101
+ };
102
+ m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
103
+ }
104
+
105
+ async function opfsAlloc(Module, name, fileHandle) {
106
+ // createSyncAccessHandle is worker-only and exclusive — only one writer
107
+ // per OPFS file at a time. Caller must ensure no createWritable session
108
+ // is open when we land here.
109
+ const syncHandle = await fileHandle.createSyncAccessHandle();
110
+ const size = syncHandle.getSize();
111
+ opfsHandles[name] = { syncHandle, size };
112
+ // Zero-byte placeholder so llama.cpp's fopen() finds the path.
113
+ Module.FS.createDataFile('/', name, new Uint8Array(0), true, false, true);
114
+ // Set usedBytes so fstat()/seek-end report the real file size — our
115
+ // patched llseek consults size, but other code (e.g. llama.cpp's GGUF
116
+ // reader sanity-checking the file length) goes through stat first.
117
+ Module.FS.lookupPath('/' + name).node.usedBytes = size;
118
+ return size;
119
+ }
120
+
121
+ function opfsFreeAll(Module) {
122
+ for (const [name, { syncHandle }] of Object.entries(opfsHandles)) {
123
+ try { syncHandle.close(); } catch { /* already closed */ }
124
+ try { Module.FS.unlink('/' + name); } catch { /* already gone */ }
125
+ delete opfsHandles[name];
126
+ }
127
+ }
128
+
129
  // Aggregate raw nanosecond samples into the llama-bench result shape.
130
  // Mirrors core.js buildTest — keep them identical.
131
  function buildTest(name, n_prompt, n_gen, samples_ns) {
 
189
  }
190
  };
191
 
192
+ async function runOne({ params, stream, buffer, fileHandle }) {
193
  const {
194
  buildType,
195
  contentLength,
 
205
  nReps,
206
  noWarmup,
207
  } = params;
208
+ // Three input modes are supported:
209
+ // fileHandle → wllama-style OPFS-streaming load (preferred for >2GB)
210
+ // stream → heap-stream mode (zero-copy WASM-heap, transferable)
211
+ // buffer → buffered fallback for browsers without transferable streams
212
+ // Exactly one must be provided.
213
+ const inputCount = (fileHandle ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
214
+ if (inputCount !== 1) {
215
+ throw new Error('runOne: exactly one of `fileHandle`, `stream`, or `buffer` must be provided');
216
  }
217
 
218
  const result = {
 
273
  });
274
  log('WASM module loaded');
275
 
276
+ // ─── Make the model visible to the WASM filesystem ───
277
+ // Two paths:
278
+ // useOpfsPath: leave the bytes on disk (OPFS) and route reads through
279
+ // a sync access handle via patched MEMFS stream_ops. No
280
+ // heap copy, supports >2GB.
281
+ // else: _malloc the full file on the WASM heap, write the stream
282
+ // in, register a heap-backed MEMFS file. Faster (mmap'd
283
+ // zero-copy at load time) but caps at ~2GB.
284
+ let modelPtr = 0; // tracks heap-path allocation for cleanup
285
+ const useOpfsPath = !!fileHandle;
286
 
287
+ if (useOpfsPath) {
288
+ status('opfs', 'Linking OPFS-backed model into MEMFS...');
289
+ patchMEMFS(Module);
290
+ const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
291
+ log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);
292
+ // Report 100% to keep the existing progress UI happy — the actual
293
+ // download to OPFS happened before the worker spawn.
294
+ post({ type: 'progress', fraction: 1, downloaded: size, total: size });
295
+ } else {
296
+ if (!(contentLength > 0)) {
297
+ throw new Error('content-length is required for streaming into WASM heap');
298
+ }
299
+ status('downloading', 'Streaming model into WASM heap...');
300
 
301
+ modelPtr = Module._malloc(contentLength);
302
+ if (!modelPtr) {
303
+ throw new Error(
304
+ `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
305
+ );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  }
 
307
 
308
+ try {
309
+ let downloaded = 0;
310
+ if (stream) {
311
+ const reader = stream.getReader();
312
+ while (true) {
313
+ const { done, value } = await reader.read();
314
+ if (done) break;
315
+ Module.HEAPU8.set(value, modelPtr + downloaded);
316
+ downloaded += value.length;
317
+ post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
318
+ }
319
+ } else {
320
+ const view = new Uint8Array(buffer);
321
+ if (view.byteLength !== contentLength) {
322
+ log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
323
+ }
324
+ Module.HEAPU8.set(view, modelPtr);
325
+ downloaded = view.byteLength;
326
+ post({ type: 'progress', fraction: 1, downloaded, total: contentLength });
327
+ }
328
+ log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
329
 
330
+ const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
331
+ Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
332
+
333
+ const node = Module.FS.lookupPath('/model.gguf').node;
334
+ Object.defineProperty(node, 'contents', {
335
+ get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
336
+ set: () => { /* read-only file */ },
337
+ configurable: true,
338
+ });
339
+ node.usedBytes = contentLength;
340
+ } catch (err) {
341
+ Module._free(modelPtr);
342
+ modelPtr = 0;
343
+ throw err;
344
+ }
345
  }
346
 
347
  // ─── Init backend ───
 
351
  log('Backends initialized');
352
 
353
  // ─── Load model ───
354
+ // OPFS path requires use_mmap=0 — the patched mmap throws to surface bugs
355
+ // if it's accidentally invoked. Heap path uses mmap=1 to take MEMFS's
356
+ // zero-copy mmap fast path against our HEAPU8-backed file.
357
+ const useMmap = useOpfsPath ? 0 : 1;
358
+ status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=${useMmap})...`);
359
  const loadResult = await Module.ccall(
360
  'bench_load',
361
  'number',
362
+ ['string', 'number', 'number', 'number'],
363
+ ['/model.gguf', nCtx, nGpuLayers, useMmap],
364
  { async: true },
365
  );
366
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
367
  log('Model loaded');
368
 
369
+ if (!useOpfsPath) {
370
+ // Heap path: drop the MEMFS node now that llama.cpp's mmap captured a
371
+ // pointer into our _malloc'd region. Bytes stay alive in the heap until
372
+ // bench_exit + _free.
373
+ try {
374
+ Module.FS.unlink('/model.gguf');
375
+ } catch (err) {
376
+ log(`Warning: could not remove model FS node: ${err.message}`);
377
+ }
378
  }
379
 
380
  // ─── Consistency phase ───
 
486
 
487
  await Module.ccall('bench_exit', null, [], [], { async: true });
488
 
489
+ if (useOpfsPath) {
490
+ // Close the sync handle so OPFS can release its lock on the file (and
491
+ // so a subsequent run can open a fresh handle without colliding).
492
+ opfsFreeAll(Module);
493
+ } else if (modelPtr) {
494
  Module._free(modelPtr);
495
  modelPtr = 0;
496
  }
js/run/controller.js CHANGED
@@ -1126,6 +1126,7 @@ async function onRunClick() {
1126
  function runInWorker({
1127
  params,
1128
  stream,
 
1129
  onStatus,
1130
  onProgress,
1131
  onLog,
@@ -1166,6 +1167,19 @@ function runInWorker({
1166
  finish({ status: 'error', error: 'worker message deserialization failed' });
1167
  };
1168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1169
  // Mobile browsers (esp. iOS Safari) advertise transferable streams but
1170
  // can't actually transfer ReadableStreams across postMessage — the call
1171
  // throws "The object can not be cloned." We probe once with a tiny
@@ -1249,9 +1263,59 @@ async function readStreamToBuffer(stream, contentLength, onProgress) {
1249
  return out.buffer;
1250
  }
1251
 
1252
- // Fetch the model through the source adapter and hand the stream to a
1253
- // freshly-spawned worker. Returns a record shaped like runBenchmarkCore().
 
 
 
 
 
 
 
 
 
 
1254
  async function runBenchmarkInWorker(v, params, callbacks) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1255
  let fetched;
1256
  try {
1257
  fetched = await state.source.fetchModel(v.repo, v.filename);
@@ -1259,30 +1323,13 @@ async function runBenchmarkInWorker(v, params, callbacks) {
1259
  return { status: 'error', error: `fetchModel failed: ${err.message}` };
1260
  }
1261
 
1262
- const record = await runInWorker({
1263
- params: {
1264
- buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
1265
- contentLength: fetched.contentLength,
1266
- // Model load
1267
- nCtx: params.nCtx,
1268
- nGpuLayers: params.nGpuLayers,
1269
- // Consistency phase — empty consistencyPrompt skips it
1270
- consistencyPrompt: params.consistencyPrompt || '',
1271
- consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
1272
- refTokenIds: params.refTokenIds || null,
1273
- // Perf phase — set both to 0 to skip
1274
- nPrompt: params.nPrompt ?? 0,
1275
- nGen: params.nGen ?? 0,
1276
- nReps: params.nReps ?? DEFAULT_ITERATIONS,
1277
- noWarmup: !!params.noWarmup,
1278
- },
1279
  stream: fetched.stream,
1280
  onStatus: callbacks.onStatus,
1281
  onProgress: callbacks.onProgress,
1282
  onLog: callbacks.onLog,
1283
  });
1284
-
1285
- return record;
1286
  }
1287
 
1288
  // Runs one variant: CPU consistency baseline (one model load, generates
 
1126
  function runInWorker({
1127
  params,
1128
  stream,
1129
+ fileHandle,
1130
  onStatus,
1131
  onProgress,
1132
  onLog,
 
1167
  finish({ status: 'error', error: 'worker message deserialization failed' });
1168
  };
1169
 
1170
+ // Three transport modes — see bench-worker.js runOne() for matching shape.
1171
+ if (fileHandle) {
1172
+ // OPFS path: FileSystemFileHandle is structured-cloneable, not
1173
+ // transferable. The worker creates its own sync access handle on the
1174
+ // cloned reference (still bound to the same underlying OPFS file).
1175
+ try {
1176
+ worker.postMessage({ type: 'run', params, fileHandle });
1177
+ } catch (err) {
1178
+ finish({ status: 'error', error: `postMessage(fileHandle) failed: ${err.message}` });
1179
+ }
1180
+ return;
1181
+ }
1182
+
1183
  // Mobile browsers (esp. iOS Safari) advertise transferable streams but
1184
  // can't actually transfer ReadableStreams across postMessage — the call
1185
  // throws "The object can not be cloned." We probe once with a tiny
 
1263
  return out.buffer;
1264
  }
1265
 
1266
+ // Fetch the model and hand it to a freshly-spawned worker. Returns a record
1267
+ // shaped like runBenchmarkCore(). Two paths:
1268
+ //
1269
+ // wllama-style OPFS streaming (preferred): if the source provides
1270
+ // opfsHandleForModel (currently hostedSource), download to OPFS on the
1271
+ // main thread, then transfer the FileSystemFileHandle to the worker.
1272
+ // The worker opens a sync access handle and routes MEMFS reads through
1273
+ // it, never copying the model into the WASM heap. Supports >2GB.
1274
+ //
1275
+ // Heap-stream (fallback for localSource): keep the prior behavior —
1276
+ // stream the GGUF into a single _malloc'd buffer in the WASM heap.
1277
+ // Faster for small models (zero-copy mmap on load), capped at ~2GB.
1278
  async function runBenchmarkInWorker(v, params, callbacks) {
1279
+ const useOpfs = typeof state.source.opfsHandleForModel === 'function';
1280
+
1281
+ const baseParams = {
1282
+ buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
1283
+ // Model load
1284
+ nCtx: params.nCtx,
1285
+ nGpuLayers: params.nGpuLayers,
1286
+ // Consistency phase — empty consistencyPrompt skips it
1287
+ consistencyPrompt: params.consistencyPrompt || '',
1288
+ consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
1289
+ refTokenIds: params.refTokenIds || null,
1290
+ // Perf phase — set both to 0 to skip
1291
+ nPrompt: params.nPrompt ?? 0,
1292
+ nGen: params.nGen ?? 0,
1293
+ nReps: params.nReps ?? DEFAULT_ITERATIONS,
1294
+ noWarmup: !!params.noWarmup,
1295
+ };
1296
+
1297
+ if (useOpfs) {
1298
+ let fileHandle, contentLength;
1299
+ try {
1300
+ callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
1301
+ const r = await state.source.opfsHandleForModel(
1302
+ v.repo, v.filename,
1303
+ callbacks.onProgress,
1304
+ );
1305
+ fileHandle = r.handle;
1306
+ contentLength = r.size;
1307
+ } catch (err) {
1308
+ return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
1309
+ }
1310
+ return runInWorker({
1311
+ params: { ...baseParams, contentLength },
1312
+ fileHandle,
1313
+ onStatus: callbacks.onStatus,
1314
+ onProgress: callbacks.onProgress,
1315
+ onLog: callbacks.onLog,
1316
+ });
1317
+ }
1318
+
1319
  let fetched;
1320
  try {
1321
  fetched = await state.source.fetchModel(v.repo, v.filename);
 
1323
  return { status: 'error', error: `fetchModel failed: ${err.message}` };
1324
  }
1325
 
1326
+ return runInWorker({
1327
+ params: { ...baseParams, contentLength: fetched.contentLength },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1328
  stream: fetched.stream,
1329
  onStatus: callbacks.onStatus,
1330
  onProgress: callbacks.onProgress,
1331
  onLog: callbacks.onLog,
1332
  });
 
 
1333
  }
1334
 
1335
  // Runs one variant: CPU consistency baseline (one model load, generates
js/run/core.js CHANGED
@@ -329,12 +329,17 @@ export async function runBenchmarkCore({
329
  if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
330
  onLog('Backends initialized');
331
 
332
- onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
 
 
 
 
 
333
  const loadResult = await Module.ccall(
334
  'bench_load',
335
  'number',
336
- ['string', 'number', 'number'],
337
- ['/model.gguf', nCtx, nGpuLayers],
338
  { async: true },
339
  );
340
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
 
329
  if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
330
  onLog('Backends initialized');
331
 
332
+ // core.js is the main-thread/heap-stream path (used by harness.js +
333
+ // runner.js Playwright harness). Sync access handles aren't available
334
+ // on the main thread, so we always pass use_mmap=1 here — llama.cpp
335
+ // mmap's the HEAPU8-backed MEMFS file zero-copy. Capped at ~2GB.
336
+ // For >2GB models, run via the dashboard Run page (worker path).
337
+ onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=1)...`);
338
  const loadResult = await Module.ccall(
339
  'bench_load',
340
  'number',
341
+ ['string', 'number', 'number', 'number'],
342
+ ['/model.gguf', nCtx, nGpuLayers, 1],
343
  { async: true },
344
  );
345
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
js/run/source.js CHANGED
@@ -95,6 +95,54 @@ export function hostedSource() {
95
  }
96
  },
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  async fetchModel(repo, file) {
99
  // Cache hit → stream the OPFS file straight out.
100
  try {
 
95
  }
96
  },
97
 
98
+ // Ensure the model is fully downloaded to OPFS, then return its
99
+ // FileSystemFileHandle. Used by the wllama-style OPFS-streaming load
100
+ // path: the worker opens a sync access handle on this FileHandle and
101
+ // routes MEMFS reads through it, never copying the model into the
102
+ // WASM heap. onProgress is called during the download leg with
103
+ // (fraction, downloaded, total).
104
+ async opfsHandleForModel(repo, file, onProgress) {
105
+ const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
106
+ if (cached) {
107
+ const f = await cached.getFile();
108
+ if (f.size > 0) {
109
+ onProgress?.(1, f.size, f.size);
110
+ return { handle: cached, size: f.size };
111
+ }
112
+ }
113
+
114
+ // Cache miss — download from HF straight into a writable OPFS stream.
115
+ const url = `https://huggingface.co/${repo}/resolve/main/${file}`;
116
+ const resp = await fetch(url);
117
+ if (!resp.ok) {
118
+ throw new Error(`Download failed: ${resp.status} ${resp.statusText}`);
119
+ }
120
+ const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
121
+
122
+ const handle = await getOpfsFileHandle(repo, file, { create: true });
123
+ const writable = await handle.createWritable({ keepExistingData: false });
124
+
125
+ // Same persistent-storage hint as fetchModel — best-effort.
126
+ navigator.storage?.persist?.().catch(() => {});
127
+
128
+ try {
129
+ const reader = resp.body.getReader();
130
+ let downloaded = 0;
131
+ while (true) {
132
+ const { done, value } = await reader.read();
133
+ if (done) break;
134
+ await writable.write(value);
135
+ downloaded += value.byteLength;
136
+ if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
137
+ }
138
+ await writable.close();
139
+ return { handle, size: downloaded };
140
+ } catch (err) {
141
+ try { await writable.abort(err); } catch { /* ignore */ }
142
+ throw err;
143
+ }
144
+ },
145
+
146
  async fetchModel(repo, file) {
147
  // Cache hit → stream the OPFS file straight out.
148
  try {