Spaces:
Running
Running
GitHub Actions commited on
Commit ·
299e359
1
Parent(s): 0fc83ad
sync from abhijitramesh/webgpu-bench@dab7e7757e
Browse files- build/asyncify/bench.wasm +2 -2
- build/asyncify/build-info.json +2 -2
- build/jspi/bench.wasm +2 -2
- build/jspi/build-info.json +2 -2
- js/run/bench-worker.js +183 -55
- js/run/controller.js +68 -21
- js/run/core.js +8 -3
- js/run/source.js +48 -0
build/asyncify/bench.wasm
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50895b262f9b0da117509d04075ca06f3b30d3482c130d22c827e53e20d8a650
|
| 3 |
+
size 5233188
|
build/asyncify/build-info.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
-
"llamaCppDescribe": "
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
-
"builtAt": "2026-04-
|
| 6 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
+
"llamaCppDescribe": "b8981-3-gf22c8021d",
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
+
"builtAt": "2026-04-29T23:41:53Z"
|
| 6 |
}
|
build/jspi/bench.wasm
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92ef71c59da832ad869cbc002665fd3bb3505c7e515a7cefc5d7f7901224ea40
|
| 3 |
+
size 3612135
|
build/jspi/build-info.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
-
"llamaCppDescribe": "
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
-
"builtAt": "2026-04-
|
| 6 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
+
"llamaCppDescribe": "b8981-3-gf22c8021d",
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
+
"builtAt": "2026-04-29T23:37:54Z"
|
| 6 |
}
|
js/run/bench-worker.js
CHANGED
|
@@ -37,6 +37,95 @@ const post = (msg) => self.postMessage(msg);
|
|
| 37 |
const log = (line) => post({ type: 'log', line });
|
| 38 |
const status = (s, msg) => post({ type: 'status', status: s, msg });
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
// Aggregate raw nanosecond samples into the llama-bench result shape.
|
| 41 |
// Mirrors core.js buildTest — keep them identical.
|
| 42 |
function buildTest(name, n_prompt, n_gen, samples_ns) {
|
|
@@ -100,7 +189,7 @@ self.onmessage = async (e) => {
|
|
| 100 |
}
|
| 101 |
};
|
| 102 |
|
| 103 |
-
async function runOne({ params, stream, buffer }) {
|
| 104 |
const {
|
| 105 |
buildType,
|
| 106 |
contentLength,
|
|
@@ -116,8 +205,14 @@ async function runOne({ params, stream, buffer }) {
|
|
| 116 |
nReps,
|
| 117 |
noWarmup,
|
| 118 |
} = params;
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
|
| 123 |
const result = {
|
|
@@ -178,55 +273,75 @@ async function runOne({ params, stream, buffer }) {
|
|
| 178 |
});
|
| 179 |
log('WASM module loaded');
|
| 180 |
|
| 181 |
-
// ───
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
);
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
const { done, value } = await reader.read();
|
| 200 |
-
if (done) break;
|
| 201 |
-
Module.HEAPU8.set(value, modelPtr + downloaded);
|
| 202 |
-
downloaded += value.length;
|
| 203 |
-
post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
|
| 204 |
-
}
|
| 205 |
-
} else {
|
| 206 |
-
const view = new Uint8Array(buffer);
|
| 207 |
-
if (view.byteLength !== contentLength) {
|
| 208 |
-
log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
|
| 209 |
-
}
|
| 210 |
-
Module.HEAPU8.set(view, modelPtr);
|
| 211 |
-
downloaded = view.byteLength;
|
| 212 |
-
post({ type: 'progress', fraction: 1, downloaded, total: contentLength });
|
| 213 |
}
|
| 214 |
-
log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
}
|
| 231 |
|
| 232 |
// ─── Init backend ───
|
|
@@ -236,21 +351,30 @@ async function runOne({ params, stream, buffer }) {
|
|
| 236 |
log('Backends initialized');
|
| 237 |
|
| 238 |
// ─── Load model ───
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
const loadResult = await Module.ccall(
|
| 241 |
'bench_load',
|
| 242 |
'number',
|
| 243 |
-
['string', 'number', 'number'],
|
| 244 |
-
['/model.gguf', nCtx, nGpuLayers],
|
| 245 |
{ async: true },
|
| 246 |
);
|
| 247 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 248 |
log('Model loaded');
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
}
|
| 255 |
|
| 256 |
// ─── Consistency phase ───
|
|
@@ -362,7 +486,11 @@ async function runOne({ params, stream, buffer }) {
|
|
| 362 |
|
| 363 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 364 |
|
| 365 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
Module._free(modelPtr);
|
| 367 |
modelPtr = 0;
|
| 368 |
}
|
|
|
|
| 37 |
const log = (line) => post({ type: 'log', line });
|
| 38 |
const status = (s, msg) => post({ type: 'status', status: s, msg });
|
| 39 |
|
| 40 |
+
// ─── OPFS-backed model loading (wllama-style) ───
|
| 41 |
+
// For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray
|
| 42 |
+
// length limits, and it eats the heap budget that KV cache + working memory
|
| 43 |
+
// need). Instead, we open a FileSystemSyncAccessHandle on the OPFS file in
|
| 44 |
+
// this worker, register a zero-byte stub in MEMFS, and patch MEMFS's
|
| 45 |
+
// stream_ops so reads delegate to syncHandle.read(). llama.cpp then loads
|
| 46 |
+
// the model via fread (use_mmap=false), which calls the patched stream_ops
|
| 47 |
+
// — never copying the bytes through the WASM heap.
|
| 48 |
+
//
|
| 49 |
+
// Mirrors wllama's src/workers-code/llama-cpp.js (patchMEMFS / opfsAlloc /
|
| 50 |
+
// opfsFreeAll). Worker-only: sync access handles aren't available on the
|
| 51 |
+
// main thread.
|
| 52 |
+
|
| 53 |
+
const opfsHandles = {}; // map MEMFS-name → { syncHandle, size }
|
| 54 |
+
|
| 55 |
+
function patchMEMFS(Module) {
|
| 56 |
+
const m = Module;
|
| 57 |
+
// Idempotent — only install the patches once per Module.
|
| 58 |
+
if (m.MEMFS.stream_ops._read) return;
|
| 59 |
+
m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read;
|
| 60 |
+
m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek;
|
| 61 |
+
m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap;
|
| 62 |
+
|
| 63 |
+
m.MEMFS.stream_ops.read = function (stream, buffer, offset, length, position) {
|
| 64 |
+
const name = stream.node.name;
|
| 65 |
+
if (opfsHandles[name]) {
|
| 66 |
+
const { syncHandle, size } = opfsHandles[name];
|
| 67 |
+
const toRead = Math.min(length, size - position);
|
| 68 |
+
if (toRead <= 0) return 0;
|
| 69 |
+
const view = new Uint8Array(buffer.buffer, buffer.byteOffset + offset, toRead);
|
| 70 |
+
return syncHandle.read(view, { at: position });
|
| 71 |
+
}
|
| 72 |
+
return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position);
|
| 73 |
+
};
|
| 74 |
+
m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read;
|
| 75 |
+
|
| 76 |
+
m.MEMFS.stream_ops.llseek = function (stream, offset, whence) {
|
| 77 |
+
const name = stream.node.name;
|
| 78 |
+
if (opfsHandles[name]) {
|
| 79 |
+
const { size } = opfsHandles[name];
|
| 80 |
+
let newPos = offset;
|
| 81 |
+
if (whence === 1) newPos += stream.position; // SEEK_CUR
|
| 82 |
+
if (whence === 2) newPos += size; // SEEK_END
|
| 83 |
+
if (newPos < 0) throw new Error('SEEK before start of file');
|
| 84 |
+
stream.position = newPos;
|
| 85 |
+
return newPos;
|
| 86 |
+
}
|
| 87 |
+
return m.MEMFS.stream_ops._llseek(stream, offset, whence);
|
| 88 |
+
};
|
| 89 |
+
m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek;
|
| 90 |
+
|
| 91 |
+
m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) {
|
| 92 |
+
const name = stream.node.name;
|
| 93 |
+
if (opfsHandles[name]) {
|
| 94 |
+
// OPFS-backed files must never be mmap'd — that would force MEMFS to
|
| 95 |
+
// copy the file into the WASM heap, defeating the OPFS path. The C++
|
| 96 |
+
// side passes use_mmap=0 to avoid this. If we ever land here, the
|
| 97 |
+
// caller forgot to disable mmap.
|
| 98 |
+
throw new Error(`[OPFS] mmap called on "${name}" — bench_load was not invoked with use_mmap=0`);
|
| 99 |
+
}
|
| 100 |
+
return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags);
|
| 101 |
+
};
|
| 102 |
+
m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
async function opfsAlloc(Module, name, fileHandle) {
|
| 106 |
+
// createSyncAccessHandle is worker-only and exclusive — only one writer
|
| 107 |
+
// per OPFS file at a time. Caller must ensure no createWritable session
|
| 108 |
+
// is open when we land here.
|
| 109 |
+
const syncHandle = await fileHandle.createSyncAccessHandle();
|
| 110 |
+
const size = syncHandle.getSize();
|
| 111 |
+
opfsHandles[name] = { syncHandle, size };
|
| 112 |
+
// Zero-byte placeholder so llama.cpp's fopen() finds the path.
|
| 113 |
+
Module.FS.createDataFile('/', name, new Uint8Array(0), true, false, true);
|
| 114 |
+
// Set usedBytes so fstat()/seek-end report the real file size — our
|
| 115 |
+
// patched llseek consults size, but other code (e.g. llama.cpp's GGUF
|
| 116 |
+
// reader sanity-checking the file length) goes through stat first.
|
| 117 |
+
Module.FS.lookupPath('/' + name).node.usedBytes = size;
|
| 118 |
+
return size;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
function opfsFreeAll(Module) {
|
| 122 |
+
for (const [name, { syncHandle }] of Object.entries(opfsHandles)) {
|
| 123 |
+
try { syncHandle.close(); } catch { /* already closed */ }
|
| 124 |
+
try { Module.FS.unlink('/' + name); } catch { /* already gone */ }
|
| 125 |
+
delete opfsHandles[name];
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
// Aggregate raw nanosecond samples into the llama-bench result shape.
|
| 130 |
// Mirrors core.js buildTest — keep them identical.
|
| 131 |
function buildTest(name, n_prompt, n_gen, samples_ns) {
|
|
|
|
| 189 |
}
|
| 190 |
};
|
| 191 |
|
| 192 |
+
async function runOne({ params, stream, buffer, fileHandle }) {
|
| 193 |
const {
|
| 194 |
buildType,
|
| 195 |
contentLength,
|
|
|
|
| 205 |
nReps,
|
| 206 |
noWarmup,
|
| 207 |
} = params;
|
| 208 |
+
// Three input modes are supported:
|
| 209 |
+
// fileHandle → wllama-style OPFS-streaming load (preferred for >2GB)
|
| 210 |
+
// stream → heap-stream mode (zero-copy WASM-heap, transferable)
|
| 211 |
+
// buffer → buffered fallback for browsers without transferable streams
|
| 212 |
+
// Exactly one must be provided.
|
| 213 |
+
const inputCount = (fileHandle ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
|
| 214 |
+
if (inputCount !== 1) {
|
| 215 |
+
throw new Error('runOne: exactly one of `fileHandle`, `stream`, or `buffer` must be provided');
|
| 216 |
}
|
| 217 |
|
| 218 |
const result = {
|
|
|
|
| 273 |
});
|
| 274 |
log('WASM module loaded');
|
| 275 |
|
| 276 |
+
// ─── Make the model visible to the WASM filesystem ───
|
| 277 |
+
// Two paths:
|
| 278 |
+
// useOpfsPath: leave the bytes on disk (OPFS) and route reads through
|
| 279 |
+
// a sync access handle via patched MEMFS stream_ops. No
|
| 280 |
+
// heap copy, supports >2GB.
|
| 281 |
+
// else: _malloc the full file on the WASM heap, write the stream
|
| 282 |
+
// in, register a heap-backed MEMFS file. Faster (mmap'd
|
| 283 |
+
// zero-copy at load time) but caps at ~2GB.
|
| 284 |
+
let modelPtr = 0; // tracks heap-path allocation for cleanup
|
| 285 |
+
const useOpfsPath = !!fileHandle;
|
| 286 |
|
| 287 |
+
if (useOpfsPath) {
|
| 288 |
+
status('opfs', 'Linking OPFS-backed model into MEMFS...');
|
| 289 |
+
patchMEMFS(Module);
|
| 290 |
+
const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
|
| 291 |
+
log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);
|
| 292 |
+
// Report 100% to keep the existing progress UI happy — the actual
|
| 293 |
+
// download to OPFS happened before the worker spawn.
|
| 294 |
+
post({ type: 'progress', fraction: 1, downloaded: size, total: size });
|
| 295 |
+
} else {
|
| 296 |
+
if (!(contentLength > 0)) {
|
| 297 |
+
throw new Error('content-length is required for streaming into WASM heap');
|
| 298 |
+
}
|
| 299 |
+
status('downloading', 'Streaming model into WASM heap...');
|
| 300 |
|
| 301 |
+
modelPtr = Module._malloc(contentLength);
|
| 302 |
+
if (!modelPtr) {
|
| 303 |
+
throw new Error(
|
| 304 |
+
`_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
|
| 305 |
+
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
}
|
|
|
|
| 307 |
|
| 308 |
+
try {
|
| 309 |
+
let downloaded = 0;
|
| 310 |
+
if (stream) {
|
| 311 |
+
const reader = stream.getReader();
|
| 312 |
+
while (true) {
|
| 313 |
+
const { done, value } = await reader.read();
|
| 314 |
+
if (done) break;
|
| 315 |
+
Module.HEAPU8.set(value, modelPtr + downloaded);
|
| 316 |
+
downloaded += value.length;
|
| 317 |
+
post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
|
| 318 |
+
}
|
| 319 |
+
} else {
|
| 320 |
+
const view = new Uint8Array(buffer);
|
| 321 |
+
if (view.byteLength !== contentLength) {
|
| 322 |
+
log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
|
| 323 |
+
}
|
| 324 |
+
Module.HEAPU8.set(view, modelPtr);
|
| 325 |
+
downloaded = view.byteLength;
|
| 326 |
+
post({ type: 'progress', fraction: 1, downloaded, total: contentLength });
|
| 327 |
+
}
|
| 328 |
+
log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 329 |
|
| 330 |
+
const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
|
| 331 |
+
Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
|
| 332 |
+
|
| 333 |
+
const node = Module.FS.lookupPath('/model.gguf').node;
|
| 334 |
+
Object.defineProperty(node, 'contents', {
|
| 335 |
+
get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
|
| 336 |
+
set: () => { /* read-only file */ },
|
| 337 |
+
configurable: true,
|
| 338 |
+
});
|
| 339 |
+
node.usedBytes = contentLength;
|
| 340 |
+
} catch (err) {
|
| 341 |
+
Module._free(modelPtr);
|
| 342 |
+
modelPtr = 0;
|
| 343 |
+
throw err;
|
| 344 |
+
}
|
| 345 |
}
|
| 346 |
|
| 347 |
// ─── Init backend ───
|
|
|
|
| 351 |
log('Backends initialized');
|
| 352 |
|
| 353 |
// ─── Load model ───
|
| 354 |
+
// OPFS path requires use_mmap=0 — the patched mmap throws to surface bugs
|
| 355 |
+
// if it's accidentally invoked. Heap path uses mmap=1 to take MEMFS's
|
| 356 |
+
// zero-copy mmap fast path against our HEAPU8-backed file.
|
| 357 |
+
const useMmap = useOpfsPath ? 0 : 1;
|
| 358 |
+
status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=${useMmap})...`);
|
| 359 |
const loadResult = await Module.ccall(
|
| 360 |
'bench_load',
|
| 361 |
'number',
|
| 362 |
+
['string', 'number', 'number', 'number'],
|
| 363 |
+
['/model.gguf', nCtx, nGpuLayers, useMmap],
|
| 364 |
{ async: true },
|
| 365 |
);
|
| 366 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 367 |
log('Model loaded');
|
| 368 |
|
| 369 |
+
if (!useOpfsPath) {
|
| 370 |
+
// Heap path: drop the MEMFS node now that llama.cpp's mmap captured a
|
| 371 |
+
// pointer into our _malloc'd region. Bytes stay alive in the heap until
|
| 372 |
+
// bench_exit + _free.
|
| 373 |
+
try {
|
| 374 |
+
Module.FS.unlink('/model.gguf');
|
| 375 |
+
} catch (err) {
|
| 376 |
+
log(`Warning: could not remove model FS node: ${err.message}`);
|
| 377 |
+
}
|
| 378 |
}
|
| 379 |
|
| 380 |
// ─── Consistency phase ───
|
|
|
|
| 486 |
|
| 487 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 488 |
|
| 489 |
+
if (useOpfsPath) {
|
| 490 |
+
// Close the sync handle so OPFS can release its lock on the file (and
|
| 491 |
+
// so a subsequent run can open a fresh handle without colliding).
|
| 492 |
+
opfsFreeAll(Module);
|
| 493 |
+
} else if (modelPtr) {
|
| 494 |
Module._free(modelPtr);
|
| 495 |
modelPtr = 0;
|
| 496 |
}
|
js/run/controller.js
CHANGED
|
@@ -1126,6 +1126,7 @@ async function onRunClick() {
|
|
| 1126 |
function runInWorker({
|
| 1127 |
params,
|
| 1128 |
stream,
|
|
|
|
| 1129 |
onStatus,
|
| 1130 |
onProgress,
|
| 1131 |
onLog,
|
|
@@ -1166,6 +1167,19 @@ function runInWorker({
|
|
| 1166 |
finish({ status: 'error', error: 'worker message deserialization failed' });
|
| 1167 |
};
|
| 1168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1169 |
// Mobile browsers (esp. iOS Safari) advertise transferable streams but
|
| 1170 |
// can't actually transfer ReadableStreams across postMessage — the call
|
| 1171 |
// throws "The object can not be cloned." We probe once with a tiny
|
|
@@ -1249,9 +1263,59 @@ async function readStreamToBuffer(stream, contentLength, onProgress) {
|
|
| 1249 |
return out.buffer;
|
| 1250 |
}
|
| 1251 |
|
| 1252 |
-
// Fetch the model
|
| 1253 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1254 |
async function runBenchmarkInWorker(v, params, callbacks) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1255 |
let fetched;
|
| 1256 |
try {
|
| 1257 |
fetched = await state.source.fetchModel(v.repo, v.filename);
|
|
@@ -1259,30 +1323,13 @@ async function runBenchmarkInWorker(v, params, callbacks) {
|
|
| 1259 |
return { status: 'error', error: `fetchModel failed: ${err.message}` };
|
| 1260 |
}
|
| 1261 |
|
| 1262 |
-
|
| 1263 |
-
params: {
|
| 1264 |
-
buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
|
| 1265 |
-
contentLength: fetched.contentLength,
|
| 1266 |
-
// Model load
|
| 1267 |
-
nCtx: params.nCtx,
|
| 1268 |
-
nGpuLayers: params.nGpuLayers,
|
| 1269 |
-
// Consistency phase — empty consistencyPrompt skips it
|
| 1270 |
-
consistencyPrompt: params.consistencyPrompt || '',
|
| 1271 |
-
consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
|
| 1272 |
-
refTokenIds: params.refTokenIds || null,
|
| 1273 |
-
// Perf phase — set both to 0 to skip
|
| 1274 |
-
nPrompt: params.nPrompt ?? 0,
|
| 1275 |
-
nGen: params.nGen ?? 0,
|
| 1276 |
-
nReps: params.nReps ?? DEFAULT_ITERATIONS,
|
| 1277 |
-
noWarmup: !!params.noWarmup,
|
| 1278 |
-
},
|
| 1279 |
stream: fetched.stream,
|
| 1280 |
onStatus: callbacks.onStatus,
|
| 1281 |
onProgress: callbacks.onProgress,
|
| 1282 |
onLog: callbacks.onLog,
|
| 1283 |
});
|
| 1284 |
-
|
| 1285 |
-
return record;
|
| 1286 |
}
|
| 1287 |
|
| 1288 |
// Runs one variant: CPU consistency baseline (one model load, generates
|
|
|
|
| 1126 |
function runInWorker({
|
| 1127 |
params,
|
| 1128 |
stream,
|
| 1129 |
+
fileHandle,
|
| 1130 |
onStatus,
|
| 1131 |
onProgress,
|
| 1132 |
onLog,
|
|
|
|
| 1167 |
finish({ status: 'error', error: 'worker message deserialization failed' });
|
| 1168 |
};
|
| 1169 |
|
| 1170 |
+
// Three transport modes — see bench-worker.js runOne() for matching shape.
|
| 1171 |
+
if (fileHandle) {
|
| 1172 |
+
// OPFS path: FileSystemFileHandle is structured-cloneable, not
|
| 1173 |
+
// transferable. The worker creates its own sync access handle on the
|
| 1174 |
+
// cloned reference (still bound to the same underlying OPFS file).
|
| 1175 |
+
try {
|
| 1176 |
+
worker.postMessage({ type: 'run', params, fileHandle });
|
| 1177 |
+
} catch (err) {
|
| 1178 |
+
finish({ status: 'error', error: `postMessage(fileHandle) failed: ${err.message}` });
|
| 1179 |
+
}
|
| 1180 |
+
return;
|
| 1181 |
+
}
|
| 1182 |
+
|
| 1183 |
// Mobile browsers (esp. iOS Safari) advertise transferable streams but
|
| 1184 |
// can't actually transfer ReadableStreams across postMessage — the call
|
| 1185 |
// throws "The object can not be cloned." We probe once with a tiny
|
|
|
|
| 1263 |
return out.buffer;
|
| 1264 |
}
|
| 1265 |
|
| 1266 |
+
// Fetch the model and hand it to a freshly-spawned worker. Returns a record
|
| 1267 |
+
// shaped like runBenchmarkCore(). Two paths:
|
| 1268 |
+
//
|
| 1269 |
+
// wllama-style OPFS streaming (preferred): if the source provides
|
| 1270 |
+
// opfsHandleForModel (currently hostedSource), download to OPFS on the
|
| 1271 |
+
// main thread, then transfer the FileSystemFileHandle to the worker.
|
| 1272 |
+
// The worker opens a sync access handle and routes MEMFS reads through
|
| 1273 |
+
// it, never copying the model into the WASM heap. Supports >2GB.
|
| 1274 |
+
//
|
| 1275 |
+
// Heap-stream (fallback for localSource): keep the prior behavior —
|
| 1276 |
+
// stream the GGUF into a single _malloc'd buffer in the WASM heap.
|
| 1277 |
+
// Faster for small models (zero-copy mmap on load), capped at ~2GB.
|
| 1278 |
async function runBenchmarkInWorker(v, params, callbacks) {
|
| 1279 |
+
const useOpfs = typeof state.source.opfsHandleForModel === 'function';
|
| 1280 |
+
|
| 1281 |
+
const baseParams = {
|
| 1282 |
+
buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
|
| 1283 |
+
// Model load
|
| 1284 |
+
nCtx: params.nCtx,
|
| 1285 |
+
nGpuLayers: params.nGpuLayers,
|
| 1286 |
+
// Consistency phase — empty consistencyPrompt skips it
|
| 1287 |
+
consistencyPrompt: params.consistencyPrompt || '',
|
| 1288 |
+
consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
|
| 1289 |
+
refTokenIds: params.refTokenIds || null,
|
| 1290 |
+
// Perf phase — set both to 0 to skip
|
| 1291 |
+
nPrompt: params.nPrompt ?? 0,
|
| 1292 |
+
nGen: params.nGen ?? 0,
|
| 1293 |
+
nReps: params.nReps ?? DEFAULT_ITERATIONS,
|
| 1294 |
+
noWarmup: !!params.noWarmup,
|
| 1295 |
+
};
|
| 1296 |
+
|
| 1297 |
+
if (useOpfs) {
|
| 1298 |
+
let fileHandle, contentLength;
|
| 1299 |
+
try {
|
| 1300 |
+
callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
|
| 1301 |
+
const r = await state.source.opfsHandleForModel(
|
| 1302 |
+
v.repo, v.filename,
|
| 1303 |
+
callbacks.onProgress,
|
| 1304 |
+
);
|
| 1305 |
+
fileHandle = r.handle;
|
| 1306 |
+
contentLength = r.size;
|
| 1307 |
+
} catch (err) {
|
| 1308 |
+
return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
|
| 1309 |
+
}
|
| 1310 |
+
return runInWorker({
|
| 1311 |
+
params: { ...baseParams, contentLength },
|
| 1312 |
+
fileHandle,
|
| 1313 |
+
onStatus: callbacks.onStatus,
|
| 1314 |
+
onProgress: callbacks.onProgress,
|
| 1315 |
+
onLog: callbacks.onLog,
|
| 1316 |
+
});
|
| 1317 |
+
}
|
| 1318 |
+
|
| 1319 |
let fetched;
|
| 1320 |
try {
|
| 1321 |
fetched = await state.source.fetchModel(v.repo, v.filename);
|
|
|
|
| 1323 |
return { status: 'error', error: `fetchModel failed: ${err.message}` };
|
| 1324 |
}
|
| 1325 |
|
| 1326 |
+
return runInWorker({
|
| 1327 |
+
params: { ...baseParams, contentLength: fetched.contentLength },
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1328 |
stream: fetched.stream,
|
| 1329 |
onStatus: callbacks.onStatus,
|
| 1330 |
onProgress: callbacks.onProgress,
|
| 1331 |
onLog: callbacks.onLog,
|
| 1332 |
});
|
|
|
|
|
|
|
| 1333 |
}
|
| 1334 |
|
| 1335 |
// Runs one variant: CPU consistency baseline (one model load, generates
|
js/run/core.js
CHANGED
|
@@ -329,12 +329,17 @@ export async function runBenchmarkCore({
|
|
| 329 |
if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
|
| 330 |
onLog('Backends initialized');
|
| 331 |
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
const loadResult = await Module.ccall(
|
| 334 |
'bench_load',
|
| 335 |
'number',
|
| 336 |
-
['string', 'number', 'number'],
|
| 337 |
-
['/model.gguf', nCtx, nGpuLayers],
|
| 338 |
{ async: true },
|
| 339 |
);
|
| 340 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
|
|
|
| 329 |
if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
|
| 330 |
onLog('Backends initialized');
|
| 331 |
|
| 332 |
+
// core.js is the main-thread/heap-stream path (used by harness.js +
|
| 333 |
+
// runner.js Playwright harness). Sync access handles aren't available
|
| 334 |
+
// on the main thread, so we always pass use_mmap=1 here — llama.cpp
|
| 335 |
+
// mmap's the HEAPU8-backed MEMFS file zero-copy. Capped at ~2GB.
|
| 336 |
+
// For >2GB models, run via the dashboard Run page (worker path).
|
| 337 |
+
onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=1)...`);
|
| 338 |
const loadResult = await Module.ccall(
|
| 339 |
'bench_load',
|
| 340 |
'number',
|
| 341 |
+
['string', 'number', 'number', 'number'],
|
| 342 |
+
['/model.gguf', nCtx, nGpuLayers, 1],
|
| 343 |
{ async: true },
|
| 344 |
);
|
| 345 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
js/run/source.js
CHANGED
|
@@ -95,6 +95,54 @@ export function hostedSource() {
|
|
| 95 |
}
|
| 96 |
},
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
async fetchModel(repo, file) {
|
| 99 |
// Cache hit → stream the OPFS file straight out.
|
| 100 |
try {
|
|
|
|
| 95 |
}
|
| 96 |
},
|
| 97 |
|
| 98 |
+
// Ensure the model is fully downloaded to OPFS, then return its
|
| 99 |
+
// FileSystemFileHandle. Used by the wllama-style OPFS-streaming load
|
| 100 |
+
// path: the worker opens a sync access handle on this FileHandle and
|
| 101 |
+
// routes MEMFS reads through it, never copying the model into the
|
| 102 |
+
// WASM heap. onProgress is called during the download leg with
|
| 103 |
+
// (fraction, downloaded, total).
|
| 104 |
+
async opfsHandleForModel(repo, file, onProgress) {
|
| 105 |
+
const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
|
| 106 |
+
if (cached) {
|
| 107 |
+
const f = await cached.getFile();
|
| 108 |
+
if (f.size > 0) {
|
| 109 |
+
onProgress?.(1, f.size, f.size);
|
| 110 |
+
return { handle: cached, size: f.size };
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
// Cache miss — download from HF straight into a writable OPFS stream.
|
| 115 |
+
const url = `https://huggingface.co/${repo}/resolve/main/${file}`;
|
| 116 |
+
const resp = await fetch(url);
|
| 117 |
+
if (!resp.ok) {
|
| 118 |
+
throw new Error(`Download failed: ${resp.status} ${resp.statusText}`);
|
| 119 |
+
}
|
| 120 |
+
const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
|
| 121 |
+
|
| 122 |
+
const handle = await getOpfsFileHandle(repo, file, { create: true });
|
| 123 |
+
const writable = await handle.createWritable({ keepExistingData: false });
|
| 124 |
+
|
| 125 |
+
// Same persistent-storage hint as fetchModel — best-effort.
|
| 126 |
+
navigator.storage?.persist?.().catch(() => {});
|
| 127 |
+
|
| 128 |
+
try {
|
| 129 |
+
const reader = resp.body.getReader();
|
| 130 |
+
let downloaded = 0;
|
| 131 |
+
while (true) {
|
| 132 |
+
const { done, value } = await reader.read();
|
| 133 |
+
if (done) break;
|
| 134 |
+
await writable.write(value);
|
| 135 |
+
downloaded += value.byteLength;
|
| 136 |
+
if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
|
| 137 |
+
}
|
| 138 |
+
await writable.close();
|
| 139 |
+
return { handle, size: downloaded };
|
| 140 |
+
} catch (err) {
|
| 141 |
+
try { await writable.abort(err); } catch { /* ignore */ }
|
| 142 |
+
throw err;
|
| 143 |
+
}
|
| 144 |
+
},
|
| 145 |
+
|
| 146 |
async fetchModel(repo, file) {
|
| 147 |
// Cache hit → stream the OPFS file straight out.
|
| 148 |
try {
|