GitHub Actions commited on
Commit
ba6f9e5
·
1 Parent(s): 2ee9bac

sync from abhijitramesh/webgpu-bench@55ab2c71db

Browse files
Files changed (3) hide show
  1. js/run/bench-worker.js +63 -20
  2. js/run/controller.js +11 -11
  3. js/run/core.js +60 -21
js/run/bench-worker.js CHANGED
@@ -126,25 +126,61 @@ async function runOne({ params, stream }) {
126
  });
127
  log('WASM module loaded');
128
 
129
- // ─── Stream the model into MEMFS ───
130
- status('downloading', 'Streaming model into WASM FS...');
131
- if (contentLength > 0) {
132
- Module.FS.writeFile('/model.gguf', new Uint8Array(0));
133
- Module.FS.truncate('/model.gguf', contentLength);
 
 
 
 
 
 
 
134
  }
135
- const memfsHandle = Module.FS.open('/model.gguf', 'w');
136
- const reader = stream.getReader();
137
- let downloaded = 0;
138
- while (true) {
139
- const { done, value } = await reader.read();
140
- if (done) break;
141
- Module.FS.write(memfsHandle, value, 0, value.length, downloaded);
142
- downloaded += value.length;
143
- const fraction = contentLength ? downloaded / contentLength : 0;
144
- post({ type: 'progress', fraction, downloaded, total: contentLength });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
146
- Module.FS.close(memfsHandle);
147
- log(`Model written to /model.gguf (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
148
 
149
  // ─── Init backend ───
150
  status('initializing', 'Initializing llama.cpp backends...');
@@ -164,12 +200,13 @@ async function runOne({ params, stream }) {
164
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
165
  log('Model loaded');
166
 
167
- // Free MEMFS copyllama.cpp has mapped weights into its own heap by now.
 
 
168
  try {
169
  Module.FS.unlink('/model.gguf');
170
- log('Freed model file from virtual FS');
171
  } catch (err) {
172
- log(`Warning: could not remove model file from FS: ${err.message}`);
173
  }
174
 
175
  // ─── Inference ───
@@ -227,6 +264,12 @@ async function runOne({ params, stream }) {
227
 
228
  await Module.ccall('bench_exit', null, [], [], { async: true });
229
 
 
 
 
 
 
 
230
  result.status = 'done';
231
  status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
232
  log(
 
126
  });
127
  log('WASM module loaded');
128
 
129
+ // ─── Stream the model into the WASM heap (HeapFS-style) ───
130
+ // Avoid the JS-side MEMFS staging buffer by allocating space inside the
131
+ // WASM heap with _malloc and writing chunks directly via HEAPU8.set. Then
132
+ // register the file with MEMFS using a Uint8Array view backed by the heap
133
+ // region, so llama.cpp's mmap can take the zero-copy branch in MEMFS.mmap
134
+ // (which fires when contents.buffer === HEAP8.buffer).
135
+ //
136
+ // Heap growth during bench_init/bench_load detaches old views, so we
137
+ // override node.contents with a getter that always rebuilds the view
138
+ // from the saved pointer + length against the current Module.HEAPU8.
139
+ if (!(contentLength > 0)) {
140
+ throw new Error('content-length is required for streaming into WASM heap');
141
  }
142
+ status('downloading', 'Streaming model into WASM heap...');
143
+
144
+ let modelPtr = Module._malloc(contentLength);
145
+ if (!modelPtr) {
146
+ throw new Error(
147
+ `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
148
+ );
149
+ }
150
+
151
+ try {
152
+ const reader = stream.getReader();
153
+ let downloaded = 0;
154
+ while (true) {
155
+ const { done, value } = await reader.read();
156
+ if (done) break;
157
+ Module.HEAPU8.set(value, modelPtr + downloaded);
158
+ downloaded += value.length;
159
+ post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
160
+ }
161
+ log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
162
+
163
+ // Register as a MEMFS file with a heap-backed view. canOwn=true so MEMFS
164
+ // doesn't make its own copy.
165
+ const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
166
+ Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
167
+
168
+ // Replace contents with a getter — heap growth (e.g. when llama.cpp
169
+ // allocates KV cache) replaces Module.HEAPU8.buffer, which would
170
+ // detach our static view. The getter rebuilds against the live buffer.
171
+ const node = Module.FS.lookupPath('/model.gguf').node;
172
+ Object.defineProperty(node, 'contents', {
173
+ get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
174
+ set: () => { /* read-only file */ },
175
+ configurable: true,
176
+ });
177
+ // usedBytes is read by MEMFS for stat() — keep it accurate.
178
+ node.usedBytes = contentLength;
179
+ } catch (err) {
180
+ Module._free(modelPtr);
181
+ modelPtr = 0;
182
+ throw err;
183
  }
 
 
184
 
185
  // ─── Init backend ───
186
  status('initializing', 'Initializing llama.cpp backends...');
 
200
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
201
  log('Model loaded');
202
 
203
+ // Drop the MEMFS nodethe bytes themselves stay alive in the WASM heap
204
+ // because llama.cpp's mmap captured a pointer into our _malloc'd region.
205
+ // We free that region after bench_exit.
206
  try {
207
  Module.FS.unlink('/model.gguf');
 
208
  } catch (err) {
209
+ log(`Warning: could not remove model FS node: ${err.message}`);
210
  }
211
 
212
  // ─── Inference ───
 
264
 
265
  await Module.ccall('bench_exit', null, [], [], { async: true });
266
 
267
+ // Free the heap-resident model bytes now that llama.cpp has unmapped.
268
+ if (modelPtr) {
269
+ Module._free(modelPtr);
270
+ modelPtr = 0;
271
+ }
272
+
273
  result.status = 'done';
274
  status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
275
  log(
js/run/controller.js CHANGED
@@ -991,19 +991,19 @@ async function runVariantWithIterations(v, row) {
991
  cpuResult = { status: 'error', error: err.message || String(err) };
992
  }
993
 
994
- if (cpuResult.status !== 'done') {
995
- return {
996
- status: 'error',
997
- error: `CPU baseline failed: ${cpuResult.error || 'unknown'}`,
998
- iterations: 0,
999
- cpu: cpuResult,
1000
- gpuSamples: [],
1001
- consistency: null,
1002
- gpuCore: null,
1003
- };
1004
  }
1005
 
1006
- const refTokenIds = (cpuResult.metrics?.token_ids || []).join(',');
1007
 
1008
  // ─── GPU iterations ───
1009
  const gpuSamples = [];
 
991
  cpuResult = { status: 'error', error: err.message || String(err) };
992
  }
993
 
994
+ // CPU baseline is "best effort": if it fails (typically OOM on a tight
995
+ // tab), keep going with GPU runs but skip the consistency check, since
996
+ // we'd have no reference token IDs to compare against. The user still
997
+ // gets prefill/decode metrics just no agreement-rate number.
998
+ const cpuOk = cpuResult.status === 'done';
999
+ if (!cpuOk) {
1000
+ logLine(
1001
+ `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU runs, skipping consistency check.`
1002
+ );
1003
+ row.setStatus('cpu-skipped', 'continuing with GPU only');
1004
  }
1005
 
1006
+ const refTokenIds = cpuOk ? (cpuResult.metrics?.token_ids || []).join(',') : '';
1007
 
1008
  // ─── GPU iterations ───
1009
  const gpuSamples = [];
js/run/core.js CHANGED
@@ -49,6 +49,9 @@ export async function runBenchmarkCore({
49
  output: '',
50
  };
51
 
 
 
 
52
  try {
53
  // WebGPU adapter probe — informational only.
54
  if (navigator.gpu) {
@@ -72,7 +75,7 @@ export async function runBenchmarkCore({
72
  onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
73
  await loadBenchScriptOnce(buildType);
74
 
75
- const Module = await globalThis.createBenchModule({
76
  print: (text) => onLog(`[wasm] ${text}`),
77
  printErr: (text) => onLog(`[wasm:err] ${text}`),
78
  // Catch Emscripten abort() — Firefox can abort during Asyncify init.
@@ -93,25 +96,49 @@ export async function runBenchmarkCore({
93
  contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
94
  }`);
95
 
96
- // Stream directly into MEMFS to avoid holding the full model in JS memory.
97
- // Pre-allocate so MEMFS doesn't realloc on every chunk.
98
- if (contentLength > 0) {
99
- Module.FS.writeFile('/model.gguf', new Uint8Array(0));
100
- Module.FS.truncate('/model.gguf', contentLength);
 
 
 
 
101
  }
102
- const memfsHandle = Module.FS.open('/model.gguf', 'w');
103
- const reader = stream.getReader();
104
- let downloaded = 0;
105
- while (true) {
106
- const { done, value } = await reader.read();
107
- if (done) break;
108
- Module.FS.write(memfsHandle, value, 0, value.length, downloaded);
109
- downloaded += value.length;
110
- const fraction = contentLength ? downloaded / contentLength : 0;
111
- onProgress(fraction, downloaded, contentLength);
112
  }
113
- Module.FS.close(memfsHandle);
114
- onLog(`Model written to /model.gguf (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  // Init backend.
117
  onStatus('initializing', 'Initializing llama.cpp backends...');
@@ -133,12 +160,13 @@ export async function runBenchmarkCore({
133
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
134
  onLog('Model loaded');
135
 
136
- // llama.cpp has copied the model into WASM heap free the MEMFS copy.
 
 
137
  try {
138
  Module.FS.unlink('/model.gguf');
139
- onLog('Freed model file from virtual FS');
140
  } catch (e) {
141
- onLog(`Warning: could not remove model file from FS: ${e.message}`);
142
  }
143
 
144
  // Run inference.
@@ -198,6 +226,12 @@ export async function runBenchmarkCore({
198
  onLog('Calling bench_exit()...');
199
  await Module.ccall('bench_exit', null, [], [], { async: true });
200
 
 
 
 
 
 
 
201
  result.status = 'done';
202
  onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
203
  onLog(
@@ -216,6 +250,11 @@ export async function runBenchmarkCore({
216
  onStatus('error', `Error: ${err.message}`);
217
  onLog(`ERROR: ${err.message}`);
218
  if (err.stack) onLog(err.stack);
 
 
 
 
 
219
  return result;
220
  }
221
  }
 
49
  output: '',
50
  };
51
 
52
+ // Declared outside the try so the catch can free our heap allocation.
53
+ let Module;
54
+
55
  try {
56
  // WebGPU adapter probe — informational only.
57
  if (navigator.gpu) {
 
75
  onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
76
  await loadBenchScriptOnce(buildType);
77
 
78
+ Module = await globalThis.createBenchModule({
79
  print: (text) => onLog(`[wasm] ${text}`),
80
  printErr: (text) => onLog(`[wasm:err] ${text}`),
81
  // Catch Emscripten abort() — Firefox can abort during Asyncify init.
 
96
  contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
97
  }`);
98
 
99
+ // Stream the GGUF directly into the WASM heap (HeapFS-style) to avoid a
100
+ // duplicate JS-side MEMFS staging buffer. _malloc reserves a region in
101
+ // the linear memory; HEAPU8.set writes chunks in place. We then expose
102
+ // the region as a MEMFS file with `canOwn=true` so MEMFS does not copy,
103
+ // and override node.contents with a getter that always rebuilds the
104
+ // view from the saved pointer — this survives the heap growth that
105
+ // llama.cpp triggers during bench_init/bench_load.
106
+ if (!(contentLength > 0)) {
107
+ throw new Error('content-length is required for streaming into WASM heap');
108
  }
109
+ let modelPtr = Module._malloc(contentLength);
110
+ if (!modelPtr) {
111
+ throw new Error(
112
+ `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
113
+ );
 
 
 
 
 
114
  }
115
+ try {
116
+ const reader = stream.getReader();
117
+ let downloaded = 0;
118
+ while (true) {
119
+ const { done, value } = await reader.read();
120
+ if (done) break;
121
+ Module.HEAPU8.set(value, modelPtr + downloaded);
122
+ downloaded += value.length;
123
+ onProgress(downloaded / contentLength, downloaded, contentLength);
124
+ }
125
+ onLog(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
126
+
127
+ const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
128
+ Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
129
+ const node = Module.FS.lookupPath('/model.gguf').node;
130
+ Object.defineProperty(node, 'contents', {
131
+ get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
132
+ set: () => { /* read-only */ },
133
+ configurable: true,
134
+ });
135
+ node.usedBytes = contentLength;
136
+ } catch (err) {
137
+ Module._free(modelPtr);
138
+ throw err;
139
+ }
140
+ // Track on the result object so we can free in the success/exit paths.
141
+ result._modelPtr = modelPtr;
142
 
143
  // Init backend.
144
  onStatus('initializing', 'Initializing llama.cpp backends...');
 
160
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
161
  onLog('Model loaded');
162
 
163
+ // Drop the MEMFS node llama.cpp's mmap captured a pointer into the
164
+ // _malloc'd region in the WASM heap, so the bytes themselves stay alive
165
+ // until we _free below after bench_exit.
166
  try {
167
  Module.FS.unlink('/model.gguf');
 
168
  } catch (e) {
169
+ onLog(`Warning: could not remove model FS node: ${e.message}`);
170
  }
171
 
172
  // Run inference.
 
226
  onLog('Calling bench_exit()...');
227
  await Module.ccall('bench_exit', null, [], [], { async: true });
228
 
229
+ // Free the heap-resident model bytes now that llama.cpp has unmapped.
230
+ if (result._modelPtr) {
231
+ Module._free(result._modelPtr);
232
+ delete result._modelPtr;
233
+ }
234
+
235
  result.status = 'done';
236
  onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
237
  onLog(
 
250
  onStatus('error', `Error: ${err.message}`);
251
  onLog(`ERROR: ${err.message}`);
252
  if (err.stack) onLog(err.stack);
253
+ // Best-effort: release the model heap region so a re-run can reuse it.
254
+ if (result._modelPtr && Module?._free) {
255
+ try { Module._free(result._modelPtr); } catch { /* ignore */ }
256
+ delete result._modelPtr;
257
+ }
258
  return result;
259
  }
260
  }