Spaces:
Running
Running
GitHub Actions commited on
Commit ·
ba6f9e5
1
Parent(s): 2ee9bac
sync from abhijitramesh/webgpu-bench@55ab2c71db
Browse files- js/run/bench-worker.js +63 -20
- js/run/controller.js +11 -11
- js/run/core.js +60 -21
js/run/bench-worker.js
CHANGED
|
@@ -126,25 +126,61 @@ async function runOne({ params, stream }) {
|
|
| 126 |
});
|
| 127 |
log('WASM module loaded');
|
| 128 |
|
| 129 |
-
// ─── Stream the model into
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
let
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
}
|
| 146 |
-
Module.FS.close(memfsHandle);
|
| 147 |
-
log(`Model written to /model.gguf (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 148 |
|
| 149 |
// ─── Init backend ───
|
| 150 |
status('initializing', 'Initializing llama.cpp backends...');
|
|
@@ -164,12 +200,13 @@ async function runOne({ params, stream }) {
|
|
| 164 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 165 |
log('Model loaded');
|
| 166 |
|
| 167 |
-
//
|
|
|
|
|
|
|
| 168 |
try {
|
| 169 |
Module.FS.unlink('/model.gguf');
|
| 170 |
-
log('Freed model file from virtual FS');
|
| 171 |
} catch (err) {
|
| 172 |
-
log(`Warning: could not remove model
|
| 173 |
}
|
| 174 |
|
| 175 |
// ─── Inference ───
|
|
@@ -227,6 +264,12 @@ async function runOne({ params, stream }) {
|
|
| 227 |
|
| 228 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
result.status = 'done';
|
| 231 |
status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
|
| 232 |
log(
|
|
|
|
| 126 |
});
|
| 127 |
log('WASM module loaded');
|
| 128 |
|
| 129 |
+
// ─── Stream the model into the WASM heap (HeapFS-style) ───
|
| 130 |
+
// Avoid the JS-side MEMFS staging buffer by allocating space inside the
|
| 131 |
+
// WASM heap with _malloc and writing chunks directly via HEAPU8.set. Then
|
| 132 |
+
// register the file with MEMFS using a Uint8Array view backed by the heap
|
| 133 |
+
// region, so llama.cpp's mmap can take the zero-copy branch in MEMFS.mmap
|
| 134 |
+
// (which fires when contents.buffer === HEAP8.buffer).
|
| 135 |
+
//
|
| 136 |
+
// Heap growth during bench_init/bench_load detaches old views, so we
|
| 137 |
+
// override node.contents with a getter that always rebuilds the view
|
| 138 |
+
// from the saved pointer + length against the current Module.HEAPU8.
|
| 139 |
+
if (!(contentLength > 0)) {
|
| 140 |
+
throw new Error('content-length is required for streaming into WASM heap');
|
| 141 |
}
|
| 142 |
+
status('downloading', 'Streaming model into WASM heap...');
|
| 143 |
+
|
| 144 |
+
let modelPtr = Module._malloc(contentLength);
|
| 145 |
+
if (!modelPtr) {
|
| 146 |
+
throw new Error(
|
| 147 |
+
`_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
|
| 148 |
+
);
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
try {
|
| 152 |
+
const reader = stream.getReader();
|
| 153 |
+
let downloaded = 0;
|
| 154 |
+
while (true) {
|
| 155 |
+
const { done, value } = await reader.read();
|
| 156 |
+
if (done) break;
|
| 157 |
+
Module.HEAPU8.set(value, modelPtr + downloaded);
|
| 158 |
+
downloaded += value.length;
|
| 159 |
+
post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
|
| 160 |
+
}
|
| 161 |
+
log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 162 |
+
|
| 163 |
+
// Register as a MEMFS file with a heap-backed view. canOwn=true so MEMFS
|
| 164 |
+
// doesn't make its own copy.
|
| 165 |
+
const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
|
| 166 |
+
Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
|
| 167 |
+
|
| 168 |
+
// Replace contents with a getter — heap growth (e.g. when llama.cpp
|
| 169 |
+
// allocates KV cache) replaces Module.HEAPU8.buffer, which would
|
| 170 |
+
// detach our static view. The getter rebuilds against the live buffer.
|
| 171 |
+
const node = Module.FS.lookupPath('/model.gguf').node;
|
| 172 |
+
Object.defineProperty(node, 'contents', {
|
| 173 |
+
get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
|
| 174 |
+
set: () => { /* read-only file */ },
|
| 175 |
+
configurable: true,
|
| 176 |
+
});
|
| 177 |
+
// usedBytes is read by MEMFS for stat() — keep it accurate.
|
| 178 |
+
node.usedBytes = contentLength;
|
| 179 |
+
} catch (err) {
|
| 180 |
+
Module._free(modelPtr);
|
| 181 |
+
modelPtr = 0;
|
| 182 |
+
throw err;
|
| 183 |
}
|
|
|
|
|
|
|
| 184 |
|
| 185 |
// ─── Init backend ───
|
| 186 |
status('initializing', 'Initializing llama.cpp backends...');
|
|
|
|
| 200 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 201 |
log('Model loaded');
|
| 202 |
|
| 203 |
+
// Drop the MEMFS node — the bytes themselves stay alive in the WASM heap
|
| 204 |
+
// because llama.cpp's mmap captured a pointer into our _malloc'd region.
|
| 205 |
+
// We free that region after bench_exit.
|
| 206 |
try {
|
| 207 |
Module.FS.unlink('/model.gguf');
|
|
|
|
| 208 |
} catch (err) {
|
| 209 |
+
log(`Warning: could not remove model FS node: ${err.message}`);
|
| 210 |
}
|
| 211 |
|
| 212 |
// ─── Inference ───
|
|
|
|
| 264 |
|
| 265 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 266 |
|
| 267 |
+
// Free the heap-resident model bytes now that llama.cpp has unmapped.
|
| 268 |
+
if (modelPtr) {
|
| 269 |
+
Module._free(modelPtr);
|
| 270 |
+
modelPtr = 0;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
result.status = 'done';
|
| 274 |
status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
|
| 275 |
log(
|
js/run/controller.js
CHANGED
|
@@ -991,19 +991,19 @@ async function runVariantWithIterations(v, row) {
|
|
| 991 |
cpuResult = { status: 'error', error: err.message || String(err) };
|
| 992 |
}
|
| 993 |
|
| 994 |
-
if (
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
}
|
| 1005 |
|
| 1006 |
-
const refTokenIds = (cpuResult.metrics?.token_ids || []).join(',');
|
| 1007 |
|
| 1008 |
// ─── GPU iterations ───
|
| 1009 |
const gpuSamples = [];
|
|
|
|
| 991 |
cpuResult = { status: 'error', error: err.message || String(err) };
|
| 992 |
}
|
| 993 |
|
| 994 |
+
// CPU baseline is "best effort": if it fails (typically OOM on a tight
|
| 995 |
+
// tab), keep going with GPU runs but skip the consistency check, since
|
| 996 |
+
// we'd have no reference token IDs to compare against. The user still
|
| 997 |
+
// gets prefill/decode metrics — just no agreement-rate number.
|
| 998 |
+
const cpuOk = cpuResult.status === 'done';
|
| 999 |
+
if (!cpuOk) {
|
| 1000 |
+
logLine(
|
| 1001 |
+
`CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU runs, skipping consistency check.`
|
| 1002 |
+
);
|
| 1003 |
+
row.setStatus('cpu-skipped', 'continuing with GPU only');
|
| 1004 |
}
|
| 1005 |
|
| 1006 |
+
const refTokenIds = cpuOk ? (cpuResult.metrics?.token_ids || []).join(',') : '';
|
| 1007 |
|
| 1008 |
// ─── GPU iterations ───
|
| 1009 |
const gpuSamples = [];
|
js/run/core.js
CHANGED
|
@@ -49,6 +49,9 @@ export async function runBenchmarkCore({
|
|
| 49 |
output: '',
|
| 50 |
};
|
| 51 |
|
|
|
|
|
|
|
|
|
|
| 52 |
try {
|
| 53 |
// WebGPU adapter probe — informational only.
|
| 54 |
if (navigator.gpu) {
|
|
@@ -72,7 +75,7 @@ export async function runBenchmarkCore({
|
|
| 72 |
onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
|
| 73 |
await loadBenchScriptOnce(buildType);
|
| 74 |
|
| 75 |
-
|
| 76 |
print: (text) => onLog(`[wasm] ${text}`),
|
| 77 |
printErr: (text) => onLog(`[wasm:err] ${text}`),
|
| 78 |
// Catch Emscripten abort() — Firefox can abort during Asyncify init.
|
|
@@ -93,25 +96,49 @@ export async function runBenchmarkCore({
|
|
| 93 |
contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
|
| 94 |
}`);
|
| 95 |
|
| 96 |
-
// Stream directly into
|
| 97 |
-
//
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
}
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
if (done) break;
|
| 108 |
-
Module.FS.write(memfsHandle, value, 0, value.length, downloaded);
|
| 109 |
-
downloaded += value.length;
|
| 110 |
-
const fraction = contentLength ? downloaded / contentLength : 0;
|
| 111 |
-
onProgress(fraction, downloaded, contentLength);
|
| 112 |
}
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
// Init backend.
|
| 117 |
onStatus('initializing', 'Initializing llama.cpp backends...');
|
|
@@ -133,12 +160,13 @@ export async function runBenchmarkCore({
|
|
| 133 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 134 |
onLog('Model loaded');
|
| 135 |
|
| 136 |
-
//
|
|
|
|
|
|
|
| 137 |
try {
|
| 138 |
Module.FS.unlink('/model.gguf');
|
| 139 |
-
onLog('Freed model file from virtual FS');
|
| 140 |
} catch (e) {
|
| 141 |
-
onLog(`Warning: could not remove model
|
| 142 |
}
|
| 143 |
|
| 144 |
// Run inference.
|
|
@@ -198,6 +226,12 @@ export async function runBenchmarkCore({
|
|
| 198 |
onLog('Calling bench_exit()...');
|
| 199 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
result.status = 'done';
|
| 202 |
onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
|
| 203 |
onLog(
|
|
@@ -216,6 +250,11 @@ export async function runBenchmarkCore({
|
|
| 216 |
onStatus('error', `Error: ${err.message}`);
|
| 217 |
onLog(`ERROR: ${err.message}`);
|
| 218 |
if (err.stack) onLog(err.stack);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
return result;
|
| 220 |
}
|
| 221 |
}
|
|
|
|
| 49 |
output: '',
|
| 50 |
};
|
| 51 |
|
| 52 |
+
// Declared outside the try so the catch can free our heap allocation.
|
| 53 |
+
let Module;
|
| 54 |
+
|
| 55 |
try {
|
| 56 |
// WebGPU adapter probe — informational only.
|
| 57 |
if (navigator.gpu) {
|
|
|
|
| 75 |
onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
|
| 76 |
await loadBenchScriptOnce(buildType);
|
| 77 |
|
| 78 |
+
Module = await globalThis.createBenchModule({
|
| 79 |
print: (text) => onLog(`[wasm] ${text}`),
|
| 80 |
printErr: (text) => onLog(`[wasm:err] ${text}`),
|
| 81 |
// Catch Emscripten abort() — Firefox can abort during Asyncify init.
|
|
|
|
| 96 |
contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
|
| 97 |
}`);
|
| 98 |
|
| 99 |
+
// Stream the GGUF directly into the WASM heap (HeapFS-style) to avoid a
|
| 100 |
+
// duplicate JS-side MEMFS staging buffer. _malloc reserves a region in
|
| 101 |
+
// the linear memory; HEAPU8.set writes chunks in place. We then expose
|
| 102 |
+
// the region as a MEMFS file with `canOwn=true` so MEMFS does not copy,
|
| 103 |
+
// and override node.contents with a getter that always rebuilds the
|
| 104 |
+
// view from the saved pointer — this survives the heap growth that
|
| 105 |
+
// llama.cpp triggers during bench_init/bench_load.
|
| 106 |
+
if (!(contentLength > 0)) {
|
| 107 |
+
throw new Error('content-length is required for streaming into WASM heap');
|
| 108 |
}
|
| 109 |
+
let modelPtr = Module._malloc(contentLength);
|
| 110 |
+
if (!modelPtr) {
|
| 111 |
+
throw new Error(
|
| 112 |
+
`_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
|
| 113 |
+
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
}
|
| 115 |
+
try {
|
| 116 |
+
const reader = stream.getReader();
|
| 117 |
+
let downloaded = 0;
|
| 118 |
+
while (true) {
|
| 119 |
+
const { done, value } = await reader.read();
|
| 120 |
+
if (done) break;
|
| 121 |
+
Module.HEAPU8.set(value, modelPtr + downloaded);
|
| 122 |
+
downloaded += value.length;
|
| 123 |
+
onProgress(downloaded / contentLength, downloaded, contentLength);
|
| 124 |
+
}
|
| 125 |
+
onLog(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 126 |
+
|
| 127 |
+
const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
|
| 128 |
+
Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
|
| 129 |
+
const node = Module.FS.lookupPath('/model.gguf').node;
|
| 130 |
+
Object.defineProperty(node, 'contents', {
|
| 131 |
+
get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
|
| 132 |
+
set: () => { /* read-only */ },
|
| 133 |
+
configurable: true,
|
| 134 |
+
});
|
| 135 |
+
node.usedBytes = contentLength;
|
| 136 |
+
} catch (err) {
|
| 137 |
+
Module._free(modelPtr);
|
| 138 |
+
throw err;
|
| 139 |
+
}
|
| 140 |
+
// Track on the result object so we can free in the success/exit paths.
|
| 141 |
+
result._modelPtr = modelPtr;
|
| 142 |
|
| 143 |
// Init backend.
|
| 144 |
onStatus('initializing', 'Initializing llama.cpp backends...');
|
|
|
|
| 160 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 161 |
onLog('Model loaded');
|
| 162 |
|
| 163 |
+
// Drop the MEMFS node — llama.cpp's mmap captured a pointer into the
|
| 164 |
+
// _malloc'd region in the WASM heap, so the bytes themselves stay alive
|
| 165 |
+
// until we _free below after bench_exit.
|
| 166 |
try {
|
| 167 |
Module.FS.unlink('/model.gguf');
|
|
|
|
| 168 |
} catch (e) {
|
| 169 |
+
onLog(`Warning: could not remove model FS node: ${e.message}`);
|
| 170 |
}
|
| 171 |
|
| 172 |
// Run inference.
|
|
|
|
| 226 |
onLog('Calling bench_exit()...');
|
| 227 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 228 |
|
| 229 |
+
// Free the heap-resident model bytes now that llama.cpp has unmapped.
|
| 230 |
+
if (result._modelPtr) {
|
| 231 |
+
Module._free(result._modelPtr);
|
| 232 |
+
delete result._modelPtr;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
result.status = 'done';
|
| 236 |
onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
|
| 237 |
onLog(
|
|
|
|
| 250 |
onStatus('error', `Error: ${err.message}`);
|
| 251 |
onLog(`ERROR: ${err.message}`);
|
| 252 |
if (err.stack) onLog(err.stack);
|
| 253 |
+
// Best-effort: release the model heap region so a re-run can reuse it.
|
| 254 |
+
if (result._modelPtr && Module?._free) {
|
| 255 |
+
try { Module._free(result._modelPtr); } catch { /* ignore */ }
|
| 256 |
+
delete result._modelPtr;
|
| 257 |
+
}
|
| 258 |
return result;
|
| 259 |
}
|
| 260 |
}
|