Spaces:
Running
Running
GitHub Actions commited on
Commit ·
e72601b
1
Parent(s): e6a49d5
sync from abhijitramesh/webgpu-bench@1be8b82935
Browse files- harness.js +23 -8
- js/dataset.js +14 -0
- js/run/bench-worker.js +146 -98
- js/run/controller.js +154 -118
- js/run/core.js +201 -90
- js/tables.js +26 -6
- run.html +9 -1
harness.js
CHANGED
|
@@ -21,13 +21,22 @@ window.addEventListener('unhandledrejection', (e) => {
|
|
| 21 |
|
| 22 |
(async function () {
|
| 23 |
const params = new URLSearchParams(window.location.search);
|
| 24 |
-
const modelFile
|
| 25 |
-
const hfRepo
|
| 26 |
-
const
|
| 27 |
-
const
|
| 28 |
-
const
|
| 29 |
-
const
|
| 30 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
const hasJspi = 'Suspending' in WebAssembly;
|
| 33 |
|
|
@@ -73,7 +82,13 @@ window.addEventListener('unhandledrejection', (e) => {
|
|
| 73 |
|
| 74 |
const result = await runBenchmarkCore({
|
| 75 |
source: localSource(),
|
| 76 |
-
modelFile, hfRepo,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
onStatus, onProgress, onLog,
|
| 78 |
});
|
| 79 |
|
|
|
|
| 21 |
|
| 22 |
(async function () {
|
| 23 |
const params = new URLSearchParams(window.location.search);
|
| 24 |
+
const modelFile = params.get('model') || '';
|
| 25 |
+
const hfRepo = params.get('hfRepo') || 'unsloth/Llama-3.2-1B-Instruct-GGUF';
|
| 26 |
+
const consistencyPrompt = params.get('prompt') || 'Hello, how are you?';
|
| 27 |
+
const consistencyNPredict = parseInt(params.get('nPredict') || '128', 10);
|
| 28 |
+
const nPrompt = parseInt(params.get('nPrompt') || '512', 10);
|
| 29 |
+
const nGen = parseInt(params.get('nGen') || '128', 10);
|
| 30 |
+
const nReps = parseInt(params.get('nReps') || '5', 10);
|
| 31 |
+
const nCtx = parseInt(params.get('nCtx') || '2048', 10);
|
| 32 |
+
const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
|
| 33 |
+
const refTokenIds = params.get('refTokenIds') || null;
|
| 34 |
+
// mode=perf → skip consistency entirely (e.g. for the GPU perf-only pass).
|
| 35 |
+
// mode=consistency → skip perf (e.g. CPU baseline pass that just needs token_ids).
|
| 36 |
+
// default 'both' runs both phases in one model load.
|
| 37 |
+
const mode = params.get('mode') || 'both';
|
| 38 |
+
const runConsistency = mode !== 'perf';
|
| 39 |
+
const runPerf = mode !== 'consistency';
|
| 40 |
|
| 41 |
const hasJspi = 'Suspending' in WebAssembly;
|
| 42 |
|
|
|
|
| 82 |
|
| 83 |
const result = await runBenchmarkCore({
|
| 84 |
source: localSource(),
|
| 85 |
+
modelFile, hfRepo,
|
| 86 |
+
consistencyPrompt, consistencyNPredict, refTokenIds,
|
| 87 |
+
runConsistency,
|
| 88 |
+
nPrompt: runPerf ? nPrompt : 0,
|
| 89 |
+
nGen: runPerf ? nGen : 0,
|
| 90 |
+
nReps,
|
| 91 |
+
nCtx, nGpuLayers,
|
| 92 |
onStatus, onProgress, onLog,
|
| 93 |
});
|
| 94 |
|
js/dataset.js
CHANGED
|
@@ -120,6 +120,13 @@ async function fetchRunsBatch(datasetRepo, files) {
|
|
| 120 |
produces. Keep field-for-field aligned with build-site.js so the merged
|
| 121 |
results are indistinguishable from the baseline. */
|
| 122 |
function flattenForDashboard(r, slug) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
return {
|
| 124 |
machineSlug: slug,
|
| 125 |
timestamp: r.timestamp,
|
|
@@ -137,6 +144,13 @@ function flattenForDashboard(r, slug) {
|
|
| 137 |
wallTimeMs: r.wallTimeMs,
|
| 138 |
prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
|
| 139 |
decode_tok_s: r.metrics?.decode_tok_s ?? null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
n_p_eval: r.metrics?.n_p_eval ?? null,
|
| 141 |
t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
|
| 142 |
n_eval: r.metrics?.n_eval ?? null,
|
|
|
|
| 120 |
produces. Keep field-for-field aligned with build-site.js so the merged
|
| 121 |
results are indistinguishable from the baseline. */
|
| 122 |
function flattenForDashboard(r, slug) {
|
| 123 |
+
// New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}].
|
| 124 |
+
// Old-format records have flat metrics.prefill_tok_s / decode_tok_s only.
|
| 125 |
+
// Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev"
|
| 126 |
+
// when stddev is available without breaking on older rows.
|
| 127 |
+
const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null;
|
| 128 |
+
const pp = tests?.find(t => t.name?.startsWith('pp')) || null;
|
| 129 |
+
const tg = tests?.find(t => t.name?.startsWith('tg')) || null;
|
| 130 |
return {
|
| 131 |
machineSlug: slug,
|
| 132 |
timestamp: r.timestamp,
|
|
|
|
| 144 |
wallTimeMs: r.wallTimeMs,
|
| 145 |
prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
|
| 146 |
decode_tok_s: r.metrics?.decode_tok_s ?? null,
|
| 147 |
+
// llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N})
|
| 148 |
+
prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null,
|
| 149 |
+
decode_stddev_ts: tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev ?? null,
|
| 150 |
+
pp_test_name: pp?.name ?? null,
|
| 151 |
+
tg_test_name: tg?.name ?? null,
|
| 152 |
+
pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
|
| 153 |
+
tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
|
| 154 |
n_p_eval: r.metrics?.n_p_eval ?? null,
|
| 155 |
t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
|
| 156 |
n_eval: r.metrics?.n_eval ?? null,
|
js/run/bench-worker.js
CHANGED
|
@@ -6,12 +6,15 @@
|
|
| 6 |
//
|
| 7 |
// main → worker: {
|
| 8 |
// type: 'run',
|
| 9 |
-
// params: {
|
| 10 |
-
//
|
| 11 |
-
//
|
| 12 |
-
//
|
| 13 |
-
//
|
| 14 |
-
//
|
|
|
|
|
|
|
|
|
|
| 15 |
// stream?: ReadableStream<Uint8Array>, // TRANSFERRED
|
| 16 |
// buffer?: ArrayBuffer // TRANSFERRED (mobile fallback)
|
| 17 |
// }
|
|
@@ -25,15 +28,57 @@
|
|
| 25 |
// decode loops ignore signals, and termination is the only reliable way to
|
| 26 |
// stop an in-flight WASM call.
|
| 27 |
//
|
| 28 |
-
// NOTE ON DUPLICATION: the orchestration below mirrors runBenchmarkCore()
|
| 29 |
-
// in site/js/run/core.js. core.js stays the authoritative
|
| 30 |
-
// (used by harness.js + runner.js Playwright harness). When
|
| 31 |
-
// change the other.
|
| 32 |
|
| 33 |
const post = (msg) => self.postMessage(msg);
|
| 34 |
const log = (line) => post({ type: 'log', line });
|
| 35 |
const status = (s, msg) => post({ type: 'status', status: s, msg });
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
self.onmessage = async (e) => {
|
| 38 |
const { type } = e.data || {};
|
| 39 |
if (type !== 'run') {
|
|
@@ -58,12 +103,18 @@ self.onmessage = async (e) => {
|
|
| 58 |
async function runOne({ params, stream, buffer }) {
|
| 59 |
const {
|
| 60 |
buildType,
|
| 61 |
-
|
| 62 |
-
nPredict,
|
| 63 |
nCtx,
|
| 64 |
nGpuLayers,
|
|
|
|
|
|
|
|
|
|
| 65 |
refTokenIds,
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
} = params;
|
| 68 |
if (!stream && !buffer) {
|
| 69 |
throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
|
|
@@ -85,9 +136,6 @@ async function runOne({ params, stream, buffer }) {
|
|
| 85 |
try {
|
| 86 |
const adapter = await self.navigator.gpu.requestAdapter();
|
| 87 |
if (adapter) {
|
| 88 |
-
// GPUAdapterInfo is a host object — structured-clone can't serialize
|
| 89 |
-
// it across postMessage. Copy the fields we care about into a plain
|
| 90 |
-
// object before storing on result.
|
| 91 |
const info = adapter.info;
|
| 92 |
result.gpuAdapterInfo = info ? {
|
| 93 |
vendor: info.vendor || '',
|
|
@@ -118,10 +166,6 @@ async function runOne({ params, stream, buffer }) {
|
|
| 118 |
}
|
| 119 |
|
| 120 |
const Module = await self.createBenchModule({
|
| 121 |
-
// In a worker loaded via importScripts(), Emscripten can't infer the
|
| 122 |
-
// script's directory and falls back to self.location (this worker's
|
| 123 |
-
// own URL), which makes it look for bench.wasm next to bench-worker.js.
|
| 124 |
-
// Pin the lookup to the build directory so it grabs the right file.
|
| 125 |
locateFile: (filename) => `/build/${buildType}/${filename}`,
|
| 126 |
print: (text) => log(`[wasm] ${text}`),
|
| 127 |
printErr: (text) => log(`[wasm:err] ${text}`),
|
|
@@ -135,15 +179,6 @@ async function runOne({ params, stream, buffer }) {
|
|
| 135 |
log('WASM module loaded');
|
| 136 |
|
| 137 |
// ─── Stream the model into the WASM heap (HeapFS-style) ───
|
| 138 |
-
// Avoid the JS-side MEMFS staging buffer by allocating space inside the
|
| 139 |
-
// WASM heap with _malloc and writing chunks directly via HEAPU8.set. Then
|
| 140 |
-
// register the file with MEMFS using a Uint8Array view backed by the heap
|
| 141 |
-
// region, so llama.cpp's mmap can take the zero-copy branch in MEMFS.mmap
|
| 142 |
-
// (which fires when contents.buffer === HEAP8.buffer).
|
| 143 |
-
//
|
| 144 |
-
// Heap growth during bench_init/bench_load detaches old views, so we
|
| 145 |
-
// override node.contents with a getter that always rebuilds the view
|
| 146 |
-
// from the saved pointer + length against the current Module.HEAPU8.
|
| 147 |
if (!(contentLength > 0)) {
|
| 148 |
throw new Error('content-length is required for streaming into WASM heap');
|
| 149 |
}
|
|
@@ -168,10 +203,6 @@ async function runOne({ params, stream, buffer }) {
|
|
| 168 |
post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
|
| 169 |
}
|
| 170 |
} else {
|
| 171 |
-
// Buffered path (mobile fallback): the whole file is already in
|
| 172 |
-
// memory. Copy it into the WASM heap in one shot. Progress was
|
| 173 |
-
// emitted on the main thread while buffering, so we just report 100%
|
| 174 |
-
// here for the loading phase.
|
| 175 |
const view = new Uint8Array(buffer);
|
| 176 |
if (view.byteLength !== contentLength) {
|
| 177 |
log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
|
|
@@ -182,21 +213,15 @@ async function runOne({ params, stream, buffer }) {
|
|
| 182 |
}
|
| 183 |
log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 184 |
|
| 185 |
-
// Register as a MEMFS file with a heap-backed view. canOwn=true so MEMFS
|
| 186 |
-
// doesn't make its own copy.
|
| 187 |
const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
|
| 188 |
Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
|
| 189 |
|
| 190 |
-
// Replace contents with a getter — heap growth (e.g. when llama.cpp
|
| 191 |
-
// allocates KV cache) replaces Module.HEAPU8.buffer, which would
|
| 192 |
-
// detach our static view. The getter rebuilds against the live buffer.
|
| 193 |
const node = Module.FS.lookupPath('/model.gguf').node;
|
| 194 |
Object.defineProperty(node, 'contents', {
|
| 195 |
get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
|
| 196 |
set: () => { /* read-only file */ },
|
| 197 |
configurable: true,
|
| 198 |
});
|
| 199 |
-
// usedBytes is read by MEMFS for stat() — keep it accurate.
|
| 200 |
node.usedBytes = contentLength;
|
| 201 |
} catch (err) {
|
| 202 |
Module._free(modelPtr);
|
|
@@ -222,86 +247,109 @@ async function runOne({ params, stream, buffer }) {
|
|
| 222 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 223 |
log('Model loaded');
|
| 224 |
|
| 225 |
-
// Drop the MEMFS node — the bytes themselves stay alive in the WASM heap
|
| 226 |
-
// because llama.cpp's mmap captured a pointer into our _malloc'd region.
|
| 227 |
-
// We free that region after bench_exit.
|
| 228 |
try {
|
| 229 |
Module.FS.unlink('/model.gguf');
|
| 230 |
} catch (err) {
|
| 231 |
log(`Warning: could not remove model FS node: ${err.message}`);
|
| 232 |
}
|
| 233 |
|
| 234 |
-
// ───
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
);
|
| 243 |
-
log(`bench_run returned: ${String(resultJson).substring(0, 200)}`);
|
| 244 |
-
|
| 245 |
-
const inferResult = JSON.parse(resultJson);
|
| 246 |
-
if (inferResult.error) throw new Error(`Inference error: ${inferResult.error}`);
|
| 247 |
-
|
| 248 |
-
const prefillTokS = inferResult.t_p_eval_ms > 0
|
| 249 |
-
? (inferResult.n_p_eval / (inferResult.t_p_eval_ms / 1000)).toFixed(2)
|
| 250 |
-
: 'N/A';
|
| 251 |
-
const decodeTokS = inferResult.t_eval_ms > 0
|
| 252 |
-
? (inferResult.n_eval / (inferResult.t_eval_ms / 1000)).toFixed(2)
|
| 253 |
-
: 'N/A';
|
| 254 |
-
|
| 255 |
-
result.metrics = {
|
| 256 |
-
...inferResult,
|
| 257 |
-
prefill_tok_s: parseFloat(prefillTokS) || 0,
|
| 258 |
-
decode_tok_s: parseFloat(decodeTokS) || 0,
|
| 259 |
-
};
|
| 260 |
-
result.output = inferResult.output || '';
|
| 261 |
-
|
| 262 |
-
// ─── Consistency check ───
|
| 263 |
-
if (refTokenIds && nGpuLayers > 0 && inferResult.token_ids?.length > 0) {
|
| 264 |
-
log('Running forced-decoding consistency check...');
|
| 265 |
-
const evalJson = await Module.ccall(
|
| 266 |
-
'bench_eval_tokens',
|
| 267 |
-
'string',
|
| 268 |
-
['string', 'string'],
|
| 269 |
-
[prompt, refTokenIds],
|
| 270 |
{ async: true },
|
| 271 |
);
|
| 272 |
-
const
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
log(
|
| 278 |
-
`Consistency: ${(
|
| 279 |
-
`${
|
|
|
|
| 280 |
);
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
}
|
|
|
|
| 284 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
}
|
| 286 |
|
| 287 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 288 |
|
| 289 |
-
// Free the heap-resident model bytes now that llama.cpp has unmapped.
|
| 290 |
if (modelPtr) {
|
| 291 |
Module._free(modelPtr);
|
| 292 |
modelPtr = 0;
|
| 293 |
}
|
| 294 |
|
| 295 |
result.status = 'done';
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
);
|
| 301 |
-
log(
|
| 302 |
-
`Decode: ${decodeTokS} tok/s (${inferResult.n_eval} tokens in ` +
|
| 303 |
-
`${inferResult.t_eval_ms.toFixed(0)} ms)`,
|
| 304 |
-
);
|
| 305 |
-
log(`Output: ${(inferResult.output || '').substring(0, 200)}`);
|
| 306 |
return result;
|
| 307 |
}
|
|
|
|
| 6 |
//
|
| 7 |
// main → worker: {
|
| 8 |
// type: 'run',
|
| 9 |
+
// params: {
|
| 10 |
+
// buildType, contentLength,
|
| 11 |
+
// // model load
|
| 12 |
+
// nCtx, nGpuLayers,
|
| 13 |
+
// // consistency phase (set consistencyPrompt to '' to skip)
|
| 14 |
+
// consistencyPrompt, consistencyNPredict, refTokenIds,
|
| 15 |
+
// // perf phase
|
| 16 |
+
// nPrompt, nGen, nReps, noWarmup,
|
| 17 |
+
// },
|
| 18 |
// stream?: ReadableStream<Uint8Array>, // TRANSFERRED
|
| 19 |
// buffer?: ArrayBuffer // TRANSFERRED (mobile fallback)
|
| 20 |
// }
|
|
|
|
| 28 |
// decode loops ignore signals, and termination is the only reliable way to
|
| 29 |
// stop an in-flight WASM call.
|
| 30 |
//
|
| 31 |
+
// NOTE ON DUPLICATION: the orchestration below mirrors runBenchmarkCore() +
|
| 32 |
+
// runBenchActions() in site/js/run/core.js. core.js stays the authoritative
|
| 33 |
+
// main-thread path (used by harness.js + runner.js Playwright harness). When
|
| 34 |
+
// changing one, change the other.
|
| 35 |
|
| 36 |
const post = (msg) => self.postMessage(msg);
|
| 37 |
const log = (line) => post({ type: 'log', line });
|
| 38 |
const status = (s, msg) => post({ type: 'status', status: s, msg });
|
| 39 |
|
| 40 |
+
// Aggregate raw nanosecond samples into the llama-bench result shape.
|
| 41 |
+
// Mirrors core.js buildTest — keep them identical.
|
| 42 |
+
function buildTest(name, n_prompt, n_gen, samples_ns) {
|
| 43 |
+
const n = samples_ns.length;
|
| 44 |
+
if (n === 0) {
|
| 45 |
+
return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
|
| 46 |
+
}
|
| 47 |
+
const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
|
| 48 |
+
const var_ns = n > 1
|
| 49 |
+
? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
|
| 50 |
+
: 0;
|
| 51 |
+
const stddev_ns = Math.sqrt(var_ns);
|
| 52 |
+
const n_tokens = n_prompt + n_gen;
|
| 53 |
+
const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
|
| 54 |
+
const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
|
| 55 |
+
const var_ts = n > 1
|
| 56 |
+
? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
|
| 57 |
+
: 0;
|
| 58 |
+
const stddev_ts = Math.sqrt(var_ts);
|
| 59 |
+
const round2 = x => Math.round(x * 100) / 100;
|
| 60 |
+
return {
|
| 61 |
+
name,
|
| 62 |
+
n_prompt,
|
| 63 |
+
n_gen,
|
| 64 |
+
avg_ns: Math.round(avg_ns),
|
| 65 |
+
stddev_ns: Math.round(stddev_ns),
|
| 66 |
+
avg_ts: round2(avg_ts),
|
| 67 |
+
stddev_ts: round2(stddev_ts),
|
| 68 |
+
samples_ns: samples_ns.map(Math.round),
|
| 69 |
+
samples_ts: samples_ts.map(round2),
|
| 70 |
+
};
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
function parseBenchResult(label, raw) {
|
| 74 |
+
let r;
|
| 75 |
+
try { r = JSON.parse(raw); } catch (e) {
|
| 76 |
+
throw new Error(`${label}: invalid JSON from C (${e.message})`);
|
| 77 |
+
}
|
| 78 |
+
if (r.error) throw new Error(`${label}: ${r.error}`);
|
| 79 |
+
return r;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
self.onmessage = async (e) => {
|
| 83 |
const { type } = e.data || {};
|
| 84 |
if (type !== 'run') {
|
|
|
|
| 103 |
async function runOne({ params, stream, buffer }) {
|
| 104 |
const {
|
| 105 |
buildType,
|
| 106 |
+
contentLength,
|
|
|
|
| 107 |
nCtx,
|
| 108 |
nGpuLayers,
|
| 109 |
+
// consistency
|
| 110 |
+
consistencyPrompt,
|
| 111 |
+
consistencyNPredict,
|
| 112 |
refTokenIds,
|
| 113 |
+
// perf
|
| 114 |
+
nPrompt,
|
| 115 |
+
nGen,
|
| 116 |
+
nReps,
|
| 117 |
+
noWarmup,
|
| 118 |
} = params;
|
| 119 |
if (!stream && !buffer) {
|
| 120 |
throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
|
|
|
|
| 136 |
try {
|
| 137 |
const adapter = await self.navigator.gpu.requestAdapter();
|
| 138 |
if (adapter) {
|
|
|
|
|
|
|
|
|
|
| 139 |
const info = adapter.info;
|
| 140 |
result.gpuAdapterInfo = info ? {
|
| 141 |
vendor: info.vendor || '',
|
|
|
|
| 166 |
}
|
| 167 |
|
| 168 |
const Module = await self.createBenchModule({
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
locateFile: (filename) => `/build/${buildType}/${filename}`,
|
| 170 |
print: (text) => log(`[wasm] ${text}`),
|
| 171 |
printErr: (text) => log(`[wasm:err] ${text}`),
|
|
|
|
| 179 |
log('WASM module loaded');
|
| 180 |
|
| 181 |
// ─── Stream the model into the WASM heap (HeapFS-style) ───
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
if (!(contentLength > 0)) {
|
| 183 |
throw new Error('content-length is required for streaming into WASM heap');
|
| 184 |
}
|
|
|
|
| 203 |
post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
|
| 204 |
}
|
| 205 |
} else {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
const view = new Uint8Array(buffer);
|
| 207 |
if (view.byteLength !== contentLength) {
|
| 208 |
log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
|
|
|
|
| 213 |
}
|
| 214 |
log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
|
| 215 |
|
|
|
|
|
|
|
| 216 |
const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
|
| 217 |
Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
|
| 218 |
|
|
|
|
|
|
|
|
|
|
| 219 |
const node = Module.FS.lookupPath('/model.gguf').node;
|
| 220 |
Object.defineProperty(node, 'contents', {
|
| 221 |
get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
|
| 222 |
set: () => { /* read-only file */ },
|
| 223 |
configurable: true,
|
| 224 |
});
|
|
|
|
| 225 |
node.usedBytes = contentLength;
|
| 226 |
} catch (err) {
|
| 227 |
Module._free(modelPtr);
|
|
|
|
| 247 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 248 |
log('Model loaded');
|
| 249 |
|
|
|
|
|
|
|
|
|
|
| 250 |
try {
|
| 251 |
Module.FS.unlink('/model.gguf');
|
| 252 |
} catch (err) {
|
| 253 |
log(`Warning: could not remove model FS node: ${err.message}`);
|
| 254 |
}
|
| 255 |
|
| 256 |
+
// ─── Consistency phase ───
|
| 257 |
+
if (consistencyPrompt) {
|
| 258 |
+
status('consistency', 'Running consistency check...');
|
| 259 |
+
log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
|
| 260 |
+
const raw = await Module.ccall(
|
| 261 |
+
'bench_run', 'string',
|
| 262 |
+
['string', 'number'],
|
| 263 |
+
[consistencyPrompt, consistencyNPredict],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
{ async: true },
|
| 265 |
);
|
| 266 |
+
const r = parseBenchResult('bench_run', raw);
|
| 267 |
+
result.output = r.output || '';
|
| 268 |
+
result.consistency = { token_ids: r.token_ids || [] };
|
| 269 |
+
|
| 270 |
+
if (refTokenIds) {
|
| 271 |
+
log('bench_eval_tokens — forced-decode vs CPU baseline');
|
| 272 |
+
const evalRaw = await Module.ccall(
|
| 273 |
+
'bench_eval_tokens', 'string',
|
| 274 |
+
['string', 'string'],
|
| 275 |
+
[consistencyPrompt, refTokenIds],
|
| 276 |
+
{ async: true },
|
| 277 |
+
);
|
| 278 |
+
const ev = parseBenchResult('bench_eval_tokens', evalRaw);
|
| 279 |
+
result.consistency = { ...result.consistency, ...ev };
|
| 280 |
log(
|
| 281 |
+
`Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
|
| 282 |
+
`${ev.n_agree}/${ev.n_tokens})` +
|
| 283 |
+
(ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
|
| 284 |
);
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
// ─── Perf phase (llama-bench style) ───
|
| 289 |
+
const wantPp = nPrompt > 0;
|
| 290 |
+
const wantTg = nGen > 0;
|
| 291 |
+
if (wantPp || wantTg) {
|
| 292 |
+
const tests = [];
|
| 293 |
+
|
| 294 |
+
if (wantPp) {
|
| 295 |
+
if (!noWarmup) {
|
| 296 |
+
status('perf', `warmup pp${nPrompt}`);
|
| 297 |
+
log(`bench_pp(${nPrompt}) — warmup`);
|
| 298 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 299 |
+
parseBenchResult('bench_pp warmup', raw);
|
| 300 |
+
}
|
| 301 |
+
const samples_ns = [];
|
| 302 |
+
for (let i = 0; i < nReps; i++) {
|
| 303 |
+
status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
|
| 304 |
+
const t0 = performance.now();
|
| 305 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 306 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 307 |
+
parseBenchResult('bench_pp', raw);
|
| 308 |
+
samples_ns.push(t_ns);
|
| 309 |
+
log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 310 |
}
|
| 311 |
+
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 312 |
}
|
| 313 |
+
|
| 314 |
+
if (wantTg) {
|
| 315 |
+
if (!noWarmup) {
|
| 316 |
+
status('perf', `warmup tg`);
|
| 317 |
+
log('bench_tg(1) — warmup');
|
| 318 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
|
| 319 |
+
parseBenchResult('bench_tg warmup', raw);
|
| 320 |
+
}
|
| 321 |
+
const samples_ns = [];
|
| 322 |
+
for (let i = 0; i < nReps; i++) {
|
| 323 |
+
status('perf', `tg${nGen} ${i + 1}/${nReps}`);
|
| 324 |
+
const t0 = performance.now();
|
| 325 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 326 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 327 |
+
parseBenchResult('bench_tg', raw);
|
| 328 |
+
samples_ns.push(t_ns);
|
| 329 |
+
log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 330 |
+
}
|
| 331 |
+
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
result.metrics = {
|
| 335 |
+
tests,
|
| 336 |
+
n_prompt: wantPp ? nPrompt : 0,
|
| 337 |
+
n_gen: wantTg ? nGen : 0,
|
| 338 |
+
n_reps: nReps,
|
| 339 |
+
};
|
| 340 |
}
|
| 341 |
|
| 342 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 343 |
|
|
|
|
| 344 |
if (modelPtr) {
|
| 345 |
Module._free(modelPtr);
|
| 346 |
modelPtr = 0;
|
| 347 |
}
|
| 348 |
|
| 349 |
result.status = 'done';
|
| 350 |
+
const summary = result.metrics?.tests
|
| 351 |
+
?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`)
|
| 352 |
+
.join(' | ') || 'no perf';
|
| 353 |
+
status('done', `Done! ${summary}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
return result;
|
| 355 |
}
|
js/run/controller.js
CHANGED
|
@@ -22,7 +22,9 @@ const DEFAULT_N_PREDICT = 128;
|
|
| 22 |
const DEFAULT_N_CTX = 2048;
|
| 23 |
const DEFAULT_N_GPU_LAYERS = 999;
|
| 24 |
const YIELD_BETWEEN_RUNS_MS = 500;
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
const DEFAULT_ITERATIONS = 5;
|
| 27 |
const MIN_ITERATIONS_FOR_SUBMIT = 5;
|
| 28 |
|
|
@@ -39,6 +41,8 @@ const state = {
|
|
| 39 |
results: [], // result records from the current session
|
| 40 |
hfSession: null, // { accessToken, expiresAt, userName } when signed in
|
| 41 |
iterations: DEFAULT_ITERATIONS,
|
|
|
|
|
|
|
| 42 |
mounted: false,
|
| 43 |
// Tracks variants the Run pipeline downloaded this session (as opposed to
|
| 44 |
// the standalone Download button or pre-existing cache). Only these are
|
|
@@ -628,15 +632,34 @@ function wireBatchSelect() {
|
|
| 628 |
});
|
| 629 |
}
|
| 630 |
|
| 631 |
-
function
|
| 632 |
-
const
|
| 633 |
-
if (
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
}
|
| 641 |
|
| 642 |
function submittableResults() {
|
|
@@ -767,8 +790,8 @@ function ensureProgressTable() {
|
|
| 767 |
<th>Model</th>
|
| 768 |
<th>Variant</th>
|
| 769 |
<th>Status</th>
|
| 770 |
-
<th class="num">
|
| 771 |
-
<th class="num">
|
| 772 |
<th class="num">Wall s</th>
|
| 773 |
<th>Error</th>
|
| 774 |
</tr>
|
|
@@ -815,11 +838,21 @@ function progressRowFor(v) {
|
|
| 815 |
fillFromRecord(record) {
|
| 816 |
tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
|
| 817 |
tr.querySelector('.status').textContent = record.status;
|
| 818 |
-
|
| 819 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
tr.querySelector('.wall').textContent = record.wallTimeMs
|
| 821 |
? (record.wallTimeMs / 1000).toFixed(1)
|
| 822 |
-
: '
|
| 823 |
tr.querySelector('.err').textContent = record.error || '';
|
| 824 |
},
|
| 825 |
};
|
|
@@ -1209,12 +1242,19 @@ async function runBenchmarkInWorker(v, params, callbacks) {
|
|
| 1209 |
const record = await runInWorker({
|
| 1210 |
params: {
|
| 1211 |
buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
|
| 1212 |
-
|
| 1213 |
-
|
| 1214 |
nCtx: params.nCtx,
|
| 1215 |
nGpuLayers: params.nGpuLayers,
|
| 1216 |
-
|
| 1217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1218 |
},
|
| 1219 |
stream: fetched.stream,
|
| 1220 |
onStatus: callbacks.onStatus,
|
|
@@ -1225,22 +1265,29 @@ async function runBenchmarkInWorker(v, params, callbacks) {
|
|
| 1225 |
return record;
|
| 1226 |
}
|
| 1227 |
|
| 1228 |
-
// Runs one variant: CPU baseline (
|
| 1229 |
-
//
|
|
|
|
|
|
|
| 1230 |
// Returns an aggregate that makeRecord consumes.
|
| 1231 |
async function runVariantWithIterations(v, row) {
|
| 1232 |
-
const
|
|
|
|
|
|
|
| 1233 |
|
| 1234 |
// ─── CPU baseline ───
|
|
|
|
| 1235 |
row.setStatus('cpu-baseline', 'generating reference tokens');
|
| 1236 |
let cpuResult;
|
| 1237 |
try {
|
| 1238 |
cpuResult = await runBenchmarkInWorker(v, {
|
| 1239 |
-
|
| 1240 |
-
|
|
|
|
|
|
|
|
|
|
| 1241 |
nCtx: DEFAULT_N_CTX,
|
| 1242 |
nGpuLayers: 0,
|
| 1243 |
-
refTokenIds: null,
|
| 1244 |
}, {
|
| 1245 |
onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
|
| 1246 |
onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
|
|
@@ -1251,113 +1298,94 @@ async function runVariantWithIterations(v, row) {
|
|
| 1251 |
}
|
| 1252 |
|
| 1253 |
// CPU baseline is "best effort": if it fails (typically OOM on a tight
|
| 1254 |
-
// tab), keep going with GPU
|
| 1255 |
-
//
|
| 1256 |
-
// gets prefill/decode metrics — just no agreement-rate number.
|
| 1257 |
const cpuOk = cpuResult.status === 'done';
|
| 1258 |
if (!cpuOk) {
|
| 1259 |
logLine(
|
| 1260 |
-
`CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU
|
| 1261 |
);
|
| 1262 |
row.setStatus('cpu-skipped', 'continuing with GPU only');
|
| 1263 |
}
|
| 1264 |
|
| 1265 |
-
const refTokenIds = cpuOk ? (cpuResult.
|
| 1266 |
|
| 1267 |
-
|
| 1268 |
-
|
| 1269 |
-
|
| 1270 |
-
let gpuCore = null;
|
| 1271 |
-
|
| 1272 |
-
for (let i = 0; i < iterations; i++) {
|
| 1273 |
-
if (state.aborted) break;
|
| 1274 |
-
row.setStatus('gpu-run', `iteration ${i + 1}/${iterations}`);
|
| 1275 |
-
let gpuResult;
|
| 1276 |
-
try {
|
| 1277 |
-
gpuResult = await runBenchmarkInWorker(v, {
|
| 1278 |
-
prompt: DEFAULT_PROMPT,
|
| 1279 |
-
nPredict: DEFAULT_N_PREDICT,
|
| 1280 |
-
nCtx: DEFAULT_N_CTX,
|
| 1281 |
-
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1282 |
-
refTokenIds: i === 0 ? (refTokenIds || null) : null,
|
| 1283 |
-
}, {
|
| 1284 |
-
onStatus: (s, m) => row.setStatus(`gpu${i + 1}/${s}`, m),
|
| 1285 |
-
onProgress: (fr, d, t) => row.setProgress(fr, d, t),
|
| 1286 |
-
onLog: logLine,
|
| 1287 |
-
});
|
| 1288 |
-
} catch (err) {
|
| 1289 |
-
gpuResult = { status: 'error', error: err.message || String(err) };
|
| 1290 |
-
}
|
| 1291 |
-
|
| 1292 |
-
if (gpuResult.status !== 'done') {
|
| 1293 |
-
return {
|
| 1294 |
-
status: 'error',
|
| 1295 |
-
error: `GPU iteration ${i + 1} failed: ${gpuResult.error || 'unknown'}`,
|
| 1296 |
-
iterations: gpuSamples.length,
|
| 1297 |
-
cpu: cpuResult,
|
| 1298 |
-
gpuSamples,
|
| 1299 |
-
consistency,
|
| 1300 |
-
gpuCore: gpuCore || gpuResult,
|
| 1301 |
-
};
|
| 1302 |
-
}
|
| 1303 |
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
-
|
| 1307 |
-
|
| 1308 |
-
|
| 1309 |
-
|
| 1310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1311 |
});
|
| 1312 |
-
|
| 1313 |
-
|
| 1314 |
-
gpuCore = gpuResult;
|
| 1315 |
-
}
|
| 1316 |
-
|
| 1317 |
-
await sleep(YIELD_BETWEEN_ITERATIONS_MS);
|
| 1318 |
}
|
| 1319 |
|
| 1320 |
return {
|
| 1321 |
-
status:
|
| 1322 |
-
error:
|
| 1323 |
-
iterations: gpuSamples.length,
|
| 1324 |
cpu: cpuResult,
|
| 1325 |
-
|
| 1326 |
-
consistency,
|
| 1327 |
-
gpuCore,
|
| 1328 |
};
|
| 1329 |
}
|
| 1330 |
|
| 1331 |
-
function mean(arr, key) {
|
| 1332 |
-
if (arr.length === 0) return 0;
|
| 1333 |
-
return arr.reduce((a, x) => a + (x[key] || 0), 0) / arr.length;
|
| 1334 |
-
}
|
| 1335 |
-
function stdev(arr, key) {
|
| 1336 |
-
if (arr.length < 2) return 0;
|
| 1337 |
-
const m = mean(arr, key);
|
| 1338 |
-
return Math.sqrt(arr.reduce((a, x) => a + ((x[key] || 0) - m) ** 2, 0) / arr.length);
|
| 1339 |
-
}
|
| 1340 |
function round2(n) { return Number.isFinite(n) ? parseFloat(n.toFixed(2)) : 0; }
|
| 1341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1342 |
function makeRecord(v, vr, machine, browser, wallTimeMs) {
|
| 1343 |
-
const
|
| 1344 |
-
const
|
| 1345 |
-
|
| 1346 |
-
|
| 1347 |
-
|
| 1348 |
-
|
| 1349 |
-
|
| 1350 |
-
|
| 1351 |
-
|
| 1352 |
-
|
| 1353 |
-
|
| 1354 |
-
|
| 1355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1356 |
} : null;
|
| 1357 |
|
| 1358 |
-
const cpuBaseline = vr.cpu?.status === 'done' && vr.cpu.
|
| 1359 |
-
|
| 1360 |
-
|
|
|
|
|
|
|
|
|
|
| 1361 |
} : null;
|
| 1362 |
|
| 1363 |
return {
|
|
@@ -1371,21 +1399,24 @@ function makeRecord(v, vr, machine, browser, wallTimeMs) {
|
|
| 1371 |
browser,
|
| 1372 |
nCtx: DEFAULT_N_CTX,
|
| 1373 |
nPredict: DEFAULT_N_PREDICT,
|
|
|
|
|
|
|
|
|
|
| 1374 |
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1375 |
timestamp: new Date().toISOString(),
|
| 1376 |
wallTimeMs,
|
| 1377 |
-
webgpuAvailable:
|
| 1378 |
-
gpuAdapterInfo:
|
| 1379 |
-
buildType:
|
| 1380 |
// llama.cpp version stamped from build-info.json. Lets us correlate
|
| 1381 |
// result drift with llama.cpp upgrades over time.
|
| 1382 |
llamaCppCommit: state.buildInfo?.llamaCppCommit ?? null,
|
| 1383 |
llamaCppDescribe: state.buildInfo?.llamaCppDescribe ?? null,
|
| 1384 |
dawnTag: state.buildInfo?.dawnTag ?? null,
|
| 1385 |
metrics,
|
| 1386 |
-
consistency:
|
| 1387 |
cpu_baseline: cpuBaseline,
|
| 1388 |
-
output:
|
| 1389 |
machine,
|
| 1390 |
source: `webgpu-bench/site (${state.surface})`,
|
| 1391 |
};
|
|
@@ -1501,11 +1532,16 @@ function generateMarkdown(results) {
|
|
| 1501 |
let body = '';
|
| 1502 |
if (passed.length) {
|
| 1503 |
body += `## Passed (${passed.length})\n\n`;
|
| 1504 |
-
|
|
|
|
| 1505 |
body += `|---|---|---:|---:|---:|---:|\n`;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1506 |
for (const r of passed) {
|
| 1507 |
body += `| ${r.model} | ${r.variant} | ${formatSize(r.sizeMB)} | ${
|
| 1508 |
-
r.metrics?.
|
| 1509 |
(r.wallTimeMs / 1000).toFixed(1)} |\n`;
|
| 1510 |
}
|
| 1511 |
body += `\n`;
|
|
@@ -1707,7 +1743,7 @@ export async function mountRunSection() {
|
|
| 1707 |
wireFilters();
|
| 1708 |
wireFamilySearch();
|
| 1709 |
wireBatchSelect();
|
| 1710 |
-
|
| 1711 |
wireRunHandlers();
|
| 1712 |
wireAbortHandler();
|
| 1713 |
wirePurgeHandler();
|
|
|
|
| 22 |
const DEFAULT_N_CTX = 2048;
|
| 23 |
const DEFAULT_N_GPU_LAYERS = 999;
|
| 24 |
const YIELD_BETWEEN_RUNS_MS = 500;
|
| 25 |
+
// llama-bench defaults: -p 512 -n 128 -r 5
|
| 26 |
+
const DEFAULT_N_PROMPT = 512;
|
| 27 |
+
const DEFAULT_N_GEN = 128;
|
| 28 |
const DEFAULT_ITERATIONS = 5;
|
| 29 |
const MIN_ITERATIONS_FOR_SUBMIT = 5;
|
| 30 |
|
|
|
|
| 41 |
results: [], // result records from the current session
|
| 42 |
hfSession: null, // { accessToken, expiresAt, userName } when signed in
|
| 43 |
iterations: DEFAULT_ITERATIONS,
|
| 44 |
+
nPrompt: DEFAULT_N_PROMPT,
|
| 45 |
+
nGen: DEFAULT_N_GEN,
|
| 46 |
mounted: false,
|
| 47 |
// Tracks variants the Run pipeline downloaded this session (as opposed to
|
| 48 |
// the standalone Download button or pre-existing cache). Only these are
|
|
|
|
| 632 |
});
|
| 633 |
}
|
| 634 |
|
| 635 |
+
function wirePerfInputs() {
|
| 636 |
+
const reps = $('iterations-input');
|
| 637 |
+
if (reps) {
|
| 638 |
+
reps.value = String(state.iterations);
|
| 639 |
+
reps.addEventListener('change', () => {
|
| 640 |
+
const n = Math.max(1, Math.min(50, parseInt(reps.value, 10) || DEFAULT_ITERATIONS));
|
| 641 |
+
state.iterations = n;
|
| 642 |
+
reps.value = String(n);
|
| 643 |
+
});
|
| 644 |
+
}
|
| 645 |
+
const np = $('n-prompt-input');
|
| 646 |
+
if (np) {
|
| 647 |
+
np.value = String(state.nPrompt);
|
| 648 |
+
np.addEventListener('change', () => {
|
| 649 |
+
const n = Math.max(0, Math.min(4096, parseInt(np.value, 10)));
|
| 650 |
+
state.nPrompt = Number.isFinite(n) ? n : DEFAULT_N_PROMPT;
|
| 651 |
+
np.value = String(state.nPrompt);
|
| 652 |
+
});
|
| 653 |
+
}
|
| 654 |
+
const ng = $('n-gen-input');
|
| 655 |
+
if (ng) {
|
| 656 |
+
ng.value = String(state.nGen);
|
| 657 |
+
ng.addEventListener('change', () => {
|
| 658 |
+
const n = Math.max(0, Math.min(4096, parseInt(ng.value, 10)));
|
| 659 |
+
state.nGen = Number.isFinite(n) ? n : DEFAULT_N_GEN;
|
| 660 |
+
ng.value = String(state.nGen);
|
| 661 |
+
});
|
| 662 |
+
}
|
| 663 |
}
|
| 664 |
|
| 665 |
function submittableResults() {
|
|
|
|
| 790 |
<th>Model</th>
|
| 791 |
<th>Variant</th>
|
| 792 |
<th>Status</th>
|
| 793 |
+
<th class="num" title="Prompt processing throughput (avg \u00b1 stddev t/s)">pp tok/s</th>
|
| 794 |
+
<th class="num" title="Text generation throughput (avg \u00b1 stddev t/s)">tg tok/s</th>
|
| 795 |
<th class="num">Wall s</th>
|
| 796 |
<th>Error</th>
|
| 797 |
</tr>
|
|
|
|
| 838 |
fillFromRecord(record) {
|
| 839 |
tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
|
| 840 |
tr.querySelector('.status').textContent = record.status;
|
| 841 |
+
// Format llama-bench style: "avg \u00b1 stddev" with the test name as
|
| 842 |
+
// the cell tooltip so users see the exact pp/tg N that was measured.
|
| 843 |
+
const tests = record.metrics?.tests || [];
|
| 844 |
+
const pp = tests.find(t => t.name?.startsWith('pp'));
|
| 845 |
+
const tg = tests.find(t => t.name?.startsWith('tg'));
|
| 846 |
+
const fmt = (t) => t ? `${t.avg_ts.toFixed(2)} \u00b1 ${t.stddev_ts.toFixed(2)}` : '\u2014';
|
| 847 |
+
const ppCell = tr.querySelector('.prefill');
|
| 848 |
+
ppCell.textContent = fmt(pp);
|
| 849 |
+
if (pp) ppCell.title = pp.name;
|
| 850 |
+
const tgCell = tr.querySelector('.decode');
|
| 851 |
+
tgCell.textContent = fmt(tg);
|
| 852 |
+
if (tg) tgCell.title = tg.name;
|
| 853 |
tr.querySelector('.wall').textContent = record.wallTimeMs
|
| 854 |
? (record.wallTimeMs / 1000).toFixed(1)
|
| 855 |
+
: '\u2014';
|
| 856 |
tr.querySelector('.err').textContent = record.error || '';
|
| 857 |
},
|
| 858 |
};
|
|
|
|
| 1242 |
const record = await runInWorker({
|
| 1243 |
params: {
|
| 1244 |
buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
|
| 1245 |
+
contentLength: fetched.contentLength,
|
| 1246 |
+
// Model load
|
| 1247 |
nCtx: params.nCtx,
|
| 1248 |
nGpuLayers: params.nGpuLayers,
|
| 1249 |
+
// Consistency phase — empty consistencyPrompt skips it
|
| 1250 |
+
consistencyPrompt: params.consistencyPrompt || '',
|
| 1251 |
+
consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
|
| 1252 |
+
refTokenIds: params.refTokenIds || null,
|
| 1253 |
+
// Perf phase — set both to 0 to skip
|
| 1254 |
+
nPrompt: params.nPrompt ?? 0,
|
| 1255 |
+
nGen: params.nGen ?? 0,
|
| 1256 |
+
nReps: params.nReps ?? DEFAULT_ITERATIONS,
|
| 1257 |
+
noWarmup: !!params.noWarmup,
|
| 1258 |
},
|
| 1259 |
stream: fetched.stream,
|
| 1260 |
onStatus: callbacks.onStatus,
|
|
|
|
| 1265 |
return record;
|
| 1266 |
}
|
| 1267 |
|
| 1268 |
+
// Runs one variant: CPU consistency baseline (one model load, generates
|
| 1269 |
+
// reference token IDs via bench_run), then GPU pass (one model load that
|
| 1270 |
+
// does both consistency forced-decoding and the llama-bench-style perf
|
| 1271 |
+
// sweep — pp + tg with warmup + nReps timed reps each).
|
| 1272 |
// Returns an aggregate that makeRecord consumes.
|
| 1273 |
async function runVariantWithIterations(v, row) {
|
| 1274 |
+
const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
|
| 1275 |
+
const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
|
| 1276 |
+
const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
|
| 1277 |
|
| 1278 |
// ─── CPU baseline ───
|
| 1279 |
+
// Pure consistency pass — capture token_ids; no perf metrics on CPU.
|
| 1280 |
row.setStatus('cpu-baseline', 'generating reference tokens');
|
| 1281 |
let cpuResult;
|
| 1282 |
try {
|
| 1283 |
cpuResult = await runBenchmarkInWorker(v, {
|
| 1284 |
+
consistencyPrompt: DEFAULT_PROMPT,
|
| 1285 |
+
consistencyNPredict: DEFAULT_N_PREDICT,
|
| 1286 |
+
refTokenIds: null,
|
| 1287 |
+
nPrompt: 0,
|
| 1288 |
+
nGen: 0,
|
| 1289 |
nCtx: DEFAULT_N_CTX,
|
| 1290 |
nGpuLayers: 0,
|
|
|
|
| 1291 |
}, {
|
| 1292 |
onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
|
| 1293 |
onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
|
|
|
|
| 1298 |
}
|
| 1299 |
|
| 1300 |
// CPU baseline is "best effort": if it fails (typically OOM on a tight
|
| 1301 |
+
// tab), keep going with the GPU pass but skip consistency. Perf metrics
|
| 1302 |
+
// are independent of consistency so they're still reported.
|
|
|
|
| 1303 |
const cpuOk = cpuResult.status === 'done';
|
| 1304 |
if (!cpuOk) {
|
| 1305 |
logLine(
|
| 1306 |
+
`CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU run, skipping consistency check.`
|
| 1307 |
);
|
| 1308 |
row.setStatus('cpu-skipped', 'continuing with GPU only');
|
| 1309 |
}
|
| 1310 |
|
| 1311 |
+
const refTokenIds = cpuOk ? (cpuResult.consistency?.token_ids || []).join(',') : '';
|
| 1312 |
|
| 1313 |
+
if (state.aborted) {
|
| 1314 |
+
return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
|
| 1315 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1316 |
|
| 1317 |
+
// ─── GPU pass: consistency + perf in one model load ───
|
| 1318 |
+
row.setStatus('gpu-run', 'loading model');
|
| 1319 |
+
let gpuResult;
|
| 1320 |
+
try {
|
| 1321 |
+
gpuResult = await runBenchmarkInWorker(v, {
|
| 1322 |
+
consistencyPrompt: DEFAULT_PROMPT,
|
| 1323 |
+
consistencyNPredict: DEFAULT_N_PREDICT,
|
| 1324 |
+
refTokenIds: refTokenIds || null,
|
| 1325 |
+
nPrompt,
|
| 1326 |
+
nGen,
|
| 1327 |
+
nReps,
|
| 1328 |
+
nCtx: DEFAULT_N_CTX,
|
| 1329 |
+
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1330 |
+
}, {
|
| 1331 |
+
onStatus: (s, m) => row.setStatus(`gpu/${s}`, m),
|
| 1332 |
+
onProgress: (fr, d, t) => row.setProgress(fr, d, t),
|
| 1333 |
+
onLog: logLine,
|
| 1334 |
});
|
| 1335 |
+
} catch (err) {
|
| 1336 |
+
gpuResult = { status: 'error', error: err.message || String(err) };
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1337 |
}
|
| 1338 |
|
| 1339 |
return {
|
| 1340 |
+
status: gpuResult.status === 'done' ? 'done' : 'error',
|
| 1341 |
+
error: gpuResult.status === 'done' ? null : (gpuResult.error || 'GPU run failed'),
|
|
|
|
| 1342 |
cpu: cpuResult,
|
| 1343 |
+
gpu: gpuResult,
|
|
|
|
|
|
|
| 1344 |
};
|
| 1345 |
}
|
| 1346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1347 |
function round2(n) { return Number.isFinite(n) ? parseFloat(n.toFixed(2)) : 0; }
|
| 1348 |
|
| 1349 |
+
// Pull pp/tg test results out of a metrics.tests array. Returns null if the
|
| 1350 |
+
// requested test wasn't run (e.g. nPrompt=0 means no pp test).
|
| 1351 |
+
function findTest(tests, prefix) {
|
| 1352 |
+
if (!Array.isArray(tests)) return null;
|
| 1353 |
+
return tests.find(t => typeof t.name === 'string' && t.name.startsWith(prefix)) || null;
|
| 1354 |
+
}
|
| 1355 |
+
|
| 1356 |
function makeRecord(v, vr, machine, browser, wallTimeMs) {
|
| 1357 |
+
const gpu = vr.gpu;
|
| 1358 |
+
const tests = gpu?.metrics?.tests || null;
|
| 1359 |
+
const pp = findTest(tests, 'pp');
|
| 1360 |
+
const tg = findTest(tests, 'tg');
|
| 1361 |
+
|
| 1362 |
+
// Llama-bench shape lives under metrics.tests; flat prefill_tok_s /
|
| 1363 |
+
// decode_tok_s are kept for backward compat with the existing dashboard
|
| 1364 |
+
// table cells until those are migrated to read from tests directly.
|
| 1365 |
+
const metrics = tests ? {
|
| 1366 |
+
tests,
|
| 1367 |
+
n_prompt: gpu.metrics.n_prompt,
|
| 1368 |
+
n_gen: gpu.metrics.n_gen,
|
| 1369 |
+
n_reps: gpu.metrics.n_reps,
|
| 1370 |
+
iterations: gpu.metrics.n_reps,
|
| 1371 |
+
prefill_tok_s: pp ? round2(pp.avg_ts) : 0,
|
| 1372 |
+
decode_tok_s: tg ? round2(tg.avg_ts) : 0,
|
| 1373 |
+
prefill_tok_s_stdev: pp ? round2(pp.stddev_ts) : 0,
|
| 1374 |
+
decode_tok_s_stdev: tg ? round2(tg.stddev_ts) : 0,
|
| 1375 |
+
prefill_samples: pp ? pp.samples_ts : [],
|
| 1376 |
+
decode_samples: tg ? tg.samples_ts : [],
|
| 1377 |
+
n_p_eval: pp ? pp.n_prompt : 0,
|
| 1378 |
+
n_eval: tg ? tg.n_gen : 0,
|
| 1379 |
+
t_p_eval_ms: pp ? round2(pp.avg_ns / 1e6) : 0,
|
| 1380 |
+
t_eval_ms: tg ? round2(tg.avg_ns / 1e6) : 0,
|
| 1381 |
} : null;
|
| 1382 |
|
| 1383 |
+
const cpuBaseline = vr.cpu?.status === 'done' && vr.cpu.consistency?.token_ids?.length ? {
|
| 1384 |
+
// CPU pass no longer measures perf — only token_ids for consistency.
|
| 1385 |
+
// Keep the field present but null-valued so dashboards that look it up
|
| 1386 |
+
// don't crash; downstream code can treat null as "not measured".
|
| 1387 |
+
prefill_tok_s: null,
|
| 1388 |
+
decode_tok_s: null,
|
| 1389 |
} : null;
|
| 1390 |
|
| 1391 |
return {
|
|
|
|
| 1399 |
browser,
|
| 1400 |
nCtx: DEFAULT_N_CTX,
|
| 1401 |
nPredict: DEFAULT_N_PREDICT,
|
| 1402 |
+
nPrompt: gpu?.metrics?.n_prompt ?? 0,
|
| 1403 |
+
nGen: gpu?.metrics?.n_gen ?? 0,
|
| 1404 |
+
nReps: gpu?.metrics?.n_reps ?? 0,
|
| 1405 |
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1406 |
timestamp: new Date().toISOString(),
|
| 1407 |
wallTimeMs,
|
| 1408 |
+
webgpuAvailable: gpu?.webgpuAvailable ?? !!navigator.gpu,
|
| 1409 |
+
gpuAdapterInfo: gpu?.gpuAdapterInfo ?? null,
|
| 1410 |
+
buildType: gpu?.buildType ?? null,
|
| 1411 |
// llama.cpp version stamped from build-info.json. Lets us correlate
|
| 1412 |
// result drift with llama.cpp upgrades over time.
|
| 1413 |
llamaCppCommit: state.buildInfo?.llamaCppCommit ?? null,
|
| 1414 |
llamaCppDescribe: state.buildInfo?.llamaCppDescribe ?? null,
|
| 1415 |
dawnTag: state.buildInfo?.dawnTag ?? null,
|
| 1416 |
metrics,
|
| 1417 |
+
consistency: gpu?.consistency ?? null,
|
| 1418 |
cpu_baseline: cpuBaseline,
|
| 1419 |
+
output: gpu?.output || '',
|
| 1420 |
machine,
|
| 1421 |
source: `webgpu-bench/site (${state.surface})`,
|
| 1422 |
};
|
|
|
|
| 1532 |
let body = '';
|
| 1533 |
if (passed.length) {
|
| 1534 |
body += `## Passed (${passed.length})\n\n`;
|
| 1535 |
+
// llama-bench-style markdown: separate pp / tg columns with avg \u00b1 stddev.
|
| 1536 |
+
body += `| Model | Variant | Size | pp tok/s | tg tok/s | Wall s |\n`;
|
| 1537 |
body += `|---|---|---:|---:|---:|---:|\n`;
|
| 1538 |
+
const fmtTest = (tests, prefix) => {
|
| 1539 |
+
const t = tests?.find(x => x.name?.startsWith(prefix));
|
| 1540 |
+
return t ? `${t.avg_ts.toFixed(2)} \u00b1 ${t.stddev_ts.toFixed(2)} (${t.name})` : '\u2014';
|
| 1541 |
+
};
|
| 1542 |
for (const r of passed) {
|
| 1543 |
body += `| ${r.model} | ${r.variant} | ${formatSize(r.sizeMB)} | ${
|
| 1544 |
+
fmtTest(r.metrics?.tests, 'pp')} | ${fmtTest(r.metrics?.tests, 'tg')} | ${
|
| 1545 |
(r.wallTimeMs / 1000).toFixed(1)} |\n`;
|
| 1546 |
}
|
| 1547 |
body += `\n`;
|
|
|
|
| 1743 |
wireFilters();
|
| 1744 |
wireFamilySearch();
|
| 1745 |
wireBatchSelect();
|
| 1746 |
+
wirePerfInputs();
|
| 1747 |
wireRunHandlers();
|
| 1748 |
wireAbortHandler();
|
| 1749 |
wirePurgeHandler();
|
js/run/core.js
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
// Benchmark core: load GGUF via a source adapter, init llama.cpp WASM,
|
| 2 |
-
// run
|
| 3 |
-
//
|
|
|
|
|
|
|
| 4 |
|
| 5 |
const DEFAULT_PROMPT = 'Hello, how are you?';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
async function loadBenchScriptOnce(buildType) {
|
| 8 |
if (typeof globalThis.createBenchModule === 'function') return;
|
|
@@ -18,15 +24,186 @@ async function loadBenchScriptOnce(buildType) {
|
|
| 18 |
}
|
| 19 |
}
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
export async function runBenchmarkCore({
|
| 22 |
source,
|
| 23 |
modelFile,
|
| 24 |
hfRepo,
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
nCtx = 2048,
|
| 28 |
nGpuLayers = 999,
|
| 29 |
-
|
| 30 |
onStatus = () => {},
|
| 31 |
onProgress = () => {},
|
| 32 |
onLog = () => {},
|
|
@@ -46,14 +223,13 @@ export async function runBenchmarkCore({
|
|
| 46 |
webgpuAvailable: !!navigator.gpu,
|
| 47 |
gpuAdapterInfo: null,
|
| 48 |
metrics: null,
|
|
|
|
| 49 |
output: '',
|
| 50 |
};
|
| 51 |
|
| 52 |
-
// Declared outside the try so the catch can free our heap allocation.
|
| 53 |
let Module;
|
| 54 |
|
| 55 |
try {
|
| 56 |
-
// WebGPU adapter probe — informational only.
|
| 57 |
if (navigator.gpu) {
|
| 58 |
try {
|
| 59 |
const adapter = await navigator.gpu.requestAdapter();
|
|
@@ -70,7 +246,6 @@ export async function runBenchmarkCore({
|
|
| 70 |
onLog('WebGPU: not available in this browser');
|
| 71 |
}
|
| 72 |
|
| 73 |
-
// Load the Emscripten glue script once per page.
|
| 74 |
onStatus('loading_wasm', `Loading WASM module (${buildType})...`);
|
| 75 |
onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
|
| 76 |
await loadBenchScriptOnce(buildType);
|
|
@@ -78,7 +253,6 @@ export async function runBenchmarkCore({
|
|
| 78 |
Module = await globalThis.createBenchModule({
|
| 79 |
print: (text) => onLog(`[wasm] ${text}`),
|
| 80 |
printErr: (text) => onLog(`[wasm:err] ${text}`),
|
| 81 |
-
// Catch Emscripten abort() — Firefox can abort during Asyncify init.
|
| 82 |
onAbort: (reason) => {
|
| 83 |
const msg = `WASM aborted: ${reason}`;
|
| 84 |
result.error = msg;
|
|
@@ -88,7 +262,6 @@ export async function runBenchmarkCore({
|
|
| 88 |
});
|
| 89 |
onLog('WASM module loaded');
|
| 90 |
|
| 91 |
-
// Download model via the injected source adapter.
|
| 92 |
onStatus('downloading', `Downloading ${modelFile}...`);
|
| 93 |
onLog(`Fetching model via source: ${hfRepo}/${modelFile}`);
|
| 94 |
const { stream, contentLength } = await source.fetchModel(hfRepo, modelFile);
|
|
@@ -96,13 +269,8 @@ export async function runBenchmarkCore({
|
|
| 96 |
contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
|
| 97 |
}`);
|
| 98 |
|
| 99 |
-
// Stream the GGUF directly into the WASM heap (HeapFS-style)
|
| 100 |
-
//
|
| 101 |
-
// the linear memory; HEAPU8.set writes chunks in place. We then expose
|
| 102 |
-
// the region as a MEMFS file with `canOwn=true` so MEMFS does not copy,
|
| 103 |
-
// and override node.contents with a getter that always rebuilds the
|
| 104 |
-
// view from the saved pointer — this survives the heap growth that
|
| 105 |
-
// llama.cpp triggers during bench_init/bench_load.
|
| 106 |
if (!(contentLength > 0)) {
|
| 107 |
throw new Error('content-length is required for streaming into WASM heap');
|
| 108 |
}
|
|
@@ -137,19 +305,14 @@ export async function runBenchmarkCore({
|
|
| 137 |
Module._free(modelPtr);
|
| 138 |
throw err;
|
| 139 |
}
|
| 140 |
-
// Track on the result object so we can free in the success/exit paths.
|
| 141 |
result._modelPtr = modelPtr;
|
| 142 |
|
| 143 |
-
// Init backend.
|
| 144 |
onStatus('initializing', 'Initializing llama.cpp backends...');
|
| 145 |
-
onLog('Calling bench_init()...');
|
| 146 |
const initResult = await Module.ccall('bench_init', 'number', [], [], { async: true });
|
| 147 |
if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
|
| 148 |
onLog('Backends initialized');
|
| 149 |
|
| 150 |
-
// Load model.
|
| 151 |
onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
|
| 152 |
-
onLog(`Calling bench_load("/model.gguf", ${nCtx}, ${nGpuLayers})...`);
|
| 153 |
const loadResult = await Module.ccall(
|
| 154 |
'bench_load',
|
| 155 |
'number',
|
|
@@ -160,89 +323,38 @@ export async function runBenchmarkCore({
|
|
| 160 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 161 |
onLog('Model loaded');
|
| 162 |
|
| 163 |
-
// Drop the MEMFS node — llama.cpp's mmap captured a pointer into the
|
| 164 |
-
// _malloc'd region in the WASM heap, so the bytes themselves stay alive
|
| 165 |
-
// until we _free below after bench_exit.
|
| 166 |
try {
|
| 167 |
Module.FS.unlink('/model.gguf');
|
| 168 |
} catch (e) {
|
| 169 |
onLog(`Warning: could not remove model FS node: ${e.message}`);
|
| 170 |
}
|
| 171 |
|
| 172 |
-
//
|
| 173 |
-
onStatus('running', 'Running
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
const inferResult = JSON.parse(resultJson);
|
| 185 |
-
if (inferResult.error) throw new Error(`Inference error: ${inferResult.error}`);
|
| 186 |
-
|
| 187 |
-
const prefillTokS = inferResult.t_p_eval_ms > 0
|
| 188 |
-
? (inferResult.n_p_eval / (inferResult.t_p_eval_ms / 1000)).toFixed(2)
|
| 189 |
-
: 'N/A';
|
| 190 |
-
const decodeTokS = inferResult.t_eval_ms > 0
|
| 191 |
-
? (inferResult.n_eval / (inferResult.t_eval_ms / 1000)).toFixed(2)
|
| 192 |
-
: 'N/A';
|
| 193 |
-
|
| 194 |
-
result.metrics = {
|
| 195 |
-
...inferResult,
|
| 196 |
-
prefill_tok_s: parseFloat(prefillTokS) || 0,
|
| 197 |
-
decode_tok_s: parseFloat(decodeTokS) || 0,
|
| 198 |
-
};
|
| 199 |
-
result.output = inferResult.output || '';
|
| 200 |
-
|
| 201 |
-
// Forced-decoding consistency check against a CPU reference token sequence.
|
| 202 |
-
if (refTokenIds && nGpuLayers > 0 && inferResult.token_ids?.length > 0) {
|
| 203 |
-
onLog('Running forced-decoding consistency check...');
|
| 204 |
-
const evalJson = await Module.ccall(
|
| 205 |
-
'bench_eval_tokens',
|
| 206 |
-
'string',
|
| 207 |
-
['string', 'string'],
|
| 208 |
-
[prompt, refTokenIds],
|
| 209 |
-
{ async: true },
|
| 210 |
-
);
|
| 211 |
-
const evalResult = JSON.parse(evalJson);
|
| 212 |
-
if (evalResult.error) {
|
| 213 |
-
onLog(`Consistency check error: ${evalResult.error}`);
|
| 214 |
-
} else {
|
| 215 |
-
result.consistency = evalResult;
|
| 216 |
-
onLog(
|
| 217 |
-
`Consistency: ${(evalResult.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
|
| 218 |
-
`${evalResult.n_agree}/${evalResult.n_tokens} tokens)`,
|
| 219 |
-
);
|
| 220 |
-
if (evalResult.first_disagreement >= 0) {
|
| 221 |
-
onLog(`First disagreement at token position ${evalResult.first_disagreement}`);
|
| 222 |
-
}
|
| 223 |
-
}
|
| 224 |
-
}
|
| 225 |
|
| 226 |
onLog('Calling bench_exit()...');
|
| 227 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 228 |
|
| 229 |
-
// Free the heap-resident model bytes now that llama.cpp has unmapped.
|
| 230 |
if (result._modelPtr) {
|
| 231 |
Module._free(result._modelPtr);
|
| 232 |
delete result._modelPtr;
|
| 233 |
}
|
| 234 |
|
| 235 |
result.status = 'done';
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
);
|
| 241 |
-
onLog(
|
| 242 |
-
`Decode: ${decodeTokS} tok/s (${inferResult.n_eval} tokens in ` +
|
| 243 |
-
`${inferResult.t_eval_ms.toFixed(0)} ms)`,
|
| 244 |
-
);
|
| 245 |
-
onLog(`Output: ${(inferResult.output || '').substring(0, 200)}`);
|
| 246 |
return result;
|
| 247 |
} catch (err) {
|
| 248 |
result.error = err.message || String(err);
|
|
@@ -250,7 +362,6 @@ export async function runBenchmarkCore({
|
|
| 250 |
onStatus('error', `Error: ${err.message}`);
|
| 251 |
onLog(`ERROR: ${err.message}`);
|
| 252 |
if (err.stack) onLog(err.stack);
|
| 253 |
-
// Best-effort: release the model heap region so a re-run can reuse it.
|
| 254 |
if (result._modelPtr && Module?._free) {
|
| 255 |
try { Module._free(result._modelPtr); } catch { /* ignore */ }
|
| 256 |
delete result._modelPtr;
|
|
|
|
| 1 |
// Benchmark core: load GGUF via a source adapter, init llama.cpp WASM,
|
| 2 |
+
// then run a consistency phase (forced-decoding against a CPU baseline) and
|
| 3 |
+
// a perf phase (llama-bench-style pp/tg with warmup + n_reps timed reps).
|
| 4 |
+
// Used by harness.js (URL-param driven, for runner.js) and by the Run-tab
|
| 5 |
+
// controller (which runs the same logic in a Web Worker — see bench-worker.js).
|
| 6 |
|
| 7 |
const DEFAULT_PROMPT = 'Hello, how are you?';
|
| 8 |
+
const DEFAULT_N_PREDICT = 128;
|
| 9 |
+
const DEFAULT_N_PROMPT = 512;
|
| 10 |
+
const DEFAULT_N_GEN = 128;
|
| 11 |
+
const DEFAULT_N_REPS = 5;
|
| 12 |
|
| 13 |
async function loadBenchScriptOnce(buildType) {
|
| 14 |
if (typeof globalThis.createBenchModule === 'function') return;
|
|
|
|
| 24 |
}
|
| 25 |
}
|
| 26 |
|
| 27 |
+
// Aggregate raw nanosecond samples into the llama-bench result shape.
|
| 28 |
+
// llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
|
| 29 |
+
// the std of per-sample t/s, computed independently rather than propagated
|
| 30 |
+
// from stddev_ns (the mapping isn't linear).
|
| 31 |
+
function buildTest(name, n_prompt, n_gen, samples_ns) {
|
| 32 |
+
const n = samples_ns.length;
|
| 33 |
+
if (n === 0) {
|
| 34 |
+
return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
|
| 35 |
+
}
|
| 36 |
+
const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
|
| 37 |
+
// Sample stddev (Bessel's correction) — matches llama-bench's avg_stdev when reps > 1.
|
| 38 |
+
const var_ns = n > 1
|
| 39 |
+
? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
|
| 40 |
+
: 0;
|
| 41 |
+
const stddev_ns = Math.sqrt(var_ns);
|
| 42 |
+
const n_tokens = n_prompt + n_gen;
|
| 43 |
+
const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
|
| 44 |
+
const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
|
| 45 |
+
const var_ts = n > 1
|
| 46 |
+
? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
|
| 47 |
+
: 0;
|
| 48 |
+
const stddev_ts = Math.sqrt(var_ts);
|
| 49 |
+
const round2 = x => Math.round(x * 100) / 100;
|
| 50 |
+
return {
|
| 51 |
+
name,
|
| 52 |
+
n_prompt,
|
| 53 |
+
n_gen,
|
| 54 |
+
avg_ns: Math.round(avg_ns),
|
| 55 |
+
stddev_ns: Math.round(stddev_ns),
|
| 56 |
+
avg_ts: round2(avg_ts),
|
| 57 |
+
stddev_ts: round2(stddev_ts),
|
| 58 |
+
samples_ns: samples_ns.map(Math.round),
|
| 59 |
+
samples_ts: samples_ts.map(round2),
|
| 60 |
+
};
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
// Parse the JSON returned by a bench_* C function. Throws on parse failure
|
| 64 |
+
// or on `error` field from C.
|
| 65 |
+
function parseBenchResult(label, raw) {
|
| 66 |
+
let r;
|
| 67 |
+
try { r = JSON.parse(raw); } catch (e) {
|
| 68 |
+
throw new Error(`${label}: invalid JSON from C (${e.message})`);
|
| 69 |
+
}
|
| 70 |
+
if (r.error) throw new Error(`${label}: ${r.error}`);
|
| 71 |
+
return r;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
// Run the consistency + perf phases against an already-loaded WASM Module.
|
| 75 |
+
// Returns { metrics: { tests, n_prompt, n_gen, n_reps }, consistency, output }.
|
| 76 |
+
//
|
| 77 |
+
// Both the worker (bench-worker.js) and the main-thread path (this file) call
|
| 78 |
+
// into this. Keep the two implementations in sync.
|
| 79 |
+
async function runBenchActions(Module, {
|
| 80 |
+
// Consistency phase
|
| 81 |
+
consistencyPrompt, // non-empty string ⇒ run consistency
|
| 82 |
+
consistencyNPredict, // tokens generated by bench_run during consistency
|
| 83 |
+
refTokenIds, // CSV of CPU-side token IDs ⇒ forced-decode against them
|
| 84 |
+
// Perf phase
|
| 85 |
+
nPrompt, nGen, nReps, noWarmup,
|
| 86 |
+
// Reporting
|
| 87 |
+
onStatus, onLog,
|
| 88 |
+
}) {
|
| 89 |
+
const out = { metrics: null, consistency: null, output: '' };
|
| 90 |
+
|
| 91 |
+
// ─── Consistency phase ───
|
| 92 |
+
// Two sub-modes: (a) CPU baseline — generates token_ids via bench_run for a
|
| 93 |
+
// future GPU verification pass; (b) GPU verification — runs bench_run then
|
| 94 |
+
// bench_eval_tokens to compute the agreement rate against refTokenIds.
|
| 95 |
+
if (consistencyPrompt) {
|
| 96 |
+
onStatus?.('consistency', 'Running consistency check...');
|
| 97 |
+
onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
|
| 98 |
+
const raw = await Module.ccall(
|
| 99 |
+
'bench_run', 'string',
|
| 100 |
+
['string', 'number'],
|
| 101 |
+
[consistencyPrompt, consistencyNPredict],
|
| 102 |
+
{ async: true },
|
| 103 |
+
);
|
| 104 |
+
const r = parseBenchResult('bench_run', raw);
|
| 105 |
+
out.output = r.output || '';
|
| 106 |
+
out.consistency = { token_ids: r.token_ids || [] };
|
| 107 |
+
|
| 108 |
+
if (refTokenIds) {
|
| 109 |
+
onLog?.('bench_eval_tokens — forced-decode vs CPU baseline');
|
| 110 |
+
const evalRaw = await Module.ccall(
|
| 111 |
+
'bench_eval_tokens', 'string',
|
| 112 |
+
['string', 'string'],
|
| 113 |
+
[consistencyPrompt, refTokenIds],
|
| 114 |
+
{ async: true },
|
| 115 |
+
);
|
| 116 |
+
const ev = parseBenchResult('bench_eval_tokens', evalRaw);
|
| 117 |
+
out.consistency = { ...out.consistency, ...ev };
|
| 118 |
+
onLog?.(
|
| 119 |
+
`Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
|
| 120 |
+
`${ev.n_agree}/${ev.n_tokens})` +
|
| 121 |
+
(ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
|
| 122 |
+
);
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
// ─── Perf phase (llama-bench style) ───
|
| 127 |
+
// Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
|
| 128 |
+
// Warmup is one full pp + one tg(1) call before the timed reps, matching
|
| 129 |
+
// tools/llama-bench/llama-bench.cpp.
|
| 130 |
+
const wantPp = nPrompt > 0;
|
| 131 |
+
const wantTg = nGen > 0;
|
| 132 |
+
if (wantPp || wantTg) {
|
| 133 |
+
const tests = [];
|
| 134 |
+
|
| 135 |
+
if (wantPp) {
|
| 136 |
+
if (!noWarmup) {
|
| 137 |
+
onStatus?.('perf', `warmup pp${nPrompt}`);
|
| 138 |
+
onLog?.(`bench_pp(${nPrompt}) — warmup`);
|
| 139 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 140 |
+
parseBenchResult('bench_pp warmup', raw);
|
| 141 |
+
}
|
| 142 |
+
const samples_ns = [];
|
| 143 |
+
for (let i = 0; i < nReps; i++) {
|
| 144 |
+
onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
|
| 145 |
+
const t0 = performance.now();
|
| 146 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 147 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 148 |
+
parseBenchResult('bench_pp', raw);
|
| 149 |
+
samples_ns.push(t_ns);
|
| 150 |
+
onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 151 |
+
}
|
| 152 |
+
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
if (wantTg) {
|
| 156 |
+
if (!noWarmup) {
|
| 157 |
+
onStatus?.('perf', `warmup tg`);
|
| 158 |
+
onLog?.('bench_tg(1) — warmup');
|
| 159 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
|
| 160 |
+
parseBenchResult('bench_tg warmup', raw);
|
| 161 |
+
}
|
| 162 |
+
const samples_ns = [];
|
| 163 |
+
for (let i = 0; i < nReps; i++) {
|
| 164 |
+
onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
|
| 165 |
+
const t0 = performance.now();
|
| 166 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 167 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 168 |
+
parseBenchResult('bench_tg', raw);
|
| 169 |
+
samples_ns.push(t_ns);
|
| 170 |
+
onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 171 |
+
}
|
| 172 |
+
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
out.metrics = {
|
| 176 |
+
tests,
|
| 177 |
+
n_prompt: wantPp ? nPrompt : 0,
|
| 178 |
+
n_gen: wantTg ? nGen : 0,
|
| 179 |
+
n_reps: nReps,
|
| 180 |
+
};
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
return out;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
// Public entry. Loads the WASM module + model, then dispatches to
|
| 187 |
+
// runBenchActions for the actual workload. Returns a flat record shape
|
| 188 |
+
// consumed by harness.js (window.__BENCH) and by controller.makeRecord.
|
| 189 |
export async function runBenchmarkCore({
|
| 190 |
source,
|
| 191 |
modelFile,
|
| 192 |
hfRepo,
|
| 193 |
+
// consistency phase
|
| 194 |
+
consistencyPrompt = DEFAULT_PROMPT,
|
| 195 |
+
consistencyNPredict = DEFAULT_N_PREDICT,
|
| 196 |
+
refTokenIds = null,
|
| 197 |
+
runConsistency = true, // false ⇒ skip consistency phase entirely
|
| 198 |
+
// perf phase
|
| 199 |
+
nPrompt = DEFAULT_N_PROMPT,
|
| 200 |
+
nGen = DEFAULT_N_GEN,
|
| 201 |
+
nReps = DEFAULT_N_REPS,
|
| 202 |
+
noWarmup = false,
|
| 203 |
+
// model load
|
| 204 |
nCtx = 2048,
|
| 205 |
nGpuLayers = 999,
|
| 206 |
+
// reporting
|
| 207 |
onStatus = () => {},
|
| 208 |
onProgress = () => {},
|
| 209 |
onLog = () => {},
|
|
|
|
| 223 |
webgpuAvailable: !!navigator.gpu,
|
| 224 |
gpuAdapterInfo: null,
|
| 225 |
metrics: null,
|
| 226 |
+
consistency: null,
|
| 227 |
output: '',
|
| 228 |
};
|
| 229 |
|
|
|
|
| 230 |
let Module;
|
| 231 |
|
| 232 |
try {
|
|
|
|
| 233 |
if (navigator.gpu) {
|
| 234 |
try {
|
| 235 |
const adapter = await navigator.gpu.requestAdapter();
|
|
|
|
| 246 |
onLog('WebGPU: not available in this browser');
|
| 247 |
}
|
| 248 |
|
|
|
|
| 249 |
onStatus('loading_wasm', `Loading WASM module (${buildType})...`);
|
| 250 |
onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
|
| 251 |
await loadBenchScriptOnce(buildType);
|
|
|
|
| 253 |
Module = await globalThis.createBenchModule({
|
| 254 |
print: (text) => onLog(`[wasm] ${text}`),
|
| 255 |
printErr: (text) => onLog(`[wasm:err] ${text}`),
|
|
|
|
| 256 |
onAbort: (reason) => {
|
| 257 |
const msg = `WASM aborted: ${reason}`;
|
| 258 |
result.error = msg;
|
|
|
|
| 262 |
});
|
| 263 |
onLog('WASM module loaded');
|
| 264 |
|
|
|
|
| 265 |
onStatus('downloading', `Downloading ${modelFile}...`);
|
| 266 |
onLog(`Fetching model via source: ${hfRepo}/${modelFile}`);
|
| 267 |
const { stream, contentLength } = await source.fetchModel(hfRepo, modelFile);
|
|
|
|
| 269 |
contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
|
| 270 |
}`);
|
| 271 |
|
| 272 |
+
// Stream the GGUF directly into the WASM heap (HeapFS-style) — see worker
|
| 273 |
+
// for the full explanation of why we override node.contents with a getter.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
if (!(contentLength > 0)) {
|
| 275 |
throw new Error('content-length is required for streaming into WASM heap');
|
| 276 |
}
|
|
|
|
| 305 |
Module._free(modelPtr);
|
| 306 |
throw err;
|
| 307 |
}
|
|
|
|
| 308 |
result._modelPtr = modelPtr;
|
| 309 |
|
|
|
|
| 310 |
onStatus('initializing', 'Initializing llama.cpp backends...');
|
|
|
|
| 311 |
const initResult = await Module.ccall('bench_init', 'number', [], [], { async: true });
|
| 312 |
if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
|
| 313 |
onLog('Backends initialized');
|
| 314 |
|
|
|
|
| 315 |
onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
|
|
|
|
| 316 |
const loadResult = await Module.ccall(
|
| 317 |
'bench_load',
|
| 318 |
'number',
|
|
|
|
| 323 |
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
|
| 324 |
onLog('Model loaded');
|
| 325 |
|
|
|
|
|
|
|
|
|
|
| 326 |
try {
|
| 327 |
Module.FS.unlink('/model.gguf');
|
| 328 |
} catch (e) {
|
| 329 |
onLog(`Warning: could not remove model FS node: ${e.message}`);
|
| 330 |
}
|
| 331 |
|
| 332 |
+
// ─── Consistency + perf phases ───
|
| 333 |
+
onStatus('running', 'Running benchmark...');
|
| 334 |
+
const actions = await runBenchActions(Module, {
|
| 335 |
+
consistencyPrompt: runConsistency ? consistencyPrompt : null,
|
| 336 |
+
consistencyNPredict,
|
| 337 |
+
refTokenIds,
|
| 338 |
+
nPrompt, nGen, nReps, noWarmup,
|
| 339 |
+
onStatus, onLog,
|
| 340 |
+
});
|
| 341 |
+
result.metrics = actions.metrics;
|
| 342 |
+
result.consistency = actions.consistency;
|
| 343 |
+
result.output = actions.output;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
onLog('Calling bench_exit()...');
|
| 346 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
| 347 |
|
|
|
|
| 348 |
if (result._modelPtr) {
|
| 349 |
Module._free(result._modelPtr);
|
| 350 |
delete result._modelPtr;
|
| 351 |
}
|
| 352 |
|
| 353 |
result.status = 'done';
|
| 354 |
+
const summary = result.metrics?.tests
|
| 355 |
+
?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`)
|
| 356 |
+
.join(' | ') || 'no perf';
|
| 357 |
+
onStatus('done', `Done! ${summary}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
return result;
|
| 359 |
} catch (err) {
|
| 360 |
result.error = err.message || String(err);
|
|
|
|
| 362 |
onStatus('error', `Error: ${err.message}`);
|
| 363 |
onLog(`ERROR: ${err.message}`);
|
| 364 |
if (err.stack) onLog(err.stack);
|
|
|
|
| 365 |
if (result._modelPtr && Module?._free) {
|
| 366 |
try { Module._free(result._modelPtr); } catch { /* ignore */ }
|
| 367 |
delete result._modelPtr;
|
js/tables.js
CHANGED
|
@@ -76,10 +76,10 @@ export function renderResultsTable(results) {
|
|
| 76 |
{ key: 'status', label: 'Status', priority: 1 },
|
| 77 |
{ key: 'buildType', label: 'Build', priority: 3 },
|
| 78 |
{ key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
|
| 79 |
-
{ key: 'decode_tok_s', label: '
|
| 80 |
-
{ key: 'prefill_tok_s', label: '
|
| 81 |
-
{ key: 'cpu_baseline_decode_tok_s', label: 'CPU
|
| 82 |
-
{ key: 'cpu_baseline_prefill_tok_s', label: 'CPU
|
| 83 |
{ key: 'n_eval', label: 'n_eval', priority: 3 },
|
| 84 |
{ key: 't_eval_ms', label: 't_eval (ms)', priority: 3 },
|
| 85 |
{ key: 'n_p_eval', label: 'n_p_eval', priority: 3 },
|
|
@@ -133,9 +133,29 @@ export function renderResultsTable(results) {
|
|
| 133 |
case 'decode_tok_s':
|
| 134 |
case 'prefill_tok_s':
|
| 135 |
case 'cpu_baseline_decode_tok_s':
|
| 136 |
-
case 'cpu_baseline_prefill_tok_s':
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
break;
|
|
|
|
| 139 |
case 't_eval_ms':
|
| 140 |
case 't_p_eval_ms':
|
| 141 |
html += `<span class="mono">${formatMs(r[col.key])}</span>`;
|
|
|
|
| 76 |
{ key: 'status', label: 'Status', priority: 1 },
|
| 77 |
{ key: 'buildType', label: 'Build', priority: 3 },
|
| 78 |
{ key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
|
| 79 |
+
{ key: 'decode_tok_s', label: 'tg tok/s', priority: 1 },
|
| 80 |
+
{ key: 'prefill_tok_s', label: 'pp tok/s', priority: 3 },
|
| 81 |
+
{ key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
|
| 82 |
+
{ key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
|
| 83 |
{ key: 'n_eval', label: 'n_eval', priority: 3 },
|
| 84 |
{ key: 't_eval_ms', label: 't_eval (ms)', priority: 3 },
|
| 85 |
{ key: 'n_p_eval', label: 'n_p_eval', priority: 3 },
|
|
|
|
| 133 |
case 'decode_tok_s':
|
| 134 |
case 'prefill_tok_s':
|
| 135 |
case 'cpu_baseline_decode_tok_s':
|
| 136 |
+
case 'cpu_baseline_prefill_tok_s': {
|
| 137 |
+
// llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
|
| 138 |
+
// label as a tooltip when the new schema is present. Older records
|
| 139 |
+
// without stddev fall back to the bare avg from formatTokS.
|
| 140 |
+
const isDecode = col.key === 'decode_tok_s';
|
| 141 |
+
const isPrefill = col.key === 'prefill_tok_s';
|
| 142 |
+
const stddev = isDecode ? r.decode_stddev_ts
|
| 143 |
+
: isPrefill ? r.prefill_stddev_ts
|
| 144 |
+
: null;
|
| 145 |
+
const testName = isDecode ? r.tg_test_name
|
| 146 |
+
: isPrefill ? r.pp_test_name
|
| 147 |
+
: null;
|
| 148 |
+
const avg = r[col.key];
|
| 149 |
+
let cell;
|
| 150 |
+
if (avg != null && stddev != null) {
|
| 151 |
+
cell = `${formatTokS(avg)} \u00b1 ${formatTokS(stddev)}`;
|
| 152 |
+
} else {
|
| 153 |
+
cell = formatTokS(avg);
|
| 154 |
+
}
|
| 155 |
+
const titleAttr = testName ? ` title="${escapeHtml(testName)}"` : '';
|
| 156 |
+
html += `<span class="mono"${titleAttr}>${cell}</span>`;
|
| 157 |
break;
|
| 158 |
+
}
|
| 159 |
case 't_eval_ms':
|
| 160 |
case 't_p_eval_ms':
|
| 161 |
html += `<span class="mono">${formatMs(r[col.key])}</span>`;
|
run.html
CHANGED
|
@@ -125,7 +125,15 @@
|
|
| 125 |
</div>
|
| 126 |
</div>
|
| 127 |
<div class="filter-group">
|
| 128 |
-
<label class="filter-label" for="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
|
| 130 |
</div>
|
| 131 |
</div>
|
|
|
|
| 125 |
</div>
|
| 126 |
</div>
|
| 127 |
<div class="filter-group">
|
| 128 |
+
<label class="filter-label" for="n-prompt-input">Prompt tokens (-p)</label>
|
| 129 |
+
<input type="number" id="n-prompt-input" class="filter-select run-iter-input" value="512" min="0" max="4096" step="1">
|
| 130 |
+
</div>
|
| 131 |
+
<div class="filter-group">
|
| 132 |
+
<label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
|
| 133 |
+
<input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
|
| 134 |
+
</div>
|
| 135 |
+
<div class="filter-group">
|
| 136 |
+
<label class="filter-label" for="iterations-input">Reps (-r)</label>
|
| 137 |
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
|
| 138 |
</div>
|
| 139 |
</div>
|