webgpu-bench / js /run /bench-worker.js
GitHub Actions
sync from abhijitramesh/webgpu-bench@124659d1dc
adbad62
// Dedicated Worker that runs a single llama.cpp inference pass. Loaded by
// controller.js and harness.js so we can importScripts() the
// Emscripten-emitted bench.js (which is a classic, non-module script).
//
// Protocol (all messages use { type, ... } tag):
//
// main β†’ worker: {
// type: 'run',
// params: {
// buildType,
// // model load
// nCtx, nGpuLayers,
// // consistency phase (set consistencyPrompt to '' to skip)
// consistencyPrompt, consistencyNPredict, refTokenIds,
// // perf phase
// nPrompt, nGen, nReps, nDepth, noWarmup,
// },
// opfsPath: { rootDir, repo, filename }
// }
//
// worker β†’ main: { type: 'status', status, msg }
// worker β†’ main: { type: 'progress', fraction, downloaded, total }
// worker β†’ main: { type: 'log', line }
// worker β†’ main: { type: 'result', record } // terminal
//
// Abort: main thread calls worker.terminate(). No cooperative abort β€” JSPI
// decode loops ignore signals, and termination is the only reliable way to
// stop an in-flight WASM call.
const post = (msg) => self.postMessage(msg);
const log = (line) => post({ type: 'log', line });
// sinceMs: optional epoch ms. Forwarded to controller so the row ticks an
// elapsed counter while a long-running ccall (warmup, big-model rep) is in
// flight β€” JSPI doesn't yield often enough on CPU paths to drive ticks here.
const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, sinceMs });
// Capture the GPUDevice that llama.cpp's WebGPU backend creates so we can
// destroy() it before the worker terminates. Without this, iOS Safari holds
// Metal allocations from prior runs long enough for the next model load in a
// study sweep to push the tab over its memory limit and trigger Jetsam.
// Installed at module scope so the capture is in place before the bench.js
// glue is importScripts'd and before any C-side requestAdapter/requestDevice
// calls run. The wrapper is one-shot per device: if the backend ever
// re-requests, the latest reference wins.
let capturedGpuDevice = null;
if (self.navigator?.gpu && typeof self.navigator.gpu.requestAdapter === 'function') {
const origRequestAdapter = self.navigator.gpu.requestAdapter.bind(self.navigator.gpu);
self.navigator.gpu.requestAdapter = async (...args) => {
const adapter = await origRequestAdapter(...args);
if (adapter && typeof adapter.requestDevice === 'function' && !adapter.__deviceWrapped) {
const origRequestDevice = adapter.requestDevice.bind(adapter);
adapter.requestDevice = async (...devArgs) => {
const device = await origRequestDevice(...devArgs);
capturedGpuDevice = device;
return device;
};
adapter.__deviceWrapped = true;
}
return adapter;
};
}
// Below this many compared tokens, the consistency agreement rate is
// statistical noise (e.g. early-EOS models that produce 1 token always
// report 100%).
const CONSISTENCY_MIN_TOKENS = 8;
// Sleep between perf reps so the GPU clock state can recover. Without
// this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
// rep N slowest) β€” looks like Apple's GPU power-state cooldown.
const REP_COOLDOWN_MS = 1000;
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
const MAX_WASM_ERROR_LINES = 12;
const MAX_WASM_STDERR_LINES = 20;
// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
// actually-bad lines as :err so real failures stand out.
function classifyWasmStderr(text) {
return /\b(error|abort(ed)?|failed|fatal|panic|assert)\b|GGML_ASSERT/i.test(text)
? '[wasm:err]' : '[wasm]';
}
const wasmErrLines = [];
const wasmStderrLines = [];
function recordWasmStderrLine(line) {
wasmStderrLines.push(line);
if (wasmStderrLines.length > MAX_WASM_STDERR_LINES) wasmStderrLines.shift();
}
function recordWasmErrLine(line) {
wasmErrLines.push(line);
if (wasmErrLines.length > MAX_WASM_ERROR_LINES) wasmErrLines.shift();
}
function recentWasmErrDetail() {
if (wasmErrLines.length === 0) return '';
return wasmErrLines.join(' || ');
}
function recentWasmStderrDetail() {
if (wasmStderrLines.length === 0) return '';
return wasmStderrLines.join(' || ');
}
// ─── OPFS-backed model loading (wllama-style) ───
// For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray
// length limits, and it eats the heap budget that KV cache + working memory
// need). Instead, we open a FileSystemSyncAccessHandle on the OPFS file in
// this worker, register a zero-byte stub in MEMFS, and patch MEMFS's
// stream_ops so reads delegate to syncHandle.read(). llama.cpp then loads
// the model via fread (use_mmap=false), which calls the patched stream_ops
// β€” never copying the bytes through the WASM heap.
//
// Mirrors wllama's src/workers-code/llama-cpp.js (patchMEMFS / opfsAlloc /
// opfsFreeAll). Worker-only: sync access handles aren't available on the
// main thread.
const opfsHandles = {}; // map MEMFS-name β†’ { syncHandle, size }
function patchMEMFS(Module) {
const m = Module;
// Idempotent β€” only install the patches once per Module.
if (m.MEMFS.stream_ops._read) return;
m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read;
m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek;
m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap;
m.MEMFS.stream_ops.read = function (stream, buffer, offset, length, position) {
const name = stream.node.name;
if (opfsHandles[name]) {
const { syncHandle, size } = opfsHandles[name];
const toRead = Math.min(length, size - position);
if (toRead <= 0) return 0;
const view = new Uint8Array(buffer.buffer, buffer.byteOffset + offset, toRead);
return syncHandle.read(view, { at: position });
}
return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position);
};
m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read;
m.MEMFS.stream_ops.llseek = function (stream, offset, whence) {
const name = stream.node.name;
if (opfsHandles[name]) {
const { size } = opfsHandles[name];
let newPos = offset;
if (whence === 1) newPos += stream.position; // SEEK_CUR
if (whence === 2) newPos += size; // SEEK_END
if (newPos < 0) throw new Error('SEEK before start of file');
stream.position = newPos;
return newPos;
}
return m.MEMFS.stream_ops._llseek(stream, offset, whence);
};
m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek;
m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) {
const name = stream.node.name;
if (opfsHandles[name]) {
// OPFS-backed files must never be mmap'd β€” that would force MEMFS to
// copy the file into the WASM heap, defeating the OPFS path. The C++
// side passes use_mmap=0 to avoid this. If we ever land here, the
// caller forgot to disable mmap.
throw new Error(`[OPFS] mmap called on "${name}" β€” bench_load was not invoked with use_mmap=0`);
}
return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags);
};
m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
}
// Resolve an OPFS path (rootDir + repo segments + filename) to a
// FileSystemFileHandle inside this worker. Works around the iOS Safari
// limitation that FileSystemFileHandle isn't structured-cloneable across
// postMessage β€” main thread sends the layout key, worker opens the
// handle locally.
async function resolveOpfsHandle({ rootDir, repo, filename }) {
if (!self.navigator?.storage?.getDirectory) {
throw new Error('OPFS not available in this worker');
}
let dir = await self.navigator.storage.getDirectory();
dir = await dir.getDirectoryHandle(rootDir, { create: false });
for (const seg of String(repo).split('/').filter(Boolean)) {
dir = await dir.getDirectoryHandle(seg, { create: false });
}
return dir.getFileHandle(filename, { create: false });
}
async function opfsAlloc(Module, name, fileHandle) {
// createSyncAccessHandle is worker-only and exclusive β€” only one writer
// per OPFS file at a time. Caller must ensure no createWritable session
// is open when we land here.
const syncHandle = await fileHandle.createSyncAccessHandle();
const size = syncHandle.getSize();
opfsHandles[name] = { syncHandle, size };
// Zero-byte placeholder so llama.cpp's fopen() finds the path.
Module.FS.createDataFile('/', name, new Uint8Array(0), true, false, true);
// Set usedBytes so fstat()/seek-end report the real file size β€” our
// patched llseek consults size, but other code (e.g. llama.cpp's GGUF
// reader sanity-checking the file length) goes through stat first.
Module.FS.lookupPath('/' + name).node.usedBytes = size;
return size;
}
function opfsFreeAll(Module) {
for (const [name, { syncHandle }] of Object.entries(opfsHandles)) {
try { syncHandle.close(); } catch { /* already closed */ }
try { Module.FS.unlink('/' + name); } catch { /* already gone */ }
delete opfsHandles[name];
}
}
// Aggregate raw nanosecond samples into the llama-bench result shape.
// llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
// the std of per-sample t/s, computed independently rather than propagated
// from stddev_ns (the mapping isn't linear).
//
// `n_depth` carries through unchanged so downstream consumers can label
// e.g. "pp512 @ d2048" the way llama-bench does (line 1984 of
// llama.cpp/tools/llama-bench/llama-bench.cpp).
function buildTest(name, n_prompt, n_gen, n_depth, samples_ns) {
const n = samples_ns.length;
if (n === 0) {
return { name, n_prompt, n_gen, n_depth, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
}
const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
const var_ns = n > 1
? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
: 0;
const stddev_ns = Math.sqrt(var_ns);
const n_tokens = n_prompt + n_gen;
const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
const var_ts = n > 1
? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
: 0;
const stddev_ts = Math.sqrt(var_ts);
const round2 = x => Math.round(x * 100) / 100;
return {
name,
n_prompt,
n_gen,
n_depth,
avg_ns: Math.round(avg_ns),
stddev_ns: Math.round(stddev_ns),
avg_ts: round2(avg_ts),
stddev_ts: round2(stddev_ts),
samples_ns: samples_ns.map(Math.round),
samples_ts: samples_ts.map(round2),
};
}
function parseBenchResult(label, raw) {
let r;
try { r = JSON.parse(raw); } catch (e) {
throw new Error(`${label}: invalid JSON from C (${e.message})`);
}
if (r.error) throw new Error(`${label}: ${r.error}`);
return r;
}
function describeError(err) {
if (err == null) return '';
if (typeof err === 'string') return err;
if (typeof err === 'number' || typeof err === 'boolean') return String(err);
if (err instanceof Error) return err.message || String(err);
if (typeof err === 'object') {
const parts = [];
if (typeof err.name === 'string' && err.name) parts.push(err.name);
if (typeof err.type === 'string' && err.type) parts.push(`type=${err.type}`);
if (typeof err.message === 'string' && err.message) parts.push(err.message);
if (typeof err.reason === 'string' && err.reason) parts.push(`reason=${err.reason}`);
if (typeof err.filename === 'string' && err.filename) parts.push(`file=${err.filename}`);
if (typeof err.lineno === 'number' && err.lineno > 0) parts.push(`line=${err.lineno}`);
if (typeof err.colno === 'number' && err.colno > 0) parts.push(`col=${err.colno}`);
if (typeof err.error === 'string' && err.error) parts.push(`error=${err.error}`);
else if (err.error instanceof Error && err.error.message) parts.push(`error=${err.error.message}`);
if (parts.length > 0) return parts.join(' | ');
try {
const own = {};
for (const key of Object.getOwnPropertyNames(err)) {
own[key] = err[key];
}
const json = JSON.stringify(own);
if (json && json !== '{}') return json;
} catch {
// fall through
}
const tag = Object.prototype.toString.call(err);
if (tag && tag !== '[object Object]') return tag;
return 'unknown structured error';
}
return String(err);
}
function formatPhaseError(phase, err) {
const detail = describeError(err);
const stderr = recentWasmErrDetail();
const stderrTail = recentWasmStderrDetail();
if (detail && stderr) return `${phase} threw WASM exception (${detail}) | recent stderr: ${stderr}`;
if (detail && stderrTail) return `${phase} threw WASM exception (${detail}) | recent stderr tail: ${stderrTail}`;
if (detail) return `${phase} threw WASM exception (${detail})`;
if (stderr) return `${phase} threw WASM exception | recent stderr: ${stderr}`;
if (stderrTail) return `${phase} threw WASM exception | recent stderr tail: ${stderrTail}`;
return `${phase} threw WASM exception`;
}
async function ccallPhase(Module, phase, returnType, argTypes, args) {
try {
return await Module.ccall(phase, returnType, argTypes, args, { async: true });
} catch (err) {
throw new Error(formatPhaseError(phase, err));
}
}
async function ccallPhaseLabel(Module, phaseLabel, exportName, returnType, argTypes, args) {
try {
return await Module.ccall(exportName, returnType, argTypes, args, { async: true });
} catch (err) {
throw new Error(formatPhaseError(phaseLabel, err));
}
}
self.onmessage = async (e) => {
const { type } = e.data || {};
if (type !== 'run') {
log(`worker: ignoring unknown message type "${type}"`);
return;
}
try {
const record = await runOne(e.data);
post({ type: 'result', record });
} catch (err) {
post({
type: 'result',
record: {
status: 'error',
error: describeError(err),
metrics: null,
},
});
}
};
async function runOne({ params, opfsPath }) {
wasmErrLines.length = 0;
wasmStderrLines.length = 0;
const {
buildType,
nCtx,
nGpuLayers,
// consistency
consistencyPrompt,
consistencyNPredict,
refTokenIds,
// perf
nPrompt,
nGen,
nReps,
nDepth = 0,
noWarmup,
} = params;
// The worker only loads via OPFS now: main thread downloads to OPFS,
// we open a FileSystemSyncAccessHandle here, and patched MEMFS
// stream_ops route llama.cpp's fread through the sync handle. That
// path scales past the WASM heap budget and shares one numerical
// implementation across every surface.
if (!opfsPath) {
throw new Error('runOne: opfsPath is required (heap/buffer paths removed)');
}
const result = {
status: 'init',
error: null,
buildType,
webgpuAvailable: !!self.navigator?.gpu,
gpuAdapterInfo: null,
metrics: null,
consistency: null,
output: '',
};
// ─── WebGPU adapter probe (informational) ───
if (self.navigator?.gpu) {
try {
const adapter = await self.navigator.gpu.requestAdapter();
if (adapter) {
const info = adapter.info;
result.gpuAdapterInfo = info ? {
vendor: info.vendor || '',
architecture: info.architecture || '',
device: info.device || '',
description: info.description || '',
} : null;
log(`WebGPU adapter: ${JSON.stringify(result.gpuAdapterInfo || 'no info')}`);
} else {
log('WebGPU: adapter request returned null');
}
} catch (err) {
log(`WebGPU adapter error: ${err.message}`);
}
} else {
log('WebGPU: not available in this worker');
}
// ─── Load the Emscripten glue once per worker ───
status('loading_wasm', `Loading WASM module (${buildType})...`);
try {
self.importScripts(`/build/${buildType}/bench.js`);
} catch (err) {
throw new Error(`importScripts /build/${buildType}/bench.js failed: ${err.message}`);
}
if (typeof self.createBenchModule !== 'function') {
throw new Error('createBenchModule not defined after importScripts');
}
const Module = await self.createBenchModule({
locateFile: (filename) => `/build/${buildType}/${filename}`,
print: (text) => log(`[wasm] ${text}`),
printErr: (text) => {
const tag = classifyWasmStderr(text);
const line = `${tag} ${text}`;
recordWasmStderrLine(line);
if (tag === '[wasm:err]') recordWasmErrLine(line);
log(line);
},
onAbort: (reason) => {
const msg = `WASM aborted: ${reason}`;
result.error = msg;
result.status = 'error';
log(`ERROR: ${msg}`);
},
});
log('WASM module loaded');
// ─── Make the model visible to the WASM filesystem ───
// Open a FileSystemSyncAccessHandle on the OPFS file, register a
// zero-byte placeholder in MEMFS, and patch MEMFS stream_ops so
// llama.cpp's fread is routed to the sync handle. Bytes never touch
// the WASM heap, so the model size is bounded by OPFS quota, not heap.
status('opfs', 'Linking OPFS-backed model into MEMFS...');
const fileHandle = await resolveOpfsHandle(opfsPath);
patchMEMFS(Module);
const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);
// Report 100% to keep the existing progress UI happy β€” the actual
// download to OPFS happened before the worker spawn.
post({ type: 'progress', fraction: 1, downloaded: size, total: size });
// ─── Init backend ───
status('initializing', 'Initializing llama.cpp backends...');
const initResult = await ccallPhase(Module, 'bench_init', 'number', [], []);
if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
log('Backends initialized');
// ─── Load model ───
// use_mmap=0 β€” the patched MEMFS mmap throws explicitly, so any
// accidental mmap attempt surfaces as a clear error rather than a
// silent heap copy.
status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=0)...`);
const loadResult = await ccallPhase(
Module,
'bench_load',
'number',
['string', 'number', 'number', 'number'],
['/model.gguf', nCtx, nGpuLayers, 0],
);
if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
log('Model loaded');
// ─── Memory snapshot from llama.cpp ───
// Captured immediately after bench_load so model_size reflects the loaded
// model and per-device free counters reflect post-allocation state. Wrapped
// in try/catch β€” if the C side or a backend errors, the run can still
// produce perf numbers, just without memoryInfo on the record.
try {
const raw = await ccallPhase(Module, 'bench_memory_info', 'string', [], []);
result.memoryInfo = parseBenchResult('bench_memory_info', raw);
const dev = (result.memoryInfo.devices || [])
.map(d => `${d.name}(${d.type}) free=${(d.free / (1024 * 1024)).toFixed(0)}MB total=${(d.total / (1024 * 1024)).toFixed(0)}MB`)
.join(' | ') || 'none';
log(`Memory: model=${(result.memoryInfo.model_size / (1024 * 1024)).toFixed(0)}MB state=${(result.memoryInfo.state_size / (1024 * 1024)).toFixed(0)}MB | ${dev}`);
} catch (err) {
log(`bench_memory_info failed: ${err.message} β€” continuing without memoryInfo`);
}
// ─── Consistency phase ───
// Soft-fail: a failure here logs and falls through to the perf phase
// rather than aborting the whole run. Some devices/models can't survive
// bench_run (e.g. unsupported op, OOM mid-decode) but can still produce
// useful pp/tg numbers via synthetic-token paths.
if (consistencyPrompt) {
try {
status('consistency', 'Running consistency check...', Date.now());
log(`bench_run("...", ${consistencyNPredict}) β€” consistency phase`);
const raw = await ccallPhase(Module, 'bench_run', 'string',
['string', 'number'],
[consistencyPrompt, consistencyNPredict]);
const r = parseBenchResult('bench_run', raw);
result.output = r.output || '';
result.consistency = { token_ids: r.token_ids || [] };
if (refTokenIds) {
log('bench_eval_tokens β€” forced-decode vs CPU baseline');
const evalRaw = await ccallPhase(Module, 'bench_eval_tokens', 'string',
['string', 'string'],
[consistencyPrompt, refTokenIds]);
const ev = parseBenchResult('bench_eval_tokens', evalRaw);
result.consistency = { ...result.consistency, ...ev };
if (ev.n_tokens < CONSISTENCY_MIN_TOKENS) {
log(
`Consistency: insufficient samples (${ev.n_tokens} token` +
`${ev.n_tokens === 1 ? '' : 's'} before EOS) β€” agreement rate not meaningful`
);
} else {
log(
`Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
`${ev.n_agree}/${ev.n_tokens})` +
(ev.first_disagreement >= 0 ? ` β€” first diverge @ ${ev.first_disagreement}` : '')
);
}
}
} catch (err) {
log(`Consistency phase failed: ${err.message} β€” continuing to perf phase`);
}
}
// ─── Perf phase (llama-bench style) ───
// Each test (pp, tg) is wrapped independently so a failure in one doesn't
// skip the other. Empty samples_ns produces a buildTest with avg_ts=0,
// which the dashboard renders as a dash.
const wantPp = nPrompt > 0;
const wantTg = nGen > 0;
// Test name suffix mirroring llama-bench (e.g. "pp512 @ d2048").
const depthSuffix = nDepth > 0 ? ` @ d${nDepth}` : '';
// Each timed rep is preceded by an untimed bench_set_depth call so the KV
// cache is in a known state. The C side caches the post-prefill snapshot,
// so reps 2..N at the same depth restore from snapshot instead of
// re-running the prefill (mirroring llama-bench's `cstate` reuse).
const setDepth = async (label) => {
const raw = await ccallPhaseLabel(Module, `bench_set_depth(${nDepth}) ${label}`, 'bench_set_depth', 'string', ['number'], [nDepth]);
const r = parseBenchResult(`bench_set_depth(${nDepth}) ${label}`, raw);
if (nDepth > 0) {
log(`bench_set_depth(${nDepth}) ${label}: ${r.cached ? 'restored snapshot' : 'prefilled'}`);
}
};
if (wantPp || wantTg) {
const tests = [];
if (wantPp) {
try {
if (!noWarmup) {
status('perf', `warmup pp${nPrompt}${depthSuffix}`, Date.now());
await setDepth('pp warmup');
log(`bench_pp(${nPrompt})${depthSuffix} β€” warmup`);
const raw = await ccallPhaseLabel(Module, `bench_pp warmup (${nPrompt}${depthSuffix})`, 'bench_pp', 'string', ['number'], [nPrompt]);
parseBenchResult('bench_pp warmup', raw);
}
const samples_ns = [];
for (let i = 0; i < nReps; i++) {
status('perf', `pp${nPrompt}${depthSuffix} ${i + 1}/${nReps}`, Date.now());
await setDepth(`pp rep ${i + 1}/${nReps}`);
const t0 = performance.now();
const raw = await ccallPhaseLabel(Module, `bench_pp rep ${i + 1}/${nReps} (${nPrompt}${depthSuffix})`, 'bench_pp', 'string', ['number'], [nPrompt]);
const t_ns = (performance.now() - t0) * 1e6;
parseBenchResult('bench_pp', raw);
samples_ns.push(t_ns);
log(`pp${nPrompt}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
}
tests.push(buildTest(`pp${nPrompt}${depthSuffix}`, nPrompt, 0, nDepth, samples_ns));
} catch (err) {
log(`pp test failed: ${err.message}`);
}
}
if (wantTg) {
try {
if (!noWarmup) {
// Run the full nGen-token decode loop as warmup (was bench_tg(1)).
// A 1-token warmup exercises the decode kernel once, which leaves
// the first timed rep absorbing pipeline-cache / shader-specialize
// cost on every subsequent step.
status('perf', `warmup tg${nGen}${depthSuffix}`, Date.now());
await setDepth('tg warmup');
log(`bench_tg(${nGen})${depthSuffix} β€” warmup`);
const raw = await ccallPhaseLabel(Module, `bench_tg warmup (${nGen}${depthSuffix})`, 'bench_tg', 'string', ['number'], [nGen]);
parseBenchResult('bench_tg warmup', raw);
}
const samples_ns = [];
for (let i = 0; i < nReps; i++) {
status('perf', `tg${nGen}${depthSuffix} ${i + 1}/${nReps}`, Date.now());
await setDepth(`tg rep ${i + 1}/${nReps}`);
const t0 = performance.now();
const raw = await ccallPhaseLabel(Module, `bench_tg rep ${i + 1}/${nReps} (${nGen}${depthSuffix})`, 'bench_tg', 'string', ['number'], [nGen]);
const t_ns = (performance.now() - t0) * 1e6;
parseBenchResult('bench_tg', raw);
samples_ns.push(t_ns);
log(`tg${nGen}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
}
tests.push(buildTest(`tg${nGen}${depthSuffix}`, 0, nGen, nDepth, samples_ns));
} catch (err) {
log(`tg test failed: ${err.message}`);
}
}
if (tests.length > 0) {
result.metrics = {
tests,
n_prompt: wantPp ? nPrompt : 0,
n_gen: wantTg ? nGen : 0,
n_depth: nDepth,
n_reps: nReps,
};
}
}
await ccallPhase(Module, 'bench_exit', null, [], []);
// Close the sync handle so OPFS can release its lock on the file (and
// so a subsequent run can open a fresh handle without colliding).
opfsFreeAll(Module);
// Eagerly drop GPU buffers. worker.terminate() alone leaves Metal
// allocations alive on iOS Safari long enough for the next study run to
// hit Jetsam β€” destroy() returns the memory synchronously.
if (capturedGpuDevice) {
try {
capturedGpuDevice.destroy();
} catch (err) {
log(`device.destroy() failed: ${err.message}`);
}
capturedGpuDevice = null;
}
result.status = 'done';
const summary = result.metrics?.tests
?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} Β± ${t.stddev_ts.toFixed(2)} t/s`)
.join(' | ') || 'no perf';
status('done', `Done! ${summary}`);
return result;
}