// Dedicated Worker that runs a single llama.cpp inference pass. Loaded by // controller.js and harness.js so we can importScripts() the // Emscripten-emitted bench.js (which is a classic, non-module script). // // Protocol (all messages use { type, ... } tag): // // main → worker: { // type: 'run', // params: { // buildType, // // model load // nCtx, nGpuLayers, // // consistency phase (set consistencyPrompt to '' to skip) // consistencyPrompt, consistencyNPredict, refTokenIds, // // perf phase // nPrompt, nGen, nReps, nDepth, noWarmup, // }, // opfsPath: { rootDir, repo, filename } // } // // worker → main: { type: 'status', status, msg } // worker → main: { type: 'progress', fraction, downloaded, total } // worker → main: { type: 'log', line } // worker → main: { type: 'result', record } // terminal // // Abort: main thread calls worker.terminate(). No cooperative abort — JSPI // decode loops ignore signals, and termination is the only reliable way to // stop an in-flight WASM call. const post = (msg) => self.postMessage(msg); const log = (line) => post({ type: 'log', line }); // sinceMs: optional epoch ms. Forwarded to controller so the row ticks an // elapsed counter while a long-running ccall (warmup, big-model rep) is in // flight — JSPI doesn't yield often enough on CPU paths to drive ticks here. const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, sinceMs }); // Capture the GPUDevice that llama.cpp's WebGPU backend creates so we can // destroy() it before the worker terminates. Without this, iOS Safari holds // Metal allocations from prior runs long enough for the next model load in a // study sweep to push the tab over its memory limit and trigger Jetsam. // Installed at module scope so the capture is in place before the bench.js // glue is importScripts'd and before any C-side requestAdapter/requestDevice // calls run. The wrapper is one-shot per device: if the backend ever // re-requests, the latest reference wins. let capturedGpuDevice = null; if (self.navigator?.gpu && typeof self.navigator.gpu.requestAdapter === 'function') { const origRequestAdapter = self.navigator.gpu.requestAdapter.bind(self.navigator.gpu); self.navigator.gpu.requestAdapter = async (...args) => { const adapter = await origRequestAdapter(...args); if (adapter && typeof adapter.requestDevice === 'function' && !adapter.__deviceWrapped) { const origRequestDevice = adapter.requestDevice.bind(adapter); adapter.requestDevice = async (...devArgs) => { const device = await origRequestDevice(...devArgs); capturedGpuDevice = device; return device; }; adapter.__deviceWrapped = true; } return adapter; }; } // Below this many compared tokens, the consistency agreement rate is // statistical noise (e.g. early-EOS models that produce 1 token always // report 100%). const CONSISTENCY_MIN_TOKENS = 8; // Sleep between perf reps so the GPU clock state can recover. Without // this, sustained tg decode reps showed monotonic decay (rep 1 fastest, // rep N slowest) — looks like Apple's GPU power-state cooldown. const REP_COOLDOWN_MS = 1000; const sleep = (ms) => new Promise(r => setTimeout(r, ms)); const MAX_WASM_ERROR_LINES = 12; const MAX_WASM_STDERR_LINES = 20; // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the // actually-bad lines as :err so real failures stand out. function classifyWasmStderr(text) { return /\b(error|abort(ed)?|failed|fatal|panic|assert)\b|GGML_ASSERT/i.test(text) ? '[wasm:err]' : '[wasm]'; } const wasmErrLines = []; const wasmStderrLines = []; function recordWasmStderrLine(line) { wasmStderrLines.push(line); if (wasmStderrLines.length > MAX_WASM_STDERR_LINES) wasmStderrLines.shift(); } function recordWasmErrLine(line) { wasmErrLines.push(line); if (wasmErrLines.length > MAX_WASM_ERROR_LINES) wasmErrLines.shift(); } function recentWasmErrDetail() { if (wasmErrLines.length === 0) return ''; return wasmErrLines.join(' || '); } function recentWasmStderrDetail() { if (wasmStderrLines.length === 0) return ''; return wasmStderrLines.join(' || '); } // ─── OPFS-backed model loading (wllama-style) ─── // For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray // length limits, and it eats the heap budget that KV cache + working memory // need). Instead, we open a FileSystemSyncAccessHandle on the OPFS file in // this worker, register a zero-byte stub in MEMFS, and patch MEMFS's // stream_ops so reads delegate to syncHandle.read(). llama.cpp then loads // the model via fread (use_mmap=false), which calls the patched stream_ops // — never copying the bytes through the WASM heap. // // Mirrors wllama's src/workers-code/llama-cpp.js (patchMEMFS / opfsAlloc / // opfsFreeAll). Worker-only: sync access handles aren't available on the // main thread. const opfsHandles = {}; // map MEMFS-name → { syncHandle, size } function patchMEMFS(Module) { const m = Module; // Idempotent — only install the patches once per Module. if (m.MEMFS.stream_ops._read) return; m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read; m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek; m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap; m.MEMFS.stream_ops.read = function (stream, buffer, offset, length, position) { const name = stream.node.name; if (opfsHandles[name]) { const { syncHandle, size } = opfsHandles[name]; const toRead = Math.min(length, size - position); if (toRead <= 0) return 0; const view = new Uint8Array(buffer.buffer, buffer.byteOffset + offset, toRead); return syncHandle.read(view, { at: position }); } return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position); }; m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read; m.MEMFS.stream_ops.llseek = function (stream, offset, whence) { const name = stream.node.name; if (opfsHandles[name]) { const { size } = opfsHandles[name]; let newPos = offset; if (whence === 1) newPos += stream.position; // SEEK_CUR if (whence === 2) newPos += size; // SEEK_END if (newPos < 0) throw new Error('SEEK before start of file'); stream.position = newPos; return newPos; } return m.MEMFS.stream_ops._llseek(stream, offset, whence); }; m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek; m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) { const name = stream.node.name; if (opfsHandles[name]) { // OPFS-backed files must never be mmap'd — that would force MEMFS to // copy the file into the WASM heap, defeating the OPFS path. The C++ // side passes use_mmap=0 to avoid this. If we ever land here, the // caller forgot to disable mmap. throw new Error(`[OPFS] mmap called on "${name}" — bench_load was not invoked with use_mmap=0`); } return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags); }; m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap; } // Resolve an OPFS path (rootDir + repo segments + filename) to a // FileSystemFileHandle inside this worker. Works around the iOS Safari // limitation that FileSystemFileHandle isn't structured-cloneable across // postMessage — main thread sends the layout key, worker opens the // handle locally. async function resolveOpfsHandle({ rootDir, repo, filename }) { if (!self.navigator?.storage?.getDirectory) { throw new Error('OPFS not available in this worker'); } let dir = await self.navigator.storage.getDirectory(); dir = await dir.getDirectoryHandle(rootDir, { create: false }); for (const seg of String(repo).split('/').filter(Boolean)) { dir = await dir.getDirectoryHandle(seg, { create: false }); } return dir.getFileHandle(filename, { create: false }); } async function opfsAlloc(Module, name, fileHandle) { // createSyncAccessHandle is worker-only and exclusive — only one writer // per OPFS file at a time. Caller must ensure no createWritable session // is open when we land here. const syncHandle = await fileHandle.createSyncAccessHandle(); const size = syncHandle.getSize(); opfsHandles[name] = { syncHandle, size }; // Zero-byte placeholder so llama.cpp's fopen() finds the path. Module.FS.createDataFile('/', name, new Uint8Array(0), true, false, true); // Set usedBytes so fstat()/seek-end report the real file size — our // patched llseek consults size, but other code (e.g. llama.cpp's GGUF // reader sanity-checking the file length) goes through stat first. Module.FS.lookupPath('/' + name).node.usedBytes = size; return size; } function opfsFreeAll(Module) { for (const [name, { syncHandle }] of Object.entries(opfsHandles)) { try { syncHandle.close(); } catch { /* already closed */ } try { Module.FS.unlink('/' + name); } catch { /* already gone */ } delete opfsHandles[name]; } } // Aggregate raw nanosecond samples into the llama-bench result shape. // llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as // the std of per-sample t/s, computed independently rather than propagated // from stddev_ns (the mapping isn't linear). // // `n_depth` carries through unchanged so downstream consumers can label // e.g. "pp512 @ d2048" the way llama-bench does (line 1984 of // llama.cpp/tools/llama-bench/llama-bench.cpp). function buildTest(name, n_prompt, n_gen, n_depth, samples_ns) { const n = samples_ns.length; if (n === 0) { return { name, n_prompt, n_gen, n_depth, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] }; } const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n; const var_ns = n > 1 ? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1) : 0; const stddev_ns = Math.sqrt(var_ns); const n_tokens = n_prompt + n_gen; const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0); const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n; const var_ts = n > 1 ? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1) : 0; const stddev_ts = Math.sqrt(var_ts); const round2 = x => Math.round(x * 100) / 100; return { name, n_prompt, n_gen, n_depth, avg_ns: Math.round(avg_ns), stddev_ns: Math.round(stddev_ns), avg_ts: round2(avg_ts), stddev_ts: round2(stddev_ts), samples_ns: samples_ns.map(Math.round), samples_ts: samples_ts.map(round2), }; } function parseBenchResult(label, raw) { let r; try { r = JSON.parse(raw); } catch (e) { throw new Error(`${label}: invalid JSON from C (${e.message})`); } if (r.error) throw new Error(`${label}: ${r.error}`); return r; } function describeError(err) { if (err == null) return ''; if (typeof err === 'string') return err; if (typeof err === 'number' || typeof err === 'boolean') return String(err); if (err instanceof Error) return err.message || String(err); if (typeof err === 'object') { const parts = []; if (typeof err.name === 'string' && err.name) parts.push(err.name); if (typeof err.type === 'string' && err.type) parts.push(`type=${err.type}`); if (typeof err.message === 'string' && err.message) parts.push(err.message); if (typeof err.reason === 'string' && err.reason) parts.push(`reason=${err.reason}`); if (typeof err.filename === 'string' && err.filename) parts.push(`file=${err.filename}`); if (typeof err.lineno === 'number' && err.lineno > 0) parts.push(`line=${err.lineno}`); if (typeof err.colno === 'number' && err.colno > 0) parts.push(`col=${err.colno}`); if (typeof err.error === 'string' && err.error) parts.push(`error=${err.error}`); else if (err.error instanceof Error && err.error.message) parts.push(`error=${err.error.message}`); if (parts.length > 0) return parts.join(' | '); try { const own = {}; for (const key of Object.getOwnPropertyNames(err)) { own[key] = err[key]; } const json = JSON.stringify(own); if (json && json !== '{}') return json; } catch { // fall through } const tag = Object.prototype.toString.call(err); if (tag && tag !== '[object Object]') return tag; return 'unknown structured error'; } return String(err); } function formatPhaseError(phase, err) { const detail = describeError(err); const stderr = recentWasmErrDetail(); const stderrTail = recentWasmStderrDetail(); if (detail && stderr) return `${phase} threw WASM exception (${detail}) | recent stderr: ${stderr}`; if (detail && stderrTail) return `${phase} threw WASM exception (${detail}) | recent stderr tail: ${stderrTail}`; if (detail) return `${phase} threw WASM exception (${detail})`; if (stderr) return `${phase} threw WASM exception | recent stderr: ${stderr}`; if (stderrTail) return `${phase} threw WASM exception | recent stderr tail: ${stderrTail}`; return `${phase} threw WASM exception`; } async function ccallPhase(Module, phase, returnType, argTypes, args) { try { return await Module.ccall(phase, returnType, argTypes, args, { async: true }); } catch (err) { throw new Error(formatPhaseError(phase, err)); } } async function ccallPhaseLabel(Module, phaseLabel, exportName, returnType, argTypes, args) { try { return await Module.ccall(exportName, returnType, argTypes, args, { async: true }); } catch (err) { throw new Error(formatPhaseError(phaseLabel, err)); } } self.onmessage = async (e) => { const { type } = e.data || {}; if (type !== 'run') { log(`worker: ignoring unknown message type "${type}"`); return; } try { const record = await runOne(e.data); post({ type: 'result', record }); } catch (err) { post({ type: 'result', record: { status: 'error', error: describeError(err), metrics: null, }, }); } }; async function runOne({ params, opfsPath }) { wasmErrLines.length = 0; wasmStderrLines.length = 0; const { buildType, nCtx, nGpuLayers, // consistency consistencyPrompt, consistencyNPredict, refTokenIds, // perf nPrompt, nGen, nReps, nDepth = 0, noWarmup, } = params; // The worker only loads via OPFS now: main thread downloads to OPFS, // we open a FileSystemSyncAccessHandle here, and patched MEMFS // stream_ops route llama.cpp's fread through the sync handle. That // path scales past the WASM heap budget and shares one numerical // implementation across every surface. if (!opfsPath) { throw new Error('runOne: opfsPath is required (heap/buffer paths removed)'); } const result = { status: 'init', error: null, buildType, webgpuAvailable: !!self.navigator?.gpu, gpuAdapterInfo: null, metrics: null, consistency: null, output: '', }; // ─── WebGPU adapter probe (informational) ─── if (self.navigator?.gpu) { try { const adapter = await self.navigator.gpu.requestAdapter(); if (adapter) { const info = adapter.info; result.gpuAdapterInfo = info ? { vendor: info.vendor || '', architecture: info.architecture || '', device: info.device || '', description: info.description || '', } : null; log(`WebGPU adapter: ${JSON.stringify(result.gpuAdapterInfo || 'no info')}`); } else { log('WebGPU: adapter request returned null'); } } catch (err) { log(`WebGPU adapter error: ${err.message}`); } } else { log('WebGPU: not available in this worker'); } // ─── Load the Emscripten glue once per worker ─── status('loading_wasm', `Loading WASM module (${buildType})...`); try { self.importScripts(`/build/${buildType}/bench.js`); } catch (err) { throw new Error(`importScripts /build/${buildType}/bench.js failed: ${err.message}`); } if (typeof self.createBenchModule !== 'function') { throw new Error('createBenchModule not defined after importScripts'); } const Module = await self.createBenchModule({ locateFile: (filename) => `/build/${buildType}/${filename}`, print: (text) => log(`[wasm] ${text}`), printErr: (text) => { const tag = classifyWasmStderr(text); const line = `${tag} ${text}`; recordWasmStderrLine(line); if (tag === '[wasm:err]') recordWasmErrLine(line); log(line); }, onAbort: (reason) => { const msg = `WASM aborted: ${reason}`; result.error = msg; result.status = 'error'; log(`ERROR: ${msg}`); }, }); log('WASM module loaded'); // ─── Make the model visible to the WASM filesystem ─── // Open a FileSystemSyncAccessHandle on the OPFS file, register a // zero-byte placeholder in MEMFS, and patch MEMFS stream_ops so // llama.cpp's fread is routed to the sync handle. Bytes never touch // the WASM heap, so the model size is bounded by OPFS quota, not heap. status('opfs', 'Linking OPFS-backed model into MEMFS...'); const fileHandle = await resolveOpfsHandle(opfsPath); patchMEMFS(Module); const size = await opfsAlloc(Module, 'model.gguf', fileHandle); log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`); // Report 100% to keep the existing progress UI happy — the actual // download to OPFS happened before the worker spawn. post({ type: 'progress', fraction: 1, downloaded: size, total: size }); // ─── Init backend ─── status('initializing', 'Initializing llama.cpp backends...'); const initResult = await ccallPhase(Module, 'bench_init', 'number', [], []); if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`); log('Backends initialized'); // ─── Load model ─── // use_mmap=0 — the patched MEMFS mmap throws explicitly, so any // accidental mmap attempt surfaces as a clear error rather than a // silent heap copy. status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=0)...`); const loadResult = await ccallPhase( Module, 'bench_load', 'number', ['string', 'number', 'number', 'number'], ['/model.gguf', nCtx, nGpuLayers, 0], ); if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`); log('Model loaded'); // ─── Memory snapshot from llama.cpp ─── // Captured immediately after bench_load so model_size reflects the loaded // model and per-device free counters reflect post-allocation state. Wrapped // in try/catch — if the C side or a backend errors, the run can still // produce perf numbers, just without memoryInfo on the record. try { const raw = await ccallPhase(Module, 'bench_memory_info', 'string', [], []); result.memoryInfo = parseBenchResult('bench_memory_info', raw); const dev = (result.memoryInfo.devices || []) .map(d => `${d.name}(${d.type}) free=${(d.free / (1024 * 1024)).toFixed(0)}MB total=${(d.total / (1024 * 1024)).toFixed(0)}MB`) .join(' | ') || 'none'; log(`Memory: model=${(result.memoryInfo.model_size / (1024 * 1024)).toFixed(0)}MB state=${(result.memoryInfo.state_size / (1024 * 1024)).toFixed(0)}MB | ${dev}`); } catch (err) { log(`bench_memory_info failed: ${err.message} — continuing without memoryInfo`); } // ─── Consistency phase ─── // Soft-fail: a failure here logs and falls through to the perf phase // rather than aborting the whole run. Some devices/models can't survive // bench_run (e.g. unsupported op, OOM mid-decode) but can still produce // useful pp/tg numbers via synthetic-token paths. if (consistencyPrompt) { try { status('consistency', 'Running consistency check...', Date.now()); log(`bench_run("...", ${consistencyNPredict}) — consistency phase`); const raw = await ccallPhase(Module, 'bench_run', 'string', ['string', 'number'], [consistencyPrompt, consistencyNPredict]); const r = parseBenchResult('bench_run', raw); result.output = r.output || ''; result.consistency = { token_ids: r.token_ids || [] }; if (refTokenIds) { log('bench_eval_tokens — forced-decode vs CPU baseline'); const evalRaw = await ccallPhase(Module, 'bench_eval_tokens', 'string', ['string', 'string'], [consistencyPrompt, refTokenIds]); const ev = parseBenchResult('bench_eval_tokens', evalRaw); result.consistency = { ...result.consistency, ...ev }; if (ev.n_tokens < CONSISTENCY_MIN_TOKENS) { log( `Consistency: insufficient samples (${ev.n_tokens} token` + `${ev.n_tokens === 1 ? '' : 's'} before EOS) — agreement rate not meaningful` ); } else { log( `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` + `${ev.n_agree}/${ev.n_tokens})` + (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '') ); } } } catch (err) { log(`Consistency phase failed: ${err.message} — continuing to perf phase`); } } // ─── Perf phase (llama-bench style) ─── // Each test (pp, tg) is wrapped independently so a failure in one doesn't // skip the other. Empty samples_ns produces a buildTest with avg_ts=0, // which the dashboard renders as a dash. const wantPp = nPrompt > 0; const wantTg = nGen > 0; // Test name suffix mirroring llama-bench (e.g. "pp512 @ d2048"). const depthSuffix = nDepth > 0 ? ` @ d${nDepth}` : ''; // Each timed rep is preceded by an untimed bench_set_depth call so the KV // cache is in a known state. The C side caches the post-prefill snapshot, // so reps 2..N at the same depth restore from snapshot instead of // re-running the prefill (mirroring llama-bench's `cstate` reuse). const setDepth = async (label) => { const raw = await ccallPhaseLabel(Module, `bench_set_depth(${nDepth}) ${label}`, 'bench_set_depth', 'string', ['number'], [nDepth]); const r = parseBenchResult(`bench_set_depth(${nDepth}) ${label}`, raw); if (nDepth > 0) { log(`bench_set_depth(${nDepth}) ${label}: ${r.cached ? 'restored snapshot' : 'prefilled'}`); } }; if (wantPp || wantTg) { const tests = []; if (wantPp) { try { if (!noWarmup) { status('perf', `warmup pp${nPrompt}${depthSuffix}`, Date.now()); await setDepth('pp warmup'); log(`bench_pp(${nPrompt})${depthSuffix} — warmup`); const raw = await ccallPhaseLabel(Module, `bench_pp warmup (${nPrompt}${depthSuffix})`, 'bench_pp', 'string', ['number'], [nPrompt]); parseBenchResult('bench_pp warmup', raw); } const samples_ns = []; for (let i = 0; i < nReps; i++) { status('perf', `pp${nPrompt}${depthSuffix} ${i + 1}/${nReps}`, Date.now()); await setDepth(`pp rep ${i + 1}/${nReps}`); const t0 = performance.now(); const raw = await ccallPhaseLabel(Module, `bench_pp rep ${i + 1}/${nReps} (${nPrompt}${depthSuffix})`, 'bench_pp', 'string', ['number'], [nPrompt]); const t_ns = (performance.now() - t0) * 1e6; parseBenchResult('bench_pp', raw); samples_ns.push(t_ns); log(`pp${nPrompt}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`); if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS); } tests.push(buildTest(`pp${nPrompt}${depthSuffix}`, nPrompt, 0, nDepth, samples_ns)); } catch (err) { log(`pp test failed: ${err.message}`); } } if (wantTg) { try { if (!noWarmup) { // Run the full nGen-token decode loop as warmup (was bench_tg(1)). // A 1-token warmup exercises the decode kernel once, which leaves // the first timed rep absorbing pipeline-cache / shader-specialize // cost on every subsequent step. status('perf', `warmup tg${nGen}${depthSuffix}`, Date.now()); await setDepth('tg warmup'); log(`bench_tg(${nGen})${depthSuffix} — warmup`); const raw = await ccallPhaseLabel(Module, `bench_tg warmup (${nGen}${depthSuffix})`, 'bench_tg', 'string', ['number'], [nGen]); parseBenchResult('bench_tg warmup', raw); } const samples_ns = []; for (let i = 0; i < nReps; i++) { status('perf', `tg${nGen}${depthSuffix} ${i + 1}/${nReps}`, Date.now()); await setDepth(`tg rep ${i + 1}/${nReps}`); const t0 = performance.now(); const raw = await ccallPhaseLabel(Module, `bench_tg rep ${i + 1}/${nReps} (${nGen}${depthSuffix})`, 'bench_tg', 'string', ['number'], [nGen]); const t_ns = (performance.now() - t0) * 1e6; parseBenchResult('bench_tg', raw); samples_ns.push(t_ns); log(`tg${nGen}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`); if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS); } tests.push(buildTest(`tg${nGen}${depthSuffix}`, 0, nGen, nDepth, samples_ns)); } catch (err) { log(`tg test failed: ${err.message}`); } } if (tests.length > 0) { result.metrics = { tests, n_prompt: wantPp ? nPrompt : 0, n_gen: wantTg ? nGen : 0, n_depth: nDepth, n_reps: nReps, }; } } await ccallPhase(Module, 'bench_exit', null, [], []); // Close the sync handle so OPFS can release its lock on the file (and // so a subsequent run can open a fresh handle without colliding). opfsFreeAll(Module); // Eagerly drop GPU buffers. worker.terminate() alone leaves Metal // allocations alive on iOS Safari long enough for the next study run to // hit Jetsam — destroy() returns the memory synchronously. if (capturedGpuDevice) { try { capturedGpuDevice.destroy(); } catch (err) { log(`device.destroy() failed: ${err.message}`); } capturedGpuDevice = null; } result.status = 'done'; const summary = result.metrics?.tests ?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`) .join(' | ') || 'no perf'; status('done', `Done! ${summary}`); return result; }