Spaces:
Running
Running
| // Dedicated Worker that runs a single llama.cpp inference pass. Loaded by | |
| // controller.js and harness.js so we can importScripts() the | |
| // Emscripten-emitted bench.js (which is a classic, non-module script). | |
| // | |
| // Protocol (all messages use { type, ... } tag): | |
| // | |
| // main β worker: { | |
| // type: 'run', | |
| // params: { | |
| // buildType, | |
| // // model load | |
| // nCtx, nGpuLayers, | |
| // // consistency phase (set consistencyPrompt to '' to skip) | |
| // consistencyPrompt, consistencyNPredict, refTokenIds, | |
| // // perf phase | |
| // nPrompt, nGen, nReps, nDepth, noWarmup, | |
| // }, | |
| // opfsPath: { rootDir, repo, filename } | |
| // } | |
| // | |
| // worker β main: { type: 'status', status, msg } | |
| // worker β main: { type: 'progress', fraction, downloaded, total } | |
| // worker β main: { type: 'log', line } | |
| // worker β main: { type: 'result', record } // terminal | |
| // | |
| // Abort: main thread calls worker.terminate(). No cooperative abort β JSPI | |
| // decode loops ignore signals, and termination is the only reliable way to | |
| // stop an in-flight WASM call. | |
| const post = (msg) => self.postMessage(msg); | |
| const log = (line) => post({ type: 'log', line }); | |
| // sinceMs: optional epoch ms. Forwarded to controller so the row ticks an | |
| // elapsed counter while a long-running ccall (warmup, big-model rep) is in | |
| // flight β JSPI doesn't yield often enough on CPU paths to drive ticks here. | |
| const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, sinceMs }); | |
| // Capture the GPUDevice that llama.cpp's WebGPU backend creates so we can | |
| // destroy() it before the worker terminates. Without this, iOS Safari holds | |
| // Metal allocations from prior runs long enough for the next model load in a | |
| // study sweep to push the tab over its memory limit and trigger Jetsam. | |
| // Installed at module scope so the capture is in place before the bench.js | |
| // glue is importScripts'd and before any C-side requestAdapter/requestDevice | |
| // calls run. The wrapper is one-shot per device: if the backend ever | |
| // re-requests, the latest reference wins. | |
| let capturedGpuDevice = null; | |
| if (self.navigator?.gpu && typeof self.navigator.gpu.requestAdapter === 'function') { | |
| const origRequestAdapter = self.navigator.gpu.requestAdapter.bind(self.navigator.gpu); | |
| self.navigator.gpu.requestAdapter = async (...args) => { | |
| const adapter = await origRequestAdapter(...args); | |
| if (adapter && typeof adapter.requestDevice === 'function' && !adapter.__deviceWrapped) { | |
| const origRequestDevice = adapter.requestDevice.bind(adapter); | |
| adapter.requestDevice = async (...devArgs) => { | |
| const device = await origRequestDevice(...devArgs); | |
| capturedGpuDevice = device; | |
| return device; | |
| }; | |
| adapter.__deviceWrapped = true; | |
| } | |
| return adapter; | |
| }; | |
| } | |
| // Below this many compared tokens, the consistency agreement rate is | |
| // statistical noise (e.g. early-EOS models that produce 1 token always | |
| // report 100%). | |
| const CONSISTENCY_MIN_TOKENS = 8; | |
| // Sleep between perf reps so the GPU clock state can recover. Without | |
| // this, sustained tg decode reps showed monotonic decay (rep 1 fastest, | |
| // rep N slowest) β looks like Apple's GPU power-state cooldown. | |
| const REP_COOLDOWN_MS = 1000; | |
| const sleep = (ms) => new Promise(r => setTimeout(r, ms)); | |
| const MAX_WASM_ERROR_LINES = 12; | |
| const MAX_WASM_STDERR_LINES = 20; | |
| // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the | |
| // actually-bad lines as :err so real failures stand out. | |
| function classifyWasmStderr(text) { | |
| return /\b(error|abort(ed)?|failed|fatal|panic|assert)\b|GGML_ASSERT/i.test(text) | |
| ? '[wasm:err]' : '[wasm]'; | |
| } | |
| const wasmErrLines = []; | |
| const wasmStderrLines = []; | |
| function recordWasmStderrLine(line) { | |
| wasmStderrLines.push(line); | |
| if (wasmStderrLines.length > MAX_WASM_STDERR_LINES) wasmStderrLines.shift(); | |
| } | |
| function recordWasmErrLine(line) { | |
| wasmErrLines.push(line); | |
| if (wasmErrLines.length > MAX_WASM_ERROR_LINES) wasmErrLines.shift(); | |
| } | |
| function recentWasmErrDetail() { | |
| if (wasmErrLines.length === 0) return ''; | |
| return wasmErrLines.join(' || '); | |
| } | |
| function recentWasmStderrDetail() { | |
| if (wasmStderrLines.length === 0) return ''; | |
| return wasmStderrLines.join(' || '); | |
| } | |
| // βββ OPFS-backed model loading (wllama-style) βββ | |
| // For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray | |
| // length limits, and it eats the heap budget that KV cache + working memory | |
| // need). Instead, we open a FileSystemSyncAccessHandle on the OPFS file in | |
| // this worker, register a zero-byte stub in MEMFS, and patch MEMFS's | |
| // stream_ops so reads delegate to syncHandle.read(). llama.cpp then loads | |
| // the model via fread (use_mmap=false), which calls the patched stream_ops | |
| // β never copying the bytes through the WASM heap. | |
| // | |
| // Mirrors wllama's src/workers-code/llama-cpp.js (patchMEMFS / opfsAlloc / | |
| // opfsFreeAll). Worker-only: sync access handles aren't available on the | |
| // main thread. | |
| const opfsHandles = {}; // map MEMFS-name β { syncHandle, size } | |
| function patchMEMFS(Module) { | |
| const m = Module; | |
| // Idempotent β only install the patches once per Module. | |
| if (m.MEMFS.stream_ops._read) return; | |
| m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read; | |
| m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek; | |
| m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap; | |
| m.MEMFS.stream_ops.read = function (stream, buffer, offset, length, position) { | |
| const name = stream.node.name; | |
| if (opfsHandles[name]) { | |
| const { syncHandle, size } = opfsHandles[name]; | |
| const toRead = Math.min(length, size - position); | |
| if (toRead <= 0) return 0; | |
| const view = new Uint8Array(buffer.buffer, buffer.byteOffset + offset, toRead); | |
| return syncHandle.read(view, { at: position }); | |
| } | |
| return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position); | |
| }; | |
| m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read; | |
| m.MEMFS.stream_ops.llseek = function (stream, offset, whence) { | |
| const name = stream.node.name; | |
| if (opfsHandles[name]) { | |
| const { size } = opfsHandles[name]; | |
| let newPos = offset; | |
| if (whence === 1) newPos += stream.position; // SEEK_CUR | |
| if (whence === 2) newPos += size; // SEEK_END | |
| if (newPos < 0) throw new Error('SEEK before start of file'); | |
| stream.position = newPos; | |
| return newPos; | |
| } | |
| return m.MEMFS.stream_ops._llseek(stream, offset, whence); | |
| }; | |
| m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek; | |
| m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) { | |
| const name = stream.node.name; | |
| if (opfsHandles[name]) { | |
| // OPFS-backed files must never be mmap'd β that would force MEMFS to | |
| // copy the file into the WASM heap, defeating the OPFS path. The C++ | |
| // side passes use_mmap=0 to avoid this. If we ever land here, the | |
| // caller forgot to disable mmap. | |
| throw new Error(`[OPFS] mmap called on "${name}" β bench_load was not invoked with use_mmap=0`); | |
| } | |
| return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags); | |
| }; | |
| m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap; | |
| } | |
| // Resolve an OPFS path (rootDir + repo segments + filename) to a | |
| // FileSystemFileHandle inside this worker. Works around the iOS Safari | |
| // limitation that FileSystemFileHandle isn't structured-cloneable across | |
| // postMessage β main thread sends the layout key, worker opens the | |
| // handle locally. | |
| async function resolveOpfsHandle({ rootDir, repo, filename }) { | |
| if (!self.navigator?.storage?.getDirectory) { | |
| throw new Error('OPFS not available in this worker'); | |
| } | |
| let dir = await self.navigator.storage.getDirectory(); | |
| dir = await dir.getDirectoryHandle(rootDir, { create: false }); | |
| for (const seg of String(repo).split('/').filter(Boolean)) { | |
| dir = await dir.getDirectoryHandle(seg, { create: false }); | |
| } | |
| return dir.getFileHandle(filename, { create: false }); | |
| } | |
| async function opfsAlloc(Module, name, fileHandle) { | |
| // createSyncAccessHandle is worker-only and exclusive β only one writer | |
| // per OPFS file at a time. Caller must ensure no createWritable session | |
| // is open when we land here. | |
| const syncHandle = await fileHandle.createSyncAccessHandle(); | |
| const size = syncHandle.getSize(); | |
| opfsHandles[name] = { syncHandle, size }; | |
| // Zero-byte placeholder so llama.cpp's fopen() finds the path. | |
| Module.FS.createDataFile('/', name, new Uint8Array(0), true, false, true); | |
| // Set usedBytes so fstat()/seek-end report the real file size β our | |
| // patched llseek consults size, but other code (e.g. llama.cpp's GGUF | |
| // reader sanity-checking the file length) goes through stat first. | |
| Module.FS.lookupPath('/' + name).node.usedBytes = size; | |
| return size; | |
| } | |
| function opfsFreeAll(Module) { | |
| for (const [name, { syncHandle }] of Object.entries(opfsHandles)) { | |
| try { syncHandle.close(); } catch { /* already closed */ } | |
| try { Module.FS.unlink('/' + name); } catch { /* already gone */ } | |
| delete opfsHandles[name]; | |
| } | |
| } | |
| // Aggregate raw nanosecond samples into the llama-bench result shape. | |
| // llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as | |
| // the std of per-sample t/s, computed independently rather than propagated | |
| // from stddev_ns (the mapping isn't linear). | |
| // | |
| // `n_depth` carries through unchanged so downstream consumers can label | |
| // e.g. "pp512 @ d2048" the way llama-bench does (line 1984 of | |
| // llama.cpp/tools/llama-bench/llama-bench.cpp). | |
| function buildTest(name, n_prompt, n_gen, n_depth, samples_ns) { | |
| const n = samples_ns.length; | |
| if (n === 0) { | |
| return { name, n_prompt, n_gen, n_depth, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] }; | |
| } | |
| const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n; | |
| const var_ns = n > 1 | |
| ? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1) | |
| : 0; | |
| const stddev_ns = Math.sqrt(var_ns); | |
| const n_tokens = n_prompt + n_gen; | |
| const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0); | |
| const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n; | |
| const var_ts = n > 1 | |
| ? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1) | |
| : 0; | |
| const stddev_ts = Math.sqrt(var_ts); | |
| const round2 = x => Math.round(x * 100) / 100; | |
| return { | |
| name, | |
| n_prompt, | |
| n_gen, | |
| n_depth, | |
| avg_ns: Math.round(avg_ns), | |
| stddev_ns: Math.round(stddev_ns), | |
| avg_ts: round2(avg_ts), | |
| stddev_ts: round2(stddev_ts), | |
| samples_ns: samples_ns.map(Math.round), | |
| samples_ts: samples_ts.map(round2), | |
| }; | |
| } | |
| function parseBenchResult(label, raw) { | |
| let r; | |
| try { r = JSON.parse(raw); } catch (e) { | |
| throw new Error(`${label}: invalid JSON from C (${e.message})`); | |
| } | |
| if (r.error) throw new Error(`${label}: ${r.error}`); | |
| return r; | |
| } | |
| function describeError(err) { | |
| if (err == null) return ''; | |
| if (typeof err === 'string') return err; | |
| if (typeof err === 'number' || typeof err === 'boolean') return String(err); | |
| if (err instanceof Error) return err.message || String(err); | |
| if (typeof err === 'object') { | |
| const parts = []; | |
| if (typeof err.name === 'string' && err.name) parts.push(err.name); | |
| if (typeof err.type === 'string' && err.type) parts.push(`type=${err.type}`); | |
| if (typeof err.message === 'string' && err.message) parts.push(err.message); | |
| if (typeof err.reason === 'string' && err.reason) parts.push(`reason=${err.reason}`); | |
| if (typeof err.filename === 'string' && err.filename) parts.push(`file=${err.filename}`); | |
| if (typeof err.lineno === 'number' && err.lineno > 0) parts.push(`line=${err.lineno}`); | |
| if (typeof err.colno === 'number' && err.colno > 0) parts.push(`col=${err.colno}`); | |
| if (typeof err.error === 'string' && err.error) parts.push(`error=${err.error}`); | |
| else if (err.error instanceof Error && err.error.message) parts.push(`error=${err.error.message}`); | |
| if (parts.length > 0) return parts.join(' | '); | |
| try { | |
| const own = {}; | |
| for (const key of Object.getOwnPropertyNames(err)) { | |
| own[key] = err[key]; | |
| } | |
| const json = JSON.stringify(own); | |
| if (json && json !== '{}') return json; | |
| } catch { | |
| // fall through | |
| } | |
| const tag = Object.prototype.toString.call(err); | |
| if (tag && tag !== '[object Object]') return tag; | |
| return 'unknown structured error'; | |
| } | |
| return String(err); | |
| } | |
| function formatPhaseError(phase, err) { | |
| const detail = describeError(err); | |
| const stderr = recentWasmErrDetail(); | |
| const stderrTail = recentWasmStderrDetail(); | |
| if (detail && stderr) return `${phase} threw WASM exception (${detail}) | recent stderr: ${stderr}`; | |
| if (detail && stderrTail) return `${phase} threw WASM exception (${detail}) | recent stderr tail: ${stderrTail}`; | |
| if (detail) return `${phase} threw WASM exception (${detail})`; | |
| if (stderr) return `${phase} threw WASM exception | recent stderr: ${stderr}`; | |
| if (stderrTail) return `${phase} threw WASM exception | recent stderr tail: ${stderrTail}`; | |
| return `${phase} threw WASM exception`; | |
| } | |
| async function ccallPhase(Module, phase, returnType, argTypes, args) { | |
| try { | |
| return await Module.ccall(phase, returnType, argTypes, args, { async: true }); | |
| } catch (err) { | |
| throw new Error(formatPhaseError(phase, err)); | |
| } | |
| } | |
| async function ccallPhaseLabel(Module, phaseLabel, exportName, returnType, argTypes, args) { | |
| try { | |
| return await Module.ccall(exportName, returnType, argTypes, args, { async: true }); | |
| } catch (err) { | |
| throw new Error(formatPhaseError(phaseLabel, err)); | |
| } | |
| } | |
| self.onmessage = async (e) => { | |
| const { type } = e.data || {}; | |
| if (type !== 'run') { | |
| log(`worker: ignoring unknown message type "${type}"`); | |
| return; | |
| } | |
| try { | |
| const record = await runOne(e.data); | |
| post({ type: 'result', record }); | |
| } catch (err) { | |
| post({ | |
| type: 'result', | |
| record: { | |
| status: 'error', | |
| error: describeError(err), | |
| metrics: null, | |
| }, | |
| }); | |
| } | |
| }; | |
| async function runOne({ params, opfsPath }) { | |
| wasmErrLines.length = 0; | |
| wasmStderrLines.length = 0; | |
| const { | |
| buildType, | |
| nCtx, | |
| nGpuLayers, | |
| // consistency | |
| consistencyPrompt, | |
| consistencyNPredict, | |
| refTokenIds, | |
| // perf | |
| nPrompt, | |
| nGen, | |
| nReps, | |
| nDepth = 0, | |
| noWarmup, | |
| } = params; | |
| // The worker only loads via OPFS now: main thread downloads to OPFS, | |
| // we open a FileSystemSyncAccessHandle here, and patched MEMFS | |
| // stream_ops route llama.cpp's fread through the sync handle. That | |
| // path scales past the WASM heap budget and shares one numerical | |
| // implementation across every surface. | |
| if (!opfsPath) { | |
| throw new Error('runOne: opfsPath is required (heap/buffer paths removed)'); | |
| } | |
| const result = { | |
| status: 'init', | |
| error: null, | |
| buildType, | |
| webgpuAvailable: !!self.navigator?.gpu, | |
| gpuAdapterInfo: null, | |
| metrics: null, | |
| consistency: null, | |
| output: '', | |
| }; | |
| // βββ WebGPU adapter probe (informational) βββ | |
| if (self.navigator?.gpu) { | |
| try { | |
| const adapter = await self.navigator.gpu.requestAdapter(); | |
| if (adapter) { | |
| const info = adapter.info; | |
| result.gpuAdapterInfo = info ? { | |
| vendor: info.vendor || '', | |
| architecture: info.architecture || '', | |
| device: info.device || '', | |
| description: info.description || '', | |
| } : null; | |
| log(`WebGPU adapter: ${JSON.stringify(result.gpuAdapterInfo || 'no info')}`); | |
| } else { | |
| log('WebGPU: adapter request returned null'); | |
| } | |
| } catch (err) { | |
| log(`WebGPU adapter error: ${err.message}`); | |
| } | |
| } else { | |
| log('WebGPU: not available in this worker'); | |
| } | |
| // βββ Load the Emscripten glue once per worker βββ | |
| status('loading_wasm', `Loading WASM module (${buildType})...`); | |
| try { | |
| self.importScripts(`/build/${buildType}/bench.js`); | |
| } catch (err) { | |
| throw new Error(`importScripts /build/${buildType}/bench.js failed: ${err.message}`); | |
| } | |
| if (typeof self.createBenchModule !== 'function') { | |
| throw new Error('createBenchModule not defined after importScripts'); | |
| } | |
| const Module = await self.createBenchModule({ | |
| locateFile: (filename) => `/build/${buildType}/${filename}`, | |
| print: (text) => log(`[wasm] ${text}`), | |
| printErr: (text) => { | |
| const tag = classifyWasmStderr(text); | |
| const line = `${tag} ${text}`; | |
| recordWasmStderrLine(line); | |
| if (tag === '[wasm:err]') recordWasmErrLine(line); | |
| log(line); | |
| }, | |
| onAbort: (reason) => { | |
| const msg = `WASM aborted: ${reason}`; | |
| result.error = msg; | |
| result.status = 'error'; | |
| log(`ERROR: ${msg}`); | |
| }, | |
| }); | |
| log('WASM module loaded'); | |
| // βββ Make the model visible to the WASM filesystem βββ | |
| // Open a FileSystemSyncAccessHandle on the OPFS file, register a | |
| // zero-byte placeholder in MEMFS, and patch MEMFS stream_ops so | |
| // llama.cpp's fread is routed to the sync handle. Bytes never touch | |
| // the WASM heap, so the model size is bounded by OPFS quota, not heap. | |
| status('opfs', 'Linking OPFS-backed model into MEMFS...'); | |
| const fileHandle = await resolveOpfsHandle(opfsPath); | |
| patchMEMFS(Module); | |
| const size = await opfsAlloc(Module, 'model.gguf', fileHandle); | |
| log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`); | |
| // Report 100% to keep the existing progress UI happy β the actual | |
| // download to OPFS happened before the worker spawn. | |
| post({ type: 'progress', fraction: 1, downloaded: size, total: size }); | |
| // βββ Init backend βββ | |
| status('initializing', 'Initializing llama.cpp backends...'); | |
| const initResult = await ccallPhase(Module, 'bench_init', 'number', [], []); | |
| if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`); | |
| log('Backends initialized'); | |
| // βββ Load model βββ | |
| // use_mmap=0 β the patched MEMFS mmap throws explicitly, so any | |
| // accidental mmap attempt surfaces as a clear error rather than a | |
| // silent heap copy. | |
| status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=0)...`); | |
| const loadResult = await ccallPhase( | |
| Module, | |
| 'bench_load', | |
| 'number', | |
| ['string', 'number', 'number', 'number'], | |
| ['/model.gguf', nCtx, nGpuLayers, 0], | |
| ); | |
| if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`); | |
| log('Model loaded'); | |
| // βββ Memory snapshot from llama.cpp βββ | |
| // Captured immediately after bench_load so model_size reflects the loaded | |
| // model and per-device free counters reflect post-allocation state. Wrapped | |
| // in try/catch β if the C side or a backend errors, the run can still | |
| // produce perf numbers, just without memoryInfo on the record. | |
| try { | |
| const raw = await ccallPhase(Module, 'bench_memory_info', 'string', [], []); | |
| result.memoryInfo = parseBenchResult('bench_memory_info', raw); | |
| const dev = (result.memoryInfo.devices || []) | |
| .map(d => `${d.name}(${d.type}) free=${(d.free / (1024 * 1024)).toFixed(0)}MB total=${(d.total / (1024 * 1024)).toFixed(0)}MB`) | |
| .join(' | ') || 'none'; | |
| log(`Memory: model=${(result.memoryInfo.model_size / (1024 * 1024)).toFixed(0)}MB state=${(result.memoryInfo.state_size / (1024 * 1024)).toFixed(0)}MB | ${dev}`); | |
| } catch (err) { | |
| log(`bench_memory_info failed: ${err.message} β continuing without memoryInfo`); | |
| } | |
| // βββ Consistency phase βββ | |
| // Soft-fail: a failure here logs and falls through to the perf phase | |
| // rather than aborting the whole run. Some devices/models can't survive | |
| // bench_run (e.g. unsupported op, OOM mid-decode) but can still produce | |
| // useful pp/tg numbers via synthetic-token paths. | |
| if (consistencyPrompt) { | |
| try { | |
| status('consistency', 'Running consistency check...', Date.now()); | |
| log(`bench_run("...", ${consistencyNPredict}) β consistency phase`); | |
| const raw = await ccallPhase(Module, 'bench_run', 'string', | |
| ['string', 'number'], | |
| [consistencyPrompt, consistencyNPredict]); | |
| const r = parseBenchResult('bench_run', raw); | |
| result.output = r.output || ''; | |
| result.consistency = { token_ids: r.token_ids || [] }; | |
| if (refTokenIds) { | |
| log('bench_eval_tokens β forced-decode vs CPU baseline'); | |
| const evalRaw = await ccallPhase(Module, 'bench_eval_tokens', 'string', | |
| ['string', 'string'], | |
| [consistencyPrompt, refTokenIds]); | |
| const ev = parseBenchResult('bench_eval_tokens', evalRaw); | |
| result.consistency = { ...result.consistency, ...ev }; | |
| if (ev.n_tokens < CONSISTENCY_MIN_TOKENS) { | |
| log( | |
| `Consistency: insufficient samples (${ev.n_tokens} token` + | |
| `${ev.n_tokens === 1 ? '' : 's'} before EOS) β agreement rate not meaningful` | |
| ); | |
| } else { | |
| log( | |
| `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` + | |
| `${ev.n_agree}/${ev.n_tokens})` + | |
| (ev.first_disagreement >= 0 ? ` β first diverge @ ${ev.first_disagreement}` : '') | |
| ); | |
| } | |
| } | |
| } catch (err) { | |
| log(`Consistency phase failed: ${err.message} β continuing to perf phase`); | |
| } | |
| } | |
| // βββ Perf phase (llama-bench style) βββ | |
| // Each test (pp, tg) is wrapped independently so a failure in one doesn't | |
| // skip the other. Empty samples_ns produces a buildTest with avg_ts=0, | |
| // which the dashboard renders as a dash. | |
| const wantPp = nPrompt > 0; | |
| const wantTg = nGen > 0; | |
| // Test name suffix mirroring llama-bench (e.g. "pp512 @ d2048"). | |
| const depthSuffix = nDepth > 0 ? ` @ d${nDepth}` : ''; | |
| // Each timed rep is preceded by an untimed bench_set_depth call so the KV | |
| // cache is in a known state. The C side caches the post-prefill snapshot, | |
| // so reps 2..N at the same depth restore from snapshot instead of | |
| // re-running the prefill (mirroring llama-bench's `cstate` reuse). | |
| const setDepth = async (label) => { | |
| const raw = await ccallPhaseLabel(Module, `bench_set_depth(${nDepth}) ${label}`, 'bench_set_depth', 'string', ['number'], [nDepth]); | |
| const r = parseBenchResult(`bench_set_depth(${nDepth}) ${label}`, raw); | |
| if (nDepth > 0) { | |
| log(`bench_set_depth(${nDepth}) ${label}: ${r.cached ? 'restored snapshot' : 'prefilled'}`); | |
| } | |
| }; | |
| if (wantPp || wantTg) { | |
| const tests = []; | |
| if (wantPp) { | |
| try { | |
| if (!noWarmup) { | |
| status('perf', `warmup pp${nPrompt}${depthSuffix}`, Date.now()); | |
| await setDepth('pp warmup'); | |
| log(`bench_pp(${nPrompt})${depthSuffix} β warmup`); | |
| const raw = await ccallPhaseLabel(Module, `bench_pp warmup (${nPrompt}${depthSuffix})`, 'bench_pp', 'string', ['number'], [nPrompt]); | |
| parseBenchResult('bench_pp warmup', raw); | |
| } | |
| const samples_ns = []; | |
| for (let i = 0; i < nReps; i++) { | |
| status('perf', `pp${nPrompt}${depthSuffix} ${i + 1}/${nReps}`, Date.now()); | |
| await setDepth(`pp rep ${i + 1}/${nReps}`); | |
| const t0 = performance.now(); | |
| const raw = await ccallPhaseLabel(Module, `bench_pp rep ${i + 1}/${nReps} (${nPrompt}${depthSuffix})`, 'bench_pp', 'string', ['number'], [nPrompt]); | |
| const t_ns = (performance.now() - t0) * 1e6; | |
| parseBenchResult('bench_pp', raw); | |
| samples_ns.push(t_ns); | |
| log(`pp${nPrompt}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`); | |
| if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS); | |
| } | |
| tests.push(buildTest(`pp${nPrompt}${depthSuffix}`, nPrompt, 0, nDepth, samples_ns)); | |
| } catch (err) { | |
| log(`pp test failed: ${err.message}`); | |
| } | |
| } | |
| if (wantTg) { | |
| try { | |
| if (!noWarmup) { | |
| // Run the full nGen-token decode loop as warmup (was bench_tg(1)). | |
| // A 1-token warmup exercises the decode kernel once, which leaves | |
| // the first timed rep absorbing pipeline-cache / shader-specialize | |
| // cost on every subsequent step. | |
| status('perf', `warmup tg${nGen}${depthSuffix}`, Date.now()); | |
| await setDepth('tg warmup'); | |
| log(`bench_tg(${nGen})${depthSuffix} β warmup`); | |
| const raw = await ccallPhaseLabel(Module, `bench_tg warmup (${nGen}${depthSuffix})`, 'bench_tg', 'string', ['number'], [nGen]); | |
| parseBenchResult('bench_tg warmup', raw); | |
| } | |
| const samples_ns = []; | |
| for (let i = 0; i < nReps; i++) { | |
| status('perf', `tg${nGen}${depthSuffix} ${i + 1}/${nReps}`, Date.now()); | |
| await setDepth(`tg rep ${i + 1}/${nReps}`); | |
| const t0 = performance.now(); | |
| const raw = await ccallPhaseLabel(Module, `bench_tg rep ${i + 1}/${nReps} (${nGen}${depthSuffix})`, 'bench_tg', 'string', ['number'], [nGen]); | |
| const t_ns = (performance.now() - t0) * 1e6; | |
| parseBenchResult('bench_tg', raw); | |
| samples_ns.push(t_ns); | |
| log(`tg${nGen}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`); | |
| if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS); | |
| } | |
| tests.push(buildTest(`tg${nGen}${depthSuffix}`, 0, nGen, nDepth, samples_ns)); | |
| } catch (err) { | |
| log(`tg test failed: ${err.message}`); | |
| } | |
| } | |
| if (tests.length > 0) { | |
| result.metrics = { | |
| tests, | |
| n_prompt: wantPp ? nPrompt : 0, | |
| n_gen: wantTg ? nGen : 0, | |
| n_depth: nDepth, | |
| n_reps: nReps, | |
| }; | |
| } | |
| } | |
| await ccallPhase(Module, 'bench_exit', null, [], []); | |
| // Close the sync handle so OPFS can release its lock on the file (and | |
| // so a subsequent run can open a fresh handle without colliding). | |
| opfsFreeAll(Module); | |
| // Eagerly drop GPU buffers. worker.terminate() alone leaves Metal | |
| // allocations alive on iOS Safari long enough for the next study run to | |
| // hit Jetsam β destroy() returns the memory synchronously. | |
| if (capturedGpuDevice) { | |
| try { | |
| capturedGpuDevice.destroy(); | |
| } catch (err) { | |
| log(`device.destroy() failed: ${err.message}`); | |
| } | |
| capturedGpuDevice = null; | |
| } | |
| result.status = 'done'; | |
| const summary = result.metrics?.tests | |
| ?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} Β± ${t.stddev_ts.toFixed(2)} t/s`) | |
| .join(' | ') || 'no perf'; | |
| status('done', `Done! ${summary}`); | |
| return result; | |
| } | |