Spaces:
Running
Running
File size: 12,004 Bytes
4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e df975ba 4721a6e 2dc46fb 4721a6e 67d4061 df975ba 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e 2dc46fb 4721a6e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | <!DOCTYPE html>
<html lang="en" data-theme="light">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="color-scheme" content="light dark">
<script>(function(){var s=localStorage.getItem('theme');if(!s){s=(window.matchMedia&&matchMedia('(prefers-color-scheme: dark)').matches)?'dark':'light';}document.documentElement.setAttribute('data-theme',s);})();</script>
<title>Methodology β WebGPU Bench</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Bricolage+Grotesque:opsz,wght@12..96,400;12..96,500;12..96,600;12..96,700;12..96,800&family=Geist+Mono:wght@400;500;600&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
<link rel="stylesheet" href="css/style.css">
</head>
<body>
<header class="header">
<div class="header-inner">
<a href="index.html" class="header-brand">
<svg class="header-logo" width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="4" y="4" width="16" height="16" rx="2"/><rect x="9" y="9" width="6" height="6"/><line x1="9" y1="1" x2="9" y2="4"/><line x1="15" y1="1" x2="15" y2="4"/><line x1="9" y1="20" x2="9" y2="23"/><line x1="15" y1="20" x2="15" y2="23"/><line x1="20" y1="9" x2="23" y2="9"/><line x1="20" y1="14" x2="23" y2="14"/><line x1="1" y1="9" x2="4" y2="9"/><line x1="1" y1="14" x2="4" y2="14"/></svg>
<span class="header-title">WebGPU Bench</span>
</a>
<nav class="header-nav" aria-label="Primary">
<a href="index.html" class="header-link">Dashboard</a>
<a href="run.html" class="header-link">Run</a>
<button id="theme-toggle" class="header-link theme-toggle-btn" type="button" title="Toggle theme" aria-label="Toggle dark mode">
<svg class="icon-sun" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg>
<svg class="icon-moon" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg>
</button>
<a href="https://github.com/abhijitramesh/webgpu-bench" target="_blank" rel="noopener" class="header-link">
<svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0C5.37 0 0 5.37 0 12c0 5.3 3.44 9.8 8.2 11.39.6.11.82-.26.82-.58v-2.03c-3.34.73-4.04-1.61-4.04-1.61-.55-1.39-1.34-1.76-1.34-1.76-1.09-.74.08-.73.08-.73 1.2.09 1.84 1.24 1.84 1.24 1.07 1.83 2.81 1.3 3.5 1 .11-.78.42-1.3.76-1.6-2.67-.3-5.47-1.33-5.47-5.93 0-1.31.47-2.38 1.24-3.22-.13-.3-.54-1.52.12-3.18 0 0 1.01-.32 3.3 1.23a11.5 11.5 0 0 1 6.02 0c2.28-1.55 3.29-1.23 3.29-1.23.66 1.66.25 2.88.12 3.18.77.84 1.24 1.91 1.24 3.22 0 4.61-2.81 5.63-5.48 5.92.43.37.81 1.1.81 2.22v3.29c0 .32.22.7.82.58C20.57 21.8 24 17.3 24 12c0-6.63-5.37-12-12-12z"/></svg>
GitHub
</a>
</nav>
</div>
</header>
<main class="methodology-layout">
<nav class="methodology-toc" aria-label="Table of contents">
<p class="methodology-toc-title">On this page</p>
<ol>
<li><a href="#how-benchmarks-work">How Benchmarks Work</a></li>
<li><a href="#dashboard-columns">Dashboard Columns</a></li>
<li><a href="#error-categories">Error Categories</a></li>
<li>
<a href="#consistency-measurement">Consistency Measurement</a>
<ol>
<li><a href="#how-it-works">How it works</a></li>
<li><a href="#why-forced-decoding">Why forced decoding</a></li>
<li><a href="#interpreting-cpu-match">Interpreting CPU Match</a></li>
</ol>
</li>
</ol>
</nav>
<div class="methodology-content">
<a href="index.html" class="back-link">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><line x1="19" y1="12" x2="5" y2="12"/><polyline points="12 19 5 12 12 5"/></svg>
Back to Dashboard
</a>
<h1>Methodology</h1>
<h2 id="how-benchmarks-work">How Benchmarks Work</h2>
<ol>
<li><code>build.sh</code> compiles llama.cpp to WebAssembly with WebGPU support via Emscripten + emdawnwebgpu, producing two WASM variants: JSPI (Chrome) and Asyncify (Firefox, Safari).</li>
<li><code>runner.js</code> launches Playwright browsers and navigates to <code>harness.html</code>.</li>
<li><code>harness.js</code> detects JSPI support and loads the correct WASM variant.</li>
<li>The GGUF model is downloaded from HuggingFace directly in the browser.</li>
<li>Inference runs via WebGPU (or CPU fallback) using llama.cpp's C API with greedy sampling for deterministic output.</li>
<li>Performance metrics are collected via <code>llama_perf_context()</code> and returned to Playwright.</li>
<li>A fresh browser instance is launched for each variant to prevent WASM memory accumulation (OOM fix).</li>
</ol>
<h2 id="dashboard-columns">Dashboard Columns</h2>
<table>
<thead>
<tr><th>Column</th><th>Description</th></tr>
</thead>
<tbody>
<tr><td>Machine</td><td>Machine slug identifying the hardware (e.g. <code>apple-m3-16gb-darwin</code>)</td></tr>
<tr><td>Model</td><td>Model name (e.g. Llama-3.2-1B-Instruct)</td></tr>
<tr><td>Quant</td><td>Quantization variant (e.g. Q4_K_M, Q8_0)</td></tr>
<tr><td>Size (MB)</td><td>Model file size in megabytes</td></tr>
<tr><td>Browser</td><td>Browser used for the benchmark (chromium, firefox, webkit)</td></tr>
<tr><td>Status</td><td>PASS if inference completed successfully, FAIL otherwise</td></tr>
<tr><td>Build</td><td><code>jspi</code> or <code>asyncify</code> β which WASM variant was used. Chrome supports JSPI; Firefox and Safari use Asyncify.</td></tr>
<tr><td>WebGPU</td><td>Whether WebGPU was available in the browser. If not, inference falls back to CPU.</td></tr>
<tr><td>Decode tok/s</td><td>Token generation speed (tokens/sec) β main performance metric</td></tr>
<tr><td>Prefill tok/s</td><td>Prompt processing speed (tokens/sec)</td></tr>
<tr><td>n_eval</td><td>Number of tokens generated during decode</td></tr>
<tr><td>t_eval (ms)</td><td>Total decode time in milliseconds</td></tr>
<tr><td>n_p_eval</td><td>Number of prompt tokens processed during prefill</td></tr>
<tr><td>t_p_eval (ms)</td><td>Total prefill time in milliseconds</td></tr>
<tr><td>Wall (s)</td><td>Total wall-clock time for the benchmark run in seconds (includes model download, load, and inference)</td></tr>
<tr><td>CPU Match</td><td>Consistency with CPU baseline β percentage of token positions where WebGPU and CPU agree on the top-1 token. Only present when benchmarks are run with <code>--consistency</code>. See Consistency Measurement below.</td></tr>
<tr><td>Error</td><td>Error message and category (OOM, WASM Abort, Timeout, etc.) when status is FAIL</td></tr>
</tbody>
</table>
<h2 id="error-categories">Error Categories</h2>
<table>
<thead>
<tr><th>Category</th><th>Pattern</th><th>Typical Cause</th></tr>
</thead>
<tbody>
<tr><td>OOM</td><td>out of memory, memory allocation</td><td>Model too large for available WASM memory</td></tr>
<tr><td>WASM Abort</td><td>wasm, abort, unreachable</td><td>WASM execution error, often from unsupported operations</td></tr>
<tr><td>Timeout</td><td>timeout, timed out</td><td>Benchmark exceeded time limit (model download or inference)</td></tr>
<tr><td>Download Failed</td><td>download, fetch, 404, network</td><td>Model file not found or network error</td></tr>
<tr><td>Other</td><td>everything else</td><td>Uncategorized errors</td></tr>
</tbody>
</table>
<h2 id="consistency-measurement">Consistency Measurement</h2>
<p>The <code>--consistency</code> flag measures how faithfully the WebGPU backend reproduces the CPU computation for each quantization type.</p>
<h3 id="how-it-works">How it works</h3>
<p>For each variant, two runs are performed:</p>
<ol>
<li><strong>CPU baseline</strong> (<code>n_gpu_layers=0</code>): greedy-decodes 128 tokens and records the token ID sequence. Cached to <code>results/cpu_baselines.json</code>. When testing multiple browsers, the baseline is collected once on the first browser and shared across all browsers (CPU output is identical regardless of JSPI vs Asyncify). When testing a single browser, the baseline runs in that same browser.</li>
<li><strong>WebGPU run</strong> (<code>n_gpu_layers=999</code>): performs a forced-decoding pass β feeds the CPU's token sequence one token at a time and checks whether the WebGPU backend independently predicts the same top-1 token at each position.</li>
</ol>
<h3 id="why-forced-decoding">Why forced decoding</h3>
<p>Naively comparing generated text suffers from cascading divergence: a single token difference changes the KV cache context for all subsequent tokens. Forced decoding evaluates each position independently, giving a clean per-token accuracy signal.</p>
<h3 id="interpreting-cpu-match">Interpreting CPU Match</h3>
<table>
<thead>
<tr><th>CPU Match</th><th>Interpretation</th></tr>
</thead>
<tbody>
<tr><td><code>100.0%</code></td><td>Numerically identical to CPU β no precision issues</td></tr>
<tr><td><code>95β99%</code></td><td>A few tokens differ due to near-equal logits β expected for lower-precision quants</td></tr>
<tr><td><code>< 90%</code></td><td>Systematic precision issues β GPU kernel may need investigation</td></tr>
<tr><td><code>0.0%</code></td><td>First token wrong β quantization kernel likely broken</td></tr>
<tr><td><code>β</code></td><td>No consistency data β benchmarks were run without <code>--consistency</code></td></tr>
</tbody>
</table>
</div>
</main>
<script>
document.getElementById('theme-toggle')?.addEventListener('click', () => {
const next = document.documentElement.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
document.documentElement.setAttribute('data-theme', next);
localStorage.setItem('theme', next);
});
// ToC scroll-spy: mark active link based on the topmost visible heading.
(function() {
const links = [...document.querySelectorAll('.methodology-toc a[href^="#"]')];
const targets = links
.map(a => ({ a, el: document.getElementById(a.getAttribute('href').slice(1)) }))
.filter(x => x.el);
if (targets.length === 0) return;
let ticking = false;
function update() {
const anchor = 120;
let active = targets[0].a;
for (const t of targets) {
if (t.el.getBoundingClientRect().top - anchor <= 0) active = t.a;
else break;
}
links.forEach(l => l.classList.toggle('active', l === active));
}
window.addEventListener('scroll', () => {
if (ticking) return;
ticking = true;
requestAnimationFrame(() => { update(); ticking = false; });
}, { passive: true });
update();
})();
</script>
</body>
</html>
|