Spaces:
Running
Running
| <html lang="en" data-theme="light"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <meta name="color-scheme" content="light dark"> | |
| <script>(function(){var s=localStorage.getItem('theme');if(!s){s=(window.matchMedia&&matchMedia('(prefers-color-scheme: dark)').matches)?'dark':'light';}document.documentElement.setAttribute('data-theme',s);})();</script> | |
| <title>Methodology β WebGPU Bench</title> | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> | |
| <link href="https://fonts.googleapis.com/css2?family=Bricolage+Grotesque:opsz,wght@12..96,400;12..96,500;12..96,600;12..96,700;12..96,800&family=Geist+Mono:wght@400;500;600&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet"> | |
| <link rel="stylesheet" href="css/style.css"> | |
| </head> | |
| <body> | |
| <header class="header"> | |
| <div class="header-inner"> | |
| <a href="index.html" class="header-brand"> | |
| <svg class="header-logo" width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="4" y="4" width="16" height="16" rx="2"/><rect x="9" y="9" width="6" height="6"/><line x1="9" y1="1" x2="9" y2="4"/><line x1="15" y1="1" x2="15" y2="4"/><line x1="9" y1="20" x2="9" y2="23"/><line x1="15" y1="20" x2="15" y2="23"/><line x1="20" y1="9" x2="23" y2="9"/><line x1="20" y1="14" x2="23" y2="14"/><line x1="1" y1="9" x2="4" y2="9"/><line x1="1" y1="14" x2="4" y2="14"/></svg> | |
| <span class="header-title">WebGPU Bench</span> | |
| </a> | |
| <nav class="header-nav" aria-label="Primary"> | |
| <a href="index.html" class="header-link">Dashboard</a> | |
| <a href="run.html" class="header-link">Run</a> | |
| <button id="theme-toggle" class="header-link theme-toggle-btn" type="button" title="Toggle theme" aria-label="Toggle dark mode"> | |
| <svg class="icon-sun" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg> | |
| <svg class="icon-moon" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg> | |
| </button> | |
| <a href="https://github.com/abhijitramesh/webgpu-bench" target="_blank" rel="noopener" class="header-link"> | |
| <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0C5.37 0 0 5.37 0 12c0 5.3 3.44 9.8 8.2 11.39.6.11.82-.26.82-.58v-2.03c-3.34.73-4.04-1.61-4.04-1.61-.55-1.39-1.34-1.76-1.34-1.76-1.09-.74.08-.73.08-.73 1.2.09 1.84 1.24 1.84 1.24 1.07 1.83 2.81 1.3 3.5 1 .11-.78.42-1.3.76-1.6-2.67-.3-5.47-1.33-5.47-5.93 0-1.31.47-2.38 1.24-3.22-.13-.3-.54-1.52.12-3.18 0 0 1.01-.32 3.3 1.23a11.5 11.5 0 0 1 6.02 0c2.28-1.55 3.29-1.23 3.29-1.23.66 1.66.25 2.88.12 3.18.77.84 1.24 1.91 1.24 3.22 0 4.61-2.81 5.63-5.48 5.92.43.37.81 1.1.81 2.22v3.29c0 .32.22.7.82.58C20.57 21.8 24 17.3 24 12c0-6.63-5.37-12-12-12z"/></svg> | |
| GitHub | |
| </a> | |
| </nav> | |
| </div> | |
| </header> | |
| <main class="methodology-layout"> | |
| <nav class="methodology-toc" aria-label="Table of contents"> | |
| <p class="methodology-toc-title">On this page</p> | |
| <ol> | |
| <li><a href="#how-benchmarks-work">How Benchmarks Work</a></li> | |
| <li><a href="#dashboard-columns">Dashboard Columns</a></li> | |
| <li><a href="#error-categories">Error Categories</a></li> | |
| <li> | |
| <a href="#consistency-measurement">Consistency Measurement</a> | |
| <ol> | |
| <li><a href="#how-it-works">How it works</a></li> | |
| <li><a href="#why-forced-decoding">Why forced decoding</a></li> | |
| <li><a href="#interpreting-cpu-match">Interpreting CPU Match</a></li> | |
| </ol> | |
| </li> | |
| </ol> | |
| </nav> | |
| <div class="methodology-content"> | |
| <a href="index.html" class="back-link"> | |
| <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><line x1="19" y1="12" x2="5" y2="12"/><polyline points="12 19 5 12 12 5"/></svg> | |
| Back to Dashboard | |
| </a> | |
| <h1>Methodology</h1> | |
| <h2 id="how-benchmarks-work">How Benchmarks Work</h2> | |
| <ol> | |
| <li><code>build.sh</code> compiles llama.cpp to WebAssembly with WebGPU support via Emscripten + emdawnwebgpu, producing two WASM variants: JSPI (Chrome) and Asyncify (Firefox, Safari).</li> | |
| <li><code>runner.js</code> launches Playwright browsers and navigates to <code>harness.html</code>.</li> | |
| <li><code>harness.js</code> detects JSPI support and loads the correct WASM variant.</li> | |
| <li>The GGUF model is downloaded from HuggingFace directly in the browser.</li> | |
| <li>Inference runs via WebGPU (or CPU fallback) using llama.cpp's C API with greedy sampling for deterministic output.</li> | |
| <li>Performance metrics are collected via <code>llama_perf_context()</code> and returned to Playwright.</li> | |
| <li>A fresh browser instance is launched for each variant to prevent WASM memory accumulation (OOM fix).</li> | |
| </ol> | |
| <h2 id="dashboard-columns">Dashboard Columns</h2> | |
| <table> | |
| <thead> | |
| <tr><th>Column</th><th>Description</th></tr> | |
| </thead> | |
| <tbody> | |
| <tr><td>Machine</td><td>Machine slug identifying the hardware (e.g. <code>apple-m3-16gb-darwin</code>)</td></tr> | |
| <tr><td>Model</td><td>Model name (e.g. Llama-3.2-1B-Instruct)</td></tr> | |
| <tr><td>Quant</td><td>Quantization variant (e.g. Q4_K_M, Q8_0)</td></tr> | |
| <tr><td>Size (MB)</td><td>Model file size in megabytes</td></tr> | |
| <tr><td>Browser</td><td>Browser used for the benchmark (chromium, firefox, webkit)</td></tr> | |
| <tr><td>Status</td><td>PASS if inference completed successfully, FAIL otherwise</td></tr> | |
| <tr><td>Build</td><td><code>jspi</code> or <code>asyncify</code> β which WASM variant was used. Chrome supports JSPI; Firefox and Safari use Asyncify.</td></tr> | |
| <tr><td>WebGPU</td><td>Whether WebGPU was available in the browser. If not, inference falls back to CPU.</td></tr> | |
| <tr><td>Decode tok/s</td><td>Token generation speed (tokens/sec) β main performance metric</td></tr> | |
| <tr><td>Prefill tok/s</td><td>Prompt processing speed (tokens/sec)</td></tr> | |
| <tr><td>n_eval</td><td>Number of tokens generated during decode</td></tr> | |
| <tr><td>t_eval (ms)</td><td>Total decode time in milliseconds</td></tr> | |
| <tr><td>n_p_eval</td><td>Number of prompt tokens processed during prefill</td></tr> | |
| <tr><td>t_p_eval (ms)</td><td>Total prefill time in milliseconds</td></tr> | |
| <tr><td>Wall (s)</td><td>Total wall-clock time for the benchmark run in seconds (includes model download, load, and inference)</td></tr> | |
| <tr><td>CPU Match</td><td>Consistency with CPU baseline β percentage of token positions where WebGPU and CPU agree on the top-1 token. Only present when benchmarks are run with <code>--consistency</code>. See Consistency Measurement below.</td></tr> | |
| <tr><td>Error</td><td>Error message and category (OOM, WASM Abort, Timeout, etc.) when status is FAIL</td></tr> | |
| </tbody> | |
| </table> | |
| <h2 id="error-categories">Error Categories</h2> | |
| <table> | |
| <thead> | |
| <tr><th>Category</th><th>Pattern</th><th>Typical Cause</th></tr> | |
| </thead> | |
| <tbody> | |
| <tr><td>OOM</td><td>out of memory, memory allocation</td><td>Model too large for available WASM memory</td></tr> | |
| <tr><td>WASM Abort</td><td>wasm, abort, unreachable</td><td>WASM execution error, often from unsupported operations</td></tr> | |
| <tr><td>Timeout</td><td>timeout, timed out</td><td>Benchmark exceeded time limit (model download or inference)</td></tr> | |
| <tr><td>Download Failed</td><td>download, fetch, 404, network</td><td>Model file not found or network error</td></tr> | |
| <tr><td>Other</td><td>everything else</td><td>Uncategorized errors</td></tr> | |
| </tbody> | |
| </table> | |
| <h2 id="consistency-measurement">Consistency Measurement</h2> | |
| <p>The <code>--consistency</code> flag measures how faithfully the WebGPU backend reproduces the CPU computation for each quantization type.</p> | |
| <h3 id="how-it-works">How it works</h3> | |
| <p>For each variant, two runs are performed:</p> | |
| <ol> | |
| <li><strong>CPU baseline</strong> (<code>n_gpu_layers=0</code>): greedy-decodes 128 tokens and records the token ID sequence. Cached to <code>results/cpu_baselines.json</code>. When testing multiple browsers, the baseline is collected once on the first browser and shared across all browsers (CPU output is identical regardless of JSPI vs Asyncify). When testing a single browser, the baseline runs in that same browser.</li> | |
| <li><strong>WebGPU run</strong> (<code>n_gpu_layers=999</code>): performs a forced-decoding pass β feeds the CPU's token sequence one token at a time and checks whether the WebGPU backend independently predicts the same top-1 token at each position.</li> | |
| </ol> | |
| <h3 id="why-forced-decoding">Why forced decoding</h3> | |
| <p>Naively comparing generated text suffers from cascading divergence: a single token difference changes the KV cache context for all subsequent tokens. Forced decoding evaluates each position independently, giving a clean per-token accuracy signal.</p> | |
| <h3 id="interpreting-cpu-match">Interpreting CPU Match</h3> | |
| <table> | |
| <thead> | |
| <tr><th>CPU Match</th><th>Interpretation</th></tr> | |
| </thead> | |
| <tbody> | |
| <tr><td><code>100.0%</code></td><td>Numerically identical to CPU β no precision issues</td></tr> | |
| <tr><td><code>95β99%</code></td><td>A few tokens differ due to near-equal logits β expected for lower-precision quants</td></tr> | |
| <tr><td><code>< 90%</code></td><td>Systematic precision issues β GPU kernel may need investigation</td></tr> | |
| <tr><td><code>0.0%</code></td><td>First token wrong β quantization kernel likely broken</td></tr> | |
| <tr><td><code>β</code></td><td>No consistency data β benchmarks were run without <code>--consistency</code></td></tr> | |
| </tbody> | |
| </table> | |
| </div> | |
| </main> | |
| <script> | |
| document.getElementById('theme-toggle')?.addEventListener('click', () => { | |
| const next = document.documentElement.getAttribute('data-theme') === 'dark' ? 'light' : 'dark'; | |
| document.documentElement.setAttribute('data-theme', next); | |
| localStorage.setItem('theme', next); | |
| }); | |
| // ToC scroll-spy: mark active link based on the topmost visible heading. | |
| (function() { | |
| const links = [...document.querySelectorAll('.methodology-toc a[href^="#"]')]; | |
| const targets = links | |
| .map(a => ({ a, el: document.getElementById(a.getAttribute('href').slice(1)) })) | |
| .filter(x => x.el); | |
| if (targets.length === 0) return; | |
| let ticking = false; | |
| function update() { | |
| const anchor = 120; | |
| let active = targets[0].a; | |
| for (const t of targets) { | |
| if (t.el.getBoundingClientRect().top - anchor <= 0) active = t.a; | |
| else break; | |
| } | |
| links.forEach(l => l.classList.toggle('active', l === active)); | |
| } | |
| window.addEventListener('scroll', () => { | |
| if (ticking) return; | |
| ticking = true; | |
| requestAnimationFrame(() => { update(); ticking = false; }); | |
| }, { passive: true }); | |
| update(); | |
| })(); | |
| </script> | |
| </body> | |
| </html> | |