webgpu-bench / run.html
GitHub Actions
sync from abhijitramesh/webgpu-bench@ad8f5dbb6b
20b03c5
<!DOCTYPE html>
<html lang="en" data-theme="light">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="color-scheme" content="light dark">
<script>(function(){var s=localStorage.getItem('theme');if(!s){s=(window.matchMedia&&matchMedia('(prefers-color-scheme: dark)').matches)?'dark':'light';}document.documentElement.setAttribute('data-theme',s);})();</script>
<title>Run — WebGPU Bench</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Bricolage+Grotesque:opsz,wght@12..96,400;12..96,500;12..96,600;12..96,700;12..96,800&family=Geist+Mono:wght@400;500;600&display=swap" rel="stylesheet">
<link rel="stylesheet" href="css/style.css">
<!-- Import map so `@huggingface/hub` resolves in the browser via esm.sh.
Must appear before any <script type="module">. -->
<script type="importmap">
{
"imports": {
"@huggingface/hub": "https://esm.sh/@huggingface/hub"
}
}
</script>
</head>
<body>
<header class="header">
<div class="header-inner">
<a href="index.html" class="header-brand">
<svg class="header-logo" width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="4" y="4" width="16" height="16" rx="2"/><rect x="9" y="9" width="6" height="6"/><line x1="9" y1="1" x2="9" y2="4"/><line x1="15" y1="1" x2="15" y2="4"/><line x1="9" y1="20" x2="9" y2="23"/><line x1="15" y1="20" x2="15" y2="23"/><line x1="20" y1="9" x2="23" y2="9"/><line x1="20" y1="14" x2="23" y2="14"/><line x1="1" y1="9" x2="4" y2="9"/><line x1="1" y1="14" x2="4" y2="14"/></svg>
<span class="header-title">WebGPU Bench</span>
</a>
<nav class="header-nav" aria-label="Primary">
<a href="index.html" class="header-link">Dashboard</a>
<a href="methodology.html" class="header-link">Methodology</a>
<button id="theme-toggle" class="header-link theme-toggle-btn" type="button" title="Toggle theme" aria-label="Toggle dark mode">
<svg class="icon-sun" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg>
<svg class="icon-moon" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg>
</button>
<a href="https://github.com/abhijitramesh/webgpu-bench" target="_blank" rel="noopener" class="header-link">
<svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0C5.37 0 0 5.37 0 12c0 5.3 3.44 9.8 8.2 11.39.6.11.82-.26.82-.58v-2.03c-3.34.73-4.04-1.61-4.04-1.61-.55-1.39-1.34-1.76-1.34-1.76-1.09-.74.08-.73.08-.73 1.2.09 1.84 1.24 1.84 1.24 1.07 1.83 2.81 1.3 3.5 1 .11-.78.42-1.3.76-1.6-2.67-.3-5.47-1.33-5.47-5.93 0-1.31.47-2.38 1.24-3.22-.13-.3-.54-1.52.12-3.18 0 0 1.01-.32 3.3 1.23a11.5 11.5 0 0 1 6.02 0c2.28-1.55 3.29-1.23 3.29-1.23.66 1.66.25 2.88.12 3.18.77.84 1.24 1.91 1.24 3.22 0 4.61-2.81 5.63-5.48 5.92.43.37.81 1.1.81 2.22v3.29c0 .32.22.7.82.58C20.57 21.8 24 17.3 24 12c0-6.63-5.37-12-12-12z"/></svg>
GitHub
</a>
</nav>
</div>
</header>
<main>
<section id="run-section" class="dash-section">
<div class="container">
<div class="run-hero">
<h1 class="run-hero-title">Run a benchmark</h1>
<span id="run-mode-badge" class="badge run-mode-badge"></span>
</div>
<!-- How-to-use guide — collapsible so it doesn't dominate the page
for repeat users, but open by default so first-timers see it. -->
<details class="card run-instructions" open>
<summary><strong>How to use this page</strong> — quick start, run modes, and configuration knobs</summary>
<div class="run-instructions-body">
<section>
<h3>Quick start</h3>
<ol>
<li>The three cards above show your <strong>device</strong>, its <strong>capabilities</strong>, and the safe <strong>model budget</strong> (largest variant that should fit without exhausting GPU/WASM memory).</li>
<li>Pick variants in the <strong>Models</strong> panel below — or use the <em>Quick set</em> / <em>All fit</em> selection buttons.</li>
<li>Click <strong>Run study</strong> for a curated comparison set, <strong>Run benchmarks</strong> for whatever you've checked, or <strong>Download selected</strong> to just cache files for later.</li>
</ol>
</section>
<section>
<h3>Run modes</h3>
<dl class="run-instructions-dl">
<dt><strong>Run study</strong></dt>
<dd>Curated, opinionated set: <code>Llama-3.2-1B-Instruct</code> at <code>Q2_K</code> / <code>Q4_K_M</code> / <code>Q8_0</code> / <code>F16</code>, plus every other model at <code>Q4_K_M</code>, filtered to what fits this device. Use this for apples-to-apples comparison across machines.</dd>
<dt><strong>Run benchmarks</strong></dt>
<dd>Runs whatever variants you've explicitly checked in the Models panel. Use this when you want to drill into a specific model or quant family.</dd>
<dt><strong>Download selected</strong></dt>
<dd>Caches the checked variants to OPFS without running them. Useful for pre-warming before a flight or running offline later.</dd>
<dt><strong>Abort</strong></dt>
<dd>Cancels the in-flight worker AND any concurrent download. Aborted variants get an "aborted" status; subsequent variants do not run.</dd>
<dt><strong>Purge OPFS cache</strong></dt>
<dd>Wipes every cached GGUF from browser storage. Only shown on hosted surfaces; helpful when the device is low on disk.</dd>
</dl>
</section>
<section>
<h3>What each run does</h3>
<p>By default each variant runs the GPU perf pass only. The CPU baseline is opt-in via the <em>Run</em> toggles below — flip them on if you want consistency or CPU-vs-GPU comparison numbers.</p>
<ol>
<li><strong>GPU pass (always)</strong> — runs a llama-bench-style perf sweep: one warmup + <code>reps</code> timed reps for both prompt processing (<code>pp</code>) and token generation (<code>tg</code>). 1-second cooldown between reps lets the GPU clock recover, so reps stay comparable instead of decaying across the sweep.</li>
<li><strong>CPU baseline (opt-in)</strong> — when <em>Run: Consistency</em> or <em>Run: CPU perf</em> is on, the CPU pass runs first to capture reference token IDs (for the GPU forced-decode agreement check) and/or a 1-rep CPU perf sample (for CPU vs GPU dashboard cells).</li>
</ol>
</section>
<section>
<h3>Configuration</h3>
<dl class="run-instructions-dl">
<dt><strong>Search / Hide</strong></dt>
<dd>Filter the Models panel. <em>Hide</em> toggles drop UD (Unsloth dynamic), IQ (i-quants), and BF16/F16 high-precision variants — useful when you only care about the "standard" K-quant lineup.</dd>
<dt><strong>Select: Quick set / All fit / None</strong></dt>
<dd><em>Quick set</em> = the same variants Run study uses. <em>All fit</em> = every variant under your device budget. <em>None</em> = clear the selection.</dd>
<dt><strong>Prompt tokens (-p)</strong></dt>
<dd>How many synthetic tokens go through prompt processing for the <code>pp</code> test. Default <code>512</code>. Larger = more compute-bound.</dd>
<dt><strong>Gen tokens (-n)</strong></dt>
<dd>How many tokens are generated for the <code>tg</code> test. Default <code>128</code>. Larger reps stress sustained decode bandwidth.</dd>
<dt><strong>Reps (-r)</strong></dt>
<dd>Timed repetitions of each test (after warmup). Default <code>5</code>. The reported figure is mean ± stddev across the reps.</dd>
<dt><strong>Run: Consistency</strong></dt>
<dd>Off by default. Turns on the CPU-baseline + GPU-forced-decode agreement check. Opt in when you want to verify CPU and GPU produce matching tokens for a given variant.</dd>
<dt><strong>Run: CPU perf</strong></dt>
<dd>Off by default. Turns on a 1-rep CPU perf baseline alongside the GPU pass so the dashboard's CPU vs GPU cells have data. Opt in when you want apples-to-apples CPU↔GPU numbers; leave off to keep runs short.</dd>
<dt><strong>Evict cached GGUFs after each run</strong></dt>
<dd>Frees OPFS storage as the queue moves through variants. Only evicts files this session downloaded — files you cached before the run are preserved.</dd>
</dl>
</section>
<section>
<h3>Reading the output</h3>
<p>The <strong>Progress</strong> table updates per-variant with live status (download → cpu → gpu → done). Final per-variant numbers fill the <code>pp tok/s</code> and <code>tg tok/s</code> columns as <code>mean ± stddev</code>. The <strong>Output</strong> textarea below contains the full JSON record — copy it, download it, or (on the HF Space) sign in to submit to the leaderboard.</p>
</section>
<section>
<h3>Known issues</h3>
<dl class="run-instructions-dl">
<dt><strong>Safari Private Browsing</strong></dt>
<dd>Don't use it. Private mode caps OPFS storage at a few hundred MB per tab, denies <code>navigator.storage.persist()</code>, and routes the file system through an ephemeral backend with stricter sync-handle limits. Downloads fail mid-stream with <em>"operation failed for an unknown transient reason"</em> and retries don't help — the cause isn't transient, it's the browsing mode. Switch to a regular Safari window.</dd>
<dt><strong>Mobile tabs (iOS/Android)</strong></dt>
<dd>iOS Jetsam and Android's low-memory killer reap tabs that approach the per-process memory ceiling, often silently reloading the page. The Run page applies tighter budgets and forced-eviction on mobile to mitigate, but variants near the budget edge can still crash the tab. Run from a desktop for stable numbers.</dd>
</dl>
</section>
</div>
</details>
<!-- Read-only banner (shown on any hosted surface that isn't the HF
Space — e.g. a mirror or preview deploy where OAuth isn't set up). -->
<div id="run-pages-banner" class="run-pages-banner" hidden>
<span>Read-only mode — to submit benchmarks, open the canonical <a href="https://abhijitramesh-webgpu-bench.static.hf.space/run.html" target="_blank" rel="noopener">HF Space</a>.</span>
</div>
<!-- Mobile-device warning: tabs on iOS/Android get reaped under
memory pressure, so benchmarks often crash and the tab silently
reloads. Shown by the controller when isMobileDevice() is true. -->
<div id="run-mobile-banner" class="run-pages-banner" hidden>
<span>Mobile device detected — the per-tab memory budget is tight, and larger quants will likely crash this tab. <strong>If you're on Safari, do not use Private Browsing</strong>: it caps OPFS storage at a few hundred MB and disables persistent storage, so model downloads fail with "operation failed for an unknown transient reason." For representative numbers, run from a laptop or desktop.</span>
</div>
<!-- Crash-recovery banner: set by the controller when a previous
Run started but never posted a success, suggesting the tab got
reaped mid-run. Appears on the next page load. -->
<div id="run-crash-banner" class="run-pages-banner" hidden>
<span id="run-crash-banner-text"></span>
<button id="run-crash-banner-dismiss" class="btn btn-secondary btn-xs" type="button">Dismiss</button>
</div>
<!-- Device & budget -->
<div class="summary-grid run-device-grid">
<div class="stat-card run-device-card">
<span class="stat-card-label">Device</span>
<div class="run-device-rows">
<div class="run-device-row"><span class="run-device-row-label">Browser</span><span class="run-device-row-value" id="device-browser"></span></div>
<div class="run-device-row"><span class="run-device-row-label">Platform</span><span class="run-device-row-value" id="device-platform"></span></div>
<div class="run-device-row"><span class="run-device-row-label">GPU</span><span class="run-device-row-value" id="device-gpu"></span></div>
</div>
</div>
<div class="stat-card run-device-card">
<span class="stat-card-label">Capability</span>
<div class="run-device-rows">
<div class="run-device-row"><span class="run-device-row-label">deviceMemory</span><span class="run-device-row-value" id="device-memory"></span></div>
<div class="run-device-row"><span class="run-device-row-label">WebGPU</span><span class="run-device-row-value" id="device-webgpu"></span></div>
<div class="run-device-row"><span class="run-device-row-label">llama.cpp</span><span class="run-device-row-value" id="device-llamacpp"></span></div>
</div>
</div>
<div class="stat-card run-device-card">
<span class="stat-card-label">Model budget</span>
<div class="run-device-rows">
<div class="run-device-row"><span class="run-device-row-label">Max size</span><span class="run-device-row-value" id="device-budget"></span></div>
<div class="run-device-note" id="device-budget-source"></div>
</div>
</div>
</div>
<!-- User-reported machine identity. The auto-detected values in the
cards above are unreliable (UA strings lie, deviceMemory is
coarse, GPU adapter info is often empty). We ship these
user-typed fields alongside the auto-detected ones so the
leaderboard can attribute submissions correctly. Persisted to
localStorage between visits. -->
<details class="card run-machine-card" id="user-reported-card" open>
<summary>
<strong>Your machine</strong> — labels the auto-detected device data on submission. Saved between visits.
</summary>
<div class="run-machine-grid">
<label class="run-machine-field">
<span class="run-machine-label">Machine name <span class="run-machine-req" aria-hidden="true">*</span></span>
<input type="text" id="ur-machine-name" class="run-machine-input" placeholder="e.g. MacBook Pro M3 16GB" autocomplete="off" spellcheck="false">
</label>
<label class="run-machine-field">
<span class="run-machine-label">GPU name <span class="run-machine-opt">(optional)</span></span>
<input type="text" id="ur-gpu-name" class="run-machine-input" placeholder="e.g. Apple M3 Pro" autocomplete="off" spellcheck="false">
</label>
<label class="run-machine-field">
<span class="run-machine-label">Browser <span class="run-machine-req" aria-hidden="true">*</span></span>
<input type="text" id="ur-browser" class="run-machine-input" placeholder="e.g. Chrome 138 dev" autocomplete="off" spellcheck="false">
</label>
<label class="run-machine-field">
<span class="run-machine-label">Operating system <span class="run-machine-req" aria-hidden="true">*</span></span>
<input type="text" id="ur-os" class="run-machine-input" placeholder="e.g. macOS 15.4" autocomplete="off" spellcheck="false">
</label>
</div>
<p class="run-machine-hint" id="ur-hint">Required fields marked <span class="run-machine-req">*</span>. Defaults are filled in from your browser; edit anything that's wrong before running.</p>
</details>
<!-- Hide filters, iterations, actions -->
<div class="filter-bar run-controls">
<div class="filter-bar-inner run-filters">
<div class="filter-group filter-group--search">
<label class="filter-label" for="family-search">Search</label>
<div class="run-search-wrapper">
<svg class="run-search-icon" width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><circle cx="11" cy="11" r="8"/><line x1="21" y1="21" x2="16.65" y2="16.65"/></svg>
<input type="search" id="family-search" class="run-search-input" placeholder="Filter models…" autocomplete="off" spellcheck="false">
</div>
</div>
<div class="filter-group">
<span class="filter-label">Hide</span>
<div class="run-filters-checks">
<label class="run-hide-label"><input type="checkbox" id="hide-ud"> UD</label>
<label class="run-hide-label"><input type="checkbox" id="hide-iq"> IQ</label>
<label class="run-hide-label"><input type="checkbox" id="hide-hifp"> BF16/F16</label>
</div>
</div>
<div class="filter-group">
<span class="filter-label">Select</span>
<div class="run-filters-checks">
<button class="btn btn-secondary btn-xs" id="btn-select-quick" type="button">Quick set</button>
<button class="btn btn-secondary btn-xs" id="btn-select-fit" type="button">All fit</button>
<button class="btn btn-secondary btn-xs" id="btn-select-none" type="button">None</button>
</div>
</div>
<div class="filter-group">
<label class="filter-label" for="n-prompt-input">Prompt tokens (-p)</label>
<input type="number" id="n-prompt-input" class="filter-select run-iter-input" value="512" min="0" max="4096" step="1">
</div>
<div class="filter-group">
<label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
<input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
</div>
<div class="filter-group">
<label class="filter-label" for="n-depth-input">KV depth (-d)</label>
<input type="number" id="n-depth-input" class="filter-select run-iter-input" value="2048" min="0" max="32768" step="1">
</div>
<div class="filter-group">
<label class="filter-label" for="iterations-input">Reps (-r)</label>
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
</div>
<div class="filter-group">
<span class="filter-label">Run</span>
<div class="run-filters-checks">
<label class="run-hide-label" title="Run the consistency check (CPU baseline + GPU forced-decode agreement). Off by default — opt in to verify CPU and GPU produce matching tokens."><input type="checkbox" id="run-consistency"> Consistency</label>
<label class="run-hide-label" title="Run a single-rep CPU perf baseline alongside the GPU pass. Off by default — opt in for CPU vs GPU comparison numbers."><input type="checkbox" id="run-cpu-perf"> CPU perf</label>
</div>
</div>
</div>
</div>
<!-- Family / variant list -->
<div id="run-models" class="run-models-stack">
<div class="empty-state">Loading models…</div>
</div>
<!-- Action bar: lives just above Progress so the Run/Abort/Download
controls are co-located with the live status they affect. -->
<div class="run-action-bar">
<div class="run-action-bar-inner">
<div class="run-budget" id="run-budget" hidden>
<div class="run-budget-row">
<span class="run-budget-label">Selected</span>
<span class="run-budget-text" id="run-budget-text"></span>
</div>
<div class="run-budget-bar" role="progressbar" aria-labelledby="run-budget-text">
<div class="run-budget-bar-fill" id="run-budget-fill"></div>
</div>
<div class="run-budget-meta" id="run-budget-meta"></div>
</div>
<span id="queue-status" class="run-queue-status" hidden></span>
<div class="run-actions">
<button class="btn btn-secondary" id="btn-download" type="button" disabled>Download selected</button>
<button class="btn btn-primary" id="btn-run-study" type="button" title="Llama-3.2-1B-Instruct at Q2_K / Q4_K_M / Q8_0 / F16, plus every other model at Q4_K_M, filtered to what fits this device.">Run study</button>
<button class="btn btn-primary" id="btn-run" type="button" disabled>Run benchmarks</button>
<button class="btn btn-danger" id="btn-abort" type="button" hidden>Abort</button>
<button class="btn btn-secondary" id="btn-purge" type="button" hidden>Purge OPFS cache</button>
</div>
</div>
</div>
<!-- Progress -->
<div class="section-header">
<h2 class="subsection-title">Progress</h2>
</div>
<div class="table-card">
<div id="run-progress-wrapper" class="results-wrapper"></div>
</div>
<!-- HF sign-in + submit (space surface only) — sits between Progress
and Output so users land on it once they have results to push. -->
<div id="hub-row" class="card hub-row" hidden>
<div class="hub-row-inner">
<div class="hub-row-info">
<span id="hf-user"></span>
</div>
<div class="hub-row-actions">
<button id="btn-signin" class="btn btn-secondary" type="button">Sign in with Hugging Face</button>
<button id="btn-submit" class="btn btn-primary" type="button" disabled hidden>Submit to leaderboard</button>
</div>
</div>
</div>
<!-- Output -->
<div class="section-header" style="margin-top: 32px;">
<h2 class="subsection-title">Output</h2>
</div>
<div class="card run-output">
<label id="save-local-row" class="run-output-toggle" hidden>
<input type="checkbox" id="save-local" checked>
Save to <code>results/results.json</code> on this server
</label>
<label id="evict-after-row" class="run-output-toggle">
<input type="checkbox" id="evict-after-run">
Evict cached GGUFs after each run (frees disk/OPFS as the queue moves)
</label>
<textarea id="output-textarea" class="run-output-textarea" readonly spellcheck="false" aria-label="Benchmark results output (JSON)" placeholder="Run benchmarks to generate output here…"></textarea>
<div class="run-output-buttons">
<button class="btn btn-secondary" id="btn-copy" type="button">Copy</button>
<button class="btn btn-secondary" id="btn-download-json" type="button">Download JSON</button>
</div>
</div>
<!-- Log -->
<details id="run-log" class="card run-log" style="margin-top: 16px;">
<summary>Run log</summary>
<pre id="log-output" class="run-log-pre"></pre>
</details>
<details id="run-wasm-errors" class="card run-log" style="margin-top: 12px;">
<summary>WASM errors</summary>
<pre id="wasm-error-output" class="run-log-pre" aria-label="WASM error log"></pre>
</details>
</div>
</section>
</main>
<script type="module">
import { mountRunSection } from './js/run/controller.js';
// Theme toggle wiring (kept here since app.js no longer runs on this page).
document.getElementById('theme-toggle')?.addEventListener('click', () => {
const next = document.documentElement.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
document.documentElement.setAttribute('data-theme', next);
localStorage.setItem('theme', next);
});
mountRunSection();
</script>
</body>
</html>