q / roofline.html
Humuhumu33's picture
roofline: batched passes + onSubmittedWorkDone for real bw
fa278c0 verified
Raw
History Blame Contribute Delete
8.91 kB
<!doctype html><html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1">
<title>Decode roofline — your GPU</title>
<style>
:root{--bg:#0a0d13;--panel:#111825;--ink:#e8ebf1;--dim:#8b95a7;--ac:#7c5cff;--ok:#48c26c;--no:#f0616d;--warn:#e0a94a;--line:#1e2836}
*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:15px/1.6 -apple-system,Segoe UI,Roboto,monospace;padding:24px;max-width:760px;margin:0 auto}
h1{font-size:20px;margin:0 0 4px}.sub{color:var(--dim);font-size:13px;margin:0 0 18px}
.card{font-family:ui-monospace,monospace;padding:14px 16px;border:1px solid var(--line);border-radius:10px;background:var(--panel);margin-bottom:12px;word-break:break-word}
.big{font-size:22px;font-weight:700}.k{color:var(--dim)}.v{font-weight:600}
.ok{color:var(--ok)}.no{color:var(--no)}.warn{color:var(--warn)}
table{width:100%;border-collapse:collapse;font-family:ui-monospace,monospace;font-size:13px}
td{padding:5px 8px;border-bottom:1px solid var(--line)}td.n{text-align:right;font-variant-numeric:tabular-nums}
.verdict{font-size:17px;font-weight:600;margin-top:14px}
</style></head><body>
<h1>BitNet decode <span style="color:var(--ac)">roofline</span> on your GPU</h1>
<p class="sub">Measures your GPU's achievable VRAM read bandwidth, then computes the batch-1 decode ceiling for BitNet-2B (0.69&nbsp;GB read per token) and how far the current kernel is from it.</p>
<div id="status" class="card">starting…</div>
<div id="out"></div>
<script type="module">
const $=s=>document.querySelector(s), st=$("#status"), out=$("#out");
const MODEL_GB = 0.69, CUR_MS = 18.4;
const say = (t,cls) => { st.textContent = t; st.className = "card" + (cls?" "+cls:""); };
window.addEventListener("unhandledrejection", e => say("✗ unhandled: " + (e.reason && (e.reason.message||e.reason)), "no"));
// FILL: write a non-zero, non-compressible pattern so the read pass hits real VRAM
// (an uninitialised STORAGE buffer is zero/DCC-compressed → the memory controller
// never fetches it → measured bandwidth is fictional). Must run before timing.
const FILL = `
@group(0) @binding(0) var<storage, read_write> d : array<vec4<u32>>;
@group(0) @binding(1) var<uniform> P : vec4<u32>; // x=#vec4 elems, y=total threads
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid:vec3<u32>){
let n=P.x; let stride=P.y; var i=gid.x;
loop { if(i>=n){break;}
d[i]=vec4<u32>(i*2654435761u+1u, i*40503u+7u, i^0x9e3779b9u, i*2246822519u+3u);
i=i+stride; }
}`;
const WGSL = `
@group(0) @binding(0) var<storage, read> data : array<vec4<u32>>;
@group(0) @binding(1) var<storage, read_write> sink : array<u32>;
@group(0) @binding(2) var<uniform> P : vec4<u32>; // x=#vec4 elems, y=total threads, z=repeat
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid:vec3<u32>){
let n=P.x; let stride=P.y; let R=P.z; var acc=vec4<u32>(0u);
for(var rep=0u; rep<R; rep=rep+1u){ var i=gid.x; loop { if(i>=n){break;} acc=acc ^ data[i]; i=i+stride; } }
sink[gid.x]=acc.x ^ acc.y ^ acc.z ^ acc.w;
}`;
(async()=>{
try{
if(!navigator.gpu){ say("✗ No WebGPU. Open in Chrome/Edge or a recent browser.","no"); return; }
say("requesting adapter…");
let ad = await navigator.gpu.requestAdapter({powerPreference:"high-performance"});
if(!ad) ad = await navigator.gpu.requestAdapter();
if(!ad){ say("✗ no GPU adapter available.","no"); return; }
const L=ad.limits;
say("requesting device…");
const dev=await ad.requestDevice({requiredLimits:{
maxStorageBufferBindingSize:L.maxStorageBufferBindingSize, maxBufferSize:L.maxBufferSize,
maxComputeWorkgroupsPerDimension:L.maxComputeWorkgroupsPerDimension }});
dev.lost.then(i => say("✗ device lost: "+(i&&i.message||i.reason||""),"no"));
dev.pushErrorScope("validation");
const info=ad.info||{};
// Buffer must exceed the GPU's last-level cache (RDNA-3 Infinity Cache is up to 96 MB)
// so re-reads actually miss to VRAM. Aim as large as the device allows, floor 512 MB.
const wantBytes=Math.min(L.maxStorageBufferBindingSize, L.maxBufferSize, 1024*1024*1024);
const bytes=Math.floor(Math.max(512*1024*1024, wantBytes)/16)*16;
const nVec=bytes/16;
say("allocating "+(bytes/1048576).toFixed(0)+" MB…");
const buf=dev.createBuffer({size:bytes, usage:GPUBufferUsage.STORAGE});
const wg=Math.min(L.maxComputeWorkgroupsPerDimension,65535), TOTAL=wg*256, R=1;
const sink=dev.createBuffer({size:TOTAL*4, usage:GPUBufferUsage.STORAGE});
const P=dev.createBuffer({size:16, usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST});
dev.queue.writeBuffer(P,0,new Uint32Array([nVec, TOTAL, R, 0]));
// fill pipeline (non-zero pattern) — its own bind group, same buffer
const fmod=dev.createShaderModule({code:FILL});
const fci=await fmod.getCompilationInfo(); const fer=fci.messages.filter(m=>m.type==="error");
if(fer.length){ say("✗ FILL WGSL: "+fer[0].message,"no"); return; }
const fpipe=dev.createComputePipeline({layout:"auto",compute:{module:fmod,entryPoint:"main"}});
const fbg=dev.createBindGroup({layout:fpipe.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:buf}},{binding:1,resource:{buffer:P}}]});
const mod=dev.createShaderModule({code:WGSL});
const ci=await mod.getCompilationInfo(); const er=ci.messages.filter(m=>m.type==="error");
if(er.length){ say("✗ WGSL: "+er[0].message,"no"); return; }
const pipe=dev.createComputePipeline({layout:"auto",compute:{module:mod,entryPoint:"main"}});
const bg=dev.createBindGroup({layout:pipe.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:buf}},{binding:1,resource:{buffer:sink}},{binding:2,resource:{buffer:P}}]});
const rb=dev.createBuffer({size:4, usage:GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ});
const scopeErr = await dev.popErrorScope();
if(scopeErr){ say("✗ GPU validation: "+scopeErr.message,"no"); return; }
// write the non-zero pattern into VRAM before we time reads
say("filling "+(bytes/1048576).toFixed(0)+" MB with non-zero pattern…");
{ const e=dev.createCommandEncoder(); const p=e.beginComputePass(); p.setPipeline(fpipe); p.setBindGroup(0,fbg); p.dispatchWorkgroups(wg); p.end(); dev.queue.submit([e.finish()]); await dev.queue.onSubmittedWorkDone(); }
// Batch many read passes into ONE command buffer, submit once, sync with
// onSubmittedWorkDone. Moving tens of GB makes GPU time >> fixed submit/map
// latency, so wall-clock reflects real VRAM bandwidth (not the ~0.1 ms overhead floor).
const PASSES=64; // 64 × buffer ≈ 32–64 GB of reads
async function timeReads(passes){
const e=dev.createCommandEncoder();
for(let k=0;k<passes;k++){ const p=e.beginComputePass(); p.setPipeline(pipe); p.setBindGroup(0,bg); p.dispatchWorkgroups(wg); p.end(); }
const t0=performance.now();
dev.queue.submit([e.finish()]);
await dev.queue.onSubmittedWorkDone();
return performance.now()-t0;
}
say("timing "+(bytes/1048576*PASSES/1024).toFixed(1)+" GB of reads…");
await timeReads(4); // warm up (compile, clocks up)
let best=1e9; for(let k=0;k<5;k++){ best=Math.min(best, await timeReads(PASSES)); }
const gbps = (bytes*PASSES/1073741824)/(best/1000);
const roofTok=gbps/MODEL_GB, curTok=1000/CUR_MS, curGBps=MODEL_GB/(CUR_MS/1000);
const pct=100*curGBps/gbps, gap=gbps/curGBps, reachable=roofTok>=1000, specNeeded=Math.max(1,1000/roofTok);
say("done · adapter: "+((info.vendor||"?")+" "+(info.architecture||"")+" "+(info.device||"")).trim(),"ok");
out.innerHTML=`
<div class="card"><span class="k">Achievable VRAM read bandwidth</span><br><span class="big">${gbps.toFixed(0)} GB/s</span></div>
<div class="card"><table>
<tr><td>Weights read / token (BitNet-2B t2)</td><td class="n">${MODEL_GB} GB</td></tr>
<tr><td><b>Bandwidth roofline (batch-1 ceiling)</b></td><td class="n"><b>${roofTok.toFixed(0)} tok/s</b></td></tr>
<tr><td>Current kernel</td><td class="n">${curTok.toFixed(0)} tok/s · ${curGBps.toFixed(0)} GB/s</td></tr>
<tr><td>Current kernel reaches</td><td class="n ${pct<20?'no':pct<60?'warn':'ok'}">${pct.toFixed(1)}% of roofline</td></tr>
<tr><td>Kernel headroom to roofline</td><td class="n">${gap.toFixed(1)}×</td></tr>
</table></div>
<div class="verdict ${reachable?'ok':'warn'}">${reachable
? '✓ >1000 tok/s is within your GPU roofline ('+roofTok.toFixed(0)+') — a bandwidth-optimal ternary kernel ('+gap.toFixed(1)+'× headroom) gets there; spec-decode is margin.'
: '⚠ Your GPU batch-1 roofline is '+roofTok.toFixed(0)+' tok/s. A perfect kernel reaches ~'+roofTok.toFixed(0)+'; >1000 needs speculative decode accepting ≥'+specNeeded.toFixed(1)+' tokens/pass, or a card with more bandwidth.'}
</div>`;
}catch(e){ say("✗ "+(e && (e.message||e)), "no"); }
})();
</script></body></html>