Spaces:
Running
Running
| <html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1"> | |
| <title>Decode roofline — your GPU</title> | |
| <style> | |
| :root{--bg:#0a0d13;--panel:#111825;--ink:#e8ebf1;--dim:#8b95a7;--ac:#7c5cff;--ok:#48c26c;--no:#f0616d;--warn:#e0a94a;--line:#1e2836} | |
| *{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:15px/1.6 -apple-system,Segoe UI,Roboto,monospace;padding:24px;max-width:760px;margin:0 auto} | |
| h1{font-size:20px;margin:0 0 4px}.sub{color:var(--dim);font-size:13px;margin:0 0 18px} | |
| .card{font-family:ui-monospace,monospace;padding:14px 16px;border:1px solid var(--line);border-radius:10px;background:var(--panel);margin-bottom:12px;word-break:break-word} | |
| .big{font-size:22px;font-weight:700}.k{color:var(--dim)}.v{font-weight:600} | |
| .ok{color:var(--ok)}.no{color:var(--no)}.warn{color:var(--warn)} | |
| table{width:100%;border-collapse:collapse;font-family:ui-monospace,monospace;font-size:13px} | |
| td{padding:5px 8px;border-bottom:1px solid var(--line)}td.n{text-align:right;font-variant-numeric:tabular-nums} | |
| .verdict{font-size:17px;font-weight:600;margin-top:14px} | |
| </style></head><body> | |
| <h1>BitNet decode <span style="color:var(--ac)">roofline</span> on your GPU</h1> | |
| <p class="sub">Measures your GPU's achievable VRAM read bandwidth, then computes the batch-1 decode ceiling for BitNet-2B (0.69 GB read per token) and how far the current kernel is from it.</p> | |
| <div id="status" class="card">starting…</div> | |
| <div id="out"></div> | |
| <script type="module"> | |
| const $=s=>document.querySelector(s), st=$("#status"), out=$("#out"); | |
| const MODEL_GB = 0.69, CUR_MS = 18.4; | |
| const say = (t,cls) => { st.textContent = t; st.className = "card" + (cls?" "+cls:""); }; | |
| window.addEventListener("unhandledrejection", e => say("✗ unhandled: " + (e.reason && (e.reason.message||e.reason)), "no")); | |
| // FILL: write a non-zero, non-compressible pattern so the read pass hits real VRAM | |
| // (an uninitialised STORAGE buffer is zero/DCC-compressed → the memory controller | |
| // never fetches it → measured bandwidth is fictional). Must run before timing. | |
| const FILL = ` | |
| @group(0) @binding(0) var<storage, read_write> d : array<vec4<u32>>; | |
| @group(0) @binding(1) var<uniform> P : vec4<u32>; // x=#vec4 elems, y=total threads | |
| @compute @workgroup_size(256) | |
| fn main(@builtin(global_invocation_id) gid:vec3<u32>){ | |
| let n=P.x; let stride=P.y; var i=gid.x; | |
| loop { if(i>=n){break;} | |
| d[i]=vec4<u32>(i*2654435761u+1u, i*40503u+7u, i^0x9e3779b9u, i*2246822519u+3u); | |
| i=i+stride; } | |
| }`; | |
| const WGSL = ` | |
| @group(0) @binding(0) var<storage, read> data : array<vec4<u32>>; | |
| @group(0) @binding(1) var<storage, read_write> sink : array<u32>; | |
| @group(0) @binding(2) var<uniform> P : vec4<u32>; // x=#vec4 elems, y=total threads, z=repeat | |
| @compute @workgroup_size(256) | |
| fn main(@builtin(global_invocation_id) gid:vec3<u32>){ | |
| let n=P.x; let stride=P.y; let R=P.z; var acc=vec4<u32>(0u); | |
| for(var rep=0u; rep<R; rep=rep+1u){ var i=gid.x; loop { if(i>=n){break;} acc=acc ^ data[i]; i=i+stride; } } | |
| sink[gid.x]=acc.x ^ acc.y ^ acc.z ^ acc.w; | |
| }`; | |
| (async()=>{ | |
| try{ | |
| if(!navigator.gpu){ say("✗ No WebGPU. Open in Chrome/Edge or a recent browser.","no"); return; } | |
| say("requesting adapter…"); | |
| let ad = await navigator.gpu.requestAdapter({powerPreference:"high-performance"}); | |
| if(!ad) ad = await navigator.gpu.requestAdapter(); | |
| if(!ad){ say("✗ no GPU adapter available.","no"); return; } | |
| const L=ad.limits; | |
| say("requesting device…"); | |
| const dev=await ad.requestDevice({requiredLimits:{ | |
| maxStorageBufferBindingSize:L.maxStorageBufferBindingSize, maxBufferSize:L.maxBufferSize, | |
| maxComputeWorkgroupsPerDimension:L.maxComputeWorkgroupsPerDimension }}); | |
| dev.lost.then(i => say("✗ device lost: "+(i&&i.message||i.reason||""),"no")); | |
| dev.pushErrorScope("validation"); | |
| const info=ad.info||{}; | |
| // Buffer must exceed the GPU's last-level cache (RDNA-3 Infinity Cache is up to 96 MB) | |
| // so re-reads actually miss to VRAM. Aim as large as the device allows, floor 512 MB. | |
| const wantBytes=Math.min(L.maxStorageBufferBindingSize, L.maxBufferSize, 1024*1024*1024); | |
| const bytes=Math.floor(Math.max(512*1024*1024, wantBytes)/16)*16; | |
| const nVec=bytes/16; | |
| say("allocating "+(bytes/1048576).toFixed(0)+" MB…"); | |
| const buf=dev.createBuffer({size:bytes, usage:GPUBufferUsage.STORAGE}); | |
| const wg=Math.min(L.maxComputeWorkgroupsPerDimension,65535), TOTAL=wg*256, R=1; | |
| const sink=dev.createBuffer({size:TOTAL*4, usage:GPUBufferUsage.STORAGE}); | |
| const P=dev.createBuffer({size:16, usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST}); | |
| dev.queue.writeBuffer(P,0,new Uint32Array([nVec, TOTAL, R, 0])); | |
| // fill pipeline (non-zero pattern) — its own bind group, same buffer | |
| const fmod=dev.createShaderModule({code:FILL}); | |
| const fci=await fmod.getCompilationInfo(); const fer=fci.messages.filter(m=>m.type==="error"); | |
| if(fer.length){ say("✗ FILL WGSL: "+fer[0].message,"no"); return; } | |
| const fpipe=dev.createComputePipeline({layout:"auto",compute:{module:fmod,entryPoint:"main"}}); | |
| const fbg=dev.createBindGroup({layout:fpipe.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:buf}},{binding:1,resource:{buffer:P}}]}); | |
| const mod=dev.createShaderModule({code:WGSL}); | |
| const ci=await mod.getCompilationInfo(); const er=ci.messages.filter(m=>m.type==="error"); | |
| if(er.length){ say("✗ WGSL: "+er[0].message,"no"); return; } | |
| const pipe=dev.createComputePipeline({layout:"auto",compute:{module:mod,entryPoint:"main"}}); | |
| const bg=dev.createBindGroup({layout:pipe.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:buf}},{binding:1,resource:{buffer:sink}},{binding:2,resource:{buffer:P}}]}); | |
| const rb=dev.createBuffer({size:4, usage:GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ}); | |
| const scopeErr = await dev.popErrorScope(); | |
| if(scopeErr){ say("✗ GPU validation: "+scopeErr.message,"no"); return; } | |
| // write the non-zero pattern into VRAM before we time reads | |
| say("filling "+(bytes/1048576).toFixed(0)+" MB with non-zero pattern…"); | |
| { const e=dev.createCommandEncoder(); const p=e.beginComputePass(); p.setPipeline(fpipe); p.setBindGroup(0,fbg); p.dispatchWorkgroups(wg); p.end(); dev.queue.submit([e.finish()]); await dev.queue.onSubmittedWorkDone(); } | |
| // Batch many read passes into ONE command buffer, submit once, sync with | |
| // onSubmittedWorkDone. Moving tens of GB makes GPU time >> fixed submit/map | |
| // latency, so wall-clock reflects real VRAM bandwidth (not the ~0.1 ms overhead floor). | |
| const PASSES=64; // 64 × buffer ≈ 32–64 GB of reads | |
| async function timeReads(passes){ | |
| const e=dev.createCommandEncoder(); | |
| for(let k=0;k<passes;k++){ const p=e.beginComputePass(); p.setPipeline(pipe); p.setBindGroup(0,bg); p.dispatchWorkgroups(wg); p.end(); } | |
| const t0=performance.now(); | |
| dev.queue.submit([e.finish()]); | |
| await dev.queue.onSubmittedWorkDone(); | |
| return performance.now()-t0; | |
| } | |
| say("timing "+(bytes/1048576*PASSES/1024).toFixed(1)+" GB of reads…"); | |
| await timeReads(4); // warm up (compile, clocks up) | |
| let best=1e9; for(let k=0;k<5;k++){ best=Math.min(best, await timeReads(PASSES)); } | |
| const gbps = (bytes*PASSES/1073741824)/(best/1000); | |
| const roofTok=gbps/MODEL_GB, curTok=1000/CUR_MS, curGBps=MODEL_GB/(CUR_MS/1000); | |
| const pct=100*curGBps/gbps, gap=gbps/curGBps, reachable=roofTok>=1000, specNeeded=Math.max(1,1000/roofTok); | |
| say("done · adapter: "+((info.vendor||"?")+" "+(info.architecture||"")+" "+(info.device||"")).trim(),"ok"); | |
| out.innerHTML=` | |
| <div class="card"><span class="k">Achievable VRAM read bandwidth</span><br><span class="big">${gbps.toFixed(0)} GB/s</span></div> | |
| <div class="card"><table> | |
| <tr><td>Weights read / token (BitNet-2B t2)</td><td class="n">${MODEL_GB} GB</td></tr> | |
| <tr><td><b>Bandwidth roofline (batch-1 ceiling)</b></td><td class="n"><b>${roofTok.toFixed(0)} tok/s</b></td></tr> | |
| <tr><td>Current kernel</td><td class="n">${curTok.toFixed(0)} tok/s · ${curGBps.toFixed(0)} GB/s</td></tr> | |
| <tr><td>Current kernel reaches</td><td class="n ${pct<20?'no':pct<60?'warn':'ok'}">${pct.toFixed(1)}% of roofline</td></tr> | |
| <tr><td>Kernel headroom to roofline</td><td class="n">${gap.toFixed(1)}×</td></tr> | |
| </table></div> | |
| <div class="verdict ${reachable?'ok':'warn'}">${reachable | |
| ? '✓ >1000 tok/s is within your GPU roofline ('+roofTok.toFixed(0)+') — a bandwidth-optimal ternary kernel ('+gap.toFixed(1)+'× headroom) gets there; spec-decode is margin.' | |
| : '⚠ Your GPU batch-1 roofline is '+roofTok.toFixed(0)+' tok/s. A perfect kernel reaches ~'+roofTok.toFixed(0)+'; >1000 needs speculative decode accepting ≥'+specNeeded.toFixed(1)+' tokens/pass, or a card with more bandwidth.'} | |
| </div>`; | |
| }catch(e){ say("✗ "+(e && (e.message||e)), "no"); } | |
| })(); | |
| </script></body></html> | |