Spaces:

HOLOGRAMTECH
/

q

Running

App Files Files Community

q / roofline.html

Humuhumu33

roofline: batched passes + onSubmittedWorkDone for real bw

fa278c0 verified about 8 hours ago

Raw

History Blame Contribute Delete

8.91 kB

	<!doctype html><html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1">
	<title>Decode roofline — your GPU</title>
	<style>
	:root{--bg:#0a0d13;--panel:#111825;--ink:#e8ebf1;--dim:#8b95a7;--ac:#7c5cff;--ok:#48c26c;--no:#f0616d;--warn:#e0a94a;--line:#1e2836}
	*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:15px/1.6 -apple-system,Segoe UI,Roboto,monospace;padding:24px;max-width:760px;margin:0 auto}
	h1{font-size:20px;margin:0 0 4px}.sub{color:var(--dim);font-size:13px;margin:0 0 18px}
	.card{font-family:ui-monospace,monospace;padding:14px 16px;border:1px solid var(--line);border-radius:10px;background:var(--panel);margin-bottom:12px;word-break:break-word}
	.big{font-size:22px;font-weight:700}.k{color:var(--dim)}.v{font-weight:600}
	.ok{color:var(--ok)}.no{color:var(--no)}.warn{color:var(--warn)}
	table{width:100%;border-collapse:collapse;font-family:ui-monospace,monospace;font-size:13px}
	td{padding:5px 8px;border-bottom:1px solid var(--line)}td.n{text-align:right;font-variant-numeric:tabular-nums}
	.verdict{font-size:17px;font-weight:600;margin-top:14px}
	</style></head><body>
	<h1>BitNet decode <span style="color:var(--ac)">roofline</span> on your GPU</h1>
	<p class="sub">Measures your GPU's achievable VRAM read bandwidth, then computes the batch-1 decode ceiling for BitNet-2B (0.69 GB read per token) and how far the current kernel is from it.</p>
	<div id="status" class="card">starting…</div>
	<div id="out"></div>

	<script type="module">
	const $=s=>document.querySelector(s), st=$("#status"), out=$("#out");
	const MODEL_GB = 0.69, CUR_MS = 18.4;
	const say = (t,cls) => { st.textContent = t; st.className = "card" + (cls?" "+cls:""); };
	window.addEventListener("unhandledrejection", e => say("✗ unhandled: " + (e.reason && (e.reason.message\|\|e.reason)), "no"));

	// FILL: write a non-zero, non-compressible pattern so the read pass hits real VRAM
	// (an uninitialised STORAGE buffer is zero/DCC-compressed → the memory controller
	// never fetches it → measured bandwidth is fictional). Must run before timing.
	const FILL = `
	@group(0) @binding(0) var<storage, read_write> d : array<vec4<u32>>;
	@group(0) @binding(1) var<uniform> P : vec4<u32>; // x=#vec4 elems, y=total threads
	@compute @workgroup_size(256)
	fn main(@builtin(global_invocation_id) gid:vec3<u32>){
	let n=P.x; let stride=P.y; var i=gid.x;
	loop { if(i>=n){break;}
	d[i]=vec4<u32>(i2654435761u+1u, i40503u+7u, i^0x9e3779b9u, i*2246822519u+3u);
	i=i+stride; }
	}`;
	const WGSL = `
	@group(0) @binding(0) var<storage, read> data : array<vec4<u32>>;
	@group(0) @binding(1) var<storage, read_write> sink : array<u32>;
	@group(0) @binding(2) var<uniform> P : vec4<u32>; // x=#vec4 elems, y=total threads, z=repeat
	@compute @workgroup_size(256)
	fn main(@builtin(global_invocation_id) gid:vec3<u32>){
	let n=P.x; let stride=P.y; let R=P.z; var acc=vec4<u32>(0u);
	for(var rep=0u; rep<R; rep=rep+1u){ var i=gid.x; loop { if(i>=n){break;} acc=acc ^ data[i]; i=i+stride; } }
	sink[gid.x]=acc.x ^ acc.y ^ acc.z ^ acc.w;
	}`;

	(async()=>{
	try{
	if(!navigator.gpu){ say("✗ No WebGPU. Open in Chrome/Edge or a recent browser.","no"); return; }
	say("requesting adapter…");
	let ad = await navigator.gpu.requestAdapter({powerPreference:"high-performance"});
	if(!ad) ad = await navigator.gpu.requestAdapter();
	if(!ad){ say("✗ no GPU adapter available.","no"); return; }
	const L=ad.limits;
	say("requesting device…");
	const dev=await ad.requestDevice({requiredLimits:{
	maxStorageBufferBindingSize:L.maxStorageBufferBindingSize, maxBufferSize:L.maxBufferSize,
	maxComputeWorkgroupsPerDimension:L.maxComputeWorkgroupsPerDimension }});
	dev.lost.then(i => say("✗ device lost: "+(i&&i.message\|\|i.reason\|\|""),"no"));
	dev.pushErrorScope("validation");
	const info=ad.info\|\|{};
	// Buffer must exceed the GPU's last-level cache (RDNA-3 Infinity Cache is up to 96 MB)
	// so re-reads actually miss to VRAM. Aim as large as the device allows, floor 512 MB.
	const wantBytes=Math.min(L.maxStorageBufferBindingSize, L.maxBufferSize, 102410241024);
	const bytes=Math.floor(Math.max(51210241024, wantBytes)/16)*16;
	const nVec=bytes/16;
	say("allocating "+(bytes/1048576).toFixed(0)+" MB…");
	const buf=dev.createBuffer({size:bytes, usage:GPUBufferUsage.STORAGE});
	const wg=Math.min(L.maxComputeWorkgroupsPerDimension,65535), TOTAL=wg*256, R=1;
	const sink=dev.createBuffer({size:TOTAL*4, usage:GPUBufferUsage.STORAGE});
	const P=dev.createBuffer({size:16, usage:GPUBufferUsage.UNIFORM\|GPUBufferUsage.COPY_DST});
	dev.queue.writeBuffer(P,0,new Uint32Array([nVec, TOTAL, R, 0]));
	// fill pipeline (non-zero pattern) — its own bind group, same buffer
	const fmod=dev.createShaderModule({code:FILL});
	const fci=await fmod.getCompilationInfo(); const fer=fci.messages.filter(m=>m.type==="error");
	if(fer.length){ say("✗ FILL WGSL: "+fer[0].message,"no"); return; }
	const fpipe=dev.createComputePipeline({layout:"auto",compute:{module:fmod,entryPoint:"main"}});
	const fbg=dev.createBindGroup({layout:fpipe.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:buf}},{binding:1,resource:{buffer:P}}]});
	const mod=dev.createShaderModule({code:WGSL});
	const ci=await mod.getCompilationInfo(); const er=ci.messages.filter(m=>m.type==="error");
	if(er.length){ say("✗ WGSL: "+er[0].message,"no"); return; }
	const pipe=dev.createComputePipeline({layout:"auto",compute:{module:mod,entryPoint:"main"}});
	const bg=dev.createBindGroup({layout:pipe.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:buf}},{binding:1,resource:{buffer:sink}},{binding:2,resource:{buffer:P}}]});
	const rb=dev.createBuffer({size:4, usage:GPUBufferUsage.COPY_DST\|GPUBufferUsage.MAP_READ});
	const scopeErr = await dev.popErrorScope();
	if(scopeErr){ say("✗ GPU validation: "+scopeErr.message,"no"); return; }
	// write the non-zero pattern into VRAM before we time reads
	say("filling "+(bytes/1048576).toFixed(0)+" MB with non-zero pattern…");
	{ const e=dev.createCommandEncoder(); const p=e.beginComputePass(); p.setPipeline(fpipe); p.setBindGroup(0,fbg); p.dispatchWorkgroups(wg); p.end(); dev.queue.submit([e.finish()]); await dev.queue.onSubmittedWorkDone(); }
	// Batch many read passes into ONE command buffer, submit once, sync with
	// onSubmittedWorkDone. Moving tens of GB makes GPU time >> fixed submit/map
	// latency, so wall-clock reflects real VRAM bandwidth (not the ~0.1 ms overhead floor).
	const PASSES=64; // 64 × buffer ≈ 32–64 GB of reads
	async function timeReads(passes){
	const e=dev.createCommandEncoder();
	for(let k=0;k<passes;k++){ const p=e.beginComputePass(); p.setPipeline(pipe); p.setBindGroup(0,bg); p.dispatchWorkgroups(wg); p.end(); }
	const t0=performance.now();
	dev.queue.submit([e.finish()]);
	await dev.queue.onSubmittedWorkDone();
	return performance.now()-t0;
	}
	say("timing "+(bytes/1048576*PASSES/1024).toFixed(1)+" GB of reads…");
	await timeReads(4); // warm up (compile, clocks up)
	let best=1e9; for(let k=0;k<5;k++){ best=Math.min(best, await timeReads(PASSES)); }
	const gbps = (bytes*PASSES/1073741824)/(best/1000);
	const roofTok=gbps/MODEL_GB, curTok=1000/CUR_MS, curGBps=MODEL_GB/(CUR_MS/1000);
	const pct=100*curGBps/gbps, gap=gbps/curGBps, reachable=roofTok>=1000, specNeeded=Math.max(1,1000/roofTok);
	say("done · adapter: "+((info.vendor\|\|"?")+" "+(info.architecture\|\|"")+" "+(info.device\|\|"")).trim(),"ok");
	out.innerHTML=`
	<div class="card"><span class="k">Achievable VRAM read bandwidth</span><br><span class="big">${gbps.toFixed(0)} GB/s</span></div>
	<div class="card"><table>
	<tr><td>Weights read / token (BitNet-2B t2)</td><td class="n">${MODEL_GB} GB</td></tr>
	<tr><td><b>Bandwidth roofline (batch-1 ceiling)</b></td><td class="n"><b>${roofTok.toFixed(0)} tok/s</b></td></tr>
	<tr><td>Current kernel</td><td class="n">${curTok.toFixed(0)} tok/s · ${curGBps.toFixed(0)} GB/s</td></tr>
	<tr><td>Current kernel reaches</td><td class="n ${pct<20?'no':pct<60?'warn':'ok'}">${pct.toFixed(1)}% of roofline</td></tr>
	<tr><td>Kernel headroom to roofline</td><td class="n">${gap.toFixed(1)}×</td></tr>
	</table></div>
	<div class="verdict ${reachable?'ok':'warn'}">${reachable
	? '✓ >1000 tok/s is within your GPU roofline ('+roofTok.toFixed(0)+') — a bandwidth-optimal ternary kernel ('+gap.toFixed(1)+'× headroom) gets there; spec-decode is margin.'
	: '⚠ Your GPU batch-1 roofline is '+roofTok.toFixed(0)+' tok/s. A perfect kernel reaches ~'+roofTok.toFixed(0)+'; >1000 needs speculative decode accepting ≥'+specNeeded.toFixed(1)+' tokens/pass, or a card with more bandwidth.'}
	</div>`;
	}catch(e){ say("✗ "+(e && (e.message\|\|e)), "no"); }
	})();
	</script></body></html>