Spaces:

HOLOGRAMTECH
/

q

Running

App Files Files Community

q / blake3-gpu-test.html

Humuhumu33

Upload blake3-gpu-test.html with huggingface_hub

a349783 verified about 13 hours ago

Raw

History Blame Contribute Delete

15.3 kB

	<!doctype html><html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1">
	<title>BLAKE3 on your GPU — parity test</title>
	<style>
	:root{--bg:#0a0d13;--panel:#111825;--ink:#e8ebf1;--dim:#8b95a7;--ac:#7c5cff;--ok:#48c26c;--no:#f0616d;--line:#1e2836}
	*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:15px/1.55 -apple-system,Segoe UI,Roboto,monospace;padding:24px;max-width:820px;margin:0 auto}
	h1{font-size:20px;margin:0 0 4px}.sub{color:var(--dim);font-size:13px;margin:0 0 18px}
	#status{font-family:ui-monospace,monospace;padding:10px 14px;border:1px solid var(--line);border-radius:10px;background:var(--panel);margin-bottom:16px}
	table{border-collapse:collapse;width:100%;font-family:ui-monospace,monospace;font-size:12.5px}
	th,td{text-align:left;padding:7px 10px;border-bottom:1px solid var(--line)}
	th{color:var(--dim);text-transform:uppercase;font-size:10.5px;letter-spacing:.08em}
	td.n{text-align:right;font-variant-numeric:tabular-nums}
	.ok{color:var(--ok)}.no{color:var(--no)}
	.big{font-size:17px;font-weight:600;margin:16px 0 4px}
	.hx{color:var(--dim)}
	</style></head><body>
	<h1>BLAKE3 — <span style="color:var(--ac)">on your GPU</span> vs the CPU oracle</h1>
	<p class="sub">WGSL kernel hashes each input entirely on the GPU; compared byte-for-byte to <code>holo-blake3.mjs</code> and the official empty-input vector. Boundary sizes exercise block (64), chunk (1024), and tree merges.</p>
	<div id="status">initializing WebGPU…</div>
	<div id="result"></div>
	<table id="tbl"><thead><tr><th>input</th><th class="n">bytes</th><th>GPU digest (first 32 hex)</th><th>match</th></tr></thead><tbody></tbody></table>

	<script type="module">
	// ─────────────────── CPU ORACLE (holo-blake3.mjs, inlined verbatim) ───────────────────
	const IV=[0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19];
	const MSG=[2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8];
	const CHUNK_START=1,CHUNK_END=2,PARENT=4,ROOT=8,BLOCK=64,CHUNK=1024;
	const rotr=(x,n)=>((x>>>n)\|(x<<(32-n)))>>>0;
	function g(v,a,b,c,d,mx,my){v[a]=(v[a]+v[b]+mx)>>>0;v[d]=rotr(v[d]^v[a],16);v[c]=(v[c]+v[d])>>>0;v[b]=rotr(v[b]^v[c],12);v[a]=(v[a]+v[b]+my)>>>0;v[d]=rotr(v[d]^v[a],8);v[c]=(v[c]+v[d])>>>0;v[b]=rotr(v[b]^v[c],7);}
	function roundFn(v,m){g(v,0,4,8,12,m[0],m[1]);g(v,1,5,9,13,m[2],m[3]);g(v,2,6,10,14,m[4],m[5]);g(v,3,7,11,15,m[6],m[7]);g(v,0,5,10,15,m[8],m[9]);g(v,1,6,11,12,m[10],m[11]);g(v,2,7,8,13,m[12],m[13]);g(v,3,4,9,14,m[14],m[15]);}
	function compress(cv,m0,counter,blockLen,flags){const cl=counter>>>0,ch=Math.floor(counter/4294967296)>>>0;const v=[cv[0],cv[1],cv[2],cv[3],cv[4],cv[5],cv[6],cv[7],IV[0],IV[1],IV[2],IV[3],cl,ch,blockLen>>>0,flags>>>0];let m=m0.slice();for(let r=0;r<7;r++){roundFn(v,m);if(r<6){const p=new Array(16);for(let i=0;i<16;i++)p[i]=m[MSG[i]];m=p;}}const out=new Array(16);for(let i=0;i<8;i++){out[i]=(v[i]^v[i+8])>>>0;out[i+8]=(v[i+8]^cv[i])>>>0;}return out;}
	function words(bytes,off,len){const m=new Array(16).fill(0);for(let i=0;i<len;i++)m[i>>2]\|=bytes[off+i]<<((i&3)*8);for(let i=0;i<16;i++)m[i]>>>=0;return m;}
	function nodeCV(o){return compress(o.cv,o.m,o.counter,o.blockLen,o.flags).slice(0,8);}
	function nodeRoot(o){const out=compress(o.cv,o.m,0,o.blockLen,o.flags\|ROOT);const b=new Uint8Array(32);for(let i=0;i<8;i++){const w=out[i];b[i4]=w&255;b[i4+1]=(w>>>8)&255;b[i4+2]=(w>>>16)&255;b[i4+3]=(w>>>24)&255;}return b;}
	function chunkNode(bytes,start,len,counter){let cv=IV.slice();const nB=Math.max(1,Math.ceil(len/BLOCK));let flags=CHUNK_START;for(let i=0;i<nB-1;i++){cv=compress(cv,words(bytes,start+iBLOCK,BLOCK),counter,BLOCK,flags).slice(0,8);flags=0;}const lo=start+(nB-1)BLOCK,ll=len-(nB-1)*BLOCK;return{cv,m:words(bytes,lo,ll),counter,blockLen:ll,flags:flags\|CHUNK_END};}
	function parentNode(l,r){return{cv:IV.slice(),m:l.concat(r),counter:0,blockLen:BLOCK,flags:PARENT};}
	function subtree(bytes,start,len,counter){if(len<=CHUNK)return chunkNode(bytes,start,len,counter);let left=CHUNK;while(left2<len)left=2;const lcv=nodeCV(subtree(bytes,start,left,counter));const rcv=nodeCV(subtree(bytes,start+left,len-left,counter+left/CHUNK));return parentNode(lcv,rcv);}
	function blake3hex(bytes){const d=nodeRoot(subtree(bytes,0,bytes.length,0));let s="";for(let i=0;i<32;i++)s+=d[i].toString(16).padStart(2,"0");return s;}

	// ─────────────────── WGSL BLAKE3 — runs ENTIRELY on the GPU ───────────────────
	// Single-invocation port of the incremental/stack tree (no recursion). Input is zero-padded to a
	// multiple of 64 bytes as an array<u32> (LE) so each 64-byte block = 16 consecutive u32. blockLen in
	// the compression state carries the true length. Stack of subtree CVs collapses on even chunk counts,
	// exactly like createBlake3(); the final chunk merges down the stack with ROOT — byte-identical to blake3().
	const WGSL = `
	@group(0) @binding(0) var<storage, read> inp : array<u32>;
	@group(0) @binding(1) var<storage, read_write> outp : array<u32>;
	@group(0) @binding(2) var<uniform> P : vec4<u32>; // P.x = byte length

	const IV = array<u32,8>(0x6a09e667u,0xbb67ae85u,0x3c6ef372u,0xa54ff53au,0x510e527fu,0x9b05688cu,0x1f83d9abu,0x5be0cd19u);

	fn rotr(x:u32,n:u32)->u32 { return (x >> n) \| (x << (32u - n)); }

	// compress → 16 words. cv[8], m[16], counter lo/hi, blockLen, flags.
	fn compress(cv:array<u32,8>, m0:array<u32,16>, cl:u32, ch:u32, blockLen:u32, flags:u32) -> array<u32,16> {
	var v:array<u32,16>;
	v[0]=cv[0]; v[1]=cv[1]; v[2]=cv[2]; v[3]=cv[3]; v[4]=cv[4]; v[5]=cv[5]; v[6]=cv[6]; v[7]=cv[7];
	v[8]=IV[0]; v[9]=IV[1]; v[10]=IV[2]; v[11]=IV[3]; v[12]=cl; v[13]=ch; v[14]=blockLen; v[15]=flags;
	var m:array<u32,16> = m0;
	for (var r:u32=0u; r<7u; r=r+1u) {
	// roundFn — 8 g() applications, indices are literals
	// g(a,b,c,d,mx,my): a=a+b+mx; d=rotr(d^a,16); c=c+d; b=rotr(b^c,12); a=a+b+my; d=rotr(d^a,8); c=c+d; b=rotr(b^c,7)
	// col 0
	v[0]=v[0]+v[4]+m[0]; v[12]=rotr(v[12]^v[0],16u); v[8]=v[8]+v[12]; v[4]=rotr(v[4]^v[8],12u); v[0]=v[0]+v[4]+m[1]; v[12]=rotr(v[12]^v[0],8u); v[8]=v[8]+v[12]; v[4]=rotr(v[4]^v[8],7u);
	v[1]=v[1]+v[5]+m[2]; v[13]=rotr(v[13]^v[1],16u); v[9]=v[9]+v[13]; v[5]=rotr(v[5]^v[9],12u); v[1]=v[1]+v[5]+m[3]; v[13]=rotr(v[13]^v[1],8u); v[9]=v[9]+v[13]; v[5]=rotr(v[5]^v[9],7u);
	v[2]=v[2]+v[6]+m[4]; v[14]=rotr(v[14]^v[2],16u); v[10]=v[10]+v[14]; v[6]=rotr(v[6]^v[10],12u); v[2]=v[2]+v[6]+m[5]; v[14]=rotr(v[14]^v[2],8u); v[10]=v[10]+v[14]; v[6]=rotr(v[6]^v[10],7u);
	v[3]=v[3]+v[7]+m[6]; v[15]=rotr(v[15]^v[3],16u); v[11]=v[11]+v[15]; v[7]=rotr(v[7]^v[11],12u); v[3]=v[3]+v[7]+m[7]; v[15]=rotr(v[15]^v[3],8u); v[11]=v[11]+v[15]; v[7]=rotr(v[7]^v[11],7u);
	// diag
	v[0]=v[0]+v[5]+m[8]; v[15]=rotr(v[15]^v[0],16u); v[10]=v[10]+v[15]; v[5]=rotr(v[5]^v[10],12u); v[0]=v[0]+v[5]+m[9]; v[15]=rotr(v[15]^v[0],8u); v[10]=v[10]+v[15]; v[5]=rotr(v[5]^v[10],7u);
	v[1]=v[1]+v[6]+m[10]; v[12]=rotr(v[12]^v[1],16u); v[11]=v[11]+v[12]; v[6]=rotr(v[6]^v[11],12u); v[1]=v[1]+v[6]+m[11]; v[12]=rotr(v[12]^v[1],8u); v[11]=v[11]+v[12]; v[6]=rotr(v[6]^v[11],7u);
	v[2]=v[2]+v[7]+m[12]; v[13]=rotr(v[13]^v[2],16u); v[8]=v[8]+v[13]; v[7]=rotr(v[7]^v[8],12u); v[2]=v[2]+v[7]+m[13]; v[13]=rotr(v[13]^v[2],8u); v[8]=v[8]+v[13]; v[7]=rotr(v[7]^v[8],7u);
	v[3]=v[3]+v[4]+m[14]; v[14]=rotr(v[14]^v[3],16u); v[9]=v[9]+v[14]; v[4]=rotr(v[4]^v[9],12u); v[3]=v[3]+v[4]+m[15]; v[14]=rotr(v[14]^v[3],8u); v[9]=v[9]+v[14]; v[4]=rotr(v[4]^v[9],7u);
	if (r < 6u) { var perm=array<u32,16>(2u,6u,3u,10u,7u,0u,4u,13u,1u,11u,12u,5u,9u,14u,15u,8u); var p:array<u32,16>; for (var i:u32=0u;i<16u;i=i+1u){ p[i]=m[perm[i]]; } m=p; }
	}
	var o:array<u32,16>;
	for (var i:u32=0u;i<8u;i=i+1u){ o[i]=v[i]^v[i+8u]; o[i+8u]=v[i+8u]^cv[i]; }
	return o;
	}
	fn loadBlock(byteOff:u32) -> array<u32,16> {
	let base = byteOff >> 2u; var m:array<u32,16>;
	for (var i:u32=0u;i<16u;i=i+1u){ m[i] = inp[base+i]; } // input is zero-padded → direct read
	return m;
	}
	// chunk [start,start+len) at index ctr → chaining value (8) ; final=false path
	fn chunkCV(start:u32, len:u32, ctr:u32) -> array<u32,8> {
	var cv:array<u32,8> = IV;
	let nB = max(1u, (len + 63u) / 64u);
	var flags:u32 = 1u; // CHUNK_START
	for (var i:u32=0u; i<nB-1u; i=i+1u) {
	let o = compress(cv, loadBlock(start + i*64u), ctr, 0u, 64u, flags);
	for (var j:u32=0u;j<8u;j=j+1u){ cv[j]=o[j]; }
	flags = 0u;
	}
	let lastOff = start + (nB-1u)*64u;
	let lastLen = len - (nB-1u)*64u;
	let o2 = compress(cv, loadBlock(lastOff), ctr, 0u, lastLen, flags \| 2u); // CHUNK_END
	var out:array<u32,8>; for (var j:u32=0u;j<8u;j=j+1u){ out[j]=o2[j]; } return out;
	}
	// final chunk → 32-byte root (returns 8 words). merged with a running CV from the stack side is handled in main.
	// parent(left8,right8) → cv8 (or root8 if isRoot)
	fn parentCV(l:array<u32,8>, r:array<u32,8>, isRoot:bool) -> array<u32,8> {
	var m:array<u32,16>; for (var i:u32=0u;i<8u;i=i+1u){ m[i]=l[i]; m[i+8u]=r[i]; }
	var flags:u32 = 4u; if (isRoot) { flags = 4u \| 8u; }
	let o = compress(IV, m, 0u, 0u, 64u, flags);
	var out:array<u32,8>; for (var i:u32=0u;i<8u;i=i+1u){ out[i]=o[i]; } return out;
	}
	// final chunk root (when there are no parents): compress final block with ROOT, counter 0
	fn chunkRoot(start:u32, len:u32, ctr:u32) -> array<u32,8> {
	var cv:array<u32,8> = IV;
	let nB = max(1u, (len + 63u) / 64u);
	var flags:u32 = 1u;
	for (var i:u32=0u; i<nB-1u; i=i+1u){ let o=compress(cv,loadBlock(start+i*64u),ctr,0u,64u,flags); for(var j:u32=0u;j<8u;j=j+1u){cv[j]=o[j];} flags=0u; }
	let lastOff=start+(nB-1u)64u; let lastLen=len-(nB-1u)64u;
	let o=compress(cv, loadBlock(lastOff), 0u,0u, lastLen, flags \| 2u \| 8u); // CHUNK_END\|ROOT, counter 0
	var out:array<u32,8>; for(var j:u32=0u;j<8u;j=j+1u){out[j]=o[j];} return out;
	}

	@compute @workgroup_size(1)
	fn main() {
	let len = P.x;
	var stack: array<array<u32,8>, 54>;
	var depth:u32 = 0u;
	let totalChunks = select((len + 1023u) / 1024u, 1u, len == 0u);
	let flushed = totalChunks - 1u; // chunks 0..flushed-1 go to the stack; last is final
	// build the stack from the flushed chunks
	for (var c:u32=0u; c<flushed; c=c+1u) {
	var cv = chunkCV(c*1024u, 1024u, c);
	var t = c + 1u; // add(cv, totalChunks=c+1)
	loop { if ((t & 1u) != 0u) { break; } depth=depth-1u; cv = parentCV(stack[depth], cv, false); t = t >> 1u; }
	stack[depth] = cv; depth = depth + 1u;
	}
	let finalOff = flushed * 1024u;
	let finalLen = len - finalOff; // 0..1024
	var root: array<u32,8>;
	if (depth == 0u) {
	root = chunkRoot(finalOff, finalLen, flushed); // single chunk → its own root
	} else {
	// final chunk's CV, then merge down the stack; the LAST merge is ROOT
	var node = chunkCV(finalOff, finalLen, flushed);
	for (var i:u32=0u; i<depth; i=i+1u) {
	let li = depth - 1u - i;
	root = parentCV(stack[li], node, i == depth-1u); // top merge = ROOT
	node = root;
	}
	}
	for (var i:u32=0u;i<8u;i=i+1u){ outp[i] = root[i]; }
	}
	`;

	// ─────────────────── WebGPU harness ───────────────────
	const $=s=>document.querySelector(s), st=$("#status"), tb=$("#tbl tbody"), res=$("#result");
	function hexLE(words){let s="";for(let i=0;i<8;i++){const w=words[i]>>>0;s+=(w&255).toString(16).padStart(2,"0")+((w>>>8)&255).toString(16).padStart(2,"0")+((w>>>16)&255).toString(16).padStart(2,"0")+((w>>>24)&255).toString(16).padStart(2,"0");}return s;}

	(async()=>{
	if(!navigator.gpu){st.textContent="✗ No WebGPU in this browser. Open in Chrome/Edge (or enable WebGPU).";st.classList.add("no");return;}
	let adapter,device;
	try{adapter=await navigator.gpu.requestAdapter();device=await adapter.requestDevice();}
	catch(e){st.textContent="✗ WebGPU device error: "+e.message;return;}
	let module;
	try{ module=device.createShaderModule({code:WGSL}); const info=await module.getCompilationInfo(); const errs=info.messages.filter(m=>m.type==="error"); if(errs.length){st.textContent="✗ WGSL compile error: "+errs[0].message+" (line "+errs[0].lineNum+")";st.classList.add("no");return;} }
	catch(e){st.textContent="✗ shader error: "+e.message;return;}
	const pipeline=device.createComputePipeline({layout:"auto",compute:{module,entryPoint:"main"}});

	async function gpuHash(bytes){
	const len=bytes.length, padded=Math.max(16,(Math.ceil(len/64)\|\|1)*16); // u32 count, ≥1 block
	const u=new Uint32Array(padded); const b=new Uint8Array(u.buffer); b.set(bytes);
	const inBuf=device.createBuffer({size:u.byteLength,usage:GPUBufferUsage.STORAGE\|GPUBufferUsage.COPY_DST}); device.queue.writeBuffer(inBuf,0,u);
	const outBuf=device.createBuffer({size:32,usage:GPUBufferUsage.STORAGE\|GPUBufferUsage.COPY_SRC});
	const parBuf=device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM\|GPUBufferUsage.COPY_DST}); device.queue.writeBuffer(parBuf,0,new Uint32Array([len,0,0,0]));
	const stg=device.createBuffer({size:32,usage:GPUBufferUsage.COPY_DST\|GPUBufferUsage.MAP_READ});
	const bg=device.createBindGroup({layout:pipeline.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:inBuf}},{binding:1,resource:{buffer:outBuf}},{binding:2,resource:{buffer:parBuf}}]});
	const enc=device.createCommandEncoder(); const p=enc.beginComputePass(); p.setPipeline(pipeline); p.setBindGroup(0,bg); p.dispatchWorkgroups(1); p.end();
	enc.copyBufferToBuffer(outBuf,0,stg,0,32); device.queue.submit([enc.finish()]);
	await stg.mapAsync(GPUMapMode.READ); const w=new Uint32Array(stg.getMappedRange().slice(0)); stg.unmap();
	inBuf.destroy();outBuf.destroy();parBuf.destroy();stg.destroy();
	return hexLE(w);
	}

	const sizes=[0,1,63,64,65,127,128,129,1023,1024,1025,2047,2048,2049,4096,10000,65536,100000];
	st.textContent="running "+sizes.length+" boundary cases on your GPU…";
	let pass=0;
	// official empty-input vector, an extra ground-truth anchor
	const OFFICIAL_EMPTY="af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262";
	for(const n of sizes){
	const bytes=new Uint8Array(n); for(let i=0;i<n;i++)bytes[i]=i&255;
	const cpu=blake3hex(bytes); const gpu=await gpuHash(bytes);
	const ok=gpu===cpu && (n!==0 \|\| cpu===OFFICIAL_EMPTY); if(ok)pass++;
	const tr=document.createElement("tr");
	tr.innerHTML=`<td>${n===0?"empty":"pattern"}${n===0?" (official vector)":""}</td><td class="n">${n.toLocaleString()}</td><td class="hx">${gpu.slice(0,32)}…</td><td class="${ok?'ok':'no'}">${ok?'✓ match':'✗ MISMATCH'}</td>`;
	tb.appendChild(tr);
	if(!ok){res.innerHTML=`<div class="big no">✗ MISMATCH at ${n} bytes</div><div class="hx">gpu ${gpu}<br>cpu ${cpu}</div>`;}
	}
	st.textContent=`done · ${pass}/${sizes.length} cases`; st.classList.add(pass===sizes.length?"ok":"no");
	if(pass===sizes.length){res.innerHTML=`<div class="big ok">✓ BLAKE3 runs entirely on your GPU — byte-identical to the CPU oracle across every block, chunk, and tree boundary.</div><div class="hx" style="margin-top:6px">Adapter: ${adapter.info?adapter.info.vendor+" "+adapter.info.architecture:"(info withheld)"}</div>`;}
	})();
	</script></body></html>