Spaces:
Running
Running
| <html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1"> | |
| <title>BLAKE3 on your GPU β parity test</title> | |
| <style> | |
| :root{--bg:#0a0d13;--panel:#111825;--ink:#e8ebf1;--dim:#8b95a7;--ac:#7c5cff;--ok:#48c26c;--no:#f0616d;--line:#1e2836} | |
| *{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:15px/1.55 -apple-system,Segoe UI,Roboto,monospace;padding:24px;max-width:820px;margin:0 auto} | |
| h1{font-size:20px;margin:0 0 4px}.sub{color:var(--dim);font-size:13px;margin:0 0 18px} | |
| #status{font-family:ui-monospace,monospace;padding:10px 14px;border:1px solid var(--line);border-radius:10px;background:var(--panel);margin-bottom:16px} | |
| table{border-collapse:collapse;width:100%;font-family:ui-monospace,monospace;font-size:12.5px} | |
| th,td{text-align:left;padding:7px 10px;border-bottom:1px solid var(--line)} | |
| th{color:var(--dim);text-transform:uppercase;font-size:10.5px;letter-spacing:.08em} | |
| td.n{text-align:right;font-variant-numeric:tabular-nums} | |
| .ok{color:var(--ok)}.no{color:var(--no)} | |
| .big{font-size:17px;font-weight:600;margin:16px 0 4px} | |
| .hx{color:var(--dim)} | |
| </style></head><body> | |
| <h1>BLAKE3 β <span style="color:var(--ac)">on your GPU</span> vs the CPU oracle</h1> | |
| <p class="sub">WGSL kernel hashes each input entirely on the GPU; compared byte-for-byte to <code>holo-blake3.mjs</code> and the official empty-input vector. Boundary sizes exercise block (64), chunk (1024), and tree merges.</p> | |
| <div id="status">initializing WebGPUβ¦</div> | |
| <div id="result"></div> | |
| <table id="tbl"><thead><tr><th>input</th><th class="n">bytes</th><th>GPU digest (first 32 hex)</th><th>match</th></tr></thead><tbody></tbody></table> | |
| <script type="module"> | |
| // βββββββββββββββββββ CPU ORACLE (holo-blake3.mjs, inlined verbatim) βββββββββββββββββββ | |
| const IV=[0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19]; | |
| const MSG=[2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8]; | |
| const CHUNK_START=1,CHUNK_END=2,PARENT=4,ROOT=8,BLOCK=64,CHUNK=1024; | |
| const rotr=(x,n)=>((x>>>n)|(x<<(32-n)))>>>0; | |
| function g(v,a,b,c,d,mx,my){v[a]=(v[a]+v[b]+mx)>>>0;v[d]=rotr(v[d]^v[a],16);v[c]=(v[c]+v[d])>>>0;v[b]=rotr(v[b]^v[c],12);v[a]=(v[a]+v[b]+my)>>>0;v[d]=rotr(v[d]^v[a],8);v[c]=(v[c]+v[d])>>>0;v[b]=rotr(v[b]^v[c],7);} | |
| function roundFn(v,m){g(v,0,4,8,12,m[0],m[1]);g(v,1,5,9,13,m[2],m[3]);g(v,2,6,10,14,m[4],m[5]);g(v,3,7,11,15,m[6],m[7]);g(v,0,5,10,15,m[8],m[9]);g(v,1,6,11,12,m[10],m[11]);g(v,2,7,8,13,m[12],m[13]);g(v,3,4,9,14,m[14],m[15]);} | |
| function compress(cv,m0,counter,blockLen,flags){const cl=counter>>>0,ch=Math.floor(counter/4294967296)>>>0;const v=[cv[0],cv[1],cv[2],cv[3],cv[4],cv[5],cv[6],cv[7],IV[0],IV[1],IV[2],IV[3],cl,ch,blockLen>>>0,flags>>>0];let m=m0.slice();for(let r=0;r<7;r++){roundFn(v,m);if(r<6){const p=new Array(16);for(let i=0;i<16;i++)p[i]=m[MSG[i]];m=p;}}const out=new Array(16);for(let i=0;i<8;i++){out[i]=(v[i]^v[i+8])>>>0;out[i+8]=(v[i+8]^cv[i])>>>0;}return out;} | |
| function words(bytes,off,len){const m=new Array(16).fill(0);for(let i=0;i<len;i++)m[i>>2]|=bytes[off+i]<<((i&3)*8);for(let i=0;i<16;i++)m[i]>>>=0;return m;} | |
| function nodeCV(o){return compress(o.cv,o.m,o.counter,o.blockLen,o.flags).slice(0,8);} | |
| function nodeRoot(o){const out=compress(o.cv,o.m,0,o.blockLen,o.flags|ROOT);const b=new Uint8Array(32);for(let i=0;i<8;i++){const w=out[i];b[i*4]=w&255;b[i*4+1]=(w>>>8)&255;b[i*4+2]=(w>>>16)&255;b[i*4+3]=(w>>>24)&255;}return b;} | |
| function chunkNode(bytes,start,len,counter){let cv=IV.slice();const nB=Math.max(1,Math.ceil(len/BLOCK));let flags=CHUNK_START;for(let i=0;i<nB-1;i++){cv=compress(cv,words(bytes,start+i*BLOCK,BLOCK),counter,BLOCK,flags).slice(0,8);flags=0;}const lo=start+(nB-1)*BLOCK,ll=len-(nB-1)*BLOCK;return{cv,m:words(bytes,lo,ll),counter,blockLen:ll,flags:flags|CHUNK_END};} | |
| function parentNode(l,r){return{cv:IV.slice(),m:l.concat(r),counter:0,blockLen:BLOCK,flags:PARENT};} | |
| function subtree(bytes,start,len,counter){if(len<=CHUNK)return chunkNode(bytes,start,len,counter);let left=CHUNK;while(left*2<len)left*=2;const lcv=nodeCV(subtree(bytes,start,left,counter));const rcv=nodeCV(subtree(bytes,start+left,len-left,counter+left/CHUNK));return parentNode(lcv,rcv);} | |
| function blake3hex(bytes){const d=nodeRoot(subtree(bytes,0,bytes.length,0));let s="";for(let i=0;i<32;i++)s+=d[i].toString(16).padStart(2,"0");return s;} | |
| // βββββββββββββββββββ WGSL BLAKE3 β runs ENTIRELY on the GPU βββββββββββββββββββ | |
| // Single-invocation port of the incremental/stack tree (no recursion). Input is zero-padded to a | |
| // multiple of 64 bytes as an array<u32> (LE) so each 64-byte block = 16 consecutive u32. blockLen in | |
| // the compression state carries the true length. Stack of subtree CVs collapses on even chunk counts, | |
| // exactly like createBlake3(); the final chunk merges down the stack with ROOT β byte-identical to blake3(). | |
| const WGSL = ` | |
| @group(0) @binding(0) var<storage, read> inp : array<u32>; | |
| @group(0) @binding(1) var<storage, read_write> outp : array<u32>; | |
| @group(0) @binding(2) var<uniform> P : vec4<u32>; // P.x = byte length | |
| const IV = array<u32,8>(0x6a09e667u,0xbb67ae85u,0x3c6ef372u,0xa54ff53au,0x510e527fu,0x9b05688cu,0x1f83d9abu,0x5be0cd19u); | |
| fn rotr(x:u32,n:u32)->u32 { return (x >> n) | (x << (32u - n)); } | |
| // compress β 16 words. cv[8], m[16], counter lo/hi, blockLen, flags. | |
| fn compress(cv:array<u32,8>, m0:array<u32,16>, cl:u32, ch:u32, blockLen:u32, flags:u32) -> array<u32,16> { | |
| var v:array<u32,16>; | |
| v[0]=cv[0]; v[1]=cv[1]; v[2]=cv[2]; v[3]=cv[3]; v[4]=cv[4]; v[5]=cv[5]; v[6]=cv[6]; v[7]=cv[7]; | |
| v[8]=IV[0]; v[9]=IV[1]; v[10]=IV[2]; v[11]=IV[3]; v[12]=cl; v[13]=ch; v[14]=blockLen; v[15]=flags; | |
| var m:array<u32,16> = m0; | |
| for (var r:u32=0u; r<7u; r=r+1u) { | |
| // roundFn β 8 g() applications, indices are literals | |
| // g(a,b,c,d,mx,my): a=a+b+mx; d=rotr(d^a,16); c=c+d; b=rotr(b^c,12); a=a+b+my; d=rotr(d^a,8); c=c+d; b=rotr(b^c,7) | |
| // col 0 | |
| v[0]=v[0]+v[4]+m[0]; v[12]=rotr(v[12]^v[0],16u); v[8]=v[8]+v[12]; v[4]=rotr(v[4]^v[8],12u); v[0]=v[0]+v[4]+m[1]; v[12]=rotr(v[12]^v[0],8u); v[8]=v[8]+v[12]; v[4]=rotr(v[4]^v[8],7u); | |
| v[1]=v[1]+v[5]+m[2]; v[13]=rotr(v[13]^v[1],16u); v[9]=v[9]+v[13]; v[5]=rotr(v[5]^v[9],12u); v[1]=v[1]+v[5]+m[3]; v[13]=rotr(v[13]^v[1],8u); v[9]=v[9]+v[13]; v[5]=rotr(v[5]^v[9],7u); | |
| v[2]=v[2]+v[6]+m[4]; v[14]=rotr(v[14]^v[2],16u); v[10]=v[10]+v[14]; v[6]=rotr(v[6]^v[10],12u); v[2]=v[2]+v[6]+m[5]; v[14]=rotr(v[14]^v[2],8u); v[10]=v[10]+v[14]; v[6]=rotr(v[6]^v[10],7u); | |
| v[3]=v[3]+v[7]+m[6]; v[15]=rotr(v[15]^v[3],16u); v[11]=v[11]+v[15]; v[7]=rotr(v[7]^v[11],12u); v[3]=v[3]+v[7]+m[7]; v[15]=rotr(v[15]^v[3],8u); v[11]=v[11]+v[15]; v[7]=rotr(v[7]^v[11],7u); | |
| // diag | |
| v[0]=v[0]+v[5]+m[8]; v[15]=rotr(v[15]^v[0],16u); v[10]=v[10]+v[15]; v[5]=rotr(v[5]^v[10],12u); v[0]=v[0]+v[5]+m[9]; v[15]=rotr(v[15]^v[0],8u); v[10]=v[10]+v[15]; v[5]=rotr(v[5]^v[10],7u); | |
| v[1]=v[1]+v[6]+m[10]; v[12]=rotr(v[12]^v[1],16u); v[11]=v[11]+v[12]; v[6]=rotr(v[6]^v[11],12u); v[1]=v[1]+v[6]+m[11]; v[12]=rotr(v[12]^v[1],8u); v[11]=v[11]+v[12]; v[6]=rotr(v[6]^v[11],7u); | |
| v[2]=v[2]+v[7]+m[12]; v[13]=rotr(v[13]^v[2],16u); v[8]=v[8]+v[13]; v[7]=rotr(v[7]^v[8],12u); v[2]=v[2]+v[7]+m[13]; v[13]=rotr(v[13]^v[2],8u); v[8]=v[8]+v[13]; v[7]=rotr(v[7]^v[8],7u); | |
| v[3]=v[3]+v[4]+m[14]; v[14]=rotr(v[14]^v[3],16u); v[9]=v[9]+v[14]; v[4]=rotr(v[4]^v[9],12u); v[3]=v[3]+v[4]+m[15]; v[14]=rotr(v[14]^v[3],8u); v[9]=v[9]+v[14]; v[4]=rotr(v[4]^v[9],7u); | |
| if (r < 6u) { var perm=array<u32,16>(2u,6u,3u,10u,7u,0u,4u,13u,1u,11u,12u,5u,9u,14u,15u,8u); var p:array<u32,16>; for (var i:u32=0u;i<16u;i=i+1u){ p[i]=m[perm[i]]; } m=p; } | |
| } | |
| var o:array<u32,16>; | |
| for (var i:u32=0u;i<8u;i=i+1u){ o[i]=v[i]^v[i+8u]; o[i+8u]=v[i+8u]^cv[i]; } | |
| return o; | |
| } | |
| fn loadBlock(byteOff:u32) -> array<u32,16> { | |
| let base = byteOff >> 2u; var m:array<u32,16>; | |
| for (var i:u32=0u;i<16u;i=i+1u){ m[i] = inp[base+i]; } // input is zero-padded β direct read | |
| return m; | |
| } | |
| // chunk [start,start+len) at index ctr β chaining value (8) ; final=false path | |
| fn chunkCV(start:u32, len:u32, ctr:u32) -> array<u32,8> { | |
| var cv:array<u32,8> = IV; | |
| let nB = max(1u, (len + 63u) / 64u); | |
| var flags:u32 = 1u; // CHUNK_START | |
| for (var i:u32=0u; i<nB-1u; i=i+1u) { | |
| let o = compress(cv, loadBlock(start + i*64u), ctr, 0u, 64u, flags); | |
| for (var j:u32=0u;j<8u;j=j+1u){ cv[j]=o[j]; } | |
| flags = 0u; | |
| } | |
| let lastOff = start + (nB-1u)*64u; | |
| let lastLen = len - (nB-1u)*64u; | |
| let o2 = compress(cv, loadBlock(lastOff), ctr, 0u, lastLen, flags | 2u); // CHUNK_END | |
| var out:array<u32,8>; for (var j:u32=0u;j<8u;j=j+1u){ out[j]=o2[j]; } return out; | |
| } | |
| // final chunk β 32-byte root (returns 8 words). merged with a running CV from the stack side is handled in main. | |
| // parent(left8,right8) β cv8 (or root8 if isRoot) | |
| fn parentCV(l:array<u32,8>, r:array<u32,8>, isRoot:bool) -> array<u32,8> { | |
| var m:array<u32,16>; for (var i:u32=0u;i<8u;i=i+1u){ m[i]=l[i]; m[i+8u]=r[i]; } | |
| var flags:u32 = 4u; if (isRoot) { flags = 4u | 8u; } | |
| let o = compress(IV, m, 0u, 0u, 64u, flags); | |
| var out:array<u32,8>; for (var i:u32=0u;i<8u;i=i+1u){ out[i]=o[i]; } return out; | |
| } | |
| // final chunk root (when there are no parents): compress final block with ROOT, counter 0 | |
| fn chunkRoot(start:u32, len:u32, ctr:u32) -> array<u32,8> { | |
| var cv:array<u32,8> = IV; | |
| let nB = max(1u, (len + 63u) / 64u); | |
| var flags:u32 = 1u; | |
| for (var i:u32=0u; i<nB-1u; i=i+1u){ let o=compress(cv,loadBlock(start+i*64u),ctr,0u,64u,flags); for(var j:u32=0u;j<8u;j=j+1u){cv[j]=o[j];} flags=0u; } | |
| let lastOff=start+(nB-1u)*64u; let lastLen=len-(nB-1u)*64u; | |
| let o=compress(cv, loadBlock(lastOff), 0u,0u, lastLen, flags | 2u | 8u); // CHUNK_END|ROOT, counter 0 | |
| var out:array<u32,8>; for(var j:u32=0u;j<8u;j=j+1u){out[j]=o[j];} return out; | |
| } | |
| @compute @workgroup_size(1) | |
| fn main() { | |
| let len = P.x; | |
| var stack: array<array<u32,8>, 54>; | |
| var depth:u32 = 0u; | |
| let totalChunks = select((len + 1023u) / 1024u, 1u, len == 0u); | |
| let flushed = totalChunks - 1u; // chunks 0..flushed-1 go to the stack; last is final | |
| // build the stack from the flushed chunks | |
| for (var c:u32=0u; c<flushed; c=c+1u) { | |
| var cv = chunkCV(c*1024u, 1024u, c); | |
| var t = c + 1u; // add(cv, totalChunks=c+1) | |
| loop { if ((t & 1u) != 0u) { break; } depth=depth-1u; cv = parentCV(stack[depth], cv, false); t = t >> 1u; } | |
| stack[depth] = cv; depth = depth + 1u; | |
| } | |
| let finalOff = flushed * 1024u; | |
| let finalLen = len - finalOff; // 0..1024 | |
| var root: array<u32,8>; | |
| if (depth == 0u) { | |
| root = chunkRoot(finalOff, finalLen, flushed); // single chunk β its own root | |
| } else { | |
| // final chunk's CV, then merge down the stack; the LAST merge is ROOT | |
| var node = chunkCV(finalOff, finalLen, flushed); | |
| for (var i:u32=0u; i<depth; i=i+1u) { | |
| let li = depth - 1u - i; | |
| root = parentCV(stack[li], node, i == depth-1u); // top merge = ROOT | |
| node = root; | |
| } | |
| } | |
| for (var i:u32=0u;i<8u;i=i+1u){ outp[i] = root[i]; } | |
| } | |
| `; | |
| // βββββββββββββββββββ WebGPU harness βββββββββββββββββββ | |
| const $=s=>document.querySelector(s), st=$("#status"), tb=$("#tbl tbody"), res=$("#result"); | |
| function hexLE(words){let s="";for(let i=0;i<8;i++){const w=words[i]>>>0;s+=(w&255).toString(16).padStart(2,"0")+((w>>>8)&255).toString(16).padStart(2,"0")+((w>>>16)&255).toString(16).padStart(2,"0")+((w>>>24)&255).toString(16).padStart(2,"0");}return s;} | |
| (async()=>{ | |
| if(!navigator.gpu){st.textContent="β No WebGPU in this browser. Open in Chrome/Edge (or enable WebGPU).";st.classList.add("no");return;} | |
| let adapter,device; | |
| try{adapter=await navigator.gpu.requestAdapter();device=await adapter.requestDevice();} | |
| catch(e){st.textContent="β WebGPU device error: "+e.message;return;} | |
| let module; | |
| try{ module=device.createShaderModule({code:WGSL}); const info=await module.getCompilationInfo(); const errs=info.messages.filter(m=>m.type==="error"); if(errs.length){st.textContent="β WGSL compile error: "+errs[0].message+" (line "+errs[0].lineNum+")";st.classList.add("no");return;} } | |
| catch(e){st.textContent="β shader error: "+e.message;return;} | |
| const pipeline=device.createComputePipeline({layout:"auto",compute:{module,entryPoint:"main"}}); | |
| async function gpuHash(bytes){ | |
| const len=bytes.length, padded=Math.max(16,(Math.ceil(len/64)||1)*16); // u32 count, β₯1 block | |
| const u=new Uint32Array(padded); const b=new Uint8Array(u.buffer); b.set(bytes); | |
| const inBuf=device.createBuffer({size:u.byteLength,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST}); device.queue.writeBuffer(inBuf,0,u); | |
| const outBuf=device.createBuffer({size:32,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC}); | |
| const parBuf=device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST}); device.queue.writeBuffer(parBuf,0,new Uint32Array([len,0,0,0])); | |
| const stg=device.createBuffer({size:32,usage:GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ}); | |
| const bg=device.createBindGroup({layout:pipeline.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:inBuf}},{binding:1,resource:{buffer:outBuf}},{binding:2,resource:{buffer:parBuf}}]}); | |
| const enc=device.createCommandEncoder(); const p=enc.beginComputePass(); p.setPipeline(pipeline); p.setBindGroup(0,bg); p.dispatchWorkgroups(1); p.end(); | |
| enc.copyBufferToBuffer(outBuf,0,stg,0,32); device.queue.submit([enc.finish()]); | |
| await stg.mapAsync(GPUMapMode.READ); const w=new Uint32Array(stg.getMappedRange().slice(0)); stg.unmap(); | |
| inBuf.destroy();outBuf.destroy();parBuf.destroy();stg.destroy(); | |
| return hexLE(w); | |
| } | |
| const sizes=[0,1,63,64,65,127,128,129,1023,1024,1025,2047,2048,2049,4096,10000,65536,100000]; | |
| st.textContent="running "+sizes.length+" boundary cases on your GPUβ¦"; | |
| let pass=0; | |
| // official empty-input vector, an extra ground-truth anchor | |
| const OFFICIAL_EMPTY="af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"; | |
| for(const n of sizes){ | |
| const bytes=new Uint8Array(n); for(let i=0;i<n;i++)bytes[i]=i&255; | |
| const cpu=blake3hex(bytes); const gpu=await gpuHash(bytes); | |
| const ok=gpu===cpu && (n!==0 || cpu===OFFICIAL_EMPTY); if(ok)pass++; | |
| const tr=document.createElement("tr"); | |
| tr.innerHTML=`<td>${n===0?"empty":"pattern"}${n===0?" (official vector)":""}</td><td class="n">${n.toLocaleString()}</td><td class="hx">${gpu.slice(0,32)}β¦</td><td class="${ok?'ok':'no'}">${ok?'β match':'β MISMATCH'}</td>`; | |
| tb.appendChild(tr); | |
| if(!ok){res.innerHTML=`<div class="big no">β MISMATCH at ${n} bytes</div><div class="hx">gpu ${gpu}<br>cpu ${cpu}</div>`;} | |
| } | |
| st.textContent=`done Β· ${pass}/${sizes.length} cases`; st.classList.add(pass===sizes.length?"ok":"no"); | |
| if(pass===sizes.length){res.innerHTML=`<div class="big ok">β BLAKE3 runs entirely on your GPU β byte-identical to the CPU oracle across every block, chunk, and tree boundary.</div><div class="hx" style="margin-top:6px">Adapter: ${adapter.info?adapter.info.vendor+" "+adapter.info.architecture:"(info withheld)"}</div>`;} | |
| })(); | |
| </script></body></html> | |