q / blake3-gpu-test.html
Humuhumu33's picture
Upload blake3-gpu-test.html with huggingface_hub
a349783 verified
Raw
History Blame Contribute Delete
15.3 kB
<!doctype html><html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1">
<title>BLAKE3 on your GPU β€” parity test</title>
<style>
:root{--bg:#0a0d13;--panel:#111825;--ink:#e8ebf1;--dim:#8b95a7;--ac:#7c5cff;--ok:#48c26c;--no:#f0616d;--line:#1e2836}
*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:15px/1.55 -apple-system,Segoe UI,Roboto,monospace;padding:24px;max-width:820px;margin:0 auto}
h1{font-size:20px;margin:0 0 4px}.sub{color:var(--dim);font-size:13px;margin:0 0 18px}
#status{font-family:ui-monospace,monospace;padding:10px 14px;border:1px solid var(--line);border-radius:10px;background:var(--panel);margin-bottom:16px}
table{border-collapse:collapse;width:100%;font-family:ui-monospace,monospace;font-size:12.5px}
th,td{text-align:left;padding:7px 10px;border-bottom:1px solid var(--line)}
th{color:var(--dim);text-transform:uppercase;font-size:10.5px;letter-spacing:.08em}
td.n{text-align:right;font-variant-numeric:tabular-nums}
.ok{color:var(--ok)}.no{color:var(--no)}
.big{font-size:17px;font-weight:600;margin:16px 0 4px}
.hx{color:var(--dim)}
</style></head><body>
<h1>BLAKE3 β€” <span style="color:var(--ac)">on your GPU</span> vs the CPU oracle</h1>
<p class="sub">WGSL kernel hashes each input entirely on the GPU; compared byte-for-byte to <code>holo-blake3.mjs</code> and the official empty-input vector. Boundary sizes exercise block (64), chunk (1024), and tree merges.</p>
<div id="status">initializing WebGPU…</div>
<div id="result"></div>
<table id="tbl"><thead><tr><th>input</th><th class="n">bytes</th><th>GPU digest (first 32 hex)</th><th>match</th></tr></thead><tbody></tbody></table>
<script type="module">
// ─────────────────── CPU ORACLE (holo-blake3.mjs, inlined verbatim) ───────────────────
const IV=[0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19];
const MSG=[2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8];
const CHUNK_START=1,CHUNK_END=2,PARENT=4,ROOT=8,BLOCK=64,CHUNK=1024;
const rotr=(x,n)=>((x>>>n)|(x<<(32-n)))>>>0;
function g(v,a,b,c,d,mx,my){v[a]=(v[a]+v[b]+mx)>>>0;v[d]=rotr(v[d]^v[a],16);v[c]=(v[c]+v[d])>>>0;v[b]=rotr(v[b]^v[c],12);v[a]=(v[a]+v[b]+my)>>>0;v[d]=rotr(v[d]^v[a],8);v[c]=(v[c]+v[d])>>>0;v[b]=rotr(v[b]^v[c],7);}
function roundFn(v,m){g(v,0,4,8,12,m[0],m[1]);g(v,1,5,9,13,m[2],m[3]);g(v,2,6,10,14,m[4],m[5]);g(v,3,7,11,15,m[6],m[7]);g(v,0,5,10,15,m[8],m[9]);g(v,1,6,11,12,m[10],m[11]);g(v,2,7,8,13,m[12],m[13]);g(v,3,4,9,14,m[14],m[15]);}
function compress(cv,m0,counter,blockLen,flags){const cl=counter>>>0,ch=Math.floor(counter/4294967296)>>>0;const v=[cv[0],cv[1],cv[2],cv[3],cv[4],cv[5],cv[6],cv[7],IV[0],IV[1],IV[2],IV[3],cl,ch,blockLen>>>0,flags>>>0];let m=m0.slice();for(let r=0;r<7;r++){roundFn(v,m);if(r<6){const p=new Array(16);for(let i=0;i<16;i++)p[i]=m[MSG[i]];m=p;}}const out=new Array(16);for(let i=0;i<8;i++){out[i]=(v[i]^v[i+8])>>>0;out[i+8]=(v[i+8]^cv[i])>>>0;}return out;}
function words(bytes,off,len){const m=new Array(16).fill(0);for(let i=0;i<len;i++)m[i>>2]|=bytes[off+i]<<((i&3)*8);for(let i=0;i<16;i++)m[i]>>>=0;return m;}
function nodeCV(o){return compress(o.cv,o.m,o.counter,o.blockLen,o.flags).slice(0,8);}
function nodeRoot(o){const out=compress(o.cv,o.m,0,o.blockLen,o.flags|ROOT);const b=new Uint8Array(32);for(let i=0;i<8;i++){const w=out[i];b[i*4]=w&255;b[i*4+1]=(w>>>8)&255;b[i*4+2]=(w>>>16)&255;b[i*4+3]=(w>>>24)&255;}return b;}
function chunkNode(bytes,start,len,counter){let cv=IV.slice();const nB=Math.max(1,Math.ceil(len/BLOCK));let flags=CHUNK_START;for(let i=0;i<nB-1;i++){cv=compress(cv,words(bytes,start+i*BLOCK,BLOCK),counter,BLOCK,flags).slice(0,8);flags=0;}const lo=start+(nB-1)*BLOCK,ll=len-(nB-1)*BLOCK;return{cv,m:words(bytes,lo,ll),counter,blockLen:ll,flags:flags|CHUNK_END};}
function parentNode(l,r){return{cv:IV.slice(),m:l.concat(r),counter:0,blockLen:BLOCK,flags:PARENT};}
function subtree(bytes,start,len,counter){if(len<=CHUNK)return chunkNode(bytes,start,len,counter);let left=CHUNK;while(left*2<len)left*=2;const lcv=nodeCV(subtree(bytes,start,left,counter));const rcv=nodeCV(subtree(bytes,start+left,len-left,counter+left/CHUNK));return parentNode(lcv,rcv);}
function blake3hex(bytes){const d=nodeRoot(subtree(bytes,0,bytes.length,0));let s="";for(let i=0;i<32;i++)s+=d[i].toString(16).padStart(2,"0");return s;}
// ─────────────────── WGSL BLAKE3 β€” runs ENTIRELY on the GPU ───────────────────
// Single-invocation port of the incremental/stack tree (no recursion). Input is zero-padded to a
// multiple of 64 bytes as an array<u32> (LE) so each 64-byte block = 16 consecutive u32. blockLen in
// the compression state carries the true length. Stack of subtree CVs collapses on even chunk counts,
// exactly like createBlake3(); the final chunk merges down the stack with ROOT β€” byte-identical to blake3().
const WGSL = `
@group(0) @binding(0) var<storage, read> inp : array<u32>;
@group(0) @binding(1) var<storage, read_write> outp : array<u32>;
@group(0) @binding(2) var<uniform> P : vec4<u32>; // P.x = byte length
const IV = array<u32,8>(0x6a09e667u,0xbb67ae85u,0x3c6ef372u,0xa54ff53au,0x510e527fu,0x9b05688cu,0x1f83d9abu,0x5be0cd19u);
fn rotr(x:u32,n:u32)->u32 { return (x >> n) | (x << (32u - n)); }
// compress β†’ 16 words. cv[8], m[16], counter lo/hi, blockLen, flags.
fn compress(cv:array<u32,8>, m0:array<u32,16>, cl:u32, ch:u32, blockLen:u32, flags:u32) -> array<u32,16> {
var v:array<u32,16>;
v[0]=cv[0]; v[1]=cv[1]; v[2]=cv[2]; v[3]=cv[3]; v[4]=cv[4]; v[5]=cv[5]; v[6]=cv[6]; v[7]=cv[7];
v[8]=IV[0]; v[9]=IV[1]; v[10]=IV[2]; v[11]=IV[3]; v[12]=cl; v[13]=ch; v[14]=blockLen; v[15]=flags;
var m:array<u32,16> = m0;
for (var r:u32=0u; r<7u; r=r+1u) {
// roundFn β€” 8 g() applications, indices are literals
// g(a,b,c,d,mx,my): a=a+b+mx; d=rotr(d^a,16); c=c+d; b=rotr(b^c,12); a=a+b+my; d=rotr(d^a,8); c=c+d; b=rotr(b^c,7)
// col 0
v[0]=v[0]+v[4]+m[0]; v[12]=rotr(v[12]^v[0],16u); v[8]=v[8]+v[12]; v[4]=rotr(v[4]^v[8],12u); v[0]=v[0]+v[4]+m[1]; v[12]=rotr(v[12]^v[0],8u); v[8]=v[8]+v[12]; v[4]=rotr(v[4]^v[8],7u);
v[1]=v[1]+v[5]+m[2]; v[13]=rotr(v[13]^v[1],16u); v[9]=v[9]+v[13]; v[5]=rotr(v[5]^v[9],12u); v[1]=v[1]+v[5]+m[3]; v[13]=rotr(v[13]^v[1],8u); v[9]=v[9]+v[13]; v[5]=rotr(v[5]^v[9],7u);
v[2]=v[2]+v[6]+m[4]; v[14]=rotr(v[14]^v[2],16u); v[10]=v[10]+v[14]; v[6]=rotr(v[6]^v[10],12u); v[2]=v[2]+v[6]+m[5]; v[14]=rotr(v[14]^v[2],8u); v[10]=v[10]+v[14]; v[6]=rotr(v[6]^v[10],7u);
v[3]=v[3]+v[7]+m[6]; v[15]=rotr(v[15]^v[3],16u); v[11]=v[11]+v[15]; v[7]=rotr(v[7]^v[11],12u); v[3]=v[3]+v[7]+m[7]; v[15]=rotr(v[15]^v[3],8u); v[11]=v[11]+v[15]; v[7]=rotr(v[7]^v[11],7u);
// diag
v[0]=v[0]+v[5]+m[8]; v[15]=rotr(v[15]^v[0],16u); v[10]=v[10]+v[15]; v[5]=rotr(v[5]^v[10],12u); v[0]=v[0]+v[5]+m[9]; v[15]=rotr(v[15]^v[0],8u); v[10]=v[10]+v[15]; v[5]=rotr(v[5]^v[10],7u);
v[1]=v[1]+v[6]+m[10]; v[12]=rotr(v[12]^v[1],16u); v[11]=v[11]+v[12]; v[6]=rotr(v[6]^v[11],12u); v[1]=v[1]+v[6]+m[11]; v[12]=rotr(v[12]^v[1],8u); v[11]=v[11]+v[12]; v[6]=rotr(v[6]^v[11],7u);
v[2]=v[2]+v[7]+m[12]; v[13]=rotr(v[13]^v[2],16u); v[8]=v[8]+v[13]; v[7]=rotr(v[7]^v[8],12u); v[2]=v[2]+v[7]+m[13]; v[13]=rotr(v[13]^v[2],8u); v[8]=v[8]+v[13]; v[7]=rotr(v[7]^v[8],7u);
v[3]=v[3]+v[4]+m[14]; v[14]=rotr(v[14]^v[3],16u); v[9]=v[9]+v[14]; v[4]=rotr(v[4]^v[9],12u); v[3]=v[3]+v[4]+m[15]; v[14]=rotr(v[14]^v[3],8u); v[9]=v[9]+v[14]; v[4]=rotr(v[4]^v[9],7u);
if (r < 6u) { var perm=array<u32,16>(2u,6u,3u,10u,7u,0u,4u,13u,1u,11u,12u,5u,9u,14u,15u,8u); var p:array<u32,16>; for (var i:u32=0u;i<16u;i=i+1u){ p[i]=m[perm[i]]; } m=p; }
}
var o:array<u32,16>;
for (var i:u32=0u;i<8u;i=i+1u){ o[i]=v[i]^v[i+8u]; o[i+8u]=v[i+8u]^cv[i]; }
return o;
}
fn loadBlock(byteOff:u32) -> array<u32,16> {
let base = byteOff >> 2u; var m:array<u32,16>;
for (var i:u32=0u;i<16u;i=i+1u){ m[i] = inp[base+i]; } // input is zero-padded β†’ direct read
return m;
}
// chunk [start,start+len) at index ctr β†’ chaining value (8) ; final=false path
fn chunkCV(start:u32, len:u32, ctr:u32) -> array<u32,8> {
var cv:array<u32,8> = IV;
let nB = max(1u, (len + 63u) / 64u);
var flags:u32 = 1u; // CHUNK_START
for (var i:u32=0u; i<nB-1u; i=i+1u) {
let o = compress(cv, loadBlock(start + i*64u), ctr, 0u, 64u, flags);
for (var j:u32=0u;j<8u;j=j+1u){ cv[j]=o[j]; }
flags = 0u;
}
let lastOff = start + (nB-1u)*64u;
let lastLen = len - (nB-1u)*64u;
let o2 = compress(cv, loadBlock(lastOff), ctr, 0u, lastLen, flags | 2u); // CHUNK_END
var out:array<u32,8>; for (var j:u32=0u;j<8u;j=j+1u){ out[j]=o2[j]; } return out;
}
// final chunk β†’ 32-byte root (returns 8 words). merged with a running CV from the stack side is handled in main.
// parent(left8,right8) β†’ cv8 (or root8 if isRoot)
fn parentCV(l:array<u32,8>, r:array<u32,8>, isRoot:bool) -> array<u32,8> {
var m:array<u32,16>; for (var i:u32=0u;i<8u;i=i+1u){ m[i]=l[i]; m[i+8u]=r[i]; }
var flags:u32 = 4u; if (isRoot) { flags = 4u | 8u; }
let o = compress(IV, m, 0u, 0u, 64u, flags);
var out:array<u32,8>; for (var i:u32=0u;i<8u;i=i+1u){ out[i]=o[i]; } return out;
}
// final chunk root (when there are no parents): compress final block with ROOT, counter 0
fn chunkRoot(start:u32, len:u32, ctr:u32) -> array<u32,8> {
var cv:array<u32,8> = IV;
let nB = max(1u, (len + 63u) / 64u);
var flags:u32 = 1u;
for (var i:u32=0u; i<nB-1u; i=i+1u){ let o=compress(cv,loadBlock(start+i*64u),ctr,0u,64u,flags); for(var j:u32=0u;j<8u;j=j+1u){cv[j]=o[j];} flags=0u; }
let lastOff=start+(nB-1u)*64u; let lastLen=len-(nB-1u)*64u;
let o=compress(cv, loadBlock(lastOff), 0u,0u, lastLen, flags | 2u | 8u); // CHUNK_END|ROOT, counter 0
var out:array<u32,8>; for(var j:u32=0u;j<8u;j=j+1u){out[j]=o[j];} return out;
}
@compute @workgroup_size(1)
fn main() {
let len = P.x;
var stack: array<array<u32,8>, 54>;
var depth:u32 = 0u;
let totalChunks = select((len + 1023u) / 1024u, 1u, len == 0u);
let flushed = totalChunks - 1u; // chunks 0..flushed-1 go to the stack; last is final
// build the stack from the flushed chunks
for (var c:u32=0u; c<flushed; c=c+1u) {
var cv = chunkCV(c*1024u, 1024u, c);
var t = c + 1u; // add(cv, totalChunks=c+1)
loop { if ((t & 1u) != 0u) { break; } depth=depth-1u; cv = parentCV(stack[depth], cv, false); t = t >> 1u; }
stack[depth] = cv; depth = depth + 1u;
}
let finalOff = flushed * 1024u;
let finalLen = len - finalOff; // 0..1024
var root: array<u32,8>;
if (depth == 0u) {
root = chunkRoot(finalOff, finalLen, flushed); // single chunk β†’ its own root
} else {
// final chunk's CV, then merge down the stack; the LAST merge is ROOT
var node = chunkCV(finalOff, finalLen, flushed);
for (var i:u32=0u; i<depth; i=i+1u) {
let li = depth - 1u - i;
root = parentCV(stack[li], node, i == depth-1u); // top merge = ROOT
node = root;
}
}
for (var i:u32=0u;i<8u;i=i+1u){ outp[i] = root[i]; }
}
`;
// ─────────────────── WebGPU harness ───────────────────
const $=s=>document.querySelector(s), st=$("#status"), tb=$("#tbl tbody"), res=$("#result");
function hexLE(words){let s="";for(let i=0;i<8;i++){const w=words[i]>>>0;s+=(w&255).toString(16).padStart(2,"0")+((w>>>8)&255).toString(16).padStart(2,"0")+((w>>>16)&255).toString(16).padStart(2,"0")+((w>>>24)&255).toString(16).padStart(2,"0");}return s;}
(async()=>{
if(!navigator.gpu){st.textContent="βœ— No WebGPU in this browser. Open in Chrome/Edge (or enable WebGPU).";st.classList.add("no");return;}
let adapter,device;
try{adapter=await navigator.gpu.requestAdapter();device=await adapter.requestDevice();}
catch(e){st.textContent="βœ— WebGPU device error: "+e.message;return;}
let module;
try{ module=device.createShaderModule({code:WGSL}); const info=await module.getCompilationInfo(); const errs=info.messages.filter(m=>m.type==="error"); if(errs.length){st.textContent="βœ— WGSL compile error: "+errs[0].message+" (line "+errs[0].lineNum+")";st.classList.add("no");return;} }
catch(e){st.textContent="βœ— shader error: "+e.message;return;}
const pipeline=device.createComputePipeline({layout:"auto",compute:{module,entryPoint:"main"}});
async function gpuHash(bytes){
const len=bytes.length, padded=Math.max(16,(Math.ceil(len/64)||1)*16); // u32 count, β‰₯1 block
const u=new Uint32Array(padded); const b=new Uint8Array(u.buffer); b.set(bytes);
const inBuf=device.createBuffer({size:u.byteLength,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST}); device.queue.writeBuffer(inBuf,0,u);
const outBuf=device.createBuffer({size:32,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC});
const parBuf=device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST}); device.queue.writeBuffer(parBuf,0,new Uint32Array([len,0,0,0]));
const stg=device.createBuffer({size:32,usage:GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ});
const bg=device.createBindGroup({layout:pipeline.getBindGroupLayout(0),entries:[{binding:0,resource:{buffer:inBuf}},{binding:1,resource:{buffer:outBuf}},{binding:2,resource:{buffer:parBuf}}]});
const enc=device.createCommandEncoder(); const p=enc.beginComputePass(); p.setPipeline(pipeline); p.setBindGroup(0,bg); p.dispatchWorkgroups(1); p.end();
enc.copyBufferToBuffer(outBuf,0,stg,0,32); device.queue.submit([enc.finish()]);
await stg.mapAsync(GPUMapMode.READ); const w=new Uint32Array(stg.getMappedRange().slice(0)); stg.unmap();
inBuf.destroy();outBuf.destroy();parBuf.destroy();stg.destroy();
return hexLE(w);
}
const sizes=[0,1,63,64,65,127,128,129,1023,1024,1025,2047,2048,2049,4096,10000,65536,100000];
st.textContent="running "+sizes.length+" boundary cases on your GPU…";
let pass=0;
// official empty-input vector, an extra ground-truth anchor
const OFFICIAL_EMPTY="af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262";
for(const n of sizes){
const bytes=new Uint8Array(n); for(let i=0;i<n;i++)bytes[i]=i&255;
const cpu=blake3hex(bytes); const gpu=await gpuHash(bytes);
const ok=gpu===cpu && (n!==0 || cpu===OFFICIAL_EMPTY); if(ok)pass++;
const tr=document.createElement("tr");
tr.innerHTML=`<td>${n===0?"empty":"pattern"}${n===0?" (official vector)":""}</td><td class="n">${n.toLocaleString()}</td><td class="hx">${gpu.slice(0,32)}…</td><td class="${ok?'ok':'no'}">${ok?'βœ“ match':'βœ— MISMATCH'}</td>`;
tb.appendChild(tr);
if(!ok){res.innerHTML=`<div class="big no">βœ— MISMATCH at ${n} bytes</div><div class="hx">gpu ${gpu}<br>cpu ${cpu}</div>`;}
}
st.textContent=`done Β· ${pass}/${sizes.length} cases`; st.classList.add(pass===sizes.length?"ok":"no");
if(pass===sizes.length){res.innerHTML=`<div class="big ok">βœ“ BLAKE3 runs entirely on your GPU β€” byte-identical to the CPU oracle across every block, chunk, and tree boundary.</div><div class="hx" style="margin-top:6px">Adapter: ${adapter.info?adapter.info.vendor+" "+adapter.info.architecture:"(info withheld)"}</div>`;}
})();
</script></body></html>