(function(){const e=document.createElement("link").relList;if(e&&e.supports&&e.supports("modulepreload"))return;for(const s of document.querySelectorAll('link[rel="modulepreload"]'))n(s);new MutationObserver(s=>{for(const r of s)if(r.type==="childList")for(const a of r.addedNodes)a.tagName==="LINK"&&a.rel==="modulepreload"&&n(a)}).observe(document,{childList:!0,subtree:!0});function t(s){const r={};return s.integrity&&(r.integrity=s.integrity),s.referrerPolicy&&(r.referrerPolicy=s.referrerPolicy),s.crossOrigin==="use-credentials"?r.credentials="include":s.crossOrigin==="anonymous"?r.credentials="omit":r.credentials="same-origin",r}function n(s){if(s.ep)return;s.ep=!0;const r=t(s);fetch(s.href,r)}})();var N=1179993927,oe=0,Q=1,ue=16,de=17,ce=18,fe=27,le=28,he=30,pe=34,me=35,R=36;function ge(i){switch(i){case oe:return 4;case Q:return 2;case ue:return 1;case de:return 2;case ce:return 4;case fe:return 8;case le:return 8;case he:return 2;case pe:return 54/256;case me:return 66/256;case R:return .25;default:throw new Error(`Unsupported GGML type: ${i}`)}}var _e=class{view;offset;textDecoder=new TextDecoder("utf-8");constructor(i){this.view=new DataView(i),this.offset=0}parse(){const i=this.readHeader(),e=this.readMetadata(Number(i.metadataKVCount)),t=this.readTensorInfos(Number(i.tensorCount)),n=e["general.alignment"]||32,s=Math.ceil(this.offset/n)*n;return{header:i,metadata:e,tensors:t,tensorDataOffset:s}}readHeader(){const i=this.readU32();if(i!==N)throw new Error(`Invalid GGUF magic: 0x${i.toString(16)} (expected 0x${N.toString(16)})`);const e=this.readU32();if(e<2||e>3)throw new Error(`Unsupported GGUF version: ${e}`);const t=this.readU64(),n=this.readU64();return{magic:i,version:e,tensorCount:t,metadataKVCount:n}}readMetadata(i){const e={};for(let t=0;t=t)return a.inUse=!0,a.buffer}const s=this.device.createBuffer({size:t,usage:e}),r={buffer:s,size:t,inUse:!0};return this.bufferToEntry.set(s,r),n?n.push(r):this.pools.set(e,[r]),s}release(i){const e=this.bufferToEntry.get(i);e&&(e.inUse=!1)}destroy(){for(const i of this.pools.values())for(const e of i)e.buffer.destroy();this.pools.clear(),this.bufferToEntry.clear()}};function G(){return new Map}function B(i){i.clear()}function v(i,e,t,n,s){const r=s.map(d=>d.resource.buffer),a=i.get(t);if(a&&a.bufs.length===r.length){let d=!0;for(let u=0;u input: array; @group(0) @binding(1) var weight: array; @group(0) @binding(2) var output: array; @group(0) @binding(3) var params: Params; const WORKGROUP_SIZE: u32 = 256u; var shared_sum: array; @compute @workgroup_size(256) fn main( @builtin(workgroup_id) wg_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { let row = wg_id.x; if (row >= params.N) { return; } let tid = local_id.x; let row_offset = row * params.D; // Pass 1: Sum of squares var local_sum: f32 = 0.0; for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { let val = input[row_offset + col]; local_sum += val * val; } shared_sum[tid] = local_sum; workgroupBarrier(); for (var stride = WORKGROUP_SIZE / 2u; stride > 0u; stride >>= 1u) { if (tid < stride) { shared_sum[tid] += shared_sum[tid + stride]; } workgroupBarrier(); } let rms = inverseSqrt(shared_sum[0] / f32(params.D) + params.eps); // Pass 2: Normalize for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { output[row_offset + col] = input[row_offset + col] * rms * weight[col]; } } `,J=`// Elementwise operations: add, multiply // // Used for residual connections and gating. // // Layout: // a: [N] f32 // b: [N] f32 // output: [N] f32 struct Params { N: u32, op: u32, // 0 = add, 1 = multiply } @group(0) @binding(0) var a: array; @group(0) @binding(1) var b: array; @group(0) @binding(2) var output: array; @group(0) @binding(3) var params: Params; @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) gid: vec3, ) { let idx = gid.x; if (idx >= params.N) { return; } if (params.op == 0u) { output[idx] = a[idx] + b[idx]; } else { output[idx] = a[idx] * b[idx]; } } `,ke=class{device;pipelines;pool;config;inputLayerNorm;postAttnLayerNorm;attention;ffn;decodeNormUniform;decodeAddUniform;bgCache=G();constructor(i,e,t,n,s,r,a,o){this.device=i,this.pipelines=e,this.pool=t,this.config=n,this.inputLayerNorm=s,this.postAttnLayerNorm=r,this.attention=a,this.ffn=o}initDecodeUniforms(i){{const e=new ArrayBuffer(12),t=new DataView(e);t.setUint32(0,1,!0),t.setUint32(4,this.config.hiddenSize,!0),t.setFloat32(8,this.config.rmsNormEps,!0),this.decodeNormUniform=this.createUniform(e)}{const e=new ArrayBuffer(8),t=new DataView(e);t.setUint32(0,this.config.hiddenSize,!0),t.setUint32(4,0,!0),this.decodeAddUniform=this.createUniform(e)}this.attention.initDecodeUniforms(i),this.ffn.initDecodeUniforms()}forward(i,e,t,n){const s=this.config.hiddenSize,r=this.dispatchRMSNorm(n,i,this.inputLayerNorm,e,"attnNorm"),a=this.attention.forward(r,e,t,n);this.pool.release(r);const o=this.dispatchAdd(n,i,a,e*s,e,"attnAdd");this.pool.release(a);const d=this.dispatchRMSNorm(n,o,this.postAttnLayerNorm,e,"ffnNorm"),u=this.ffn.forward(d,e,n);this.pool.release(d);const c=this.dispatchAdd(n,o,u,e*s,e,"ffnAdd");return this.pool.release(o),this.pool.release(u),c}dispatchRMSNorm(i,e,t,n,s){const{pipeline:r,bindGroupLayout:a}=this.pipelines.getOrCreate("rmsnorm",O),o=this.config.hiddenSize,d=this.pool.acquire(n*o*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let u;if(n===1&&this.decodeNormUniform)u=this.decodeNormUniform;else{const p=new ArrayBuffer(12),l=new DataView(p);l.setUint32(0,n,!0),l.setUint32(4,o,!0),l.setFloat32(8,this.config.rmsNormEps,!0),u=this.createUniform(p)}const c=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:d}},{binding:3,resource:{buffer:u}}],f=n===1&&s?v(this.bgCache,this.device,s,a,c):this.device.createBindGroup({layout:a,entries:c}),h=i.beginComputePass();return h.setPipeline(r),h.setBindGroup(0,f),h.dispatchWorkgroups(n),h.end(),d}dispatchAdd(i,e,t,n,s,r){const{pipeline:a,bindGroupLayout:o}=this.pipelines.getOrCreate("elementwise_0",J),d=this.pool.acquire(n*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let u;if(s===1&&this.decodeAddUniform)u=this.decodeAddUniform;else{const p=new ArrayBuffer(8),l=new DataView(p);l.setUint32(0,n,!0),l.setUint32(4,0,!0),u=this.createUniform(p)}const c=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:d}},{binding:3,resource:{buffer:u}}],f=s===1&&r?v(this.bgCache,this.device,r,o,c):this.device.createBindGroup({layout:o,entries:c}),h=i.beginComputePass();return h.setPipeline(a),h.setBindGroup(0,f),h.dispatchWorkgroups(Math.ceil(n/256)),h.end(),d}clearBGCache(){B(this.bgCache),this.attention.clearBGCache(),this.ffn.clearBGCache()}destroyPreAllocated(){this.attention.destroyPreAllocated()}createUniform(i){const e=Math.max(Math.ceil(i.byteLength/4)*4,4),t=this.device.createBuffer({size:e,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});return new Uint8Array(t.getMappedRange()).set(new Uint8Array(i)),t.unmap(),t}},Be=`// Per-token absmax activation quantization: f32 → int8 // // Two-pass approach: // Pass 1: Compute absmax per row (token) // Pass 2: Scale and round to [-127, 127] // // This shader combines both passes using workgroup reduction. // // Layout: // input: [N, D] f32 // output: [N, D] i32 (int8 stored as i32 for compute compatibility) // scales: [N] f32 (per-token absmax / 127) struct Params { N: u32, // number of tokens D: u32, // hidden dimension } @group(0) @binding(0) var input: array; @group(0) @binding(1) var output: array; @group(0) @binding(2) var scales: array; @group(0) @binding(3) var params: Params; const WORKGROUP_SIZE: u32 = 256u; var shared_max: array; @compute @workgroup_size(256) fn main( @builtin(workgroup_id) wg_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { let row = wg_id.x; if (row >= params.N) { return; } let tid = local_id.x; let row_offset = row * params.D; // Pass 1: Find absmax var local_max: f32 = 0.0; for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { local_max = max(local_max, abs(input[row_offset + col])); } shared_max[tid] = local_max; workgroupBarrier(); // Reduction for max for (var stride = WORKGROUP_SIZE / 2u; stride > 0u; stride >>= 1u) { if (tid < stride) { shared_max[tid] = max(shared_max[tid], shared_max[tid + stride]); } workgroupBarrier(); } let absmax = shared_max[0]; let scale = select(absmax / 127.0, 1.0, absmax == 0.0); if (tid == 0u) { scales[row] = scale; } workgroupBarrier(); // Pass 2: Quantize let inv_scale = select(127.0 / absmax, 0.0, absmax == 0.0); for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { let val = input[row_offset + col]; let quantized = clamp(i32(round(val * inv_scale)), -127, 127); output[row_offset + col] = quantized; } } `,Pe=`// Ternary GEMV: packed ternary weights × int8 activations → i32 accumulator // // Weight packing (I2_S / Eddie-Wang1120 llama.cpp fork): // 128-element block interleaving for SIMD. Each 32-byte block stores 128 elements // in 4 groups of 32. Byte[gp] within a block stores: // bits[7:6] = element at group0 (offset 0*32 + gp) // bits[5:4] = element at group1 (offset 1*32 + gp) // bits[3:2] = element at group2 (offset 2*32 + gp) // bits[1:0] = element at group3 (offset 3*32 + gp) // code mapping: {0=-1, 1=0, 2=+1} // // Layout: // weights: [M, K/16] u32 (packed ternary) // input: [K] i32 (int8 stored as i32) // scales: [M] f32 (per-row weight scale) // input_scale: f32 (activation absmax scale) // output: [M] f32 // // Each workgroup processes one output row. // Threads cooperatively reduce over the K dimension. struct Params { M: u32, // output rows K: u32, // input dimension (unpacked) K_packed: u32, // K / 16 } @group(0) @binding(0) var weights: array; @group(0) @binding(1) var input: array; @group(0) @binding(2) var scales: array; @group(0) @binding(3) var params: Params; @group(0) @binding(4) var input_scale: f32; @group(0) @binding(5) var output: array; const WORKGROUP_SIZE: u32 = 256u; var shared_sums: array; @compute @workgroup_size(256) fn main( @builtin(workgroup_id) wg_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { let row = wg_id.x; if (row >= params.M) { return; } let tid = local_id.x; let row_offset = row * params.K_packed; var acc: i32 = 0; // Each thread processes a strided slice of packed u32 columns for (var col = tid; col < params.K_packed; col += WORKGROUP_SIZE) { let packed = weights[row_offset + col]; // I2_S block interleaving: 128 elements per 32-byte (8 u32) block let block = col / 8u; let base_gp = (col % 8u) * 4u; // Unpack 16 ternary weights from this u32 and dot with input for (var i = 0u; i < 16u; i++) { let byte_in_u32 = i / 4u; let group = i % 4u; let gp = base_gp + byte_in_u32; let k_idx = block * 128u + group * 32u + gp; if (k_idx < params.K) { let shift = byte_in_u32 * 8u + (6u - 2u * group); let code = (packed >> shift) & 3u; let w = i32(code) - 1; acc += w * input[k_idx]; } } } // Workgroup reduction shared_sums[tid] = acc; workgroupBarrier(); // Tree reduction for (var stride = WORKGROUP_SIZE / 2u; stride > 0u; stride >>= 1u) { if (tid < stride) { shared_sums[tid] += shared_sums[tid + stride]; } workgroupBarrier(); } // Thread 0 writes the dequantized result if (tid == 0u) { let sum = f32(shared_sums[0]); output[row] = sum * scales[row] * input_scale; } } `,Se=`// Ternary GEMM: batched matrix multiply for prompt processing // Output[N,M] = Input[N,K] × TernaryWeights[M,K]^T // // Weight packing (I2_S / Eddie-Wang1120 llama.cpp fork): // 128-element block interleaving. Each 32-byte block stores 128 elements // in 4 groups of 32. Byte[gp] within a block stores: // bits[7:6] = group0 (offset 0*32+gp), bits[5:4] = group1 (offset 1*32+gp) // bits[3:2] = group2 (offset 2*32+gp), bits[1:0] = group3 (offset 3*32+gp) // code mapping: {0=-1, 1=0, 2=+1} // Input: int8 activations stored as i32 // Output: f32 (dequantized) // // 2D tiling: 16×16 workgroup, 4×4 per-thread output tile struct Params { M: u32, // output rows (weight rows) N: u32, // output cols (batch / seq_len) K: u32, // inner dimension (unpacked) K_packed: u32, // K / 16 } @group(0) @binding(0) var weights: array; @group(0) @binding(1) var input: array; @group(0) @binding(2) var scales: array; @group(0) @binding(3) var params: Params; @group(0) @binding(4) var input_scales: array; @group(0) @binding(5) var output: array; const TILE_M: u32 = 64u; // rows per workgroup const TILE_N: u32 = 64u; // cols per workgroup const TILE_K: u32 = 32u; // K-tile for shared memory (unpacked units) const THREADS_M: u32 = 16u; const THREADS_N: u32 = 16u; const THREAD_TILE_M: u32 = 4u; // TILE_M / THREADS_M const THREAD_TILE_N: u32 = 4u; // TILE_N / THREADS_N var shared_w: array; // TILE_M × TILE_K var shared_x: array; // TILE_K × TILE_N @compute @workgroup_size(16, 16) fn main( @builtin(workgroup_id) wg_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { let wg_row = wg_id.x * TILE_M; let wg_col = wg_id.y * TILE_N; let tid_m = local_id.x; let tid_n = local_id.y; // Per-thread accumulators (4×4 tile) var acc: array; // THREAD_TILE_M × THREAD_TILE_N for (var i = 0u; i < 16u; i++) { acc[i] = 0; } // Loop over K in tiles let k_tiles = (params.K + TILE_K - 1u) / TILE_K; for (var kt = 0u; kt < k_tiles; kt++) { let k_base = kt * TILE_K; // Cooperatively load weight tile into shared memory let linear_id = tid_m * THREADS_N + tid_n; let load_count = (TILE_M * TILE_K) / (THREADS_M * THREADS_N); for (var ld = 0u; ld < load_count; ld++) { let idx = linear_id + ld * (THREADS_M * THREADS_N); let local_row = idx / TILE_K; let local_col = idx % TILE_K; let global_row = wg_row + local_row; let global_k = k_base + local_col; var w_val: i32 = 0; if (global_row < params.M && global_k < params.K) { // I2_S 128-element block interleaving let block = global_k / 128u; let pos = global_k % 128u; let group = pos / 32u; let gp = pos % 32u; let u32_idx = block * 8u + gp / 4u; let byte_in_u32 = gp % 4u; let shift = byte_in_u32 * 8u + (6u - 2u * group); let packed = weights[global_row * params.K_packed + u32_idx]; let code = (packed >> shift) & 3u; w_val = i32(code) - 1; } shared_w[local_row * TILE_K + local_col] = w_val; } // Cooperatively load input tile into shared memory let load_count_x = (TILE_K * TILE_N) / (THREADS_M * THREADS_N); for (var ld = 0u; ld < load_count_x; ld++) { let idx = linear_id + ld * (THREADS_M * THREADS_N); let local_k = idx / TILE_N; let local_col = idx % TILE_N; let global_k = k_base + local_k; let global_col = wg_col + local_col; var x_val: i32 = 0; if (global_k < params.K && global_col < params.N) { x_val = input[global_col * params.K + global_k]; } shared_x[local_k * TILE_N + local_col] = x_val; } workgroupBarrier(); // Compute per-thread 4×4 accumulation for (var k = 0u; k < TILE_K; k++) { for (var tm = 0u; tm < THREAD_TILE_M; tm++) { let w = shared_w[(tid_m * THREAD_TILE_M + tm) * TILE_K + k]; for (var tn = 0u; tn < THREAD_TILE_N; tn++) { let x = shared_x[k * TILE_N + tid_n * THREAD_TILE_N + tn]; acc[tm * THREAD_TILE_N + tn] += w * x; } } } workgroupBarrier(); } // Write results with dequantization for (var tm = 0u; tm < THREAD_TILE_M; tm++) { let global_row = wg_row + tid_m * THREAD_TILE_M + tm; if (global_row >= params.M) { continue; } let w_scale = scales[global_row]; for (var tn = 0u; tn < THREAD_TILE_N; tn++) { let global_col = wg_col + tid_n * THREAD_TILE_N + tn; if (global_col >= params.N) { continue; } let scale = w_scale * input_scales[global_col]; output[global_col * params.M + global_row] = f32(acc[tm * THREAD_TILE_N + tn]) * scale; } } } `,w=class{device;pipelines;pool;packedWeights;weightScales;normWeight;inDim;outDim;kPacked;decodeNormUniform;decodeQuantUniform;decodeGemvParamsUniform;decodeGemvScaleUniform;bgCache=G();constructor(i,e,t,n,s,r,a,o){this.device=i,this.pipelines=e,this.pool=t,this.packedWeights=n,this.weightScales=s,this.normWeight=r,this.inDim=a,this.outDim=o,this.kPacked=Math.ceil(a/16)}initDecodeUniforms(){if(this.normWeight){const i=new ArrayBuffer(12),e=new DataView(i);e.setUint32(0,1,!0),e.setUint32(4,this.inDim,!0),e.setFloat32(8,1e-5,!0),this.decodeNormUniform=this.createUniformBuffer(i)}{const i=new ArrayBuffer(8),e=new DataView(i);e.setUint32(0,1,!0),e.setUint32(4,this.inDim,!0),this.decodeQuantUniform=this.createUniformBuffer(i)}{const i=new ArrayBuffer(12),e=new DataView(i);e.setUint32(0,this.outDim,!0),e.setUint32(4,this.inDim,!0),e.setUint32(8,this.kPacked,!0),this.decodeGemvParamsUniform=this.createUniformBuffer(i)}this.decodeGemvScaleUniform=this.device.createBuffer({size:4,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST})}forward(i,e,t){let n;this.normWeight?(n=this.pool.acquire(e*this.inDim*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.dispatchRMSNorm(t,i,n,e)):n=i;const s=this.pool.acquire(e*this.inDim*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),r=this.pool.acquire(e*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.UNIFORM);this.dispatchQuantize(t,n,s,r,e);const a=this.pool.acquire(e*this.outDim*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);return e===1?this.dispatchGEMV(t,s,r,a):this.dispatchGEMM(t,s,r,a,e),this.normWeight&&this.pool.release(n),this.pool.release(s),this.pool.release(r),a}dispatchRMSNorm(i,e,t,n){const{pipeline:s,bindGroupLayout:r}=this.pipelines.getOrCreate("rmsnorm",O);let a;if(n===1&&this.decodeNormUniform)a=this.decodeNormUniform;else{const c=new ArrayBuffer(12),f=new DataView(c);f.setUint32(0,n,!0),f.setUint32(4,this.inDim,!0),f.setFloat32(8,1e-5,!0),a=this.createUniformBuffer(c)}const o=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:this.normWeight}},{binding:2,resource:{buffer:t}},{binding:3,resource:{buffer:a}}],d=n===1?v(this.bgCache,this.device,"rmsnorm",r,o):this.device.createBindGroup({layout:r,entries:o}),u=i.beginComputePass();u.setPipeline(s),u.setBindGroup(0,d),u.dispatchWorkgroups(n),u.end()}dispatchQuantize(i,e,t,n,s){const{pipeline:r,bindGroupLayout:a}=this.pipelines.getOrCreate("quantize",Be);let o;if(s===1&&this.decodeQuantUniform)o=this.decodeQuantUniform;else{const f=new ArrayBuffer(8),h=new DataView(f);h.setUint32(0,s,!0),h.setUint32(4,this.inDim,!0),o=this.createUniformBuffer(f)}const d=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:n}},{binding:3,resource:{buffer:o}}],u=s===1?v(this.bgCache,this.device,"quantize",a,d):this.device.createBindGroup({layout:a,entries:d}),c=i.beginComputePass();c.setPipeline(r),c.setBindGroup(0,u),c.dispatchWorkgroups(s),c.end()}dispatchGEMV(i,e,t,n){const{pipeline:s,bindGroupLayout:r}=this.pipelines.getOrCreate("ternary_gemv",Pe);let a,o;if(this.decodeGemvParamsUniform&&this.decodeGemvScaleUniform)a=this.decodeGemvParamsUniform,o=this.decodeGemvScaleUniform;else{const f=new ArrayBuffer(12),h=new DataView(f);h.setUint32(0,this.outDim,!0),h.setUint32(4,this.inDim,!0),h.setUint32(8,this.kPacked,!0),a=this.createUniformBuffer(f),o=this.createUniformBuffer(new ArrayBuffer(4))}i.copyBufferToBuffer(t,0,o,0,4);const d=[{binding:0,resource:{buffer:this.packedWeights}},{binding:1,resource:{buffer:e}},{binding:2,resource:{buffer:this.weightScales}},{binding:3,resource:{buffer:a}},{binding:4,resource:{buffer:o}},{binding:5,resource:{buffer:n}}],u=v(this.bgCache,this.device,"gemv",r,d),c=i.beginComputePass();c.setPipeline(s),c.setBindGroup(0,u),c.dispatchWorkgroups(this.outDim),c.end()}dispatchGEMM(i,e,t,n,s){const{pipeline:r,bindGroupLayout:a}=this.pipelines.getOrCreate("ternary_gemm",Se),o=new ArrayBuffer(16),d=new DataView(o);d.setUint32(0,this.outDim,!0),d.setUint32(4,s,!0),d.setUint32(8,this.inDim,!0),d.setUint32(12,this.kPacked,!0);const u=this.createUniformBuffer(o),c=this.device.createBindGroup({layout:a,entries:[{binding:0,resource:{buffer:this.packedWeights}},{binding:1,resource:{buffer:e}},{binding:2,resource:{buffer:this.weightScales}},{binding:3,resource:{buffer:u}},{binding:4,resource:{buffer:t}},{binding:5,resource:{buffer:n}}]}),f=Math.ceil(this.outDim/64),h=Math.ceil(s/64),p=i.beginComputePass();p.setPipeline(r),p.setBindGroup(0,c),p.dispatchWorkgroups(f,h),p.end()}clearBGCache(){B(this.bgCache)}createUniformBuffer(i){const e=Math.max(Math.ceil(i.byteLength/4)*4,4),t=this.device.createBuffer({size:e,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});return new Uint8Array(t.getMappedRange()).set(new Uint8Array(i)),t.unmap(),t}},Ge=`// Rotary Position Embeddings (RoPE) // // For each pair (x[2i], x[2i+1]) at position \`pos\`: // theta = pos * base^(-2i/D) // out[2i] = x[2i] * cos(theta) - x[2i+1] * sin(theta) // out[2i+1] = x[2i] * sin(theta) + x[2i+1] * cos(theta) // // Layout: // input: [N, num_heads, head_dim] f32 // output: [N, num_heads, head_dim] f32 // Dispatched per (token, head, pair) struct Params { N: u32, // sequence length num_heads: u32, head_dim: u32, pos_offset: u32, // starting position (for KV-cache continuation) theta_base: f32, // default 10000.0 or 500000.0 } @group(0) @binding(0) var input: array; @group(0) @binding(1) var output: array; @group(0) @binding(2) var params: Params; @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) gid: vec3, ) { let half_dim = params.head_dim / 2u; let total_pairs = params.N * params.num_heads * half_dim; let pair_idx = gid.x; if (pair_idx >= total_pairs) { return; } // Decompose linear index into (token, head, dim_pair) let dim_pair = pair_idx % half_dim; let remainder = pair_idx / half_dim; let head = remainder % params.num_heads; let token = remainder / params.num_heads; let pos = f32(token + params.pos_offset); let freq_exp = -2.0 * f32(dim_pair) / f32(params.head_dim); let theta = pos * pow(params.theta_base, freq_exp); let cos_theta = cos(theta); let sin_theta = sin(theta); let base_idx = (token * params.num_heads + head) * params.head_dim + dim_pair * 2u; let x0 = input[base_idx]; let x1 = input[base_idx + 1u]; output[base_idx] = x0 * cos_theta - x1 * sin_theta; output[base_idx + 1u] = x0 * sin_theta + x1 * cos_theta; } `,xe=`// Numerically stable softmax // // For each row: // 1. Find max value (for numerical stability) // 2. Compute sum of exp(x - max) // 3. Normalize: out[i] = exp(x[i] - max) / sum // // Layout: // input: [N, D] f32 // output: [N, D] f32 struct Params { N: u32, D: u32, } @group(0) @binding(0) var input: array; @group(0) @binding(1) var output: array; @group(0) @binding(2) var params: Params; const WORKGROUP_SIZE: u32 = 256u; var shared_val: array; @compute @workgroup_size(256) fn main( @builtin(workgroup_id) wg_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { let row = wg_id.x; if (row >= params.N) { return; } let tid = local_id.x; let row_offset = row * params.D; // Pass 1: Find max var local_max: f32 = -3.402823e+38; // -FLT_MAX for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { local_max = max(local_max, input[row_offset + col]); } shared_val[tid] = local_max; workgroupBarrier(); for (var stride = WORKGROUP_SIZE / 2u; stride > 0u; stride >>= 1u) { if (tid < stride) { shared_val[tid] = max(shared_val[tid], shared_val[tid + stride]); } workgroupBarrier(); } let row_max = shared_val[0]; workgroupBarrier(); // Pass 2: Sum of exp(x - max) var local_sum: f32 = 0.0; for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { local_sum += exp(input[row_offset + col] - row_max); } shared_val[tid] = local_sum; workgroupBarrier(); for (var stride = WORKGROUP_SIZE / 2u; stride > 0u; stride >>= 1u) { if (tid < stride) { shared_val[tid] += shared_val[tid + stride]; } workgroupBarrier(); } let inv_sum = 1.0 / shared_val[0]; workgroupBarrier(); // Pass 3: Normalize for (var col = tid; col < params.D; col += WORKGROUP_SIZE) { output[row_offset + col] = exp(input[row_offset + col] - row_max) * inv_sum; } } `,$=`// Standard f32 attention matmul kernels // // Two operations: // 1. scores = Q @ K^T * scale (score computation) // 2. output = attn_weights @ V (value aggregation) // // These use standard f32 matmul (not ternary) because Q,K,V are // already projected through BitLinear and are f32 activations. // ─── Kernel 1: Q @ K^T (score computation) ─── // Q: [N, num_heads, head_dim] // K: [S, num_kv_heads, head_dim] (S = total seq including cache) // scores: [num_heads, N, S] struct ScoreParams { N: u32, // query seq length S: u32, // key seq length (including cache) num_heads: u32, num_kv_heads: u32, head_dim: u32, scale: f32, // 1/sqrt(head_dim) } @group(0) @binding(0) var Q: array; @group(0) @binding(1) var K: array; @group(0) @binding(2) var scores: array; @group(0) @binding(3) var params: ScoreParams; @compute @workgroup_size(16, 16) fn compute_scores( @builtin(global_invocation_id) gid: vec3, ) { // gid.x = query position, gid.y = key position, gid.z = head let q_pos = gid.x; let k_pos = gid.y; let head = gid.z; if (q_pos >= params.N || k_pos >= params.S || head >= params.num_heads) { return; } // GQA: map attention head to KV head let kv_head = head / (params.num_heads / params.num_kv_heads); let q_offset = (q_pos * params.num_heads + head) * params.head_dim; let k_offset = (k_pos * params.num_kv_heads + kv_head) * params.head_dim; var dot: f32 = 0.0; for (var d = 0u; d < params.head_dim; d++) { dot += Q[q_offset + d] * K[k_offset + d]; } // Causal mask: positions after query are -inf let is_causal = k_pos > q_pos + (params.S - params.N); let masked_score = select(dot * params.scale, -3.402823e+38, is_causal); let score_idx = (head * params.N + q_pos) * params.S + k_pos; scores[score_idx] = masked_score; } // ─── Kernel 2: Attention weights @ V ─── // attn: [num_heads, N, S] // V: [S, num_kv_heads, head_dim] // output: [N, num_heads, head_dim] struct AttnVParams { N: u32, S: u32, num_heads: u32, num_kv_heads: u32, head_dim: u32, } @group(0) @binding(0) var attn: array; @group(0) @binding(1) var V: array; @group(0) @binding(2) var attn_output: array; @group(0) @binding(3) var attn_v_params: AttnVParams; @compute @workgroup_size(256) fn attn_v( @builtin(global_invocation_id) gid: vec3, ) { let total = attn_v_params.N * attn_v_params.num_heads * attn_v_params.head_dim; let idx = gid.x; if (idx >= total) { return; } let d = idx % attn_v_params.head_dim; let remainder = idx / attn_v_params.head_dim; let head = remainder % attn_v_params.num_heads; let q_pos = remainder / attn_v_params.num_heads; let kv_head = head / (attn_v_params.num_heads / attn_v_params.num_kv_heads); var sum: f32 = 0.0; for (var s = 0u; s < attn_v_params.S; s++) { let a = attn[(head * attn_v_params.N + q_pos) * attn_v_params.S + s]; let v = V[(s * attn_v_params.num_kv_heads + kv_head) * attn_v_params.head_dim + d]; sum += a * v; } let out_idx = (q_pos * attn_v_params.num_heads + head) * attn_v_params.head_dim + d; attn_output[out_idx] = sum; } `,De=class{device;pipelines;pool;config;hDim;qProj;kProj;vProj;oProj;decodeRopeQUniform;decodeRopeKUniform;decodeScoresUniform;decodeSoftmaxUniform;decodeAttnVUniform;decodeScoresBuf;decodeAttnWeightsBuf;bgCache=G();constructor(i,e,t,n,s,r,a,o){this.device=i,this.pipelines=e,this.pool=t,this.config=n,this.hDim=U(n),this.qProj=s,this.kProj=r,this.vProj=a,this.oProj=o}initDecodeUniforms(i){const e=n=>this.device.createBuffer({size:n,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST});this.decodeRopeQUniform=e(20),this.decodeRopeKUniform=e(20),this.decodeScoresUniform=e(24),this.decodeSoftmaxUniform=e(8),this.decodeAttnVUniform=e(20);const t=this.config.numAttentionHeads*i*4;this.decodeScoresBuf=this.device.createBuffer({size:t,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC}),this.decodeAttnWeightsBuf=this.device.createBuffer({size:t,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC}),this.qProj.initDecodeUniforms(),this.kProj.initDecodeUniforms(),this.vProj.initDecodeUniforms(),this.oProj.initDecodeUniforms()}forward(i,e,t,n){const{numAttentionHeads:s,numKeyValueHeads:r,hiddenSize:a}=this.config,o=this.qProj.forward(i,e,n),d=this.kProj.forward(i,e,n),u=this.vProj.forward(i,e,n),c=this.applyRoPE(n,o,e,s,t.seqLen,e===1?this.decodeRopeQUniform:void 0,"ropeQ"),f=this.applyRoPE(n,d,e,r,t.seqLen,e===1?this.decodeRopeKUniform:void 0,"ropeK");this.pool.release(o),this.pool.release(d),this.appendToCache(n,f,u,t,e),this.pool.release(f),this.pool.release(u);const h=t.seqLen+e,p=this.computeScores(n,c,t.key,e,h,e===1?this.decodeScoresUniform:void 0,e===1?this.decodeScoresBuf:void 0);this.pool.release(c);const l=this.applySoftmax(n,p,s*e,h,e===1?this.decodeSoftmaxUniform:void 0,e===1?this.decodeAttnWeightsBuf:void 0);e!==1&&this.pool.release(p);const m=this.computeAttnV(n,l,t.value,e,h,e===1?this.decodeAttnVUniform:void 0);e!==1&&this.pool.release(l);const g=this.oProj.forward(m,e,n);return this.pool.release(m),g}applyRoPE(i,e,t,n,s,r,a){const{pipeline:o,bindGroupLayout:d}=this.pipelines.getOrCreate("rope",Ge),u=t*n*this.hDim*4,c=this.pool.acquire(u,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),f=new ArrayBuffer(20),h=new DataView(f);h.setUint32(0,t,!0),h.setUint32(4,n,!0),h.setUint32(8,this.hDim,!0),h.setUint32(12,s,!0),h.setFloat32(16,this.config.ropeTheta,!0);let p;r?(this.device.queue.writeBuffer(r,0,new Uint8Array(f)),p=r):p=this.createUniform(f);const l=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:c}},{binding:2,resource:{buffer:p}}],m=t===1&&a?v(this.bgCache,this.device,a,d,l):this.device.createBindGroup({layout:d,entries:l}),g=t*n*(this.hDim/2),_=i.beginComputePass();return _.setPipeline(o),_.setBindGroup(0,m),_.dispatchWorkgroups(Math.ceil(g/256)),_.end(),c}appendToCache(i,e,t,n,s){const r=s*this.config.numKeyValueHeads*this.hDim*4,a=n.seqLen*this.config.numKeyValueHeads*this.hDim*4;i.copyBufferToBuffer(e,0,n.key,a,r),i.copyBufferToBuffer(t,0,n.value,a,r)}computeScores(i,e,t,n,s,r,a){const{pipeline:o,bindGroupLayout:d}=this.pipelines.getOrCreate("attention_scores",$,"compute_scores"),{numAttentionHeads:u,numKeyValueHeads:c}=this.config,f=a??this.pool.acquire(u*n*s*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),h=new ArrayBuffer(24),p=new DataView(h);p.setUint32(0,n,!0),p.setUint32(4,s,!0),p.setUint32(8,u,!0),p.setUint32(12,c,!0),p.setUint32(16,this.hDim,!0),p.setFloat32(20,1/Math.sqrt(this.hDim),!0);let l;r?(this.device.queue.writeBuffer(r,0,new Uint8Array(h)),l=r):l=this.createUniform(h);const m=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:f}},{binding:3,resource:{buffer:l}}],g=n===1?v(this.bgCache,this.device,"scores",d,m):this.device.createBindGroup({layout:d,entries:m}),_=i.beginComputePass();return _.setPipeline(o),_.setBindGroup(0,g),_.dispatchWorkgroups(Math.ceil(n/16),Math.ceil(s/16),u),_.end(),f}applySoftmax(i,e,t,n,s,r){const{pipeline:a,bindGroupLayout:o}=this.pipelines.getOrCreate("softmax",xe),d=r??this.pool.acquire(t*n*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),u=new ArrayBuffer(8),c=new DataView(u);c.setUint32(0,t,!0),c.setUint32(4,n,!0);let f;s?(this.device.queue.writeBuffer(s,0,new Uint8Array(u)),f=s):f=this.createUniform(u);const h=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:d}},{binding:2,resource:{buffer:f}}],p=t===1?v(this.bgCache,this.device,"softmax",o,h):this.device.createBindGroup({layout:o,entries:h}),l=i.beginComputePass();return l.setPipeline(a),l.setBindGroup(0,p),l.dispatchWorkgroups(t),l.end(),d}computeAttnV(i,e,t,n,s,r){const{pipeline:a,bindGroupLayout:o}=this.pipelines.getOrCreate("attn_v",$,"attn_v"),{numAttentionHeads:d,numKeyValueHeads:u}=this.config,c=n*d*this.hDim*4,f=this.pool.acquire(c,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),h=new ArrayBuffer(20),p=new DataView(h);p.setUint32(0,n,!0),p.setUint32(4,s,!0),p.setUint32(8,d,!0),p.setUint32(12,u,!0),p.setUint32(16,this.hDim,!0);let l;r?(this.device.queue.writeBuffer(r,0,new Uint8Array(h)),l=r):l=this.createUniform(h);const m=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:f}},{binding:3,resource:{buffer:l}}],g=n===1?v(this.bgCache,this.device,"attnV",o,m):this.device.createBindGroup({layout:o,entries:m}),_=n*d*this.hDim,b=i.beginComputePass();return b.setPipeline(a),b.setBindGroup(0,g),b.dispatchWorkgroups(Math.ceil(_/256)),b.end(),f}clearBGCache(){B(this.bgCache),this.qProj.clearBGCache(),this.kProj.clearBGCache(),this.vProj.clearBGCache(),this.oProj.clearBGCache()}destroyPreAllocated(){this.decodeScoresBuf?.destroy(),this.decodeAttnWeightsBuf?.destroy(),this.decodeScoresBuf=void 0,this.decodeAttnWeightsBuf=void 0}createUniform(i){const e=Math.max(Math.ceil(i.byteLength/4)*4,4),t=this.device.createBuffer({size:e,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});return new Uint8Array(t.getMappedRange()).set(new Uint8Array(i)),t.unmap(),t}};function Te(i,e,t){const n=t*e.numKeyValueHeads*U(e)*4,s=i.createBuffer({size:n,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC}),r=i.createBuffer({size:n,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC});return{key:s,value:r,seqLen:0,maxSeqLen:t}}var Ee=`// Activation functions for BitNet FFN // // ReLU²: relu(x)² — used in official 2B-4T model // SiLU: x * sigmoid(x) — used in community models // // Layout: // input: [N] f32 // output: [N] f32 struct Params { N: u32, activation_type: u32, // 0 = ReLU², 1 = SiLU } @group(0) @binding(0) var input: array; @group(0) @binding(1) var output: array; @group(0) @binding(2) var params: Params; @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) gid: vec3, ) { let idx = gid.x; if (idx >= params.N) { return; } let x = input[idx]; if (params.activation_type == 0u) { // ReLU²: max(0, x)² let relu_x = max(0.0, x); output[idx] = relu_x * relu_x; } else { // SiLU: x * sigmoid(x) output[idx] = x / (1.0 + exp(-x)); } } `,Ce=class{device;pipelines;pool;config;upProj;downProj;gateProj;decodeActivationUniform;decodeElementwiseUniform;bgCache=G();constructor(i,e,t,n,s,r,a){this.device=i,this.pipelines=e,this.pool=t,this.config=n,this.upProj=s,this.downProj=r,this.gateProj=a}initDecodeUniforms(){const i=this.config.activation==="relu2"?0:1;{const e=new ArrayBuffer(8),t=new DataView(e);t.setUint32(0,this.config.intermediateSize,!0),t.setUint32(4,i,!0),this.decodeActivationUniform=this.createUniform(e)}{const e=new ArrayBuffer(8),t=new DataView(e);t.setUint32(0,this.config.intermediateSize,!0),t.setUint32(4,1,!0),this.decodeElementwiseUniform=this.createUniform(e)}this.upProj.initDecodeUniforms(),this.downProj.initDecodeUniforms(),this.gateProj?.initDecodeUniforms()}forward(i,e,t){return this.gateProj?this.forwardGated(i,e,t):this.forwardSimple(i,e,t)}forwardGated(i,e,t){const n=this.config.activation==="relu2"?0:1,s=this.gateProj.forward(i,e,t),r=this.upProj.forward(i,e,t),a=this.applyActivation(t,s,e*this.config.intermediateSize,n,e);this.pool.release(s);const o=this.applyElementwise(t,a,r,e*this.config.intermediateSize,1,e);this.pool.release(a),this.pool.release(r);const d=this.downProj.forward(o,e,t);return this.pool.release(o),d}forwardSimple(i,e,t){const n=this.config.activation==="relu2"?0:1,s=this.upProj.forward(i,e,t),r=this.applyActivation(t,s,e*this.config.intermediateSize,n,e);this.pool.release(s);const a=this.downProj.forward(r,e,t);return this.pool.release(r),a}applyActivation(i,e,t,n,s){const{pipeline:r,bindGroupLayout:a}=this.pipelines.getOrCreate(`activation_${n}`,Ee),o=this.pool.acquire(t*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let d;if(s===1&&this.decodeActivationUniform)d=this.decodeActivationUniform;else{const h=new ArrayBuffer(8),p=new DataView(h);p.setUint32(0,t,!0),p.setUint32(4,n,!0),d=this.createUniform(h)}const u=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:o}},{binding:2,resource:{buffer:d}}],c=s===1?v(this.bgCache,this.device,"activation",a,u):this.device.createBindGroup({layout:a,entries:u}),f=i.beginComputePass();return f.setPipeline(r),f.setBindGroup(0,c),f.dispatchWorkgroups(Math.ceil(t/256)),f.end(),o}applyElementwise(i,e,t,n,s,r){const{pipeline:a,bindGroupLayout:o}=this.pipelines.getOrCreate(`elementwise_${s}`,J),d=this.pool.acquire(n*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let u;if(r===1&&this.decodeElementwiseUniform)u=this.decodeElementwiseUniform;else{const p=new ArrayBuffer(8),l=new DataView(p);l.setUint32(0,n,!0),l.setUint32(4,s,!0),u=this.createUniform(p)}const c=[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:d}},{binding:3,resource:{buffer:u}}],f=r===1?v(this.bgCache,this.device,"elementwise",o,c):this.device.createBindGroup({layout:o,entries:c}),h=i.beginComputePass();return h.setPipeline(a),h.setBindGroup(0,f),h.dispatchWorkgroups(Math.ceil(n/256)),h.end(),d}clearBGCache(){B(this.bgCache),this.upProj.clearBGCache(),this.downProj.clearBGCache(),this.gateProj?.clearBGCache()}createUniform(i){const e=Math.max(Math.ceil(i.byteLength/4)*4,4),t=this.device.createBuffer({size:e,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});return new Uint8Array(t.getMappedRange()).set(new Uint8Array(i)),t.unmap(),t}},H=class extends Error{constructor(i){super(i),this.name="GPUDeviceError"}};async function W(i){if(i)return{device:i,adapter:null,limits:i.limits};if(typeof navigator>"u"||!navigator.gpu)throw new H("WebGPU is not supported in this environment. Please use a browser with WebGPU support (Chrome 113+, Edge 113+, Firefox Nightly).");const e=await navigator.gpu.requestAdapter({powerPreference:"high-performance"});if(!e)throw new H("Failed to obtain WebGPU adapter. Check that your GPU drivers are up to date.");const t={};t.maxBufferSize=e.limits.maxBufferSize,t.maxStorageBufferBindingSize=e.limits.maxStorageBufferBindingSize;const n=e.limits.maxStorageBuffersPerShaderStage;t.maxStorageBuffersPerShaderStage=n,t.maxComputeWorkgroupSizeX=e.limits.maxComputeWorkgroupSizeX,t.maxComputeWorkgroupSizeY=e.limits.maxComputeWorkgroupSizeY,t.maxComputeWorkgroupSizeZ=e.limits.maxComputeWorkgroupSizeZ,t.maxComputeInvocationsPerWorkgroup=e.limits.maxComputeInvocationsPerWorkgroup,t.maxComputeWorkgroupStorageSize=e.limits.maxComputeWorkgroupStorageSize;const s=await e.requestDevice({requiredLimits:t});return s.lost.then(r=>{console.error(`WebGPU device lost: ${r.message} (reason: ${r.reason})`)}),{device:s,adapter:e,limits:s.limits}}var X=class{buffers=new Map;device;constructor(i){this.device=i}upload(i,e){const t=this.device.createBuffer({size:Math.max(e.byteLength,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});return new Uint8Array(t.getMappedRange()).set(new Uint8Array(e)),t.unmap(),this.buffers.set(i,t),t}uploadSharded(i,e,t){if(e.byteLength<=t)return[this.upload(i,e)];const n=[];let s=0,r=0;for(;s0&&!this.buffers.has(i)&&this.buffers.set(i,n[0]),n}get(i){return this.buffers.get(i)}has(i){return this.buffers.has(i)}destroy(){for(const i of this.buffers.values())i.destroy();this.buffers.clear()}};async function Ae(i,e,t,n){const s=typeof i=="string"?i:i.href,r=ze(s);t?.({phase:"download",loaded:0,total:0,fraction:0});const a=await We(s,(o,d)=>{t?.({phase:"download",loaded:o,total:d,fraction:d>0?o/d:0})},n);return t?.({phase:"parse",loaded:0,total:1,fraction:0}),r==="gguf"?Re(a,e,t):Ie(a,e,t)}function ze(i){return i.endsWith(".gguf")?"gguf":i.endsWith(".safetensors")?"safetensors":"gguf"}async function Re(i,e,t){const s=new _e(i).parse(),r=qe(s.metadata),a=s.tensors.some(c=>c.name==="output.weight");r.tieWordEmbeddings=!a,console.debug(`[0xBitNet] config: arch=${s.metadata["general.architecture"]}, heads=${r.numAttentionHeads}, kv_heads=${r.numKeyValueHeads}, head_dim=${r.hiddenSize/r.numAttentionHeads}, hidden=${r.hiddenSize}, intermediate=${r.intermediateSize}, layers=${r.numHiddenLayers}, tied=${r.tieWordEmbeddings}`);const o=new X(e),d=e.limits.maxStorageBufferBindingSize,u=s.tensors.length;for(let c=0;c_*Number(b),1);let l;if(f.type===R)l=Math.ceil(p/4)+32;else{const _=ge(f.type);l=Math.ceil(p*_)}const m=i.slice(h,h+l),g=Le(f.name);if(console.debug(`[0xBitNet] tensor: ${f.name} → ${g} (type=${f.type}, ${l} bytes)`),f.type===R){const _=Math.ceil(p/4),b=m.slice(0,_);o.uploadSharded(g,b,d);const T=new DataView(m,_,32).getFloat32(0,!0),E=Number(f.shape[1]),C=g.replace(".weight",".weight_scale"),A=new Float32Array(E).fill(T);o.upload(C,A.buffer)}else if(f.type===Q)if(g==="model.embed_tokens.weight")o.uploadSharded(g,m,d);else{const _=Me(new Uint16Array(m),p);o.uploadSharded(g,_.buffer,d)}else o.uploadSharded(g,m,d);t?.({phase:"upload",loaded:c+1,total:u,fraction:(c+1)/u})}return console.debug(`[0xBitNet] ${u} tensors loaded, tieWordEmbeddings=${r.tieWordEmbeddings}`),Oe(o,r),{config:r,weights:o,metadata:s.metadata}}function Me(i,e){const t=new Float32Array(e);for(let n=0;n>15&1,a=s>>10&31,o=s&1023;let d;a===0?d=o/1024*Math.pow(2,-14):a===31?d=o===0?1/0:NaN:d=(1+o/1024)*Math.pow(2,a-15),t[n]=r?-d:d}return t}function Le(i){if(i==="token_embd.weight")return"model.embed_tokens.weight";if(i==="output_norm.weight")return"model.norm.weight";if(i==="output.weight")return"lm_head.weight";const e=i.match(/^blk\.(\d+)\.(.+)$/);if(!e)return i;const[,t,n]=e,s=`model.layers.${t}`,a={"attn_q.weight":"self_attn.q_proj.weight","attn_k.weight":"self_attn.k_proj.weight","attn_v.weight":"self_attn.v_proj.weight","attn_output.weight":"self_attn.o_proj.weight","attn_norm.weight":"input_layernorm.weight","ffn_norm.weight":"post_attention_layernorm.weight","attn_sub_norm.weight":"self_attn.sub_norm.weight","ffn_sub_norm.weight":"mlp.sub_norm.weight","ffn_up.weight":"mlp.up_proj.weight","ffn_down.weight":"mlp.down_proj.weight","ffn_gate.weight":"mlp.gate_proj.weight"}[n];return a?`${s}.${a}`:`${s}.${n}`}function Oe(i,e,t){const n=[];for(let s=0;s1e5||e.includes("bitnet");return{modelType:"bitnet",vocabSize:o,hiddenSize:n,intermediateSize:d,numHiddenLayers:s,numAttentionHeads:r,numKeyValueHeads:a,maxPositionEmbeddings:t("context_length")??4096,rmsNormEps:t("attention.layer_norm_rms_epsilon")??1e-5,ropeTheta:t("rope.freq_base")??(u?5e5:1e4),tieWordEmbeddings:!1,activation:u?"relu2":"silu"}}function Ne(i){const e=i.find(p=>p.name==="model.embed_tokens.weight"||p.name==="transformer.wte.weight"),t=e?.shape[0]??128256,n=e?.shape[1]??2560,s=i.map(p=>{const l=p.name.match(/layers\.(\d+)\./);return l?parseInt(l[1],10):-1}).filter(p=>p>=0),r=s.length>0?Math.max(...s)+1:30,a=i.find(p=>p.name.includes("q_proj.weight")),o=a?a.shape[0]/(n/32):32,u=i.find(p=>p.name.includes("k_proj.weight"))?.shape[0]??n,c=n/o,f=u/c,h=t>1e5;return{modelType:"bitnet",vocabSize:t,hiddenSize:n,intermediateSize:0,numHiddenLayers:r,numAttentionHeads:o,numKeyValueHeads:f,maxPositionEmbeddings:4096,rmsNormEps:1e-5,ropeTheta:h?5e5:1e4,tieWordEmbeddings:!1,activation:h?"relu2":"silu"}}var Ve="0xbitnet",S="models";function j(){return new Promise((i,e)=>{const t=indexedDB.open(Ve,1);t.onupgradeneeded=()=>t.result.createObjectStore(S),t.onsuccess=()=>i(t.result),t.onerror=()=>e(t.error)})}function $e(i,e){return new Promise((t,n)=>{const r=i.transaction(S,"readonly").objectStore(S).get(e);r.onsuccess=()=>t(r.result),r.onerror=()=>n(r.error)})}function He(i,e,t){return new Promise((n,s)=>{const r=i.transaction(S,"readwrite");r.objectStore(S).put(t,e),r.oncomplete=()=>n(),r.onerror=()=>s(r.error)})}async function We(i,e,t){if(typeof indexedDB<"u")try{const c=await j(),f=await $e(c,i);if(c.close(),f)return e(f.byteLength,f.byteLength),f}catch{}const n=await fetch(i,{signal:t});if(!n.ok)throw new Error(`Failed to fetch model: ${n.status} ${n.statusText}`);const s=parseInt(n.headers.get("content-length")??"0",10),r=n.body?.getReader();if(!r){const c=await n.arrayBuffer();return e(c.byteLength,c.byteLength),c}const a=[];let o=0;for(;;){const{done:c,value:f}=await r.read();if(c)break;a.push(f),o+=f.byteLength,e(o,s)}const d=new Uint8Array(o);let u=0;for(const c of a)d.set(c,u),u+=c.byteLength;if(typeof indexedDB<"u")try{const c=await j();await He(c,i,d.buffer),c.close()}catch{}return d.buffer}var je=`// Token embedding lookup (F16 on GPU) // // For each token ID, copy the corresponding row from the embedding table. // Embedding table is stored as packed F16 pairs (two f16 values per u32) // to avoid exceeding maxStorageBufferBindingSize on most GPUs. // // Layout: // token_ids: [N] u32 // embed_table: [V * D / 2] u32 (packed f16 pairs) // output: [N, D] f32 struct Params { N: u32, // number of tokens D: u32, // embedding dimension V: u32, // vocab size } @group(0) @binding(0) var token_ids: array; @group(0) @binding(1) var embed_table: array; @group(0) @binding(2) var output: array; @group(0) @binding(3) var params: Params; @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) gid: vec3, ) { let idx = gid.x; let total = params.N * params.D; if (idx >= total) { return; } let token = idx / params.D; let dim = idx % params.D; let token_id = token_ids[token]; // Bounds check: treat out-of-vocab as zero if (token_id < params.V) { let flat = token_id * params.D + dim; let packed = embed_table[flat / 2u]; let pair = unpack2x16float(packed); output[idx] = select(pair.x, pair.y, (flat & 1u) == 1u); } else { output[idx] = 0.0; } } `,Ke=`// F32 GEMV for tied-embedding LM head (F16 embedding on GPU) // logits[n, v] = sum_d( hidden[n, d] * embed[v, d] ) // // hidden: [N, D] f32 — final hidden states // embed: [V * D / 2] u32 — embedding table stored as packed f16 pairs // output: [N, V] f32 — logits // // Each workgroup computes one (n, v) element. // 256 threads cooperatively reduce over D. // 2D dispatch: v = wg_id.x + wg_id.y * 65535 (V can exceed 65535) struct Params { N: u32, V: u32, D: u32, } @group(0) @binding(0) var hidden: array; @group(0) @binding(1) var embed: array; @group(0) @binding(2) var output: array; @group(0) @binding(3) var params: Params; const WG_SIZE: u32 = 256u; var shared_sums: array; @compute @workgroup_size(256) fn main( @builtin(workgroup_id) wg_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { // Decode (n, v) from 2D dispatch let flat_id = wg_id.x + wg_id.y * 65535u; let n = flat_id / params.V; let v = flat_id % params.V; if (n >= params.N || v >= params.V) { return; } let tid = local_id.x; // Each thread accumulates a strided slice of D // Process pairs of dimensions for efficiency var acc: f32 = 0.0; let hidden_base = n * params.D; let embed_base = v * params.D; // Process two dimensions at a time using packed f16 pairs let D_half = params.D / 2u; for (var dh = tid; dh < D_half; dh += WG_SIZE) { let d = dh * 2u; let packed = embed[embed_base / 2u + dh]; let pair = unpack2x16float(packed); acc += hidden[hidden_base + d] * pair.x; acc += hidden[hidden_base + d + 1u] * pair.y; } // Workgroup reduction shared_sums[tid] = acc; workgroupBarrier(); for (var stride = WG_SIZE / 2u; stride > 0u; stride >>= 1u) { if (tid < stride) { shared_sums[tid] += shared_sums[tid + stride]; } workgroupBarrier(); } // Thread 0 writes the result if (tid == 0u) { output[n * params.V + v] = shared_sums[0]; } } `,Fe=class ee{device;pipelines;pool;config;embedTokens;layers;finalNorm;lmHead;kvCaches;decodeTokenBuffer;decodeEmbeddingUniform;decodeFinalNormUniform;decodeLMHeadUniform;bgCache=G();constructor(e,t,n,s,r,a,o,d,u){this.device=e,this.pipelines=t,this.pool=n,this.config=s,this.embedTokens=r,this.layers=a,this.finalNorm=o,this.lmHead=d,this.kvCaches=u}static build(e,t,n,s=4096){const r=new ye(e),a=new Ue(e);function o(l){const m=n.get(l);if(!m)throw new Error(`Missing weight tensor: "${l}"`);return m}const d=o("model.embed_tokens.weight"),u=o("model.norm.weight"),c=[],f=[];for(let l=0;l1?(o=this.pool.acquire(this.config.hiddenSize*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST),n.copyBufferToBuffer(a,(t-1)*this.config.hiddenSize*4,o,0,this.config.hiddenSize*4),this.pool.release(a)):o=a;let d;return this.lmHead instanceof w?d=this.lmHead.forward(o,1,n):d=this.dispatchLMHead(n,o,1),t>1?this.pool.release(o):this.pool.release(a),this.device.queue.submit([n.finish()]),d}releaseBuffer(e){this.pool.release(e)}resetKVCache(){for(const e of this.kvCaches)e.seqLen=0;B(this.bgCache);for(const e of this.layers)e.clearBGCache();this.lmHead instanceof w&&this.lmHead.clearBGCache()}dispose(){B(this.bgCache);for(const e of this.layers)e.clearBGCache(),e.destroyPreAllocated();this.lmHead instanceof w&&this.lmHead.clearBGCache();for(const e of this.kvCaches)e.key.destroy(),e.value.destroy();this.pool.destroy(),this.pipelines.clear()}async diagnose(e){const t=e.length,n=[];this.resetKVCache();const s=this.device.createBuffer({size:e.byteLength,usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});new Uint32Array(s.getMappedRange()).set(e),s.unmap();let r=this.device.createCommandEncoder();const a=this.dispatchEmbedding(r,s,t);this.device.queue.submit([r.finish()]),n.push(await this.readDiag("embedding",a,t*this.config.hiddenSize)),r=this.device.createCommandEncoder();const o=this.layers[0].forward(a,t,this.kvCaches[0],r);this.device.queue.submit([r.finish()]),this.kvCaches[0].seqLen+=t,n.push(await this.readDiag("layer_0",o,t*this.config.hiddenSize)),this.pool.release(a),r=this.device.createCommandEncoder();const d=this.layers[1].forward(o,t,this.kvCaches[1],r);this.device.queue.submit([r.finish()]),this.kvCaches[1].seqLen+=t,n.push(await this.readDiag("layer_1",d,t*this.config.hiddenSize)),this.pool.release(o);let u=d;for(let p=2;p1?(f=this.pool.acquire(this.config.hiddenSize*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST),r=this.device.createCommandEncoder(),r.copyBufferToBuffer(c,(t-1)*this.config.hiddenSize*4,f,0,this.config.hiddenSize*4),this.device.queue.submit([r.finish()]),this.pool.release(c)):f=c,n.push(await this.readDiag("lm_input",f,this.config.hiddenSize)),r=this.device.createCommandEncoder();let h;return this.lmHead instanceof w?h=this.lmHead.forward(f,1,r):h=this.dispatchLMHead(r,f,1),this.device.queue.submit([r.finish()]),n.push(await this.readDiag("logits_first100",h,100)),this.pool.release(f===c?c:f),this.pool.release(h),n}async readDiag(e,t,n){const s=n*4,r=this.device.createBuffer({size:s,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST}),a=this.device.createCommandEncoder();a.copyBufferToBuffer(t,0,r,0,s),this.device.queue.submit([a.finish()]),await r.mapAsync(GPUMapMode.READ);const o=new Float32Array(r.getMappedRange().slice(0));r.unmap(),r.destroy();let d=1/0,u=-1/0,c=0,f=0,h=0,p=0,l=0;for(let _=0;_u&&(u=b),c+=b,f+=b*b}const m=c/o.length,g=Math.sqrt(f/o.length);return{name:e,length:o.length,min:d,max:u,mean:m,rms:g,nanCount:h,infCount:p,zeroCount:l,first8:Array.from(o.slice(0,8))}}dispatchEmbedding(e,t,n){const{pipeline:s,bindGroupLayout:r}=this.pipelines.getOrCreate("embedding",je),a=n*this.config.hiddenSize*4,o=this.pool.acquire(a,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let d;if(n===1&&this.decodeEmbeddingUniform)d=this.decodeEmbeddingUniform;else{const p=new ArrayBuffer(12),l=new DataView(p);l.setUint32(0,n,!0),l.setUint32(4,this.config.hiddenSize,!0),l.setUint32(8,this.config.vocabSize,!0),d=this.createUniform(p)}const u=[{binding:0,resource:{buffer:t}},{binding:1,resource:{buffer:this.embedTokens}},{binding:2,resource:{buffer:o}},{binding:3,resource:{buffer:d}}],c=n===1?v(this.bgCache,this.device,"embedding",r,u):this.device.createBindGroup({layout:r,entries:u}),f=n*this.config.hiddenSize,h=e.beginComputePass();return h.setPipeline(s),h.setBindGroup(0,c),h.dispatchWorkgroups(Math.ceil(f/256)),h.end(),o}dispatchFinalNorm(e,t,n){const{pipeline:s,bindGroupLayout:r}=this.pipelines.getOrCreate("rmsnorm",O),a=this.pool.acquire(n*this.config.hiddenSize*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let o;if(n===1&&this.decodeFinalNormUniform)o=this.decodeFinalNormUniform;else{const f=new ArrayBuffer(12),h=new DataView(f);h.setUint32(0,n,!0),h.setUint32(4,this.config.hiddenSize,!0),h.setFloat32(8,this.config.rmsNormEps,!0),o=this.createUniform(f)}const d=[{binding:0,resource:{buffer:t}},{binding:1,resource:{buffer:this.finalNorm}},{binding:2,resource:{buffer:a}},{binding:3,resource:{buffer:o}}],u=n===1?v(this.bgCache,this.device,"finalNorm",r,d):this.device.createBindGroup({layout:r,entries:d}),c=e.beginComputePass();return c.setPipeline(s),c.setBindGroup(0,u),c.dispatchWorkgroups(n),c.end(),a}dispatchLMHead(e,t,n){const s=this.config.vocabSize,r=this.config.hiddenSize,{pipeline:a,bindGroupLayout:o}=this.pipelines.getOrCreate("f32_matmul",Ke),d=this.pool.acquire(n*s*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);let u;if(n===1&&this.decodeLMHeadUniform)u=this.decodeLMHeadUniform;else{const g=new ArrayBuffer(12),_=new DataView(g);_.setUint32(0,n,!0),_.setUint32(4,s,!0),_.setUint32(8,r,!0),u=this.createUniform(g)}const c=[{binding:0,resource:{buffer:t}},{binding:1,resource:{buffer:this.embedTokens}},{binding:2,resource:{buffer:d}},{binding:3,resource:{buffer:u}}],f=n===1?v(this.bgCache,this.device,"lmHead",o,c):this.device.createBindGroup({layout:o,entries:c}),h=n*s,p=Math.min(h,65535),l=Math.ceil(h/65535),m=e.beginComputePass();return m.setPipeline(a),m.setBindGroup(0,f),m.dispatchWorkgroups(p,l),m.end(),d}createUniform(e){const t=Math.max(Math.ceil(e.byteLength/4)*4,4),n=this.device.createBuffer({size:t,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,mappedAtCreation:!0});return new Uint8Array(n.getMappedRange()).set(new Uint8Array(e)),n.unmap(),n}},K=class y{config;vocab;reverseVocab;merges;mergeRanks;bosId;eosId;textEncoder=new TextEncoder;textDecoder=new TextDecoder("utf-8",{fatal:!1});constructor(e,t,n){this.config=e,this.vocab=t,this.merges=n,this.bosId=e.bosToken??1,this.eosId=e.eosToken??2,this.reverseVocab=new Map;for(const[s,r]of t)this.reverseVocab.set(r,s);this.mergeRanks=new Map;for(let s=0;s{const a=r.split(" ");return[a[0],a[1]]}),s={type:e.config?.type??"bpe",vocabSize:t.size,bosToken:e.config?.bosToken??1,eosToken:e.config?.eosToken??2};return new y(s,t,n)}encode(e,t=!0){const n=[];t&&n.push(this.bosId),this.config.type==="sentencepiece"&&(e=" "+e);const s=new RegExp("(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+","gu"),r=e.match(s)??[e];for(const a of r){const o=this.bpeEncode(a);n.push(...o)}return new Uint32Array(n)}decode(e){const t=[];for(const n of e){if(n===this.bosId||n===this.eosId)continue;const s=this.reverseVocab.get(n);s!==void 0&&t.push(this.decodeToken(s))}return t.join("")}decodeToken(e){if(e.startsWith("<0x")&&e.endsWith(">")){const t=parseInt(e.slice(3,-1),16);return String.fromCharCode(t)}return this.config.type==="sentencepiece"?e.replace(/▁/g," "):this.bytesToString(e)}get eosTokenId(){return this.eosId}get bosTokenId(){return this.bosId}get eotTokenId(){return this.vocab.get("<|eot_id|>")}applyChatTemplate(e){const t=this.vocab.get("<|start_header_id|>"),n=this.vocab.get("<|end_header_id|>"),s=this.vocab.get("<|eot_id|>");if(t===void 0||n===void 0||s===void 0){console.warn(`[0xBitNet] Chat template fallback: special tokens missing (start_header=${t}, end_header=${n}, eot=${s})`);const a=e.map(o=>o.content).join(` `);return this.encode(a)}console.debug(`[0xBitNet] Chat template: start_header=${t}, end_header=${n}, eot=${s}`);const r=[this.bosId];for(const a of e)r.push(t),r.push(...this.encode(a.role,!1)),r.push(n),r.push(...this.encode(` `+a.content,!1)),r.push(s);return r.push(t),r.push(...this.encode("assistant",!1)),r.push(n),r.push(...this.encode(` `,!1)),new Uint32Array(r)}bpeEncode(e){if(e.length===0)return[];let t;for(this.config.type==="sentencepiece"?t=[...e].map(s=>s.replace(" ","▁")):t=this.stringToBytes(e);t.length>1;){let s=1/0,r=-1;for(let o=0;o`,d=this.vocab.get(o);d!==void 0&&n.push(d)}}return n}static byteToUnicode=null;static getByteToUnicode(){if(y.byteToUnicode)return y.byteToUnicode;const e=new Map,t=[[33,126],[161,172],[174,255]],n=[];for(const[a,o]of t)for(let d=a;d<=o;d++)n.push(d);const s=[...n];let r=0;for(let a=0;a<256;a++)n.includes(a)||(n.push(a),s.push(256+r),r++);for(let a=0;a0?f.slice(-o):f,m=await this.sampleToken(h,s,r,a,l);if(this.model.releaseBuffer(h),m===this.tokenizer.eosTokenId||m===c)break;f.push(m);const g=this.tokenizer.decode([m]);t.onToken?.(g),yield g,h=this.model.forward(new Uint32Array([m]))}}dispose(){this.readbackBuffer.destroy(),this.model.dispose()}async diagnose(e="Hello"){const t=this.tokenizer.encode(e);return this.model.diagnose(t)}async sampleToken(e,t,n,s,r){const a=this.model.config.vocabSize,o=this.device.createCommandEncoder();o.copyBufferToBuffer(e,0,this.readbackBuffer,0,a*4),this.device.queue.submit([o.finish()]),await this.readbackBuffer.mapAsync(GPUMapMode.READ);const d=new Float32Array(this.readbackBuffer.getMappedRange()),u=this.logitsArray;if(u.set(d),this.readbackBuffer.unmap(),s!==1&&r.length>0)for(const l of r)u[l]>0?u[l]/=s:u[l]*=s;if(t!==1){const l=1/t;for(let m=0;m0&&n>1)-1;g>=0;g--)F(l,g,n,u);for(let g=n;gu[l[0]]&&(l[0]=g,F(l,0,n,u));const m=u[l[0]];for(let g=0;gc&&(c=u[l]);let f=0;for(let l=0;l=h)return l;return a-1}};function F(i,e,t,n){for(;;){let s=e;const r=2*e+1,a=2*e+2;if(r{x.disabled=!0,Qe.style.display="none",Y.style.display="block";try{M=await Ye.load(Ze,{onProgress(i){const e=(i.fraction*100).toFixed(1);Z.textContent=`${i.phase}: ${e}%`,Je.style.width=`${e}%`}}),Y.style.display="none",k.style.display="flex",Xe.style.display="flex",D.disabled=!1,P.focus(),L("assistant","Model loaded! Ask me anything.")}catch(i){Z.textContent=`Error: ${i.message}`}});D.addEventListener("click",ne);P.addEventListener("keydown",i=>{i.key==="Enter"&&!i.shiftKey&&(i.preventDefault(),ne())});async function ne(){if(!M||z)return;const i=P.value.trim();if(!i)return;P.value="",L("user",i),z=!0,D.disabled=!0;const e=L("assistant",""),t=[{role:"system",content:"You are a helpful assistant."},{role:"user",content:i}];try{for await(const n of M.generate(t,{maxTokens:512,temperature:.7,topK:40,repeatPenalty:1.1}))e.textContent+=n,k.scrollTop=k.scrollHeight}catch(n){e.textContent+=` [Error: ${n.message}]`}z=!1,D.disabled=!1,P.focus()}function L(i,e){const t=document.createElement("div");return t.className=`msg ${i}`,t.textContent=e,k.appendChild(t),k.scrollTop=k.scrollHeight,t}navigator.gpu||(x.disabled=!0,x.textContent="WebGPU not supported — use Chrome 113+");