Spaces:

Ex0bit
/

tensorbend

Running

App Files Files Community

Ex0bit Claude Opus 4.6 commited on 4 days ago

Commit

db79cca

1 Parent(s): 0c2960a

Sync latest local build with updated model & GPU ops

Browse files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (7) hide show

assets/{gpu-ops-Bq_PFJSE.js → gpu-ops-PQDFq1iI.js} +328 -44
assets/{main-DTfqdn80.js → main-D60Okk_s.js} +0 -0
assets/qwen35-model-CmeFfImT.js +1 -0
assets/qwen35-model-D0qiY8Dx.js +0 -1
assets/{test-BQCz-9iM.js → test-BEFPr_G8.js} +3 -3
index.html +3 -3
test.html +2 -2

assets/{gpu-ops-Bq_PFJSE.js → gpu-ops-PQDFq1iI.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-(function(){const e=document.createElement("link").relList;if(e&&e.supports&&e.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))u(r);new MutationObserver(r=>{for(const a of r)if(a.type==="childList")for(const i of a.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&u(i)}).observe(document,{childList:!0,subtree:!0});function t(r){const a={};return r.integrity&&(a.integrity=r.integrity),r.referrerPolicy&&(a.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?a.credentials="include":r.crossOrigin==="anonymous"?a.credentials="omit":a.credentials="same-origin",a}function u(r){if(r.ep)return;r.ep=!0;const a=t(r);fetch(r.href,a)}})();class v{constructor(){this.device=null,this.adapter=null,this.adapterInfo=null,this.pipelineCache=new Map,this.bufferCache=new Map,this.bindGroupCache=new Map}async init(){if(!navigator.gpu)throw new Error("WebGPU not supported");if(this.adapter=await navigator.gpu.requestAdapter({powerPreference:"high-performance"}),!this.adapter)throw new Error("No WebGPU adapter found");this.adapterInfo=await this.adapter.requestAdapterInfo?.()??{};const e={},t={maxBufferSize:4*1024*1024*1024,maxStorageBufferBindingSize:4*1024*1024*1024,maxComputeWorkgroupStorageSize:32768,maxComputeInvocationsPerWorkgroup:256,maxComputeWorkgroupSizeX:256,maxStorageBuffersPerShaderStage:10};for(const[r,a]of Object.entries(t))this.adapter.limits[r]!==void 0&&(e[r]=Math.min(a,this.adapter.limits[r]));const u=[];return this.adapter.features.has("shader-f16")&&u.push("shader-f16"),this.adapter.features.has("subgroups")&&u.push("subgroups"),this.device=await this.adapter.requestDevice({requiredLimits:e,requiredFeatures:u}),this.hasF16=this.device.features.has("shader-f16"),this.hasSubgroups=this.device.features.has("subgroups"),this.device.lost.then(r=>console.error("WebGPU device lost:",r)),this}createBuffer(e,t,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST){const r=Math.ceil(t/4)*4,a=this.device.createBuffer({size:r,usage:u,label:e});return this.bufferCache.set(e,a),a}createBufferFromData(e,t,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){const r=this.createBuffer(e,t.byteLength,u);return this.device.queue.writeBuffer(r,0,t),r}createReadbackBuffer(e,t){const u=Math.ceil(t/4)*4;return this.device.createBuffer({size:u,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:e+"_readback"})}getOrCreatePipeline(e,t,u="main"){if(this.pipelineCache.has(e))return this.pipelineCache.get(e);const r=this.device.createShaderModule({code:t,label:e}),a=this.device.createComputePipeline({layout:"auto",compute:{module:r,entryPoint:u},label:e});return this.pipelineCache.set(e,a),a}initTimestamps(){this.device.features.has("timestamp-query")&&(this._tsQuerySet=this.device.createQuerySet({type:"timestamp",count:2}),this._tsResolveBuf=this.device.createBuffer({size:16,usage:GPUBufferUsage.QUERY_RESOLVE|GPUBufferUsage.COPY_SRC}),this._tsReadBuf=this.device.createBuffer({size:16,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST}),this._tsEnabled=!0,this._tsResults=[])}beginBatch(){this._encoder=this.device.createCommandEncoder(),this._passCount=0,this.singlePassMode&&(this._singlePass=this._encoder.beginComputePass()),this._tsEnabled&&this._encoder.beginComputePass({timestampWrites:{querySet:this._tsQuerySet,beginningOfPassWriteIndex:0}}).end()}endBatch(){this._singlePass&&(this._singlePass.end(),this._singlePass=null),this._tsEnabled&&this._encoder&&(this._encoder.beginComputePass({timestampWrites:{querySet:this._tsQuerySet,beginningOfPassWriteIndex:1}}).end(),this._encoder.resolveQuerySet(this._tsQuerySet,0,2,this._tsResolveBuf,0),this._encoder.copyBufferToBuffer(this._tsResolveBuf,0,this._tsReadBuf,0,16)),this._encoder&&(this.device.queue.submit([this._encoder.finish()]),this._encoder=null)}async readTimestamp(){if(!this._tsEnabled)return null;await this._tsReadBuf.mapAsync(GPUMapMode.READ);const e=new BigInt64Array(this._tsReadBuf.getMappedRange().slice(0));this._tsReadBuf.unmap();const u=Number(e[1]-e[0])/1e6;return this._tsResults.push(u),u}copyBuffer(e,t,u,r=0,a=0){if(this._singlePass){this._singlePass.end(),this._encoder.copyBufferToBuffer(e,r,t,a,u),this._singlePass=this._encoder.beginComputePass();return}const i=this._encoder||this.device.createCommandEncoder();i.copyBufferToBuffer(e,r,t,a,u),this._encoder||this.device.queue.submit([i.finish()])}startRecording(){this._recording=[]}stopRecording(){const e=this._recording;return this._recording=null,e}replay(e,t){if(t)for(const r of t)this.device.queue.writeBuffer(r.buffer,r.offset,r.data,r.dataOffset,r.size);const u=this._encoder;for(let r=0;r<e.length;r++){const a=e[r];if(a.multi){const i=u.beginComputePass(),s=a.ops;for(let o=0;o<s.length;o++){const g=s[o];i.setPipeline(g.pipeline),i.setBindGroup(0,g.bindGroup),i.dispatchWorkgroups(g.wgX,g.wgY)}i.end()}else{const i=u.beginComputePass();i.setPipeline(a.pipeline),i.setBindGroup(0,a.bindGroup),i.dispatchWorkgroups(a.wgX,a.wgY),i.end()}}}dispatch(e,t,u,r=1,a=1){if(this._recording&&this._recording.push({pipeline:e,bindGroup:t[0],wgX:u,wgY:r}),this._singlePass){const o=this._singlePass;this._passCount!==void 0&&this._passCount++,o.setPipeline(e);for(let g=0;g<t.length;g++)o.setBindGroup(g,t[g]);o.dispatchWorkgroups(u,r,a);return}const i=this._encoder||this.device.createCommandEncoder(),s=i.beginComputePass();this._passCount!==void 0&&this._passCount++,s.setPipeline(e);for(let o=0;o<t.length;o++)s.setBindGroup(o,t[o]);s.dispatchWorkgroups(u,r,a),s.end(),this._encoder||this.device.queue.submit([i.finish()])}dispatchMulti(e){if(this._recording&&this._recording.push({multi:!0,ops:e.map(r=>({pipeline:r.pipeline,bindGroup:r.bindGroups[0],wgX:r.workgroupsX,wgY:r.workgroupsY||1}))}),this._singlePass){this._passCount!==void 0&&this._passCount++;for(const r of e){this._singlePass.setPipeline(r.pipeline);for(let a=0;a<r.bindGroups.length;a++)this._singlePass.setBindGroup(a,r.bindGroups[a]);this._singlePass.dispatchWorkgroups(r.workgroupsX,r.workgroupsY||1,r.workgroupsZ||1)}return}const t=this._encoder||this.device.createCommandEncoder(),u=t.beginComputePass();this._passCount!==void 0&&this._passCount++;for(const r of e){u.setPipeline(r.pipeline);for(let a=0;a<r.bindGroups.length;a++)u.setBindGroup(a,r.bindGroups[a]);u.dispatchWorkgroups(r.workgroupsX,r.workgroupsY||1,r.workgroupsZ||1)}u.end(),this._encoder||this.device.queue.submit([t.finish()])}async readBuffer(e,t){const u=this.createReadbackBuffer("_readback",t),r=this.device.createCommandEncoder();r.copyBufferToBuffer(e,0,u,0,t),this.device.queue.submit([r.finish()]),await u.mapAsync(GPUMapMode.READ);const a=new Float32Array(u.getMappedRange().slice(0));return u.unmap(),u.destroy(),a}createBindGroup(e,t,u){return this.device.createBindGroup({layout:e.getBindGroupLayout(t),entries:u.map((r,a)=>({binding:a,resource:{buffer:r}}))})}createBindGroupWithOffsets(e,t,u){return this.device.createBindGroup({layout:e.getBindGroupLayout(t),entries:u.map((r,a)=>({binding:a,resource:r.buffer?{buffer:r.buffer,offset:r.offset||0,size:r.size}:{buffer:r}}))})}getCachedBindGroup(e,t,u,r){let a=this.bindGroupCache.get(e);return a||(a=this.createBindGroup(t,u,r),this.bindGroupCache.set(e,a)),a}destroy(){for(const e of this.bufferCache.values())e.destroy();this.bufferCache.clear(),this.bindGroupCache.clear(),this.device?.destroy()}}const Kr=Object.freeze(Object.defineProperty({__proto__:null,GPUContext:v},Symbol.toStringTag,{value:"Module"})),m="modulepreload",k=function(p,e){return new URL(p,e).href},w={},Er=function(e,t,u){let r=Promise.resolve();if(t&&t.length>0){let g=function(n){return Promise.all(n.map(f=>Promise.resolve(f).then(d=>({status:"fulfilled",value:d}),d=>({status:"rejected",reason:d}))))};const i=document.getElementsByTagName("link"),s=document.querySelector("meta[property=csp-nonce]"),o=s?.nonce||s?.getAttribute("nonce");r=g(t.map(n=>{if(n=k(n,u),n in w)return;w[n]=!0;const f=n.endsWith(".css"),d=f?'[rel="stylesheet"]':"";if(u)for(let _=i.length-1;_>=0;_--){const c=i[_];if(c.href===n&&(!f||c.rel==="stylesheet"))return}else if(document.querySelector(`link[href="${n}"]${d}`))return;const l=document.createElement("link");if(l.rel=f?"stylesheet":m,f||(l.as="script"),l.crossOrigin="",l.href=n,o&&l.setAttribute("nonce",o),document.head.appendChild(l),f)return new Promise((_,c)=>{l.addEventListener("load",_),l.addEventListener("error",()=>c(new Error(`Unable to preload CSS for ${n}`)))})}))}function a(i){const s=new Event("vite:preloadError",{cancelable:!0});if(s.payload=i,window.dispatchEvent(s),!s.defaultPrevented)throw i}return r.then(i=>{for(const s of i||[])s.status==="rejected"&&a(s.reason);return e().catch(a)})},h=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
@@ -176,7 +176,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
     sum += partials[s * params.N + col];
   }
   output[col] = sum;
-}`,N=`
 struct Params { K: u32, N: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
@@ -203,7 +203,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
     sum += input[k] * w0 + input[k + 1u] * w1 + input[k + 2u] * w2 + input[k + 3u] * w3;
   }
   output[col] = sum;
-}`,F=`
 struct Params { N: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
@@ -270,7 +270,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   let i = gid.x;
   if (i >= params.N) { return; }
   a[i] = a[i] + b[i];
-}`,K=`
 struct Params { N: u32, num_heads: u32, head_dim: u32, }
 @group(0) @binding(0) var<storage, read> src: array<f32>;
@@ -287,7 +287,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   let d = i % hd;
   dst_a[i] = src[head * hd * 2u + d];
   dst_b[i] = src[head * hd * 2u + hd + d];
-}`,E=`
 struct Params { N: u32, }
 @group(0) @binding(0) var<storage, read> x: array<f32>;
@@ -300,7 +300,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   let i = gid.x;
   if (i >= params.N) { return; }
   output[i] = x[i] / (1.0 + exp(-gate[i]));
-}`,B=`
 struct Params { token_id: u32, dim: u32, }
 @group(0) @binding(0) var<storage, read> embeddings: array<u32>;
@@ -318,7 +318,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   if (i >= params.dim) { return; }
   let flat = params.token_id * params.dim + i;
   output[i] = unpack_bf16(embeddings[flat / 2u], flat % 2u);
-}`,R=`
 struct ArgmaxResult { idx: u32, val: f32, }
 struct Params { dim: u32, }
@@ -338,7 +338,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   if (i >= params.dim) { return; }
   let flat = argmax_result.idx * params.dim + i;
   output[i] = unpack_bf16(embeddings[flat / 2u], flat % 2u);
-}`,z=`
 struct Params { N: u32, }
 struct Result { idx: u32, val: f32, }
@@ -381,7 +381,7 @@ fn main(@builtin(local_invocation_id) lid: vec3u) {
     result.idx = s_idx[0];
     result.val = s_val[0];
   }
-}`,A=`
 struct Params { N: u32, }
 @group(0) @binding(0) var<storage, read> logits: array<f32>;
@@ -406,7 +406,7 @@ fn main(@builtin(local_invocation_id) lid: vec3u) {
   // Output (idx, val) pair — 256 candidates total
   result[tid * 2u] = best_idx;
   result[tid * 2u + 1u] = bitcast<u32>(best_val);
-}`,D=`
 struct ArgmaxResult { idx: u32, val: f32, }
 struct Params { recent_count: u32, history_slot: u32, }
@@ -437,7 +437,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   let offset = params.position * total + i;
   k_cache[offset] = k_proj[i];
   v_cache[offset] = v_proj[i];
-}`,G=`
 struct Params {
   seq_len: u32,
   head_dim: u32,
@@ -583,7 +583,7 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
   }
   output[h * hd + tid] = acc / ws;
-}`,H=`
 struct Params {
   num_heads: u32,
   key_dim: u32,
@@ -669,7 +669,7 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
     }
     output[h * vd + vi] = o_val;
   }
-}`,U=`
 struct Params { num_heads: u32, head_dim: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read_write> x: array<f32>;
@@ -710,7 +710,7 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
     let w = unpack_bf16(weight[i / 2u], i % 2u);
     x[off + i] = x[off + i] * rms * (1.0 + w);
   }
-}`,C=`
 struct Params { num_heads: u32, head_dim: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read_write> x: array<f32>;
@@ -749,7 +749,7 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
     let w = unpack_bf16(weight[i / 2u], i % 2u);
     x[off + i] = x[off + i] * rms * w;
   }
-}`,O=`
 struct Params { channels: u32, }
 @group(0) @binding(0) var<storage, read_write> x: array<f32>;
@@ -785,7 +785,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   hist[c] = h1;
   hist[ch + c] = h2;
   hist[2u * ch + c] = cur;
-}`,L=`
 struct Params {
   num_heads: u32,
   key_dim: u32,
@@ -1002,7 +1002,7 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
     }
     workgroupBarrier();
   }
-}`,Q=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
@@ -1056,7 +1056,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   }
   output[col] = (gate_sum / (1.0 + exp(-gate_sum))) * up_sum;
-}`,I=`
 enable f16;
 struct Params { K: u32, N: u32, group_size: u32, }
@@ -1185,7 +1185,7 @@ fn main(@builtin(local_invocation_id) lid: vec3u,
     let u = partial[32u+tid] + partial[32u+tid+1u] + partial[32u+tid+2u] + partial[32u+tid+3u];
     output[col] = (g / (1.0 + exp(-g))) * u;
   }
-}`,W=`
 enable f16;
 struct Params { K: u32, N: u32, group_size: u32, }
@@ -1260,7 +1260,7 @@ fn main(@builtin(local_invocation_id) lid: vec3u,
     let u = partial[32u+tid] + partial[32u+tid+1u] + partial[32u+tid+2u] + partial[32u+tid+3u];
     output[col] = (g / (1.0 + exp(-g))) * u;
   }
-}`,V=`
 struct Params { N: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read_write> hidden: array<f32>;
@@ -2087,7 +2087,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u,
   }
   output[col] = sum;
-}`,gr=`
 struct Params {
   K: u32, N: u32, group_size: u32, eps: f32,
   norm_weight: array<vec4<u32>, 640>,
@@ -2169,7 +2169,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u,
     }
   }
   output[col] = sum;
-}`,nr=`
 struct Params {
   K: u32, N: u32, group_size: u32, eps: f32,
   norm_weight: array<vec4<u32>, 640>,
@@ -2244,7 +2244,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u,
     }
   }
   output[col] = sum;
-}`,lr=`
 struct Params { num_tokens: u32, penalty: f32, presence: f32, _pad: u32, }
 @group(0) @binding(0) var<storage, read_write> logits: array<f32>;
@@ -2268,7 +2268,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   // Presence penalty (additive)
   val -= params.presence;
   logits[tok] = val;
-}`,fr=`
 struct Params {
   temperature: f32,
   top_k: u32,
@@ -2404,16 +2404,21 @@ fn main(@builtin(local_invocation_id) lid: vec3u) {
     result.idx = wg_idx[selected];
     result.val = wg_val[selected];
   }
-}`;function b(p=320){return`
 struct Params {
   num_heads: u32,       // Q heads
   num_kv_heads: u32,    // KV heads
   head_dim: u32,
   eps: f32,
-  position: u32,        // KV cache write position (updated per token)
-  partial_dim: u32,     // RoPE partial rotation dimension
-  rope_theta: f32,      // RoPE base frequency
-  _pad1: u32,
   // Packed BF16 norm weights: [Q norm weights | K norm weights]
   // Q: numHeads*headDim BF16 values, K: numKVHeads*headDim BF16 values
   qk_norm_weight: array<vec4<u32>, ${p}>,
@@ -2476,14 +2481,19 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
     }
     workgroupBarrier();
-    // Phase 3: Apply RoPE to Q (rotate_half: dim i pairs with dim i+half_dim)
-    let q_half = params.partial_dim / 2u;
     for (var i = tid; i < hd; i += 256u) {
       var q_out = wg_data[i];
-      if (i < params.partial_dim) {
         let freq_idx = i % q_half;
-        let freq = 1.0 / pow(params.rope_theta, f32(2u * freq_idx) / f32(params.partial_dim));
-        let angle = f32(params.position) * freq;
         let cos_a = cos(angle);
         let sin_a = sin(angle);
         if (i < q_half) {
@@ -2525,16 +2535,20 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
     }
     workgroupBarrier();
-    // Phase 3: Apply RoPE to K, write to kProj + k_cache, copy V to v_cache
     let total = nkv * hd;
-    let cache_off = params.position * total + off;
-    let k_half = params.partial_dim / 2u;
     for (var i = tid; i < hd; i += 256u) {
       var k_out = wg_data[i];
-      if (i < params.partial_dim) {
         let freq_idx = i % k_half;
-        let freq = 1.0 / pow(params.rope_theta, f32(2u * freq_idx) / f32(params.partial_dim));
-        let angle = f32(params.position) * freq;
         let cos_a = cos(angle);
         let sin_a = sin(angle);
         if (i < k_half) {
@@ -2548,7 +2562,7 @@ fn main(@builtin(workgroup_id) wg: vec3u, @builtin(local_invocation_id) lid: vec
       v_cache[cache_off + i] = v_proj[off + i];
     }
   }
-}`}const pr=b(320),dr=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
@@ -3152,7 +3166,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   output[col] = sum0;
   output[N + col] = sum1;
-}`,Nr=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
@@ -3207,7 +3221,7 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
   output[col] = sum0;
   output[N + col] = sum1;
-}`,Fr=`
 enable f16;
 struct Params { K: u32, N: u32, group_size: u32, }
@@ -3340,4 +3354,274 @@ fn main(@builtin(local_invocation_id) lid: vec3u, @builtin(workgroup_id) wid: ve
     output[col] = partial0[tid] + partial0[tid+1u] + partial0[tid+2u] + partial0[tid+3u];
     output[N + col] = partial1[tid] + partial1[tid+1u] + partial1[tid+2u] + partial1[tid+3u];
   }
-}`,Sr={gptq_matvec:h,gptq_matvec_f16:x,gptq_matvec_4t:dr,gptq_matvec_4t_f16:_r,gptq_splitk:q,reduce_splitk:y,bf16_matvec:N,rmsnorm:F,silu_mul:P,add:S,embedding:B,embed_from_argmax:R,argmax:z,topk_extract:A,kv_cache_store:T,gqa_attention_head:G,gqa_reduce:M,deltanet_recurrent:H,head_rmsnorm:U,head_rmsnorm_nogated:C,causal_conv1d:O,split:K,sigmoid_mul:E,fused_gate_up_silu:Q,fused_gate_up_silu_f16:I,fused_gate_up_silu_4t:j,fused_gate_up_silu_4t_f16:W,add_rmsnorm:V,add_rmsnorm_ro:Y,three_way_add_rmsnorm:X,norm_gptq_lite:gr,norm_gptq_lite_noadd:nr,fused_sigmoid_gptq:$,fused_sigmoid_gptq_f16:Z,fused_sigmoid_gptq_4t:ur,fused_sigmoid_gptq_4t_f16:tr,fused_silu_gptq:J,fused_silu_gptq_f16:rr,fused_silu_gptq_4t:ar,fused_silu_gptq_4t_f16:er,fused_addnorm_gate_up_silu:ir,rep_penalty:lr,gpu_sample:fr,append_token:D,fused_norm_gptq:sr,fused_norm_gptq_noadd:or,fused_conv_deltanet_norm:L,fused_split_qknorm_kvstore:pr,gptq_matvec_4t_f16_sk:cr,gather_rows_bf16:wr,quantize_bf16_to_int4:br,pack_f32_to_f16_pairs:vr,gptq_matmul_b2:Nr,gptq_matmul_b2_f16:Fr,gptq_matmul_b2_4t_f16:Pr,add_rmsnorm_b2:mr,add_rmsnorm_ro_b2:kr,three_way_add_rmsnorm_b2:hr,fused_gate_up_silu_b2_f16:xr,fused_silu_gptq_b2_f16:qr,fused_sigmoid_gptq_b2_f16:yr},Br=Object.freeze(Object.defineProperty({__proto__:null,SHADERS:Sr,SHADER_FUSED_SPLIT_QKNORM_KVSTORE_FN:b},Symbol.toStringTag,{value:"Module"}));export{v as G,Sr as S,Er as _,b as a,Br as b,Kr as g};

+(function(){const a=document.createElement("link").relList;if(a&&a.supports&&a.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))u(r);new MutationObserver(r=>{for(const e of r)if(e.type==="childList")for(const i of e.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&u(i)}).observe(document,{childList:!0,subtree:!0});function t(r){const e={};return r.integrity&&(e.integrity=r.integrity),r.referrerPolicy&&(e.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?e.credentials="include":r.crossOrigin==="anonymous"?e.credentials="omit":e.credentials="same-origin",e}function u(r){if(r.ep)return;r.ep=!0;const e=t(r);fetch(r.href,e)}})();class v{constructor(){this.device=null,this.adapter=null,this.adapterInfo=null,this.pipelineCache=new Map,this.bufferCache=new Map,this.bindGroupCache=new Map}async init(){if(!navigator.gpu)throw new Error("WebGPU not supported");if(this.adapter=await navigator.gpu.requestAdapter({powerPreference:"high-performance"}),!this.adapter)throw new Error("No WebGPU adapter found");this.adapterInfo=await this.adapter.requestAdapterInfo?.()??{};const a={},t={maxBufferSize:4*1024*1024*1024,maxStorageBufferBindingSize:4*1024*1024*1024,maxComputeWorkgroupStorageSize:32768,maxComputeInvocationsPerWorkgroup:256,maxComputeWorkgroupSizeX:256,maxStorageBuffersPerShaderStage:10};for(const[r,e]of Object.entries(t))this.adapter.limits[r]!==void 0&&(a[r]=Math.min(e,this.adapter.limits[r]));const u=[];return this.adapter.features.has("shader-f16")&&u.push("shader-f16"),this.adapter.features.has("subgroups")&&u.push("subgroups"),this.device=await this.adapter.requestDevice({requiredLimits:a,requiredFeatures:u}),this.hasF16=this.device.features.has("shader-f16"),this.hasSubgroups=this.device.features.has("subgroups"),this.device.lost.then(r=>console.error("WebGPU device lost:",r)),this}createBuffer(a,t,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST){const r=Math.ceil(t/4)*4,e=this.device.createBuffer({size:r,usage:u,label:a});return this.bufferCache.set(a,e),e}createBufferFromData(a,t,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){const r=this.createBuffer(a,t.byteLength,u);return this.device.queue.writeBuffer(r,0,t),r}createReadbackBuffer(a,t){const u=Math.ceil(t/4)*4;return this.device.createBuffer({size:u,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:a+"_readback"})}getOrCreatePipeline(a,t,u="main"){if(this.pipelineCache.has(a))return this.pipelineCache.get(a);const r=this.device.createShaderModule({code:t,label:a}),e=this.device.createComputePipeline({layout:"auto",compute:{module:r,entryPoint:u},label:a});return this.pipelineCache.set(a,e),e}initTimestamps(){this.device.features.has("timestamp-query")&&(this._tsQuerySet=this.device.createQuerySet({type:"timestamp",count:2}),this._tsResolveBuf=this.device.createBuffer({size:16,usage:GPUBufferUsage.QUERY_RESOLVE|GPUBufferUsage.COPY_SRC}),this._tsReadBuf=this.device.createBuffer({size:16,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST}),this._tsEnabled=!0,this._tsResults=[])}beginBatch(){this._encoder=this.device.createCommandEncoder(),this._passCount=0,this.singlePassMode&&(this._singlePass=this._encoder.beginComputePass()),this._tsEnabled&&this._encoder.beginComputePass({timestampWrites:{querySet:this._tsQuerySet,beginningOfPassWriteIndex:0}}).end()}endBatch(){this._singlePass&&(this._singlePass.end(),this._singlePass=null),this._tsEnabled&&this._encoder&&(this._encoder.beginComputePass({timestampWrites:{querySet:this._tsQuerySet,beginningOfPassWriteIndex:1}}).end(),this._encoder.resolveQuerySet(this._tsQuerySet,0,2,this._tsResolveBuf,0),this._encoder.copyBufferToBuffer(this._tsResolveBuf,0,this._tsReadBuf,0,16)),this._encoder&&(this.device.queue.submit([this._encoder.finish()]),this._encoder=null)}async readTimestamp(){if(!this._tsEnabled)return null;await this._tsReadBuf.mapAsync(GPUMapMode.READ);const a=new BigInt64Array(this._tsReadBuf.getMappedRange().slice(0));this._tsReadBuf.unmap();const u=Number(a[1]-a[0])/1e6;return this._tsResults.push(u),u}copyBuffer(a,t,u,r=0,e=0){if(this._singlePass){this._singlePass.end(),this._encoder.copyBufferToBuffer(a,r,t,e,u),this._singlePass=this._encoder.beginComputePass();return}const i=this._encoder||this.device.createCommandEncoder();i.copyBufferToBuffer(a,r,t,e,u),this._encoder||this.device.queue.submit([i.finish()])}startRecording(){this._recording=[]}stopRecording(){const a=this._recording;return this._recording=null,a}replay(a,t){if(t)for(const r of t)this.device.queue.writeBuffer(r.buffer,r.offset,r.data,r.dataOffset,r.size);const u=this._encoder;for(let r=0;r<a.length;r++){const e=a[r];if(e.multi){const i=u.beginComputePass(),s=e.ops;for(let o=0;o<s.length;o++){const n=s[o];i.setPipeline(n.pipeline),i.setBindGroup(0,n.bindGroup),i.dispatchWorkgroups(n.wgX,n.wgY)}i.end()}else{const i=u.beginComputePass();i.setPipeline(e.pipeline),i.setBindGroup(0,e.bindGroup),i.dispatchWorkgroups(e.wgX,e.wgY),i.end()}}}dispatch(a,t,u,r=1,e=1){if(this._recording&&this._recording.push({pipeline:a,bindGroup:t[0],wgX:u,wgY:r}),this._singlePass){const o=this._singlePass;this._passCount!==void 0&&this._passCount++,o.setPipeline(a);for(let n=0;n<t.length;n++)o.setBindGroup(n,t[n]);o.dispatchWorkgroups(u,r,e);return}const i=this._encoder||this.device.createCommandEncoder(),s=i.beginComputePass();this._passCount!==void 0&&this._passCount++,s.setPipeline(a);for(let o=0;o<t.length;o++)s.setBindGroup(o,t[o]);s.dispatchWorkgroups(u,r,e),s.end(),this._encoder||this.device.queue.submit([i.finish()])}dispatchMulti(a){if(this._recording&&this._recording.push({multi:!0,ops:a.map(r=>({pipeline:r.pipeline,bindGroup:r.bindGroups[0],wgX:r.workgroupsX,wgY:r.workgroupsY||1}))}),this._singlePass){this._passCount!==void 0&&this._passCount++;for(const r of a){this._singlePass.setPipeline(r.pipeline);for(let e=0;e<r.bindGroups.length;e++)this._singlePass.setBindGroup(e,r.bindGroups[e]);this._singlePass.dispatchWorkgroups(r.workgroupsX,r.workgroupsY||1,r.workgroupsZ||1)}return}const t=this._encoder||this.device.createCommandEncoder(),u=t.beginComputePass();this._passCount!==void 0&&this._passCount++;for(const r of a){u.setPipeline(r.pipeline);for(let e=0;e<r.bindGroups.length;e++)u.setBindGroup(e,r.bindGroups[e]);u.dispatchWorkgroups(r.workgroupsX,r.workgroupsY||1,r.workgroupsZ||1)}u.end(),this._encoder||this.device.queue.submit([t.finish()])}async readBuffer(a,t){const u=this.createReadbackBuffer("_readback",t),r=this.device.createCommandEncoder();r.copyBufferToBuffer(a,0,u,0,t),this.device.queue.submit([r.finish()]),await u.mapAsync(GPUMapMode.READ);const e=new Float32Array(u.getMappedRange().slice(0));return u.unmap(),u.destroy(),e}createBindGroup(a,t,u){return this.device.createBindGroup({layout:a.getBindGroupLayout(t),entries:u.map((r,e)=>({binding:e,resource:{buffer:r}}))})}createBindGroupWithOffsets(a,t,u){return this.device.createBindGroup({layout:a.getBindGroupLayout(t),entries:u.map((r,e)=>({binding:e,resource:r.buffer?{buffer:r.buffer,offset:r.offset||0,size:r.size}:{buffer:r}}))})}getCachedBindGroup(a,t,u,r){let e=this.bindGroupCache.get(a);return e||(e=this.createBindGroup(t,u,r),this.bindGroupCache.set(a,e)),e}destroy(){for(const a of this.bufferCache.values())a.destroy();this.bufferCache.clear(),this.bindGroupCache.clear(),this.device?.destroy()}}const Hr=Object.freeze(Object.defineProperty({__proto__:null,GPUContext:v},Symbol.toStringTag,{value:"Module"})),m="modulepreload",k=function(p,a){return new URL(p,a).href},w={},Mr=function(a,t,u){let r=Promise.resolve();if(t&&t.length>0){let n=function(g){return Promise.all(g.map(l=>Promise.resolve(l).then(f=>({status:"fulfilled",value:f}),f=>({status:"rejected",reason:f}))))};const i=document.getElementsByTagName("link"),s=document.querySelector("meta[property=csp-nonce]"),o=s?.nonce||s?.getAttribute("nonce");r=n(t.map(g=>{if(g=k(g,u),g in w)return;w[g]=!0;const l=g.endsWith(".css"),f=l?'[rel="stylesheet"]':"";if(u)for(let _=i.length-1;_>=0;_--){const c=i[_];if(c.href===g&&(!l||c.rel==="stylesheet"))return}else if(document.querySelector(`link[href="${g}"]${f}`))return;const d=document.createElement("link");if(d.rel=l?"stylesheet":m,l||(d.as="script"),d.crossOrigin="",d.href=g,o&&d.setAttribute("nonce",o),document.head.appendChild(d),l)return new Promise((_,c)=>{d.addEventListener("load",_),d.addEventListener("error",()=>c(new Error(`Unable to preload CSS for ${g}`)))})}))}function e(i){const s=new Event("vite:preloadError",{cancelable:!0});if(s.payload=i,window.dispatchEvent(s),!s.defaultPrevented)throw i}return r.then(i=>{for(const s of i||[])s.status==="rejected"&&e(s.reason);return a().catch(e)})},h=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
     sum += partials[s * params.N + col];
   }
   output[col] = sum;
+}`,F=`
 struct Params { K: u32, N: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
     sum += input[k] * w0 + input[k + 1u] * w1 + input[k + 2u] * w2 + input[k + 3u] * w3;
   }
   output[col] = sum;
+}`,N=`
 struct Params { N: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
   let i = gid.x;
   if (i >= params.N) { return; }
   a[i] = a[i] + b[i];
+}`,E=`
 struct Params { N: u32, num_heads: u32, head_dim: u32, }
 @group(0) @binding(0) var<storage, read> src: array<f32>;
   let d = i % hd;
   dst_a[i] = src[head * hd * 2u + d];
   dst_b[i] = src[head * hd * 2u + hd + d];
+}`,R=`
 struct Params { N: u32, }
 @group(0) @binding(0) var<storage, read> x: array<f32>;
   let i = gid.x;
   if (i >= params.N) { return; }
   output[i] = x[i] / (1.0 + exp(-gate[i]));
+}`,K=`
 struct Params { token_id: u32, dim: u32, }
 @group(0) @binding(0) var<storage, read> embeddings: array<u32>;
   if (i >= params.dim) { return; }
   let flat = params.token_id * params.dim + i;
   output[i] = unpack_bf16(embeddings[flat / 2u], flat % 2u);
+}`,A=`
 struct ArgmaxResult { idx: u32, val: f32, }
 struct Params { dim: u32, }
   if (i >= params.dim) { return; }
   let flat = argmax_result.idx * params.dim + i;
   output[i] = unpack_bf16(embeddings[flat / 2u], flat % 2u);
+}`,B=`
 struct Params { N: u32, }
 struct Result { idx: u32, val: f32, }
     result.idx = s_idx[0];
     result.val = s_val[0];
   }
+}`,D=`
 struct Params { N: u32, }
 @group(0) @binding(0) var<storage, read> logits: array<f32>;
   // Output (idx, val) pair — 256 candidates total
   result[tid * 2u] = best_idx;
   result[tid * 2u + 1u] = bitcast<u32>(best_val);
+}`,z=`
 struct ArgmaxResult { idx: u32, val: f32, }
 struct Params { recent_count: u32, history_slot: u32, }
   let offset = params.position * total + i;
   k_cache[offset] = k_proj[i];
   v_cache[offset] = v_proj[i];
+}`,H=`
 struct Params {
   seq_len: u32,
   head_dim: u32,
   }
   output[h * hd + tid] = acc / ws;
+}`,G=`
 struct Params {
   num_heads: u32,
   key_dim: u32,
     }
     output[h * vd + vi] = o_val;
   }
+}`,O=`
 struct Params { num_heads: u32, head_dim: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read_write> x: array<f32>;
     let w = unpack_bf16(weight[i / 2u], i % 2u);
     x[off + i] = x[off + i] * rms * (1.0 + w);
   }
+}`,U=`
 struct Params { num_heads: u32, head_dim: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read_write> x: array<f32>;
     let w = unpack_bf16(weight[i / 2u], i % 2u);
     x[off + i] = x[off + i] * rms * w;
   }
+}`,I=`
 struct Params { channels: u32, }
 @group(0) @binding(0) var<storage, read_write> x: array<f32>;
   hist[c] = h1;
   hist[ch + c] = h2;
   hist[2u * ch + c] = cur;
+}`,C=`
 struct Params {
   num_heads: u32,
   key_dim: u32,
     }
     workgroupBarrier();
   }
+}`,L=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
   }
   output[col] = (gate_sum / (1.0 + exp(-gate_sum))) * up_sum;
+}`,Q=`
 enable f16;
 struct Params { K: u32, N: u32, group_size: u32, }
     let u = partial[32u+tid] + partial[32u+tid+1u] + partial[32u+tid+2u] + partial[32u+tid+3u];
     output[col] = (g / (1.0 + exp(-g))) * u;
   }
+}`,V=`
 enable f16;
 struct Params { K: u32, N: u32, group_size: u32, }
     let u = partial[32u+tid] + partial[32u+tid+1u] + partial[32u+tid+2u] + partial[32u+tid+3u];
     output[col] = (g / (1.0 + exp(-g))) * u;
   }
+}`,W=`
 struct Params { N: u32, eps: f32, }
 @group(0) @binding(0) var<storage, read_write> hidden: array<f32>;
   }
   output[col] = sum;
+}`,nr=`
 struct Params {
   K: u32, N: u32, group_size: u32, eps: f32,
   norm_weight: array<vec4<u32>, 640>,
     }
   }
   output[col] = sum;
+}`,gr=`
 struct Params {
   K: u32, N: u32, group_size: u32, eps: f32,
   norm_weight: array<vec4<u32>, 640>,
     }
   }
   output[col] = sum;
+}`,dr=`
 struct Params { num_tokens: u32, penalty: f32, presence: f32, _pad: u32, }
 @group(0) @binding(0) var<storage, read_write> logits: array<f32>;
   // Presence penalty (additive)
   val -= params.presence;
   logits[tok] = val;
+}`,lr=`
 struct Params {
   temperature: f32,
   top_k: u32,
     result.idx = wg_idx[selected];
     result.val = wg_val[selected];
   }
+}`;function b(p=320,a=1e7,t=33,u=30,r=128){return`
+const ROPE_THETA: f32 = ${a};
+const MROPE_S1_LIMIT: u32 = ${t}u;
+const MROPE_S2_LIMIT: u32 = ${u}u;
+const PARTIAL_DIM: u32 = ${r}u;
 struct Params {
   num_heads: u32,       // Q heads
   num_kv_heads: u32,    // KV heads
   head_dim: u32,
   eps: f32,
+  cache_position: u32,  // KV cache write index (sequential, updated per token)
+  position: u32,        // RoPE temporal/text position (updated per token)
+  position_h: u32,      // mRoPE height dimension (= position for text tokens)
+  position_w: u32,      // mRoPE width dimension (= position for text tokens)
   // Packed BF16 norm weights: [Q norm weights | K norm weights]
   // Q: numHeads*headDim BF16 values, K: numKVHeads*headDim BF16 values
   qk_norm_weight: array<vec4<u32>, ${p}>,
     }
     workgroupBarrier();
+    // Phase 3: Apply mRoPE to Q (rotate_half: dim i pairs with dim i+half_dim)
+    // mRoPE: interleaved position selection based on freq_idx
+    let q_half = PARTIAL_DIM / 2u;
     for (var i = tid; i < hd; i += 256u) {
       var q_out = wg_data[i];
+      if (i < PARTIAL_DIM) {
         let freq_idx = i % q_half;
+        let freq = 1.0 / pow(ROPE_THETA, f32(2u * freq_idx) / f32(PARTIAL_DIM));
+        // mRoPE: select position based on interleaved section
+        var pos = params.position;
+        if (freq_idx % 3u == 1u && freq_idx < MROPE_S1_LIMIT) { pos = params.position_h; }
+        else if (freq_idx % 3u == 2u && freq_idx < MROPE_S2_LIMIT) { pos = params.position_w; }
+        let angle = f32(pos) * freq;
         let cos_a = cos(angle);
         let sin_a = sin(angle);
         if (i < q_half) {
     }
     workgroupBarrier();
+    // Phase 3: Apply mRoPE to K, write to kProj + k_cache, copy V to v_cache
     let total = nkv * hd;
+    let cache_off = params.cache_position * total + off;
+    let k_half = PARTIAL_DIM / 2u;
     for (var i = tid; i < hd; i += 256u) {
       var k_out = wg_data[i];
+      if (i < PARTIAL_DIM) {
         let freq_idx = i % k_half;
+        let freq = 1.0 / pow(ROPE_THETA, f32(2u * freq_idx) / f32(PARTIAL_DIM));
+        // mRoPE: select position based on interleaved section
+        var pos = params.position;
+        if (freq_idx % 3u == 1u && freq_idx < MROPE_S1_LIMIT) { pos = params.position_h; }
+        else if (freq_idx % 3u == 2u && freq_idx < MROPE_S2_LIMIT) { pos = params.position_w; }
+        let angle = f32(pos) * freq;
         let cos_a = cos(angle);
         let sin_a = sin(angle);
         if (i < k_half) {
       v_cache[cache_off + i] = v_proj[off + i];
     }
   }
+}`}const pr=b(320),fr=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
   output[col] = sum0;
   output[N + col] = sum1;
+}`,Fr=`
 struct Params { K: u32, N: u32, group_size: u32, }
 @group(0) @binding(0) var<storage, read> input: array<f32>;
   output[col] = sum0;
   output[N + col] = sum1;
+}`,Nr=`
 enable f16;
 struct Params { K: u32, N: u32, group_size: u32, }
     output[col] = partial0[tid] + partial0[tid+1u] + partial0[tid+2u] + partial0[tid+3u];
     output[N + col] = partial1[tid] + partial1[tid+1u] + partial1[tid+2u] + partial1[tid+3u];
   }
+}`,Sr=`
+struct Params { N: u32, eps: f32, }
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read> weight: array<u32>;  // BF16 packed
+@group(0) @binding(2) var<storage, read> bias: array<u32>;    // BF16 packed
+@group(0) @binding(3) var<storage, read_write> output: array<f32>;
+@group(0) @binding(4) var<uniform> params: Params;
+fn unpack_bf16(packed: u32, idx: u32) -> f32 {
+  let shift = (idx & 1u) * 16u;
+  return bitcast<f32>((packed >> shift) << (16u - shift) & 0xFFFF0000u);
+}
+var<workgroup> shared_sum: array<f32, 256>;
+var<workgroup> shared_sq: array<f32, 256>;
+@compute @workgroup_size(256)
+fn main(@builtin(local_invocation_id) lid: vec3u, @builtin(workgroup_id) wid: vec3u) {
+  let N = params.N;
+  let token = wid.x;
+  let base = token * N;
+  let tid = lid.x;
+  // Phase 1: compute sum and sum-of-squares
+  var s: f32 = 0.0;
+  var sq: f32 = 0.0;
+  for (var i = tid; i < N; i += 256u) {
+    let v = input[base + i];
+    s += v;
+    sq += v * v;
+  }
+  shared_sum[tid] = s;
+  shared_sq[tid] = sq;
+  workgroupBarrier();
+  // Parallel reduction
+  for (var stride: u32 = 128u; stride > 0u; stride >>= 1u) {
+    if (tid < stride) {
+      shared_sum[tid] += shared_sum[tid + stride];
+      shared_sq[tid] += shared_sq[tid + stride];
+    }
+    workgroupBarrier();
+  }
+  let mean = shared_sum[0] / f32(N);
+  let variance = shared_sq[0] / f32(N) - mean * mean;
+  let inv_std = 1.0 / sqrt(variance + params.eps);
+  // Phase 2: normalize and apply weight + bias
+  for (var i = tid; i < N; i += 256u) {
+    let v = input[base + i];
+    let normalized = (v - mean) * inv_std;
+    let w = unpack_bf16(weight[i >> 1u], i);
+    let b = unpack_bf16(bias[i >> 1u], i);
+    output[base + i] = normalized * w + b;
+  }
+}`,Er=`
+struct Params { K: u32, N: u32, }
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read> weight: array<u32>;  // BF16 packed [N, K/2]
+@group(0) @binding(2) var<storage, read> bias: array<u32>;    // BF16 packed [N/2]
+@group(0) @binding(3) var<storage, read_write> output: array<f32>;
+@group(0) @binding(4) var<uniform> params: Params;
+@compute @workgroup_size(32)
+fn main(@builtin(global_invocation_id) gid: vec3u) {
+  let col = gid.x;  // output feature index
+  let token = gid.y;  // batch/token index
+  let K = params.K;
+  let N = params.N;
+  if (col >= N) { return; }
+  let in_base = token * K;
+  let w_base = col * K / 2u;
+  var sum: f32 = 0.0;
+  for (var k: u32 = 0u; k < K; k += 4u) {
+    let p0 = weight[w_base + k / 2u];
+    let p1 = weight[w_base + k / 2u + 1u];
+    let w0 = bitcast<f32>((p0 & 0xFFFFu) << 16u);
+    let w1 = bitcast<f32>(p0 & 0xFFFF0000u);
+    let w2 = bitcast<f32>((p1 & 0xFFFFu) << 16u);
+    let w3 = bitcast<f32>(p1 & 0xFFFF0000u);
+    sum += input[in_base + k] * w0 + input[in_base + k + 1u] * w1
+         + input[in_base + k + 2u] * w2 + input[in_base + k + 3u] * w3;
+  }
+  // Add BF16 bias
+  let bp = bias[col >> 1u];
+  let b = bitcast<f32>(select((bp & 0xFFFFu) << 16u, bp & 0xFFFF0000u, (col & 1u) == 1u));
+  output[token * N + col] = sum + b;
+}`,Rr=`
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+@group(0) @binding(2) var<uniform> len: u32;
+const SQRT_2_OVER_PI: f32 = 0.7978845608;
+const COEFF: f32 = 0.044715;
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3u) {
+  let i = gid.x;
+  if (i >= len) { return; }
+  let x = input[i];
+  let inner = SQRT_2_OVER_PI * (x + COEFF * x * x * x);
+  output[i] = 0.5 * x * (1.0 + tanh(inner));
+}`,Kr=`
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+@group(0) @binding(2) var<uniform> len: u32;
+const SQRT_2_OVER_PI: f32 = 0.7978845608;
+const COEFF: f32 = 0.044715;
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3u) {
+  let i = gid.x;
+  if (i >= len) { return; }
+  let x = input[i];
+  let inner = SQRT_2_OVER_PI * (x + COEFF * x * x * x);
+  output[i] = 0.5 * x * (1.0 + tanh(inner));
+}`,Ar=`
+struct Params { seq_len: u32, num_heads: u32, head_dim: u32, }
+@group(0) @binding(0) var<storage, read_write> q: array<f32>;
+@group(0) @binding(1) var<storage, read_write> k: array<f32>;
+@group(0) @binding(2) var<storage, read> cos_buf: array<f32>;
+@group(0) @binding(3) var<storage, read> sin_buf: array<f32>;
+@group(0) @binding(4) var<uniform> params: Params;
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3u) {
+  let idx = gid.x;
+  let total = params.seq_len * params.num_heads * params.head_dim;
+  if (idx >= total) { return; }
+  let d = idx % params.head_dim;
+  let head_and_seq = idx / params.head_dim;
+  let seq = head_and_seq / params.num_heads;
+  let half_dim = params.head_dim / 2u;
+  // cos/sin are [seq_len, head_dim] — same for all heads
+  let cs_idx = seq * params.head_dim + d;
+  let c = cos_buf[cs_idx];
+  let s = sin_buf[cs_idx];
+  // rotate_half: pair d with d+half_dim (or d-half_dim)
+  let partner_d = select(d + half_dim, d - half_dim, d >= half_dim);
+  let partner_idx = head_and_seq * params.head_dim + partner_d;
+  let sign = select(-1.0, 1.0, d >= half_dim);
+  let q_val = q[idx];
+  let q_partner = q[partner_idx];
+  q[idx] = q_val * c + sign * q_partner * s;
+  let k_val = k[idx];
+  let k_partner = k[partner_idx];
+  k[idx] = k_val * c + sign * k_partner * s;
+}`,Br=`
+struct Params { seq_len: u32, num_heads: u32, head_dim: u32, scale: f32, }
+@group(0) @binding(0) var<storage, read> q: array<f32>;
+@group(0) @binding(1) var<storage, read> k: array<f32>;
+@group(0) @binding(2) var<storage, read> v: array<f32>;
+@group(0) @binding(3) var<storage, read_write> output: array<f32>;
+@group(0) @binding(4) var<uniform> params: Params;
+var<workgroup> shared_scores: array<f32, 256>;
+@compute @workgroup_size(256)
+fn main(@builtin(local_invocation_id) lid: vec3u, @builtin(workgroup_id) wid: vec3u) {
+  let q_pos = wid.x;
+  let head = wid.y;
+  let tid = lid.x;
+  let S = params.seq_len;
+  let H = params.num_heads;
+  let D = params.head_dim;
+  let scale = params.scale;
+  // Base offset for this head's Q at position q_pos
+  let q_base = (q_pos * H + head) * D;
+  // Phase 1: Compute all attention scores (Q @ K^T * scale)
+  // Each thread handles a subset of K positions
+  var max_score: f32 = -1e30;
+  for (var kp = tid; kp < S; kp += 256u) {
+    let k_base = (kp * H + head) * D;
+    var dot: f32 = 0.0;
+    for (var d: u32 = 0u; d < D; d++) {
+      dot += q[q_base + d] * k[k_base + d];
+    }
+    let s = dot * scale;
+    shared_scores[tid] = s; // temp store (only valid for this kp)
+    max_score = max(max_score, s);
+  }
+  // For simplicity with variable S, use multi-pass approach:
+  // Pass 1: find max, Pass 2: compute exp and sum, Pass 3: weighted V sum
+  // Since head_dim is small (64), we can accumulate V in registers.
+  // Re-compute scores and do online softmax + V accumulation
+  var running_max: f32 = -1e30;
+  var running_sum: f32 = 0.0;
+  var acc = array<f32, 128>(); // max head_dim = 128
+  for (var kp: u32 = 0u; kp < S; kp++) {
+    let k_base = (kp * H + head) * D;
+    var dot: f32 = 0.0;
+    for (var d: u32 = 0u; d < D; d++) {
+      dot += q[q_base + d] * k[k_base + d];
+    }
+    let s = dot * scale;
+    let old_max = running_max;
+    running_max = max(running_max, s);
+    let correction = exp(old_max - running_max);
+    // Correct previous accumulation
+    running_sum = running_sum * correction;
+    for (var d: u32 = 0u; d < D; d++) {
+      acc[d] = acc[d] * correction;
+    }
+    let w = exp(s - running_max);
+    running_sum += w;
+    let v_base = (kp * H + head) * D;
+    for (var d: u32 = 0u; d < D; d++) {
+      acc[d] += w * v[v_base + d];
+    }
+  }
+  // Write output (only thread 0 does the work since this is sequential per-query)
+  if (tid == 0u) {
+    let inv_sum = 1.0 / running_sum;
+    let out_base = (q_pos * H + head) * D;
+    for (var d: u32 = 0u; d < D; d++) {
+      output[out_base + d] = acc[d] * inv_sum;
+    }
+  }
+}`,Dr=`
+@group(0) @binding(0) var<storage, read_write> a: array<f32>;
+@group(0) @binding(1) var<storage, read> b: array<f32>;
+@group(0) @binding(2) var<uniform> len: u32;
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3u) {
+  let i = gid.x;
+  if (i >= len) { return; }
+  a[i] += b[i];
+}`,zr=`
+struct Params { H: u32, }
+@group(0) @binding(0) var<storage, read> vision: array<f32>;
+@group(0) @binding(1) var<storage, read> scatter_indices: array<u32>;
+@group(0) @binding(2) var<storage, read_write> embeds: array<f32>;
+@group(0) @binding(3) var<uniform> params: Params;
+@compute @workgroup_size(256)
+fn main(@builtin(local_invocation_id) lid: vec3u, @builtin(workgroup_id) wid: vec3u) {
+  let vit_idx = wid.x;
+  let pos = scatter_indices[vit_idx];
+  let H = params.H;
+  let tid = lid.x;
+  for (var i = tid; i < H; i += 256u) {
+    embeds[pos * H + i] = vision[vit_idx * H + i];
+  }
+}`,Tr={gptq_matvec:h,gptq_matvec_f16:x,gptq_matvec_4t:fr,gptq_matvec_4t_f16:_r,gptq_splitk:q,reduce_splitk:y,bf16_matvec:F,rmsnorm:N,silu_mul:P,add:S,embedding:K,embed_from_argmax:A,argmax:B,topk_extract:D,kv_cache_store:T,gqa_attention_head:H,gqa_reduce:M,deltanet_recurrent:G,head_rmsnorm:O,head_rmsnorm_nogated:U,causal_conv1d:I,split:E,sigmoid_mul:R,fused_gate_up_silu:L,fused_gate_up_silu_f16:Q,fused_gate_up_silu_4t:j,fused_gate_up_silu_4t_f16:V,add_rmsnorm:W,add_rmsnorm_ro:Y,three_way_add_rmsnorm:X,norm_gptq_lite:nr,norm_gptq_lite_noadd:gr,fused_sigmoid_gptq:$,fused_sigmoid_gptq_f16:Z,fused_sigmoid_gptq_4t:ur,fused_sigmoid_gptq_4t_f16:tr,fused_silu_gptq:J,fused_silu_gptq_f16:rr,fused_silu_gptq_4t:ar,fused_silu_gptq_4t_f16:er,fused_addnorm_gate_up_silu:ir,rep_penalty:dr,gpu_sample:lr,append_token:z,fused_norm_gptq:sr,fused_norm_gptq_noadd:or,fused_conv_deltanet_norm:C,fused_split_qknorm_kvstore:pr,gptq_matvec_4t_f16_sk:cr,gather_rows_bf16:wr,quantize_bf16_to_int4:br,pack_f32_to_f16_pairs:vr,gptq_matmul_b2:Fr,gptq_matmul_b2_f16:Nr,gptq_matmul_b2_4t_f16:Pr,add_rmsnorm_b2:mr,add_rmsnorm_ro_b2:kr,three_way_add_rmsnorm_b2:hr,fused_gate_up_silu_b2_f16:xr,fused_silu_gptq_b2_f16:qr,fused_sigmoid_gptq_b2_f16:yr,vit_layernorm:Sr,vit_bf16_matvec_bias:Er,vit_gelu_tanh:Rr,vit_gelu:Kr,vit_rope:Ar,vit_attention:Br,vit_add:Dr,vit_scatter_embed:zr},Gr=Object.freeze(Object.defineProperty({__proto__:null,SHADERS:Tr,SHADER_FUSED_SPLIT_QKNORM_KVSTORE_FN:b},Symbol.toStringTag,{value:"Module"}));export{v as G,Tr as S,Mr as _,b as a,Gr as b,Hr as g};

assets/{main-DTfqdn80.js → main-D60Okk_s.js} RENAMED Viewed

The diff for this file is too large to render. See raw diff

assets/qwen35-model-CmeFfImT.js ADDED Viewed

	@@ -0,0 +1 @@

+ import{S as H,a as W,_ as A}from"./gpu-ops-PQDFq1iI.js";function K(E){const a=E<<16,e=new ArrayBuffer(4);new Uint32Array(e)[0]=a,new Float32Array(e)[0];const t=a>>>31&1,s=a>>>23&255,i=a&8388607;if(s===0)return t<<15;if(s===255)return t<<15|31744|(i?512:0);const n=s-127+15;return n>=31?t<<15|31744:n<=0?t<<15:t<<15|n<<10|i>>>13}class N{constructor(a,e,t){if(this.gpu=a,this.config=e,this.textCfg=e.text_config,this.quantConfig=t,this.hiddenSize=this.textCfg.hidden_size,this.intermediateSize=this.textCfg.intermediate_size,this.numLayers=this.textCfg.num_hidden_layers,this.numHeads=this.textCfg.num_attention_heads,this.numKVHeads=this.textCfg.num_key_value_heads,this.headDim=this.textCfg.head_dim,this.vocabSize=this.textCfg.vocab_size,this.rmsEps=this.textCfg.rms_norm_eps,this.ropeTheta=this.textCfg.rope_parameters?.rope_theta||1e7,this.partialRotary=this.textCfg.partial_rotary_factor,this.partialDim=Math.floor(this.headDim*this.partialRotary),this.mropeSection=this.textCfg.rope_parameters?.mrope_section||[11,11,10],this.layerTypes=this.textCfg.layer_types,this.groupSize=t?.group_size||t?.config_groups?.group_0?.weights?.group_size||128,this.fp16Layers=new Set,t?.extra_config)for(const u of Object.keys(t.extra_config))this.fp16Layers.add(u);this.weights={},this.kvCache={},this.seqLen=0,this.pipelines={},this.singlePassMode=!0,this._replayFlat=null,this._forceMinSplits=0,this._ropeDelta=0;const s=this.numHeads*this.headDim,i=this.numKVHeads*this.headDim,n=(s+i)/2,r=Math.ceil(n/4);this._splitQKNormShaderKey=`fused_split_qknorm_kvstore_${r}`,H[this._splitQKNormShaderKey]||(H[this._splitQKNormShaderKey]=W(r))}compilePipelines(){const a=this.gpu;for(const[e,t]of Object.entries(H))this.pipelines[e]=a.getOrCreatePipeline(e,t)}uploadTensors(a){this.quantConfig?.quant_method==="compressed-tensors"&&(a=this._convertCompressedTensors(a));for(const[t,s]of Object.entries(a)){let i=s.data;if(s.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")){const n=new Float32Array(i.buffer,i.byteOffset,i.byteLength/4),r=new Uint16Array(n.length);for(let u=0;u<n.length;u++){const o=new Uint32Array(n.buffer,n.byteOffset+u*4,1)[0];r[u]=o>>>16}i=new Uint8Array(r.buffer)}if(s._partial){let{offset:n,totalSize:r}=s._partial;if(s.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")&&(n/=2,r/=2),n===0){const o=this.gpu.createBuffer(t,r,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC);this.gpu.device.queue.writeBuffer(o,0,i),this.weights[t]=o}else{const o=this.weights[t];o&&this.gpu.device.queue.writeBuffer(o,n,i)}}else this.weights[t]=this.gpu.createBufferFromData(t,i);(t.includes("post_attention_layernorm.weight")||t.includes("input_layernorm.weight")||t.includes("q_norm.weight")||t.includes("k_norm.weight")||t==="model.language_model.norm.weight")&&(this._normWeightRaw||(this._normWeightRaw={}),this._normWeightRaw[t]=new Uint32Array(i.buffer.slice(i.byteOffset,i.byteOffset+i.byteLength))),this._uploadedCount=(this._uploadedCount||0)+1}}_convertCompressedTensors(a){const e={};for(const[t,s]of Object.entries(a))if(!t.endsWith(".weight_shape")){if(t.endsWith(".weight_packed")){const i=t.slice(0,-14),n=s.shape,r=n[0],u=n[1],o=new Int32Array(s.data.buffer,s.data.byteOffset,s.data.byteLength/4),h=new Int32Array(r*u);for(let m=0;m<r;m++)for(let c=0;c<u;c++)h[c*r+m]=o[m*u+c];e[`${i}.qweight`]={dtype:"I32",shape:[u,r],data:new Uint8Array(h.buffer)};continue}if(t.endsWith(".weight_scale")){const i=t.slice(0,-13),n=s.shape,r=n[0],u=n[1],o=new Uint16Array(s.data.buffer,s.data.byteOffset,s.data.byteLength/2),h=Math.ceil(r/2),m=new Uint32Array(u*h);for(let c=0;c<u;c++)for(let f=0;f<r;f+=2){const p=o[f*u+c],w=f+1<r?o[(f+1)*u+c]:0,B=K(p),q=K(w);m[c*h+(f>>1)]=B|q<<16}e[`${i}.scales`]={dtype:"I32",shape:[u,h],data:new Uint8Array(m.buffer)};continue}e[t]=s}return e}async postProcessWeights(){const a=this.textCfg.linear_num_key_heads||0,e=this.hiddenSize,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=this.layerTypes.indexOf("linear_attention"),i=s>=0?`model.language_model.layers.${s}.linear_attn`:"";if(this.abQuantized=i&&!!this.weights[`${i}.in_proj_a.qweight`],this.abQuantized){const g=this.textCfg.linear_num_value_heads??a;this.linAlpha=this.gpu.createBuffer("lin_alpha",g*4,t),this.linBeta=this.gpu.createBuffer("lin_beta",g*4,t)}else{this.linABWeight={};const g=this.textCfg.linear_num_value_heads??a,P=e*2,S=g*P,b=2*S;for(let v=0;v<this.numLayers;v++)if(this.layerTypes[v]==="linear_attention"){const C=`model.language_model.layers.${v}.linear_attn`,D=this.weights[`${C}.in_proj_a.weight`],k=this.weights[`${C}.in_proj_b.weight`];if(D&&k){const z=this.gpu.createBuffer(`ab_merged_${v}`,b,t),M=this.gpu.device.createCommandEncoder();M.copyBufferToBuffer(D,0,z,0,S),M.copyBufferToBuffer(k,0,z,S,S),this.gpu.device.queue.submit([M.finish()]),this.linABWeight[v]=z}}}{const g=[];for(let b=0;b<this.numLayers;b++){if(this.layerTypes[b]==="linear_attention"){const v=`model.language_model.layers.${b}.linear_attn`,C=this.textCfg.linear_num_key_heads||0,D=this.textCfg.linear_key_head_dim||128,k=this.textCfg.linear_value_head_dim||128,z=this.textCfg.linear_num_value_heads??C,M=z*k,$=z/C*k,O=C*(D+D+$);g.push({prefix:`${v}.in_proj_qkv`,K:e,N:O}),g.push({prefix:`${v}.in_proj_z`,K:e,N:M}),g.push({prefix:`${v}.out_proj`,K:M,N:e})}else{const v=`model.language_model.layers.${b}.self_attn`,C=this.numHeads*this.headDim*2,D=this.numKVHeads*this.headDim;g.push({prefix:`${v}.q_proj`,K:e,N:C}),g.push({prefix:`${v}.k_proj`,K:e,N:D}),g.push({prefix:`${v}.v_proj`,K:e,N:D}),g.push({prefix:`${v}.o_proj`,K:this.numHeads*this.headDim,N:e})}g.push({prefix:`model.language_model.layers.${b}.mlp.gate_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${b}.mlp.up_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${b}.mlp.down_proj`,K:this.intermediateSize,N:e})}let P=0;const S=performance.now();for(const{prefix:b,K:v,N:C}of g)if(!this.weights[`${b}.qweight`]&&this.weights[`${b}.weight`]){const{qweight:D,scales:k}=await this._quantizeBF16ToINT4(this.weights[`${b}.weight`],v,C,this.groupSize,b.replace(/\./g,"_"));this.weights[`${b}.qweight`]=D,this.weights[`${b}.scales`]=k,P++}P>0&&console.log(`[QUANT] GPU-quantized ${P} BF16 projections to INT4 in ${(performance.now()-S).toFixed(0)}ms`)}this._mergedGateUp={};const n=this.intermediateSize,r=e/8,u=r*n*4,h=r/(this.groupSize/8)*n*2;for(let g=0;g<this.numLayers;g++){const P=`model.language_model.layers.${g}.mlp`,S=this.getQWeight(`${P}.gate_proj`),b=this.getQWeight(`${P}.up_proj`);if(S.qweight&&b.qweight){const v=this.gpu.createBuffer(`merged_qw_${g}`,u*2,t),C=this.gpu.createBuffer(`merged_sc_${g}`,h*2,t),D=this.gpu.device.createCommandEncoder();D.copyBufferToBuffer(S.qweight,0,v,0,u),D.copyBufferToBuffer(b.qweight,0,v,u,u),D.copyBufferToBuffer(S.scales,0,C,0,h),D.copyBufferToBuffer(b.scales,0,C,h,h),this.gpu.device.queue.submit([D.finish()]),this._mergedGateUp[g]={qweight:v,scales:C}}}this._fusedMLPParams={};const m=16+512*16;for(let g=0;g<this.numLayers;g++){const P=`model.language_model.layers.${g}.post_attention_layernorm.weight`,S=this._normWeightRaw?.[P];if(!S||!this._mergedGateUp[g])continue;const b=new ArrayBuffer(m),v=new Uint32Array(b),C=new Float32Array(b);v[0]=e,v[1]=n,v[2]=this.groupSize,C[3]=this.rmsEps;for(let D=0;D<S.length;D++)v[4+D]=S[D];this._fusedMLPParams[g]=this.gpu.createBufferFromData(`fused_mlp_params_${g}`,new Uint32Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}this._fusedSQKParams={};const c=this.headDim,f=this.numHeads,p=this.numKVHeads,w=f*c,B=p*c,q=(w+B)/2,_=Math.ceil(q/4),l=32+_*16,d=this.mropeSection[1]*3,y=this.mropeSection[2]*3,U=`fused_split_qknorm_kvstore_${_}`;H[U]||(H[U]=W(_,this.ropeTheta,d,y,this.partialDim)),this.pipelines[U]||(this.pipelines[U]=this.gpu.getOrCreatePipeline(U,H[U])),this._splitQKNormShaderKey=U;for(let g=0;g<this.numLayers;g++){if(this.layerTypes[g]!=="full_attention")continue;const P=`model.language_model.layers.${g}.self_attn`,S=`${P}.q_norm.weight`,b=`${P}.k_norm.weight`,v=this._normWeightRaw?.[S],C=this._normWeightRaw?.[b],D=new ArrayBuffer(l),k=new DataView(D);if(k.setUint32(0,f,!0),k.setUint32(4,p,!0),k.setUint32(8,c,!0),k.setFloat32(12,this.rmsEps,!0),k.setUint32(16,0,!0),k.setUint32(20,0,!0),k.setUint32(24,0,!0),k.setUint32(28,0,!0),v)for(let M=0;M<w/2;M++){const $=Math.floor(M/4),O=M%4;k.setUint32(32+$*16+O*4,v[M],!0)}if(C){const M=w/2;for(let $=0;$<B/2;$++){const O=M+$,G=Math.floor(O/4),R=O%4;k.setUint32(32+G*16+R*4,C[$],!0)}}const z=this.gpu.device.createBuffer({size:l,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:`fused_sqk_${g}`});this.gpu.device.queue.writeBuffer(z,0,new Uint8Array(D)),this._fusedSQKParams[g]=z}await this._quantizeLmHead()}async _quantizeLmHead(){const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight";if(!this.weights[e])return;const t=this.hiddenSize,s=this.vocabSize,i=this.groupSize,n=t/8,r=t/i,u=performance.now(),o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,h=this.weights[e];this._lmHeadQWeight=this.gpu.createBuffer("lmhead_qweight",n*s*4,o);const m=this.gpu.createBuffer("lmhead_scales_f32",r*s*4,o),c=Math.ceil(r*s/2)*4;this._lmHeadScales=this.gpu.createBuffer("lmhead_scales",c,o);const f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:D}=await import("./gpu-ops-PQDFq1iI.js").then(k=>k.b);return{SHADERS:D}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData("quant_params",new Uint32Array([t,s,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[h,this._lmHeadQWeight,m,p]),B=65535,q=Math.min(s,B),_=Math.ceil(s/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:D}=await import("./gpu-ops-PQDFq1iI.js").then(k=>k.b);return{SHADERS:D}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(r*s/2),g=this.gpu.createBufferFromData("pack_params",new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[m,this._lmHeadScales,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),m.destroy(),p.destroy(),g.destroy();const v=(n*s*4/1e6).toFixed(0),C=(c/1e6).toFixed(0);console.log(`[QUANT] lm_head INT4 (GPU): ${(performance.now()-u).toFixed(0)}ms, ${v}MB qw + ${C}MB sc`)}async _quantizeBF16ToINT4(a,e,t,s,i){const n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,r=e/8,u=e/s,o=this.gpu.createBuffer(`${i}_qweight`,r*t*4,n),h=this.gpu.createBuffer(`${i}_scales_f32`,u*t*4,n),m=Math.ceil(u*t/2)*4,c=this.gpu.createBuffer(`${i}_scales`,m,n),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData(`${i}_qp`,new Uint32Array([e,t,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[a,o,h,p]),B=65535,q=Math.min(t,B),_=Math.ceil(t/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(u*t/2),g=this.gpu.createBufferFromData(`${i}_pp`,new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[h,c,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();return b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),h.destroy(),p.destroy(),g.destroy(),{qweight:o,scales:c}}initBuffers(a){a||(a=this.textCfg.max_position_embeddings||4096),this.maxSeqLen=a;const e=this.gpu,t=this.hiddenSize,s=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.hidden=e.createBuffer("hidden_a",t*4,i),this.hiddenB=e.createBuffer("hidden_b",t*4,i),this.normed=e.createBuffer("normed",t*4,i),this.normedB=e.createBuffer("normed_b",t*4,i),this.mlpIntermediate=e.createBuffer("mlp_inter",s*4,i),this.mlpOut=e.createBuffer("mlp_out",t*4,i),this.logits=e.createBuffer("logits",this.vocabSize*4,i),this.argmaxResult=e.createBuffer("argmax_result",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this._argmaxReadback=e.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"argmax_readback"}),this._topkResult=e.createBuffer("topk_result",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._repMaxTokens=64,this._recentTokens=new Uint32Array(this._repMaxTokens),this._recentTokenCount=0,this._gpuRecentTokens=e.createBuffer("recent_tokens",this._repMaxTokens*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST);for(let q=0;q<this.numLayers;q++)this.layerTypes[q]==="full_attention"&&(this.kvCache[q]={keys:e.createBuffer(`kv_k_${q}`,a*this.numKVHeads*this.headDim*4),values:e.createBuffer(`kv_v_${q}`,a*this.numKVHeads*this.headDim*4)});this.qProjFull=e.createBuffer("q_proj_full",this.numHeads*this.headDim*2*4,i),this.qProj=e.createBuffer("q_proj_out",this.numHeads*this.headDim*4,i),this.qGate=e.createBuffer("q_gate",this.numHeads*this.headDim*4,i),this.kProj=e.createBuffer("k_proj_out",this.numKVHeads*this.headDim*4,i),this.vProj=e.createBuffer("v_proj_out",this.numKVHeads*this.headDim*4,i);const n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,o=this.textCfg.linear_num_value_heads??n,m=o/n*u,c=n*(r+r+m),f=o*u;this.linValueDim=f,this.linValueHeads=o,this.linQKV=e.createBuffer("lin_qkv",c*4,i),this.linZ=e.createBuffer("lin_z",f*4,i),this.linOut=e.createBuffer("lin_out",f*4,i);const p=Math.max(t,this.numHeads*this.headDim,f)*4;this.attnOut=e.createBuffer("attn_out",p,i),this._maxGqaSplits=64;const w=this.numHeads*this._maxGqaSplits*(this.headDim+2)*4;this._gqaPartials=e.createBuffer("gqa_partials",w,i),this.linState={},this.linConvHist={};for(let q=0;q<this.numLayers;q++)this.layerTypes[q]==="linear_attention"&&(this.linState[q]=e.createBuffer(`lin_state_${q}`,n*r*m*4,i),this.linConvHist[q]=e.createBuffer(`lin_conv_hist_${q}`,3*c*4,i));this.zeroBuf=e.createBuffer("zero_buf",t*4,i),this.useSplitK=!1,this.splitKSplits=1;const B=Math.max(t,s);this.splitKPartials=e.createBuffer("splitk_partials",B*this.splitKSplits*4,i),this.paramBufs={},this._gqaParamBuf=e.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_params"}),this._kvStoreParamBuf=e.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"kv_store_params"}),this._gqaData=new Uint8Array(32),this._gqaDv=new DataView(this._gqaData.buffer),this._kvData=new Uint8Array(16),this._kvDv=new DataView(this._kvData.buffer)}initVision(){const a=this.config.vision_config;if(!a){this.vision=null;return}const e=this.gpu,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=a.hidden_size,i=a.intermediate_size,n=a.out_hidden_size,r=a.depth,u=a.num_heads,o=s/u,h=a.patch_size,m=a.temporal_patch_size,c=a.spatial_merge_size,f=3*m*h*h,p=4096,w=s*c*c;this.vision={V:s,Vi:i,Vo:n,depth:r,heads:u,headDim:o,patchSize:h,temporalPatchSize:m,mergeSize:c,patchInputDim:f,maxVitTokens:p,mergedHidden:w,numPosEmbeddings:a.num_position_embeddings,numGridPerSide:Math.round(Math.sqrt(a.num_position_embeddings)),hidden:e.createBuffer("vit_hidden",p*s*4,t),hiddenB:e.createBuffer("vit_hidden_b",p*s*4,t),normed:e.createBuffer("vit_normed",p*s*4,t),q:e.createBuffer("vit_q",p*s*4,t),k:e.createBuffer("vit_k",p*s*4,t),v:e.createBuffer("vit_v",p*s*4,t),attnOut:e.createBuffer("vit_attn_out",p*s*4,t),mlpInter:e.createBuffer("vit_mlp_inter",p*i*4,t),mlpOut:e.createBuffer("vit_mlp_out",p*s*4,t),cos:e.createBuffer("vit_cos",p*o*4,t),sin:e.createBuffer("vit_sin",p*o*4,t),posEmbed:e.createBuffer("vit_pos_embed",p*s*4,t),merged:e.createBuffer("vit_merged",p/4*n*4,t),mergerNormed:e.createBuffer("vit_merger_normed",p*s*4,t),mergerInter:e.createBuffer("vit_merger_inter",p/4*w*4,t),patchInput:e.createBuffer("vit_patch_input",p*f*4,t),qkv:e.createBuffer("vit_qkv",p*3*s*4,t)},this.imageTokenId=this.config.image_token_id,this.visionStartTokenId=this.config.vision_start_token_id,this.visionEndTokenId=this.config.vision_end_token_id,console.log(`[VISION] Initialized: depth=${r}, hidden=${s}, heads=${u}, headDim=${o}, out=${n}`),console.log(`[VISION] Buffer allocation: ~${((p*s*4*12+p*i*4)/1024/1024).toFixed(0)} MB transient`)}async _readVisionPosEmbed(){const a=this.vision,e=this.weights["model.visual.pos_embed.weight"];if(!e)throw new Error("Vision pos_embed weight not found");const t=a.numPosEmbeddings,s=a.V,i=t*s*2,n=this.gpu.createReadbackBuffer("vit_pos_embed_readback",i),r=this.gpu.device.createCommandEncoder();r.copyBufferToBuffer(e,0,n,0,i),this.gpu.device.queue.submit([r.finish()]),await n.mapAsync(GPUMapMode.READ);const u=new Uint16Array(n.getMappedRange().slice(0));n.unmap(),n.destroy();const o=new Float32Array(t*s);for(let h=0;h<u.length;h++){const m=u[h]<<16,c=new ArrayBuffer(4);new Uint32Array(c)[0]=m,o[h]=new Float32Array(c)[0]}this._vitPosEmbedF32=o,console.log(`[VISION] Read pos_embed: ${t}×${s} (${(i/1024).toFixed(0)} KB)`)}_interpolatePosEmbed(a,e){const t=this.vision,s=t.V,i=t.numGridPerSide,n=this._vitPosEmbedF32,r=t.mergeSize,u=a*e,o=new Float32Array(u*s);for(let p=0;p<a;p++){const w=a===1?0:p*(i-1)/(a-1),B=Math.min(Math.floor(w),i-1),q=Math.min(B+1,i-1),_=w-B;for(let l=0;l<e;l++){const d=e===1?0:l*(i-1)/(e-1),y=Math.min(Math.floor(d),i-1),U=Math.min(y+1,i-1),g=d-y,P=B*i+y,S=B*i+U,b=q*i+y,v=q*i+U,C=(1-_)*(1-g),D=(1-_)*g,k=_*(1-g),z=_*g,M=p*e+l;for(let $=0;$<s;$++)o[M*s+$]=C*n[P*s+$]+D*n[S*s+$]+k*n[b*s+$]+z*n[v*s+$]}}const h=a/r,m=e/r,c=new Float32Array(u*s);let f=0;for(let p=0;p<h;p++)for(let w=0;w<m;w++)for(let B=0;B<r;B++)for(let q=0;q<r;q++){const _=p*r+B,l=w*r+q,d=_*e+l;c.set(o.subarray(d*s,d*s+s),f*s),f++}return c}_computeVisionRoPE(a,e){const t=this.vision,s=t.headDim/2,i=t.mergeSize,n=a/i,r=e/i,u=a*e,o=Math.max(a,e),h=new Float32Array(o*s);for(let p=0;p<o;p++)for(let w=0;w<s;w++){const B=1/Math.pow(1e4,2*w/t.headDim);h[p*s+w]=p*B}const m=new Float32Array(u*t.headDim),c=new Float32Array(u*t.headDim);let f=0;for(let p=0;p<n;p++)for(let w=0;w<r;w++)for(let B=0;B<i;B++)for(let q=0;q<i;q++){const _=p*i+B,l=w*i+q;for(let U=0;U<s;U++){const g=h[_*s+U];h[l*s+U];const P=f*t.headDim;U<s/2&&(m[P+U]=Math.cos(g),c[P+U]=Math.sin(g),m[P+s+U]=Math.cos(g),c[P+s+U]=Math.sin(g))}const d=f*t.headDim,y=s/2;for(let U=0;U<y;U++){const g=h[_*s+U],P=h[l*s+U];m[d+U]=Math.cos(g),m[d+y+U]=Math.cos(P),m[d+s+U]=Math.cos(g),m[d+s+y+U]=Math.cos(P),c[d+U]=Math.sin(g),c[d+y+U]=Math.sin(P),c[d+s+U]=Math.sin(g),c[d+s+y+U]=Math.sin(P)}f++}return{cos:m,sin:c}}async preprocessImage(a){const e=this.vision,t=e.patchSize,s=e.mergeSize,i=t*s,n=256*256,r=1280*1280,u=new Image;await new Promise((v,C)=>{u.onload=v,u.onerror=C,u.src=a});let{width:o,height:h}=u,m=1;o*h>r?m=Math.sqrt(r/(o*h)):o*h<n&&(m=Math.sqrt(n/(o*h)));let c=Math.round(o*m/i)*i,f=Math.round(h*m/i)*i;c=Math.max(i,c),f=Math.max(i,f);const w=new OffscreenCanvas(c,f).getContext("2d");w.drawImage(u,0,0,c,f);const q=w.getImageData(0,0,c,f).data,_=f/t,l=c/t,d=_*l,y=_/s,U=l/s,g=e.temporalPatchSize,P=3*g*t*t,S=new Float32Array(d*P);let b=0;for(let v=0;v<y;v++)for(let C=0;C<U;C++)for(let D=0;D<s;D++)for(let k=0;k<s;k++){const z=v*s+D,M=C*s+k,$=z*t,O=M*t,G=b*P;for(let R=0;R<g;R++)for(let F=0;F<3;F++)for(let T=0;T<t;T++)for(let x=0;x<t;x++){const V=(($+T)*c+(O+x))*4+F,j=q[V]/127.5-1,L=((F*g+R)*t+T)*t+x;S[G+L]=j}b++}return console.log(`[VISION] Preprocessed: ${o}x${h} → ${c}x${f}, ${d} patches (${_}x${l}), merge→${d/4} tokens`),{pixels:S,gridH:_,gridW:l,numPatches:d,imgW:c,imgH:f}}async visionForward(a){const e=this.vision,t=this.gpu,{pixels:s,gridH:i,gridW:n,numPatches:r}=a;this._vitPosEmbedF32||await this._readVisionPosEmbed(),t.device.queue.writeBuffer(e.patchInput,0,s);const u=this._interpolatePosEmbed(i,n);t.device.queue.writeBuffer(e.posEmbed,0,u);const{cos:o,sin:h}=this._computeVisionRoPE(i,n);t.device.queue.writeBuffer(e.cos,0,o),t.device.queue.writeBuffer(e.sin,0,h),t.beginBatch();const m=this.weights["model.visual.patch_embed.proj.weight"],c=this.weights["model.visual.patch_embed.proj.bias"],f=this.makeUniform("vit_patch_params",[e.patchInputDim,e.V]);t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.patchInput,m,c,e.hidden,f])],Math.ceil(e.V/32),r);const p=this.makeUniform("vit_add_len",[r*e.V]);t.dispatch(this.pipelines.vit_add,[t.createBindGroup(this.pipelines.vit_add,0,[e.hidden,e.posEmbed,p])],Math.ceil(r*e.V/256));for(let b=0;b<e.depth;b++)this._vitBlock(b,r);const w=this.weights["model.visual.merger.norm.weight"],B=this.weights["model.visual.merger.norm.bias"],q=this.makeUniform("vit_merger_ln_params",[e.V,1e-6]);t.dispatch(this.pipelines.vit_layernorm,[t.createBindGroup(this.pipelines.vit_layernorm,0,[e.hidden,w,B,e.mergerNormed,q])],r);const _=r/4,l=this.weights["model.visual.merger.linear_fc1.weight"],d=this.weights["model.visual.merger.linear_fc1.bias"],y=this.makeUniform("vit_merger_fc1_params",[e.mergedHidden,e.mergedHidden]);t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.mergerNormed,l,d,e.mergerInter,y])],Math.ceil(e.mergedHidden/32),_);const U=this.makeUniform("vit_gelu_len",[_*e.mergedHidden]);t.dispatch(this.pipelines.vit_gelu,[t.createBindGroup(this.pipelines.vit_gelu,0,[e.mergerInter,e.mergerInter,U])],Math.ceil(_*e.mergedHidden/256));const g=this.weights["model.visual.merger.linear_fc2.weight"],P=this.weights["model.visual.merger.linear_fc2.bias"],S=this.makeUniform("vit_merger_fc2_params",[e.mergedHidden,e.Vo]);return t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.mergerInter,g,P,e.merged,S])],Math.ceil(e.Vo/32),_),t.endBatch(),await t.device.queue.onSubmittedWorkDone(),console.log(`[VISION] Forward done: ${r} patches → ${_} merged tokens (dim=${e.Vo})`),{numMergedTokens:_,embedBuffer:e.merged}}_vitBlock(a,e){const t=this.vision,s=this.gpu,i=`model.visual.blocks.${a}`,n=this.weights[`${i}.norm1.weight`],r=this.weights[`${i}.norm1.bias`],u=this.makeUniform(`vit_ln1_${a}`,[t.V,1e-6]);s.dispatch(this.pipelines.vit_layernorm,[s.createBindGroup(this.pipelines.vit_layernorm,0,[t.hidden,n,r,t.normed,u])],e);const o=this.weights[`${i}.attn.qkv.weight`],h=this.weights[`${i}.attn.qkv.bias`],m=this.makeUniform(`vit_qkv_${a}`,[t.V,3*t.V]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,o,h,t.qkv,m])],Math.ceil(3*t.V/32),e);const c=t.V,f=c*c*2,p=c*2,w=this.makeUniform(`vit_qkv_mv_${a}`,[c,c]);for(let $=0;$<3;$++){const O=[t.q,t.k,t.v][$],G=s.createBindGroupWithOffsets(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,{buffer:o,offset:$*f,size:f},{buffer:h,offset:$*p,size:p},O,w]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[G],Math.ceil(c/32),e)}const B=this.makeUniform(`vit_rope_${a}`,[e,t.heads,t.headDim]);s.dispatch(this.pipelines.vit_rope,[s.createBindGroup(this.pipelines.vit_rope,0,[t.q,t.k,t.cos,t.sin,B])],Math.ceil(e*t.heads*t.headDim/256));const q=1/Math.sqrt(t.headDim),_=this.makeUniform(`vit_attn_${a}`,[e,t.heads,t.headDim,q]);s.dispatch(this.pipelines.vit_attention,[s.createBindGroup(this.pipelines.vit_attention,0,[t.q,t.k,t.v,t.attnOut,_])],e,t.heads);const l=this.weights[`${i}.attn.proj.weight`],d=this.weights[`${i}.attn.proj.bias`],y=this.makeUniform(`vit_proj_${a}`,[c,c]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.attnOut,l,d,t.mlpOut,y])],Math.ceil(c/32),e);const U=this.makeUniform(`vit_res1_${a}`,[e*c]);s.dispatch(this.pipelines.vit_add,[s.createBindGroup(this.pipelines.vit_add,0,[t.hidden,t.mlpOut,U])],Math.ceil(e*c/256));const g=this.weights[`${i}.norm2.weight`],P=this.weights[`${i}.norm2.bias`];s.dispatch(this.pipelines.vit_layernorm,[s.createBindGroup(this.pipelines.vit_layernorm,0,[t.hidden,g,P,t.normed,u])],e);const S=this.weights[`${i}.mlp.fc1.weight`],b=this.weights[`${i}.mlp.fc1.bias`],v=this.makeUniform(`vit_mlp_fc1_${a}`,[c,t.Vi]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,S,b,t.mlpInter,v])],Math.ceil(t.Vi/32),e);const C=this.makeUniform(`vit_gelu_${a}`,[e*t.Vi]);s.dispatch(this.pipelines.vit_gelu_tanh,[s.createBindGroup(this.pipelines.vit_gelu_tanh,0,[t.mlpInter,t.mlpInter,C])],Math.ceil(e*t.Vi/256));const D=this.weights[`${i}.mlp.fc2.weight`],k=this.weights[`${i}.mlp.fc2.bias`],z=this.makeUniform(`vit_mlp_fc2_${a}`,[t.Vi,c]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.mlpInter,D,k,t.mlpOut,z])],Math.ceil(c/32),e);const M=this.makeUniform(`vit_res2_${a}`,[e*c]);s.dispatch(this.pipelines.vit_add,[s.createBindGroup(this.pipelines.vit_add,0,[t.hidden,t.mlpOut,M])],Math.ceil(e*c/256))}computeMultimodalPositions(a,e,t,s){const i=this.vision.mergeSize,n=t/i,r=s/i,u=a.length,o=new Array(3);for(let f=0;f<3;f++)o[f]=new Int32Array(u);let h=0,m=0;for(let f=0;f<u;f++)if(a[f]===this.imageTokenId){const w=m,B=Math.floor(w/r),q=w%r;o[0][f]=h,o[1][f]=h+B,o[2][f]=h+q,m++,m===e&&(h+=Math.max(n,r))}else o[0][f]=h,o[1][f]=h,o[2][f]=h,h++;const c=h-u;return{positionIds3D:o,ropeDelta:c}}initB2Buffers(){const a=this.gpu,e=this.hiddenSize,t=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.b2={},this.b2.hidden=a.createBuffer("b2_hidden_a",2*e*4,s),this.b2.hiddenB=a.createBuffer("b2_hidden_b",2*e*4,s),this.b2.normed=a.createBuffer("b2_normed",2*e*4,s),this.b2.mlpIntermediate=a.createBuffer("b2_mlp_inter",2*t*4,s),this.b2.mlpOut=a.createBuffer("b2_mlp_out",2*e*4,s),this.b2.logits=a.createBuffer("b2_logits",2*this.vocabSize*4,s),this.b2.zeroBuf=a.createBuffer("b2_zero_buf",2*e*4,s),this.b2.qProjFull=a.createBuffer("b2_q_proj_full",2*this.numHeads*this.headDim*2*4,s),this.b2.qProj=a.createBuffer("b2_q_proj_out",2*this.numHeads*this.headDim*4,s),this.b2.qGate=a.createBuffer("b2_q_gate",2*this.numHeads*this.headDim*4,s),this.b2.kProj=a.createBuffer("b2_k_proj_out",2*this.numKVHeads*this.headDim*4,s),this.b2.vProj=a.createBuffer("b2_v_proj_out",2*this.numKVHeads*this.headDim*4,s);const i=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,h=this.linValueHeads/i*r,m=i*(n+n+h),c=this.linValueDim;this.b2.linQKV=a.createBuffer("b2_lin_qkv",2*m*4,s),this.b2.linZ=a.createBuffer("b2_lin_z",2*c*4,s),this.b2.linOut=a.createBuffer("b2_lin_out",2*c*4,s);const f=Math.max(e,this.numHeads*this.headDim,c)*4;if(this.b2.attnOut=a.createBuffer("b2_attn_out",2*f,s),this.abQuantized){const p=this.textCfg.linear_num_value_heads??i;this.b2.linAlpha=a.createBuffer("b2_lin_alpha",2*p*4,s),this.b2.linBeta=a.createBuffer("b2_lin_beta",2*p*4,s)}this.b2.topkResult0=a.createBuffer("b2_topk0",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkResult1=a.createBuffer("b2_topk1",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkReadback0=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb0"}),this.b2.topkReadback1=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb1"}),this.b2.argmaxResult0=a.createBuffer("b2_argmax0",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxResult1=a.createBuffer("b2_argmax1",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxReadback0=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb0"}),this.b2.argmaxReadback1=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb1"}),this.b2._gqaParamBuf0=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params0"}),this.b2._gqaParamBuf1=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params1"}),this.b2._dims={H:e,I:t,linQKVDim:m,valueDim:c,linHeads:i,linKeyDim:n,linValDim:r,linEVD:h,qProjSize:this.numHeads*this.headDim,qProjFullSize:this.numHeads*this.headDim*2,kProjSize:this.numKVHeads*this.headDim,vProjSize:this.numKVHeads*this.headDim,attnOutElems:f/4},console.log(`[B2] Allocated batch=2 buffers (${((2*(e+e+e+t+e+this.vocabSize)*4+2*f)/1e6).toFixed(1)} MB activations)`)}runWithOffsets(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroupWithOffsets(i,0,e);this.gpu.dispatch(i,[n],t,s)}makeUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),s=new DataView(t);for(let r=0;r<e.length;r++)Number.isInteger(e[r])?s.setUint32(r*4,e[r],!0):s.setFloat32(r*4,e[r],!0);const i=a+"_"+e.join("_");if(this.paramBufs[i])return this.paramBufs[i];const n=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(n,0,new Uint8Array(t)),this.paramBufs[i]=n,n}_makeMixedUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),s=new DataView(t);for(let n=0;n<e.length;n++){const r=e[n];r.u!==void 0?s.setUint32(n*4,r.u,!0):s.setFloat32(n*4,r.f,!0)}this._mixedUniformBufs||(this._mixedUniformBufs={});let i=this._mixedUniformBufs[a];return i||(i=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a}),this._mixedUniformBufs[a]=i),this.gpu.device.queue.writeBuffer(i,0,new Uint8Array(t)),i}makeUniformTyped(a,e,t){const s=e.length+t.length,i=new ArrayBuffer(Math.max(16,Math.ceil(s*4/16)*16)),n=new DataView(i);let r=0;for(const h of e)n.setUint32(r,h,!0),r+=4;for(const h of t)n.setFloat32(r,h,!0),r+=4;const u=a+"_"+[...e,...t].join("_");if(this.paramBufs[u])return this.paramBufs[u];const o=this.gpu.device.createBuffer({size:i.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(i)),this.paramBufs[u]=o,o}makeFusedNormMLPUniform(a){const e=`fused_anorm_mlp_${a}`;if(this.paramBufs[e])return this.paramBufs[e];const t=this.hiddenSize,s=this.intermediateSize,i=16+640*16,n=new ArrayBuffer(i),r=new DataView(n);r.setUint32(0,t,!0),r.setUint32(4,s,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=`model.language_model.layers.${a}.post_attention_layernorm.weight`,o=this._normWeightRaw?.[u];if(!o)throw new Error(`Norm weight not cached for layer ${a}`);for(let m=0;m<t/2;m++){const c=Math.floor(m/4),f=m%4;r.setUint32(16+c*16+f*4,o[m],!0)}const h=this.gpu.device.createBuffer({size:i,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:e});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(n)),this.paramBufs[e]=h,h}fusedNormGptqOp(a,e,t,s,i,n,r,u){const o=this.getQWeight(s);if(!o.qweight)return null;const h=this.makeFusedNormGPTQUniform(i,r),m=u?"fused_norm_gptq":"fused_norm_gptq_noadd",c=u?"fnorm_gptq_":"fnorm_gptqna_",f=u?[a,e,o.qweight,o.scales,t,h]:[a,o.qweight,o.scales,t,h];return this.prepOpCached(`${c}${s}`,m,f,this.wg4(r))}makeFusedNormGPTQUniform(a,e){const t=`fused_norm_gptq_${a}_${e}`;if(this.paramBufs[t])return this.paramBufs[t];const s=this.hiddenSize,i=16+512*16,n=new ArrayBuffer(i),r=new DataView(n);r.setUint32(0,s,!0),r.setUint32(4,e,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=this._normWeightRaw?.[a];if(!u)throw new Error(`Norm weight not cached: ${a}`);for(let h=0;h<s/2;h++){const m=Math.floor(h/4),c=h%4;r.setUint32(16+m*16+c*4,u[h],!0)}const o=this.gpu.device.createBuffer({size:i,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:t});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(n)),this.paramBufs[t]=o,o}run(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroup(i,0,e);this.gpu.dispatch(i,[n],t,s)}runCached(a,e,t,s,i=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);this.gpu.dispatch(n,[r],s,i)}prepOp(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroup(i,0,e);return{pipeline:i,bindGroups:[n],workgroupsX:t,workgroupsY:s}}prepOpCached(a,e,t,s,i=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);return{pipeline:n,bindGroups:[r],workgroupsX:s,workgroupsY:i}}wg(a){return Math.ceil(a/256)}wg4(a){return Math.ceil(a/32)}wg8(a){return Math.ceil(a/8)}getQWeight(a){return{qweight:this.weights[a+".qweight"],scales:this.weights[a+".scales"],qzeros:this.weights[a+".qzeros"]}}embedding(a){const e=this.weights["model.language_model.embed_tokens.weight"],t=this.makeUniform("emb_params",[a,this.hiddenSize]);this.run("embedding",[e,this.hidden,t],this.wg(this.hiddenSize))}embeddingFromVisionBuffer(a,e){const t=this.hiddenSize;this.gpu.copyBuffer(a,this.hidden,t*4,e*t*4,0)}embeddingFromArgmax(){const a=this.weights["model.language_model.embed_tokens.weight"];this._embedArgmaxParams||(this._embedArgmaxParams=this.makeUniform("emb_argmax_params",[this.hiddenSize])),this.runCached("embed_from_argmax","embed_from_argmax",[a,this.hidden,this.argmaxResult,this._embedArgmaxParams],this.wg(this.hiddenSize))}_dispatchLmHead(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,s=a/this.groupSize%4===0,i=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(s){const n=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,i],this.wg8(e))}else{const n=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,i],this.wg4(e))}}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],s=this.makeUniform("lmhead_params",[this.hiddenSize,this.vocabSize]);this.runCached("lm_head","bf16_matvec",[this.normed,t,this.logits,s],this.wg(this.vocabSize))}}rmsNorm(a,e,t,s){const i=this.makeUniform("rmsnorm_params",[this.hiddenSize,this.rmsEps]);s?this.runCached(s,"rmsnorm",[a,t,e,i],1):this.run("rmsnorm",[a,t,e,i],1)}gptqMatvec(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return;const r=s/this.groupSize,u=r%4===0;if(this.useSplitK&&u){let o=this.splitKSplits;for(;o>1&&r%(o*4)!==0;)o>>=1;if(o>1){const h=this.makeUniform(`mv_sk_${s}_${i}_${o}`,[s,i,this.groupSize,o]);this.runCached(`sk4tf16_${t}`,"gptq_matvec_4t_f16_sk",[a,n.qweight,n.scales,this.splitKPartials,h],this.wg8(i),o);const m=this.makeUniform(`rsk_${i}_${o}`,[i,o]);this.runCached(`rsk_${t}`,"reduce_splitk",[this.splitKPartials,e,m],this.wg(i));return}}if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",h=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);this.runCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg8(i))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",h=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);this.runCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg4(i))}}gptqMatvecOp(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return null;if(s/this.groupSize%4===0){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",h=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);return this.prepOpCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg8(i))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",h=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);return this.prepOpCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg4(i))}}bf16Matvec(a,e,t,s,i,n){const r=this.makeUniform(`bf16mv_${s}_${i}`,[s,i]);n?this.runCached(n,"bf16_matvec",[a,t,e,r],this.wg(i)):this.run("bf16_matvec",[a,t,e,r],this.wg(i))}siluMul(a,e,t,s,i){const n=this.makeUniform(`silu_${s}`,[s]);i?this.runCached(i,"silu_mul",[a,e,t,n],this.wg(s)):this.run("silu_mul",[a,e,t,n],this.wg(s))}addVectors(a,e,t,s){const i=this.makeUniform(`add_${t}`,[t]);s?this.runCached(s,"add",[a,e,i],this.wg(t)):this.run("add",[a,e,i],this.wg(t))}addAndRmsNorm(a,e,t,s,i){const n=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);i?this.runCached(i,"add_rmsnorm",[a,e,s,t,n],1):this.run("add_rmsnorm",[a,e,s,t,n],1)}mlp(a,e){const t=`model.language_model.layers.${e}.mlp`,s=this.hiddenSize,i=this.intermediateSize,n=this.getQWeight(`${t}.gate_proj`),r=this.getQWeight(`${t}.up_proj`),u=this.makeUniform(`fused_mv_${s}_${i}`,[s,i,this.groupSize]);if(s/this.groupSize%4===0){const m=this.gpu.hasF16?"fused_gate_up_silu_4t_f16":"fused_gate_up_silu_4t",c=this.gpu.hasF16?"fused_gus4t16_":"fused_gus4t_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg8(i))}else{const m=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",c=this.gpu.hasF16?"fused_gus16_":"fused_gus_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg4(i))}this.gptqMatvec(this.mlpIntermediate,this.mlpOut,`${t}.down_proj`,i,s)}fullAttentionFused(a,e,t,s,i,n,r,u){i=i||this.normed;const o=`model.language_model.layers.${t}.self_attn`,h=this.hiddenSize,m=this.headDim,c=this.numHeads,f=this.numKVHeads,p=c/f,w=t===0,B=`model.language_model.layers.${t}.input_layernorm.weight`,q=this.weights[B];if(w){const $=this.makeUniform("add_rmsnorm_params",[h,this.rmsEps]);this.runCached(`fa_norm_${t}`,"rmsnorm",[a,q,i,$],1)}else{const $=this.makeUniform("add_rmsnorm_params",[h,this.rmsEps]);this.runCached(`fa_norm_${t}`,"add_rmsnorm_ro",[a,e,q,i,$],1)}const _=this.gptqMatvecOp(i,this.qProjFull,`${o}.q_proj`,h,c*m*2),l=this.gptqMatvecOp(i,this.kProj,`${o}.k_proj`,h,f*m),d=this.gptqMatvecOp(i,this.vProj,`${o}.v_proj`,h,f*m);this.gpu.dispatchMulti([_,l,d].filter(Boolean));const y=this.kvCache[t],U=this._fusedSQKParams[t],g=u??s;this._gqaDv.setUint32(0,g,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,n??s,!0),this._gqaDv.setUint32(12,r??s,!0),this.gpu.device.queue.writeBuffer(U,16,this._gqaData,0,16),this.runCached(`fa_sqk_${t}`,this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,y.keys,y.values,U],c+f);const P=(u??s)+1,S=this._forceMinSplits||1,b=Math.max(S,Math.min(Math.max(1,Math.ceil(P/32)),this._maxGqaSplits)),v=b>1?this._gqaPartials:this.attnOut;if(this._gqaDv.setUint32(0,P,!0),this._gqaDv.setUint32(4,m,!0),this._gqaDv.setUint32(8,f,!0),this._gqaDv.setUint32(12,c,!0),this._gqaDv.setUint32(16,p,!0),this._gqaDv.setUint32(20,b,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.run("gqa_attention_head",[this.qProj,y.keys,y.values,v,this._gqaParamBuf],c,b),b>1){this._gqaReduceParams||(this._gqaReduceParams=this.gpu.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_reduce_params"}));const $=new Uint8Array(16),O=new DataView($.buffer);O.setUint32(0,m,!0),O.setUint32(4,b,!0),O.setUint32(8,c,!0),this.gpu.device.queue.writeBuffer(this._gqaReduceParams,0,$),this.run("gqa_reduce",[this._gqaPartials,this.attnOut,this._gqaReduceParams],c)}const C=this.getQWeight(`${o}.o_proj`),D=c*m,z=D/this.groupSize%4===0,M=this.makeUniform(`fused_sig_mv_${D}_${h}`,[D,h,this.groupSize]);if(z){const $=this.gpu.hasF16?"fused_sigmoid_gptq_4t_f16":"fused_sigmoid_gptq_4t",O=this.gpu.hasF16?"fused_sig4t16_oproj_":"fused_sig4t_oproj_";this.runCached(`${O}${t}`,$,[this.attnOut,this.qGate,C.qweight,C.scales,this.qProj,M],this.wg8(h))}else{const $=this.gpu.hasF16?"fused_sigmoid_gptq_f16":"fused_sigmoid_gptq",O=this.gpu.hasF16?"fused_sig16_oproj_":"fused_sig_oproj_";this.runCached(`${O}${t}`,$,[this.attnOut,this.qGate,C.qweight,C.scales,this.qProj,M],this.wg4(h))}}linearAttentionFused(a,e,t,s){s=s||this.normed;const i=`model.language_model.layers.${t}.linear_attn`,n=this.hiddenSize,r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,o=this.textCfg.linear_value_head_dim,h=this.linValueHeads,c=h/r*o,f=r*(u+u+c),p=this.linValueDim,w=t===0,B=`model.language_model.layers.${t}.input_layernorm.weight`,q=this.weights[B];if(w){const b=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"rmsnorm",[a,q,s,b],1)}else{const b=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"add_rmsnorm_ro",[a,e,q,s,b],1)}{const b=[this.gptqMatvecOp(s,this.linQKV,`${i}.in_proj_qkv`,n,f),this.gptqMatvecOp(s,this.linZ,`${i}.in_proj_z`,n,p)];this.abQuantized&&(b.push(this.gptqMatvecOp(s,this.linAlpha,`${i}.in_proj_a`,n,h)),b.push(this.gptqMatvecOp(s,this.linBeta,`${i}.in_proj_b`,n,h))),this.gpu.dispatchMulti(b.filter(Boolean))}const _=this.weights[`${i}.conv1d.weight`],l=this.weights[`${i}.A_log`],d=this.weights[`${i}.dt_bias`],y=this.weights[`${i}.norm.weight`];if(this.abQuantized){const b=`fused_cdn_q_${r}_${u}_${o}_${f}_${h}`;let v=this.paramBufs[b];if(!v){const C=new ArrayBuffer(32),D=new DataView(C);D.setUint32(0,r,!0),D.setUint32(4,u,!0),D.setUint32(8,o,!0),D.setUint32(12,f,!0),D.setFloat32(16,this.rmsEps,!0),D.setUint32(20,0,!0),D.setUint32(24,h,!0),v=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_q_params"}),this.gpu.device.queue.writeBuffer(v,0,new Uint8Array(C)),this.paramBufs[b]=v}this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,this.linAlpha,this.linBeta,l,d,y,v],r)}else{const b=`fused_cdn_ext_${r}_${u}_${o}_${f}_${n}_${h}`;let v=this.paramBufs[b];if(!v){const D=new ArrayBuffer(32),k=new DataView(D);k.setUint32(0,r,!0),k.setUint32(4,u,!0),k.setUint32(8,o,!0),k.setUint32(12,f,!0),k.setFloat32(16,this.rmsEps,!0),k.setUint32(20,n,!0),k.setUint32(24,h,!0),v=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_ext_params"}),this.gpu.device.queue.writeBuffer(v,0,new Uint8Array(D)),this.paramBufs[b]=v}const C=this.linABWeight[t];this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,s,C,l,d,y,v],r)}const U=this.getQWeight(`${i}.out_proj`),P=p/this.groupSize%4===0,S=this.makeUniform(`fused_silu_mv_${p}_${n}`,[p,n,this.groupSize]);if(P){const b=this.gpu.hasF16?"fused_silu_gptq_4t_f16":"fused_silu_gptq_4t",v=this.gpu.hasF16?"fused_silu4t16_oproj_":"fused_silu4t_oproj_";this.runCached(`${v}${t}`,b,[this.linZ,this.linOut,U.qweight,U.scales,this.attnOut,S],this.wg8(n))}else{const b=this.gpu.hasF16?"fused_silu_gptq_f16":"fused_silu_gptq",v=this.gpu.hasF16?"fused_silu16_oproj_":"fused_silu_oproj_";this.runCached(`${v}${t}`,b,[this.linZ,this.linOut,U.qweight,U.scales,this.attnOut,S],this.wg4(n))}}fusedNormMLP(a,e,t,s,i,n){n=n||this.normed;const r=this.hiddenSize,u=`model.language_model.layers.${i}.post_attention_layernorm.weight`,o=this.weights[u],h=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]),m=`mlp_norm_${i}_${a===this.hidden?"a":"b"}`;this.runCached(m,"three_way_add_rmsnorm",[a,t,s,o,e,n,h],1),this.mlp(n,i)}decoderLayer(a,e,t,s,i,n,r,u){let o;this.layerTypes[a]==="full_attention"?(this.fullAttentionFused(t,i,a,e,void 0,n,r,u),o=this.qProj):(this.linearAttentionFused(t,i,a),o=this.attnOut),this.fusedNormMLP(t,s,i,o,a)}async forward(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,s=this.hiddenB,i=this.zeroBuf;for(let o=0;o<this.numLayers;o++){this.decoderLayer(o,e,t,s,i),i=this.mlpOut;const h=t;t=s,s=h}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const o=this.repetitionPenalty??1,h=this.presencePenalty??0;if(this._recentTokenCount>0&&(o>1||h>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const p=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:o},{f:h},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,p],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const o=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,o],1)}return this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this._readAndSample()}forwardSubmit(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,s=this.hiddenB,i=this.zeroBuf;for(let o=0;o<this.numLayers;o++){this.decoderLayer(o,e,t,s,i),i=this.mlpOut;const h=t;t=s,s=h}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const o=this.repetitionPenalty??1,h=this.presencePenalty??0;if(this._recentTokenCount>0&&(o>1||h>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const p=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:o},{f:h},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,p],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const o=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,o],1)}this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch()}_buildReplayFromRecording(){const a=this.gpu.stopRecording(),e=this.pipelines.gqa_attention_head,t=[];for(const s of a)if(s.multi)for(const i of s.ops)t.push({p:i.pipeline,bg:i.bindGroup,x:i.wgX,y:i.wgY,gqa:i.pipeline===e?1:0});else t.push({p:s.pipeline,bg:s.bindGroup,x:s.wgX,y:s.wgY,gqa:s.pipeline===e?1:0});this._replayFlat=t,console.log(`[PERF] Recorded ${t.length} flat replay ops for core forward`)}_replayCoreForward(a){const e=this.gpu,t=a-this._ropeDelta,s=t+1,i=Math.min(Math.max(1,Math.ceil(s/32)),this._maxGqaSplits);this._gqaDv.setUint32(0,t,!0),this._gqaDv.setUint32(4,a,!0),this._gqaDv.setUint32(8,a,!0),this._gqaDv.setUint32(12,a,!0);for(let o=0;o<this.numLayers;o++)this.layerTypes[o]==="full_attention"&&e.device.queue.writeBuffer(this._fusedSQKParams[o],16,this._gqaData,0,16);this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,i,!0),e.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData,0,24),this._gqaReduceParams&&(this._gqaDv.setUint32(0,this.headDim,!0),this._gqaDv.setUint32(4,i,!0),this._gqaDv.setUint32(8,this.numHeads,!0),this._gqaDv.setUint32(12,0,!0),e.device.queue.writeBuffer(this._gqaReduceParams,0,this._gqaData,0,16));const n=e._singlePass,r=this._replayFlat,u=r.length;for(let o=0;o<u;o++){const h=r[o];n.setPipeline(h.p),n.setBindGroup(0,h.bg),n.dispatchWorkgroups(h.x,h.gqa?i:h.y)}}embeddingB2(a,e){const t=this.hiddenSize,s=this.weights["model.language_model.embed_tokens.weight"],i=this.makeUniform("emb_params",[a,t]);this.runWithOffsets("embedding",[s,{buffer:this.b2.hidden,offset:0,size:t*4},i],this.wg(t));const n=this.makeUniform(`emb_params_${e}`,[e,t]);this.runWithOffsets("embedding",[s,{buffer:this.b2.hidden,offset:t*4,size:t*4},n],this.wg(t))}gptqMatvecB2(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return;const r=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",o=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";this.runCached(`${o}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(i))}gptqMatvecB2Op(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return null;const r=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",o=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";return this.prepOpCached(`${o}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(i))}fullAttentionB2(a,e,t,s){const i=`model.language_model.layers.${t}.self_attn`,n=this.hiddenSize,r=this.b2._dims,u=this.headDim,o=this.numHeads,h=this.numKVHeads,m=o/h,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,p=this.weights[f],w=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:n*4},p,{buffer:this.b2.normed,offset:0,size:n*4},w],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:n*4,size:n*4},p,{buffer:this.b2.normed,offset:n*4,size:n*4},w],1)):this.run("add_rmsnorm_ro_b2",[a,e,p,this.b2.normed,w],2);const B=this.gptqMatvecB2Op(this.b2.normed,this.b2.qProjFull,`${i}.q_proj`,n,o*u*2),q=this.gptqMatvecB2Op(this.b2.normed,this.b2.kProj,`${i}.k_proj`,n,h*u),_=this.gptqMatvecB2Op(this.b2.normed,this.b2.vProj,`${i}.v_proj`,n,h*u);this.gpu.dispatchMulti([B,q,_].filter(Boolean));const l=this.kvCache[t],d=this._fusedSQKParams[t],y=r.qProjFullSize*4,U=r.kProjSize*4,g=r.vProjSize*4,P=r.qProjSize*4;this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,s,!0),this._gqaDv.setUint32(12,s,!0),this.gpu.device.queue.writeBuffer(d,16,this._gqaData,0,16),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:0,size:y},{buffer:this.b2.kProj,offset:0,size:U},{buffer:this.b2.vProj,offset:0,size:g},{buffer:this.b2.qProj,offset:0,size:P},{buffer:this.b2.qGate,offset:0,size:P},l.keys,l.values,d],o+h),this._gqaDv.setUint32(0,s+1,!0),this._gqaDv.setUint32(4,s+1,!0),this._gqaDv.setUint32(8,s+1,!0),this._gqaDv.setUint32(12,s+1,!0),this.gpu.device.queue.writeBuffer(d,16,this._gqaData,0,16),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:y,size:y},{buffer:this.b2.kProj,offset:U,size:U},{buffer:this.b2.vProj,offset:g,size:g},{buffer:this.b2.qProj,offset:P,size:P},{buffer:this.b2.qGate,offset:P,size:P},l.keys,l.values,d],o+h);const S=s+1,b=s+2;this._gqaDv.setUint32(0,S,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,h,!0),this._gqaDv.setUint32(12,o,!0),this._gqaDv.setUint32(16,m,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,s,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf0,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:0,size:P},l.keys,l.values,{buffer:this.b2.attnOut,offset:0,size:r.qProjSize*4},this.b2._gqaParamBuf0],o),this._gqaDv.setUint32(0,b,!0),this._gqaDv.setUint32(24,s+1,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf1,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:P,size:P},l.keys,l.values,{buffer:this.b2.attnOut,offset:r.qProjSize*4,size:r.qProjSize*4},this.b2._gqaParamBuf1],o);const v=this.getQWeight(`${i}.o_proj`),C=o*u,D=this.makeUniform(`fused_sig_mv_${C}_${n}`,[C,n,this.groupSize]);this.runCached(`b2_fused_sig_oproj_${t}`,"fused_sigmoid_gptq_b2_f16",[this.b2.attnOut,this.b2.qGate,v.qweight,v.scales,this.b2.qProj,D],this.wg4(n))}linearAttentionB2(a,e,t){const s=`model.language_model.layers.${t}.linear_attn`,i=this.hiddenSize,n=this.b2._dims,r=n.linHeads,u=n.linKeyDim,o=n.linValDim;n.linEVD;const h=n.linQKVDim,m=n.valueDim,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,p=this.weights[f],w=this.makeUniform("add_rmsnorm_params",[i,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:i*4},p,{buffer:this.b2.normed,offset:0,size:i*4},w],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:i*4,size:i*4},p,{buffer:this.b2.normed,offset:i*4,size:i*4},w],1)):this.run("add_rmsnorm_ro_b2",[a,e,p,this.b2.normed,w],2);{const P=[this.gptqMatvecB2Op(this.b2.normed,this.b2.linQKV,`${s}.in_proj_qkv`,i,h),this.gptqMatvecB2Op(this.b2.normed,this.b2.linZ,`${s}.in_proj_z`,i,m)];this.abQuantized&&(P.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linAlpha,`${s}.in_proj_a`,i,this.linValueHeads)),P.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linBeta,`${s}.in_proj_b`,i,this.linValueHeads))),this.gpu.dispatchMulti(P.filter(Boolean))}const B=this.weights[`${s}.conv1d.weight`],q=this.weights[`${s}.A_log`],_=this.weights[`${s}.dt_bias`],l=this.weights[`${s}.norm.weight`],d=h*4,y=m*4;if(this.abQuantized){const P=this.linValueHeads,S=P*4,b=`fused_cdn_q_${r}_${u}_${o}_${h}_${P}`,v=this.paramBufs[b];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:0,size:y},{buffer:this.b2.linAlpha,offset:0,size:S},{buffer:this.b2.linBeta,offset:0,size:S},q,_,l,v],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:d,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:y,size:y},{buffer:this.b2.linAlpha,offset:S,size:S},{buffer:this.b2.linBeta,offset:S,size:S},q,_,l,v],r)}else{const P=`fused_cdn_ext_${r}_${u}_${o}_${h}_${i}_${this.linValueHeads}`,S=this.paramBufs[P],b=this.linABWeight[t];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:0,size:y},{buffer:this.b2.normed,offset:0,size:i*4},b,q,_,l,S],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:d,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:y,size:y},{buffer:this.b2.normed,offset:i*4,size:i*4},b,q,_,l,S],r)}const U=this.getQWeight(`${s}.out_proj`),g=this.makeUniform(`fused_silu_mv_${m}_${i}`,[m,i,this.groupSize]);this.runCached(`b2_fused_silu_oproj_${t}`,"fused_silu_gptq_b2_f16",[this.b2.linZ,this.b2.linOut,U.qweight,U.scales,this.b2.attnOut,g],this.wg4(i))}fusedNormMLPB2(a,e,t,s,i){const n=this.hiddenSize,r=this.intermediateSize,u=`model.language_model.layers.${i}.mlp`,o=`model.language_model.layers.${i}.post_attention_layernorm.weight`,h=this.weights[o],m=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.run("three_way_add_rmsnorm_b2",[a,t,s,h,e,this.b2.normed,m],2);const c=this.getQWeight(`${u}.gate_proj`),f=this.getQWeight(`${u}.up_proj`),p=this.makeUniform(`fused_mv_${n}_${r}`,[n,r,this.groupSize]);this.runCached(`b2_fused_gus_${i}`,"fused_gate_up_silu_b2_f16",[this.b2.normed,c.qweight,c.scales,f.qweight,f.scales,this.b2.mlpIntermediate,p],this.wg4(r)),this.gptqMatvecB2(this.b2.mlpIntermediate,this.b2.mlpOut,`${u}.down_proj`,r,n)}decoderLayerB2(a,e,t,s,i){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionB2(t,i,a,e),n=this.b2.qProj):(this.linearAttentionB2(t,i,a),n=this.b2.attnOut),this.fusedNormMLPB2(t,s,i,n,a)}_dispatchLmHeadB2(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,t=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]),s=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2";this.runCached("b2_lm_head",s,[this.b2.normed,this._lmHeadQWeight,this._lmHeadScales,this.b2.logits,t],this.wg4(e))}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],s=this.hiddenSize,i=this.vocabSize,n=this.makeUniform("lmhead_params",[s,i]);this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:0,size:s*4},t,{buffer:this.b2.logits,offset:0,size:i*4},n],this.wg(i)),this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:s*4,size:s*4},t,{buffer:this.b2.logits,offset:i*4,size:i*4},n],this.wg(i))}}async forwardB2(a,e,t){this.b2||this.initB2Buffers(),this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embeddingB2(a,e);let s=this.b2.hidden,i=this.b2.hiddenB,n=this.b2.zeroBuf;for(let f=0;f<this.numLayers;f++){this.decoderLayerB2(f,t,s,i,n),n=this.b2.mlpOut;const p=s;s=i,i=p}if(this.mtp){const f=this.hiddenSize;this.gpu.copyBuffer(s,this.mtp.savedHidden,f*4,f*4,0),this.gpu.copyBuffer(this.b2.mlpOut,this.mtp.savedMlpOut,f*4,f*4,0)}const r=this.weights["model.language_model.norm.weight"],u=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);this.run("add_rmsnorm_b2",[s,this.b2.mlpOut,r,this.b2.normed,u],2),this._dispatchLmHeadB2();const h=(this.temperature??.7)>0,m=this.vocabSize,c=this.makeUniform("argmax_params",[m]);return h?(this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.topkResult0,c],1),this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.topkResult1,c],1)):(this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.argmaxResult0,c],1),this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.argmaxResult1,c],1)),this.gpu._passCount!==void 0&&!this._b2PassCountLogged&&(console.log(`[PERF] B2: ${this.gpu._passCount} compute passes per 2-token step`),this._b2PassCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),h?(this.gpu.copyBuffer(this.b2.topkResult0,this.b2.topkReadback0,2048),this.gpu.copyBuffer(this.b2.topkResult1,this.b2.topkReadback1,2048)):(this.gpu.copyBuffer(this.b2.argmaxResult0,this.b2.argmaxReadback0,8),this.gpu.copyBuffer(this.b2.argmaxResult1,this.b2.argmaxReadback1,8)),this.gpu.endBatch(),this._readAndSampleB2(h)}async _readAndSampleB2(a){const e=[0,0];a?(await Promise.all([this.b2.topkReadback0.mapAsync(GPUMapMode.READ),this.b2.topkReadback1.mapAsync(GPUMapMode.READ)]),e[0]=this._sampleFromTopkReadback(this.b2.topkReadback0),e[1]=this._sampleFromTopkReadback(this.b2.topkReadback1)):(await Promise.all([this.b2.argmaxReadback0.mapAsync(GPUMapMode.READ),this.b2.argmaxReadback1.mapAsync(GPUMapMode.READ)]),e[0]=new Uint32Array(this.b2.argmaxReadback0.getMappedRange().slice(0))[0],e[1]=new Uint32Array(this.b2.argmaxReadback1.getMappedRange().slice(0))[0],this.b2.argmaxReadback0.unmap(),this.b2.argmaxReadback1.unmap());for(const t of e)this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=t:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=t);return e}_sampleFromTopkReadback(a){const e=a.getMappedRange(),t=new Uint32Array(e),s=new Float32Array(e),i=new Uint32Array(256),n=new Float32Array(256);for(let o=0;o<256;o++)i[o]=t[o*2],n[o]=s[o*2+1];a.unmap();const r=this.presencePenalty??0,u=this.repetitionPenalty??1;if((r>0||u>1)&&this._recentTokenCount>0){const o=new Set;for(let h=0;h<this._recentTokenCount;h++)o.add(this._recentTokens[h]);for(let h=0;h<256;h++)o.has(i[h])&&(r>0&&(n[h]-=r),u>1&&(n[h]=n[h]>0?n[h]/u:n[h]*u))}return this._sampleFromArrays(i,n,256)}async _readAndSample(){await this._argmaxReadback.mapAsync(GPUMapMode.READ);const a=new Uint32Array(this._argmaxReadback.getMappedRange().slice(0));this._argmaxReadback.unmap();const e=a[0];return this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=e:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=e),e}_sampleFromArrays(a,e,t){const s=this.temperature??.7,i=this.topP??.8,n=this.topK??20,r=Math.min(n,t),u=this._selBuf||(this._selBuf=new Uint32Array(64)),o=this._selValBuf||(this._selValBuf=new Float32Array(64)),h=this._usedBuf||(this._usedBuf=new Uint8Array(256));h.fill(0);for(let l=0;l<r;l++){let d=-1,y=-1/0;for(let U=0;U<t;U++)!h[U]&&e[U]>y&&(y=e[U],d=U);if(d<0)break;u[l]=a[d],o[l]=y,h[d]=1}const m=o[0],c=this._probBuf||(this._probBuf=new Float32Array(64));let f=0;for(let l=0;l<r;l++)c[l]=Math.exp((o[l]-m)/s),f+=c[l];for(let l=0;l<r;l++)c[l]/=f;let p=0,w=r;for(let l=0;l<r;l++)if(p+=c[l],p>=i){w=l+1;break}let B=0;for(let l=0;l<w;l++)B+=c[l];const q=Math.random()*B;let _=0;for(let l=0;l<w;l++)if(_+=c[l],_>=q)return u[l];return u[0]}_sample(a){const e=this.temperature??.7,t=this.topP??.8,s=this.topK??20,i=a.length;if(e<=0){let _=0,l=a[0];for(let d=1;d<i;d++)a[d]>l&&(l=a[d],_=d);return _}const n=Math.max(s,64),r=new Int32Array(n).fill(-1),u=new Float32Array(n).fill(-1/0);let o=-1/0;for(let _=0;_<i;_++){const l=a[_];if(l>o&&(o=l),l>u[n-1]){let d=n-1;for(;d>0&&l>u[d-1];)u[d]=u[d-1],r[d]=r[d-1],d--;u[d]=l,r[d]=_}}const h=Math.min(s,n),m=new Float32Array(h);let c=0;for(let _=0;_<h&&!(r[_]<0);_++)m[_]=Math.exp((u[_]-o)/e),c+=m[_];for(let _=0;_<h;_++)m[_]/=c;let f=0,p=h;for(let _=0;_<h;_++){if(r[_]<0){p=_;break}if(f+=m[_],f>=t){p=_+1;break}}let w=0;for(let _=0;_<p;_++)w+=m[_];const B=Math.random()*w;let q=0;for(let _=0;_<p;_++)if(q+=m[_],q>=B)return r[_];return r[0]}async generate(a,e=512,t,s){this.seqLen=0,this._recentTokenCount=0,this._replayFlat=null,this._ropeDelta=s?.ropeDelta??0;const i=this.maxSeqLen||4096;if(e=Math.min(e,i-a.length),e<=0)throw new Error(`Prompt (${a.length} tokens) exceeds context size (${i})`);const n=[...a],r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,o=this.textCfg.linear_value_head_dim,c=this.linValueHeads/r*o,f=r*(u+u+c);for(let P=0;P<this.numLayers;P++)if(this.layerTypes[P]==="linear_attention"){const S=r*u*c*4,b=3*f*4;this.gpu.device.queue.writeBuffer(this.linState[P],0,new Uint8Array(S)),this.gpu.device.queue.writeBuffer(this.linConvHist[P],0,new Uint8Array(b))}let p=null;if(s){let P=0;const S=s.imageTokenId,b=s.positionIds3D;for(let v=0;v<a.length;v++){const C=b[0][v],D=b[1][v],k=b[2][v];this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),a[v]===S?this.embeddingFromVisionBuffer(s.embedBuffer,P++):this.embedding(a[v]);let z=this.hidden,M=this.hiddenB,$=this.zeroBuf;for(let O=0;O<this.numLayers;O++){this.decoderLayer(O,C,z,M,$,D,k,v),$=this.mlpOut;const G=z;z=M,M=G}if(v===a.length-1){const O=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(z,this.mlpOut,this.normed,O,"add_final_norm"),this._dispatchLmHead();const G=this.temperature??.7;if(G>0){const R=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,R],1);const F=Math.random()*4294967295>>>0,T=this._makeMixedUniform("sample_params",[{f:G},{u:this.topK??20},{f:this.topP??.8},{u:F}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,T],1)}else{const R=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,R],1)}this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8)}this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch(),this.seqLen=v+1}p=await this._readAndSample()}else for(let P=0;P<a.length;P++)p=await this.forward(a[P],P),this.seqLen=P+1;n.push(p);const w=this.config.eos_token_id??this.textCfg.eos_token_id,B=Array.isArray(w)?w:w!=null?[w]:[248044,248046];if(t?.(p,0)||B.includes(p))return n;this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=p:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=p),this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const _=8;this._tokenHistoryBuf||(this._tokenHistoryBuf=this.gpu.createBuffer("token_history",_*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._tokenHistoryReadback=this.gpu.device.createBuffer({size:_*4,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"token_history_readback"}));let l=0,d=0,y=1,U=p,g=!1;for(;y<e;){const P=performance.now(),S=Math.min(_,e-y);for(let k=0;k<S;k++){const z=this.seqLen+k+this._ropeDelta;if(this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),k===0?this.embedding(U):this.embeddingFromArgmax(),g)this._replayCoreForward(z);else{let G=this.hidden,R=this.hiddenB,F=this.zeroBuf;k===0&&!this._replayFlat&&(this._forceMinSplits=2,this.gpu.startRecording());for(let x=0;x<this.numLayers;x++){this.decoderLayer(x,z,G,R,F),F=this.mlpOut;const V=G;G=R,R=V}const T=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(G,this.mlpOut,this.normed,T,"add_final_norm"),this._dispatchLmHead(),k===0&&this.gpu._recording&&(this._forceMinSplits=0,this._buildReplayFromRecording())}const M=this.temperature??.7;if(M>0){const G=this.repetitionPenalty??1,R=this.presencePenalty??0,F=this._recentTokenCount+k;if(F>0&&(G>1||R>0)){const j=this._makeMixedUniform("penalty_params",[{u:Math.min(F,this._repMaxTokens)},{f:G},{f:R},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,j],Math.ceil(Math.min(F,this._repMaxTokens)/256))}const T=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,T],1);const x=Math.random()*4294967295>>>0,V=this._makeMixedUniform("sample_params",[{f:M},{u:this.topK??20},{f:this.topP??.8},{u:x}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,V],1)}else{const G=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,G],1)}const $=(this._recentTokenCount+k)%this._repMaxTokens,O=this.makeUniform(`append_${k}`,[$,k]);this.run("append_token",[this.argmaxResult,this._gpuRecentTokens,this._tokenHistoryBuf,O],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch()}!g&&this._replayFlat&&(g=!0);const b=this.gpu.device.createCommandEncoder();b.copyBufferToBuffer(this._tokenHistoryBuf,0,this._tokenHistoryReadback,0,S*4),this.gpu.device.queue.submit([b.finish()]),await this._tokenHistoryReadback.mapAsync(GPUMapMode.READ);const v=new Uint32Array(this._tokenHistoryReadback.getMappedRange().slice(0,S*4));this._tokenHistoryReadback.unmap();const C=performance.now();l+=C-P,d+=S;let D=!1;for(let k=0;k<S;k++){const z=v[k];n.push(z),this.seqLen++,this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=z:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=z);const M=t?.(z,y);if(y++,M||B.includes(z)){D=!0;break}}if(d%50<_&&console.log(`[T @${d}] ${(l/d).toFixed(1)}ms/tok (batch=${_})`),D)break;U=v[S-1],this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4)}return d>0&&console.log(`[T final @${d}] ${(l/d).toFixed(1)}ms/tok (batch=${_})`),n}async _quantizeBF16Weight(a,e,t,s){const i=this.groupSize,n=e/8,r=e/i,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,o=this.gpu.createBuffer(`${s}_qweight`,n*t*4,u),h=this.gpu.createBuffer(`${s}_scales_f32`,r*t*4,u),m=Math.ceil(r*t/2)*4,c=this.gpu.createBuffer(`${s}_scales`,m,u),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData(`${s}_qparams`,new Uint32Array([e,t,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[a,o,h,p]),B=65535,q=Math.min(t,B),_=Math.ceil(t/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(r*t/2),g=this.gpu.createBufferFromData(`${s}_pparams`,new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[h,c,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();return b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),h.destroy(),p.destroy(),g.destroy(),{qweight:o,scales:c}}async initMTP(a){if(this.mtp)return;const e=this.hiddenSize,t=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=performance.now(),n="Qwen/Qwen3.5-2B";console.log(`[MTP] Downloading MTP weights from ${n}...`);const{loadMTPWeights:r}=await A(async()=>{const{loadMTPWeights:B}=await import("./safetensors-loader-CwGm5mJX.js");return{loadMTPWeights:B}},[],import.meta.url),u=await r(n,a);console.log(`[MTP] Downloaded ${Object.keys(u).length} MTP tensors`),this.mtp={weights:{},qweights:{}};const o={};for(const[B,q]of Object.entries(u)){const _=this.gpu.createBufferFromData(`mtp_${B}`,q.data);o[B]=_,this.mtp.weights[B]=_}const h=[{name:"mtp.fc.weight",K:e*2,N:e},{name:"mtp.layers.0.self_attn.q_proj.weight",K:e,N:this.numHeads*this.headDim*2},{name:"mtp.layers.0.self_attn.k_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.v_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.o_proj.weight",K:e,N:e},{name:"mtp.layers.0.mlp.gate_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.up_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.down_proj.weight",K:t,N:e}];for(const{name:B,K:q,N:_}of h){const{qweight:l,scales:d}=await this._quantizeBF16Weight(o[B],q,_,`mtp_${B}`);this.mtp.qweights[B]={qweight:l,scales:d},o[B].destroy(),delete this.mtp.weights[B]}this.mtp.normRaw={};const m=["mtp.pre_fc_norm_hidden.weight","mtp.pre_fc_norm_embedding.weight","mtp.layers.0.input_layernorm.weight","mtp.layers.0.post_attention_layernorm.weight","mtp.norm.weight","mtp.layers.0.self_attn.q_norm.weight","mtp.layers.0.self_attn.k_norm.weight"];for(const B of m){const q=u[B];q&&(this.mtp.normRaw[B]=new Uint32Array(q.data.buffer.slice(q.data.byteOffset,q.data.byteOffset+q.data.byteLength)))}const f=4096*this.numKVHeads*this.headDim*4;this.mtp.kvCache={keys:this.gpu.createBuffer("mtp_kv_keys",f,s),values:this.gpu.createBuffer("mtp_kv_values",f,s)},this.mtp.concatBuf=this.gpu.createBuffer("mtp_concat",e*2*4,s),this.mtp.fcOut=this.gpu.createBuffer("mtp_fc_out",e*4,s),this.mtp.savedHidden=this.gpu.createBuffer("mtp_saved_hidden",e*4,s),this.mtp.savedMlpOut=this.gpu.createBuffer("mtp_saved_mlp_out",e*4,s),this.mtp.seqLen=0,this._buildMTPSplitQKNormUniform(),this._buildMTPFusedMLPUniform(),await this._buildTrimmedLmHead(8e3);const p=((performance.now()-i)/1e3).toFixed(1),w=h.length;console.log(`[MTP] Initialized in ${p}s: ${w} INT4 weights, 1 KV cache layer`)}_buildMTPSplitQKNormUniform(){const a=this.numHeads,e=this.numKVHeads,t=this.headDim,s=a*t,i=e*t,n=(s+i)/2,u=32+Math.ceil(n/4)*16,o=new ArrayBuffer(u),h=new DataView(o);h.setUint32(0,a,!0),h.setUint32(4,e,!0),h.setUint32(8,t,!0),h.setFloat32(12,this.rmsEps,!0),h.setUint32(16,0,!0),h.setUint32(20,0,!0),h.setUint32(24,0,!0),h.setUint32(28,0,!0);const m=this.mtp.normRaw["mtp.layers.0.self_attn.q_norm.weight"],c=this.mtp.normRaw["mtp.layers.0.self_attn.k_norm.weight"];if(m)for(let p=0;p<s/2;p++){const w=Math.floor(p/4),B=p%4;h.setUint32(32+w*16+B*4,m[p],!0)}if(c){const p=s/2;for(let w=0;w<i/2;w++){const B=p+w,q=Math.floor(B/4),_=B%4;h.setUint32(32+q*16+_*4,c[w],!0)}}const f=this.gpu.device.createBuffer({size:u,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"mtp_fused_sqk"});this.gpu.device.queue.writeBuffer(f,0,new Uint8Array(o)),this.mtp.fusedSQKParams=f}_buildMTPFusedMLPUniform(){}async _buildTrimmedLmHead(a=8e3){const e=this.hiddenSize,t=this.gpu,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=new Uint32Array(a);for(let y=0;y<a;y++)i[y]=y;this.mtp.trimmedToFull=i,this.mtp.trimmedVocabSize=a;const n=t.createBufferFromData("mtp_trim_indices",i),u=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",o=this.weights[u],h=e/2,m=a*h*4,c=t.createBuffer("mtp_trim_gathered",m,s),f=(await A(async()=>{const{SHADERS:y}=await import("./gpu-ops-PQDFq1iI.js").then(U=>U.b);return{SHADERS:y}},[],import.meta.url)).SHADERS.gather_rows_bf16,p=t.getOrCreatePipeline("gather_rows_bf16",f),w=t.createBufferFromData("mtp_trim_gparams",new Uint32Array([h,a]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),B=t.createBindGroup(p,0,[o,n,c,w]),q=t.device.createCommandEncoder(),_=q.beginComputePass();_.setPipeline(p),_.setBindGroup(0,B),_.dispatchWorkgroups(Math.ceil(h/256),a),_.end(),t.device.queue.submit([q.finish()]);const{qweight:l,scales:d}=await this._quantizeBF16Weight(c,e,a,"mtp_trim_lmhead");this.mtp.trimmedLmHead={qweight:l,scales:d},this.mtp.trimmedLogits=t.createBuffer("mtp_trimmed_logits",a*4,s),c.destroy(),n.destroy(),w.destroy(),console.log(`[MTP] Trimmed lm_head: ${a} tokens, ${(l.size/1024/1024).toFixed(1)}MB qw + ${(d.size/1024/1024).toFixed(1)}MB sc`)}_dispatchTrimmedLmHead(){const a=this.hiddenSize,e=this.mtp.trimmedVocabSize,t=this.mtp.trimmedLmHead,i=a/this.groupSize%4===0,n=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(i){const r=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg8(e))}else{const r=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg4(e))}}_mtpGetQWeight(a){return this.mtp.qweights[a]}_mtpGptqMatvec(a,e,t,s,i){const n=this._mtpGetQWeight(t);if(!n)throw new Error(`MTP weight not found: ${t}`);const u=s/this.groupSize%4===0,o=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg8(i))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg4(i))}}_mtpSaveHidden(a,e){this.mtp&&(this.gpu.copyBuffer(a,this.mtp.savedHidden,this.hiddenSize*4),this.gpu.copyBuffer(e,this.mtp.savedMlpOut,this.hiddenSize*4))}async mtpForward(a){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");return this.mtpForwardSubmit(a),await this._readAndSample()}mtpForwardSubmit(a){const e=this.hiddenSize,t=this.intermediateSize,s=this.mtp.seqLen;this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);const i=this.makeUniform("add_rmsnorm_params",[e,this.rmsEps]);this.runCached("mtp_emb_norm","rmsnorm",[this.hidden,this.mtp.weights["mtp.pre_fc_norm_embedding.weight"],this.normed,i],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4),this.runCached("mtp_hid_norm","add_rmsnorm_ro",[this.mtp.savedHidden,this.mtp.savedMlpOut,this.mtp.weights["mtp.pre_fc_norm_hidden.weight"],this.normed,i],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4,0,e*4),this._mtpGptqMatvec(this.mtp.concatBuf,this.mtp.fcOut,"mtp.fc.weight",e*2,e),this.runCached("mtp_input_norm","rmsnorm",[this.mtp.fcOut,this.mtp.weights["mtp.layers.0.input_layernorm.weight"],this.normed,i],1);const n="mtp.layers.0.self_attn",r=this._mtpGptqMatvecOp(this.normed,this.qProjFull,`${n}.q_proj.weight`,e,this.numHeads*this.headDim*2),u=this._mtpGptqMatvecOp(this.normed,this.kProj,`${n}.k_proj.weight`,e,this.numKVHeads*this.headDim),o=this._mtpGptqMatvecOp(this.normed,this.vProj,`${n}.v_proj.weight`,e,this.numKVHeads*this.headDim);this.gpu.dispatchMulti([r,u,o].filter(Boolean)),this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,s,!0),this._gqaDv.setUint32(12,s,!0),this.gpu.device.queue.writeBuffer(this.mtp.fusedSQKParams,16,this._gqaData,0,16),this.runCached("mtp_sqk",this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.mtp.fusedSQKParams],this.numHeads+this.numKVHeads);const h=s+1;this._gqaDv.setUint32(0,h,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,s,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.runCached("mtp_gqa","gqa_attention_head",[this.qProj,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.attnOut,this._gqaParamBuf],this.numHeads);const m=this._mtpGetQWeight(`${n}.o_proj.weight`),c=this.numHeads*this.headDim,f=this.makeUniform(`fused_sig_mv_${c}_${e}`,[c,e,this.groupSize]);c/this.groupSize%4===0&&this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_4t_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg8(e)):this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)):this.runCached("mtp_sig_oproj","fused_sigmoid_gptq",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)),this.runCached("mtp_post_norm","add_rmsnorm_ro",[this.mtp.fcOut,this.qProj,this.mtp.weights["mtp.layers.0.post_attention_layernorm.weight"],this.normed,i],1);const B=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",q=this.makeUniform(`fused_mv_${e}_${t}`,[e,t,this.groupSize]),_=this.mtp.qweights["mtp.layers.0.mlp.gate_proj.weight"],l=this.mtp.qweights["mtp.layers.0.mlp.up_proj.weight"];this.runCached("mtp_gate_up_silu",B,[this.normed,_.qweight,_.scales,l.qweight,l.scales,this.mlpIntermediate,q],this.wg4(t)),this._mtpGptqMatvec(this.mlpIntermediate,this.mlpOut,"mtp.layers.0.mlp.down_proj.weight",t,e),this.runCached("mtp_final_norm","three_way_add_rmsnorm",[this.mtp.fcOut,this.qProj,this.mlpOut,this.mtp.weights["mtp.norm.weight"],this.hidden,this.normed,i],1),this._dispatchLmHead();const d=this.makeUniform("mtp_argmax_params",[this.vocabSize]);this.runCached("mtp_argmax","argmax",[this.logits,this.argmaxResult,d],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this.mtp.seqLen++}_mtpSnapshotDeltaNet(){if(!this.mtp._deltaNetSnapshot){const e=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.mtp._deltaNetSnapshot={states:{},convs:{}};for(let t=0;t<this.numLayers;t++)this.layerTypes[t]==="linear_attention"&&(this.mtp._deltaNetSnapshot.states[t]=this.gpu.createBuffer(`mtp_snap_state_${t}`,this.linState[t].size,e),this.mtp._deltaNetSnapshot.convs[t]=this.gpu.createBuffer(`mtp_snap_conv_${t}`,this.linConvHist[t].size,e))}const a=this.mtp._deltaNetSnapshot;a.mtpSeqLen=this.mtp.seqLen;for(const e of Object.keys(a.states))this.gpu.copyBuffer(this.linState[e],a.states[e],this.linState[e].size),this.gpu.copyBuffer(this.linConvHist[e],a.convs[e],this.linConvHist[e].size)}_mtpRestoreDeltaNet(){const a=this.mtp._deltaNetSnapshot;if(a){a.mtpSeqLen!==void 0&&(this.mtp.seqLen=a.mtpSeqLen);for(const e of Object.keys(a.states))this.gpu.copyBuffer(a.states[e],this.linState[e],this.linState[e].size),this.gpu.copyBuffer(a.convs[e],this.linConvHist[e],this.linConvHist[e].size)}}async generateWithMTP(a,e=512,t){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");this.seqLen=0,this.mtp.seqLen=0,this._recentTokenCount=0;const s=[...a],i=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,o=this.linValueHeads/i*r,h=i*(n+n+o);for(let g=0;g<this.numLayers;g++)if(this.layerTypes[g]==="linear_attention"){const P=i*n*o*4,S=3*h*4;this.gpu.device.queue.writeBuffer(this.linState[g],0,new Uint8Array(P)),this.gpu.device.queue.writeBuffer(this.linConvHist[g],0,new Uint8Array(S))}let m=null;for(let g=0;g<a.length;g++)m=await this.forward(a[g],g),this.seqLen=g+1;s.push(m);const c=this.config.eos_token_id??this.textCfg.eos_token_id,f=Array.isArray(c)?c:c!=null?[c]:[248044,248046];if(t?.(m,0)||f.includes(m))return s;let w=1,B=0,q=0,_=m,l=0,d=0;for(;w<e;){const g=performance.now(),P=await this.mtpForward(_);this._mtpSnapshotDeltaNet();const S=this.seqLen,b=await this.forwardB2(_,P,this.seqLen);this.seqLen+=2;const v=b[0],C=b[1];if(v===P){B++,s.push(P),w++;let k=t?.(P,w-1);if(k||f.includes(P)||(s.push(C),w++,k=t?.(C,w-1),k||f.includes(C)))break;_=C}else{q++,this._mtpRestoreDeltaNet(),this.seqLen=S;const k=await this.forward(_,this.seqLen);if(this.seqLen++,s.push(k),w++,t?.(k,w-1)||f.includes(k))break;_=k}const D=performance.now();if(l+=D-g,d++,d%25===0){const k=B/(B+q)*100,z=w/d;console.log(`[MTP @${d}] ${(l/d).toFixed(1)}ms/step, ${(w/(l/1e3)).toFixed(0)} tok/s, accept=${k.toFixed(0)}%, ${z.toFixed(1)} tok/step`)}}const y=B/Math.max(1,B+q)*100;return console.log(`[MTP final] ${(w/((l||1)/1e3)).toFixed(0)} tok/s, accept=${y.toFixed(0)}% (${B}/${B+q}), ${w} tokens`),s}_mtpGptqMatvecOp(a,e,t,s,i){const n=this._mtpGetQWeight(t);if(!n)return null;const u=s/this.groupSize%4===0,o=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";return this.prepOpCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg8(i))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";return this.prepOpCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg4(i))}}}export{N as Qwen35Model};

assets/qwen35-model-D0qiY8Dx.js DELETED Viewed

@@ -1 +0,0 @@

- import{S as T,a as E,_ as R}from"./gpu-ops-Bq_PFJSE.js";function V(j){const a=j<<16,e=new ArrayBuffer(4);new Uint32Array(e)[0]=a,new Float32Array(e)[0];const t=a>>>31&1,i=a>>>23&255,s=a&8388607;if(i===0)return t<<15;if(i===255)return t<<15|31744|(s?512:0);const n=i-127+15;return n>=31?t<<15|31744:n<=0?t<<15:t<<15|n<<10|s>>>13}class L{constructor(a,e,t){if(this.gpu=a,this.config=e,this.textCfg=e.text_config,this.quantConfig=t,this.hiddenSize=this.textCfg.hidden_size,this.intermediateSize=this.textCfg.intermediate_size,this.numLayers=this.textCfg.num_hidden_layers,this.numHeads=this.textCfg.num_attention_heads,this.numKVHeads=this.textCfg.num_key_value_heads,this.headDim=this.textCfg.head_dim,this.vocabSize=this.textCfg.vocab_size,this.rmsEps=this.textCfg.rms_norm_eps,this.ropeTheta=this.textCfg.rope_parameters?.rope_theta||1e7,this.partialRotary=this.textCfg.partial_rotary_factor,this.partialDim=Math.floor(this.headDim*this.partialRotary),this.layerTypes=this.textCfg.layer_types,this.groupSize=t?.group_size||t?.config_groups?.group_0?.weights?.group_size||128,this.fp16Layers=new Set,t?.extra_config)for(const u of Object.keys(t.extra_config))this.fp16Layers.add(u);this.weights={},this.kvCache={},this.seqLen=0,this.pipelines={},this.singlePassMode=!0,this._replayFlat=null,this._forceMinSplits=0;const i=this.numHeads*this.headDim,s=this.numKVHeads*this.headDim,n=(i+s)/2,r=Math.ceil(n/4);this._splitQKNormShaderKey=`fused_split_qknorm_kvstore_${r}`,T[this._splitQKNormShaderKey]||(T[this._splitQKNormShaderKey]=E(r))}compilePipelines(){const a=this.gpu;for(const[e,t]of Object.entries(T))this.pipelines[e]=a.getOrCreatePipeline(e,t)}uploadTensors(a){this.quantConfig?.quant_method==="compressed-tensors"&&(a=this._convertCompressedTensors(a));for(const[t,i]of Object.entries(a)){let s=i.data;if(i.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")){const n=new Float32Array(s.buffer,s.byteOffset,s.byteLength/4),r=new Uint16Array(n.length);for(let u=0;u<n.length;u++){const h=new Uint32Array(n.buffer,n.byteOffset+u*4,1)[0];r[u]=h>>>16}s=new Uint8Array(r.buffer)}if(i._partial){let{offset:n,totalSize:r}=i._partial;if(i.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")&&(n/=2,r/=2),n===0){const h=this.gpu.createBuffer(t,r,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC);this.gpu.device.queue.writeBuffer(h,0,s),this.weights[t]=h}else{const h=this.weights[t];h&&this.gpu.device.queue.writeBuffer(h,n,s)}}else this.weights[t]=this.gpu.createBufferFromData(t,s);(t.includes("post_attention_layernorm.weight")||t.includes("input_layernorm.weight")||t.includes("q_norm.weight")||t.includes("k_norm.weight")||t==="model.language_model.norm.weight")&&(this._normWeightRaw||(this._normWeightRaw={}),this._normWeightRaw[t]=new Uint32Array(s.buffer.slice(s.byteOffset,s.byteOffset+s.byteLength))),this._uploadedCount=(this._uploadedCount||0)+1}}_convertCompressedTensors(a){const e={};for(const[t,i]of Object.entries(a))if(!t.endsWith(".weight_shape")){if(t.endsWith(".weight_packed")){const s=t.slice(0,-14),n=i.shape,r=n[0],u=n[1],h=new Int32Array(i.data.buffer,i.data.byteOffset,i.data.byteLength/4),o=new Int32Array(r*u);for(let m=0;m<r;m++)for(let c=0;c<u;c++)o[c*r+m]=h[m*u+c];e[`${s}.qweight`]={dtype:"I32",shape:[u,r],data:new Uint8Array(o.buffer)};continue}if(t.endsWith(".weight_scale")){const s=t.slice(0,-13),n=i.shape,r=n[0],u=n[1],h=new Uint16Array(i.data.buffer,i.data.byteOffset,i.data.byteLength/2),o=Math.ceil(r/2),m=new Uint32Array(u*o);for(let c=0;c<u;c++)for(let f=0;f<r;f+=2){const b=h[f*u+c],v=f+1<r?h[(f+1)*u+c]:0,U=V(b),w=V(v);m[c*o+(f>>1)]=U|w<<16}e[`${s}.scales`]={dtype:"I32",shape:[u,o],data:new Uint8Array(m.buffer)};continue}e[t]=i}return e}async postProcessWeights(){const a=this.textCfg.linear_num_key_heads||0,e=this.hiddenSize,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=this.layerTypes.indexOf("linear_attention"),s=i>=0?`model.language_model.layers.${i}.linear_attn`:"";if(this.abQuantized=s&&!!this.weights[`${s}.in_proj_a.qweight`],this.abQuantized){const g=this.textCfg.linear_num_value_heads??a;this.linAlpha=this.gpu.createBuffer("lin_alpha",g*4,t),this.linBeta=this.gpu.createBuffer("lin_beta",g*4,t)}else{this.linABWeight={};const g=this.textCfg.linear_num_value_heads??a,S=e*2,B=g*S,d=2*B;for(let P=0;P<this.numLayers;P++)if(this.layerTypes[P]==="linear_attention"){const q=`model.language_model.layers.${P}.linear_attn`,k=this.weights[`${q}.in_proj_a.weight`],$=this.weights[`${q}.in_proj_b.weight`];if(k&&$){const y=this.gpu.createBuffer(`ab_merged_${P}`,d,t),C=this.gpu.device.createCommandEncoder();C.copyBufferToBuffer(k,0,y,0,B),C.copyBufferToBuffer($,0,y,B,B),this.gpu.device.queue.submit([C.finish()]),this.linABWeight[P]=y}}}{const g=[];for(let d=0;d<this.numLayers;d++){if(this.layerTypes[d]==="linear_attention"){const P=`model.language_model.layers.${d}.linear_attn`,q=this.textCfg.linear_num_key_heads||0,k=this.textCfg.linear_key_head_dim||128,$=this.textCfg.linear_value_head_dim||128,y=this.textCfg.linear_num_value_heads??q,C=y*$,D=y/q*$,z=q*(k+k+D);g.push({prefix:`${P}.in_proj_qkv`,K:e,N:z}),g.push({prefix:`${P}.in_proj_z`,K:e,N:C}),g.push({prefix:`${P}.out_proj`,K:C,N:e})}else{const P=`model.language_model.layers.${d}.self_attn`,q=this.numHeads*this.headDim*2,k=this.numKVHeads*this.headDim;g.push({prefix:`${P}.q_proj`,K:e,N:q}),g.push({prefix:`${P}.k_proj`,K:e,N:k}),g.push({prefix:`${P}.v_proj`,K:e,N:k}),g.push({prefix:`${P}.o_proj`,K:this.numHeads*this.headDim,N:e})}g.push({prefix:`model.language_model.layers.${d}.mlp.gate_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${d}.mlp.up_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${d}.mlp.down_proj`,K:this.intermediateSize,N:e})}let S=0;const B=performance.now();for(const{prefix:d,K:P,N:q}of g)if(!this.weights[`${d}.qweight`]&&this.weights[`${d}.weight`]){const{qweight:k,scales:$}=await this._quantizeBF16ToINT4(this.weights[`${d}.weight`],P,q,this.groupSize,d.replace(/\./g,"_"));this.weights[`${d}.qweight`]=k,this.weights[`${d}.scales`]=$,S++}S>0&&console.log(`[QUANT] GPU-quantized ${S} BF16 projections to INT4 in ${(performance.now()-B).toFixed(0)}ms`)}this._mergedGateUp={};const n=this.intermediateSize,r=e/8,u=r*n*4,o=r/(this.groupSize/8)*n*2;for(let g=0;g<this.numLayers;g++){const S=`model.language_model.layers.${g}.mlp`,B=this.getQWeight(`${S}.gate_proj`),d=this.getQWeight(`${S}.up_proj`);if(B.qweight&&d.qweight){const P=this.gpu.createBuffer(`merged_qw_${g}`,u*2,t),q=this.gpu.createBuffer(`merged_sc_${g}`,o*2,t),k=this.gpu.device.createCommandEncoder();k.copyBufferToBuffer(B.qweight,0,P,0,u),k.copyBufferToBuffer(d.qweight,0,P,u,u),k.copyBufferToBuffer(B.scales,0,q,0,o),k.copyBufferToBuffer(d.scales,0,q,o,o),this.gpu.device.queue.submit([k.finish()]),this._mergedGateUp[g]={qweight:P,scales:q}}}this._fusedMLPParams={};const m=16+512*16;for(let g=0;g<this.numLayers;g++){const S=`model.language_model.layers.${g}.post_attention_layernorm.weight`,B=this._normWeightRaw?.[S];if(!B||!this._mergedGateUp[g])continue;const d=new ArrayBuffer(m),P=new Uint32Array(d),q=new Float32Array(d);P[0]=e,P[1]=n,P[2]=this.groupSize,q[3]=this.rmsEps;for(let k=0;k<B.length;k++)P[4+k]=B[k];this._fusedMLPParams[g]=this.gpu.createBufferFromData(`fused_mlp_params_${g}`,new Uint32Array(d),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}this._fusedSQKParams={};const c=this.headDim,f=this.numHeads,b=this.numKVHeads,v=f*c,U=b*c,w=(v+U)/2,_=Math.ceil(w/4),p=32+_*16,l=`fused_split_qknorm_kvstore_${_}`;T[l]||(T[l]=E(_)),this.pipelines[l]||(this.pipelines[l]=this.gpu.getOrCreatePipeline(l,T[l])),this._splitQKNormShaderKey=l;for(let g=0;g<this.numLayers;g++){if(this.layerTypes[g]!=="full_attention")continue;const S=`model.language_model.layers.${g}.self_attn`,B=`${S}.q_norm.weight`,d=`${S}.k_norm.weight`,P=this._normWeightRaw?.[B],q=this._normWeightRaw?.[d],k=new ArrayBuffer(p),$=new DataView(k);if($.setUint32(0,f,!0),$.setUint32(4,b,!0),$.setUint32(8,c,!0),$.setFloat32(12,this.rmsEps,!0),$.setUint32(16,0,!0),$.setUint32(20,this.partialDim,!0),$.setFloat32(24,this.ropeTheta,!0),P)for(let C=0;C<v/2;C++){const D=Math.floor(C/4),z=C%4;$.setUint32(32+D*16+z*4,P[C],!0)}if(q){const C=v/2;for(let D=0;D<U/2;D++){const z=C+D,x=Math.floor(z/4),O=z%4;$.setUint32(32+x*16+O*4,q[D],!0)}}const y=this.gpu.device.createBuffer({size:p,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:`fused_sqk_${g}`});this.gpu.device.queue.writeBuffer(y,0,new Uint8Array(k)),this._fusedSQKParams[g]=y}await this._quantizeLmHead()}async _quantizeLmHead(){const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight";if(!this.weights[e])return;const t=this.hiddenSize,i=this.vocabSize,s=this.groupSize,n=t/8,r=t/s,u=performance.now(),h=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,o=this.weights[e];this._lmHeadQWeight=this.gpu.createBuffer("lmhead_qweight",n*i*4,h);const m=this.gpu.createBuffer("lmhead_scales_f32",r*i*4,h),c=Math.ceil(r*i/2)*4;this._lmHeadScales=this.gpu.createBuffer("lmhead_scales",c,h);const f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await R(async()=>{const{SHADERS:y}=await import("./gpu-ops-Bq_PFJSE.js").then(C=>C.b);return{SHADERS:y}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),b=this.gpu.createBufferFromData("quant_params",new Uint32Array([t,i,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),v=this.gpu.createBindGroup(f,0,[o,this._lmHeadQWeight,m,b]),U=65535,w=Math.min(i,U),_=Math.ceil(i/U),p=this.gpu.device.createCommandEncoder(),l=p.beginComputePass();l.setPipeline(f),l.setBindGroup(0,v),l.dispatchWorkgroups(w,_),l.end(),this.gpu.device.queue.submit([p.finish()]);const g=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await R(async()=>{const{SHADERS:y}=await import("./gpu-ops-Bq_PFJSE.js").then(C=>C.b);return{SHADERS:y}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),S=Math.ceil(r*i/2),B=this.gpu.createBufferFromData("pack_params",new Uint32Array([S]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),d=this.gpu.createBindGroup(g,0,[m,this._lmHeadScales,B]),P=this.gpu.device.createCommandEncoder(),q=P.beginComputePass();q.setPipeline(g),q.setBindGroup(0,d),q.dispatchWorkgroups(Math.ceil(S/256)),q.end(),this.gpu.device.queue.submit([P.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),m.destroy(),b.destroy(),B.destroy();const k=(n*i*4/1e6).toFixed(0),$=(c/1e6).toFixed(0);console.log(`[QUANT] lm_head INT4 (GPU): ${(performance.now()-u).toFixed(0)}ms, ${k}MB qw + ${$}MB sc`)}async _quantizeBF16ToINT4(a,e,t,i,s){const n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,r=e/8,u=e/i,h=this.gpu.createBuffer(`${s}_qweight`,r*t*4,n),o=this.gpu.createBuffer(`${s}_scales_f32`,u*t*4,n),m=Math.ceil(u*t/2)*4,c=this.gpu.createBuffer(`${s}_scales`,m,n),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Bq_PFJSE.js").then($=>$.b);return{SHADERS:k}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),b=this.gpu.createBufferFromData(`${s}_qp`,new Uint32Array([e,t,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),v=this.gpu.createBindGroup(f,0,[a,h,o,b]),U=65535,w=Math.min(t,U),_=Math.ceil(t/U),p=this.gpu.device.createCommandEncoder(),l=p.beginComputePass();l.setPipeline(f),l.setBindGroup(0,v),l.dispatchWorkgroups(w,_),l.end(),this.gpu.device.queue.submit([p.finish()]);const g=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Bq_PFJSE.js").then($=>$.b);return{SHADERS:k}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),S=Math.ceil(u*t/2),B=this.gpu.createBufferFromData(`${s}_pp`,new Uint32Array([S]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),d=this.gpu.createBindGroup(g,0,[o,c,B]),P=this.gpu.device.createCommandEncoder(),q=P.beginComputePass();return q.setPipeline(g),q.setBindGroup(0,d),q.dispatchWorkgroups(Math.ceil(S/256)),q.end(),this.gpu.device.queue.submit([P.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),o.destroy(),b.destroy(),B.destroy(),{qweight:h,scales:c}}initBuffers(a){a||(a=this.textCfg.max_position_embeddings||4096),this.maxSeqLen=a;const e=this.gpu,t=this.hiddenSize,i=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.hidden=e.createBuffer("hidden_a",t*4,s),this.hiddenB=e.createBuffer("hidden_b",t*4,s),this.normed=e.createBuffer("normed",t*4,s),this.normedB=e.createBuffer("normed_b",t*4,s),this.mlpIntermediate=e.createBuffer("mlp_inter",i*4,s),this.mlpOut=e.createBuffer("mlp_out",t*4,s),this.logits=e.createBuffer("logits",this.vocabSize*4,s),this.argmaxResult=e.createBuffer("argmax_result",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this._argmaxReadback=e.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"argmax_readback"}),this._topkResult=e.createBuffer("topk_result",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._repMaxTokens=64,this._recentTokens=new Uint32Array(this._repMaxTokens),this._recentTokenCount=0,this._gpuRecentTokens=e.createBuffer("recent_tokens",this._repMaxTokens*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST);for(let w=0;w<this.numLayers;w++)this.layerTypes[w]==="full_attention"&&(this.kvCache[w]={keys:e.createBuffer(`kv_k_${w}`,a*this.numKVHeads*this.headDim*4),values:e.createBuffer(`kv_v_${w}`,a*this.numKVHeads*this.headDim*4)});this.qProjFull=e.createBuffer("q_proj_full",this.numHeads*this.headDim*2*4,s),this.qProj=e.createBuffer("q_proj_out",this.numHeads*this.headDim*4,s),this.qGate=e.createBuffer("q_gate",this.numHeads*this.headDim*4,s),this.kProj=e.createBuffer("k_proj_out",this.numKVHeads*this.headDim*4,s),this.vProj=e.createBuffer("v_proj_out",this.numKVHeads*this.headDim*4,s);const n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,h=this.textCfg.linear_num_value_heads??n,m=h/n*u,c=n*(r+r+m),f=h*u;this.linValueDim=f,this.linValueHeads=h,this.linQKV=e.createBuffer("lin_qkv",c*4,s),this.linZ=e.createBuffer("lin_z",f*4,s),this.linOut=e.createBuffer("lin_out",f*4,s);const b=Math.max(t,this.numHeads*this.headDim,f)*4;this.attnOut=e.createBuffer("attn_out",b,s),this._maxGqaSplits=64;const v=this.numHeads*this._maxGqaSplits*(this.headDim+2)*4;this._gqaPartials=e.createBuffer("gqa_partials",v,s),this.linState={},this.linConvHist={};for(let w=0;w<this.numLayers;w++)this.layerTypes[w]==="linear_attention"&&(this.linState[w]=e.createBuffer(`lin_state_${w}`,n*r*m*4,s),this.linConvHist[w]=e.createBuffer(`lin_conv_hist_${w}`,3*c*4,s));this.zeroBuf=e.createBuffer("zero_buf",t*4,s),this.useSplitK=!1,this.splitKSplits=1;const U=Math.max(t,i);this.splitKPartials=e.createBuffer("splitk_partials",U*this.splitKSplits*4,s),this.paramBufs={},this._gqaParamBuf=e.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_params"}),this._kvStoreParamBuf=e.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"kv_store_params"}),this._gqaData=new Uint8Array(32),this._gqaDv=new DataView(this._gqaData.buffer),this._kvData=new Uint8Array(16),this._kvDv=new DataView(this._kvData.buffer)}initB2Buffers(){const a=this.gpu,e=this.hiddenSize,t=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.b2={},this.b2.hidden=a.createBuffer("b2_hidden_a",2*e*4,i),this.b2.hiddenB=a.createBuffer("b2_hidden_b",2*e*4,i),this.b2.normed=a.createBuffer("b2_normed",2*e*4,i),this.b2.mlpIntermediate=a.createBuffer("b2_mlp_inter",2*t*4,i),this.b2.mlpOut=a.createBuffer("b2_mlp_out",2*e*4,i),this.b2.logits=a.createBuffer("b2_logits",2*this.vocabSize*4,i),this.b2.zeroBuf=a.createBuffer("b2_zero_buf",2*e*4,i),this.b2.qProjFull=a.createBuffer("b2_q_proj_full",2*this.numHeads*this.headDim*2*4,i),this.b2.qProj=a.createBuffer("b2_q_proj_out",2*this.numHeads*this.headDim*4,i),this.b2.qGate=a.createBuffer("b2_q_gate",2*this.numHeads*this.headDim*4,i),this.b2.kProj=a.createBuffer("b2_k_proj_out",2*this.numKVHeads*this.headDim*4,i),this.b2.vProj=a.createBuffer("b2_v_proj_out",2*this.numKVHeads*this.headDim*4,i);const s=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,o=this.linValueHeads/s*r,m=s*(n+n+o),c=this.linValueDim;this.b2.linQKV=a.createBuffer("b2_lin_qkv",2*m*4,i),this.b2.linZ=a.createBuffer("b2_lin_z",2*c*4,i),this.b2.linOut=a.createBuffer("b2_lin_out",2*c*4,i);const f=Math.max(e,this.numHeads*this.headDim,c)*4;if(this.b2.attnOut=a.createBuffer("b2_attn_out",2*f,i),this.abQuantized){const b=this.textCfg.linear_num_value_heads??s;this.b2.linAlpha=a.createBuffer("b2_lin_alpha",2*b*4,i),this.b2.linBeta=a.createBuffer("b2_lin_beta",2*b*4,i)}this.b2.topkResult0=a.createBuffer("b2_topk0",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkResult1=a.createBuffer("b2_topk1",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkReadback0=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb0"}),this.b2.topkReadback1=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb1"}),this.b2.argmaxResult0=a.createBuffer("b2_argmax0",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxResult1=a.createBuffer("b2_argmax1",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxReadback0=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb0"}),this.b2.argmaxReadback1=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb1"}),this.b2._gqaParamBuf0=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params0"}),this.b2._gqaParamBuf1=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params1"}),this.b2._dims={H:e,I:t,linQKVDim:m,valueDim:c,linHeads:s,linKeyDim:n,linValDim:r,linEVD:o,qProjSize:this.numHeads*this.headDim,qProjFullSize:this.numHeads*this.headDim*2,kProjSize:this.numKVHeads*this.headDim,vProjSize:this.numKVHeads*this.headDim,attnOutElems:f/4},console.log(`[B2] Allocated batch=2 buffers (${((2*(e+e+e+t+e+this.vocabSize)*4+2*f)/1e6).toFixed(1)} MB activations)`)}runWithOffsets(a,e,t,i=1){const s=this.pipelines[a],n=this.gpu.createBindGroupWithOffsets(s,0,e);this.gpu.dispatch(s,[n],t,i)}makeUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),i=new DataView(t);for(let r=0;r<e.length;r++)Number.isInteger(e[r])?i.setUint32(r*4,e[r],!0):i.setFloat32(r*4,e[r],!0);const s=a+"_"+e.join("_");if(this.paramBufs[s])return this.paramBufs[s];const n=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(n,0,new Uint8Array(t)),this.paramBufs[s]=n,n}_makeMixedUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),i=new DataView(t);for(let n=0;n<e.length;n++){const r=e[n];r.u!==void 0?i.setUint32(n*4,r.u,!0):i.setFloat32(n*4,r.f,!0)}this._mixedUniformBufs||(this._mixedUniformBufs={});let s=this._mixedUniformBufs[a];return s||(s=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a}),this._mixedUniformBufs[a]=s),this.gpu.device.queue.writeBuffer(s,0,new Uint8Array(t)),s}makeUniformTyped(a,e,t){const i=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(i*4/16)*16)),n=new DataView(s);let r=0;for(const o of e)n.setUint32(r,o,!0),r+=4;for(const o of t)n.setFloat32(r,o,!0),r+=4;const u=a+"_"+[...e,...t].join("_");if(this.paramBufs[u])return this.paramBufs[u];const h=this.gpu.device.createBuffer({size:s.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(s)),this.paramBufs[u]=h,h}makeFusedNormMLPUniform(a){const e=`fused_anorm_mlp_${a}`;if(this.paramBufs[e])return this.paramBufs[e];const t=this.hiddenSize,i=this.intermediateSize,s=16+640*16,n=new ArrayBuffer(s),r=new DataView(n);r.setUint32(0,t,!0),r.setUint32(4,i,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=`model.language_model.layers.${a}.post_attention_layernorm.weight`,h=this._normWeightRaw?.[u];if(!h)throw new Error(`Norm weight not cached for layer ${a}`);for(let m=0;m<t/2;m++){const c=Math.floor(m/4),f=m%4;r.setUint32(16+c*16+f*4,h[m],!0)}const o=this.gpu.device.createBuffer({size:s,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:e});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(n)),this.paramBufs[e]=o,o}fusedNormGptqOp(a,e,t,i,s,n,r,u){const h=this.getQWeight(i);if(!h.qweight)return null;const o=this.makeFusedNormGPTQUniform(s,r),m=u?"fused_norm_gptq":"fused_norm_gptq_noadd",c=u?"fnorm_gptq_":"fnorm_gptqna_",f=u?[a,e,h.qweight,h.scales,t,o]:[a,h.qweight,h.scales,t,o];return this.prepOpCached(`${c}${i}`,m,f,this.wg4(r))}makeFusedNormGPTQUniform(a,e){const t=`fused_norm_gptq_${a}_${e}`;if(this.paramBufs[t])return this.paramBufs[t];const i=this.hiddenSize,s=16+512*16,n=new ArrayBuffer(s),r=new DataView(n);r.setUint32(0,i,!0),r.setUint32(4,e,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=this._normWeightRaw?.[a];if(!u)throw new Error(`Norm weight not cached: ${a}`);for(let o=0;o<i/2;o++){const m=Math.floor(o/4),c=o%4;r.setUint32(16+m*16+c*4,u[o],!0)}const h=this.gpu.device.createBuffer({size:s,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:t});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(n)),this.paramBufs[t]=h,h}run(a,e,t,i=1){const s=this.pipelines[a],n=this.gpu.createBindGroup(s,0,e);this.gpu.dispatch(s,[n],t,i)}runCached(a,e,t,i,s=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);this.gpu.dispatch(n,[r],i,s)}prepOp(a,e,t,i=1){const s=this.pipelines[a],n=this.gpu.createBindGroup(s,0,e);return{pipeline:s,bindGroups:[n],workgroupsX:t,workgroupsY:i}}prepOpCached(a,e,t,i,s=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);return{pipeline:n,bindGroups:[r],workgroupsX:i,workgroupsY:s}}wg(a){return Math.ceil(a/256)}wg4(a){return Math.ceil(a/32)}wg8(a){return Math.ceil(a/8)}getQWeight(a){return{qweight:this.weights[a+".qweight"],scales:this.weights[a+".scales"],qzeros:this.weights[a+".qzeros"]}}embedding(a){const e=this.weights["model.language_model.embed_tokens.weight"],t=this.makeUniform("emb_params",[a,this.hiddenSize]);this.run("embedding",[e,this.hidden,t],this.wg(this.hiddenSize))}embeddingFromArgmax(){const a=this.weights["model.language_model.embed_tokens.weight"];this._embedArgmaxParams||(this._embedArgmaxParams=this.makeUniform("emb_argmax_params",[this.hiddenSize])),this.runCached("embed_from_argmax","embed_from_argmax",[a,this.hidden,this.argmaxResult,this._embedArgmaxParams],this.wg(this.hiddenSize))}_dispatchLmHead(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,i=a/this.groupSize%4===0,s=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(i){const n=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,s],this.wg8(e))}else{const n=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,s],this.wg4(e))}}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],i=this.makeUniform("lmhead_params",[this.hiddenSize,this.vocabSize]);this.runCached("lm_head","bf16_matvec",[this.normed,t,this.logits,i],this.wg(this.vocabSize))}}rmsNorm(a,e,t,i){const s=this.makeUniform("rmsnorm_params",[this.hiddenSize,this.rmsEps]);i?this.runCached(i,"rmsnorm",[a,t,e,s],1):this.run("rmsnorm",[a,t,e,s],1)}gptqMatvec(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return;const r=i/this.groupSize,u=r%4===0;if(this.useSplitK&&u){let h=this.splitKSplits;for(;h>1&&r%(h*4)!==0;)h>>=1;if(h>1){const o=this.makeUniform(`mv_sk_${i}_${s}_${h}`,[i,s,this.groupSize,h]);this.runCached(`sk4tf16_${t}`,"gptq_matvec_4t_f16_sk",[a,n.qweight,n.scales,this.splitKPartials,o],this.wg8(s),h);const m=this.makeUniform(`rsk_${s}_${h}`,[s,h]);this.runCached(`rsk_${t}`,"reduce_splitk",[this.splitKPartials,e,m],this.wg(s));return}}if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",o=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);this.runCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,m],this.wg8(s))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",o=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);this.runCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,m],this.wg4(s))}}gptqMatvecOp(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return null;if(i/this.groupSize%4===0){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",o=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);return this.prepOpCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,m],this.wg8(s))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",o=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);return this.prepOpCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,m],this.wg4(s))}}bf16Matvec(a,e,t,i,s,n){const r=this.makeUniform(`bf16mv_${i}_${s}`,[i,s]);n?this.runCached(n,"bf16_matvec",[a,t,e,r],this.wg(s)):this.run("bf16_matvec",[a,t,e,r],this.wg(s))}siluMul(a,e,t,i,s){const n=this.makeUniform(`silu_${i}`,[i]);s?this.runCached(s,"silu_mul",[a,e,t,n],this.wg(i)):this.run("silu_mul",[a,e,t,n],this.wg(i))}addVectors(a,e,t,i){const s=this.makeUniform(`add_${t}`,[t]);i?this.runCached(i,"add",[a,e,s],this.wg(t)):this.run("add",[a,e,s],this.wg(t))}addAndRmsNorm(a,e,t,i,s){const n=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);s?this.runCached(s,"add_rmsnorm",[a,e,i,t,n],1):this.run("add_rmsnorm",[a,e,i,t,n],1)}mlp(a,e){const t=`model.language_model.layers.${e}.mlp`,i=this.hiddenSize,s=this.intermediateSize,n=this.getQWeight(`${t}.gate_proj`),r=this.getQWeight(`${t}.up_proj`),u=this.makeUniform(`fused_mv_${i}_${s}`,[i,s,this.groupSize]);if(i/this.groupSize%4===0){const m=this.gpu.hasF16?"fused_gate_up_silu_4t_f16":"fused_gate_up_silu_4t",c=this.gpu.hasF16?"fused_gus4t16_":"fused_gus4t_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg8(s))}else{const m=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",c=this.gpu.hasF16?"fused_gus16_":"fused_gus_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg4(s))}this.gptqMatvec(this.mlpIntermediate,this.mlpOut,`${t}.down_proj`,s,i)}fullAttentionFused(a,e,t,i,s){s=s||this.normed;const n=`model.language_model.layers.${t}.self_attn`,r=this.hiddenSize,u=this.headDim,h=this.numHeads,o=this.numKVHeads,m=h/o,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,b=this.weights[f];if(c){const y=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]);this.runCached(`fa_norm_${t}`,"rmsnorm",[a,b,s,y],1)}else{const y=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]);this.runCached(`fa_norm_${t}`,"add_rmsnorm_ro",[a,e,b,s,y],1)}const v=this.gptqMatvecOp(s,this.qProjFull,`${n}.q_proj`,r,h*u*2),U=this.gptqMatvecOp(s,this.kProj,`${n}.k_proj`,r,o*u),w=this.gptqMatvecOp(s,this.vProj,`${n}.v_proj`,r,o*u);this.gpu.dispatchMulti([v,U,w].filter(Boolean));const _=this.kvCache[t],p=this._fusedSQKParams[t];this._gqaDv.setUint32(0,i,!0),this.gpu.device.queue.writeBuffer(p,16,this._gqaData,0,4),this.runCached(`fa_sqk_${t}`,this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,_.keys,_.values,p],h+o);const l=i+1,g=this._forceMinSplits||1,S=Math.max(g,Math.min(Math.max(1,Math.ceil(l/32)),this._maxGqaSplits)),B=S>1?this._gqaPartials:this.attnOut;if(this._gqaDv.setUint32(0,l,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,o,!0),this._gqaDv.setUint32(12,h,!0),this._gqaDv.setUint32(16,m,!0),this._gqaDv.setUint32(20,S,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.run("gqa_attention_head",[this.qProj,_.keys,_.values,B,this._gqaParamBuf],h,S),S>1){this._gqaReduceParams||(this._gqaReduceParams=this.gpu.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_reduce_params"}));const y=new Uint8Array(16),C=new DataView(y.buffer);C.setUint32(0,u,!0),C.setUint32(4,S,!0),C.setUint32(8,h,!0),this.gpu.device.queue.writeBuffer(this._gqaReduceParams,0,y),this.run("gqa_reduce",[this._gqaPartials,this.attnOut,this._gqaReduceParams],h)}const d=this.getQWeight(`${n}.o_proj`),P=h*u,k=P/this.groupSize%4===0,$=this.makeUniform(`fused_sig_mv_${P}_${r}`,[P,r,this.groupSize]);if(k){const y=this.gpu.hasF16?"fused_sigmoid_gptq_4t_f16":"fused_sigmoid_gptq_4t",C=this.gpu.hasF16?"fused_sig4t16_oproj_":"fused_sig4t_oproj_";this.runCached(`${C}${t}`,y,[this.attnOut,this.qGate,d.qweight,d.scales,this.qProj,$],this.wg8(r))}else{const y=this.gpu.hasF16?"fused_sigmoid_gptq_f16":"fused_sigmoid_gptq",C=this.gpu.hasF16?"fused_sig16_oproj_":"fused_sig_oproj_";this.runCached(`${C}${t}`,y,[this.attnOut,this.qGate,d.qweight,d.scales,this.qProj,$],this.wg4(r))}}linearAttentionFused(a,e,t,i){i=i||this.normed;const s=`model.language_model.layers.${t}.linear_attn`,n=this.hiddenSize,r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,h=this.textCfg.linear_value_head_dim,o=this.linValueHeads,c=o/r*h,f=r*(u+u+c),b=this.linValueDim,v=t===0,U=`model.language_model.layers.${t}.input_layernorm.weight`,w=this.weights[U];if(v){const q=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"rmsnorm",[a,w,i,q],1)}else{const q=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"add_rmsnorm_ro",[a,e,w,i,q],1)}{const q=[this.gptqMatvecOp(i,this.linQKV,`${s}.in_proj_qkv`,n,f),this.gptqMatvecOp(i,this.linZ,`${s}.in_proj_z`,n,b)];this.abQuantized&&(q.push(this.gptqMatvecOp(i,this.linAlpha,`${s}.in_proj_a`,n,o)),q.push(this.gptqMatvecOp(i,this.linBeta,`${s}.in_proj_b`,n,o))),this.gpu.dispatchMulti(q.filter(Boolean))}const _=this.weights[`${s}.conv1d.weight`],p=this.weights[`${s}.A_log`],l=this.weights[`${s}.dt_bias`],g=this.weights[`${s}.norm.weight`];if(this.abQuantized){const q=`fused_cdn_q_${r}_${u}_${h}_${f}_${o}`;let k=this.paramBufs[q];if(!k){const $=new ArrayBuffer(32),y=new DataView($);y.setUint32(0,r,!0),y.setUint32(4,u,!0),y.setUint32(8,h,!0),y.setUint32(12,f,!0),y.setFloat32(16,this.rmsEps,!0),y.setUint32(20,0,!0),y.setUint32(24,o,!0),k=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_q_params"}),this.gpu.device.queue.writeBuffer(k,0,new Uint8Array($)),this.paramBufs[q]=k}this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,this.linAlpha,this.linBeta,p,l,g,k],r)}else{const q=`fused_cdn_ext_${r}_${u}_${h}_${f}_${n}_${o}`;let k=this.paramBufs[q];if(!k){const y=new ArrayBuffer(32),C=new DataView(y);C.setUint32(0,r,!0),C.setUint32(4,u,!0),C.setUint32(8,h,!0),C.setUint32(12,f,!0),C.setFloat32(16,this.rmsEps,!0),C.setUint32(20,n,!0),C.setUint32(24,o,!0),k=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_ext_params"}),this.gpu.device.queue.writeBuffer(k,0,new Uint8Array(y)),this.paramBufs[q]=k}const $=this.linABWeight[t];this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,i,$,p,l,g,k],r)}const S=this.getQWeight(`${s}.out_proj`),d=b/this.groupSize%4===0,P=this.makeUniform(`fused_silu_mv_${b}_${n}`,[b,n,this.groupSize]);if(d){const q=this.gpu.hasF16?"fused_silu_gptq_4t_f16":"fused_silu_gptq_4t",k=this.gpu.hasF16?"fused_silu4t16_oproj_":"fused_silu4t_oproj_";this.runCached(`${k}${t}`,q,[this.linZ,this.linOut,S.qweight,S.scales,this.attnOut,P],this.wg8(n))}else{const q=this.gpu.hasF16?"fused_silu_gptq_f16":"fused_silu_gptq",k=this.gpu.hasF16?"fused_silu16_oproj_":"fused_silu_oproj_";this.runCached(`${k}${t}`,q,[this.linZ,this.linOut,S.qweight,S.scales,this.attnOut,P],this.wg4(n))}}fusedNormMLP(a,e,t,i,s,n){n=n||this.normed;const r=this.hiddenSize,u=`model.language_model.layers.${s}.post_attention_layernorm.weight`,h=this.weights[u],o=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]),m=`mlp_norm_${s}_${a===this.hidden?"a":"b"}`;this.runCached(m,"three_way_add_rmsnorm",[a,t,i,h,e,n,o],1),this.mlp(n,s)}decoderLayer(a,e,t,i,s){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionFused(t,s,a,e),n=this.qProj):(this.linearAttentionFused(t,s,a),n=this.attnOut),this.fusedNormMLP(t,i,s,n,a)}async forward(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,i=this.hiddenB,s=this.zeroBuf;for(let h=0;h<this.numLayers;h++){this.decoderLayer(h,e,t,i,s),s=this.mlpOut;const o=t;t=i,i=o}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const h=this.repetitionPenalty??1,o=this.presencePenalty??0;if(this._recentTokenCount>0&&(h>1||o>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const b=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:h},{f:o},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,b],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const h=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,h],1)}return this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this._readAndSample()}forwardSubmit(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,i=this.hiddenB,s=this.zeroBuf;for(let h=0;h<this.numLayers;h++){this.decoderLayer(h,e,t,i,s),s=this.mlpOut;const o=t;t=i,i=o}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const h=this.repetitionPenalty??1,o=this.presencePenalty??0;if(this._recentTokenCount>0&&(h>1||o>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const b=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:h},{f:o},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,b],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const h=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,h],1)}this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch()}_buildReplayFromRecording(){const a=this.gpu.stopRecording(),e=this.pipelines.gqa_attention_head,t=[];for(const i of a)if(i.multi)for(const s of i.ops)t.push({p:s.pipeline,bg:s.bindGroup,x:s.wgX,y:s.wgY,gqa:s.pipeline===e?1:0});else t.push({p:i.pipeline,bg:i.bindGroup,x:i.wgX,y:i.wgY,gqa:i.pipeline===e?1:0});this._replayFlat=t,console.log(`[PERF] Recorded ${t.length} flat replay ops for core forward`)}_replayCoreForward(a){const e=this.gpu,t=a+1,i=Math.min(Math.max(1,Math.ceil(t/32)),this._maxGqaSplits);this._gqaDv.setUint32(0,a,!0);for(let u=0;u<this.numLayers;u++)this.layerTypes[u]==="full_attention"&&e.device.queue.writeBuffer(this._fusedSQKParams[u],16,this._gqaData,0,4);this._gqaDv.setUint32(0,t,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,i,!0),e.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData,0,24),this._gqaReduceParams&&(this._gqaDv.setUint32(0,this.headDim,!0),this._gqaDv.setUint32(4,i,!0),this._gqaDv.setUint32(8,this.numHeads,!0),this._gqaDv.setUint32(12,0,!0),e.device.queue.writeBuffer(this._gqaReduceParams,0,this._gqaData,0,16));const s=e._singlePass,n=this._replayFlat,r=n.length;for(let u=0;u<r;u++){const h=n[u];s.setPipeline(h.p),s.setBindGroup(0,h.bg),s.dispatchWorkgroups(h.x,h.gqa?i:h.y)}}embeddingB2(a,e){const t=this.hiddenSize,i=this.weights["model.language_model.embed_tokens.weight"],s=this.makeUniform("emb_params",[a,t]);this.runWithOffsets("embedding",[i,{buffer:this.b2.hidden,offset:0,size:t*4},s],this.wg(t));const n=this.makeUniform(`emb_params_${e}`,[e,t]);this.runWithOffsets("embedding",[i,{buffer:this.b2.hidden,offset:t*4,size:t*4},n],this.wg(t))}gptqMatvecB2(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return;const r=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",h=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";this.runCached(`${h}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(s))}gptqMatvecB2Op(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return null;const r=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",h=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";return this.prepOpCached(`${h}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(s))}fullAttentionB2(a,e,t,i){const s=`model.language_model.layers.${t}.self_attn`,n=this.hiddenSize,r=this.b2._dims,u=this.headDim,h=this.numHeads,o=this.numKVHeads,m=h/o,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,b=this.weights[f],v=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:n*4},b,{buffer:this.b2.normed,offset:0,size:n*4},v],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:n*4,size:n*4},b,{buffer:this.b2.normed,offset:n*4,size:n*4},v],1)):this.run("add_rmsnorm_ro_b2",[a,e,b,this.b2.normed,v],2);const U=this.gptqMatvecB2Op(this.b2.normed,this.b2.qProjFull,`${s}.q_proj`,n,h*u*2),w=this.gptqMatvecB2Op(this.b2.normed,this.b2.kProj,`${s}.k_proj`,n,o*u),_=this.gptqMatvecB2Op(this.b2.normed,this.b2.vProj,`${s}.v_proj`,n,o*u);this.gpu.dispatchMulti([U,w,_].filter(Boolean));const p=this.kvCache[t],l=this._fusedSQKParams[t],g=r.qProjFullSize*4,S=r.kProjSize*4,B=r.vProjSize*4,d=r.qProjSize*4;this._gqaDv.setUint32(0,i,!0),this.gpu.device.queue.writeBuffer(l,16,this._gqaData,0,4),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:0,size:g},{buffer:this.b2.kProj,offset:0,size:S},{buffer:this.b2.vProj,offset:0,size:B},{buffer:this.b2.qProj,offset:0,size:d},{buffer:this.b2.qGate,offset:0,size:d},p.keys,p.values,l],h+o),this._gqaDv.setUint32(0,i+1,!0),this.gpu.device.queue.writeBuffer(l,16,this._gqaData,0,4),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:g,size:g},{buffer:this.b2.kProj,offset:S,size:S},{buffer:this.b2.vProj,offset:B,size:B},{buffer:this.b2.qProj,offset:d,size:d},{buffer:this.b2.qGate,offset:d,size:d},p.keys,p.values,l],h+o);const P=i+1,q=i+2;this._gqaDv.setUint32(0,P,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,o,!0),this._gqaDv.setUint32(12,h,!0),this._gqaDv.setUint32(16,m,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,i,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf0,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:0,size:d},p.keys,p.values,{buffer:this.b2.attnOut,offset:0,size:r.qProjSize*4},this.b2._gqaParamBuf0],h),this._gqaDv.setUint32(0,q,!0),this._gqaDv.setUint32(24,i+1,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf1,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:d,size:d},p.keys,p.values,{buffer:this.b2.attnOut,offset:r.qProjSize*4,size:r.qProjSize*4},this.b2._gqaParamBuf1],h);const k=this.getQWeight(`${s}.o_proj`),$=h*u,y=this.makeUniform(`fused_sig_mv_${$}_${n}`,[$,n,this.groupSize]);this.runCached(`b2_fused_sig_oproj_${t}`,"fused_sigmoid_gptq_b2_f16",[this.b2.attnOut,this.b2.qGate,k.qweight,k.scales,this.b2.qProj,y],this.wg4(n))}linearAttentionB2(a,e,t){const i=`model.language_model.layers.${t}.linear_attn`,s=this.hiddenSize,n=this.b2._dims,r=n.linHeads,u=n.linKeyDim,h=n.linValDim;n.linEVD;const o=n.linQKVDim,m=n.valueDim,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,b=this.weights[f],v=this.makeUniform("add_rmsnorm_params",[s,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:s*4},b,{buffer:this.b2.normed,offset:0,size:s*4},v],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:s*4,size:s*4},b,{buffer:this.b2.normed,offset:s*4,size:s*4},v],1)):this.run("add_rmsnorm_ro_b2",[a,e,b,this.b2.normed,v],2);{const d=[this.gptqMatvecB2Op(this.b2.normed,this.b2.linQKV,`${i}.in_proj_qkv`,s,o),this.gptqMatvecB2Op(this.b2.normed,this.b2.linZ,`${i}.in_proj_z`,s,m)];this.abQuantized&&(d.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linAlpha,`${i}.in_proj_a`,s,this.linValueHeads)),d.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linBeta,`${i}.in_proj_b`,s,this.linValueHeads))),this.gpu.dispatchMulti(d.filter(Boolean))}const U=this.weights[`${i}.conv1d.weight`],w=this.weights[`${i}.A_log`],_=this.weights[`${i}.dt_bias`],p=this.weights[`${i}.norm.weight`],l=o*4,g=m*4;if(this.abQuantized){const d=this.linValueHeads,P=d*4,q=`fused_cdn_q_${r}_${u}_${h}_${o}_${d}`,k=this.paramBufs[q];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:l},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:0,size:g},{buffer:this.b2.linAlpha,offset:0,size:P},{buffer:this.b2.linBeta,offset:0,size:P},w,_,p,k],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:l,size:l},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:g,size:g},{buffer:this.b2.linAlpha,offset:P,size:P},{buffer:this.b2.linBeta,offset:P,size:P},w,_,p,k],r)}else{const d=`fused_cdn_ext_${r}_${u}_${h}_${o}_${s}_${this.linValueHeads}`,P=this.paramBufs[d],q=this.linABWeight[t];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:l},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:0,size:g},{buffer:this.b2.normed,offset:0,size:s*4},q,w,_,p,P],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:l,size:l},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:g,size:g},{buffer:this.b2.normed,offset:s*4,size:s*4},q,w,_,p,P],r)}const S=this.getQWeight(`${i}.out_proj`),B=this.makeUniform(`fused_silu_mv_${m}_${s}`,[m,s,this.groupSize]);this.runCached(`b2_fused_silu_oproj_${t}`,"fused_silu_gptq_b2_f16",[this.b2.linZ,this.b2.linOut,S.qweight,S.scales,this.b2.attnOut,B],this.wg4(s))}fusedNormMLPB2(a,e,t,i,s){const n=this.hiddenSize,r=this.intermediateSize,u=`model.language_model.layers.${s}.mlp`,h=`model.language_model.layers.${s}.post_attention_layernorm.weight`,o=this.weights[h],m=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.run("three_way_add_rmsnorm_b2",[a,t,i,o,e,this.b2.normed,m],2);const c=this.getQWeight(`${u}.gate_proj`),f=this.getQWeight(`${u}.up_proj`),b=this.makeUniform(`fused_mv_${n}_${r}`,[n,r,this.groupSize]);this.runCached(`b2_fused_gus_${s}`,"fused_gate_up_silu_b2_f16",[this.b2.normed,c.qweight,c.scales,f.qweight,f.scales,this.b2.mlpIntermediate,b],this.wg4(r)),this.gptqMatvecB2(this.b2.mlpIntermediate,this.b2.mlpOut,`${u}.down_proj`,r,n)}decoderLayerB2(a,e,t,i,s){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionB2(t,s,a,e),n=this.b2.qProj):(this.linearAttentionB2(t,s,a),n=this.b2.attnOut),this.fusedNormMLPB2(t,i,s,n,a)}_dispatchLmHeadB2(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,t=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]),i=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2";this.runCached("b2_lm_head",i,[this.b2.normed,this._lmHeadQWeight,this._lmHeadScales,this.b2.logits,t],this.wg4(e))}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],i=this.hiddenSize,s=this.vocabSize,n=this.makeUniform("lmhead_params",[i,s]);this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:0,size:i*4},t,{buffer:this.b2.logits,offset:0,size:s*4},n],this.wg(s)),this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:i*4,size:i*4},t,{buffer:this.b2.logits,offset:s*4,size:s*4},n],this.wg(s))}}async forwardB2(a,e,t){this.b2||this.initB2Buffers(),this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embeddingB2(a,e);let i=this.b2.hidden,s=this.b2.hiddenB,n=this.b2.zeroBuf;for(let f=0;f<this.numLayers;f++){this.decoderLayerB2(f,t,i,s,n),n=this.b2.mlpOut;const b=i;i=s,s=b}if(this.mtp){const f=this.hiddenSize;this.gpu.copyBuffer(i,this.mtp.savedHidden,f*4,f*4,0),this.gpu.copyBuffer(this.b2.mlpOut,this.mtp.savedMlpOut,f*4,f*4,0)}const r=this.weights["model.language_model.norm.weight"],u=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);this.run("add_rmsnorm_b2",[i,this.b2.mlpOut,r,this.b2.normed,u],2),this._dispatchLmHeadB2();const o=(this.temperature??.7)>0,m=this.vocabSize,c=this.makeUniform("argmax_params",[m]);return o?(this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.topkResult0,c],1),this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.topkResult1,c],1)):(this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.argmaxResult0,c],1),this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.argmaxResult1,c],1)),this.gpu._passCount!==void 0&&!this._b2PassCountLogged&&(console.log(`[PERF] B2: ${this.gpu._passCount} compute passes per 2-token step`),this._b2PassCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),o?(this.gpu.copyBuffer(this.b2.topkResult0,this.b2.topkReadback0,2048),this.gpu.copyBuffer(this.b2.topkResult1,this.b2.topkReadback1,2048)):(this.gpu.copyBuffer(this.b2.argmaxResult0,this.b2.argmaxReadback0,8),this.gpu.copyBuffer(this.b2.argmaxResult1,this.b2.argmaxReadback1,8)),this.gpu.endBatch(),this._readAndSampleB2(o)}async _readAndSampleB2(a){const e=[0,0];a?(await Promise.all([this.b2.topkReadback0.mapAsync(GPUMapMode.READ),this.b2.topkReadback1.mapAsync(GPUMapMode.READ)]),e[0]=this._sampleFromTopkReadback(this.b2.topkReadback0),e[1]=this._sampleFromTopkReadback(this.b2.topkReadback1)):(await Promise.all([this.b2.argmaxReadback0.mapAsync(GPUMapMode.READ),this.b2.argmaxReadback1.mapAsync(GPUMapMode.READ)]),e[0]=new Uint32Array(this.b2.argmaxReadback0.getMappedRange().slice(0))[0],e[1]=new Uint32Array(this.b2.argmaxReadback1.getMappedRange().slice(0))[0],this.b2.argmaxReadback0.unmap(),this.b2.argmaxReadback1.unmap());for(const t of e)this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=t:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=t);return e}_sampleFromTopkReadback(a){const e=a.getMappedRange(),t=new Uint32Array(e),i=new Float32Array(e),s=new Uint32Array(256),n=new Float32Array(256);for(let h=0;h<256;h++)s[h]=t[h*2],n[h]=i[h*2+1];a.unmap();const r=this.presencePenalty??0,u=this.repetitionPenalty??1;if((r>0||u>1)&&this._recentTokenCount>0){const h=new Set;for(let o=0;o<this._recentTokenCount;o++)h.add(this._recentTokens[o]);for(let o=0;o<256;o++)h.has(s[o])&&(r>0&&(n[o]-=r),u>1&&(n[o]=n[o]>0?n[o]/u:n[o]*u))}return this._sampleFromArrays(s,n,256)}async _readAndSample(){await this._argmaxReadback.mapAsync(GPUMapMode.READ);const a=new Uint32Array(this._argmaxReadback.getMappedRange().slice(0));this._argmaxReadback.unmap();const e=a[0];return this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=e:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=e),e}_sampleFromArrays(a,e,t){const i=this.temperature??.7,s=this.topP??.8,n=this.topK??20,r=Math.min(n,t),u=this._selBuf||(this._selBuf=new Uint32Array(64)),h=this._selValBuf||(this._selValBuf=new Float32Array(64)),o=this._usedBuf||(this._usedBuf=new Uint8Array(256));o.fill(0);for(let p=0;p<r;p++){let l=-1,g=-1/0;for(let S=0;S<t;S++)!o[S]&&e[S]>g&&(g=e[S],l=S);if(l<0)break;u[p]=a[l],h[p]=g,o[l]=1}const m=h[0],c=this._probBuf||(this._probBuf=new Float32Array(64));let f=0;for(let p=0;p<r;p++)c[p]=Math.exp((h[p]-m)/i),f+=c[p];for(let p=0;p<r;p++)c[p]/=f;let b=0,v=r;for(let p=0;p<r;p++)if(b+=c[p],b>=s){v=p+1;break}let U=0;for(let p=0;p<v;p++)U+=c[p];const w=Math.random()*U;let _=0;for(let p=0;p<v;p++)if(_+=c[p],_>=w)return u[p];return u[0]}_sample(a){const e=this.temperature??.7,t=this.topP??.8,i=this.topK??20,s=a.length;if(e<=0){let _=0,p=a[0];for(let l=1;l<s;l++)a[l]>p&&(p=a[l],_=l);return _}const n=Math.max(i,64),r=new Int32Array(n).fill(-1),u=new Float32Array(n).fill(-1/0);let h=-1/0;for(let _=0;_<s;_++){const p=a[_];if(p>h&&(h=p),p>u[n-1]){let l=n-1;for(;l>0&&p>u[l-1];)u[l]=u[l-1],r[l]=r[l-1],l--;u[l]=p,r[l]=_}}const o=Math.min(i,n),m=new Float32Array(o);let c=0;for(let _=0;_<o&&!(r[_]<0);_++)m[_]=Math.exp((u[_]-h)/e),c+=m[_];for(let _=0;_<o;_++)m[_]/=c;let f=0,b=o;for(let _=0;_<o;_++){if(r[_]<0){b=_;break}if(f+=m[_],f>=t){b=_+1;break}}let v=0;for(let _=0;_<b;_++)v+=m[_];const U=Math.random()*v;let w=0;for(let _=0;_<b;_++)if(w+=m[_],w>=U)return r[_];return r[0]}async generate(a,e=512,t){this.seqLen=0,this._recentTokenCount=0,this._replayFlat=null;const i=this.maxSeqLen||4096;if(e=Math.min(e,i-a.length),e<=0)throw new Error(`Prompt (${a.length} tokens) exceeds context size (${i})`);const s=[...a],n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,m=this.linValueHeads/n*u,c=n*(r+r+m);for(let B=0;B<this.numLayers;B++)if(this.layerTypes[B]==="linear_attention"){const d=n*r*m*4,P=3*c*4;this.gpu.device.queue.writeBuffer(this.linState[B],0,new Uint8Array(d)),this.gpu.device.queue.writeBuffer(this.linConvHist[B],0,new Uint8Array(P))}let f=null;for(let B=0;B<a.length;B++)f=await this.forward(a[B],B),this.seqLen=B+1;s.push(f);const b=this.config.eos_token_id??this.textCfg.eos_token_id,v=Array.isArray(b)?b:b!=null?[b]:[248044,248046];if(t?.(f,0)||v.includes(f))return s;this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=f:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=f),this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const w=8;this._tokenHistoryBuf||(this._tokenHistoryBuf=this.gpu.createBuffer("token_history",w*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._tokenHistoryReadback=this.gpu.device.createBuffer({size:w*4,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"token_history_readback"}));let _=0,p=0,l=1,g=f,S=!1;for(;l<e;){const B=performance.now(),d=Math.min(w,e-l);for(let y=0;y<d;y++){const C=this.seqLen+y;if(this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),y===0?this.embedding(g):this.embeddingFromArgmax(),S)this._replayCoreForward(C);else{let O=this.hidden,G=this.hiddenB,M=this.zeroBuf;y===0&&!this._replayFlat&&(this._forceMinSplits=2,this.gpu.startRecording());for(let F=0;F<this.numLayers;F++){this.decoderLayer(F,C,O,G,M),M=this.mlpOut;const H=O;O=G,G=H}const A=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(O,this.mlpOut,this.normed,A,"add_final_norm"),this._dispatchLmHead(),y===0&&this.gpu._recording&&(this._forceMinSplits=0,this._buildReplayFromRecording())}const D=this.temperature??.7;if(D>0){const O=this.repetitionPenalty??1,G=this.presencePenalty??0,M=this._recentTokenCount+y;if(M>0&&(O>1||G>0)){const W=this._makeMixedUniform("penalty_params",[{u:Math.min(M,this._repMaxTokens)},{f:O},{f:G},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,W],Math.ceil(Math.min(M,this._repMaxTokens)/256))}const A=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,A],1);const F=Math.random()*4294967295>>>0,H=this._makeMixedUniform("sample_params",[{f:D},{u:this.topK??20},{f:this.topP??.8},{u:F}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,H],1)}else{const O=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,O],1)}const z=(this._recentTokenCount+y)%this._repMaxTokens,x=this.makeUniform(`append_${y}`,[z,y]);this.run("append_token",[this.argmaxResult,this._gpuRecentTokens,this._tokenHistoryBuf,x],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch()}!S&&this._replayFlat&&(S=!0);const P=this.gpu.device.createCommandEncoder();P.copyBufferToBuffer(this._tokenHistoryBuf,0,this._tokenHistoryReadback,0,d*4),this.gpu.device.queue.submit([P.finish()]),await this._tokenHistoryReadback.mapAsync(GPUMapMode.READ);const q=new Uint32Array(this._tokenHistoryReadback.getMappedRange().slice(0,d*4));this._tokenHistoryReadback.unmap();const k=performance.now();_+=k-B,p+=d;let $=!1;for(let y=0;y<d;y++){const C=q[y];s.push(C),this.seqLen++,this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=C:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=C);const D=t?.(C,l);if(l++,D||v.includes(C)){$=!0;break}}if(p%50<w&&console.log(`[T @${p}] ${(_/p).toFixed(1)}ms/tok (batch=${w})`),$)break;g=q[d-1],this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4)}return p>0&&console.log(`[T final @${p}] ${(_/p).toFixed(1)}ms/tok (batch=${w})`),s}async _quantizeBF16Weight(a,e,t,i){const s=this.groupSize,n=e/8,r=e/s,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,h=this.gpu.createBuffer(`${i}_qweight`,n*t*4,u),o=this.gpu.createBuffer(`${i}_scales_f32`,r*t*4,u),m=Math.ceil(r*t/2)*4,c=this.gpu.createBuffer(`${i}_scales`,m,u),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Bq_PFJSE.js").then($=>$.b);return{SHADERS:k}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),b=this.gpu.createBufferFromData(`${i}_qparams`,new Uint32Array([e,t,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),v=this.gpu.createBindGroup(f,0,[a,h,o,b]),U=65535,w=Math.min(t,U),_=Math.ceil(t/U),p=this.gpu.device.createCommandEncoder(),l=p.beginComputePass();l.setPipeline(f),l.setBindGroup(0,v),l.dispatchWorkgroups(w,_),l.end(),this.gpu.device.queue.submit([p.finish()]);const g=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Bq_PFJSE.js").then($=>$.b);return{SHADERS:k}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),S=Math.ceil(r*t/2),B=this.gpu.createBufferFromData(`${i}_pparams`,new Uint32Array([S]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),d=this.gpu.createBindGroup(g,0,[o,c,B]),P=this.gpu.device.createCommandEncoder(),q=P.beginComputePass();return q.setPipeline(g),q.setBindGroup(0,d),q.dispatchWorkgroups(Math.ceil(S/256)),q.end(),this.gpu.device.queue.submit([P.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),o.destroy(),b.destroy(),B.destroy(),{qweight:h,scales:c}}async initMTP(a){if(this.mtp)return;const e=this.hiddenSize,t=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=performance.now(),n="Qwen/Qwen3.5-2B";console.log(`[MTP] Downloading MTP weights from ${n}...`);const{loadMTPWeights:r}=await R(async()=>{const{loadMTPWeights:U}=await import("./safetensors-loader-CwGm5mJX.js");return{loadMTPWeights:U}},[],import.meta.url),u=await r(n,a);console.log(`[MTP] Downloaded ${Object.keys(u).length} MTP tensors`),this.mtp={weights:{},qweights:{}};const h={};for(const[U,w]of Object.entries(u)){const _=this.gpu.createBufferFromData(`mtp_${U}`,w.data);h[U]=_,this.mtp.weights[U]=_}const o=[{name:"mtp.fc.weight",K:e*2,N:e},{name:"mtp.layers.0.self_attn.q_proj.weight",K:e,N:this.numHeads*this.headDim*2},{name:"mtp.layers.0.self_attn.k_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.v_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.o_proj.weight",K:e,N:e},{name:"mtp.layers.0.mlp.gate_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.up_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.down_proj.weight",K:t,N:e}];for(const{name:U,K:w,N:_}of o){const{qweight:p,scales:l}=await this._quantizeBF16Weight(h[U],w,_,`mtp_${U}`);this.mtp.qweights[U]={qweight:p,scales:l},h[U].destroy(),delete this.mtp.weights[U]}this.mtp.normRaw={};const m=["mtp.pre_fc_norm_hidden.weight","mtp.pre_fc_norm_embedding.weight","mtp.layers.0.input_layernorm.weight","mtp.layers.0.post_attention_layernorm.weight","mtp.norm.weight","mtp.layers.0.self_attn.q_norm.weight","mtp.layers.0.self_attn.k_norm.weight"];for(const U of m){const w=u[U];w&&(this.mtp.normRaw[U]=new Uint32Array(w.data.buffer.slice(w.data.byteOffset,w.data.byteOffset+w.data.byteLength)))}const f=4096*this.numKVHeads*this.headDim*4;this.mtp.kvCache={keys:this.gpu.createBuffer("mtp_kv_keys",f,i),values:this.gpu.createBuffer("mtp_kv_values",f,i)},this.mtp.concatBuf=this.gpu.createBuffer("mtp_concat",e*2*4,i),this.mtp.fcOut=this.gpu.createBuffer("mtp_fc_out",e*4,i),this.mtp.savedHidden=this.gpu.createBuffer("mtp_saved_hidden",e*4,i),this.mtp.savedMlpOut=this.gpu.createBuffer("mtp_saved_mlp_out",e*4,i),this.mtp.seqLen=0,this._buildMTPSplitQKNormUniform(),this._buildMTPFusedMLPUniform(),await this._buildTrimmedLmHead(8e3);const b=((performance.now()-s)/1e3).toFixed(1),v=o.length;console.log(`[MTP] Initialized in ${b}s: ${v} INT4 weights, 1 KV cache layer`)}_buildMTPSplitQKNormUniform(){const a=this.numHeads,e=this.numKVHeads,t=this.headDim,i=a*t,s=e*t,n=(i+s)/2,u=32+Math.ceil(n/4)*16,h=new ArrayBuffer(u),o=new DataView(h);o.setUint32(0,a,!0),o.setUint32(4,e,!0),o.setUint32(8,t,!0),o.setFloat32(12,this.rmsEps,!0),o.setUint32(16,0,!0);const m=this.mtp.normRaw["mtp.layers.0.self_attn.q_norm.weight"],c=this.mtp.normRaw["mtp.layers.0.self_attn.k_norm.weight"];if(m)for(let b=0;b<i/2;b++){const v=Math.floor(b/4),U=b%4;o.setUint32(32+v*16+U*4,m[b],!0)}if(c){const b=i/2;for(let v=0;v<s/2;v++){const U=b+v,w=Math.floor(U/4),_=U%4;o.setUint32(32+w*16+_*4,c[v],!0)}}const f=this.gpu.device.createBuffer({size:u,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"mtp_fused_sqk"});this.gpu.device.queue.writeBuffer(f,0,new Uint8Array(h)),this.mtp.fusedSQKParams=f}_buildMTPFusedMLPUniform(){}async _buildTrimmedLmHead(a=8e3){const e=this.hiddenSize,t=this.gpu,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=new Uint32Array(a);for(let g=0;g<a;g++)s[g]=g;this.mtp.trimmedToFull=s,this.mtp.trimmedVocabSize=a;const n=t.createBufferFromData("mtp_trim_indices",s),u=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",h=this.weights[u],o=e/2,m=a*o*4,c=t.createBuffer("mtp_trim_gathered",m,i),f=(await R(async()=>{const{SHADERS:g}=await import("./gpu-ops-Bq_PFJSE.js").then(S=>S.b);return{SHADERS:g}},[],import.meta.url)).SHADERS.gather_rows_bf16,b=t.getOrCreatePipeline("gather_rows_bf16",f),v=t.createBufferFromData("mtp_trim_gparams",new Uint32Array([o,a]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),U=t.createBindGroup(b,0,[h,n,c,v]),w=t.device.createCommandEncoder(),_=w.beginComputePass();_.setPipeline(b),_.setBindGroup(0,U),_.dispatchWorkgroups(Math.ceil(o/256),a),_.end(),t.device.queue.submit([w.finish()]);const{qweight:p,scales:l}=await this._quantizeBF16Weight(c,e,a,"mtp_trim_lmhead");this.mtp.trimmedLmHead={qweight:p,scales:l},this.mtp.trimmedLogits=t.createBuffer("mtp_trimmed_logits",a*4,i),c.destroy(),n.destroy(),v.destroy(),console.log(`[MTP] Trimmed lm_head: ${a} tokens, ${(p.size/1024/1024).toFixed(1)}MB qw + ${(l.size/1024/1024).toFixed(1)}MB sc`)}_dispatchTrimmedLmHead(){const a=this.hiddenSize,e=this.mtp.trimmedVocabSize,t=this.mtp.trimmedLmHead,s=a/this.groupSize%4===0,n=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(s){const r=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg8(e))}else{const r=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg4(e))}}_mtpGetQWeight(a){return this.mtp.qweights[a]}_mtpGptqMatvec(a,e,t,i,s){const n=this._mtpGetQWeight(t);if(!n)throw new Error(`MTP weight not found: ${t}`);const u=i/this.groupSize%4===0,h=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg8(s))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg4(s))}}_mtpSaveHidden(a,e){this.mtp&&(this.gpu.copyBuffer(a,this.mtp.savedHidden,this.hiddenSize*4),this.gpu.copyBuffer(e,this.mtp.savedMlpOut,this.hiddenSize*4))}async mtpForward(a){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");return this.mtpForwardSubmit(a),await this._readAndSample()}mtpForwardSubmit(a){const e=this.hiddenSize,t=this.intermediateSize,i=this.mtp.seqLen;this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);const s=this.makeUniform("add_rmsnorm_params",[e,this.rmsEps]);this.runCached("mtp_emb_norm","rmsnorm",[this.hidden,this.mtp.weights["mtp.pre_fc_norm_embedding.weight"],this.normed,s],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4),this.runCached("mtp_hid_norm","add_rmsnorm_ro",[this.mtp.savedHidden,this.mtp.savedMlpOut,this.mtp.weights["mtp.pre_fc_norm_hidden.weight"],this.normed,s],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4,0,e*4),this._mtpGptqMatvec(this.mtp.concatBuf,this.mtp.fcOut,"mtp.fc.weight",e*2,e),this.runCached("mtp_input_norm","rmsnorm",[this.mtp.fcOut,this.mtp.weights["mtp.layers.0.input_layernorm.weight"],this.normed,s],1);const n="mtp.layers.0.self_attn",r=this._mtpGptqMatvecOp(this.normed,this.qProjFull,`${n}.q_proj.weight`,e,this.numHeads*this.headDim*2),u=this._mtpGptqMatvecOp(this.normed,this.kProj,`${n}.k_proj.weight`,e,this.numKVHeads*this.headDim),h=this._mtpGptqMatvecOp(this.normed,this.vProj,`${n}.v_proj.weight`,e,this.numKVHeads*this.headDim);this.gpu.dispatchMulti([r,u,h].filter(Boolean)),this._gqaDv.setUint32(0,i,!0),this.gpu.device.queue.writeBuffer(this.mtp.fusedSQKParams,16,this._gqaData,0,4),this.runCached("mtp_sqk",this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.mtp.fusedSQKParams],this.numHeads+this.numKVHeads);const o=i+1;this._gqaDv.setUint32(0,o,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,i,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.runCached("mtp_gqa","gqa_attention_head",[this.qProj,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.attnOut,this._gqaParamBuf],this.numHeads);const m=this._mtpGetQWeight(`${n}.o_proj.weight`),c=this.numHeads*this.headDim,f=this.makeUniform(`fused_sig_mv_${c}_${e}`,[c,e,this.groupSize]);c/this.groupSize%4===0&&this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_4t_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg8(e)):this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)):this.runCached("mtp_sig_oproj","fused_sigmoid_gptq",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)),this.runCached("mtp_post_norm","add_rmsnorm_ro",[this.mtp.fcOut,this.qProj,this.mtp.weights["mtp.layers.0.post_attention_layernorm.weight"],this.normed,s],1);const U=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",w=this.makeUniform(`fused_mv_${e}_${t}`,[e,t,this.groupSize]),_=this.mtp.qweights["mtp.layers.0.mlp.gate_proj.weight"],p=this.mtp.qweights["mtp.layers.0.mlp.up_proj.weight"];this.runCached("mtp_gate_up_silu",U,[this.normed,_.qweight,_.scales,p.qweight,p.scales,this.mlpIntermediate,w],this.wg4(t)),this._mtpGptqMatvec(this.mlpIntermediate,this.mlpOut,"mtp.layers.0.mlp.down_proj.weight",t,e),this.runCached("mtp_final_norm","three_way_add_rmsnorm",[this.mtp.fcOut,this.qProj,this.mlpOut,this.mtp.weights["mtp.norm.weight"],this.hidden,this.normed,s],1),this._dispatchLmHead();const l=this.makeUniform("mtp_argmax_params",[this.vocabSize]);this.runCached("mtp_argmax","argmax",[this.logits,this.argmaxResult,l],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this.mtp.seqLen++}_mtpSnapshotDeltaNet(){if(!this.mtp._deltaNetSnapshot){const e=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.mtp._deltaNetSnapshot={states:{},convs:{}};for(let t=0;t<this.numLayers;t++)this.layerTypes[t]==="linear_attention"&&(this.mtp._deltaNetSnapshot.states[t]=this.gpu.createBuffer(`mtp_snap_state_${t}`,this.linState[t].size,e),this.mtp._deltaNetSnapshot.convs[t]=this.gpu.createBuffer(`mtp_snap_conv_${t}`,this.linConvHist[t].size,e))}const a=this.mtp._deltaNetSnapshot;a.mtpSeqLen=this.mtp.seqLen;for(const e of Object.keys(a.states))this.gpu.copyBuffer(this.linState[e],a.states[e],this.linState[e].size),this.gpu.copyBuffer(this.linConvHist[e],a.convs[e],this.linConvHist[e].size)}_mtpRestoreDeltaNet(){const a=this.mtp._deltaNetSnapshot;if(a){a.mtpSeqLen!==void 0&&(this.mtp.seqLen=a.mtpSeqLen);for(const e of Object.keys(a.states))this.gpu.copyBuffer(a.states[e],this.linState[e],this.linState[e].size),this.gpu.copyBuffer(a.convs[e],this.linConvHist[e],this.linConvHist[e].size)}}async generateWithMTP(a,e=512,t){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");this.seqLen=0,this.mtp.seqLen=0,this._recentTokenCount=0;const i=[...a],s=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,h=this.linValueHeads/s*r,o=s*(n+n+h);for(let B=0;B<this.numLayers;B++)if(this.layerTypes[B]==="linear_attention"){const d=s*n*h*4,P=3*o*4;this.gpu.device.queue.writeBuffer(this.linState[B],0,new Uint8Array(d)),this.gpu.device.queue.writeBuffer(this.linConvHist[B],0,new Uint8Array(P))}let m=null;for(let B=0;B<a.length;B++)m=await this.forward(a[B],B),this.seqLen=B+1;i.push(m);const c=this.config.eos_token_id??this.textCfg.eos_token_id,f=Array.isArray(c)?c:c!=null?[c]:[248044,248046];if(t?.(m,0)||f.includes(m))return i;let v=1,U=0,w=0,_=m,p=0,l=0;for(;v<e;){const B=performance.now(),d=await this.mtpForward(_);this._mtpSnapshotDeltaNet();const P=this.seqLen,q=await this.forwardB2(_,d,this.seqLen);this.seqLen+=2;const k=q[0],$=q[1];if(k===d){U++,i.push(d),v++;let C=t?.(d,v-1);if(C||f.includes(d)||(i.push($),v++,C=t?.($,v-1),C||f.includes($)))break;_=$}else{w++,this._mtpRestoreDeltaNet(),this.seqLen=P;const C=await this.forward(_,this.seqLen);if(this.seqLen++,i.push(C),v++,t?.(C,v-1)||f.includes(C))break;_=C}const y=performance.now();if(p+=y-B,l++,l%25===0){const C=U/(U+w)*100,D=v/l;console.log(`[MTP @${l}] ${(p/l).toFixed(1)}ms/step, ${(v/(p/1e3)).toFixed(0)} tok/s, accept=${C.toFixed(0)}%, ${D.toFixed(1)} tok/step`)}}const g=U/Math.max(1,U+w)*100;return console.log(`[MTP final] ${(v/((p||1)/1e3)).toFixed(0)} tok/s, accept=${g.toFixed(0)}% (${U}/${U+w}), ${v} tokens`),i}_mtpGptqMatvecOp(a,e,t,i,s){const n=this._mtpGetQWeight(t);if(!n)return null;const u=i/this.groupSize%4===0,h=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";return this.prepOpCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg8(s))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";return this.prepOpCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg4(s))}}}export{L as Qwen35Model};

assets/{test-BQCz-9iM.js → test-BEFPr_G8.js} RENAMED Viewed

@@ -1,5 +1,5 @@
-const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["./qwen35-model-D0qiY8Dx.js","./gpu-ops-Bq_PFJSE.js"])))=>i.map(i=>d[i]);
-import{G as nt,S as ct,_ as et}from"./gpu-ops-Bq_PFJSE.js";class ot{constructor(){this.gpu=null,this.results=[],this.pipelines={}}async init(){this.gpu=new nt,await this.gpu.init();for(const[e,t]of Object.entries(ct))this.pipelines[e]=this.gpu.getOrCreatePipeline(e,t);return this}async runAll(e){const t=[["GPTQ MatVec",()=>this.testGPTQMatvec()],["GPTQ MatVec (group boundary)",()=>this.testGPTQMatvecGroups()],["GPTQ MatVec (gs=32)",()=>this.testGPTQMatvecGS32()],["GPTQ Split-K",()=>this.testGPTQSplitK()],["GPTQ F16",()=>this.testGPTQF16()],["BF16 MatVec",()=>this.testBF16Matvec()],["RMSNorm",()=>this.testRMSNorm()],["RMSNorm (1+w formula)",()=>this.testRMSNorm1PlusW()],["SiLU * Mul",()=>this.testSiLUMul()],["Vector Add",()=>this.testAdd()],["Embedding (BF16)",()=>this.testEmbedding()],["Argmax",()=>this.testArgmax()],["Argmax (large)",()=>this.testArgmaxLarge()],["Split interleaved",()=>this.testSplit()],["Sigmoid Mul",()=>this.testSigmoidMul()],["Head RMSNorm (1+w)",()=>this.testHeadRMSNorm()],["Head RMSNorm (nogated, w)",()=>this.testHeadRMSNormNogated()],["Causal Conv1d",()=>this.testCausalConv1d()],["KV Cache Store",()=>this.testKVCacheStore()],["GQA Attention (single pos)",()=>this.testGQAAttention()],["DeltaNet Recurrent",()=>this.testDeltaNetRecurrent()],["Add + RMSNorm (fused)",()=>this.testAddRMSNorm()],["Three-way Add + RMSNorm",()=>this.testThreeWayAddRMSNorm()],["GQA RoPE rotate_half",()=>this.testGQARoPERotateHalf()],["Fused sigmoid + GPTQ",()=>this.testFusedSigmoidGPTQ()],["Fused SiLU + GPTQ",()=>this.testFusedSiluGPTQ()],["Fused AddNorm+GateUp+SiLU",()=>this.testFusedAddNormGateUpSiLU()],["Fused Split+QKNorm+KVStore",()=>this.testFusedSplitQKNormKVStore()],["Add+RMSNorm B=2",()=>this.testAddRMSNormB2()],["Add+RMSNorm RO B=2",()=>this.testAddRMSNormROB2()],["Three-way Add+RMSNorm B=2",()=>this.testThreeWayAddRMSNormB2()],["Fused SiLU+GPTQ B=2",()=>this.testFusedSiluGPTQB2()],["Fused Sigmoid+GPTQ B=2",()=>this.testFusedSigmoidGPTQB2()],["Fused GateUp+SiLU B=2",()=>this.testFusedGateUpSiLUB2()],["GPTQ Matmul B=2",()=>this.testGPTQMatmulB2()],["GPTQ Matmul B=2 F16",()=>this.testGPTQMatmulB2F16()],["GPTQ Matmul B=2 4T F16",()=>this.testGPTQMatmulB2_4T_F16()]];for(const[o,s]of t)try{const r=await s();this.results.push({name:o,...r}),e?.({name:o,...r})}catch(r){this.results.push({name:o,pass:!1,error:r.message,maxErr:NaN}),e?.({name:o,pass:!1,error:r.message,maxErr:NaN})}return this.results}destroy(){this.gpu?.destroy()}makeBuffer(e,t,o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){return this.gpu.createBufferFromData(e,new Uint8Array(t.buffer,t.byteOffset,t.byteLength),o)}makeF32Buffer(e,t){return this.makeBuffer(e,new Float32Array(t))}makeU32Buffer(e,t){return this.makeBuffer(e,new Uint32Array(t))}makeOutputBuffer(e,t){return this.gpu.createBuffer(e,t*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC)}makeUniform(e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),o=new DataView(t);for(let s=0;s<e.length;s++)typeof e[s]=="number"&&!Number.isInteger(e[s])?o.setFloat32(s*4,e[s],!0):o.setUint32(s*4,e[s],!0);return this.gpu.createBufferFromData("uniform",new Uint8Array(t),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}makeUniformTyped(e,t){const o=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(o*4/16)*16)),r=new DataView(s);let a=0;for(const d of e)r.setUint32(a,d,!0),a+=4;for(const d of t)r.setFloat32(a,d,!0),a+=4;return this.gpu.createBufferFromData("uniform_typed",new Uint8Array(s),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}dispatch(e,t,o,s=1){const r=this.pipelines[e],a=this.gpu.createBindGroup(r,0,t);this.gpu.dispatch(r,[a],o,s)}async readback(e,t){return this.gpu.readBuffer(e,t*4)}async readbackU32(e,t){const o=this.gpu.createReadbackBuffer("_rb",t*4),s=this.gpu.device.createCommandEncoder();s.copyBufferToBuffer(e,0,o,0,t*4),this.gpu.device.queue.submit([s.finish()]),await o.mapAsync(GPUMapMode.READ);const r=new Uint32Array(o.getMappedRange().slice(0));return o.unmap(),o.destroy(),r}compare(e,t,o=1e-4){let s=0;const r=[];for(let a=0;a<t.length;a++){const d=Math.abs(e[a]-t[a]);s=Math.max(s,d),d>o&&r.push({idx:a,got:e[a],expected:t[a],err:d})}return{pass:r.length===0,maxErr:s,errors:r.slice(0,5)}}cpuF32ToBF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const a=new Uint32Array(new Float32Array([e[s]]).buffer)[0]>>>16,d=Math.floor(s/2);s%2===0?o[d]=a:o[d]|=a<<16}return o}cpuF32ToF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const r=this.f32ToF16(e[s]),a=Math.floor(s/2);s%2===0?o[a]=r:o[a]|=r<<16}return o}f32ToF16(e){const t=new ArrayBuffer(4);new Float32Array(t)[0]=e;const o=new Uint32Array(t)[0],s=o>>31&1;let r=o>>23&255,a=o&8388607;return r===0?s<<15:r===255?s<<15|31744|(a?512:0):(r=r-127+15,r>=31?s<<15|31744:r<=0?s<<15:s<<15|r<<10|a>>13)}f16ToF32(e){const t=e>>15&1,o=e>>10&31,s=e&1023;return o===0?s===0?t?-0:0:(t?-1:1)*Math.pow(2,-14)*(s/1024):o===31?s?NaN:t?-1/0:1/0:(t?-1:1)*Math.pow(2,o-15)*(1+s/1024)}bf16ToF32(e){const t=e<<16,o=new ArrayBuffer(4);return new Uint32Array(o)[0]=t,new Float32Array(o)[0]}async testGPTQMatvec(){const r=new Float32Array(64);for(let k=0;k<64;k++)r[k]=(k+1)*.1;const a=new Uint32Array(32),d=new Float32Array(256);for(let k=0;k<8;k++)for(let u=0;u<4;u++){let m=0;for(let g=0;g<8;g++){const i=k*8+g,c=(i+u)%15;d[i*4+u]=c-8,m|=c<<g*4}a[k*4+u]=m}const l=new Float32Array(4);for(let k=0;k<4;k++)l[k]=.5+k*.1;const w=this.cpuF32ToF16Packed(l),h=new Float32Array(4);for(let k=0;k<4;k++){let u=0;for(let m=0;m<64;m++){const i=Math.floor(m/64)*4+k,c=w[Math.floor(i/2)]>>i%2*16&65535,_=this.f16ToF32(c);u+=r[m]*d[m*4+k]*_}h[k]=u}const F=this.makeF32Buffer("gptq_input",r),n=this.makeU32Buffer("gptq_qw",a),B=this.makeU32Buffer("gptq_scales",w),f=this.makeOutputBuffer("gptq_out",4),p=this.makeUniform([64,4,64]);this.dispatch("gptq_matvec",[F,n,B,f,p],Math.ceil(4/32));const y=await this.readback(f,4);return this.compare(y,h,.01)}async testGPTQMatvecGroups(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/64,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/64)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq2_in",r),B=this.makeU32Buffer("gptq2_qw",a),f=this.makeU32Buffer("gptq2_sc",h),p=this.makeOutputBuffer("gptq2_out",4),y=this.makeUniform([128,4,64]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQMatvecGS32(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/32,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/32)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq32_in",r),B=this.makeU32Buffer("gptq32_qw",a),f=this.makeU32Buffer("gptq32_sc",h),p=this.makeOutputBuffer("gptq32_out",4),y=this.makeUniform([128,4,32]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQSplitK(){const a=new Float32Array(256);for(let i=0;i<256;i++)a[i]=Math.sin(i*.3)*.5;const d=new Uint32Array(1024),l=new Float32Array(256*32);for(let i=0;i<32;i++)for(let c=0;c<32;c++){let _=0;for(let b=0;b<8;b++){const N=i*8+b,x=(N*3+c*7)%15;l[N*32+c]=x-8,_|=x<<b*4}d[i*32+c]=_}const w=256/64,h=new Float32Array(w*32);for(let i=0;i<w;i++)for(let c=0;c<32;c++)h[i*32+c]=.3+(i*32+c)*.02;const F=this.cpuF32ToF16Packed(h),n=new Float32Array(32);for(let i=0;i<32;i++){let c=0;for(let _=0;_<256;_++){const N=Math.floor(_/64)*32+i,x=F[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);c+=a[_]*l[_*32+i]*q}n[i]=c}const B=this.makeF32Buffer("sk_in",a),f=this.makeU32Buffer("sk_qw",d),p=this.makeU32Buffer("sk_sc",F),y=this.makeOutputBuffer("sk_partials",128),k=this.makeOutputBuffer("sk_out",32),u=this.makeUniform([256,32,64,4]),m=this.makeUniform([32,4]);this.dispatch("gptq_splitk",[B,f,p,y,u],Math.ceil(32/32),4),this.dispatch("reduce_splitk",[y,k,m],Math.ceil(32/256));const g=await this.readback(k,32);return this.compare(g,n,.01)}async testGPTQF16(){if(!this.gpu.hasF16||!this.pipelines.gptq_matvec_f16)return{pass:!0,maxErr:0,note:"shader-f16 not available, skipped"};const e=128,t=4,o=64,s=e/8,r=new Float32Array(e);for(let u=0;u<e;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let u=0;u<s;u++)for(let m=0;m<t;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c+m*3)%15;d[c*t+m]=_-8,g|=_<<i*4}a[u*t+m]=g}const l=e/o,w=new Float32Array(l*t);for(let u=0;u<l;u++)for(let m=0;m<t;m++)w[u*t+m]=.5+u*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(t);for(let u=0;u<t;u++){let m=0;for(let g=0;g<e;g++){const c=Math.floor(g/o)*t+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*t+u]*b}F[u]=m}const n=this.makeF32Buffer("f16_in",r),B=this.makeU32Buffer("f16_qw",a),f=this.makeU32Buffer("f16_sc",h),p=this.makeOutputBuffer("f16_out",t),y=this.makeUniform([e,t,o]);this.dispatch("gptq_matvec_f16",[n,B,f,p,y],Math.ceil(t/32));const k=await this.readback(p,t);return this.compare(k,F,.5)}async testBF16Matvec(){const o=new Float32Array(8);for(let n=0;n<8;n++)o[n]=(n+1)*.2;const s=new Float32Array(32);for(let n=0;n<4;n++)for(let B=0;B<8;B++)s[n*8+B]=Math.cos(n*.5+B*.3);const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4);for(let n=0;n<4;n++){let B=0;for(let f=0;f<8;f++){const p=n*8+f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);B+=o[f]*u}a[n]=B}const d=this.makeF32Buffer("bf16mv_in",o),l=this.makeU32Buffer("bf16mv_w",r),w=this.makeOutputBuffer("bf16mv_out",4),h=this.makeUniform([8,4]);this.dispatch("bf16_matvec",[d,l,w,h],Math.ceil(4/32));const F=await this.readback(w,4);return this.compare(F,a,.001)}async testRMSNorm(){const o=new Float32Array(16);for(let f=0;f<16;f++)o[f]=(f-8)*.3;const s=new Float32Array(16);for(let f=0;f<16;f++)s[f]=Math.sin(f*.4)*.1;const r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<16;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/16+1e-6),l=new Float32Array(16);for(let f=0;f<16;f++){const p=f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);l[f]=o[f]*d*(1+u)}const w=this.makeF32Buffer("rmsn_in",o),h=this.makeU32Buffer("rmsn_w",r),F=this.makeOutputBuffer("rmsn_out",16),n=this.makeUniformTyped([16],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,16);return this.compare(B,l,1e-4)}async testRMSNorm1PlusW(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8).fill(0),r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<8;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/8+1e-6),l=new Float32Array(8);for(let f=0;f<8;f++)l[f]=o[f]*d;const w=this.makeF32Buffer("rmsn1_in",o),h=this.makeU32Buffer("rmsn1_w",r),F=this.makeOutputBuffer("rmsn1_out",8),n=this.makeUniformTyped([8],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,8);return this.compare(B,l,1e-4)}async testSiLUMul(){const t=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8);for(let h=0;h<8;h++){const F=t[h]/(1+Math.exp(-t[h]));s[h]=F*o[h]}const r=this.makeF32Buffer("silu_gate",t),a=this.makeF32Buffer("silu_up",o),d=this.makeOutputBuffer("silu_out",8),l=this.makeUniform([8]);this.dispatch("silu_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testAdd(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([.1,.2,.3,.4,.5,.6,.7,.8]),s=new Float32Array(8);for(let w=0;w<8;w++)s[w]=t[w]+o[w];const r=this.makeF32Buffer("add_a",t),a=this.makeF32Buffer("add_b",o),d=this.makeUniform([8]);this.dispatch("add",[r,a,d],Math.ceil(8/32));const l=await this.readback(r,8);return this.compare(l,s,1e-6)}async testEmbedding(){const o=new Float32Array(32);for(let F=0;F<o.length;F++)o[F]=(F+1)*.01;const s=this.cpuF32ToBF16Packed(o),r=2,a=new Float32Array(8);for(let F=0;F<8;F++){const n=r*8+F,B=s[Math.floor(n/2)],f=n%2===0?B&65535:B>>16&65535;a[F]=this.bf16ToF32(f)}const d=this.makeU32Buffer("emb_w",s),l=this.makeOutputBuffer("emb_out",8),w=this.makeUniform([r,8]);this.dispatch("embedding",[d,l,w],Math.ceil(8/256));const h=await this.readback(l,8);return this.compare(h,a,.001)}async testArgmax(){const t=new Float32Array(16);for(let w=0;w<16;w++)t[w]=Math.sin(w*.7)*3;const o=t.indexOf(Math.max(...t)),s=this.makeF32Buffer("argmax_in",t),r=this.makeOutputBuffer("argmax_res",2),a=this.makeUniform([16]);this.dispatch("argmax",[s,r,a],1);const l=(await this.readbackU32(r,2))[0];return{pass:l===o,maxErr:Math.abs(l-o),errors:l!==o?[{idx:0,got:l,expected:o}]:[]}}async testArgmaxLarge(){const t=new Float32Array(1024);for(let d=0;d<1024;d++)t[d]=Math.sin(d*.01)-2;t[773]=99;const o=this.makeF32Buffer("argmax_lg_in",t),s=this.makeOutputBuffer("argmax_lg_res",2),r=this.makeUniform([1024]);this.dispatch("argmax",[o,s,r],1);const a=await this.readbackU32(s,2);return{pass:a[0]===773,maxErr:Math.abs(a[0]-773),errors:a[0]!==773?[{got:a[0],expected:773}]:[]}}async testSplit(){const s=new Float32Array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]),r=new Float32Array([1,2,3,4,9,10,11,12]),a=new Float32Array([5,6,7,8,13,14,15,16]),d=this.makeF32Buffer("split_src",s),l=this.makeOutputBuffer("split_a",8),w=this.makeOutputBuffer("split_b",8),h=this.makeUniform([8,2,4]);this.dispatch("split",[d,l,w,h],Math.ceil(8/32));const F=await this.readback(l,8),n=await this.readback(w,8),B=this.compare(F,r,1e-6),f=this.compare(n,a,1e-6);return{pass:B.pass&&f.pass,maxErr:Math.max(B.maxErr,f.maxErr),errors:[...B.errors,...f.errors]}}async testSigmoidMul(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),s=new Float32Array(8);for(let h=0;h<8;h++)s[h]=t[h]/(1+Math.exp(-o[h]));const r=this.makeF32Buffer("sigmul_x",t),a=this.makeF32Buffer("sigmul_g",o),d=this.makeOutputBuffer("sigmul_out",8),l=this.makeUniform([8]);this.dispatch("sigmoid_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testHeadRMSNorm(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.1*(n+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*(1+u)}}const l=this.makeF32Buffer("hrmsn_x",s),w=this.makeU32Buffer("hrmsn_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testHeadRMSNormNogated(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.5+.1*n;const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*u}}const l=this.makeF32Buffer("hrmsng_x",s),w=this.makeU32Buffer("hrmsng_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm_nogated",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testCausalConv1d(){const t=new Float32Array([1,2,3,4]),o=new Float32Array(12);for(let y=0;y<o.length;y++)o[y]=(y+1)*.1;const s=new Float32Array(16);for(let y=0;y<s.length;y++)s[y]=.25;const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4),d=new Float32Array(12);for(let y=0;y<4;y++){const k=o[y],u=o[4+y],m=o[8+y],g=t[y],i=[];for(let _=0;_<4;_++){const b=y*4+_,N=r[Math.floor(b/2)],x=b%2===0?N&65535:N>>16&65535;i.push(this.bf16ToF32(x))}const c=i[0]*k+i[1]*u+i[2]*m+i[3]*g;a[y]=c/(1+Math.exp(-c)),d[y]=u,d[4+y]=m,d[8+y]=g}const l=this.makeF32Buffer("conv_x",t),w=this.makeF32Buffer("conv_hist",o),h=this.makeU32Buffer("conv_w",r),F=this.makeUniform([4]);this.dispatch("causal_conv1d",[l,w,h,F],Math.ceil(4/256));const n=await this.readback(l,4),B=await this.readback(w,12),f=this.compare(n,a,1e-4),p=this.compare(B,d,1e-4);return{pass:f.pass&&p.pass,maxErr:Math.max(f.maxErr,p.maxErr),errors:[...f.errors.map(y=>({...y,note:"conv output"})),...p.errors.map(y=>({...y,note:"history"}))]}}async testKVCacheStore(){const r=new Float32Array(8),a=new Float32Array(8);for(let c=0;c<8;c++)r[c]=c+1,a[c]=(c+1)*10;const l=8*8,w=new Float32Array(l),h=new Float32Array(l),F=new Float32Array(l),n=new Float32Array(l);for(let c=0;c<8;c++)F[24+c]=r[c],n[24+c]=a[c];const B=this.makeF32Buffer("kvs_k",r),f=this.makeF32Buffer("kvs_v",a),p=this.makeF32Buffer("kvs_kc",w),y=this.makeF32Buffer("kvs_vc",h),k=this.makeUniform([3,2,4]);this.dispatch("kv_cache_store",[B,f,p,y,k],Math.ceil(8/256));const u=await this.readback(p,l),m=await this.readback(y,l),g=this.compare(u,F,1e-6),i=this.compare(m,n,1e-6);return{pass:g.pass&&i.pass,maxErr:Math.max(g.maxErr,i.maxErr),errors:[...g.errors,...i.errors]}}async testGQAAttention(){const a=new Float32Array([1,0,0,0]),d=new Float32Array(4);d.set([1,0,0,0]);const l=new Float32Array(4);l.set([.5,.6,.7,.8]);const w=new Float32Array([.5,.6,.7,.8]),h=this.makeF32Buffer("gqa_q",a),F=this.makeF32Buffer("gqa_kc",d),n=this.makeF32Buffer("gqa_vc",l),B=this.makeOutputBuffer("gqa_out",4),f=new ArrayBuffer(32),p=new DataView(f);p.setUint32(0,1,!0),p.setUint32(4,4,!0),p.setUint32(8,1,!0),p.setUint32(12,1,!0),p.setUint32(16,1/1,!0),p.setUint32(20,1,!0);const y=this.gpu.createBufferFromData("gqa_params",new Uint8Array(f),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[h,F,n,B,y],1);const k=await this.readback(B,4);return this.compare(k,w,1e-4)}async testDeltaNetRecurrent(){const s=[.5,.5],r=[1,0],a=[.8,.2],d=new Float32Array([...s,...r,...a]),l=new Float32Array(4),w=new Float32Array([0]),h=new Float32Array([0]),F=new Float32Array([0]),n=new Float32Array([0]),B=this.cpuF32ToBF16Packed(F),f=this.cpuF32ToBF16Packed(n),p=1/(1+Math.exp(-0)),y=Math.sqrt(s[0]**2+s[1]**2),k=Math.sqrt(r[0]**2+r[1]**2),u=s.map(U=>U/y),m=r.map(U=>U/k),g=[0,0,0,0],i=[0,0],c=a.map((U,S)=>(U-i[S])*p);for(let U=0;U<2;U++)for(let S=0;S<2;S++)g[U*2+S]+=m[U]*c[S];const _=1/Math.sqrt(2),b=new Float32Array(2);for(let U=0;U<2;U++){let S=0;for(let O=0;O<2;O++)S+=g[O*2+U]*u[O]*_;b[U]=S}const N=this.makeF32Buffer("dn_qkv",d),x=this.makeF32Buffer("dn_state",l),q=this.makeOutputBuffer("dn_out",2),K=this.makeF32Buffer("dn_a",w),v=this.makeF32Buffer("dn_b",h),T=this.makeU32Buffer("dn_alog",B),V=this.makeU32Buffer("dn_dtb",f),G=this.makeUniform([1,2,2]);this.dispatch("deltanet_recurrent",[N,x,q,K,v,T,V,G],1);const C=await this.readback(q,2);return this.compare(C,b,.001)}async testAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array(8);for(let _=0;_<8;_++)r[_]=.1*(_+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(o);for(let _=0;_<8;_++)d[_]+=s[_];let l=0;for(let _=0;_<8;_++)l+=d[_]*d[_];const w=1/Math.sqrt(l/8+1e-6),h=new Float32Array(8);for(let _=0;_<8;_++){const b=a[Math.floor(_/2)]>>_%2*16&65535,N=this.bf16ToF32(b);h[_]=d[_]*w*(1+N)}const F=this.makeF32Buffer("addnorm_h",o),n=this.makeF32Buffer("addnorm_a",s),B=this.makeU32Buffer("addnorm_w",a),f=this.makeOutputBuffer("addnorm_out",8);this.makeUniform([8]);const p=new ArrayBuffer(16),y=new DataView(p);y.setUint32(0,8,!0),y.setFloat32(4,1e-6,!0);const k=this.gpu.createBufferFromData("addnorm_params",new Uint8Array(p),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("add_rmsnorm",[F,n,B,f,k],1);const u=await this.readback(f,8),m=await this.readback(F,8),g=d,i=this.compare(u,h,.001),c=this.compare(m,g,1e-6);return{pass:i.pass&&c.pass,maxErr:Math.max(i.maxErr,c.maxErr),errors:[...i.errors,...c.errors]}}async testThreeWayAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array([.5,.5,-.5,-.5,1,1,-1,-1]),a=new Float32Array(8);for(let N=0;N<8;N++)a[N]=.1*(N+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(8);for(let N=0;N<8;N++)l[N]=o[N]+s[N]+r[N];let w=0;for(let N=0;N<8;N++)w+=l[N]*l[N];const h=1/Math.sqrt(w/8+1e-6),F=new Float32Array(8);for(let N=0;N<8;N++){const x=d[Math.floor(N/2)]>>N%2*16&65535,q=this.bf16ToF32(x);F[N]=l[N]*h*(1+q)}const n=this.makeF32Buffer("twa_a",o),B=this.makeF32Buffer("twa_b",s),f=this.makeF32Buffer("twa_c",r),p=this.makeU32Buffer("twa_w",d),y=this.makeOutputBuffer("twa_hout",8),k=this.makeOutputBuffer("twa_normed",8),u=new ArrayBuffer(16),m=new DataView(u);m.setUint32(0,8,!0),m.setFloat32(4,1e-6,!0);const g=this.gpu.createBufferFromData("twa_params",new Uint8Array(u),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("three_way_add_rmsnorm",[n,B,f,p,y,k,g],1);const i=await this.readback(y,8),c=await this.readback(k,8),_=this.compare(i,l,1e-6),b=this.compare(c,F,.001);return{pass:_.pass&&b.pass,maxErr:Math.max(_.maxErr,b.maxErr),errors:[..._.errors,...b.errors]}}async testGQARoPERotateHalf(){const h=1/Math.sqrt(256),F=[];for(let D=0;D<32;D++)F.push(1/Math.pow(100,2*D/64));function n(D,M){const H=new Float32Array(D.length);for(let P=0;P<D.length;P++)H[P]=D[P];for(let P=0;P<32;P++){const Q=M*F[P],E=Math.cos(Q),J=Math.sin(Q);H[P]=D[P]*E-D[P+32]*J,H[P+32]=D[P+32]*E+D[P]*J}return H}const B=new Float32Array(256);B[0]=1,B[1]=.5,B[2]=.3,B[32]=.7,B[33]=.2;const f=new Float32Array(256);f[0]=1,f[32]=.5;const p=new Float32Array(256);p[1]=1,p[33]=.3;const y=new Float32Array(256);y[2]=1;const k=new Float32Array(256);k[0]=1;const u=new Float32Array(256);u[0]=0,u[1]=1;const m=new Float32Array(256);m[0]=0,m[2]=1;const g=n(B,2),i=n(f,0),c=n(p,1),_=n(y,2);let b=0,N=0,x=0;for(let D=0;D<256;D++)b+=g[D]*i[D],N+=g[D]*c[D],x+=g[D]*_[D];b*=h,N*=h,x*=h;const q=Math.max(b,N,x),K=Math.exp(b-q),v=Math.exp(N-q),T=Math.exp(x-q),V=K+v+T,G=K/V,C=v/V,U=T/V,S=new Float32Array(256);for(let D=0;D<256;D++)S[D]=G*k[D]+C*u[D]+U*m[D];const O=1*256,Y=new Float32Array(3*O),z=new Float32Array(3*O);Y.set(i,0),Y.set(c,O),Y.set(_,2*O),z.set(k,0),z.set(u,O),z.set(m,2*O);const X=this.makeF32Buffer("rope2_q",g),tt=this.makeF32Buffer("rope2_kc",Y),A=this.makeF32Buffer("rope2_vc",z),R=this.makeOutputBuffer("rope2_out",1*256),L=new ArrayBuffer(32),W=new DataView(L);W.setUint32(0,3,!0),W.setUint32(4,256,!0),W.setUint32(8,1,!0),W.setUint32(12,1,!0),W.setUint32(16,1/1,!0),W.setUint32(20,1,!0);const Z=this.gpu.createBufferFromData("rope2_params",new Uint8Array(L),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[X,tt,A,R,Z],1);const j=await this.readback(R,1*256);return this.compare(j,S,.001)}async testFusedSigmoidGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m+1)*.1,a[m]=(m-8)*.3;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.5+m*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const c=r[i]/(1+Math.exp(-a[i])),b=Math.floor(i/64)*4+m,N=h[Math.floor(b/2)]>>b%2*16&65535,x=this.f16ToF32(N);g+=c*l[i*4+m]*x}F[m]=g}const n=this.makeF32Buffer("fsg_x",r),B=this.makeF32Buffer("fsg_g",a),f=this.makeU32Buffer("fsg_qw",d),p=this.makeU32Buffer("fsg_sc",h),y=this.makeOutputBuffer("fsg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_sigmoid_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedSiluGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m-8)*.2,a[m]=(m+1)*.15;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_*2+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.4+m*.2;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const _=r[i]/(1+Math.exp(-r[i]))*a[i],N=Math.floor(i/64)*4+m,x=h[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);g+=_*l[i*4+m]*q}F[m]=g}const n=this.makeF32Buffer("fslg_a",r),B=this.makeF32Buffer("fslg_b",a),f=this.makeU32Buffer("fslg_qw",d),p=this.makeU32Buffer("fslg_sc",h),y=this.makeOutputBuffer("fslg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_silu_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedAddNormGateUpSiLU(){const a=new Float32Array(64),d=new Float32Array(64),l=new Float32Array(64),w=new Float32Array(64);for(let A=0;A<64;A++)a[A]=(A+1)*.1,d[A]=(A-5)*.05,l[A]=(A%7-3)*.03,w[A]=(A%3-1)*.1;const h=new Uint32Array(32),F=new Uint32Array(32),n=new Float32Array(256),B=new Float32Array(256);for(let A=0;A<64;A++)for(let R=0;R<4;R++)n[A*4+R]=Math.sin(A*.7+R*1.3)*.5,B[A*4+R]=Math.cos(A*.3+R*.9)*.5;const f=new Uint32Array(4/2),p=new Uint32Array(4/2);for(let A=0;A<4;A++){let R=0,L=0;for(let P=0;P<64;P++)R=Math.max(R,Math.abs(n[P*4+A])),L=Math.max(L,Math.abs(B[P*4+A]));const W=R/7,Z=L/7,j=this.f32ToF16(W),D=this.f32ToF16(Z),M=this.f16ToF32(j),H=this.f16ToF32(D);A%2===0?(f[A/2]=j,p[A/2]=D):(f[(A-1)/2]|=j<<16,p[(A-1)/2]|=D<<16);for(let P=0;P<64;P++){const Q=Math.round(n[P*4+A]/M)+8,E=Math.round(B[P*4+A]/H)+8,J=Math.max(0,Math.min(15,Q)),rt=Math.max(0,Math.min(15,E)),st=Math.floor(P/8),at=P%8*4;h[st*4+A]|=J<<at,F[st*4+A]|=rt<<at}}const y=new Uint32Array(64/2);for(let A=0;A<64;A+=2){const R=this.f32ToBF16(w[A]),L=this.f32ToBF16(w[A+1]);y[A/2]=R|L<<16}const k=new Float32Array(64);let u=0;for(let A=0;A<64;A++)k[A]=a[A]+d[A]+l[A],u+=k[A]*k[A];const m=1/Math.sqrt(u/64+1e-6),g=new Float32Array(64);for(let A=0;A<64;A++){const R=this.unpackBF16(y[Math.floor(A/2)],A%2);g[A]=k[A]*m*(1+R)}const i=new Float32Array(4);for(let A=0;A<4;A++){let R=0,L=0;const W=this.f16ToF32(A%2===0?f[Math.floor(A/2)]&65535:f[Math.floor(A/2)]>>>16),Z=this.f16ToF32(A%2===0?p[Math.floor(A/2)]&65535:p[Math.floor(A/2)]>>>16);for(let j=0;j<64;j++){const D=Math.floor(j/8),M=j%8*4,H=h[D*4+A]>>>M&15,P=F[D*4+A]>>>M&15;R+=g[j]*W*(H-8),L+=g[j]*Z*(P-8)}i[A]=R/(1+Math.exp(-R))*L}const c=k,_=16+640*16,b=new ArrayBuffer(_),N=new DataView(b);N.setUint32(0,64,!0),N.setUint32(4,4,!0),N.setUint32(8,64,!0),N.setFloat32(12,1e-6,!0);for(let A=0;A<64/2;A++){const R=Math.floor(A/4),L=A%4;N.setUint32(16+R*16+L*4,y[A],!0)}const x=this.gpu.createBufferFromData("fang_params",new Uint8Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),q=new Uint32Array(64);q.set(h,0),q.set(F,32);const K=8/(64/8),v=new Uint32Array(K*4/2*2);v.set(f,0),v.set(p,K*4/2);const T=this.makeF32Buffer("fang_hin",a),V=this.makeF32Buffer("fang_mlpres",d),G=this.makeF32Buffer("fang_attnres",l),C=this.makeU32Buffer("fang_mqw",q),U=this.makeU32Buffer("fang_msc",v),S=this.makeOutputBuffer("fang_hout",64),O=this.makeOutputBuffer("fang_out",4);this.dispatch("fused_addnorm_gate_up_silu",[T,V,G,C,U,S,O,x],Math.max(1,Math.ceil(4/32)));const Y=await this.readback(O,4),z=await this.readback(S,64),X=this.compare(Y,i,.02),tt=this.compare(z,c,1e-6);return{pass:X.pass&&tt.pass,maxErr:Math.max(X.maxErr,tt.maxErr),errors:[...X.errors||[],...tt.errors||[]]}}async testFusedSplitQKNormKVStore(){const l=new Float32Array(32);for(let M=0;M<l.length;M++)l[M]=Math.sin(M*.5+1)*.3;const w=new Float32Array(8),h=new Float32Array(8);for(let M=0;M<8;M++)w[M]=Math.cos(M*.7+2)*.4,h[M]=Math.sin(M*1.1+3)*.2;const F=new Float32Array(16),n=new Float32Array(8);for(let M=0;M<16;M++)F[M]=(M%3-1)*.05;for(let M=0;M<8;M++)n[M]=(M%4-2)*.03;const B=new Uint32Array(16/2);for(let M=0;M<16;M+=2)B[M/2]=this.f32ToBF16(F[M])|this.f32ToBF16(F[M+1])<<16;const f=new Uint32Array(8/2);for(let M=0;M<8;M+=2)f[M/2]=this.f32ToBF16(n[M])|this.f32ToBF16(n[M+1])<<16;const p=new Float32Array(16),y=new Float32Array(16);for(let M=0;M<2;M++){const H=new Float32Array(8);for(let E=0;E<8;E++)H[E]=l[M*8*2+E],y[M*8+E]=l[M*8*2+8+E];let P=0;for(let E=0;E<8;E++)P+=H[E]*H[E];const Q=1/Math.sqrt(P/8+1e-6);for(let E=0;E<8;E++){const J=this.unpackBF16(B[Math.floor((M*8+E)/2)],(M*8+E)%2);p[M*8+E]=H[E]*Q*(1+J)}}const k=new Float32Array(8),u=new Float32Array(32),m=new Float32Array(32);for(let M=0;M<1;M++){let H=0;for(let Q=0;Q<8;Q++)H+=w[M*8+Q]*w[M*8+Q];const P=1/Math.sqrt(H/8+1e-6);for(let Q=0;Q<8;Q++){const E=this.unpackBF16(f[Math.floor((M*8+Q)/2)],(M*8+Q)%2),J=w[M*8+Q]*P*(1+E);k[M*8+Q]=J,u[24+M*8+Q]=J,m[24+M*8+Q]=h[M*8+Q]}}const g=32+320*16,i=new ArrayBuffer(g),c=new DataView(i);c.setUint32(0,2,!0),c.setUint32(4,1,!0),c.setUint32(8,8,!0),c.setFloat32(12,1e-6,!0),c.setUint32(16,3,!0);for(let M=0;M<16/2;M++){const H=Math.floor(M/4),P=M%4;c.setUint32(32+H*16+P*4,B[M],!0)}const _=16/2;for(let M=0;M<8/2;M++){const H=_+M,P=Math.floor(H/4),Q=H%4;c.setUint32(32+P*16+Q*4,f[M],!0)}const b=this.gpu.createBufferFromData("fsqk_params",new Uint8Array(i),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),N=this.makeF32Buffer("fsqk_qpf",l),x=this.makeF32Buffer("fsqk_kp",w),q=this.makeF32Buffer("fsqk_vp",h),K=this.makeOutputBuffer("fsqk_qp",16),v=this.makeOutputBuffer("fsqk_qg",16),T=this.makeOutputBuffer("fsqk_kc",32),V=this.makeOutputBuffer("fsqk_vc",32);this.dispatch("fused_split_qknorm_kvstore",[N,x,q,K,v,T,V,b],3);const G=await this.readback(K,16),C=await this.readback(v,16),U=await this.readback(x,8),S=await this.readback(T,32),O=await this.readback(V,32),Y=this.compare(G,p,1e-5),z=this.compare(C,y,1e-6),X=this.compare(U,k,1e-5),tt=S.slice(24,32),A=O.slice(24,32),R=u.slice(24,32),L=m.slice(24,32),W=this.compare(tt,R,1e-5),Z=this.compare(A,L,1e-6),j=Math.max(Y.maxErr,z.maxErr,X.maxErr,W.maxErr,Z.maxErr);return{pass:Y.pass&&z.pass&&X.pass&&W.pass&&Z.pass,maxErr:j,errors:[...Y.errors||[],...z.errors||[],...X.errors||[],...W.errors||[],...Z.errors||[]]}}async benchmarkGPTQ(e){const t=[{K:2048,N:2048,gs:128,label:"2048→2048 (Z/out_proj)"},{K:2048,N:6144,gs:128,label:"2048→6144 (QKV/gate+up)"},{K:6144,N:2048,gs:128,label:"6144→2048 (down_proj)"}],o=[1,2,4,8],s=50,r=3,a=10,d=[];for(const l of t){const{K:w,N:h,gs:F,label:n}=l,B=w/8,f=B/(F/8),p=new Float32Array(w);for(let b=0;b<w;b++)p[b]=Math.random()*2-1;const y=new Uint32Array(B*h);for(let b=0;b<y.length;b++)y[b]=Math.random()*4294967295>>>0;const k=new Float32Array(f*h);for(let b=0;b<k.length;b++)k[b]=Math.random()*.5;const u=this.cpuF32ToF16Packed(k),m=this.makeF32Buffer("bm_in",p),g=this.makeU32Buffer("bm_qw",y),i=this.makeU32Buffer("bm_sc",u),c=[];for(let b=0;b<s;b++)c.push(this.makeOutputBuffer(`bm_out_${b}`,h));const _=B*h*4+f*h*2;for(const b of o){if(b>1&&f%b!==0)continue;let N;if(b===1){const v=this.makeUniform([w,h,F]);for(let G=0;G<r;G++){this.gpu.beginBatch();for(let C=0;C<s;C++)this.dispatch("gptq_matvec",[m,g,i,c[C],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const T=[];for(let G=0;G<a;G++){await this.gpu.device.queue.onSubmittedWorkDone();const C=performance.now();this.gpu.beginBatch();for(let U=0;U<s;U++)this.dispatch("gptq_matvec",[m,g,i,c[U],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),T.push(performance.now()-C)}T.sort((G,C)=>G-C),N=T.slice(1,-1).reduce((G,C)=>G+C,0)/(T.length-2)/s}else{const v=[];for(let U=0;U<s;U++)v.push(this.makeOutputBuffer(`bm_part_${U}`,h*b));const T=this.makeUniform([w,h,F,b]),V=this.makeUniform([h,b]);for(let U=0;U<r;U++){this.gpu.beginBatch();for(let S=0;S<s;S++)this.dispatch("gptq_splitk",[m,g,i,v[S],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[S],c[S],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const G=[];for(let U=0;U<a;U++){await this.gpu.device.queue.onSubmittedWorkDone();const S=performance.now();this.gpu.beginBatch();for(let O=0;O<s;O++)this.dispatch("gptq_splitk",[m,g,i,v[O],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[O],c[O],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),G.push(performance.now()-S)}G.sort((U,S)=>U-S),N=G.slice(1,-1).reduce((U,S)=>U+S,0)/(G.length-2)/s}const x=_/1e9/(N/1e3),q=Math.ceil(h/32)*b,K={label:n,ns:b,wgs:q,avgMs:N.toFixed(4),bwGBs:x.toFixed(1)};d.push(K),e?.(K)}if(this.gpu.hasF16&&this.pipelines.gptq_matvec_f16){const b=this.makeUniform([w,h,F]);for(let T=0;T<r;T++){this.gpu.beginBatch();for(let V=0;V<s;V++)this.dispatch("gptq_matvec_f16",[m,g,i,c[V],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const N=[];for(let T=0;T<a;T++){await this.gpu.device.queue.onSubmittedWorkDone();const V=performance.now();this.gpu.beginBatch();for(let G=0;G<s;G++)this.dispatch("gptq_matvec_f16",[m,g,i,c[G],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),N.push(performance.now()-V)}N.sort((T,V)=>T-V);const q=N.slice(1,-1).reduce((T,V)=>T+V,0)/(N.length-2)/s,K=_/1e9/(q/1e3),v={label:n,ns:"f16",wgs:Math.ceil(h/32),avgMs:q.toFixed(4),bwGBs:K.toFixed(1)};d.push(v),e?.(v)}}return d}_makeNormParams(e,t=1e-6){const o=new ArrayBuffer(16),s=new DataView(o);return s.setUint32(0,e,!0),s.setFloat32(4,t,!0),this.gpu.createBufferFromData("norm_params",new Uint8Array(o),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}_cpuRMSNorm1PlusW(e,t,o,s=1e-6){let r=0;for(let l=0;l<o;l++)r+=e[l]*e[l];const a=1/Math.sqrt(r/o+s),d=new Float32Array(o);for(let l=0;l<o;l++){const w=t[Math.floor(l/2)]>>l%2*16&65535,h=this.bf16ToF32(w);d[l]=e[l]*a*(1+h)}return d}async testAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let u=0;u<32;u++)o[u]=Math.sin(u*.7)*2,s[u]=Math.cos(u*.3)*.5;const r=new Float32Array(16);for(let u=0;u<16;u++)r[u]=.1*(u+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32),l=new Float32Array(32);for(let u=0;u<2;u++){const m=new Float32Array(16);for(let i=0;i<16;i++)m[i]=o[u*16+i]+s[u*16+i];for(let i=0;i<16;i++)l[u*16+i]=m[i];const g=this._cpuRMSNorm1PlusW(m,a,16,1e-6);d.set(g,u*16)}const w=this.makeF32Buffer("b2an_h",o),h=this.makeF32Buffer("b2an_a",s),F=this.makeU32Buffer("b2an_w",a),n=this.makeOutputBuffer("b2an_out",32),B=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_b2",[w,h,F,n,B],2);const f=await this.readback(n,32),p=await this.readback(w,32),y=this.compare(f,d,.001),k=this.compare(p,l,1e-6);return{pass:y.pass&&k.pass,maxErr:Math.max(y.maxErr,k.maxErr),errors:[...y.errors,...k.errors]}}async testAddRMSNormROB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let f=0;f<32;f++)o[f]=Math.sin(f*.5)*3,s[f]=Math.cos(f*.8)*.3;const r=new Float32Array(16);for(let f=0;f<16;f++)r[f]=.05*(f+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32);for(let f=0;f<2;f++){const p=new Float32Array(16);for(let k=0;k<16;k++)p[k]=o[f*16+k]+s[f*16+k];const y=this._cpuRMSNorm1PlusW(p,a,16,1e-6);d.set(y,f*16)}const l=this.makeF32Buffer("b2anro_h",o),w=this.makeF32Buffer("b2anro_a",s),h=this.makeU32Buffer("b2anro_w",a),F=this.makeOutputBuffer("b2anro_out",32),n=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_ro_b2",[l,w,h,F,n],2);const B=await this.readback(F,32);return this.compare(B,d,.001)}async testThreeWayAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32),r=new Float32Array(32);for(let i=0;i<32;i++)o[i]=Math.sin(i*.4)*2,s[i]=Math.cos(i*.6)*.5,r[i]=Math.sin(i*1.1)*.3;const a=new Float32Array(16);for(let i=0;i<16;i++)a[i]=.1*(i+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(32),w=new Float32Array(32);for(let i=0;i<2;i++){const c=new Float32Array(16);for(let b=0;b<16;b++)c[b]=o[i*16+b]+s[i*16+b]+r[i*16+b];l.set(c,i*16);const _=this._cpuRMSNorm1PlusW(c,d,16,1e-6);w.set(_,i*16)}const h=this.makeF32Buffer("b2twa_a",o),F=this.makeF32Buffer("b2twa_b",s),n=this.makeF32Buffer("b2twa_c",r),B=this.makeU32Buffer("b2twa_w",d),f=this.makeOutputBuffer("b2twa_hout",32),p=this.makeOutputBuffer("b2twa_normed",32),y=this._makeNormParams(16,1e-6);this.dispatch("three_way_add_rmsnorm_b2",[h,F,n,B,f,p,y],2);const k=await this.readback(p,32),u=await this.readback(f,32),m=this.compare(k,w,.001),g=this.compare(u,l,1e-6);return{pass:m.pass&&g.pass,maxErr:Math.max(m.maxErr,g.maxErr),errors:[...m.errors,...g.errors]}}_makeGPTQData(e,t,o){const s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let h=0;h<s;h++)for(let F=0;F<t;F++){let n=0;for(let B=0;B<8;B++){const f=h*8+B,p=(f*3+F*7)%15;a[f*t+F]=p-8,n|=p<<B*4}r[h*t+F]=n}const d=e/o,l=new Float32Array(d*t);for(let h=0;h<d*t;h++)l[h]=.3+h*.15;const w=this.cpuF32ToF16Packed(l);return{qweight:r,rawWeights:a,scalesPacked:w}}_cpuGPTQMatvec(e,t,o,s,r,a,d=0){const l=new Float32Array(r);for(let w=0;w<r;w++){let h=0;for(let F=0;F<s;F++){const B=Math.floor(F/a)*r+w,f=o[Math.floor(B/2)]>>B%2*16&65535,p=this.f16ToF32(f);h+=e[d+F]*t[F*r+w]*p}l[w]=h}return l}async testFusedSiluGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.3)*2,l[k]=Math.cos(k*.5)*1.5;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=d[k*e+g];u[g]=i/(1+Math.exp(-i))*l[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sl_a",d),F=this.makeF32Buffer("b2sl_b",l),n=this.makeU32Buffer("b2sl_qw",s),B=this.makeU32Buffer("b2sl_sc",a),f=this.makeOutputBuffer("b2sl_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_silu_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedSigmoidGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.4)*1.5,l[k]=Math.cos(k*.7)*2;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=1/(1+Math.exp(-l[k*e+g]));u[g]=i*d[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sg_x",d),F=this.makeF32Buffer("b2sg_g",l),n=this.makeU32Buffer("b2sg_qw",s),B=this.makeU32Buffer("b2sg_sc",a),f=this.makeOutputBuffer("b2sg_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_sigmoid_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedGateUpSiLUB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*3+x*7)%15;a[v*t+x]=T-8,q|=T<<K*4}r[N*t+x]=q}const d=e/o,l=new Float32Array(d*t);for(let N=0;N<d*t;N++)l[N]=.01+N%8*.005;const w=this.cpuF32ToF16Packed(l),h=new Uint32Array(s*t),F=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*5+x*3)%15;F[v*t+x]=T-8,q|=T<<K*4}h[N*t+x]=q}const n=e/o,B=new Float32Array(n*t);for(let N=0;N<n*t;N++)B[N]=.01+N%8*.005;const f=this.cpuF32ToF16Packed(B),p=new Float32Array(2*e);for(let N=0;N<2*e;N++)p[N]=Math.sin(N*.2)*.5;const y=new Float32Array(2*t);for(let N=0;N<2;N++){const x=this._cpuGPTQMatvec(p,a,w,e,t,o,N*e),q=this._cpuGPTQMatvec(p,F,f,e,t,o,N*e);for(let K=0;K<t;K++){const v=x[K];y[N*t+K]=v/(1+Math.exp(-v))*q[K]}}const k=this.makeF32Buffer("b2gus_in",p),u=this.makeU32Buffer("b2gus_gqw",r),m=this.makeU32Buffer("b2gus_gsc",w),g=this.makeU32Buffer("b2gus_uqw",h),i=this.makeU32Buffer("b2gus_usc",f),c=this.makeOutputBuffer("b2gus_out",2*t),_=this.makeUniform([e,t,o]);this.dispatch("fused_gate_up_silu_b2_f16",[k,u,m,g,i,c,_],Math.ceil(t/32));const b=await this.readback(c,2*t);return this.compare(b,y,.05)}_makeGPTQMatmulB2Data(e,t,o){const s=e/8,r=new Float32Array(2*e);for(let n=0;n<e;n++)r[n]=Math.sin(n*.5)*.5;for(let n=0;n<e;n++)r[e+n]=Math.cos(n*.3)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let n=0;n<s;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<8;p++){const y=n*8+p,k=(y*3+B*7)%15;d[y*t+B]=k-8,f|=k<<p*4}a[n*t+B]=f}const l=e/o,w=new Float32Array(l*t);for(let n=0;n<l*t;n++)w[n]=.01+n%8*.005;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(2*t);for(let n=0;n<2;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<e;p++){const k=Math.floor(p/o)*t+B,u=h[Math.floor(k/2)]>>k%2*16&65535,m=this.f16ToF32(u);f+=r[n*e+p]*d[p*t+B]*m}F[n*t+B]=f}return{input:r,qweight:a,scalesPacked:h,expected:F,K:e,N:t,gs:o}}async testGPTQMatmulB2(){const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2_input",e),w=this.makeU32Buffer("b2_qw",t),h=this.makeU32Buffer("b2_sc",o),F=this.makeOutputBuffer("b2_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.02)}async testGPTQMatmulB2F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2f16_input",e),w=this.makeU32Buffer("b2f16_qw",t),h=this.makeU32Buffer("b2f16_sc",o),F=this.makeOutputBuffer("b2f16_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_f16",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}async testGPTQMatmulB2_4T_F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(256,16,64),l=this.makeF32Buffer("b2_4t_input",e),w=this.makeU32Buffer("b2_4t_qw",t),h=this.makeU32Buffer("b2_4t_sc",o),F=this.makeOutputBuffer("b2_4t_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_4t_f16",[l,w,h,F,n],Math.ceil(a/8));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}f32ToBF16(e){const t=new ArrayBuffer(4);return new Uint32Array(t)[0]=new Uint32Array(new Float32Array([e]).buffer)[0],new Uint32Array(t)[0]>>>16}unpackBF16(e,t){const o=e>>>t*16&65535;return this.bf16ToF32(o)}}function $(I){const e=document.getElementById("log");e.textContent+=I+`
 `,e.scrollTop=e.scrollHeight}document.querySelectorAll(".tab").forEach(I=>{I.addEventListener("click",()=>{document.querySelectorAll(".tab").forEach(e=>e.classList.remove("active")),document.querySelectorAll(".panel").forEach(e=>e.classList.remove("active")),I.classList.add("active"),document.getElementById(I.dataset.panel).classList.add("active")})});document.getElementById("runTests").addEventListener("click",async()=>{const I=document.getElementById("runTests"),e=document.getElementById("testResults"),t=document.getElementById("testSummary");I.disabled=!0,e.innerHTML="",t.innerHTML="",$("Initializing WebGPU for tests...");try{const o=new ot;await o.init(),$(`WebGPU initialized. Running tests...
 `);let s=0,r=0;await o.runAll(a=>{const d=a.pass?"✓":"✗",l=a.pass?"pass":"fail";a.pass?s++:r++;const w=document.createElement("div");if(w.className="test-row",w.innerHTML=`
             <span class="test-icon ${l}">${d}</span>
@@ -14,7 +14,7 @@ import{G as nt,S as ct,_ as et}from"./gpu-ops-Bq_PFJSE.js";class ot{constructor(
           </div>
         `,$(`
 Done: ${s} passed, ${r} failed`),o.destroy()}catch(o){$(`Error: ${o.message}
-${o.stack}`)}finally{I.disabled=!1}});document.getElementById("loadAndProfile").addEventListener("click",async()=>{const I=document.getElementById("loadAndProfile"),e=document.getElementById("profStatus"),t=document.getElementById("profCategories"),o=document.getElementById("profTopOps");I.disabled=!0,t.innerHTML="",o.innerHTML="";try{const s=document.getElementById("profRepo").value.trim();e.textContent="Loading model...",$("Profiler: loading model from "+s);const{GPUContext:r}=await et(async()=>{const{GPUContext:c}=await import("./gpu-ops-Bq_PFJSE.js").then(_=>_.g);return{GPUContext:c}},[],import.meta.url),{Qwen35Model:a}=await et(async()=>{const{Qwen35Model:c}=await import("./qwen35-model-D0qiY8Dx.js");return{Qwen35Model:c}},__vite__mapDeps([0,1]),import.meta.url),{loadModelWeights:d,loadConfig:l,loadQuantConfig:w}=await et(async()=>{const{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}=await import("./safetensors-loader-CwGm5mJX.js");return{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}},[],import.meta.url),{Profiler:h}=await et(async()=>{const{Profiler:c}=await import("./profiler-DYUyiq-B.js");return{Profiler:c}},[],import.meta.url),F=new r;await F.init(),e.textContent="Fetching config...";const n=await l(s),B=await w(s);e.textContent="Downloading weights (this may take a while)...";const f=await d(s,c=>{if(c.phase==="downloading"){const _=c.total?(c.loaded/c.total*100).toFixed(0):"?";e.textContent=`Downloading: ${c.filesLoaded}/${c.filesTotal} shards (${_}%)`}else c.phase==="parsing"&&(e.textContent=`Parsing ${c.file}...`)});e.textContent="Uploading to GPU...";const p=new a(F,n,B);p.compilePipelines(),p.loadWeights(f),p.initBuffers(),e.textContent="Warming up...",await p.forward(1,0);const y=p.textCfg.linear_num_key_heads,k=p.textCfg.linear_key_head_dim,u=p.textCfg.linear_value_head_dim,m=y*(k+k+u);for(let c=0;c<p.numLayers;c++)p.layerTypes[c]==="linear_attention"&&(F.device.queue.writeBuffer(p.linState[c],0,new ArrayBuffer(y*k*u*4)),F.device.queue.writeBuffer(p.linConvHist[c],0,new ArrayBuffer(3*m*4)));p.seqLen=0,e.textContent="Profiling forward pass...",$("Profiling single forward pass...");const i=await new h(p).profileForward(1,0);e.textContent=`Done! Estimated: ${i.estimatedTokPerSec} tok/s (sync overhead inflates times)`,$(`
 Profile complete: ${i.total.toFixed(1)}ms total (~${i.estimatedTokPerSec} tok/s with sync overhead)`),$(`
 Category breakdown:`);for(const c of i.categories){const _=document.createElement("div");_.className="prof-row";const b=c.name.includes("matvec")||c.name==="lm_head"?"var(--orange)":c.name.includes("attention")||c.name.includes("deltanet")?"var(--blue)":"var(--green)";_.innerHTML=`
             <span class="prof-name">${c.name}</span>

+const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["./qwen35-model-CmeFfImT.js","./gpu-ops-PQDFq1iI.js"])))=>i.map(i=>d[i]);
+import{G as nt,S as ct,_ as et}from"./gpu-ops-PQDFq1iI.js";class ot{constructor(){this.gpu=null,this.results=[],this.pipelines={}}async init(){this.gpu=new nt,await this.gpu.init();for(const[e,t]of Object.entries(ct))this.pipelines[e]=this.gpu.getOrCreatePipeline(e,t);return this}async runAll(e){const t=[["GPTQ MatVec",()=>this.testGPTQMatvec()],["GPTQ MatVec (group boundary)",()=>this.testGPTQMatvecGroups()],["GPTQ MatVec (gs=32)",()=>this.testGPTQMatvecGS32()],["GPTQ Split-K",()=>this.testGPTQSplitK()],["GPTQ F16",()=>this.testGPTQF16()],["BF16 MatVec",()=>this.testBF16Matvec()],["RMSNorm",()=>this.testRMSNorm()],["RMSNorm (1+w formula)",()=>this.testRMSNorm1PlusW()],["SiLU * Mul",()=>this.testSiLUMul()],["Vector Add",()=>this.testAdd()],["Embedding (BF16)",()=>this.testEmbedding()],["Argmax",()=>this.testArgmax()],["Argmax (large)",()=>this.testArgmaxLarge()],["Split interleaved",()=>this.testSplit()],["Sigmoid Mul",()=>this.testSigmoidMul()],["Head RMSNorm (1+w)",()=>this.testHeadRMSNorm()],["Head RMSNorm (nogated, w)",()=>this.testHeadRMSNormNogated()],["Causal Conv1d",()=>this.testCausalConv1d()],["KV Cache Store",()=>this.testKVCacheStore()],["GQA Attention (single pos)",()=>this.testGQAAttention()],["DeltaNet Recurrent",()=>this.testDeltaNetRecurrent()],["Add + RMSNorm (fused)",()=>this.testAddRMSNorm()],["Three-way Add + RMSNorm",()=>this.testThreeWayAddRMSNorm()],["GQA RoPE rotate_half",()=>this.testGQARoPERotateHalf()],["Fused sigmoid + GPTQ",()=>this.testFusedSigmoidGPTQ()],["Fused SiLU + GPTQ",()=>this.testFusedSiluGPTQ()],["Fused AddNorm+GateUp+SiLU",()=>this.testFusedAddNormGateUpSiLU()],["Fused Split+QKNorm+KVStore",()=>this.testFusedSplitQKNormKVStore()],["Add+RMSNorm B=2",()=>this.testAddRMSNormB2()],["Add+RMSNorm RO B=2",()=>this.testAddRMSNormROB2()],["Three-way Add+RMSNorm B=2",()=>this.testThreeWayAddRMSNormB2()],["Fused SiLU+GPTQ B=2",()=>this.testFusedSiluGPTQB2()],["Fused Sigmoid+GPTQ B=2",()=>this.testFusedSigmoidGPTQB2()],["Fused GateUp+SiLU B=2",()=>this.testFusedGateUpSiLUB2()],["GPTQ Matmul B=2",()=>this.testGPTQMatmulB2()],["GPTQ Matmul B=2 F16",()=>this.testGPTQMatmulB2F16()],["GPTQ Matmul B=2 4T F16",()=>this.testGPTQMatmulB2_4T_F16()]];for(const[o,s]of t)try{const r=await s();this.results.push({name:o,...r}),e?.({name:o,...r})}catch(r){this.results.push({name:o,pass:!1,error:r.message,maxErr:NaN}),e?.({name:o,pass:!1,error:r.message,maxErr:NaN})}return this.results}destroy(){this.gpu?.destroy()}makeBuffer(e,t,o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){return this.gpu.createBufferFromData(e,new Uint8Array(t.buffer,t.byteOffset,t.byteLength),o)}makeF32Buffer(e,t){return this.makeBuffer(e,new Float32Array(t))}makeU32Buffer(e,t){return this.makeBuffer(e,new Uint32Array(t))}makeOutputBuffer(e,t){return this.gpu.createBuffer(e,t*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC)}makeUniform(e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),o=new DataView(t);for(let s=0;s<e.length;s++)typeof e[s]=="number"&&!Number.isInteger(e[s])?o.setFloat32(s*4,e[s],!0):o.setUint32(s*4,e[s],!0);return this.gpu.createBufferFromData("uniform",new Uint8Array(t),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}makeUniformTyped(e,t){const o=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(o*4/16)*16)),r=new DataView(s);let a=0;for(const d of e)r.setUint32(a,d,!0),a+=4;for(const d of t)r.setFloat32(a,d,!0),a+=4;return this.gpu.createBufferFromData("uniform_typed",new Uint8Array(s),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}dispatch(e,t,o,s=1){const r=this.pipelines[e],a=this.gpu.createBindGroup(r,0,t);this.gpu.dispatch(r,[a],o,s)}async readback(e,t){return this.gpu.readBuffer(e,t*4)}async readbackU32(e,t){const o=this.gpu.createReadbackBuffer("_rb",t*4),s=this.gpu.device.createCommandEncoder();s.copyBufferToBuffer(e,0,o,0,t*4),this.gpu.device.queue.submit([s.finish()]),await o.mapAsync(GPUMapMode.READ);const r=new Uint32Array(o.getMappedRange().slice(0));return o.unmap(),o.destroy(),r}compare(e,t,o=1e-4){let s=0;const r=[];for(let a=0;a<t.length;a++){const d=Math.abs(e[a]-t[a]);s=Math.max(s,d),d>o&&r.push({idx:a,got:e[a],expected:t[a],err:d})}return{pass:r.length===0,maxErr:s,errors:r.slice(0,5)}}cpuF32ToBF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const a=new Uint32Array(new Float32Array([e[s]]).buffer)[0]>>>16,d=Math.floor(s/2);s%2===0?o[d]=a:o[d]|=a<<16}return o}cpuF32ToF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const r=this.f32ToF16(e[s]),a=Math.floor(s/2);s%2===0?o[a]=r:o[a]|=r<<16}return o}f32ToF16(e){const t=new ArrayBuffer(4);new Float32Array(t)[0]=e;const o=new Uint32Array(t)[0],s=o>>31&1;let r=o>>23&255,a=o&8388607;return r===0?s<<15:r===255?s<<15|31744|(a?512:0):(r=r-127+15,r>=31?s<<15|31744:r<=0?s<<15:s<<15|r<<10|a>>13)}f16ToF32(e){const t=e>>15&1,o=e>>10&31,s=e&1023;return o===0?s===0?t?-0:0:(t?-1:1)*Math.pow(2,-14)*(s/1024):o===31?s?NaN:t?-1/0:1/0:(t?-1:1)*Math.pow(2,o-15)*(1+s/1024)}bf16ToF32(e){const t=e<<16,o=new ArrayBuffer(4);return new Uint32Array(o)[0]=t,new Float32Array(o)[0]}async testGPTQMatvec(){const r=new Float32Array(64);for(let k=0;k<64;k++)r[k]=(k+1)*.1;const a=new Uint32Array(32),d=new Float32Array(256);for(let k=0;k<8;k++)for(let u=0;u<4;u++){let m=0;for(let g=0;g<8;g++){const i=k*8+g,c=(i+u)%15;d[i*4+u]=c-8,m|=c<<g*4}a[k*4+u]=m}const l=new Float32Array(4);for(let k=0;k<4;k++)l[k]=.5+k*.1;const w=this.cpuF32ToF16Packed(l),h=new Float32Array(4);for(let k=0;k<4;k++){let u=0;for(let m=0;m<64;m++){const i=Math.floor(m/64)*4+k,c=w[Math.floor(i/2)]>>i%2*16&65535,_=this.f16ToF32(c);u+=r[m]*d[m*4+k]*_}h[k]=u}const F=this.makeF32Buffer("gptq_input",r),n=this.makeU32Buffer("gptq_qw",a),B=this.makeU32Buffer("gptq_scales",w),f=this.makeOutputBuffer("gptq_out",4),p=this.makeUniform([64,4,64]);this.dispatch("gptq_matvec",[F,n,B,f,p],Math.ceil(4/32));const y=await this.readback(f,4);return this.compare(y,h,.01)}async testGPTQMatvecGroups(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/64,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/64)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq2_in",r),B=this.makeU32Buffer("gptq2_qw",a),f=this.makeU32Buffer("gptq2_sc",h),p=this.makeOutputBuffer("gptq2_out",4),y=this.makeUniform([128,4,64]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQMatvecGS32(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/32,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/32)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq32_in",r),B=this.makeU32Buffer("gptq32_qw",a),f=this.makeU32Buffer("gptq32_sc",h),p=this.makeOutputBuffer("gptq32_out",4),y=this.makeUniform([128,4,32]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQSplitK(){const a=new Float32Array(256);for(let i=0;i<256;i++)a[i]=Math.sin(i*.3)*.5;const d=new Uint32Array(1024),l=new Float32Array(256*32);for(let i=0;i<32;i++)for(let c=0;c<32;c++){let _=0;for(let b=0;b<8;b++){const N=i*8+b,x=(N*3+c*7)%15;l[N*32+c]=x-8,_|=x<<b*4}d[i*32+c]=_}const w=256/64,h=new Float32Array(w*32);for(let i=0;i<w;i++)for(let c=0;c<32;c++)h[i*32+c]=.3+(i*32+c)*.02;const F=this.cpuF32ToF16Packed(h),n=new Float32Array(32);for(let i=0;i<32;i++){let c=0;for(let _=0;_<256;_++){const N=Math.floor(_/64)*32+i,x=F[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);c+=a[_]*l[_*32+i]*q}n[i]=c}const B=this.makeF32Buffer("sk_in",a),f=this.makeU32Buffer("sk_qw",d),p=this.makeU32Buffer("sk_sc",F),y=this.makeOutputBuffer("sk_partials",128),k=this.makeOutputBuffer("sk_out",32),u=this.makeUniform([256,32,64,4]),m=this.makeUniform([32,4]);this.dispatch("gptq_splitk",[B,f,p,y,u],Math.ceil(32/32),4),this.dispatch("reduce_splitk",[y,k,m],Math.ceil(32/256));const g=await this.readback(k,32);return this.compare(g,n,.01)}async testGPTQF16(){if(!this.gpu.hasF16||!this.pipelines.gptq_matvec_f16)return{pass:!0,maxErr:0,note:"shader-f16 not available, skipped"};const e=128,t=4,o=64,s=e/8,r=new Float32Array(e);for(let u=0;u<e;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let u=0;u<s;u++)for(let m=0;m<t;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c+m*3)%15;d[c*t+m]=_-8,g|=_<<i*4}a[u*t+m]=g}const l=e/o,w=new Float32Array(l*t);for(let u=0;u<l;u++)for(let m=0;m<t;m++)w[u*t+m]=.5+u*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(t);for(let u=0;u<t;u++){let m=0;for(let g=0;g<e;g++){const c=Math.floor(g/o)*t+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*t+u]*b}F[u]=m}const n=this.makeF32Buffer("f16_in",r),B=this.makeU32Buffer("f16_qw",a),f=this.makeU32Buffer("f16_sc",h),p=this.makeOutputBuffer("f16_out",t),y=this.makeUniform([e,t,o]);this.dispatch("gptq_matvec_f16",[n,B,f,p,y],Math.ceil(t/32));const k=await this.readback(p,t);return this.compare(k,F,.5)}async testBF16Matvec(){const o=new Float32Array(8);for(let n=0;n<8;n++)o[n]=(n+1)*.2;const s=new Float32Array(32);for(let n=0;n<4;n++)for(let B=0;B<8;B++)s[n*8+B]=Math.cos(n*.5+B*.3);const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4);for(let n=0;n<4;n++){let B=0;for(let f=0;f<8;f++){const p=n*8+f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);B+=o[f]*u}a[n]=B}const d=this.makeF32Buffer("bf16mv_in",o),l=this.makeU32Buffer("bf16mv_w",r),w=this.makeOutputBuffer("bf16mv_out",4),h=this.makeUniform([8,4]);this.dispatch("bf16_matvec",[d,l,w,h],Math.ceil(4/32));const F=await this.readback(w,4);return this.compare(F,a,.001)}async testRMSNorm(){const o=new Float32Array(16);for(let f=0;f<16;f++)o[f]=(f-8)*.3;const s=new Float32Array(16);for(let f=0;f<16;f++)s[f]=Math.sin(f*.4)*.1;const r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<16;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/16+1e-6),l=new Float32Array(16);for(let f=0;f<16;f++){const p=f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);l[f]=o[f]*d*(1+u)}const w=this.makeF32Buffer("rmsn_in",o),h=this.makeU32Buffer("rmsn_w",r),F=this.makeOutputBuffer("rmsn_out",16),n=this.makeUniformTyped([16],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,16);return this.compare(B,l,1e-4)}async testRMSNorm1PlusW(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8).fill(0),r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<8;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/8+1e-6),l=new Float32Array(8);for(let f=0;f<8;f++)l[f]=o[f]*d;const w=this.makeF32Buffer("rmsn1_in",o),h=this.makeU32Buffer("rmsn1_w",r),F=this.makeOutputBuffer("rmsn1_out",8),n=this.makeUniformTyped([8],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,8);return this.compare(B,l,1e-4)}async testSiLUMul(){const t=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8);for(let h=0;h<8;h++){const F=t[h]/(1+Math.exp(-t[h]));s[h]=F*o[h]}const r=this.makeF32Buffer("silu_gate",t),a=this.makeF32Buffer("silu_up",o),d=this.makeOutputBuffer("silu_out",8),l=this.makeUniform([8]);this.dispatch("silu_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testAdd(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([.1,.2,.3,.4,.5,.6,.7,.8]),s=new Float32Array(8);for(let w=0;w<8;w++)s[w]=t[w]+o[w];const r=this.makeF32Buffer("add_a",t),a=this.makeF32Buffer("add_b",o),d=this.makeUniform([8]);this.dispatch("add",[r,a,d],Math.ceil(8/32));const l=await this.readback(r,8);return this.compare(l,s,1e-6)}async testEmbedding(){const o=new Float32Array(32);for(let F=0;F<o.length;F++)o[F]=(F+1)*.01;const s=this.cpuF32ToBF16Packed(o),r=2,a=new Float32Array(8);for(let F=0;F<8;F++){const n=r*8+F,B=s[Math.floor(n/2)],f=n%2===0?B&65535:B>>16&65535;a[F]=this.bf16ToF32(f)}const d=this.makeU32Buffer("emb_w",s),l=this.makeOutputBuffer("emb_out",8),w=this.makeUniform([r,8]);this.dispatch("embedding",[d,l,w],Math.ceil(8/256));const h=await this.readback(l,8);return this.compare(h,a,.001)}async testArgmax(){const t=new Float32Array(16);for(let w=0;w<16;w++)t[w]=Math.sin(w*.7)*3;const o=t.indexOf(Math.max(...t)),s=this.makeF32Buffer("argmax_in",t),r=this.makeOutputBuffer("argmax_res",2),a=this.makeUniform([16]);this.dispatch("argmax",[s,r,a],1);const l=(await this.readbackU32(r,2))[0];return{pass:l===o,maxErr:Math.abs(l-o),errors:l!==o?[{idx:0,got:l,expected:o}]:[]}}async testArgmaxLarge(){const t=new Float32Array(1024);for(let d=0;d<1024;d++)t[d]=Math.sin(d*.01)-2;t[773]=99;const o=this.makeF32Buffer("argmax_lg_in",t),s=this.makeOutputBuffer("argmax_lg_res",2),r=this.makeUniform([1024]);this.dispatch("argmax",[o,s,r],1);const a=await this.readbackU32(s,2);return{pass:a[0]===773,maxErr:Math.abs(a[0]-773),errors:a[0]!==773?[{got:a[0],expected:773}]:[]}}async testSplit(){const s=new Float32Array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]),r=new Float32Array([1,2,3,4,9,10,11,12]),a=new Float32Array([5,6,7,8,13,14,15,16]),d=this.makeF32Buffer("split_src",s),l=this.makeOutputBuffer("split_a",8),w=this.makeOutputBuffer("split_b",8),h=this.makeUniform([8,2,4]);this.dispatch("split",[d,l,w,h],Math.ceil(8/32));const F=await this.readback(l,8),n=await this.readback(w,8),B=this.compare(F,r,1e-6),f=this.compare(n,a,1e-6);return{pass:B.pass&&f.pass,maxErr:Math.max(B.maxErr,f.maxErr),errors:[...B.errors,...f.errors]}}async testSigmoidMul(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),s=new Float32Array(8);for(let h=0;h<8;h++)s[h]=t[h]/(1+Math.exp(-o[h]));const r=this.makeF32Buffer("sigmul_x",t),a=this.makeF32Buffer("sigmul_g",o),d=this.makeOutputBuffer("sigmul_out",8),l=this.makeUniform([8]);this.dispatch("sigmoid_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testHeadRMSNorm(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.1*(n+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*(1+u)}}const l=this.makeF32Buffer("hrmsn_x",s),w=this.makeU32Buffer("hrmsn_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testHeadRMSNormNogated(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.5+.1*n;const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*u}}const l=this.makeF32Buffer("hrmsng_x",s),w=this.makeU32Buffer("hrmsng_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm_nogated",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testCausalConv1d(){const t=new Float32Array([1,2,3,4]),o=new Float32Array(12);for(let y=0;y<o.length;y++)o[y]=(y+1)*.1;const s=new Float32Array(16);for(let y=0;y<s.length;y++)s[y]=.25;const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4),d=new Float32Array(12);for(let y=0;y<4;y++){const k=o[y],u=o[4+y],m=o[8+y],g=t[y],i=[];for(let _=0;_<4;_++){const b=y*4+_,N=r[Math.floor(b/2)],x=b%2===0?N&65535:N>>16&65535;i.push(this.bf16ToF32(x))}const c=i[0]*k+i[1]*u+i[2]*m+i[3]*g;a[y]=c/(1+Math.exp(-c)),d[y]=u,d[4+y]=m,d[8+y]=g}const l=this.makeF32Buffer("conv_x",t),w=this.makeF32Buffer("conv_hist",o),h=this.makeU32Buffer("conv_w",r),F=this.makeUniform([4]);this.dispatch("causal_conv1d",[l,w,h,F],Math.ceil(4/256));const n=await this.readback(l,4),B=await this.readback(w,12),f=this.compare(n,a,1e-4),p=this.compare(B,d,1e-4);return{pass:f.pass&&p.pass,maxErr:Math.max(f.maxErr,p.maxErr),errors:[...f.errors.map(y=>({...y,note:"conv output"})),...p.errors.map(y=>({...y,note:"history"}))]}}async testKVCacheStore(){const r=new Float32Array(8),a=new Float32Array(8);for(let c=0;c<8;c++)r[c]=c+1,a[c]=(c+1)*10;const l=8*8,w=new Float32Array(l),h=new Float32Array(l),F=new Float32Array(l),n=new Float32Array(l);for(let c=0;c<8;c++)F[24+c]=r[c],n[24+c]=a[c];const B=this.makeF32Buffer("kvs_k",r),f=this.makeF32Buffer("kvs_v",a),p=this.makeF32Buffer("kvs_kc",w),y=this.makeF32Buffer("kvs_vc",h),k=this.makeUniform([3,2,4]);this.dispatch("kv_cache_store",[B,f,p,y,k],Math.ceil(8/256));const u=await this.readback(p,l),m=await this.readback(y,l),g=this.compare(u,F,1e-6),i=this.compare(m,n,1e-6);return{pass:g.pass&&i.pass,maxErr:Math.max(g.maxErr,i.maxErr),errors:[...g.errors,...i.errors]}}async testGQAAttention(){const a=new Float32Array([1,0,0,0]),d=new Float32Array(4);d.set([1,0,0,0]);const l=new Float32Array(4);l.set([.5,.6,.7,.8]);const w=new Float32Array([.5,.6,.7,.8]),h=this.makeF32Buffer("gqa_q",a),F=this.makeF32Buffer("gqa_kc",d),n=this.makeF32Buffer("gqa_vc",l),B=this.makeOutputBuffer("gqa_out",4),f=new ArrayBuffer(32),p=new DataView(f);p.setUint32(0,1,!0),p.setUint32(4,4,!0),p.setUint32(8,1,!0),p.setUint32(12,1,!0),p.setUint32(16,1/1,!0),p.setUint32(20,1,!0);const y=this.gpu.createBufferFromData("gqa_params",new Uint8Array(f),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[h,F,n,B,y],1);const k=await this.readback(B,4);return this.compare(k,w,1e-4)}async testDeltaNetRecurrent(){const s=[.5,.5],r=[1,0],a=[.8,.2],d=new Float32Array([...s,...r,...a]),l=new Float32Array(4),w=new Float32Array([0]),h=new Float32Array([0]),F=new Float32Array([0]),n=new Float32Array([0]),B=this.cpuF32ToBF16Packed(F),f=this.cpuF32ToBF16Packed(n),p=1/(1+Math.exp(-0)),y=Math.sqrt(s[0]**2+s[1]**2),k=Math.sqrt(r[0]**2+r[1]**2),u=s.map(U=>U/y),m=r.map(U=>U/k),g=[0,0,0,0],i=[0,0],c=a.map((U,S)=>(U-i[S])*p);for(let U=0;U<2;U++)for(let S=0;S<2;S++)g[U*2+S]+=m[U]*c[S];const _=1/Math.sqrt(2),b=new Float32Array(2);for(let U=0;U<2;U++){let S=0;for(let O=0;O<2;O++)S+=g[O*2+U]*u[O]*_;b[U]=S}const N=this.makeF32Buffer("dn_qkv",d),x=this.makeF32Buffer("dn_state",l),q=this.makeOutputBuffer("dn_out",2),K=this.makeF32Buffer("dn_a",w),v=this.makeF32Buffer("dn_b",h),T=this.makeU32Buffer("dn_alog",B),V=this.makeU32Buffer("dn_dtb",f),G=this.makeUniform([1,2,2]);this.dispatch("deltanet_recurrent",[N,x,q,K,v,T,V,G],1);const C=await this.readback(q,2);return this.compare(C,b,.001)}async testAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array(8);for(let _=0;_<8;_++)r[_]=.1*(_+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(o);for(let _=0;_<8;_++)d[_]+=s[_];let l=0;for(let _=0;_<8;_++)l+=d[_]*d[_];const w=1/Math.sqrt(l/8+1e-6),h=new Float32Array(8);for(let _=0;_<8;_++){const b=a[Math.floor(_/2)]>>_%2*16&65535,N=this.bf16ToF32(b);h[_]=d[_]*w*(1+N)}const F=this.makeF32Buffer("addnorm_h",o),n=this.makeF32Buffer("addnorm_a",s),B=this.makeU32Buffer("addnorm_w",a),f=this.makeOutputBuffer("addnorm_out",8);this.makeUniform([8]);const p=new ArrayBuffer(16),y=new DataView(p);y.setUint32(0,8,!0),y.setFloat32(4,1e-6,!0);const k=this.gpu.createBufferFromData("addnorm_params",new Uint8Array(p),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("add_rmsnorm",[F,n,B,f,k],1);const u=await this.readback(f,8),m=await this.readback(F,8),g=d,i=this.compare(u,h,.001),c=this.compare(m,g,1e-6);return{pass:i.pass&&c.pass,maxErr:Math.max(i.maxErr,c.maxErr),errors:[...i.errors,...c.errors]}}async testThreeWayAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array([.5,.5,-.5,-.5,1,1,-1,-1]),a=new Float32Array(8);for(let N=0;N<8;N++)a[N]=.1*(N+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(8);for(let N=0;N<8;N++)l[N]=o[N]+s[N]+r[N];let w=0;for(let N=0;N<8;N++)w+=l[N]*l[N];const h=1/Math.sqrt(w/8+1e-6),F=new Float32Array(8);for(let N=0;N<8;N++){const x=d[Math.floor(N/2)]>>N%2*16&65535,q=this.bf16ToF32(x);F[N]=l[N]*h*(1+q)}const n=this.makeF32Buffer("twa_a",o),B=this.makeF32Buffer("twa_b",s),f=this.makeF32Buffer("twa_c",r),p=this.makeU32Buffer("twa_w",d),y=this.makeOutputBuffer("twa_hout",8),k=this.makeOutputBuffer("twa_normed",8),u=new ArrayBuffer(16),m=new DataView(u);m.setUint32(0,8,!0),m.setFloat32(4,1e-6,!0);const g=this.gpu.createBufferFromData("twa_params",new Uint8Array(u),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("three_way_add_rmsnorm",[n,B,f,p,y,k,g],1);const i=await this.readback(y,8),c=await this.readback(k,8),_=this.compare(i,l,1e-6),b=this.compare(c,F,.001);return{pass:_.pass&&b.pass,maxErr:Math.max(_.maxErr,b.maxErr),errors:[..._.errors,...b.errors]}}async testGQARoPERotateHalf(){const h=1/Math.sqrt(256),F=[];for(let D=0;D<32;D++)F.push(1/Math.pow(100,2*D/64));function n(D,M){const H=new Float32Array(D.length);for(let P=0;P<D.length;P++)H[P]=D[P];for(let P=0;P<32;P++){const Q=M*F[P],E=Math.cos(Q),J=Math.sin(Q);H[P]=D[P]*E-D[P+32]*J,H[P+32]=D[P+32]*E+D[P]*J}return H}const B=new Float32Array(256);B[0]=1,B[1]=.5,B[2]=.3,B[32]=.7,B[33]=.2;const f=new Float32Array(256);f[0]=1,f[32]=.5;const p=new Float32Array(256);p[1]=1,p[33]=.3;const y=new Float32Array(256);y[2]=1;const k=new Float32Array(256);k[0]=1;const u=new Float32Array(256);u[0]=0,u[1]=1;const m=new Float32Array(256);m[0]=0,m[2]=1;const g=n(B,2),i=n(f,0),c=n(p,1),_=n(y,2);let b=0,N=0,x=0;for(let D=0;D<256;D++)b+=g[D]*i[D],N+=g[D]*c[D],x+=g[D]*_[D];b*=h,N*=h,x*=h;const q=Math.max(b,N,x),K=Math.exp(b-q),v=Math.exp(N-q),T=Math.exp(x-q),V=K+v+T,G=K/V,C=v/V,U=T/V,S=new Float32Array(256);for(let D=0;D<256;D++)S[D]=G*k[D]+C*u[D]+U*m[D];const O=1*256,Y=new Float32Array(3*O),z=new Float32Array(3*O);Y.set(i,0),Y.set(c,O),Y.set(_,2*O),z.set(k,0),z.set(u,O),z.set(m,2*O);const X=this.makeF32Buffer("rope2_q",g),tt=this.makeF32Buffer("rope2_kc",Y),A=this.makeF32Buffer("rope2_vc",z),R=this.makeOutputBuffer("rope2_out",1*256),L=new ArrayBuffer(32),W=new DataView(L);W.setUint32(0,3,!0),W.setUint32(4,256,!0),W.setUint32(8,1,!0),W.setUint32(12,1,!0),W.setUint32(16,1/1,!0),W.setUint32(20,1,!0);const Z=this.gpu.createBufferFromData("rope2_params",new Uint8Array(L),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[X,tt,A,R,Z],1);const j=await this.readback(R,1*256);return this.compare(j,S,.001)}async testFusedSigmoidGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m+1)*.1,a[m]=(m-8)*.3;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.5+m*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const c=r[i]/(1+Math.exp(-a[i])),b=Math.floor(i/64)*4+m,N=h[Math.floor(b/2)]>>b%2*16&65535,x=this.f16ToF32(N);g+=c*l[i*4+m]*x}F[m]=g}const n=this.makeF32Buffer("fsg_x",r),B=this.makeF32Buffer("fsg_g",a),f=this.makeU32Buffer("fsg_qw",d),p=this.makeU32Buffer("fsg_sc",h),y=this.makeOutputBuffer("fsg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_sigmoid_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedSiluGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m-8)*.2,a[m]=(m+1)*.15;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_*2+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.4+m*.2;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const _=r[i]/(1+Math.exp(-r[i]))*a[i],N=Math.floor(i/64)*4+m,x=h[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);g+=_*l[i*4+m]*q}F[m]=g}const n=this.makeF32Buffer("fslg_a",r),B=this.makeF32Buffer("fslg_b",a),f=this.makeU32Buffer("fslg_qw",d),p=this.makeU32Buffer("fslg_sc",h),y=this.makeOutputBuffer("fslg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_silu_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedAddNormGateUpSiLU(){const a=new Float32Array(64),d=new Float32Array(64),l=new Float32Array(64),w=new Float32Array(64);for(let A=0;A<64;A++)a[A]=(A+1)*.1,d[A]=(A-5)*.05,l[A]=(A%7-3)*.03,w[A]=(A%3-1)*.1;const h=new Uint32Array(32),F=new Uint32Array(32),n=new Float32Array(256),B=new Float32Array(256);for(let A=0;A<64;A++)for(let R=0;R<4;R++)n[A*4+R]=Math.sin(A*.7+R*1.3)*.5,B[A*4+R]=Math.cos(A*.3+R*.9)*.5;const f=new Uint32Array(4/2),p=new Uint32Array(4/2);for(let A=0;A<4;A++){let R=0,L=0;for(let P=0;P<64;P++)R=Math.max(R,Math.abs(n[P*4+A])),L=Math.max(L,Math.abs(B[P*4+A]));const W=R/7,Z=L/7,j=this.f32ToF16(W),D=this.f32ToF16(Z),M=this.f16ToF32(j),H=this.f16ToF32(D);A%2===0?(f[A/2]=j,p[A/2]=D):(f[(A-1)/2]|=j<<16,p[(A-1)/2]|=D<<16);for(let P=0;P<64;P++){const Q=Math.round(n[P*4+A]/M)+8,E=Math.round(B[P*4+A]/H)+8,J=Math.max(0,Math.min(15,Q)),rt=Math.max(0,Math.min(15,E)),st=Math.floor(P/8),at=P%8*4;h[st*4+A]|=J<<at,F[st*4+A]|=rt<<at}}const y=new Uint32Array(64/2);for(let A=0;A<64;A+=2){const R=this.f32ToBF16(w[A]),L=this.f32ToBF16(w[A+1]);y[A/2]=R|L<<16}const k=new Float32Array(64);let u=0;for(let A=0;A<64;A++)k[A]=a[A]+d[A]+l[A],u+=k[A]*k[A];const m=1/Math.sqrt(u/64+1e-6),g=new Float32Array(64);for(let A=0;A<64;A++){const R=this.unpackBF16(y[Math.floor(A/2)],A%2);g[A]=k[A]*m*(1+R)}const i=new Float32Array(4);for(let A=0;A<4;A++){let R=0,L=0;const W=this.f16ToF32(A%2===0?f[Math.floor(A/2)]&65535:f[Math.floor(A/2)]>>>16),Z=this.f16ToF32(A%2===0?p[Math.floor(A/2)]&65535:p[Math.floor(A/2)]>>>16);for(let j=0;j<64;j++){const D=Math.floor(j/8),M=j%8*4,H=h[D*4+A]>>>M&15,P=F[D*4+A]>>>M&15;R+=g[j]*W*(H-8),L+=g[j]*Z*(P-8)}i[A]=R/(1+Math.exp(-R))*L}const c=k,_=16+640*16,b=new ArrayBuffer(_),N=new DataView(b);N.setUint32(0,64,!0),N.setUint32(4,4,!0),N.setUint32(8,64,!0),N.setFloat32(12,1e-6,!0);for(let A=0;A<64/2;A++){const R=Math.floor(A/4),L=A%4;N.setUint32(16+R*16+L*4,y[A],!0)}const x=this.gpu.createBufferFromData("fang_params",new Uint8Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),q=new Uint32Array(64);q.set(h,0),q.set(F,32);const K=8/(64/8),v=new Uint32Array(K*4/2*2);v.set(f,0),v.set(p,K*4/2);const T=this.makeF32Buffer("fang_hin",a),V=this.makeF32Buffer("fang_mlpres",d),G=this.makeF32Buffer("fang_attnres",l),C=this.makeU32Buffer("fang_mqw",q),U=this.makeU32Buffer("fang_msc",v),S=this.makeOutputBuffer("fang_hout",64),O=this.makeOutputBuffer("fang_out",4);this.dispatch("fused_addnorm_gate_up_silu",[T,V,G,C,U,S,O,x],Math.max(1,Math.ceil(4/32)));const Y=await this.readback(O,4),z=await this.readback(S,64),X=this.compare(Y,i,.02),tt=this.compare(z,c,1e-6);return{pass:X.pass&&tt.pass,maxErr:Math.max(X.maxErr,tt.maxErr),errors:[...X.errors||[],...tt.errors||[]]}}async testFusedSplitQKNormKVStore(){const l=new Float32Array(32);for(let M=0;M<l.length;M++)l[M]=Math.sin(M*.5+1)*.3;const w=new Float32Array(8),h=new Float32Array(8);for(let M=0;M<8;M++)w[M]=Math.cos(M*.7+2)*.4,h[M]=Math.sin(M*1.1+3)*.2;const F=new Float32Array(16),n=new Float32Array(8);for(let M=0;M<16;M++)F[M]=(M%3-1)*.05;for(let M=0;M<8;M++)n[M]=(M%4-2)*.03;const B=new Uint32Array(16/2);for(let M=0;M<16;M+=2)B[M/2]=this.f32ToBF16(F[M])|this.f32ToBF16(F[M+1])<<16;const f=new Uint32Array(8/2);for(let M=0;M<8;M+=2)f[M/2]=this.f32ToBF16(n[M])|this.f32ToBF16(n[M+1])<<16;const p=new Float32Array(16),y=new Float32Array(16);for(let M=0;M<2;M++){const H=new Float32Array(8);for(let E=0;E<8;E++)H[E]=l[M*8*2+E],y[M*8+E]=l[M*8*2+8+E];let P=0;for(let E=0;E<8;E++)P+=H[E]*H[E];const Q=1/Math.sqrt(P/8+1e-6);for(let E=0;E<8;E++){const J=this.unpackBF16(B[Math.floor((M*8+E)/2)],(M*8+E)%2);p[M*8+E]=H[E]*Q*(1+J)}}const k=new Float32Array(8),u=new Float32Array(32),m=new Float32Array(32);for(let M=0;M<1;M++){let H=0;for(let Q=0;Q<8;Q++)H+=w[M*8+Q]*w[M*8+Q];const P=1/Math.sqrt(H/8+1e-6);for(let Q=0;Q<8;Q++){const E=this.unpackBF16(f[Math.floor((M*8+Q)/2)],(M*8+Q)%2),J=w[M*8+Q]*P*(1+E);k[M*8+Q]=J,u[24+M*8+Q]=J,m[24+M*8+Q]=h[M*8+Q]}}const g=32+320*16,i=new ArrayBuffer(g),c=new DataView(i);c.setUint32(0,2,!0),c.setUint32(4,1,!0),c.setUint32(8,8,!0),c.setFloat32(12,1e-6,!0),c.setUint32(16,3,!0);for(let M=0;M<16/2;M++){const H=Math.floor(M/4),P=M%4;c.setUint32(32+H*16+P*4,B[M],!0)}const _=16/2;for(let M=0;M<8/2;M++){const H=_+M,P=Math.floor(H/4),Q=H%4;c.setUint32(32+P*16+Q*4,f[M],!0)}const b=this.gpu.createBufferFromData("fsqk_params",new Uint8Array(i),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),N=this.makeF32Buffer("fsqk_qpf",l),x=this.makeF32Buffer("fsqk_kp",w),q=this.makeF32Buffer("fsqk_vp",h),K=this.makeOutputBuffer("fsqk_qp",16),v=this.makeOutputBuffer("fsqk_qg",16),T=this.makeOutputBuffer("fsqk_kc",32),V=this.makeOutputBuffer("fsqk_vc",32);this.dispatch("fused_split_qknorm_kvstore",[N,x,q,K,v,T,V,b],3);const G=await this.readback(K,16),C=await this.readback(v,16),U=await this.readback(x,8),S=await this.readback(T,32),O=await this.readback(V,32),Y=this.compare(G,p,1e-5),z=this.compare(C,y,1e-6),X=this.compare(U,k,1e-5),tt=S.slice(24,32),A=O.slice(24,32),R=u.slice(24,32),L=m.slice(24,32),W=this.compare(tt,R,1e-5),Z=this.compare(A,L,1e-6),j=Math.max(Y.maxErr,z.maxErr,X.maxErr,W.maxErr,Z.maxErr);return{pass:Y.pass&&z.pass&&X.pass&&W.pass&&Z.pass,maxErr:j,errors:[...Y.errors||[],...z.errors||[],...X.errors||[],...W.errors||[],...Z.errors||[]]}}async benchmarkGPTQ(e){const t=[{K:2048,N:2048,gs:128,label:"2048→2048 (Z/out_proj)"},{K:2048,N:6144,gs:128,label:"2048→6144 (QKV/gate+up)"},{K:6144,N:2048,gs:128,label:"6144→2048 (down_proj)"}],o=[1,2,4,8],s=50,r=3,a=10,d=[];for(const l of t){const{K:w,N:h,gs:F,label:n}=l,B=w/8,f=B/(F/8),p=new Float32Array(w);for(let b=0;b<w;b++)p[b]=Math.random()*2-1;const y=new Uint32Array(B*h);for(let b=0;b<y.length;b++)y[b]=Math.random()*4294967295>>>0;const k=new Float32Array(f*h);for(let b=0;b<k.length;b++)k[b]=Math.random()*.5;const u=this.cpuF32ToF16Packed(k),m=this.makeF32Buffer("bm_in",p),g=this.makeU32Buffer("bm_qw",y),i=this.makeU32Buffer("bm_sc",u),c=[];for(let b=0;b<s;b++)c.push(this.makeOutputBuffer(`bm_out_${b}`,h));const _=B*h*4+f*h*2;for(const b of o){if(b>1&&f%b!==0)continue;let N;if(b===1){const v=this.makeUniform([w,h,F]);for(let G=0;G<r;G++){this.gpu.beginBatch();for(let C=0;C<s;C++)this.dispatch("gptq_matvec",[m,g,i,c[C],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const T=[];for(let G=0;G<a;G++){await this.gpu.device.queue.onSubmittedWorkDone();const C=performance.now();this.gpu.beginBatch();for(let U=0;U<s;U++)this.dispatch("gptq_matvec",[m,g,i,c[U],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),T.push(performance.now()-C)}T.sort((G,C)=>G-C),N=T.slice(1,-1).reduce((G,C)=>G+C,0)/(T.length-2)/s}else{const v=[];for(let U=0;U<s;U++)v.push(this.makeOutputBuffer(`bm_part_${U}`,h*b));const T=this.makeUniform([w,h,F,b]),V=this.makeUniform([h,b]);for(let U=0;U<r;U++){this.gpu.beginBatch();for(let S=0;S<s;S++)this.dispatch("gptq_splitk",[m,g,i,v[S],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[S],c[S],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const G=[];for(let U=0;U<a;U++){await this.gpu.device.queue.onSubmittedWorkDone();const S=performance.now();this.gpu.beginBatch();for(let O=0;O<s;O++)this.dispatch("gptq_splitk",[m,g,i,v[O],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[O],c[O],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),G.push(performance.now()-S)}G.sort((U,S)=>U-S),N=G.slice(1,-1).reduce((U,S)=>U+S,0)/(G.length-2)/s}const x=_/1e9/(N/1e3),q=Math.ceil(h/32)*b,K={label:n,ns:b,wgs:q,avgMs:N.toFixed(4),bwGBs:x.toFixed(1)};d.push(K),e?.(K)}if(this.gpu.hasF16&&this.pipelines.gptq_matvec_f16){const b=this.makeUniform([w,h,F]);for(let T=0;T<r;T++){this.gpu.beginBatch();for(let V=0;V<s;V++)this.dispatch("gptq_matvec_f16",[m,g,i,c[V],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const N=[];for(let T=0;T<a;T++){await this.gpu.device.queue.onSubmittedWorkDone();const V=performance.now();this.gpu.beginBatch();for(let G=0;G<s;G++)this.dispatch("gptq_matvec_f16",[m,g,i,c[G],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),N.push(performance.now()-V)}N.sort((T,V)=>T-V);const q=N.slice(1,-1).reduce((T,V)=>T+V,0)/(N.length-2)/s,K=_/1e9/(q/1e3),v={label:n,ns:"f16",wgs:Math.ceil(h/32),avgMs:q.toFixed(4),bwGBs:K.toFixed(1)};d.push(v),e?.(v)}}return d}_makeNormParams(e,t=1e-6){const o=new ArrayBuffer(16),s=new DataView(o);return s.setUint32(0,e,!0),s.setFloat32(4,t,!0),this.gpu.createBufferFromData("norm_params",new Uint8Array(o),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}_cpuRMSNorm1PlusW(e,t,o,s=1e-6){let r=0;for(let l=0;l<o;l++)r+=e[l]*e[l];const a=1/Math.sqrt(r/o+s),d=new Float32Array(o);for(let l=0;l<o;l++){const w=t[Math.floor(l/2)]>>l%2*16&65535,h=this.bf16ToF32(w);d[l]=e[l]*a*(1+h)}return d}async testAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let u=0;u<32;u++)o[u]=Math.sin(u*.7)*2,s[u]=Math.cos(u*.3)*.5;const r=new Float32Array(16);for(let u=0;u<16;u++)r[u]=.1*(u+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32),l=new Float32Array(32);for(let u=0;u<2;u++){const m=new Float32Array(16);for(let i=0;i<16;i++)m[i]=o[u*16+i]+s[u*16+i];for(let i=0;i<16;i++)l[u*16+i]=m[i];const g=this._cpuRMSNorm1PlusW(m,a,16,1e-6);d.set(g,u*16)}const w=this.makeF32Buffer("b2an_h",o),h=this.makeF32Buffer("b2an_a",s),F=this.makeU32Buffer("b2an_w",a),n=this.makeOutputBuffer("b2an_out",32),B=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_b2",[w,h,F,n,B],2);const f=await this.readback(n,32),p=await this.readback(w,32),y=this.compare(f,d,.001),k=this.compare(p,l,1e-6);return{pass:y.pass&&k.pass,maxErr:Math.max(y.maxErr,k.maxErr),errors:[...y.errors,...k.errors]}}async testAddRMSNormROB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let f=0;f<32;f++)o[f]=Math.sin(f*.5)*3,s[f]=Math.cos(f*.8)*.3;const r=new Float32Array(16);for(let f=0;f<16;f++)r[f]=.05*(f+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32);for(let f=0;f<2;f++){const p=new Float32Array(16);for(let k=0;k<16;k++)p[k]=o[f*16+k]+s[f*16+k];const y=this._cpuRMSNorm1PlusW(p,a,16,1e-6);d.set(y,f*16)}const l=this.makeF32Buffer("b2anro_h",o),w=this.makeF32Buffer("b2anro_a",s),h=this.makeU32Buffer("b2anro_w",a),F=this.makeOutputBuffer("b2anro_out",32),n=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_ro_b2",[l,w,h,F,n],2);const B=await this.readback(F,32);return this.compare(B,d,.001)}async testThreeWayAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32),r=new Float32Array(32);for(let i=0;i<32;i++)o[i]=Math.sin(i*.4)*2,s[i]=Math.cos(i*.6)*.5,r[i]=Math.sin(i*1.1)*.3;const a=new Float32Array(16);for(let i=0;i<16;i++)a[i]=.1*(i+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(32),w=new Float32Array(32);for(let i=0;i<2;i++){const c=new Float32Array(16);for(let b=0;b<16;b++)c[b]=o[i*16+b]+s[i*16+b]+r[i*16+b];l.set(c,i*16);const _=this._cpuRMSNorm1PlusW(c,d,16,1e-6);w.set(_,i*16)}const h=this.makeF32Buffer("b2twa_a",o),F=this.makeF32Buffer("b2twa_b",s),n=this.makeF32Buffer("b2twa_c",r),B=this.makeU32Buffer("b2twa_w",d),f=this.makeOutputBuffer("b2twa_hout",32),p=this.makeOutputBuffer("b2twa_normed",32),y=this._makeNormParams(16,1e-6);this.dispatch("three_way_add_rmsnorm_b2",[h,F,n,B,f,p,y],2);const k=await this.readback(p,32),u=await this.readback(f,32),m=this.compare(k,w,.001),g=this.compare(u,l,1e-6);return{pass:m.pass&&g.pass,maxErr:Math.max(m.maxErr,g.maxErr),errors:[...m.errors,...g.errors]}}_makeGPTQData(e,t,o){const s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let h=0;h<s;h++)for(let F=0;F<t;F++){let n=0;for(let B=0;B<8;B++){const f=h*8+B,p=(f*3+F*7)%15;a[f*t+F]=p-8,n|=p<<B*4}r[h*t+F]=n}const d=e/o,l=new Float32Array(d*t);for(let h=0;h<d*t;h++)l[h]=.3+h*.15;const w=this.cpuF32ToF16Packed(l);return{qweight:r,rawWeights:a,scalesPacked:w}}_cpuGPTQMatvec(e,t,o,s,r,a,d=0){const l=new Float32Array(r);for(let w=0;w<r;w++){let h=0;for(let F=0;F<s;F++){const B=Math.floor(F/a)*r+w,f=o[Math.floor(B/2)]>>B%2*16&65535,p=this.f16ToF32(f);h+=e[d+F]*t[F*r+w]*p}l[w]=h}return l}async testFusedSiluGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.3)*2,l[k]=Math.cos(k*.5)*1.5;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=d[k*e+g];u[g]=i/(1+Math.exp(-i))*l[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sl_a",d),F=this.makeF32Buffer("b2sl_b",l),n=this.makeU32Buffer("b2sl_qw",s),B=this.makeU32Buffer("b2sl_sc",a),f=this.makeOutputBuffer("b2sl_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_silu_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedSigmoidGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.4)*1.5,l[k]=Math.cos(k*.7)*2;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=1/(1+Math.exp(-l[k*e+g]));u[g]=i*d[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sg_x",d),F=this.makeF32Buffer("b2sg_g",l),n=this.makeU32Buffer("b2sg_qw",s),B=this.makeU32Buffer("b2sg_sc",a),f=this.makeOutputBuffer("b2sg_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_sigmoid_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedGateUpSiLUB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*3+x*7)%15;a[v*t+x]=T-8,q|=T<<K*4}r[N*t+x]=q}const d=e/o,l=new Float32Array(d*t);for(let N=0;N<d*t;N++)l[N]=.01+N%8*.005;const w=this.cpuF32ToF16Packed(l),h=new Uint32Array(s*t),F=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*5+x*3)%15;F[v*t+x]=T-8,q|=T<<K*4}h[N*t+x]=q}const n=e/o,B=new Float32Array(n*t);for(let N=0;N<n*t;N++)B[N]=.01+N%8*.005;const f=this.cpuF32ToF16Packed(B),p=new Float32Array(2*e);for(let N=0;N<2*e;N++)p[N]=Math.sin(N*.2)*.5;const y=new Float32Array(2*t);for(let N=0;N<2;N++){const x=this._cpuGPTQMatvec(p,a,w,e,t,o,N*e),q=this._cpuGPTQMatvec(p,F,f,e,t,o,N*e);for(let K=0;K<t;K++){const v=x[K];y[N*t+K]=v/(1+Math.exp(-v))*q[K]}}const k=this.makeF32Buffer("b2gus_in",p),u=this.makeU32Buffer("b2gus_gqw",r),m=this.makeU32Buffer("b2gus_gsc",w),g=this.makeU32Buffer("b2gus_uqw",h),i=this.makeU32Buffer("b2gus_usc",f),c=this.makeOutputBuffer("b2gus_out",2*t),_=this.makeUniform([e,t,o]);this.dispatch("fused_gate_up_silu_b2_f16",[k,u,m,g,i,c,_],Math.ceil(t/32));const b=await this.readback(c,2*t);return this.compare(b,y,.05)}_makeGPTQMatmulB2Data(e,t,o){const s=e/8,r=new Float32Array(2*e);for(let n=0;n<e;n++)r[n]=Math.sin(n*.5)*.5;for(let n=0;n<e;n++)r[e+n]=Math.cos(n*.3)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let n=0;n<s;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<8;p++){const y=n*8+p,k=(y*3+B*7)%15;d[y*t+B]=k-8,f|=k<<p*4}a[n*t+B]=f}const l=e/o,w=new Float32Array(l*t);for(let n=0;n<l*t;n++)w[n]=.01+n%8*.005;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(2*t);for(let n=0;n<2;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<e;p++){const k=Math.floor(p/o)*t+B,u=h[Math.floor(k/2)]>>k%2*16&65535,m=this.f16ToF32(u);f+=r[n*e+p]*d[p*t+B]*m}F[n*t+B]=f}return{input:r,qweight:a,scalesPacked:h,expected:F,K:e,N:t,gs:o}}async testGPTQMatmulB2(){const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2_input",e),w=this.makeU32Buffer("b2_qw",t),h=this.makeU32Buffer("b2_sc",o),F=this.makeOutputBuffer("b2_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.02)}async testGPTQMatmulB2F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2f16_input",e),w=this.makeU32Buffer("b2f16_qw",t),h=this.makeU32Buffer("b2f16_sc",o),F=this.makeOutputBuffer("b2f16_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_f16",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}async testGPTQMatmulB2_4T_F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(256,16,64),l=this.makeF32Buffer("b2_4t_input",e),w=this.makeU32Buffer("b2_4t_qw",t),h=this.makeU32Buffer("b2_4t_sc",o),F=this.makeOutputBuffer("b2_4t_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_4t_f16",[l,w,h,F,n],Math.ceil(a/8));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}f32ToBF16(e){const t=new ArrayBuffer(4);return new Uint32Array(t)[0]=new Uint32Array(new Float32Array([e]).buffer)[0],new Uint32Array(t)[0]>>>16}unpackBF16(e,t){const o=e>>>t*16&65535;return this.bf16ToF32(o)}}function $(I){const e=document.getElementById("log");e.textContent+=I+`
 `,e.scrollTop=e.scrollHeight}document.querySelectorAll(".tab").forEach(I=>{I.addEventListener("click",()=>{document.querySelectorAll(".tab").forEach(e=>e.classList.remove("active")),document.querySelectorAll(".panel").forEach(e=>e.classList.remove("active")),I.classList.add("active"),document.getElementById(I.dataset.panel).classList.add("active")})});document.getElementById("runTests").addEventListener("click",async()=>{const I=document.getElementById("runTests"),e=document.getElementById("testResults"),t=document.getElementById("testSummary");I.disabled=!0,e.innerHTML="",t.innerHTML="",$("Initializing WebGPU for tests...");try{const o=new ot;await o.init(),$(`WebGPU initialized. Running tests...
 `);let s=0,r=0;await o.runAll(a=>{const d=a.pass?"✓":"✗",l=a.pass?"pass":"fail";a.pass?s++:r++;const w=document.createElement("div");if(w.className="test-row",w.innerHTML=`
             <span class="test-icon ${l}">${d}</span>
           </div>
         `,$(`
 Done: ${s} passed, ${r} failed`),o.destroy()}catch(o){$(`Error: ${o.message}
+${o.stack}`)}finally{I.disabled=!1}});document.getElementById("loadAndProfile").addEventListener("click",async()=>{const I=document.getElementById("loadAndProfile"),e=document.getElementById("profStatus"),t=document.getElementById("profCategories"),o=document.getElementById("profTopOps");I.disabled=!0,t.innerHTML="",o.innerHTML="";try{const s=document.getElementById("profRepo").value.trim();e.textContent="Loading model...",$("Profiler: loading model from "+s);const{GPUContext:r}=await et(async()=>{const{GPUContext:c}=await import("./gpu-ops-PQDFq1iI.js").then(_=>_.g);return{GPUContext:c}},[],import.meta.url),{Qwen35Model:a}=await et(async()=>{const{Qwen35Model:c}=await import("./qwen35-model-CmeFfImT.js");return{Qwen35Model:c}},__vite__mapDeps([0,1]),import.meta.url),{loadModelWeights:d,loadConfig:l,loadQuantConfig:w}=await et(async()=>{const{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}=await import("./safetensors-loader-CwGm5mJX.js");return{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}},[],import.meta.url),{Profiler:h}=await et(async()=>{const{Profiler:c}=await import("./profiler-DYUyiq-B.js");return{Profiler:c}},[],import.meta.url),F=new r;await F.init(),e.textContent="Fetching config...";const n=await l(s),B=await w(s);e.textContent="Downloading weights (this may take a while)...";const f=await d(s,c=>{if(c.phase==="downloading"){const _=c.total?(c.loaded/c.total*100).toFixed(0):"?";e.textContent=`Downloading: ${c.filesLoaded}/${c.filesTotal} shards (${_}%)`}else c.phase==="parsing"&&(e.textContent=`Parsing ${c.file}...`)});e.textContent="Uploading to GPU...";const p=new a(F,n,B);p.compilePipelines(),p.loadWeights(f),p.initBuffers(),e.textContent="Warming up...",await p.forward(1,0);const y=p.textCfg.linear_num_key_heads,k=p.textCfg.linear_key_head_dim,u=p.textCfg.linear_value_head_dim,m=y*(k+k+u);for(let c=0;c<p.numLayers;c++)p.layerTypes[c]==="linear_attention"&&(F.device.queue.writeBuffer(p.linState[c],0,new ArrayBuffer(y*k*u*4)),F.device.queue.writeBuffer(p.linConvHist[c],0,new ArrayBuffer(3*m*4)));p.seqLen=0,e.textContent="Profiling forward pass...",$("Profiling single forward pass...");const i=await new h(p).profileForward(1,0);e.textContent=`Done! Estimated: ${i.estimatedTokPerSec} tok/s (sync overhead inflates times)`,$(`
 Profile complete: ${i.total.toFixed(1)}ms total (~${i.estimatedTokPerSec} tok/s with sync overhead)`),$(`
 Category breakdown:`);for(const c of i.categories){const _=document.createElement("div");_.className="prof-row";const b=c.name.includes("matvec")||c.name==="lm_head"?"var(--orange)":c.name.includes("attention")||c.name.includes("deltanet")?"var(--blue)":"var(--green)";_.innerHTML=`
             <span class="prof-name">${c.name}</span>

index.html CHANGED Viewed

@@ -135,9 +135,9 @@
     .toast-error { border-color: #ef4444; color: #ef4444; }
     .toast-success { border-color: var(--accent); color: var(--accent); }
   </style>
-  <script type="module" crossorigin src="./assets/main-DTfqdn80.js"></script>
-  <link rel="modulepreload" crossorigin href="./assets/gpu-ops-Bq_PFJSE.js">
-  <link rel="modulepreload" crossorigin href="./assets/qwen35-model-D0qiY8Dx.js">
   <link rel="modulepreload" crossorigin href="./assets/safetensors-loader-CwGm5mJX.js">
 </head>
 <body>

     .toast-error { border-color: #ef4444; color: #ef4444; }
     .toast-success { border-color: var(--accent); color: var(--accent); }
   </style>
+  <script type="module" crossorigin src="./assets/main-D60Okk_s.js"></script>
+  <link rel="modulepreload" crossorigin href="./assets/gpu-ops-PQDFq1iI.js">
+  <link rel="modulepreload" crossorigin href="./assets/qwen35-model-CmeFfImT.js">
   <link rel="modulepreload" crossorigin href="./assets/safetensors-loader-CwGm5mJX.js">
 </head>
 <body>

test.html CHANGED Viewed

@@ -41,8 +41,8 @@
     button.run-btn:hover:not(:disabled) { filter: brightness(1.15); }
     #log { margin-top: 12px; padding: 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 8px; max-height: 300px; overflow-y: auto; white-space: pre-wrap; font-size: 0.7rem; color: var(--dim); }
   </style>
-  <script type="module" crossorigin src="./assets/test-BQCz-9iM.js"></script>
-  <link rel="modulepreload" crossorigin href="./assets/gpu-ops-Bq_PFJSE.js">
 </head>
 <body>
   <h1>TensorBend Shader Tests & Profiler</h1>

     button.run-btn:hover:not(:disabled) { filter: brightness(1.15); }
     #log { margin-top: 12px; padding: 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 8px; max-height: 300px; overflow-y: auto; white-space: pre-wrap; font-size: 0.7rem; color: var(--dim); }
   </style>
+  <script type="module" crossorigin src="./assets/test-BEFPr_G8.js"></script>
+  <link rel="modulepreload" crossorigin href="./assets/gpu-ops-PQDFq1iI.js">
 </head>
 <body>
   <h1>TensorBend Shader Tests & Profiler</h1>