Spaces:
Running
Running
Fix boot regression: render synchronously, probe GPU in background
Browse files- App now renders immediately without waiting for GPU adapter probe
- WebGPU check is non-blocking: shows warning banner instead of blocking UI
- Hardware-adaptive model selection updates input after probe completes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
assets/{gpu-ops-PQDFq1iI.js → gpu-ops-DKsrMEcC.js}
RENAMED
|
@@ -3364,8 +3364,8 @@ struct Params { N: u32, eps: f32, }
|
|
| 3364 |
@group(0) @binding(4) var<uniform> params: Params;
|
| 3365 |
|
| 3366 |
fn unpack_bf16(packed: u32, idx: u32) -> f32 {
|
| 3367 |
-
let
|
| 3368 |
-
return bitcast<f32>(
|
| 3369 |
}
|
| 3370 |
|
| 3371 |
var<workgroup> shared_sum: array<f32, 256>;
|
|
@@ -3448,9 +3448,8 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
|
|
| 3448 |
let b = bitcast<f32>(select((bp & 0xFFFFu) << 16u, bp & 0xFFFF0000u, (col & 1u) == 1u));
|
| 3449 |
output[token * N + col] = sum + b;
|
| 3450 |
}`,Rr=`
|
| 3451 |
-
@group(0) @binding(0) var<storage,
|
| 3452 |
-
@group(0) @binding(1) var<
|
| 3453 |
-
@group(0) @binding(2) var<uniform> len: u32;
|
| 3454 |
|
| 3455 |
const SQRT_2_OVER_PI: f32 = 0.7978845608;
|
| 3456 |
const COEFF: f32 = 0.044715;
|
|
@@ -3459,13 +3458,12 @@ const COEFF: f32 = 0.044715;
|
|
| 3459 |
fn main(@builtin(global_invocation_id) gid: vec3u) {
|
| 3460 |
let i = gid.x;
|
| 3461 |
if (i >= len) { return; }
|
| 3462 |
-
let x =
|
| 3463 |
let inner = SQRT_2_OVER_PI * (x + COEFF * x * x * x);
|
| 3464 |
-
|
| 3465 |
}`,Kr=`
|
| 3466 |
-
@group(0) @binding(0) var<storage,
|
| 3467 |
-
@group(0) @binding(1) var<
|
| 3468 |
-
@group(0) @binding(2) var<uniform> len: u32;
|
| 3469 |
|
| 3470 |
const SQRT_2_OVER_PI: f32 = 0.7978845608;
|
| 3471 |
const COEFF: f32 = 0.044715;
|
|
@@ -3474,17 +3472,19 @@ const COEFF: f32 = 0.044715;
|
|
| 3474 |
fn main(@builtin(global_invocation_id) gid: vec3u) {
|
| 3475 |
let i = gid.x;
|
| 3476 |
if (i >= len) { return; }
|
| 3477 |
-
let x =
|
| 3478 |
let inner = SQRT_2_OVER_PI * (x + COEFF * x * x * x);
|
| 3479 |
-
|
| 3480 |
}`,Ar=`
|
| 3481 |
struct Params { seq_len: u32, num_heads: u32, head_dim: u32, }
|
| 3482 |
|
| 3483 |
-
@group(0) @binding(0) var<storage,
|
| 3484 |
-
@group(0) @binding(1) var<storage,
|
| 3485 |
-
@group(0) @binding(2) var<storage,
|
| 3486 |
-
@group(0) @binding(3) var<storage,
|
| 3487 |
-
@group(0) @binding(4) var<
|
|
|
|
|
|
|
| 3488 |
|
| 3489 |
@compute @workgroup_size(256)
|
| 3490 |
fn main(@builtin(global_invocation_id) gid: vec3u) {
|
|
@@ -3507,13 +3507,8 @@ fn main(@builtin(global_invocation_id) gid: vec3u) {
|
|
| 3507 |
let partner_idx = head_and_seq * params.head_dim + partner_d;
|
| 3508 |
let sign = select(-1.0, 1.0, d >= half_dim);
|
| 3509 |
|
| 3510 |
-
|
| 3511 |
-
|
| 3512 |
-
q[idx] = q_val * c + sign * q_partner * s;
|
| 3513 |
-
|
| 3514 |
-
let k_val = k[idx];
|
| 3515 |
-
let k_partner = k[partner_idx];
|
| 3516 |
-
k[idx] = k_val * c + sign * k_partner * s;
|
| 3517 |
}`,Br=`
|
| 3518 |
struct Params { seq_len: u32, num_heads: u32, head_dim: u32, scale: f32, }
|
| 3519 |
|
|
|
|
| 3364 |
@group(0) @binding(4) var<uniform> params: Params;
|
| 3365 |
|
| 3366 |
fn unpack_bf16(packed: u32, idx: u32) -> f32 {
|
| 3367 |
+
let bits = (packed >> (idx * 16u)) & 0xFFFFu;
|
| 3368 |
+
return bitcast<f32>(bits << 16u);
|
| 3369 |
}
|
| 3370 |
|
| 3371 |
var<workgroup> shared_sum: array<f32, 256>;
|
|
|
|
| 3448 |
let b = bitcast<f32>(select((bp & 0xFFFFu) << 16u, bp & 0xFFFF0000u, (col & 1u) == 1u));
|
| 3449 |
output[token * N + col] = sum + b;
|
| 3450 |
}`,Rr=`
|
| 3451 |
+
@group(0) @binding(0) var<storage, read_write> data: array<f32>;
|
| 3452 |
+
@group(0) @binding(1) var<uniform> len: u32;
|
|
|
|
| 3453 |
|
| 3454 |
const SQRT_2_OVER_PI: f32 = 0.7978845608;
|
| 3455 |
const COEFF: f32 = 0.044715;
|
|
|
|
| 3458 |
fn main(@builtin(global_invocation_id) gid: vec3u) {
|
| 3459 |
let i = gid.x;
|
| 3460 |
if (i >= len) { return; }
|
| 3461 |
+
let x = data[i];
|
| 3462 |
let inner = SQRT_2_OVER_PI * (x + COEFF * x * x * x);
|
| 3463 |
+
data[i] = 0.5 * x * (1.0 + tanh(inner));
|
| 3464 |
}`,Kr=`
|
| 3465 |
+
@group(0) @binding(0) var<storage, read_write> data: array<f32>;
|
| 3466 |
+
@group(0) @binding(1) var<uniform> len: u32;
|
|
|
|
| 3467 |
|
| 3468 |
const SQRT_2_OVER_PI: f32 = 0.7978845608;
|
| 3469 |
const COEFF: f32 = 0.044715;
|
|
|
|
| 3472 |
fn main(@builtin(global_invocation_id) gid: vec3u) {
|
| 3473 |
let i = gid.x;
|
| 3474 |
if (i >= len) { return; }
|
| 3475 |
+
let x = data[i];
|
| 3476 |
let inner = SQRT_2_OVER_PI * (x + COEFF * x * x * x);
|
| 3477 |
+
data[i] = 0.5 * x * (1.0 + tanh(inner));
|
| 3478 |
}`,Ar=`
|
| 3479 |
struct Params { seq_len: u32, num_heads: u32, head_dim: u32, }
|
| 3480 |
|
| 3481 |
+
@group(0) @binding(0) var<storage, read> q_in: array<f32>;
|
| 3482 |
+
@group(0) @binding(1) var<storage, read> k_in: array<f32>;
|
| 3483 |
+
@group(0) @binding(2) var<storage, read_write> q_out: array<f32>;
|
| 3484 |
+
@group(0) @binding(3) var<storage, read_write> k_out: array<f32>;
|
| 3485 |
+
@group(0) @binding(4) var<storage, read> cos_buf: array<f32>;
|
| 3486 |
+
@group(0) @binding(5) var<storage, read> sin_buf: array<f32>;
|
| 3487 |
+
@group(0) @binding(6) var<uniform> params: Params;
|
| 3488 |
|
| 3489 |
@compute @workgroup_size(256)
|
| 3490 |
fn main(@builtin(global_invocation_id) gid: vec3u) {
|
|
|
|
| 3507 |
let partner_idx = head_and_seq * params.head_dim + partner_d;
|
| 3508 |
let sign = select(-1.0, 1.0, d >= half_dim);
|
| 3509 |
|
| 3510 |
+
q_out[idx] = q_in[idx] * c + sign * q_in[partner_idx] * s;
|
| 3511 |
+
k_out[idx] = k_in[idx] * c + sign * k_in[partner_idx] * s;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3512 |
}`,Br=`
|
| 3513 |
struct Params { seq_len: u32, num_heads: u32, head_dim: u32, scale: f32, }
|
| 3514 |
|
assets/{main-BKlPaJuO.js → main-DkVqMqJQ.js}
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/qwen35-model-CmeFfImT.js
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
import{S as H,a as W,_ as A}from"./gpu-ops-PQDFq1iI.js";function K(E){const a=E<<16,e=new ArrayBuffer(4);new Uint32Array(e)[0]=a,new Float32Array(e)[0];const t=a>>>31&1,s=a>>>23&255,i=a&8388607;if(s===0)return t<<15;if(s===255)return t<<15|31744|(i?512:0);const n=s-127+15;return n>=31?t<<15|31744:n<=0?t<<15:t<<15|n<<10|i>>>13}class N{constructor(a,e,t){if(this.gpu=a,this.config=e,this.textCfg=e.text_config,this.quantConfig=t,this.hiddenSize=this.textCfg.hidden_size,this.intermediateSize=this.textCfg.intermediate_size,this.numLayers=this.textCfg.num_hidden_layers,this.numHeads=this.textCfg.num_attention_heads,this.numKVHeads=this.textCfg.num_key_value_heads,this.headDim=this.textCfg.head_dim,this.vocabSize=this.textCfg.vocab_size,this.rmsEps=this.textCfg.rms_norm_eps,this.ropeTheta=this.textCfg.rope_parameters?.rope_theta||1e7,this.partialRotary=this.textCfg.partial_rotary_factor,this.partialDim=Math.floor(this.headDim*this.partialRotary),this.mropeSection=this.textCfg.rope_parameters?.mrope_section||[11,11,10],this.layerTypes=this.textCfg.layer_types,this.groupSize=t?.group_size||t?.config_groups?.group_0?.weights?.group_size||128,this.fp16Layers=new Set,t?.extra_config)for(const u of Object.keys(t.extra_config))this.fp16Layers.add(u);this.weights={},this.kvCache={},this.seqLen=0,this.pipelines={},this.singlePassMode=!0,this._replayFlat=null,this._forceMinSplits=0,this._ropeDelta=0;const s=this.numHeads*this.headDim,i=this.numKVHeads*this.headDim,n=(s+i)/2,r=Math.ceil(n/4);this._splitQKNormShaderKey=`fused_split_qknorm_kvstore_${r}`,H[this._splitQKNormShaderKey]||(H[this._splitQKNormShaderKey]=W(r))}compilePipelines(){const a=this.gpu;for(const[e,t]of Object.entries(H))this.pipelines[e]=a.getOrCreatePipeline(e,t)}uploadTensors(a){this.quantConfig?.quant_method==="compressed-tensors"&&(a=this._convertCompressedTensors(a));for(const[t,s]of Object.entries(a)){let i=s.data;if(s.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")){const n=new Float32Array(i.buffer,i.byteOffset,i.byteLength/4),r=new Uint16Array(n.length);for(let u=0;u<n.length;u++){const o=new Uint32Array(n.buffer,n.byteOffset+u*4,1)[0];r[u]=o>>>16}i=new Uint8Array(r.buffer)}if(s._partial){let{offset:n,totalSize:r}=s._partial;if(s.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")&&(n/=2,r/=2),n===0){const o=this.gpu.createBuffer(t,r,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC);this.gpu.device.queue.writeBuffer(o,0,i),this.weights[t]=o}else{const o=this.weights[t];o&&this.gpu.device.queue.writeBuffer(o,n,i)}}else this.weights[t]=this.gpu.createBufferFromData(t,i);(t.includes("post_attention_layernorm.weight")||t.includes("input_layernorm.weight")||t.includes("q_norm.weight")||t.includes("k_norm.weight")||t==="model.language_model.norm.weight")&&(this._normWeightRaw||(this._normWeightRaw={}),this._normWeightRaw[t]=new Uint32Array(i.buffer.slice(i.byteOffset,i.byteOffset+i.byteLength))),this._uploadedCount=(this._uploadedCount||0)+1}}_convertCompressedTensors(a){const e={};for(const[t,s]of Object.entries(a))if(!t.endsWith(".weight_shape")){if(t.endsWith(".weight_packed")){const i=t.slice(0,-14),n=s.shape,r=n[0],u=n[1],o=new Int32Array(s.data.buffer,s.data.byteOffset,s.data.byteLength/4),h=new Int32Array(r*u);for(let m=0;m<r;m++)for(let c=0;c<u;c++)h[c*r+m]=o[m*u+c];e[`${i}.qweight`]={dtype:"I32",shape:[u,r],data:new Uint8Array(h.buffer)};continue}if(t.endsWith(".weight_scale")){const i=t.slice(0,-13),n=s.shape,r=n[0],u=n[1],o=new Uint16Array(s.data.buffer,s.data.byteOffset,s.data.byteLength/2),h=Math.ceil(r/2),m=new Uint32Array(u*h);for(let c=0;c<u;c++)for(let f=0;f<r;f+=2){const p=o[f*u+c],w=f+1<r?o[(f+1)*u+c]:0,B=K(p),q=K(w);m[c*h+(f>>1)]=B|q<<16}e[`${i}.scales`]={dtype:"I32",shape:[u,h],data:new Uint8Array(m.buffer)};continue}e[t]=s}return e}async postProcessWeights(){const a=this.textCfg.linear_num_key_heads||0,e=this.hiddenSize,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=this.layerTypes.indexOf("linear_attention"),i=s>=0?`model.language_model.layers.${s}.linear_attn`:"";if(this.abQuantized=i&&!!this.weights[`${i}.in_proj_a.qweight`],this.abQuantized){const g=this.textCfg.linear_num_value_heads??a;this.linAlpha=this.gpu.createBuffer("lin_alpha",g*4,t),this.linBeta=this.gpu.createBuffer("lin_beta",g*4,t)}else{this.linABWeight={};const g=this.textCfg.linear_num_value_heads??a,P=e*2,S=g*P,b=2*S;for(let v=0;v<this.numLayers;v++)if(this.layerTypes[v]==="linear_attention"){const C=`model.language_model.layers.${v}.linear_attn`,D=this.weights[`${C}.in_proj_a.weight`],k=this.weights[`${C}.in_proj_b.weight`];if(D&&k){const z=this.gpu.createBuffer(`ab_merged_${v}`,b,t),M=this.gpu.device.createCommandEncoder();M.copyBufferToBuffer(D,0,z,0,S),M.copyBufferToBuffer(k,0,z,S,S),this.gpu.device.queue.submit([M.finish()]),this.linABWeight[v]=z}}}{const g=[];for(let b=0;b<this.numLayers;b++){if(this.layerTypes[b]==="linear_attention"){const v=`model.language_model.layers.${b}.linear_attn`,C=this.textCfg.linear_num_key_heads||0,D=this.textCfg.linear_key_head_dim||128,k=this.textCfg.linear_value_head_dim||128,z=this.textCfg.linear_num_value_heads??C,M=z*k,$=z/C*k,O=C*(D+D+$);g.push({prefix:`${v}.in_proj_qkv`,K:e,N:O}),g.push({prefix:`${v}.in_proj_z`,K:e,N:M}),g.push({prefix:`${v}.out_proj`,K:M,N:e})}else{const v=`model.language_model.layers.${b}.self_attn`,C=this.numHeads*this.headDim*2,D=this.numKVHeads*this.headDim;g.push({prefix:`${v}.q_proj`,K:e,N:C}),g.push({prefix:`${v}.k_proj`,K:e,N:D}),g.push({prefix:`${v}.v_proj`,K:e,N:D}),g.push({prefix:`${v}.o_proj`,K:this.numHeads*this.headDim,N:e})}g.push({prefix:`model.language_model.layers.${b}.mlp.gate_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${b}.mlp.up_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${b}.mlp.down_proj`,K:this.intermediateSize,N:e})}let P=0;const S=performance.now();for(const{prefix:b,K:v,N:C}of g)if(!this.weights[`${b}.qweight`]&&this.weights[`${b}.weight`]){const{qweight:D,scales:k}=await this._quantizeBF16ToINT4(this.weights[`${b}.weight`],v,C,this.groupSize,b.replace(/\./g,"_"));this.weights[`${b}.qweight`]=D,this.weights[`${b}.scales`]=k,P++}P>0&&console.log(`[QUANT] GPU-quantized ${P} BF16 projections to INT4 in ${(performance.now()-S).toFixed(0)}ms`)}this._mergedGateUp={};const n=this.intermediateSize,r=e/8,u=r*n*4,h=r/(this.groupSize/8)*n*2;for(let g=0;g<this.numLayers;g++){const P=`model.language_model.layers.${g}.mlp`,S=this.getQWeight(`${P}.gate_proj`),b=this.getQWeight(`${P}.up_proj`);if(S.qweight&&b.qweight){const v=this.gpu.createBuffer(`merged_qw_${g}`,u*2,t),C=this.gpu.createBuffer(`merged_sc_${g}`,h*2,t),D=this.gpu.device.createCommandEncoder();D.copyBufferToBuffer(S.qweight,0,v,0,u),D.copyBufferToBuffer(b.qweight,0,v,u,u),D.copyBufferToBuffer(S.scales,0,C,0,h),D.copyBufferToBuffer(b.scales,0,C,h,h),this.gpu.device.queue.submit([D.finish()]),this._mergedGateUp[g]={qweight:v,scales:C}}}this._fusedMLPParams={};const m=16+512*16;for(let g=0;g<this.numLayers;g++){const P=`model.language_model.layers.${g}.post_attention_layernorm.weight`,S=this._normWeightRaw?.[P];if(!S||!this._mergedGateUp[g])continue;const b=new ArrayBuffer(m),v=new Uint32Array(b),C=new Float32Array(b);v[0]=e,v[1]=n,v[2]=this.groupSize,C[3]=this.rmsEps;for(let D=0;D<S.length;D++)v[4+D]=S[D];this._fusedMLPParams[g]=this.gpu.createBufferFromData(`fused_mlp_params_${g}`,new Uint32Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}this._fusedSQKParams={};const c=this.headDim,f=this.numHeads,p=this.numKVHeads,w=f*c,B=p*c,q=(w+B)/2,_=Math.ceil(q/4),l=32+_*16,d=this.mropeSection[1]*3,y=this.mropeSection[2]*3,U=`fused_split_qknorm_kvstore_${_}`;H[U]||(H[U]=W(_,this.ropeTheta,d,y,this.partialDim)),this.pipelines[U]||(this.pipelines[U]=this.gpu.getOrCreatePipeline(U,H[U])),this._splitQKNormShaderKey=U;for(let g=0;g<this.numLayers;g++){if(this.layerTypes[g]!=="full_attention")continue;const P=`model.language_model.layers.${g}.self_attn`,S=`${P}.q_norm.weight`,b=`${P}.k_norm.weight`,v=this._normWeightRaw?.[S],C=this._normWeightRaw?.[b],D=new ArrayBuffer(l),k=new DataView(D);if(k.setUint32(0,f,!0),k.setUint32(4,p,!0),k.setUint32(8,c,!0),k.setFloat32(12,this.rmsEps,!0),k.setUint32(16,0,!0),k.setUint32(20,0,!0),k.setUint32(24,0,!0),k.setUint32(28,0,!0),v)for(let M=0;M<w/2;M++){const $=Math.floor(M/4),O=M%4;k.setUint32(32+$*16+O*4,v[M],!0)}if(C){const M=w/2;for(let $=0;$<B/2;$++){const O=M+$,G=Math.floor(O/4),R=O%4;k.setUint32(32+G*16+R*4,C[$],!0)}}const z=this.gpu.device.createBuffer({size:l,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:`fused_sqk_${g}`});this.gpu.device.queue.writeBuffer(z,0,new Uint8Array(D)),this._fusedSQKParams[g]=z}await this._quantizeLmHead()}async _quantizeLmHead(){const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight";if(!this.weights[e])return;const t=this.hiddenSize,s=this.vocabSize,i=this.groupSize,n=t/8,r=t/i,u=performance.now(),o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,h=this.weights[e];this._lmHeadQWeight=this.gpu.createBuffer("lmhead_qweight",n*s*4,o);const m=this.gpu.createBuffer("lmhead_scales_f32",r*s*4,o),c=Math.ceil(r*s/2)*4;this._lmHeadScales=this.gpu.createBuffer("lmhead_scales",c,o);const f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:D}=await import("./gpu-ops-PQDFq1iI.js").then(k=>k.b);return{SHADERS:D}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData("quant_params",new Uint32Array([t,s,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[h,this._lmHeadQWeight,m,p]),B=65535,q=Math.min(s,B),_=Math.ceil(s/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:D}=await import("./gpu-ops-PQDFq1iI.js").then(k=>k.b);return{SHADERS:D}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(r*s/2),g=this.gpu.createBufferFromData("pack_params",new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[m,this._lmHeadScales,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),m.destroy(),p.destroy(),g.destroy();const v=(n*s*4/1e6).toFixed(0),C=(c/1e6).toFixed(0);console.log(`[QUANT] lm_head INT4 (GPU): ${(performance.now()-u).toFixed(0)}ms, ${v}MB qw + ${C}MB sc`)}async _quantizeBF16ToINT4(a,e,t,s,i){const n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,r=e/8,u=e/s,o=this.gpu.createBuffer(`${i}_qweight`,r*t*4,n),h=this.gpu.createBuffer(`${i}_scales_f32`,u*t*4,n),m=Math.ceil(u*t/2)*4,c=this.gpu.createBuffer(`${i}_scales`,m,n),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData(`${i}_qp`,new Uint32Array([e,t,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[a,o,h,p]),B=65535,q=Math.min(t,B),_=Math.ceil(t/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(u*t/2),g=this.gpu.createBufferFromData(`${i}_pp`,new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[h,c,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();return b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),h.destroy(),p.destroy(),g.destroy(),{qweight:o,scales:c}}initBuffers(a){a||(a=this.textCfg.max_position_embeddings||4096),this.maxSeqLen=a;const e=this.gpu,t=this.hiddenSize,s=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.hidden=e.createBuffer("hidden_a",t*4,i),this.hiddenB=e.createBuffer("hidden_b",t*4,i),this.normed=e.createBuffer("normed",t*4,i),this.normedB=e.createBuffer("normed_b",t*4,i),this.mlpIntermediate=e.createBuffer("mlp_inter",s*4,i),this.mlpOut=e.createBuffer("mlp_out",t*4,i),this.logits=e.createBuffer("logits",this.vocabSize*4,i),this.argmaxResult=e.createBuffer("argmax_result",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this._argmaxReadback=e.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"argmax_readback"}),this._topkResult=e.createBuffer("topk_result",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._repMaxTokens=64,this._recentTokens=new Uint32Array(this._repMaxTokens),this._recentTokenCount=0,this._gpuRecentTokens=e.createBuffer("recent_tokens",this._repMaxTokens*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST);for(let q=0;q<this.numLayers;q++)this.layerTypes[q]==="full_attention"&&(this.kvCache[q]={keys:e.createBuffer(`kv_k_${q}`,a*this.numKVHeads*this.headDim*4),values:e.createBuffer(`kv_v_${q}`,a*this.numKVHeads*this.headDim*4)});this.qProjFull=e.createBuffer("q_proj_full",this.numHeads*this.headDim*2*4,i),this.qProj=e.createBuffer("q_proj_out",this.numHeads*this.headDim*4,i),this.qGate=e.createBuffer("q_gate",this.numHeads*this.headDim*4,i),this.kProj=e.createBuffer("k_proj_out",this.numKVHeads*this.headDim*4,i),this.vProj=e.createBuffer("v_proj_out",this.numKVHeads*this.headDim*4,i);const n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,o=this.textCfg.linear_num_value_heads??n,m=o/n*u,c=n*(r+r+m),f=o*u;this.linValueDim=f,this.linValueHeads=o,this.linQKV=e.createBuffer("lin_qkv",c*4,i),this.linZ=e.createBuffer("lin_z",f*4,i),this.linOut=e.createBuffer("lin_out",f*4,i);const p=Math.max(t,this.numHeads*this.headDim,f)*4;this.attnOut=e.createBuffer("attn_out",p,i),this._maxGqaSplits=64;const w=this.numHeads*this._maxGqaSplits*(this.headDim+2)*4;this._gqaPartials=e.createBuffer("gqa_partials",w,i),this.linState={},this.linConvHist={};for(let q=0;q<this.numLayers;q++)this.layerTypes[q]==="linear_attention"&&(this.linState[q]=e.createBuffer(`lin_state_${q}`,n*r*m*4,i),this.linConvHist[q]=e.createBuffer(`lin_conv_hist_${q}`,3*c*4,i));this.zeroBuf=e.createBuffer("zero_buf",t*4,i),this.useSplitK=!1,this.splitKSplits=1;const B=Math.max(t,s);this.splitKPartials=e.createBuffer("splitk_partials",B*this.splitKSplits*4,i),this.paramBufs={},this._gqaParamBuf=e.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_params"}),this._kvStoreParamBuf=e.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"kv_store_params"}),this._gqaData=new Uint8Array(32),this._gqaDv=new DataView(this._gqaData.buffer),this._kvData=new Uint8Array(16),this._kvDv=new DataView(this._kvData.buffer)}initVision(){const a=this.config.vision_config;if(!a){this.vision=null;return}const e=this.gpu,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=a.hidden_size,i=a.intermediate_size,n=a.out_hidden_size,r=a.depth,u=a.num_heads,o=s/u,h=a.patch_size,m=a.temporal_patch_size,c=a.spatial_merge_size,f=3*m*h*h,p=4096,w=s*c*c;this.vision={V:s,Vi:i,Vo:n,depth:r,heads:u,headDim:o,patchSize:h,temporalPatchSize:m,mergeSize:c,patchInputDim:f,maxVitTokens:p,mergedHidden:w,numPosEmbeddings:a.num_position_embeddings,numGridPerSide:Math.round(Math.sqrt(a.num_position_embeddings)),hidden:e.createBuffer("vit_hidden",p*s*4,t),hiddenB:e.createBuffer("vit_hidden_b",p*s*4,t),normed:e.createBuffer("vit_normed",p*s*4,t),q:e.createBuffer("vit_q",p*s*4,t),k:e.createBuffer("vit_k",p*s*4,t),v:e.createBuffer("vit_v",p*s*4,t),attnOut:e.createBuffer("vit_attn_out",p*s*4,t),mlpInter:e.createBuffer("vit_mlp_inter",p*i*4,t),mlpOut:e.createBuffer("vit_mlp_out",p*s*4,t),cos:e.createBuffer("vit_cos",p*o*4,t),sin:e.createBuffer("vit_sin",p*o*4,t),posEmbed:e.createBuffer("vit_pos_embed",p*s*4,t),merged:e.createBuffer("vit_merged",p/4*n*4,t),mergerNormed:e.createBuffer("vit_merger_normed",p*s*4,t),mergerInter:e.createBuffer("vit_merger_inter",p/4*w*4,t),patchInput:e.createBuffer("vit_patch_input",p*f*4,t),qkv:e.createBuffer("vit_qkv",p*3*s*4,t)},this.imageTokenId=this.config.image_token_id,this.visionStartTokenId=this.config.vision_start_token_id,this.visionEndTokenId=this.config.vision_end_token_id,console.log(`[VISION] Initialized: depth=${r}, hidden=${s}, heads=${u}, headDim=${o}, out=${n}`),console.log(`[VISION] Buffer allocation: ~${((p*s*4*12+p*i*4)/1024/1024).toFixed(0)} MB transient`)}async _readVisionPosEmbed(){const a=this.vision,e=this.weights["model.visual.pos_embed.weight"];if(!e)throw new Error("Vision pos_embed weight not found");const t=a.numPosEmbeddings,s=a.V,i=t*s*2,n=this.gpu.createReadbackBuffer("vit_pos_embed_readback",i),r=this.gpu.device.createCommandEncoder();r.copyBufferToBuffer(e,0,n,0,i),this.gpu.device.queue.submit([r.finish()]),await n.mapAsync(GPUMapMode.READ);const u=new Uint16Array(n.getMappedRange().slice(0));n.unmap(),n.destroy();const o=new Float32Array(t*s);for(let h=0;h<u.length;h++){const m=u[h]<<16,c=new ArrayBuffer(4);new Uint32Array(c)[0]=m,o[h]=new Float32Array(c)[0]}this._vitPosEmbedF32=o,console.log(`[VISION] Read pos_embed: ${t}×${s} (${(i/1024).toFixed(0)} KB)`)}_interpolatePosEmbed(a,e){const t=this.vision,s=t.V,i=t.numGridPerSide,n=this._vitPosEmbedF32,r=t.mergeSize,u=a*e,o=new Float32Array(u*s);for(let p=0;p<a;p++){const w=a===1?0:p*(i-1)/(a-1),B=Math.min(Math.floor(w),i-1),q=Math.min(B+1,i-1),_=w-B;for(let l=0;l<e;l++){const d=e===1?0:l*(i-1)/(e-1),y=Math.min(Math.floor(d),i-1),U=Math.min(y+1,i-1),g=d-y,P=B*i+y,S=B*i+U,b=q*i+y,v=q*i+U,C=(1-_)*(1-g),D=(1-_)*g,k=_*(1-g),z=_*g,M=p*e+l;for(let $=0;$<s;$++)o[M*s+$]=C*n[P*s+$]+D*n[S*s+$]+k*n[b*s+$]+z*n[v*s+$]}}const h=a/r,m=e/r,c=new Float32Array(u*s);let f=0;for(let p=0;p<h;p++)for(let w=0;w<m;w++)for(let B=0;B<r;B++)for(let q=0;q<r;q++){const _=p*r+B,l=w*r+q,d=_*e+l;c.set(o.subarray(d*s,d*s+s),f*s),f++}return c}_computeVisionRoPE(a,e){const t=this.vision,s=t.headDim/2,i=t.mergeSize,n=a/i,r=e/i,u=a*e,o=Math.max(a,e),h=new Float32Array(o*s);for(let p=0;p<o;p++)for(let w=0;w<s;w++){const B=1/Math.pow(1e4,2*w/t.headDim);h[p*s+w]=p*B}const m=new Float32Array(u*t.headDim),c=new Float32Array(u*t.headDim);let f=0;for(let p=0;p<n;p++)for(let w=0;w<r;w++)for(let B=0;B<i;B++)for(let q=0;q<i;q++){const _=p*i+B,l=w*i+q;for(let U=0;U<s;U++){const g=h[_*s+U];h[l*s+U];const P=f*t.headDim;U<s/2&&(m[P+U]=Math.cos(g),c[P+U]=Math.sin(g),m[P+s+U]=Math.cos(g),c[P+s+U]=Math.sin(g))}const d=f*t.headDim,y=s/2;for(let U=0;U<y;U++){const g=h[_*s+U],P=h[l*s+U];m[d+U]=Math.cos(g),m[d+y+U]=Math.cos(P),m[d+s+U]=Math.cos(g),m[d+s+y+U]=Math.cos(P),c[d+U]=Math.sin(g),c[d+y+U]=Math.sin(P),c[d+s+U]=Math.sin(g),c[d+s+y+U]=Math.sin(P)}f++}return{cos:m,sin:c}}async preprocessImage(a){const e=this.vision,t=e.patchSize,s=e.mergeSize,i=t*s,n=256*256,r=1280*1280,u=new Image;await new Promise((v,C)=>{u.onload=v,u.onerror=C,u.src=a});let{width:o,height:h}=u,m=1;o*h>r?m=Math.sqrt(r/(o*h)):o*h<n&&(m=Math.sqrt(n/(o*h)));let c=Math.round(o*m/i)*i,f=Math.round(h*m/i)*i;c=Math.max(i,c),f=Math.max(i,f);const w=new OffscreenCanvas(c,f).getContext("2d");w.drawImage(u,0,0,c,f);const q=w.getImageData(0,0,c,f).data,_=f/t,l=c/t,d=_*l,y=_/s,U=l/s,g=e.temporalPatchSize,P=3*g*t*t,S=new Float32Array(d*P);let b=0;for(let v=0;v<y;v++)for(let C=0;C<U;C++)for(let D=0;D<s;D++)for(let k=0;k<s;k++){const z=v*s+D,M=C*s+k,$=z*t,O=M*t,G=b*P;for(let R=0;R<g;R++)for(let F=0;F<3;F++)for(let T=0;T<t;T++)for(let x=0;x<t;x++){const V=(($+T)*c+(O+x))*4+F,j=q[V]/127.5-1,L=((F*g+R)*t+T)*t+x;S[G+L]=j}b++}return console.log(`[VISION] Preprocessed: ${o}x${h} → ${c}x${f}, ${d} patches (${_}x${l}), merge→${d/4} tokens`),{pixels:S,gridH:_,gridW:l,numPatches:d,imgW:c,imgH:f}}async visionForward(a){const e=this.vision,t=this.gpu,{pixels:s,gridH:i,gridW:n,numPatches:r}=a;this._vitPosEmbedF32||await this._readVisionPosEmbed(),t.device.queue.writeBuffer(e.patchInput,0,s);const u=this._interpolatePosEmbed(i,n);t.device.queue.writeBuffer(e.posEmbed,0,u);const{cos:o,sin:h}=this._computeVisionRoPE(i,n);t.device.queue.writeBuffer(e.cos,0,o),t.device.queue.writeBuffer(e.sin,0,h),t.beginBatch();const m=this.weights["model.visual.patch_embed.proj.weight"],c=this.weights["model.visual.patch_embed.proj.bias"],f=this.makeUniform("vit_patch_params",[e.patchInputDim,e.V]);t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.patchInput,m,c,e.hidden,f])],Math.ceil(e.V/32),r);const p=this.makeUniform("vit_add_len",[r*e.V]);t.dispatch(this.pipelines.vit_add,[t.createBindGroup(this.pipelines.vit_add,0,[e.hidden,e.posEmbed,p])],Math.ceil(r*e.V/256));for(let b=0;b<e.depth;b++)this._vitBlock(b,r);const w=this.weights["model.visual.merger.norm.weight"],B=this.weights["model.visual.merger.norm.bias"],q=this.makeUniform("vit_merger_ln_params",[e.V,1e-6]);t.dispatch(this.pipelines.vit_layernorm,[t.createBindGroup(this.pipelines.vit_layernorm,0,[e.hidden,w,B,e.mergerNormed,q])],r);const _=r/4,l=this.weights["model.visual.merger.linear_fc1.weight"],d=this.weights["model.visual.merger.linear_fc1.bias"],y=this.makeUniform("vit_merger_fc1_params",[e.mergedHidden,e.mergedHidden]);t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.mergerNormed,l,d,e.mergerInter,y])],Math.ceil(e.mergedHidden/32),_);const U=this.makeUniform("vit_gelu_len",[_*e.mergedHidden]);t.dispatch(this.pipelines.vit_gelu,[t.createBindGroup(this.pipelines.vit_gelu,0,[e.mergerInter,e.mergerInter,U])],Math.ceil(_*e.mergedHidden/256));const g=this.weights["model.visual.merger.linear_fc2.weight"],P=this.weights["model.visual.merger.linear_fc2.bias"],S=this.makeUniform("vit_merger_fc2_params",[e.mergedHidden,e.Vo]);return t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.mergerInter,g,P,e.merged,S])],Math.ceil(e.Vo/32),_),t.endBatch(),await t.device.queue.onSubmittedWorkDone(),console.log(`[VISION] Forward done: ${r} patches → ${_} merged tokens (dim=${e.Vo})`),{numMergedTokens:_,embedBuffer:e.merged}}_vitBlock(a,e){const t=this.vision,s=this.gpu,i=`model.visual.blocks.${a}`,n=this.weights[`${i}.norm1.weight`],r=this.weights[`${i}.norm1.bias`],u=this.makeUniform(`vit_ln1_${a}`,[t.V,1e-6]);s.dispatch(this.pipelines.vit_layernorm,[s.createBindGroup(this.pipelines.vit_layernorm,0,[t.hidden,n,r,t.normed,u])],e);const o=this.weights[`${i}.attn.qkv.weight`],h=this.weights[`${i}.attn.qkv.bias`],m=this.makeUniform(`vit_qkv_${a}`,[t.V,3*t.V]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,o,h,t.qkv,m])],Math.ceil(3*t.V/32),e);const c=t.V,f=c*c*2,p=c*2,w=this.makeUniform(`vit_qkv_mv_${a}`,[c,c]);for(let $=0;$<3;$++){const O=[t.q,t.k,t.v][$],G=s.createBindGroupWithOffsets(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,{buffer:o,offset:$*f,size:f},{buffer:h,offset:$*p,size:p},O,w]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[G],Math.ceil(c/32),e)}const B=this.makeUniform(`vit_rope_${a}`,[e,t.heads,t.headDim]);s.dispatch(this.pipelines.vit_rope,[s.createBindGroup(this.pipelines.vit_rope,0,[t.q,t.k,t.cos,t.sin,B])],Math.ceil(e*t.heads*t.headDim/256));const q=1/Math.sqrt(t.headDim),_=this.makeUniform(`vit_attn_${a}`,[e,t.heads,t.headDim,q]);s.dispatch(this.pipelines.vit_attention,[s.createBindGroup(this.pipelines.vit_attention,0,[t.q,t.k,t.v,t.attnOut,_])],e,t.heads);const l=this.weights[`${i}.attn.proj.weight`],d=this.weights[`${i}.attn.proj.bias`],y=this.makeUniform(`vit_proj_${a}`,[c,c]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.attnOut,l,d,t.mlpOut,y])],Math.ceil(c/32),e);const U=this.makeUniform(`vit_res1_${a}`,[e*c]);s.dispatch(this.pipelines.vit_add,[s.createBindGroup(this.pipelines.vit_add,0,[t.hidden,t.mlpOut,U])],Math.ceil(e*c/256));const g=this.weights[`${i}.norm2.weight`],P=this.weights[`${i}.norm2.bias`];s.dispatch(this.pipelines.vit_layernorm,[s.createBindGroup(this.pipelines.vit_layernorm,0,[t.hidden,g,P,t.normed,u])],e);const S=this.weights[`${i}.mlp.fc1.weight`],b=this.weights[`${i}.mlp.fc1.bias`],v=this.makeUniform(`vit_mlp_fc1_${a}`,[c,t.Vi]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,S,b,t.mlpInter,v])],Math.ceil(t.Vi/32),e);const C=this.makeUniform(`vit_gelu_${a}`,[e*t.Vi]);s.dispatch(this.pipelines.vit_gelu_tanh,[s.createBindGroup(this.pipelines.vit_gelu_tanh,0,[t.mlpInter,t.mlpInter,C])],Math.ceil(e*t.Vi/256));const D=this.weights[`${i}.mlp.fc2.weight`],k=this.weights[`${i}.mlp.fc2.bias`],z=this.makeUniform(`vit_mlp_fc2_${a}`,[t.Vi,c]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.mlpInter,D,k,t.mlpOut,z])],Math.ceil(c/32),e);const M=this.makeUniform(`vit_res2_${a}`,[e*c]);s.dispatch(this.pipelines.vit_add,[s.createBindGroup(this.pipelines.vit_add,0,[t.hidden,t.mlpOut,M])],Math.ceil(e*c/256))}computeMultimodalPositions(a,e,t,s){const i=this.vision.mergeSize,n=t/i,r=s/i,u=a.length,o=new Array(3);for(let f=0;f<3;f++)o[f]=new Int32Array(u);let h=0,m=0;for(let f=0;f<u;f++)if(a[f]===this.imageTokenId){const w=m,B=Math.floor(w/r),q=w%r;o[0][f]=h,o[1][f]=h+B,o[2][f]=h+q,m++,m===e&&(h+=Math.max(n,r))}else o[0][f]=h,o[1][f]=h,o[2][f]=h,h++;const c=h-u;return{positionIds3D:o,ropeDelta:c}}initB2Buffers(){const a=this.gpu,e=this.hiddenSize,t=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.b2={},this.b2.hidden=a.createBuffer("b2_hidden_a",2*e*4,s),this.b2.hiddenB=a.createBuffer("b2_hidden_b",2*e*4,s),this.b2.normed=a.createBuffer("b2_normed",2*e*4,s),this.b2.mlpIntermediate=a.createBuffer("b2_mlp_inter",2*t*4,s),this.b2.mlpOut=a.createBuffer("b2_mlp_out",2*e*4,s),this.b2.logits=a.createBuffer("b2_logits",2*this.vocabSize*4,s),this.b2.zeroBuf=a.createBuffer("b2_zero_buf",2*e*4,s),this.b2.qProjFull=a.createBuffer("b2_q_proj_full",2*this.numHeads*this.headDim*2*4,s),this.b2.qProj=a.createBuffer("b2_q_proj_out",2*this.numHeads*this.headDim*4,s),this.b2.qGate=a.createBuffer("b2_q_gate",2*this.numHeads*this.headDim*4,s),this.b2.kProj=a.createBuffer("b2_k_proj_out",2*this.numKVHeads*this.headDim*4,s),this.b2.vProj=a.createBuffer("b2_v_proj_out",2*this.numKVHeads*this.headDim*4,s);const i=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,h=this.linValueHeads/i*r,m=i*(n+n+h),c=this.linValueDim;this.b2.linQKV=a.createBuffer("b2_lin_qkv",2*m*4,s),this.b2.linZ=a.createBuffer("b2_lin_z",2*c*4,s),this.b2.linOut=a.createBuffer("b2_lin_out",2*c*4,s);const f=Math.max(e,this.numHeads*this.headDim,c)*4;if(this.b2.attnOut=a.createBuffer("b2_attn_out",2*f,s),this.abQuantized){const p=this.textCfg.linear_num_value_heads??i;this.b2.linAlpha=a.createBuffer("b2_lin_alpha",2*p*4,s),this.b2.linBeta=a.createBuffer("b2_lin_beta",2*p*4,s)}this.b2.topkResult0=a.createBuffer("b2_topk0",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkResult1=a.createBuffer("b2_topk1",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkReadback0=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb0"}),this.b2.topkReadback1=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb1"}),this.b2.argmaxResult0=a.createBuffer("b2_argmax0",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxResult1=a.createBuffer("b2_argmax1",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxReadback0=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb0"}),this.b2.argmaxReadback1=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb1"}),this.b2._gqaParamBuf0=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params0"}),this.b2._gqaParamBuf1=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params1"}),this.b2._dims={H:e,I:t,linQKVDim:m,valueDim:c,linHeads:i,linKeyDim:n,linValDim:r,linEVD:h,qProjSize:this.numHeads*this.headDim,qProjFullSize:this.numHeads*this.headDim*2,kProjSize:this.numKVHeads*this.headDim,vProjSize:this.numKVHeads*this.headDim,attnOutElems:f/4},console.log(`[B2] Allocated batch=2 buffers (${((2*(e+e+e+t+e+this.vocabSize)*4+2*f)/1e6).toFixed(1)} MB activations)`)}runWithOffsets(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroupWithOffsets(i,0,e);this.gpu.dispatch(i,[n],t,s)}makeUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),s=new DataView(t);for(let r=0;r<e.length;r++)Number.isInteger(e[r])?s.setUint32(r*4,e[r],!0):s.setFloat32(r*4,e[r],!0);const i=a+"_"+e.join("_");if(this.paramBufs[i])return this.paramBufs[i];const n=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(n,0,new Uint8Array(t)),this.paramBufs[i]=n,n}_makeMixedUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),s=new DataView(t);for(let n=0;n<e.length;n++){const r=e[n];r.u!==void 0?s.setUint32(n*4,r.u,!0):s.setFloat32(n*4,r.f,!0)}this._mixedUniformBufs||(this._mixedUniformBufs={});let i=this._mixedUniformBufs[a];return i||(i=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a}),this._mixedUniformBufs[a]=i),this.gpu.device.queue.writeBuffer(i,0,new Uint8Array(t)),i}makeUniformTyped(a,e,t){const s=e.length+t.length,i=new ArrayBuffer(Math.max(16,Math.ceil(s*4/16)*16)),n=new DataView(i);let r=0;for(const h of e)n.setUint32(r,h,!0),r+=4;for(const h of t)n.setFloat32(r,h,!0),r+=4;const u=a+"_"+[...e,...t].join("_");if(this.paramBufs[u])return this.paramBufs[u];const o=this.gpu.device.createBuffer({size:i.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(i)),this.paramBufs[u]=o,o}makeFusedNormMLPUniform(a){const e=`fused_anorm_mlp_${a}`;if(this.paramBufs[e])return this.paramBufs[e];const t=this.hiddenSize,s=this.intermediateSize,i=16+640*16,n=new ArrayBuffer(i),r=new DataView(n);r.setUint32(0,t,!0),r.setUint32(4,s,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=`model.language_model.layers.${a}.post_attention_layernorm.weight`,o=this._normWeightRaw?.[u];if(!o)throw new Error(`Norm weight not cached for layer ${a}`);for(let m=0;m<t/2;m++){const c=Math.floor(m/4),f=m%4;r.setUint32(16+c*16+f*4,o[m],!0)}const h=this.gpu.device.createBuffer({size:i,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:e});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(n)),this.paramBufs[e]=h,h}fusedNormGptqOp(a,e,t,s,i,n,r,u){const o=this.getQWeight(s);if(!o.qweight)return null;const h=this.makeFusedNormGPTQUniform(i,r),m=u?"fused_norm_gptq":"fused_norm_gptq_noadd",c=u?"fnorm_gptq_":"fnorm_gptqna_",f=u?[a,e,o.qweight,o.scales,t,h]:[a,o.qweight,o.scales,t,h];return this.prepOpCached(`${c}${s}`,m,f,this.wg4(r))}makeFusedNormGPTQUniform(a,e){const t=`fused_norm_gptq_${a}_${e}`;if(this.paramBufs[t])return this.paramBufs[t];const s=this.hiddenSize,i=16+512*16,n=new ArrayBuffer(i),r=new DataView(n);r.setUint32(0,s,!0),r.setUint32(4,e,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=this._normWeightRaw?.[a];if(!u)throw new Error(`Norm weight not cached: ${a}`);for(let h=0;h<s/2;h++){const m=Math.floor(h/4),c=h%4;r.setUint32(16+m*16+c*4,u[h],!0)}const o=this.gpu.device.createBuffer({size:i,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:t});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(n)),this.paramBufs[t]=o,o}run(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroup(i,0,e);this.gpu.dispatch(i,[n],t,s)}runCached(a,e,t,s,i=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);this.gpu.dispatch(n,[r],s,i)}prepOp(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroup(i,0,e);return{pipeline:i,bindGroups:[n],workgroupsX:t,workgroupsY:s}}prepOpCached(a,e,t,s,i=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);return{pipeline:n,bindGroups:[r],workgroupsX:s,workgroupsY:i}}wg(a){return Math.ceil(a/256)}wg4(a){return Math.ceil(a/32)}wg8(a){return Math.ceil(a/8)}getQWeight(a){return{qweight:this.weights[a+".qweight"],scales:this.weights[a+".scales"],qzeros:this.weights[a+".qzeros"]}}embedding(a){const e=this.weights["model.language_model.embed_tokens.weight"],t=this.makeUniform("emb_params",[a,this.hiddenSize]);this.run("embedding",[e,this.hidden,t],this.wg(this.hiddenSize))}embeddingFromVisionBuffer(a,e){const t=this.hiddenSize;this.gpu.copyBuffer(a,this.hidden,t*4,e*t*4,0)}embeddingFromArgmax(){const a=this.weights["model.language_model.embed_tokens.weight"];this._embedArgmaxParams||(this._embedArgmaxParams=this.makeUniform("emb_argmax_params",[this.hiddenSize])),this.runCached("embed_from_argmax","embed_from_argmax",[a,this.hidden,this.argmaxResult,this._embedArgmaxParams],this.wg(this.hiddenSize))}_dispatchLmHead(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,s=a/this.groupSize%4===0,i=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(s){const n=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,i],this.wg8(e))}else{const n=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,i],this.wg4(e))}}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],s=this.makeUniform("lmhead_params",[this.hiddenSize,this.vocabSize]);this.runCached("lm_head","bf16_matvec",[this.normed,t,this.logits,s],this.wg(this.vocabSize))}}rmsNorm(a,e,t,s){const i=this.makeUniform("rmsnorm_params",[this.hiddenSize,this.rmsEps]);s?this.runCached(s,"rmsnorm",[a,t,e,i],1):this.run("rmsnorm",[a,t,e,i],1)}gptqMatvec(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return;const r=s/this.groupSize,u=r%4===0;if(this.useSplitK&&u){let o=this.splitKSplits;for(;o>1&&r%(o*4)!==0;)o>>=1;if(o>1){const h=this.makeUniform(`mv_sk_${s}_${i}_${o}`,[s,i,this.groupSize,o]);this.runCached(`sk4tf16_${t}`,"gptq_matvec_4t_f16_sk",[a,n.qweight,n.scales,this.splitKPartials,h],this.wg8(i),o);const m=this.makeUniform(`rsk_${i}_${o}`,[i,o]);this.runCached(`rsk_${t}`,"reduce_splitk",[this.splitKPartials,e,m],this.wg(i));return}}if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",h=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);this.runCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg8(i))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",h=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);this.runCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg4(i))}}gptqMatvecOp(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return null;if(s/this.groupSize%4===0){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",h=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);return this.prepOpCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg8(i))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",h=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);return this.prepOpCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg4(i))}}bf16Matvec(a,e,t,s,i,n){const r=this.makeUniform(`bf16mv_${s}_${i}`,[s,i]);n?this.runCached(n,"bf16_matvec",[a,t,e,r],this.wg(i)):this.run("bf16_matvec",[a,t,e,r],this.wg(i))}siluMul(a,e,t,s,i){const n=this.makeUniform(`silu_${s}`,[s]);i?this.runCached(i,"silu_mul",[a,e,t,n],this.wg(s)):this.run("silu_mul",[a,e,t,n],this.wg(s))}addVectors(a,e,t,s){const i=this.makeUniform(`add_${t}`,[t]);s?this.runCached(s,"add",[a,e,i],this.wg(t)):this.run("add",[a,e,i],this.wg(t))}addAndRmsNorm(a,e,t,s,i){const n=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);i?this.runCached(i,"add_rmsnorm",[a,e,s,t,n],1):this.run("add_rmsnorm",[a,e,s,t,n],1)}mlp(a,e){const t=`model.language_model.layers.${e}.mlp`,s=this.hiddenSize,i=this.intermediateSize,n=this.getQWeight(`${t}.gate_proj`),r=this.getQWeight(`${t}.up_proj`),u=this.makeUniform(`fused_mv_${s}_${i}`,[s,i,this.groupSize]);if(s/this.groupSize%4===0){const m=this.gpu.hasF16?"fused_gate_up_silu_4t_f16":"fused_gate_up_silu_4t",c=this.gpu.hasF16?"fused_gus4t16_":"fused_gus4t_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg8(i))}else{const m=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",c=this.gpu.hasF16?"fused_gus16_":"fused_gus_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg4(i))}this.gptqMatvec(this.mlpIntermediate,this.mlpOut,`${t}.down_proj`,i,s)}fullAttentionFused(a,e,t,s,i,n,r,u){i=i||this.normed;const o=`model.language_model.layers.${t}.self_attn`,h=this.hiddenSize,m=this.headDim,c=this.numHeads,f=this.numKVHeads,p=c/f,w=t===0,B=`model.language_model.layers.${t}.input_layernorm.weight`,q=this.weights[B];if(w){const $=this.makeUniform("add_rmsnorm_params",[h,this.rmsEps]);this.runCached(`fa_norm_${t}`,"rmsnorm",[a,q,i,$],1)}else{const $=this.makeUniform("add_rmsnorm_params",[h,this.rmsEps]);this.runCached(`fa_norm_${t}`,"add_rmsnorm_ro",[a,e,q,i,$],1)}const _=this.gptqMatvecOp(i,this.qProjFull,`${o}.q_proj`,h,c*m*2),l=this.gptqMatvecOp(i,this.kProj,`${o}.k_proj`,h,f*m),d=this.gptqMatvecOp(i,this.vProj,`${o}.v_proj`,h,f*m);this.gpu.dispatchMulti([_,l,d].filter(Boolean));const y=this.kvCache[t],U=this._fusedSQKParams[t],g=u??s;this._gqaDv.setUint32(0,g,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,n??s,!0),this._gqaDv.setUint32(12,r??s,!0),this.gpu.device.queue.writeBuffer(U,16,this._gqaData,0,16),this.runCached(`fa_sqk_${t}`,this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,y.keys,y.values,U],c+f);const P=(u??s)+1,S=this._forceMinSplits||1,b=Math.max(S,Math.min(Math.max(1,Math.ceil(P/32)),this._maxGqaSplits)),v=b>1?this._gqaPartials:this.attnOut;if(this._gqaDv.setUint32(0,P,!0),this._gqaDv.setUint32(4,m,!0),this._gqaDv.setUint32(8,f,!0),this._gqaDv.setUint32(12,c,!0),this._gqaDv.setUint32(16,p,!0),this._gqaDv.setUint32(20,b,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.run("gqa_attention_head",[this.qProj,y.keys,y.values,v,this._gqaParamBuf],c,b),b>1){this._gqaReduceParams||(this._gqaReduceParams=this.gpu.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_reduce_params"}));const $=new Uint8Array(16),O=new DataView($.buffer);O.setUint32(0,m,!0),O.setUint32(4,b,!0),O.setUint32(8,c,!0),this.gpu.device.queue.writeBuffer(this._gqaReduceParams,0,$),this.run("gqa_reduce",[this._gqaPartials,this.attnOut,this._gqaReduceParams],c)}const C=this.getQWeight(`${o}.o_proj`),D=c*m,z=D/this.groupSize%4===0,M=this.makeUniform(`fused_sig_mv_${D}_${h}`,[D,h,this.groupSize]);if(z){const $=this.gpu.hasF16?"fused_sigmoid_gptq_4t_f16":"fused_sigmoid_gptq_4t",O=this.gpu.hasF16?"fused_sig4t16_oproj_":"fused_sig4t_oproj_";this.runCached(`${O}${t}`,$,[this.attnOut,this.qGate,C.qweight,C.scales,this.qProj,M],this.wg8(h))}else{const $=this.gpu.hasF16?"fused_sigmoid_gptq_f16":"fused_sigmoid_gptq",O=this.gpu.hasF16?"fused_sig16_oproj_":"fused_sig_oproj_";this.runCached(`${O}${t}`,$,[this.attnOut,this.qGate,C.qweight,C.scales,this.qProj,M],this.wg4(h))}}linearAttentionFused(a,e,t,s){s=s||this.normed;const i=`model.language_model.layers.${t}.linear_attn`,n=this.hiddenSize,r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,o=this.textCfg.linear_value_head_dim,h=this.linValueHeads,c=h/r*o,f=r*(u+u+c),p=this.linValueDim,w=t===0,B=`model.language_model.layers.${t}.input_layernorm.weight`,q=this.weights[B];if(w){const b=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"rmsnorm",[a,q,s,b],1)}else{const b=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"add_rmsnorm_ro",[a,e,q,s,b],1)}{const b=[this.gptqMatvecOp(s,this.linQKV,`${i}.in_proj_qkv`,n,f),this.gptqMatvecOp(s,this.linZ,`${i}.in_proj_z`,n,p)];this.abQuantized&&(b.push(this.gptqMatvecOp(s,this.linAlpha,`${i}.in_proj_a`,n,h)),b.push(this.gptqMatvecOp(s,this.linBeta,`${i}.in_proj_b`,n,h))),this.gpu.dispatchMulti(b.filter(Boolean))}const _=this.weights[`${i}.conv1d.weight`],l=this.weights[`${i}.A_log`],d=this.weights[`${i}.dt_bias`],y=this.weights[`${i}.norm.weight`];if(this.abQuantized){const b=`fused_cdn_q_${r}_${u}_${o}_${f}_${h}`;let v=this.paramBufs[b];if(!v){const C=new ArrayBuffer(32),D=new DataView(C);D.setUint32(0,r,!0),D.setUint32(4,u,!0),D.setUint32(8,o,!0),D.setUint32(12,f,!0),D.setFloat32(16,this.rmsEps,!0),D.setUint32(20,0,!0),D.setUint32(24,h,!0),v=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_q_params"}),this.gpu.device.queue.writeBuffer(v,0,new Uint8Array(C)),this.paramBufs[b]=v}this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,this.linAlpha,this.linBeta,l,d,y,v],r)}else{const b=`fused_cdn_ext_${r}_${u}_${o}_${f}_${n}_${h}`;let v=this.paramBufs[b];if(!v){const D=new ArrayBuffer(32),k=new DataView(D);k.setUint32(0,r,!0),k.setUint32(4,u,!0),k.setUint32(8,o,!0),k.setUint32(12,f,!0),k.setFloat32(16,this.rmsEps,!0),k.setUint32(20,n,!0),k.setUint32(24,h,!0),v=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_ext_params"}),this.gpu.device.queue.writeBuffer(v,0,new Uint8Array(D)),this.paramBufs[b]=v}const C=this.linABWeight[t];this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,s,C,l,d,y,v],r)}const U=this.getQWeight(`${i}.out_proj`),P=p/this.groupSize%4===0,S=this.makeUniform(`fused_silu_mv_${p}_${n}`,[p,n,this.groupSize]);if(P){const b=this.gpu.hasF16?"fused_silu_gptq_4t_f16":"fused_silu_gptq_4t",v=this.gpu.hasF16?"fused_silu4t16_oproj_":"fused_silu4t_oproj_";this.runCached(`${v}${t}`,b,[this.linZ,this.linOut,U.qweight,U.scales,this.attnOut,S],this.wg8(n))}else{const b=this.gpu.hasF16?"fused_silu_gptq_f16":"fused_silu_gptq",v=this.gpu.hasF16?"fused_silu16_oproj_":"fused_silu_oproj_";this.runCached(`${v}${t}`,b,[this.linZ,this.linOut,U.qweight,U.scales,this.attnOut,S],this.wg4(n))}}fusedNormMLP(a,e,t,s,i,n){n=n||this.normed;const r=this.hiddenSize,u=`model.language_model.layers.${i}.post_attention_layernorm.weight`,o=this.weights[u],h=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]),m=`mlp_norm_${i}_${a===this.hidden?"a":"b"}`;this.runCached(m,"three_way_add_rmsnorm",[a,t,s,o,e,n,h],1),this.mlp(n,i)}decoderLayer(a,e,t,s,i,n,r,u){let o;this.layerTypes[a]==="full_attention"?(this.fullAttentionFused(t,i,a,e,void 0,n,r,u),o=this.qProj):(this.linearAttentionFused(t,i,a),o=this.attnOut),this.fusedNormMLP(t,s,i,o,a)}async forward(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,s=this.hiddenB,i=this.zeroBuf;for(let o=0;o<this.numLayers;o++){this.decoderLayer(o,e,t,s,i),i=this.mlpOut;const h=t;t=s,s=h}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const o=this.repetitionPenalty??1,h=this.presencePenalty??0;if(this._recentTokenCount>0&&(o>1||h>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const p=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:o},{f:h},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,p],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const o=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,o],1)}return this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this._readAndSample()}forwardSubmit(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,s=this.hiddenB,i=this.zeroBuf;for(let o=0;o<this.numLayers;o++){this.decoderLayer(o,e,t,s,i),i=this.mlpOut;const h=t;t=s,s=h}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const o=this.repetitionPenalty??1,h=this.presencePenalty??0;if(this._recentTokenCount>0&&(o>1||h>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const p=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:o},{f:h},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,p],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const o=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,o],1)}this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch()}_buildReplayFromRecording(){const a=this.gpu.stopRecording(),e=this.pipelines.gqa_attention_head,t=[];for(const s of a)if(s.multi)for(const i of s.ops)t.push({p:i.pipeline,bg:i.bindGroup,x:i.wgX,y:i.wgY,gqa:i.pipeline===e?1:0});else t.push({p:s.pipeline,bg:s.bindGroup,x:s.wgX,y:s.wgY,gqa:s.pipeline===e?1:0});this._replayFlat=t,console.log(`[PERF] Recorded ${t.length} flat replay ops for core forward`)}_replayCoreForward(a){const e=this.gpu,t=a-this._ropeDelta,s=t+1,i=Math.min(Math.max(1,Math.ceil(s/32)),this._maxGqaSplits);this._gqaDv.setUint32(0,t,!0),this._gqaDv.setUint32(4,a,!0),this._gqaDv.setUint32(8,a,!0),this._gqaDv.setUint32(12,a,!0);for(let o=0;o<this.numLayers;o++)this.layerTypes[o]==="full_attention"&&e.device.queue.writeBuffer(this._fusedSQKParams[o],16,this._gqaData,0,16);this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,i,!0),e.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData,0,24),this._gqaReduceParams&&(this._gqaDv.setUint32(0,this.headDim,!0),this._gqaDv.setUint32(4,i,!0),this._gqaDv.setUint32(8,this.numHeads,!0),this._gqaDv.setUint32(12,0,!0),e.device.queue.writeBuffer(this._gqaReduceParams,0,this._gqaData,0,16));const n=e._singlePass,r=this._replayFlat,u=r.length;for(let o=0;o<u;o++){const h=r[o];n.setPipeline(h.p),n.setBindGroup(0,h.bg),n.dispatchWorkgroups(h.x,h.gqa?i:h.y)}}embeddingB2(a,e){const t=this.hiddenSize,s=this.weights["model.language_model.embed_tokens.weight"],i=this.makeUniform("emb_params",[a,t]);this.runWithOffsets("embedding",[s,{buffer:this.b2.hidden,offset:0,size:t*4},i],this.wg(t));const n=this.makeUniform(`emb_params_${e}`,[e,t]);this.runWithOffsets("embedding",[s,{buffer:this.b2.hidden,offset:t*4,size:t*4},n],this.wg(t))}gptqMatvecB2(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return;const r=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",o=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";this.runCached(`${o}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(i))}gptqMatvecB2Op(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return null;const r=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",o=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";return this.prepOpCached(`${o}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(i))}fullAttentionB2(a,e,t,s){const i=`model.language_model.layers.${t}.self_attn`,n=this.hiddenSize,r=this.b2._dims,u=this.headDim,o=this.numHeads,h=this.numKVHeads,m=o/h,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,p=this.weights[f],w=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:n*4},p,{buffer:this.b2.normed,offset:0,size:n*4},w],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:n*4,size:n*4},p,{buffer:this.b2.normed,offset:n*4,size:n*4},w],1)):this.run("add_rmsnorm_ro_b2",[a,e,p,this.b2.normed,w],2);const B=this.gptqMatvecB2Op(this.b2.normed,this.b2.qProjFull,`${i}.q_proj`,n,o*u*2),q=this.gptqMatvecB2Op(this.b2.normed,this.b2.kProj,`${i}.k_proj`,n,h*u),_=this.gptqMatvecB2Op(this.b2.normed,this.b2.vProj,`${i}.v_proj`,n,h*u);this.gpu.dispatchMulti([B,q,_].filter(Boolean));const l=this.kvCache[t],d=this._fusedSQKParams[t],y=r.qProjFullSize*4,U=r.kProjSize*4,g=r.vProjSize*4,P=r.qProjSize*4;this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,s,!0),this._gqaDv.setUint32(12,s,!0),this.gpu.device.queue.writeBuffer(d,16,this._gqaData,0,16),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:0,size:y},{buffer:this.b2.kProj,offset:0,size:U},{buffer:this.b2.vProj,offset:0,size:g},{buffer:this.b2.qProj,offset:0,size:P},{buffer:this.b2.qGate,offset:0,size:P},l.keys,l.values,d],o+h),this._gqaDv.setUint32(0,s+1,!0),this._gqaDv.setUint32(4,s+1,!0),this._gqaDv.setUint32(8,s+1,!0),this._gqaDv.setUint32(12,s+1,!0),this.gpu.device.queue.writeBuffer(d,16,this._gqaData,0,16),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:y,size:y},{buffer:this.b2.kProj,offset:U,size:U},{buffer:this.b2.vProj,offset:g,size:g},{buffer:this.b2.qProj,offset:P,size:P},{buffer:this.b2.qGate,offset:P,size:P},l.keys,l.values,d],o+h);const S=s+1,b=s+2;this._gqaDv.setUint32(0,S,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,h,!0),this._gqaDv.setUint32(12,o,!0),this._gqaDv.setUint32(16,m,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,s,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf0,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:0,size:P},l.keys,l.values,{buffer:this.b2.attnOut,offset:0,size:r.qProjSize*4},this.b2._gqaParamBuf0],o),this._gqaDv.setUint32(0,b,!0),this._gqaDv.setUint32(24,s+1,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf1,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:P,size:P},l.keys,l.values,{buffer:this.b2.attnOut,offset:r.qProjSize*4,size:r.qProjSize*4},this.b2._gqaParamBuf1],o);const v=this.getQWeight(`${i}.o_proj`),C=o*u,D=this.makeUniform(`fused_sig_mv_${C}_${n}`,[C,n,this.groupSize]);this.runCached(`b2_fused_sig_oproj_${t}`,"fused_sigmoid_gptq_b2_f16",[this.b2.attnOut,this.b2.qGate,v.qweight,v.scales,this.b2.qProj,D],this.wg4(n))}linearAttentionB2(a,e,t){const s=`model.language_model.layers.${t}.linear_attn`,i=this.hiddenSize,n=this.b2._dims,r=n.linHeads,u=n.linKeyDim,o=n.linValDim;n.linEVD;const h=n.linQKVDim,m=n.valueDim,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,p=this.weights[f],w=this.makeUniform("add_rmsnorm_params",[i,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:i*4},p,{buffer:this.b2.normed,offset:0,size:i*4},w],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:i*4,size:i*4},p,{buffer:this.b2.normed,offset:i*4,size:i*4},w],1)):this.run("add_rmsnorm_ro_b2",[a,e,p,this.b2.normed,w],2);{const P=[this.gptqMatvecB2Op(this.b2.normed,this.b2.linQKV,`${s}.in_proj_qkv`,i,h),this.gptqMatvecB2Op(this.b2.normed,this.b2.linZ,`${s}.in_proj_z`,i,m)];this.abQuantized&&(P.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linAlpha,`${s}.in_proj_a`,i,this.linValueHeads)),P.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linBeta,`${s}.in_proj_b`,i,this.linValueHeads))),this.gpu.dispatchMulti(P.filter(Boolean))}const B=this.weights[`${s}.conv1d.weight`],q=this.weights[`${s}.A_log`],_=this.weights[`${s}.dt_bias`],l=this.weights[`${s}.norm.weight`],d=h*4,y=m*4;if(this.abQuantized){const P=this.linValueHeads,S=P*4,b=`fused_cdn_q_${r}_${u}_${o}_${h}_${P}`,v=this.paramBufs[b];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:0,size:y},{buffer:this.b2.linAlpha,offset:0,size:S},{buffer:this.b2.linBeta,offset:0,size:S},q,_,l,v],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:d,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:y,size:y},{buffer:this.b2.linAlpha,offset:S,size:S},{buffer:this.b2.linBeta,offset:S,size:S},q,_,l,v],r)}else{const P=`fused_cdn_ext_${r}_${u}_${o}_${h}_${i}_${this.linValueHeads}`,S=this.paramBufs[P],b=this.linABWeight[t];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:0,size:y},{buffer:this.b2.normed,offset:0,size:i*4},b,q,_,l,S],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:d,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:y,size:y},{buffer:this.b2.normed,offset:i*4,size:i*4},b,q,_,l,S],r)}const U=this.getQWeight(`${s}.out_proj`),g=this.makeUniform(`fused_silu_mv_${m}_${i}`,[m,i,this.groupSize]);this.runCached(`b2_fused_silu_oproj_${t}`,"fused_silu_gptq_b2_f16",[this.b2.linZ,this.b2.linOut,U.qweight,U.scales,this.b2.attnOut,g],this.wg4(i))}fusedNormMLPB2(a,e,t,s,i){const n=this.hiddenSize,r=this.intermediateSize,u=`model.language_model.layers.${i}.mlp`,o=`model.language_model.layers.${i}.post_attention_layernorm.weight`,h=this.weights[o],m=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.run("three_way_add_rmsnorm_b2",[a,t,s,h,e,this.b2.normed,m],2);const c=this.getQWeight(`${u}.gate_proj`),f=this.getQWeight(`${u}.up_proj`),p=this.makeUniform(`fused_mv_${n}_${r}`,[n,r,this.groupSize]);this.runCached(`b2_fused_gus_${i}`,"fused_gate_up_silu_b2_f16",[this.b2.normed,c.qweight,c.scales,f.qweight,f.scales,this.b2.mlpIntermediate,p],this.wg4(r)),this.gptqMatvecB2(this.b2.mlpIntermediate,this.b2.mlpOut,`${u}.down_proj`,r,n)}decoderLayerB2(a,e,t,s,i){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionB2(t,i,a,e),n=this.b2.qProj):(this.linearAttentionB2(t,i,a),n=this.b2.attnOut),this.fusedNormMLPB2(t,s,i,n,a)}_dispatchLmHeadB2(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,t=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]),s=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2";this.runCached("b2_lm_head",s,[this.b2.normed,this._lmHeadQWeight,this._lmHeadScales,this.b2.logits,t],this.wg4(e))}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],s=this.hiddenSize,i=this.vocabSize,n=this.makeUniform("lmhead_params",[s,i]);this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:0,size:s*4},t,{buffer:this.b2.logits,offset:0,size:i*4},n],this.wg(i)),this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:s*4,size:s*4},t,{buffer:this.b2.logits,offset:i*4,size:i*4},n],this.wg(i))}}async forwardB2(a,e,t){this.b2||this.initB2Buffers(),this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embeddingB2(a,e);let s=this.b2.hidden,i=this.b2.hiddenB,n=this.b2.zeroBuf;for(let f=0;f<this.numLayers;f++){this.decoderLayerB2(f,t,s,i,n),n=this.b2.mlpOut;const p=s;s=i,i=p}if(this.mtp){const f=this.hiddenSize;this.gpu.copyBuffer(s,this.mtp.savedHidden,f*4,f*4,0),this.gpu.copyBuffer(this.b2.mlpOut,this.mtp.savedMlpOut,f*4,f*4,0)}const r=this.weights["model.language_model.norm.weight"],u=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);this.run("add_rmsnorm_b2",[s,this.b2.mlpOut,r,this.b2.normed,u],2),this._dispatchLmHeadB2();const h=(this.temperature??.7)>0,m=this.vocabSize,c=this.makeUniform("argmax_params",[m]);return h?(this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.topkResult0,c],1),this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.topkResult1,c],1)):(this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.argmaxResult0,c],1),this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.argmaxResult1,c],1)),this.gpu._passCount!==void 0&&!this._b2PassCountLogged&&(console.log(`[PERF] B2: ${this.gpu._passCount} compute passes per 2-token step`),this._b2PassCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),h?(this.gpu.copyBuffer(this.b2.topkResult0,this.b2.topkReadback0,2048),this.gpu.copyBuffer(this.b2.topkResult1,this.b2.topkReadback1,2048)):(this.gpu.copyBuffer(this.b2.argmaxResult0,this.b2.argmaxReadback0,8),this.gpu.copyBuffer(this.b2.argmaxResult1,this.b2.argmaxReadback1,8)),this.gpu.endBatch(),this._readAndSampleB2(h)}async _readAndSampleB2(a){const e=[0,0];a?(await Promise.all([this.b2.topkReadback0.mapAsync(GPUMapMode.READ),this.b2.topkReadback1.mapAsync(GPUMapMode.READ)]),e[0]=this._sampleFromTopkReadback(this.b2.topkReadback0),e[1]=this._sampleFromTopkReadback(this.b2.topkReadback1)):(await Promise.all([this.b2.argmaxReadback0.mapAsync(GPUMapMode.READ),this.b2.argmaxReadback1.mapAsync(GPUMapMode.READ)]),e[0]=new Uint32Array(this.b2.argmaxReadback0.getMappedRange().slice(0))[0],e[1]=new Uint32Array(this.b2.argmaxReadback1.getMappedRange().slice(0))[0],this.b2.argmaxReadback0.unmap(),this.b2.argmaxReadback1.unmap());for(const t of e)this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=t:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=t);return e}_sampleFromTopkReadback(a){const e=a.getMappedRange(),t=new Uint32Array(e),s=new Float32Array(e),i=new Uint32Array(256),n=new Float32Array(256);for(let o=0;o<256;o++)i[o]=t[o*2],n[o]=s[o*2+1];a.unmap();const r=this.presencePenalty??0,u=this.repetitionPenalty??1;if((r>0||u>1)&&this._recentTokenCount>0){const o=new Set;for(let h=0;h<this._recentTokenCount;h++)o.add(this._recentTokens[h]);for(let h=0;h<256;h++)o.has(i[h])&&(r>0&&(n[h]-=r),u>1&&(n[h]=n[h]>0?n[h]/u:n[h]*u))}return this._sampleFromArrays(i,n,256)}async _readAndSample(){await this._argmaxReadback.mapAsync(GPUMapMode.READ);const a=new Uint32Array(this._argmaxReadback.getMappedRange().slice(0));this._argmaxReadback.unmap();const e=a[0];return this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=e:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=e),e}_sampleFromArrays(a,e,t){const s=this.temperature??.7,i=this.topP??.8,n=this.topK??20,r=Math.min(n,t),u=this._selBuf||(this._selBuf=new Uint32Array(64)),o=this._selValBuf||(this._selValBuf=new Float32Array(64)),h=this._usedBuf||(this._usedBuf=new Uint8Array(256));h.fill(0);for(let l=0;l<r;l++){let d=-1,y=-1/0;for(let U=0;U<t;U++)!h[U]&&e[U]>y&&(y=e[U],d=U);if(d<0)break;u[l]=a[d],o[l]=y,h[d]=1}const m=o[0],c=this._probBuf||(this._probBuf=new Float32Array(64));let f=0;for(let l=0;l<r;l++)c[l]=Math.exp((o[l]-m)/s),f+=c[l];for(let l=0;l<r;l++)c[l]/=f;let p=0,w=r;for(let l=0;l<r;l++)if(p+=c[l],p>=i){w=l+1;break}let B=0;for(let l=0;l<w;l++)B+=c[l];const q=Math.random()*B;let _=0;for(let l=0;l<w;l++)if(_+=c[l],_>=q)return u[l];return u[0]}_sample(a){const e=this.temperature??.7,t=this.topP??.8,s=this.topK??20,i=a.length;if(e<=0){let _=0,l=a[0];for(let d=1;d<i;d++)a[d]>l&&(l=a[d],_=d);return _}const n=Math.max(s,64),r=new Int32Array(n).fill(-1),u=new Float32Array(n).fill(-1/0);let o=-1/0;for(let _=0;_<i;_++){const l=a[_];if(l>o&&(o=l),l>u[n-1]){let d=n-1;for(;d>0&&l>u[d-1];)u[d]=u[d-1],r[d]=r[d-1],d--;u[d]=l,r[d]=_}}const h=Math.min(s,n),m=new Float32Array(h);let c=0;for(let _=0;_<h&&!(r[_]<0);_++)m[_]=Math.exp((u[_]-o)/e),c+=m[_];for(let _=0;_<h;_++)m[_]/=c;let f=0,p=h;for(let _=0;_<h;_++){if(r[_]<0){p=_;break}if(f+=m[_],f>=t){p=_+1;break}}let w=0;for(let _=0;_<p;_++)w+=m[_];const B=Math.random()*w;let q=0;for(let _=0;_<p;_++)if(q+=m[_],q>=B)return r[_];return r[0]}async generate(a,e=512,t,s){this.seqLen=0,this._recentTokenCount=0,this._replayFlat=null,this._ropeDelta=s?.ropeDelta??0;const i=this.maxSeqLen||4096;if(e=Math.min(e,i-a.length),e<=0)throw new Error(`Prompt (${a.length} tokens) exceeds context size (${i})`);const n=[...a],r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,o=this.textCfg.linear_value_head_dim,c=this.linValueHeads/r*o,f=r*(u+u+c);for(let P=0;P<this.numLayers;P++)if(this.layerTypes[P]==="linear_attention"){const S=r*u*c*4,b=3*f*4;this.gpu.device.queue.writeBuffer(this.linState[P],0,new Uint8Array(S)),this.gpu.device.queue.writeBuffer(this.linConvHist[P],0,new Uint8Array(b))}let p=null;if(s){let P=0;const S=s.imageTokenId,b=s.positionIds3D;for(let v=0;v<a.length;v++){const C=b[0][v],D=b[1][v],k=b[2][v];this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),a[v]===S?this.embeddingFromVisionBuffer(s.embedBuffer,P++):this.embedding(a[v]);let z=this.hidden,M=this.hiddenB,$=this.zeroBuf;for(let O=0;O<this.numLayers;O++){this.decoderLayer(O,C,z,M,$,D,k,v),$=this.mlpOut;const G=z;z=M,M=G}if(v===a.length-1){const O=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(z,this.mlpOut,this.normed,O,"add_final_norm"),this._dispatchLmHead();const G=this.temperature??.7;if(G>0){const R=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,R],1);const F=Math.random()*4294967295>>>0,T=this._makeMixedUniform("sample_params",[{f:G},{u:this.topK??20},{f:this.topP??.8},{u:F}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,T],1)}else{const R=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,R],1)}this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8)}this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch(),this.seqLen=v+1}p=await this._readAndSample()}else for(let P=0;P<a.length;P++)p=await this.forward(a[P],P),this.seqLen=P+1;n.push(p);const w=this.config.eos_token_id??this.textCfg.eos_token_id,B=Array.isArray(w)?w:w!=null?[w]:[248044,248046];if(t?.(p,0)||B.includes(p))return n;this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=p:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=p),this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const _=8;this._tokenHistoryBuf||(this._tokenHistoryBuf=this.gpu.createBuffer("token_history",_*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._tokenHistoryReadback=this.gpu.device.createBuffer({size:_*4,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"token_history_readback"}));let l=0,d=0,y=1,U=p,g=!1;for(;y<e;){const P=performance.now(),S=Math.min(_,e-y);for(let k=0;k<S;k++){const z=this.seqLen+k+this._ropeDelta;if(this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),k===0?this.embedding(U):this.embeddingFromArgmax(),g)this._replayCoreForward(z);else{let G=this.hidden,R=this.hiddenB,F=this.zeroBuf;k===0&&!this._replayFlat&&(this._forceMinSplits=2,this.gpu.startRecording());for(let x=0;x<this.numLayers;x++){this.decoderLayer(x,z,G,R,F),F=this.mlpOut;const V=G;G=R,R=V}const T=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(G,this.mlpOut,this.normed,T,"add_final_norm"),this._dispatchLmHead(),k===0&&this.gpu._recording&&(this._forceMinSplits=0,this._buildReplayFromRecording())}const M=this.temperature??.7;if(M>0){const G=this.repetitionPenalty??1,R=this.presencePenalty??0,F=this._recentTokenCount+k;if(F>0&&(G>1||R>0)){const j=this._makeMixedUniform("penalty_params",[{u:Math.min(F,this._repMaxTokens)},{f:G},{f:R},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,j],Math.ceil(Math.min(F,this._repMaxTokens)/256))}const T=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,T],1);const x=Math.random()*4294967295>>>0,V=this._makeMixedUniform("sample_params",[{f:M},{u:this.topK??20},{f:this.topP??.8},{u:x}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,V],1)}else{const G=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,G],1)}const $=(this._recentTokenCount+k)%this._repMaxTokens,O=this.makeUniform(`append_${k}`,[$,k]);this.run("append_token",[this.argmaxResult,this._gpuRecentTokens,this._tokenHistoryBuf,O],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch()}!g&&this._replayFlat&&(g=!0);const b=this.gpu.device.createCommandEncoder();b.copyBufferToBuffer(this._tokenHistoryBuf,0,this._tokenHistoryReadback,0,S*4),this.gpu.device.queue.submit([b.finish()]),await this._tokenHistoryReadback.mapAsync(GPUMapMode.READ);const v=new Uint32Array(this._tokenHistoryReadback.getMappedRange().slice(0,S*4));this._tokenHistoryReadback.unmap();const C=performance.now();l+=C-P,d+=S;let D=!1;for(let k=0;k<S;k++){const z=v[k];n.push(z),this.seqLen++,this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=z:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=z);const M=t?.(z,y);if(y++,M||B.includes(z)){D=!0;break}}if(d%50<_&&console.log(`[T @${d}] ${(l/d).toFixed(1)}ms/tok (batch=${_})`),D)break;U=v[S-1],this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4)}return d>0&&console.log(`[T final @${d}] ${(l/d).toFixed(1)}ms/tok (batch=${_})`),n}async _quantizeBF16Weight(a,e,t,s){const i=this.groupSize,n=e/8,r=e/i,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,o=this.gpu.createBuffer(`${s}_qweight`,n*t*4,u),h=this.gpu.createBuffer(`${s}_scales_f32`,r*t*4,u),m=Math.ceil(r*t/2)*4,c=this.gpu.createBuffer(`${s}_scales`,m,u),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData(`${s}_qparams`,new Uint32Array([e,t,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[a,o,h,p]),B=65535,q=Math.min(t,B),_=Math.ceil(t/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-PQDFq1iI.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(r*t/2),g=this.gpu.createBufferFromData(`${s}_pparams`,new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[h,c,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();return b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),h.destroy(),p.destroy(),g.destroy(),{qweight:o,scales:c}}async initMTP(a){if(this.mtp)return;const e=this.hiddenSize,t=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=performance.now(),n="Qwen/Qwen3.5-2B";console.log(`[MTP] Downloading MTP weights from ${n}...`);const{loadMTPWeights:r}=await A(async()=>{const{loadMTPWeights:B}=await import("./safetensors-loader-CwGm5mJX.js");return{loadMTPWeights:B}},[],import.meta.url),u=await r(n,a);console.log(`[MTP] Downloaded ${Object.keys(u).length} MTP tensors`),this.mtp={weights:{},qweights:{}};const o={};for(const[B,q]of Object.entries(u)){const _=this.gpu.createBufferFromData(`mtp_${B}`,q.data);o[B]=_,this.mtp.weights[B]=_}const h=[{name:"mtp.fc.weight",K:e*2,N:e},{name:"mtp.layers.0.self_attn.q_proj.weight",K:e,N:this.numHeads*this.headDim*2},{name:"mtp.layers.0.self_attn.k_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.v_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.o_proj.weight",K:e,N:e},{name:"mtp.layers.0.mlp.gate_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.up_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.down_proj.weight",K:t,N:e}];for(const{name:B,K:q,N:_}of h){const{qweight:l,scales:d}=await this._quantizeBF16Weight(o[B],q,_,`mtp_${B}`);this.mtp.qweights[B]={qweight:l,scales:d},o[B].destroy(),delete this.mtp.weights[B]}this.mtp.normRaw={};const m=["mtp.pre_fc_norm_hidden.weight","mtp.pre_fc_norm_embedding.weight","mtp.layers.0.input_layernorm.weight","mtp.layers.0.post_attention_layernorm.weight","mtp.norm.weight","mtp.layers.0.self_attn.q_norm.weight","mtp.layers.0.self_attn.k_norm.weight"];for(const B of m){const q=u[B];q&&(this.mtp.normRaw[B]=new Uint32Array(q.data.buffer.slice(q.data.byteOffset,q.data.byteOffset+q.data.byteLength)))}const f=4096*this.numKVHeads*this.headDim*4;this.mtp.kvCache={keys:this.gpu.createBuffer("mtp_kv_keys",f,s),values:this.gpu.createBuffer("mtp_kv_values",f,s)},this.mtp.concatBuf=this.gpu.createBuffer("mtp_concat",e*2*4,s),this.mtp.fcOut=this.gpu.createBuffer("mtp_fc_out",e*4,s),this.mtp.savedHidden=this.gpu.createBuffer("mtp_saved_hidden",e*4,s),this.mtp.savedMlpOut=this.gpu.createBuffer("mtp_saved_mlp_out",e*4,s),this.mtp.seqLen=0,this._buildMTPSplitQKNormUniform(),this._buildMTPFusedMLPUniform(),await this._buildTrimmedLmHead(8e3);const p=((performance.now()-i)/1e3).toFixed(1),w=h.length;console.log(`[MTP] Initialized in ${p}s: ${w} INT4 weights, 1 KV cache layer`)}_buildMTPSplitQKNormUniform(){const a=this.numHeads,e=this.numKVHeads,t=this.headDim,s=a*t,i=e*t,n=(s+i)/2,u=32+Math.ceil(n/4)*16,o=new ArrayBuffer(u),h=new DataView(o);h.setUint32(0,a,!0),h.setUint32(4,e,!0),h.setUint32(8,t,!0),h.setFloat32(12,this.rmsEps,!0),h.setUint32(16,0,!0),h.setUint32(20,0,!0),h.setUint32(24,0,!0),h.setUint32(28,0,!0);const m=this.mtp.normRaw["mtp.layers.0.self_attn.q_norm.weight"],c=this.mtp.normRaw["mtp.layers.0.self_attn.k_norm.weight"];if(m)for(let p=0;p<s/2;p++){const w=Math.floor(p/4),B=p%4;h.setUint32(32+w*16+B*4,m[p],!0)}if(c){const p=s/2;for(let w=0;w<i/2;w++){const B=p+w,q=Math.floor(B/4),_=B%4;h.setUint32(32+q*16+_*4,c[w],!0)}}const f=this.gpu.device.createBuffer({size:u,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"mtp_fused_sqk"});this.gpu.device.queue.writeBuffer(f,0,new Uint8Array(o)),this.mtp.fusedSQKParams=f}_buildMTPFusedMLPUniform(){}async _buildTrimmedLmHead(a=8e3){const e=this.hiddenSize,t=this.gpu,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=new Uint32Array(a);for(let y=0;y<a;y++)i[y]=y;this.mtp.trimmedToFull=i,this.mtp.trimmedVocabSize=a;const n=t.createBufferFromData("mtp_trim_indices",i),u=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",o=this.weights[u],h=e/2,m=a*h*4,c=t.createBuffer("mtp_trim_gathered",m,s),f=(await A(async()=>{const{SHADERS:y}=await import("./gpu-ops-PQDFq1iI.js").then(U=>U.b);return{SHADERS:y}},[],import.meta.url)).SHADERS.gather_rows_bf16,p=t.getOrCreatePipeline("gather_rows_bf16",f),w=t.createBufferFromData("mtp_trim_gparams",new Uint32Array([h,a]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),B=t.createBindGroup(p,0,[o,n,c,w]),q=t.device.createCommandEncoder(),_=q.beginComputePass();_.setPipeline(p),_.setBindGroup(0,B),_.dispatchWorkgroups(Math.ceil(h/256),a),_.end(),t.device.queue.submit([q.finish()]);const{qweight:l,scales:d}=await this._quantizeBF16Weight(c,e,a,"mtp_trim_lmhead");this.mtp.trimmedLmHead={qweight:l,scales:d},this.mtp.trimmedLogits=t.createBuffer("mtp_trimmed_logits",a*4,s),c.destroy(),n.destroy(),w.destroy(),console.log(`[MTP] Trimmed lm_head: ${a} tokens, ${(l.size/1024/1024).toFixed(1)}MB qw + ${(d.size/1024/1024).toFixed(1)}MB sc`)}_dispatchTrimmedLmHead(){const a=this.hiddenSize,e=this.mtp.trimmedVocabSize,t=this.mtp.trimmedLmHead,i=a/this.groupSize%4===0,n=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(i){const r=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg8(e))}else{const r=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg4(e))}}_mtpGetQWeight(a){return this.mtp.qweights[a]}_mtpGptqMatvec(a,e,t,s,i){const n=this._mtpGetQWeight(t);if(!n)throw new Error(`MTP weight not found: ${t}`);const u=s/this.groupSize%4===0,o=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg8(i))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg4(i))}}_mtpSaveHidden(a,e){this.mtp&&(this.gpu.copyBuffer(a,this.mtp.savedHidden,this.hiddenSize*4),this.gpu.copyBuffer(e,this.mtp.savedMlpOut,this.hiddenSize*4))}async mtpForward(a){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");return this.mtpForwardSubmit(a),await this._readAndSample()}mtpForwardSubmit(a){const e=this.hiddenSize,t=this.intermediateSize,s=this.mtp.seqLen;this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);const i=this.makeUniform("add_rmsnorm_params",[e,this.rmsEps]);this.runCached("mtp_emb_norm","rmsnorm",[this.hidden,this.mtp.weights["mtp.pre_fc_norm_embedding.weight"],this.normed,i],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4),this.runCached("mtp_hid_norm","add_rmsnorm_ro",[this.mtp.savedHidden,this.mtp.savedMlpOut,this.mtp.weights["mtp.pre_fc_norm_hidden.weight"],this.normed,i],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4,0,e*4),this._mtpGptqMatvec(this.mtp.concatBuf,this.mtp.fcOut,"mtp.fc.weight",e*2,e),this.runCached("mtp_input_norm","rmsnorm",[this.mtp.fcOut,this.mtp.weights["mtp.layers.0.input_layernorm.weight"],this.normed,i],1);const n="mtp.layers.0.self_attn",r=this._mtpGptqMatvecOp(this.normed,this.qProjFull,`${n}.q_proj.weight`,e,this.numHeads*this.headDim*2),u=this._mtpGptqMatvecOp(this.normed,this.kProj,`${n}.k_proj.weight`,e,this.numKVHeads*this.headDim),o=this._mtpGptqMatvecOp(this.normed,this.vProj,`${n}.v_proj.weight`,e,this.numKVHeads*this.headDim);this.gpu.dispatchMulti([r,u,o].filter(Boolean)),this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,s,!0),this._gqaDv.setUint32(12,s,!0),this.gpu.device.queue.writeBuffer(this.mtp.fusedSQKParams,16,this._gqaData,0,16),this.runCached("mtp_sqk",this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.mtp.fusedSQKParams],this.numHeads+this.numKVHeads);const h=s+1;this._gqaDv.setUint32(0,h,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,s,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.runCached("mtp_gqa","gqa_attention_head",[this.qProj,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.attnOut,this._gqaParamBuf],this.numHeads);const m=this._mtpGetQWeight(`${n}.o_proj.weight`),c=this.numHeads*this.headDim,f=this.makeUniform(`fused_sig_mv_${c}_${e}`,[c,e,this.groupSize]);c/this.groupSize%4===0&&this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_4t_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg8(e)):this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)):this.runCached("mtp_sig_oproj","fused_sigmoid_gptq",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)),this.runCached("mtp_post_norm","add_rmsnorm_ro",[this.mtp.fcOut,this.qProj,this.mtp.weights["mtp.layers.0.post_attention_layernorm.weight"],this.normed,i],1);const B=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",q=this.makeUniform(`fused_mv_${e}_${t}`,[e,t,this.groupSize]),_=this.mtp.qweights["mtp.layers.0.mlp.gate_proj.weight"],l=this.mtp.qweights["mtp.layers.0.mlp.up_proj.weight"];this.runCached("mtp_gate_up_silu",B,[this.normed,_.qweight,_.scales,l.qweight,l.scales,this.mlpIntermediate,q],this.wg4(t)),this._mtpGptqMatvec(this.mlpIntermediate,this.mlpOut,"mtp.layers.0.mlp.down_proj.weight",t,e),this.runCached("mtp_final_norm","three_way_add_rmsnorm",[this.mtp.fcOut,this.qProj,this.mlpOut,this.mtp.weights["mtp.norm.weight"],this.hidden,this.normed,i],1),this._dispatchLmHead();const d=this.makeUniform("mtp_argmax_params",[this.vocabSize]);this.runCached("mtp_argmax","argmax",[this.logits,this.argmaxResult,d],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this.mtp.seqLen++}_mtpSnapshotDeltaNet(){if(!this.mtp._deltaNetSnapshot){const e=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.mtp._deltaNetSnapshot={states:{},convs:{}};for(let t=0;t<this.numLayers;t++)this.layerTypes[t]==="linear_attention"&&(this.mtp._deltaNetSnapshot.states[t]=this.gpu.createBuffer(`mtp_snap_state_${t}`,this.linState[t].size,e),this.mtp._deltaNetSnapshot.convs[t]=this.gpu.createBuffer(`mtp_snap_conv_${t}`,this.linConvHist[t].size,e))}const a=this.mtp._deltaNetSnapshot;a.mtpSeqLen=this.mtp.seqLen;for(const e of Object.keys(a.states))this.gpu.copyBuffer(this.linState[e],a.states[e],this.linState[e].size),this.gpu.copyBuffer(this.linConvHist[e],a.convs[e],this.linConvHist[e].size)}_mtpRestoreDeltaNet(){const a=this.mtp._deltaNetSnapshot;if(a){a.mtpSeqLen!==void 0&&(this.mtp.seqLen=a.mtpSeqLen);for(const e of Object.keys(a.states))this.gpu.copyBuffer(a.states[e],this.linState[e],this.linState[e].size),this.gpu.copyBuffer(a.convs[e],this.linConvHist[e],this.linConvHist[e].size)}}async generateWithMTP(a,e=512,t){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");this.seqLen=0,this.mtp.seqLen=0,this._recentTokenCount=0;const s=[...a],i=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,o=this.linValueHeads/i*r,h=i*(n+n+o);for(let g=0;g<this.numLayers;g++)if(this.layerTypes[g]==="linear_attention"){const P=i*n*o*4,S=3*h*4;this.gpu.device.queue.writeBuffer(this.linState[g],0,new Uint8Array(P)),this.gpu.device.queue.writeBuffer(this.linConvHist[g],0,new Uint8Array(S))}let m=null;for(let g=0;g<a.length;g++)m=await this.forward(a[g],g),this.seqLen=g+1;s.push(m);const c=this.config.eos_token_id??this.textCfg.eos_token_id,f=Array.isArray(c)?c:c!=null?[c]:[248044,248046];if(t?.(m,0)||f.includes(m))return s;let w=1,B=0,q=0,_=m,l=0,d=0;for(;w<e;){const g=performance.now(),P=await this.mtpForward(_);this._mtpSnapshotDeltaNet();const S=this.seqLen,b=await this.forwardB2(_,P,this.seqLen);this.seqLen+=2;const v=b[0],C=b[1];if(v===P){B++,s.push(P),w++;let k=t?.(P,w-1);if(k||f.includes(P)||(s.push(C),w++,k=t?.(C,w-1),k||f.includes(C)))break;_=C}else{q++,this._mtpRestoreDeltaNet(),this.seqLen=S;const k=await this.forward(_,this.seqLen);if(this.seqLen++,s.push(k),w++,t?.(k,w-1)||f.includes(k))break;_=k}const D=performance.now();if(l+=D-g,d++,d%25===0){const k=B/(B+q)*100,z=w/d;console.log(`[MTP @${d}] ${(l/d).toFixed(1)}ms/step, ${(w/(l/1e3)).toFixed(0)} tok/s, accept=${k.toFixed(0)}%, ${z.toFixed(1)} tok/step`)}}const y=B/Math.max(1,B+q)*100;return console.log(`[MTP final] ${(w/((l||1)/1e3)).toFixed(0)} tok/s, accept=${y.toFixed(0)}% (${B}/${B+q}), ${w} tokens`),s}_mtpGptqMatvecOp(a,e,t,s,i){const n=this._mtpGetQWeight(t);if(!n)return null;const u=s/this.groupSize%4===0,o=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";return this.prepOpCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg8(i))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";return this.prepOpCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg4(i))}}}export{N as Qwen35Model};
|
|
|
|
|
|
assets/qwen35-model-CmivuQnY.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
import{S as H,a as W,_ as A}from"./gpu-ops-DKsrMEcC.js";function K(E){const a=E<<16,e=new ArrayBuffer(4);new Uint32Array(e)[0]=a,new Float32Array(e)[0];const t=a>>>31&1,s=a>>>23&255,i=a&8388607;if(s===0)return t<<15;if(s===255)return t<<15|31744|(i?512:0);const n=s-127+15;return n>=31?t<<15|31744:n<=0?t<<15:t<<15|n<<10|i>>>13}class N{constructor(a,e,t){if(this.gpu=a,this.config=e,this.textCfg=e.text_config,this.quantConfig=t,this.hiddenSize=this.textCfg.hidden_size,this.intermediateSize=this.textCfg.intermediate_size,this.numLayers=this.textCfg.num_hidden_layers,this.numHeads=this.textCfg.num_attention_heads,this.numKVHeads=this.textCfg.num_key_value_heads,this.headDim=this.textCfg.head_dim,this.vocabSize=this.textCfg.vocab_size,this.rmsEps=this.textCfg.rms_norm_eps,this.ropeTheta=this.textCfg.rope_parameters?.rope_theta||1e7,this.partialRotary=this.textCfg.partial_rotary_factor,this.partialDim=Math.floor(this.headDim*this.partialRotary),this.mropeSection=this.textCfg.rope_parameters?.mrope_section||[11,11,10],this.layerTypes=this.textCfg.layer_types,this.groupSize=t?.group_size||t?.config_groups?.group_0?.weights?.group_size||128,this.fp16Layers=new Set,t?.extra_config)for(const u of Object.keys(t.extra_config))this.fp16Layers.add(u);this.weights={},this.kvCache={},this.seqLen=0,this.pipelines={},this.singlePassMode=!0,this._replayFlat=null,this._forceMinSplits=0,this._ropeDelta=0;const s=this.numHeads*this.headDim,i=this.numKVHeads*this.headDim,n=(s+i)/2,r=Math.ceil(n/4);this._splitQKNormShaderKey=`fused_split_qknorm_kvstore_${r}`,H[this._splitQKNormShaderKey]||(H[this._splitQKNormShaderKey]=W(r))}compilePipelines(){const a=this.gpu;for(const[e,t]of Object.entries(H))this.pipelines[e]=a.getOrCreatePipeline(e,t)}uploadTensors(a){this.quantConfig?.quant_method==="compressed-tensors"&&(a=this._convertCompressedTensors(a));for(const[t,s]of Object.entries(a)){let i=s.data;if(s.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")){const n=new Float32Array(i.buffer,i.byteOffset,i.byteLength/4),r=new Uint16Array(n.length);for(let u=0;u<n.length;u++){const o=new Uint32Array(n.buffer,n.byteOffset+u*4,1)[0];r[u]=o>>>16}i=new Uint8Array(r.buffer)}if(s._partial){let{offset:n,totalSize:r}=s._partial;if(s.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")&&(n/=2,r/=2),n===0){const o=this.gpu.createBuffer(t,r,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC);this.gpu.device.queue.writeBuffer(o,0,i),this.weights[t]=o}else{const o=this.weights[t];o&&this.gpu.device.queue.writeBuffer(o,n,i)}}else this.weights[t]=this.gpu.createBufferFromData(t,i);(t.includes("post_attention_layernorm.weight")||t.includes("input_layernorm.weight")||t.includes("q_norm.weight")||t.includes("k_norm.weight")||t==="model.language_model.norm.weight")&&(this._normWeightRaw||(this._normWeightRaw={}),this._normWeightRaw[t]=new Uint32Array(i.buffer.slice(i.byteOffset,i.byteOffset+i.byteLength))),this._uploadedCount=(this._uploadedCount||0)+1}}_convertCompressedTensors(a){const e={};for(const[t,s]of Object.entries(a))if(!t.endsWith(".weight_shape")){if(t.endsWith(".weight_packed")){const i=t.slice(0,-14),n=s.shape,r=n[0],u=n[1],o=new Int32Array(s.data.buffer,s.data.byteOffset,s.data.byteLength/4),h=new Int32Array(r*u);for(let m=0;m<r;m++)for(let c=0;c<u;c++)h[c*r+m]=o[m*u+c];e[`${i}.qweight`]={dtype:"I32",shape:[u,r],data:new Uint8Array(h.buffer)};continue}if(t.endsWith(".weight_scale")){const i=t.slice(0,-13),n=s.shape,r=n[0],u=n[1],o=new Uint16Array(s.data.buffer,s.data.byteOffset,s.data.byteLength/2),h=Math.ceil(r/2),m=new Uint32Array(u*h);for(let c=0;c<u;c++)for(let f=0;f<r;f+=2){const p=o[f*u+c],w=f+1<r?o[(f+1)*u+c]:0,B=K(p),q=K(w);m[c*h+(f>>1)]=B|q<<16}e[`${i}.scales`]={dtype:"I32",shape:[u,h],data:new Uint8Array(m.buffer)};continue}e[t]=s}return e}async postProcessWeights(){const a=this.textCfg.linear_num_key_heads||0,e=this.hiddenSize,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=this.layerTypes.indexOf("linear_attention"),i=s>=0?`model.language_model.layers.${s}.linear_attn`:"";if(this.abQuantized=i&&!!this.weights[`${i}.in_proj_a.qweight`],this.abQuantized){const g=this.textCfg.linear_num_value_heads??a;this.linAlpha=this.gpu.createBuffer("lin_alpha",g*4,t),this.linBeta=this.gpu.createBuffer("lin_beta",g*4,t)}else{this.linABWeight={};const g=this.textCfg.linear_num_value_heads??a,P=e*2,S=g*P,b=2*S;for(let v=0;v<this.numLayers;v++)if(this.layerTypes[v]==="linear_attention"){const C=`model.language_model.layers.${v}.linear_attn`,D=this.weights[`${C}.in_proj_a.weight`],k=this.weights[`${C}.in_proj_b.weight`];if(D&&k){const z=this.gpu.createBuffer(`ab_merged_${v}`,b,t),M=this.gpu.device.createCommandEncoder();M.copyBufferToBuffer(D,0,z,0,S),M.copyBufferToBuffer(k,0,z,S,S),this.gpu.device.queue.submit([M.finish()]),this.linABWeight[v]=z}}}{const g=[];for(let b=0;b<this.numLayers;b++){if(this.layerTypes[b]==="linear_attention"){const v=`model.language_model.layers.${b}.linear_attn`,C=this.textCfg.linear_num_key_heads||0,D=this.textCfg.linear_key_head_dim||128,k=this.textCfg.linear_value_head_dim||128,z=this.textCfg.linear_num_value_heads??C,M=z*k,$=z/C*k,O=C*(D+D+$);g.push({prefix:`${v}.in_proj_qkv`,K:e,N:O}),g.push({prefix:`${v}.in_proj_z`,K:e,N:M}),g.push({prefix:`${v}.out_proj`,K:M,N:e})}else{const v=`model.language_model.layers.${b}.self_attn`,C=this.numHeads*this.headDim*2,D=this.numKVHeads*this.headDim;g.push({prefix:`${v}.q_proj`,K:e,N:C}),g.push({prefix:`${v}.k_proj`,K:e,N:D}),g.push({prefix:`${v}.v_proj`,K:e,N:D}),g.push({prefix:`${v}.o_proj`,K:this.numHeads*this.headDim,N:e})}g.push({prefix:`model.language_model.layers.${b}.mlp.gate_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${b}.mlp.up_proj`,K:e,N:this.intermediateSize}),g.push({prefix:`model.language_model.layers.${b}.mlp.down_proj`,K:this.intermediateSize,N:e})}let P=0;const S=performance.now();for(const{prefix:b,K:v,N:C}of g)if(!this.weights[`${b}.qweight`]&&this.weights[`${b}.weight`]){const{qweight:D,scales:k}=await this._quantizeBF16ToINT4(this.weights[`${b}.weight`],v,C,this.groupSize,b.replace(/\./g,"_"));this.weights[`${b}.qweight`]=D,this.weights[`${b}.scales`]=k,P++}P>0&&console.log(`[QUANT] GPU-quantized ${P} BF16 projections to INT4 in ${(performance.now()-S).toFixed(0)}ms`)}this._mergedGateUp={};const n=this.intermediateSize,r=e/8,u=r*n*4,h=r/(this.groupSize/8)*n*2;for(let g=0;g<this.numLayers;g++){const P=`model.language_model.layers.${g}.mlp`,S=this.getQWeight(`${P}.gate_proj`),b=this.getQWeight(`${P}.up_proj`);if(S.qweight&&b.qweight){const v=this.gpu.createBuffer(`merged_qw_${g}`,u*2,t),C=this.gpu.createBuffer(`merged_sc_${g}`,h*2,t),D=this.gpu.device.createCommandEncoder();D.copyBufferToBuffer(S.qweight,0,v,0,u),D.copyBufferToBuffer(b.qweight,0,v,u,u),D.copyBufferToBuffer(S.scales,0,C,0,h),D.copyBufferToBuffer(b.scales,0,C,h,h),this.gpu.device.queue.submit([D.finish()]),this._mergedGateUp[g]={qweight:v,scales:C}}}this._fusedMLPParams={};const m=16+512*16;for(let g=0;g<this.numLayers;g++){const P=`model.language_model.layers.${g}.post_attention_layernorm.weight`,S=this._normWeightRaw?.[P];if(!S||!this._mergedGateUp[g])continue;const b=new ArrayBuffer(m),v=new Uint32Array(b),C=new Float32Array(b);v[0]=e,v[1]=n,v[2]=this.groupSize,C[3]=this.rmsEps;for(let D=0;D<S.length;D++)v[4+D]=S[D];this._fusedMLPParams[g]=this.gpu.createBufferFromData(`fused_mlp_params_${g}`,new Uint32Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}this._fusedSQKParams={};const c=this.headDim,f=this.numHeads,p=this.numKVHeads,w=f*c,B=p*c,q=(w+B)/2,_=Math.ceil(q/4),l=32+_*16,d=this.mropeSection[1]*3,y=this.mropeSection[2]*3,U=`fused_split_qknorm_kvstore_${_}`;H[U]||(H[U]=W(_,this.ropeTheta,d,y,this.partialDim)),this.pipelines[U]||(this.pipelines[U]=this.gpu.getOrCreatePipeline(U,H[U])),this._splitQKNormShaderKey=U;for(let g=0;g<this.numLayers;g++){if(this.layerTypes[g]!=="full_attention")continue;const P=`model.language_model.layers.${g}.self_attn`,S=`${P}.q_norm.weight`,b=`${P}.k_norm.weight`,v=this._normWeightRaw?.[S],C=this._normWeightRaw?.[b],D=new ArrayBuffer(l),k=new DataView(D);if(k.setUint32(0,f,!0),k.setUint32(4,p,!0),k.setUint32(8,c,!0),k.setFloat32(12,this.rmsEps,!0),k.setUint32(16,0,!0),k.setUint32(20,0,!0),k.setUint32(24,0,!0),k.setUint32(28,0,!0),v)for(let M=0;M<w/2;M++){const $=Math.floor(M/4),O=M%4;k.setUint32(32+$*16+O*4,v[M],!0)}if(C){const M=w/2;for(let $=0;$<B/2;$++){const O=M+$,G=Math.floor(O/4),R=O%4;k.setUint32(32+G*16+R*4,C[$],!0)}}const z=this.gpu.device.createBuffer({size:l,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:`fused_sqk_${g}`});this.gpu.device.queue.writeBuffer(z,0,new Uint8Array(D)),this._fusedSQKParams[g]=z}await this._quantizeLmHead()}async _quantizeLmHead(){const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight";if(!this.weights[e])return;const t=this.hiddenSize,s=this.vocabSize,i=this.groupSize,n=t/8,r=t/i,u=performance.now(),o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,h=this.weights[e];this._lmHeadQWeight=this.gpu.createBuffer("lmhead_qweight",n*s*4,o);const m=this.gpu.createBuffer("lmhead_scales_f32",r*s*4,o),c=Math.ceil(r*s/2)*4;this._lmHeadScales=this.gpu.createBuffer("lmhead_scales",c,o);const f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:D}=await import("./gpu-ops-DKsrMEcC.js").then(k=>k.b);return{SHADERS:D}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData("quant_params",new Uint32Array([t,s,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[h,this._lmHeadQWeight,m,p]),B=65535,q=Math.min(s,B),_=Math.ceil(s/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:D}=await import("./gpu-ops-DKsrMEcC.js").then(k=>k.b);return{SHADERS:D}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(r*s/2),g=this.gpu.createBufferFromData("pack_params",new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[m,this._lmHeadScales,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),m.destroy(),p.destroy(),g.destroy();const v=(n*s*4/1e6).toFixed(0),C=(c/1e6).toFixed(0);console.log(`[QUANT] lm_head INT4 (GPU): ${(performance.now()-u).toFixed(0)}ms, ${v}MB qw + ${C}MB sc`)}async _quantizeBF16ToINT4(a,e,t,s,i){const n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,r=e/8,u=e/s,o=this.gpu.createBuffer(`${i}_qweight`,r*t*4,n),h=this.gpu.createBuffer(`${i}_scales_f32`,u*t*4,n),m=Math.ceil(u*t/2)*4,c=this.gpu.createBuffer(`${i}_scales`,m,n),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-DKsrMEcC.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData(`${i}_qp`,new Uint32Array([e,t,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[a,o,h,p]),B=65535,q=Math.min(t,B),_=Math.ceil(t/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-DKsrMEcC.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(u*t/2),g=this.gpu.createBufferFromData(`${i}_pp`,new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[h,c,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();return b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),h.destroy(),p.destroy(),g.destroy(),{qweight:o,scales:c}}initBuffers(a){a||(a=this.textCfg.max_position_embeddings||4096),this.maxSeqLen=a;const e=this.gpu,t=this.hiddenSize,s=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.hidden=e.createBuffer("hidden_a",t*4,i),this.hiddenB=e.createBuffer("hidden_b",t*4,i),this.normed=e.createBuffer("normed",t*4,i),this.normedB=e.createBuffer("normed_b",t*4,i),this.mlpIntermediate=e.createBuffer("mlp_inter",s*4,i),this.mlpOut=e.createBuffer("mlp_out",t*4,i),this.logits=e.createBuffer("logits",this.vocabSize*4,i),this.argmaxResult=e.createBuffer("argmax_result",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this._argmaxReadback=e.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"argmax_readback"}),this._topkResult=e.createBuffer("topk_result",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._repMaxTokens=64,this._recentTokens=new Uint32Array(this._repMaxTokens),this._recentTokenCount=0,this._gpuRecentTokens=e.createBuffer("recent_tokens",this._repMaxTokens*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST);for(let q=0;q<this.numLayers;q++)this.layerTypes[q]==="full_attention"&&(this.kvCache[q]={keys:e.createBuffer(`kv_k_${q}`,a*this.numKVHeads*this.headDim*4),values:e.createBuffer(`kv_v_${q}`,a*this.numKVHeads*this.headDim*4)});this.qProjFull=e.createBuffer("q_proj_full",this.numHeads*this.headDim*2*4,i),this.qProj=e.createBuffer("q_proj_out",this.numHeads*this.headDim*4,i),this.qGate=e.createBuffer("q_gate",this.numHeads*this.headDim*4,i),this.kProj=e.createBuffer("k_proj_out",this.numKVHeads*this.headDim*4,i),this.vProj=e.createBuffer("v_proj_out",this.numKVHeads*this.headDim*4,i);const n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,o=this.textCfg.linear_num_value_heads??n,m=o/n*u,c=n*(r+r+m),f=o*u;this.linValueDim=f,this.linValueHeads=o,this.linQKV=e.createBuffer("lin_qkv",c*4,i),this.linZ=e.createBuffer("lin_z",f*4,i),this.linOut=e.createBuffer("lin_out",f*4,i);const p=Math.max(t,this.numHeads*this.headDim,f)*4;this.attnOut=e.createBuffer("attn_out",p,i),this._maxGqaSplits=64;const w=this.numHeads*this._maxGqaSplits*(this.headDim+2)*4;this._gqaPartials=e.createBuffer("gqa_partials",w,i),this.linState={},this.linConvHist={};for(let q=0;q<this.numLayers;q++)this.layerTypes[q]==="linear_attention"&&(this.linState[q]=e.createBuffer(`lin_state_${q}`,n*r*m*4,i),this.linConvHist[q]=e.createBuffer(`lin_conv_hist_${q}`,3*c*4,i));this.zeroBuf=e.createBuffer("zero_buf",t*4,i),this.useSplitK=!1,this.splitKSplits=1;const B=Math.max(t,s);this.splitKPartials=e.createBuffer("splitk_partials",B*this.splitKSplits*4,i),this.paramBufs={},this._gqaParamBuf=e.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_params"}),this._kvStoreParamBuf=e.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"kv_store_params"}),this._gqaData=new Uint8Array(32),this._gqaDv=new DataView(this._gqaData.buffer),this._kvData=new Uint8Array(16),this._kvDv=new DataView(this._kvData.buffer)}initVision(){const a=this.config.vision_config;if(!a){this.vision=null;return}const e=this.gpu,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=a.hidden_size,i=a.intermediate_size,n=a.out_hidden_size,r=a.depth,u=a.num_heads,o=s/u,h=a.patch_size,m=a.temporal_patch_size,c=a.spatial_merge_size,f=3*m*h*h,p=4096,w=s*c*c;this.vision={V:s,Vi:i,Vo:n,depth:r,heads:u,headDim:o,patchSize:h,temporalPatchSize:m,mergeSize:c,patchInputDim:f,maxVitTokens:p,mergedHidden:w,numPosEmbeddings:a.num_position_embeddings,numGridPerSide:Math.round(Math.sqrt(a.num_position_embeddings)),hidden:e.createBuffer("vit_hidden",p*s*4,t),hiddenB:e.createBuffer("vit_hidden_b",p*s*4,t),normed:e.createBuffer("vit_normed",p*s*4,t),q:e.createBuffer("vit_q",p*s*4,t),k:e.createBuffer("vit_k",p*s*4,t),v:e.createBuffer("vit_v",p*s*4,t),attnOut:e.createBuffer("vit_attn_out",p*s*4,t),mlpInter:e.createBuffer("vit_mlp_inter",p*i*4,t),mlpOut:e.createBuffer("vit_mlp_out",p*s*4,t),cos:e.createBuffer("vit_cos",p*o*4,t),sin:e.createBuffer("vit_sin",p*o*4,t),posEmbed:e.createBuffer("vit_pos_embed",p*s*4,t),merged:e.createBuffer("vit_merged",p/4*n*4,t),mergerNormed:e.createBuffer("vit_merger_normed",p*s*4,t),mergerInter:e.createBuffer("vit_merger_inter",p/4*w*4,t),patchInput:e.createBuffer("vit_patch_input",p*f*4,t),qkv:e.createBuffer("vit_qkv",p*3*s*4,t)},this.imageTokenId=this.config.image_token_id,this.visionStartTokenId=this.config.vision_start_token_id,this.visionEndTokenId=this.config.vision_end_token_id,console.log(`[VISION] Initialized: depth=${r}, hidden=${s}, heads=${u}, headDim=${o}, out=${n}`),console.log(`[VISION] Buffer allocation: ~${((p*s*4*12+p*i*4)/1024/1024).toFixed(0)} MB transient`)}async _readVisionPosEmbed(){const a=this.vision,e=this.weights["model.visual.pos_embed.weight"];if(!e)throw new Error("Vision pos_embed weight not found");const t=a.numPosEmbeddings,s=a.V,i=t*s*2,n=this.gpu.createReadbackBuffer("vit_pos_embed_readback",i),r=this.gpu.device.createCommandEncoder();r.copyBufferToBuffer(e,0,n,0,i),this.gpu.device.queue.submit([r.finish()]),await n.mapAsync(GPUMapMode.READ);const u=new Uint16Array(n.getMappedRange().slice(0));n.unmap(),n.destroy();const o=new Float32Array(t*s);for(let h=0;h<u.length;h++){const m=u[h]<<16,c=new ArrayBuffer(4);new Uint32Array(c)[0]=m,o[h]=new Float32Array(c)[0]}this._vitPosEmbedF32=o,console.log(`[VISION] Read pos_embed: ${t}×${s} (${(i/1024).toFixed(0)} KB)`)}_interpolatePosEmbed(a,e){const t=this.vision,s=t.V,i=t.numGridPerSide,n=this._vitPosEmbedF32,r=t.mergeSize,u=a*e,o=new Float32Array(u*s);for(let p=0;p<a;p++){const w=a===1?0:p*(i-1)/(a-1),B=Math.min(Math.floor(w),i-1),q=Math.min(B+1,i-1),_=w-B;for(let l=0;l<e;l++){const d=e===1?0:l*(i-1)/(e-1),y=Math.min(Math.floor(d),i-1),U=Math.min(y+1,i-1),g=d-y,P=B*i+y,S=B*i+U,b=q*i+y,v=q*i+U,C=(1-_)*(1-g),D=(1-_)*g,k=_*(1-g),z=_*g,M=p*e+l;for(let $=0;$<s;$++)o[M*s+$]=C*n[P*s+$]+D*n[S*s+$]+k*n[b*s+$]+z*n[v*s+$]}}const h=a/r,m=e/r,c=new Float32Array(u*s);let f=0;for(let p=0;p<h;p++)for(let w=0;w<m;w++)for(let B=0;B<r;B++)for(let q=0;q<r;q++){const _=p*r+B,l=w*r+q,d=_*e+l;c.set(o.subarray(d*s,d*s+s),f*s),f++}return c}_computeVisionRoPE(a,e){const t=this.vision,s=t.headDim/2,i=t.mergeSize,n=a/i,r=e/i,u=a*e,o=Math.max(a,e),h=new Float32Array(o*s);for(let p=0;p<o;p++)for(let w=0;w<s;w++){const B=1/Math.pow(1e4,2*w/t.headDim);h[p*s+w]=p*B}const m=new Float32Array(u*t.headDim),c=new Float32Array(u*t.headDim);let f=0;for(let p=0;p<n;p++)for(let w=0;w<r;w++)for(let B=0;B<i;B++)for(let q=0;q<i;q++){const _=p*i+B,l=w*i+q;for(let U=0;U<s;U++){const g=h[_*s+U];h[l*s+U];const P=f*t.headDim;U<s/2&&(m[P+U]=Math.cos(g),c[P+U]=Math.sin(g),m[P+s+U]=Math.cos(g),c[P+s+U]=Math.sin(g))}const d=f*t.headDim,y=s/2;for(let U=0;U<y;U++){const g=h[_*s+U],P=h[l*s+U];m[d+U]=Math.cos(g),m[d+y+U]=Math.cos(P),m[d+s+U]=Math.cos(g),m[d+s+y+U]=Math.cos(P),c[d+U]=Math.sin(g),c[d+y+U]=Math.sin(P),c[d+s+U]=Math.sin(g),c[d+s+y+U]=Math.sin(P)}f++}return{cos:m,sin:c}}async preprocessImage(a){const e=this.vision,t=e.patchSize,s=e.mergeSize,i=t*s,n=256*256,r=1280*1280,u=new Image;await new Promise((v,C)=>{u.onload=v,u.onerror=C,u.src=a});let{width:o,height:h}=u,m=1;o*h>r?m=Math.sqrt(r/(o*h)):o*h<n&&(m=Math.sqrt(n/(o*h)));let c=Math.round(o*m/i)*i,f=Math.round(h*m/i)*i;c=Math.max(i,c),f=Math.max(i,f);const w=new OffscreenCanvas(c,f).getContext("2d");w.drawImage(u,0,0,c,f);const q=w.getImageData(0,0,c,f).data,_=f/t,l=c/t,d=_*l,y=_/s,U=l/s,g=e.temporalPatchSize,P=3*g*t*t,S=new Float32Array(d*P);let b=0;for(let v=0;v<y;v++)for(let C=0;C<U;C++)for(let D=0;D<s;D++)for(let k=0;k<s;k++){const z=v*s+D,M=C*s+k,$=z*t,O=M*t,G=b*P;for(let R=0;R<g;R++)for(let F=0;F<3;F++)for(let T=0;T<t;T++)for(let x=0;x<t;x++){const V=(($+T)*c+(O+x))*4+F,j=q[V]/127.5-1,L=((F*g+R)*t+T)*t+x;S[G+L]=j}b++}return console.log(`[VISION] Preprocessed: ${o}x${h} → ${c}x${f}, ${d} patches (${_}x${l}), merge→${d/4} tokens`),{pixels:S,gridH:_,gridW:l,numPatches:d,imgW:c,imgH:f}}async visionForward(a){const e=this.vision,t=this.gpu,{pixels:s,gridH:i,gridW:n,numPatches:r}=a;this._vitPosEmbedF32||await this._readVisionPosEmbed(),t.device.queue.writeBuffer(e.patchInput,0,s);const u=this._interpolatePosEmbed(i,n);t.device.queue.writeBuffer(e.posEmbed,0,u);const{cos:o,sin:h}=this._computeVisionRoPE(i,n);t.device.queue.writeBuffer(e.cos,0,o),t.device.queue.writeBuffer(e.sin,0,h),t.beginBatch();const m=this.weights["model.visual.patch_embed.proj.weight"],c=this.weights["model.visual.patch_embed.proj.bias"],f=this.makeUniform("vit_patch_params",[e.patchInputDim,e.V]);t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.patchInput,m,c,e.hidden,f])],Math.ceil(e.V/32),r);const p=this.makeUniform("vit_add_len",[r*e.V]);t.dispatch(this.pipelines.vit_add,[t.createBindGroup(this.pipelines.vit_add,0,[e.hidden,e.posEmbed,p])],Math.ceil(r*e.V/256));for(let b=0;b<e.depth;b++)this._vitBlock(b,r);const w=this.weights["model.visual.merger.norm.weight"],B=this.weights["model.visual.merger.norm.bias"],q=this.makeUniform("vit_merger_ln_params",[e.V,1e-6]);t.dispatch(this.pipelines.vit_layernorm,[t.createBindGroup(this.pipelines.vit_layernorm,0,[e.hidden,w,B,e.mergerNormed,q])],r);const _=r/4,l=this.weights["model.visual.merger.linear_fc1.weight"],d=this.weights["model.visual.merger.linear_fc1.bias"],y=this.makeUniform("vit_merger_fc1_params",[e.mergedHidden,e.mergedHidden]);t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.mergerNormed,l,d,e.mergerInter,y])],Math.ceil(e.mergedHidden/32),_);const U=this.makeUniform("vit_gelu_len",[_*e.mergedHidden]);t.dispatch(this.pipelines.vit_gelu,[t.createBindGroup(this.pipelines.vit_gelu,0,[e.mergerInter,U])],Math.ceil(_*e.mergedHidden/256));const g=this.weights["model.visual.merger.linear_fc2.weight"],P=this.weights["model.visual.merger.linear_fc2.bias"],S=this.makeUniform("vit_merger_fc2_params",[e.mergedHidden,e.Vo]);return t.dispatch(this.pipelines.vit_bf16_matvec_bias,[t.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[e.mergerInter,g,P,e.merged,S])],Math.ceil(e.Vo/32),_),t.endBatch(),await t.device.queue.onSubmittedWorkDone(),console.log(`[VISION] Forward done: ${r} patches → ${_} merged tokens (dim=${e.Vo})`),{numMergedTokens:_,embedBuffer:e.merged}}_vitBlock(a,e){const t=this.vision,s=this.gpu,i=`model.visual.blocks.${a}`,n=this.weights[`${i}.norm1.weight`],r=this.weights[`${i}.norm1.bias`],u=this.makeUniform(`vit_ln1_${a}`,[t.V,1e-6]);s.dispatch(this.pipelines.vit_layernorm,[s.createBindGroup(this.pipelines.vit_layernorm,0,[t.hidden,n,r,t.normed,u])],e);const o=this.weights[`${i}.attn.qkv.weight`],h=this.weights[`${i}.attn.qkv.bias`],m=this.makeUniform(`vit_qkv_${a}`,[t.V,3*t.V]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,o,h,t.qkv,m])],Math.ceil(3*t.V/32),e);const c=t.V,f=c*c*2,p=c*2,w=this.makeUniform(`vit_qkv_mv_${a}`,[c,c]);for(let $=0;$<3;$++){const O=[t.q,t.k,t.v][$],G=s.createBindGroupWithOffsets(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,{buffer:o,offset:$*f,size:f},{buffer:h,offset:$*p,size:p},O,w]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[G],Math.ceil(c/32),e)}const B=this.makeUniform(`vit_rope_${a}`,[e,t.heads,t.headDim]);s.dispatch(this.pipelines.vit_rope,[s.createBindGroup(this.pipelines.vit_rope,0,[t.q,t.k,t.attnOut,t.mlpOut,t.cos,t.sin,B])],Math.ceil(e*t.heads*t.headDim/256));const q=1/Math.sqrt(t.headDim),_=this.makeUniform(`vit_attn_${a}`,[e,t.heads,t.headDim,q]);s.dispatch(this.pipelines.vit_attention,[s.createBindGroup(this.pipelines.vit_attention,0,[t.attnOut,t.mlpOut,t.v,t.q,_])],e,t.heads);const l=this.weights[`${i}.attn.proj.weight`],d=this.weights[`${i}.attn.proj.bias`],y=this.makeUniform(`vit_proj_${a}`,[c,c]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.q,l,d,t.mlpOut,y])],Math.ceil(c/32),e);const U=this.makeUniform(`vit_res1_${a}`,[e*c]);s.dispatch(this.pipelines.vit_add,[s.createBindGroup(this.pipelines.vit_add,0,[t.hidden,t.mlpOut,U])],Math.ceil(e*c/256));const g=this.weights[`${i}.norm2.weight`],P=this.weights[`${i}.norm2.bias`];s.dispatch(this.pipelines.vit_layernorm,[s.createBindGroup(this.pipelines.vit_layernorm,0,[t.hidden,g,P,t.normed,u])],e);const S=this.weights[`${i}.mlp.linear_fc1.weight`],b=this.weights[`${i}.mlp.linear_fc1.bias`],v=this.makeUniform(`vit_mlp_fc1_${a}`,[c,t.Vi]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.normed,S,b,t.mlpInter,v])],Math.ceil(t.Vi/32),e);const C=this.makeUniform(`vit_gelu_${a}`,[e*t.Vi]);s.dispatch(this.pipelines.vit_gelu_tanh,[s.createBindGroup(this.pipelines.vit_gelu_tanh,0,[t.mlpInter,C])],Math.ceil(e*t.Vi/256));const D=this.weights[`${i}.mlp.linear_fc2.weight`],k=this.weights[`${i}.mlp.linear_fc2.bias`],z=this.makeUniform(`vit_mlp_fc2_${a}`,[t.Vi,c]);s.dispatch(this.pipelines.vit_bf16_matvec_bias,[s.createBindGroup(this.pipelines.vit_bf16_matvec_bias,0,[t.mlpInter,D,k,t.mlpOut,z])],Math.ceil(c/32),e);const M=this.makeUniform(`vit_res2_${a}`,[e*c]);s.dispatch(this.pipelines.vit_add,[s.createBindGroup(this.pipelines.vit_add,0,[t.hidden,t.mlpOut,M])],Math.ceil(e*c/256))}computeMultimodalPositions(a,e,t,s){const i=this.vision.mergeSize,n=t/i,r=s/i,u=a.length,o=new Array(3);for(let f=0;f<3;f++)o[f]=new Int32Array(u);let h=0,m=0;for(let f=0;f<u;f++)if(a[f]===this.imageTokenId){const w=m,B=Math.floor(w/r),q=w%r;o[0][f]=h,o[1][f]=h+B,o[2][f]=h+q,m++,m===e&&(h+=Math.max(n,r))}else o[0][f]=h,o[1][f]=h,o[2][f]=h,h++;const c=h-u;return{positionIds3D:o,ropeDelta:c}}initB2Buffers(){const a=this.gpu,e=this.hiddenSize,t=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.b2={},this.b2.hidden=a.createBuffer("b2_hidden_a",2*e*4,s),this.b2.hiddenB=a.createBuffer("b2_hidden_b",2*e*4,s),this.b2.normed=a.createBuffer("b2_normed",2*e*4,s),this.b2.mlpIntermediate=a.createBuffer("b2_mlp_inter",2*t*4,s),this.b2.mlpOut=a.createBuffer("b2_mlp_out",2*e*4,s),this.b2.logits=a.createBuffer("b2_logits",2*this.vocabSize*4,s),this.b2.zeroBuf=a.createBuffer("b2_zero_buf",2*e*4,s),this.b2.qProjFull=a.createBuffer("b2_q_proj_full",2*this.numHeads*this.headDim*2*4,s),this.b2.qProj=a.createBuffer("b2_q_proj_out",2*this.numHeads*this.headDim*4,s),this.b2.qGate=a.createBuffer("b2_q_gate",2*this.numHeads*this.headDim*4,s),this.b2.kProj=a.createBuffer("b2_k_proj_out",2*this.numKVHeads*this.headDim*4,s),this.b2.vProj=a.createBuffer("b2_v_proj_out",2*this.numKVHeads*this.headDim*4,s);const i=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,h=this.linValueHeads/i*r,m=i*(n+n+h),c=this.linValueDim;this.b2.linQKV=a.createBuffer("b2_lin_qkv",2*m*4,s),this.b2.linZ=a.createBuffer("b2_lin_z",2*c*4,s),this.b2.linOut=a.createBuffer("b2_lin_out",2*c*4,s);const f=Math.max(e,this.numHeads*this.headDim,c)*4;if(this.b2.attnOut=a.createBuffer("b2_attn_out",2*f,s),this.abQuantized){const p=this.textCfg.linear_num_value_heads??i;this.b2.linAlpha=a.createBuffer("b2_lin_alpha",2*p*4,s),this.b2.linBeta=a.createBuffer("b2_lin_beta",2*p*4,s)}this.b2.topkResult0=a.createBuffer("b2_topk0",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkResult1=a.createBuffer("b2_topk1",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkReadback0=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb0"}),this.b2.topkReadback1=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb1"}),this.b2.argmaxResult0=a.createBuffer("b2_argmax0",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxResult1=a.createBuffer("b2_argmax1",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxReadback0=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb0"}),this.b2.argmaxReadback1=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb1"}),this.b2._gqaParamBuf0=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params0"}),this.b2._gqaParamBuf1=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params1"}),this.b2._dims={H:e,I:t,linQKVDim:m,valueDim:c,linHeads:i,linKeyDim:n,linValDim:r,linEVD:h,qProjSize:this.numHeads*this.headDim,qProjFullSize:this.numHeads*this.headDim*2,kProjSize:this.numKVHeads*this.headDim,vProjSize:this.numKVHeads*this.headDim,attnOutElems:f/4},console.log(`[B2] Allocated batch=2 buffers (${((2*(e+e+e+t+e+this.vocabSize)*4+2*f)/1e6).toFixed(1)} MB activations)`)}runWithOffsets(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroupWithOffsets(i,0,e);this.gpu.dispatch(i,[n],t,s)}makeUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),s=new DataView(t);for(let r=0;r<e.length;r++)Number.isInteger(e[r])?s.setUint32(r*4,e[r],!0):s.setFloat32(r*4,e[r],!0);const i=a+"_"+e.join("_");if(this.paramBufs[i])return this.paramBufs[i];const n=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(n,0,new Uint8Array(t)),this.paramBufs[i]=n,n}_makeMixedUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),s=new DataView(t);for(let n=0;n<e.length;n++){const r=e[n];r.u!==void 0?s.setUint32(n*4,r.u,!0):s.setFloat32(n*4,r.f,!0)}this._mixedUniformBufs||(this._mixedUniformBufs={});let i=this._mixedUniformBufs[a];return i||(i=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a}),this._mixedUniformBufs[a]=i),this.gpu.device.queue.writeBuffer(i,0,new Uint8Array(t)),i}makeUniformTyped(a,e,t){const s=e.length+t.length,i=new ArrayBuffer(Math.max(16,Math.ceil(s*4/16)*16)),n=new DataView(i);let r=0;for(const h of e)n.setUint32(r,h,!0),r+=4;for(const h of t)n.setFloat32(r,h,!0),r+=4;const u=a+"_"+[...e,...t].join("_");if(this.paramBufs[u])return this.paramBufs[u];const o=this.gpu.device.createBuffer({size:i.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(i)),this.paramBufs[u]=o,o}makeFusedNormMLPUniform(a){const e=`fused_anorm_mlp_${a}`;if(this.paramBufs[e])return this.paramBufs[e];const t=this.hiddenSize,s=this.intermediateSize,i=16+640*16,n=new ArrayBuffer(i),r=new DataView(n);r.setUint32(0,t,!0),r.setUint32(4,s,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=`model.language_model.layers.${a}.post_attention_layernorm.weight`,o=this._normWeightRaw?.[u];if(!o)throw new Error(`Norm weight not cached for layer ${a}`);for(let m=0;m<t/2;m++){const c=Math.floor(m/4),f=m%4;r.setUint32(16+c*16+f*4,o[m],!0)}const h=this.gpu.device.createBuffer({size:i,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:e});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(n)),this.paramBufs[e]=h,h}fusedNormGptqOp(a,e,t,s,i,n,r,u){const o=this.getQWeight(s);if(!o.qweight)return null;const h=this.makeFusedNormGPTQUniform(i,r),m=u?"fused_norm_gptq":"fused_norm_gptq_noadd",c=u?"fnorm_gptq_":"fnorm_gptqna_",f=u?[a,e,o.qweight,o.scales,t,h]:[a,o.qweight,o.scales,t,h];return this.prepOpCached(`${c}${s}`,m,f,this.wg4(r))}makeFusedNormGPTQUniform(a,e){const t=`fused_norm_gptq_${a}_${e}`;if(this.paramBufs[t])return this.paramBufs[t];const s=this.hiddenSize,i=16+512*16,n=new ArrayBuffer(i),r=new DataView(n);r.setUint32(0,s,!0),r.setUint32(4,e,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=this._normWeightRaw?.[a];if(!u)throw new Error(`Norm weight not cached: ${a}`);for(let h=0;h<s/2;h++){const m=Math.floor(h/4),c=h%4;r.setUint32(16+m*16+c*4,u[h],!0)}const o=this.gpu.device.createBuffer({size:i,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:t});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(n)),this.paramBufs[t]=o,o}run(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroup(i,0,e);this.gpu.dispatch(i,[n],t,s)}runCached(a,e,t,s,i=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);this.gpu.dispatch(n,[r],s,i)}prepOp(a,e,t,s=1){const i=this.pipelines[a],n=this.gpu.createBindGroup(i,0,e);return{pipeline:i,bindGroups:[n],workgroupsX:t,workgroupsY:s}}prepOpCached(a,e,t,s,i=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);return{pipeline:n,bindGroups:[r],workgroupsX:s,workgroupsY:i}}wg(a){return Math.ceil(a/256)}wg4(a){return Math.ceil(a/32)}wg8(a){return Math.ceil(a/8)}getQWeight(a){return{qweight:this.weights[a+".qweight"],scales:this.weights[a+".scales"],qzeros:this.weights[a+".qzeros"]}}embedding(a){const e=this.weights["model.language_model.embed_tokens.weight"],t=this.makeUniform("emb_params",[a,this.hiddenSize]);this.run("embedding",[e,this.hidden,t],this.wg(this.hiddenSize))}embeddingFromVisionBuffer(a,e){const t=this.hiddenSize;this.gpu.copyBuffer(a,this.hidden,t*4,e*t*4,0)}embeddingFromArgmax(){const a=this.weights["model.language_model.embed_tokens.weight"];this._embedArgmaxParams||(this._embedArgmaxParams=this.makeUniform("emb_argmax_params",[this.hiddenSize])),this.runCached("embed_from_argmax","embed_from_argmax",[a,this.hidden,this.argmaxResult,this._embedArgmaxParams],this.wg(this.hiddenSize))}_dispatchLmHead(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,s=a/this.groupSize%4===0,i=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(s){const n=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,i],this.wg8(e))}else{const n=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,i],this.wg4(e))}}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],s=this.makeUniform("lmhead_params",[this.hiddenSize,this.vocabSize]);this.runCached("lm_head","bf16_matvec",[this.normed,t,this.logits,s],this.wg(this.vocabSize))}}rmsNorm(a,e,t,s){const i=this.makeUniform("rmsnorm_params",[this.hiddenSize,this.rmsEps]);s?this.runCached(s,"rmsnorm",[a,t,e,i],1):this.run("rmsnorm",[a,t,e,i],1)}gptqMatvec(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return;const r=s/this.groupSize,u=r%4===0;if(this.useSplitK&&u){let o=this.splitKSplits;for(;o>1&&r%(o*4)!==0;)o>>=1;if(o>1){const h=this.makeUniform(`mv_sk_${s}_${i}_${o}`,[s,i,this.groupSize,o]);this.runCached(`sk4tf16_${t}`,"gptq_matvec_4t_f16_sk",[a,n.qweight,n.scales,this.splitKPartials,h],this.wg8(i),o);const m=this.makeUniform(`rsk_${i}_${o}`,[i,o]);this.runCached(`rsk_${t}`,"reduce_splitk",[this.splitKPartials,e,m],this.wg(i));return}}if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",h=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);this.runCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg8(i))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",h=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);this.runCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg4(i))}}gptqMatvecOp(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return null;if(s/this.groupSize%4===0){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",h=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);return this.prepOpCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg8(i))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",h=this.gpu.hasF16?"gptqf16_":"gptq_",m=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);return this.prepOpCached(`${h}${t}`,o,[a,n.qweight,n.scales,e,m],this.wg4(i))}}bf16Matvec(a,e,t,s,i,n){const r=this.makeUniform(`bf16mv_${s}_${i}`,[s,i]);n?this.runCached(n,"bf16_matvec",[a,t,e,r],this.wg(i)):this.run("bf16_matvec",[a,t,e,r],this.wg(i))}siluMul(a,e,t,s,i){const n=this.makeUniform(`silu_${s}`,[s]);i?this.runCached(i,"silu_mul",[a,e,t,n],this.wg(s)):this.run("silu_mul",[a,e,t,n],this.wg(s))}addVectors(a,e,t,s){const i=this.makeUniform(`add_${t}`,[t]);s?this.runCached(s,"add",[a,e,i],this.wg(t)):this.run("add",[a,e,i],this.wg(t))}addAndRmsNorm(a,e,t,s,i){const n=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);i?this.runCached(i,"add_rmsnorm",[a,e,s,t,n],1):this.run("add_rmsnorm",[a,e,s,t,n],1)}mlp(a,e){const t=`model.language_model.layers.${e}.mlp`,s=this.hiddenSize,i=this.intermediateSize,n=this.getQWeight(`${t}.gate_proj`),r=this.getQWeight(`${t}.up_proj`),u=this.makeUniform(`fused_mv_${s}_${i}`,[s,i,this.groupSize]);if(s/this.groupSize%4===0){const m=this.gpu.hasF16?"fused_gate_up_silu_4t_f16":"fused_gate_up_silu_4t",c=this.gpu.hasF16?"fused_gus4t16_":"fused_gus4t_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg8(i))}else{const m=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",c=this.gpu.hasF16?"fused_gus16_":"fused_gus_";this.runCached(`${c}${e}`,m,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg4(i))}this.gptqMatvec(this.mlpIntermediate,this.mlpOut,`${t}.down_proj`,i,s)}fullAttentionFused(a,e,t,s,i,n,r,u){i=i||this.normed;const o=`model.language_model.layers.${t}.self_attn`,h=this.hiddenSize,m=this.headDim,c=this.numHeads,f=this.numKVHeads,p=c/f,w=t===0,B=`model.language_model.layers.${t}.input_layernorm.weight`,q=this.weights[B];if(w){const $=this.makeUniform("add_rmsnorm_params",[h,this.rmsEps]);this.runCached(`fa_norm_${t}`,"rmsnorm",[a,q,i,$],1)}else{const $=this.makeUniform("add_rmsnorm_params",[h,this.rmsEps]);this.runCached(`fa_norm_${t}`,"add_rmsnorm_ro",[a,e,q,i,$],1)}const _=this.gptqMatvecOp(i,this.qProjFull,`${o}.q_proj`,h,c*m*2),l=this.gptqMatvecOp(i,this.kProj,`${o}.k_proj`,h,f*m),d=this.gptqMatvecOp(i,this.vProj,`${o}.v_proj`,h,f*m);this.gpu.dispatchMulti([_,l,d].filter(Boolean));const y=this.kvCache[t],U=this._fusedSQKParams[t],g=u??s;this._gqaDv.setUint32(0,g,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,n??s,!0),this._gqaDv.setUint32(12,r??s,!0),this.gpu.device.queue.writeBuffer(U,16,this._gqaData,0,16),this.runCached(`fa_sqk_${t}`,this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,y.keys,y.values,U],c+f);const P=(u??s)+1,S=this._forceMinSplits||1,b=Math.max(S,Math.min(Math.max(1,Math.ceil(P/32)),this._maxGqaSplits)),v=b>1?this._gqaPartials:this.attnOut;if(this._gqaDv.setUint32(0,P,!0),this._gqaDv.setUint32(4,m,!0),this._gqaDv.setUint32(8,f,!0),this._gqaDv.setUint32(12,c,!0),this._gqaDv.setUint32(16,p,!0),this._gqaDv.setUint32(20,b,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.run("gqa_attention_head",[this.qProj,y.keys,y.values,v,this._gqaParamBuf],c,b),b>1){this._gqaReduceParams||(this._gqaReduceParams=this.gpu.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_reduce_params"}));const $=new Uint8Array(16),O=new DataView($.buffer);O.setUint32(0,m,!0),O.setUint32(4,b,!0),O.setUint32(8,c,!0),this.gpu.device.queue.writeBuffer(this._gqaReduceParams,0,$),this.run("gqa_reduce",[this._gqaPartials,this.attnOut,this._gqaReduceParams],c)}const C=this.getQWeight(`${o}.o_proj`),D=c*m,z=D/this.groupSize%4===0,M=this.makeUniform(`fused_sig_mv_${D}_${h}`,[D,h,this.groupSize]);if(z){const $=this.gpu.hasF16?"fused_sigmoid_gptq_4t_f16":"fused_sigmoid_gptq_4t",O=this.gpu.hasF16?"fused_sig4t16_oproj_":"fused_sig4t_oproj_";this.runCached(`${O}${t}`,$,[this.attnOut,this.qGate,C.qweight,C.scales,this.qProj,M],this.wg8(h))}else{const $=this.gpu.hasF16?"fused_sigmoid_gptq_f16":"fused_sigmoid_gptq",O=this.gpu.hasF16?"fused_sig16_oproj_":"fused_sig_oproj_";this.runCached(`${O}${t}`,$,[this.attnOut,this.qGate,C.qweight,C.scales,this.qProj,M],this.wg4(h))}}linearAttentionFused(a,e,t,s){s=s||this.normed;const i=`model.language_model.layers.${t}.linear_attn`,n=this.hiddenSize,r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,o=this.textCfg.linear_value_head_dim,h=this.linValueHeads,c=h/r*o,f=r*(u+u+c),p=this.linValueDim,w=t===0,B=`model.language_model.layers.${t}.input_layernorm.weight`,q=this.weights[B];if(w){const b=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"rmsnorm",[a,q,s,b],1)}else{const b=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"add_rmsnorm_ro",[a,e,q,s,b],1)}{const b=[this.gptqMatvecOp(s,this.linQKV,`${i}.in_proj_qkv`,n,f),this.gptqMatvecOp(s,this.linZ,`${i}.in_proj_z`,n,p)];this.abQuantized&&(b.push(this.gptqMatvecOp(s,this.linAlpha,`${i}.in_proj_a`,n,h)),b.push(this.gptqMatvecOp(s,this.linBeta,`${i}.in_proj_b`,n,h))),this.gpu.dispatchMulti(b.filter(Boolean))}const _=this.weights[`${i}.conv1d.weight`],l=this.weights[`${i}.A_log`],d=this.weights[`${i}.dt_bias`],y=this.weights[`${i}.norm.weight`];if(this.abQuantized){const b=`fused_cdn_q_${r}_${u}_${o}_${f}_${h}`;let v=this.paramBufs[b];if(!v){const C=new ArrayBuffer(32),D=new DataView(C);D.setUint32(0,r,!0),D.setUint32(4,u,!0),D.setUint32(8,o,!0),D.setUint32(12,f,!0),D.setFloat32(16,this.rmsEps,!0),D.setUint32(20,0,!0),D.setUint32(24,h,!0),v=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_q_params"}),this.gpu.device.queue.writeBuffer(v,0,new Uint8Array(C)),this.paramBufs[b]=v}this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,this.linAlpha,this.linBeta,l,d,y,v],r)}else{const b=`fused_cdn_ext_${r}_${u}_${o}_${f}_${n}_${h}`;let v=this.paramBufs[b];if(!v){const D=new ArrayBuffer(32),k=new DataView(D);k.setUint32(0,r,!0),k.setUint32(4,u,!0),k.setUint32(8,o,!0),k.setUint32(12,f,!0),k.setFloat32(16,this.rmsEps,!0),k.setUint32(20,n,!0),k.setUint32(24,h,!0),v=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_ext_params"}),this.gpu.device.queue.writeBuffer(v,0,new Uint8Array(D)),this.paramBufs[b]=v}const C=this.linABWeight[t];this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],_,this.linState[t],this.linOut,s,C,l,d,y,v],r)}const U=this.getQWeight(`${i}.out_proj`),P=p/this.groupSize%4===0,S=this.makeUniform(`fused_silu_mv_${p}_${n}`,[p,n,this.groupSize]);if(P){const b=this.gpu.hasF16?"fused_silu_gptq_4t_f16":"fused_silu_gptq_4t",v=this.gpu.hasF16?"fused_silu4t16_oproj_":"fused_silu4t_oproj_";this.runCached(`${v}${t}`,b,[this.linZ,this.linOut,U.qweight,U.scales,this.attnOut,S],this.wg8(n))}else{const b=this.gpu.hasF16?"fused_silu_gptq_f16":"fused_silu_gptq",v=this.gpu.hasF16?"fused_silu16_oproj_":"fused_silu_oproj_";this.runCached(`${v}${t}`,b,[this.linZ,this.linOut,U.qweight,U.scales,this.attnOut,S],this.wg4(n))}}fusedNormMLP(a,e,t,s,i,n){n=n||this.normed;const r=this.hiddenSize,u=`model.language_model.layers.${i}.post_attention_layernorm.weight`,o=this.weights[u],h=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]),m=`mlp_norm_${i}_${a===this.hidden?"a":"b"}`;this.runCached(m,"three_way_add_rmsnorm",[a,t,s,o,e,n,h],1),this.mlp(n,i)}decoderLayer(a,e,t,s,i,n,r,u){let o;this.layerTypes[a]==="full_attention"?(this.fullAttentionFused(t,i,a,e,void 0,n,r,u),o=this.qProj):(this.linearAttentionFused(t,i,a),o=this.attnOut),this.fusedNormMLP(t,s,i,o,a)}async forward(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,s=this.hiddenB,i=this.zeroBuf;for(let o=0;o<this.numLayers;o++){this.decoderLayer(o,e,t,s,i),i=this.mlpOut;const h=t;t=s,s=h}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const o=this.repetitionPenalty??1,h=this.presencePenalty??0;if(this._recentTokenCount>0&&(o>1||h>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const p=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:o},{f:h},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,p],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const o=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,o],1)}return this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this._readAndSample()}forwardSubmit(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,s=this.hiddenB,i=this.zeroBuf;for(let o=0;o<this.numLayers;o++){this.decoderLayer(o,e,t,s,i),i=this.mlpOut;const h=t;t=s,s=h}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const o=this.repetitionPenalty??1,h=this.presencePenalty??0;if(this._recentTokenCount>0&&(o>1||h>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const p=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:o},{f:h},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,p],Math.ceil(this._recentTokenCount/256))}const m=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,m],1);const c=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:c}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const o=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,o],1)}this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch()}_buildReplayFromRecording(){const a=this.gpu.stopRecording(),e=this.pipelines.gqa_attention_head,t=[];for(const s of a)if(s.multi)for(const i of s.ops)t.push({p:i.pipeline,bg:i.bindGroup,x:i.wgX,y:i.wgY,gqa:i.pipeline===e?1:0});else t.push({p:s.pipeline,bg:s.bindGroup,x:s.wgX,y:s.wgY,gqa:s.pipeline===e?1:0});this._replayFlat=t,console.log(`[PERF] Recorded ${t.length} flat replay ops for core forward`)}_replayCoreForward(a){const e=this.gpu,t=a-this._ropeDelta,s=t+1,i=Math.min(Math.max(1,Math.ceil(s/32)),this._maxGqaSplits);this._gqaDv.setUint32(0,t,!0),this._gqaDv.setUint32(4,a,!0),this._gqaDv.setUint32(8,a,!0),this._gqaDv.setUint32(12,a,!0);for(let o=0;o<this.numLayers;o++)this.layerTypes[o]==="full_attention"&&e.device.queue.writeBuffer(this._fusedSQKParams[o],16,this._gqaData,0,16);this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,i,!0),e.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData,0,24),this._gqaReduceParams&&(this._gqaDv.setUint32(0,this.headDim,!0),this._gqaDv.setUint32(4,i,!0),this._gqaDv.setUint32(8,this.numHeads,!0),this._gqaDv.setUint32(12,0,!0),e.device.queue.writeBuffer(this._gqaReduceParams,0,this._gqaData,0,16));const n=e._singlePass,r=this._replayFlat,u=r.length;for(let o=0;o<u;o++){const h=r[o];n.setPipeline(h.p),n.setBindGroup(0,h.bg),n.dispatchWorkgroups(h.x,h.gqa?i:h.y)}}embeddingB2(a,e){const t=this.hiddenSize,s=this.weights["model.language_model.embed_tokens.weight"],i=this.makeUniform("emb_params",[a,t]);this.runWithOffsets("embedding",[s,{buffer:this.b2.hidden,offset:0,size:t*4},i],this.wg(t));const n=this.makeUniform(`emb_params_${e}`,[e,t]);this.runWithOffsets("embedding",[s,{buffer:this.b2.hidden,offset:t*4,size:t*4},n],this.wg(t))}gptqMatvecB2(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return;const r=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",o=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";this.runCached(`${o}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(i))}gptqMatvecB2Op(a,e,t,s,i){const n=this.getQWeight(t);if(!n.qweight)return null;const r=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",o=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";return this.prepOpCached(`${o}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(i))}fullAttentionB2(a,e,t,s){const i=`model.language_model.layers.${t}.self_attn`,n=this.hiddenSize,r=this.b2._dims,u=this.headDim,o=this.numHeads,h=this.numKVHeads,m=o/h,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,p=this.weights[f],w=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:n*4},p,{buffer:this.b2.normed,offset:0,size:n*4},w],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:n*4,size:n*4},p,{buffer:this.b2.normed,offset:n*4,size:n*4},w],1)):this.run("add_rmsnorm_ro_b2",[a,e,p,this.b2.normed,w],2);const B=this.gptqMatvecB2Op(this.b2.normed,this.b2.qProjFull,`${i}.q_proj`,n,o*u*2),q=this.gptqMatvecB2Op(this.b2.normed,this.b2.kProj,`${i}.k_proj`,n,h*u),_=this.gptqMatvecB2Op(this.b2.normed,this.b2.vProj,`${i}.v_proj`,n,h*u);this.gpu.dispatchMulti([B,q,_].filter(Boolean));const l=this.kvCache[t],d=this._fusedSQKParams[t],y=r.qProjFullSize*4,U=r.kProjSize*4,g=r.vProjSize*4,P=r.qProjSize*4;this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,s,!0),this._gqaDv.setUint32(12,s,!0),this.gpu.device.queue.writeBuffer(d,16,this._gqaData,0,16),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:0,size:y},{buffer:this.b2.kProj,offset:0,size:U},{buffer:this.b2.vProj,offset:0,size:g},{buffer:this.b2.qProj,offset:0,size:P},{buffer:this.b2.qGate,offset:0,size:P},l.keys,l.values,d],o+h),this._gqaDv.setUint32(0,s+1,!0),this._gqaDv.setUint32(4,s+1,!0),this._gqaDv.setUint32(8,s+1,!0),this._gqaDv.setUint32(12,s+1,!0),this.gpu.device.queue.writeBuffer(d,16,this._gqaData,0,16),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:y,size:y},{buffer:this.b2.kProj,offset:U,size:U},{buffer:this.b2.vProj,offset:g,size:g},{buffer:this.b2.qProj,offset:P,size:P},{buffer:this.b2.qGate,offset:P,size:P},l.keys,l.values,d],o+h);const S=s+1,b=s+2;this._gqaDv.setUint32(0,S,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,h,!0),this._gqaDv.setUint32(12,o,!0),this._gqaDv.setUint32(16,m,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,s,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf0,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:0,size:P},l.keys,l.values,{buffer:this.b2.attnOut,offset:0,size:r.qProjSize*4},this.b2._gqaParamBuf0],o),this._gqaDv.setUint32(0,b,!0),this._gqaDv.setUint32(24,s+1,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf1,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:P,size:P},l.keys,l.values,{buffer:this.b2.attnOut,offset:r.qProjSize*4,size:r.qProjSize*4},this.b2._gqaParamBuf1],o);const v=this.getQWeight(`${i}.o_proj`),C=o*u,D=this.makeUniform(`fused_sig_mv_${C}_${n}`,[C,n,this.groupSize]);this.runCached(`b2_fused_sig_oproj_${t}`,"fused_sigmoid_gptq_b2_f16",[this.b2.attnOut,this.b2.qGate,v.qweight,v.scales,this.b2.qProj,D],this.wg4(n))}linearAttentionB2(a,e,t){const s=`model.language_model.layers.${t}.linear_attn`,i=this.hiddenSize,n=this.b2._dims,r=n.linHeads,u=n.linKeyDim,o=n.linValDim;n.linEVD;const h=n.linQKVDim,m=n.valueDim,c=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,p=this.weights[f],w=this.makeUniform("add_rmsnorm_params",[i,this.rmsEps]);c?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:i*4},p,{buffer:this.b2.normed,offset:0,size:i*4},w],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:i*4,size:i*4},p,{buffer:this.b2.normed,offset:i*4,size:i*4},w],1)):this.run("add_rmsnorm_ro_b2",[a,e,p,this.b2.normed,w],2);{const P=[this.gptqMatvecB2Op(this.b2.normed,this.b2.linQKV,`${s}.in_proj_qkv`,i,h),this.gptqMatvecB2Op(this.b2.normed,this.b2.linZ,`${s}.in_proj_z`,i,m)];this.abQuantized&&(P.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linAlpha,`${s}.in_proj_a`,i,this.linValueHeads)),P.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linBeta,`${s}.in_proj_b`,i,this.linValueHeads))),this.gpu.dispatchMulti(P.filter(Boolean))}const B=this.weights[`${s}.conv1d.weight`],q=this.weights[`${s}.A_log`],_=this.weights[`${s}.dt_bias`],l=this.weights[`${s}.norm.weight`],d=h*4,y=m*4;if(this.abQuantized){const P=this.linValueHeads,S=P*4,b=`fused_cdn_q_${r}_${u}_${o}_${h}_${P}`,v=this.paramBufs[b];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:0,size:y},{buffer:this.b2.linAlpha,offset:0,size:S},{buffer:this.b2.linBeta,offset:0,size:S},q,_,l,v],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:d,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:y,size:y},{buffer:this.b2.linAlpha,offset:S,size:S},{buffer:this.b2.linBeta,offset:S,size:S},q,_,l,v],r)}else{const P=`fused_cdn_ext_${r}_${u}_${o}_${h}_${i}_${this.linValueHeads}`,S=this.paramBufs[P],b=this.linABWeight[t];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:0,size:y},{buffer:this.b2.normed,offset:0,size:i*4},b,q,_,l,S],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:d,size:d},this.linConvHist[t],B,this.linState[t],{buffer:this.b2.linOut,offset:y,size:y},{buffer:this.b2.normed,offset:i*4,size:i*4},b,q,_,l,S],r)}const U=this.getQWeight(`${s}.out_proj`),g=this.makeUniform(`fused_silu_mv_${m}_${i}`,[m,i,this.groupSize]);this.runCached(`b2_fused_silu_oproj_${t}`,"fused_silu_gptq_b2_f16",[this.b2.linZ,this.b2.linOut,U.qweight,U.scales,this.b2.attnOut,g],this.wg4(i))}fusedNormMLPB2(a,e,t,s,i){const n=this.hiddenSize,r=this.intermediateSize,u=`model.language_model.layers.${i}.mlp`,o=`model.language_model.layers.${i}.post_attention_layernorm.weight`,h=this.weights[o],m=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.run("three_way_add_rmsnorm_b2",[a,t,s,h,e,this.b2.normed,m],2);const c=this.getQWeight(`${u}.gate_proj`),f=this.getQWeight(`${u}.up_proj`),p=this.makeUniform(`fused_mv_${n}_${r}`,[n,r,this.groupSize]);this.runCached(`b2_fused_gus_${i}`,"fused_gate_up_silu_b2_f16",[this.b2.normed,c.qweight,c.scales,f.qweight,f.scales,this.b2.mlpIntermediate,p],this.wg4(r)),this.gptqMatvecB2(this.b2.mlpIntermediate,this.b2.mlpOut,`${u}.down_proj`,r,n)}decoderLayerB2(a,e,t,s,i){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionB2(t,i,a,e),n=this.b2.qProj):(this.linearAttentionB2(t,i,a),n=this.b2.attnOut),this.fusedNormMLPB2(t,s,i,n,a)}_dispatchLmHeadB2(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,t=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]),s=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2";this.runCached("b2_lm_head",s,[this.b2.normed,this._lmHeadQWeight,this._lmHeadScales,this.b2.logits,t],this.wg4(e))}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],s=this.hiddenSize,i=this.vocabSize,n=this.makeUniform("lmhead_params",[s,i]);this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:0,size:s*4},t,{buffer:this.b2.logits,offset:0,size:i*4},n],this.wg(i)),this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:s*4,size:s*4},t,{buffer:this.b2.logits,offset:i*4,size:i*4},n],this.wg(i))}}async forwardB2(a,e,t){this.b2||this.initB2Buffers(),this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embeddingB2(a,e);let s=this.b2.hidden,i=this.b2.hiddenB,n=this.b2.zeroBuf;for(let f=0;f<this.numLayers;f++){this.decoderLayerB2(f,t,s,i,n),n=this.b2.mlpOut;const p=s;s=i,i=p}if(this.mtp){const f=this.hiddenSize;this.gpu.copyBuffer(s,this.mtp.savedHidden,f*4,f*4,0),this.gpu.copyBuffer(this.b2.mlpOut,this.mtp.savedMlpOut,f*4,f*4,0)}const r=this.weights["model.language_model.norm.weight"],u=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);this.run("add_rmsnorm_b2",[s,this.b2.mlpOut,r,this.b2.normed,u],2),this._dispatchLmHeadB2();const h=(this.temperature??.7)>0,m=this.vocabSize,c=this.makeUniform("argmax_params",[m]);return h?(this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.topkResult0,c],1),this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.topkResult1,c],1)):(this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:0,size:m*4},this.b2.argmaxResult0,c],1),this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:m*4,size:m*4},this.b2.argmaxResult1,c],1)),this.gpu._passCount!==void 0&&!this._b2PassCountLogged&&(console.log(`[PERF] B2: ${this.gpu._passCount} compute passes per 2-token step`),this._b2PassCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),h?(this.gpu.copyBuffer(this.b2.topkResult0,this.b2.topkReadback0,2048),this.gpu.copyBuffer(this.b2.topkResult1,this.b2.topkReadback1,2048)):(this.gpu.copyBuffer(this.b2.argmaxResult0,this.b2.argmaxReadback0,8),this.gpu.copyBuffer(this.b2.argmaxResult1,this.b2.argmaxReadback1,8)),this.gpu.endBatch(),this._readAndSampleB2(h)}async _readAndSampleB2(a){const e=[0,0];a?(await Promise.all([this.b2.topkReadback0.mapAsync(GPUMapMode.READ),this.b2.topkReadback1.mapAsync(GPUMapMode.READ)]),e[0]=this._sampleFromTopkReadback(this.b2.topkReadback0),e[1]=this._sampleFromTopkReadback(this.b2.topkReadback1)):(await Promise.all([this.b2.argmaxReadback0.mapAsync(GPUMapMode.READ),this.b2.argmaxReadback1.mapAsync(GPUMapMode.READ)]),e[0]=new Uint32Array(this.b2.argmaxReadback0.getMappedRange().slice(0))[0],e[1]=new Uint32Array(this.b2.argmaxReadback1.getMappedRange().slice(0))[0],this.b2.argmaxReadback0.unmap(),this.b2.argmaxReadback1.unmap());for(const t of e)this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=t:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=t);return e}_sampleFromTopkReadback(a){const e=a.getMappedRange(),t=new Uint32Array(e),s=new Float32Array(e),i=new Uint32Array(256),n=new Float32Array(256);for(let o=0;o<256;o++)i[o]=t[o*2],n[o]=s[o*2+1];a.unmap();const r=this.presencePenalty??0,u=this.repetitionPenalty??1;if((r>0||u>1)&&this._recentTokenCount>0){const o=new Set;for(let h=0;h<this._recentTokenCount;h++)o.add(this._recentTokens[h]);for(let h=0;h<256;h++)o.has(i[h])&&(r>0&&(n[h]-=r),u>1&&(n[h]=n[h]>0?n[h]/u:n[h]*u))}return this._sampleFromArrays(i,n,256)}async _readAndSample(){await this._argmaxReadback.mapAsync(GPUMapMode.READ);const a=new Uint32Array(this._argmaxReadback.getMappedRange().slice(0));this._argmaxReadback.unmap();const e=a[0];return this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=e:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=e),e}_sampleFromArrays(a,e,t){const s=this.temperature??.7,i=this.topP??.8,n=this.topK??20,r=Math.min(n,t),u=this._selBuf||(this._selBuf=new Uint32Array(64)),o=this._selValBuf||(this._selValBuf=new Float32Array(64)),h=this._usedBuf||(this._usedBuf=new Uint8Array(256));h.fill(0);for(let l=0;l<r;l++){let d=-1,y=-1/0;for(let U=0;U<t;U++)!h[U]&&e[U]>y&&(y=e[U],d=U);if(d<0)break;u[l]=a[d],o[l]=y,h[d]=1}const m=o[0],c=this._probBuf||(this._probBuf=new Float32Array(64));let f=0;for(let l=0;l<r;l++)c[l]=Math.exp((o[l]-m)/s),f+=c[l];for(let l=0;l<r;l++)c[l]/=f;let p=0,w=r;for(let l=0;l<r;l++)if(p+=c[l],p>=i){w=l+1;break}let B=0;for(let l=0;l<w;l++)B+=c[l];const q=Math.random()*B;let _=0;for(let l=0;l<w;l++)if(_+=c[l],_>=q)return u[l];return u[0]}_sample(a){const e=this.temperature??.7,t=this.topP??.8,s=this.topK??20,i=a.length;if(e<=0){let _=0,l=a[0];for(let d=1;d<i;d++)a[d]>l&&(l=a[d],_=d);return _}const n=Math.max(s,64),r=new Int32Array(n).fill(-1),u=new Float32Array(n).fill(-1/0);let o=-1/0;for(let _=0;_<i;_++){const l=a[_];if(l>o&&(o=l),l>u[n-1]){let d=n-1;for(;d>0&&l>u[d-1];)u[d]=u[d-1],r[d]=r[d-1],d--;u[d]=l,r[d]=_}}const h=Math.min(s,n),m=new Float32Array(h);let c=0;for(let _=0;_<h&&!(r[_]<0);_++)m[_]=Math.exp((u[_]-o)/e),c+=m[_];for(let _=0;_<h;_++)m[_]/=c;let f=0,p=h;for(let _=0;_<h;_++){if(r[_]<0){p=_;break}if(f+=m[_],f>=t){p=_+1;break}}let w=0;for(let _=0;_<p;_++)w+=m[_];const B=Math.random()*w;let q=0;for(let _=0;_<p;_++)if(q+=m[_],q>=B)return r[_];return r[0]}async generate(a,e=512,t,s){this.seqLen=0,this._recentTokenCount=0,this._replayFlat=null,this._ropeDelta=s?.ropeDelta??0;const i=this.maxSeqLen||4096;if(e=Math.min(e,i-a.length),e<=0)throw new Error(`Prompt (${a.length} tokens) exceeds context size (${i})`);const n=[...a],r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,o=this.textCfg.linear_value_head_dim,c=this.linValueHeads/r*o,f=r*(u+u+c);for(let P=0;P<this.numLayers;P++)if(this.layerTypes[P]==="linear_attention"){const S=r*u*c*4,b=3*f*4;this.gpu.device.queue.writeBuffer(this.linState[P],0,new Uint8Array(S)),this.gpu.device.queue.writeBuffer(this.linConvHist[P],0,new Uint8Array(b))}let p=null;if(s){let P=0;const S=s.imageTokenId,b=s.positionIds3D;for(let v=0;v<a.length;v++){const C=b[0][v],D=b[1][v],k=b[2][v];this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),a[v]===S?this.embeddingFromVisionBuffer(s.embedBuffer,P++):this.embedding(a[v]);let z=this.hidden,M=this.hiddenB,$=this.zeroBuf;for(let O=0;O<this.numLayers;O++){this.decoderLayer(O,C,z,M,$,D,k,v),$=this.mlpOut;const G=z;z=M,M=G}if(v===a.length-1){const O=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(z,this.mlpOut,this.normed,O,"add_final_norm"),this._dispatchLmHead();const G=this.temperature??.7;if(G>0){const R=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,R],1);const F=Math.random()*4294967295>>>0,T=this._makeMixedUniform("sample_params",[{f:G},{u:this.topK??20},{f:this.topP??.8},{u:F}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,T],1)}else{const R=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,R],1)}this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8)}this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch(),this.seqLen=v+1}p=await this._readAndSample()}else for(let P=0;P<a.length;P++)p=await this.forward(a[P],P),this.seqLen=P+1;n.push(p);const w=this.config.eos_token_id??this.textCfg.eos_token_id,B=Array.isArray(w)?w:w!=null?[w]:[248044,248046];if(t?.(p,0)||B.includes(p))return n;this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=p:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=p),this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const _=8;this._tokenHistoryBuf||(this._tokenHistoryBuf=this.gpu.createBuffer("token_history",_*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._tokenHistoryReadback=this.gpu.device.createBuffer({size:_*4,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"token_history_readback"}));let l=0,d=0,y=1,U=p,g=!1;for(;y<e;){const P=performance.now(),S=Math.min(_,e-y);for(let k=0;k<S;k++){const z=this.seqLen+k+this._ropeDelta;if(this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),k===0?this.embedding(U):this.embeddingFromArgmax(),g)this._replayCoreForward(z);else{let G=this.hidden,R=this.hiddenB,F=this.zeroBuf;k===0&&!this._replayFlat&&(this._forceMinSplits=2,this.gpu.startRecording());for(let x=0;x<this.numLayers;x++){this.decoderLayer(x,z,G,R,F),F=this.mlpOut;const V=G;G=R,R=V}const T=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(G,this.mlpOut,this.normed,T,"add_final_norm"),this._dispatchLmHead(),k===0&&this.gpu._recording&&(this._forceMinSplits=0,this._buildReplayFromRecording())}const M=this.temperature??.7;if(M>0){const G=this.repetitionPenalty??1,R=this.presencePenalty??0,F=this._recentTokenCount+k;if(F>0&&(G>1||R>0)){const j=this._makeMixedUniform("penalty_params",[{u:Math.min(F,this._repMaxTokens)},{f:G},{f:R},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,j],Math.ceil(Math.min(F,this._repMaxTokens)/256))}const T=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,T],1);const x=Math.random()*4294967295>>>0,V=this._makeMixedUniform("sample_params",[{f:M},{u:this.topK??20},{f:this.topP??.8},{u:x}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,V],1)}else{const G=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,G],1)}const $=(this._recentTokenCount+k)%this._repMaxTokens,O=this.makeUniform(`append_${k}`,[$,k]);this.run("append_token",[this.argmaxResult,this._gpuRecentTokens,this._tokenHistoryBuf,O],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch()}!g&&this._replayFlat&&(g=!0);const b=this.gpu.device.createCommandEncoder();b.copyBufferToBuffer(this._tokenHistoryBuf,0,this._tokenHistoryReadback,0,S*4),this.gpu.device.queue.submit([b.finish()]),await this._tokenHistoryReadback.mapAsync(GPUMapMode.READ);const v=new Uint32Array(this._tokenHistoryReadback.getMappedRange().slice(0,S*4));this._tokenHistoryReadback.unmap();const C=performance.now();l+=C-P,d+=S;let D=!1;for(let k=0;k<S;k++){const z=v[k];n.push(z),this.seqLen++,this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=z:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=z);const M=t?.(z,y);if(y++,M||B.includes(z)){D=!0;break}}if(d%50<_&&console.log(`[T @${d}] ${(l/d).toFixed(1)}ms/tok (batch=${_})`),D)break;U=v[S-1],this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4)}return d>0&&console.log(`[T final @${d}] ${(l/d).toFixed(1)}ms/tok (batch=${_})`),n}async _quantizeBF16Weight(a,e,t,s){const i=this.groupSize,n=e/8,r=e/i,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,o=this.gpu.createBuffer(`${s}_qweight`,n*t*4,u),h=this.gpu.createBuffer(`${s}_scales_f32`,r*t*4,u),m=Math.ceil(r*t/2)*4,c=this.gpu.createBuffer(`${s}_scales`,m,u),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-DKsrMEcC.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.quantize_bf16_to_int4),p=this.gpu.createBufferFromData(`${s}_qparams`,new Uint32Array([e,t,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),w=this.gpu.createBindGroup(f,0,[a,o,h,p]),B=65535,q=Math.min(t,B),_=Math.ceil(t/B),l=this.gpu.device.createCommandEncoder(),d=l.beginComputePass();d.setPipeline(f),d.setBindGroup(0,w),d.dispatchWorkgroups(q,_),d.end(),this.gpu.device.queue.submit([l.finish()]);const y=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await A(async()=>{const{SHADERS:v}=await import("./gpu-ops-DKsrMEcC.js").then(C=>C.b);return{SHADERS:v}},[],import.meta.url)).SHADERS.pack_f32_to_f16_pairs),U=Math.ceil(r*t/2),g=this.gpu.createBufferFromData(`${s}_pparams`,new Uint32Array([U]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),P=this.gpu.createBindGroup(y,0,[h,c,g]),S=this.gpu.device.createCommandEncoder(),b=S.beginComputePass();return b.setPipeline(y),b.setBindGroup(0,P),b.dispatchWorkgroups(Math.ceil(U/256)),b.end(),this.gpu.device.queue.submit([S.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),h.destroy(),p.destroy(),g.destroy(),{qweight:o,scales:c}}async initMTP(a){if(this.mtp)return;const e=this.hiddenSize,t=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=performance.now(),n="Qwen/Qwen3.5-2B";console.log(`[MTP] Downloading MTP weights from ${n}...`);const{loadMTPWeights:r}=await A(async()=>{const{loadMTPWeights:B}=await import("./safetensors-loader-CwGm5mJX.js");return{loadMTPWeights:B}},[],import.meta.url),u=await r(n,a);console.log(`[MTP] Downloaded ${Object.keys(u).length} MTP tensors`),this.mtp={weights:{},qweights:{}};const o={};for(const[B,q]of Object.entries(u)){const _=this.gpu.createBufferFromData(`mtp_${B}`,q.data);o[B]=_,this.mtp.weights[B]=_}const h=[{name:"mtp.fc.weight",K:e*2,N:e},{name:"mtp.layers.0.self_attn.q_proj.weight",K:e,N:this.numHeads*this.headDim*2},{name:"mtp.layers.0.self_attn.k_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.v_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.o_proj.weight",K:e,N:e},{name:"mtp.layers.0.mlp.gate_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.up_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.down_proj.weight",K:t,N:e}];for(const{name:B,K:q,N:_}of h){const{qweight:l,scales:d}=await this._quantizeBF16Weight(o[B],q,_,`mtp_${B}`);this.mtp.qweights[B]={qweight:l,scales:d},o[B].destroy(),delete this.mtp.weights[B]}this.mtp.normRaw={};const m=["mtp.pre_fc_norm_hidden.weight","mtp.pre_fc_norm_embedding.weight","mtp.layers.0.input_layernorm.weight","mtp.layers.0.post_attention_layernorm.weight","mtp.norm.weight","mtp.layers.0.self_attn.q_norm.weight","mtp.layers.0.self_attn.k_norm.weight"];for(const B of m){const q=u[B];q&&(this.mtp.normRaw[B]=new Uint32Array(q.data.buffer.slice(q.data.byteOffset,q.data.byteOffset+q.data.byteLength)))}const f=4096*this.numKVHeads*this.headDim*4;this.mtp.kvCache={keys:this.gpu.createBuffer("mtp_kv_keys",f,s),values:this.gpu.createBuffer("mtp_kv_values",f,s)},this.mtp.concatBuf=this.gpu.createBuffer("mtp_concat",e*2*4,s),this.mtp.fcOut=this.gpu.createBuffer("mtp_fc_out",e*4,s),this.mtp.savedHidden=this.gpu.createBuffer("mtp_saved_hidden",e*4,s),this.mtp.savedMlpOut=this.gpu.createBuffer("mtp_saved_mlp_out",e*4,s),this.mtp.seqLen=0,this._buildMTPSplitQKNormUniform(),this._buildMTPFusedMLPUniform(),await this._buildTrimmedLmHead(8e3);const p=((performance.now()-i)/1e3).toFixed(1),w=h.length;console.log(`[MTP] Initialized in ${p}s: ${w} INT4 weights, 1 KV cache layer`)}_buildMTPSplitQKNormUniform(){const a=this.numHeads,e=this.numKVHeads,t=this.headDim,s=a*t,i=e*t,n=(s+i)/2,u=32+Math.ceil(n/4)*16,o=new ArrayBuffer(u),h=new DataView(o);h.setUint32(0,a,!0),h.setUint32(4,e,!0),h.setUint32(8,t,!0),h.setFloat32(12,this.rmsEps,!0),h.setUint32(16,0,!0),h.setUint32(20,0,!0),h.setUint32(24,0,!0),h.setUint32(28,0,!0);const m=this.mtp.normRaw["mtp.layers.0.self_attn.q_norm.weight"],c=this.mtp.normRaw["mtp.layers.0.self_attn.k_norm.weight"];if(m)for(let p=0;p<s/2;p++){const w=Math.floor(p/4),B=p%4;h.setUint32(32+w*16+B*4,m[p],!0)}if(c){const p=s/2;for(let w=0;w<i/2;w++){const B=p+w,q=Math.floor(B/4),_=B%4;h.setUint32(32+q*16+_*4,c[w],!0)}}const f=this.gpu.device.createBuffer({size:u,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"mtp_fused_sqk"});this.gpu.device.queue.writeBuffer(f,0,new Uint8Array(o)),this.mtp.fusedSQKParams=f}_buildMTPFusedMLPUniform(){}async _buildTrimmedLmHead(a=8e3){const e=this.hiddenSize,t=this.gpu,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=new Uint32Array(a);for(let y=0;y<a;y++)i[y]=y;this.mtp.trimmedToFull=i,this.mtp.trimmedVocabSize=a;const n=t.createBufferFromData("mtp_trim_indices",i),u=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",o=this.weights[u],h=e/2,m=a*h*4,c=t.createBuffer("mtp_trim_gathered",m,s),f=(await A(async()=>{const{SHADERS:y}=await import("./gpu-ops-DKsrMEcC.js").then(U=>U.b);return{SHADERS:y}},[],import.meta.url)).SHADERS.gather_rows_bf16,p=t.getOrCreatePipeline("gather_rows_bf16",f),w=t.createBufferFromData("mtp_trim_gparams",new Uint32Array([h,a]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),B=t.createBindGroup(p,0,[o,n,c,w]),q=t.device.createCommandEncoder(),_=q.beginComputePass();_.setPipeline(p),_.setBindGroup(0,B),_.dispatchWorkgroups(Math.ceil(h/256),a),_.end(),t.device.queue.submit([q.finish()]);const{qweight:l,scales:d}=await this._quantizeBF16Weight(c,e,a,"mtp_trim_lmhead");this.mtp.trimmedLmHead={qweight:l,scales:d},this.mtp.trimmedLogits=t.createBuffer("mtp_trimmed_logits",a*4,s),c.destroy(),n.destroy(),w.destroy(),console.log(`[MTP] Trimmed lm_head: ${a} tokens, ${(l.size/1024/1024).toFixed(1)}MB qw + ${(d.size/1024/1024).toFixed(1)}MB sc`)}_dispatchTrimmedLmHead(){const a=this.hiddenSize,e=this.mtp.trimmedVocabSize,t=this.mtp.trimmedLmHead,i=a/this.groupSize%4===0,n=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(i){const r=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg8(e))}else{const r=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg4(e))}}_mtpGetQWeight(a){return this.mtp.qweights[a]}_mtpGptqMatvec(a,e,t,s,i){const n=this._mtpGetQWeight(t);if(!n)throw new Error(`MTP weight not found: ${t}`);const u=s/this.groupSize%4===0,o=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg8(i))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg4(i))}}_mtpSaveHidden(a,e){this.mtp&&(this.gpu.copyBuffer(a,this.mtp.savedHidden,this.hiddenSize*4),this.gpu.copyBuffer(e,this.mtp.savedMlpOut,this.hiddenSize*4))}async mtpForward(a){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");return this.mtpForwardSubmit(a),await this._readAndSample()}mtpForwardSubmit(a){const e=this.hiddenSize,t=this.intermediateSize,s=this.mtp.seqLen;this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);const i=this.makeUniform("add_rmsnorm_params",[e,this.rmsEps]);this.runCached("mtp_emb_norm","rmsnorm",[this.hidden,this.mtp.weights["mtp.pre_fc_norm_embedding.weight"],this.normed,i],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4),this.runCached("mtp_hid_norm","add_rmsnorm_ro",[this.mtp.savedHidden,this.mtp.savedMlpOut,this.mtp.weights["mtp.pre_fc_norm_hidden.weight"],this.normed,i],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4,0,e*4),this._mtpGptqMatvec(this.mtp.concatBuf,this.mtp.fcOut,"mtp.fc.weight",e*2,e),this.runCached("mtp_input_norm","rmsnorm",[this.mtp.fcOut,this.mtp.weights["mtp.layers.0.input_layernorm.weight"],this.normed,i],1);const n="mtp.layers.0.self_attn",r=this._mtpGptqMatvecOp(this.normed,this.qProjFull,`${n}.q_proj.weight`,e,this.numHeads*this.headDim*2),u=this._mtpGptqMatvecOp(this.normed,this.kProj,`${n}.k_proj.weight`,e,this.numKVHeads*this.headDim),o=this._mtpGptqMatvecOp(this.normed,this.vProj,`${n}.v_proj.weight`,e,this.numKVHeads*this.headDim);this.gpu.dispatchMulti([r,u,o].filter(Boolean)),this._gqaDv.setUint32(0,s,!0),this._gqaDv.setUint32(4,s,!0),this._gqaDv.setUint32(8,s,!0),this._gqaDv.setUint32(12,s,!0),this.gpu.device.queue.writeBuffer(this.mtp.fusedSQKParams,16,this._gqaData,0,16),this.runCached("mtp_sqk",this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.mtp.fusedSQKParams],this.numHeads+this.numKVHeads);const h=s+1;this._gqaDv.setUint32(0,h,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,s,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.runCached("mtp_gqa","gqa_attention_head",[this.qProj,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.attnOut,this._gqaParamBuf],this.numHeads);const m=this._mtpGetQWeight(`${n}.o_proj.weight`),c=this.numHeads*this.headDim,f=this.makeUniform(`fused_sig_mv_${c}_${e}`,[c,e,this.groupSize]);c/this.groupSize%4===0&&this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_4t_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg8(e)):this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_f16",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)):this.runCached("mtp_sig_oproj","fused_sigmoid_gptq",[this.attnOut,this.qGate,m.qweight,m.scales,this.qProj,f],this.wg4(e)),this.runCached("mtp_post_norm","add_rmsnorm_ro",[this.mtp.fcOut,this.qProj,this.mtp.weights["mtp.layers.0.post_attention_layernorm.weight"],this.normed,i],1);const B=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",q=this.makeUniform(`fused_mv_${e}_${t}`,[e,t,this.groupSize]),_=this.mtp.qweights["mtp.layers.0.mlp.gate_proj.weight"],l=this.mtp.qweights["mtp.layers.0.mlp.up_proj.weight"];this.runCached("mtp_gate_up_silu",B,[this.normed,_.qweight,_.scales,l.qweight,l.scales,this.mlpIntermediate,q],this.wg4(t)),this._mtpGptqMatvec(this.mlpIntermediate,this.mlpOut,"mtp.layers.0.mlp.down_proj.weight",t,e),this.runCached("mtp_final_norm","three_way_add_rmsnorm",[this.mtp.fcOut,this.qProj,this.mlpOut,this.mtp.weights["mtp.norm.weight"],this.hidden,this.normed,i],1),this._dispatchLmHead();const d=this.makeUniform("mtp_argmax_params",[this.vocabSize]);this.runCached("mtp_argmax","argmax",[this.logits,this.argmaxResult,d],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this.mtp.seqLen++}_mtpSnapshotDeltaNet(){if(!this.mtp._deltaNetSnapshot){const e=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.mtp._deltaNetSnapshot={states:{},convs:{}};for(let t=0;t<this.numLayers;t++)this.layerTypes[t]==="linear_attention"&&(this.mtp._deltaNetSnapshot.states[t]=this.gpu.createBuffer(`mtp_snap_state_${t}`,this.linState[t].size,e),this.mtp._deltaNetSnapshot.convs[t]=this.gpu.createBuffer(`mtp_snap_conv_${t}`,this.linConvHist[t].size,e))}const a=this.mtp._deltaNetSnapshot;a.mtpSeqLen=this.mtp.seqLen;for(const e of Object.keys(a.states))this.gpu.copyBuffer(this.linState[e],a.states[e],this.linState[e].size),this.gpu.copyBuffer(this.linConvHist[e],a.convs[e],this.linConvHist[e].size)}_mtpRestoreDeltaNet(){const a=this.mtp._deltaNetSnapshot;if(a){a.mtpSeqLen!==void 0&&(this.mtp.seqLen=a.mtpSeqLen);for(const e of Object.keys(a.states))this.gpu.copyBuffer(a.states[e],this.linState[e],this.linState[e].size),this.gpu.copyBuffer(a.convs[e],this.linConvHist[e],this.linConvHist[e].size)}}async generateWithMTP(a,e=512,t){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");this.seqLen=0,this.mtp.seqLen=0,this._recentTokenCount=0;const s=[...a],i=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,o=this.linValueHeads/i*r,h=i*(n+n+o);for(let g=0;g<this.numLayers;g++)if(this.layerTypes[g]==="linear_attention"){const P=i*n*o*4,S=3*h*4;this.gpu.device.queue.writeBuffer(this.linState[g],0,new Uint8Array(P)),this.gpu.device.queue.writeBuffer(this.linConvHist[g],0,new Uint8Array(S))}let m=null;for(let g=0;g<a.length;g++)m=await this.forward(a[g],g),this.seqLen=g+1;s.push(m);const c=this.config.eos_token_id??this.textCfg.eos_token_id,f=Array.isArray(c)?c:c!=null?[c]:[248044,248046];if(t?.(m,0)||f.includes(m))return s;let w=1,B=0,q=0,_=m,l=0,d=0;for(;w<e;){const g=performance.now(),P=await this.mtpForward(_);this._mtpSnapshotDeltaNet();const S=this.seqLen,b=await this.forwardB2(_,P,this.seqLen);this.seqLen+=2;const v=b[0],C=b[1];if(v===P){B++,s.push(P),w++;let k=t?.(P,w-1);if(k||f.includes(P)||(s.push(C),w++,k=t?.(C,w-1),k||f.includes(C)))break;_=C}else{q++,this._mtpRestoreDeltaNet(),this.seqLen=S;const k=await this.forward(_,this.seqLen);if(this.seqLen++,s.push(k),w++,t?.(k,w-1)||f.includes(k))break;_=k}const D=performance.now();if(l+=D-g,d++,d%25===0){const k=B/(B+q)*100,z=w/d;console.log(`[MTP @${d}] ${(l/d).toFixed(1)}ms/step, ${(w/(l/1e3)).toFixed(0)} tok/s, accept=${k.toFixed(0)}%, ${z.toFixed(1)} tok/step`)}}const y=B/Math.max(1,B+q)*100;return console.log(`[MTP final] ${(w/((l||1)/1e3)).toFixed(0)} tok/s, accept=${y.toFixed(0)}% (${B}/${B+q}), ${w} tokens`),s}_mtpGptqMatvecOp(a,e,t,s,i){const n=this._mtpGetQWeight(t);if(!n)return null;const u=s/this.groupSize%4===0,o=this.makeUniform(`mv_${s}_${i}`,[s,i,this.groupSize]);if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";return this.prepOpCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg8(i))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";return this.prepOpCached(`mtp_${t}`,h,[a,n.qweight,n.scales,e,o],this.wg4(i))}}}export{N as Qwen35Model};
|
assets/{test-BEFPr_G8.js → test-DMd_5haN.js}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["./qwen35-model-
|
| 2 |
-
import{G as nt,S as ct,_ as et}from"./gpu-ops-PQDFq1iI.js";class ot{constructor(){this.gpu=null,this.results=[],this.pipelines={}}async init(){this.gpu=new nt,await this.gpu.init();for(const[e,t]of Object.entries(ct))this.pipelines[e]=this.gpu.getOrCreatePipeline(e,t);return this}async runAll(e){const t=[["GPTQ MatVec",()=>this.testGPTQMatvec()],["GPTQ MatVec (group boundary)",()=>this.testGPTQMatvecGroups()],["GPTQ MatVec (gs=32)",()=>this.testGPTQMatvecGS32()],["GPTQ Split-K",()=>this.testGPTQSplitK()],["GPTQ F16",()=>this.testGPTQF16()],["BF16 MatVec",()=>this.testBF16Matvec()],["RMSNorm",()=>this.testRMSNorm()],["RMSNorm (1+w formula)",()=>this.testRMSNorm1PlusW()],["SiLU * Mul",()=>this.testSiLUMul()],["Vector Add",()=>this.testAdd()],["Embedding (BF16)",()=>this.testEmbedding()],["Argmax",()=>this.testArgmax()],["Argmax (large)",()=>this.testArgmaxLarge()],["Split interleaved",()=>this.testSplit()],["Sigmoid Mul",()=>this.testSigmoidMul()],["Head RMSNorm (1+w)",()=>this.testHeadRMSNorm()],["Head RMSNorm (nogated, w)",()=>this.testHeadRMSNormNogated()],["Causal Conv1d",()=>this.testCausalConv1d()],["KV Cache Store",()=>this.testKVCacheStore()],["GQA Attention (single pos)",()=>this.testGQAAttention()],["DeltaNet Recurrent",()=>this.testDeltaNetRecurrent()],["Add + RMSNorm (fused)",()=>this.testAddRMSNorm()],["Three-way Add + RMSNorm",()=>this.testThreeWayAddRMSNorm()],["GQA RoPE rotate_half",()=>this.testGQARoPERotateHalf()],["Fused sigmoid + GPTQ",()=>this.testFusedSigmoidGPTQ()],["Fused SiLU + GPTQ",()=>this.testFusedSiluGPTQ()],["Fused AddNorm+GateUp+SiLU",()=>this.testFusedAddNormGateUpSiLU()],["Fused Split+QKNorm+KVStore",()=>this.testFusedSplitQKNormKVStore()],["Add+RMSNorm B=2",()=>this.testAddRMSNormB2()],["Add+RMSNorm RO B=2",()=>this.testAddRMSNormROB2()],["Three-way Add+RMSNorm B=2",()=>this.testThreeWayAddRMSNormB2()],["Fused SiLU+GPTQ B=2",()=>this.testFusedSiluGPTQB2()],["Fused Sigmoid+GPTQ B=2",()=>this.testFusedSigmoidGPTQB2()],["Fused GateUp+SiLU B=2",()=>this.testFusedGateUpSiLUB2()],["GPTQ Matmul B=2",()=>this.testGPTQMatmulB2()],["GPTQ Matmul B=2 F16",()=>this.testGPTQMatmulB2F16()],["GPTQ Matmul B=2 4T F16",()=>this.testGPTQMatmulB2_4T_F16()]];for(const[o,s]of t)try{const r=await s();this.results.push({name:o,...r}),e?.({name:o,...r})}catch(r){this.results.push({name:o,pass:!1,error:r.message,maxErr:NaN}),e?.({name:o,pass:!1,error:r.message,maxErr:NaN})}return this.results}destroy(){this.gpu?.destroy()}makeBuffer(e,t,o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){return this.gpu.createBufferFromData(e,new Uint8Array(t.buffer,t.byteOffset,t.byteLength),o)}makeF32Buffer(e,t){return this.makeBuffer(e,new Float32Array(t))}makeU32Buffer(e,t){return this.makeBuffer(e,new Uint32Array(t))}makeOutputBuffer(e,t){return this.gpu.createBuffer(e,t*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC)}makeUniform(e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),o=new DataView(t);for(let s=0;s<e.length;s++)typeof e[s]=="number"&&!Number.isInteger(e[s])?o.setFloat32(s*4,e[s],!0):o.setUint32(s*4,e[s],!0);return this.gpu.createBufferFromData("uniform",new Uint8Array(t),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}makeUniformTyped(e,t){const o=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(o*4/16)*16)),r=new DataView(s);let a=0;for(const d of e)r.setUint32(a,d,!0),a+=4;for(const d of t)r.setFloat32(a,d,!0),a+=4;return this.gpu.createBufferFromData("uniform_typed",new Uint8Array(s),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}dispatch(e,t,o,s=1){const r=this.pipelines[e],a=this.gpu.createBindGroup(r,0,t);this.gpu.dispatch(r,[a],o,s)}async readback(e,t){return this.gpu.readBuffer(e,t*4)}async readbackU32(e,t){const o=this.gpu.createReadbackBuffer("_rb",t*4),s=this.gpu.device.createCommandEncoder();s.copyBufferToBuffer(e,0,o,0,t*4),this.gpu.device.queue.submit([s.finish()]),await o.mapAsync(GPUMapMode.READ);const r=new Uint32Array(o.getMappedRange().slice(0));return o.unmap(),o.destroy(),r}compare(e,t,o=1e-4){let s=0;const r=[];for(let a=0;a<t.length;a++){const d=Math.abs(e[a]-t[a]);s=Math.max(s,d),d>o&&r.push({idx:a,got:e[a],expected:t[a],err:d})}return{pass:r.length===0,maxErr:s,errors:r.slice(0,5)}}cpuF32ToBF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const a=new Uint32Array(new Float32Array([e[s]]).buffer)[0]>>>16,d=Math.floor(s/2);s%2===0?o[d]=a:o[d]|=a<<16}return o}cpuF32ToF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const r=this.f32ToF16(e[s]),a=Math.floor(s/2);s%2===0?o[a]=r:o[a]|=r<<16}return o}f32ToF16(e){const t=new ArrayBuffer(4);new Float32Array(t)[0]=e;const o=new Uint32Array(t)[0],s=o>>31&1;let r=o>>23&255,a=o&8388607;return r===0?s<<15:r===255?s<<15|31744|(a?512:0):(r=r-127+15,r>=31?s<<15|31744:r<=0?s<<15:s<<15|r<<10|a>>13)}f16ToF32(e){const t=e>>15&1,o=e>>10&31,s=e&1023;return o===0?s===0?t?-0:0:(t?-1:1)*Math.pow(2,-14)*(s/1024):o===31?s?NaN:t?-1/0:1/0:(t?-1:1)*Math.pow(2,o-15)*(1+s/1024)}bf16ToF32(e){const t=e<<16,o=new ArrayBuffer(4);return new Uint32Array(o)[0]=t,new Float32Array(o)[0]}async testGPTQMatvec(){const r=new Float32Array(64);for(let k=0;k<64;k++)r[k]=(k+1)*.1;const a=new Uint32Array(32),d=new Float32Array(256);for(let k=0;k<8;k++)for(let u=0;u<4;u++){let m=0;for(let g=0;g<8;g++){const i=k*8+g,c=(i+u)%15;d[i*4+u]=c-8,m|=c<<g*4}a[k*4+u]=m}const l=new Float32Array(4);for(let k=0;k<4;k++)l[k]=.5+k*.1;const w=this.cpuF32ToF16Packed(l),h=new Float32Array(4);for(let k=0;k<4;k++){let u=0;for(let m=0;m<64;m++){const i=Math.floor(m/64)*4+k,c=w[Math.floor(i/2)]>>i%2*16&65535,_=this.f16ToF32(c);u+=r[m]*d[m*4+k]*_}h[k]=u}const F=this.makeF32Buffer("gptq_input",r),n=this.makeU32Buffer("gptq_qw",a),B=this.makeU32Buffer("gptq_scales",w),f=this.makeOutputBuffer("gptq_out",4),p=this.makeUniform([64,4,64]);this.dispatch("gptq_matvec",[F,n,B,f,p],Math.ceil(4/32));const y=await this.readback(f,4);return this.compare(y,h,.01)}async testGPTQMatvecGroups(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/64,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/64)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq2_in",r),B=this.makeU32Buffer("gptq2_qw",a),f=this.makeU32Buffer("gptq2_sc",h),p=this.makeOutputBuffer("gptq2_out",4),y=this.makeUniform([128,4,64]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQMatvecGS32(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/32,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/32)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq32_in",r),B=this.makeU32Buffer("gptq32_qw",a),f=this.makeU32Buffer("gptq32_sc",h),p=this.makeOutputBuffer("gptq32_out",4),y=this.makeUniform([128,4,32]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQSplitK(){const a=new Float32Array(256);for(let i=0;i<256;i++)a[i]=Math.sin(i*.3)*.5;const d=new Uint32Array(1024),l=new Float32Array(256*32);for(let i=0;i<32;i++)for(let c=0;c<32;c++){let _=0;for(let b=0;b<8;b++){const N=i*8+b,x=(N*3+c*7)%15;l[N*32+c]=x-8,_|=x<<b*4}d[i*32+c]=_}const w=256/64,h=new Float32Array(w*32);for(let i=0;i<w;i++)for(let c=0;c<32;c++)h[i*32+c]=.3+(i*32+c)*.02;const F=this.cpuF32ToF16Packed(h),n=new Float32Array(32);for(let i=0;i<32;i++){let c=0;for(let _=0;_<256;_++){const N=Math.floor(_/64)*32+i,x=F[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);c+=a[_]*l[_*32+i]*q}n[i]=c}const B=this.makeF32Buffer("sk_in",a),f=this.makeU32Buffer("sk_qw",d),p=this.makeU32Buffer("sk_sc",F),y=this.makeOutputBuffer("sk_partials",128),k=this.makeOutputBuffer("sk_out",32),u=this.makeUniform([256,32,64,4]),m=this.makeUniform([32,4]);this.dispatch("gptq_splitk",[B,f,p,y,u],Math.ceil(32/32),4),this.dispatch("reduce_splitk",[y,k,m],Math.ceil(32/256));const g=await this.readback(k,32);return this.compare(g,n,.01)}async testGPTQF16(){if(!this.gpu.hasF16||!this.pipelines.gptq_matvec_f16)return{pass:!0,maxErr:0,note:"shader-f16 not available, skipped"};const e=128,t=4,o=64,s=e/8,r=new Float32Array(e);for(let u=0;u<e;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let u=0;u<s;u++)for(let m=0;m<t;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c+m*3)%15;d[c*t+m]=_-8,g|=_<<i*4}a[u*t+m]=g}const l=e/o,w=new Float32Array(l*t);for(let u=0;u<l;u++)for(let m=0;m<t;m++)w[u*t+m]=.5+u*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(t);for(let u=0;u<t;u++){let m=0;for(let g=0;g<e;g++){const c=Math.floor(g/o)*t+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*t+u]*b}F[u]=m}const n=this.makeF32Buffer("f16_in",r),B=this.makeU32Buffer("f16_qw",a),f=this.makeU32Buffer("f16_sc",h),p=this.makeOutputBuffer("f16_out",t),y=this.makeUniform([e,t,o]);this.dispatch("gptq_matvec_f16",[n,B,f,p,y],Math.ceil(t/32));const k=await this.readback(p,t);return this.compare(k,F,.5)}async testBF16Matvec(){const o=new Float32Array(8);for(let n=0;n<8;n++)o[n]=(n+1)*.2;const s=new Float32Array(32);for(let n=0;n<4;n++)for(let B=0;B<8;B++)s[n*8+B]=Math.cos(n*.5+B*.3);const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4);for(let n=0;n<4;n++){let B=0;for(let f=0;f<8;f++){const p=n*8+f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);B+=o[f]*u}a[n]=B}const d=this.makeF32Buffer("bf16mv_in",o),l=this.makeU32Buffer("bf16mv_w",r),w=this.makeOutputBuffer("bf16mv_out",4),h=this.makeUniform([8,4]);this.dispatch("bf16_matvec",[d,l,w,h],Math.ceil(4/32));const F=await this.readback(w,4);return this.compare(F,a,.001)}async testRMSNorm(){const o=new Float32Array(16);for(let f=0;f<16;f++)o[f]=(f-8)*.3;const s=new Float32Array(16);for(let f=0;f<16;f++)s[f]=Math.sin(f*.4)*.1;const r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<16;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/16+1e-6),l=new Float32Array(16);for(let f=0;f<16;f++){const p=f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);l[f]=o[f]*d*(1+u)}const w=this.makeF32Buffer("rmsn_in",o),h=this.makeU32Buffer("rmsn_w",r),F=this.makeOutputBuffer("rmsn_out",16),n=this.makeUniformTyped([16],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,16);return this.compare(B,l,1e-4)}async testRMSNorm1PlusW(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8).fill(0),r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<8;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/8+1e-6),l=new Float32Array(8);for(let f=0;f<8;f++)l[f]=o[f]*d;const w=this.makeF32Buffer("rmsn1_in",o),h=this.makeU32Buffer("rmsn1_w",r),F=this.makeOutputBuffer("rmsn1_out",8),n=this.makeUniformTyped([8],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,8);return this.compare(B,l,1e-4)}async testSiLUMul(){const t=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8);for(let h=0;h<8;h++){const F=t[h]/(1+Math.exp(-t[h]));s[h]=F*o[h]}const r=this.makeF32Buffer("silu_gate",t),a=this.makeF32Buffer("silu_up",o),d=this.makeOutputBuffer("silu_out",8),l=this.makeUniform([8]);this.dispatch("silu_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testAdd(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([.1,.2,.3,.4,.5,.6,.7,.8]),s=new Float32Array(8);for(let w=0;w<8;w++)s[w]=t[w]+o[w];const r=this.makeF32Buffer("add_a",t),a=this.makeF32Buffer("add_b",o),d=this.makeUniform([8]);this.dispatch("add",[r,a,d],Math.ceil(8/32));const l=await this.readback(r,8);return this.compare(l,s,1e-6)}async testEmbedding(){const o=new Float32Array(32);for(let F=0;F<o.length;F++)o[F]=(F+1)*.01;const s=this.cpuF32ToBF16Packed(o),r=2,a=new Float32Array(8);for(let F=0;F<8;F++){const n=r*8+F,B=s[Math.floor(n/2)],f=n%2===0?B&65535:B>>16&65535;a[F]=this.bf16ToF32(f)}const d=this.makeU32Buffer("emb_w",s),l=this.makeOutputBuffer("emb_out",8),w=this.makeUniform([r,8]);this.dispatch("embedding",[d,l,w],Math.ceil(8/256));const h=await this.readback(l,8);return this.compare(h,a,.001)}async testArgmax(){const t=new Float32Array(16);for(let w=0;w<16;w++)t[w]=Math.sin(w*.7)*3;const o=t.indexOf(Math.max(...t)),s=this.makeF32Buffer("argmax_in",t),r=this.makeOutputBuffer("argmax_res",2),a=this.makeUniform([16]);this.dispatch("argmax",[s,r,a],1);const l=(await this.readbackU32(r,2))[0];return{pass:l===o,maxErr:Math.abs(l-o),errors:l!==o?[{idx:0,got:l,expected:o}]:[]}}async testArgmaxLarge(){const t=new Float32Array(1024);for(let d=0;d<1024;d++)t[d]=Math.sin(d*.01)-2;t[773]=99;const o=this.makeF32Buffer("argmax_lg_in",t),s=this.makeOutputBuffer("argmax_lg_res",2),r=this.makeUniform([1024]);this.dispatch("argmax",[o,s,r],1);const a=await this.readbackU32(s,2);return{pass:a[0]===773,maxErr:Math.abs(a[0]-773),errors:a[0]!==773?[{got:a[0],expected:773}]:[]}}async testSplit(){const s=new Float32Array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]),r=new Float32Array([1,2,3,4,9,10,11,12]),a=new Float32Array([5,6,7,8,13,14,15,16]),d=this.makeF32Buffer("split_src",s),l=this.makeOutputBuffer("split_a",8),w=this.makeOutputBuffer("split_b",8),h=this.makeUniform([8,2,4]);this.dispatch("split",[d,l,w,h],Math.ceil(8/32));const F=await this.readback(l,8),n=await this.readback(w,8),B=this.compare(F,r,1e-6),f=this.compare(n,a,1e-6);return{pass:B.pass&&f.pass,maxErr:Math.max(B.maxErr,f.maxErr),errors:[...B.errors,...f.errors]}}async testSigmoidMul(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),s=new Float32Array(8);for(let h=0;h<8;h++)s[h]=t[h]/(1+Math.exp(-o[h]));const r=this.makeF32Buffer("sigmul_x",t),a=this.makeF32Buffer("sigmul_g",o),d=this.makeOutputBuffer("sigmul_out",8),l=this.makeUniform([8]);this.dispatch("sigmoid_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testHeadRMSNorm(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.1*(n+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*(1+u)}}const l=this.makeF32Buffer("hrmsn_x",s),w=this.makeU32Buffer("hrmsn_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testHeadRMSNormNogated(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.5+.1*n;const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*u}}const l=this.makeF32Buffer("hrmsng_x",s),w=this.makeU32Buffer("hrmsng_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm_nogated",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testCausalConv1d(){const t=new Float32Array([1,2,3,4]),o=new Float32Array(12);for(let y=0;y<o.length;y++)o[y]=(y+1)*.1;const s=new Float32Array(16);for(let y=0;y<s.length;y++)s[y]=.25;const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4),d=new Float32Array(12);for(let y=0;y<4;y++){const k=o[y],u=o[4+y],m=o[8+y],g=t[y],i=[];for(let _=0;_<4;_++){const b=y*4+_,N=r[Math.floor(b/2)],x=b%2===0?N&65535:N>>16&65535;i.push(this.bf16ToF32(x))}const c=i[0]*k+i[1]*u+i[2]*m+i[3]*g;a[y]=c/(1+Math.exp(-c)),d[y]=u,d[4+y]=m,d[8+y]=g}const l=this.makeF32Buffer("conv_x",t),w=this.makeF32Buffer("conv_hist",o),h=this.makeU32Buffer("conv_w",r),F=this.makeUniform([4]);this.dispatch("causal_conv1d",[l,w,h,F],Math.ceil(4/256));const n=await this.readback(l,4),B=await this.readback(w,12),f=this.compare(n,a,1e-4),p=this.compare(B,d,1e-4);return{pass:f.pass&&p.pass,maxErr:Math.max(f.maxErr,p.maxErr),errors:[...f.errors.map(y=>({...y,note:"conv output"})),...p.errors.map(y=>({...y,note:"history"}))]}}async testKVCacheStore(){const r=new Float32Array(8),a=new Float32Array(8);for(let c=0;c<8;c++)r[c]=c+1,a[c]=(c+1)*10;const l=8*8,w=new Float32Array(l),h=new Float32Array(l),F=new Float32Array(l),n=new Float32Array(l);for(let c=0;c<8;c++)F[24+c]=r[c],n[24+c]=a[c];const B=this.makeF32Buffer("kvs_k",r),f=this.makeF32Buffer("kvs_v",a),p=this.makeF32Buffer("kvs_kc",w),y=this.makeF32Buffer("kvs_vc",h),k=this.makeUniform([3,2,4]);this.dispatch("kv_cache_store",[B,f,p,y,k],Math.ceil(8/256));const u=await this.readback(p,l),m=await this.readback(y,l),g=this.compare(u,F,1e-6),i=this.compare(m,n,1e-6);return{pass:g.pass&&i.pass,maxErr:Math.max(g.maxErr,i.maxErr),errors:[...g.errors,...i.errors]}}async testGQAAttention(){const a=new Float32Array([1,0,0,0]),d=new Float32Array(4);d.set([1,0,0,0]);const l=new Float32Array(4);l.set([.5,.6,.7,.8]);const w=new Float32Array([.5,.6,.7,.8]),h=this.makeF32Buffer("gqa_q",a),F=this.makeF32Buffer("gqa_kc",d),n=this.makeF32Buffer("gqa_vc",l),B=this.makeOutputBuffer("gqa_out",4),f=new ArrayBuffer(32),p=new DataView(f);p.setUint32(0,1,!0),p.setUint32(4,4,!0),p.setUint32(8,1,!0),p.setUint32(12,1,!0),p.setUint32(16,1/1,!0),p.setUint32(20,1,!0);const y=this.gpu.createBufferFromData("gqa_params",new Uint8Array(f),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[h,F,n,B,y],1);const k=await this.readback(B,4);return this.compare(k,w,1e-4)}async testDeltaNetRecurrent(){const s=[.5,.5],r=[1,0],a=[.8,.2],d=new Float32Array([...s,...r,...a]),l=new Float32Array(4),w=new Float32Array([0]),h=new Float32Array([0]),F=new Float32Array([0]),n=new Float32Array([0]),B=this.cpuF32ToBF16Packed(F),f=this.cpuF32ToBF16Packed(n),p=1/(1+Math.exp(-0)),y=Math.sqrt(s[0]**2+s[1]**2),k=Math.sqrt(r[0]**2+r[1]**2),u=s.map(U=>U/y),m=r.map(U=>U/k),g=[0,0,0,0],i=[0,0],c=a.map((U,S)=>(U-i[S])*p);for(let U=0;U<2;U++)for(let S=0;S<2;S++)g[U*2+S]+=m[U]*c[S];const _=1/Math.sqrt(2),b=new Float32Array(2);for(let U=0;U<2;U++){let S=0;for(let O=0;O<2;O++)S+=g[O*2+U]*u[O]*_;b[U]=S}const N=this.makeF32Buffer("dn_qkv",d),x=this.makeF32Buffer("dn_state",l),q=this.makeOutputBuffer("dn_out",2),K=this.makeF32Buffer("dn_a",w),v=this.makeF32Buffer("dn_b",h),T=this.makeU32Buffer("dn_alog",B),V=this.makeU32Buffer("dn_dtb",f),G=this.makeUniform([1,2,2]);this.dispatch("deltanet_recurrent",[N,x,q,K,v,T,V,G],1);const C=await this.readback(q,2);return this.compare(C,b,.001)}async testAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array(8);for(let _=0;_<8;_++)r[_]=.1*(_+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(o);for(let _=0;_<8;_++)d[_]+=s[_];let l=0;for(let _=0;_<8;_++)l+=d[_]*d[_];const w=1/Math.sqrt(l/8+1e-6),h=new Float32Array(8);for(let _=0;_<8;_++){const b=a[Math.floor(_/2)]>>_%2*16&65535,N=this.bf16ToF32(b);h[_]=d[_]*w*(1+N)}const F=this.makeF32Buffer("addnorm_h",o),n=this.makeF32Buffer("addnorm_a",s),B=this.makeU32Buffer("addnorm_w",a),f=this.makeOutputBuffer("addnorm_out",8);this.makeUniform([8]);const p=new ArrayBuffer(16),y=new DataView(p);y.setUint32(0,8,!0),y.setFloat32(4,1e-6,!0);const k=this.gpu.createBufferFromData("addnorm_params",new Uint8Array(p),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("add_rmsnorm",[F,n,B,f,k],1);const u=await this.readback(f,8),m=await this.readback(F,8),g=d,i=this.compare(u,h,.001),c=this.compare(m,g,1e-6);return{pass:i.pass&&c.pass,maxErr:Math.max(i.maxErr,c.maxErr),errors:[...i.errors,...c.errors]}}async testThreeWayAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array([.5,.5,-.5,-.5,1,1,-1,-1]),a=new Float32Array(8);for(let N=0;N<8;N++)a[N]=.1*(N+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(8);for(let N=0;N<8;N++)l[N]=o[N]+s[N]+r[N];let w=0;for(let N=0;N<8;N++)w+=l[N]*l[N];const h=1/Math.sqrt(w/8+1e-6),F=new Float32Array(8);for(let N=0;N<8;N++){const x=d[Math.floor(N/2)]>>N%2*16&65535,q=this.bf16ToF32(x);F[N]=l[N]*h*(1+q)}const n=this.makeF32Buffer("twa_a",o),B=this.makeF32Buffer("twa_b",s),f=this.makeF32Buffer("twa_c",r),p=this.makeU32Buffer("twa_w",d),y=this.makeOutputBuffer("twa_hout",8),k=this.makeOutputBuffer("twa_normed",8),u=new ArrayBuffer(16),m=new DataView(u);m.setUint32(0,8,!0),m.setFloat32(4,1e-6,!0);const g=this.gpu.createBufferFromData("twa_params",new Uint8Array(u),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("three_way_add_rmsnorm",[n,B,f,p,y,k,g],1);const i=await this.readback(y,8),c=await this.readback(k,8),_=this.compare(i,l,1e-6),b=this.compare(c,F,.001);return{pass:_.pass&&b.pass,maxErr:Math.max(_.maxErr,b.maxErr),errors:[..._.errors,...b.errors]}}async testGQARoPERotateHalf(){const h=1/Math.sqrt(256),F=[];for(let D=0;D<32;D++)F.push(1/Math.pow(100,2*D/64));function n(D,M){const H=new Float32Array(D.length);for(let P=0;P<D.length;P++)H[P]=D[P];for(let P=0;P<32;P++){const Q=M*F[P],E=Math.cos(Q),J=Math.sin(Q);H[P]=D[P]*E-D[P+32]*J,H[P+32]=D[P+32]*E+D[P]*J}return H}const B=new Float32Array(256);B[0]=1,B[1]=.5,B[2]=.3,B[32]=.7,B[33]=.2;const f=new Float32Array(256);f[0]=1,f[32]=.5;const p=new Float32Array(256);p[1]=1,p[33]=.3;const y=new Float32Array(256);y[2]=1;const k=new Float32Array(256);k[0]=1;const u=new Float32Array(256);u[0]=0,u[1]=1;const m=new Float32Array(256);m[0]=0,m[2]=1;const g=n(B,2),i=n(f,0),c=n(p,1),_=n(y,2);let b=0,N=0,x=0;for(let D=0;D<256;D++)b+=g[D]*i[D],N+=g[D]*c[D],x+=g[D]*_[D];b*=h,N*=h,x*=h;const q=Math.max(b,N,x),K=Math.exp(b-q),v=Math.exp(N-q),T=Math.exp(x-q),V=K+v+T,G=K/V,C=v/V,U=T/V,S=new Float32Array(256);for(let D=0;D<256;D++)S[D]=G*k[D]+C*u[D]+U*m[D];const O=1*256,Y=new Float32Array(3*O),z=new Float32Array(3*O);Y.set(i,0),Y.set(c,O),Y.set(_,2*O),z.set(k,0),z.set(u,O),z.set(m,2*O);const X=this.makeF32Buffer("rope2_q",g),tt=this.makeF32Buffer("rope2_kc",Y),A=this.makeF32Buffer("rope2_vc",z),R=this.makeOutputBuffer("rope2_out",1*256),L=new ArrayBuffer(32),W=new DataView(L);W.setUint32(0,3,!0),W.setUint32(4,256,!0),W.setUint32(8,1,!0),W.setUint32(12,1,!0),W.setUint32(16,1/1,!0),W.setUint32(20,1,!0);const Z=this.gpu.createBufferFromData("rope2_params",new Uint8Array(L),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[X,tt,A,R,Z],1);const j=await this.readback(R,1*256);return this.compare(j,S,.001)}async testFusedSigmoidGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m+1)*.1,a[m]=(m-8)*.3;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.5+m*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const c=r[i]/(1+Math.exp(-a[i])),b=Math.floor(i/64)*4+m,N=h[Math.floor(b/2)]>>b%2*16&65535,x=this.f16ToF32(N);g+=c*l[i*4+m]*x}F[m]=g}const n=this.makeF32Buffer("fsg_x",r),B=this.makeF32Buffer("fsg_g",a),f=this.makeU32Buffer("fsg_qw",d),p=this.makeU32Buffer("fsg_sc",h),y=this.makeOutputBuffer("fsg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_sigmoid_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedSiluGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m-8)*.2,a[m]=(m+1)*.15;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_*2+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.4+m*.2;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const _=r[i]/(1+Math.exp(-r[i]))*a[i],N=Math.floor(i/64)*4+m,x=h[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);g+=_*l[i*4+m]*q}F[m]=g}const n=this.makeF32Buffer("fslg_a",r),B=this.makeF32Buffer("fslg_b",a),f=this.makeU32Buffer("fslg_qw",d),p=this.makeU32Buffer("fslg_sc",h),y=this.makeOutputBuffer("fslg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_silu_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedAddNormGateUpSiLU(){const a=new Float32Array(64),d=new Float32Array(64),l=new Float32Array(64),w=new Float32Array(64);for(let A=0;A<64;A++)a[A]=(A+1)*.1,d[A]=(A-5)*.05,l[A]=(A%7-3)*.03,w[A]=(A%3-1)*.1;const h=new Uint32Array(32),F=new Uint32Array(32),n=new Float32Array(256),B=new Float32Array(256);for(let A=0;A<64;A++)for(let R=0;R<4;R++)n[A*4+R]=Math.sin(A*.7+R*1.3)*.5,B[A*4+R]=Math.cos(A*.3+R*.9)*.5;const f=new Uint32Array(4/2),p=new Uint32Array(4/2);for(let A=0;A<4;A++){let R=0,L=0;for(let P=0;P<64;P++)R=Math.max(R,Math.abs(n[P*4+A])),L=Math.max(L,Math.abs(B[P*4+A]));const W=R/7,Z=L/7,j=this.f32ToF16(W),D=this.f32ToF16(Z),M=this.f16ToF32(j),H=this.f16ToF32(D);A%2===0?(f[A/2]=j,p[A/2]=D):(f[(A-1)/2]|=j<<16,p[(A-1)/2]|=D<<16);for(let P=0;P<64;P++){const Q=Math.round(n[P*4+A]/M)+8,E=Math.round(B[P*4+A]/H)+8,J=Math.max(0,Math.min(15,Q)),rt=Math.max(0,Math.min(15,E)),st=Math.floor(P/8),at=P%8*4;h[st*4+A]|=J<<at,F[st*4+A]|=rt<<at}}const y=new Uint32Array(64/2);for(let A=0;A<64;A+=2){const R=this.f32ToBF16(w[A]),L=this.f32ToBF16(w[A+1]);y[A/2]=R|L<<16}const k=new Float32Array(64);let u=0;for(let A=0;A<64;A++)k[A]=a[A]+d[A]+l[A],u+=k[A]*k[A];const m=1/Math.sqrt(u/64+1e-6),g=new Float32Array(64);for(let A=0;A<64;A++){const R=this.unpackBF16(y[Math.floor(A/2)],A%2);g[A]=k[A]*m*(1+R)}const i=new Float32Array(4);for(let A=0;A<4;A++){let R=0,L=0;const W=this.f16ToF32(A%2===0?f[Math.floor(A/2)]&65535:f[Math.floor(A/2)]>>>16),Z=this.f16ToF32(A%2===0?p[Math.floor(A/2)]&65535:p[Math.floor(A/2)]>>>16);for(let j=0;j<64;j++){const D=Math.floor(j/8),M=j%8*4,H=h[D*4+A]>>>M&15,P=F[D*4+A]>>>M&15;R+=g[j]*W*(H-8),L+=g[j]*Z*(P-8)}i[A]=R/(1+Math.exp(-R))*L}const c=k,_=16+640*16,b=new ArrayBuffer(_),N=new DataView(b);N.setUint32(0,64,!0),N.setUint32(4,4,!0),N.setUint32(8,64,!0),N.setFloat32(12,1e-6,!0);for(let A=0;A<64/2;A++){const R=Math.floor(A/4),L=A%4;N.setUint32(16+R*16+L*4,y[A],!0)}const x=this.gpu.createBufferFromData("fang_params",new Uint8Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),q=new Uint32Array(64);q.set(h,0),q.set(F,32);const K=8/(64/8),v=new Uint32Array(K*4/2*2);v.set(f,0),v.set(p,K*4/2);const T=this.makeF32Buffer("fang_hin",a),V=this.makeF32Buffer("fang_mlpres",d),G=this.makeF32Buffer("fang_attnres",l),C=this.makeU32Buffer("fang_mqw",q),U=this.makeU32Buffer("fang_msc",v),S=this.makeOutputBuffer("fang_hout",64),O=this.makeOutputBuffer("fang_out",4);this.dispatch("fused_addnorm_gate_up_silu",[T,V,G,C,U,S,O,x],Math.max(1,Math.ceil(4/32)));const Y=await this.readback(O,4),z=await this.readback(S,64),X=this.compare(Y,i,.02),tt=this.compare(z,c,1e-6);return{pass:X.pass&&tt.pass,maxErr:Math.max(X.maxErr,tt.maxErr),errors:[...X.errors||[],...tt.errors||[]]}}async testFusedSplitQKNormKVStore(){const l=new Float32Array(32);for(let M=0;M<l.length;M++)l[M]=Math.sin(M*.5+1)*.3;const w=new Float32Array(8),h=new Float32Array(8);for(let M=0;M<8;M++)w[M]=Math.cos(M*.7+2)*.4,h[M]=Math.sin(M*1.1+3)*.2;const F=new Float32Array(16),n=new Float32Array(8);for(let M=0;M<16;M++)F[M]=(M%3-1)*.05;for(let M=0;M<8;M++)n[M]=(M%4-2)*.03;const B=new Uint32Array(16/2);for(let M=0;M<16;M+=2)B[M/2]=this.f32ToBF16(F[M])|this.f32ToBF16(F[M+1])<<16;const f=new Uint32Array(8/2);for(let M=0;M<8;M+=2)f[M/2]=this.f32ToBF16(n[M])|this.f32ToBF16(n[M+1])<<16;const p=new Float32Array(16),y=new Float32Array(16);for(let M=0;M<2;M++){const H=new Float32Array(8);for(let E=0;E<8;E++)H[E]=l[M*8*2+E],y[M*8+E]=l[M*8*2+8+E];let P=0;for(let E=0;E<8;E++)P+=H[E]*H[E];const Q=1/Math.sqrt(P/8+1e-6);for(let E=0;E<8;E++){const J=this.unpackBF16(B[Math.floor((M*8+E)/2)],(M*8+E)%2);p[M*8+E]=H[E]*Q*(1+J)}}const k=new Float32Array(8),u=new Float32Array(32),m=new Float32Array(32);for(let M=0;M<1;M++){let H=0;for(let Q=0;Q<8;Q++)H+=w[M*8+Q]*w[M*8+Q];const P=1/Math.sqrt(H/8+1e-6);for(let Q=0;Q<8;Q++){const E=this.unpackBF16(f[Math.floor((M*8+Q)/2)],(M*8+Q)%2),J=w[M*8+Q]*P*(1+E);k[M*8+Q]=J,u[24+M*8+Q]=J,m[24+M*8+Q]=h[M*8+Q]}}const g=32+320*16,i=new ArrayBuffer(g),c=new DataView(i);c.setUint32(0,2,!0),c.setUint32(4,1,!0),c.setUint32(8,8,!0),c.setFloat32(12,1e-6,!0),c.setUint32(16,3,!0);for(let M=0;M<16/2;M++){const H=Math.floor(M/4),P=M%4;c.setUint32(32+H*16+P*4,B[M],!0)}const _=16/2;for(let M=0;M<8/2;M++){const H=_+M,P=Math.floor(H/4),Q=H%4;c.setUint32(32+P*16+Q*4,f[M],!0)}const b=this.gpu.createBufferFromData("fsqk_params",new Uint8Array(i),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),N=this.makeF32Buffer("fsqk_qpf",l),x=this.makeF32Buffer("fsqk_kp",w),q=this.makeF32Buffer("fsqk_vp",h),K=this.makeOutputBuffer("fsqk_qp",16),v=this.makeOutputBuffer("fsqk_qg",16),T=this.makeOutputBuffer("fsqk_kc",32),V=this.makeOutputBuffer("fsqk_vc",32);this.dispatch("fused_split_qknorm_kvstore",[N,x,q,K,v,T,V,b],3);const G=await this.readback(K,16),C=await this.readback(v,16),U=await this.readback(x,8),S=await this.readback(T,32),O=await this.readback(V,32),Y=this.compare(G,p,1e-5),z=this.compare(C,y,1e-6),X=this.compare(U,k,1e-5),tt=S.slice(24,32),A=O.slice(24,32),R=u.slice(24,32),L=m.slice(24,32),W=this.compare(tt,R,1e-5),Z=this.compare(A,L,1e-6),j=Math.max(Y.maxErr,z.maxErr,X.maxErr,W.maxErr,Z.maxErr);return{pass:Y.pass&&z.pass&&X.pass&&W.pass&&Z.pass,maxErr:j,errors:[...Y.errors||[],...z.errors||[],...X.errors||[],...W.errors||[],...Z.errors||[]]}}async benchmarkGPTQ(e){const t=[{K:2048,N:2048,gs:128,label:"2048→2048 (Z/out_proj)"},{K:2048,N:6144,gs:128,label:"2048→6144 (QKV/gate+up)"},{K:6144,N:2048,gs:128,label:"6144→2048 (down_proj)"}],o=[1,2,4,8],s=50,r=3,a=10,d=[];for(const l of t){const{K:w,N:h,gs:F,label:n}=l,B=w/8,f=B/(F/8),p=new Float32Array(w);for(let b=0;b<w;b++)p[b]=Math.random()*2-1;const y=new Uint32Array(B*h);for(let b=0;b<y.length;b++)y[b]=Math.random()*4294967295>>>0;const k=new Float32Array(f*h);for(let b=0;b<k.length;b++)k[b]=Math.random()*.5;const u=this.cpuF32ToF16Packed(k),m=this.makeF32Buffer("bm_in",p),g=this.makeU32Buffer("bm_qw",y),i=this.makeU32Buffer("bm_sc",u),c=[];for(let b=0;b<s;b++)c.push(this.makeOutputBuffer(`bm_out_${b}`,h));const _=B*h*4+f*h*2;for(const b of o){if(b>1&&f%b!==0)continue;let N;if(b===1){const v=this.makeUniform([w,h,F]);for(let G=0;G<r;G++){this.gpu.beginBatch();for(let C=0;C<s;C++)this.dispatch("gptq_matvec",[m,g,i,c[C],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const T=[];for(let G=0;G<a;G++){await this.gpu.device.queue.onSubmittedWorkDone();const C=performance.now();this.gpu.beginBatch();for(let U=0;U<s;U++)this.dispatch("gptq_matvec",[m,g,i,c[U],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),T.push(performance.now()-C)}T.sort((G,C)=>G-C),N=T.slice(1,-1).reduce((G,C)=>G+C,0)/(T.length-2)/s}else{const v=[];for(let U=0;U<s;U++)v.push(this.makeOutputBuffer(`bm_part_${U}`,h*b));const T=this.makeUniform([w,h,F,b]),V=this.makeUniform([h,b]);for(let U=0;U<r;U++){this.gpu.beginBatch();for(let S=0;S<s;S++)this.dispatch("gptq_splitk",[m,g,i,v[S],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[S],c[S],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const G=[];for(let U=0;U<a;U++){await this.gpu.device.queue.onSubmittedWorkDone();const S=performance.now();this.gpu.beginBatch();for(let O=0;O<s;O++)this.dispatch("gptq_splitk",[m,g,i,v[O],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[O],c[O],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),G.push(performance.now()-S)}G.sort((U,S)=>U-S),N=G.slice(1,-1).reduce((U,S)=>U+S,0)/(G.length-2)/s}const x=_/1e9/(N/1e3),q=Math.ceil(h/32)*b,K={label:n,ns:b,wgs:q,avgMs:N.toFixed(4),bwGBs:x.toFixed(1)};d.push(K),e?.(K)}if(this.gpu.hasF16&&this.pipelines.gptq_matvec_f16){const b=this.makeUniform([w,h,F]);for(let T=0;T<r;T++){this.gpu.beginBatch();for(let V=0;V<s;V++)this.dispatch("gptq_matvec_f16",[m,g,i,c[V],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const N=[];for(let T=0;T<a;T++){await this.gpu.device.queue.onSubmittedWorkDone();const V=performance.now();this.gpu.beginBatch();for(let G=0;G<s;G++)this.dispatch("gptq_matvec_f16",[m,g,i,c[G],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),N.push(performance.now()-V)}N.sort((T,V)=>T-V);const q=N.slice(1,-1).reduce((T,V)=>T+V,0)/(N.length-2)/s,K=_/1e9/(q/1e3),v={label:n,ns:"f16",wgs:Math.ceil(h/32),avgMs:q.toFixed(4),bwGBs:K.toFixed(1)};d.push(v),e?.(v)}}return d}_makeNormParams(e,t=1e-6){const o=new ArrayBuffer(16),s=new DataView(o);return s.setUint32(0,e,!0),s.setFloat32(4,t,!0),this.gpu.createBufferFromData("norm_params",new Uint8Array(o),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}_cpuRMSNorm1PlusW(e,t,o,s=1e-6){let r=0;for(let l=0;l<o;l++)r+=e[l]*e[l];const a=1/Math.sqrt(r/o+s),d=new Float32Array(o);for(let l=0;l<o;l++){const w=t[Math.floor(l/2)]>>l%2*16&65535,h=this.bf16ToF32(w);d[l]=e[l]*a*(1+h)}return d}async testAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let u=0;u<32;u++)o[u]=Math.sin(u*.7)*2,s[u]=Math.cos(u*.3)*.5;const r=new Float32Array(16);for(let u=0;u<16;u++)r[u]=.1*(u+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32),l=new Float32Array(32);for(let u=0;u<2;u++){const m=new Float32Array(16);for(let i=0;i<16;i++)m[i]=o[u*16+i]+s[u*16+i];for(let i=0;i<16;i++)l[u*16+i]=m[i];const g=this._cpuRMSNorm1PlusW(m,a,16,1e-6);d.set(g,u*16)}const w=this.makeF32Buffer("b2an_h",o),h=this.makeF32Buffer("b2an_a",s),F=this.makeU32Buffer("b2an_w",a),n=this.makeOutputBuffer("b2an_out",32),B=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_b2",[w,h,F,n,B],2);const f=await this.readback(n,32),p=await this.readback(w,32),y=this.compare(f,d,.001),k=this.compare(p,l,1e-6);return{pass:y.pass&&k.pass,maxErr:Math.max(y.maxErr,k.maxErr),errors:[...y.errors,...k.errors]}}async testAddRMSNormROB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let f=0;f<32;f++)o[f]=Math.sin(f*.5)*3,s[f]=Math.cos(f*.8)*.3;const r=new Float32Array(16);for(let f=0;f<16;f++)r[f]=.05*(f+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32);for(let f=0;f<2;f++){const p=new Float32Array(16);for(let k=0;k<16;k++)p[k]=o[f*16+k]+s[f*16+k];const y=this._cpuRMSNorm1PlusW(p,a,16,1e-6);d.set(y,f*16)}const l=this.makeF32Buffer("b2anro_h",o),w=this.makeF32Buffer("b2anro_a",s),h=this.makeU32Buffer("b2anro_w",a),F=this.makeOutputBuffer("b2anro_out",32),n=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_ro_b2",[l,w,h,F,n],2);const B=await this.readback(F,32);return this.compare(B,d,.001)}async testThreeWayAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32),r=new Float32Array(32);for(let i=0;i<32;i++)o[i]=Math.sin(i*.4)*2,s[i]=Math.cos(i*.6)*.5,r[i]=Math.sin(i*1.1)*.3;const a=new Float32Array(16);for(let i=0;i<16;i++)a[i]=.1*(i+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(32),w=new Float32Array(32);for(let i=0;i<2;i++){const c=new Float32Array(16);for(let b=0;b<16;b++)c[b]=o[i*16+b]+s[i*16+b]+r[i*16+b];l.set(c,i*16);const _=this._cpuRMSNorm1PlusW(c,d,16,1e-6);w.set(_,i*16)}const h=this.makeF32Buffer("b2twa_a",o),F=this.makeF32Buffer("b2twa_b",s),n=this.makeF32Buffer("b2twa_c",r),B=this.makeU32Buffer("b2twa_w",d),f=this.makeOutputBuffer("b2twa_hout",32),p=this.makeOutputBuffer("b2twa_normed",32),y=this._makeNormParams(16,1e-6);this.dispatch("three_way_add_rmsnorm_b2",[h,F,n,B,f,p,y],2);const k=await this.readback(p,32),u=await this.readback(f,32),m=this.compare(k,w,.001),g=this.compare(u,l,1e-6);return{pass:m.pass&&g.pass,maxErr:Math.max(m.maxErr,g.maxErr),errors:[...m.errors,...g.errors]}}_makeGPTQData(e,t,o){const s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let h=0;h<s;h++)for(let F=0;F<t;F++){let n=0;for(let B=0;B<8;B++){const f=h*8+B,p=(f*3+F*7)%15;a[f*t+F]=p-8,n|=p<<B*4}r[h*t+F]=n}const d=e/o,l=new Float32Array(d*t);for(let h=0;h<d*t;h++)l[h]=.3+h*.15;const w=this.cpuF32ToF16Packed(l);return{qweight:r,rawWeights:a,scalesPacked:w}}_cpuGPTQMatvec(e,t,o,s,r,a,d=0){const l=new Float32Array(r);for(let w=0;w<r;w++){let h=0;for(let F=0;F<s;F++){const B=Math.floor(F/a)*r+w,f=o[Math.floor(B/2)]>>B%2*16&65535,p=this.f16ToF32(f);h+=e[d+F]*t[F*r+w]*p}l[w]=h}return l}async testFusedSiluGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.3)*2,l[k]=Math.cos(k*.5)*1.5;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=d[k*e+g];u[g]=i/(1+Math.exp(-i))*l[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sl_a",d),F=this.makeF32Buffer("b2sl_b",l),n=this.makeU32Buffer("b2sl_qw",s),B=this.makeU32Buffer("b2sl_sc",a),f=this.makeOutputBuffer("b2sl_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_silu_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedSigmoidGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.4)*1.5,l[k]=Math.cos(k*.7)*2;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=1/(1+Math.exp(-l[k*e+g]));u[g]=i*d[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sg_x",d),F=this.makeF32Buffer("b2sg_g",l),n=this.makeU32Buffer("b2sg_qw",s),B=this.makeU32Buffer("b2sg_sc",a),f=this.makeOutputBuffer("b2sg_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_sigmoid_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedGateUpSiLUB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*3+x*7)%15;a[v*t+x]=T-8,q|=T<<K*4}r[N*t+x]=q}const d=e/o,l=new Float32Array(d*t);for(let N=0;N<d*t;N++)l[N]=.01+N%8*.005;const w=this.cpuF32ToF16Packed(l),h=new Uint32Array(s*t),F=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*5+x*3)%15;F[v*t+x]=T-8,q|=T<<K*4}h[N*t+x]=q}const n=e/o,B=new Float32Array(n*t);for(let N=0;N<n*t;N++)B[N]=.01+N%8*.005;const f=this.cpuF32ToF16Packed(B),p=new Float32Array(2*e);for(let N=0;N<2*e;N++)p[N]=Math.sin(N*.2)*.5;const y=new Float32Array(2*t);for(let N=0;N<2;N++){const x=this._cpuGPTQMatvec(p,a,w,e,t,o,N*e),q=this._cpuGPTQMatvec(p,F,f,e,t,o,N*e);for(let K=0;K<t;K++){const v=x[K];y[N*t+K]=v/(1+Math.exp(-v))*q[K]}}const k=this.makeF32Buffer("b2gus_in",p),u=this.makeU32Buffer("b2gus_gqw",r),m=this.makeU32Buffer("b2gus_gsc",w),g=this.makeU32Buffer("b2gus_uqw",h),i=this.makeU32Buffer("b2gus_usc",f),c=this.makeOutputBuffer("b2gus_out",2*t),_=this.makeUniform([e,t,o]);this.dispatch("fused_gate_up_silu_b2_f16",[k,u,m,g,i,c,_],Math.ceil(t/32));const b=await this.readback(c,2*t);return this.compare(b,y,.05)}_makeGPTQMatmulB2Data(e,t,o){const s=e/8,r=new Float32Array(2*e);for(let n=0;n<e;n++)r[n]=Math.sin(n*.5)*.5;for(let n=0;n<e;n++)r[e+n]=Math.cos(n*.3)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let n=0;n<s;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<8;p++){const y=n*8+p,k=(y*3+B*7)%15;d[y*t+B]=k-8,f|=k<<p*4}a[n*t+B]=f}const l=e/o,w=new Float32Array(l*t);for(let n=0;n<l*t;n++)w[n]=.01+n%8*.005;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(2*t);for(let n=0;n<2;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<e;p++){const k=Math.floor(p/o)*t+B,u=h[Math.floor(k/2)]>>k%2*16&65535,m=this.f16ToF32(u);f+=r[n*e+p]*d[p*t+B]*m}F[n*t+B]=f}return{input:r,qweight:a,scalesPacked:h,expected:F,K:e,N:t,gs:o}}async testGPTQMatmulB2(){const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2_input",e),w=this.makeU32Buffer("b2_qw",t),h=this.makeU32Buffer("b2_sc",o),F=this.makeOutputBuffer("b2_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.02)}async testGPTQMatmulB2F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2f16_input",e),w=this.makeU32Buffer("b2f16_qw",t),h=this.makeU32Buffer("b2f16_sc",o),F=this.makeOutputBuffer("b2f16_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_f16",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}async testGPTQMatmulB2_4T_F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(256,16,64),l=this.makeF32Buffer("b2_4t_input",e),w=this.makeU32Buffer("b2_4t_qw",t),h=this.makeU32Buffer("b2_4t_sc",o),F=this.makeOutputBuffer("b2_4t_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_4t_f16",[l,w,h,F,n],Math.ceil(a/8));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}f32ToBF16(e){const t=new ArrayBuffer(4);return new Uint32Array(t)[0]=new Uint32Array(new Float32Array([e]).buffer)[0],new Uint32Array(t)[0]>>>16}unpackBF16(e,t){const o=e>>>t*16&65535;return this.bf16ToF32(o)}}function $(I){const e=document.getElementById("log");e.textContent+=I+`
|
| 3 |
`,e.scrollTop=e.scrollHeight}document.querySelectorAll(".tab").forEach(I=>{I.addEventListener("click",()=>{document.querySelectorAll(".tab").forEach(e=>e.classList.remove("active")),document.querySelectorAll(".panel").forEach(e=>e.classList.remove("active")),I.classList.add("active"),document.getElementById(I.dataset.panel).classList.add("active")})});document.getElementById("runTests").addEventListener("click",async()=>{const I=document.getElementById("runTests"),e=document.getElementById("testResults"),t=document.getElementById("testSummary");I.disabled=!0,e.innerHTML="",t.innerHTML="",$("Initializing WebGPU for tests...");try{const o=new ot;await o.init(),$(`WebGPU initialized. Running tests...
|
| 4 |
`);let s=0,r=0;await o.runAll(a=>{const d=a.pass?"✓":"✗",l=a.pass?"pass":"fail";a.pass?s++:r++;const w=document.createElement("div");if(w.className="test-row",w.innerHTML=`
|
| 5 |
<span class="test-icon ${l}">${d}</span>
|
|
@@ -14,7 +14,7 @@ import{G as nt,S as ct,_ as et}from"./gpu-ops-PQDFq1iI.js";class ot{constructor(
|
|
| 14 |
</div>
|
| 15 |
`,$(`
|
| 16 |
Done: ${s} passed, ${r} failed`),o.destroy()}catch(o){$(`Error: ${o.message}
|
| 17 |
-
${o.stack}`)}finally{I.disabled=!1}});document.getElementById("loadAndProfile").addEventListener("click",async()=>{const I=document.getElementById("loadAndProfile"),e=document.getElementById("profStatus"),t=document.getElementById("profCategories"),o=document.getElementById("profTopOps");I.disabled=!0,t.innerHTML="",o.innerHTML="";try{const s=document.getElementById("profRepo").value.trim();e.textContent="Loading model...",$("Profiler: loading model from "+s);const{GPUContext:r}=await et(async()=>{const{GPUContext:c}=await import("./gpu-ops-
|
| 18 |
Profile complete: ${i.total.toFixed(1)}ms total (~${i.estimatedTokPerSec} tok/s with sync overhead)`),$(`
|
| 19 |
Category breakdown:`);for(const c of i.categories){const _=document.createElement("div");_.className="prof-row";const b=c.name.includes("matvec")||c.name==="lm_head"?"var(--orange)":c.name.includes("attention")||c.name.includes("deltanet")?"var(--blue)":"var(--green)";_.innerHTML=`
|
| 20 |
<span class="prof-name">${c.name}</span>
|
|
|
|
| 1 |
+
const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["./qwen35-model-CmivuQnY.js","./gpu-ops-DKsrMEcC.js"])))=>i.map(i=>d[i]);
|
| 2 |
+
import{G as nt,S as ct,_ as et}from"./gpu-ops-DKsrMEcC.js";class ot{constructor(){this.gpu=null,this.results=[],this.pipelines={}}async init(){this.gpu=new nt,await this.gpu.init();for(const[e,t]of Object.entries(ct))this.pipelines[e]=this.gpu.getOrCreatePipeline(e,t);return this}async runAll(e){const t=[["GPTQ MatVec",()=>this.testGPTQMatvec()],["GPTQ MatVec (group boundary)",()=>this.testGPTQMatvecGroups()],["GPTQ MatVec (gs=32)",()=>this.testGPTQMatvecGS32()],["GPTQ Split-K",()=>this.testGPTQSplitK()],["GPTQ F16",()=>this.testGPTQF16()],["BF16 MatVec",()=>this.testBF16Matvec()],["RMSNorm",()=>this.testRMSNorm()],["RMSNorm (1+w formula)",()=>this.testRMSNorm1PlusW()],["SiLU * Mul",()=>this.testSiLUMul()],["Vector Add",()=>this.testAdd()],["Embedding (BF16)",()=>this.testEmbedding()],["Argmax",()=>this.testArgmax()],["Argmax (large)",()=>this.testArgmaxLarge()],["Split interleaved",()=>this.testSplit()],["Sigmoid Mul",()=>this.testSigmoidMul()],["Head RMSNorm (1+w)",()=>this.testHeadRMSNorm()],["Head RMSNorm (nogated, w)",()=>this.testHeadRMSNormNogated()],["Causal Conv1d",()=>this.testCausalConv1d()],["KV Cache Store",()=>this.testKVCacheStore()],["GQA Attention (single pos)",()=>this.testGQAAttention()],["DeltaNet Recurrent",()=>this.testDeltaNetRecurrent()],["Add + RMSNorm (fused)",()=>this.testAddRMSNorm()],["Three-way Add + RMSNorm",()=>this.testThreeWayAddRMSNorm()],["GQA RoPE rotate_half",()=>this.testGQARoPERotateHalf()],["Fused sigmoid + GPTQ",()=>this.testFusedSigmoidGPTQ()],["Fused SiLU + GPTQ",()=>this.testFusedSiluGPTQ()],["Fused AddNorm+GateUp+SiLU",()=>this.testFusedAddNormGateUpSiLU()],["Fused Split+QKNorm+KVStore",()=>this.testFusedSplitQKNormKVStore()],["Add+RMSNorm B=2",()=>this.testAddRMSNormB2()],["Add+RMSNorm RO B=2",()=>this.testAddRMSNormROB2()],["Three-way Add+RMSNorm B=2",()=>this.testThreeWayAddRMSNormB2()],["Fused SiLU+GPTQ B=2",()=>this.testFusedSiluGPTQB2()],["Fused Sigmoid+GPTQ B=2",()=>this.testFusedSigmoidGPTQB2()],["Fused GateUp+SiLU B=2",()=>this.testFusedGateUpSiLUB2()],["GPTQ Matmul B=2",()=>this.testGPTQMatmulB2()],["GPTQ Matmul B=2 F16",()=>this.testGPTQMatmulB2F16()],["GPTQ Matmul B=2 4T F16",()=>this.testGPTQMatmulB2_4T_F16()]];for(const[o,s]of t)try{const r=await s();this.results.push({name:o,...r}),e?.({name:o,...r})}catch(r){this.results.push({name:o,pass:!1,error:r.message,maxErr:NaN}),e?.({name:o,pass:!1,error:r.message,maxErr:NaN})}return this.results}destroy(){this.gpu?.destroy()}makeBuffer(e,t,o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){return this.gpu.createBufferFromData(e,new Uint8Array(t.buffer,t.byteOffset,t.byteLength),o)}makeF32Buffer(e,t){return this.makeBuffer(e,new Float32Array(t))}makeU32Buffer(e,t){return this.makeBuffer(e,new Uint32Array(t))}makeOutputBuffer(e,t){return this.gpu.createBuffer(e,t*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC)}makeUniform(e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),o=new DataView(t);for(let s=0;s<e.length;s++)typeof e[s]=="number"&&!Number.isInteger(e[s])?o.setFloat32(s*4,e[s],!0):o.setUint32(s*4,e[s],!0);return this.gpu.createBufferFromData("uniform",new Uint8Array(t),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}makeUniformTyped(e,t){const o=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(o*4/16)*16)),r=new DataView(s);let a=0;for(const d of e)r.setUint32(a,d,!0),a+=4;for(const d of t)r.setFloat32(a,d,!0),a+=4;return this.gpu.createBufferFromData("uniform_typed",new Uint8Array(s),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}dispatch(e,t,o,s=1){const r=this.pipelines[e],a=this.gpu.createBindGroup(r,0,t);this.gpu.dispatch(r,[a],o,s)}async readback(e,t){return this.gpu.readBuffer(e,t*4)}async readbackU32(e,t){const o=this.gpu.createReadbackBuffer("_rb",t*4),s=this.gpu.device.createCommandEncoder();s.copyBufferToBuffer(e,0,o,0,t*4),this.gpu.device.queue.submit([s.finish()]),await o.mapAsync(GPUMapMode.READ);const r=new Uint32Array(o.getMappedRange().slice(0));return o.unmap(),o.destroy(),r}compare(e,t,o=1e-4){let s=0;const r=[];for(let a=0;a<t.length;a++){const d=Math.abs(e[a]-t[a]);s=Math.max(s,d),d>o&&r.push({idx:a,got:e[a],expected:t[a],err:d})}return{pass:r.length===0,maxErr:s,errors:r.slice(0,5)}}cpuF32ToBF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const a=new Uint32Array(new Float32Array([e[s]]).buffer)[0]>>>16,d=Math.floor(s/2);s%2===0?o[d]=a:o[d]|=a<<16}return o}cpuF32ToF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const r=this.f32ToF16(e[s]),a=Math.floor(s/2);s%2===0?o[a]=r:o[a]|=r<<16}return o}f32ToF16(e){const t=new ArrayBuffer(4);new Float32Array(t)[0]=e;const o=new Uint32Array(t)[0],s=o>>31&1;let r=o>>23&255,a=o&8388607;return r===0?s<<15:r===255?s<<15|31744|(a?512:0):(r=r-127+15,r>=31?s<<15|31744:r<=0?s<<15:s<<15|r<<10|a>>13)}f16ToF32(e){const t=e>>15&1,o=e>>10&31,s=e&1023;return o===0?s===0?t?-0:0:(t?-1:1)*Math.pow(2,-14)*(s/1024):o===31?s?NaN:t?-1/0:1/0:(t?-1:1)*Math.pow(2,o-15)*(1+s/1024)}bf16ToF32(e){const t=e<<16,o=new ArrayBuffer(4);return new Uint32Array(o)[0]=t,new Float32Array(o)[0]}async testGPTQMatvec(){const r=new Float32Array(64);for(let k=0;k<64;k++)r[k]=(k+1)*.1;const a=new Uint32Array(32),d=new Float32Array(256);for(let k=0;k<8;k++)for(let u=0;u<4;u++){let m=0;for(let g=0;g<8;g++){const i=k*8+g,c=(i+u)%15;d[i*4+u]=c-8,m|=c<<g*4}a[k*4+u]=m}const l=new Float32Array(4);for(let k=0;k<4;k++)l[k]=.5+k*.1;const w=this.cpuF32ToF16Packed(l),h=new Float32Array(4);for(let k=0;k<4;k++){let u=0;for(let m=0;m<64;m++){const i=Math.floor(m/64)*4+k,c=w[Math.floor(i/2)]>>i%2*16&65535,_=this.f16ToF32(c);u+=r[m]*d[m*4+k]*_}h[k]=u}const F=this.makeF32Buffer("gptq_input",r),n=this.makeU32Buffer("gptq_qw",a),B=this.makeU32Buffer("gptq_scales",w),f=this.makeOutputBuffer("gptq_out",4),p=this.makeUniform([64,4,64]);this.dispatch("gptq_matvec",[F,n,B,f,p],Math.ceil(4/32));const y=await this.readback(f,4);return this.compare(y,h,.01)}async testGPTQMatvecGroups(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/64,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/64)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq2_in",r),B=this.makeU32Buffer("gptq2_qw",a),f=this.makeU32Buffer("gptq2_sc",h),p=this.makeOutputBuffer("gptq2_out",4),y=this.makeUniform([128,4,64]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQMatvecGS32(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const l=128/32,w=new Float32Array(l*4);for(let u=0;u<l*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/32)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq32_in",r),B=this.makeU32Buffer("gptq32_qw",a),f=this.makeU32Buffer("gptq32_sc",h),p=this.makeOutputBuffer("gptq32_out",4),y=this.makeUniform([128,4,32]);this.dispatch("gptq_matvec",[n,B,f,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQSplitK(){const a=new Float32Array(256);for(let i=0;i<256;i++)a[i]=Math.sin(i*.3)*.5;const d=new Uint32Array(1024),l=new Float32Array(256*32);for(let i=0;i<32;i++)for(let c=0;c<32;c++){let _=0;for(let b=0;b<8;b++){const N=i*8+b,x=(N*3+c*7)%15;l[N*32+c]=x-8,_|=x<<b*4}d[i*32+c]=_}const w=256/64,h=new Float32Array(w*32);for(let i=0;i<w;i++)for(let c=0;c<32;c++)h[i*32+c]=.3+(i*32+c)*.02;const F=this.cpuF32ToF16Packed(h),n=new Float32Array(32);for(let i=0;i<32;i++){let c=0;for(let _=0;_<256;_++){const N=Math.floor(_/64)*32+i,x=F[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);c+=a[_]*l[_*32+i]*q}n[i]=c}const B=this.makeF32Buffer("sk_in",a),f=this.makeU32Buffer("sk_qw",d),p=this.makeU32Buffer("sk_sc",F),y=this.makeOutputBuffer("sk_partials",128),k=this.makeOutputBuffer("sk_out",32),u=this.makeUniform([256,32,64,4]),m=this.makeUniform([32,4]);this.dispatch("gptq_splitk",[B,f,p,y,u],Math.ceil(32/32),4),this.dispatch("reduce_splitk",[y,k,m],Math.ceil(32/256));const g=await this.readback(k,32);return this.compare(g,n,.01)}async testGPTQF16(){if(!this.gpu.hasF16||!this.pipelines.gptq_matvec_f16)return{pass:!0,maxErr:0,note:"shader-f16 not available, skipped"};const e=128,t=4,o=64,s=e/8,r=new Float32Array(e);for(let u=0;u<e;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let u=0;u<s;u++)for(let m=0;m<t;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c+m*3)%15;d[c*t+m]=_-8,g|=_<<i*4}a[u*t+m]=g}const l=e/o,w=new Float32Array(l*t);for(let u=0;u<l;u++)for(let m=0;m<t;m++)w[u*t+m]=.5+u*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(t);for(let u=0;u<t;u++){let m=0;for(let g=0;g<e;g++){const c=Math.floor(g/o)*t+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*t+u]*b}F[u]=m}const n=this.makeF32Buffer("f16_in",r),B=this.makeU32Buffer("f16_qw",a),f=this.makeU32Buffer("f16_sc",h),p=this.makeOutputBuffer("f16_out",t),y=this.makeUniform([e,t,o]);this.dispatch("gptq_matvec_f16",[n,B,f,p,y],Math.ceil(t/32));const k=await this.readback(p,t);return this.compare(k,F,.5)}async testBF16Matvec(){const o=new Float32Array(8);for(let n=0;n<8;n++)o[n]=(n+1)*.2;const s=new Float32Array(32);for(let n=0;n<4;n++)for(let B=0;B<8;B++)s[n*8+B]=Math.cos(n*.5+B*.3);const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4);for(let n=0;n<4;n++){let B=0;for(let f=0;f<8;f++){const p=n*8+f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);B+=o[f]*u}a[n]=B}const d=this.makeF32Buffer("bf16mv_in",o),l=this.makeU32Buffer("bf16mv_w",r),w=this.makeOutputBuffer("bf16mv_out",4),h=this.makeUniform([8,4]);this.dispatch("bf16_matvec",[d,l,w,h],Math.ceil(4/32));const F=await this.readback(w,4);return this.compare(F,a,.001)}async testRMSNorm(){const o=new Float32Array(16);for(let f=0;f<16;f++)o[f]=(f-8)*.3;const s=new Float32Array(16);for(let f=0;f<16;f++)s[f]=Math.sin(f*.4)*.1;const r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<16;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/16+1e-6),l=new Float32Array(16);for(let f=0;f<16;f++){const p=f,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);l[f]=o[f]*d*(1+u)}const w=this.makeF32Buffer("rmsn_in",o),h=this.makeU32Buffer("rmsn_w",r),F=this.makeOutputBuffer("rmsn_out",16),n=this.makeUniformTyped([16],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,16);return this.compare(B,l,1e-4)}async testRMSNorm1PlusW(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8).fill(0),r=this.cpuF32ToBF16Packed(s);let a=0;for(let f=0;f<8;f++)a+=o[f]*o[f];const d=1/Math.sqrt(a/8+1e-6),l=new Float32Array(8);for(let f=0;f<8;f++)l[f]=o[f]*d;const w=this.makeF32Buffer("rmsn1_in",o),h=this.makeU32Buffer("rmsn1_w",r),F=this.makeOutputBuffer("rmsn1_out",8),n=this.makeUniformTyped([8],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,8);return this.compare(B,l,1e-4)}async testSiLUMul(){const t=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8);for(let h=0;h<8;h++){const F=t[h]/(1+Math.exp(-t[h]));s[h]=F*o[h]}const r=this.makeF32Buffer("silu_gate",t),a=this.makeF32Buffer("silu_up",o),d=this.makeOutputBuffer("silu_out",8),l=this.makeUniform([8]);this.dispatch("silu_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testAdd(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([.1,.2,.3,.4,.5,.6,.7,.8]),s=new Float32Array(8);for(let w=0;w<8;w++)s[w]=t[w]+o[w];const r=this.makeF32Buffer("add_a",t),a=this.makeF32Buffer("add_b",o),d=this.makeUniform([8]);this.dispatch("add",[r,a,d],Math.ceil(8/32));const l=await this.readback(r,8);return this.compare(l,s,1e-6)}async testEmbedding(){const o=new Float32Array(32);for(let F=0;F<o.length;F++)o[F]=(F+1)*.01;const s=this.cpuF32ToBF16Packed(o),r=2,a=new Float32Array(8);for(let F=0;F<8;F++){const n=r*8+F,B=s[Math.floor(n/2)],f=n%2===0?B&65535:B>>16&65535;a[F]=this.bf16ToF32(f)}const d=this.makeU32Buffer("emb_w",s),l=this.makeOutputBuffer("emb_out",8),w=this.makeUniform([r,8]);this.dispatch("embedding",[d,l,w],Math.ceil(8/256));const h=await this.readback(l,8);return this.compare(h,a,.001)}async testArgmax(){const t=new Float32Array(16);for(let w=0;w<16;w++)t[w]=Math.sin(w*.7)*3;const o=t.indexOf(Math.max(...t)),s=this.makeF32Buffer("argmax_in",t),r=this.makeOutputBuffer("argmax_res",2),a=this.makeUniform([16]);this.dispatch("argmax",[s,r,a],1);const l=(await this.readbackU32(r,2))[0];return{pass:l===o,maxErr:Math.abs(l-o),errors:l!==o?[{idx:0,got:l,expected:o}]:[]}}async testArgmaxLarge(){const t=new Float32Array(1024);for(let d=0;d<1024;d++)t[d]=Math.sin(d*.01)-2;t[773]=99;const o=this.makeF32Buffer("argmax_lg_in",t),s=this.makeOutputBuffer("argmax_lg_res",2),r=this.makeUniform([1024]);this.dispatch("argmax",[o,s,r],1);const a=await this.readbackU32(s,2);return{pass:a[0]===773,maxErr:Math.abs(a[0]-773),errors:a[0]!==773?[{got:a[0],expected:773}]:[]}}async testSplit(){const s=new Float32Array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]),r=new Float32Array([1,2,3,4,9,10,11,12]),a=new Float32Array([5,6,7,8,13,14,15,16]),d=this.makeF32Buffer("split_src",s),l=this.makeOutputBuffer("split_a",8),w=this.makeOutputBuffer("split_b",8),h=this.makeUniform([8,2,4]);this.dispatch("split",[d,l,w,h],Math.ceil(8/32));const F=await this.readback(l,8),n=await this.readback(w,8),B=this.compare(F,r,1e-6),f=this.compare(n,a,1e-6);return{pass:B.pass&&f.pass,maxErr:Math.max(B.maxErr,f.maxErr),errors:[...B.errors,...f.errors]}}async testSigmoidMul(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),s=new Float32Array(8);for(let h=0;h<8;h++)s[h]=t[h]/(1+Math.exp(-o[h]));const r=this.makeF32Buffer("sigmul_x",t),a=this.makeF32Buffer("sigmul_g",o),d=this.makeOutputBuffer("sigmul_out",8),l=this.makeUniform([8]);this.dispatch("sigmoid_mul",[r,a,d,l],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testHeadRMSNorm(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.1*(n+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*(1+u)}}const l=this.makeF32Buffer("hrmsn_x",s),w=this.makeU32Buffer("hrmsn_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testHeadRMSNormNogated(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.5+.1*n;const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const f=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*f*u}}const l=this.makeF32Buffer("hrmsng_x",s),w=this.makeU32Buffer("hrmsng_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm_nogated",[l,w,h],2);const F=await this.readback(l,8);return this.compare(F,d,.001)}async testCausalConv1d(){const t=new Float32Array([1,2,3,4]),o=new Float32Array(12);for(let y=0;y<o.length;y++)o[y]=(y+1)*.1;const s=new Float32Array(16);for(let y=0;y<s.length;y++)s[y]=.25;const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4),d=new Float32Array(12);for(let y=0;y<4;y++){const k=o[y],u=o[4+y],m=o[8+y],g=t[y],i=[];for(let _=0;_<4;_++){const b=y*4+_,N=r[Math.floor(b/2)],x=b%2===0?N&65535:N>>16&65535;i.push(this.bf16ToF32(x))}const c=i[0]*k+i[1]*u+i[2]*m+i[3]*g;a[y]=c/(1+Math.exp(-c)),d[y]=u,d[4+y]=m,d[8+y]=g}const l=this.makeF32Buffer("conv_x",t),w=this.makeF32Buffer("conv_hist",o),h=this.makeU32Buffer("conv_w",r),F=this.makeUniform([4]);this.dispatch("causal_conv1d",[l,w,h,F],Math.ceil(4/256));const n=await this.readback(l,4),B=await this.readback(w,12),f=this.compare(n,a,1e-4),p=this.compare(B,d,1e-4);return{pass:f.pass&&p.pass,maxErr:Math.max(f.maxErr,p.maxErr),errors:[...f.errors.map(y=>({...y,note:"conv output"})),...p.errors.map(y=>({...y,note:"history"}))]}}async testKVCacheStore(){const r=new Float32Array(8),a=new Float32Array(8);for(let c=0;c<8;c++)r[c]=c+1,a[c]=(c+1)*10;const l=8*8,w=new Float32Array(l),h=new Float32Array(l),F=new Float32Array(l),n=new Float32Array(l);for(let c=0;c<8;c++)F[24+c]=r[c],n[24+c]=a[c];const B=this.makeF32Buffer("kvs_k",r),f=this.makeF32Buffer("kvs_v",a),p=this.makeF32Buffer("kvs_kc",w),y=this.makeF32Buffer("kvs_vc",h),k=this.makeUniform([3,2,4]);this.dispatch("kv_cache_store",[B,f,p,y,k],Math.ceil(8/256));const u=await this.readback(p,l),m=await this.readback(y,l),g=this.compare(u,F,1e-6),i=this.compare(m,n,1e-6);return{pass:g.pass&&i.pass,maxErr:Math.max(g.maxErr,i.maxErr),errors:[...g.errors,...i.errors]}}async testGQAAttention(){const a=new Float32Array([1,0,0,0]),d=new Float32Array(4);d.set([1,0,0,0]);const l=new Float32Array(4);l.set([.5,.6,.7,.8]);const w=new Float32Array([.5,.6,.7,.8]),h=this.makeF32Buffer("gqa_q",a),F=this.makeF32Buffer("gqa_kc",d),n=this.makeF32Buffer("gqa_vc",l),B=this.makeOutputBuffer("gqa_out",4),f=new ArrayBuffer(32),p=new DataView(f);p.setUint32(0,1,!0),p.setUint32(4,4,!0),p.setUint32(8,1,!0),p.setUint32(12,1,!0),p.setUint32(16,1/1,!0),p.setUint32(20,1,!0);const y=this.gpu.createBufferFromData("gqa_params",new Uint8Array(f),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[h,F,n,B,y],1);const k=await this.readback(B,4);return this.compare(k,w,1e-4)}async testDeltaNetRecurrent(){const s=[.5,.5],r=[1,0],a=[.8,.2],d=new Float32Array([...s,...r,...a]),l=new Float32Array(4),w=new Float32Array([0]),h=new Float32Array([0]),F=new Float32Array([0]),n=new Float32Array([0]),B=this.cpuF32ToBF16Packed(F),f=this.cpuF32ToBF16Packed(n),p=1/(1+Math.exp(-0)),y=Math.sqrt(s[0]**2+s[1]**2),k=Math.sqrt(r[0]**2+r[1]**2),u=s.map(U=>U/y),m=r.map(U=>U/k),g=[0,0,0,0],i=[0,0],c=a.map((U,S)=>(U-i[S])*p);for(let U=0;U<2;U++)for(let S=0;S<2;S++)g[U*2+S]+=m[U]*c[S];const _=1/Math.sqrt(2),b=new Float32Array(2);for(let U=0;U<2;U++){let S=0;for(let O=0;O<2;O++)S+=g[O*2+U]*u[O]*_;b[U]=S}const N=this.makeF32Buffer("dn_qkv",d),x=this.makeF32Buffer("dn_state",l),q=this.makeOutputBuffer("dn_out",2),K=this.makeF32Buffer("dn_a",w),v=this.makeF32Buffer("dn_b",h),T=this.makeU32Buffer("dn_alog",B),V=this.makeU32Buffer("dn_dtb",f),G=this.makeUniform([1,2,2]);this.dispatch("deltanet_recurrent",[N,x,q,K,v,T,V,G],1);const C=await this.readback(q,2);return this.compare(C,b,.001)}async testAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array(8);for(let _=0;_<8;_++)r[_]=.1*(_+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(o);for(let _=0;_<8;_++)d[_]+=s[_];let l=0;for(let _=0;_<8;_++)l+=d[_]*d[_];const w=1/Math.sqrt(l/8+1e-6),h=new Float32Array(8);for(let _=0;_<8;_++){const b=a[Math.floor(_/2)]>>_%2*16&65535,N=this.bf16ToF32(b);h[_]=d[_]*w*(1+N)}const F=this.makeF32Buffer("addnorm_h",o),n=this.makeF32Buffer("addnorm_a",s),B=this.makeU32Buffer("addnorm_w",a),f=this.makeOutputBuffer("addnorm_out",8);this.makeUniform([8]);const p=new ArrayBuffer(16),y=new DataView(p);y.setUint32(0,8,!0),y.setFloat32(4,1e-6,!0);const k=this.gpu.createBufferFromData("addnorm_params",new Uint8Array(p),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("add_rmsnorm",[F,n,B,f,k],1);const u=await this.readback(f,8),m=await this.readback(F,8),g=d,i=this.compare(u,h,.001),c=this.compare(m,g,1e-6);return{pass:i.pass&&c.pass,maxErr:Math.max(i.maxErr,c.maxErr),errors:[...i.errors,...c.errors]}}async testThreeWayAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array([.5,.5,-.5,-.5,1,1,-1,-1]),a=new Float32Array(8);for(let N=0;N<8;N++)a[N]=.1*(N+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(8);for(let N=0;N<8;N++)l[N]=o[N]+s[N]+r[N];let w=0;for(let N=0;N<8;N++)w+=l[N]*l[N];const h=1/Math.sqrt(w/8+1e-6),F=new Float32Array(8);for(let N=0;N<8;N++){const x=d[Math.floor(N/2)]>>N%2*16&65535,q=this.bf16ToF32(x);F[N]=l[N]*h*(1+q)}const n=this.makeF32Buffer("twa_a",o),B=this.makeF32Buffer("twa_b",s),f=this.makeF32Buffer("twa_c",r),p=this.makeU32Buffer("twa_w",d),y=this.makeOutputBuffer("twa_hout",8),k=this.makeOutputBuffer("twa_normed",8),u=new ArrayBuffer(16),m=new DataView(u);m.setUint32(0,8,!0),m.setFloat32(4,1e-6,!0);const g=this.gpu.createBufferFromData("twa_params",new Uint8Array(u),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("three_way_add_rmsnorm",[n,B,f,p,y,k,g],1);const i=await this.readback(y,8),c=await this.readback(k,8),_=this.compare(i,l,1e-6),b=this.compare(c,F,.001);return{pass:_.pass&&b.pass,maxErr:Math.max(_.maxErr,b.maxErr),errors:[..._.errors,...b.errors]}}async testGQARoPERotateHalf(){const h=1/Math.sqrt(256),F=[];for(let D=0;D<32;D++)F.push(1/Math.pow(100,2*D/64));function n(D,M){const H=new Float32Array(D.length);for(let P=0;P<D.length;P++)H[P]=D[P];for(let P=0;P<32;P++){const Q=M*F[P],E=Math.cos(Q),J=Math.sin(Q);H[P]=D[P]*E-D[P+32]*J,H[P+32]=D[P+32]*E+D[P]*J}return H}const B=new Float32Array(256);B[0]=1,B[1]=.5,B[2]=.3,B[32]=.7,B[33]=.2;const f=new Float32Array(256);f[0]=1,f[32]=.5;const p=new Float32Array(256);p[1]=1,p[33]=.3;const y=new Float32Array(256);y[2]=1;const k=new Float32Array(256);k[0]=1;const u=new Float32Array(256);u[0]=0,u[1]=1;const m=new Float32Array(256);m[0]=0,m[2]=1;const g=n(B,2),i=n(f,0),c=n(p,1),_=n(y,2);let b=0,N=0,x=0;for(let D=0;D<256;D++)b+=g[D]*i[D],N+=g[D]*c[D],x+=g[D]*_[D];b*=h,N*=h,x*=h;const q=Math.max(b,N,x),K=Math.exp(b-q),v=Math.exp(N-q),T=Math.exp(x-q),V=K+v+T,G=K/V,C=v/V,U=T/V,S=new Float32Array(256);for(let D=0;D<256;D++)S[D]=G*k[D]+C*u[D]+U*m[D];const O=1*256,Y=new Float32Array(3*O),z=new Float32Array(3*O);Y.set(i,0),Y.set(c,O),Y.set(_,2*O),z.set(k,0),z.set(u,O),z.set(m,2*O);const X=this.makeF32Buffer("rope2_q",g),tt=this.makeF32Buffer("rope2_kc",Y),A=this.makeF32Buffer("rope2_vc",z),R=this.makeOutputBuffer("rope2_out",1*256),L=new ArrayBuffer(32),W=new DataView(L);W.setUint32(0,3,!0),W.setUint32(4,256,!0),W.setUint32(8,1,!0),W.setUint32(12,1,!0),W.setUint32(16,1/1,!0),W.setUint32(20,1,!0);const Z=this.gpu.createBufferFromData("rope2_params",new Uint8Array(L),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[X,tt,A,R,Z],1);const j=await this.readback(R,1*256);return this.compare(j,S,.001)}async testFusedSigmoidGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m+1)*.1,a[m]=(m-8)*.3;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.5+m*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const c=r[i]/(1+Math.exp(-a[i])),b=Math.floor(i/64)*4+m,N=h[Math.floor(b/2)]>>b%2*16&65535,x=this.f16ToF32(N);g+=c*l[i*4+m]*x}F[m]=g}const n=this.makeF32Buffer("fsg_x",r),B=this.makeF32Buffer("fsg_g",a),f=this.makeU32Buffer("fsg_qw",d),p=this.makeU32Buffer("fsg_sc",h),y=this.makeOutputBuffer("fsg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_sigmoid_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedSiluGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m-8)*.2,a[m]=(m+1)*.15;const d=new Uint32Array(32),l=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_*2+g)%15;l[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.4+m*.2;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const _=r[i]/(1+Math.exp(-r[i]))*a[i],N=Math.floor(i/64)*4+m,x=h[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);g+=_*l[i*4+m]*q}F[m]=g}const n=this.makeF32Buffer("fslg_a",r),B=this.makeF32Buffer("fslg_b",a),f=this.makeU32Buffer("fslg_qw",d),p=this.makeU32Buffer("fslg_sc",h),y=this.makeOutputBuffer("fslg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_silu_gptq",[n,B,f,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedAddNormGateUpSiLU(){const a=new Float32Array(64),d=new Float32Array(64),l=new Float32Array(64),w=new Float32Array(64);for(let A=0;A<64;A++)a[A]=(A+1)*.1,d[A]=(A-5)*.05,l[A]=(A%7-3)*.03,w[A]=(A%3-1)*.1;const h=new Uint32Array(32),F=new Uint32Array(32),n=new Float32Array(256),B=new Float32Array(256);for(let A=0;A<64;A++)for(let R=0;R<4;R++)n[A*4+R]=Math.sin(A*.7+R*1.3)*.5,B[A*4+R]=Math.cos(A*.3+R*.9)*.5;const f=new Uint32Array(4/2),p=new Uint32Array(4/2);for(let A=0;A<4;A++){let R=0,L=0;for(let P=0;P<64;P++)R=Math.max(R,Math.abs(n[P*4+A])),L=Math.max(L,Math.abs(B[P*4+A]));const W=R/7,Z=L/7,j=this.f32ToF16(W),D=this.f32ToF16(Z),M=this.f16ToF32(j),H=this.f16ToF32(D);A%2===0?(f[A/2]=j,p[A/2]=D):(f[(A-1)/2]|=j<<16,p[(A-1)/2]|=D<<16);for(let P=0;P<64;P++){const Q=Math.round(n[P*4+A]/M)+8,E=Math.round(B[P*4+A]/H)+8,J=Math.max(0,Math.min(15,Q)),rt=Math.max(0,Math.min(15,E)),st=Math.floor(P/8),at=P%8*4;h[st*4+A]|=J<<at,F[st*4+A]|=rt<<at}}const y=new Uint32Array(64/2);for(let A=0;A<64;A+=2){const R=this.f32ToBF16(w[A]),L=this.f32ToBF16(w[A+1]);y[A/2]=R|L<<16}const k=new Float32Array(64);let u=0;for(let A=0;A<64;A++)k[A]=a[A]+d[A]+l[A],u+=k[A]*k[A];const m=1/Math.sqrt(u/64+1e-6),g=new Float32Array(64);for(let A=0;A<64;A++){const R=this.unpackBF16(y[Math.floor(A/2)],A%2);g[A]=k[A]*m*(1+R)}const i=new Float32Array(4);for(let A=0;A<4;A++){let R=0,L=0;const W=this.f16ToF32(A%2===0?f[Math.floor(A/2)]&65535:f[Math.floor(A/2)]>>>16),Z=this.f16ToF32(A%2===0?p[Math.floor(A/2)]&65535:p[Math.floor(A/2)]>>>16);for(let j=0;j<64;j++){const D=Math.floor(j/8),M=j%8*4,H=h[D*4+A]>>>M&15,P=F[D*4+A]>>>M&15;R+=g[j]*W*(H-8),L+=g[j]*Z*(P-8)}i[A]=R/(1+Math.exp(-R))*L}const c=k,_=16+640*16,b=new ArrayBuffer(_),N=new DataView(b);N.setUint32(0,64,!0),N.setUint32(4,4,!0),N.setUint32(8,64,!0),N.setFloat32(12,1e-6,!0);for(let A=0;A<64/2;A++){const R=Math.floor(A/4),L=A%4;N.setUint32(16+R*16+L*4,y[A],!0)}const x=this.gpu.createBufferFromData("fang_params",new Uint8Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),q=new Uint32Array(64);q.set(h,0),q.set(F,32);const K=8/(64/8),v=new Uint32Array(K*4/2*2);v.set(f,0),v.set(p,K*4/2);const T=this.makeF32Buffer("fang_hin",a),V=this.makeF32Buffer("fang_mlpres",d),G=this.makeF32Buffer("fang_attnres",l),C=this.makeU32Buffer("fang_mqw",q),U=this.makeU32Buffer("fang_msc",v),S=this.makeOutputBuffer("fang_hout",64),O=this.makeOutputBuffer("fang_out",4);this.dispatch("fused_addnorm_gate_up_silu",[T,V,G,C,U,S,O,x],Math.max(1,Math.ceil(4/32)));const Y=await this.readback(O,4),z=await this.readback(S,64),X=this.compare(Y,i,.02),tt=this.compare(z,c,1e-6);return{pass:X.pass&&tt.pass,maxErr:Math.max(X.maxErr,tt.maxErr),errors:[...X.errors||[],...tt.errors||[]]}}async testFusedSplitQKNormKVStore(){const l=new Float32Array(32);for(let M=0;M<l.length;M++)l[M]=Math.sin(M*.5+1)*.3;const w=new Float32Array(8),h=new Float32Array(8);for(let M=0;M<8;M++)w[M]=Math.cos(M*.7+2)*.4,h[M]=Math.sin(M*1.1+3)*.2;const F=new Float32Array(16),n=new Float32Array(8);for(let M=0;M<16;M++)F[M]=(M%3-1)*.05;for(let M=0;M<8;M++)n[M]=(M%4-2)*.03;const B=new Uint32Array(16/2);for(let M=0;M<16;M+=2)B[M/2]=this.f32ToBF16(F[M])|this.f32ToBF16(F[M+1])<<16;const f=new Uint32Array(8/2);for(let M=0;M<8;M+=2)f[M/2]=this.f32ToBF16(n[M])|this.f32ToBF16(n[M+1])<<16;const p=new Float32Array(16),y=new Float32Array(16);for(let M=0;M<2;M++){const H=new Float32Array(8);for(let E=0;E<8;E++)H[E]=l[M*8*2+E],y[M*8+E]=l[M*8*2+8+E];let P=0;for(let E=0;E<8;E++)P+=H[E]*H[E];const Q=1/Math.sqrt(P/8+1e-6);for(let E=0;E<8;E++){const J=this.unpackBF16(B[Math.floor((M*8+E)/2)],(M*8+E)%2);p[M*8+E]=H[E]*Q*(1+J)}}const k=new Float32Array(8),u=new Float32Array(32),m=new Float32Array(32);for(let M=0;M<1;M++){let H=0;for(let Q=0;Q<8;Q++)H+=w[M*8+Q]*w[M*8+Q];const P=1/Math.sqrt(H/8+1e-6);for(let Q=0;Q<8;Q++){const E=this.unpackBF16(f[Math.floor((M*8+Q)/2)],(M*8+Q)%2),J=w[M*8+Q]*P*(1+E);k[M*8+Q]=J,u[24+M*8+Q]=J,m[24+M*8+Q]=h[M*8+Q]}}const g=32+320*16,i=new ArrayBuffer(g),c=new DataView(i);c.setUint32(0,2,!0),c.setUint32(4,1,!0),c.setUint32(8,8,!0),c.setFloat32(12,1e-6,!0),c.setUint32(16,3,!0);for(let M=0;M<16/2;M++){const H=Math.floor(M/4),P=M%4;c.setUint32(32+H*16+P*4,B[M],!0)}const _=16/2;for(let M=0;M<8/2;M++){const H=_+M,P=Math.floor(H/4),Q=H%4;c.setUint32(32+P*16+Q*4,f[M],!0)}const b=this.gpu.createBufferFromData("fsqk_params",new Uint8Array(i),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),N=this.makeF32Buffer("fsqk_qpf",l),x=this.makeF32Buffer("fsqk_kp",w),q=this.makeF32Buffer("fsqk_vp",h),K=this.makeOutputBuffer("fsqk_qp",16),v=this.makeOutputBuffer("fsqk_qg",16),T=this.makeOutputBuffer("fsqk_kc",32),V=this.makeOutputBuffer("fsqk_vc",32);this.dispatch("fused_split_qknorm_kvstore",[N,x,q,K,v,T,V,b],3);const G=await this.readback(K,16),C=await this.readback(v,16),U=await this.readback(x,8),S=await this.readback(T,32),O=await this.readback(V,32),Y=this.compare(G,p,1e-5),z=this.compare(C,y,1e-6),X=this.compare(U,k,1e-5),tt=S.slice(24,32),A=O.slice(24,32),R=u.slice(24,32),L=m.slice(24,32),W=this.compare(tt,R,1e-5),Z=this.compare(A,L,1e-6),j=Math.max(Y.maxErr,z.maxErr,X.maxErr,W.maxErr,Z.maxErr);return{pass:Y.pass&&z.pass&&X.pass&&W.pass&&Z.pass,maxErr:j,errors:[...Y.errors||[],...z.errors||[],...X.errors||[],...W.errors||[],...Z.errors||[]]}}async benchmarkGPTQ(e){const t=[{K:2048,N:2048,gs:128,label:"2048→2048 (Z/out_proj)"},{K:2048,N:6144,gs:128,label:"2048→6144 (QKV/gate+up)"},{K:6144,N:2048,gs:128,label:"6144→2048 (down_proj)"}],o=[1,2,4,8],s=50,r=3,a=10,d=[];for(const l of t){const{K:w,N:h,gs:F,label:n}=l,B=w/8,f=B/(F/8),p=new Float32Array(w);for(let b=0;b<w;b++)p[b]=Math.random()*2-1;const y=new Uint32Array(B*h);for(let b=0;b<y.length;b++)y[b]=Math.random()*4294967295>>>0;const k=new Float32Array(f*h);for(let b=0;b<k.length;b++)k[b]=Math.random()*.5;const u=this.cpuF32ToF16Packed(k),m=this.makeF32Buffer("bm_in",p),g=this.makeU32Buffer("bm_qw",y),i=this.makeU32Buffer("bm_sc",u),c=[];for(let b=0;b<s;b++)c.push(this.makeOutputBuffer(`bm_out_${b}`,h));const _=B*h*4+f*h*2;for(const b of o){if(b>1&&f%b!==0)continue;let N;if(b===1){const v=this.makeUniform([w,h,F]);for(let G=0;G<r;G++){this.gpu.beginBatch();for(let C=0;C<s;C++)this.dispatch("gptq_matvec",[m,g,i,c[C],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const T=[];for(let G=0;G<a;G++){await this.gpu.device.queue.onSubmittedWorkDone();const C=performance.now();this.gpu.beginBatch();for(let U=0;U<s;U++)this.dispatch("gptq_matvec",[m,g,i,c[U],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),T.push(performance.now()-C)}T.sort((G,C)=>G-C),N=T.slice(1,-1).reduce((G,C)=>G+C,0)/(T.length-2)/s}else{const v=[];for(let U=0;U<s;U++)v.push(this.makeOutputBuffer(`bm_part_${U}`,h*b));const T=this.makeUniform([w,h,F,b]),V=this.makeUniform([h,b]);for(let U=0;U<r;U++){this.gpu.beginBatch();for(let S=0;S<s;S++)this.dispatch("gptq_splitk",[m,g,i,v[S],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[S],c[S],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const G=[];for(let U=0;U<a;U++){await this.gpu.device.queue.onSubmittedWorkDone();const S=performance.now();this.gpu.beginBatch();for(let O=0;O<s;O++)this.dispatch("gptq_splitk",[m,g,i,v[O],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[O],c[O],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),G.push(performance.now()-S)}G.sort((U,S)=>U-S),N=G.slice(1,-1).reduce((U,S)=>U+S,0)/(G.length-2)/s}const x=_/1e9/(N/1e3),q=Math.ceil(h/32)*b,K={label:n,ns:b,wgs:q,avgMs:N.toFixed(4),bwGBs:x.toFixed(1)};d.push(K),e?.(K)}if(this.gpu.hasF16&&this.pipelines.gptq_matvec_f16){const b=this.makeUniform([w,h,F]);for(let T=0;T<r;T++){this.gpu.beginBatch();for(let V=0;V<s;V++)this.dispatch("gptq_matvec_f16",[m,g,i,c[V],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const N=[];for(let T=0;T<a;T++){await this.gpu.device.queue.onSubmittedWorkDone();const V=performance.now();this.gpu.beginBatch();for(let G=0;G<s;G++)this.dispatch("gptq_matvec_f16",[m,g,i,c[G],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),N.push(performance.now()-V)}N.sort((T,V)=>T-V);const q=N.slice(1,-1).reduce((T,V)=>T+V,0)/(N.length-2)/s,K=_/1e9/(q/1e3),v={label:n,ns:"f16",wgs:Math.ceil(h/32),avgMs:q.toFixed(4),bwGBs:K.toFixed(1)};d.push(v),e?.(v)}}return d}_makeNormParams(e,t=1e-6){const o=new ArrayBuffer(16),s=new DataView(o);return s.setUint32(0,e,!0),s.setFloat32(4,t,!0),this.gpu.createBufferFromData("norm_params",new Uint8Array(o),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}_cpuRMSNorm1PlusW(e,t,o,s=1e-6){let r=0;for(let l=0;l<o;l++)r+=e[l]*e[l];const a=1/Math.sqrt(r/o+s),d=new Float32Array(o);for(let l=0;l<o;l++){const w=t[Math.floor(l/2)]>>l%2*16&65535,h=this.bf16ToF32(w);d[l]=e[l]*a*(1+h)}return d}async testAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let u=0;u<32;u++)o[u]=Math.sin(u*.7)*2,s[u]=Math.cos(u*.3)*.5;const r=new Float32Array(16);for(let u=0;u<16;u++)r[u]=.1*(u+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32),l=new Float32Array(32);for(let u=0;u<2;u++){const m=new Float32Array(16);for(let i=0;i<16;i++)m[i]=o[u*16+i]+s[u*16+i];for(let i=0;i<16;i++)l[u*16+i]=m[i];const g=this._cpuRMSNorm1PlusW(m,a,16,1e-6);d.set(g,u*16)}const w=this.makeF32Buffer("b2an_h",o),h=this.makeF32Buffer("b2an_a",s),F=this.makeU32Buffer("b2an_w",a),n=this.makeOutputBuffer("b2an_out",32),B=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_b2",[w,h,F,n,B],2);const f=await this.readback(n,32),p=await this.readback(w,32),y=this.compare(f,d,.001),k=this.compare(p,l,1e-6);return{pass:y.pass&&k.pass,maxErr:Math.max(y.maxErr,k.maxErr),errors:[...y.errors,...k.errors]}}async testAddRMSNormROB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let f=0;f<32;f++)o[f]=Math.sin(f*.5)*3,s[f]=Math.cos(f*.8)*.3;const r=new Float32Array(16);for(let f=0;f<16;f++)r[f]=.05*(f+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32);for(let f=0;f<2;f++){const p=new Float32Array(16);for(let k=0;k<16;k++)p[k]=o[f*16+k]+s[f*16+k];const y=this._cpuRMSNorm1PlusW(p,a,16,1e-6);d.set(y,f*16)}const l=this.makeF32Buffer("b2anro_h",o),w=this.makeF32Buffer("b2anro_a",s),h=this.makeU32Buffer("b2anro_w",a),F=this.makeOutputBuffer("b2anro_out",32),n=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_ro_b2",[l,w,h,F,n],2);const B=await this.readback(F,32);return this.compare(B,d,.001)}async testThreeWayAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32),r=new Float32Array(32);for(let i=0;i<32;i++)o[i]=Math.sin(i*.4)*2,s[i]=Math.cos(i*.6)*.5,r[i]=Math.sin(i*1.1)*.3;const a=new Float32Array(16);for(let i=0;i<16;i++)a[i]=.1*(i+1);const d=this.cpuF32ToBF16Packed(a),l=new Float32Array(32),w=new Float32Array(32);for(let i=0;i<2;i++){const c=new Float32Array(16);for(let b=0;b<16;b++)c[b]=o[i*16+b]+s[i*16+b]+r[i*16+b];l.set(c,i*16);const _=this._cpuRMSNorm1PlusW(c,d,16,1e-6);w.set(_,i*16)}const h=this.makeF32Buffer("b2twa_a",o),F=this.makeF32Buffer("b2twa_b",s),n=this.makeF32Buffer("b2twa_c",r),B=this.makeU32Buffer("b2twa_w",d),f=this.makeOutputBuffer("b2twa_hout",32),p=this.makeOutputBuffer("b2twa_normed",32),y=this._makeNormParams(16,1e-6);this.dispatch("three_way_add_rmsnorm_b2",[h,F,n,B,f,p,y],2);const k=await this.readback(p,32),u=await this.readback(f,32),m=this.compare(k,w,.001),g=this.compare(u,l,1e-6);return{pass:m.pass&&g.pass,maxErr:Math.max(m.maxErr,g.maxErr),errors:[...m.errors,...g.errors]}}_makeGPTQData(e,t,o){const s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let h=0;h<s;h++)for(let F=0;F<t;F++){let n=0;for(let B=0;B<8;B++){const f=h*8+B,p=(f*3+F*7)%15;a[f*t+F]=p-8,n|=p<<B*4}r[h*t+F]=n}const d=e/o,l=new Float32Array(d*t);for(let h=0;h<d*t;h++)l[h]=.3+h*.15;const w=this.cpuF32ToF16Packed(l);return{qweight:r,rawWeights:a,scalesPacked:w}}_cpuGPTQMatvec(e,t,o,s,r,a,d=0){const l=new Float32Array(r);for(let w=0;w<r;w++){let h=0;for(let F=0;F<s;F++){const B=Math.floor(F/a)*r+w,f=o[Math.floor(B/2)]>>B%2*16&65535,p=this.f16ToF32(f);h+=e[d+F]*t[F*r+w]*p}l[w]=h}return l}async testFusedSiluGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.3)*2,l[k]=Math.cos(k*.5)*1.5;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=d[k*e+g];u[g]=i/(1+Math.exp(-i))*l[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sl_a",d),F=this.makeF32Buffer("b2sl_b",l),n=this.makeU32Buffer("b2sl_qw",s),B=this.makeU32Buffer("b2sl_sc",a),f=this.makeOutputBuffer("b2sl_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_silu_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedSigmoidGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),l=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.4)*1.5,l[k]=Math.cos(k*.7)*2;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=1/(1+Math.exp(-l[k*e+g]));u[g]=i*d[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sg_x",d),F=this.makeF32Buffer("b2sg_g",l),n=this.makeU32Buffer("b2sg_qw",s),B=this.makeU32Buffer("b2sg_sc",a),f=this.makeOutputBuffer("b2sg_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_sigmoid_gptq_b2_f16",[h,F,n,B,f,p],Math.ceil(t/32));const y=await this.readback(f,2*t);return this.compare(y,w,.05)}async testFusedGateUpSiLUB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*3+x*7)%15;a[v*t+x]=T-8,q|=T<<K*4}r[N*t+x]=q}const d=e/o,l=new Float32Array(d*t);for(let N=0;N<d*t;N++)l[N]=.01+N%8*.005;const w=this.cpuF32ToF16Packed(l),h=new Uint32Array(s*t),F=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*5+x*3)%15;F[v*t+x]=T-8,q|=T<<K*4}h[N*t+x]=q}const n=e/o,B=new Float32Array(n*t);for(let N=0;N<n*t;N++)B[N]=.01+N%8*.005;const f=this.cpuF32ToF16Packed(B),p=new Float32Array(2*e);for(let N=0;N<2*e;N++)p[N]=Math.sin(N*.2)*.5;const y=new Float32Array(2*t);for(let N=0;N<2;N++){const x=this._cpuGPTQMatvec(p,a,w,e,t,o,N*e),q=this._cpuGPTQMatvec(p,F,f,e,t,o,N*e);for(let K=0;K<t;K++){const v=x[K];y[N*t+K]=v/(1+Math.exp(-v))*q[K]}}const k=this.makeF32Buffer("b2gus_in",p),u=this.makeU32Buffer("b2gus_gqw",r),m=this.makeU32Buffer("b2gus_gsc",w),g=this.makeU32Buffer("b2gus_uqw",h),i=this.makeU32Buffer("b2gus_usc",f),c=this.makeOutputBuffer("b2gus_out",2*t),_=this.makeUniform([e,t,o]);this.dispatch("fused_gate_up_silu_b2_f16",[k,u,m,g,i,c,_],Math.ceil(t/32));const b=await this.readback(c,2*t);return this.compare(b,y,.05)}_makeGPTQMatmulB2Data(e,t,o){const s=e/8,r=new Float32Array(2*e);for(let n=0;n<e;n++)r[n]=Math.sin(n*.5)*.5;for(let n=0;n<e;n++)r[e+n]=Math.cos(n*.3)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let n=0;n<s;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<8;p++){const y=n*8+p,k=(y*3+B*7)%15;d[y*t+B]=k-8,f|=k<<p*4}a[n*t+B]=f}const l=e/o,w=new Float32Array(l*t);for(let n=0;n<l*t;n++)w[n]=.01+n%8*.005;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(2*t);for(let n=0;n<2;n++)for(let B=0;B<t;B++){let f=0;for(let p=0;p<e;p++){const k=Math.floor(p/o)*t+B,u=h[Math.floor(k/2)]>>k%2*16&65535,m=this.f16ToF32(u);f+=r[n*e+p]*d[p*t+B]*m}F[n*t+B]=f}return{input:r,qweight:a,scalesPacked:h,expected:F,K:e,N:t,gs:o}}async testGPTQMatmulB2(){const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2_input",e),w=this.makeU32Buffer("b2_qw",t),h=this.makeU32Buffer("b2_sc",o),F=this.makeOutputBuffer("b2_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.02)}async testGPTQMatmulB2F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),l=this.makeF32Buffer("b2f16_input",e),w=this.makeU32Buffer("b2f16_qw",t),h=this.makeU32Buffer("b2f16_sc",o),F=this.makeOutputBuffer("b2f16_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_f16",[l,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}async testGPTQMatmulB2_4T_F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(256,16,64),l=this.makeF32Buffer("b2_4t_input",e),w=this.makeU32Buffer("b2_4t_qw",t),h=this.makeU32Buffer("b2_4t_sc",o),F=this.makeOutputBuffer("b2_4t_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_4t_f16",[l,w,h,F,n],Math.ceil(a/8));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}f32ToBF16(e){const t=new ArrayBuffer(4);return new Uint32Array(t)[0]=new Uint32Array(new Float32Array([e]).buffer)[0],new Uint32Array(t)[0]>>>16}unpackBF16(e,t){const o=e>>>t*16&65535;return this.bf16ToF32(o)}}function $(I){const e=document.getElementById("log");e.textContent+=I+`
|
| 3 |
`,e.scrollTop=e.scrollHeight}document.querySelectorAll(".tab").forEach(I=>{I.addEventListener("click",()=>{document.querySelectorAll(".tab").forEach(e=>e.classList.remove("active")),document.querySelectorAll(".panel").forEach(e=>e.classList.remove("active")),I.classList.add("active"),document.getElementById(I.dataset.panel).classList.add("active")})});document.getElementById("runTests").addEventListener("click",async()=>{const I=document.getElementById("runTests"),e=document.getElementById("testResults"),t=document.getElementById("testSummary");I.disabled=!0,e.innerHTML="",t.innerHTML="",$("Initializing WebGPU for tests...");try{const o=new ot;await o.init(),$(`WebGPU initialized. Running tests...
|
| 4 |
`);let s=0,r=0;await o.runAll(a=>{const d=a.pass?"✓":"✗",l=a.pass?"pass":"fail";a.pass?s++:r++;const w=document.createElement("div");if(w.className="test-row",w.innerHTML=`
|
| 5 |
<span class="test-icon ${l}">${d}</span>
|
|
|
|
| 14 |
</div>
|
| 15 |
`,$(`
|
| 16 |
Done: ${s} passed, ${r} failed`),o.destroy()}catch(o){$(`Error: ${o.message}
|
| 17 |
+
${o.stack}`)}finally{I.disabled=!1}});document.getElementById("loadAndProfile").addEventListener("click",async()=>{const I=document.getElementById("loadAndProfile"),e=document.getElementById("profStatus"),t=document.getElementById("profCategories"),o=document.getElementById("profTopOps");I.disabled=!0,t.innerHTML="",o.innerHTML="";try{const s=document.getElementById("profRepo").value.trim();e.textContent="Loading model...",$("Profiler: loading model from "+s);const{GPUContext:r}=await et(async()=>{const{GPUContext:c}=await import("./gpu-ops-DKsrMEcC.js").then(_=>_.g);return{GPUContext:c}},[],import.meta.url),{Qwen35Model:a}=await et(async()=>{const{Qwen35Model:c}=await import("./qwen35-model-CmivuQnY.js");return{Qwen35Model:c}},__vite__mapDeps([0,1]),import.meta.url),{loadModelWeights:d,loadConfig:l,loadQuantConfig:w}=await et(async()=>{const{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}=await import("./safetensors-loader-CwGm5mJX.js");return{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}},[],import.meta.url),{Profiler:h}=await et(async()=>{const{Profiler:c}=await import("./profiler-DYUyiq-B.js");return{Profiler:c}},[],import.meta.url),F=new r;await F.init(),e.textContent="Fetching config...";const n=await l(s),B=await w(s);e.textContent="Downloading weights (this may take a while)...";const f=await d(s,c=>{if(c.phase==="downloading"){const _=c.total?(c.loaded/c.total*100).toFixed(0):"?";e.textContent=`Downloading: ${c.filesLoaded}/${c.filesTotal} shards (${_}%)`}else c.phase==="parsing"&&(e.textContent=`Parsing ${c.file}...`)});e.textContent="Uploading to GPU...";const p=new a(F,n,B);p.compilePipelines(),p.loadWeights(f),p.initBuffers(),e.textContent="Warming up...",await p.forward(1,0);const y=p.textCfg.linear_num_key_heads,k=p.textCfg.linear_key_head_dim,u=p.textCfg.linear_value_head_dim,m=y*(k+k+u);for(let c=0;c<p.numLayers;c++)p.layerTypes[c]==="linear_attention"&&(F.device.queue.writeBuffer(p.linState[c],0,new ArrayBuffer(y*k*u*4)),F.device.queue.writeBuffer(p.linConvHist[c],0,new ArrayBuffer(3*m*4)));p.seqLen=0,e.textContent="Profiling forward pass...",$("Profiling single forward pass...");const i=await new h(p).profileForward(1,0);e.textContent=`Done! Estimated: ${i.estimatedTokPerSec} tok/s (sync overhead inflates times)`,$(`
|
| 18 |
Profile complete: ${i.total.toFixed(1)}ms total (~${i.estimatedTokPerSec} tok/s with sync overhead)`),$(`
|
| 19 |
Category breakdown:`);for(const c of i.categories){const _=document.createElement("div");_.className="prof-row";const b=c.name.includes("matvec")||c.name==="lm_head"?"var(--orange)":c.name.includes("attention")||c.name.includes("deltanet")?"var(--blue)":"var(--green)";_.innerHTML=`
|
| 20 |
<span class="prof-name">${c.name}</span>
|
index.html
CHANGED
|
@@ -129,12 +129,8 @@
|
|
| 129 |
.toggle-btn.active { background: var(--accent); color: #fff; border-color: var(--accent); }
|
| 130 |
.toggle-btn:hover { border-color: var(--accent); }
|
| 131 |
|
| 132 |
-
/*
|
| 133 |
-
.
|
| 134 |
-
.unsupported-icon { color: #f59e0b; margin-bottom: 8px; }
|
| 135 |
-
.unsupported h1 { font-size: 24px; font-weight: 700; color: var(--accent); }
|
| 136 |
-
.unsupported p { color: var(--text-2); font-size: 14px; line-height: 1.7; max-width: 480px; }
|
| 137 |
-
.unsupported-meta { display: flex; flex-direction: column; gap: 4px; margin-top: 12px; font-size: 10px; color: var(--text-3); font-family: var(--mono); max-width: 480px; word-break: break-all; }
|
| 138 |
|
| 139 |
/* Toast */
|
| 140 |
.toast { position: fixed; bottom: 24px; left: 50%; transform: translateX(-50%) translateY(20px); padding: 10px 20px; border-radius: 6px; font-size: 13px; font-weight: 500; background: var(--bg-2); color: var(--text-1); border: 1px solid var(--border); opacity: 0; transition: all 0.3s; z-index: 1000; max-width: 480px; }
|
|
@@ -142,9 +138,9 @@
|
|
| 142 |
.toast-error { border-color: #ef4444; color: #ef4444; }
|
| 143 |
.toast-success { border-color: var(--accent); color: var(--accent); }
|
| 144 |
</style>
|
| 145 |
-
<script type="module" crossorigin src="./assets/main-
|
| 146 |
-
<link rel="modulepreload" crossorigin href="./assets/gpu-ops-
|
| 147 |
-
<link rel="modulepreload" crossorigin href="./assets/qwen35-model-
|
| 148 |
<link rel="modulepreload" crossorigin href="./assets/safetensors-loader-CwGm5mJX.js">
|
| 149 |
</head>
|
| 150 |
<body>
|
|
|
|
| 129 |
.toggle-btn.active { background: var(--accent); color: #fff; border-color: var(--accent); }
|
| 130 |
.toggle-btn:hover { border-color: var(--accent); }
|
| 131 |
|
| 132 |
+
/* WebGPU warning banner */
|
| 133 |
+
.webgpu-warning { padding: 10px 20px; background: rgba(245,158,11,0.12); border-bottom: 1px solid #f59e0b; color: #f59e0b; font-size: 12px; text-align: center; flex-shrink: 0; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
/* Toast */
|
| 136 |
.toast { position: fixed; bottom: 24px; left: 50%; transform: translateX(-50%) translateY(20px); padding: 10px 20px; border-radius: 6px; font-size: 13px; font-weight: 500; background: var(--bg-2); color: var(--text-1); border: 1px solid var(--border); opacity: 0; transition: all 0.3s; z-index: 1000; max-width: 480px; }
|
|
|
|
| 138 |
.toast-error { border-color: #ef4444; color: #ef4444; }
|
| 139 |
.toast-success { border-color: var(--accent); color: var(--accent); }
|
| 140 |
</style>
|
| 141 |
+
<script type="module" crossorigin src="./assets/main-DkVqMqJQ.js"></script>
|
| 142 |
+
<link rel="modulepreload" crossorigin href="./assets/gpu-ops-DKsrMEcC.js">
|
| 143 |
+
<link rel="modulepreload" crossorigin href="./assets/qwen35-model-CmivuQnY.js">
|
| 144 |
<link rel="modulepreload" crossorigin href="./assets/safetensors-loader-CwGm5mJX.js">
|
| 145 |
</head>
|
| 146 |
<body>
|
test.html
CHANGED
|
@@ -41,8 +41,8 @@
|
|
| 41 |
button.run-btn:hover:not(:disabled) { filter: brightness(1.15); }
|
| 42 |
#log { margin-top: 12px; padding: 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 8px; max-height: 300px; overflow-y: auto; white-space: pre-wrap; font-size: 0.7rem; color: var(--dim); }
|
| 43 |
</style>
|
| 44 |
-
<script type="module" crossorigin src="./assets/test-
|
| 45 |
-
<link rel="modulepreload" crossorigin href="./assets/gpu-ops-
|
| 46 |
</head>
|
| 47 |
<body>
|
| 48 |
<h1>TensorBend Shader Tests & Profiler</h1>
|
|
|
|
| 41 |
button.run-btn:hover:not(:disabled) { filter: brightness(1.15); }
|
| 42 |
#log { margin-top: 12px; padding: 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 8px; max-height: 300px; overflow-y: auto; white-space: pre-wrap; font-size: 0.7rem; color: var(--dim); }
|
| 43 |
</style>
|
| 44 |
+
<script type="module" crossorigin src="./assets/test-DMd_5haN.js"></script>
|
| 45 |
+
<link rel="modulepreload" crossorigin href="./assets/gpu-ops-DKsrMEcC.js">
|
| 46 |
</head>
|
| 47 |
<body>
|
| 48 |
<h1>TensorBend Shader Tests & Profiler</h1>
|