Spaces:

Ex0bit
/

tensorbend

Running

App Files Files Community

Ex0bit commited on 3 days ago

Commit

21a8eeb

1 Parent(s): 05acc9f

Deploy TensorBend — browser-based LLM inference via WebGPU

Browse files

Files changed (11) hide show

README.md +10 -4
assets/gpu-ops-Da8QZLNh.js +0 -0
assets/main-D8aBnLGG.js +0 -0
assets/profiler-DYUyiq-B.js +1 -0
assets/qwen35-model-BU7SY47A.js +1 -0
assets/safetensors-loader-CwGm5mJX.js +1 -0
assets/test-s5fGKojE.js +46 -0
index.html +145 -18
style.css +0 -28
test.html +91 -0
vite.svg +1 -0

README.md CHANGED Viewed

@@ -1,10 +1,16 @@
 ---
-title: Tensorbend
-emoji: 🔥
 colorFrom: gray
-colorTo: blue
 sdk: static
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TensorBend
+emoji: ⚡
 colorFrom: gray
+colorTo: green
 sdk: static
 pinned: false
 ---
+# TensorBend
+Run LLMs entirely in your browser. Weights are fetched as raw SafeTensors from HuggingFace and loaded directly into WebGPU compute buffers. No ONNX, no server.
+**Requires:** Chrome/Edge with WebGPU support (macOS, Windows, ChromeOS). Apple Silicon recommended for best performance.
+**Supported models:** Qwen3.5 family (0.8B, 2B, 4B, 9B) — INT4 quantized via AutoRound.

assets/gpu-ops-Da8QZLNh.js ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/main-D8aBnLGG.js ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/profiler-DYUyiq-B.js ADDED Viewed

	@@ -0,0 +1 @@

+ class u{constructor(s){this.model=s,this.gpu=s.gpu,this.timings=[]}async timeOp(s,o){await this.gpu.device.queue.onSubmittedWorkDone();const e=performance.now();this.gpu.beginBatch(),o(),this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone();const n=performance.now()-e;return this.timings.push({name:s,elapsed:n}),n}async profileForward(s,o){this.timings=[];const e=this.model;await this.timeOp("embedding",()=>e.embedding(s));let n=e.hidden,t=e.hiddenB,i=e.zeroBuf;for(let a=0;a<e.numLayers;a++){const m=e.layerTypes[a];m==="full_attention"?await this.profileFullAttention(a,n,i,o):await this.profileLinearAttention(a,n,i);const l=m==="full_attention"?e.qProj:e.attnOut;await this.timeOp(`L${a}.fused_norm_mlp`,()=>{e.fusedNormMLP(n,t,i,l,a)}),i=e.mlpOut;const r=n;n=t,t=r}return await this.timeOp("add_final_norm",()=>{const a=e.weights["model.language_model.norm.weight"];e.addAndRmsNorm(n,e.mlpOut,e.normed,a)}),await this.timeOp("lm_head",()=>{const a=e.weights["model.language_model.embed_tokens.weight"],m=e.makeUniform("lmhead_params",[e.hiddenSize,e.vocabSize]);e.run("bf16_matvec",[e.normed,a,e.logits,m],e.wg(e.vocabSize))}),await this.timeOp("argmax",()=>{const a=e.makeUniform("argmax_params",[e.vocabSize]);e.run("argmax",[e.logits,e.argmaxResult,a],1)}),this.summarize()}async profileFullAttention(s,o,e,n){const t=this.model;await this.timeOp(`L${s}.fa.fused_norm_qkv`,()=>{t.fullAttentionFused(o,e,s,n)})}async profileLinearAttention(s,o,e){const n=this.model;await this.timeOp(`L${s}.la.fused_all`,()=>{n.linearAttentionFused(o,e,s)})}summarize(){const s=this.timings.reduce((t,i)=>t+i.elapsed,0),o={};for(const t of this.timings){let i;t.name.includes("fused_norm_mlp")?i="fused_norm_mlp":t.name.includes("fused_norm_qkv")?i="full_attention":t.name.includes("la.fused_all")?i="linear_attention":t.name.includes("final_norm")?i="norm":t.name.includes("lm_head")?i="lm_head":t.name.includes("embedding")?i="embedding":t.name.includes("argmax")?i="argmax":i="other",o[i]=(o[i]||0)+t.elapsed}const e=Object.entries(o).sort((t,i)=>i[1]-t[1]),n=[...this.timings].sort((t,i)=>i.elapsed-t.elapsed).slice(0,20);return{total:s,categories:e.map(([t,i])=>({name:t,time:i.toFixed(2),pct:(i/s*100).toFixed(1)})),topOps:n.map(t=>({name:t.name,time:t.elapsed.toFixed(2),pct:(t.elapsed/s*100).toFixed(1)})),allTimings:this.timings,dispatchCount:this.timings.length,estimatedTokPerSec:(1e3/s).toFixed(1)}}}export{u as Profiler};

assets/qwen35-model-BU7SY47A.js ADDED Viewed

	@@ -0,0 +1 @@

+ import{S as T,a as E,_ as R}from"./gpu-ops-Da8QZLNh.js";function V(j){const a=j<<16,e=new ArrayBuffer(4);new Uint32Array(e)[0]=a,new Float32Array(e)[0];const t=a>>>31&1,i=a>>>23&255,s=a&8388607;if(i===0)return t<<15;if(i===255)return t<<15|31744|(s?512:0);const n=i-127+15;return n>=31?t<<15|31744:n<=0?t<<15:t<<15|n<<10|s>>>13}class L{constructor(a,e,t){if(this.gpu=a,this.config=e,this.textCfg=e.text_config,this.quantConfig=t,this.hiddenSize=this.textCfg.hidden_size,this.intermediateSize=this.textCfg.intermediate_size,this.numLayers=this.textCfg.num_hidden_layers,this.numHeads=this.textCfg.num_attention_heads,this.numKVHeads=this.textCfg.num_key_value_heads,this.headDim=this.textCfg.head_dim,this.vocabSize=this.textCfg.vocab_size,this.rmsEps=this.textCfg.rms_norm_eps,this.ropeTheta=this.textCfg.rope_parameters?.rope_theta||1e7,this.partialRotary=this.textCfg.partial_rotary_factor,this.partialDim=Math.floor(this.headDim*this.partialRotary),this.layerTypes=this.textCfg.layer_types,this.groupSize=t?.group_size||t?.config_groups?.group_0?.weights?.group_size||128,this.fp16Layers=new Set,t?.extra_config)for(const u of Object.keys(t.extra_config))this.fp16Layers.add(u);this.weights={},this.kvCache={},this.seqLen=0,this.pipelines={},this.singlePassMode=!0,this._replayFlat=null,this._forceMinSplits=0;const i=this.numHeads*this.headDim,s=this.numKVHeads*this.headDim,n=(i+s)/2,r=Math.ceil(n/4);this._splitQKNormShaderKey=`fused_split_qknorm_kvstore_${r}`,T[this._splitQKNormShaderKey]||(T[this._splitQKNormShaderKey]=E(r))}compilePipelines(){const a=this.gpu;for(const[e,t]of Object.entries(T))this.pipelines[e]=a.getOrCreatePipeline(e,t)}uploadTensors(a){this.quantConfig?.quant_method==="compressed-tensors"&&(a=this._convertCompressedTensors(a));for(const[t,i]of Object.entries(a)){let s=i.data;if(i.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")){const n=new Float32Array(s.buffer,s.byteOffset,s.byteLength/4),r=new Uint16Array(n.length);for(let u=0;u<n.length;u++){const h=new Uint32Array(n.buffer,n.byteOffset+u*4,1)[0];r[u]=h>>>16}s=new Uint8Array(r.buffer)}if(i._partial){let{offset:n,totalSize:r}=i._partial;if(i.dtype==="F32"&&!t.endsWith(".qweight")&&!t.endsWith(".qzeros")&&!t.endsWith(".scales")&&(n/=2,r/=2),n===0){const h=this.gpu.createBuffer(t,r,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC);this.gpu.device.queue.writeBuffer(h,0,s),this.weights[t]=h}else{const h=this.weights[t];h&&this.gpu.device.queue.writeBuffer(h,n,s)}}else this.weights[t]=this.gpu.createBufferFromData(t,s);(t.includes("post_attention_layernorm.weight")||t.includes("input_layernorm.weight")||t.includes("q_norm.weight")||t.includes("k_norm.weight")||t==="model.language_model.norm.weight")&&(this._normWeightRaw||(this._normWeightRaw={}),this._normWeightRaw[t]=new Uint32Array(s.buffer.slice(s.byteOffset,s.byteOffset+s.byteLength))),this._uploadedCount=(this._uploadedCount||0)+1}}_convertCompressedTensors(a){const e={};for(const[t,i]of Object.entries(a))if(!t.endsWith(".weight_shape")){if(t.endsWith(".weight_packed")){const s=t.slice(0,-14),n=i.shape,r=n[0],u=n[1],h=new Int32Array(i.data.buffer,i.data.byteOffset,i.data.byteLength/4),o=new Int32Array(r*u);for(let c=0;c<r;c++)for(let m=0;m<u;m++)o[m*r+c]=h[c*u+m];e[`${s}.qweight`]={dtype:"I32",shape:[u,r],data:new Uint8Array(o.buffer)};continue}if(t.endsWith(".weight_scale")){const s=t.slice(0,-13),n=i.shape,r=n[0],u=n[1],h=new Uint16Array(i.data.buffer,i.data.byteOffset,i.data.byteLength/2),o=Math.ceil(r/2),c=new Uint32Array(u*o);for(let m=0;m<u;m++)for(let f=0;f<r;f+=2){const b=h[f*u+m],v=f+1<r?h[(f+1)*u+m]:0,U=V(b),w=V(v);c[m*o+(f>>1)]=U|w<<16}e[`${s}.scales`]={dtype:"I32",shape:[u,o],data:new Uint8Array(c.buffer)};continue}e[t]=i}return e}async postProcessWeights(){const a=this.textCfg.linear_num_key_heads||0,e=this.hiddenSize,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,i=this.layerTypes.indexOf("linear_attention"),s=i>=0?`model.language_model.layers.${i}.linear_attn`:"";if(this.abQuantized=s&&!!this.weights[`${s}.in_proj_a.qweight`],this.abQuantized){const l=this.textCfg.linear_num_value_heads??a;this.linAlpha=this.gpu.createBuffer("lin_alpha",l*4,t),this.linBeta=this.gpu.createBuffer("lin_beta",l*4,t)}else{this.linABWeight={};const l=this.textCfg.linear_num_value_heads??a,S=e*2,B=l*S,d=2*B;for(let P=0;P<this.numLayers;P++)if(this.layerTypes[P]==="linear_attention"){const q=`model.language_model.layers.${P}.linear_attn`,k=this.weights[`${q}.in_proj_a.weight`],$=this.weights[`${q}.in_proj_b.weight`];if(k&&$){const y=this.gpu.createBuffer(`ab_merged_${P}`,d,t),C=this.gpu.device.createCommandEncoder();C.copyBufferToBuffer(k,0,y,0,B),C.copyBufferToBuffer($,0,y,B,B),this.gpu.device.queue.submit([C.finish()]),this.linABWeight[P]=y}}}{const l=[];for(let d=0;d<this.numLayers;d++){if(this.layerTypes[d]==="linear_attention"){const P=`model.language_model.layers.${d}.linear_attn`,q=this.textCfg.linear_num_key_heads||0,k=this.textCfg.linear_key_head_dim||128,$=this.textCfg.linear_value_head_dim||128,y=this.textCfg.linear_num_value_heads??q,C=y*$,D=y/q*$,z=q*(k+k+D);l.push({prefix:`${P}.in_proj_qkv`,K:e,N:z}),l.push({prefix:`${P}.in_proj_z`,K:e,N:C}),l.push({prefix:`${P}.out_proj`,K:C,N:e})}else{const P=`model.language_model.layers.${d}.self_attn`,q=this.numHeads*this.headDim*2,k=this.numKVHeads*this.headDim;l.push({prefix:`${P}.q_proj`,K:e,N:q}),l.push({prefix:`${P}.k_proj`,K:e,N:k}),l.push({prefix:`${P}.v_proj`,K:e,N:k}),l.push({prefix:`${P}.o_proj`,K:this.numHeads*this.headDim,N:e})}l.push({prefix:`model.language_model.layers.${d}.mlp.gate_proj`,K:e,N:this.intermediateSize}),l.push({prefix:`model.language_model.layers.${d}.mlp.up_proj`,K:e,N:this.intermediateSize}),l.push({prefix:`model.language_model.layers.${d}.mlp.down_proj`,K:this.intermediateSize,N:e})}let S=0;const B=performance.now();for(const{prefix:d,K:P,N:q}of l)if(!this.weights[`${d}.qweight`]&&this.weights[`${d}.weight`]){const{qweight:k,scales:$}=await this._quantizeBF16ToINT4(this.weights[`${d}.weight`],P,q,this.groupSize,d.replace(/\./g,"_"));this.weights[`${d}.qweight`]=k,this.weights[`${d}.scales`]=$,S++}S>0&&console.log(`[QUANT] GPU-quantized ${S} BF16 projections to INT4 in ${(performance.now()-B).toFixed(0)}ms`)}this._mergedGateUp={};const n=this.intermediateSize,r=e/8,u=r*n*4,o=r/(this.groupSize/8)*n*2;for(let l=0;l<this.numLayers;l++){const S=`model.language_model.layers.${l}.mlp`,B=this.getQWeight(`${S}.gate_proj`),d=this.getQWeight(`${S}.up_proj`);if(B.qweight&&d.qweight){const P=this.gpu.createBuffer(`merged_qw_${l}`,u*2,t),q=this.gpu.createBuffer(`merged_sc_${l}`,o*2,t),k=this.gpu.device.createCommandEncoder();k.copyBufferToBuffer(B.qweight,0,P,0,u),k.copyBufferToBuffer(d.qweight,0,P,u,u),k.copyBufferToBuffer(B.scales,0,q,0,o),k.copyBufferToBuffer(d.scales,0,q,o,o),this.gpu.device.queue.submit([k.finish()]),this._mergedGateUp[l]={qweight:P,scales:q}}}this._fusedMLPParams={};const c=16+512*16;for(let l=0;l<this.numLayers;l++){const S=`model.language_model.layers.${l}.post_attention_layernorm.weight`,B=this._normWeightRaw?.[S];if(!B||!this._mergedGateUp[l])continue;const d=new ArrayBuffer(c),P=new Uint32Array(d),q=new Float32Array(d);P[0]=e,P[1]=n,P[2]=this.groupSize,q[3]=this.rmsEps;for(let k=0;k<B.length;k++)P[4+k]=B[k];this._fusedMLPParams[l]=this.gpu.createBufferFromData(`fused_mlp_params_${l}`,new Uint32Array(d),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}this._fusedSQKParams={};const m=this.headDim,f=this.numHeads,b=this.numKVHeads,v=f*m,U=b*m,w=(v+U)/2,p=Math.ceil(w/4),_=32+p*16,g=`fused_split_qknorm_kvstore_${p}`;T[g]||(T[g]=E(p)),this.pipelines[g]||(this.pipelines[g]=this.gpu.getOrCreatePipeline(g,T[g])),this._splitQKNormShaderKey=g;for(let l=0;l<this.numLayers;l++){if(this.layerTypes[l]!=="full_attention")continue;const S=`model.language_model.layers.${l}.self_attn`,B=`${S}.q_norm.weight`,d=`${S}.k_norm.weight`,P=this._normWeightRaw?.[B],q=this._normWeightRaw?.[d],k=new ArrayBuffer(_),$=new DataView(k);if($.setUint32(0,f,!0),$.setUint32(4,b,!0),$.setUint32(8,m,!0),$.setFloat32(12,this.rmsEps,!0),$.setUint32(16,0,!0),$.setUint32(20,this.partialDim,!0),$.setFloat32(24,this.ropeTheta,!0),P)for(let C=0;C<v/2;C++){const D=Math.floor(C/4),z=C%4;$.setUint32(32+D*16+z*4,P[C],!0)}if(q){const C=v/2;for(let D=0;D<U/2;D++){const z=C+D,x=Math.floor(z/4),O=z%4;$.setUint32(32+x*16+O*4,q[D],!0)}}const y=this.gpu.device.createBuffer({size:_,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:`fused_sqk_${l}`});this.gpu.device.queue.writeBuffer(y,0,new Uint8Array(k)),this._fusedSQKParams[l]=y}await this._quantizeLmHead()}async _quantizeLmHead(){const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight";if(!this.weights[e])return;const t=this.hiddenSize,i=this.vocabSize,s=this.groupSize,n=t/8,r=t/s,u=performance.now(),h=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,o=this.weights[e];this._lmHeadQWeight=this.gpu.createBuffer("lmhead_qweight",n*i*4,h);const c=this.gpu.createBuffer("lmhead_scales_f32",r*i*4,h),m=Math.ceil(r*i/2)*4;this._lmHeadScales=this.gpu.createBuffer("lmhead_scales",m,h);const f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await R(async()=>{const{SHADERS:y}=await import("./gpu-ops-Da8QZLNh.js").then(C=>C.b);return{SHADERS:y}},[])).SHADERS.quantize_bf16_to_int4),b=this.gpu.createBufferFromData("quant_params",new Uint32Array([t,i,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),v=this.gpu.createBindGroup(f,0,[o,this._lmHeadQWeight,c,b]),U=65535,w=Math.min(i,U),p=Math.ceil(i/U),_=this.gpu.device.createCommandEncoder(),g=_.beginComputePass();g.setPipeline(f),g.setBindGroup(0,v),g.dispatchWorkgroups(w,p),g.end(),this.gpu.device.queue.submit([_.finish()]);const l=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await R(async()=>{const{SHADERS:y}=await import("./gpu-ops-Da8QZLNh.js").then(C=>C.b);return{SHADERS:y}},[])).SHADERS.pack_f32_to_f16_pairs),S=Math.ceil(r*i/2),B=this.gpu.createBufferFromData("pack_params",new Uint32Array([S]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),d=this.gpu.createBindGroup(l,0,[c,this._lmHeadScales,B]),P=this.gpu.device.createCommandEncoder(),q=P.beginComputePass();q.setPipeline(l),q.setBindGroup(0,d),q.dispatchWorkgroups(Math.ceil(S/256)),q.end(),this.gpu.device.queue.submit([P.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),c.destroy(),b.destroy(),B.destroy();const k=(n*i*4/1e6).toFixed(0),$=(m/1e6).toFixed(0);console.log(`[QUANT] lm_head INT4 (GPU): ${(performance.now()-u).toFixed(0)}ms, ${k}MB qw + ${$}MB sc`)}async _quantizeBF16ToINT4(a,e,t,i,s){const n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,r=e/8,u=e/i,h=this.gpu.createBuffer(`${s}_qweight`,r*t*4,n),o=this.gpu.createBuffer(`${s}_scales_f32`,u*t*4,n),c=Math.ceil(u*t/2)*4,m=this.gpu.createBuffer(`${s}_scales`,c,n),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Da8QZLNh.js").then($=>$.b);return{SHADERS:k}},[])).SHADERS.quantize_bf16_to_int4),b=this.gpu.createBufferFromData(`${s}_qp`,new Uint32Array([e,t,i]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),v=this.gpu.createBindGroup(f,0,[a,h,o,b]),U=65535,w=Math.min(t,U),p=Math.ceil(t/U),_=this.gpu.device.createCommandEncoder(),g=_.beginComputePass();g.setPipeline(f),g.setBindGroup(0,v),g.dispatchWorkgroups(w,p),g.end(),this.gpu.device.queue.submit([_.finish()]);const l=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Da8QZLNh.js").then($=>$.b);return{SHADERS:k}},[])).SHADERS.pack_f32_to_f16_pairs),S=Math.ceil(u*t/2),B=this.gpu.createBufferFromData(`${s}_pp`,new Uint32Array([S]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),d=this.gpu.createBindGroup(l,0,[o,m,B]),P=this.gpu.device.createCommandEncoder(),q=P.beginComputePass();return q.setPipeline(l),q.setBindGroup(0,d),q.dispatchWorkgroups(Math.ceil(S/256)),q.end(),this.gpu.device.queue.submit([P.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),o.destroy(),b.destroy(),B.destroy(),{qweight:h,scales:m}}initBuffers(a){a||(a=this.textCfg.max_position_embeddings||4096),this.maxSeqLen=a;const e=this.gpu,t=this.hiddenSize,i=this.intermediateSize,s=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.hidden=e.createBuffer("hidden_a",t*4,s),this.hiddenB=e.createBuffer("hidden_b",t*4,s),this.normed=e.createBuffer("normed",t*4,s),this.normedB=e.createBuffer("normed_b",t*4,s),this.mlpIntermediate=e.createBuffer("mlp_inter",i*4,s),this.mlpOut=e.createBuffer("mlp_out",t*4,s),this.logits=e.createBuffer("logits",this.vocabSize*4,s),this.argmaxResult=e.createBuffer("argmax_result",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this._argmaxReadback=e.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"argmax_readback"}),this._topkResult=e.createBuffer("topk_result",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._repMaxTokens=64,this._recentTokens=new Uint32Array(this._repMaxTokens),this._recentTokenCount=0,this._gpuRecentTokens=e.createBuffer("recent_tokens",this._repMaxTokens*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST);for(let w=0;w<this.numLayers;w++)this.layerTypes[w]==="full_attention"&&(this.kvCache[w]={keys:e.createBuffer(`kv_k_${w}`,a*this.numKVHeads*this.headDim*4),values:e.createBuffer(`kv_v_${w}`,a*this.numKVHeads*this.headDim*4)});this.qProjFull=e.createBuffer("q_proj_full",this.numHeads*this.headDim*2*4,s),this.qProj=e.createBuffer("q_proj_out",this.numHeads*this.headDim*4,s),this.qGate=e.createBuffer("q_gate",this.numHeads*this.headDim*4,s),this.kProj=e.createBuffer("k_proj_out",this.numKVHeads*this.headDim*4,s),this.vProj=e.createBuffer("v_proj_out",this.numKVHeads*this.headDim*4,s);const n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,h=this.textCfg.linear_num_value_heads??n,c=h/n*u,m=n*(r+r+c),f=h*u;this.linValueDim=f,this.linValueHeads=h,this.linQKV=e.createBuffer("lin_qkv",m*4,s),this.linZ=e.createBuffer("lin_z",f*4,s),this.linOut=e.createBuffer("lin_out",f*4,s);const b=Math.max(t,this.numHeads*this.headDim,f)*4;this.attnOut=e.createBuffer("attn_out",b,s),this._maxGqaSplits=64;const v=this.numHeads*this._maxGqaSplits*(this.headDim+2)*4;this._gqaPartials=e.createBuffer("gqa_partials",v,s),this.linState={},this.linConvHist={};for(let w=0;w<this.numLayers;w++)this.layerTypes[w]==="linear_attention"&&(this.linState[w]=e.createBuffer(`lin_state_${w}`,n*r*c*4,s),this.linConvHist[w]=e.createBuffer(`lin_conv_hist_${w}`,3*m*4,s));this.zeroBuf=e.createBuffer("zero_buf",t*4,s),this.useSplitK=!1,this.splitKSplits=1;const U=Math.max(t,i);this.splitKPartials=e.createBuffer("splitk_partials",U*this.splitKSplits*4,s),this.paramBufs={},this._gqaParamBuf=e.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_params"}),this._kvStoreParamBuf=e.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"kv_store_params"}),this._gqaData=new Uint8Array(32),this._gqaDv=new DataView(this._gqaData.buffer),this._kvData=new Uint8Array(16),this._kvDv=new DataView(this._kvData.buffer)}initB2Buffers(){const a=this.gpu,e=this.hiddenSize,t=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.b2={},this.b2.hidden=a.createBuffer("b2_hidden_a",2*e*4,i),this.b2.hiddenB=a.createBuffer("b2_hidden_b",2*e*4,i),this.b2.normed=a.createBuffer("b2_normed",2*e*4,i),this.b2.mlpIntermediate=a.createBuffer("b2_mlp_inter",2*t*4,i),this.b2.mlpOut=a.createBuffer("b2_mlp_out",2*e*4,i),this.b2.logits=a.createBuffer("b2_logits",2*this.vocabSize*4,i),this.b2.zeroBuf=a.createBuffer("b2_zero_buf",2*e*4,i),this.b2.qProjFull=a.createBuffer("b2_q_proj_full",2*this.numHeads*this.headDim*2*4,i),this.b2.qProj=a.createBuffer("b2_q_proj_out",2*this.numHeads*this.headDim*4,i),this.b2.qGate=a.createBuffer("b2_q_gate",2*this.numHeads*this.headDim*4,i),this.b2.kProj=a.createBuffer("b2_k_proj_out",2*this.numKVHeads*this.headDim*4,i),this.b2.vProj=a.createBuffer("b2_v_proj_out",2*this.numKVHeads*this.headDim*4,i);const s=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,o=this.linValueHeads/s*r,c=s*(n+n+o),m=this.linValueDim;this.b2.linQKV=a.createBuffer("b2_lin_qkv",2*c*4,i),this.b2.linZ=a.createBuffer("b2_lin_z",2*m*4,i),this.b2.linOut=a.createBuffer("b2_lin_out",2*m*4,i);const f=Math.max(e,this.numHeads*this.headDim,m)*4;if(this.b2.attnOut=a.createBuffer("b2_attn_out",2*f,i),this.abQuantized){const b=this.textCfg.linear_num_value_heads??s;this.b2.linAlpha=a.createBuffer("b2_lin_alpha",2*b*4,i),this.b2.linBeta=a.createBuffer("b2_lin_beta",2*b*4,i)}this.b2.topkResult0=a.createBuffer("b2_topk0",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkResult1=a.createBuffer("b2_topk1",2048,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this.b2.topkReadback0=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb0"}),this.b2.topkReadback1=a.device.createBuffer({size:2048,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_topk_rb1"}),this.b2.argmaxResult0=a.createBuffer("b2_argmax0",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxResult1=a.createBuffer("b2_argmax1",8,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC),this.b2.argmaxReadback0=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb0"}),this.b2.argmaxReadback1=a.device.createBuffer({size:8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"b2_argmax_rb1"}),this.b2._gqaParamBuf0=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params0"}),this.b2._gqaParamBuf1=a.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"b2_gqa_params1"}),this.b2._dims={H:e,I:t,linQKVDim:c,valueDim:m,linHeads:s,linKeyDim:n,linValDim:r,linEVD:o,qProjSize:this.numHeads*this.headDim,qProjFullSize:this.numHeads*this.headDim*2,kProjSize:this.numKVHeads*this.headDim,vProjSize:this.numKVHeads*this.headDim,attnOutElems:f/4},console.log(`[B2] Allocated batch=2 buffers (${((2*(e+e+e+t+e+this.vocabSize)*4+2*f)/1e6).toFixed(1)} MB activations)`)}runWithOffsets(a,e,t,i=1){const s=this.pipelines[a],n=this.gpu.createBindGroupWithOffsets(s,0,e);this.gpu.dispatch(s,[n],t,i)}makeUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),i=new DataView(t);for(let r=0;r<e.length;r++)Number.isInteger(e[r])?i.setUint32(r*4,e[r],!0):i.setFloat32(r*4,e[r],!0);const s=a+"_"+e.join("_");if(this.paramBufs[s])return this.paramBufs[s];const n=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(n,0,new Uint8Array(t)),this.paramBufs[s]=n,n}_makeMixedUniform(a,e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),i=new DataView(t);for(let n=0;n<e.length;n++){const r=e[n];r.u!==void 0?i.setUint32(n*4,r.u,!0):i.setFloat32(n*4,r.f,!0)}this._mixedUniformBufs||(this._mixedUniformBufs={});let s=this._mixedUniformBufs[a];return s||(s=this.gpu.device.createBuffer({size:t.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a}),this._mixedUniformBufs[a]=s),this.gpu.device.queue.writeBuffer(s,0,new Uint8Array(t)),s}makeUniformTyped(a,e,t){const i=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(i*4/16)*16)),n=new DataView(s);let r=0;for(const o of e)n.setUint32(r,o,!0),r+=4;for(const o of t)n.setFloat32(r,o,!0),r+=4;const u=a+"_"+[...e,...t].join("_");if(this.paramBufs[u])return this.paramBufs[u];const h=this.gpu.device.createBuffer({size:s.byteLength,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:a});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(s)),this.paramBufs[u]=h,h}makeFusedNormMLPUniform(a){const e=`fused_anorm_mlp_${a}`;if(this.paramBufs[e])return this.paramBufs[e];const t=this.hiddenSize,i=this.intermediateSize,s=16+640*16,n=new ArrayBuffer(s),r=new DataView(n);r.setUint32(0,t,!0),r.setUint32(4,i,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=`model.language_model.layers.${a}.post_attention_layernorm.weight`,h=this._normWeightRaw?.[u];if(!h)throw new Error(`Norm weight not cached for layer ${a}`);for(let c=0;c<t/2;c++){const m=Math.floor(c/4),f=c%4;r.setUint32(16+m*16+f*4,h[c],!0)}const o=this.gpu.device.createBuffer({size:s,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:e});return this.gpu.device.queue.writeBuffer(o,0,new Uint8Array(n)),this.paramBufs[e]=o,o}fusedNormGptqOp(a,e,t,i,s,n,r,u){const h=this.getQWeight(i);if(!h.qweight)return null;const o=this.makeFusedNormGPTQUniform(s,r),c=u?"fused_norm_gptq":"fused_norm_gptq_noadd",m=u?"fnorm_gptq_":"fnorm_gptqna_",f=u?[a,e,h.qweight,h.scales,t,o]:[a,h.qweight,h.scales,t,o];return this.prepOpCached(`${m}${i}`,c,f,this.wg4(r))}makeFusedNormGPTQUniform(a,e){const t=`fused_norm_gptq_${a}_${e}`;if(this.paramBufs[t])return this.paramBufs[t];const i=this.hiddenSize,s=16+512*16,n=new ArrayBuffer(s),r=new DataView(n);r.setUint32(0,i,!0),r.setUint32(4,e,!0),r.setUint32(8,this.groupSize,!0),r.setFloat32(12,this.rmsEps,!0);const u=this._normWeightRaw?.[a];if(!u)throw new Error(`Norm weight not cached: ${a}`);for(let o=0;o<i/2;o++){const c=Math.floor(o/4),m=o%4;r.setUint32(16+c*16+m*4,u[o],!0)}const h=this.gpu.device.createBuffer({size:s,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:t});return this.gpu.device.queue.writeBuffer(h,0,new Uint8Array(n)),this.paramBufs[t]=h,h}run(a,e,t,i=1){const s=this.pipelines[a],n=this.gpu.createBindGroup(s,0,e);this.gpu.dispatch(s,[n],t,i)}runCached(a,e,t,i,s=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);this.gpu.dispatch(n,[r],i,s)}prepOp(a,e,t,i=1){const s=this.pipelines[a],n=this.gpu.createBindGroup(s,0,e);return{pipeline:s,bindGroups:[n],workgroupsX:t,workgroupsY:i}}prepOpCached(a,e,t,i,s=1){const n=this.pipelines[e],r=this.gpu.getCachedBindGroup(a,n,0,t);return{pipeline:n,bindGroups:[r],workgroupsX:i,workgroupsY:s}}wg(a){return Math.ceil(a/256)}wg4(a){return Math.ceil(a/32)}wg8(a){return Math.ceil(a/8)}getQWeight(a){return{qweight:this.weights[a+".qweight"],scales:this.weights[a+".scales"],qzeros:this.weights[a+".qzeros"]}}embedding(a){const e=this.weights["model.language_model.embed_tokens.weight"],t=this.makeUniform("emb_params",[a,this.hiddenSize]);this.run("embedding",[e,this.hidden,t],this.wg(this.hiddenSize))}embeddingFromArgmax(){const a=this.weights["model.language_model.embed_tokens.weight"];this._embedArgmaxParams||(this._embedArgmaxParams=this.makeUniform("emb_argmax_params",[this.hiddenSize])),this.runCached("embed_from_argmax","embed_from_argmax",[a,this.hidden,this.argmaxResult,this._embedArgmaxParams],this.wg(this.hiddenSize))}_dispatchLmHead(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,i=a/this.groupSize%4===0,s=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(i){const n=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,s],this.wg8(e))}else{const n=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("lm_head",n,[this.normed,this._lmHeadQWeight,this._lmHeadScales,this.logits,s],this.wg4(e))}}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],i=this.makeUniform("lmhead_params",[this.hiddenSize,this.vocabSize]);this.runCached("lm_head","bf16_matvec",[this.normed,t,this.logits,i],this.wg(this.vocabSize))}}rmsNorm(a,e,t,i){const s=this.makeUniform("rmsnorm_params",[this.hiddenSize,this.rmsEps]);i?this.runCached(i,"rmsnorm",[a,t,e,s],1):this.run("rmsnorm",[a,t,e,s],1)}gptqMatvec(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return;const r=i/this.groupSize,u=r%4===0;if(this.useSplitK&&u){let h=this.splitKSplits;for(;h>1&&r%(h*4)!==0;)h>>=1;if(h>1){const o=this.makeUniform(`mv_sk_${i}_${s}_${h}`,[i,s,this.groupSize,h]);this.runCached(`sk4tf16_${t}`,"gptq_matvec_4t_f16_sk",[a,n.qweight,n.scales,this.splitKPartials,o],this.wg8(s),h);const c=this.makeUniform(`rsk_${s}_${h}`,[s,h]);this.runCached(`rsk_${t}`,"reduce_splitk",[this.splitKPartials,e,c],this.wg(s));return}}if(u){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",o=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",c=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);this.runCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,c],this.wg8(s))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",o=this.gpu.hasF16?"gptqf16_":"gptq_",c=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);this.runCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,c],this.wg4(s))}}gptqMatvecOp(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return null;if(i/this.groupSize%4===0){const h=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t",o=this.gpu.hasF16?"gptq4tf16_":"gptq4t_",c=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);return this.prepOpCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,c],this.wg8(s))}else{const h=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec",o=this.gpu.hasF16?"gptqf16_":"gptq_",c=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);return this.prepOpCached(`${o}${t}`,h,[a,n.qweight,n.scales,e,c],this.wg4(s))}}bf16Matvec(a,e,t,i,s,n){const r=this.makeUniform(`bf16mv_${i}_${s}`,[i,s]);n?this.runCached(n,"bf16_matvec",[a,t,e,r],this.wg(s)):this.run("bf16_matvec",[a,t,e,r],this.wg(s))}siluMul(a,e,t,i,s){const n=this.makeUniform(`silu_${i}`,[i]);s?this.runCached(s,"silu_mul",[a,e,t,n],this.wg(i)):this.run("silu_mul",[a,e,t,n],this.wg(i))}addVectors(a,e,t,i){const s=this.makeUniform(`add_${t}`,[t]);i?this.runCached(i,"add",[a,e,s],this.wg(t)):this.run("add",[a,e,s],this.wg(t))}addAndRmsNorm(a,e,t,i,s){const n=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);s?this.runCached(s,"add_rmsnorm",[a,e,i,t,n],1):this.run("add_rmsnorm",[a,e,i,t,n],1)}mlp(a,e){const t=`model.language_model.layers.${e}.mlp`,i=this.hiddenSize,s=this.intermediateSize,n=this.getQWeight(`${t}.gate_proj`),r=this.getQWeight(`${t}.up_proj`),u=this.makeUniform(`fused_mv_${i}_${s}`,[i,s,this.groupSize]);if(i/this.groupSize%4===0){const c=this.gpu.hasF16?"fused_gate_up_silu_4t_f16":"fused_gate_up_silu_4t",m=this.gpu.hasF16?"fused_gus4t16_":"fused_gus4t_";this.runCached(`${m}${e}`,c,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg8(s))}else{const c=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",m=this.gpu.hasF16?"fused_gus16_":"fused_gus_";this.runCached(`${m}${e}`,c,[a,n.qweight,n.scales,r.qweight,r.scales,this.mlpIntermediate,u],this.wg4(s))}this.gptqMatvec(this.mlpIntermediate,this.mlpOut,`${t}.down_proj`,s,i)}fullAttentionFused(a,e,t,i,s){s=s||this.normed;const n=`model.language_model.layers.${t}.self_attn`,r=this.hiddenSize,u=this.headDim,h=this.numHeads,o=this.numKVHeads,c=h/o,m=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,b=this.weights[f];if(m){const y=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]);this.runCached(`fa_norm_${t}`,"rmsnorm",[a,b,s,y],1)}else{const y=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]);this.runCached(`fa_norm_${t}`,"add_rmsnorm_ro",[a,e,b,s,y],1)}const v=this.gptqMatvecOp(s,this.qProjFull,`${n}.q_proj`,r,h*u*2),U=this.gptqMatvecOp(s,this.kProj,`${n}.k_proj`,r,o*u),w=this.gptqMatvecOp(s,this.vProj,`${n}.v_proj`,r,o*u);this.gpu.dispatchMulti([v,U,w].filter(Boolean));const p=this.kvCache[t],_=this._fusedSQKParams[t];this._gqaDv.setUint32(0,i,!0),this.gpu.device.queue.writeBuffer(_,16,this._gqaData,0,4),this.runCached(`fa_sqk_${t}`,this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,p.keys,p.values,_],h+o);const g=i+1,l=this._forceMinSplits||1,S=Math.max(l,Math.min(Math.max(1,Math.ceil(g/32)),this._maxGqaSplits)),B=S>1?this._gqaPartials:this.attnOut;if(this._gqaDv.setUint32(0,g,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,o,!0),this._gqaDv.setUint32(12,h,!0),this._gqaDv.setUint32(16,c,!0),this._gqaDv.setUint32(20,S,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.run("gqa_attention_head",[this.qProj,p.keys,p.values,B,this._gqaParamBuf],h,S),S>1){this._gqaReduceParams||(this._gqaReduceParams=this.gpu.device.createBuffer({size:16,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"gqa_reduce_params"}));const y=new Uint8Array(16),C=new DataView(y.buffer);C.setUint32(0,u,!0),C.setUint32(4,S,!0),C.setUint32(8,h,!0),this.gpu.device.queue.writeBuffer(this._gqaReduceParams,0,y),this.run("gqa_reduce",[this._gqaPartials,this.attnOut,this._gqaReduceParams],h)}const d=this.getQWeight(`${n}.o_proj`),P=h*u,k=P/this.groupSize%4===0,$=this.makeUniform(`fused_sig_mv_${P}_${r}`,[P,r,this.groupSize]);if(k){const y=this.gpu.hasF16?"fused_sigmoid_gptq_4t_f16":"fused_sigmoid_gptq_4t",C=this.gpu.hasF16?"fused_sig4t16_oproj_":"fused_sig4t_oproj_";this.runCached(`${C}${t}`,y,[this.attnOut,this.qGate,d.qweight,d.scales,this.qProj,$],this.wg8(r))}else{const y=this.gpu.hasF16?"fused_sigmoid_gptq_f16":"fused_sigmoid_gptq",C=this.gpu.hasF16?"fused_sig16_oproj_":"fused_sig_oproj_";this.runCached(`${C}${t}`,y,[this.attnOut,this.qGate,d.qweight,d.scales,this.qProj,$],this.wg4(r))}}linearAttentionFused(a,e,t,i){i=i||this.normed;const s=`model.language_model.layers.${t}.linear_attn`,n=this.hiddenSize,r=this.textCfg.linear_num_key_heads,u=this.textCfg.linear_key_head_dim,h=this.textCfg.linear_value_head_dim,o=this.linValueHeads,m=o/r*h,f=r*(u+u+m),b=this.linValueDim,v=t===0,U=`model.language_model.layers.${t}.input_layernorm.weight`,w=this.weights[U];if(v){const q=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"rmsnorm",[a,w,i,q],1)}else{const q=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.runCached(`la_norm_${t}`,"add_rmsnorm_ro",[a,e,w,i,q],1)}{const q=[this.gptqMatvecOp(i,this.linQKV,`${s}.in_proj_qkv`,n,f),this.gptqMatvecOp(i,this.linZ,`${s}.in_proj_z`,n,b)];this.abQuantized&&(q.push(this.gptqMatvecOp(i,this.linAlpha,`${s}.in_proj_a`,n,o)),q.push(this.gptqMatvecOp(i,this.linBeta,`${s}.in_proj_b`,n,o))),this.gpu.dispatchMulti(q.filter(Boolean))}const p=this.weights[`${s}.conv1d.weight`],_=this.weights[`${s}.A_log`],g=this.weights[`${s}.dt_bias`],l=this.weights[`${s}.norm.weight`];if(this.abQuantized){const q=`fused_cdn_q_${r}_${u}_${h}_${f}_${o}`;let k=this.paramBufs[q];if(!k){const $=new ArrayBuffer(32),y=new DataView($);y.setUint32(0,r,!0),y.setUint32(4,u,!0),y.setUint32(8,h,!0),y.setUint32(12,f,!0),y.setFloat32(16,this.rmsEps,!0),y.setUint32(20,0,!0),y.setUint32(24,o,!0),k=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_q_params"}),this.gpu.device.queue.writeBuffer(k,0,new Uint8Array($)),this.paramBufs[q]=k}this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],p,this.linState[t],this.linOut,this.linAlpha,this.linBeta,_,g,l,k],r)}else{const q=`fused_cdn_ext_${r}_${u}_${h}_${f}_${n}_${o}`;let k=this.paramBufs[q];if(!k){const y=new ArrayBuffer(32),C=new DataView(y);C.setUint32(0,r,!0),C.setUint32(4,u,!0),C.setUint32(8,h,!0),C.setUint32(12,f,!0),C.setFloat32(16,this.rmsEps,!0),C.setUint32(20,n,!0),C.setUint32(24,o,!0),k=this.gpu.device.createBuffer({size:32,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"fused_cdn_ext_params"}),this.gpu.device.queue.writeBuffer(k,0,new Uint8Array(y)),this.paramBufs[q]=k}const $=this.linABWeight[t];this.runCached(`fused_cdn_${t}`,"fused_conv_deltanet_norm",[this.linQKV,this.linConvHist[t],p,this.linState[t],this.linOut,i,$,_,g,l,k],r)}const S=this.getQWeight(`${s}.out_proj`),d=b/this.groupSize%4===0,P=this.makeUniform(`fused_silu_mv_${b}_${n}`,[b,n,this.groupSize]);if(d){const q=this.gpu.hasF16?"fused_silu_gptq_4t_f16":"fused_silu_gptq_4t",k=this.gpu.hasF16?"fused_silu4t16_oproj_":"fused_silu4t_oproj_";this.runCached(`${k}${t}`,q,[this.linZ,this.linOut,S.qweight,S.scales,this.attnOut,P],this.wg8(n))}else{const q=this.gpu.hasF16?"fused_silu_gptq_f16":"fused_silu_gptq",k=this.gpu.hasF16?"fused_silu16_oproj_":"fused_silu_oproj_";this.runCached(`${k}${t}`,q,[this.linZ,this.linOut,S.qweight,S.scales,this.attnOut,P],this.wg4(n))}}fusedNormMLP(a,e,t,i,s,n){n=n||this.normed;const r=this.hiddenSize,u=`model.language_model.layers.${s}.post_attention_layernorm.weight`,h=this.weights[u],o=this.makeUniform("add_rmsnorm_params",[r,this.rmsEps]),c=`mlp_norm_${s}_${a===this.hidden?"a":"b"}`;this.runCached(c,"three_way_add_rmsnorm",[a,t,i,h,e,n,o],1),this.mlp(n,s)}decoderLayer(a,e,t,i,s){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionFused(t,s,a,e),n=this.qProj):(this.linearAttentionFused(t,s,a),n=this.attnOut),this.fusedNormMLP(t,i,s,n,a)}async forward(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,i=this.hiddenB,s=this.zeroBuf;for(let h=0;h<this.numLayers;h++){this.decoderLayer(h,e,t,i,s),s=this.mlpOut;const o=t;t=i,i=o}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const h=this.repetitionPenalty??1,o=this.presencePenalty??0;if(this._recentTokenCount>0&&(h>1||o>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const b=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:h},{f:o},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,b],Math.ceil(this._recentTokenCount/256))}const c=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,c],1);const m=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:m}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const h=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,h],1)}return this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this._readAndSample()}forwardSubmit(a,e){this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);let t=this.hidden,i=this.hiddenB,s=this.zeroBuf;for(let h=0;h<this.numLayers;h++){this.decoderLayer(h,e,t,i,s),s=this.mlpOut;const o=t;t=i,i=o}this.mtp&&this._mtpSaveHidden(t,this.mlpOut);const n=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(t,this.mlpOut,this.normed,n,"add_final_norm"),this._dispatchLmHead();const r=this.temperature??.7;if(r>0){const h=this.repetitionPenalty??1,o=this.presencePenalty??0;if(this._recentTokenCount>0&&(h>1||o>0)){this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const b=this._makeMixedUniform("penalty_params",[{u:this._recentTokenCount},{f:h},{f:o},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,b],Math.ceil(this._recentTokenCount/256))}const c=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,c],1);const m=Math.random()*4294967295>>>0,f=this._makeMixedUniform("sample_params",[{f:r},{u:this.topK??20},{f:this.topP??.8},{u:m}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,f],1)}else{const h=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,h],1)}this.gpu._passCount!==void 0&&!this._passCountLogged&&(console.log(`[PERF] ${this.gpu._passCount} compute passes per token`),this._passCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch()}_buildReplayFromRecording(){const a=this.gpu.stopRecording(),e=this.pipelines.gqa_attention_head,t=[];for(const i of a)if(i.multi)for(const s of i.ops)t.push({p:s.pipeline,bg:s.bindGroup,x:s.wgX,y:s.wgY,gqa:s.pipeline===e?1:0});else t.push({p:i.pipeline,bg:i.bindGroup,x:i.wgX,y:i.wgY,gqa:i.pipeline===e?1:0});this._replayFlat=t,console.log(`[PERF] Recorded ${t.length} flat replay ops for core forward`)}_replayCoreForward(a){const e=this.gpu,t=a+1,i=Math.min(Math.max(1,Math.ceil(t/32)),this._maxGqaSplits);this._gqaDv.setUint32(0,a,!0);for(let u=0;u<this.numLayers;u++)this.layerTypes[u]==="full_attention"&&e.device.queue.writeBuffer(this._fusedSQKParams[u],16,this._gqaData,0,4);this._gqaDv.setUint32(0,t,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,i,!0),e.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData,0,24),this._gqaReduceParams&&(this._gqaDv.setUint32(0,this.headDim,!0),this._gqaDv.setUint32(4,i,!0),this._gqaDv.setUint32(8,this.numHeads,!0),this._gqaDv.setUint32(12,0,!0),e.device.queue.writeBuffer(this._gqaReduceParams,0,this._gqaData,0,16));const s=e._singlePass,n=this._replayFlat,r=n.length;for(let u=0;u<r;u++){const h=n[u];s.setPipeline(h.p),s.setBindGroup(0,h.bg),s.dispatchWorkgroups(h.x,h.gqa?i:h.y)}}embeddingB2(a,e){const t=this.hiddenSize,i=this.weights["model.language_model.embed_tokens.weight"],s=this.makeUniform("emb_params",[a,t]);this.runWithOffsets("embedding",[i,{buffer:this.b2.hidden,offset:0,size:t*4},s],this.wg(t));const n=this.makeUniform(`emb_params_${e}`,[e,t]);this.runWithOffsets("embedding",[i,{buffer:this.b2.hidden,offset:t*4,size:t*4},n],this.wg(t))}gptqMatvecB2(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return;const r=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",h=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";this.runCached(`${h}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(s))}gptqMatvecB2Op(a,e,t,i,s){const n=this.getQWeight(t);if(!n.qweight)return null;const r=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]),u=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2",h=this.gpu.hasF16?"b2gptqf16_":"b2gptq_";return this.prepOpCached(`${h}${t}`,u,[a,n.qweight,n.scales,e,r],this.wg4(s))}fullAttentionB2(a,e,t,i){const s=`model.language_model.layers.${t}.self_attn`,n=this.hiddenSize,r=this.b2._dims,u=this.headDim,h=this.numHeads,o=this.numKVHeads,c=h/o,m=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,b=this.weights[f],v=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);m?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:n*4},b,{buffer:this.b2.normed,offset:0,size:n*4},v],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:n*4,size:n*4},b,{buffer:this.b2.normed,offset:n*4,size:n*4},v],1)):this.run("add_rmsnorm_ro_b2",[a,e,b,this.b2.normed,v],2);const U=this.gptqMatvecB2Op(this.b2.normed,this.b2.qProjFull,`${s}.q_proj`,n,h*u*2),w=this.gptqMatvecB2Op(this.b2.normed,this.b2.kProj,`${s}.k_proj`,n,o*u),p=this.gptqMatvecB2Op(this.b2.normed,this.b2.vProj,`${s}.v_proj`,n,o*u);this.gpu.dispatchMulti([U,w,p].filter(Boolean));const _=this.kvCache[t],g=this._fusedSQKParams[t],l=r.qProjFullSize*4,S=r.kProjSize*4,B=r.vProjSize*4,d=r.qProjSize*4;this._gqaDv.setUint32(0,i,!0),this.gpu.device.queue.writeBuffer(g,16,this._gqaData,0,4),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:0,size:l},{buffer:this.b2.kProj,offset:0,size:S},{buffer:this.b2.vProj,offset:0,size:B},{buffer:this.b2.qProj,offset:0,size:d},{buffer:this.b2.qGate,offset:0,size:d},_.keys,_.values,g],h+o),this._gqaDv.setUint32(0,i+1,!0),this.gpu.device.queue.writeBuffer(g,16,this._gqaData,0,4),this.runWithOffsets(this._splitQKNormShaderKey,[{buffer:this.b2.qProjFull,offset:l,size:l},{buffer:this.b2.kProj,offset:S,size:S},{buffer:this.b2.vProj,offset:B,size:B},{buffer:this.b2.qProj,offset:d,size:d},{buffer:this.b2.qGate,offset:d,size:d},_.keys,_.values,g],h+o);const P=i+1,q=i+2;this._gqaDv.setUint32(0,P,!0),this._gqaDv.setUint32(4,u,!0),this._gqaDv.setUint32(8,o,!0),this._gqaDv.setUint32(12,h,!0),this._gqaDv.setUint32(16,c,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,i,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf0,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:0,size:d},_.keys,_.values,{buffer:this.b2.attnOut,offset:0,size:r.qProjSize*4},this.b2._gqaParamBuf0],h),this._gqaDv.setUint32(0,q,!0),this._gqaDv.setUint32(24,i+1,!0),this.gpu.device.queue.writeBuffer(this.b2._gqaParamBuf1,0,this._gqaData),this.runWithOffsets("gqa_attention_head",[{buffer:this.b2.qProj,offset:d,size:d},_.keys,_.values,{buffer:this.b2.attnOut,offset:r.qProjSize*4,size:r.qProjSize*4},this.b2._gqaParamBuf1],h);const k=this.getQWeight(`${s}.o_proj`),$=h*u,y=this.makeUniform(`fused_sig_mv_${$}_${n}`,[$,n,this.groupSize]);this.runCached(`b2_fused_sig_oproj_${t}`,"fused_sigmoid_gptq_b2_f16",[this.b2.attnOut,this.b2.qGate,k.qweight,k.scales,this.b2.qProj,y],this.wg4(n))}linearAttentionB2(a,e,t){const i=`model.language_model.layers.${t}.linear_attn`,s=this.hiddenSize,n=this.b2._dims,r=n.linHeads,u=n.linKeyDim,h=n.linValDim;n.linEVD;const o=n.linQKVDim,c=n.valueDim,m=t===0,f=`model.language_model.layers.${t}.input_layernorm.weight`,b=this.weights[f],v=this.makeUniform("add_rmsnorm_params",[s,this.rmsEps]);m?(this.runWithOffsets("rmsnorm",[{buffer:a,offset:0,size:s*4},b,{buffer:this.b2.normed,offset:0,size:s*4},v],1),this.runWithOffsets("rmsnorm",[{buffer:a,offset:s*4,size:s*4},b,{buffer:this.b2.normed,offset:s*4,size:s*4},v],1)):this.run("add_rmsnorm_ro_b2",[a,e,b,this.b2.normed,v],2);{const d=[this.gptqMatvecB2Op(this.b2.normed,this.b2.linQKV,`${i}.in_proj_qkv`,s,o),this.gptqMatvecB2Op(this.b2.normed,this.b2.linZ,`${i}.in_proj_z`,s,c)];this.abQuantized&&(d.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linAlpha,`${i}.in_proj_a`,s,this.linValueHeads)),d.push(this.gptqMatvecB2Op(this.b2.normed,this.b2.linBeta,`${i}.in_proj_b`,s,this.linValueHeads))),this.gpu.dispatchMulti(d.filter(Boolean))}const U=this.weights[`${i}.conv1d.weight`],w=this.weights[`${i}.A_log`],p=this.weights[`${i}.dt_bias`],_=this.weights[`${i}.norm.weight`],g=o*4,l=c*4;if(this.abQuantized){const d=this.linValueHeads,P=d*4,q=`fused_cdn_q_${r}_${u}_${h}_${o}_${d}`,k=this.paramBufs[q];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:g},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:0,size:l},{buffer:this.b2.linAlpha,offset:0,size:P},{buffer:this.b2.linBeta,offset:0,size:P},w,p,_,k],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:g,size:g},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:l,size:l},{buffer:this.b2.linAlpha,offset:P,size:P},{buffer:this.b2.linBeta,offset:P,size:P},w,p,_,k],r)}else{const d=`fused_cdn_ext_${r}_${u}_${h}_${o}_${s}_${this.linValueHeads}`,P=this.paramBufs[d],q=this.linABWeight[t];this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:0,size:g},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:0,size:l},{buffer:this.b2.normed,offset:0,size:s*4},q,w,p,_,P],r),this.runWithOffsets("fused_conv_deltanet_norm",[{buffer:this.b2.linQKV,offset:g,size:g},this.linConvHist[t],U,this.linState[t],{buffer:this.b2.linOut,offset:l,size:l},{buffer:this.b2.normed,offset:s*4,size:s*4},q,w,p,_,P],r)}const S=this.getQWeight(`${i}.out_proj`),B=this.makeUniform(`fused_silu_mv_${c}_${s}`,[c,s,this.groupSize]);this.runCached(`b2_fused_silu_oproj_${t}`,"fused_silu_gptq_b2_f16",[this.b2.linZ,this.b2.linOut,S.qweight,S.scales,this.b2.attnOut,B],this.wg4(s))}fusedNormMLPB2(a,e,t,i,s){const n=this.hiddenSize,r=this.intermediateSize,u=`model.language_model.layers.${s}.mlp`,h=`model.language_model.layers.${s}.post_attention_layernorm.weight`,o=this.weights[h],c=this.makeUniform("add_rmsnorm_params",[n,this.rmsEps]);this.run("three_way_add_rmsnorm_b2",[a,t,i,o,e,this.b2.normed,c],2);const m=this.getQWeight(`${u}.gate_proj`),f=this.getQWeight(`${u}.up_proj`),b=this.makeUniform(`fused_mv_${n}_${r}`,[n,r,this.groupSize]);this.runCached(`b2_fused_gus_${s}`,"fused_gate_up_silu_b2_f16",[this.b2.normed,m.qweight,m.scales,f.qweight,f.scales,this.b2.mlpIntermediate,b],this.wg4(r)),this.gptqMatvecB2(this.b2.mlpIntermediate,this.b2.mlpOut,`${u}.down_proj`,r,n)}decoderLayerB2(a,e,t,i,s){let n;this.layerTypes[a]==="full_attention"?(this.fullAttentionB2(t,s,a,e),n=this.b2.qProj):(this.linearAttentionB2(t,s,a),n=this.b2.attnOut),this.fusedNormMLPB2(t,i,s,n,a)}_dispatchLmHeadB2(){if(this._lmHeadQWeight){const a=this.hiddenSize,e=this.vocabSize,t=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]),i=this.gpu.hasF16?"gptq_matmul_b2_f16":"gptq_matmul_b2";this.runCached("b2_lm_head",i,[this.b2.normed,this._lmHeadQWeight,this._lmHeadScales,this.b2.logits,t],this.wg4(e))}else{const e=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",t=this.weights[e],i=this.hiddenSize,s=this.vocabSize,n=this.makeUniform("lmhead_params",[i,s]);this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:0,size:i*4},t,{buffer:this.b2.logits,offset:0,size:s*4},n],this.wg(s)),this.runWithOffsets("bf16_matvec",[{buffer:this.b2.normed,offset:i*4,size:i*4},t,{buffer:this.b2.logits,offset:s*4,size:s*4},n],this.wg(s))}}async forwardB2(a,e,t){this.b2||this.initB2Buffers(),this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embeddingB2(a,e);let i=this.b2.hidden,s=this.b2.hiddenB,n=this.b2.zeroBuf;for(let f=0;f<this.numLayers;f++){this.decoderLayerB2(f,t,i,s,n),n=this.b2.mlpOut;const b=i;i=s,s=b}if(this.mtp){const f=this.hiddenSize;this.gpu.copyBuffer(i,this.mtp.savedHidden,f*4,f*4,0),this.gpu.copyBuffer(this.b2.mlpOut,this.mtp.savedMlpOut,f*4,f*4,0)}const r=this.weights["model.language_model.norm.weight"],u=this.makeUniform("add_rmsnorm_params",[this.hiddenSize,this.rmsEps]);this.run("add_rmsnorm_b2",[i,this.b2.mlpOut,r,this.b2.normed,u],2),this._dispatchLmHeadB2();const o=(this.temperature??.7)>0,c=this.vocabSize,m=this.makeUniform("argmax_params",[c]);return o?(this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:0,size:c*4},this.b2.topkResult0,m],1),this.runWithOffsets("topk_extract",[{buffer:this.b2.logits,offset:c*4,size:c*4},this.b2.topkResult1,m],1)):(this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:0,size:c*4},this.b2.argmaxResult0,m],1),this.runWithOffsets("argmax",[{buffer:this.b2.logits,offset:c*4,size:c*4},this.b2.argmaxResult1,m],1)),this.gpu._passCount!==void 0&&!this._b2PassCountLogged&&(console.log(`[PERF] B2: ${this.gpu._passCount} compute passes per 2-token step`),this._b2PassCountLogged=!0),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),o?(this.gpu.copyBuffer(this.b2.topkResult0,this.b2.topkReadback0,2048),this.gpu.copyBuffer(this.b2.topkResult1,this.b2.topkReadback1,2048)):(this.gpu.copyBuffer(this.b2.argmaxResult0,this.b2.argmaxReadback0,8),this.gpu.copyBuffer(this.b2.argmaxResult1,this.b2.argmaxReadback1,8)),this.gpu.endBatch(),this._readAndSampleB2(o)}async _readAndSampleB2(a){const e=[0,0];a?(await Promise.all([this.b2.topkReadback0.mapAsync(GPUMapMode.READ),this.b2.topkReadback1.mapAsync(GPUMapMode.READ)]),e[0]=this._sampleFromTopkReadback(this.b2.topkReadback0),e[1]=this._sampleFromTopkReadback(this.b2.topkReadback1)):(await Promise.all([this.b2.argmaxReadback0.mapAsync(GPUMapMode.READ),this.b2.argmaxReadback1.mapAsync(GPUMapMode.READ)]),e[0]=new Uint32Array(this.b2.argmaxReadback0.getMappedRange().slice(0))[0],e[1]=new Uint32Array(this.b2.argmaxReadback1.getMappedRange().slice(0))[0],this.b2.argmaxReadback0.unmap(),this.b2.argmaxReadback1.unmap());for(const t of e)this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=t:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=t);return e}_sampleFromTopkReadback(a){const e=a.getMappedRange(),t=new Uint32Array(e),i=new Float32Array(e),s=new Uint32Array(256),n=new Float32Array(256);for(let h=0;h<256;h++)s[h]=t[h*2],n[h]=i[h*2+1];a.unmap();const r=this.presencePenalty??0,u=this.repetitionPenalty??1;if((r>0||u>1)&&this._recentTokenCount>0){const h=new Set;for(let o=0;o<this._recentTokenCount;o++)h.add(this._recentTokens[o]);for(let o=0;o<256;o++)h.has(s[o])&&(r>0&&(n[o]-=r),u>1&&(n[o]=n[o]>0?n[o]/u:n[o]*u))}return this._sampleFromArrays(s,n,256)}async _readAndSample(){await this._argmaxReadback.mapAsync(GPUMapMode.READ);const a=new Uint32Array(this._argmaxReadback.getMappedRange().slice(0));this._argmaxReadback.unmap();const e=a[0];return this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=e:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=e),e}_sampleFromArrays(a,e,t){const i=this.temperature??.7,s=this.topP??.8,n=this.topK??20,r=Math.min(n,t),u=this._selBuf||(this._selBuf=new Uint32Array(64)),h=this._selValBuf||(this._selValBuf=new Float32Array(64)),o=this._usedBuf||(this._usedBuf=new Uint8Array(256));o.fill(0);for(let _=0;_<r;_++){let g=-1,l=-1/0;for(let S=0;S<t;S++)!o[S]&&e[S]>l&&(l=e[S],g=S);if(g<0)break;u[_]=a[g],h[_]=l,o[g]=1}const c=h[0],m=this._probBuf||(this._probBuf=new Float32Array(64));let f=0;for(let _=0;_<r;_++)m[_]=Math.exp((h[_]-c)/i),f+=m[_];for(let _=0;_<r;_++)m[_]/=f;let b=0,v=r;for(let _=0;_<r;_++)if(b+=m[_],b>=s){v=_+1;break}let U=0;for(let _=0;_<v;_++)U+=m[_];const w=Math.random()*U;let p=0;for(let _=0;_<v;_++)if(p+=m[_],p>=w)return u[_];return u[0]}_sample(a){const e=this.temperature??.7,t=this.topP??.8,i=this.topK??20,s=a.length;if(e<=0){let p=0,_=a[0];for(let g=1;g<s;g++)a[g]>_&&(_=a[g],p=g);return p}const n=Math.max(i,64),r=new Int32Array(n).fill(-1),u=new Float32Array(n).fill(-1/0);let h=-1/0;for(let p=0;p<s;p++){const _=a[p];if(_>h&&(h=_),_>u[n-1]){let g=n-1;for(;g>0&&_>u[g-1];)u[g]=u[g-1],r[g]=r[g-1],g--;u[g]=_,r[g]=p}}const o=Math.min(i,n),c=new Float32Array(o);let m=0;for(let p=0;p<o&&!(r[p]<0);p++)c[p]=Math.exp((u[p]-h)/e),m+=c[p];for(let p=0;p<o;p++)c[p]/=m;let f=0,b=o;for(let p=0;p<o;p++){if(r[p]<0){b=p;break}if(f+=c[p],f>=t){b=p+1;break}}let v=0;for(let p=0;p<b;p++)v+=c[p];const U=Math.random()*v;let w=0;for(let p=0;p<b;p++)if(w+=c[p],w>=U)return r[p];return r[0]}async generate(a,e=512,t){this.seqLen=0,this._recentTokenCount=0,this._replayFlat=null;const i=this.maxSeqLen||4096;if(e=Math.min(e,i-a.length),e<=0)throw new Error(`Prompt (${a.length} tokens) exceeds context size (${i})`);const s=[...a],n=this.textCfg.linear_num_key_heads,r=this.textCfg.linear_key_head_dim,u=this.textCfg.linear_value_head_dim,c=this.linValueHeads/n*u,m=n*(r+r+c);for(let B=0;B<this.numLayers;B++)if(this.layerTypes[B]==="linear_attention"){const d=n*r*c*4,P=3*m*4;this.gpu.device.queue.writeBuffer(this.linState[B],0,new Uint8Array(d)),this.gpu.device.queue.writeBuffer(this.linConvHist[B],0,new Uint8Array(P))}let f=null;for(let B=0;B<a.length;B++)f=await this.forward(a[B],B),this.seqLen=B+1;s.push(f);const b=this.config.eos_token_id??this.textCfg.eos_token_id,v=Array.isArray(b)?b:b!=null?[b]:[248044,248046];if(t?.(f,0)||v.includes(f))return s;this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=f:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=f),this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4);const w=8;this._tokenHistoryBuf||(this._tokenHistoryBuf=this.gpu.createBuffer("token_history",w*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC),this._tokenHistoryReadback=this.gpu.device.createBuffer({size:w*4,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST,label:"token_history_readback"}));let p=0,_=0,g=1,l=f,S=!1;for(;g<e;){const B=performance.now(),d=Math.min(w,e-g);for(let y=0;y<d;y++){const C=this.seqLen+y;if(this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),y===0?this.embedding(l):this.embeddingFromArgmax(),S)this._replayCoreForward(C);else{let O=this.hidden,G=this.hiddenB,M=this.zeroBuf;y===0&&!this._replayFlat&&(this._forceMinSplits=2,this.gpu.startRecording());for(let F=0;F<this.numLayers;F++){this.decoderLayer(F,C,O,G,M),M=this.mlpOut;const H=O;O=G,G=H}const A=this.weights["model.language_model.norm.weight"];this.addAndRmsNorm(O,this.mlpOut,this.normed,A,"add_final_norm"),this._dispatchLmHead(),y===0&&this.gpu._recording&&(this._forceMinSplits=0,this._buildReplayFromRecording())}const D=this.temperature??.7;if(D>0){const O=this.repetitionPenalty??1,G=this.presencePenalty??0,M=this._recentTokenCount+y;if(M>0&&(O>1||G>0)){const W=this._makeMixedUniform("penalty_params",[{u:Math.min(M,this._repMaxTokens)},{f:O},{f:G},{u:0}]);this.run("rep_penalty",[this.logits,this._gpuRecentTokens,W],Math.ceil(Math.min(M,this._repMaxTokens)/256))}const A=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("topk","topk_extract",[this.logits,this._topkResult,A],1);const F=Math.random()*4294967295>>>0,H=this._makeMixedUniform("sample_params",[{f:D},{u:this.topK??20},{f:this.topP??.8},{u:F}]);this.run("gpu_sample",[this._topkResult,this.argmaxResult,H],1)}else{const O=this.makeUniform("argmax_params",[this.vocabSize]);this.runCached("argmax","argmax",[this.logits,this.argmaxResult,O],1)}const z=(this._recentTokenCount+y)%this._repMaxTokens,x=this.makeUniform(`append_${y}`,[z,y]);this.run("append_token",[this.argmaxResult,this._gpuRecentTokens,this._tokenHistoryBuf,x],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.endBatch()}!S&&this._replayFlat&&(S=!0);const P=this.gpu.device.createCommandEncoder();P.copyBufferToBuffer(this._tokenHistoryBuf,0,this._tokenHistoryReadback,0,d*4),this.gpu.device.queue.submit([P.finish()]),await this._tokenHistoryReadback.mapAsync(GPUMapMode.READ);const q=new Uint32Array(this._tokenHistoryReadback.getMappedRange().slice(0,d*4));this._tokenHistoryReadback.unmap();const k=performance.now();p+=k-B,_+=d;let $=!1;for(let y=0;y<d;y++){const C=q[y];s.push(C),this.seqLen++,this._recentTokenCount<this._repMaxTokens?this._recentTokens[this._recentTokenCount++]=C:(this._recentTokens.copyWithin(0,1),this._recentTokens[this._repMaxTokens-1]=C);const D=t?.(C,g);if(g++,D||v.includes(C)){$=!0;break}}if(_%50<w&&console.log(`[T @${_}] ${(p/_).toFixed(1)}ms/tok (batch=${w})`),$)break;l=q[d-1],this.gpu.device.queue.writeBuffer(this._gpuRecentTokens,0,this._recentTokens.buffer,0,this._recentTokenCount*4)}return _>0&&console.log(`[T final @${_}] ${(p/_).toFixed(1)}ms/tok (batch=${w})`),s}async _quantizeBF16Weight(a,e,t,i){const s=this.groupSize,n=e/8,r=e/s,u=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,h=this.gpu.createBuffer(`${i}_qweight`,n*t*4,u),o=this.gpu.createBuffer(`${i}_scales_f32`,r*t*4,u),c=Math.ceil(r*t/2)*4,m=this.gpu.createBuffer(`${i}_scales`,c,u),f=this.gpu.getOrCreatePipeline("quantize_bf16_to_int4",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Da8QZLNh.js").then($=>$.b);return{SHADERS:k}},[])).SHADERS.quantize_bf16_to_int4),b=this.gpu.createBufferFromData(`${i}_qparams`,new Uint32Array([e,t,s]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),v=this.gpu.createBindGroup(f,0,[a,h,o,b]),U=65535,w=Math.min(t,U),p=Math.ceil(t/U),_=this.gpu.device.createCommandEncoder(),g=_.beginComputePass();g.setPipeline(f),g.setBindGroup(0,v),g.dispatchWorkgroups(w,p),g.end(),this.gpu.device.queue.submit([_.finish()]);const l=this.gpu.getOrCreatePipeline("pack_f32_to_f16_pairs",(await R(async()=>{const{SHADERS:k}=await import("./gpu-ops-Da8QZLNh.js").then($=>$.b);return{SHADERS:k}},[])).SHADERS.pack_f32_to_f16_pairs),S=Math.ceil(r*t/2),B=this.gpu.createBufferFromData(`${i}_pparams`,new Uint32Array([S]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),d=this.gpu.createBindGroup(l,0,[o,m,B]),P=this.gpu.device.createCommandEncoder(),q=P.beginComputePass();return q.setPipeline(l),q.setBindGroup(0,d),q.dispatchWorkgroups(Math.ceil(S/256)),q.end(),this.gpu.device.queue.submit([P.finish()]),await this.gpu.device.queue.onSubmittedWorkDone(),o.destroy(),b.destroy(),B.destroy(),{qweight:h,scales:m}}async initMTP(a){if(this.mtp)return;const e=this.hiddenSize,t=this.intermediateSize,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=performance.now(),n="Qwen/Qwen3.5-2B";console.log(`[MTP] Downloading MTP weights from ${n}...`);const{loadMTPWeights:r}=await R(async()=>{const{loadMTPWeights:U}=await import("./safetensors-loader-CwGm5mJX.js");return{loadMTPWeights:U}},[]),u=await r(n,a);console.log(`[MTP] Downloaded ${Object.keys(u).length} MTP tensors`),this.mtp={weights:{},qweights:{}};const h={};for(const[U,w]of Object.entries(u)){const p=this.gpu.createBufferFromData(`mtp_${U}`,w.data);h[U]=p,this.mtp.weights[U]=p}const o=[{name:"mtp.fc.weight",K:e*2,N:e},{name:"mtp.layers.0.self_attn.q_proj.weight",K:e,N:this.numHeads*this.headDim*2},{name:"mtp.layers.0.self_attn.k_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.v_proj.weight",K:e,N:this.numKVHeads*this.headDim},{name:"mtp.layers.0.self_attn.o_proj.weight",K:e,N:e},{name:"mtp.layers.0.mlp.gate_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.up_proj.weight",K:e,N:t},{name:"mtp.layers.0.mlp.down_proj.weight",K:t,N:e}];for(const{name:U,K:w,N:p}of o){const{qweight:_,scales:g}=await this._quantizeBF16Weight(h[U],w,p,`mtp_${U}`);this.mtp.qweights[U]={qweight:_,scales:g},h[U].destroy(),delete this.mtp.weights[U]}this.mtp.normRaw={};const c=["mtp.pre_fc_norm_hidden.weight","mtp.pre_fc_norm_embedding.weight","mtp.layers.0.input_layernorm.weight","mtp.layers.0.post_attention_layernorm.weight","mtp.norm.weight","mtp.layers.0.self_attn.q_norm.weight","mtp.layers.0.self_attn.k_norm.weight"];for(const U of c){const w=u[U];w&&(this.mtp.normRaw[U]=new Uint32Array(w.data.buffer.slice(w.data.byteOffset,w.data.byteOffset+w.data.byteLength)))}const f=4096*this.numKVHeads*this.headDim*4;this.mtp.kvCache={keys:this.gpu.createBuffer("mtp_kv_keys",f,i),values:this.gpu.createBuffer("mtp_kv_values",f,i)},this.mtp.concatBuf=this.gpu.createBuffer("mtp_concat",e*2*4,i),this.mtp.fcOut=this.gpu.createBuffer("mtp_fc_out",e*4,i),this.mtp.savedHidden=this.gpu.createBuffer("mtp_saved_hidden",e*4,i),this.mtp.savedMlpOut=this.gpu.createBuffer("mtp_saved_mlp_out",e*4,i),this.mtp.seqLen=0,this._buildMTPSplitQKNormUniform(),this._buildMTPFusedMLPUniform(),await this._buildTrimmedLmHead(8e3);const b=((performance.now()-s)/1e3).toFixed(1),v=o.length;console.log(`[MTP] Initialized in ${b}s: ${v} INT4 weights, 1 KV cache layer`)}_buildMTPSplitQKNormUniform(){const a=this.numHeads,e=this.numKVHeads,t=this.headDim,i=a*t,s=e*t,n=(i+s)/2,u=32+Math.ceil(n/4)*16,h=new ArrayBuffer(u),o=new DataView(h);o.setUint32(0,a,!0),o.setUint32(4,e,!0),o.setUint32(8,t,!0),o.setFloat32(12,this.rmsEps,!0),o.setUint32(16,0,!0);const c=this.mtp.normRaw["mtp.layers.0.self_attn.q_norm.weight"],m=this.mtp.normRaw["mtp.layers.0.self_attn.k_norm.weight"];if(c)for(let b=0;b<i/2;b++){const v=Math.floor(b/4),U=b%4;o.setUint32(32+v*16+U*4,c[b],!0)}if(m){const b=i/2;for(let v=0;v<s/2;v++){const U=b+v,w=Math.floor(U/4),p=U%4;o.setUint32(32+w*16+p*4,m[v],!0)}}const f=this.gpu.device.createBuffer({size:u,usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST,label:"mtp_fused_sqk"});this.gpu.device.queue.writeBuffer(f,0,new Uint8Array(h)),this.mtp.fusedSQKParams=f}_buildMTPFusedMLPUniform(){}async _buildTrimmedLmHead(a=8e3){const e=this.hiddenSize,t=this.gpu,i=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC,s=new Uint32Array(a);for(let l=0;l<a;l++)s[l]=l;this.mtp.trimmedToFull=s,this.mtp.trimmedVocabSize=a;const n=t.createBufferFromData("mtp_trim_indices",s),u=(this.config.tie_word_embeddings??this.textCfg.tie_word_embeddings)!==!1?"model.language_model.embed_tokens.weight":"lm_head.weight",h=this.weights[u],o=e/2,c=a*o*4,m=t.createBuffer("mtp_trim_gathered",c,i),f=(await R(async()=>{const{SHADERS:l}=await import("./gpu-ops-Da8QZLNh.js").then(S=>S.b);return{SHADERS:l}},[])).SHADERS.gather_rows_bf16,b=t.getOrCreatePipeline("gather_rows_bf16",f),v=t.createBufferFromData("mtp_trim_gparams",new Uint32Array([o,a]),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),U=t.createBindGroup(b,0,[h,n,m,v]),w=t.device.createCommandEncoder(),p=w.beginComputePass();p.setPipeline(b),p.setBindGroup(0,U),p.dispatchWorkgroups(Math.ceil(o/256),a),p.end(),t.device.queue.submit([w.finish()]);const{qweight:_,scales:g}=await this._quantizeBF16Weight(m,e,a,"mtp_trim_lmhead");this.mtp.trimmedLmHead={qweight:_,scales:g},this.mtp.trimmedLogits=t.createBuffer("mtp_trimmed_logits",a*4,i),m.destroy(),n.destroy(),v.destroy(),console.log(`[MTP] Trimmed lm_head: ${a} tokens, ${(_.size/1024/1024).toFixed(1)}MB qw + ${(g.size/1024/1024).toFixed(1)}MB sc`)}_dispatchTrimmedLmHead(){const a=this.hiddenSize,e=this.mtp.trimmedVocabSize,t=this.mtp.trimmedLmHead,s=a/this.groupSize%4===0,n=this.makeUniform(`mv_${a}_${e}`,[a,e,this.groupSize]);if(s){const r=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg8(e))}else{const r=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached("mtp_trim_lmhead",r,[this.normed,t.qweight,t.scales,this.mtp.trimmedLogits,n],this.wg4(e))}}_mtpGetQWeight(a){return this.mtp.qweights[a]}_mtpGptqMatvec(a,e,t,i,s){const n=this._mtpGetQWeight(t);if(!n)throw new Error(`MTP weight not found: ${t}`);const u=i/this.groupSize%4===0,h=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";this.runCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg8(s))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";this.runCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg4(s))}}_mtpSaveHidden(a,e){this.mtp&&(this.gpu.copyBuffer(a,this.mtp.savedHidden,this.hiddenSize*4),this.gpu.copyBuffer(e,this.mtp.savedMlpOut,this.hiddenSize*4))}async mtpForward(a){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");return this.mtpForwardSubmit(a),await this._readAndSample()}mtpForwardSubmit(a){const e=this.hiddenSize,t=this.intermediateSize,i=this.mtp.seqLen;this.gpu.singlePassMode=this.singlePassMode,this.gpu.beginBatch(),this.embedding(a);const s=this.makeUniform("add_rmsnorm_params",[e,this.rmsEps]);this.runCached("mtp_emb_norm","rmsnorm",[this.hidden,this.mtp.weights["mtp.pre_fc_norm_embedding.weight"],this.normed,s],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4),this.runCached("mtp_hid_norm","add_rmsnorm_ro",[this.mtp.savedHidden,this.mtp.savedMlpOut,this.mtp.weights["mtp.pre_fc_norm_hidden.weight"],this.normed,s],1),this.gpu.copyBuffer(this.normed,this.mtp.concatBuf,e*4,0,e*4),this._mtpGptqMatvec(this.mtp.concatBuf,this.mtp.fcOut,"mtp.fc.weight",e*2,e),this.runCached("mtp_input_norm","rmsnorm",[this.mtp.fcOut,this.mtp.weights["mtp.layers.0.input_layernorm.weight"],this.normed,s],1);const n="mtp.layers.0.self_attn",r=this._mtpGptqMatvecOp(this.normed,this.qProjFull,`${n}.q_proj.weight`,e,this.numHeads*this.headDim*2),u=this._mtpGptqMatvecOp(this.normed,this.kProj,`${n}.k_proj.weight`,e,this.numKVHeads*this.headDim),h=this._mtpGptqMatvecOp(this.normed,this.vProj,`${n}.v_proj.weight`,e,this.numKVHeads*this.headDim);this.gpu.dispatchMulti([r,u,h].filter(Boolean)),this._gqaDv.setUint32(0,i,!0),this.gpu.device.queue.writeBuffer(this.mtp.fusedSQKParams,16,this._gqaData,0,4),this.runCached("mtp_sqk",this._splitQKNormShaderKey,[this.qProjFull,this.kProj,this.vProj,this.qProj,this.qGate,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.mtp.fusedSQKParams],this.numHeads+this.numKVHeads);const o=i+1;this._gqaDv.setUint32(0,o,!0),this._gqaDv.setUint32(4,this.headDim,!0),this._gqaDv.setUint32(8,this.numKVHeads,!0),this._gqaDv.setUint32(12,this.numHeads,!0),this._gqaDv.setUint32(16,this.numHeads/this.numKVHeads,!0),this._gqaDv.setUint32(20,this.partialDim,!0),this._gqaDv.setUint32(24,i,!0),this._gqaDv.setFloat32(28,this.ropeTheta,!0),this.gpu.device.queue.writeBuffer(this._gqaParamBuf,0,this._gqaData),this.runCached("mtp_gqa","gqa_attention_head",[this.qProj,this.mtp.kvCache.keys,this.mtp.kvCache.values,this.attnOut,this._gqaParamBuf],this.numHeads);const c=this._mtpGetQWeight(`${n}.o_proj.weight`),m=this.numHeads*this.headDim,f=this.makeUniform(`fused_sig_mv_${m}_${e}`,[m,e,this.groupSize]);m/this.groupSize%4===0&&this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_4t_f16",[this.attnOut,this.qGate,c.qweight,c.scales,this.qProj,f],this.wg8(e)):this.gpu.hasF16?this.runCached("mtp_sig_oproj","fused_sigmoid_gptq_f16",[this.attnOut,this.qGate,c.qweight,c.scales,this.qProj,f],this.wg4(e)):this.runCached("mtp_sig_oproj","fused_sigmoid_gptq",[this.attnOut,this.qGate,c.qweight,c.scales,this.qProj,f],this.wg4(e)),this.runCached("mtp_post_norm","add_rmsnorm_ro",[this.mtp.fcOut,this.qProj,this.mtp.weights["mtp.layers.0.post_attention_layernorm.weight"],this.normed,s],1);const U=this.gpu.hasF16?"fused_gate_up_silu_f16":"fused_gate_up_silu",w=this.makeUniform(`fused_mv_${e}_${t}`,[e,t,this.groupSize]),p=this.mtp.qweights["mtp.layers.0.mlp.gate_proj.weight"],_=this.mtp.qweights["mtp.layers.0.mlp.up_proj.weight"];this.runCached("mtp_gate_up_silu",U,[this.normed,p.qweight,p.scales,_.qweight,_.scales,this.mlpIntermediate,w],this.wg4(t)),this._mtpGptqMatvec(this.mlpIntermediate,this.mlpOut,"mtp.layers.0.mlp.down_proj.weight",t,e),this.runCached("mtp_final_norm","three_way_add_rmsnorm",[this.mtp.fcOut,this.qProj,this.mlpOut,this.mtp.weights["mtp.norm.weight"],this.hidden,this.normed,s],1),this._dispatchLmHead();const g=this.makeUniform("mtp_argmax_params",[this.vocabSize]);this.runCached("mtp_argmax","argmax",[this.logits,this.argmaxResult,g],1),this.gpu._singlePass&&(this.gpu._singlePass.end(),this.gpu._singlePass=null),this.gpu.copyBuffer(this.argmaxResult,this._argmaxReadback,8),this.gpu.endBatch(),this.mtp.seqLen++}_mtpSnapshotDeltaNet(){if(!this.mtp._deltaNetSnapshot){const e=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC;this.mtp._deltaNetSnapshot={states:{},convs:{}};for(let t=0;t<this.numLayers;t++)this.layerTypes[t]==="linear_attention"&&(this.mtp._deltaNetSnapshot.states[t]=this.gpu.createBuffer(`mtp_snap_state_${t}`,this.linState[t].size,e),this.mtp._deltaNetSnapshot.convs[t]=this.gpu.createBuffer(`mtp_snap_conv_${t}`,this.linConvHist[t].size,e))}const a=this.mtp._deltaNetSnapshot;a.mtpSeqLen=this.mtp.seqLen;for(const e of Object.keys(a.states))this.gpu.copyBuffer(this.linState[e],a.states[e],this.linState[e].size),this.gpu.copyBuffer(this.linConvHist[e],a.convs[e],this.linConvHist[e].size)}_mtpRestoreDeltaNet(){const a=this.mtp._deltaNetSnapshot;if(a){a.mtpSeqLen!==void 0&&(this.mtp.seqLen=a.mtpSeqLen);for(const e of Object.keys(a.states))this.gpu.copyBuffer(a.states[e],this.linState[e],this.linState[e].size),this.gpu.copyBuffer(a.convs[e],this.linConvHist[e],this.linConvHist[e].size)}}async generateWithMTP(a,e=512,t){if(!this.mtp)throw new Error("MTP not initialized. Call initMTP() first.");this.seqLen=0,this.mtp.seqLen=0,this._recentTokenCount=0;const i=[...a],s=this.textCfg.linear_num_key_heads,n=this.textCfg.linear_key_head_dim,r=this.textCfg.linear_value_head_dim,h=this.linValueHeads/s*r,o=s*(n+n+h);for(let B=0;B<this.numLayers;B++)if(this.layerTypes[B]==="linear_attention"){const d=s*n*h*4,P=3*o*4;this.gpu.device.queue.writeBuffer(this.linState[B],0,new Uint8Array(d)),this.gpu.device.queue.writeBuffer(this.linConvHist[B],0,new Uint8Array(P))}let c=null;for(let B=0;B<a.length;B++)c=await this.forward(a[B],B),this.seqLen=B+1;i.push(c);const m=this.config.eos_token_id??this.textCfg.eos_token_id,f=Array.isArray(m)?m:m!=null?[m]:[248044,248046];if(t?.(c,0)||f.includes(c))return i;let v=1,U=0,w=0,p=c,_=0,g=0;for(;v<e;){const B=performance.now(),d=await this.mtpForward(p);this._mtpSnapshotDeltaNet();const P=this.seqLen,q=await this.forwardB2(p,d,this.seqLen);this.seqLen+=2;const k=q[0],$=q[1];if(k===d){U++,i.push(d),v++;let C=t?.(d,v-1);if(C||f.includes(d)||(i.push($),v++,C=t?.($,v-1),C||f.includes($)))break;p=$}else{w++,this._mtpRestoreDeltaNet(),this.seqLen=P;const C=await this.forward(p,this.seqLen);if(this.seqLen++,i.push(C),v++,t?.(C,v-1)||f.includes(C))break;p=C}const y=performance.now();if(_+=y-B,g++,g%25===0){const C=U/(U+w)*100,D=v/g;console.log(`[MTP @${g}] ${(_/g).toFixed(1)}ms/step, ${(v/(_/1e3)).toFixed(0)} tok/s, accept=${C.toFixed(0)}%, ${D.toFixed(1)} tok/step`)}}const l=U/Math.max(1,U+w)*100;return console.log(`[MTP final] ${(v/((_||1)/1e3)).toFixed(0)} tok/s, accept=${l.toFixed(0)}% (${U}/${U+w}), ${v} tokens`),i}_mtpGptqMatvecOp(a,e,t,i,s){const n=this._mtpGetQWeight(t);if(!n)return null;const u=i/this.groupSize%4===0,h=this.makeUniform(`mv_${i}_${s}`,[i,s,this.groupSize]);if(u){const o=this.gpu.hasF16?"gptq_matvec_4t_f16":"gptq_matvec_4t";return this.prepOpCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg8(s))}else{const o=this.gpu.hasF16?"gptq_matvec_f16":"gptq_matvec";return this.prepOpCached(`mtp_${t}`,o,[a,n.qweight,n.scales,e,h],this.wg4(s))}}}export{L as Qwen35Model};

assets/safetensors-loader-CwGm5mJX.js ADDED Viewed

	@@ -0,0 +1 @@

+ const k="https://huggingface.co",B="tensorbend-safetensors-v1";async function A(a,n,i="main"){const l=`${k}/${a}/resolve/${i}/${n}`,o=await fetch(l);if(!o.ok)throw new Error(`Failed to fetch ${n}: ${o.status}`);return o.json()}async function T(a,n){try{const r=await(await caches.open(B)).match(a);if(r){const g=await r.arrayBuffer();return n?.({loaded:g.byteLength,total:g.byteLength,cached:!0}),g}}catch{}const i=await fetch(a);if(!i.ok)throw new Error(`Failed to fetch ${a}: ${i.status}`);let l=null;try{l=(await caches.open(B)).put(a,i.clone())}catch{}const o=+i.headers.get("content-length")||0,f=i.body.getReader();let d=0,c;if(o>0)for(c=new Uint8Array(o);;){const{done:e,value:r}=await f.read();if(e)break;c.set(r,d),d+=r.byteLength,n?.({loaded:d,total:o})}else{const e=[];for(;;){const{done:g,value:t}=await f.read();if(g)break;e.push(t),d+=t.byteLength,n?.({loaded:d,total:d})}c=new Uint8Array(d);let r=0;for(const g of e)c.set(g,r),r+=g.byteLength}if(l)try{await l}catch{}return c.buffer}function L(a){const n=new DataView(a),i=Number(n.getBigUint64(0,!0)),l=new Uint8Array(a,8,i),o=new TextDecoder().decode(l),f=JSON.parse(o),d=8+i,c={};for(const[e,r]of Object.entries(f))e!=="__metadata__"&&(c[e]={dtype:r.dtype,shape:r.shape,data:new Uint8Array(a,d+r.data_offsets[0],r.data_offsets[1]-r.data_offsets[0])});return c}const I=2*1024*1024*1024-4*1024*1024,j=512*1024*1024;async function F(a){const i=await(await fetch(a,{headers:{Range:"bytes=0-7"}})).arrayBuffer(),l=Number(new DataView(i).getBigUint64(0,!0)),f=await(await fetch(a,{headers:{Range:`bytes=8-${7+l}`}})).arrayBuffer(),d=new TextDecoder().decode(new Uint8Array(f)),c=JSON.parse(d),e=8+l;return{header:c,dataOffset:e}}async function C(a,n,i){const{header:l,dataOffset:o}=await F(a),f=[];for(const[t,s]of Object.entries(l))t!=="__metadata__"&&f.push({name:t,dtype:s.dtype,shape:s.shape,absStart:o+s.data_offsets[0],absEnd:o+s.data_offsets[1],size:s.data_offsets[1]-s.data_offsets[0]});f.sort((t,s)=>t.absStart-s.absStart);const d=[],c=[];let e={tensors:[],start:f[0]?.absStart||0,end:0,size:0};for(const t of f){if(t.size>I){e.tensors.length>0&&(e.end=e.tensors[e.tensors.length-1].absEnd,d.push(e)),c.push(t),e={tensors:[],start:0,end:0,size:0};continue}e.tensors.length===0&&(e.start=t.absStart),t.absEnd-e.start>j&&e.tensors.length>0&&(e.end=e.tensors[e.tensors.length-1].absEnd,d.push(e),e={tensors:[],start:t.absStart,end:0,size:0}),e.tensors.push(t)}e.tensors.length>0&&(e.end=e.tensors[e.tensors.length-1].absEnd,d.push(e));let r=0;const g=f.length>0?f[f.length-1].absEnd-f[0].absStart:0;for(const t of d){const s=t.start,h=t.end,w=h-s,y=await fetch(a,{headers:{Range:`bytes=${s}-${h-1}`}});if(!y.ok&&y.status!==206)throw new Error(`Range request failed: ${y.status}`);const z=y.body.getReader(),$=new Uint8Array(w);let u=0;for(;;){const{done:b,value:m}=await z.read();if(b)break;$.set(m,u),u+=m.byteLength,r+=m.byteLength,n?.({loaded:r,total:g})}const R={};for(const b of t.tensors){const m=b.absStart-s;R[b.name]={dtype:b.dtype,shape:b.shape,data:new Uint8Array($.buffer,m,b.size)}}i&&await i(R),await new Promise(b=>setTimeout(b,0))}for(const t of c){const s=j;let h=0;for(;h<t.size;){const w=t.size-h,y=Math.min(s,w),z=t.absStart+h,$=z+y,u=await fetch(a,{headers:{Range:`bytes=${z}-${$-1}`}});if(!u.ok&&u.status!==206)throw new Error(`Range request failed: ${u.status}`);const R=u.body.getReader(),b=new Uint8Array(y);let m=0;for(;;){const{done:_,value:E}=await R.read();if(_)break;b.set(E,m),m+=E.byteLength,r+=E.byteLength,n?.({loaded:r,total:g})}if(i){const _={};_[t.name]={dtype:t.dtype,shape:t.shape,data:b,_partial:{offset:h,totalSize:t.size}},await i(_)}h+=y,await new Promise(_=>setTimeout(_,0))}}}async function N(a,n,{revision:i="main",onShard:l=null}={}){let o;try{const s=await A(a,"model.safetensors.index.json",i);o={};for(const[h,w]of Object.entries(s.weight_map))o[w]||(o[w]=[]),o[w].push(h)}catch{o={"model.safetensors":null}}const f=Object.keys(o),d={};let c=0;const e=f.length,r={};let g=0;for(const s of f)r[s]={loaded:0,total:0};function t(){let s=0,h=0;for(const w of Object.values(r))s+=w.loaded,h+=w.total;n?.({phase:"downloading",file:`${c}/${e} shards`,loaded:s,total:h,filesLoaded:c,filesTotal:e,cachedCount:g})}for(const s of f){const h=`${k}/${a}/resolve/${i}/${s}`;let w=0;try{w=+(await fetch(h,{method:"HEAD"})).headers.get("content-length")||0}catch{}if(w>I&&l)r[s]={loaded:0,total:w},t(),await C(h,y=>{r[s]={loaded:y.loaded,total:y.total||w},t()},l),c++,t();else{let y=!1,z=await T(h,u=>{u.cached&&!y&&(g++,y=!0),r[s]={loaded:u.loaded,total:u.total},t()});n?.({phase:"parsing",file:s});let $=L(z);c++,t(),l?(await l($),$=null,z=null,await new Promise(u=>setTimeout(u,0))):Object.assign(d,$)}}return n?.({phase:"done",filesLoaded:c,filesTotal:e}),d}async function D(a,n="main"){return A(a,"config.json",n)}async function v(a,n="main"){try{return await A(a,"quantization_config.json",n)}catch{try{return(await A(a,"config.json",n)).quantization_config||null}catch{return null}}}async function x(a,n,i="main"){const l=await A(a,"model.safetensors.index.json",i),o={},f=new Set;for(const[c,e]of Object.entries(l.weight_map))c.startsWith("mtp.")&&(o[c]=e,f.add(e));if(Object.keys(o).length===0)throw new Error(`No MTP tensors found in ${a}`);n?.({phase:"mtp_init",count:Object.keys(o).length});const d={};for(const c of f){const e=`${k}/${a}/resolve/${i}/${c}`,g=await(await fetch(e,{headers:{Range:"bytes=0-7"}})).arrayBuffer(),t=Number(new DataView(g).getBigUint64(0,!0)),h=await(await fetch(e,{headers:{Range:`bytes=8-${7+t}`}})).arrayBuffer(),w=new TextDecoder().decode(new Uint8Array(h)),y=JSON.parse(w),z=8+t,u=Object.keys(o).filter(p=>o[p]===c).map(p=>{const S=y[p];return{name:p,dtype:S.dtype,shape:S.shape,absStart:z+S.data_offsets[0],absEnd:z+S.data_offsets[1],relStart:S.data_offsets[0],size:S.data_offsets[1]-S.data_offsets[0]}}).sort((p,S)=>p.absStart-S.absStart),R=u[0].absStart,b=u[u.length-1].absEnd,m=b-R;n?.({phase:"mtp_downloading",loaded:0,total:m});const _=await fetch(e,{headers:{Range:`bytes=${R}-${b-1}`}});if(!_.ok&&_.status!==206)throw new Error(`Range request failed: ${_.status}`);const E=_.body.getReader(),U=new Uint8Array(m);let O=0;for(;;){const{done:p,value:S}=await E.read();if(p)break;U.set(S,O),O+=S.byteLength,n?.({phase:"mtp_downloading",loaded:O,total:m})}for(const p of u){const S=p.absStart-R;d[p.name]={dtype:p.dtype,shape:p.shape,data:new Uint8Array(U.buffer,S,p.size)}}}return n?.({phase:"mtp_done",count:Object.keys(d).length}),d}const M={F32:4,F16:2,BF16:2,I32:4,I16:2,I8:1,U8:1,BOOL:1,F64:8,I64:8};function q(a){switch(a){case"F32":return Float32Array;case"F16":return Uint16Array;case"BF16":return Uint16Array;case"I32":return Int32Array;case"I16":return Int16Array;case"I8":return Int8Array;case"U8":return Uint8Array;default:return Uint8Array}}export{M as DTYPE_SIZES,q as dtypeToTypedArray,A as fetchJSON,D as loadConfig,x as loadMTPWeights,N as loadModelWeights,v as loadQuantConfig};

assets/test-s5fGKojE.js ADDED Viewed

	@@ -0,0 +1,46 @@

+const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["assets/qwen35-model-BU7SY47A.js","assets/gpu-ops-Da8QZLNh.js"])))=>i.map(i=>d[i]);
+import{G as nt,S as ct,_ as et}from"./gpu-ops-Da8QZLNh.js";class ot{constructor(){this.gpu=null,this.results=[],this.pipelines={}}async init(){this.gpu=new nt,await this.gpu.init();for(const[e,t]of Object.entries(ct))this.pipelines[e]=this.gpu.getOrCreatePipeline(e,t);return this}async runAll(e){const t=[["GPTQ MatVec",()=>this.testGPTQMatvec()],["GPTQ MatVec (group boundary)",()=>this.testGPTQMatvecGroups()],["GPTQ MatVec (gs=32)",()=>this.testGPTQMatvecGS32()],["GPTQ Split-K",()=>this.testGPTQSplitK()],["GPTQ F16",()=>this.testGPTQF16()],["BF16 MatVec",()=>this.testBF16Matvec()],["RMSNorm",()=>this.testRMSNorm()],["RMSNorm (1+w formula)",()=>this.testRMSNorm1PlusW()],["SiLU * Mul",()=>this.testSiLUMul()],["Vector Add",()=>this.testAdd()],["Embedding (BF16)",()=>this.testEmbedding()],["Argmax",()=>this.testArgmax()],["Argmax (large)",()=>this.testArgmaxLarge()],["Split interleaved",()=>this.testSplit()],["Sigmoid Mul",()=>this.testSigmoidMul()],["Head RMSNorm (1+w)",()=>this.testHeadRMSNorm()],["Head RMSNorm (nogated, w)",()=>this.testHeadRMSNormNogated()],["Causal Conv1d",()=>this.testCausalConv1d()],["KV Cache Store",()=>this.testKVCacheStore()],["GQA Attention (single pos)",()=>this.testGQAAttention()],["DeltaNet Recurrent",()=>this.testDeltaNetRecurrent()],["Add + RMSNorm (fused)",()=>this.testAddRMSNorm()],["Three-way Add + RMSNorm",()=>this.testThreeWayAddRMSNorm()],["GQA RoPE rotate_half",()=>this.testGQARoPERotateHalf()],["Fused sigmoid + GPTQ",()=>this.testFusedSigmoidGPTQ()],["Fused SiLU + GPTQ",()=>this.testFusedSiluGPTQ()],["Fused AddNorm+GateUp+SiLU",()=>this.testFusedAddNormGateUpSiLU()],["Fused Split+QKNorm+KVStore",()=>this.testFusedSplitQKNormKVStore()],["Add+RMSNorm B=2",()=>this.testAddRMSNormB2()],["Add+RMSNorm RO B=2",()=>this.testAddRMSNormROB2()],["Three-way Add+RMSNorm B=2",()=>this.testThreeWayAddRMSNormB2()],["Fused SiLU+GPTQ B=2",()=>this.testFusedSiluGPTQB2()],["Fused Sigmoid+GPTQ B=2",()=>this.testFusedSigmoidGPTQB2()],["Fused GateUp+SiLU B=2",()=>this.testFusedGateUpSiLUB2()],["GPTQ Matmul B=2",()=>this.testGPTQMatmulB2()],["GPTQ Matmul B=2 F16",()=>this.testGPTQMatmulB2F16()],["GPTQ Matmul B=2 4T F16",()=>this.testGPTQMatmulB2_4T_F16()]];for(const[o,s]of t)try{const r=await s();this.results.push({name:o,...r}),e?.({name:o,...r})}catch(r){this.results.push({name:o,pass:!1,error:r.message,maxErr:NaN}),e?.({name:o,pass:!1,error:r.message,maxErr:NaN})}return this.results}destroy(){this.gpu?.destroy()}makeBuffer(e,t,o=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC){return this.gpu.createBufferFromData(e,new Uint8Array(t.buffer,t.byteOffset,t.byteLength),o)}makeF32Buffer(e,t){return this.makeBuffer(e,new Float32Array(t))}makeU32Buffer(e,t){return this.makeBuffer(e,new Uint32Array(t))}makeOutputBuffer(e,t){return this.gpu.createBuffer(e,t*4,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC)}makeUniform(e){const t=new ArrayBuffer(Math.max(16,Math.ceil(e.length*4/16)*16)),o=new DataView(t);for(let s=0;s<e.length;s++)typeof e[s]=="number"&&!Number.isInteger(e[s])?o.setFloat32(s*4,e[s],!0):o.setUint32(s*4,e[s],!0);return this.gpu.createBufferFromData("uniform",new Uint8Array(t),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}makeUniformTyped(e,t){const o=e.length+t.length,s=new ArrayBuffer(Math.max(16,Math.ceil(o*4/16)*16)),r=new DataView(s);let a=0;for(const d of e)r.setUint32(a,d,!0),a+=4;for(const d of t)r.setFloat32(a,d,!0),a+=4;return this.gpu.createBufferFromData("uniform_typed",new Uint8Array(s),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}dispatch(e,t,o,s=1){const r=this.pipelines[e],a=this.gpu.createBindGroup(r,0,t);this.gpu.dispatch(r,[a],o,s)}async readback(e,t){return this.gpu.readBuffer(e,t*4)}async readbackU32(e,t){const o=this.gpu.createReadbackBuffer("_rb",t*4),s=this.gpu.device.createCommandEncoder();s.copyBufferToBuffer(e,0,o,0,t*4),this.gpu.device.queue.submit([s.finish()]),await o.mapAsync(GPUMapMode.READ);const r=new Uint32Array(o.getMappedRange().slice(0));return o.unmap(),o.destroy(),r}compare(e,t,o=1e-4){let s=0;const r=[];for(let a=0;a<t.length;a++){const d=Math.abs(e[a]-t[a]);s=Math.max(s,d),d>o&&r.push({idx:a,got:e[a],expected:t[a],err:d})}return{pass:r.length===0,maxErr:s,errors:r.slice(0,5)}}cpuF32ToBF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const a=new Uint32Array(new Float32Array([e[s]]).buffer)[0]>>>16,d=Math.floor(s/2);s%2===0?o[d]=a:o[d]|=a<<16}return o}cpuF32ToF16Packed(e){const t=Math.ceil(e.length/2),o=new Uint32Array(t);for(let s=0;s<e.length;s++){const r=this.f32ToF16(e[s]),a=Math.floor(s/2);s%2===0?o[a]=r:o[a]|=r<<16}return o}f32ToF16(e){const t=new ArrayBuffer(4);new Float32Array(t)[0]=e;const o=new Uint32Array(t)[0],s=o>>31&1;let r=o>>23&255,a=o&8388607;return r===0?s<<15:r===255?s<<15|31744|(a?512:0):(r=r-127+15,r>=31?s<<15|31744:r<=0?s<<15:s<<15|r<<10|a>>13)}f16ToF32(e){const t=e>>15&1,o=e>>10&31,s=e&1023;return o===0?s===0?t?-0:0:(t?-1:1)*Math.pow(2,-14)*(s/1024):o===31?s?NaN:t?-1/0:1/0:(t?-1:1)*Math.pow(2,o-15)*(1+s/1024)}bf16ToF32(e){const t=e<<16,o=new ArrayBuffer(4);return new Uint32Array(o)[0]=t,new Float32Array(o)[0]}async testGPTQMatvec(){const r=new Float32Array(64);for(let k=0;k<64;k++)r[k]=(k+1)*.1;const a=new Uint32Array(32),d=new Float32Array(256);for(let k=0;k<8;k++)for(let u=0;u<4;u++){let m=0;for(let g=0;g<8;g++){const i=k*8+g,c=(i+u)%15;d[i*4+u]=c-8,m|=c<<g*4}a[k*4+u]=m}const f=new Float32Array(4);for(let k=0;k<4;k++)f[k]=.5+k*.1;const w=this.cpuF32ToF16Packed(f),h=new Float32Array(4);for(let k=0;k<4;k++){let u=0;for(let m=0;m<64;m++){const i=Math.floor(m/64)*4+k,c=w[Math.floor(i/2)]>>i%2*16&65535,_=this.f16ToF32(c);u+=r[m]*d[m*4+k]*_}h[k]=u}const F=this.makeF32Buffer("gptq_input",r),n=this.makeU32Buffer("gptq_qw",a),B=this.makeU32Buffer("gptq_scales",w),l=this.makeOutputBuffer("gptq_out",4),p=this.makeUniform([64,4,64]);this.dispatch("gptq_matvec",[F,n,B,l,p],Math.ceil(4/32));const y=await this.readback(l,4);return this.compare(y,h,.01)}async testGPTQMatvecGroups(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const f=128/64,w=new Float32Array(f*4);for(let u=0;u<f*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/64)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq2_in",r),B=this.makeU32Buffer("gptq2_qw",a),l=this.makeU32Buffer("gptq2_sc",h),p=this.makeOutputBuffer("gptq2_out",4),y=this.makeUniform([128,4,64]);this.dispatch("gptq_matvec",[n,B,l,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQMatvecGS32(){const r=new Float32Array(128);for(let u=0;u<128;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(64),d=new Float32Array(512);for(let u=0;u<16;u++)for(let m=0;m<4;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c*3+m*7)%15;d[c*4+m]=_-8,g|=_<<i*4}a[u*4+m]=g}const f=128/32,w=new Float32Array(f*4);for(let u=0;u<f*4;u++)w[u]=.3+u*.15;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let u=0;u<4;u++){let m=0;for(let g=0;g<128;g++){const c=Math.floor(g/32)*4+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*4+u]*b}F[u]=m}const n=this.makeF32Buffer("gptq32_in",r),B=this.makeU32Buffer("gptq32_qw",a),l=this.makeU32Buffer("gptq32_sc",h),p=this.makeOutputBuffer("gptq32_out",4),y=this.makeUniform([128,4,32]);this.dispatch("gptq_matvec",[n,B,l,p,y],Math.ceil(4/32));const k=await this.readback(p,4);return this.compare(k,F,.01)}async testGPTQSplitK(){const a=new Float32Array(256);for(let i=0;i<256;i++)a[i]=Math.sin(i*.3)*.5;const d=new Uint32Array(1024),f=new Float32Array(256*32);for(let i=0;i<32;i++)for(let c=0;c<32;c++){let _=0;for(let b=0;b<8;b++){const N=i*8+b,x=(N*3+c*7)%15;f[N*32+c]=x-8,_|=x<<b*4}d[i*32+c]=_}const w=256/64,h=new Float32Array(w*32);for(let i=0;i<w;i++)for(let c=0;c<32;c++)h[i*32+c]=.3+(i*32+c)*.02;const F=this.cpuF32ToF16Packed(h),n=new Float32Array(32);for(let i=0;i<32;i++){let c=0;for(let _=0;_<256;_++){const N=Math.floor(_/64)*32+i,x=F[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);c+=a[_]*f[_*32+i]*q}n[i]=c}const B=this.makeF32Buffer("sk_in",a),l=this.makeU32Buffer("sk_qw",d),p=this.makeU32Buffer("sk_sc",F),y=this.makeOutputBuffer("sk_partials",128),k=this.makeOutputBuffer("sk_out",32),u=this.makeUniform([256,32,64,4]),m=this.makeUniform([32,4]);this.dispatch("gptq_splitk",[B,l,p,y,u],Math.ceil(32/32),4),this.dispatch("reduce_splitk",[y,k,m],Math.ceil(32/256));const g=await this.readback(k,32);return this.compare(g,n,.01)}async testGPTQF16(){if(!this.gpu.hasF16||!this.pipelines.gptq_matvec_f16)return{pass:!0,maxErr:0,note:"shader-f16 not available, skipped"};const e=128,t=4,o=64,s=e/8,r=new Float32Array(e);for(let u=0;u<e;u++)r[u]=Math.sin(u*.5)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let u=0;u<s;u++)for(let m=0;m<t;m++){let g=0;for(let i=0;i<8;i++){const c=u*8+i,_=(c+m*3)%15;d[c*t+m]=_-8,g|=_<<i*4}a[u*t+m]=g}const f=e/o,w=new Float32Array(f*t);for(let u=0;u<f;u++)for(let m=0;m<t;m++)w[u*t+m]=.5+u*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(t);for(let u=0;u<t;u++){let m=0;for(let g=0;g<e;g++){const c=Math.floor(g/o)*t+u,_=h[Math.floor(c/2)]>>c%2*16&65535,b=this.f16ToF32(_);m+=r[g]*d[g*t+u]*b}F[u]=m}const n=this.makeF32Buffer("f16_in",r),B=this.makeU32Buffer("f16_qw",a),l=this.makeU32Buffer("f16_sc",h),p=this.makeOutputBuffer("f16_out",t),y=this.makeUniform([e,t,o]);this.dispatch("gptq_matvec_f16",[n,B,l,p,y],Math.ceil(t/32));const k=await this.readback(p,t);return this.compare(k,F,.5)}async testBF16Matvec(){const o=new Float32Array(8);for(let n=0;n<8;n++)o[n]=(n+1)*.2;const s=new Float32Array(32);for(let n=0;n<4;n++)for(let B=0;B<8;B++)s[n*8+B]=Math.cos(n*.5+B*.3);const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4);for(let n=0;n<4;n++){let B=0;for(let l=0;l<8;l++){const p=n*8+l,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);B+=o[l]*u}a[n]=B}const d=this.makeF32Buffer("bf16mv_in",o),f=this.makeU32Buffer("bf16mv_w",r),w=this.makeOutputBuffer("bf16mv_out",4),h=this.makeUniform([8,4]);this.dispatch("bf16_matvec",[d,f,w,h],Math.ceil(4/32));const F=await this.readback(w,4);return this.compare(F,a,.001)}async testRMSNorm(){const o=new Float32Array(16);for(let l=0;l<16;l++)o[l]=(l-8)*.3;const s=new Float32Array(16);for(let l=0;l<16;l++)s[l]=Math.sin(l*.4)*.1;const r=this.cpuF32ToBF16Packed(s);let a=0;for(let l=0;l<16;l++)a+=o[l]*o[l];const d=1/Math.sqrt(a/16+1e-6),f=new Float32Array(16);for(let l=0;l<16;l++){const p=l,y=r[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);f[l]=o[l]*d*(1+u)}const w=this.makeF32Buffer("rmsn_in",o),h=this.makeU32Buffer("rmsn_w",r),F=this.makeOutputBuffer("rmsn_out",16),n=this.makeUniformTyped([16],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,16);return this.compare(B,f,1e-4)}async testRMSNorm1PlusW(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8).fill(0),r=this.cpuF32ToBF16Packed(s);let a=0;for(let l=0;l<8;l++)a+=o[l]*o[l];const d=1/Math.sqrt(a/8+1e-6),f=new Float32Array(8);for(let l=0;l<8;l++)f[l]=o[l]*d;const w=this.makeF32Buffer("rmsn1_in",o),h=this.makeU32Buffer("rmsn1_w",r),F=this.makeOutputBuffer("rmsn1_out",8),n=this.makeUniformTyped([8],[1e-6]);this.dispatch("rmsnorm",[w,h,F,n],1);const B=await this.readback(F,8);return this.compare(B,f,1e-4)}async testSiLUMul(){const t=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array(8);for(let h=0;h<8;h++){const F=t[h]/(1+Math.exp(-t[h]));s[h]=F*o[h]}const r=this.makeF32Buffer("silu_gate",t),a=this.makeF32Buffer("silu_up",o),d=this.makeOutputBuffer("silu_out",8),f=this.makeUniform([8]);this.dispatch("silu_mul",[r,a,d,f],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testAdd(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([.1,.2,.3,.4,.5,.6,.7,.8]),s=new Float32Array(8);for(let w=0;w<8;w++)s[w]=t[w]+o[w];const r=this.makeF32Buffer("add_a",t),a=this.makeF32Buffer("add_b",o),d=this.makeUniform([8]);this.dispatch("add",[r,a,d],Math.ceil(8/32));const f=await this.readback(r,8);return this.compare(f,s,1e-6)}async testEmbedding(){const o=new Float32Array(32);for(let F=0;F<o.length;F++)o[F]=(F+1)*.01;const s=this.cpuF32ToBF16Packed(o),r=2,a=new Float32Array(8);for(let F=0;F<8;F++){const n=r*8+F,B=s[Math.floor(n/2)],l=n%2===0?B&65535:B>>16&65535;a[F]=this.bf16ToF32(l)}const d=this.makeU32Buffer("emb_w",s),f=this.makeOutputBuffer("emb_out",8),w=this.makeUniform([r,8]);this.dispatch("embedding",[d,f,w],Math.ceil(8/256));const h=await this.readback(f,8);return this.compare(h,a,.001)}async testArgmax(){const t=new Float32Array(16);for(let w=0;w<16;w++)t[w]=Math.sin(w*.7)*3;const o=t.indexOf(Math.max(...t)),s=this.makeF32Buffer("argmax_in",t),r=this.makeOutputBuffer("argmax_res",2),a=this.makeUniform([16]);this.dispatch("argmax",[s,r,a],1);const f=(await this.readbackU32(r,2))[0];return{pass:f===o,maxErr:Math.abs(f-o),errors:f!==o?[{idx:0,got:f,expected:o}]:[]}}async testArgmaxLarge(){const t=new Float32Array(1024);for(let d=0;d<1024;d++)t[d]=Math.sin(d*.01)-2;t[773]=99;const o=this.makeF32Buffer("argmax_lg_in",t),s=this.makeOutputBuffer("argmax_lg_res",2),r=this.makeUniform([1024]);this.dispatch("argmax",[o,s,r],1);const a=await this.readbackU32(s,2);return{pass:a[0]===773,maxErr:Math.abs(a[0]-773),errors:a[0]!==773?[{got:a[0],expected:773}]:[]}}async testSplit(){const s=new Float32Array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]),r=new Float32Array([1,2,3,4,9,10,11,12]),a=new Float32Array([5,6,7,8,13,14,15,16]),d=this.makeF32Buffer("split_src",s),f=this.makeOutputBuffer("split_a",8),w=this.makeOutputBuffer("split_b",8),h=this.makeUniform([8,2,4]);this.dispatch("split",[d,f,w,h],Math.ceil(8/32));const F=await this.readback(f,8),n=await this.readback(w,8),B=this.compare(F,r,1e-6),l=this.compare(n,a,1e-6);return{pass:B.pass&&l.pass,maxErr:Math.max(B.maxErr,l.maxErr),errors:[...B.errors,...l.errors]}}async testSigmoidMul(){const t=new Float32Array([1,2,3,4,5,6,7,8]),o=new Float32Array([0,1,-1,2,-2,.5,-.5,3]),s=new Float32Array(8);for(let h=0;h<8;h++)s[h]=t[h]/(1+Math.exp(-o[h]));const r=this.makeF32Buffer("sigmul_x",t),a=this.makeF32Buffer("sigmul_g",o),d=this.makeOutputBuffer("sigmul_out",8),f=this.makeUniform([8]);this.dispatch("sigmoid_mul",[r,a,d,f],Math.ceil(8/32));const w=await this.readback(d,8);return this.compare(w,s,1e-5)}async testHeadRMSNorm(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.1*(n+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const l=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*l*(1+u)}}const f=this.makeF32Buffer("hrmsn_x",s),w=this.makeU32Buffer("hrmsn_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm",[f,w,h],2);const F=await this.readback(f,8);return this.compare(F,d,.001)}async testHeadRMSNormNogated(){const s=new Float32Array([1,2,3,4,5,6,7,8]),r=new Float32Array(4);for(let n=0;n<4;n++)r[n]=.5+.1*n;const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(8);for(let n=0;n<2;n++){let B=0;for(let p=0;p<4;p++)B+=s[n*4+p]**2;const l=1/Math.sqrt(B/4+1e-6);for(let p=0;p<4;p++){const y=a[Math.floor(p/2)],k=p%2===0?y&65535:y>>16&65535,u=this.bf16ToF32(k);d[n*4+p]=s[n*4+p]*l*u}}const f=this.makeF32Buffer("hrmsng_x",s),w=this.makeU32Buffer("hrmsng_w",a),h=this.makeUniformTyped([2,4],[1e-6]);this.dispatch("head_rmsnorm_nogated",[f,w,h],2);const F=await this.readback(f,8);return this.compare(F,d,.001)}async testCausalConv1d(){const t=new Float32Array([1,2,3,4]),o=new Float32Array(12);for(let y=0;y<o.length;y++)o[y]=(y+1)*.1;const s=new Float32Array(16);for(let y=0;y<s.length;y++)s[y]=.25;const r=this.cpuF32ToBF16Packed(s),a=new Float32Array(4),d=new Float32Array(12);for(let y=0;y<4;y++){const k=o[y],u=o[4+y],m=o[8+y],g=t[y],i=[];for(let _=0;_<4;_++){const b=y*4+_,N=r[Math.floor(b/2)],x=b%2===0?N&65535:N>>16&65535;i.push(this.bf16ToF32(x))}const c=i[0]*k+i[1]*u+i[2]*m+i[3]*g;a[y]=c/(1+Math.exp(-c)),d[y]=u,d[4+y]=m,d[8+y]=g}const f=this.makeF32Buffer("conv_x",t),w=this.makeF32Buffer("conv_hist",o),h=this.makeU32Buffer("conv_w",r),F=this.makeUniform([4]);this.dispatch("causal_conv1d",[f,w,h,F],Math.ceil(4/256));const n=await this.readback(f,4),B=await this.readback(w,12),l=this.compare(n,a,1e-4),p=this.compare(B,d,1e-4);return{pass:l.pass&&p.pass,maxErr:Math.max(l.maxErr,p.maxErr),errors:[...l.errors.map(y=>({...y,note:"conv output"})),...p.errors.map(y=>({...y,note:"history"}))]}}async testKVCacheStore(){const r=new Float32Array(8),a=new Float32Array(8);for(let c=0;c<8;c++)r[c]=c+1,a[c]=(c+1)*10;const f=8*8,w=new Float32Array(f),h=new Float32Array(f),F=new Float32Array(f),n=new Float32Array(f);for(let c=0;c<8;c++)F[24+c]=r[c],n[24+c]=a[c];const B=this.makeF32Buffer("kvs_k",r),l=this.makeF32Buffer("kvs_v",a),p=this.makeF32Buffer("kvs_kc",w),y=this.makeF32Buffer("kvs_vc",h),k=this.makeUniform([3,2,4]);this.dispatch("kv_cache_store",[B,l,p,y,k],Math.ceil(8/256));const u=await this.readback(p,f),m=await this.readback(y,f),g=this.compare(u,F,1e-6),i=this.compare(m,n,1e-6);return{pass:g.pass&&i.pass,maxErr:Math.max(g.maxErr,i.maxErr),errors:[...g.errors,...i.errors]}}async testGQAAttention(){const a=new Float32Array([1,0,0,0]),d=new Float32Array(4);d.set([1,0,0,0]);const f=new Float32Array(4);f.set([.5,.6,.7,.8]);const w=new Float32Array([.5,.6,.7,.8]),h=this.makeF32Buffer("gqa_q",a),F=this.makeF32Buffer("gqa_kc",d),n=this.makeF32Buffer("gqa_vc",f),B=this.makeOutputBuffer("gqa_out",4),l=new ArrayBuffer(32),p=new DataView(l);p.setUint32(0,1,!0),p.setUint32(4,4,!0),p.setUint32(8,1,!0),p.setUint32(12,1,!0),p.setUint32(16,1/1,!0),p.setUint32(20,1,!0);const y=this.gpu.createBufferFromData("gqa_params",new Uint8Array(l),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[h,F,n,B,y],1);const k=await this.readback(B,4);return this.compare(k,w,1e-4)}async testDeltaNetRecurrent(){const s=[.5,.5],r=[1,0],a=[.8,.2],d=new Float32Array([...s,...r,...a]),f=new Float32Array(4),w=new Float32Array([0]),h=new Float32Array([0]),F=new Float32Array([0]),n=new Float32Array([0]),B=this.cpuF32ToBF16Packed(F),l=this.cpuF32ToBF16Packed(n),p=1/(1+Math.exp(-0)),y=Math.sqrt(s[0]**2+s[1]**2),k=Math.sqrt(r[0]**2+r[1]**2),u=s.map(U=>U/y),m=r.map(U=>U/k),g=[0,0,0,0],i=[0,0],c=a.map((U,S)=>(U-i[S])*p);for(let U=0;U<2;U++)for(let S=0;S<2;S++)g[U*2+S]+=m[U]*c[S];const _=1/Math.sqrt(2),b=new Float32Array(2);for(let U=0;U<2;U++){let S=0;for(let O=0;O<2;O++)S+=g[O*2+U]*u[O]*_;b[U]=S}const N=this.makeF32Buffer("dn_qkv",d),x=this.makeF32Buffer("dn_state",f),q=this.makeOutputBuffer("dn_out",2),K=this.makeF32Buffer("dn_a",w),v=this.makeF32Buffer("dn_b",h),T=this.makeU32Buffer("dn_alog",B),V=this.makeU32Buffer("dn_dtb",l),G=this.makeUniform([1,2,2]);this.dispatch("deltanet_recurrent",[N,x,q,K,v,T,V,G],1);const C=await this.readback(q,2);return this.compare(C,b,.001)}async testAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array(8);for(let _=0;_<8;_++)r[_]=.1*(_+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(o);for(let _=0;_<8;_++)d[_]+=s[_];let f=0;for(let _=0;_<8;_++)f+=d[_]*d[_];const w=1/Math.sqrt(f/8+1e-6),h=new Float32Array(8);for(let _=0;_<8;_++){const b=a[Math.floor(_/2)]>>_%2*16&65535,N=this.bf16ToF32(b);h[_]=d[_]*w*(1+N)}const F=this.makeF32Buffer("addnorm_h",o),n=this.makeF32Buffer("addnorm_a",s),B=this.makeU32Buffer("addnorm_w",a),l=this.makeOutputBuffer("addnorm_out",8);this.makeUniform([8]);const p=new ArrayBuffer(16),y=new DataView(p);y.setUint32(0,8,!0),y.setFloat32(4,1e-6,!0);const k=this.gpu.createBufferFromData("addnorm_params",new Uint8Array(p),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("add_rmsnorm",[F,n,B,l,k],1);const u=await this.readback(l,8),m=await this.readback(F,8),g=d,i=this.compare(u,h,.001),c=this.compare(m,g,1e-6);return{pass:i.pass&&c.pass,maxErr:Math.max(i.maxErr,c.maxErr),errors:[...i.errors,...c.errors]}}async testThreeWayAddRMSNorm(){const o=new Float32Array([1,2,3,4,5,6,7,8]),s=new Float32Array([.1,-.2,.3,-.4,.5,-.6,.7,-.8]),r=new Float32Array([.5,.5,-.5,-.5,1,1,-1,-1]),a=new Float32Array(8);for(let N=0;N<8;N++)a[N]=.1*(N+1);const d=this.cpuF32ToBF16Packed(a),f=new Float32Array(8);for(let N=0;N<8;N++)f[N]=o[N]+s[N]+r[N];let w=0;for(let N=0;N<8;N++)w+=f[N]*f[N];const h=1/Math.sqrt(w/8+1e-6),F=new Float32Array(8);for(let N=0;N<8;N++){const x=d[Math.floor(N/2)]>>N%2*16&65535,q=this.bf16ToF32(x);F[N]=f[N]*h*(1+q)}const n=this.makeF32Buffer("twa_a",o),B=this.makeF32Buffer("twa_b",s),l=this.makeF32Buffer("twa_c",r),p=this.makeU32Buffer("twa_w",d),y=this.makeOutputBuffer("twa_hout",8),k=this.makeOutputBuffer("twa_normed",8),u=new ArrayBuffer(16),m=new DataView(u);m.setUint32(0,8,!0),m.setFloat32(4,1e-6,!0);const g=this.gpu.createBufferFromData("twa_params",new Uint8Array(u),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("three_way_add_rmsnorm",[n,B,l,p,y,k,g],1);const i=await this.readback(y,8),c=await this.readback(k,8),_=this.compare(i,f,1e-6),b=this.compare(c,F,.001);return{pass:_.pass&&b.pass,maxErr:Math.max(_.maxErr,b.maxErr),errors:[..._.errors,...b.errors]}}async testGQARoPERotateHalf(){const h=1/Math.sqrt(256),F=[];for(let D=0;D<32;D++)F.push(1/Math.pow(100,2*D/64));function n(D,M){const H=new Float32Array(D.length);for(let P=0;P<D.length;P++)H[P]=D[P];for(let P=0;P<32;P++){const Q=M*F[P],E=Math.cos(Q),J=Math.sin(Q);H[P]=D[P]*E-D[P+32]*J,H[P+32]=D[P+32]*E+D[P]*J}return H}const B=new Float32Array(256);B[0]=1,B[1]=.5,B[2]=.3,B[32]=.7,B[33]=.2;const l=new Float32Array(256);l[0]=1,l[32]=.5;const p=new Float32Array(256);p[1]=1,p[33]=.3;const y=new Float32Array(256);y[2]=1;const k=new Float32Array(256);k[0]=1;const u=new Float32Array(256);u[0]=0,u[1]=1;const m=new Float32Array(256);m[0]=0,m[2]=1;const g=n(B,2),i=n(l,0),c=n(p,1),_=n(y,2);let b=0,N=0,x=0;for(let D=0;D<256;D++)b+=g[D]*i[D],N+=g[D]*c[D],x+=g[D]*_[D];b*=h,N*=h,x*=h;const q=Math.max(b,N,x),K=Math.exp(b-q),v=Math.exp(N-q),T=Math.exp(x-q),V=K+v+T,G=K/V,C=v/V,U=T/V,S=new Float32Array(256);for(let D=0;D<256;D++)S[D]=G*k[D]+C*u[D]+U*m[D];const O=1*256,Y=new Float32Array(3*O),z=new Float32Array(3*O);Y.set(i,0),Y.set(c,O),Y.set(_,2*O),z.set(k,0),z.set(u,O),z.set(m,2*O);const X=this.makeF32Buffer("rope2_q",g),tt=this.makeF32Buffer("rope2_kc",Y),A=this.makeF32Buffer("rope2_vc",z),R=this.makeOutputBuffer("rope2_out",1*256),L=new ArrayBuffer(32),W=new DataView(L);W.setUint32(0,3,!0),W.setUint32(4,256,!0),W.setUint32(8,1,!0),W.setUint32(12,1,!0),W.setUint32(16,1/1,!0),W.setUint32(20,1,!0);const Z=this.gpu.createBufferFromData("rope2_params",new Uint8Array(L),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST);this.dispatch("gqa_attention_head",[X,tt,A,R,Z],1);const j=await this.readback(R,1*256);return this.compare(j,S,.001)}async testFusedSigmoidGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m+1)*.1,a[m]=(m-8)*.3;const d=new Uint32Array(32),f=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_+g)%15;f[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.5+m*.1;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const c=r[i]/(1+Math.exp(-a[i])),b=Math.floor(i/64)*4+m,N=h[Math.floor(b/2)]>>b%2*16&65535,x=this.f16ToF32(N);g+=c*f[i*4+m]*x}F[m]=g}const n=this.makeF32Buffer("fsg_x",r),B=this.makeF32Buffer("fsg_g",a),l=this.makeU32Buffer("fsg_qw",d),p=this.makeU32Buffer("fsg_sc",h),y=this.makeOutputBuffer("fsg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_sigmoid_gptq",[n,B,l,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedSiluGPTQ(){const r=new Float32Array(64),a=new Float32Array(64);for(let m=0;m<64;m++)r[m]=(m-8)*.2,a[m]=(m+1)*.15;const d=new Uint32Array(32),f=new Float32Array(256);for(let m=0;m<8;m++)for(let g=0;g<4;g++){let i=0;for(let c=0;c<8;c++){const _=m*8+c,b=(_*2+g)%15;f[_*4+g]=b-8,i|=b<<c*4}d[m*4+g]=i}const w=new Float32Array(4);for(let m=0;m<4;m++)w[m]=.4+m*.2;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(4);for(let m=0;m<4;m++){let g=0;for(let i=0;i<64;i++){const _=r[i]/(1+Math.exp(-r[i]))*a[i],N=Math.floor(i/64)*4+m,x=h[Math.floor(N/2)]>>N%2*16&65535,q=this.f16ToF32(x);g+=_*f[i*4+m]*q}F[m]=g}const n=this.makeF32Buffer("fslg_a",r),B=this.makeF32Buffer("fslg_b",a),l=this.makeU32Buffer("fslg_qw",d),p=this.makeU32Buffer("fslg_sc",h),y=this.makeOutputBuffer("fslg_out",4),k=this.makeUniform([64,4,64]);this.dispatch("fused_silu_gptq",[n,B,l,p,y,k],Math.ceil(4/32));const u=await this.readback(y,4);return this.compare(u,F,.01)}async testFusedAddNormGateUpSiLU(){const a=new Float32Array(64),d=new Float32Array(64),f=new Float32Array(64),w=new Float32Array(64);for(let A=0;A<64;A++)a[A]=(A+1)*.1,d[A]=(A-5)*.05,f[A]=(A%7-3)*.03,w[A]=(A%3-1)*.1;const h=new Uint32Array(32),F=new Uint32Array(32),n=new Float32Array(256),B=new Float32Array(256);for(let A=0;A<64;A++)for(let R=0;R<4;R++)n[A*4+R]=Math.sin(A*.7+R*1.3)*.5,B[A*4+R]=Math.cos(A*.3+R*.9)*.5;const l=new Uint32Array(4/2),p=new Uint32Array(4/2);for(let A=0;A<4;A++){let R=0,L=0;for(let P=0;P<64;P++)R=Math.max(R,Math.abs(n[P*4+A])),L=Math.max(L,Math.abs(B[P*4+A]));const W=R/7,Z=L/7,j=this.f32ToF16(W),D=this.f32ToF16(Z),M=this.f16ToF32(j),H=this.f16ToF32(D);A%2===0?(l[A/2]=j,p[A/2]=D):(l[(A-1)/2]|=j<<16,p[(A-1)/2]|=D<<16);for(let P=0;P<64;P++){const Q=Math.round(n[P*4+A]/M)+8,E=Math.round(B[P*4+A]/H)+8,J=Math.max(0,Math.min(15,Q)),rt=Math.max(0,Math.min(15,E)),st=Math.floor(P/8),at=P%8*4;h[st*4+A]|=J<<at,F[st*4+A]|=rt<<at}}const y=new Uint32Array(64/2);for(let A=0;A<64;A+=2){const R=this.f32ToBF16(w[A]),L=this.f32ToBF16(w[A+1]);y[A/2]=R|L<<16}const k=new Float32Array(64);let u=0;for(let A=0;A<64;A++)k[A]=a[A]+d[A]+f[A],u+=k[A]*k[A];const m=1/Math.sqrt(u/64+1e-6),g=new Float32Array(64);for(let A=0;A<64;A++){const R=this.unpackBF16(y[Math.floor(A/2)],A%2);g[A]=k[A]*m*(1+R)}const i=new Float32Array(4);for(let A=0;A<4;A++){let R=0,L=0;const W=this.f16ToF32(A%2===0?l[Math.floor(A/2)]&65535:l[Math.floor(A/2)]>>>16),Z=this.f16ToF32(A%2===0?p[Math.floor(A/2)]&65535:p[Math.floor(A/2)]>>>16);for(let j=0;j<64;j++){const D=Math.floor(j/8),M=j%8*4,H=h[D*4+A]>>>M&15,P=F[D*4+A]>>>M&15;R+=g[j]*W*(H-8),L+=g[j]*Z*(P-8)}i[A]=R/(1+Math.exp(-R))*L}const c=k,_=16+640*16,b=new ArrayBuffer(_),N=new DataView(b);N.setUint32(0,64,!0),N.setUint32(4,4,!0),N.setUint32(8,64,!0),N.setFloat32(12,1e-6,!0);for(let A=0;A<64/2;A++){const R=Math.floor(A/4),L=A%4;N.setUint32(16+R*16+L*4,y[A],!0)}const x=this.gpu.createBufferFromData("fang_params",new Uint8Array(b),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),q=new Uint32Array(64);q.set(h,0),q.set(F,32);const K=8/(64/8),v=new Uint32Array(K*4/2*2);v.set(l,0),v.set(p,K*4/2);const T=this.makeF32Buffer("fang_hin",a),V=this.makeF32Buffer("fang_mlpres",d),G=this.makeF32Buffer("fang_attnres",f),C=this.makeU32Buffer("fang_mqw",q),U=this.makeU32Buffer("fang_msc",v),S=this.makeOutputBuffer("fang_hout",64),O=this.makeOutputBuffer("fang_out",4);this.dispatch("fused_addnorm_gate_up_silu",[T,V,G,C,U,S,O,x],Math.max(1,Math.ceil(4/32)));const Y=await this.readback(O,4),z=await this.readback(S,64),X=this.compare(Y,i,.02),tt=this.compare(z,c,1e-6);return{pass:X.pass&&tt.pass,maxErr:Math.max(X.maxErr,tt.maxErr),errors:[...X.errors||[],...tt.errors||[]]}}async testFusedSplitQKNormKVStore(){const f=new Float32Array(32);for(let M=0;M<f.length;M++)f[M]=Math.sin(M*.5+1)*.3;const w=new Float32Array(8),h=new Float32Array(8);for(let M=0;M<8;M++)w[M]=Math.cos(M*.7+2)*.4,h[M]=Math.sin(M*1.1+3)*.2;const F=new Float32Array(16),n=new Float32Array(8);for(let M=0;M<16;M++)F[M]=(M%3-1)*.05;for(let M=0;M<8;M++)n[M]=(M%4-2)*.03;const B=new Uint32Array(16/2);for(let M=0;M<16;M+=2)B[M/2]=this.f32ToBF16(F[M])|this.f32ToBF16(F[M+1])<<16;const l=new Uint32Array(8/2);for(let M=0;M<8;M+=2)l[M/2]=this.f32ToBF16(n[M])|this.f32ToBF16(n[M+1])<<16;const p=new Float32Array(16),y=new Float32Array(16);for(let M=0;M<2;M++){const H=new Float32Array(8);for(let E=0;E<8;E++)H[E]=f[M*8*2+E],y[M*8+E]=f[M*8*2+8+E];let P=0;for(let E=0;E<8;E++)P+=H[E]*H[E];const Q=1/Math.sqrt(P/8+1e-6);for(let E=0;E<8;E++){const J=this.unpackBF16(B[Math.floor((M*8+E)/2)],(M*8+E)%2);p[M*8+E]=H[E]*Q*(1+J)}}const k=new Float32Array(8),u=new Float32Array(32),m=new Float32Array(32);for(let M=0;M<1;M++){let H=0;for(let Q=0;Q<8;Q++)H+=w[M*8+Q]*w[M*8+Q];const P=1/Math.sqrt(H/8+1e-6);for(let Q=0;Q<8;Q++){const E=this.unpackBF16(l[Math.floor((M*8+Q)/2)],(M*8+Q)%2),J=w[M*8+Q]*P*(1+E);k[M*8+Q]=J,u[24+M*8+Q]=J,m[24+M*8+Q]=h[M*8+Q]}}const g=32+320*16,i=new ArrayBuffer(g),c=new DataView(i);c.setUint32(0,2,!0),c.setUint32(4,1,!0),c.setUint32(8,8,!0),c.setFloat32(12,1e-6,!0),c.setUint32(16,3,!0);for(let M=0;M<16/2;M++){const H=Math.floor(M/4),P=M%4;c.setUint32(32+H*16+P*4,B[M],!0)}const _=16/2;for(let M=0;M<8/2;M++){const H=_+M,P=Math.floor(H/4),Q=H%4;c.setUint32(32+P*16+Q*4,l[M],!0)}const b=this.gpu.createBufferFromData("fsqk_params",new Uint8Array(i),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST),N=this.makeF32Buffer("fsqk_qpf",f),x=this.makeF32Buffer("fsqk_kp",w),q=this.makeF32Buffer("fsqk_vp",h),K=this.makeOutputBuffer("fsqk_qp",16),v=this.makeOutputBuffer("fsqk_qg",16),T=this.makeOutputBuffer("fsqk_kc",32),V=this.makeOutputBuffer("fsqk_vc",32);this.dispatch("fused_split_qknorm_kvstore",[N,x,q,K,v,T,V,b],3);const G=await this.readback(K,16),C=await this.readback(v,16),U=await this.readback(x,8),S=await this.readback(T,32),O=await this.readback(V,32),Y=this.compare(G,p,1e-5),z=this.compare(C,y,1e-6),X=this.compare(U,k,1e-5),tt=S.slice(24,32),A=O.slice(24,32),R=u.slice(24,32),L=m.slice(24,32),W=this.compare(tt,R,1e-5),Z=this.compare(A,L,1e-6),j=Math.max(Y.maxErr,z.maxErr,X.maxErr,W.maxErr,Z.maxErr);return{pass:Y.pass&&z.pass&&X.pass&&W.pass&&Z.pass,maxErr:j,errors:[...Y.errors||[],...z.errors||[],...X.errors||[],...W.errors||[],...Z.errors||[]]}}async benchmarkGPTQ(e){const t=[{K:2048,N:2048,gs:128,label:"2048→2048 (Z/out_proj)"},{K:2048,N:6144,gs:128,label:"2048→6144 (QKV/gate+up)"},{K:6144,N:2048,gs:128,label:"6144→2048 (down_proj)"}],o=[1,2,4,8],s=50,r=3,a=10,d=[];for(const f of t){const{K:w,N:h,gs:F,label:n}=f,B=w/8,l=B/(F/8),p=new Float32Array(w);for(let b=0;b<w;b++)p[b]=Math.random()*2-1;const y=new Uint32Array(B*h);for(let b=0;b<y.length;b++)y[b]=Math.random()*4294967295>>>0;const k=new Float32Array(l*h);for(let b=0;b<k.length;b++)k[b]=Math.random()*.5;const u=this.cpuF32ToF16Packed(k),m=this.makeF32Buffer("bm_in",p),g=this.makeU32Buffer("bm_qw",y),i=this.makeU32Buffer("bm_sc",u),c=[];for(let b=0;b<s;b++)c.push(this.makeOutputBuffer(`bm_out_${b}`,h));const _=B*h*4+l*h*2;for(const b of o){if(b>1&&l%b!==0)continue;let N;if(b===1){const v=this.makeUniform([w,h,F]);for(let G=0;G<r;G++){this.gpu.beginBatch();for(let C=0;C<s;C++)this.dispatch("gptq_matvec",[m,g,i,c[C],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const T=[];for(let G=0;G<a;G++){await this.gpu.device.queue.onSubmittedWorkDone();const C=performance.now();this.gpu.beginBatch();for(let U=0;U<s;U++)this.dispatch("gptq_matvec",[m,g,i,c[U],v],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),T.push(performance.now()-C)}T.sort((G,C)=>G-C),N=T.slice(1,-1).reduce((G,C)=>G+C,0)/(T.length-2)/s}else{const v=[];for(let U=0;U<s;U++)v.push(this.makeOutputBuffer(`bm_part_${U}`,h*b));const T=this.makeUniform([w,h,F,b]),V=this.makeUniform([h,b]);for(let U=0;U<r;U++){this.gpu.beginBatch();for(let S=0;S<s;S++)this.dispatch("gptq_splitk",[m,g,i,v[S],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[S],c[S],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const G=[];for(let U=0;U<a;U++){await this.gpu.device.queue.onSubmittedWorkDone();const S=performance.now();this.gpu.beginBatch();for(let O=0;O<s;O++)this.dispatch("gptq_splitk",[m,g,i,v[O],T],Math.ceil(h/32),b),this.dispatch("reduce_splitk",[v[O],c[O],V],Math.ceil(h/256));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),G.push(performance.now()-S)}G.sort((U,S)=>U-S),N=G.slice(1,-1).reduce((U,S)=>U+S,0)/(G.length-2)/s}const x=_/1e9/(N/1e3),q=Math.ceil(h/32)*b,K={label:n,ns:b,wgs:q,avgMs:N.toFixed(4),bwGBs:x.toFixed(1)};d.push(K),e?.(K)}if(this.gpu.hasF16&&this.pipelines.gptq_matvec_f16){const b=this.makeUniform([w,h,F]);for(let T=0;T<r;T++){this.gpu.beginBatch();for(let V=0;V<s;V++)this.dispatch("gptq_matvec_f16",[m,g,i,c[V],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone()}const N=[];for(let T=0;T<a;T++){await this.gpu.device.queue.onSubmittedWorkDone();const V=performance.now();this.gpu.beginBatch();for(let G=0;G<s;G++)this.dispatch("gptq_matvec_f16",[m,g,i,c[G],b],Math.ceil(h/32));this.gpu.endBatch(),await this.gpu.device.queue.onSubmittedWorkDone(),N.push(performance.now()-V)}N.sort((T,V)=>T-V);const q=N.slice(1,-1).reduce((T,V)=>T+V,0)/(N.length-2)/s,K=_/1e9/(q/1e3),v={label:n,ns:"f16",wgs:Math.ceil(h/32),avgMs:q.toFixed(4),bwGBs:K.toFixed(1)};d.push(v),e?.(v)}}return d}_makeNormParams(e,t=1e-6){const o=new ArrayBuffer(16),s=new DataView(o);return s.setUint32(0,e,!0),s.setFloat32(4,t,!0),this.gpu.createBufferFromData("norm_params",new Uint8Array(o),GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST)}_cpuRMSNorm1PlusW(e,t,o,s=1e-6){let r=0;for(let f=0;f<o;f++)r+=e[f]*e[f];const a=1/Math.sqrt(r/o+s),d=new Float32Array(o);for(let f=0;f<o;f++){const w=t[Math.floor(f/2)]>>f%2*16&65535,h=this.bf16ToF32(w);d[f]=e[f]*a*(1+h)}return d}async testAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let u=0;u<32;u++)o[u]=Math.sin(u*.7)*2,s[u]=Math.cos(u*.3)*.5;const r=new Float32Array(16);for(let u=0;u<16;u++)r[u]=.1*(u+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32),f=new Float32Array(32);for(let u=0;u<2;u++){const m=new Float32Array(16);for(let i=0;i<16;i++)m[i]=o[u*16+i]+s[u*16+i];for(let i=0;i<16;i++)f[u*16+i]=m[i];const g=this._cpuRMSNorm1PlusW(m,a,16,1e-6);d.set(g,u*16)}const w=this.makeF32Buffer("b2an_h",o),h=this.makeF32Buffer("b2an_a",s),F=this.makeU32Buffer("b2an_w",a),n=this.makeOutputBuffer("b2an_out",32),B=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_b2",[w,h,F,n,B],2);const l=await this.readback(n,32),p=await this.readback(w,32),y=this.compare(l,d,.001),k=this.compare(p,f,1e-6);return{pass:y.pass&&k.pass,maxErr:Math.max(y.maxErr,k.maxErr),errors:[...y.errors,...k.errors]}}async testAddRMSNormROB2(){const o=new Float32Array(32),s=new Float32Array(32);for(let l=0;l<32;l++)o[l]=Math.sin(l*.5)*3,s[l]=Math.cos(l*.8)*.3;const r=new Float32Array(16);for(let l=0;l<16;l++)r[l]=.05*(l+1);const a=this.cpuF32ToBF16Packed(r),d=new Float32Array(32);for(let l=0;l<2;l++){const p=new Float32Array(16);for(let k=0;k<16;k++)p[k]=o[l*16+k]+s[l*16+k];const y=this._cpuRMSNorm1PlusW(p,a,16,1e-6);d.set(y,l*16)}const f=this.makeF32Buffer("b2anro_h",o),w=this.makeF32Buffer("b2anro_a",s),h=this.makeU32Buffer("b2anro_w",a),F=this.makeOutputBuffer("b2anro_out",32),n=this._makeNormParams(16,1e-6);this.dispatch("add_rmsnorm_ro_b2",[f,w,h,F,n],2);const B=await this.readback(F,32);return this.compare(B,d,.001)}async testThreeWayAddRMSNormB2(){const o=new Float32Array(32),s=new Float32Array(32),r=new Float32Array(32);for(let i=0;i<32;i++)o[i]=Math.sin(i*.4)*2,s[i]=Math.cos(i*.6)*.5,r[i]=Math.sin(i*1.1)*.3;const a=new Float32Array(16);for(let i=0;i<16;i++)a[i]=.1*(i+1);const d=this.cpuF32ToBF16Packed(a),f=new Float32Array(32),w=new Float32Array(32);for(let i=0;i<2;i++){const c=new Float32Array(16);for(let b=0;b<16;b++)c[b]=o[i*16+b]+s[i*16+b]+r[i*16+b];f.set(c,i*16);const _=this._cpuRMSNorm1PlusW(c,d,16,1e-6);w.set(_,i*16)}const h=this.makeF32Buffer("b2twa_a",o),F=this.makeF32Buffer("b2twa_b",s),n=this.makeF32Buffer("b2twa_c",r),B=this.makeU32Buffer("b2twa_w",d),l=this.makeOutputBuffer("b2twa_hout",32),p=this.makeOutputBuffer("b2twa_normed",32),y=this._makeNormParams(16,1e-6);this.dispatch("three_way_add_rmsnorm_b2",[h,F,n,B,l,p,y],2);const k=await this.readback(p,32),u=await this.readback(l,32),m=this.compare(k,w,.001),g=this.compare(u,f,1e-6);return{pass:m.pass&&g.pass,maxErr:Math.max(m.maxErr,g.maxErr),errors:[...m.errors,...g.errors]}}_makeGPTQData(e,t,o){const s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let h=0;h<s;h++)for(let F=0;F<t;F++){let n=0;for(let B=0;B<8;B++){const l=h*8+B,p=(l*3+F*7)%15;a[l*t+F]=p-8,n|=p<<B*4}r[h*t+F]=n}const d=e/o,f=new Float32Array(d*t);for(let h=0;h<d*t;h++)f[h]=.3+h*.15;const w=this.cpuF32ToF16Packed(f);return{qweight:r,rawWeights:a,scalesPacked:w}}_cpuGPTQMatvec(e,t,o,s,r,a,d=0){const f=new Float32Array(r);for(let w=0;w<r;w++){let h=0;for(let F=0;F<s;F++){const B=Math.floor(F/a)*r+w,l=o[Math.floor(B/2)]>>B%2*16&65535,p=this.f16ToF32(l);h+=e[d+F]*t[F*r+w]*p}f[w]=h}return f}async testFusedSiluGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),f=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.3)*2,f[k]=Math.cos(k*.5)*1.5;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=d[k*e+g];u[g]=i/(1+Math.exp(-i))*f[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sl_a",d),F=this.makeF32Buffer("b2sl_b",f),n=this.makeU32Buffer("b2sl_qw",s),B=this.makeU32Buffer("b2sl_sc",a),l=this.makeOutputBuffer("b2sl_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_silu_gptq_b2_f16",[h,F,n,B,l,p],Math.ceil(t/32));const y=await this.readback(l,2*t);return this.compare(y,w,.05)}async testFusedSigmoidGPTQB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,{qweight:s,rawWeights:r,scalesPacked:a}=this._makeGPTQData(e,t,o),d=new Float32Array(2*e),f=new Float32Array(2*e);for(let k=0;k<2*e;k++)d[k]=Math.sin(k*.4)*1.5,f[k]=Math.cos(k*.7)*2;const w=new Float32Array(2*t);for(let k=0;k<2;k++){const u=new Float32Array(e);for(let g=0;g<e;g++){const i=1/(1+Math.exp(-f[k*e+g]));u[g]=i*d[k*e+g]}const m=this._cpuGPTQMatvec(u,r,a,e,t,o);w.set(m,k*t)}const h=this.makeF32Buffer("b2sg_x",d),F=this.makeF32Buffer("b2sg_g",f),n=this.makeU32Buffer("b2sg_qw",s),B=this.makeU32Buffer("b2sg_sc",a),l=this.makeOutputBuffer("b2sg_out",2*t),p=this.makeUniform([e,t,o]);this.dispatch("fused_sigmoid_gptq_b2_f16",[h,F,n,B,l,p],Math.ceil(t/32));const y=await this.readback(l,2*t);return this.compare(y,w,.05)}async testFusedGateUpSiLUB2(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const e=128,t=8,o=64,s=e/8,r=new Uint32Array(s*t),a=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*3+x*7)%15;a[v*t+x]=T-8,q|=T<<K*4}r[N*t+x]=q}const d=e/o,f=new Float32Array(d*t);for(let N=0;N<d*t;N++)f[N]=.01+N%8*.005;const w=this.cpuF32ToF16Packed(f),h=new Uint32Array(s*t),F=new Float32Array(e*t);for(let N=0;N<s;N++)for(let x=0;x<t;x++){let q=0;for(let K=0;K<8;K++){const v=N*8+K,T=(v*5+x*3)%15;F[v*t+x]=T-8,q|=T<<K*4}h[N*t+x]=q}const n=e/o,B=new Float32Array(n*t);for(let N=0;N<n*t;N++)B[N]=.01+N%8*.005;const l=this.cpuF32ToF16Packed(B),p=new Float32Array(2*e);for(let N=0;N<2*e;N++)p[N]=Math.sin(N*.2)*.5;const y=new Float32Array(2*t);for(let N=0;N<2;N++){const x=this._cpuGPTQMatvec(p,a,w,e,t,o,N*e),q=this._cpuGPTQMatvec(p,F,l,e,t,o,N*e);for(let K=0;K<t;K++){const v=x[K];y[N*t+K]=v/(1+Math.exp(-v))*q[K]}}const k=this.makeF32Buffer("b2gus_in",p),u=this.makeU32Buffer("b2gus_gqw",r),m=this.makeU32Buffer("b2gus_gsc",w),g=this.makeU32Buffer("b2gus_uqw",h),i=this.makeU32Buffer("b2gus_usc",l),c=this.makeOutputBuffer("b2gus_out",2*t),_=this.makeUniform([e,t,o]);this.dispatch("fused_gate_up_silu_b2_f16",[k,u,m,g,i,c,_],Math.ceil(t/32));const b=await this.readback(c,2*t);return this.compare(b,y,.05)}_makeGPTQMatmulB2Data(e,t,o){const s=e/8,r=new Float32Array(2*e);for(let n=0;n<e;n++)r[n]=Math.sin(n*.5)*.5;for(let n=0;n<e;n++)r[e+n]=Math.cos(n*.3)*.5;const a=new Uint32Array(s*t),d=new Float32Array(e*t);for(let n=0;n<s;n++)for(let B=0;B<t;B++){let l=0;for(let p=0;p<8;p++){const y=n*8+p,k=(y*3+B*7)%15;d[y*t+B]=k-8,l|=k<<p*4}a[n*t+B]=l}const f=e/o,w=new Float32Array(f*t);for(let n=0;n<f*t;n++)w[n]=.01+n%8*.005;const h=this.cpuF32ToF16Packed(w),F=new Float32Array(2*t);for(let n=0;n<2;n++)for(let B=0;B<t;B++){let l=0;for(let p=0;p<e;p++){const k=Math.floor(p/o)*t+B,u=h[Math.floor(k/2)]>>k%2*16&65535,m=this.f16ToF32(u);l+=r[n*e+p]*d[p*t+B]*m}F[n*t+B]=l}return{input:r,qweight:a,scalesPacked:h,expected:F,K:e,N:t,gs:o}}async testGPTQMatmulB2(){const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),f=this.makeF32Buffer("b2_input",e),w=this.makeU32Buffer("b2_qw",t),h=this.makeU32Buffer("b2_sc",o),F=this.makeOutputBuffer("b2_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2",[f,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.02)}async testGPTQMatmulB2F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(128,8,64),f=this.makeF32Buffer("b2f16_input",e),w=this.makeU32Buffer("b2f16_qw",t),h=this.makeU32Buffer("b2f16_sc",o),F=this.makeOutputBuffer("b2f16_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_f16",[f,w,h,F,n],Math.ceil(a/32));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}async testGPTQMatmulB2_4T_F16(){if(!this.gpu.hasF16)return{pass:!0,maxErr:0,note:"skipped (no f16)"};const{input:e,qweight:t,scalesPacked:o,expected:s,K:r,N:a,gs:d}=this._makeGPTQMatmulB2Data(256,16,64),f=this.makeF32Buffer("b2_4t_input",e),w=this.makeU32Buffer("b2_4t_qw",t),h=this.makeU32Buffer("b2_4t_sc",o),F=this.makeOutputBuffer("b2_4t_out",2*a),n=this.makeUniform([r,a,d]);this.dispatch("gptq_matmul_b2_4t_f16",[f,w,h,F,n],Math.ceil(a/8));const B=await this.readback(F,2*a);return this.compare(B,s,.05)}f32ToBF16(e){const t=new ArrayBuffer(4);return new Uint32Array(t)[0]=new Uint32Array(new Float32Array([e]).buffer)[0],new Uint32Array(t)[0]>>>16}unpackBF16(e,t){const o=e>>>t*16&65535;return this.bf16ToF32(o)}}function $(I){const e=document.getElementById("log");e.textContent+=I+`
+`,e.scrollTop=e.scrollHeight}document.querySelectorAll(".tab").forEach(I=>{I.addEventListener("click",()=>{document.querySelectorAll(".tab").forEach(e=>e.classList.remove("active")),document.querySelectorAll(".panel").forEach(e=>e.classList.remove("active")),I.classList.add("active"),document.getElementById(I.dataset.panel).classList.add("active")})});document.getElementById("runTests").addEventListener("click",async()=>{const I=document.getElementById("runTests"),e=document.getElementById("testResults"),t=document.getElementById("testSummary");I.disabled=!0,e.innerHTML="",t.innerHTML="",$("Initializing WebGPU for tests...");try{const o=new ot;await o.init(),$(`WebGPU initialized. Running tests...
+`);let s=0,r=0;await o.runAll(a=>{const d=a.pass?"✓":"✗",f=a.pass?"pass":"fail";a.pass?s++:r++;const w=document.createElement("div");if(w.className="test-row",w.innerHTML=`
+            <span class="test-icon ${f}">${d}</span>
+            <span class="test-name">${a.name}</span>
+            <span class="test-err ${f}">maxErr: ${typeof a.maxErr=="number"?a.maxErr.toExponential(2):"N/A"}</span>
+          `,!a.pass&&a.errors?.length){const n=document.createElement("div");n.className="test-detail",n.textContent=a.errors.slice(0,3).map(B=>`  idx=${B.idx??"?"}: got=${B.got?.toFixed?.(6)??B.got} expected=${B.expected?.toFixed?.(6)??B.expected}${B.note?" ("+B.note+")":""}`).join(`
+`),w.appendChild(n)}if(a.error){const n=document.createElement("div");n.className="test-detail",n.textContent=`  Error: ${a.error}`,w.appendChild(n)}e.appendChild(w);const h=a.pass?"PASS":"FAIL",F=typeof a.maxErr=="number"?` (maxErr=${a.maxErr.toExponential(2)})`:"";$(`[${h}] ${a.name}${F}${a.error?" — "+a.error:""}`)}),t.innerHTML=`
+          <div class="summary">
+            <div class="summary-line"><span>Total</span><span>${s+r}</span></div>
+            <div class="summary-line"><span class="pass">Passed</span><span class="pass">${s}</span></div>
+            <div class="summary-line"><span class="fail">Failed</span><span class="fail">${r}</span></div>
+          </div>
+        `,$(`
+Done: ${s} passed, ${r} failed`),o.destroy()}catch(o){$(`Error: ${o.message}
+${o.stack}`)}finally{I.disabled=!1}});document.getElementById("loadAndProfile").addEventListener("click",async()=>{const I=document.getElementById("loadAndProfile"),e=document.getElementById("profStatus"),t=document.getElementById("profCategories"),o=document.getElementById("profTopOps");I.disabled=!0,t.innerHTML="",o.innerHTML="";try{const s=document.getElementById("profRepo").value.trim();e.textContent="Loading model...",$("Profiler: loading model from "+s);const{GPUContext:r}=await et(async()=>{const{GPUContext:c}=await import("./gpu-ops-Da8QZLNh.js").then(_=>_.g);return{GPUContext:c}},[]),{Qwen35Model:a}=await et(async()=>{const{Qwen35Model:c}=await import("./qwen35-model-BU7SY47A.js");return{Qwen35Model:c}},__vite__mapDeps([0,1])),{loadModelWeights:d,loadConfig:f,loadQuantConfig:w}=await et(async()=>{const{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}=await import("./safetensors-loader-CwGm5mJX.js");return{loadModelWeights:c,loadConfig:_,loadQuantConfig:b}},[]),{Profiler:h}=await et(async()=>{const{Profiler:c}=await import("./profiler-DYUyiq-B.js");return{Profiler:c}},[]),F=new r;await F.init(),e.textContent="Fetching config...";const n=await f(s),B=await w(s);e.textContent="Downloading weights (this may take a while)...";const l=await d(s,c=>{if(c.phase==="downloading"){const _=c.total?(c.loaded/c.total*100).toFixed(0):"?";e.textContent=`Downloading: ${c.filesLoaded}/${c.filesTotal} shards (${_}%)`}else c.phase==="parsing"&&(e.textContent=`Parsing ${c.file}...`)});e.textContent="Uploading to GPU...";const p=new a(F,n,B);p.compilePipelines(),p.loadWeights(l),p.initBuffers(),e.textContent="Warming up...",await p.forward(1,0);const y=p.textCfg.linear_num_key_heads,k=p.textCfg.linear_key_head_dim,u=p.textCfg.linear_value_head_dim,m=y*(k+k+u);for(let c=0;c<p.numLayers;c++)p.layerTypes[c]==="linear_attention"&&(F.device.queue.writeBuffer(p.linState[c],0,new ArrayBuffer(y*k*u*4)),F.device.queue.writeBuffer(p.linConvHist[c],0,new ArrayBuffer(3*m*4)));p.seqLen=0,e.textContent="Profiling forward pass...",$("Profiling single forward pass...");const i=await new h(p).profileForward(1,0);e.textContent=`Done! Estimated: ${i.estimatedTokPerSec} tok/s (sync overhead inflates times)`,$(`
+Profile complete: ${i.total.toFixed(1)}ms total (~${i.estimatedTokPerSec} tok/s with sync overhead)`),$(`
+Category breakdown:`);for(const c of i.categories){const _=document.createElement("div");_.className="prof-row";const b=c.name.includes("matvec")||c.name==="lm_head"?"var(--orange)":c.name.includes("attention")||c.name.includes("deltanet")?"var(--blue)":"var(--green)";_.innerHTML=`
+            <span class="prof-name">${c.name}</span>
+            <span class="prof-time">${c.time}ms</span>
+            <span class="prof-pct">${c.pct}%</span>
+            <div class="prof-bar">
+              <div class="bar-track">
+                <div class="bar-fill" style="width:${c.pct}%; background:${b};"></div>
+                <span class="bar-label">${c.pct}%</span>
+              </div>
+            </div>
+          `,t.appendChild(_),$(`  ${c.name.padEnd(25)} ${c.time.padStart(8)}ms  ${c.pct.padStart(5)}%`)}document.getElementById("topOpsTitle").style.display="",$(`
+Top 20 individual operations:`);for(const c of i.topOps){const _=document.createElement("div");_.className="prof-row",_.innerHTML=`
+            <span class="prof-name">${c.name}</span>
+            <span class="prof-time">${c.time}ms</span>
+            <span class="prof-pct">${c.pct}%</span>
+            <div class="prof-bar">
+              <div class="bar-track">
+                <div class="bar-fill" style="width:${c.pct}%; background:var(--orange);"></div>
+              </div>
+            </div>
+          `,o.appendChild(_),$(`  ${c.name.padEnd(35)} ${c.time.padStart(8)}ms  ${c.pct.padStart(5)}%`)}F.destroy()}catch(s){e.textContent=`Error: ${s.message}`,$(`Profiler error: ${s.message}
+${s.stack}`)}finally{I.disabled=!1}});document.getElementById("runBenchmark").addEventListener("click",async()=>{const I=document.getElementById("runBenchmark"),e=document.getElementById("bmResults");I.disabled=!0,e.innerHTML="<p style='color:var(--dim)'>Initializing WebGPU...</p>",$("Starting GPTQ benchmark...");try{const t=new ot;await t.init(),e.innerHTML="";let o="";const s=await t.benchmarkGPTQ(r=>{if(r.label!==o){o=r.label;const f=document.createElement("h2");f.textContent=r.label,e.appendChild(f)}const a=document.createElement("div");a.className="prof-row";const d=(r.ns===1,"");a.innerHTML=`
+            <span class="prof-name">splits=${r.ns} (${r.wgs} WGs)</span>
+            <span class="prof-time">${r.avgMs}ms</span>
+            <span class="prof-pct">${r.bwGBs} GB/s</span>
+          `,e.appendChild(a),$(`  ${r.label} splits=${r.ns} WGs=${r.wgs}: ${r.avgMs}ms, ${r.bwGBs} GB/s`)});$(`
+Benchmark complete.`),t.destroy()}catch(t){e.innerHTML=`<p style="color:var(--red)">${t.message}</p>`,$(`Benchmark error: ${t.message}
+${t.stack}`)}finally{I.disabled=!1}});

index.html CHANGED Viewed

@@ -1,19 +1,146 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>TensorBend</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com" />
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet" />
+  <style>
+    *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+    :root {
+      --bg-0: #0a0a0a; --bg-1: #111111; --bg-2: #1a1a1a; --bg-3: #222222;
+      --accent: #2dd4bf; --accent-hover: #14b8a6; --accent-dim: rgba(45,212,191,0.10);
+      --text-1: #ffffff; --text-2: #999999; --text-3: #555555;
+      --border: #2a2a2a; --border-light: #333333;
+      --font: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      --mono: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace;
+      --chat-max: 720px;
+    }
+    html, body { height: 100%; font-family: var(--font); background: var(--bg-0); color: var(--text-1); font-size: 15px; -webkit-font-smoothing: antialiased; }
+    input, textarea, button, select { font-family: inherit; font-size: inherit; }
+    ::-webkit-scrollbar { width: 5px; } ::-webkit-scrollbar-track { background: transparent; } ::-webkit-scrollbar-thumb { background: var(--border-light); border-radius: 3px; }
+    .app { display: flex; flex-direction: column; height: 100vh; }
+    /* Top bar */
+    .topbar { display: flex; align-items: center; justify-content: space-between; height: 48px; padding: 0 20px; border-bottom: 1px solid var(--border); background: var(--bg-0); flex-shrink: 0; }
+    .topbar-left { display: flex; align-items: center; gap: 12px; }
+    .topbar-title { font-size: 14px; font-weight: 700; letter-spacing: 0.04em; text-transform: uppercase; color: var(--accent); }
+    .topbar-model { font-size: 13px; color: var(--text-2); font-weight: 500; }
+    .topbar-right { display: flex; align-items: center; gap: 10px; }
+    .status-dot { width: 7px; height: 7px; border-radius: 50%; background: var(--text-3); flex-shrink: 0; }
+    .status-dot.ready { background: #4ade80; }
+    .status-dot.loading, .status-dot.generating { background: var(--accent); animation: pulse 1.2s infinite; }
+    .status-dot.error { background: #ef4444; }
+    @keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.3; } }
+    .status-text { font-size: 11px; color: var(--text-3); text-transform: uppercase; letter-spacing: 0.06em; font-weight: 600; }
+    /* Stats bar */
+    .stats-bar { display: flex; align-items: center; justify-content: center; gap: 28px; padding: 7px 20px; border-bottom: 1px solid var(--border); background: var(--bg-1); font-size: 11px; flex-shrink: 0; }
+    .stats-bar:empty { display: none; }
+    .stat { display: flex; align-items: center; gap: 6px; }
+    .stat-label { color: var(--text-3); text-transform: uppercase; letter-spacing: 0.08em; font-weight: 600; font-size: 10px; }
+    .stat-value { color: var(--text-2); font-weight: 600; font-variant-numeric: tabular-nums; font-family: var(--mono); font-size: 12px; }
+    .stat-value.highlight { color: var(--accent); font-size: 13px; }
+    /* Chat area */
+    .chat-container { flex: 1; overflow-y: auto; }
+    .chat-scroll { max-width: var(--chat-max); margin: 0 auto; padding: 32px 20px 20px; display: flex; flex-direction: column; gap: 28px; min-height: 100%; }
+    /* Welcome */
+    .welcome { flex: 1; display: flex; flex-direction: column; align-items: center; justify-content: center; text-align: center; gap: 20px; }
+    .welcome h1 { font-size: 32px; font-weight: 700; color: var(--accent); }
+    .welcome p { color: var(--text-2); font-size: 14px; line-height: 1.7; max-width: 460px; }
+    .load-section { margin-top: 12px; display: flex; flex-direction: column; align-items: center; gap: 12px; }
+    .repo-input { width: 340px; padding: 11px 14px; border-radius: 8px; border: 1px solid var(--border); background: var(--bg-2); color: var(--text-1); outline: none; font-family: var(--mono); font-size: 13px; text-align: center; }
+    .repo-input:focus { border-color: var(--accent); }
+    .btn-load { padding: 11px 28px; border-radius: 8px; border: none; background: var(--accent); color: #fff; font-weight: 600; cursor: pointer; font-size: 14px; transition: background 0.15s; letter-spacing: 0.02em; }
+    .btn-load:hover { background: var(--accent-hover); }
+    .btn-load:disabled { opacity: 0.4; cursor: not-allowed; }
+    /* Download progress */
+    .download-bar { padding: 10px 20px; background: var(--accent-dim); border-bottom: 1px solid var(--border); text-align: center; flex-shrink: 0; }
+    .download-text { font-size: 12px; color: var(--text-2); margin-bottom: 6px; }
+    .download-track { max-width: 360px; margin: 0 auto; height: 3px; background: var(--border); border-radius: 2px; overflow: hidden; }
+    .download-fill { height: 100%; width: 0%; background: var(--accent); border-radius: 2px; transition: width 0.3s; }
+    /* Messages */
+    .msg-group { display: flex; flex-direction: column; gap: 4px; }
+    .msg-role { font-size: 11px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.08em; color: var(--text-3); margin-bottom: 4px; }
+    .msg-content { line-height: 1.7; font-size: 15px; white-space: pre-wrap; word-wrap: break-word; color: var(--text-1); }
+    .msg-user .msg-role { color: var(--text-2); }
+    /* Thinking block */
+    .think-block { margin-bottom: 10px; border-left: 2px solid var(--accent); padding: 10px 14px; background: var(--accent-dim); border-radius: 0 6px 6px 0; }
+    .think-toggle { display: flex; align-items: center; gap: 7px; cursor: pointer; user-select: none; font-size: 11px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.06em; color: var(--accent); }
+    .think-toggle:hover { opacity: 0.8; }
+    .think-arrow { font-size: 9px; transition: transform 0.2s; display: inline-block; }
+    .think-arrow.open { transform: rotate(90deg); }
+    .think-content { font-size: 13px; color: var(--text-2); line-height: 1.65; white-space: pre-wrap; word-wrap: break-word; margin-top: 8px; overflow: hidden; max-height: 600px; transition: max-height 0.3s, margin 0.3s, padding 0.3s; }
+    .think-content.collapsed { max-height: 0; margin-top: 0; padding: 0; }
+    /* Message stats */
+    .msg-stats { font-size: 10px; color: var(--text-3); margin-top: 8px; font-family: var(--mono); display: flex; gap: 8px; letter-spacing: 0.02em; }
+    .msg-stats .sep { opacity: 0.3; }
+    /* Images */
+    .msg-images { display: flex; gap: 6px; margin-bottom: 8px; flex-wrap: wrap; }
+    .msg-img { max-width: 200px; max-height: 200px; border-radius: 6px; object-fit: cover; border: 1px solid var(--border); }
+    /* Image preview */
+    .image-preview { display: flex; gap: 6px; padding: 0 20px; flex-wrap: wrap; max-width: var(--chat-max); margin: 0 auto; width: 100%; }
+    .image-preview:empty { display: none; }
+    .img-thumb { position: relative; width: 52px; height: 52px; border-radius: 6px; overflow: hidden; border: 1px solid var(--border); }
+    .img-thumb img { width: 100%; height: 100%; object-fit: cover; }
+    .img-remove { position: absolute; top: 2px; right: 2px; width: 16px; height: 16px; border-radius: 50%; border: none; background: rgba(0,0,0,0.8); color: #fff; font-size: 9px; cursor: pointer; display: flex; align-items: center; justify-content: center; }
+    /* Input area */
+    .input-wrapper { border-top: 1px solid var(--border); background: var(--bg-0); padding: 14px 20px 22px; flex-shrink: 0; }
+    .input-row { max-width: var(--chat-max); margin: 0 auto; display: flex; gap: 8px; align-items: flex-end; position: relative; }
+    .input-row textarea { flex: 1; padding: 12px 48px 12px 14px; border-radius: 10px; border: 1px solid var(--border); background: var(--bg-2); color: var(--text-1); outline: none; resize: none; height: 48px; max-height: 160px; line-height: 1.5; font-size: 15px; }
+    .input-row textarea:focus { border-color: var(--accent); }
+    .input-row textarea:disabled { opacity: 0.4; }
+    .send-btn { position: absolute; right: 8px; bottom: 8px; width: 32px; height: 32px; border-radius: 6px; border: none; background: var(--accent); color: #fff; cursor: pointer; display: flex; align-items: center; justify-content: center; transition: background 0.15s; }
+    .send-btn:hover:not(:disabled) { background: var(--accent-hover); }
+    .send-btn:disabled { opacity: 0.2; cursor: not-allowed; }
+    .send-btn svg { width: 15px; height: 15px; }
+    .img-upload-btn { width: 32px; height: 32px; border-radius: 6px; border: 1px solid var(--border); background: transparent; color: var(--text-3); cursor: pointer; display: flex; align-items: center; justify-content: center; flex-shrink: 0; font-size: 16px; }
+    .img-upload-btn:hover { background: var(--bg-2); color: var(--text-2); }
+    .input-hint { text-align: center; font-size: 10px; color: var(--text-3); margin-top: 8px; max-width: var(--chat-max); margin-left: auto; margin-right: auto; text-transform: uppercase; letter-spacing: 0.06em; }
+    /* Clear / Settings buttons */
+    .btn-clear, .btn-settings { padding: 4px 10px; border-radius: 5px; border: 1px solid var(--border); background: transparent; color: var(--text-3); font-size: 11px; cursor: pointer; text-transform: uppercase; letter-spacing: 0.04em; font-weight: 600; display: flex; align-items: center; justify-content: center; }
+    .btn-clear:hover, .btn-settings:hover { background: var(--bg-2); color: var(--text-2); }
+    .btn-settings { width: 30px; height: 26px; padding: 0; }
+    .btn-settings svg { pointer-events: none; }
+    /* Settings panel */
+    .settings-panel { padding: 12px 20px; background: var(--bg-1); border-bottom: 1px solid var(--border); display: flex; flex-wrap: wrap; gap: 10px 24px; align-items: center; justify-content: center; }
+    .settings-row { display: flex; align-items: center; gap: 8px; }
+    .settings-label { font-size: 10px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.06em; color: var(--text-3); min-width: 56px; white-space: nowrap; }
+    .settings-select { padding: 4px 8px; border-radius: 5px; border: 1px solid var(--border); background: var(--bg-2); color: var(--text-1); font-size: 12px; outline: none; cursor: pointer; }
+    .settings-select:focus { border-color: var(--accent); }
+    .settings-slider { width: 80px; accent-color: var(--accent); height: 4px; cursor: pointer; }
+    .settings-val { font-size: 11px; color: var(--text-2); font-family: var(--mono); font-variant-numeric: tabular-nums; min-width: 32px; text-align: right; }
+    .settings-divider { width: 1px; height: 20px; background: var(--border); }
+    .toggle-btn { padding: 3px 12px; border-radius: 4px; border: 1px solid var(--border); background: var(--bg-3); color: var(--text-3); font-size: 10px; font-weight: 700; letter-spacing: 0.06em; cursor: pointer; transition: all 0.15s; }
+    .toggle-btn.active { background: var(--accent); color: #fff; border-color: var(--accent); }
+    .toggle-btn:hover { border-color: var(--accent); }
+    /* Toast */
+    .toast { position: fixed; bottom: 24px; left: 50%; transform: translateX(-50%) translateY(20px); padding: 10px 20px; border-radius: 6px; font-size: 13px; font-weight: 500; background: var(--bg-2); color: var(--text-1); border: 1px solid var(--border); opacity: 0; transition: all 0.3s; z-index: 1000; max-width: 480px; }
+    .toast.show { transform: translateX(-50%) translateY(0); opacity: 1; }
+    .toast-error { border-color: #ef4444; color: #ef4444; }
+    .toast-success { border-color: var(--accent); color: var(--accent); }
+  </style>
+  <script type="module" crossorigin src="/assets/main-D8aBnLGG.js"></script>
+  <link rel="modulepreload" crossorigin href="/assets/gpu-ops-Da8QZLNh.js">
+  <link rel="modulepreload" crossorigin href="/assets/qwen35-model-BU7SY47A.js">
+  <link rel="modulepreload" crossorigin href="/assets/safetensors-loader-CwGm5mJX.js">
+</head>
+<body>
+  <div id="app"></div>
+</body>
 </html>

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}

test.html ADDED Viewed

	@@ -0,0 +1,91 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>TensorBend — Shader Tests & Profiler</title>
+  <style>
+    *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+    :root { --bg: #0a0a10; --bg2: #12121e; --border: #222238; --text: #e0e0ee; --dim: #666680; --green: #4ade80; --red: #ef4444; --blue: #4a9eff; --orange: #f59e0b; }
+    body { background: var(--bg); color: var(--text); font-family: 'SF Mono', 'Fira Code', monospace; font-size: 13px; padding: 24px; }
+    h1 { font-size: 1.2rem; margin-bottom: 4px; }
+    h2 { font-size: 0.9rem; color: var(--blue); margin: 20px 0 8px; text-transform: uppercase; letter-spacing: 0.05em; }
+    .subtitle { color: var(--dim); font-size: 0.75rem; margin-bottom: 16px; }
+    .tabs { display: flex; gap: 2px; margin-bottom: 16px; }
+    .tab { padding: 8px 20px; background: var(--bg2); border: 1px solid var(--border); color: var(--dim); cursor: pointer; font-size: 0.8rem; font-weight: 600; }
+    .tab:first-child { border-radius: 6px 0 0 6px; }
+    .tab:last-child { border-radius: 0 6px 6px 0; }
+    .tab.active { background: var(--blue); color: #fff; border-color: var(--blue); }
+    .panel { display: none; }
+    .panel.active { display: block; }
+    .test-row { display: flex; align-items: center; gap: 12px; padding: 6px 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 6px; margin-bottom: 4px; }
+    .test-icon { font-size: 1rem; width: 20px; text-align: center; }
+    .test-name { flex: 1; }
+    .test-err { color: var(--dim); font-size: 0.75rem; }
+    .test-detail { color: var(--red); font-size: 0.7rem; margin-top: 2px; }
+    .pass { color: var(--green); }
+    .fail { color: var(--red); }
+    .pending { color: var(--dim); }
+    .summary { padding: 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 8px; margin-top: 12px; }
+    .summary-line { display: flex; justify-content: space-between; padding: 2px 0; }
+    .bar-track { height: 14px; background: var(--bg); border-radius: 3px; overflow: hidden; position: relative; margin: 2px 0; }
+    .bar-fill { height: 100%; border-radius: 3px; }
+    .bar-label { position: absolute; right: 4px; top: 0; font-size: 0.65rem; line-height: 14px; color: var(--text); }
+    .prof-row { display: flex; align-items: center; gap: 8px; padding: 4px 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 4px; margin-bottom: 2px; font-size: 0.75rem; }
+    .prof-name { flex: 1; min-width: 200px; }
+    .prof-time { width: 80px; text-align: right; color: var(--orange); }
+    .prof-pct { width: 50px; text-align: right; color: var(--dim); }
+    .prof-bar { flex: 1; max-width: 300px; }
+    button.run-btn { padding: 10px 24px; background: var(--blue); color: #fff; border: none; border-radius: 6px; font-weight: 600; cursor: pointer; font-family: inherit; font-size: 0.85rem; }
+    button.run-btn:disabled { opacity: 0.4; cursor: not-allowed; }
+    button.run-btn:hover:not(:disabled) { filter: brightness(1.15); }
+    #log { margin-top: 12px; padding: 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 8px; max-height: 300px; overflow-y: auto; white-space: pre-wrap; font-size: 0.7rem; color: var(--dim); }
+  </style>
+  <script type="module" crossorigin src="/assets/test-s5fGKojE.js"></script>
+  <link rel="modulepreload" crossorigin href="/assets/gpu-ops-Da8QZLNh.js">
+</head>
+<body>
+  <h1>TensorBend Shader Tests & Profiler</h1>
+  <p class="subtitle">Validates every WGSL compute shader against CPU reference implementations</p>
+  <div class="tabs">
+    <div class="tab active" data-panel="tests">Shader Tests</div>
+    <div class="tab" data-panel="profiler">Profiler</div>
+    <div class="tab" data-panel="benchmark">Benchmark</div>
+  </div>
+  <div id="tests" class="panel active">
+    <button class="run-btn" id="runTests">Run All Shader Tests</button>
+    <div id="testResults" style="margin-top: 12px;"></div>
+    <div id="testSummary"></div>
+  </div>
+  <div id="profiler" class="panel">
+    <p style="color: var(--dim); margin-bottom: 12px; font-size: 0.8rem;">
+      Profiles a single forward pass with per-operation timing.
+      Requires a loaded model — enter repo below and load first.
+    </p>
+    <div style="display: flex; gap: 8px; margin-bottom: 12px; align-items: center;">
+      <input type="text" id="profRepo" value="Intel/Qwen3.5-2B-int4-AutoRound"
+        style="flex:1; padding: 8px 12px; background: var(--bg2); border: 1px solid var(--border); border-radius: 6px; color: var(--text); font-family: inherit;" />
+      <button class="run-btn" id="loadAndProfile">Load & Profile</button>
+    </div>
+    <div id="profStatus" style="color: var(--dim); margin-bottom: 8px;"></div>
+    <div id="profCategories"></div>
+    <h2 id="topOpsTitle" style="display:none;">Top Operations</h2>
+    <div id="profTopOps"></div>
+  </div>
+  <div id="benchmark" class="panel">
+    <p style="color: var(--dim); margin-bottom: 12px; font-size: 0.8rem;">
+      Measures GPTQ matvec memory bandwidth at model-realistic sizes.
+      Compares regular vs split-K dispatch strategies.
+    </p>
+    <button class="run-btn" id="runBenchmark">Run GPTQ Benchmark</button>
+    <div id="bmResults" style="margin-top: 12px;"></div>
+  </div>
+  <div id="log"></div>
+</body>
+</html>

vite.svg ADDED Viewed