Update AgentKernel Lite WASM decoder kernel

Browse files

Files changed (2) hide show

runtime/encdec_runtime.js +193 -27
runtime/model_stack_bitnet_wasm_bg.wasm +2 -2

runtime/encdec_runtime.js CHANGED Viewed

@@ -77,6 +77,11 @@ function zeros(length) {
   return new Float32Array(length);
 }
 function addInPlace(dst, src) {
   for (let i = 0; i < dst.length; i += 1) {
     dst[i] += src[i];
@@ -412,6 +417,7 @@ export class BitNetEncoderDecoderWebGPU {
     this.linears = linears;
     this.denseLinears = {};
     this.graph = manifest.graph;
     this.decoderRotary = decoderUsesRotary(manifest, this.graph);
     this.decoderRotaryBase = rotaryBase(manifest, this.graph);
   }
@@ -495,6 +501,70 @@ export class BitNetEncoderDecoderWebGPU {
     return denseLayer;
   }
   tensor(name) {
     const tensor = this.dense[name];
     if (!tensor) throw new Error(`missing dense tensor: ${name}`);
@@ -504,6 +574,9 @@ export class BitNetEncoderDecoderWebGPU {
   norm(prefix, x, rows) {
     const weight = this.tensor(`${prefix}.weight`);
     const bias = this.dense[`${prefix}.bias`]?.data || null;
     if (bias) {
       return layerNorm(x, rows, this.graph.d_model, weight, bias);
     }
@@ -516,53 +589,84 @@ export class BitNetEncoderDecoderWebGPU {
     );
   }
   async attentionBlock(prefix, x, seqLen, kv, kvLen, causal) {
     const dModel = this.graph.d_model;
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
-    const q = await this.linear(`${prefix}.w_q`).run(x, seqLen);
     const kInput = kv || x;
     const kRows = kvLen || seqLen;
-    const k = await this.linear(`${prefix}.w_k`).run(kInput, kRows);
-    const v = await this.linear(`${prefix}.w_v`).run(kInput, kRows);
     if (causal && this.decoderRotary) {
       applyRotaryMergedInPlace(q, k, seqLen, nHeads, headDim, this.decoderRotaryBase, 0);
     }
-    const merged = attention(q, k, v, seqLen, kRows, nHeads, headDim, causal);
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
   async selfAttentionIncremental(prefix, x, layerCache) {
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
-    const q = await this.linear(`${prefix}.w_q`).run(x, 1);
-    const kNew = await this.linear(`${prefix}.w_k`).run(x, 1);
-    const vNew = await this.linear(`${prefix}.w_v`).run(x, 1);
     const position = Number(layerCache.selfLen || 0);
     if (this.decoderRotary) {
       applyRotaryMergedInPlace(q, kNew, 1, nHeads, headDim, this.decoderRotaryBase, position);
     }
     layerCache.selfK = appendCachedRows(layerCache, "selfK", kNew);
     layerCache.selfV = appendCachedRows(layerCache, "selfV", vNew);
     layerCache.selfLen = Number(layerCache.selfLen || 0) + 1;
-    const merged = attention(q, layerCache.selfK, layerCache.selfV, 1, layerCache.selfLen, nHeads, headDim, false);
     return this.linear(`${prefix}.w_o`).run(merged, 1);
   }
   async selfAttentionIncrementalSpan(prefix, x, seqLen, layerCache) {
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
-    const q = await this.linear(`${prefix}.w_q`).run(x, seqLen);
-    const kNew = await this.linear(`${prefix}.w_k`).run(x, seqLen);
-    const vNew = await this.linear(`${prefix}.w_v`).run(x, seqLen);
     const position = Number(layerCache.selfLen || 0);
     if (this.decoderRotary) {
       applyRotaryMergedInPlace(q, kNew, seqLen, nHeads, headDim, this.decoderRotaryBase, position);
     }
     layerCache.selfK = appendCachedRows(layerCache, "selfK", kNew);
     layerCache.selfV = appendCachedRows(layerCache, "selfV", vNew);
     layerCache.selfLen = Number(layerCache.selfLen || 0) + seqLen;
-    const merged = attention(q, layerCache.selfK, layerCache.selfV, seqLen, layerCache.selfLen, nHeads, headDim, true, position);
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
@@ -570,11 +674,20 @@ export class BitNetEncoderDecoderWebGPU {
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
     const q = await this.linear(`${prefix}.w_q`).run(x, 1);
     if (!layerCache.crossK || !layerCache.crossV) {
-      layerCache.crossK = await this.linear(`${prefix}.w_k`).run(memory, memoryLen);
-      layerCache.crossV = await this.linear(`${prefix}.w_v`).run(memory, memoryLen);
     }
-    const merged = attention(q, layerCache.crossK, layerCache.crossV, 1, memoryLen, nHeads, headDim, false);
     return this.linear(`${prefix}.w_o`).run(merged, 1);
   }
@@ -582,25 +695,39 @@ export class BitNetEncoderDecoderWebGPU {
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
     const q = await this.linear(`${prefix}.w_q`).run(x, seqLen);
     if (!layerCache.crossK || !layerCache.crossV) {
-      layerCache.crossK = await this.linear(`${prefix}.w_k`).run(memory, memoryLen);
-      layerCache.crossV = await this.linear(`${prefix}.w_v`).run(memory, memoryLen);
     }
-    const merged = attention(q, layerCache.crossK, layerCache.crossV, seqLen, memoryLen, nHeads, headDim, false);
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
   async mlp(prefix, x, seqLen) {
     const wIn = this.linear(`${prefix}.w_in`);
     const wOut = this.linear(`${prefix}.w_out`);
     const hidden = await wIn.run(x, seqLen);
     const activation = String(this.graph.activation || "silu").toLowerCase();
     const isGated =
       wIn.layout.logicalOut === wOut.layout.logicalIn * 2 ||
       hidden.length === seqLen * wOut.layout.logicalIn * 2;
     const activated = isGated || ["swiglu", "gated-silu", "geglu", "reglu"].includes(activation)
-      ? gatedActivation(hidden, seqLen, wOut.layout.logicalIn, activation)
-      : activate(hidden, activation);
     return wOut.run(activated, seqLen);
   }
@@ -624,6 +751,14 @@ export class BitNetEncoderDecoderWebGPU {
   }
   async decoderLayerIncremental(index, x, memory, memoryLen, layerCache) {
     let n = this.norm(`decoder.${index}.self_attn_block.n1`, x, 1);
     x = addInPlace(
       x.slice(),
@@ -791,24 +926,54 @@ export class BitNetEncoderDecoderGenerationSession {
   }
   async next(tokenId) {
     await this.prepare();
     let x = embed([Number(tokenId)], this.runtime.tensor("dec_embed.weight"), this.runtime.graph.d_model);
     for (let i = 0; i < this.runtime.graph.n_layers; i += 1) {
       x = await this.runtime.decoderLayerIncremental(i, x, this.memory, this.memoryLen, this.layerCaches[i]);
     }
-    const hidden = layerNorm(
-      x,
-      1,
-      this.runtime.graph.d_model,
-      this.runtime.tensor("dec_norm.weight"),
-      this.runtime.dense["dec_norm.bias"]?.data,
     );
-    return this.runtime.linear("lm_head").run(hidden, 1);
   }
   cloneState() {
     return this.layerCaches.map((cache) => {
       const cloned = { ...cache };
       if (cache.selfK) {
         cloned.selfK = cache.selfK.slice();
         cloned.selfKLength = cloned.selfK.length;
@@ -849,6 +1014,7 @@ export class BitNetEncoderDecoderGenerationSession {
 export class BitNetEncoderDecoderWASM extends BitNetEncoderDecoderWebGPU {
   constructor(manifest, manifestUrl, denseTensors, linears) {
     super(null, manifest, manifestUrl, denseTensors, linears);
   }
   static async fromManifestUrl(manifestUrl, options = {}) {

   return new Float32Array(length);
 }
+function toUint32IdArray(ids) {
+  if (ids instanceof Uint32Array) return ids;
+  return Uint32Array.from(Array.from(ids || [], Number).filter((id) => Number.isFinite(id)));
+}
 function addInPlace(dst, src) {
   for (let i = 0; i < dst.length; i += 1) {
     dst[i] += src[i];
     this.linears = linears;
     this.denseLinears = {};
     this.graph = manifest.graph;
+    this.wasmOps = null;
     this.decoderRotary = decoderUsesRotary(manifest, this.graph);
     this.decoderRotaryBase = rotaryBase(manifest, this.graph);
   }
     return denseLayer;
   }
+  linear3(firstName, secondName, thirdName, input, rows) {
+    const first = this.linear(firstName);
+    const second = this.linear(secondName);
+    const third = this.linear(thirdName);
+    if (this.wasmOps?.bitnet_linear3_f32 && first.handle && second.handle && third.handle) {
+      const merged = this.wasmOps.bitnet_linear3_f32(first.handle, second.handle, third.handle, input, rows);
+      const firstLen = rows * first.layout.logicalOut;
+      const secondLen = rows * second.layout.logicalOut;
+      return [
+        merged.slice(0, firstLen),
+        merged.slice(firstLen, firstLen + secondLen),
+        merged.slice(firstLen + secondLen),
+      ];
+    }
+    return [first.run(input, rows), second.run(input, rows), third.run(input, rows)];
+  }
+  linear2(firstName, secondName, input, rows) {
+    const first = this.linear(firstName);
+    const second = this.linear(secondName);
+    if (this.wasmOps?.bitnet_linear2_f32 && first.handle && second.handle) {
+      const merged = this.wasmOps.bitnet_linear2_f32(first.handle, second.handle, input, rows);
+      const firstLen = rows * first.layout.logicalOut;
+      return [merged.slice(0, firstLen), merged.slice(firstLen)];
+    }
+    return [first.run(input, rows), second.run(input, rows)];
+  }
+  decoderLayerHandle(index) {
+    if (!this.wasmOps?.DecoderLayerHandle) return null;
+    const names = [
+      `decoder.${index}.self_attn_block.attn.w_q`,
+      `decoder.${index}.self_attn_block.attn.w_k`,
+      `decoder.${index}.self_attn_block.attn.w_v`,
+      `decoder.${index}.self_attn_block.attn.w_o`,
+      `decoder.${index}.self_attn_block.mlp.w_in`,
+      `decoder.${index}.self_attn_block.mlp.w_out`,
+      `decoder.${index}.cross_block.cross.w_q`,
+      `decoder.${index}.cross_block.cross.w_k`,
+      `decoder.${index}.cross_block.cross.w_v`,
+      `decoder.${index}.cross_block.cross.w_o`,
+      `decoder.${index}.cross_block.mlp.w_in`,
+      `decoder.${index}.cross_block.mlp.w_out`,
+    ];
+    const layers = names.map((name) => this.linear(name));
+    if (!layers.every((layer) => layer?.handle)) return null;
+    return new this.wasmOps.DecoderLayerHandle(
+      ...layers.map((layer) => layer.handle),
+      this.tensor(`decoder.${index}.self_attn_block.n1.weight`),
+      this.dense[`decoder.${index}.self_attn_block.n1.bias`]?.data || new Float32Array(0),
+      this.tensor(`decoder.${index}.self_attn_block.n2.weight`),
+      this.dense[`decoder.${index}.self_attn_block.n2.bias`]?.data || new Float32Array(0),
+      this.tensor(`decoder.${index}.cross_block.n1.weight`),
+      this.dense[`decoder.${index}.cross_block.n1.bias`]?.data || new Float32Array(0),
+      this.tensor(`decoder.${index}.cross_block.n2.weight`),
+      this.dense[`decoder.${index}.cross_block.n2.bias`]?.data || new Float32Array(0),
+      String(this.graph.activation || "silu"),
+      this.graph.d_model,
+      this.graph.n_heads,
+      this.graph.head_dim,
+      this.decoderRotary ? this.decoderRotaryBase : 0,
+    );
+  }
   tensor(name) {
     const tensor = this.dense[name];
     if (!tensor) throw new Error(`missing dense tensor: ${name}`);
   norm(prefix, x, rows) {
     const weight = this.tensor(`${prefix}.weight`);
     const bias = this.dense[`${prefix}.bias`]?.data || null;
+    if (this.wasmOps?.layer_norm_f32 && bias) {
+      return this.wasmOps.layer_norm_f32(x, weight, bias, rows, this.graph.d_model, 1e-5);
+    }
     if (bias) {
       return layerNorm(x, rows, this.graph.d_model, weight, bias);
     }
     );
   }
+  attention(q, k, v, qLen, kvLen, causal, pastLen = 0) {
+    if (this.wasmOps?.attention_f32) {
+      return this.wasmOps.attention_f32(
+        q,
+        k,
+        v,
+        qLen,
+        kvLen,
+        this.graph.n_heads,
+        this.graph.head_dim,
+        Boolean(causal),
+        Number(pastLen || 0),
+      );
+    }
+    return attention(q, k, v, qLen, kvLen, this.graph.n_heads, this.graph.head_dim, causal, pastLen);
+  }
   async attentionBlock(prefix, x, seqLen, kv, kvLen, causal) {
     const dModel = this.graph.d_model;
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
+    let q;
+    let k;
+    let v;
     const kInput = kv || x;
     const kRows = kvLen || seqLen;
+    if (!kv) {
+      [q, k, v] = this.linear3(`${prefix}.w_q`, `${prefix}.w_k`, `${prefix}.w_v`, x, seqLen);
+    } else {
+      q = await this.linear(`${prefix}.w_q`).run(x, seqLen);
+      [k, v] = this.linear2(`${prefix}.w_k`, `${prefix}.w_v`, kInput, kRows);
+    }
     if (causal && this.decoderRotary) {
       applyRotaryMergedInPlace(q, k, seqLen, nHeads, headDim, this.decoderRotaryBase, 0);
     }
+    const merged = this.attention(q, k, v, seqLen, kRows, causal);
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
   async selfAttentionIncremental(prefix, x, layerCache) {
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
+    const [q, kNew, vNew] = this.linear3(`${prefix}.w_q`, `${prefix}.w_k`, `${prefix}.w_v`, x, 1);
     const position = Number(layerCache.selfLen || 0);
     if (this.decoderRotary) {
       applyRotaryMergedInPlace(q, kNew, 1, nHeads, headDim, this.decoderRotaryBase, position);
     }
+    if (this.wasmOps?.AttentionKvCache) {
+      layerCache.selfAttention ??= new this.wasmOps.AttentionKvCache(nHeads, headDim);
+      const merged = layerCache.selfAttention.append_self_attention(q, kNew, vNew, 1, false);
+      layerCache.selfLen = layerCache.selfAttention.len();
+      return this.linear(`${prefix}.w_o`).run(merged, 1);
+    }
     layerCache.selfK = appendCachedRows(layerCache, "selfK", kNew);
     layerCache.selfV = appendCachedRows(layerCache, "selfV", vNew);
     layerCache.selfLen = Number(layerCache.selfLen || 0) + 1;
+    const merged = this.attention(q, layerCache.selfK, layerCache.selfV, 1, layerCache.selfLen, false);
     return this.linear(`${prefix}.w_o`).run(merged, 1);
   }
   async selfAttentionIncrementalSpan(prefix, x, seqLen, layerCache) {
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
+    const [q, kNew, vNew] = this.linear3(`${prefix}.w_q`, `${prefix}.w_k`, `${prefix}.w_v`, x, seqLen);
     const position = Number(layerCache.selfLen || 0);
     if (this.decoderRotary) {
       applyRotaryMergedInPlace(q, kNew, seqLen, nHeads, headDim, this.decoderRotaryBase, position);
     }
+    if (this.wasmOps?.AttentionKvCache) {
+      layerCache.selfAttention ??= new this.wasmOps.AttentionKvCache(nHeads, headDim);
+      const merged = layerCache.selfAttention.append_self_attention(q, kNew, vNew, seqLen, true);
+      layerCache.selfLen = layerCache.selfAttention.len();
+      return this.linear(`${prefix}.w_o`).run(merged, seqLen);
+    }
     layerCache.selfK = appendCachedRows(layerCache, "selfK", kNew);
     layerCache.selfV = appendCachedRows(layerCache, "selfV", vNew);
     layerCache.selfLen = Number(layerCache.selfLen || 0) + seqLen;
+    const merged = this.attention(q, layerCache.selfK, layerCache.selfV, seqLen, layerCache.selfLen, true, position);
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
     const q = await this.linear(`${prefix}.w_q`).run(x, 1);
+    if (this.wasmOps?.AttentionKvCache) {
+      layerCache.crossAttention ??= new this.wasmOps.AttentionKvCache(nHeads, headDim);
+      if (!layerCache.crossReady) {
+        const [crossK, crossV] = this.linear2(`${prefix}.w_k`, `${prefix}.w_v`, memory, memoryLen);
+        layerCache.crossAttention.set_cross(crossK, crossV, memoryLen);
+        layerCache.crossReady = true;
+      }
+      const merged = layerCache.crossAttention.attention(q, 1, false, 0);
+      return this.linear(`${prefix}.w_o`).run(merged, 1);
+    }
     if (!layerCache.crossK || !layerCache.crossV) {
+      [layerCache.crossK, layerCache.crossV] = this.linear2(`${prefix}.w_k`, `${prefix}.w_v`, memory, memoryLen);
     }
+    const merged = this.attention(q, layerCache.crossK, layerCache.crossV, 1, memoryLen, false);
     return this.linear(`${prefix}.w_o`).run(merged, 1);
   }
     const nHeads = this.graph.n_heads;
     const headDim = this.graph.head_dim;
     const q = await this.linear(`${prefix}.w_q`).run(x, seqLen);
+    if (this.wasmOps?.AttentionKvCache) {
+      layerCache.crossAttention ??= new this.wasmOps.AttentionKvCache(nHeads, headDim);
+      if (!layerCache.crossReady) {
+        const [crossK, crossV] = this.linear2(`${prefix}.w_k`, `${prefix}.w_v`, memory, memoryLen);
+        layerCache.crossAttention.set_cross(crossK, crossV, memoryLen);
+        layerCache.crossReady = true;
+      }
+      const merged = layerCache.crossAttention.attention(q, seqLen, false, 0);
+      return this.linear(`${prefix}.w_o`).run(merged, seqLen);
+    }
     if (!layerCache.crossK || !layerCache.crossV) {
+      [layerCache.crossK, layerCache.crossV] = this.linear2(`${prefix}.w_k`, `${prefix}.w_v`, memory, memoryLen);
     }
+    const merged = this.attention(q, layerCache.crossK, layerCache.crossV, seqLen, memoryLen, false);
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
   async mlp(prefix, x, seqLen) {
     const wIn = this.linear(`${prefix}.w_in`);
     const wOut = this.linear(`${prefix}.w_out`);
+    if (this.wasmOps?.bitnet_mlp_f32 && wIn.handle && wOut.handle) {
+      return this.wasmOps.bitnet_mlp_f32(wIn.handle, wOut.handle, x, seqLen, String(this.graph.activation || "silu"));
+    }
     const hidden = await wIn.run(x, seqLen);
     const activation = String(this.graph.activation || "silu").toLowerCase();
     const isGated =
       wIn.layout.logicalOut === wOut.layout.logicalIn * 2 ||
       hidden.length === seqLen * wOut.layout.logicalIn * 2;
     const activated = isGated || ["swiglu", "gated-silu", "geglu", "reglu"].includes(activation)
+      ? (this.wasmOps?.gated_activation_f32
+          ? this.wasmOps.gated_activation_f32(hidden, seqLen, wOut.layout.logicalIn, activation)
+          : gatedActivation(hidden, seqLen, wOut.layout.logicalIn, activation))
+      : (this.wasmOps?.activate_f32 ? this.wasmOps.activate_f32(hidden, activation) : activate(hidden, activation));
     return wOut.run(activated, seqLen);
   }
   }
   async decoderLayerIncremental(index, x, memory, memoryLen, layerCache) {
+    if (this.wasmOps?.DecoderLayerHandle) {
+      layerCache.decoderLayer ??= this.decoderLayerHandle(index);
+      if (layerCache.decoderLayer?.next) {
+        const out = layerCache.decoderLayer.next(x, memory, memoryLen);
+        layerCache.selfLen = layerCache.decoderLayer.self_len();
+        return out;
+      }
+    }
     let n = this.norm(`decoder.${index}.self_attn_block.n1`, x, 1);
     x = addInPlace(
       x.slice(),
   }
   async next(tokenId) {
+    const hidden = await this.nextHidden(tokenId);
+    return this.runtime.linear("lm_head").run(hidden, 1);
+  }
+  async nextHidden(tokenId) {
     await this.prepare();
     let x = embed([Number(tokenId)], this.runtime.tensor("dec_embed.weight"), this.runtime.graph.d_model);
     for (let i = 0; i < this.runtime.graph.n_layers; i += 1) {
       x = await this.runtime.decoderLayerIncremental(i, x, this.memory, this.memoryLen, this.layerCaches[i]);
     }
+    return this.runtime.norm("dec_norm", x, 1);
+  }
+  async sampleNext(tokenId, generatedIds, options = {}) {
+    if (!this.runtime.wasmOps?.bitnet_sample_token_f32) return null;
+    const lmHead = this.runtime.linear("lm_head");
+    if (!lmHead.handle) return null;
+    const hidden = await this.nextHidden(tokenId);
+    const sample = this.runtime.wasmOps.bitnet_sample_token_f32(
+      lmHead.handle,
+      hidden,
+      toUint32IdArray(generatedIds),
+      toUint32IdArray(options.blockedIds),
+      Number(options.temperature ?? 0.35),
+      Number(options.topP ?? 0.9),
+      Number(options.repetitionPenalty ?? 1.16),
+      Number(options.randomValue ?? Math.random()),
     );
+    return {
+      tokenId: Number(sample.token_id),
+      probability: Number(sample.probability),
+      topProbability: Number(sample.top_probability),
+      rank: Number(sample.rank),
+    };
   }
   cloneState() {
     return this.layerCaches.map((cache) => {
       const cloned = { ...cache };
+      if (cache.selfAttention?.clone_cache) {
+        cloned.selfAttention = cache.selfAttention.clone_cache();
+      }
+      if (cache.crossAttention?.clone_cache) {
+        cloned.crossAttention = cache.crossAttention.clone_cache();
+      }
+      if (cache.decoderLayer?.clone_cache) {
+        cloned.decoderLayer = cache.decoderLayer.clone_cache();
+      }
       if (cache.selfK) {
         cloned.selfK = cache.selfK.slice();
         cloned.selfKLength = cloned.selfK.length;
 export class BitNetEncoderDecoderWASM extends BitNetEncoderDecoderWebGPU {
   constructor(manifest, manifestUrl, denseTensors, linears) {
     super(null, manifest, manifestUrl, denseTensors, linears);
+    this.wasmOps = Object.values(linears || {}).find((layer) => layer?.wasm)?.wasm || null;
   }
   static async fromManifestUrl(manifestUrl, options = {}) {

runtime/model_stack_bitnet_wasm_bg.wasm CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6dd3b40517258c5ef044cdde24c0a7b7a812fc3b13b55cad0b89794f9a0559c5
-size 69044

 version https://git-lfs.github.com/spec/v1
+oid sha256:3016e4b7ef05f1b94bb19b7017280407281d219341a0d593460d8ba57268cee4
+size 75132