PeytonT
/

agentkernel-lite-100m-bitnet

@@ -1,4 +1,5 @@
 import { BitNetLinearWebGPU } from "./bitnet_webgpu.js";
 function resolveUrl(path, baseUrl) {
   return new URL(path, baseUrl).toString();
@@ -31,6 +32,14 @@ function addInPlace(dst, src) {
   return dst;
 }
 function layerNorm(x, rows, cols, weight, bias, eps = 1e-5) {
   const out = new Float32Array(x.length);
   for (let r = 0; r < rows; r += 1) {
@@ -262,6 +271,31 @@ export class BitNetEncoderDecoderWebGPU {
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
   async mlp(prefix, x, seqLen) {
     const wIn = this.linear(`${prefix}.w_in`);
     const wOut = this.linear(`${prefix}.w_out`);
@@ -333,6 +367,48 @@ export class BitNetEncoderDecoderWebGPU {
     return addInPlace(x, await this.mlp(`decoder.${index}.cross_block.mlp`, n, seqLen));
   }
   async encode(encInputIds) {
     let x = embed(encInputIds, this.tensor("enc_embed.weight"), this.graph.d_model);
     for (let i = 0; i < this.graph.n_layers; i += 1) {
@@ -366,4 +442,115 @@ export class BitNetEncoderDecoderWebGPU {
     const hidden = await this.decode(decInputIds, memory, encInputIds.length);
     return this.linear("lm_head").run(hidden, decInputIds.length);
   }
 }

 import { BitNetLinearWebGPU } from "./bitnet_webgpu.js";
+import { BitNetLinearWASM } from "./bitnet_wasm_runtime.js";
 function resolveUrl(path, baseUrl) {
   return new URL(path, baseUrl).toString();
   return dst;
 }
+function appendRows(existing, next) {
+  if (!existing || existing.length === 0) return next.slice();
+  const out = new Float32Array(existing.length + next.length);
+  out.set(existing, 0);
+  out.set(next, existing.length);
+  return out;
+}
 function layerNorm(x, rows, cols, weight, bias, eps = 1e-5) {
   const out = new Float32Array(x.length);
   for (let r = 0; r < rows; r += 1) {
     return this.linear(`${prefix}.w_o`).run(merged, seqLen);
   }
+  async selfAttentionIncremental(prefix, x, layerCache) {
+    const nHeads = this.graph.n_heads;
+    const headDim = this.graph.head_dim;
+    const q = await this.linear(`${prefix}.w_q`).run(x, 1);
+    const kNew = await this.linear(`${prefix}.w_k`).run(x, 1);
+    const vNew = await this.linear(`${prefix}.w_v`).run(x, 1);
+    layerCache.selfK = appendRows(layerCache.selfK, kNew);
+    layerCache.selfV = appendRows(layerCache.selfV, vNew);
+    layerCache.selfLen = Number(layerCache.selfLen || 0) + 1;
+    const merged = attention(q, layerCache.selfK, layerCache.selfV, 1, layerCache.selfLen, nHeads, headDim, false);
+    return this.linear(`${prefix}.w_o`).run(merged, 1);
+  }
+  async crossAttentionCached(prefix, x, memory, memoryLen, layerCache) {
+    const nHeads = this.graph.n_heads;
+    const headDim = this.graph.head_dim;
+    const q = await this.linear(`${prefix}.w_q`).run(x, 1);
+    if (!layerCache.crossK || !layerCache.crossV) {
+      layerCache.crossK = await this.linear(`${prefix}.w_k`).run(memory, memoryLen);
+      layerCache.crossV = await this.linear(`${prefix}.w_v`).run(memory, memoryLen);
+    }
+    const merged = attention(q, layerCache.crossK, layerCache.crossV, 1, memoryLen, nHeads, headDim, false);
+    return this.linear(`${prefix}.w_o`).run(merged, 1);
+  }
   async mlp(prefix, x, seqLen) {
     const wIn = this.linear(`${prefix}.w_in`);
     const wOut = this.linear(`${prefix}.w_out`);
     return addInPlace(x, await this.mlp(`decoder.${index}.cross_block.mlp`, n, seqLen));
   }
+  async decoderLayerIncremental(index, x, memory, memoryLen, layerCache) {
+    const dModel = this.graph.d_model;
+    let n = layerNorm(
+      x,
+      1,
+      dModel,
+      this.tensor(`decoder.${index}.self_attn_block.n1.weight`),
+      this.dense[`decoder.${index}.self_attn_block.n1.bias`]?.data,
+    );
+    x = addInPlace(
+      x.slice(),
+      await this.selfAttentionIncremental(`decoder.${index}.self_attn_block.attn`, n, layerCache),
+    );
+    n = layerNorm(
+      x,
+      1,
+      dModel,
+      this.tensor(`decoder.${index}.self_attn_block.n2.weight`),
+      this.dense[`decoder.${index}.self_attn_block.n2.bias`]?.data,
+    );
+    x = addInPlace(x, await this.mlp(`decoder.${index}.self_attn_block.mlp`, n, 1));
+    n = layerNorm(
+      x,
+      1,
+      dModel,
+      this.tensor(`decoder.${index}.cross_block.n1.weight`),
+      this.dense[`decoder.${index}.cross_block.n1.bias`]?.data,
+    );
+    x = addInPlace(
+      x.slice(),
+      await this.crossAttentionCached(`decoder.${index}.cross_block.cross`, n, memory, memoryLen, layerCache),
+    );
+    n = layerNorm(
+      x,
+      1,
+      dModel,
+      this.tensor(`decoder.${index}.cross_block.n2.weight`),
+      this.dense[`decoder.${index}.cross_block.n2.bias`]?.data,
+    );
+    return addInPlace(x, await this.mlp(`decoder.${index}.cross_block.mlp`, n, 1));
+  }
   async encode(encInputIds) {
     let x = embed(encInputIds, this.tensor("enc_embed.weight"), this.graph.d_model);
     for (let i = 0; i < this.graph.n_layers; i += 1) {
     const hidden = await this.decode(decInputIds, memory, encInputIds.length);
     return this.linear("lm_head").run(hidden, decInputIds.length);
   }
+  createGenerationSession(encInputIds) {
+    return new BitNetEncoderDecoderGenerationSession(this, encInputIds);
+  }
+}
+export class BitNetEncoderDecoderGenerationSession {
+  constructor(runtime, encInputIds) {
+    this.runtime = runtime;
+    this.encInputIds = Array.from(encInputIds || [], Number);
+    this.memory = null;
+    this.memoryLen = this.encInputIds.length;
+    this.layerCaches = Array.from({ length: runtime.graph.n_layers }, () => ({}));
+  }
+  async prepare() {
+    if (!this.memory) {
+      this.memory = await this.runtime.encode(this.encInputIds);
+    }
+    return this;
+  }
+  async next(tokenId) {
+    await this.prepare();
+    let x = embed([Number(tokenId)], this.runtime.tensor("dec_embed.weight"), this.runtime.graph.d_model);
+    for (let i = 0; i < this.runtime.graph.n_layers; i += 1) {
+      x = await this.runtime.decoderLayerIncremental(i, x, this.memory, this.memoryLen, this.layerCaches[i]);
+    }
+    const hidden = layerNorm(
+      x,
+      1,
+      this.runtime.graph.d_model,
+      this.runtime.tensor("dec_norm.weight"),
+      this.runtime.dense["dec_norm.bias"]?.data,
+    );
+    return this.runtime.linear("lm_head").run(hidden, 1);
+  }
+}
+export class BitNetEncoderDecoderWASM extends BitNetEncoderDecoderWebGPU {
+  constructor(manifest, manifestUrl, denseTensors, linears) {
+    super(null, manifest, manifestUrl, denseTensors, linears);
+  }
+  static async fromManifestUrl(manifestUrl, options = {}) {
+    const progress = typeof options.progress === "function" ? options.progress : () => {};
+    progress({ phase: "manifest", message: "Loading model manifest" });
+    const manifest = options.manifest || await fetchJson(manifestUrl);
+    const baseUrl = new URL(".", manifestUrl).toString();
+    const dense = {};
+    const denseEntries = Object.entries(manifest.dense_tensors || {});
+    for (const [index, [name, entry]] of denseEntries.entries()) {
+      progress({
+        phase: "dense",
+        index: index + 1,
+        total: denseEntries.length,
+        name,
+        message: `Loading dense tensor ${index + 1}/${denseEntries.length}: ${name}`,
+      });
+      dense[name] = await fetchFloatTensor(entry, baseUrl);
+    }
+    progress({
+      phase: "dense_ready",
+      index: denseEntries.length,
+      total: denseEntries.length,
+      message: "Dense tensors ready",
+    });
+    const linears = {};
+    const layers = manifest.layers || [];
+    const layerConcurrency = Math.max(1, Math.min(Number(options.layerConcurrency || 4), layers.length || 1));
+    progress({
+      phase: "prepare_layers",
+      index: 0,
+      total: layers.length,
+      message: `Preparing ${layers.length} BitNet WASM layers (${layerConcurrency} parallel)`,
+    });
+    let nextLayer = 0;
+    let completedLayers = 0;
+    async function loadLayerWorker() {
+      while (nextLayer < layers.length) {
+        const index = nextLayer;
+        nextLayer += 1;
+        const layer = layers[index];
+        progress({
+          phase: "layer",
+          index: index + 1,
+          total: layers.length,
+          name: layer.name,
+          message: `Loading BitNet WASM layer ${index + 1}/${layers.length}: ${layer.name}`,
+        });
+        linears[layer.name] = await BitNetLinearWASM.fromManifestLayer(manifest, layer, manifestUrl, {
+          progress,
+          index: index + 1,
+          total: layers.length,
+          name: layer.name,
+        });
+        completedLayers += 1;
+        progress({
+          phase: "layer_ready",
+          index: completedLayers,
+          total: layers.length,
+          name: layer.name,
+          message: `BitNet WASM layer ${completedLayers}/${layers.length} ready: ${layer.name}`,
+        });
+      }
+    }
+    await Promise.all(Array.from({ length: Math.min(layerConcurrency, layers.length) }, () => loadLayerWorker()));
+    progress({ phase: "wasm_ready", message: "BitNet WASM runtime ready" });
+    return new BitNetEncoderDecoderWASM(manifest, manifestUrl, dense, linears);
+  }
 }