LJTSG
/

mamba-webgpu

+// embedding.wgsl — token embedding lookup.
+// For each token id, copy table[ids[i]] row into output.
+// One thread per output element (seq_len * D threads total).
+struct Params {
+    seq_len: u32,
+    hidden_dim: u32,
+    vocab_size: u32,
+}
+@group(0) @binding(0) var<storage, read>       ids: array<u32>;
+@group(0) @binding(1) var<storage, read>       table: array<f32>;
+@group(0) @binding(2) var<storage, read_write> out_buf: array<f32>;
+@group(1) @binding(0) var<uniform> params: Params;
+@compute @workgroup_size(64)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i = gid.x;
+    let total = params.seq_len * params.hidden_dim;
+    if (i >= total) { return; }
+    let t = i / params.hidden_dim;
+    let d = i % params.hidden_dim;
+    let token_id = ids[t];
+    if (token_id >= params.vocab_size) {
+        out_buf[i] = 0.0;
+        return;
+    }
+    out_buf[i] = table[token_id * params.hidden_dim + d];
+}