LJTSG
/

mamba-webgpu

+// ssu.wgsl — selective_state_update (SSM scan, single decode step, batch=1).
+// Ported from gfx1151_inference/shaders/ssu.comp (Vulkan GLSL → WebGPU WGSL).
+//
+// Math (per (h, s)):
+//   delta_h = softplus(dt[h] + dt_bias[h])
+//   state[h, s] = state[h, s] * exp(delta_h * A[h, s]) + (delta_h * B[s]) * x[h]
+// Then per h:
+//   y[h] = sum_s( state[h, s] * C[s] ) + D[h] * x[h]
+//
+// Falcon-Mamba 7B: hidden_dim=4096, ssm_state_size=16.
+// Dispatch: one workgroup per h. Workgroup size = ssm_state_size = 16 threads.
+//
+// State buffer is updated in-place (read+write).
+const WG_SIZE: u32 = 16u;
+struct Params {
+    H: u32,   // hidden_dim
+    S: u32,   // ssm_state_size
+}
+@group(0) @binding(0) var<storage, read_write> state_buf: array<f32>;  // [H, S]
+@group(0) @binding(1) var<storage, read>       x_buf: array<f32>;      // [H]
+@group(0) @binding(2) var<storage, read>       dt_buf: array<f32>;     // [H]
+@group(0) @binding(3) var<storage, read>       A_buf: array<f32>;      // [H, S]
+@group(0) @binding(4) var<storage, read>       B_buf: array<f32>;      // [S]
+@group(0) @binding(5) var<storage, read>       C_buf: array<f32>;      // [S]
+@group(0) @binding(6) var<storage, read>       D_buf: array<f32>;      // [H]
+@group(0) @binding(7) var<storage, read>       dt_bias_buf: array<f32>; // [H]
+@group(0) @binding(8) var<storage, read_write> y_buf: array<f32>;      // [H]
+@group(1) @binding(0) var<uniform> params: Params;
+var<workgroup> partial_y: array<f32, 16>;  // WG_SIZE = 16
+fn stable_softplus(x: f32) -> f32 {
+    // softplus(x) = log(1 + exp(x)), numerically stable
+    return max(x, 0.0) + log(1.0 + exp(-abs(x)));
+}
+@compute @workgroup_size(WG_SIZE)
+fn main(
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+    @builtin(local_invocation_id) lid: vec3<u32>
+) {
+    let h = wg_id.x;
+    let s = lid.x;
+    // All threads participate in barriers — guard computation, not control flow
+    var my_partial: f32 = 0.0;
+    var D_h: f32 = 0.0;
+    var x_h: f32 = 0.0;
+    if (h < params.H && s < params.S) {
+        let dt_h    = dt_buf[h];
+        let bias_h  = dt_bias_buf[h];
+        let delta_h = stable_softplus(dt_h + bias_h);
+        x_h     = x_buf[h];
+        D_h     = D_buf[h];
+        let state_idx = h * params.S + s;
+        let A_hs = -exp(A_buf[state_idx]);
+        let B_s  = B_buf[s];
+        let C_s  = C_buf[s];
+        let delta_A = exp(delta_h * A_hs);
+        let delta_B = delta_h * B_s;
+        let new_state = state_buf[state_idx] * delta_A + delta_B * x_h;
+        state_buf[state_idx] = new_state;
+        my_partial = new_state * C_s;
+    }
+    partial_y[s] = my_partial;
+    workgroupBarrier();
+    // Tree reduction — all threads participate uniformly
+    var off: u32 = WG_SIZE / 2u;
+    loop {
+        if (off == 0u) { break; }
+        if (s < off) {
+            partial_y[s] = partial_y[s] + partial_y[s + off];
+        }
+        workgroupBarrier();
+        off = off >> 1u;
+    }
+    if (s == 0u && h < params.H) {
+        y_buf[h] = partial_y[0u] + D_h * x_h;
+    }
+}