LJTSG
/

mamba-webgpu

Text Generation

browser-inference

state-space-model

first-of-its-kind

Model card Files Files and versions

mamba-webgpu / mamba_runtime.js

LJTSG's picture

Upload mamba_runtime.js with huggingface_hub

1a8f20a verified 3 days ago

history blame contribute delete

42 kB

	/**
	* mamba_runtime.js — Browser-native Falcon-Mamba inference via WebGPU.
	*
	* The first browser-native Mamba/SSM inference engine.
	* No MLC, no TVM — pure WebGPU compute shaders ported from gfx1151_runtime.
	*
	* Architecture: Falcon-Mamba 7B
	* 64 layers, each: RMSNorm → in_proj → conv1d → SSU → out_proj
	* Final: RMSNorm → lm_head → sample
	*
	* Weight format: safetensors (HF standard), loaded directly into WebGPU buffers.
	* Shaders: WGSL compute shaders in ./shaders/ (ported from Vulkan GLSL).
	*
	* Usage:
	* const mamba = new MambaRuntime();
	* await mamba.init();
	* await mamba.loadWeights('./weights/');
	* const text = await mamba.generate("Hello Grandma", 100);
	*/

	// Falcon-Mamba 7B constants
	const CONFIG = {
	hidden_size: 4096,
	intermediate_size: 8192, // 2 * hidden
	num_layers: 64,
	vocab_size: 65024,
	state_size: 16, // SSM d_state
	conv_kernel: 4,
	dt_rank: 256,
	rms_eps: 1e-5,
	};

	class MambaRuntime {
	constructor() {
	this.device = null;
	this.pipelines = {}; // shader name → GPUComputePipeline
	this.bindLayouts = {}; // shader name → GPUBindGroupLayout
	this.weights = {}; // parameter name → GPUBuffer
	this.state = {}; // per-layer SSM state + conv1d state buffers
	this.ready = false;
	}

	// ── Init: get WebGPU device + compile all shaders ──────────────────────
	async init() {
	if (!navigator.gpu) throw new Error('WebGPU not supported in this browser');
	const adapter = await navigator.gpu.requestAdapter();
	if (!adapter) throw new Error('No WebGPU adapter found');

	// Request max buffer size the device supports
	const limits = adapter.limits;
	console.log('[mamba] maxBufferSize:', limits.maxBufferSize,
	'=', (limits.maxBufferSize / 1024 / 1024 / 1024).toFixed(2), 'GB');

	this.device = await adapter.requestDevice({
	requiredLimits: {
	maxBufferSize: limits.maxBufferSize,
	maxStorageBufferBindingSize: limits.maxStorageBufferBindingSize,
	maxComputeWorkgroupStorageSize: limits.maxComputeWorkgroupStorageSize,
	maxStorageBuffersPerShaderStage: Math.min(limits.maxStorageBuffersPerShaderStage, 16),
	}
	});

	this.device.lost.then((info) => {
	console.error('[mamba] DEVICE LOST:', info.reason, info.message);
	});
	this.device.addEventListener('uncapturederror', (e) => {
	console.error('[mamba] GPU ERROR:', e.error.message);
	});

	console.log('[mamba] device ready, compiling shaders...');
	await this._compileShaders();
	console.log('[mamba] shaders compiled');
	return this;
	}

	// ── Compile all WGSL shaders into compute pipelines ────────────────────
	async _compileShaders() {
	const shaderNames = [
	'conv1d_step', 'ssu', 'matmul_gemv', 'rmsnorm', 'rmsnorm_noweight',
	'silu', 'softplus', 'embedding', 'elementwise_mul', 'sample',
	'bf16_to_f32', 'add_residual'
	];

	for (const name of shaderNames) {
	const resp = await fetch(`./shaders/${name}.wgsl`);
	if (!resp.ok) throw new Error(`Failed to load shader: ${name}.wgsl`);
	const code = await resp.text();

	const shaderModule = this.device.createShaderModule({ code, label: name });

	// Create bind group layouts based on shader requirements
	// Group 0 = storage buffers (data), Group 1 = uniforms (params)
	const pipeline = this.device.createComputePipeline({
	layout: 'auto',
	compute: { module: shaderModule, entryPoint: 'main' },
	label: name,
	});

	this.pipelines[name] = pipeline;
	}
	}

	// ── Create a GPU buffer ────────────────────────────────────────────────
	_createBuffer(size, usage, label) {
	return this.device.createBuffer({
	size: Math.max(size, 4), // WebGPU requires min 4 bytes
	usage,
	label,
	mappedAtCreation: false,
	});
	}

	// ── Upload data to a GPU buffer ────────────────────────────────────────
	_upload(buffer, data) {
	this.device.queue.writeBuffer(buffer, 0, data);
	}

	// ── Read data back from GPU buffer ─────────────────────────────────────
	async _readback(buffer, size) {
	const staging = this.device.createBuffer({
	size,
	usage: GPUBufferUsage.MAP_READ \| GPUBufferUsage.COPY_DST,
	});
	const encoder = this.device.createCommandEncoder();
	encoder.copyBufferToBuffer(buffer, 0, staging, 0, size);
	this.device.queue.submit([encoder.finish()]);
	await staging.mapAsync(GPUMapMode.READ);
	const result = new Float32Array(staging.getMappedRange().slice(0));
	staging.unmap();
	staging.destroy();
	return result;
	}

	// ── Dispatch a compute shader ──────────────────────────────────────────
	_dispatch(shaderName, bindGroup, uniformBindGroup, workgroupsX, workgroupsY = 1, workgroupsZ = 1) {
	const encoder = this.device.createCommandEncoder();
	const pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines[shaderName]);
	pass.setBindGroup(0, bindGroup);
	if (uniformBindGroup) pass.setBindGroup(1, uniformBindGroup);
	pass.dispatchWorkgroups(workgroupsX, workgroupsY, workgroupsZ);
	pass.end();
	this.device.queue.submit([encoder.finish()]);
	}

	// ── Load safetensors weights into GPU buffers ──────────────────────────
	async loadWeights(basePath) {
	console.log('[mamba] loading weights from', basePath);

	// Get the shard index
	const indexResp = await fetch(`${basePath}/model.safetensors.index.json`);
	let fileMap; // tensor_name → filename
	let files;
	if (indexResp.ok) {
	const index = await indexResp.json();
	fileMap = index.weight_map;
	files = [...new Set(Object.values(fileMap))];
	console.log(`[mamba] multi-shard: ${files.length} files, ${Object.keys(fileMap).length} tensors`);
	} else {
	files = ['model.safetensors'];
	fileMap = null;
	}

	// For each shard, fetch ONLY the header first (small), then load tensors by byte-range
	for (const file of files) {
	console.log(`[mamba] parsing ${file} header...`);

	// Fetch first 8 bytes to get header length
	const headResp = await fetch(`${basePath}/${file}`, {
	headers: { 'Range': 'bytes=0-7' }
	});
	let headerLen;
	if (headResp.status === 206) {
	// Range request supported
	const headBuf = await headResp.arrayBuffer();
	headerLen = new DataView(headBuf).getUint32(0, true);
	} else {
	// Range not supported — fall back to full fetch but only read header
	const fullBuf = await headResp.arrayBuffer();
	headerLen = new DataView(fullBuf).getUint32(0, true);
	}
	console.log(`[mamba] header: ${headerLen} bytes`);

	// Fetch header JSON
	const hdrResp = await fetch(`${basePath}/${file}`, {
	headers: { 'Range': `bytes=8-${8 + headerLen - 1}` }
	});
	let headerStr;
	if (hdrResp.status === 206) {
	headerStr = await hdrResp.text();
	} else {
	const fullBuf = await hdrResp.arrayBuffer();
	headerStr = new TextDecoder().decode(new Uint8Array(fullBuf, 8, headerLen));
	}
	const header = JSON.parse(headerStr);
	const dataOffset = 8 + headerLen;

	// Load each tensor individually
	const tensorNames = Object.keys(header).filter(n => n !== '__metadata__');
	console.log(`[mamba] ${tensorNames.length} tensors in this shard`);

	let loaded = 0;
	for (const name of tensorNames) {
	const meta = header[name];
	const dtype = meta.dtype;
	const shape = meta.shape;
	const [start, end] = meta.data_offsets;
	const byteLen = end - start;

	if (byteLen > 2_000_000_000) {
	console.log(`[mamba] SKIP ${name} (${(byteLen/1e9).toFixed(2)} GB — exceeds buffer limit)`);
	continue;
	}

	// Fetch this tensor's bytes via Range request
	const absStart = dataOffset + start;
	const absEnd = dataOffset + end - 1;
	const tResp = await fetch(`${basePath}/${file}`, {
	headers: { 'Range': `bytes=${absStart}-${absEnd}` }
	});

	let tensorBuf;
	if (tResp.status === 206) {
	tensorBuf = await tResp.arrayBuffer();
	} else {
	// No range support — need full file (expensive)
	console.log(`[mamba] WARN: no range support, loading full file for ${name}`);
	const fullBuf = await tResp.arrayBuffer();
	tensorBuf = fullBuf.slice(absStart, absStart + byteLen);
	}

	// For BF16 weights: convert to F32 during upload (no double-buffering)
	let gpuBuf;
	let finalDtype = dtype;
	let finalByteLen = byteLen;

	if (dtype === 'BF16') {
	// Convert CPU-side: BF16 → F32 before uploading
	const bf16 = new Uint16Array(tensorBuf);
	const f32 = new Float32Array(bf16.length);
	const tmpU32 = new Uint32Array(f32.buffer);
	for (let j = 0; j < bf16.length; j++) {
	tmpU32[j] = bf16[j] << 16; // BF16 is top 16 bits of F32
	}
	finalByteLen = f32.byteLength;
	gpuBuf = this._createBuffer(
	finalByteLen,
	GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST,
	name
	);
	this._upload(gpuBuf, f32);
	finalDtype = 'F32';
	} else {
	gpuBuf = this._createBuffer(
	byteLen,
	GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST,
	name
	);
	this._upload(gpuBuf, new Uint8Array(tensorBuf));
	}
	this.weights[name] = { buffer: gpuBuf, shape, dtype: finalDtype, byteLen: finalByteLen };

	loaded++;
	if (loaded % 20 === 0) {
	console.log(`[mamba] loaded ${loaded}/${tensorNames.length} tensors`);
	}
	}
	console.log(`[mamba] shard done: ${loaded} tensors loaded`);
	}

	console.log(`[mamba] TOTAL: ${Object.keys(this.weights).length} tensors loaded`);

	// Allocate per-layer state buffers
	this._allocateState();
	this.ready = true;
	}

	// ── Allocate persistent SSM state + conv1d cache per layer ─────────────
	_allocateState() {
	const H = CONFIG.intermediate_size; // 8192
	const S = CONFIG.state_size; // 16
	const K = CONFIG.conv_kernel; // 4

	for (let l = 0; l < CONFIG.num_layers; l++) {
	// SSM state: [H, S] = 8192 * 16 = 131072 floats = 512 KB per layer
	this.state[`layer.${l}.ssm`] = this._createBuffer(
	H * S * 4,
	GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST,
	`ssm_state_${l}`
	);

	// Conv1d cache: [H, K-1] = 8192 * 3 = 24576 floats = 96 KB per layer
	this.state[`layer.${l}.conv`] = this._createBuffer(
	H * (K - 1) * 4,
	GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST,
	`conv_state_${l}`
	);
	}

	// Total state: 64 layers × (512 + 96) KB = ~38 MB
	console.log(`[mamba] allocated ${CONFIG.num_layers} layers of SSM + conv1d state (~38 MB)`);
	}

	// ── Save/restore SSM state (the entity's persistent soul) ──────────────
	async saveState() {
	const state = {};
	for (const [key, buf] of Object.entries(this.state)) {
	state[key] = await this._readback(buf, buf.size);
	}
	return state;
	}

	async restoreState(state) {
	for (const [key, data] of Object.entries(state)) {
	if (this.state[key]) {
	this._upload(this.state[key], data);
	}
	}
	}

	// ── Allocate intermediate scratch buffers for forward pass ──────────────
	_allocateScratch() {
	if (this.scratch) return; // already allocated
	const H = CONFIG.hidden_size; // 4096
	const I = CONFIG.intermediate_size; // 8192
	const DR = CONFIG.dt_rank; // 256
	const S = CONFIG.state_size; // 16
	const F = 4; // sizeof(float32)

	this.scratch = {
	norm_out: this._createBuffer(H * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'norm_out'),
	projected: this._createBuffer(2 * I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'projected'),
	hidden: this._createBuffer(I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'hidden'),
	gate: this._createBuffer(I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'gate'),
	hidden_c: this._createBuffer(I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'hidden_c'),
	sxBC: this._createBuffer((DR + 2S)F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'sxBC'),
	B_proj: this._createBuffer(S * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'B_proj'),
	C_proj: this._createBuffer(S * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'C_proj'),
	dt_pre: this._createBuffer(DR * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'dt_pre'),
	dt: this._createBuffer(I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'dt'),
	hidden_y: this._createBuffer(I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'hidden_y'),
	gate_silu: this._createBuffer(I * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'gate_silu'),
	out_proj_o: this._createBuffer(H * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'out_proj_o'),
	logits: this._createBuffer(CONFIG.vocab_size * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'logits'),
	token_out: this._createBuffer(4, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'token_out'),
	hidden_state: this._createBuffer(H * F, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'hidden_state'),
	token_id: this._createBuffer(4, GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST, 'token_id'),
	};
	this._tokenCount = 0;
	console.log('[mamba] scratch buffers allocated');
	}

	// ── Single-token forward pass through all 64 layers ─────────────────────
	async _forwardOneToken(tokenId) {
	const H = CONFIG.hidden_size; // 4096
	const I = CONFIG.intermediate_size; // 8192
	const DR = CONFIG.dt_rank; // 256
	const S = CONFIG.state_size; // 16
	const V = CONFIG.vocab_size; // 65024

	// Step 1: Embedding lookup — copy one row from embedding table to hidden_state
	this._upload(this.scratch.token_id, new Uint32Array([tokenId]));
	const embBuf = await this._getF32Weight('backbone.embeddings.weight');
	const encoder1 = this.device.createCommandEncoder();
	encoder1.copyBufferToBuffer(embBuf, tokenId * H * 4, this.scratch.hidden_state, 0, H * 4);
	this.device.queue.submit([encoder1.finish()]);

	// Step 2: For each layer (0..63)
	for (let l = 0; l < CONFIG.num_layers; l++) {
	const prefix = `backbone.layers.${l}`;

	// rmsnorm(hidden_state, norm.weight) → norm_out
	const normW = await this._getF32Weight(`${prefix}.norm.weight`);
	let encoder = this.device.createCommandEncoder();
	let pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['rmsnorm']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.hidden_state } },
	{ binding: 1, resource: { buffer: normW } },
	{ binding: 2, resource: { buffer: this.scratch.norm_out } },
	],
	}));
	const rmsnormParams = new ArrayBuffer(12);
	new DataView(rmsnormParams).setUint32(0, 1, true);
	new DataView(rmsnormParams).setUint32(4, H, true);
	new DataView(rmsnormParams).setFloat32(8, CONFIG.rms_eps, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(rmsnormParams)) }}],
	}));
	pass.dispatchWorkgroups(1); // one workgroup per row, 1 row
	pass.end();



	// matmul_gemv(norm_out, in_proj.weight) → projected [I*2 = 16384]
	const inProjW = await this._getF32Weight(`${prefix}.mixer.in_proj.weight`);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['matmul_gemv']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.norm_out } },
	{ binding: 1, resource: { buffer: inProjW } },
	{ binding: 2, resource: { buffer: this.scratch.projected } },
	],
	}));
	const gemvParams1 = new ArrayBuffer(8);
	new DataView(gemvParams1).setUint32(0, I * 2, true); // N
	new DataView(gemvParams1).setUint32(4, H, true); // K
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(gemvParams1)) }}],
	}));
	pass.dispatchWorkgroups(I * 2); // one workgroup per output element
	pass.end();

	// Split projected → hidden[0:I], gate[I:2I] via buffer copies
	encoder.copyBufferToBuffer(this.scratch.projected, 0, this.scratch.hidden, 0, I * 4);
	encoder.copyBufferToBuffer(this.scratch.projected, I * 4, this.scratch.gate, 0, I * 4);



	// conv1d_step(conv_state, hidden, conv1d.weight, conv1d.bias) → hidden_c
	const conv1dW = await this._getF32Weight(`${prefix}.mixer.conv1d.weight`);
	const conv1dB = await this._getF32Weight(`${prefix}.mixer.conv1d.bias`);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['conv1d_step']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['conv1d_step'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.state[`layer.${l}.conv`] } },
	{ binding: 1, resource: { buffer: this.scratch.hidden } },
	{ binding: 2, resource: { buffer: conv1dW } },
	{ binding: 3, resource: { buffer: conv1dB } },
	{ binding: 4, resource: { buffer: this.scratch.hidden_c } },
	],
	}));
	const conv1dParams = new ArrayBuffer(4);
	new DataView(conv1dParams).setUint32(0, I, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['conv1d_step'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(conv1dParams)) }}],
	}));
	pass.dispatchWorkgroups(Math.ceil(I / 64));
	pass.end();

	// silu(hidden_c) in-place → hidden_a
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['silu']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['silu'].getBindGroupLayout(0),
	entries: [{ binding: 0, resource: { buffer: this.scratch.hidden_c } }],
	}));
	const siluParams = new ArrayBuffer(4);
	new DataView(siluParams).setUint32(0, I, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['silu'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(siluParams)) }}],
	}));
	pass.dispatchWorkgroups(Math.ceil(I / 64));
	pass.end();
	// hidden_c is now silu'd (= hidden_a)



	// matmul_gemv(hidden_c, x_proj.weight) → sxBC [DR+2*S = 288]
	const xProjW = await this._getF32Weight(`${prefix}.mixer.x_proj.weight`);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['matmul_gemv']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.hidden_c } },
	{ binding: 1, resource: { buffer: xProjW } },
	{ binding: 2, resource: { buffer: this.scratch.sxBC } },
	],
	}));
	const gemvParams2 = new ArrayBuffer(8);
	new DataView(gemvParams2).setUint32(0, DR + 2 * S, true);
	new DataView(gemvParams2).setUint32(4, I, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(gemvParams2)) }}],
	}));
	pass.dispatchWorkgroups(DR + 2 * S);
	pass.end();

	// Copy dt_pre, B, C from sxBC into separate buffers
	encoder.copyBufferToBuffer(this.scratch.sxBC, 0, this.scratch.dt_pre, 0, DR * 4);
	encoder.copyBufferToBuffer(this.scratch.sxBC, DR * 4, this.scratch.B_proj, 0, S * 4);
	encoder.copyBufferToBuffer(this.scratch.sxBC, (DR + S) * 4, this.scratch.C_proj, 0, S * 4);

	// Falcon-Mamba: RMSNorm(dt_pre), RMSNorm(B), RMSNorm(C) before use
	const rmsNwParams_dt = new ArrayBuffer(8);
	new DataView(rmsNwParams_dt).setUint32(0, DR, true);
	new DataView(rmsNwParams_dt).setFloat32(4, CONFIG.rms_eps, true);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['rmsnorm_noweight']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm_noweight'].getBindGroupLayout(0),
	entries: [{ binding: 0, resource: { buffer: this.scratch.dt_pre } }],
	}));
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm_noweight'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(rmsNwParams_dt)) }}],
	}));
	pass.dispatchWorkgroups(1);
	pass.end();

	const rmsNwParams_s = new ArrayBuffer(8);
	new DataView(rmsNwParams_s).setUint32(0, S, true);
	new DataView(rmsNwParams_s).setFloat32(4, CONFIG.rms_eps, true);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['rmsnorm_noweight']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm_noweight'].getBindGroupLayout(0),
	entries: [{ binding: 0, resource: { buffer: this.scratch.B_proj } }],
	}));
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm_noweight'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(rmsNwParams_s)) }}],
	}));
	pass.dispatchWorkgroups(1);
	pass.end();

	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['rmsnorm_noweight']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm_noweight'].getBindGroupLayout(0),
	entries: [{ binding: 0, resource: { buffer: this.scratch.C_proj } }],
	}));
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm_noweight'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(rmsNwParams_s)) }}],
	}));
	pass.dispatchWorkgroups(1);
	pass.end();

	// matmul_gemv(dt_pre_normalized, dt_proj.weight) → dt [I]
	const dtProjW = await this._getF32Weight(`${prefix}.mixer.dt_proj.weight`);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['matmul_gemv']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.dt_pre } },
	{ binding: 1, resource: { buffer: dtProjW } },
	{ binding: 2, resource: { buffer: this.scratch.dt } },
	],
	}));
	const gemvParams3 = new ArrayBuffer(8);
	new DataView(gemvParams3).setUint32(0, I, true);
	new DataView(gemvParams3).setUint32(4, DR, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(gemvParams3)) }}],
	}));
	pass.dispatchWorkgroups(I);
	pass.end();

	// SSU: selective_state_update
	// ssu(state, hidden_c, dt, A, B, C, D, dt_bias) → hidden_y
	const aLog = await this._getF32Weight(`${prefix}.mixer.A_log`);
	const dWeight = await this._getF32Weight(`${prefix}.mixer.D`);
	const dtBias = await this._getF32Weight(`${prefix}.mixer.dt_proj.bias`);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['ssu']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['ssu'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.state[`layer.${l}.ssm`] } },
	{ binding: 1, resource: { buffer: this.scratch.hidden_c } }, // x (silu'd)
	{ binding: 2, resource: { buffer: this.scratch.dt } },
	{ binding: 3, resource: { buffer: aLog } }, // A (needs -exp transform)
	{ binding: 4, resource: { buffer: this.scratch.B_proj } }, // B
	{ binding: 5, resource: { buffer: this.scratch.C_proj } }, // C
	{ binding: 6, resource: { buffer: dWeight } },
	{ binding: 7, resource: { buffer: dtBias } },
	{ binding: 8, resource: { buffer: this.scratch.hidden_y } },
	],
	}));
	const ssuParams = new ArrayBuffer(8);
	new DataView(ssuParams).setUint32(0, I, true); // H
	new DataView(ssuParams).setUint32(4, S, true); // S
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['ssu'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(ssuParams)) }}],
	}));
	pass.dispatchWorkgroups(I); // one workgroup per h
	pass.end();



	// silu(gate) in-place
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['silu']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['silu'].getBindGroupLayout(0),
	entries: [{ binding: 0, resource: { buffer: this.scratch.gate } }],
	}));
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['silu'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(siluParams)) }}],
	}));
	pass.dispatchWorkgroups(Math.ceil(I / 64));
	pass.end();

	// elementwise_mul: hidden_y *= gate (in-place into hidden_y)
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['elementwise_mul']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['elementwise_mul'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.hidden_y } },
	{ binding: 1, resource: { buffer: this.scratch.gate } },
	],
	}));
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['elementwise_mul'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(siluParams)) }}],
	}));
	pass.dispatchWorkgroups(Math.ceil(I / 64));
	pass.end();

	// matmul_gemv(hidden_y, out_proj.weight) → out_proj_o [H]
	const outProjW = await this._getF32Weight(`${prefix}.mixer.out_proj.weight`);
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['matmul_gemv']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.hidden_y } },
	{ binding: 1, resource: { buffer: outProjW } },
	{ binding: 2, resource: { buffer: this.scratch.out_proj_o } },
	],
	}));
	const gemvParams4 = new ArrayBuffer(8);
	new DataView(gemvParams4).setUint32(0, H, true);
	new DataView(gemvParams4).setUint32(4, I, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(gemvParams4)) }}],
	}));
	pass.dispatchWorkgroups(H);
	pass.end();

	// Submit this layer's command buffer
	this.device.queue.submit([encoder.finish()]);

	// Debug: readback hidden_state after residual for select layers
	// Residual add: hidden_state += out_proj_o
	{
	const enc2 = this.device.createCommandEncoder();
	const addPass = enc2.beginComputePass();
	addPass.setPipeline(this.pipelines['add_residual']);
	addPass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['add_residual'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.hidden_state } },
	{ binding: 1, resource: { buffer: this.scratch.out_proj_o } },
	],
	}));
	const addParams = new ArrayBuffer(4);
	new DataView(addParams).setUint32(0, H, true);
	addPass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['add_residual'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(addParams)) }}],
	}));
	addPass.dispatchWorkgroups(Math.ceil(H / 64));
	addPass.end();
	this.device.queue.submit([enc2.finish()]);
	}


	}

	// Final: rmsnorm + lm_head + sample
	await this.device.queue.onSubmittedWorkDone();

	// rmsnorm(hidden_state, backbone.norm_f.weight) → norm_out
	const normFW = await this._getF32Weight('backbone.norm_f.weight');
	let encoder = this.device.createCommandEncoder();
	let pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['rmsnorm']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.hidden_state } },
	{ binding: 1, resource: { buffer: normFW } },
	{ binding: 2, resource: { buffer: this.scratch.norm_out } },
	],
	}));
	const finalNormParams = new ArrayBuffer(12);
	new DataView(finalNormParams).setUint32(0, 1, true);
	new DataView(finalNormParams).setUint32(4, H, true);
	new DataView(finalNormParams).setFloat32(8, CONFIG.rms_eps, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['rmsnorm'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(finalNormParams)) }}],
	}));
	pass.dispatchWorkgroups(1);
	pass.end();

	// matmul_gemv(norm_out, lm_head.weight) → logits [V]
	const lmHeadW = await this._getF32Weight('lm_head.weight');
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['matmul_gemv']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.norm_out } },
	{ binding: 1, resource: { buffer: lmHeadW } },
	{ binding: 2, resource: { buffer: this.scratch.logits } },
	],
	}));
	const gemvFinal = new ArrayBuffer(8);
	new DataView(gemvFinal).setUint32(0, V, true);
	new DataView(gemvFinal).setUint32(4, H, true);
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['matmul_gemv'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(gemvFinal)) }}],
	}));
	pass.dispatchWorkgroups(V);
	pass.end();

	// sample(logits, temperature) → token_out
	pass = encoder.beginComputePass();
	pass.setPipeline(this.pipelines['sample']);
	pass.setBindGroup(0, this.device.createBindGroup({
	layout: this.pipelines['sample'].getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: this.scratch.logits } },
	{ binding: 1, resource: { buffer: this.scratch.token_out } },
	],
	}));
	const sampleParams = new ArrayBuffer(12);
	new DataView(sampleParams).setUint32(0, V, true);
	new DataView(sampleParams).setFloat32(4, 1.0 / 0.75, true); // inv_temperature
	new DataView(sampleParams).setUint32(8, Math.floor(Math.random() * 0xFFFFFFFF), true); // rng_seed
	pass.setBindGroup(1, this.device.createBindGroup({
	layout: this.pipelines['sample'].getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: this._createUniform(new Uint8Array(sampleParams)) }}],
	}));
	pass.dispatchWorkgroups(1);
	pass.end();

	this.device.queue.submit([encoder.finish()]);
	await this.device.queue.onSubmittedWorkDone();

	// Read back the sampled token
	const tokenResult = await this._readback(this.scratch.token_out, 4);
	this._tokenCount++;
	return new Uint32Array(tokenResult.buffer)[0];
	}

	// ── Tokenize/detokenize via server ──────────────────────────────────────
	async tokenize(text, system = '') {
	const resp = await fetch('/tokenize', {
	method: 'POST',
	headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({ text, system }),
	});
	const data = await resp.json();
	return data.result;
	}

	async detokenize(tokens) {
	const resp = await fetch('/detokenize', {
	method: 'POST',
	headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({ tokens }),
	});
	const data = await resp.json();
	return data.result;
	}

	// ── Generate text ──────────────────────────────────────────────────────
	async generate(prompt, maxTokens = 100, temperature = 0.75, onToken = null, system = '') {
	if (!this.ready) throw new Error('Call loadWeights() first');
	this._allocateScratch();

	console.log('[mamba] generate:', prompt, 'max_tokens:', maxTokens);

	// Tokenize the prompt (with optional system prompt)
	const promptTokens = await this.tokenize(prompt, system);
	console.log(`[mamba] prompt tokens (${promptTokens.length}):`, promptTokens);

	// Process prompt tokens through forward pass to build SSM state
	console.log('[mamba] encoding prompt...');
	for (let i = 0; i < promptTokens.length; i++) {
	const t0 = performance.now();
	await this._forwardOneToken(promptTokens[i]);
	const elapsed = performance.now() - t0;
	if (i === 0 \|\| i === promptTokens.length - 1) {
	console.log(`[mamba] prompt token ${i}/${promptTokens.length}: ${promptTokens[i]} (${elapsed.toFixed(0)}ms)`);
	}
	}
	console.log('[mamba] prompt encoded, generating...');

	// Get the last prompt token's output as first generation input
	const generated = [];
	// The last _forwardOneToken already produced the next-token prediction
	// We need to read it back
	const firstResult = await this._readback(this.scratch.token_out, 4);
	let inputToken = new Uint32Array(firstResult.buffer)[0];
	generated.push(inputToken);
	console.log(`[mamba] first generated token: ${inputToken}`);
	if (onToken) onToken(inputToken, 0);

	for (let step = 1; step < maxTokens; step++) {
	const t0 = performance.now();
	try {
	const nextToken = await this._forwardOneToken(inputToken);
	const elapsed = performance.now() - t0;
	if (step < 5 \|\| step % 20 === 0) {
	console.log(`[mamba] step ${step}: token=${nextToken} (${elapsed.toFixed(0)}ms)`);
	}
	generated.push(nextToken);
	inputToken = nextToken;
	if (onToken) onToken(nextToken, step);
	if (nextToken === 11 \|\| nextToken === 10 \|\| nextToken === 0) break; // EOS=11, im_end=10, PAD=0
	} catch (e) {
	console.error(`[mamba] step ${step} failed:`, e.message);
	break;
	}
	}

	// Decode the generated tokens
	const text = await this.detokenize(generated);
	console.log(`[mamba] generated ${generated.length} tokens`);
	return text;
	}

	// ── Helper: get weight buffer by name ──────────────────────────────────
	_getWeight(name) {
	return this.weights[name] \|\| null;
	}

	// ── Create a uniform buffer with typed data ────────────────────────────
	_createUniform(data) {
	const buf = this.device.createBuffer({
	size: data.byteLength,
	usage: GPUBufferUsage.UNIFORM \| GPUBufferUsage.COPY_DST,
	mappedAtCreation: true,
	});
	new Uint8Array(buf.getMappedRange()).set(new Uint8Array(data.buffer));
	buf.unmap();
	return buf;
	}

	// ── BF16 → F32 conversion for a weight tensor ─────────────────────────
	async _convertBF16toF32(weightInfo) {
	if (weightInfo.dtype !== 'BF16' \|\| weightInfo.f32buffer) return weightInfo;

	const numBF16 = weightInfo.byteLen / 2; // each bf16 is 2 bytes
	const numPairs = weightInfo.byteLen / 4; // each u32 holds 2 bf16
	const f32Bytes = numBF16 * 4;

	// Create output F32 buffer
	const f32Buf = this._createBuffer(
	f32Bytes,
	GPUBufferUsage.STORAGE \| GPUBufferUsage.COPY_SRC \| GPUBufferUsage.COPY_DST,
	weightInfo.buffer.label + '_f32'
	);

	// Create uniform for params
	const paramBuf = this._createUniform(new Uint32Array([numPairs]));

	// Create bind groups
	const pipeline = this.pipelines['bf16_to_f32'];
	const bg0 = this.device.createBindGroup({
	layout: pipeline.getBindGroupLayout(0),
	entries: [
	{ binding: 0, resource: { buffer: weightInfo.buffer } },
	{ binding: 1, resource: { buffer: f32Buf } },
	],
	});
	const bg1 = this.device.createBindGroup({
	layout: pipeline.getBindGroupLayout(1),
	entries: [
	{ binding: 0, resource: { buffer: paramBuf } },
	],
	});

	// Dispatch
	const encoder = this.device.createCommandEncoder();
	const pass = encoder.beginComputePass();
	pass.setPipeline(pipeline);
	pass.setBindGroup(0, bg0);
	pass.setBindGroup(1, bg1);
	pass.dispatchWorkgroups(Math.ceil(numPairs / 64));
	pass.end();
	this.device.queue.submit([encoder.finish()]);
	await this.device.queue.onSubmittedWorkDone();

	// Cache the F32 buffer
	weightInfo.f32buffer = f32Buf;
	weightInfo.f32size = f32Bytes;
	paramBuf.destroy();
	return weightInfo;
	}

	// ── Get F32 weight buffer (already converted during load) ───────────────
	async _getF32Weight(name) {
	const w = this.weights[name];
	if (!w) throw new Error(`Missing weight: ${name}`);
	return w.buffer;
	}

	// ── Dispatch a shader with auto bind group creation ─────────────────────
	_dispatchShader(encoder, shaderName, storageBuffers, uniformData) {
	const pipeline = this.pipelines[shaderName];
	const pass = encoder.beginComputePass();
	pass.setPipeline(pipeline);

	// Bind group 0: storage buffers
	const entries0 = storageBuffers.map((buf, i) => ({
	binding: i, resource: { buffer: buf }
	}));
	const bg0 = this.device.createBindGroup({
	layout: pipeline.getBindGroupLayout(0),
	entries: entries0,
	});
	pass.setBindGroup(0, bg0);

	// Bind group 1: uniforms (if provided)
	if (uniformData) {
	const ubuf = this._createUniform(uniformData);
	const bg1 = this.device.createBindGroup({
	layout: pipeline.getBindGroupLayout(1),
	entries: [{ binding: 0, resource: { buffer: ubuf } }],
	});
	pass.setBindGroup(1, bg1);
	// Note: ubuf leaks — for production, cache these. Fine for proof-of-concept.
	}

	return pass; // caller sets dispatch count and calls pass.end()
	}

	// ── Cleanup ────────────────────────────────────────────────────────────
	destroy() {
	for (const w of Object.values(this.weights)) w.buffer.destroy();
	for (const s of Object.values(this.state)) s.destroy();
	this.weights = {};
	this.state = {};
	this.ready = false;
	}
	}

	// ES module export for browser <script type="module">
	export { MambaRuntime, CONFIG };