Spaces:

HOLOGRAMTECH
/

q

Running

App Files Files Community

q / qvac-2bit.mjs

Humuhumu33's picture

Upload folder using huggingface_hub

3365e13 verified about 16 hours ago

History Blame Contribute Delete

20 kB

	// qvac-2bit.mjs — NATIVE 2-bit WebGPU matmul: read 2-bit weights DIRECTLY on the GPU (no decode to Q8),
	// undo incoherence with a runtime Hadamard, accumulate in f32. This is the on-GPU realization of the E₈/
	// QuIP# 2-bit win: the per-token weight sweep drops ~4× vs Q8, so a 7B model (≈1.75 GB at 2-bit) fits
	// resident in consumer VRAM and runs at VRAM bandwidth instead of paging from disk.
	//
	// Math (one-sided incoherence, the efficient form): let R = FWHT∘diag(sign) — orthogonal AND self-inverse
	// (FWHT normalized 1/√K, sign ∈ ±1). Store Ŵ′ = quantize₂(R·Wₙ) per row (R isotropizes the row so a 2-bit
	// grid quantizes it well). At inference rotate the INPUT once, x′ = R·x, then yₙ = Σ Ŵ′[n,k]·x′[k] ≈
	// (R·Wₙ)·(R·x) = Wₙ·x — the rotation cancels for free, no output Hadamard. Cost: one length-K Hadamard
	// per matmul (O(K log K), negligible beside the O(N·K) matmul). Pure WebGPU; the codebook is a uniform
	// 4-level grid {−3,−1,1,3}·scale with a per-32 block scale (same scale layout the engine's Q8 path uses).

	const FWHT_WG = 256; // single-workgroup input rotation (K ≤ 2048)
	const MM_WG = 64; // one workgroup per output row, 64-thread reduce

	// ── CPU: deterministic ±1 signs (re-derivable from K, matches e8-quant's xorshift) ──
	export function signsFor(K) {
	const s = new Float32Array(K); let x = (0x9e3779b9 ^ K) >>> 0;
	for (let i = 0; i < K; i++) { x ^= x << 13; x ^= x >>> 17; x ^= x << 5; x >>>= 0; s[i] = (x & 1) ? 1 : -1; }
	return s;
	}
	function fwht(a) { // in place, normalized (self-inverse)
	const n = a.length;
	for (let len = 1; len < n; len <<= 1) for (let i = 0; i < n; i += len << 1) for (let j = i; j < i + len; j++) { const u = a[j], v = a[j + len]; a[j] = u + v; a[j + len] = u - v; }
	const s = 1 / Math.sqrt(n); for (let i = 0; i < n; i++) a[i] *= s;
	}

	// ── CPU: pack a weight matrix W [N,K] (row-major) to incoherent 2-bit + per-32 scales ──
	// Returns { qw:Uint32Array(NK/16), sc:Float32Array(NK/32), sign:Float32Array(K) }.
	// incoherent=false skips the rotation (naive 2-bit) — for the quality contrast only.
	export function pack2bit(W, N, K, { incoherent = true } = {}) {
	const nblk = K / 32, qw = new Uint32Array((N * K) / 16), sc = new Float32Array(N * nblk), sign = signsFor(K);
	const row = new Float64Array(K);
	for (let n = 0; n < N; n++) {
	for (let k = 0; k < K; k++) row[k] = incoherent ? W[n * K + k] * sign[k] : W[n * K + k];
	if (incoherent) fwht(row); // row ← R·Wₙ
	// per-block scale: MSE-optimal step for a 4-level uniform grid {−3,−1,1,3}·s on ~Gaussian data is
	// s ≈ 0.5·σ (grid spans ±1.5σ). Outliers beyond ±1.5σ clip — which is exactly what incoherence
	// removes (the Hadamard Gaussianizes the row), so naive-2-bit clips heavy tails and incoherent does not.
	for (let b = 0; b < nblk; b++) { let ss = 0; for (let i = 0; i < 32; i++) { const a = row[b * 32 + i]; ss += a * a; } sc[n * nblk + b] = (0.5 * Math.sqrt(ss / 32)) \|\| 1e-12; }
	for (let k = 0; k < K; k++) {
	const t = row[k] / sc[n * nblk + (k >> 5)];
	let q = Math.round((t + 3) / 2); if (q < 0) q = 0; else if (q > 3) q = 3; // grid {−3,−1,1,3}
	const idx = n * K + k; qw[idx >> 4] \|= q << ((idx & 15) * 2);
	}
	}
	return { qw, sc, sign };
	}
	// reconstruct (CPU reference for the stored 2-bit weights, in the ORIGINAL basis): undo R on each row
	export function unpack2bit(qw, sc, sign, N, K, { incoherent = true } = {}) {
	const nblk = K / 32, W = new Float32Array(N * K), row = new Float64Array(K);
	for (let n = 0; n < N; n++) {
	for (let k = 0; k < K; k++) { const idx = n * K + k; const q = (qw[idx >> 4] >>> ((idx & 15) * 2)) & 3; row[k] = (q * 2 - 3) * sc[n * nblk + (k >> 5)]; }
	if (incoherent) { fwht(row); for (let k = 0; k < K; k++) row[k] *= sign[k]; } // R is self-inverse
	for (let k = 0; k < K; k++) W[n * K + k] = row[k];
	}
	return W;
	}

	// f32 → f16 bits (Uint16). Scales are small positives; round-toward-zero of the mantissa is fine.
	export function f32ToF16(val) {
	_f32[0] = val; const x = _u32[0];
	const sign = (x >>> 16) & 0x8000; let exp = ((x >>> 23) & 0xff) - 112; const mant = x & 0x7fffff;
	if (exp <= 0) { if (exp < -10) return sign; const m = (mant \| 0x800000) >> (1 - exp); return sign \| (m >> 13); }
	if (exp >= 31) return sign \| 0x7c00;
	return sign \| (exp << 10) \| (mant >> 13);
	}
	const _f32 = new Float32Array(1), _u32 = new Uint32Array(_f32.buffer);

	// CODEBOOK-AWARE LDLQ to the 2-bit SCALAR grid {−3,−1,1,3}·sc — the engine's native 2-bit codebook, with
	// NO incoherence (so no power-of-2 padding, no runtime Hadamard). Rounds input columns high→low feeding
	// each future column's error back through L (the LDL factor of the input Hessian); L=null ⇒ plain scalar
	// 2-bit (the fallback when no calibration Hessian of the right dim exists, e.g. the FFN down-proj). Returns
	// the packed 2-bit indices (16 weights/u32, no padding — K must be a multiple of 16). sc = per-32 scales.
	// `band` caps the feedback to the nearest `band` future columns (0 = full). The LDL factor's off-diagonal
	// mass concentrates near the diagonal, so a band recovers most of the gain at O(N·K·band) instead of
	// O(N·K²) — the difference between a feasible and an infeasible 7B compile in single-thread JS.
	export function ldlqRound2bit(W, N, K, L, sc, band = 0, chunk = 4096) {
	const qw = new Uint32Array((N * K) / 16), nb = K / 32, CH = Math.min(chunk, N), E = new Float32Array(CH * K);
	for (let r0 = 0; r0 < N; r0 += CH) { // rows are independent in LDLQ → chunk them (E is CH·K, not N·K)
	const rN = Math.min(CH, N - r0); E.fill(0, 0, rN * K);
	for (let k = K - 1; k >= 0; k--) for (let ii = 0; ii < rN; ii++) {
	const i = r0 + ii;
	let corr = W[i * K + k]; if (L) { const jm = band ? Math.min(K, k + 1 + band) : K; for (let j = k + 1; j < jm; j++) corr += E[ii * K + j] * L[j * K + k]; }
	const s = sc[i * nb + (k >> 5)]; let q = Math.round(corr / s / 2 + 1.5); if (q < 0) q = 0; else if (q > 3) q = 3;
	E[ii * K + k] = W[i * K + k] - (q * 2 - 3) * s;
	const idx = i * K + k; qw[idx >> 4] \|= q << ((idx & 15) * 2);
	}
	}
	return qw;
	}

	export const nextPow2 = (n) => { let p = 1; while (p < n) p <<= 1; return p; };
	// re-quantize an engine Q8 tensor (int8 quants + per-32 f32 scales) → incoherent 2-bit, padding the input
	// dim K to Kp = next power of 2 (the FWHT needs a pow2 length; padded weights/inputs are zeros ⇒ exact).
	// Returns { q: packed 2-bit bytes [NKp/4], s: f32 scales [NKp/32], Kp }. The runtime rotates the input by
	// the SAME R_Kp (signsFor(Kp)+FWHT), so Ŵ′·x′ = (R·W)(R·x) = W·x.
	export function requant2bit(q8, s, N, K) {
	const Kp = nextPow2(K), nb = Kp / 32, sb = K / 32;
	const q = new Int8Array(q8.buffer, q8.byteOffset, N * K);
	const sign = signsFor(Kp), row = new Float64Array(Kp);
	const qw = new Uint32Array((N * Kp) / 16), sc = new Float32Array(N * nb);
	for (let n = 0; n < N; n++) {
	for (let k = 0; k < Kp; k++) row[k] = (k < K ? q[n * K + k] * s[n * sb + (k >> 5)] : 0) * sign[k];
	fwht(row);
	for (let b = 0; b < nb; b++) { let ss = 0; for (let i = 0; i < 32; i++) { const a = row[b * 32 + i]; ss += a * a; } sc[n * nb + b] = (0.5 * Math.sqrt(ss / 32)) \|\| 1e-12; } // MSE-optimal step for the {−3,−1,1,3} grid on Gaussianised (incoherent) weights
	for (let k = 0; k < Kp; k++) { const t = row[k] / sc[n * nb + (k >> 5)]; let qq = Math.round((t + 3) / 2); if (qq < 0) qq = 0; else if (qq > 3) qq = 3; const idx = n * Kp + k; qw[idx >> 4] \|= qq << ((idx & 15) * 2); }
	}
	return { q: new Uint8Array(qw.buffer), s: sc, Kp };
	}

	// ── WGSL: input rotation x′ = FWHT(sign ⊙ x), single workgroup, shared memory ──
	const FWHT_WGSL = `
	@group(0) @binding(0) var<storage,read> x: array<f32>;
	@group(0) @binding(1) var<storage,read> sgn: array<f32>;
	@group(0) @binding(2) var<storage,read_write> xr: array<f32>;
	@group(0) @binding(3) var<uniform> P: vec4<u32>; // K, _, _, _
	var<workgroup> sh: array<f32, 4096>; // 16 KB = the WebGPU min workgroup-storage limit; K ≤ 4096 (covers d up to 7B-class)
	@compute @workgroup_size(${FWHT_WG})
	fn main(@builtin(local_invocation_id) lid: vec3<u32>) {
	let K = P.x; let t = lid.x;
	for (var i = t; i < K; i += ${FWHT_WG}u) { sh[i] = x[i] * sgn[i]; }
	workgroupBarrier();
	var len = 1u;
	loop {
	if (len >= K) { break; }
	let half = K >> 1u;
	for (var i = t; i < half; i += ${FWHT_WG}u) {
	let blk = i / len; let j = i % len;
	let a = blk * (len << 1u) + j; let b = a + len;
	let u = sh[a]; let v = sh[b]; sh[a] = u + v; sh[b] = u - v;
	}
	workgroupBarrier();
	len = len << 1u;
	}
	let nrm = 1.0 / sqrt(f32(K));
	for (var i = t; i < K; i += ${FWHT_WG}u) { xr[i] = sh[i] * nrm; }
	}`;

	// ── WGSL: 2-bit GEMV — WORD-ORIENTED: each thread loads one u32 (16 weights), unpacks in registers,
	// hoists the per-32 block scale (16 weights at a 16-aligned base never cross a 32 boundary → one scale
	// read per word). Threads stride by workgroup over words → coalesced loads. f32 accumulate. ──
	const MM2_WGSL = `
	@group(0) @binding(0) var<storage,read> qw: array<u32>;
	@group(0) @binding(1) var<storage,read> sc: array<f32>;
	@group(0) @binding(2) var<storage,read> x: array<f32>;
	@group(0) @binding(3) var<storage,read_write> o: array<f32>;
	@group(0) @binding(4) var<uniform> P: vec4<u32>; // N, K, nblk, _
	var<workgroup> red: array<f32, ${MM_WG}>;
	@compute @workgroup_size(${MM_WG})
	fn main(@builtin(workgroup_id) wid: vec3<u32>, @builtin(local_invocation_id) lid: vec3<u32>) {
	let n = wid.x; let K = P.y; let words = K >> 4u; let rowW = n * words; let rowS = n * P.z;
	var acc = 0.0; var w = lid.x;
	loop {
	if (w >= words) { break; }
	let packed = qw[rowW + w];
	let kb = w << 4u; let s = sc[rowS + (kb >> 5u)];
	for (var j = 0u; j < 16u; j = j + 1u) { acc = acc + x[kb + j] * f32(i32((packed >> (j * 2u)) & 3u) * 2 - 3) * s; }
	w = w + ${MM_WG}u;
	}
	red[lid.x] = acc; workgroupBarrier();
	var r = ${MM_WG >> 1}u;
	loop { if (r == 0u) { break; } if (lid.x < r) { red[lid.x] = red[lid.x] + red[lid.x + r]; } workgroupBarrier(); r = r >> 1u; }
	if (lid.x == 0u) { o[n] = red[0]; }
	}`;

	// ── WGSL: Q8 GEMV (the engine's current format) — WORD-ORIENTED too, so the comparison is purely the
	// bytes-read difference, not kernel quality. Each thread loads one u32 (4 int8), unpacks 4. ──
	const MM8_WGSL = `
	@group(0) @binding(0) var<storage,read> qw: array<u32>;
	@group(0) @binding(1) var<storage,read> sc: array<f32>;
	@group(0) @binding(2) var<storage,read> x: array<f32>;
	@group(0) @binding(3) var<storage,read_write> o: array<f32>;
	@group(0) @binding(4) var<uniform> P: vec4<u32>;
	var<workgroup> red: array<f32, ${MM_WG}>;
	@compute @workgroup_size(${MM_WG})
	fn main(@builtin(workgroup_id) wid: vec3<u32>, @builtin(local_invocation_id) lid: vec3<u32>) {
	let n = wid.x; let K = P.y; let words = K >> 2u; let rowW = n * words; let rowS = n * P.z;
	var acc = 0.0; var w = lid.x;
	loop {
	if (w >= words) { break; }
	let packed = qw[rowW + w];
	let kb = w << 2u; let s = sc[rowS + (kb >> 5u)];
	for (var j = 0u; j < 4u; j = j + 1u) { let b = (packed >> (j * 8u)) & 0xffu; acc = acc + x[kb + j] * f32(i32(b << 24u) >> 24u) * s; }
	w = w + ${MM_WG}u;
	}
	red[lid.x] = acc; workgroupBarrier();
	var r = ${MM_WG >> 1}u;
	loop { if (r == 0u) { break; } if (lid.x < r) { red[lid.x] = red[lid.x] + red[lid.x + r]; } workgroupBarrier(); r = r >> 1u; }
	if (lid.x == 0u) { o[n] = red[0]; }
	}`;

	// ── GPU helpers ──
	const U = (typeof GPUBufferUsage !== "undefined") ? GPUBufferUsage : {};
	function pipe(dev, code) { const m = dev.createShaderModule({ code }); return dev.createComputePipeline({ layout: "auto", compute: { module: m, entryPoint: "main" } }); }
	function sbuf(dev, src) { const b = dev.createBuffer({ size: Math.max(16, src.byteLength), usage: U.STORAGE \| U.COPY_DST \| U.COPY_SRC }); dev.queue.writeBuffer(b, 0, src); return b; }
	function obuf(dev, bytes) { return dev.createBuffer({ size: Math.max(16, bytes), usage: U.STORAGE \| U.COPY_SRC }); }
	function ubuf(dev, arr) { const b = dev.createBuffer({ size: 16, usage: U.UNIFORM \| U.COPY_DST }); dev.queue.writeBuffer(b, 0, arr); return b; }
	async function readf32(dev, buf, n) { const st = dev.createBuffer({ size: n * 4, usage: U.MAP_READ \| U.COPY_DST }); const e = dev.createCommandEncoder(); e.copyBufferToBuffer(buf, 0, st, 0, n * 4); dev.queue.submit([e.finish()]); await st.mapAsync(GPUMapMode.READ); const out = new Float32Array(st.getMappedRange().slice(0)); st.unmap(); st.destroy(); return out; }

	// ── the bench: correctness (2-bit incoherent vs f32 ref vs naive-2-bit vs Q8) + perf + memory ──
	export async function runBench(dev, { N = 2048, K = 2048, iters = 200 } = {}) {
	const nblk = K / 32;
	// random Gaussian weights + input (a realistic single layer matmul)
	let s = 1234567; const rnd = () => (s = (s * 1664525 + 1013904223) >>> 0) / 4294967296;
	const gauss = () => { const u = Math.max(1e-12, rnd()); return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * rnd()); };
	// heavy-tailed weights like real LLM layers (kurtosis ≫ 3): a Gaussian bulk plus sparse large spikes —
	// the outliers that wreck naive low-bit quantization and that incoherence (the Hadamard) spreads out.
	const W = new Float32Array(N * K); for (let i = 0; i < N * K; i++) { let w = gauss() * 0.05; if (rnd() < 0.02) w *= 6; W[i] = w; }
	const x = new Float32Array(K); for (let i = 0; i < K; i++) x[i] = gauss();
	// f32 reference y = W·x
	const yref = new Float32Array(N); for (let n = 0; n < N; n++) { let a = 0; for (let k = 0; k < K; k++) a += W[n * K + k] * x[k]; yref[n] = a; }
	const relErr = (y) => { let e = 0, r = 0; for (let n = 0; n < N; n++) { const d = y[n] - yref[n]; e += d * d; r += yref[n] * yref[n]; } return Math.sqrt(e / r); };

	// pack incoherent 2-bit + naive 2-bit + Q8 (per-32 scale, the engine format)
	const inc = pack2bit(W, N, K, { incoherent: true });
	const nai = pack2bit(W, N, K, { incoherent: false });
	const q8 = new Int8Array(N * K), q8s = new Float32Array(N * nblk);
	for (let n = 0; n < N; n++) for (let b = 0; b < nblk; b++) { let mx = 0; for (let i = 0; i < 32; i++) { const a = Math.abs(W[n * K + b * 32 + i]); if (a > mx) mx = a; } const sca = (mx / 127) \|\| 1e-12; q8s[n * nblk + b] = sca; for (let i = 0; i < 32; i++) { let q = Math.round(W[n * K + b * 32 + i] / sca); if (q > 127) q = 127; else if (q < -127) q = -127; q8[n * K + b * 32 + i] = q; } }

	// GPU pipelines
	const pF = pipe(dev, FWHT_WGSL), p2 = pipe(dev, MM2_WGSL), p8 = pipe(dev, MM8_WGSL);
	// buffers
	const xB = sbuf(dev, x), xrB = obuf(dev, K * 4), sgnB = sbuf(dev, inc.sign), Pf = ubuf(dev, new Uint32Array([K, 0, 0, 0]));
	const qwB = sbuf(dev, inc.qw), scB = sbuf(dev, inc.sc), oB = obuf(dev, N * 4), P2 = ubuf(dev, new Uint32Array([N, K, nblk, 0]));
	const naiqwB = sbuf(dev, nai.qw), naiscB = sbuf(dev, nai.sc), oNB = obuf(dev, N * 4);
	const q8B = sbuf(dev, new Uint8Array(q8.buffer)), q8sB = sbuf(dev, q8s), o8B = obuf(dev, N * 4);

	const bgF = (xin) => dev.createBindGroup({ layout: pF.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: xin } }, { binding: 1, resource: { buffer: sgnB } }, { binding: 2, resource: { buffer: xrB } }, { binding: 3, resource: { buffer: Pf } }] });
	const bg2 = (qw, sc, xin, o) => dev.createBindGroup({ layout: p2.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: qw } }, { binding: 1, resource: { buffer: sc } }, { binding: 2, resource: { buffer: xin } }, { binding: 3, resource: { buffer: o } }, { binding: 4, resource: { buffer: P2 } }] });
	const bg8 = dev.createBindGroup({ layout: p8.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: q8B } }, { binding: 1, resource: { buffer: q8sB } }, { binding: 2, resource: { buffer: xB } }, { binding: 3, resource: { buffer: o8B } }, { binding: 4, resource: { buffer: P2 } }] });

	// ── correctness ──
	const doInc = K <= 4096; // single-workgroup FWHT covers K ≤ 4096 (16 KB shared)
	let yInc = null, gpuCpu = null;
	if (doInc) {
	// incoherent path: GPU rotate x→x′, then 2-bit matmul with x′
	{ const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(pF); p.setBindGroup(0, bgF(xB)); p.dispatchWorkgroups(1); p.setPipeline(p2); p.setBindGroup(0, bg2(qwB, scB, xrB, oB)); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); }
	yInc = await readf32(dev, oB, N);
	// CPU re-derivation of the SAME stored 2-bit weights — independent check the GPU kernel agrees
	const Wrec = unpack2bit(inc.qw, inc.sc, inc.sign, N, K, { incoherent: true });
	const yCpu = new Float32Array(N); for (let n = 0; n < N; n++) { let a = 0; for (let k = 0; k < K; k++) a += Wrec[n * K + k] * x[k]; yCpu[n] = a; }
	let e2 = 0, rr = 0; for (let n = 0; n < N; n++) { const d = yInc[n] - yCpu[n]; e2 += d * d; rr += yCpu[n] * yCpu[n]; } gpuCpu = Math.sqrt(e2 / rr);
	}
	// naive path (no rotation) + Q8 path
	{ const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p2); p.setBindGroup(0, bg2(naiqwB, naiscB, xB, oNB)); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); }
	const yNai = await readf32(dev, oNB, N);
	{ const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p8); p.setBindGroup(0, bg8); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); }
	const yQ8 = await readf32(dev, o8B, N);

	// ── perf — apples-to-apples: time each MATMUL alone (incoherence rotation measured separately) ──
	// pre-rotate x once so the 2-bit matmul reads x′ without re-running the FWHT in the timed loop.
	if (doInc) { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(pF); p.setBindGroup(0, bgF(xB)); p.dispatchWorkgroups(1); p.end(); dev.queue.submit([e.finish()]); }
	const time = async (fn) => { fn(); await dev.queue.onSubmittedWorkDone(); const t0 = performance.now(); for (let i = 0; i < iters; i++) fn(); await dev.queue.onSubmittedWorkDone(); return (performance.now() - t0) / iters; };
	const run2mm = () => { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p2); p.setBindGroup(0, bg2(qwB, scB, doInc ? xrB : xB, oB)); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); };
	const run8mm = () => { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p8); p.setBindGroup(0, bg8); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); };
	const runF = () => { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(pF); p.setBindGroup(0, bgF(xB)); p.dispatchWorkgroups(1); p.end(); dev.queue.submit([e.finish()]); };
	const ms2 = await time(run2mm), ms8 = await time(run8mm), msF = doInc ? await time(runF) : null;

	const bytes2 = inc.qw.byteLength + inc.sc.byteLength, bytes8 = q8.byteLength + q8s.byteLength;
	const gbps = (b, ms) => (b / 1e9) / (ms / 1e3);
	return {
	N, K, iters,
	err: { incoherent2bit: doInc ? relErr(yInc) : null, naive2bit: relErr(yNai), q8: relErr(yQ8), gpu_vs_cpu_2bit: gpuCpu },
	perf: { ms_2bit_mm: +ms2.toFixed(4), ms_q8_mm: +ms8.toFixed(4), ms_fwht: msF == null ? null : +msF.toFixed(4), matmul_speedup: +(ms8 / ms2).toFixed(2), gbps_2bit: +gbps(bytes2, ms2).toFixed(0), gbps_q8: +gbps(bytes8, ms8).toFixed(0) },
	mem: { MB_2bit: +(bytes2 / 1e6).toFixed(2), MB_q8: +(bytes8 / 1e6).toFixed(2), ratio: +(bytes8 / bytes2).toFixed(2), bits_per_weight: +(bytes2 * 8 / (N * K)).toFixed(2) },
	};
	}