Spaces:
Running
Running
| // qvac-2bit.mjs β NATIVE 2-bit WebGPU matmul: read 2-bit weights DIRECTLY on the GPU (no decode to Q8), | |
| // undo incoherence with a runtime Hadamard, accumulate in f32. This is the on-GPU realization of the Eβ/ | |
| // QuIP# 2-bit win: the per-token weight sweep drops ~4Γ vs Q8, so a 7B model (β1.75 GB at 2-bit) fits | |
| // resident in consumer VRAM and runs at VRAM bandwidth instead of paging from disk. | |
| // | |
| // Math (one-sided incoherence, the efficient form): let R = FWHTβdiag(sign) β orthogonal AND self-inverse | |
| // (FWHT normalized 1/βK, sign β Β±1). Store Ε΄β² = quantizeβ(RΒ·Wβ) per row (R isotropizes the row so a 2-bit | |
| // grid quantizes it well). At inference rotate the INPUT once, xβ² = RΒ·x, then yβ = Ξ£ Ε΄β²[n,k]Β·xβ²[k] β | |
| // (RΒ·Wβ)Β·(RΒ·x) = WβΒ·x β the rotation cancels for free, no output Hadamard. Cost: one length-K Hadamard | |
| // per matmul (O(K log K), negligible beside the O(NΒ·K) matmul). Pure WebGPU; the codebook is a uniform | |
| // 4-level grid {β3,β1,1,3}Β·scale with a per-32 block scale (same scale layout the engine's Q8 path uses). | |
| const FWHT_WG = 256; // single-workgroup input rotation (K β€ 2048) | |
| const MM_WG = 64; // one workgroup per output row, 64-thread reduce | |
| // ββ CPU: deterministic Β±1 signs (re-derivable from K, matches e8-quant's xorshift) ββ | |
| export function signsFor(K) { | |
| const s = new Float32Array(K); let x = (0x9e3779b9 ^ K) >>> 0; | |
| for (let i = 0; i < K; i++) { x ^= x << 13; x ^= x >>> 17; x ^= x << 5; x >>>= 0; s[i] = (x & 1) ? 1 : -1; } | |
| return s; | |
| } | |
| function fwht(a) { // in place, normalized (self-inverse) | |
| const n = a.length; | |
| for (let len = 1; len < n; len <<= 1) for (let i = 0; i < n; i += len << 1) for (let j = i; j < i + len; j++) { const u = a[j], v = a[j + len]; a[j] = u + v; a[j + len] = u - v; } | |
| const s = 1 / Math.sqrt(n); for (let i = 0; i < n; i++) a[i] *= s; | |
| } | |
| // ββ CPU: pack a weight matrix W [N,K] (row-major) to incoherent 2-bit + per-32 scales ββ | |
| // Returns { qw:Uint32Array(N*K/16), sc:Float32Array(N*K/32), sign:Float32Array(K) }. | |
| // incoherent=false skips the rotation (naive 2-bit) β for the quality contrast only. | |
| export function pack2bit(W, N, K, { incoherent = true } = {}) { | |
| const nblk = K / 32, qw = new Uint32Array((N * K) / 16), sc = new Float32Array(N * nblk), sign = signsFor(K); | |
| const row = new Float64Array(K); | |
| for (let n = 0; n < N; n++) { | |
| for (let k = 0; k < K; k++) row[k] = incoherent ? W[n * K + k] * sign[k] : W[n * K + k]; | |
| if (incoherent) fwht(row); // row β RΒ·Wβ | |
| // per-block scale: MSE-optimal step for a 4-level uniform grid {β3,β1,1,3}Β·s on ~Gaussian data is | |
| // s β 0.5Β·Ο (grid spans Β±1.5Ο). Outliers beyond Β±1.5Ο clip β which is exactly what incoherence | |
| // removes (the Hadamard Gaussianizes the row), so naive-2-bit clips heavy tails and incoherent does not. | |
| for (let b = 0; b < nblk; b++) { let ss = 0; for (let i = 0; i < 32; i++) { const a = row[b * 32 + i]; ss += a * a; } sc[n * nblk + b] = (0.5 * Math.sqrt(ss / 32)) || 1e-12; } | |
| for (let k = 0; k < K; k++) { | |
| const t = row[k] / sc[n * nblk + (k >> 5)]; | |
| let q = Math.round((t + 3) / 2); if (q < 0) q = 0; else if (q > 3) q = 3; // grid {β3,β1,1,3} | |
| const idx = n * K + k; qw[idx >> 4] |= q << ((idx & 15) * 2); | |
| } | |
| } | |
| return { qw, sc, sign }; | |
| } | |
| // reconstruct (CPU reference for the stored 2-bit weights, in the ORIGINAL basis): undo R on each row | |
| export function unpack2bit(qw, sc, sign, N, K, { incoherent = true } = {}) { | |
| const nblk = K / 32, W = new Float32Array(N * K), row = new Float64Array(K); | |
| for (let n = 0; n < N; n++) { | |
| for (let k = 0; k < K; k++) { const idx = n * K + k; const q = (qw[idx >> 4] >>> ((idx & 15) * 2)) & 3; row[k] = (q * 2 - 3) * sc[n * nblk + (k >> 5)]; } | |
| if (incoherent) { fwht(row); for (let k = 0; k < K; k++) row[k] *= sign[k]; } // R is self-inverse | |
| for (let k = 0; k < K; k++) W[n * K + k] = row[k]; | |
| } | |
| return W; | |
| } | |
| // f32 β f16 bits (Uint16). Scales are small positives; round-toward-zero of the mantissa is fine. | |
| export function f32ToF16(val) { | |
| _f32[0] = val; const x = _u32[0]; | |
| const sign = (x >>> 16) & 0x8000; let exp = ((x >>> 23) & 0xff) - 112; const mant = x & 0x7fffff; | |
| if (exp <= 0) { if (exp < -10) return sign; const m = (mant | 0x800000) >> (1 - exp); return sign | (m >> 13); } | |
| if (exp >= 31) return sign | 0x7c00; | |
| return sign | (exp << 10) | (mant >> 13); | |
| } | |
| const _f32 = new Float32Array(1), _u32 = new Uint32Array(_f32.buffer); | |
| // CODEBOOK-AWARE LDLQ to the 2-bit SCALAR grid {β3,β1,1,3}Β·sc β the engine's native 2-bit codebook, with | |
| // NO incoherence (so no power-of-2 padding, no runtime Hadamard). Rounds input columns highβlow feeding | |
| // each future column's error back through L (the LDL factor of the input Hessian); L=null β plain scalar | |
| // 2-bit (the fallback when no calibration Hessian of the right dim exists, e.g. the FFN down-proj). Returns | |
| // the packed 2-bit indices (16 weights/u32, no padding β K must be a multiple of 16). sc = per-32 scales. | |
| // `band` caps the feedback to the nearest `band` future columns (0 = full). The LDL factor's off-diagonal | |
| // mass concentrates near the diagonal, so a band recovers most of the gain at O(NΒ·KΒ·band) instead of | |
| // O(NΒ·KΒ²) β the difference between a feasible and an infeasible 7B compile in single-thread JS. | |
| export function ldlqRound2bit(W, N, K, L, sc, band = 0, chunk = 4096) { | |
| const qw = new Uint32Array((N * K) / 16), nb = K / 32, CH = Math.min(chunk, N), E = new Float32Array(CH * K); | |
| for (let r0 = 0; r0 < N; r0 += CH) { // rows are independent in LDLQ β chunk them (E is CHΒ·K, not NΒ·K) | |
| const rN = Math.min(CH, N - r0); E.fill(0, 0, rN * K); | |
| for (let k = K - 1; k >= 0; k--) for (let ii = 0; ii < rN; ii++) { | |
| const i = r0 + ii; | |
| let corr = W[i * K + k]; if (L) { const jm = band ? Math.min(K, k + 1 + band) : K; for (let j = k + 1; j < jm; j++) corr += E[ii * K + j] * L[j * K + k]; } | |
| const s = sc[i * nb + (k >> 5)]; let q = Math.round(corr / s / 2 + 1.5); if (q < 0) q = 0; else if (q > 3) q = 3; | |
| E[ii * K + k] = W[i * K + k] - (q * 2 - 3) * s; | |
| const idx = i * K + k; qw[idx >> 4] |= q << ((idx & 15) * 2); | |
| } | |
| } | |
| return qw; | |
| } | |
| export const nextPow2 = (n) => { let p = 1; while (p < n) p <<= 1; return p; }; | |
| // re-quantize an engine Q8 tensor (int8 quants + per-32 f32 scales) β incoherent 2-bit, padding the input | |
| // dim K to Kp = next power of 2 (the FWHT needs a pow2 length; padded weights/inputs are zeros β exact). | |
| // Returns { q: packed 2-bit bytes [N*Kp/4], s: f32 scales [N*Kp/32], Kp }. The runtime rotates the input by | |
| // the SAME R_Kp (signsFor(Kp)+FWHT), so Ε΄β²Β·xβ² = (RΒ·W)(RΒ·x) = WΒ·x. | |
| export function requant2bit(q8, s, N, K) { | |
| const Kp = nextPow2(K), nb = Kp / 32, sb = K / 32; | |
| const q = new Int8Array(q8.buffer, q8.byteOffset, N * K); | |
| const sign = signsFor(Kp), row = new Float64Array(Kp); | |
| const qw = new Uint32Array((N * Kp) / 16), sc = new Float32Array(N * nb); | |
| for (let n = 0; n < N; n++) { | |
| for (let k = 0; k < Kp; k++) row[k] = (k < K ? q[n * K + k] * s[n * sb + (k >> 5)] : 0) * sign[k]; | |
| fwht(row); | |
| for (let b = 0; b < nb; b++) { let ss = 0; for (let i = 0; i < 32; i++) { const a = row[b * 32 + i]; ss += a * a; } sc[n * nb + b] = (0.5 * Math.sqrt(ss / 32)) || 1e-12; } // MSE-optimal step for the {β3,β1,1,3} grid on Gaussianised (incoherent) weights | |
| for (let k = 0; k < Kp; k++) { const t = row[k] / sc[n * nb + (k >> 5)]; let qq = Math.round((t + 3) / 2); if (qq < 0) qq = 0; else if (qq > 3) qq = 3; const idx = n * Kp + k; qw[idx >> 4] |= qq << ((idx & 15) * 2); } | |
| } | |
| return { q: new Uint8Array(qw.buffer), s: sc, Kp }; | |
| } | |
| // ββ WGSL: input rotation xβ² = FWHT(sign β x), single workgroup, shared memory ββ | |
| const FWHT_WGSL = ` | |
| @group(0) @binding(0) var<storage,read> x: array<f32>; | |
| @group(0) @binding(1) var<storage,read> sgn: array<f32>; | |
| @group(0) @binding(2) var<storage,read_write> xr: array<f32>; | |
| @group(0) @binding(3) var<uniform> P: vec4<u32>; // K, _, _, _ | |
| var<workgroup> sh: array<f32, 4096>; // 16 KB = the WebGPU min workgroup-storage limit; K β€ 4096 (covers d up to 7B-class) | |
| @compute @workgroup_size(${FWHT_WG}) | |
| fn main(@builtin(local_invocation_id) lid: vec3<u32>) { | |
| let K = P.x; let t = lid.x; | |
| for (var i = t; i < K; i += ${FWHT_WG}u) { sh[i] = x[i] * sgn[i]; } | |
| workgroupBarrier(); | |
| var len = 1u; | |
| loop { | |
| if (len >= K) { break; } | |
| let half = K >> 1u; | |
| for (var i = t; i < half; i += ${FWHT_WG}u) { | |
| let blk = i / len; let j = i % len; | |
| let a = blk * (len << 1u) + j; let b = a + len; | |
| let u = sh[a]; let v = sh[b]; sh[a] = u + v; sh[b] = u - v; | |
| } | |
| workgroupBarrier(); | |
| len = len << 1u; | |
| } | |
| let nrm = 1.0 / sqrt(f32(K)); | |
| for (var i = t; i < K; i += ${FWHT_WG}u) { xr[i] = sh[i] * nrm; } | |
| }`; | |
| // ββ WGSL: 2-bit GEMV β WORD-ORIENTED: each thread loads one u32 (16 weights), unpacks in registers, | |
| // hoists the per-32 block scale (16 weights at a 16-aligned base never cross a 32 boundary β one scale | |
| // read per word). Threads stride by workgroup over words β coalesced loads. f32 accumulate. ββ | |
| const MM2_WGSL = ` | |
| @group(0) @binding(0) var<storage,read> qw: array<u32>; | |
| @group(0) @binding(1) var<storage,read> sc: array<f32>; | |
| @group(0) @binding(2) var<storage,read> x: array<f32>; | |
| @group(0) @binding(3) var<storage,read_write> o: array<f32>; | |
| @group(0) @binding(4) var<uniform> P: vec4<u32>; // N, K, nblk, _ | |
| var<workgroup> red: array<f32, ${MM_WG}>; | |
| @compute @workgroup_size(${MM_WG}) | |
| fn main(@builtin(workgroup_id) wid: vec3<u32>, @builtin(local_invocation_id) lid: vec3<u32>) { | |
| let n = wid.x; let K = P.y; let words = K >> 4u; let rowW = n * words; let rowS = n * P.z; | |
| var acc = 0.0; var w = lid.x; | |
| loop { | |
| if (w >= words) { break; } | |
| let packed = qw[rowW + w]; | |
| let kb = w << 4u; let s = sc[rowS + (kb >> 5u)]; | |
| for (var j = 0u; j < 16u; j = j + 1u) { acc = acc + x[kb + j] * f32(i32((packed >> (j * 2u)) & 3u) * 2 - 3) * s; } | |
| w = w + ${MM_WG}u; | |
| } | |
| red[lid.x] = acc; workgroupBarrier(); | |
| var r = ${MM_WG >> 1}u; | |
| loop { if (r == 0u) { break; } if (lid.x < r) { red[lid.x] = red[lid.x] + red[lid.x + r]; } workgroupBarrier(); r = r >> 1u; } | |
| if (lid.x == 0u) { o[n] = red[0]; } | |
| }`; | |
| // ββ WGSL: Q8 GEMV (the engine's current format) β WORD-ORIENTED too, so the comparison is purely the | |
| // bytes-read difference, not kernel quality. Each thread loads one u32 (4 int8), unpacks 4. ββ | |
| const MM8_WGSL = ` | |
| @group(0) @binding(0) var<storage,read> qw: array<u32>; | |
| @group(0) @binding(1) var<storage,read> sc: array<f32>; | |
| @group(0) @binding(2) var<storage,read> x: array<f32>; | |
| @group(0) @binding(3) var<storage,read_write> o: array<f32>; | |
| @group(0) @binding(4) var<uniform> P: vec4<u32>; | |
| var<workgroup> red: array<f32, ${MM_WG}>; | |
| @compute @workgroup_size(${MM_WG}) | |
| fn main(@builtin(workgroup_id) wid: vec3<u32>, @builtin(local_invocation_id) lid: vec3<u32>) { | |
| let n = wid.x; let K = P.y; let words = K >> 2u; let rowW = n * words; let rowS = n * P.z; | |
| var acc = 0.0; var w = lid.x; | |
| loop { | |
| if (w >= words) { break; } | |
| let packed = qw[rowW + w]; | |
| let kb = w << 2u; let s = sc[rowS + (kb >> 5u)]; | |
| for (var j = 0u; j < 4u; j = j + 1u) { let b = (packed >> (j * 8u)) & 0xffu; acc = acc + x[kb + j] * f32(i32(b << 24u) >> 24u) * s; } | |
| w = w + ${MM_WG}u; | |
| } | |
| red[lid.x] = acc; workgroupBarrier(); | |
| var r = ${MM_WG >> 1}u; | |
| loop { if (r == 0u) { break; } if (lid.x < r) { red[lid.x] = red[lid.x] + red[lid.x + r]; } workgroupBarrier(); r = r >> 1u; } | |
| if (lid.x == 0u) { o[n] = red[0]; } | |
| }`; | |
| // ββ GPU helpers ββ | |
| const U = (typeof GPUBufferUsage !== "undefined") ? GPUBufferUsage : {}; | |
| function pipe(dev, code) { const m = dev.createShaderModule({ code }); return dev.createComputePipeline({ layout: "auto", compute: { module: m, entryPoint: "main" } }); } | |
| function sbuf(dev, src) { const b = dev.createBuffer({ size: Math.max(16, src.byteLength), usage: U.STORAGE | U.COPY_DST | U.COPY_SRC }); dev.queue.writeBuffer(b, 0, src); return b; } | |
| function obuf(dev, bytes) { return dev.createBuffer({ size: Math.max(16, bytes), usage: U.STORAGE | U.COPY_SRC }); } | |
| function ubuf(dev, arr) { const b = dev.createBuffer({ size: 16, usage: U.UNIFORM | U.COPY_DST }); dev.queue.writeBuffer(b, 0, arr); return b; } | |
| async function readf32(dev, buf, n) { const st = dev.createBuffer({ size: n * 4, usage: U.MAP_READ | U.COPY_DST }); const e = dev.createCommandEncoder(); e.copyBufferToBuffer(buf, 0, st, 0, n * 4); dev.queue.submit([e.finish()]); await st.mapAsync(GPUMapMode.READ); const out = new Float32Array(st.getMappedRange().slice(0)); st.unmap(); st.destroy(); return out; } | |
| // ββ the bench: correctness (2-bit incoherent vs f32 ref vs naive-2-bit vs Q8) + perf + memory ββ | |
| export async function runBench(dev, { N = 2048, K = 2048, iters = 200 } = {}) { | |
| const nblk = K / 32; | |
| // random Gaussian weights + input (a realistic single layer matmul) | |
| let s = 1234567; const rnd = () => (s = (s * 1664525 + 1013904223) >>> 0) / 4294967296; | |
| const gauss = () => { const u = Math.max(1e-12, rnd()); return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * rnd()); }; | |
| // heavy-tailed weights like real LLM layers (kurtosis β« 3): a Gaussian bulk plus sparse large spikes β | |
| // the outliers that wreck naive low-bit quantization and that incoherence (the Hadamard) spreads out. | |
| const W = new Float32Array(N * K); for (let i = 0; i < N * K; i++) { let w = gauss() * 0.05; if (rnd() < 0.02) w *= 6; W[i] = w; } | |
| const x = new Float32Array(K); for (let i = 0; i < K; i++) x[i] = gauss(); | |
| // f32 reference y = WΒ·x | |
| const yref = new Float32Array(N); for (let n = 0; n < N; n++) { let a = 0; for (let k = 0; k < K; k++) a += W[n * K + k] * x[k]; yref[n] = a; } | |
| const relErr = (y) => { let e = 0, r = 0; for (let n = 0; n < N; n++) { const d = y[n] - yref[n]; e += d * d; r += yref[n] * yref[n]; } return Math.sqrt(e / r); }; | |
| // pack incoherent 2-bit + naive 2-bit + Q8 (per-32 scale, the engine format) | |
| const inc = pack2bit(W, N, K, { incoherent: true }); | |
| const nai = pack2bit(W, N, K, { incoherent: false }); | |
| const q8 = new Int8Array(N * K), q8s = new Float32Array(N * nblk); | |
| for (let n = 0; n < N; n++) for (let b = 0; b < nblk; b++) { let mx = 0; for (let i = 0; i < 32; i++) { const a = Math.abs(W[n * K + b * 32 + i]); if (a > mx) mx = a; } const sca = (mx / 127) || 1e-12; q8s[n * nblk + b] = sca; for (let i = 0; i < 32; i++) { let q = Math.round(W[n * K + b * 32 + i] / sca); if (q > 127) q = 127; else if (q < -127) q = -127; q8[n * K + b * 32 + i] = q; } } | |
| // GPU pipelines | |
| const pF = pipe(dev, FWHT_WGSL), p2 = pipe(dev, MM2_WGSL), p8 = pipe(dev, MM8_WGSL); | |
| // buffers | |
| const xB = sbuf(dev, x), xrB = obuf(dev, K * 4), sgnB = sbuf(dev, inc.sign), Pf = ubuf(dev, new Uint32Array([K, 0, 0, 0])); | |
| const qwB = sbuf(dev, inc.qw), scB = sbuf(dev, inc.sc), oB = obuf(dev, N * 4), P2 = ubuf(dev, new Uint32Array([N, K, nblk, 0])); | |
| const naiqwB = sbuf(dev, nai.qw), naiscB = sbuf(dev, nai.sc), oNB = obuf(dev, N * 4); | |
| const q8B = sbuf(dev, new Uint8Array(q8.buffer)), q8sB = sbuf(dev, q8s), o8B = obuf(dev, N * 4); | |
| const bgF = (xin) => dev.createBindGroup({ layout: pF.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: xin } }, { binding: 1, resource: { buffer: sgnB } }, { binding: 2, resource: { buffer: xrB } }, { binding: 3, resource: { buffer: Pf } }] }); | |
| const bg2 = (qw, sc, xin, o) => dev.createBindGroup({ layout: p2.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: qw } }, { binding: 1, resource: { buffer: sc } }, { binding: 2, resource: { buffer: xin } }, { binding: 3, resource: { buffer: o } }, { binding: 4, resource: { buffer: P2 } }] }); | |
| const bg8 = dev.createBindGroup({ layout: p8.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: q8B } }, { binding: 1, resource: { buffer: q8sB } }, { binding: 2, resource: { buffer: xB } }, { binding: 3, resource: { buffer: o8B } }, { binding: 4, resource: { buffer: P2 } }] }); | |
| // ββ correctness ββ | |
| const doInc = K <= 4096; // single-workgroup FWHT covers K β€ 4096 (16 KB shared) | |
| let yInc = null, gpuCpu = null; | |
| if (doInc) { | |
| // incoherent path: GPU rotate xβxβ², then 2-bit matmul with xβ² | |
| { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(pF); p.setBindGroup(0, bgF(xB)); p.dispatchWorkgroups(1); p.setPipeline(p2); p.setBindGroup(0, bg2(qwB, scB, xrB, oB)); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); } | |
| yInc = await readf32(dev, oB, N); | |
| // CPU re-derivation of the SAME stored 2-bit weights β independent check the GPU kernel agrees | |
| const Wrec = unpack2bit(inc.qw, inc.sc, inc.sign, N, K, { incoherent: true }); | |
| const yCpu = new Float32Array(N); for (let n = 0; n < N; n++) { let a = 0; for (let k = 0; k < K; k++) a += Wrec[n * K + k] * x[k]; yCpu[n] = a; } | |
| let e2 = 0, rr = 0; for (let n = 0; n < N; n++) { const d = yInc[n] - yCpu[n]; e2 += d * d; rr += yCpu[n] * yCpu[n]; } gpuCpu = Math.sqrt(e2 / rr); | |
| } | |
| // naive path (no rotation) + Q8 path | |
| { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p2); p.setBindGroup(0, bg2(naiqwB, naiscB, xB, oNB)); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); } | |
| const yNai = await readf32(dev, oNB, N); | |
| { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p8); p.setBindGroup(0, bg8); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); } | |
| const yQ8 = await readf32(dev, o8B, N); | |
| // ββ perf β apples-to-apples: time each MATMUL alone (incoherence rotation measured separately) ββ | |
| // pre-rotate x once so the 2-bit matmul reads xβ² without re-running the FWHT in the timed loop. | |
| if (doInc) { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(pF); p.setBindGroup(0, bgF(xB)); p.dispatchWorkgroups(1); p.end(); dev.queue.submit([e.finish()]); } | |
| const time = async (fn) => { fn(); await dev.queue.onSubmittedWorkDone(); const t0 = performance.now(); for (let i = 0; i < iters; i++) fn(); await dev.queue.onSubmittedWorkDone(); return (performance.now() - t0) / iters; }; | |
| const run2mm = () => { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p2); p.setBindGroup(0, bg2(qwB, scB, doInc ? xrB : xB, oB)); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); }; | |
| const run8mm = () => { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(p8); p.setBindGroup(0, bg8); p.dispatchWorkgroups(N); p.end(); dev.queue.submit([e.finish()]); }; | |
| const runF = () => { const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(pF); p.setBindGroup(0, bgF(xB)); p.dispatchWorkgroups(1); p.end(); dev.queue.submit([e.finish()]); }; | |
| const ms2 = await time(run2mm), ms8 = await time(run8mm), msF = doInc ? await time(runF) : null; | |
| const bytes2 = inc.qw.byteLength + inc.sc.byteLength, bytes8 = q8.byteLength + q8s.byteLength; | |
| const gbps = (b, ms) => (b / 1e9) / (ms / 1e3); | |
| return { | |
| N, K, iters, | |
| err: { incoherent2bit: doInc ? relErr(yInc) : null, naive2bit: relErr(yNai), q8: relErr(yQ8), gpu_vs_cpu_2bit: gpuCpu }, | |
| perf: { ms_2bit_mm: +ms2.toFixed(4), ms_q8_mm: +ms8.toFixed(4), ms_fwht: msF == null ? null : +msF.toFixed(4), matmul_speedup: +(ms8 / ms2).toFixed(2), gbps_2bit: +gbps(bytes2, ms2).toFixed(0), gbps_q8: +gbps(bytes8, ms8).toFixed(0) }, | |
| mem: { MB_2bit: +(bytes2 / 1e6).toFixed(2), MB_q8: +(bytes8 / 1e6).toFixed(2), ratio: +(bytes8 / bytes2).toFixed(2), bits_per_weight: +(bytes2 * 8 / (N * K)).toFixed(2) }, | |
| }; | |
| } | |