Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on 27 days ago

Commit

ba6f9e5

1 Parent(s): 2ee9bac

sync from abhijitramesh/webgpu-bench@55ab2c71db

Browse files

Files changed (3) hide show

js/run/bench-worker.js +63 -20
js/run/controller.js +11 -11
js/run/core.js +60 -21

js/run/bench-worker.js CHANGED Viewed

@@ -126,25 +126,61 @@ async function runOne({ params, stream }) {
   });
   log('WASM module loaded');
-  // ─── Stream the model into MEMFS ───
-  status('downloading', 'Streaming model into WASM FS...');
-  if (contentLength > 0) {
-    Module.FS.writeFile('/model.gguf', new Uint8Array(0));
-    Module.FS.truncate('/model.gguf', contentLength);
   }
-  const memfsHandle = Module.FS.open('/model.gguf', 'w');
-  const reader = stream.getReader();
-  let downloaded = 0;
-  while (true) {
-    const { done, value } = await reader.read();
-    if (done) break;
-    Module.FS.write(memfsHandle, value, 0, value.length, downloaded);
-    downloaded += value.length;
-    const fraction = contentLength ? downloaded / contentLength : 0;
-    post({ type: 'progress', fraction, downloaded, total: contentLength });
   }
-  Module.FS.close(memfsHandle);
-  log(`Model written to /model.gguf (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
   // ─── Init backend ───
   status('initializing', 'Initializing llama.cpp backends...');
@@ -164,12 +200,13 @@ async function runOne({ params, stream }) {
   if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
   log('Model loaded');
-  // Free MEMFS copy — llama.cpp has mapped weights into its own heap by now.
   try {
     Module.FS.unlink('/model.gguf');
-    log('Freed model file from virtual FS');
   } catch (err) {
-    log(`Warning: could not remove model file from FS: ${err.message}`);
   }
   // ─── Inference ───
@@ -227,6 +264,12 @@ async function runOne({ params, stream }) {
   await Module.ccall('bench_exit', null, [], [], { async: true });
   result.status = 'done';
   status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
   log(

   });
   log('WASM module loaded');
+  // ─── Stream the model into the WASM heap (HeapFS-style) ───
+  // Avoid the JS-side MEMFS staging buffer by allocating space inside the
+  // WASM heap with _malloc and writing chunks directly via HEAPU8.set. Then
+  // register the file with MEMFS using a Uint8Array view backed by the heap
+  // region, so llama.cpp's mmap can take the zero-copy branch in MEMFS.mmap
+  // (which fires when contents.buffer === HEAP8.buffer).
+  //
+  // Heap growth during bench_init/bench_load detaches old views, so we
+  // override node.contents with a getter that always rebuilds the view
+  // from the saved pointer + length against the current Module.HEAPU8.
+  if (!(contentLength > 0)) {
+    throw new Error('content-length is required for streaming into WASM heap');
   }
+  status('downloading', 'Streaming model into WASM heap...');
+  let modelPtr = Module._malloc(contentLength);
+  if (!modelPtr) {
+    throw new Error(
+      `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
+    );
+  }
+  try {
+    const reader = stream.getReader();
+    let downloaded = 0;
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      Module.HEAPU8.set(value, modelPtr + downloaded);
+      downloaded += value.length;
+      post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
+    }
+    log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
+    // Register as a MEMFS file with a heap-backed view. canOwn=true so MEMFS
+    // doesn't make its own copy.
+    const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
+    Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
+    // Replace contents with a getter — heap growth (e.g. when llama.cpp
+    // allocates KV cache) replaces Module.HEAPU8.buffer, which would
+    // detach our static view. The getter rebuilds against the live buffer.
+    const node = Module.FS.lookupPath('/model.gguf').node;
+    Object.defineProperty(node, 'contents', {
+      get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
+      set: () => { /* read-only file */ },
+      configurable: true,
+    });
+    // usedBytes is read by MEMFS for stat() — keep it accurate.
+    node.usedBytes = contentLength;
+  } catch (err) {
+    Module._free(modelPtr);
+    modelPtr = 0;
+    throw err;
   }
   // ─── Init backend ───
   status('initializing', 'Initializing llama.cpp backends...');
   if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
   log('Model loaded');
+  // Drop the MEMFS node — the bytes themselves stay alive in the WASM heap
+  // because llama.cpp's mmap captured a pointer into our _malloc'd region.
+  // We free that region after bench_exit.
   try {
     Module.FS.unlink('/model.gguf');
   } catch (err) {
+    log(`Warning: could not remove model FS node: ${err.message}`);
   }
   // ─── Inference ───
   await Module.ccall('bench_exit', null, [], [], { async: true });
+  // Free the heap-resident model bytes now that llama.cpp has unmapped.
+  if (modelPtr) {
+    Module._free(modelPtr);
+    modelPtr = 0;
+  }
   result.status = 'done';
   status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
   log(

js/run/controller.js CHANGED Viewed

@@ -991,19 +991,19 @@ async function runVariantWithIterations(v, row) {
     cpuResult = { status: 'error', error: err.message || String(err) };
   }
-  if (cpuResult.status !== 'done') {
-    return {
-      status: 'error',
-      error: `CPU baseline failed: ${cpuResult.error || 'unknown'}`,
-      iterations: 0,
-      cpu: cpuResult,
-      gpuSamples: [],
-      consistency: null,
-      gpuCore: null,
-    };
   }
-  const refTokenIds = (cpuResult.metrics?.token_ids || []).join(',');
   // ─── GPU iterations ───
   const gpuSamples = [];

     cpuResult = { status: 'error', error: err.message || String(err) };
   }
+  // CPU baseline is "best effort": if it fails (typically OOM on a tight
+  // tab), keep going with GPU runs but skip the consistency check, since
+  // we'd have no reference token IDs to compare against. The user still
+  // gets prefill/decode metrics — just no agreement-rate number.
+  const cpuOk = cpuResult.status === 'done';
+  if (!cpuOk) {
+    logLine(
+      `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU runs, skipping consistency check.`
+    );
+    row.setStatus('cpu-skipped', 'continuing with GPU only');
   }
+  const refTokenIds = cpuOk ? (cpuResult.metrics?.token_ids || []).join(',') : '';
   // ─── GPU iterations ───
   const gpuSamples = [];

js/run/core.js CHANGED Viewed

@@ -49,6 +49,9 @@ export async function runBenchmarkCore({
     output: '',
   };
   try {
     // WebGPU adapter probe — informational only.
     if (navigator.gpu) {
@@ -72,7 +75,7 @@ export async function runBenchmarkCore({
     onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
     await loadBenchScriptOnce(buildType);
-    const Module = await globalThis.createBenchModule({
       print: (text) => onLog(`[wasm] ${text}`),
       printErr: (text) => onLog(`[wasm:err] ${text}`),
       // Catch Emscripten abort() — Firefox can abort during Asyncify init.
@@ -93,25 +96,49 @@ export async function runBenchmarkCore({
       contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
     }`);
-    // Stream directly into MEMFS to avoid holding the full model in JS memory.
-    // Pre-allocate so MEMFS doesn't realloc on every chunk.
-    if (contentLength > 0) {
-      Module.FS.writeFile('/model.gguf', new Uint8Array(0));
-      Module.FS.truncate('/model.gguf', contentLength);
     }
-    const memfsHandle = Module.FS.open('/model.gguf', 'w');
-    const reader = stream.getReader();
-    let downloaded = 0;
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-      Module.FS.write(memfsHandle, value, 0, value.length, downloaded);
-      downloaded += value.length;
-      const fraction = contentLength ? downloaded / contentLength : 0;
-      onProgress(fraction, downloaded, contentLength);
     }
-    Module.FS.close(memfsHandle);
-    onLog(`Model written to /model.gguf (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
     // Init backend.
     onStatus('initializing', 'Initializing llama.cpp backends...');
@@ -133,12 +160,13 @@ export async function runBenchmarkCore({
     if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
     onLog('Model loaded');
-    // llama.cpp has copied the model into WASM heap — free the MEMFS copy.
     try {
       Module.FS.unlink('/model.gguf');
-      onLog('Freed model file from virtual FS');
     } catch (e) {
-      onLog(`Warning: could not remove model file from FS: ${e.message}`);
     }
     // Run inference.
@@ -198,6 +226,12 @@ export async function runBenchmarkCore({
     onLog('Calling bench_exit()...');
     await Module.ccall('bench_exit', null, [], [], { async: true });
     result.status = 'done';
     onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
     onLog(
@@ -216,6 +250,11 @@ export async function runBenchmarkCore({
     onStatus('error', `Error: ${err.message}`);
     onLog(`ERROR: ${err.message}`);
     if (err.stack) onLog(err.stack);
     return result;
   }
 }

     output: '',
   };
+  // Declared outside the try so the catch can free our heap allocation.
+  let Module;
   try {
     // WebGPU adapter probe — informational only.
     if (navigator.gpu) {
     onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
     await loadBenchScriptOnce(buildType);
+    Module = await globalThis.createBenchModule({
       print: (text) => onLog(`[wasm] ${text}`),
       printErr: (text) => onLog(`[wasm:err] ${text}`),
       // Catch Emscripten abort() — Firefox can abort during Asyncify init.
       contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
     }`);
+    // Stream the GGUF directly into the WASM heap (HeapFS-style) to avoid a
+    // duplicate JS-side MEMFS staging buffer. _malloc reserves a region in
+    // the linear memory; HEAPU8.set writes chunks in place. We then expose
+    // the region as a MEMFS file with `canOwn=true` so MEMFS does not copy,
+    // and override node.contents with a getter that always rebuilds the
+    // view from the saved pointer — this survives the heap growth that
+    // llama.cpp triggers during bench_init/bench_load.
+    if (!(contentLength > 0)) {
+      throw new Error('content-length is required for streaming into WASM heap');
     }
+    let modelPtr = Module._malloc(contentLength);
+    if (!modelPtr) {
+      throw new Error(
+        `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
+      );
     }
+    try {
+      const reader = stream.getReader();
+      let downloaded = 0;
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        Module.HEAPU8.set(value, modelPtr + downloaded);
+        downloaded += value.length;
+        onProgress(downloaded / contentLength, downloaded, contentLength);
+      }
+      onLog(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
+      const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
+      Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
+      const node = Module.FS.lookupPath('/model.gguf').node;
+      Object.defineProperty(node, 'contents', {
+        get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
+        set: () => { /* read-only */ },
+        configurable: true,
+      });
+      node.usedBytes = contentLength;
+    } catch (err) {
+      Module._free(modelPtr);
+      throw err;
+    }
+    // Track on the result object so we can free in the success/exit paths.
+    result._modelPtr = modelPtr;
     // Init backend.
     onStatus('initializing', 'Initializing llama.cpp backends...');
     if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
     onLog('Model loaded');
+    // Drop the MEMFS node — llama.cpp's mmap captured a pointer into the
+    // _malloc'd region in the WASM heap, so the bytes themselves stay alive
+    // until we _free below after bench_exit.
     try {
       Module.FS.unlink('/model.gguf');
     } catch (e) {
+      onLog(`Warning: could not remove model FS node: ${e.message}`);
     }
     // Run inference.
     onLog('Calling bench_exit()...');
     await Module.ccall('bench_exit', null, [], [], { async: true });
+    // Free the heap-resident model bytes now that llama.cpp has unmapped.
+    if (result._modelPtr) {
+      Module._free(result._modelPtr);
+      delete result._modelPtr;
+    }
     result.status = 'done';
     onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
     onLog(
     onStatus('error', `Error: ${err.message}`);
     onLog(`ERROR: ${err.message}`);
     if (err.stack) onLog(err.stack);
+    // Best-effort: release the model heap region so a re-run can reuse it.
+    if (result._modelPtr && Module?._free) {
+      try { Module._free(result._modelPtr); } catch { /* ignore */ }
+      delete result._modelPtr;
+    }
     return result;
   }
 }