Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on 19 days ago

Commit

299e359

1 Parent(s): 0fc83ad

sync from abhijitramesh/webgpu-bench@dab7e7757e

Browse files

Files changed (8) hide show

build/asyncify/bench.wasm +2 -2
build/asyncify/build-info.json +2 -2
build/jspi/bench.wasm +2 -2
build/jspi/build-info.json +2 -2
js/run/bench-worker.js +183 -55
js/run/controller.js +68 -21
js/run/core.js +8 -3
js/run/source.js +48 -0

build/asyncify/bench.wasm CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:424ebf14301cc29905a08d5a225bdc98b58dbbd6a949b22adcc8717bcda151c8
-size 5233169

 version https://git-lfs.github.com/spec/v1
+oid sha256:50895b262f9b0da117509d04075ca06f3b30d3482c130d22c827e53e20d8a650
+size 5233188

build/asyncify/build-info.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
-  "llamaCppDescribe": "b8979-5-gf22c8021d",
   "dawnTag": "v20260317.182325",
-  "builtAt": "2026-04-29T22:38:32Z"
 }

 {
   "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
+  "llamaCppDescribe": "b8981-3-gf22c8021d",
   "dawnTag": "v20260317.182325",
+  "builtAt": "2026-04-29T23:41:53Z"
 }

build/jspi/bench.wasm CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b549092fe3ac53a3cca8efdd6e9a3e4ebf64443b7d76a5946d59dd4052378fd3
-size 3612110

 version https://git-lfs.github.com/spec/v1
+oid sha256:92ef71c59da832ad869cbc002665fd3bb3505c7e515a7cefc5d7f7901224ea40
+size 3612135

build/jspi/build-info.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
-  "llamaCppDescribe": "b8979-5-gf22c8021d",
   "dawnTag": "v20260317.182325",
-  "builtAt": "2026-04-29T22:34:33Z"
 }

 {
   "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
+  "llamaCppDescribe": "b8981-3-gf22c8021d",
   "dawnTag": "v20260317.182325",
+  "builtAt": "2026-04-29T23:37:54Z"
 }

js/run/bench-worker.js CHANGED Viewed

@@ -37,6 +37,95 @@ const post = (msg) => self.postMessage(msg);
 const log = (line) => post({ type: 'log', line });
 const status = (s, msg) => post({ type: 'status', status: s, msg });
 // Aggregate raw nanosecond samples into the llama-bench result shape.
 // Mirrors core.js buildTest — keep them identical.
 function buildTest(name, n_prompt, n_gen, samples_ns) {
@@ -100,7 +189,7 @@ self.onmessage = async (e) => {
   }
 };
-async function runOne({ params, stream, buffer }) {
   const {
     buildType,
     contentLength,
@@ -116,8 +205,14 @@ async function runOne({ params, stream, buffer }) {
     nReps,
     noWarmup,
   } = params;
-  if (!stream && !buffer) {
-    throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
   }
   const result = {
@@ -178,55 +273,75 @@ async function runOne({ params, stream, buffer }) {
   });
   log('WASM module loaded');
-  // ─── Stream the model into the WASM heap (HeapFS-style) ───
-  if (!(contentLength > 0)) {
-    throw new Error('content-length is required for streaming into WASM heap');
-  }
-  status('downloading', 'Streaming model into WASM heap...');
-  let modelPtr = Module._malloc(contentLength);
-  if (!modelPtr) {
-    throw new Error(
-      `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
-    );
-  }
-  try {
-    let downloaded = 0;
-    if (stream) {
-      const reader = stream.getReader();
-      while (true) {
-        const { done, value } = await reader.read();
-        if (done) break;
-        Module.HEAPU8.set(value, modelPtr + downloaded);
-        downloaded += value.length;
-        post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
-      }
-    } else {
-      const view = new Uint8Array(buffer);
-      if (view.byteLength !== contentLength) {
-        log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
-      }
-      Module.HEAPU8.set(view, modelPtr);
-      downloaded = view.byteLength;
-      post({ type: 'progress', fraction: 1, downloaded, total: contentLength });
     }
-    log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
-    const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
-    Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
-    const node = Module.FS.lookupPath('/model.gguf').node;
-    Object.defineProperty(node, 'contents', {
-      get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
-      set: () => { /* read-only file */ },
-      configurable: true,
-    });
-    node.usedBytes = contentLength;
-  } catch (err) {
-    Module._free(modelPtr);
-    modelPtr = 0;
-    throw err;
   }
   // ─── Init backend ───
@@ -236,21 +351,30 @@ async function runOne({ params, stream, buffer }) {
   log('Backends initialized');
   // ─── Load model ───
-  status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
   const loadResult = await Module.ccall(
     'bench_load',
     'number',
-    ['string', 'number', 'number'],
-    ['/model.gguf', nCtx, nGpuLayers],
     { async: true },
   );
   if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
   log('Model loaded');
-  try {
-    Module.FS.unlink('/model.gguf');
-  } catch (err) {
-    log(`Warning: could not remove model FS node: ${err.message}`);
   }
   // ─── Consistency phase ───
@@ -362,7 +486,11 @@ async function runOne({ params, stream, buffer }) {
   await Module.ccall('bench_exit', null, [], [], { async: true });
-  if (modelPtr) {
     Module._free(modelPtr);
     modelPtr = 0;
   }

 const log = (line) => post({ type: 'log', line });
 const status = (s, msg) => post({ type: 'status', status: s, msg });
+// ─── OPFS-backed model loading (wllama-style) ───
+// For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray
+// length limits, and it eats the heap budget that KV cache + working memory
+// need). Instead, we open a FileSystemSyncAccessHandle on the OPFS file in
+// this worker, register a zero-byte stub in MEMFS, and patch MEMFS's
+// stream_ops so reads delegate to syncHandle.read(). llama.cpp then loads
+// the model via fread (use_mmap=false), which calls the patched stream_ops
+// — never copying the bytes through the WASM heap.
+//
+// Mirrors wllama's src/workers-code/llama-cpp.js (patchMEMFS / opfsAlloc /
+// opfsFreeAll). Worker-only: sync access handles aren't available on the
+// main thread.
+const opfsHandles = {}; // map MEMFS-name → { syncHandle, size }
+function patchMEMFS(Module) {
+  const m = Module;
+  // Idempotent — only install the patches once per Module.
+  if (m.MEMFS.stream_ops._read) return;
+  m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read;
+  m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek;
+  m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap;
+  m.MEMFS.stream_ops.read = function (stream, buffer, offset, length, position) {
+    const name = stream.node.name;
+    if (opfsHandles[name]) {
+      const { syncHandle, size } = opfsHandles[name];
+      const toRead = Math.min(length, size - position);
+      if (toRead <= 0) return 0;
+      const view = new Uint8Array(buffer.buffer, buffer.byteOffset + offset, toRead);
+      return syncHandle.read(view, { at: position });
+    }
+    return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position);
+  };
+  m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read;
+  m.MEMFS.stream_ops.llseek = function (stream, offset, whence) {
+    const name = stream.node.name;
+    if (opfsHandles[name]) {
+      const { size } = opfsHandles[name];
+      let newPos = offset;
+      if (whence === 1) newPos += stream.position;  // SEEK_CUR
+      if (whence === 2) newPos += size;             // SEEK_END
+      if (newPos < 0) throw new Error('SEEK before start of file');
+      stream.position = newPos;
+      return newPos;
+    }
+    return m.MEMFS.stream_ops._llseek(stream, offset, whence);
+  };
+  m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek;
+  m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) {
+    const name = stream.node.name;
+    if (opfsHandles[name]) {
+      // OPFS-backed files must never be mmap'd — that would force MEMFS to
+      // copy the file into the WASM heap, defeating the OPFS path. The C++
+      // side passes use_mmap=0 to avoid this. If we ever land here, the
+      // caller forgot to disable mmap.
+      throw new Error(`[OPFS] mmap called on "${name}" — bench_load was not invoked with use_mmap=0`);
+    }
+    return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags);
+  };
+  m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;
+}
+async function opfsAlloc(Module, name, fileHandle) {
+  // createSyncAccessHandle is worker-only and exclusive — only one writer
+  // per OPFS file at a time. Caller must ensure no createWritable session
+  // is open when we land here.
+  const syncHandle = await fileHandle.createSyncAccessHandle();
+  const size = syncHandle.getSize();
+  opfsHandles[name] = { syncHandle, size };
+  // Zero-byte placeholder so llama.cpp's fopen() finds the path.
+  Module.FS.createDataFile('/', name, new Uint8Array(0), true, false, true);
+  // Set usedBytes so fstat()/seek-end report the real file size — our
+  // patched llseek consults size, but other code (e.g. llama.cpp's GGUF
+  // reader sanity-checking the file length) goes through stat first.
+  Module.FS.lookupPath('/' + name).node.usedBytes = size;
+  return size;
+}
+function opfsFreeAll(Module) {
+  for (const [name, { syncHandle }] of Object.entries(opfsHandles)) {
+    try { syncHandle.close(); } catch { /* already closed */ }
+    try { Module.FS.unlink('/' + name); } catch { /* already gone */ }
+    delete opfsHandles[name];
+  }
+}
 // Aggregate raw nanosecond samples into the llama-bench result shape.
 // Mirrors core.js buildTest — keep them identical.
 function buildTest(name, n_prompt, n_gen, samples_ns) {
   }
 };
+async function runOne({ params, stream, buffer, fileHandle }) {
   const {
     buildType,
     contentLength,
     nReps,
     noWarmup,
   } = params;
+  // Three input modes are supported:
+  //   fileHandle  → wllama-style OPFS-streaming load (preferred for >2GB)
+  //   stream      → heap-stream mode (zero-copy WASM-heap, transferable)
+  //   buffer      → buffered fallback for browsers without transferable streams
+  // Exactly one must be provided.
+  const inputCount = (fileHandle ? 1 : 0) + (stream ? 1 : 0) + (buffer ? 1 : 0);
+  if (inputCount !== 1) {
+    throw new Error('runOne: exactly one of `fileHandle`, `stream`, or `buffer` must be provided');
   }
   const result = {
   });
   log('WASM module loaded');
+  // ─── Make the model visible to the WASM filesystem ───
+  // Two paths:
+  //   useOpfsPath: leave the bytes on disk (OPFS) and route reads through
+  //               a sync access handle via patched MEMFS stream_ops. No
+  //               heap copy, supports >2GB.
+  //   else:       _malloc the full file on the WASM heap, write the stream
+  //               in, register a heap-backed MEMFS file. Faster (mmap'd
+  //               zero-copy at load time) but caps at ~2GB.
+  let modelPtr = 0;  // tracks heap-path allocation for cleanup
+  const useOpfsPath = !!fileHandle;
+  if (useOpfsPath) {
+    status('opfs', 'Linking OPFS-backed model into MEMFS...');
+    patchMEMFS(Module);
+    const size = await opfsAlloc(Module, 'model.gguf', fileHandle);
+    log(`OPFS-backed model.gguf registered (${(size / (1024 * 1024)).toFixed(1)} MB)`);
+    // Report 100% to keep the existing progress UI happy — the actual
+    // download to OPFS happened before the worker spawn.
+    post({ type: 'progress', fraction: 1, downloaded: size, total: size });
+  } else {
+    if (!(contentLength > 0)) {
+      throw new Error('content-length is required for streaming into WASM heap');
+    }
+    status('downloading', 'Streaming model into WASM heap...');
+    modelPtr = Module._malloc(contentLength);
+    if (!modelPtr) {
+      throw new Error(
+        `_malloc(${(contentLength / (1024 * 1024)).toFixed(0)} MB) failed — wasm heap exhausted`
+      );
     }
+    try {
+      let downloaded = 0;
+      if (stream) {
+        const reader = stream.getReader();
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+          Module.HEAPU8.set(value, modelPtr + downloaded);
+          downloaded += value.length;
+          post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
+        }
+      } else {
+        const view = new Uint8Array(buffer);
+        if (view.byteLength !== contentLength) {
+          log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
+        }
+        Module.HEAPU8.set(view, modelPtr);
+        downloaded = view.byteLength;
+        post({ type: 'progress', fraction: 1, downloaded, total: contentLength });
+      }
+      log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
+      const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
+      Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
+      const node = Module.FS.lookupPath('/model.gguf').node;
+      Object.defineProperty(node, 'contents', {
+        get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
+        set: () => { /* read-only file */ },
+        configurable: true,
+      });
+      node.usedBytes = contentLength;
+    } catch (err) {
+      Module._free(modelPtr);
+      modelPtr = 0;
+      throw err;
+    }
   }
   // ─── Init backend ───
   log('Backends initialized');
   // ─── Load model ───
+  // OPFS path requires use_mmap=0 — the patched mmap throws to surface bugs
+  // if it's accidentally invoked. Heap path uses mmap=1 to take MEMFS's
+  // zero-copy mmap fast path against our HEAPU8-backed file.
+  const useMmap = useOpfsPath ? 0 : 1;
+  status('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=${useMmap})...`);
   const loadResult = await Module.ccall(
     'bench_load',
     'number',
+    ['string', 'number', 'number', 'number'],
+    ['/model.gguf', nCtx, nGpuLayers, useMmap],
     { async: true },
   );
   if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
   log('Model loaded');
+  if (!useOpfsPath) {
+    // Heap path: drop the MEMFS node now that llama.cpp's mmap captured a
+    // pointer into our _malloc'd region. Bytes stay alive in the heap until
+    // bench_exit + _free.
+    try {
+      Module.FS.unlink('/model.gguf');
+    } catch (err) {
+      log(`Warning: could not remove model FS node: ${err.message}`);
+    }
   }
   // ─── Consistency phase ───
   await Module.ccall('bench_exit', null, [], [], { async: true });
+  if (useOpfsPath) {
+    // Close the sync handle so OPFS can release its lock on the file (and
+    // so a subsequent run can open a fresh handle without colliding).
+    opfsFreeAll(Module);
+  } else if (modelPtr) {
     Module._free(modelPtr);
     modelPtr = 0;
   }

js/run/controller.js CHANGED Viewed

@@ -1126,6 +1126,7 @@ async function onRunClick() {
 function runInWorker({
   params,
   stream,
   onStatus,
   onProgress,
   onLog,
@@ -1166,6 +1167,19 @@ function runInWorker({
       finish({ status: 'error', error: 'worker message deserialization failed' });
     };
     // Mobile browsers (esp. iOS Safari) advertise transferable streams but
     // can't actually transfer ReadableStreams across postMessage — the call
     // throws "The object can not be cloned." We probe once with a tiny
@@ -1249,9 +1263,59 @@ async function readStreamToBuffer(stream, contentLength, onProgress) {
   return out.buffer;
 }
-// Fetch the model through the source adapter and hand the stream to a
-// freshly-spawned worker. Returns a record shaped like runBenchmarkCore().
 async function runBenchmarkInWorker(v, params, callbacks) {
   let fetched;
   try {
     fetched = await state.source.fetchModel(v.repo, v.filename);
@@ -1259,30 +1323,13 @@ async function runBenchmarkInWorker(v, params, callbacks) {
     return { status: 'error', error: `fetchModel failed: ${err.message}` };
   }
-  const record = await runInWorker({
-    params: {
-      buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
-      contentLength: fetched.contentLength,
-      // Model load
-      nCtx: params.nCtx,
-      nGpuLayers: params.nGpuLayers,
-      // Consistency phase — empty consistencyPrompt skips it
-      consistencyPrompt: params.consistencyPrompt || '',
-      consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
-      refTokenIds: params.refTokenIds || null,
-      // Perf phase — set both to 0 to skip
-      nPrompt: params.nPrompt ?? 0,
-      nGen:    params.nGen    ?? 0,
-      nReps:   params.nReps   ?? DEFAULT_ITERATIONS,
-      noWarmup: !!params.noWarmup,
-    },
     stream: fetched.stream,
     onStatus: callbacks.onStatus,
     onProgress: callbacks.onProgress,
     onLog: callbacks.onLog,
   });
-  return record;
 }
 // Runs one variant: CPU consistency baseline (one model load, generates

 function runInWorker({
   params,
   stream,
+  fileHandle,
   onStatus,
   onProgress,
   onLog,
       finish({ status: 'error', error: 'worker message deserialization failed' });
     };
+    // Three transport modes — see bench-worker.js runOne() for matching shape.
+    if (fileHandle) {
+      // OPFS path: FileSystemFileHandle is structured-cloneable, not
+      // transferable. The worker creates its own sync access handle on the
+      // cloned reference (still bound to the same underlying OPFS file).
+      try {
+        worker.postMessage({ type: 'run', params, fileHandle });
+      } catch (err) {
+        finish({ status: 'error', error: `postMessage(fileHandle) failed: ${err.message}` });
+      }
+      return;
+    }
     // Mobile browsers (esp. iOS Safari) advertise transferable streams but
     // can't actually transfer ReadableStreams across postMessage — the call
     // throws "The object can not be cloned." We probe once with a tiny
   return out.buffer;
 }
+// Fetch the model and hand it to a freshly-spawned worker. Returns a record
+// shaped like runBenchmarkCore(). Two paths:
+//
+//   wllama-style OPFS streaming (preferred): if the source provides
+//   opfsHandleForModel (currently hostedSource), download to OPFS on the
+//   main thread, then transfer the FileSystemFileHandle to the worker.
+//   The worker opens a sync access handle and routes MEMFS reads through
+//   it, never copying the model into the WASM heap. Supports >2GB.
+//
+//   Heap-stream (fallback for localSource): keep the prior behavior —
+//   stream the GGUF into a single _malloc'd buffer in the WASM heap.
+//   Faster for small models (zero-copy mmap on load), capped at ~2GB.
 async function runBenchmarkInWorker(v, params, callbacks) {
+  const useOpfs = typeof state.source.opfsHandleForModel === 'function';
+  const baseParams = {
+    buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
+    // Model load
+    nCtx: params.nCtx,
+    nGpuLayers: params.nGpuLayers,
+    // Consistency phase — empty consistencyPrompt skips it
+    consistencyPrompt: params.consistencyPrompt || '',
+    consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
+    refTokenIds: params.refTokenIds || null,
+    // Perf phase — set both to 0 to skip
+    nPrompt: params.nPrompt ?? 0,
+    nGen:    params.nGen    ?? 0,
+    nReps:   params.nReps   ?? DEFAULT_ITERATIONS,
+    noWarmup: !!params.noWarmup,
+  };
+  if (useOpfs) {
+    let fileHandle, contentLength;
+    try {
+      callbacks.onStatus?.('downloading', 'Downloading model to OPFS...');
+      const r = await state.source.opfsHandleForModel(
+        v.repo, v.filename,
+        callbacks.onProgress,
+      );
+      fileHandle = r.handle;
+      contentLength = r.size;
+    } catch (err) {
+      return { status: 'error', error: `opfsHandleForModel failed: ${err.message}` };
+    }
+    return runInWorker({
+      params: { ...baseParams, contentLength },
+      fileHandle,
+      onStatus: callbacks.onStatus,
+      onProgress: callbacks.onProgress,
+      onLog: callbacks.onLog,
+    });
+  }
   let fetched;
   try {
     fetched = await state.source.fetchModel(v.repo, v.filename);
     return { status: 'error', error: `fetchModel failed: ${err.message}` };
   }
+  return runInWorker({
+    params: { ...baseParams, contentLength: fetched.contentLength },
     stream: fetched.stream,
     onStatus: callbacks.onStatus,
     onProgress: callbacks.onProgress,
     onLog: callbacks.onLog,
   });
 }
 // Runs one variant: CPU consistency baseline (one model load, generates

js/run/core.js CHANGED Viewed

@@ -329,12 +329,17 @@ export async function runBenchmarkCore({
     if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
     onLog('Backends initialized');
-    onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
     const loadResult = await Module.ccall(
       'bench_load',
       'number',
-      ['string', 'number', 'number'],
-      ['/model.gguf', nCtx, nGpuLayers],
       { async: true },
     );
     if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);

     if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
     onLog('Backends initialized');
+    // core.js is the main-thread/heap-stream path (used by harness.js +
+    // runner.js Playwright harness). Sync access handles aren't available
+    // on the main thread, so we always pass use_mmap=1 here — llama.cpp
+    // mmap's the HEAPU8-backed MEMFS file zero-copy. Capped at ~2GB.
+    // For >2GB models, run via the dashboard Run page (worker path).
+    onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers}, mmap=1)...`);
     const loadResult = await Module.ccall(
       'bench_load',
       'number',
+      ['string', 'number', 'number', 'number'],
+      ['/model.gguf', nCtx, nGpuLayers, 1],
       { async: true },
     );
     if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);

js/run/source.js CHANGED Viewed

@@ -95,6 +95,54 @@ export function hostedSource() {
       }
     },
     async fetchModel(repo, file) {
       // Cache hit → stream the OPFS file straight out.
       try {

       }
     },
+    // Ensure the model is fully downloaded to OPFS, then return its
+    // FileSystemFileHandle. Used by the wllama-style OPFS-streaming load
+    // path: the worker opens a sync access handle on this FileHandle and
+    // routes MEMFS reads through it, never copying the model into the
+    // WASM heap. onProgress is called during the download leg with
+    // (fraction, downloaded, total).
+    async opfsHandleForModel(repo, file, onProgress) {
+      const cached = await getOpfsFileHandle(repo, file, { create: false }).catch(() => null);
+      if (cached) {
+        const f = await cached.getFile();
+        if (f.size > 0) {
+          onProgress?.(1, f.size, f.size);
+          return { handle: cached, size: f.size };
+        }
+      }
+      // Cache miss — download from HF straight into a writable OPFS stream.
+      const url = `https://huggingface.co/${repo}/resolve/main/${file}`;
+      const resp = await fetch(url);
+      if (!resp.ok) {
+        throw new Error(`Download failed: ${resp.status} ${resp.statusText}`);
+      }
+      const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
+      const handle = await getOpfsFileHandle(repo, file, { create: true });
+      const writable = await handle.createWritable({ keepExistingData: false });
+      // Same persistent-storage hint as fetchModel — best-effort.
+      navigator.storage?.persist?.().catch(() => {});
+      try {
+        const reader = resp.body.getReader();
+        let downloaded = 0;
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+          await writable.write(value);
+          downloaded += value.byteLength;
+          if (contentLength > 0) onProgress?.(downloaded / contentLength, downloaded, contentLength);
+        }
+        await writable.close();
+        return { handle, size: downloaded };
+      } catch (err) {
+        try { await writable.abort(err); } catch { /* ignore */ }
+        throw err;
+      }
+    },
     async fetchModel(repo, file) {
       // Cache hit → stream the OPFS file straight out.
       try {