Spaces:

abhijitramesh
/

webgpu-bench

Running

File size: 19,308 Bytes

// Device-fit helpers for the interactive bench page.
//
// Two budget probes drive the per-variant fit decision:
//
//   getDeviceBudgetMB() — empirical WASM heap probe. Grows a
//     WebAssembly.Memory page-by-page in a worker until it fails. Caps
//     the working set (KV cache + compute scratch + JS heap headroom)
//     llama.cpp consumes during inference.
//
//   probeGpuBudgetMB() — empirical WebGPU memory probe. Allocates real
//     buffers with mappedAtCreation=true on the actual adapter until OOM.
//     Caps the size of model weights llama.cpp can hold in GPU buffers,
//     since OPFS-streaming keeps model bytes off the WASM heap.
//
// variantFits() then checks both: model size + GPU overhead ≤ GPU budget,
// AND heap working-set floor ≤ heap budget. wllama doesn't probe at all
// — they let load attempts fail naturally — but our auto-select buttons
// ("All fit", "Run study") need a fit predicate, so we err on the side
// of measuring rather than guessing.
//
// On wasm32 the linear memory caps at 4 GiB no matter how much physical
// RAM the device has, so heap probe results above 4096 MB cannot exist.

const DEFAULT_BUDGET_MB = 2 * 1024;
const HOSTED_QUOTA_FRACTION = 0.4;
const HOSTED_QUOTA_CAP_MB = 8 * 1024;

// Mobile per-device budgets. Two independent caps, mirroring the desktop
// path — model weights stream from OPFS into WebGPU buffers (see
// bench-worker.js:patchMEMFS / opfsAlloc), so the model size constrains
// `gpuBudgetMB`, not `heapBudgetMB`. The WASM heap only has to hold the
// working set (KV cache + ggml compute scratch + JS heap headroom).
//
// Earlier we collapsed both into a single tab budget on the theory that
// iOS Jetsam treats the whole tab process as one pool, so any allocation
// counts the same. That's true for Jetsam — but it conflates *where* the
// memory lives with *how much* the platform can hand out: the WASM heap
// has a much tighter practical ceiling than the GPU side, and counting
// model bytes against the heap ceiling rejected models that load fine
// via OPFS streaming.
//
// Numbers come from public reports / Apple docs:
//
//  - iPhone WASM practical limit: 300–450 MB → heap budget
//      lapcatsoftware.com/articles/2026/1/7.html
//      news.ycombinator.com/item?id=39039593
//      github.com/emscripten-core/emscripten/issues/19374
//      github.com/godotengine/godot/issues/70621
//
//  - iOS Safari WebGPU maxBufferSize: 256 MB on iPhone 6 / older,
//    993 MB on iPad Pro M-class. Per-buffer cap, not total.
//      Apple WWDC 2025 "Unlock GPU computing with WebGPU"
//
//  - iPhone 12 Pro reports tab OOM around 1.5–3 GB; Jetsam intervenes
//    earlier under pressure. We undershoot the lower bound for headroom.
//      developer.apple.com/forums/thread/761666
//
// Heap budgets = WASM heap practical limits.
const IPHONE_HEAP_BUDGET_MB  = 450;
const IPAD_HEAP_BUDGET_MB    = 1500;
const ANDROID_HEAP_BUDGET_MB = 800;

// GPU budgets = available GPU-buffer capacity for model weights + KV
// mirror, sized below the Jetsam tab ceiling minus working-set headroom.
// Static per-family numbers — we don't probe on mobile because the
// probe's allocation pulse itself triggers Jetsam on lower-RAM devices,
// and WebKit doesn't expose a signal that distinguishes (e.g.) iPhone 13
// from iPhone 17 Pro Max (same maxBufferSize 1024 MB on both, same
// deviceMemory clamp). See "mobile probe" history in git: bounded
// probe shipped, then disabled because the iPhone 13 / mid-RAM iPad
// classes still Jetsamed during or right after the probe pulse.
//
// iPhone: empirical — 1200 MB caused tab reloads on first variant of a
// Run study (Llama-3.2-1B Q2_K, 554 MB) on iPhone 17 Pro Max. 700 MB
// keeps Llama-1B variants out of variantFits while still allowing the
// 250–500 MB tier (gemma-3-270m Q8, Qwen3-0.6B Q4, etc.).
//
// iPad: empirical — 2500 MB Jetsamed on Llama-3.2-1B (likely Q4_K_M
// = 770 MB or Q8_0 = 1259 MB). 1500 MB excludes Llama-1B Q8_0 (1459 MB
// after overhead) but allows Q4_K_M (970 MB), keeping the standard
// study quant runnable. High-end iPad Pro M-class probably tolerates
// more, but we have no way to detect device class.
const IPHONE_GPU_BUDGET_MB  = 700;
const IPAD_GPU_BUDGET_MB    = 1500;
const ANDROID_GPU_BUDGET_MB = 1500;

function detectMobileFamily() {
  if (typeof navigator === 'undefined') return null;
  const ua = navigator.userAgent || '';
  // iPadOS 13+ reports "Macintosh" UA but exposes touch; that's the
  // standard iPad-detection workaround.
  if (/iPad/.test(ua)) return 'ipad';
  if (navigator.maxTouchPoints > 1 && /Mac/.test(navigator.platform || '')) return 'ipad';
  if (/iPhone|iPod/.test(ua)) return 'iphone';
  if (/Android.*Mobile/.test(ua)) return 'android';
  if (navigator.userAgentData?.mobile === true) return 'android';
  return null;
}

function getMobileBudgetMB(family) {
  if (family === 'ipad')    return { heap: IPAD_HEAP_BUDGET_MB,    gpu: IPAD_GPU_BUDGET_MB };
  if (family === 'iphone')  return { heap: IPHONE_HEAP_BUDGET_MB,  gpu: IPHONE_GPU_BUDGET_MB };
  if (family === 'android') return { heap: ANDROID_HEAP_BUDGET_MB, gpu: ANDROID_GPU_BUDGET_MB };
  return { heap: IPHONE_HEAP_BUDGET_MB, gpu: IPHONE_GPU_BUDGET_MB }; // safest default
}

const PROBE_TIMEOUT_MS = 15_000;
const GPU_PROBE_STEP_MB = 256;
const GPU_PROBE_MAX_MB = 8 * 1024;
const GPU_PROBE_TIMEOUT_MS = 8_000;

// Working-set floor in the WASM heap. KV cache + compute buffers + JS
// heap headroom for a typical 1B model at n_ctx=2048 add up to a few
// hundred MB. Floor at 256 so an absurdly-tiny heap (or a probe failure
// that returned 0) doesn't pass variantFits.
const HEAP_WORKING_SET_FLOOR_MB = 256;

// Per-variant overhead added on top of the model file size when checking
// GPU fit. Covers compute buffers, alignment padding, and the KV cache
// mirror that the WebGPU backend keeps. A flat 200 MB is a conservative
// approximation; in practice it scales somewhat with model + context size.
const GPU_VARIANT_OVERHEAD_MB = 200;

export function isMobileDevice() {
  return detectMobileFamily() !== null;
}

// ──────────────── WASM heap probe ────────────────

// Spawn the probe worker, wait for a result, clean up. Returns
// { probedMB } on success, or { probedMB: 0, error } on any failure mode
// (timeout, worker construct error, worker onerror — typically the probe
// itself ran the engine out of memory).
export function probeHeapBudgetMB({ stepPages, maxPages, timeoutMs = PROBE_TIMEOUT_MS } = {}) {
  return new Promise((resolve) => {
    let worker;
    try {
      worker = new Worker(new URL('./memory-probe.js', import.meta.url));
    } catch (err) {
      resolve({ probedMB: 0, error: `worker construct failed: ${err.message}` });
      return;
    }

    const timer = setTimeout(() => {
      try { worker.terminate(); } catch { /* noop */ }
      resolve({ probedMB: 0, error: 'probe timeout' });
    }, timeoutMs);

    worker.onmessage = (e) => {
      clearTimeout(timer);
      const { committedMB = 0 } = e.data || {};
      try { worker.terminate(); } catch { /* noop */ }
      resolve({ probedMB: committedMB });
    };
    worker.onerror = (err) => {
      clearTimeout(timer);
      try { worker.terminate(); } catch { /* noop */ }
      resolve({ probedMB: 0, error: err.message || 'worker error' });
    };

    worker.postMessage({ stepPages, maxPages });
  });
}

// ──────────────── GPU memory probe ────────────────

// Allocate WebGPU buffers in stepMB increments until OOM, return the
// total committed bytes as the GPU memory budget. Uses
// mappedAtCreation=true to force real memory commit (some drivers lazy-
// allocate until first use otherwise) and captures OOM via the
// 'out-of-memory' error scope, with device.lost as a backstop.
//
// Caveats:
//  - The GPU process is shared with other tabs. If they're holding GPU
//    memory the probe undercounts. (Same as wllama's heap probe — best
//    we can do without a richer browser API.)
//  - Some drivers (notably iOS Metal under WebKit) lazy-fail at dispatch
//    time rather than at createBuffer; this probe's number is therefore
//    an upper bound, not a guarantee. Mobile cap below mitigates.
export async function probeGpuBudgetMB({
  stepMB = GPU_PROBE_STEP_MB,
  maxMB = GPU_PROBE_MAX_MB,
  timeoutMs = GPU_PROBE_TIMEOUT_MS,
  yieldMs = 0,
} = {}) {
  if (!navigator.gpu) {
    return { probedMB: 0, error: 'WebGPU not available' };
  }

  let adapter, device;
  try {
    adapter = await navigator.gpu.requestAdapter();
    if (!adapter) return { probedMB: 0, error: 'no WebGPU adapter' };
    // Request the maximum the adapter can give us; defaults are often
    // smaller than what the hardware supports.
    const requiredLimits = {};
    const cap = (k) => {
      const v = adapter.limits?.[k];
      if (typeof v === 'number') requiredLimits[k] = v;
    };
    cap('maxBufferSize');
    cap('maxStorageBufferBindingSize');
    device = await adapter.requestDevice({ requiredLimits });
  } catch (err) {
    return { probedMB: 0, error: `adapter/device init failed: ${err.message}` };
  }

  let deviceLost = false;
  device.lost.then(() => { deviceLost = true; }).catch(() => {});

  const buffers = [];
  const stepBytes = stepMB * 1024 * 1024;
  let totalBytes = 0;
  const start = performance.now();

  try {
    while (totalBytes + stepBytes <= maxMB * 1024 * 1024) {
      if (deviceLost) break;
      if (performance.now() - start > timeoutMs) break;

      device.pushErrorScope('out-of-memory');
      let buffer;
      try {
        buffer = device.createBuffer({
          size: stepBytes,
          usage: GPUBufferUsage.STORAGE,
          mappedAtCreation: true,
        });
        // Touch the start of the mapped range to force a real commit.
        // Drivers can lazy-back the allocation until first write, which
        // would fool the probe into thinking it has more headroom than it
        // really does.
        const touchBytes = Math.min(stepBytes, 64 * 1024);
        new Uint8Array(buffer.getMappedRange(0, touchBytes))[0] = 1;
        buffer.unmap();
      } catch (err) {
        await device.popErrorScope().catch(() => null);
        break;
      }

      const error = await device.popErrorScope().catch(() => null);
      if (error) {
        try { buffer.destroy(); } catch { /* noop */ }
        break;
      }

      buffers.push(buffer);
      totalBytes += stepBytes;

      // Yield so we don't starve the main thread / GC. On mobile a
      // longer yield also gives the OS a chance to update its memory
      // accounting between steps so a fast burst doesn't look like a
      // spike to Jetsam.
      await new Promise((r) => setTimeout(r, yieldMs));
    }
  } finally {
    for (const b of buffers) {
      try { b.destroy(); } catch { /* noop */ }
    }
    try { device.destroy(); } catch { /* noop */ }
  }

  return { probedMB: Math.floor(totalBytes / (1024 * 1024)) };
}

// ──────────────── public budget API ────────────────

// Cache the full budget for the lifetime of the page load. Both probes
// take 1–8 s; we don't want to pay that twice for the same surface.
let _budgetPromise = null;

export async function getDeviceBudgetMB() {
  if (_budgetPromise) return _budgetPromise;
  _budgetPromise = _computeBudget();
  return _budgetPromise;
}

async function _computeBudget() {
  const memGB = typeof navigator.deviceMemory === 'number' ? navigator.deviceMemory : null;
  let quotaMB = null;
  try {
    const est = await navigator.storage?.estimate?.();
    if (est?.quota) quotaMB = est.quota / (1024 * 1024);
  } catch {
    // some browsers throw on storage.estimate in non-secure contexts
  }

  const mobileFamily = detectMobileFamily();
  const isMobile = mobileFamily !== null;

  // ── Mobile path: pure static budgets ──
  //
  // No probes on mobile. Both the heap probe and the GPU probe have been
  // shown to themselves trigger Jetsam:
  //   - Heap probe: commit 6f33b5d (terminated worker).
  //   - GPU probe (unbounded): commit 4f567a5.
  //   - GPU probe (bounded): the 1000 MB peak allocation pulse on
  //     iPhone 13 / mid-RAM iPad classes still pushed the WebContent
  //     process over the Jetsam threshold during or right after the
  //     probe — even though the probe itself completed cleanly,
  //     subsequent OPFS writes hit "unknown transient" errors and the
  //     next inference allocation tipped the tab over.
  //
  // We can't distinguish iPhone 13 (6 GB) from iPhone 17 Pro Max (12 GB)
  // via WebGPU adapter info or navigator.deviceMemory, so we err on the
  // side of the lower-RAM device. The budgets are tuned to admit the
  // 250–500 MB tier (gemma-3-270m, Qwen3-0.6B, LFM2.5-350M) and to
  // exclude variants that empirically caused crashes on the smallest
  // device in each family. We still surface adapter.limits.maxBufferSize
  // in the source string for diagnostics.
  if (isMobile) {
    const { heap: heapBudgetMB, gpu: gpuBudgetMB } = getMobileBudgetMB(mobileFamily);

    // Read adapter limits without allocating a device buffer — purely
    // informational for the device card / log line.
    let maxBufferSizeMB = 0;
    let adapterReadError = null;
    try {
      if (navigator.gpu) {
        const adapter = await navigator.gpu.requestAdapter();
        const lim = adapter?.limits?.maxBufferSize;
        if (typeof lim === 'number') {
          maxBufferSizeMB = Math.floor(lim / (1024 * 1024));
        }
      } else {
        adapterReadError = 'WebGPU not available';
      }
    } catch (err) {
      adapterReadError = err.message;
    }

    const adapterDetail = adapterReadError
      ? ` (adapter read failed: ${adapterReadError})`
      : maxBufferSizeMB > 0
        ? ` (maxBufferSize ${maxBufferSizeMB} MB)`
        : '';

    return {
      budgetMB: gpuBudgetMB,
      gpuBudgetMB,
      heapBudgetMB,
      memGB,
      quotaMB,
      probedMB: 0,
      gpuProbedMB: 0,
      probeError: 'skipped on mobile (probes themselves trigger Jetsam)',
      gpuProbeError: 'skipped on mobile (probes themselves trigger Jetsam)',
      isMobile: true,
      mobileFamily,
      source: `mobile static budget — ${mobileFamily} (GPU ${gpuBudgetMB} MB for OPFS-streamed weights)${adapterDetail}`,
      heapSource: `mobile static budget — ${mobileFamily} (WASM heap ${heapBudgetMB} MB for KV + compute scratch)`,
    };
  }

  // ── Desktop path: real probes ──
  const [heapProbe, gpuProbe] = await Promise.all([
    probeHeapBudgetMB(),
    probeGpuBudgetMB(),
  ]);

  let heapBudgetMB;
  let heapSource;
  if (heapProbe.probedMB > 0) {
    heapBudgetMB = heapProbe.probedMB;
    heapSource = `probe (WASM heap, ${heapProbe.probedMB} MB committed)`;
  } else if (memGB !== null) {
    heapBudgetMB = memGB * 1024 * 0.6;
    heapSource = 'navigator.deviceMemory (heap probe failed)';
  } else if (quotaMB !== null) {
    heapBudgetMB = Math.min(quotaMB * HOSTED_QUOTA_FRACTION, HOSTED_QUOTA_CAP_MB);
    heapSource = 'navigator.storage.estimate().quota (heap probe failed)';
  } else {
    heapBudgetMB = DEFAULT_BUDGET_MB;
    heapSource = 'default (heap probe failed)';
  }

  let gpuBudgetMB;
  let gpuSource;
  if (gpuProbe.probedMB > 0) {
    gpuBudgetMB = gpuProbe.probedMB;
    gpuSource = `probe (WebGPU buffers, ${gpuProbe.probedMB} MB allocated)`;
  } else {
    gpuBudgetMB = 0;
    gpuSource = `probe failed: ${gpuProbe.error || 'unknown'}`;
  }

  return {
    budgetMB: gpuBudgetMB,
    gpuBudgetMB,
    heapBudgetMB,
    memGB,
    quotaMB,
    probedMB: heapProbe.probedMB,
    gpuProbedMB: gpuProbe.probedMB,
    probeError: heapProbe.error || null,
    gpuProbeError: gpuProbe.error || null,
    isMobile: false,
    mobileFamily: null,
    source: gpuSource,
    heapSource,
  };
}

// variantFits decides whether a model file of `sizeMB` bytes can be
// loaded and run on this device. Two checks must pass:
//
//   1. sizeMB + GPU_VARIANT_OVERHEAD_MB ≤ gpuBudgetMB
//        Model weights live in WebGPU buffers (since OPFS streaming
//        keeps them off the WASM heap). The overhead covers compute
//        scratch + alignment + KV cache mirror.
//
//   2. heapBudgetMB ≥ HEAP_WORKING_SET_FLOOR_MB
//        The WASM heap still has to fit the working set: KV cache,
//        ggml compute buffers, and JS heap headroom. Roughly constant
//        per inference regardless of model size at fixed n_ctx.
//
// Backwards-compat: if the second arg is a plain number, treat it as
// the legacy heap-only budget and apply the prior 1.5× sizeMB overhead.
// New callers should pass { gpuBudgetMB, heapBudgetMB }.
export function variantFits(sizeMB, budget) {
  if (typeof sizeMB !== 'number' || sizeMB <= 0) return false;

  if (typeof budget === 'number') {
    return budget > 0 && sizeMB * 1.5 <= budget;
  }
  if (!budget || typeof budget !== 'object') return false;

  const { gpuBudgetMB, heapBudgetMB } = budget;
  if (typeof gpuBudgetMB !== 'number' || sizeMB + GPU_VARIANT_OVERHEAD_MB > gpuBudgetMB) {
    return false;
  }
  if (typeof heapBudgetMB !== 'number' || heapBudgetMB < HEAP_WORKING_SET_FLOOR_MB) {
    return false;
  }
  return true;
}

export async function describeDevice() {
  const budget = await getDeviceBudgetMB();
  let gpu = null;
  if (navigator.gpu) {
    try {
      const adapter = await navigator.gpu.requestAdapter();
      if (adapter) gpu = adapter.info || { vendor: 'unknown' };
    } catch {
      gpu = null;
    }
  }

  // UA Client Hints: high-entropy values give us the real architecture
  // and OS, neither of which `navigator.platform` reports correctly on
  // Apple Silicon Macs (it returns "MacIntel" forever for back-compat).
  let uaArch = null;
  let uaPlatform = null;
  let uaPlatformVersion = null;
  // `fullVersionList` is the high-entropy version of `brands` and gives
  // us the full dotted version (e.g. "147.0.7390.107") instead of just
  // the major. The default `brands` is major-only.
  let fullVersionList = null;
  try {
    const uad = navigator.userAgentData;
    if (uad?.getHighEntropyValues) {
      const hev = await uad.getHighEntropyValues([
        'architecture', 'platform', 'platformVersion', 'fullVersionList',
      ]);
      uaArch = hev.architecture || null;
      uaPlatform = hev.platform || null;
      uaPlatformVersion = hev.platformVersion || null;
      fullVersionList = hev.fullVersionList || null;
    }
  } catch { /* not Chromium or denied */ }

  // UA-CH brands give us a clean { brand, version } pair without parsing
  // the userAgent string. Filter out the "Not(A:Brand)" decoy entries.
  // Prefer `fullVersionList` (full dotted version) over the major-only
  // default `brands` list.
  const brandSource = fullVersionList || navigator.userAgentData?.brands || [];
  const brands = brandSource
    .filter(b => b && !/not[^\w]*a[^\w]*brand/i.test(b.brand));

  return {
    ...budget,
    webgpu: !!navigator.gpu,
    gpu,
    userAgent: navigator.userAgent,
    platform: navigator.platform ?? null,
    uaArch,
    uaPlatform,
    uaPlatformVersion,
    uaBrands: brands,
  };
}