File size: 6,189 Bytes
f221926
 
 
 
 
149fe2b
f221926
 
149fe2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e72601b
 
f221926
e72601b
 
 
 
ee944ff
e72601b
 
6df9ed0
e72601b
 
 
 
 
 
 
149fe2b
 
f221926
149fe2b
 
 
 
 
f221926
149fe2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f221926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee944ff
6df9ed0
f221926
 
 
149fe2b
 
f221926
 
149fe2b
f221926
149fe2b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// Thin adapter for runner.js (Playwright). Reads URL params, downloads the
// model into OPFS, hands it to bench-worker.js, and forwards the worker's
// progress/result onto window.__BENCH so the runner can poll. Inference
// orchestration lives in site/js/run/bench-worker.js — same worker the
// interactive Run page uses.

import { ggufSource, OPFS_ROOT_NAME } from './js/run/source.js';
import { CONSISTENCY_PROMPT } from './js/run/config.js';

// Global error handlers — catch Emscripten abort() which may not throw.
window.addEventListener('error', (e) => {
  if (window.__BENCH && window.__BENCH.status !== 'done') {
    window.__BENCH.error = window.__BENCH.error || e.message || 'Uncaught error';
    window.__BENCH.status = 'error';
  }
});
window.addEventListener('unhandledrejection', (e) => {
  if (window.__BENCH && window.__BENCH.status !== 'done') {
    window.__BENCH.error = window.__BENCH.error || String(e.reason) || 'Unhandled rejection';
    window.__BENCH.status = 'error';
  }
});

(async function () {
  const params = new URLSearchParams(window.location.search);
  const modelFile           = params.get('model')         || '';
  const hfRepo              = params.get('hfRepo')        || 'unsloth/Llama-3.2-1B-Instruct-GGUF';
  const consistencyPrompt   = CONSISTENCY_PROMPT;
  const consistencyNPredict = parseInt(params.get('nPredict')   || '128', 10);
  const nPrompt             = parseInt(params.get('nPrompt')    || '512', 10);
  const nGen                = parseInt(params.get('nGen')       || '128', 10);
  const nReps               = parseInt(params.get('nReps')      || '5', 10);
  const nDepth              = parseInt(params.get('nDepth')     || '0', 10);
  const nCtx                = parseInt(params.get('nCtx')       || '2048', 10);
  const nGpuLayers          = parseInt(params.get('nGpuLayers') || '999', 10);
  const noWarmup            = params.get('noWarmup') === '1';
  const refTokenIds         = params.get('refTokenIds') || null;
  // mode=perf → skip consistency entirely (e.g. for the GPU perf-only pass).
  // mode=consistency → skip perf (e.g. CPU baseline pass that just needs token_ids).
  // default 'both' runs both phases in one model load.
  const mode                = params.get('mode') || 'both';
  const runConsistency      = mode !== 'perf';
  const runPerf             = mode !== 'consistency';

  const hasJspi = 'Suspending' in WebAssembly;
  const buildType = hasJspi ? 'jspi' : 'asyncify';

  window.__BENCH = {
    status: 'init',
    error: null,
    modelFile,
    buildType,
    webgpuAvailable: !!navigator.gpu,
    gpuAdapterInfo: null,
    downloadProgress: 0,
    metrics: null,
    output: '',
  };

  const statusEl   = document.getElementById('status');
  const progressEl = document.getElementById('progress');
  const logEl      = document.getElementById('log');

  function onStatus(status, msg) {
    window.__BENCH.status = status;
    if (statusEl) {
      statusEl.textContent = msg || status;
      statusEl.className = status === 'error' ? 'err' : status === 'done' ? 'ok' : '';
    }
  }

  function onLog(msg) {
    const line = `[${new Date().toISOString().slice(11, 23)}] ${msg}`;
    console.log(line);
    if (logEl) logEl.textContent += line + '\n';
  }

  function onProgress(fraction, downloaded, total) {
    window.__BENCH.downloadProgress = fraction;
    if (progressEl && total > 0) {
      const pct = (fraction * 100).toFixed(1);
      progressEl.textContent =
        `Downloaded: ${(downloaded / (1024 * 1024)).toFixed(1)} MB / ` +
        `${(total / (1024 * 1024)).toFixed(1)} MB (${pct}%)`;
    }
  }

  // Stage 1: download into OPFS on the main thread (sync access handles
  // are worker-only, but the downloading half runs fine here).
  let size;
  try {
    onStatus('downloading', `Downloading ${modelFile}...`);
    onLog(`Fetching ${hfRepo}/${modelFile} into OPFS`);
    const r = await ggufSource().opfsHandleForModel(hfRepo, modelFile, onProgress);
    size = r.size;
  } catch (err) {
    window.__BENCH.error = `opfsHandleForModel failed: ${err.message}`;
    window.__BENCH.status = 'error';
    onStatus('error', window.__BENCH.error);
    onLog(`ERROR: ${window.__BENCH.error}`);
    return;
  }

  // Stage 2: hand the OPFS layout key to the worker. The worker re-resolves
  // the FileHandle locally (FileHandles don't structured-clone reliably on
  // iOS Safari) and opens a sync access handle inside its own thread.
  const result = await new Promise((resolve) => {
    let worker;
    try {
      worker = new Worker(new URL('./js/run/bench-worker.js', import.meta.url));
    } catch (err) {
      resolve({ status: 'error', error: `worker construct failed: ${err.message}` });
      return;
    }

    let settled = false;
    const finish = (record) => {
      if (settled) return;
      settled = true;
      try { worker.terminate(); } catch { /* noop */ }
      resolve(record);
    };

    worker.onmessage = (e) => {
      const msg = e.data || {};
      if (msg.type === 'status') onStatus(msg.status, msg.msg);
      else if (msg.type === 'progress') onProgress(msg.fraction, msg.downloaded, msg.total);
      else if (msg.type === 'log') onLog(msg.line);
      else if (msg.type === 'result') finish(msg.record);
    };
    worker.onerror = (err) => {
      finish({ status: 'error', error: err?.message || 'worker error' });
    };
    worker.onmessageerror = () => {
      finish({ status: 'error', error: 'worker message deserialization failed' });
    };

    worker.postMessage({
      type: 'run',
      params: {
        buildType,
        nCtx,
        nGpuLayers,
        consistencyPrompt: runConsistency ? consistencyPrompt : '',
        consistencyNPredict,
        refTokenIds,
        nPrompt: runPerf ? nPrompt : 0,
        nGen:    runPerf ? nGen    : 0,
        nReps,
        nDepth:  runPerf ? nDepth  : 0,
        noWarmup,
      },
      opfsPath: { rootDir: OPFS_ROOT_NAME, repo: hfRepo, filename: modelFile },
    });
  });

  // Merge worker result into window.__BENCH. downloadProgress was set
  // during stage 1 and is preserved.
  Object.assign(window.__BENCH, result);
  window.__BENCH._opfsSize = size;
})();