File size: 6,053 Bytes
6a41fd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Gemma 26B A4B — Browser WebGPU via wllama</title>
<style>
body { font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 24px; max-width: 900px; margin: 0 auto; }
h1 { color: #58a6ff; font-size: 20px; }
.card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 16px; margin: 12px 0; }
.label { color: #8b949e; font-size: 12px; text-transform: uppercase; letter-spacing: 1px; }
.value { color: #c9d1d9; font-size: 14px; margin-top: 4px; }
.green { color: #3fb950; } .red { color: #f85149; } .amber { color: #d29922; }
#log { font-size: 12px; background: #010409; border: 1px solid #30363d; border-radius: 6px; padding: 10px; max-height: 400px; overflow-y: auto; white-space: pre-wrap; }
button { background: #238636; color: white; border: none; border-radius: 6px; padding: 8px 16px; cursor: pointer; font-weight: bold; margin: 4px; }
button:disabled { opacity: 0.5; cursor: wait; }
input { background: #161b22; border: 1px solid #30363d; color: #c9d1d9; border-radius: 6px; padding: 8px 12px; width: 60%; }
#output { background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 12px; min-height: 60px; white-space: pre-wrap; font-size: 14px; margin-top: 8px; }
</style>
</head>
<body>
<h1>Gemma 4 26B A4B — Browser WebGPU</h1>
<p>Gemma-4-26B-A4B-it (MoE, 3.8B active) running in browser via wllama + WebGPU. GGUF loaded from local server.</p>

<div class="card">
  <div class="label">Status</div>
  <div class="value" id="status"><span class="amber">*</span> not initialized</div>
</div>

<div class="card">
  <button id="btn-load" onclick="doLoad()">1. Load Model (WebGPU)</button>
  <button id="btn-gen" onclick="doGenerate()" disabled>2. Generate</button>
</div>

<div class="card">
  <div class="label">Prompt</div>
  <input id="prompt" value="Hello, I am a helpful assistant and" />
</div>

<div class="card">
  <div class="label">Output</div>
  <div id="output"></div>
</div>

<div class="card">
  <div class="label">Log</div>
  <div id="log"></div>
</div>

<script type="module">
import { Wllama } from './node_modules/@wllama/wllama/esm/index.js';

const log = document.getElementById('log');
const status = document.getElementById('status');
const output = document.getElementById('output');
let wllama = null;

function l(msg) {
  const ts = new Date().toISOString().slice(11, 19);
  log.textContent += `[${ts}] ${msg}\n`;
  log.scrollTop = log.scrollHeight;
}

window.doLoad = async function() {
  try {
    document.getElementById('btn-load').disabled = true;
    l('Initializing wllama...');
    status.innerHTML = '<span class="amber">*</span> initializing...';

    const CONFIG_PATHS = {
      default: './node_modules/@wllama/wllama/esm/wasm/wllama.wasm',
    };

    wllama = new Wllama(CONFIG_PATHS, {
      parallelDownloads: 5,
      logger: {
        debug: (msg) => console.log('[wllama]', msg),
        log: (msg) => { console.log('[wllama]', msg); l(msg); },
        warn: (msg) => { console.warn('[wllama]', msg); l('WARN: ' + msg); },
        error: (msg) => { console.error('[wllama]', msg); l('ERROR: ' + msg); },
      },
    });

    l('Loading Gemma 26B A4B (Q5_K_XL, ~20GB in 512MB splits)...');
    l('This will take several minutes on first load.');
    status.innerHTML = '<span class="amber">*</span> loading model...';

    // Load from local server (split GGUF files)
    // wllama auto-detects split pattern from the first file name
    const firstSplit = window.location.origin + '/model/gemma-26b-00001-of-00062.gguf';

    await wllama.loadModelFromUrl(firstSplit, {
      n_gpu_layers: 99, // GPU — patched GLU shader fixes aliasing
      n_ctx: 512, // minimal context to reduce CPU memory
      n_batch: 64,
      useCache: false, // don't cache 20GB in browser storage
      progressCallback: ({ loaded, total }) => {
        const pct = Math.round((loaded / total) * 100);
        if (pct % 5 === 0) l(`Downloading... ${pct}% (${(loaded/1024/1024/1024).toFixed(1)}/${(total/1024/1024/1024).toFixed(1)} GB)`);
        status.innerHTML = `<span class="amber">*</span> downloading ${pct}%...`;
      },
    });

    l('Model loaded!');
    status.innerHTML = '<span class="green">*</span> model ready';
    document.getElementById('btn-gen').disabled = false;
  } catch (e) {
    l('ERROR: ' + e.message);
    console.error(e);
    status.innerHTML = '<span class="red">*</span> ' + e.message;
    document.getElementById('btn-load').disabled = false;
  }
};

window.doGenerate = async function() {
  const prompt = document.getElementById('prompt').value;
  document.getElementById('btn-gen').disabled = true;
  output.textContent = '';
  l('Generating: "' + prompt + '"');
  status.innerHTML = '<span class="amber">*</span> generating...';

  const t0 = performance.now();
  try {
    const result = await wllama.createChatCompletion({
      messages: [{ role: 'user', content: prompt }],
      max_tokens: 500,
      temperature: 0.7,
      top_k: 40,
      top_p: 0.9,
    });

    const elapsed = ((performance.now() - t0) / 1000).toFixed(1);
    console.log('[gemma] raw result:', JSON.stringify(result, null, 2));
    const msg = result?.choices?.[0]?.message;
    const text = msg?.content || '';
    const thinking = msg?.reasoning_content || '';
    const tps = result?.timings?.predicted_per_second?.toFixed(1) || '?';
    if (thinking && !text) {
      output.textContent = thinking;
      l(`[thinking only, ${tps} tok/s] ` + thinking.slice(0, 200));
    } else {
      output.textContent = text || '(empty)';
      if (thinking) l('[thinking] ' + thinking.slice(0, 100));
      l(`[${tps} tok/s] ` + (text || '(empty)').slice(0, 200));
    }
    l(`Done in ${elapsed}s`);
    status.innerHTML = `<span class="green">*</span> done (${elapsed}s)`;
  } catch (e) {
    l('ERROR: ' + e.message);
    console.error(e);
    status.innerHTML = '<span class="red">*</span> error';
  }
  document.getElementById('btn-gen').disabled = false;
};
</script>
</body>
</html>