| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <title>Gemma 26B A4B — Browser WebGPU via wllama</title> |
| <style> |
| body { font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 24px; max-width: 900px; margin: 0 auto; } |
| h1 { color: #58a6ff; font-size: 20px; } |
| .card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 16px; margin: 12px 0; } |
| .label { color: #8b949e; font-size: 12px; text-transform: uppercase; letter-spacing: 1px; } |
| .value { color: #c9d1d9; font-size: 14px; margin-top: 4px; } |
| .green { color: #3fb950; } .red { color: #f85149; } .amber { color: #d29922; } |
| #log { font-size: 12px; background: #010409; border: 1px solid #30363d; border-radius: 6px; padding: 10px; max-height: 400px; overflow-y: auto; white-space: pre-wrap; } |
| button { background: #238636; color: white; border: none; border-radius: 6px; padding: 8px 16px; cursor: pointer; font-weight: bold; margin: 4px; } |
| button:disabled { opacity: 0.5; cursor: wait; } |
| input { background: #161b22; border: 1px solid #30363d; color: #c9d1d9; border-radius: 6px; padding: 8px 12px; width: 60%; } |
| #output { background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 12px; min-height: 60px; white-space: pre-wrap; font-size: 14px; margin-top: 8px; } |
| </style> |
| </head> |
| <body> |
| <h1>Gemma 4 26B A4B — Browser WebGPU</h1> |
| <p>Gemma-4-26B-A4B-it (MoE, 3.8B active) running in browser via wllama + WebGPU. GGUF loaded from local server.</p> |
|
|
| <div class="card"> |
| <div class="label">Status</div> |
| <div class="value" id="status"><span class="amber">*</span> not initialized</div> |
| </div> |
|
|
| <div class="card"> |
| <button id="btn-load" onclick="doLoad()">1. Load Model (WebGPU)</button> |
| <button id="btn-gen" onclick="doGenerate()" disabled>2. Generate</button> |
| </div> |
|
|
| <div class="card"> |
| <div class="label">Prompt</div> |
| <input id="prompt" value="Hello, I am a helpful assistant and" /> |
| </div> |
|
|
| <div class="card"> |
| <div class="label">Output</div> |
| <div id="output"></div> |
| </div> |
|
|
| <div class="card"> |
| <div class="label">Log</div> |
| <div id="log"></div> |
| </div> |
|
|
| <script type="module"> |
| import { Wllama } from './node_modules/@wllama/wllama/esm/index.js'; |
| |
| const log = document.getElementById('log'); |
| const status = document.getElementById('status'); |
| const output = document.getElementById('output'); |
| let wllama = null; |
| |
| function l(msg) { |
| const ts = new Date().toISOString().slice(11, 19); |
| log.textContent += `[${ts}] ${msg}\n`; |
| log.scrollTop = log.scrollHeight; |
| } |
| |
| window.doLoad = async function() { |
| try { |
| document.getElementById('btn-load').disabled = true; |
| l('Initializing wllama...'); |
| status.innerHTML = '<span class="amber">*</span> initializing...'; |
| |
| const CONFIG_PATHS = { |
| default: './node_modules/@wllama/wllama/esm/wasm/wllama.wasm', |
| }; |
| |
| wllama = new Wllama(CONFIG_PATHS, { |
| parallelDownloads: 5, |
| logger: { |
| debug: (msg) => console.log('[wllama]', msg), |
| log: (msg) => { console.log('[wllama]', msg); l(msg); }, |
| warn: (msg) => { console.warn('[wllama]', msg); l('WARN: ' + msg); }, |
| error: (msg) => { console.error('[wllama]', msg); l('ERROR: ' + msg); }, |
| }, |
| }); |
| |
| l('Loading Gemma 26B A4B (Q5_K_XL, ~20GB in 512MB splits)...'); |
| l('This will take several minutes on first load.'); |
| status.innerHTML = '<span class="amber">*</span> loading model...'; |
| |
| |
| |
| const firstSplit = window.location.origin + '/model/gemma-26b-00001-of-00062.gguf'; |
| |
| await wllama.loadModelFromUrl(firstSplit, { |
| n_gpu_layers: 99, |
| n_ctx: 512, |
| n_batch: 64, |
| useCache: false, |
| progressCallback: ({ loaded, total }) => { |
| const pct = Math.round((loaded / total) * 100); |
| if (pct % 5 === 0) l(`Downloading... ${pct}% (${(loaded/1024/1024/1024).toFixed(1)}/${(total/1024/1024/1024).toFixed(1)} GB)`); |
| status.innerHTML = `<span class="amber">*</span> downloading ${pct}%...`; |
| }, |
| }); |
| |
| l('Model loaded!'); |
| status.innerHTML = '<span class="green">*</span> model ready'; |
| document.getElementById('btn-gen').disabled = false; |
| } catch (e) { |
| l('ERROR: ' + e.message); |
| console.error(e); |
| status.innerHTML = '<span class="red">*</span> ' + e.message; |
| document.getElementById('btn-load').disabled = false; |
| } |
| }; |
| |
| window.doGenerate = async function() { |
| const prompt = document.getElementById('prompt').value; |
| document.getElementById('btn-gen').disabled = true; |
| output.textContent = ''; |
| l('Generating: "' + prompt + '"'); |
| status.innerHTML = '<span class="amber">*</span> generating...'; |
| |
| const t0 = performance.now(); |
| try { |
| const result = await wllama.createChatCompletion({ |
| messages: [{ role: 'user', content: prompt }], |
| max_tokens: 500, |
| temperature: 0.7, |
| top_k: 40, |
| top_p: 0.9, |
| }); |
| |
| const elapsed = ((performance.now() - t0) / 1000).toFixed(1); |
| console.log('[gemma] raw result:', JSON.stringify(result, null, 2)); |
| const msg = result?.choices?.[0]?.message; |
| const text = msg?.content || ''; |
| const thinking = msg?.reasoning_content || ''; |
| const tps = result?.timings?.predicted_per_second?.toFixed(1) || '?'; |
| if (thinking && !text) { |
| output.textContent = thinking; |
| l(`[thinking only, ${tps} tok/s] ` + thinking.slice(0, 200)); |
| } else { |
| output.textContent = text || '(empty)'; |
| if (thinking) l('[thinking] ' + thinking.slice(0, 100)); |
| l(`[${tps} tok/s] ` + (text || '(empty)').slice(0, 200)); |
| } |
| l(`Done in ${elapsed}s`); |
| status.innerHTML = `<span class="green">*</span> done (${elapsed}s)`; |
| } catch (e) { |
| l('ERROR: ' + e.message); |
| console.error(e); |
| status.innerHTML = '<span class="red">*</span> error'; |
| } |
| document.getElementById('btn-gen').disabled = false; |
| }; |
| </script> |
| </body> |
| </html> |
|
|