Upload index.html with huggingface_hub

6a41fd5 verified about 2 months ago

6.05 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<title>Gemma 26B A4B — Browser WebGPU via wllama</title>
	<style>
	body { font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 24px; max-width: 900px; margin: 0 auto; }
	h1 { color: #58a6ff; font-size: 20px; }
	.card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 16px; margin: 12px 0; }
	.label { color: #8b949e; font-size: 12px; text-transform: uppercase; letter-spacing: 1px; }
	.value { color: #c9d1d9; font-size: 14px; margin-top: 4px; }
	.green { color: #3fb950; } .red { color: #f85149; } .amber { color: #d29922; }
	#log { font-size: 12px; background: #010409; border: 1px solid #30363d; border-radius: 6px; padding: 10px; max-height: 400px; overflow-y: auto; white-space: pre-wrap; }
	button { background: #238636; color: white; border: none; border-radius: 6px; padding: 8px 16px; cursor: pointer; font-weight: bold; margin: 4px; }
	button:disabled { opacity: 0.5; cursor: wait; }
	input { background: #161b22; border: 1px solid #30363d; color: #c9d1d9; border-radius: 6px; padding: 8px 12px; width: 60%; }
	#output { background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 12px; min-height: 60px; white-space: pre-wrap; font-size: 14px; margin-top: 8px; }
	</style>
	</head>
	<body>
	<h1>Gemma 4 26B A4B — Browser WebGPU</h1>
	<p>Gemma-4-26B-A4B-it (MoE, 3.8B active) running in browser via wllama + WebGPU. GGUF loaded from local server.</p>

	<div class="card">
	<div class="label">Status</div>
	<div class="value" id="status"><span class="amber">*</span> not initialized</div>
	</div>

	<div class="card">
	<button id="btn-load" onclick="doLoad()">1. Load Model (WebGPU)</button>
	<button id="btn-gen" onclick="doGenerate()" disabled>2. Generate</button>
	</div>

	<div class="card">
	<div class="label">Prompt</div>
	<input id="prompt" value="Hello, I am a helpful assistant and" />
	</div>

	<div class="card">
	<div class="label">Output</div>
	<div id="output"></div>
	</div>

	<div class="card">
	<div class="label">Log</div>
	<div id="log"></div>
	</div>

	<script type="module">
	import { Wllama } from './node_modules/@wllama/wllama/esm/index.js';

	const log = document.getElementById('log');
	const status = document.getElementById('status');
	const output = document.getElementById('output');
	let wllama = null;

	function l(msg) {
	const ts = new Date().toISOString().slice(11, 19);
	log.textContent += `[${ts}] ${msg}\n`;
	log.scrollTop = log.scrollHeight;
	}

	window.doLoad = async function() {
	try {
	document.getElementById('btn-load').disabled = true;
	l('Initializing wllama...');
	status.innerHTML = '<span class="amber">*</span> initializing...';

	const CONFIG_PATHS = {
	default: './node_modules/@wllama/wllama/esm/wasm/wllama.wasm',
	};

	wllama = new Wllama(CONFIG_PATHS, {
	parallelDownloads: 5,
	logger: {
	debug: (msg) => console.log('[wllama]', msg),
	log: (msg) => { console.log('[wllama]', msg); l(msg); },
	warn: (msg) => { console.warn('[wllama]', msg); l('WARN: ' + msg); },
	error: (msg) => { console.error('[wllama]', msg); l('ERROR: ' + msg); },
	},
	});

	l('Loading Gemma 26B A4B (Q5_K_XL, ~20GB in 512MB splits)...');
	l('This will take several minutes on first load.');
	status.innerHTML = '<span class="amber">*</span> loading model...';

	// Load from local server (split GGUF files)
	// wllama auto-detects split pattern from the first file name
	const firstSplit = window.location.origin + '/model/gemma-26b-00001-of-00062.gguf';

	await wllama.loadModelFromUrl(firstSplit, {
	n_gpu_layers: 99, // GPU — patched GLU shader fixes aliasing
	n_ctx: 512, // minimal context to reduce CPU memory
	n_batch: 64,
	useCache: false, // don't cache 20GB in browser storage
	progressCallback: ({ loaded, total }) => {
	const pct = Math.round((loaded / total) * 100);
	if (pct % 5 === 0) l(`Downloading... ${pct}% (${(loaded/1024/1024/1024).toFixed(1)}/${(total/1024/1024/1024).toFixed(1)} GB)`);
	status.innerHTML = `<span class="amber">*</span> downloading ${pct}%...`;
	},
	});

	l('Model loaded!');
	status.innerHTML = '<span class="green">*</span> model ready';
	document.getElementById('btn-gen').disabled = false;
	} catch (e) {
	l('ERROR: ' + e.message);
	console.error(e);
	status.innerHTML = '<span class="red">*</span> ' + e.message;
	document.getElementById('btn-load').disabled = false;
	}
	};

	window.doGenerate = async function() {
	const prompt = document.getElementById('prompt').value;
	document.getElementById('btn-gen').disabled = true;
	output.textContent = '';
	l('Generating: "' + prompt + '"');
	status.innerHTML = '<span class="amber">*</span> generating...';

	const t0 = performance.now();
	try {
	const result = await wllama.createChatCompletion({
	messages: [{ role: 'user', content: prompt }],
	max_tokens: 500,
	temperature: 0.7,
	top_k: 40,
	top_p: 0.9,
	});

	const elapsed = ((performance.now() - t0) / 1000).toFixed(1);
	console.log('[gemma] raw result:', JSON.stringify(result, null, 2));
	const msg = result?.choices?.[0]?.message;
	const text = msg?.content \|\| '';
	const thinking = msg?.reasoning_content \|\| '';
	const tps = result?.timings?.predicted_per_second?.toFixed(1) \|\| '?';
	if (thinking && !text) {
	output.textContent = thinking;
	l(`[thinking only, ${tps} tok/s] ` + thinking.slice(0, 200));
	} else {
	output.textContent = text \|\| '(empty)';
	if (thinking) l('[thinking] ' + thinking.slice(0, 100));
	l(`[${tps} tok/s] ` + (text \|\| '(empty)').slice(0, 200));
	}
	l(`Done in ${elapsed}s`);
	status.innerHTML = `<span class="green">*</span> done (${elapsed}s)`;
	} catch (e) {
	l('ERROR: ' + e.message);
	console.error(e);
	status.innerHTML = '<span class="red">*</span> error';
	}
	document.getElementById('btn-gen').disabled = false;
	};
	</script>
	</body>
	</html>