Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

tiny-army / web /engineWebllm.js

polats's picture

Settings: model section to top; default WebLLM + Qwen3 0.6B; copyable debug

898540a 5 days ago

history blame contribute delete

5.23 kB

	// Engine: WebLLM — MLC's WebGPU LLM engine. Fastest of the three, but WebGPU is
	// REQUIRED (no WASM fallback), so it only shows when the browser exposes WebGPU.
	// NOT llama.cpp (doesn't earn 🦙) — here for benchmarking.
	import { statsTracker } from '/web/genStats.js'

	// `mlcBase` is the model name without the quantization suffix; we append
	// q4f16_1 on GPUs that expose shader-f16, else q4f32_1. q4f16 models compile a
	// WGSL kernel that needs the WebGPU `shader-f16` feature — without it MLC throws
	// "Invalid ShaderModule … index_kernel". q4f32 works everywhere (a bit slower).
	const MODELS = [
	{ id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', mlcBase: 'Qwen2.5-0.5B-Instruct' },
	{ id: 'qwen3-0.6b', label: 'Qwen3 0.6B', params: '0.6B', mlcBase: 'Qwen3-0.6B' },
	{ id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', mlcBase: 'SmolLM2-360M-Instruct' },
	{ id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', mlcBase: 'Llama-3.2-1B-Instruct' },
	]
	const get = (id) => MODELS.find((m) => m.id === id) \|\| MODELS[0]
	const hasGPU = () => { try { return !!(typeof navigator !== 'undefined' && navigator.gpu) } catch { return false } }

	let _f16 = null
	async function hasF16() {
	if (_f16 !== null) return _f16
	try { const a = await navigator.gpu.requestAdapter(); _f16 = !!a?.features?.has('shader-f16') } catch { _f16 = false }
	return _f16
	}
	const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`

	let _lib = null, _engine = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
	async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }

	async function ensure(id, onProgress) {
	const m = get(id)
	if (_engine && _loadedId === m.id) return _engine
	// Reuse the in-flight load for the SAME model (guard on _loadingId, not _loadedId,
	// which isn't set until the load finishes — otherwise a re-entrant ensure() during
	// a slow download starts a SECOND download).
	if (_loadPromise && _loadingId === m.id) return _loadPromise
	_loadingId = m.id
	_loadPromise = (async () => {
	const { CreateMLCEngine } = await lib()
	const target = await mlcId(m)
	// MLC reports two phases through this one callback: "Fetching param cache…"
	// (network) then "Loading model from cache…" (into GPU). Pass the text so the UI
	// can show which is happening — the 2nd 0→100% is a cache-load, not a re-download.
	const cb = (p) => { if (onProgress) onProgress(typeof p.progress === 'number' ? p.progress : 0, p.text) }
	if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
	_engine = await CreateMLCEngine(target, { initProgressCallback: cb })
	_loadedId = m.id; return _engine
	})().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
	return _loadPromise
	}

	function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats } = {}) {
	const run = async () => {
	const e = await ensure(id)
	const st = statsTracker(onStats)
	let full = ''
	const chunks = await e.chat.completions.create({
	messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
	stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
	})
	// MLC routes Qwen3's reasoning into a separate `reasoning_content` field. Re-wrap
	// it as <think>…</think> and prepend, so the rest of the app (stripThink + the raw
	// "thinking" view) treats every engine's output the same.
	let thinkOpen = false
	const emit = (s) => { if (!s) return; full += s; if (onToken) onToken(s); st.tick() }
	for await (const ch of chunks) {
	const d = ch.choices?.[0]?.delta \|\| {}
	const r = d.reasoning_content \|\| ''
	const c = d.content \|\| ''
	if (r) { if (!thinkOpen) { emit('<think>'); thinkOpen = true } emit(r) }
	if (c) { if (thinkOpen) { emit('</think>'); thinkOpen = false } emit(c) }
	}
	if (thinkOpen) emit('</think>')
	return { text: full, stats: st.finish() }
	}
	const p = _chain.then(run, run); _chain = p.catch(() => {}); return p
	}

	export const engine = {
	id: 'webllm',
	label: 'WebLLM · MLC (WebGPU only)',
	requiresWebGPU: true,
	available: () => hasGPU(),
	models: MODELS,
	defaultModel: 'qwen3-0.6b',
	ensure, stream,
	backendLabel: () => (hasGPU() ? '⚡ WebGPU' : 'needs WebGPU'),
	// Cache list/delete via MLC's own helpers (Cache API or IndexedDB, per appConfig).
	async cachedSet() {
	try {
	const wl = await lib()
	const cfg = wl.prebuiltAppConfig
	const ids = new Set()
	for (const m of MODELS) { if (await wl.hasModelInCache(await mlcId(m), cfg)) ids.add(m.id) }
	return ids
	} catch { return new Set() }
	},
	async deleteCached(id) {
	const wl = await lib()
	const m = get(id)
	const target = await mlcId(m)
	if (_loadedId === id && _engine) { try { await _engine.unload?.() } catch { /* ignore */ } _engine = null; _loadedId = null; _loadPromise = null; _loadingId = null }
	try { await wl.deleteModelAllInfoInCache(target, wl.prebuiltAppConfig) } catch { /* ignore */ }
	},
	}