tiny-army / web /modelCatalog.js
polats's picture
Add Qwen3-0.6B; surface WebGPU backend; strip <think> from answers
a4ca9e9
// Curated small instruct models for the IN-BROWSER (wllama / llama.cpp WASM) path.
// Constraints that shaped this list (verified June 2026 via the HF API):
// • Ungated — wllama fetches the GGUF anonymously; gated repos (official Llama/
// Gemma) won't load, so we use ungated mirrors (bartowski, unsloth, Qwen, SmolLM).
// • ≤ ~2 GB single file — the browser's ArrayBuffer cap is 2 GB (bigger needs split
// GGUFs). That's why Gemma 4 E2B (3.1 GB) and Qwen3-4B (2.5 GB) are server-only,
// and big ones like Qwen3.5-9B / MiniMax don't fit in-browser at all.
// • Q4_K_M quant (good size/quality), CPU-WASM friendly.
// Sizes are the real download bytes. The hackathon's "≤32B" is the *runtime* cap; the
// browser is far smaller, so this list is sub-3B.
export const MODELS = [
{ id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
note: 'tiniest — fastest, roughest' },
{ id: 'qwen3-0.6b', label: 'Qwen3 0.6B', params: '0.6B', bytes: 397e6, thinks: true,
repo: 'unsloth/Qwen3-0.6B-GGUF', file: 'Qwen3-0.6B-Q4_K_M.gguf',
note: 'newest tiny — strong, has a thinking mode' },
{ id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
note: 'default — fast, clean JSON, no thinking overhead' },
{ id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
note: 'solid 1B all-rounder' },
{ id: 'smollm2-1.7b', label: 'SmolLM2 1.7B', params: '1.7B', bytes: 1056e6,
repo: 'HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF', file: 'smollm2-1.7b-instruct-q4_k_m.gguf',
note: 'strong tiny model' },
{ id: 'qwen3-1.7b', label: 'Qwen3 1.7B', params: '1.7B', bytes: 1107e6,
repo: 'unsloth/Qwen3-1.7B-GGUF', file: 'Qwen3-1.7B-Q4_K_M.gguf',
note: 'newer Qwen3 — has a thinking mode' },
{ id: 'qwen2.5-1.5b', label: 'Qwen2.5 1.5B', params: '1.5B', bytes: 1117e6,
repo: 'Qwen/Qwen2.5-1.5B-Instruct-GGUF', file: 'qwen2.5-1.5b-instruct-q4_k_m.gguf',
note: 'reliable, clean JSON' },
{ id: 'llama3.2-3b', label: 'Llama 3.2 3B', params: '3B', bytes: 2019e6,
repo: 'bartowski/Llama-3.2-3B-Instruct-GGUF', file: 'Llama-3.2-3B-Instruct-Q4_K_M.gguf',
note: 'bigger/better, slower in-browser' },
{ id: 'qwen2.5-3b', label: 'Qwen2.5 3B', params: '3B', bytes: 2105e6,
repo: 'Qwen/Qwen2.5-3B-Instruct-GGUF', file: 'qwen2.5-3b-instruct-q4_k_m.gguf',
note: 'best quality here; near the 2 GB browser limit' },
]
export const DEFAULT_MODEL = 'qwen2.5-0.5b'
export const getModel = (id) => MODELS.find((m) => m.id === id) || MODELS.find((m) => m.id === DEFAULT_MODEL)
export const fmtBytes = (b) => (b >= 1e9 ? (b / 1e9).toFixed(1) + ' GB' : Math.round(b / 1e6) + ' MB')