Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on Jun 4

Commit

8eac3eb

1 Parent(s): 03708ca

In-browser models: catalog + picker + cache delete, live tok/s stats, raw 'thinking' stream

- modelCatalog.js: 8 verified ungated GGUFs ≤~2GB (SmolLM2 360M/1.7B, Qwen2.5
0.5B/1.5B/3B, Qwen3 1.7B, Llama 3.2 1B/3B) with real sizes; bigger (Gemma 4 E2B
3GB, Qwen3-4B, MiniMax) excluded — exceed the browser 2GB single-file limit.
- wllamaLlm.js: setModel/ensureModel per-model, cache list/delete via wllama
ModelManager (exit() + remove()), tok/s + first-token stats during streaming.
- modelBar.js: model dropdown (label · size · ✓downloaded) + 🗑 delete-from-cache.
- persona/diary panels: model picker, live tok/s, and a raw 'thinking' stream view
so you watch tokens arrive.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (6) hide show

web/diaryPanel.js +18 -11
web/modelBar.js +51 -0
web/modelCatalog.js +42 -0
web/personaPanel.js +29 -16
web/shell/persona.css +38 -0
web/wllamaLlm.js +71 -21

web/diaryPanel.js CHANGED Viewed

@@ -1,7 +1,8 @@
-// War-diary panel — vanilla DOM, mounted by tiny.js into #diary-stage. Streams a
-// first-person diary entry generated ON THE USER'S DEVICE via wllama (llama.cpp
-// WASM). Shares the persona panel's styling (.persona-*) and the same local model.
-import { streamChat, ensureModel, modelLabel } from '/web/wllamaLlm.js'
 import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
@@ -16,37 +17,43 @@ function el(tag, props = {}, kids = []) {
 }
 export function mountDiaryPanel(host) {
   const unit = el('input', { class: 'persona-input', type: 'text', value: 'Bram the Warrior' })
   const traits = el('input', { class: 'persona-input', type: 'text', value: 'Cautious, Veteran, Vengeful' })
   const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
   const btn = el('button', { class: 'persona-go', type: 'button' }, '✒ Write war diary')
   const out = el('div', { class: 'persona-about' }, 'A first-person diary entry, written by a small llama.cpp model in your browser.')
   const controls = el('aside', { class: 'persona-controls' }, [
-    el('h2', { class: 'persona-title' }, 'War Diary'),
     el('label', { class: 'persona-label' }, 'Unit'), unit,
     el('label', { class: 'persona-label' }, 'Traits'), traits,
-    btn, status,
   ])
   const result = el('div', { class: 'persona-result' }, [out])
   host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
   let busy = false
   async function write() {
     if (busy) return
-    busy = true; btn.disabled = true
     const header = `— Diary of ${(unit.value || 'a nameless soldier').trim()} —\n\n`
     out.textContent = header
     try {
-      status.textContent = 'loading the model into your browser…'
-      await ensureModel((frac) => { status.textContent = `downloading model… ${Math.round(frac * 100)}% (one-time, then cached)` })
-      status.textContent = `writing with ${modelLabel()} — on your device…`
       let first = true
       await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
-        maxTokens: 200, temperature: 0.9,
         onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
       })
       status.textContent = 'written ✓ (generated locally)'
     } catch (e) {
       status.textContent = `couldn't run the local model: ${e.message || e}`
     } finally {

+// War-diary panel — vanilla DOM, mounted into #diary-stage. Streams a first-person
+// diary entry generated ON THE USER'S DEVICE via wllama (llama.cpp WASM). Shares the
+// persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
+import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
+import { mountModelBar } from '/web/modelBar.js'
 import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
 }
 export function mountDiaryPanel(host) {
+  const modelHost = el('div')
   const unit = el('input', { class: 'persona-input', type: 'text', value: 'Bram the Warrior' })
   const traits = el('input', { class: 'persona-input', type: 'text', value: 'Cautious, Veteran, Vengeful' })
+  const stats = el('div', { class: 'persona-stats' })
   const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
   const btn = el('button', { class: 'persona-go', type: 'button' }, '✒ Write war diary')
   const out = el('div', { class: 'persona-about' }, 'A first-person diary entry, written by a small llama.cpp model in your browser.')
   const controls = el('aside', { class: 'persona-controls' }, [
+    modelHost,
     el('label', { class: 'persona-label' }, 'Unit'), unit,
     el('label', { class: 'persona-label' }, 'Traits'), traits,
+    btn, stats, status,
   ])
   const result = el('div', { class: 'persona-result' }, [out])
   host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
+  const bar = mountModelBar(modelHost)
   let busy = false
   async function write() {
     if (busy) return
+    busy = true; btn.disabled = true; stats.textContent = ''
     const header = `— Diary of ${(unit.value || 'a nameless soldier').trim()} —\n\n`
     out.textContent = header
     try {
+      status.textContent = `loading ${getCurrentModel().label} into your browser…`
+      await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
+      status.textContent = `writing on your device with ${getCurrentModel().label}…`
       let first = true
       await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
+        maxTokens: 220, temperature: 0.9,
         onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
+        onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
       })
       status.textContent = 'written ✓ (generated locally)'
+      bar.refresh()
     } catch (e) {
       status.textContent = `couldn't run the local model: ${e.message || e}`
     } finally {

web/modelBar.js ADDED Viewed

	@@ -0,0 +1,51 @@

+// Shared model picker + cache controls for the in-browser panels. Lets you choose a
+// model from the catalog (showing size + whether it's already downloaded) and delete
+// a downloaded model from the browser cache — like the wllama demo space.
+import { listModels, getCurrentModel, setModel, cachedSet, deleteCached } from '/web/wllamaLlm.js'
+import { fmtBytes } from '/web/modelCatalog.js'
+function el(tag, props = {}, kids = []) {
+  const n = document.createElement(tag)
+  for (const [k, v] of Object.entries(props)) {
+    if (k === 'class') n.className = v
+    else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
+    else if (v != null) n.setAttribute(k, v)
+  }
+  for (const kid of [].concat(kids)) if (kid != null) n.append(kid)
+  return n
+}
+export function mountModelBar(host, { onChange } = {}) {
+  const models = listModels()
+  const sel = el('select', { class: 'model-select' })
+  const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
+  const info = el('div', { class: 'model-info' })
+  host.append(el('div', { class: 'model-bar' }, [
+    el('label', { class: 'persona-label' }, 'Model (runs in your browser)'),
+    sel, el('div', { class: 'model-row' }, [info, del]),
+  ]))
+  let cached = new Set()
+  function render() {
+    const cur = getCurrentModel().id
+    sel.replaceChildren(...models.map((m) =>
+      el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
+    sel.value = cur
+    const m = getCurrentModel()
+    info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${cached.has(m.id) ? 'cached on device' : 'downloads on first use'}`
+    del.style.display = cached.has(m.id) ? '' : 'none'
+  }
+  async function refresh() { cached = await cachedSet(); render() }
+  sel.addEventListener('change', async () => { await setModel(sel.value); render(); onChange && onChange(sel.value) })
+  del.addEventListener('click', async () => {
+    del.disabled = true; const prev = info.textContent; info.textContent = 'deleting from cache…'
+    try { await deleteCached(sel.value) } catch (e) { info.textContent = 'delete failed: ' + (e.message || e) }
+    await refresh(); del.disabled = false
+    if (info.textContent.startsWith('delete failed')) setTimeout(() => { info.textContent = prev }, 2500)
+  })
+  render()
+  refresh()
+  return { refresh }
+}

web/modelCatalog.js ADDED Viewed

	@@ -0,0 +1,42 @@

+// Curated small instruct models for the IN-BROWSER (wllama / llama.cpp WASM) path.
+// Constraints that shaped this list (verified June 2026 via the HF API):
+//   • Ungated — wllama fetches the GGUF anonymously; gated repos (official Llama/
+//     Gemma) won't load, so we use ungated mirrors (bartowski, unsloth, Qwen, SmolLM).
+//   • ≤ ~2 GB single file — the browser's ArrayBuffer cap is 2 GB (bigger needs split
+//     GGUFs). That's why Gemma 4 E2B (3.1 GB) and Qwen3-4B (2.5 GB) are server-only,
+//     and big ones like Qwen3.5-9B / MiniMax don't fit in-browser at all.
+//   • Q4_K_M quant (good size/quality), CPU-WASM friendly.
+// Sizes are the real download bytes. The hackathon's "≤32B" is the *runtime* cap; the
+// browser is far smaller, so this list is sub-3B.
+export const MODELS = [
+  { id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
+    repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
+    note: 'tiniest — fastest, roughest' },
+  { id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
+    repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
+    note: 'default — good speed/quality balance' },
+  { id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
+    repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
+    note: 'solid 1B all-rounder' },
+  { id: 'smollm2-1.7b', label: 'SmolLM2 1.7B', params: '1.7B', bytes: 1056e6,
+    repo: 'HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF', file: 'smollm2-1.7b-instruct-q4_k_m.gguf',
+    note: 'strong tiny model' },
+  { id: 'qwen3-1.7b', label: 'Qwen3 1.7B', params: '1.7B', bytes: 1107e6,
+    repo: 'unsloth/Qwen3-1.7B-GGUF', file: 'Qwen3-1.7B-Q4_K_M.gguf',
+    note: 'newer Qwen3 — has a thinking mode' },
+  { id: 'qwen2.5-1.5b', label: 'Qwen2.5 1.5B', params: '1.5B', bytes: 1117e6,
+    repo: 'Qwen/Qwen2.5-1.5B-Instruct-GGUF', file: 'qwen2.5-1.5b-instruct-q4_k_m.gguf',
+    note: 'reliable, clean JSON' },
+  { id: 'llama3.2-3b', label: 'Llama 3.2 3B', params: '3B', bytes: 2019e6,
+    repo: 'bartowski/Llama-3.2-3B-Instruct-GGUF', file: 'Llama-3.2-3B-Instruct-Q4_K_M.gguf',
+    note: 'bigger/better, slower in-browser' },
+  { id: 'qwen2.5-3b', label: 'Qwen2.5 3B', params: '3B', bytes: 2105e6,
+    repo: 'Qwen/Qwen2.5-3B-Instruct-GGUF', file: 'qwen2.5-3b-instruct-q4_k_m.gguf',
+    note: 'best quality here; near the 2 GB browser limit' },
+]
+export const DEFAULT_MODEL = 'qwen2.5-0.5b'
+export const getModel = (id) => MODELS.find((m) => m.id === id) || MODELS.find((m) => m.id === DEFAULT_MODEL)
+export const fmtBytes = (b) => (b >= 1e9 ? (b / 1e9).toFixed(1) + ' GB' : Math.round(b / 1e6) + ' MB')

web/personaPanel.js CHANGED Viewed

@@ -1,8 +1,9 @@
 // Tiny Army persona panel — vanilla DOM, mounted by tiny.js into #persona-stage.
-// Generation runs ON THE USER'S DEVICE via wllama (llama.cpp WASM) — no server, no
-// cloud (🔌 Off the Grid + 🦙 Llama Champion). Reuses woid's persona JSON parser
-// (personaParse.js) + live-extraction (extractLivePersona) verbatim.
-import { streamChat, ensureModel, modelLabel } from '/web/wllamaLlm.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
 import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
@@ -13,7 +14,6 @@ function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
   for (const [k, v] of Object.entries(props)) {
     if (k === 'class') n.className = v
-    else if (k === 'html') n.innerHTML = v
     else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
     else if (v != null) n.setAttribute(k, v)
   }
@@ -21,50 +21,61 @@ function el(tag, props = {}, kids = []) {
   return n
 }
-export function mountPersonaPanel(host, opts = {}) {
-  const classes = opts.classes || CLASSES
-  const sel = el('select', { class: 'persona-input' }, classes.map((c) => el('option', { value: c }, c)))
   const seed = el('input', { class: 'persona-input', type: 'text', placeholder: 'a word, a vibe… (optional)' })
   const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
   const btn = el('button', { class: 'persona-go', type: 'button' }, '⚔ Recruit a soldier')
   const nameEl = el('div', { class: 'persona-name' }, 'Your soldier')
   const tagsEl = el('div', { class: 'persona-tags' })
   const aboutEl = el('div', { class: 'persona-about' }, 'Pick a class and recruit — a small llama.cpp model in your browser writes their legend.')
   const controls = el('aside', { class: 'persona-controls' }, [
-    el('h2', { class: 'persona-title' }, 'Recruit'),
     el('label', { class: 'persona-label' }, 'Class'), sel,
     el('label', { class: 'persona-label' }, 'Seed'), seed,
-    btn, status,
   ])
-  const result = el('div', { class: 'persona-result' }, [nameEl, tagsEl, aboutEl])
   host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
   function setTags(p) {
     tagsEl.replaceChildren(...[p.specialty, p.personality, p.vibe].filter(Boolean)
       .map((t) => el('span', { class: 'persona-tag' }, t)))
   }
   let busy = false
   async function generate() {
     if (busy) return
     busy = true; btn.disabled = true
     nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
     try {
-      status.textContent = 'loading the model into your browser…'
-      await ensureModel((frac) => { status.textContent = `downloading model… ${Math.round(frac * 100)}% (one-time, then cached)` })
-      status.textContent = `writing with ${modelLabel()} — on your device…`
       let acc = ''
       await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
-        maxTokens: 200,
         onToken: (piece) => {
           acc += piece
           const live = extractLivePersona(acc)
           if (live.name) nameEl.textContent = live.name
           if (live.about) aboutEl.textContent = live.about
         },
       })
       try {
         const p = parsePersonaJson(acc)
@@ -72,9 +83,11 @@ export function mountPersonaPanel(host, opts = {}) {
         aboutEl.textContent = p.about
         setTags(p)
         status.textContent = 'enlisted ✓ (generated locally)'
       } catch (e) {
         status.textContent = `the model rambled — couldn't parse a clean persona (${e.message || e})`
       }
     } catch (e) {
       status.textContent = `couldn't run the local model: ${e.message || e}`
     } finally {

 // Tiny Army persona panel — vanilla DOM, mounted by tiny.js into #persona-stage.
+// Generation runs ON THE USER'S DEVICE via wllama (llama.cpp WASM). Model is pickable
+// (modelBar), generation streams into a live "thinking" view + parsed result, and we
+// show tok/s. Reuses woid's persona parser + extractLivePersona verbatim.
+import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
+import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
 import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
   const n = document.createElement(tag)
   for (const [k, v] of Object.entries(props)) {
     if (k === 'class') n.className = v
     else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
     else if (v != null) n.setAttribute(k, v)
   }
   return n
 }
+export function mountPersonaPanel(host) {
+  const modelHost = el('div')
+  const sel = el('select', { class: 'persona-input' }, CLASSES.map((c) => el('option', { value: c }, c)))
   const seed = el('input', { class: 'persona-input', type: 'text', placeholder: 'a word, a vibe… (optional)' })
+  const stats = el('div', { class: 'persona-stats' })
   const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
   const btn = el('button', { class: 'persona-go', type: 'button' }, '⚔ Recruit a soldier')
   const nameEl = el('div', { class: 'persona-name' }, 'Your soldier')
   const tagsEl = el('div', { class: 'persona-tags' })
   const aboutEl = el('div', { class: 'persona-about' }, 'Pick a class and recruit — a small llama.cpp model in your browser writes their legend.')
+  const thinkEl = el('pre', { class: 'persona-think' })
+  const thinkWrap = el('details', { class: 'persona-think-wrap' }, [el('summary', {}, 'model output (raw)'), thinkEl])
   const controls = el('aside', { class: 'persona-controls' }, [
+    modelHost,
     el('label', { class: 'persona-label' }, 'Class'), sel,
     el('label', { class: 'persona-label' }, 'Seed'), seed,
+    btn, stats, status,
   ])
+  const result = el('div', { class: 'persona-result' }, [nameEl, tagsEl, aboutEl, thinkWrap])
   host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
+  const bar = mountModelBar(modelHost)
   function setTags(p) {
     tagsEl.replaceChildren(...[p.specialty, p.personality, p.vibe].filter(Boolean)
       .map((t) => el('span', { class: 'persona-tag' }, t)))
   }
+  function showStats(s) {
+    stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}`
+  }
   let busy = false
   async function generate() {
     if (busy) return
     busy = true; btn.disabled = true
     nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
+    thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
     try {
+      status.textContent = `loading ${getCurrentModel().label} into your browser…`
+      await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
+      status.textContent = `writing on your device with ${getCurrentModel().label}…`
       let acc = ''
       await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
+        maxTokens: 220,
         onToken: (piece) => {
           acc += piece
+          thinkEl.textContent = acc
+          thinkEl.scrollTop = thinkEl.scrollHeight
           const live = extractLivePersona(acc)
           if (live.name) nameEl.textContent = live.name
           if (live.about) aboutEl.textContent = live.about
         },
+        onStats: showStats,
       })
       try {
         const p = parsePersonaJson(acc)
         aboutEl.textContent = p.about
         setTags(p)
         status.textContent = 'enlisted ✓ (generated locally)'
+        thinkWrap.open = false
       } catch (e) {
         status.textContent = `the model rambled — couldn't parse a clean persona (${e.message || e})`
       }
+      bar.refresh() // it's now cached
     } catch (e) {
       status.textContent = `couldn't run the local model: ${e.message || e}`
     } finally {

web/shell/persona.css CHANGED Viewed

@@ -63,6 +63,44 @@
   white-space: pre-wrap;
 }
 @media (max-width: 768px) {
   .persona-view { flex-direction: column; }
   .persona-controls { width: 100%; border-right: 0; border-bottom: 2px solid var(--p-ink); }

   white-space: pre-wrap;
 }
+/* ── Model picker + cache controls ─────────────────────────────────────────── */
+.model-bar { display: flex; flex-direction: column; gap: 4px; padding-bottom: 10px; margin-bottom: 6px; border-bottom: 1px dashed var(--p-ink); }
+.model-select {
+  font-family: var(--p-sans) !important; font-size: 13px !important; color: var(--p-ink) !important;
+  background: var(--p-card) !important; border: 1.5px solid var(--p-ink) !important;
+  border-radius: 0 !important; padding: 6px 8px !important; width: 100%;
+}
+.model-row { display: flex; align-items: center; justify-content: space-between; gap: 8px; }
+.model-info { font-family: var(--p-mono); font-size: 9px; letter-spacing: .04em; color: var(--p-muted); line-height: 1.4; flex: 1; }
+.model-del {
+  font-family: var(--p-mono) !important; font-size: 9px !important; letter-spacing: .04em; text-transform: uppercase;
+  color: var(--p-transmit) !important; background: var(--p-card) !important; border: 1.5px solid var(--p-transmit) !important;
+  border-radius: 0 !important; padding: 3px 6px !important; cursor: pointer; flex-shrink: 0;
+}
+.model-del:hover { background: var(--p-transmit) !important; color: var(--p-card) !important; }
+.model-del:disabled { opacity: .5; cursor: default; }
+/* ── Live stats (tok/s) ────────────────────────────────────────────────────── */
+.persona-stats {
+  font-family: var(--p-mono); font-size: 11px; letter-spacing: .04em; color: var(--p-transmit);
+  min-height: 14px; margin-top: 6px;
+}
+/* ── "Thinking" raw stream (see progress as tokens arrive) ──────────────────── */
+.persona-think-wrap { margin-top: 22px; }
+.persona-think-wrap > summary {
+  cursor: pointer; font-family: var(--p-mono); font-size: 10px; letter-spacing: .12em; text-transform: uppercase;
+  color: var(--p-muted); list-style: none;
+}
+.persona-think-wrap > summary::-webkit-details-marker { display: none; }
+.persona-think-wrap > summary::before { content: '▸ '; }
+.persona-think-wrap[open] > summary::before { content: '▾ '; }
+.persona-think {
+  margin: 8px 0 0; max-height: 240px; overflow-y: auto; white-space: pre-wrap; word-break: break-word;
+  font-family: var(--p-mono); font-size: 11px; line-height: 1.5; color: var(--p-muted);
+  background: var(--p-paper-2); border: 1px solid var(--p-ink); padding: 8px 10px;
+}
 @media (max-width: 768px) {
   .persona-view { flex-direction: column; }
   .persona-controls { width: 100%; border-right: 0; border-bottom: 2px solid var(--p-ink); }

web/wllamaLlm.js CHANGED Viewed

@@ -1,55 +1,105 @@
-// In-browser llama.cpp via wllama (WASM) — runs a GGUF from Hugging Face on the
-// USER's device. This is the local-first path: zero cloud/server inference, so it
-// earns 🔌 Off the Grid AND 🦙 Llama Champion (wllama IS llama.cpp, compiled to
-// WebAssembly). It also dodges the free Space's ~0.6 tok/s CPU — generation runs on
-// the visitor's hardware (typically 10–50× faster). Streams tokens like the server
-// path did, so the persona/diary panels barely change.
-import { Wllama } from 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/index.js'
 const WLLAMA_VER = '3.4.1'
 const WASM = { default: `https://cdn.jsdelivr.net/npm/@wllama/wllama@${WLLAMA_VER}/esm/wasm/wllama.wasm` }
-// Small instruct GGUF: ~380 MB, downloaded once then cached by the browser.
-const MODEL = { repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf' }
 let _wllama = null
 let _loadPromise = null
 let _chain = Promise.resolve() // serialize completions (one model, no parallel decode)
-export function modelLabel() { return MODEL.repo.split('/').pop() }
-// Lazy-load wllama + the GGUF (cached after first download). onProgress(fraction 0..1).
 export function ensureModel(onProgress) {
-  if (_wllama) return Promise.resolve(_wllama)
   if (_loadPromise) return _loadPromise
   _loadPromise = (async () => {
     const w = new Wllama(WASM)
-    await w.loadModelFromHF(MODEL, {
       n_ctx: 2048,
       progressCallback: ({ loaded, total }) => onProgress && onProgress(total ? loaded / total : 0),
     })
-    _wllama = w
     return w
   })().catch((e) => { _loadPromise = null; throw e })
   return _loadPromise
 }
-// Stream a chat completion in-browser. Calls onToken(piece) per chunk; returns full text.
-// Serialized so two panels can't decode at once.
-export function streamChat(system, user, { maxTokens = 200, temperature = 0.8, onToken } = {}) {
   const run = async () => {
     const w = await ensureModel()
-    let full = ''
     const stream = await w.createChatCompletion({
       messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
       max_tokens: maxTokens, temperature, top_k: 40, top_p: 0.9, stream: true,
     })
     for await (const chunk of stream) {
       const piece = chunk?.choices?.[0]?.delta?.content || ''
-      if (piece) { full += piece; if (onToken) onToken(piece) }
     }
-    return full
   }
   const p = _chain.then(run, run)
-  _chain = p.catch(() => {})  // keep the chain alive after errors
   return p
 }

+// In-browser llama.cpp via wllama (WASM). Local-first (🔌 Off the Grid) + llama.cpp
+// (🦙 Llama Champion). Adds model selection from a catalog, cache management
+// (download/delete via wllama's ModelManager), and live generation stats (tok/s).
+import { Wllama, ModelManager } from 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/index.js'
+import { MODELS, getModel, DEFAULT_MODEL } from '/web/modelCatalog.js'
 const WLLAMA_VER = '3.4.1'
 const WASM = { default: `https://cdn.jsdelivr.net/npm/@wllama/wllama@${WLLAMA_VER}/esm/wasm/wllama.wasm` }
+let currentId = DEFAULT_MODEL
 let _wllama = null
+let _loadedId = null
 let _loadPromise = null
 let _chain = Promise.resolve() // serialize completions (one model, no parallel decode)
+const mm = new ModelManager()
+export function listModels() { return MODELS }
+export function getCurrentModel() { return getModel(currentId) }
+// Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
+export async function setModel(id) {
+  if (id === currentId) return
+  currentId = id
+  if (_wllama) { try { await _wllama.exit() } catch { /* ignore */ } }
+  _wllama = null; _loadedId = null; _loadPromise = null
+}
+// Load (download + init) the current model. onProgress(fraction 0..1) during download.
 export function ensureModel(onProgress) {
+  const m = getModel(currentId)
+  if (_wllama && _loadedId === m.id) return Promise.resolve(_wllama)
   if (_loadPromise) return _loadPromise
   _loadPromise = (async () => {
     const w = new Wllama(WASM)
+    await w.loadModelFromHF({ repo: m.repo, file: m.file }, {
       n_ctx: 2048,
       progressCallback: ({ loaded, total }) => onProgress && onProgress(total ? loaded / total : 0),
     })
+    _wllama = w; _loadedId = m.id
     return w
   })().catch((e) => { _loadPromise = null; throw e })
   return _loadPromise
 }
+// Stream a chat completion in-browser. onToken(piece); onStats({tokens,tokPerSec,seconds}).
+// Serialized so two panels can't decode at once. Returns { text, stats }.
+export function streamChat(system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats } = {}) {
   const run = async () => {
     const w = await ensureModel()
+    let full = ''; let n = 0
+    const t0 = performance.now(); let tFirst = null
+    const emitStats = (final) => {
+      if (!onStats) return
+      const secs = (performance.now() - t0) / 1000
+      const gen = tFirst ? (performance.now() - tFirst) / 1000 : 0
+      onStats({ tokens: n, seconds: +secs.toFixed(1),
+        tokPerSec: gen > 0 ? +(n / gen).toFixed(1) : 0,
+        ttftSeconds: tFirst ? +((tFirst - t0) / 1000).toFixed(1) : null, final: !!final })
+    }
     const stream = await w.createChatCompletion({
       messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
       max_tokens: maxTokens, temperature, top_k: 40, top_p: 0.9, stream: true,
     })
     for await (const chunk of stream) {
       const piece = chunk?.choices?.[0]?.delta?.content || ''
+      if (!piece) continue
+      if (tFirst === null) tFirst = performance.now()
+      full += piece; n++
+      if (onToken) onToken(piece)
+      emitStats(false)
     }
+    emitStats(true)
+    const gen = tFirst ? (performance.now() - tFirst) / 1000 : (performance.now() - t0) / 1000
+    return { text: full, stats: { tokens: n, tokPerSec: gen > 0 ? +(n / gen).toFixed(1) : 0 } }
   }
   const p = _chain.then(run, run)
+  _chain = p.catch(() => {})
   return p
 }
+// ── Cache management (wllama ModelManager) ────────────────────────────────────
+// Match cached files to catalog entries by GGUF filename (cache names embed it).
+async function _cachedModels() {
+  try { return await mm.getModels() } catch { return [] }
+}
+function _matches(model, entry) {
+  const names = (model.files || []).map((f) => f.name || '').join('|')
+  return names.includes(entry.file)
+}
+// Set of catalog model ids currently downloaded in the browser.
+export async function cachedSet() {
+  const models = await _cachedModels()
+  const ids = new Set()
+  for (const m of models) for (const c of MODELS) if (_matches(m, c)) ids.add(c.id)
+  return ids
+}
+// Delete a model from the browser cache (unloading it first if it's the active one).
+export async function deleteCached(id) {
+  const c = getModel(id)
+  if (_loadedId === id && _wllama) { try { await _wllama.exit() } catch { /* ignore */ } _wllama = null; _loadedId = null; _loadPromise = null }
+  const models = await _cachedModels()
+  for (const m of models) if (_matches(m, c) && m.remove) await m.remove()
+}