polats Claude Opus 4.8 (1M context) commited on
Commit
8eac3eb
·
1 Parent(s): 03708ca

In-browser models: catalog + picker + cache delete, live tok/s stats, raw 'thinking' stream

Browse files

- modelCatalog.js: 8 verified ungated GGUFs ≤~2GB (SmolLM2 360M/1.7B, Qwen2.5
0.5B/1.5B/3B, Qwen3 1.7B, Llama 3.2 1B/3B) with real sizes; bigger (Gemma 4 E2B
3GB, Qwen3-4B, MiniMax) excluded — exceed the browser 2GB single-file limit.
- wllamaLlm.js: setModel/ensureModel per-model, cache list/delete via wllama
ModelManager (exit() + remove()), tok/s + first-token stats during streaming.
- modelBar.js: model dropdown (label · size · ✓downloaded) + 🗑 delete-from-cache.
- persona/diary panels: model picker, live tok/s, and a raw 'thinking' stream view
so you watch tokens arrive.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

web/diaryPanel.js CHANGED
@@ -1,7 +1,8 @@
1
- // War-diary panel — vanilla DOM, mounted by tiny.js into #diary-stage. Streams a
2
- // first-person diary entry generated ON THE USER'S DEVICE via wllama (llama.cpp
3
- // WASM). Shares the persona panel's styling (.persona-*) and the same local model.
4
- import { streamChat, ensureModel, modelLabel } from '/web/wllamaLlm.js'
 
5
  import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
6
 
7
  function el(tag, props = {}, kids = []) {
@@ -16,37 +17,43 @@ function el(tag, props = {}, kids = []) {
16
  }
17
 
18
  export function mountDiaryPanel(host) {
 
19
  const unit = el('input', { class: 'persona-input', type: 'text', value: 'Bram the Warrior' })
20
  const traits = el('input', { class: 'persona-input', type: 'text', value: 'Cautious, Veteran, Vengeful' })
 
21
  const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
22
  const btn = el('button', { class: 'persona-go', type: 'button' }, '✒ Write war diary')
23
  const out = el('div', { class: 'persona-about' }, 'A first-person diary entry, written by a small llama.cpp model in your browser.')
24
 
25
  const controls = el('aside', { class: 'persona-controls' }, [
26
- el('h2', { class: 'persona-title' }, 'War Diary'),
27
  el('label', { class: 'persona-label' }, 'Unit'), unit,
28
  el('label', { class: 'persona-label' }, 'Traits'), traits,
29
- btn, status,
30
  ])
31
  const result = el('div', { class: 'persona-result' }, [out])
32
  host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
33
 
 
 
34
  let busy = false
35
  async function write() {
36
  if (busy) return
37
- busy = true; btn.disabled = true
38
  const header = `— Diary of ${(unit.value || 'a nameless soldier').trim()} —\n\n`
39
  out.textContent = header
40
  try {
41
- status.textContent = 'loading the model into your browser…'
42
- await ensureModel((frac) => { status.textContent = `downloading model… ${Math.round(frac * 100)}% (one-time, then cached)` })
43
- status.textContent = `writing with ${modelLabel()} — on your device…`
44
  let first = true
45
  await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
46
- maxTokens: 200, temperature: 0.9,
47
  onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
 
48
  })
49
  status.textContent = 'written ✓ (generated locally)'
 
50
  } catch (e) {
51
  status.textContent = `couldn't run the local model: ${e.message || e}`
52
  } finally {
 
1
+ // War-diary panel — vanilla DOM, mounted into #diary-stage. Streams a first-person
2
+ // diary entry generated ON THE USER'S DEVICE via wllama (llama.cpp WASM). Shares the
3
+ // persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
4
+ import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
5
+ import { mountModelBar } from '/web/modelBar.js'
6
  import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
7
 
8
  function el(tag, props = {}, kids = []) {
 
17
  }
18
 
19
  export function mountDiaryPanel(host) {
20
+ const modelHost = el('div')
21
  const unit = el('input', { class: 'persona-input', type: 'text', value: 'Bram the Warrior' })
22
  const traits = el('input', { class: 'persona-input', type: 'text', value: 'Cautious, Veteran, Vengeful' })
23
+ const stats = el('div', { class: 'persona-stats' })
24
  const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
25
  const btn = el('button', { class: 'persona-go', type: 'button' }, '✒ Write war diary')
26
  const out = el('div', { class: 'persona-about' }, 'A first-person diary entry, written by a small llama.cpp model in your browser.')
27
 
28
  const controls = el('aside', { class: 'persona-controls' }, [
29
+ modelHost,
30
  el('label', { class: 'persona-label' }, 'Unit'), unit,
31
  el('label', { class: 'persona-label' }, 'Traits'), traits,
32
+ btn, stats, status,
33
  ])
34
  const result = el('div', { class: 'persona-result' }, [out])
35
  host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
36
 
37
+ const bar = mountModelBar(modelHost)
38
+
39
  let busy = false
40
  async function write() {
41
  if (busy) return
42
+ busy = true; btn.disabled = true; stats.textContent = ''
43
  const header = `— Diary of ${(unit.value || 'a nameless soldier').trim()} —\n\n`
44
  out.textContent = header
45
  try {
46
+ status.textContent = `loading ${getCurrentModel().label} into your browser…`
47
+ await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
48
+ status.textContent = `writing on your device with ${getCurrentModel().label}…`
49
  let first = true
50
  await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
51
+ maxTokens: 220, temperature: 0.9,
52
  onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
53
+ onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
54
  })
55
  status.textContent = 'written ✓ (generated locally)'
56
+ bar.refresh()
57
  } catch (e) {
58
  status.textContent = `couldn't run the local model: ${e.message || e}`
59
  } finally {
web/modelBar.js ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Shared model picker + cache controls for the in-browser panels. Lets you choose a
2
+ // model from the catalog (showing size + whether it's already downloaded) and delete
3
+ // a downloaded model from the browser cache — like the wllama demo space.
4
+ import { listModels, getCurrentModel, setModel, cachedSet, deleteCached } from '/web/wllamaLlm.js'
5
+ import { fmtBytes } from '/web/modelCatalog.js'
6
+
7
+ function el(tag, props = {}, kids = []) {
8
+ const n = document.createElement(tag)
9
+ for (const [k, v] of Object.entries(props)) {
10
+ if (k === 'class') n.className = v
11
+ else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
12
+ else if (v != null) n.setAttribute(k, v)
13
+ }
14
+ for (const kid of [].concat(kids)) if (kid != null) n.append(kid)
15
+ return n
16
+ }
17
+
18
+ export function mountModelBar(host, { onChange } = {}) {
19
+ const models = listModels()
20
+ const sel = el('select', { class: 'model-select' })
21
+ const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
22
+ const info = el('div', { class: 'model-info' })
23
+ host.append(el('div', { class: 'model-bar' }, [
24
+ el('label', { class: 'persona-label' }, 'Model (runs in your browser)'),
25
+ sel, el('div', { class: 'model-row' }, [info, del]),
26
+ ]))
27
+
28
+ let cached = new Set()
29
+ function render() {
30
+ const cur = getCurrentModel().id
31
+ sel.replaceChildren(...models.map((m) =>
32
+ el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
33
+ sel.value = cur
34
+ const m = getCurrentModel()
35
+ info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${cached.has(m.id) ? 'cached on device' : 'downloads on first use'}`
36
+ del.style.display = cached.has(m.id) ? '' : 'none'
37
+ }
38
+ async function refresh() { cached = await cachedSet(); render() }
39
+
40
+ sel.addEventListener('change', async () => { await setModel(sel.value); render(); onChange && onChange(sel.value) })
41
+ del.addEventListener('click', async () => {
42
+ del.disabled = true; const prev = info.textContent; info.textContent = 'deleting from cache…'
43
+ try { await deleteCached(sel.value) } catch (e) { info.textContent = 'delete failed: ' + (e.message || e) }
44
+ await refresh(); del.disabled = false
45
+ if (info.textContent.startsWith('delete failed')) setTimeout(() => { info.textContent = prev }, 2500)
46
+ })
47
+
48
+ render()
49
+ refresh()
50
+ return { refresh }
51
+ }
web/modelCatalog.js ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Curated small instruct models for the IN-BROWSER (wllama / llama.cpp WASM) path.
2
+ // Constraints that shaped this list (verified June 2026 via the HF API):
3
+ // • Ungated — wllama fetches the GGUF anonymously; gated repos (official Llama/
4
+ // Gemma) won't load, so we use ungated mirrors (bartowski, unsloth, Qwen, SmolLM).
5
+ // • ≤ ~2 GB single file — the browser's ArrayBuffer cap is 2 GB (bigger needs split
6
+ // GGUFs). That's why Gemma 4 E2B (3.1 GB) and Qwen3-4B (2.5 GB) are server-only,
7
+ // and big ones like Qwen3.5-9B / MiniMax don't fit in-browser at all.
8
+ // • Q4_K_M quant (good size/quality), CPU-WASM friendly.
9
+ // Sizes are the real download bytes. The hackathon's "≤32B" is the *runtime* cap; the
10
+ // browser is far smaller, so this list is sub-3B.
11
+ export const MODELS = [
12
+ { id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
13
+ repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
14
+ note: 'tiniest — fastest, roughest' },
15
+ { id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
16
+ repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
17
+ note: 'default — good speed/quality balance' },
18
+ { id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
19
+ repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
20
+ note: 'solid 1B all-rounder' },
21
+ { id: 'smollm2-1.7b', label: 'SmolLM2 1.7B', params: '1.7B', bytes: 1056e6,
22
+ repo: 'HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF', file: 'smollm2-1.7b-instruct-q4_k_m.gguf',
23
+ note: 'strong tiny model' },
24
+ { id: 'qwen3-1.7b', label: 'Qwen3 1.7B', params: '1.7B', bytes: 1107e6,
25
+ repo: 'unsloth/Qwen3-1.7B-GGUF', file: 'Qwen3-1.7B-Q4_K_M.gguf',
26
+ note: 'newer Qwen3 — has a thinking mode' },
27
+ { id: 'qwen2.5-1.5b', label: 'Qwen2.5 1.5B', params: '1.5B', bytes: 1117e6,
28
+ repo: 'Qwen/Qwen2.5-1.5B-Instruct-GGUF', file: 'qwen2.5-1.5b-instruct-q4_k_m.gguf',
29
+ note: 'reliable, clean JSON' },
30
+ { id: 'llama3.2-3b', label: 'Llama 3.2 3B', params: '3B', bytes: 2019e6,
31
+ repo: 'bartowski/Llama-3.2-3B-Instruct-GGUF', file: 'Llama-3.2-3B-Instruct-Q4_K_M.gguf',
32
+ note: 'bigger/better, slower in-browser' },
33
+ { id: 'qwen2.5-3b', label: 'Qwen2.5 3B', params: '3B', bytes: 2105e6,
34
+ repo: 'Qwen/Qwen2.5-3B-Instruct-GGUF', file: 'qwen2.5-3b-instruct-q4_k_m.gguf',
35
+ note: 'best quality here; near the 2 GB browser limit' },
36
+ ]
37
+
38
+ export const DEFAULT_MODEL = 'qwen2.5-0.5b'
39
+
40
+ export const getModel = (id) => MODELS.find((m) => m.id === id) || MODELS.find((m) => m.id === DEFAULT_MODEL)
41
+
42
+ export const fmtBytes = (b) => (b >= 1e9 ? (b / 1e9).toFixed(1) + ' GB' : Math.round(b / 1e6) + ' MB')
web/personaPanel.js CHANGED
@@ -1,8 +1,9 @@
1
  // Tiny Army persona panel — vanilla DOM, mounted by tiny.js into #persona-stage.
2
- // Generation runs ON THE USER'S DEVICE via wllama (llama.cpp WASM) no server, no
3
- // cloud (🔌 Off the Grid + 🦙 Llama Champion). Reuses woid's persona JSON parser
4
- // (personaParse.js) + live-extraction (extractLivePersona) verbatim.
5
- import { streamChat, ensureModel, modelLabel } from '/web/wllamaLlm.js'
 
6
  import { extractLivePersona } from '/web/personaStream.js'
7
  import { parsePersonaJson } from '/web/personaParse.js'
8
  import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
@@ -13,7 +14,6 @@ function el(tag, props = {}, kids = []) {
13
  const n = document.createElement(tag)
14
  for (const [k, v] of Object.entries(props)) {
15
  if (k === 'class') n.className = v
16
- else if (k === 'html') n.innerHTML = v
17
  else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
18
  else if (v != null) n.setAttribute(k, v)
19
  }
@@ -21,50 +21,61 @@ function el(tag, props = {}, kids = []) {
21
  return n
22
  }
23
 
24
- export function mountPersonaPanel(host, opts = {}) {
25
- const classes = opts.classes || CLASSES
26
-
27
- const sel = el('select', { class: 'persona-input' }, classes.map((c) => el('option', { value: c }, c)))
28
  const seed = el('input', { class: 'persona-input', type: 'text', placeholder: 'a word, a vibe… (optional)' })
 
29
  const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
30
  const btn = el('button', { class: 'persona-go', type: 'button' }, '⚔ Recruit a soldier')
31
 
32
  const nameEl = el('div', { class: 'persona-name' }, 'Your soldier')
33
  const tagsEl = el('div', { class: 'persona-tags' })
34
  const aboutEl = el('div', { class: 'persona-about' }, 'Pick a class and recruit — a small llama.cpp model in your browser writes their legend.')
 
 
35
 
36
  const controls = el('aside', { class: 'persona-controls' }, [
37
- el('h2', { class: 'persona-title' }, 'Recruit'),
38
  el('label', { class: 'persona-label' }, 'Class'), sel,
39
  el('label', { class: 'persona-label' }, 'Seed'), seed,
40
- btn, status,
41
  ])
42
- const result = el('div', { class: 'persona-result' }, [nameEl, tagsEl, aboutEl])
43
  host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
44
 
 
 
45
  function setTags(p) {
46
  tagsEl.replaceChildren(...[p.specialty, p.personality, p.vibe].filter(Boolean)
47
  .map((t) => el('span', { class: 'persona-tag' }, t)))
48
  }
 
 
 
49
 
50
  let busy = false
51
  async function generate() {
52
  if (busy) return
53
  busy = true; btn.disabled = true
54
  nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
 
55
  try {
56
- status.textContent = 'loading the model into your browser…'
57
- await ensureModel((frac) => { status.textContent = `downloading model… ${Math.round(frac * 100)}% (one-time, then cached)` })
58
- status.textContent = `writing with ${modelLabel()} — on your device…`
59
  let acc = ''
60
  await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
61
- maxTokens: 200,
62
  onToken: (piece) => {
63
  acc += piece
 
 
64
  const live = extractLivePersona(acc)
65
  if (live.name) nameEl.textContent = live.name
66
  if (live.about) aboutEl.textContent = live.about
67
  },
 
68
  })
69
  try {
70
  const p = parsePersonaJson(acc)
@@ -72,9 +83,11 @@ export function mountPersonaPanel(host, opts = {}) {
72
  aboutEl.textContent = p.about
73
  setTags(p)
74
  status.textContent = 'enlisted ✓ (generated locally)'
 
75
  } catch (e) {
76
  status.textContent = `the model rambled — couldn't parse a clean persona (${e.message || e})`
77
  }
 
78
  } catch (e) {
79
  status.textContent = `couldn't run the local model: ${e.message || e}`
80
  } finally {
 
1
  // Tiny Army persona panel — vanilla DOM, mounted by tiny.js into #persona-stage.
2
+ // Generation runs ON THE USER'S DEVICE via wllama (llama.cpp WASM). Model is pickable
3
+ // (modelBar), generation streams into a live "thinking" view + parsed result, and we
4
+ // show tok/s. Reuses woid's persona parser + extractLivePersona verbatim.
5
+ import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
6
+ import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
  import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
 
14
  const n = document.createElement(tag)
15
  for (const [k, v] of Object.entries(props)) {
16
  if (k === 'class') n.className = v
 
17
  else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
18
  else if (v != null) n.setAttribute(k, v)
19
  }
 
21
  return n
22
  }
23
 
24
+ export function mountPersonaPanel(host) {
25
+ const modelHost = el('div')
26
+ const sel = el('select', { class: 'persona-input' }, CLASSES.map((c) => el('option', { value: c }, c)))
 
27
  const seed = el('input', { class: 'persona-input', type: 'text', placeholder: 'a word, a vibe… (optional)' })
28
+ const stats = el('div', { class: 'persona-stats' })
29
  const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
30
  const btn = el('button', { class: 'persona-go', type: 'button' }, '⚔ Recruit a soldier')
31
 
32
  const nameEl = el('div', { class: 'persona-name' }, 'Your soldier')
33
  const tagsEl = el('div', { class: 'persona-tags' })
34
  const aboutEl = el('div', { class: 'persona-about' }, 'Pick a class and recruit — a small llama.cpp model in your browser writes their legend.')
35
+ const thinkEl = el('pre', { class: 'persona-think' })
36
+ const thinkWrap = el('details', { class: 'persona-think-wrap' }, [el('summary', {}, 'model output (raw)'), thinkEl])
37
 
38
  const controls = el('aside', { class: 'persona-controls' }, [
39
+ modelHost,
40
  el('label', { class: 'persona-label' }, 'Class'), sel,
41
  el('label', { class: 'persona-label' }, 'Seed'), seed,
42
+ btn, stats, status,
43
  ])
44
+ const result = el('div', { class: 'persona-result' }, [nameEl, tagsEl, aboutEl, thinkWrap])
45
  host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
46
 
47
+ const bar = mountModelBar(modelHost)
48
+
49
  function setTags(p) {
50
  tagsEl.replaceChildren(...[p.specialty, p.personality, p.vibe].filter(Boolean)
51
  .map((t) => el('span', { class: 'persona-tag' }, t)))
52
  }
53
+ function showStats(s) {
54
+ stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}`
55
+ }
56
 
57
  let busy = false
58
  async function generate() {
59
  if (busy) return
60
  busy = true; btn.disabled = true
61
  nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
62
+ thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
63
  try {
64
+ status.textContent = `loading ${getCurrentModel().label} into your browser…`
65
+ await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
66
+ status.textContent = `writing on your device with ${getCurrentModel().label}…`
67
  let acc = ''
68
  await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
69
+ maxTokens: 220,
70
  onToken: (piece) => {
71
  acc += piece
72
+ thinkEl.textContent = acc
73
+ thinkEl.scrollTop = thinkEl.scrollHeight
74
  const live = extractLivePersona(acc)
75
  if (live.name) nameEl.textContent = live.name
76
  if (live.about) aboutEl.textContent = live.about
77
  },
78
+ onStats: showStats,
79
  })
80
  try {
81
  const p = parsePersonaJson(acc)
 
83
  aboutEl.textContent = p.about
84
  setTags(p)
85
  status.textContent = 'enlisted ✓ (generated locally)'
86
+ thinkWrap.open = false
87
  } catch (e) {
88
  status.textContent = `the model rambled — couldn't parse a clean persona (${e.message || e})`
89
  }
90
+ bar.refresh() // it's now cached
91
  } catch (e) {
92
  status.textContent = `couldn't run the local model: ${e.message || e}`
93
  } finally {
web/shell/persona.css CHANGED
@@ -63,6 +63,44 @@
63
  white-space: pre-wrap;
64
  }
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  @media (max-width: 768px) {
67
  .persona-view { flex-direction: column; }
68
  .persona-controls { width: 100%; border-right: 0; border-bottom: 2px solid var(--p-ink); }
 
63
  white-space: pre-wrap;
64
  }
65
 
66
+ /* ── Model picker + cache controls ─────────────────────────────────────────── */
67
+ .model-bar { display: flex; flex-direction: column; gap: 4px; padding-bottom: 10px; margin-bottom: 6px; border-bottom: 1px dashed var(--p-ink); }
68
+ .model-select {
69
+ font-family: var(--p-sans) !important; font-size: 13px !important; color: var(--p-ink) !important;
70
+ background: var(--p-card) !important; border: 1.5px solid var(--p-ink) !important;
71
+ border-radius: 0 !important; padding: 6px 8px !important; width: 100%;
72
+ }
73
+ .model-row { display: flex; align-items: center; justify-content: space-between; gap: 8px; }
74
+ .model-info { font-family: var(--p-mono); font-size: 9px; letter-spacing: .04em; color: var(--p-muted); line-height: 1.4; flex: 1; }
75
+ .model-del {
76
+ font-family: var(--p-mono) !important; font-size: 9px !important; letter-spacing: .04em; text-transform: uppercase;
77
+ color: var(--p-transmit) !important; background: var(--p-card) !important; border: 1.5px solid var(--p-transmit) !important;
78
+ border-radius: 0 !important; padding: 3px 6px !important; cursor: pointer; flex-shrink: 0;
79
+ }
80
+ .model-del:hover { background: var(--p-transmit) !important; color: var(--p-card) !important; }
81
+ .model-del:disabled { opacity: .5; cursor: default; }
82
+
83
+ /* ── Live stats (tok/s) ────────────────────────────────────────────────────── */
84
+ .persona-stats {
85
+ font-family: var(--p-mono); font-size: 11px; letter-spacing: .04em; color: var(--p-transmit);
86
+ min-height: 14px; margin-top: 6px;
87
+ }
88
+
89
+ /* ── "Thinking" raw stream (see progress as tokens arrive) ──────────────────── */
90
+ .persona-think-wrap { margin-top: 22px; }
91
+ .persona-think-wrap > summary {
92
+ cursor: pointer; font-family: var(--p-mono); font-size: 10px; letter-spacing: .12em; text-transform: uppercase;
93
+ color: var(--p-muted); list-style: none;
94
+ }
95
+ .persona-think-wrap > summary::-webkit-details-marker { display: none; }
96
+ .persona-think-wrap > summary::before { content: '▸ '; }
97
+ .persona-think-wrap[open] > summary::before { content: '▾ '; }
98
+ .persona-think {
99
+ margin: 8px 0 0; max-height: 240px; overflow-y: auto; white-space: pre-wrap; word-break: break-word;
100
+ font-family: var(--p-mono); font-size: 11px; line-height: 1.5; color: var(--p-muted);
101
+ background: var(--p-paper-2); border: 1px solid var(--p-ink); padding: 8px 10px;
102
+ }
103
+
104
  @media (max-width: 768px) {
105
  .persona-view { flex-direction: column; }
106
  .persona-controls { width: 100%; border-right: 0; border-bottom: 2px solid var(--p-ink); }
web/wllamaLlm.js CHANGED
@@ -1,55 +1,105 @@
1
- // In-browser llama.cpp via wllama (WASM) runs a GGUF from Hugging Face on the
2
- // USER's device. This is the local-first path: zero cloud/server inference, so it
3
- // earns 🔌 Off the Grid AND 🦙 Llama Champion (wllama IS llama.cpp, compiled to
4
- // WebAssembly). It also dodges the free Space's ~0.6 tok/s CPU — generation runs on
5
- // the visitor's hardware (typically 10–50× faster). Streams tokens like the server
6
- // path did, so the persona/diary panels barely change.
7
- import { Wllama } from 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/index.js'
8
 
9
  const WLLAMA_VER = '3.4.1'
10
  const WASM = { default: `https://cdn.jsdelivr.net/npm/@wllama/wllama@${WLLAMA_VER}/esm/wasm/wllama.wasm` }
11
- // Small instruct GGUF: ~380 MB, downloaded once then cached by the browser.
12
- const MODEL = { repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf' }
13
 
 
14
  let _wllama = null
 
15
  let _loadPromise = null
16
  let _chain = Promise.resolve() // serialize completions (one model, no parallel decode)
 
17
 
18
- export function modelLabel() { return MODEL.repo.split('/').pop() }
 
19
 
20
- // Lazy-load wllama + the GGUF (cached after first download). onProgress(fraction 0..1).
 
 
 
 
 
 
 
 
21
  export function ensureModel(onProgress) {
22
- if (_wllama) return Promise.resolve(_wllama)
 
23
  if (_loadPromise) return _loadPromise
24
  _loadPromise = (async () => {
25
  const w = new Wllama(WASM)
26
- await w.loadModelFromHF(MODEL, {
27
  n_ctx: 2048,
28
  progressCallback: ({ loaded, total }) => onProgress && onProgress(total ? loaded / total : 0),
29
  })
30
- _wllama = w
31
  return w
32
  })().catch((e) => { _loadPromise = null; throw e })
33
  return _loadPromise
34
  }
35
 
36
- // Stream a chat completion in-browser. Calls onToken(piece) per chunk; returns full text.
37
- // Serialized so two panels can't decode at once.
38
- export function streamChat(system, user, { maxTokens = 200, temperature = 0.8, onToken } = {}) {
39
  const run = async () => {
40
  const w = await ensureModel()
41
- let full = ''
 
 
 
 
 
 
 
 
 
42
  const stream = await w.createChatCompletion({
43
  messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
44
  max_tokens: maxTokens, temperature, top_k: 40, top_p: 0.9, stream: true,
45
  })
46
  for await (const chunk of stream) {
47
  const piece = chunk?.choices?.[0]?.delta?.content || ''
48
- if (piece) { full += piece; if (onToken) onToken(piece) }
 
 
 
 
49
  }
50
- return full
 
 
51
  }
52
  const p = _chain.then(run, run)
53
- _chain = p.catch(() => {}) // keep the chain alive after errors
54
  return p
55
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // In-browser llama.cpp via wllama (WASM). Local-first (🔌 Off the Grid) + llama.cpp
2
+ // (🦙 Llama Champion). Adds model selection from a catalog, cache management
3
+ // (download/delete via wllama's ModelManager), and live generation stats (tok/s).
4
+ import { Wllama, ModelManager } from 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/index.js'
5
+ import { MODELS, getModel, DEFAULT_MODEL } from '/web/modelCatalog.js'
 
 
6
 
7
  const WLLAMA_VER = '3.4.1'
8
  const WASM = { default: `https://cdn.jsdelivr.net/npm/@wllama/wllama@${WLLAMA_VER}/esm/wasm/wllama.wasm` }
 
 
9
 
10
+ let currentId = DEFAULT_MODEL
11
  let _wllama = null
12
+ let _loadedId = null
13
  let _loadPromise = null
14
  let _chain = Promise.resolve() // serialize completions (one model, no parallel decode)
15
+ const mm = new ModelManager()
16
 
17
+ export function listModels() { return MODELS }
18
+ export function getCurrentModel() { return getModel(currentId) }
19
 
20
+ // Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
21
+ export async function setModel(id) {
22
+ if (id === currentId) return
23
+ currentId = id
24
+ if (_wllama) { try { await _wllama.exit() } catch { /* ignore */ } }
25
+ _wllama = null; _loadedId = null; _loadPromise = null
26
+ }
27
+
28
+ // Load (download + init) the current model. onProgress(fraction 0..1) during download.
29
  export function ensureModel(onProgress) {
30
+ const m = getModel(currentId)
31
+ if (_wllama && _loadedId === m.id) return Promise.resolve(_wllama)
32
  if (_loadPromise) return _loadPromise
33
  _loadPromise = (async () => {
34
  const w = new Wllama(WASM)
35
+ await w.loadModelFromHF({ repo: m.repo, file: m.file }, {
36
  n_ctx: 2048,
37
  progressCallback: ({ loaded, total }) => onProgress && onProgress(total ? loaded / total : 0),
38
  })
39
+ _wllama = w; _loadedId = m.id
40
  return w
41
  })().catch((e) => { _loadPromise = null; throw e })
42
  return _loadPromise
43
  }
44
 
45
+ // Stream a chat completion in-browser. onToken(piece); onStats({tokens,tokPerSec,seconds}).
46
+ // Serialized so two panels can't decode at once. Returns { text, stats }.
47
+ export function streamChat(system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats } = {}) {
48
  const run = async () => {
49
  const w = await ensureModel()
50
+ let full = ''; let n = 0
51
+ const t0 = performance.now(); let tFirst = null
52
+ const emitStats = (final) => {
53
+ if (!onStats) return
54
+ const secs = (performance.now() - t0) / 1000
55
+ const gen = tFirst ? (performance.now() - tFirst) / 1000 : 0
56
+ onStats({ tokens: n, seconds: +secs.toFixed(1),
57
+ tokPerSec: gen > 0 ? +(n / gen).toFixed(1) : 0,
58
+ ttftSeconds: tFirst ? +((tFirst - t0) / 1000).toFixed(1) : null, final: !!final })
59
+ }
60
  const stream = await w.createChatCompletion({
61
  messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
62
  max_tokens: maxTokens, temperature, top_k: 40, top_p: 0.9, stream: true,
63
  })
64
  for await (const chunk of stream) {
65
  const piece = chunk?.choices?.[0]?.delta?.content || ''
66
+ if (!piece) continue
67
+ if (tFirst === null) tFirst = performance.now()
68
+ full += piece; n++
69
+ if (onToken) onToken(piece)
70
+ emitStats(false)
71
  }
72
+ emitStats(true)
73
+ const gen = tFirst ? (performance.now() - tFirst) / 1000 : (performance.now() - t0) / 1000
74
+ return { text: full, stats: { tokens: n, tokPerSec: gen > 0 ? +(n / gen).toFixed(1) : 0 } }
75
  }
76
  const p = _chain.then(run, run)
77
+ _chain = p.catch(() => {})
78
  return p
79
  }
80
+
81
+ // ── Cache management (wllama ModelManager) ────────────────────────────────────
82
+ // Match cached files to catalog entries by GGUF filename (cache names embed it).
83
+ async function _cachedModels() {
84
+ try { return await mm.getModels() } catch { return [] }
85
+ }
86
+ function _matches(model, entry) {
87
+ const names = (model.files || []).map((f) => f.name || '').join('|')
88
+ return names.includes(entry.file)
89
+ }
90
+
91
+ // Set of catalog model ids currently downloaded in the browser.
92
+ export async function cachedSet() {
93
+ const models = await _cachedModels()
94
+ const ids = new Set()
95
+ for (const m of models) for (const c of MODELS) if (_matches(m, c)) ids.add(c.id)
96
+ return ids
97
+ }
98
+
99
+ // Delete a model from the browser cache (unloading it first if it's the active one).
100
+ export async function deleteCached(id) {
101
+ const c = getModel(id)
102
+ if (_loadedId === id && _wllama) { try { await _wllama.exit() } catch { /* ignore */ } _wllama = null; _loadedId = null; _loadPromise = null }
103
+ const models = await _cachedModels()
104
+ for (const m of models) if (_matches(m, c) && m.remove) await m.remove()
105
+ }