polats Claude Opus 4.8 (1M context) commited on
Commit
a4ca9e9
·
1 Parent(s): 8eac3eb

Add Qwen3-0.6B; surface WebGPU backend; strip <think> from answers

Browse files

- Catalog: add Qwen3-0.6B (unsloth GGUF, 397MB, ungated) — newest tiny model.
- wllama V3 (3.4.1) already ships the llama.cpp WebGPU backend (auto-enabled, 10-15x
over WASM on capable browsers); show ⚡WebGPU vs CPU(WASM) in the model bar.
- stripThink(): Qwen3's <think>…</think> reasoning shows in the raw 'thinking' view
but is removed from the parsed persona / diary prose.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

web/diaryPanel.js CHANGED
@@ -3,7 +3,7 @@
3
  // persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
4
  import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
5
  import { mountModelBar } from '/web/modelBar.js'
6
- import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
7
 
8
  function el(tag, props = {}, kids = []) {
9
  const n = document.createElement(tag)
@@ -46,10 +46,10 @@ export function mountDiaryPanel(host) {
46
  status.textContent = `loading ${getCurrentModel().label} into your browser…`
47
  await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
48
  status.textContent = `writing on your device with ${getCurrentModel().label}…`
49
- let first = true
50
  await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
51
  maxTokens: 220, temperature: 0.9,
52
- onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
53
  onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
54
  })
55
  status.textContent = 'written ✓ (generated locally)'
 
3
  // persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
4
  import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
5
  import { mountModelBar } from '/web/modelBar.js'
6
+ import { DIARY_SYSTEM, diaryUserPrompt, stripThink } from '/web/personaPrompts.js'
7
 
8
  function el(tag, props = {}, kids = []) {
9
  const n = document.createElement(tag)
 
46
  status.textContent = `loading ${getCurrentModel().label} into your browser…`
47
  await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
48
  status.textContent = `writing on your device with ${getCurrentModel().label}…`
49
+ let raw = ''
50
  await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
51
  maxTokens: 220, temperature: 0.9,
52
+ onToken: (piece) => { raw += piece; out.textContent = header + stripThink(raw) }, // hide <think>
53
  onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
54
  })
55
  status.textContent = 'written ✓ (generated locally)'
web/modelBar.js CHANGED
@@ -1,7 +1,7 @@
1
  // Shared model picker + cache controls for the in-browser panels. Lets you choose a
2
  // model from the catalog (showing size + whether it's already downloaded) and delete
3
  // a downloaded model from the browser cache — like the wllama demo space.
4
- import { listModels, getCurrentModel, setModel, cachedSet, deleteCached } from '/web/wllamaLlm.js'
5
  import { fmtBytes } from '/web/modelCatalog.js'
6
 
7
  function el(tag, props = {}, kids = []) {
@@ -32,7 +32,7 @@ export function mountModelBar(host, { onChange } = {}) {
32
  el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
33
  sel.value = cur
34
  const m = getCurrentModel()
35
- info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${cached.has(m.id) ? 'cached on device' : 'downloads on first use'}`
36
  del.style.display = cached.has(m.id) ? '' : 'none'
37
  }
38
  async function refresh() { cached = await cachedSet(); render() }
 
1
  // Shared model picker + cache controls for the in-browser panels. Lets you choose a
2
  // model from the catalog (showing size + whether it's already downloaded) and delete
3
  // a downloaded model from the browser cache — like the wllama demo space.
4
+ import { listModels, getCurrentModel, setModel, cachedSet, deleteCached, backendLabel } from '/web/wllamaLlm.js'
5
  import { fmtBytes } from '/web/modelCatalog.js'
6
 
7
  function el(tag, props = {}, kids = []) {
 
32
  el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
33
  sel.value = cur
34
  const m = getCurrentModel()
35
+ info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${backendLabel()} · ${cached.has(m.id) ? 'cached' : 'downloads on first use'}`
36
  del.style.display = cached.has(m.id) ? '' : 'none'
37
  }
38
  async function refresh() { cached = await cachedSet(); render() }
web/modelCatalog.js CHANGED
@@ -12,9 +12,12 @@ export const MODELS = [
12
  { id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
13
  repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
14
  note: 'tiniest — fastest, roughest' },
 
 
 
15
  { id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
16
  repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
17
- note: 'default — good speed/quality balance' },
18
  { id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
19
  repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
20
  note: 'solid 1B all-rounder' },
 
12
  { id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
13
  repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
14
  note: 'tiniest — fastest, roughest' },
15
+ { id: 'qwen3-0.6b', label: 'Qwen3 0.6B', params: '0.6B', bytes: 397e6, thinks: true,
16
+ repo: 'unsloth/Qwen3-0.6B-GGUF', file: 'Qwen3-0.6B-Q4_K_M.gguf',
17
+ note: 'newest tiny — strong, has a thinking mode' },
18
  { id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
19
  repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
20
+ note: 'default — fast, clean JSON, no thinking overhead' },
21
  { id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
22
  repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
23
  note: 'solid 1B all-rounder' },
web/personaPanel.js CHANGED
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
- import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
10
 
11
  const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
12
 
@@ -69,16 +69,16 @@ export function mountPersonaPanel(host) {
69
  maxTokens: 220,
70
  onToken: (piece) => {
71
  acc += piece
72
- thinkEl.textContent = acc
73
  thinkEl.scrollTop = thinkEl.scrollHeight
74
- const live = extractLivePersona(acc)
75
  if (live.name) nameEl.textContent = live.name
76
  if (live.about) aboutEl.textContent = live.about
77
  },
78
  onStats: showStats,
79
  })
80
  try {
81
- const p = parsePersonaJson(acc)
82
  if (p.name) nameEl.textContent = p.name
83
  aboutEl.textContent = p.about
84
  setTags(p)
 
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
+ import { PERSONA_SYSTEM, personaUserPrompt, stripThink } from '/web/personaPrompts.js'
10
 
11
  const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
12
 
 
69
  maxTokens: 220,
70
  onToken: (piece) => {
71
  acc += piece
72
+ thinkEl.textContent = acc // raw view shows the model's <think> reasoning too
73
  thinkEl.scrollTop = thinkEl.scrollHeight
74
+ const live = extractLivePersona(stripThink(acc))
75
  if (live.name) nameEl.textContent = live.name
76
  if (live.about) aboutEl.textContent = live.about
77
  },
78
  onStats: showStats,
79
  })
80
  try {
81
+ const p = parsePersonaJson(stripThink(acc))
82
  if (p.name) nameEl.textContent = p.name
83
  aboutEl.textContent = p.about
84
  setTags(p)
web/personaPrompts.js CHANGED
@@ -28,3 +28,12 @@ export function diaryUserPrompt(unit = '', traits = '') {
28
  const t = (traits || 'untested').trim()
29
  return `Name: ${u}. Traits: ${t}. Write the diary entry.`
30
  }
 
 
 
 
 
 
 
 
 
 
28
  const t = (traits || 'untested').trim()
29
  return `Name: ${u}. Traits: ${t}. Write the diary entry.`
30
  }
31
+
32
+ // Remove a model's <think>…</think> reasoning (Qwen3 etc.) from the visible answer —
33
+ // including a still-open, unterminated block while it's mid-thought.
34
+ export function stripThink(text) {
35
+ return String(text || '')
36
+ .replace(/<think>[\s\S]*?<\/think>/gi, '')
37
+ .replace(/<think>[\s\S]*$/i, '')
38
+ .replace(/^\s+/, '')
39
+ }
web/wllamaLlm.js CHANGED
@@ -17,6 +17,13 @@ const mm = new ModelManager()
17
  export function listModels() { return MODELS }
18
  export function getCurrentModel() { return getModel(currentId) }
19
 
 
 
 
 
 
 
 
20
  // Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
21
  export async function setModel(id) {
22
  if (id === currentId) return
 
17
  export function listModels() { return MODELS }
18
  export function getCurrentModel() { return getModel(currentId) }
19
 
20
+ // wllama V3 auto-uses the llama.cpp WebGPU backend when the browser exposes one
21
+ // (10–15× faster than the WASM CPU fallback). Best-effort label for the UI.
22
+ export function backendLabel() {
23
+ try { return (typeof navigator !== 'undefined' && navigator.gpu) ? '⚡ WebGPU' : 'CPU (WASM)' }
24
+ catch { return 'CPU (WASM)' }
25
+ }
26
+
27
  // Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
28
  export async function setModel(id) {
29
  if (id === currentId) return