Spaces:
Running
Running
Add Qwen3-0.6B; surface WebGPU backend; strip <think> from answers
Browse files- Catalog: add Qwen3-0.6B (unsloth GGUF, 397MB, ungated) — newest tiny model.
- wllama V3 (3.4.1) already ships the llama.cpp WebGPU backend (auto-enabled, 10-15x
over WASM on capable browsers); show ⚡WebGPU vs CPU(WASM) in the model bar.
- stripThink(): Qwen3's <think>…</think> reasoning shows in the raw 'thinking' view
but is removed from the parsed persona / diary prose.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
- web/diaryPanel.js +3 -3
- web/modelBar.js +2 -2
- web/modelCatalog.js +4 -1
- web/personaPanel.js +4 -4
- web/personaPrompts.js +9 -0
- web/wllamaLlm.js +7 -0
web/diaryPanel.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
// persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
|
| 4 |
import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
|
| 5 |
import { mountModelBar } from '/web/modelBar.js'
|
| 6 |
-
import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
|
| 7 |
|
| 8 |
function el(tag, props = {}, kids = []) {
|
| 9 |
const n = document.createElement(tag)
|
|
@@ -46,10 +46,10 @@ export function mountDiaryPanel(host) {
|
|
| 46 |
status.textContent = `loading ${getCurrentModel().label} into your browser…`
|
| 47 |
await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
|
| 48 |
status.textContent = `writing on your device with ${getCurrentModel().label}…`
|
| 49 |
-
let
|
| 50 |
await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
|
| 51 |
maxTokens: 220, temperature: 0.9,
|
| 52 |
-
onToken: (piece) => {
|
| 53 |
onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
|
| 54 |
})
|
| 55 |
status.textContent = 'written ✓ (generated locally)'
|
|
|
|
| 3 |
// persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
|
| 4 |
import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
|
| 5 |
import { mountModelBar } from '/web/modelBar.js'
|
| 6 |
+
import { DIARY_SYSTEM, diaryUserPrompt, stripThink } from '/web/personaPrompts.js'
|
| 7 |
|
| 8 |
function el(tag, props = {}, kids = []) {
|
| 9 |
const n = document.createElement(tag)
|
|
|
|
| 46 |
status.textContent = `loading ${getCurrentModel().label} into your browser…`
|
| 47 |
await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
|
| 48 |
status.textContent = `writing on your device with ${getCurrentModel().label}…`
|
| 49 |
+
let raw = ''
|
| 50 |
await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
|
| 51 |
maxTokens: 220, temperature: 0.9,
|
| 52 |
+
onToken: (piece) => { raw += piece; out.textContent = header + stripThink(raw) }, // hide <think>
|
| 53 |
onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
|
| 54 |
})
|
| 55 |
status.textContent = 'written ✓ (generated locally)'
|
web/modelBar.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
// Shared model picker + cache controls for the in-browser panels. Lets you choose a
|
| 2 |
// model from the catalog (showing size + whether it's already downloaded) and delete
|
| 3 |
// a downloaded model from the browser cache — like the wllama demo space.
|
| 4 |
-
import { listModels, getCurrentModel, setModel, cachedSet, deleteCached } from '/web/wllamaLlm.js'
|
| 5 |
import { fmtBytes } from '/web/modelCatalog.js'
|
| 6 |
|
| 7 |
function el(tag, props = {}, kids = []) {
|
|
@@ -32,7 +32,7 @@ export function mountModelBar(host, { onChange } = {}) {
|
|
| 32 |
el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
|
| 33 |
sel.value = cur
|
| 34 |
const m = getCurrentModel()
|
| 35 |
-
info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${cached.has(m.id) ? 'cached
|
| 36 |
del.style.display = cached.has(m.id) ? '' : 'none'
|
| 37 |
}
|
| 38 |
async function refresh() { cached = await cachedSet(); render() }
|
|
|
|
| 1 |
// Shared model picker + cache controls for the in-browser panels. Lets you choose a
|
| 2 |
// model from the catalog (showing size + whether it's already downloaded) and delete
|
| 3 |
// a downloaded model from the browser cache — like the wllama demo space.
|
| 4 |
+
import { listModels, getCurrentModel, setModel, cachedSet, deleteCached, backendLabel } from '/web/wllamaLlm.js'
|
| 5 |
import { fmtBytes } from '/web/modelCatalog.js'
|
| 6 |
|
| 7 |
function el(tag, props = {}, kids = []) {
|
|
|
|
| 32 |
el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
|
| 33 |
sel.value = cur
|
| 34 |
const m = getCurrentModel()
|
| 35 |
+
info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${backendLabel()} · ${cached.has(m.id) ? 'cached' : 'downloads on first use'}`
|
| 36 |
del.style.display = cached.has(m.id) ? '' : 'none'
|
| 37 |
}
|
| 38 |
async function refresh() { cached = await cachedSet(); render() }
|
web/modelCatalog.js
CHANGED
|
@@ -12,9 +12,12 @@ export const MODELS = [
|
|
| 12 |
{ id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
|
| 13 |
repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
|
| 14 |
note: 'tiniest — fastest, roughest' },
|
|
|
|
|
|
|
|
|
|
| 15 |
{ id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
|
| 16 |
repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
|
| 17 |
-
note: 'default —
|
| 18 |
{ id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
|
| 19 |
repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
|
| 20 |
note: 'solid 1B all-rounder' },
|
|
|
|
| 12 |
{ id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
|
| 13 |
repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
|
| 14 |
note: 'tiniest — fastest, roughest' },
|
| 15 |
+
{ id: 'qwen3-0.6b', label: 'Qwen3 0.6B', params: '0.6B', bytes: 397e6, thinks: true,
|
| 16 |
+
repo: 'unsloth/Qwen3-0.6B-GGUF', file: 'Qwen3-0.6B-Q4_K_M.gguf',
|
| 17 |
+
note: 'newest tiny — strong, has a thinking mode' },
|
| 18 |
{ id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
|
| 19 |
repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
|
| 20 |
+
note: 'default — fast, clean JSON, no thinking overhead' },
|
| 21 |
{ id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
|
| 22 |
repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
|
| 23 |
note: 'solid 1B all-rounder' },
|
web/personaPanel.js
CHANGED
|
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { extractLivePersona } from '/web/personaStream.js'
|
| 8 |
import { parsePersonaJson } from '/web/personaParse.js'
|
| 9 |
-
import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
|
| 12 |
|
|
@@ -69,16 +69,16 @@ export function mountPersonaPanel(host) {
|
|
| 69 |
maxTokens: 220,
|
| 70 |
onToken: (piece) => {
|
| 71 |
acc += piece
|
| 72 |
-
thinkEl.textContent = acc
|
| 73 |
thinkEl.scrollTop = thinkEl.scrollHeight
|
| 74 |
-
const live = extractLivePersona(acc)
|
| 75 |
if (live.name) nameEl.textContent = live.name
|
| 76 |
if (live.about) aboutEl.textContent = live.about
|
| 77 |
},
|
| 78 |
onStats: showStats,
|
| 79 |
})
|
| 80 |
try {
|
| 81 |
-
const p = parsePersonaJson(acc)
|
| 82 |
if (p.name) nameEl.textContent = p.name
|
| 83 |
aboutEl.textContent = p.about
|
| 84 |
setTags(p)
|
|
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { extractLivePersona } from '/web/personaStream.js'
|
| 8 |
import { parsePersonaJson } from '/web/personaParse.js'
|
| 9 |
+
import { PERSONA_SYSTEM, personaUserPrompt, stripThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
|
| 12 |
|
|
|
|
| 69 |
maxTokens: 220,
|
| 70 |
onToken: (piece) => {
|
| 71 |
acc += piece
|
| 72 |
+
thinkEl.textContent = acc // raw view shows the model's <think> reasoning too
|
| 73 |
thinkEl.scrollTop = thinkEl.scrollHeight
|
| 74 |
+
const live = extractLivePersona(stripThink(acc))
|
| 75 |
if (live.name) nameEl.textContent = live.name
|
| 76 |
if (live.about) aboutEl.textContent = live.about
|
| 77 |
},
|
| 78 |
onStats: showStats,
|
| 79 |
})
|
| 80 |
try {
|
| 81 |
+
const p = parsePersonaJson(stripThink(acc))
|
| 82 |
if (p.name) nameEl.textContent = p.name
|
| 83 |
aboutEl.textContent = p.about
|
| 84 |
setTags(p)
|
web/personaPrompts.js
CHANGED
|
@@ -28,3 +28,12 @@ export function diaryUserPrompt(unit = '', traits = '') {
|
|
| 28 |
const t = (traits || 'untested').trim()
|
| 29 |
return `Name: ${u}. Traits: ${t}. Write the diary entry.`
|
| 30 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
const t = (traits || 'untested').trim()
|
| 29 |
return `Name: ${u}. Traits: ${t}. Write the diary entry.`
|
| 30 |
}
|
| 31 |
+
|
| 32 |
+
// Remove a model's <think>…</think> reasoning (Qwen3 etc.) from the visible answer —
|
| 33 |
+
// including a still-open, unterminated block while it's mid-thought.
|
| 34 |
+
export function stripThink(text) {
|
| 35 |
+
return String(text || '')
|
| 36 |
+
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
| 37 |
+
.replace(/<think>[\s\S]*$/i, '')
|
| 38 |
+
.replace(/^\s+/, '')
|
| 39 |
+
}
|
web/wllamaLlm.js
CHANGED
|
@@ -17,6 +17,13 @@ const mm = new ModelManager()
|
|
| 17 |
export function listModels() { return MODELS }
|
| 18 |
export function getCurrentModel() { return getModel(currentId) }
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
// Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
|
| 21 |
export async function setModel(id) {
|
| 22 |
if (id === currentId) return
|
|
|
|
| 17 |
export function listModels() { return MODELS }
|
| 18 |
export function getCurrentModel() { return getModel(currentId) }
|
| 19 |
|
| 20 |
+
// wllama V3 auto-uses the llama.cpp WebGPU backend when the browser exposes one
|
| 21 |
+
// (10–15× faster than the WASM CPU fallback). Best-effort label for the UI.
|
| 22 |
+
export function backendLabel() {
|
| 23 |
+
try { return (typeof navigator !== 'undefined' && navigator.gpu) ? '⚡ WebGPU' : 'CPU (WASM)' }
|
| 24 |
+
catch { return 'CPU (WASM)' }
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
// Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
|
| 28 |
export async function setModel(id) {
|
| 29 |
if (id === currentId) return
|