Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on Jun 4

Commit

a4ca9e9

1 Parent(s): 8eac3eb

Add Qwen3-0.6B; surface WebGPU backend; strip <think> from answers

- Catalog: add Qwen3-0.6B (unsloth GGUF, 397MB, ungated) — newest tiny model.
- wllama V3 (3.4.1) already ships the llama.cpp WebGPU backend (auto-enabled, 10-15x
over WASM on capable browsers); show ⚡WebGPU vs CPU(WASM) in the model bar.
- stripThink(): Qwen3's <think>…</think> reasoning shows in the raw 'thinking' view
but is removed from the parsed persona / diary prose.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (6) hide show

web/diaryPanel.js +3 -3
web/modelBar.js +2 -2
web/modelCatalog.js +4 -1
web/personaPanel.js +4 -4
web/personaPrompts.js +9 -0
web/wllamaLlm.js +7 -0

web/diaryPanel.js CHANGED Viewed

@@ -3,7 +3,7 @@
 // persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
 import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
 import { mountModelBar } from '/web/modelBar.js'
-import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
@@ -46,10 +46,10 @@ export function mountDiaryPanel(host) {
       status.textContent = `loading ${getCurrentModel().label} into your browser…`
       await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
       status.textContent = `writing on your device with ${getCurrentModel().label}…`
-      let first = true
       await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
         maxTokens: 220, temperature: 0.9,
-        onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
         onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
       })
       status.textContent = 'written ✓ (generated locally)'

 // persona styling (.persona-*), the model picker (modelBar), and tok/s stats.
 import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
 import { mountModelBar } from '/web/modelBar.js'
+import { DIARY_SYSTEM, diaryUserPrompt, stripThink } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
       status.textContent = `loading ${getCurrentModel().label} into your browser…`
       await ensureModel((frac) => { status.textContent = `downloading ${getCurrentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
       status.textContent = `writing on your device with ${getCurrentModel().label}…`
+      let raw = ''
       await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
         maxTokens: 220, temperature: 0.9,
+        onToken: (piece) => { raw += piece; out.textContent = header + stripThink(raw) },  // hide <think>
         onStats: (s) => { stats.textContent = `● ${s.tokPerSec} tok/s · ${s.tokens} tok${s.ttftSeconds != null ? ` · first ${s.ttftSeconds}s` : ''}` },
       })
       status.textContent = 'written ✓ (generated locally)'

web/modelBar.js CHANGED Viewed

@@ -1,7 +1,7 @@
 // Shared model picker + cache controls for the in-browser panels. Lets you choose a
 // model from the catalog (showing size + whether it's already downloaded) and delete
 // a downloaded model from the browser cache — like the wllama demo space.
-import { listModels, getCurrentModel, setModel, cachedSet, deleteCached } from '/web/wllamaLlm.js'
 import { fmtBytes } from '/web/modelCatalog.js'
 function el(tag, props = {}, kids = []) {
@@ -32,7 +32,7 @@ export function mountModelBar(host, { onChange } = {}) {
       el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
     sel.value = cur
     const m = getCurrentModel()
-    info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${cached.has(m.id) ? 'cached on device' : 'downloads on first use'}`
     del.style.display = cached.has(m.id) ? '' : 'none'
   }
   async function refresh() { cached = await cachedSet(); render() }

 // Shared model picker + cache controls for the in-browser panels. Lets you choose a
 // model from the catalog (showing size + whether it's already downloaded) and delete
 // a downloaded model from the browser cache — like the wllama demo space.
+import { listModels, getCurrentModel, setModel, cachedSet, deleteCached, backendLabel } from '/web/wllamaLlm.js'
 import { fmtBytes } from '/web/modelCatalog.js'
 function el(tag, props = {}, kids = []) {
       el('option', { value: m.id }, `${m.label} · ${fmtBytes(m.bytes)}${cached.has(m.id) ? ' · ✓ downloaded' : ''}`)))
     sel.value = cur
     const m = getCurrentModel()
+    info.textContent = `${m.params} · ${fmtBytes(m.bytes)} · ${backendLabel()} · ${cached.has(m.id) ? 'cached' : 'downloads on first use'}`
     del.style.display = cached.has(m.id) ? '' : 'none'
   }
   async function refresh() { cached = await cachedSet(); render() }

web/modelCatalog.js CHANGED Viewed

@@ -12,9 +12,12 @@ export const MODELS = [
   { id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
     repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
     note: 'tiniest — fastest, roughest' },
   { id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
     repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
-    note: 'default — good speed/quality balance' },
   { id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
     repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
     note: 'solid 1B all-rounder' },

   { id: 'smollm2-360m', label: 'SmolLM2 360M', params: '360M', bytes: 386e6,
     repo: 'HuggingFaceTB/SmolLM2-360M-Instruct-GGUF', file: 'smollm2-360m-instruct-q8_0.gguf',
     note: 'tiniest — fastest, roughest' },
+  { id: 'qwen3-0.6b', label: 'Qwen3 0.6B', params: '0.6B', bytes: 397e6, thinks: true,
+    repo: 'unsloth/Qwen3-0.6B-GGUF', file: 'Qwen3-0.6B-Q4_K_M.gguf',
+    note: 'newest tiny — strong, has a thinking mode' },
   { id: 'qwen2.5-0.5b', label: 'Qwen2.5 0.5B', params: '0.5B', bytes: 491e6,
     repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf',
+    note: 'default — fast, clean JSON, no thinking overhead' },
   { id: 'llama3.2-1b', label: 'Llama 3.2 1B', params: '1B', bytes: 808e6,
     repo: 'bartowski/Llama-3.2-1B-Instruct-GGUF', file: 'Llama-3.2-1B-Instruct-Q4_K_M.gguf',
     note: 'solid 1B all-rounder' },

web/personaPanel.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { streamChat, ensureModel, getCurrentModel } from '/web/wllamaLlm.js'
 import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
-import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
@@ -69,16 +69,16 @@ export function mountPersonaPanel(host) {
         maxTokens: 220,
         onToken: (piece) => {
           acc += piece
-          thinkEl.textContent = acc
           thinkEl.scrollTop = thinkEl.scrollHeight
-          const live = extractLivePersona(acc)
           if (live.name) nameEl.textContent = live.name
           if (live.about) aboutEl.textContent = live.about
         },
         onStats: showStats,
       })
       try {
-        const p = parsePersonaJson(acc)
         if (p.name) nameEl.textContent = p.name
         aboutEl.textContent = p.about
         setTags(p)

 import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
+import { PERSONA_SYSTEM, personaUserPrompt, stripThink } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
         maxTokens: 220,
         onToken: (piece) => {
           acc += piece
+          thinkEl.textContent = acc  // raw view shows the model's <think> reasoning too
           thinkEl.scrollTop = thinkEl.scrollHeight
+          const live = extractLivePersona(stripThink(acc))
           if (live.name) nameEl.textContent = live.name
           if (live.about) aboutEl.textContent = live.about
         },
         onStats: showStats,
       })
       try {
+        const p = parsePersonaJson(stripThink(acc))
         if (p.name) nameEl.textContent = p.name
         aboutEl.textContent = p.about
         setTags(p)

web/personaPrompts.js CHANGED Viewed

@@ -28,3 +28,12 @@ export function diaryUserPrompt(unit = '', traits = '') {
   const t = (traits || 'untested').trim()
   return `Name: ${u}. Traits: ${t}. Write the diary entry.`
 }

   const t = (traits || 'untested').trim()
   return `Name: ${u}. Traits: ${t}. Write the diary entry.`
 }
+// Remove a model's <think>…</think> reasoning (Qwen3 etc.) from the visible answer —
+// including a still-open, unterminated block while it's mid-thought.
+export function stripThink(text) {
+  return String(text || '')
+    .replace(/<think>[\s\S]*?<\/think>/gi, '')
+    .replace(/<think>[\s\S]*$/i, '')
+    .replace(/^\s+/, '')
+}

web/wllamaLlm.js CHANGED Viewed

@@ -17,6 +17,13 @@ const mm = new ModelManager()
 export function listModels() { return MODELS }
 export function getCurrentModel() { return getModel(currentId) }
 // Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
 export async function setModel(id) {
   if (id === currentId) return

 export function listModels() { return MODELS }
 export function getCurrentModel() { return getModel(currentId) }
+// wllama V3 auto-uses the llama.cpp WebGPU backend when the browser exposes one
+// (10–15× faster than the WASM CPU fallback). Best-effort label for the UI.
+export function backendLabel() {
+  try { return (typeof navigator !== 'undefined' && navigator.gpu) ? '⚡ WebGPU' : 'CPU (WASM)' }
+  catch { return 'CPU (WASM)' }
+}
 // Switch the active model. Unloads the loaded one so the next ensureModel loads fresh.
 export async function setModel(id) {
   if (id === currentId) return