Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on 5 days ago

Commit

eba5aae

1 Parent(s): ab87288

Fix Qwen3 on WebLLM: budget for thinking + capture reasoning_content

Symptom: WebLLM Qwen3 "rambled" but the output was just unfinished. MLC doesn't
honour the `/no_think` text switch (wllama does), so Qwen3 keeps reasoning, and
max_tokens:220 caps the TOTAL (reasoning + answer) — the JSON gets truncated before
it closes, so parsing fails.

- thinkMaxTokens(): give thinking models (qwen3) ≥768 tokens so they can reason AND
still finish the answer. When /no_think IS honoured (wllama/transformers) the model
stops early at EOS, so this costs nothing there.
- WebLLM stream now captures MLC's separate `reasoning_content`, re-wrapped as
<think>…</think>, so stripThink + the raw "thinking" view treat every engine the
same (and the budget math is visible).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (4) hide show

web/diaryPanel.js +2 -2
web/engineWebllm.js +11 -3
web/personaPanel.js +2 -2
web/personaPrompts.js +12 -4

web/diaryPanel.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
 import { mountModelBar } from '/web/modelBar.js'
 import { mountTtsBar } from '/web/ttsBar.js'
 import { makeNarrator, ensureTts } from '/web/tts.js'
-import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
@@ -116,7 +116,7 @@ export function mountDiaryPanel(host) {
       status.textContent = `writing on your device with ${currentModel().label}…`
       let raw = ''
       await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
-        maxTokens: 220, temperature: 0.9,
         onToken: (piece) => {
           raw += piece
           lastBody = stripThink(raw)

 import { mountModelBar } from '/web/modelBar.js'
 import { mountTtsBar } from '/web/ttsBar.js'
 import { makeNarrator, ensureTts } from '/web/tts.js'
+import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink, thinkMaxTokens } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
       status.textContent = `writing on your device with ${currentModel().label}…`
       let raw = ''
       await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
+        maxTokens: thinkMaxTokens(currentModel().id, 220), temperature: 0.9,
         onToken: (piece) => {
           raw += piece
           lastBody = stripThink(raw)

web/engineWebllm.js CHANGED Viewed

@@ -58,11 +58,19 @@ function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken,
       messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
       stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
     })
     for await (const ch of chunks) {
-      const piece = ch.choices?.[0]?.delta?.content || ''
-      if (!piece) continue
-      full += piece; if (onToken) onToken(piece); st.tick()
     }
     return { text: full, stats: st.finish() }
   }
   const p = _chain.then(run, run); _chain = p.catch(() => {}); return p

       messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
       stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
     })
+    // MLC routes Qwen3's reasoning into a separate `reasoning_content` field. Re-wrap
+    // it as <think>…</think> and prepend, so the rest of the app (stripThink + the raw
+    // "thinking" view) treats every engine's output the same.
+    let thinkOpen = false
+    const emit = (s) => { if (!s) return; full += s; if (onToken) onToken(s); st.tick() }
     for await (const ch of chunks) {
+      const d = ch.choices?.[0]?.delta || {}
+      const r = d.reasoning_content || ''
+      const c = d.content || ''
+      if (r) { if (!thinkOpen) { emit('<think>'); thinkOpen = true } emit(r) }
+      if (c) { if (thinkOpen) { emit('</think>'); thinkOpen = false } emit(c) }
     }
+    if (thinkOpen) emit('</think>')
     return { text: full, stats: st.finish() }
   }
   const p = _chain.then(run, run); _chain = p.catch(() => {}); return p

web/personaPanel.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
 import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
-import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
@@ -72,7 +72,7 @@ export function mountPersonaPanel(host) {
       status.textContent = `writing on your device with ${currentModel().label}…`
       let acc = ''
       await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
-        maxTokens: 220,
         onToken: (piece) => {
           acc += piece
           thinkEl.textContent = acc  // raw view shows the model's <think> reasoning too

 import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
+import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink, thinkMaxTokens } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
       status.textContent = `writing on your device with ${currentModel().label}…`
       let acc = ''
       await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
+        maxTokens: thinkMaxTokens(currentModel().id, 220),
         onToken: (piece) => {
           acc += piece
           thinkEl.textContent = acc  // raw view shows the model's <think> reasoning too

web/personaPrompts.js CHANGED Viewed

@@ -39,7 +39,15 @@ export function stripThink(text) {
 }
 // Qwen3 is a thinking model: left alone it burns the whole token budget on a
-// <think> block and never reaches the JSON/answer. For these structured, short
-// tasks we don't want reasoning — Qwen3 honours the `/no_think` soft switch in the
-// prompt (recognised across llama.cpp / WebLLM / Transformers.js). No-op otherwise.
-export const noThink = (modelId) => (/qwen3/i.test(String(modelId || '')) ? ' /no_think' : '')

 }
 // Qwen3 is a thinking model: left alone it burns the whole token budget on a
+// <think> block and never reaches the JSON/answer.
+export const isThinking = (modelId) => /qwen3/i.test(String(modelId || ''))
+// `/no_think` soft-switch tells Qwen3 to skip reasoning. llama.cpp/Transformers.js
+// honour it (the model finishes in a few tokens); WebLLM's MLC template does NOT
+// reliably, so we ALSO budget extra tokens (see thinkMaxTokens) for those — enough to
+// reason AND still finish the answer. No-op for non-thinking models.
+export const noThink = (modelId) => (isThinking(modelId) ? ' /no_think' : '')
+// Token budget: thinking models may spend the budget reasoning, so give them headroom
+// to still complete the answer. (When /no_think works, they stop early anyway.)
+export const thinkMaxTokens = (modelId, base) => (isThinking(modelId) ? Math.max(base, 768) : base)