Spaces:
Running
Fix Qwen3 on WebLLM: budget for thinking + capture reasoning_content
Browse filesSymptom: WebLLM Qwen3 "rambled" but the output was just unfinished. MLC doesn't
honour the `/no_think` text switch (wllama does), so Qwen3 keeps reasoning, and
max_tokens:220 caps the TOTAL (reasoning + answer) — the JSON gets truncated before
it closes, so parsing fails.
- thinkMaxTokens(): give thinking models (qwen3) ≥768 tokens so they can reason AND
still finish the answer. When /no_think IS honoured (wllama/transformers) the model
stops early at EOS, so this costs nothing there.
- WebLLM stream now captures MLC's separate `reasoning_content`, re-wrapped as
<think>…</think>, so stripThink + the raw "thinking" view treat every engine the
same (and the budget math is visible).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
- web/diaryPanel.js +2 -2
- web/engineWebllm.js +11 -3
- web/personaPanel.js +2 -2
- web/personaPrompts.js +12 -4
|
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { mountTtsBar } from '/web/ttsBar.js'
|
| 8 |
import { makeNarrator, ensureTts } from '/web/tts.js'
|
| 9 |
-
import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
function el(tag, props = {}, kids = []) {
|
| 12 |
const n = document.createElement(tag)
|
|
@@ -116,7 +116,7 @@ export function mountDiaryPanel(host) {
|
|
| 116 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 117 |
let raw = ''
|
| 118 |
await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
|
| 119 |
-
maxTokens: 220, temperature: 0.9,
|
| 120 |
onToken: (piece) => {
|
| 121 |
raw += piece
|
| 122 |
lastBody = stripThink(raw)
|
|
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { mountTtsBar } from '/web/ttsBar.js'
|
| 8 |
import { makeNarrator, ensureTts } from '/web/tts.js'
|
| 9 |
+
import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink, thinkMaxTokens } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
function el(tag, props = {}, kids = []) {
|
| 12 |
const n = document.createElement(tag)
|
|
|
|
| 116 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 117 |
let raw = ''
|
| 118 |
await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
|
| 119 |
+
maxTokens: thinkMaxTokens(currentModel().id, 220), temperature: 0.9,
|
| 120 |
onToken: (piece) => {
|
| 121 |
raw += piece
|
| 122 |
lastBody = stripThink(raw)
|
|
@@ -58,11 +58,19 @@ function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken,
|
|
| 58 |
messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
|
| 59 |
stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
|
| 60 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
for await (const ch of chunks) {
|
| 62 |
-
const
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
}
|
|
|
|
| 66 |
return { text: full, stats: st.finish() }
|
| 67 |
}
|
| 68 |
const p = _chain.then(run, run); _chain = p.catch(() => {}); return p
|
|
|
|
| 58 |
messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
|
| 59 |
stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
|
| 60 |
})
|
| 61 |
+
// MLC routes Qwen3's reasoning into a separate `reasoning_content` field. Re-wrap
|
| 62 |
+
// it as <think>…</think> and prepend, so the rest of the app (stripThink + the raw
|
| 63 |
+
// "thinking" view) treats every engine's output the same.
|
| 64 |
+
let thinkOpen = false
|
| 65 |
+
const emit = (s) => { if (!s) return; full += s; if (onToken) onToken(s); st.tick() }
|
| 66 |
for await (const ch of chunks) {
|
| 67 |
+
const d = ch.choices?.[0]?.delta || {}
|
| 68 |
+
const r = d.reasoning_content || ''
|
| 69 |
+
const c = d.content || ''
|
| 70 |
+
if (r) { if (!thinkOpen) { emit('<think>'); thinkOpen = true } emit(r) }
|
| 71 |
+
if (c) { if (thinkOpen) { emit('</think>'); thinkOpen = false } emit(c) }
|
| 72 |
}
|
| 73 |
+
if (thinkOpen) emit('</think>')
|
| 74 |
return { text: full, stats: st.finish() }
|
| 75 |
}
|
| 76 |
const p = _chain.then(run, run); _chain = p.catch(() => {}); return p
|
|
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { extractLivePersona } from '/web/personaStream.js'
|
| 8 |
import { parsePersonaJson } from '/web/personaParse.js'
|
| 9 |
-
import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
|
| 12 |
|
|
@@ -72,7 +72,7 @@ export function mountPersonaPanel(host) {
|
|
| 72 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 73 |
let acc = ''
|
| 74 |
await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
|
| 75 |
-
maxTokens: 220,
|
| 76 |
onToken: (piece) => {
|
| 77 |
acc += piece
|
| 78 |
thinkEl.textContent = acc // raw view shows the model's <think> reasoning too
|
|
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { extractLivePersona } from '/web/personaStream.js'
|
| 8 |
import { parsePersonaJson } from '/web/personaParse.js'
|
| 9 |
+
import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink, thinkMaxTokens } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
|
| 12 |
|
|
|
|
| 72 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 73 |
let acc = ''
|
| 74 |
await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
|
| 75 |
+
maxTokens: thinkMaxTokens(currentModel().id, 220),
|
| 76 |
onToken: (piece) => {
|
| 77 |
acc += piece
|
| 78 |
thinkEl.textContent = acc // raw view shows the model's <think> reasoning too
|
|
@@ -39,7 +39,15 @@ export function stripThink(text) {
|
|
| 39 |
}
|
| 40 |
|
| 41 |
// Qwen3 is a thinking model: left alone it burns the whole token budget on a
|
| 42 |
-
// <think> block and never reaches the JSON/answer.
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
// Qwen3 is a thinking model: left alone it burns the whole token budget on a
|
| 42 |
+
// <think> block and never reaches the JSON/answer.
|
| 43 |
+
export const isThinking = (modelId) => /qwen3/i.test(String(modelId || ''))
|
| 44 |
+
|
| 45 |
+
// `/no_think` soft-switch tells Qwen3 to skip reasoning. llama.cpp/Transformers.js
|
| 46 |
+
// honour it (the model finishes in a few tokens); WebLLM's MLC template does NOT
|
| 47 |
+
// reliably, so we ALSO budget extra tokens (see thinkMaxTokens) for those — enough to
|
| 48 |
+
// reason AND still finish the answer. No-op for non-thinking models.
|
| 49 |
+
export const noThink = (modelId) => (isThinking(modelId) ? ' /no_think' : '')
|
| 50 |
+
|
| 51 |
+
// Token budget: thinking models may spend the budget reasoning, so give them headroom
|
| 52 |
+
// to still complete the answer. (When /no_think works, they stop early anyway.)
|
| 53 |
+
export const thinkMaxTokens = (modelId, base) => (isThinking(modelId) ? Math.max(base, 768) : base)
|