polats Claude Opus 4.8 (1M context) commited on
Commit
eba5aae
·
1 Parent(s): ab87288

Fix Qwen3 on WebLLM: budget for thinking + capture reasoning_content

Browse files

Symptom: WebLLM Qwen3 "rambled" but the output was just unfinished. MLC doesn't
honour the `/no_think` text switch (wllama does), so Qwen3 keeps reasoning, and
max_tokens:220 caps the TOTAL (reasoning + answer) — the JSON gets truncated before
it closes, so parsing fails.

- thinkMaxTokens(): give thinking models (qwen3) ≥768 tokens so they can reason AND
still finish the answer. When /no_think IS honoured (wllama/transformers) the model
stops early at EOS, so this costs nothing there.
- WebLLM stream now captures MLC's separate `reasoning_content`, re-wrapped as
<think>…</think>, so stripThink + the raw "thinking" view treat every engine the
same (and the budget math is visible).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

web/diaryPanel.js CHANGED
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { mountTtsBar } from '/web/ttsBar.js'
8
  import { makeNarrator, ensureTts } from '/web/tts.js'
9
- import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
10
 
11
  function el(tag, props = {}, kids = []) {
12
  const n = document.createElement(tag)
@@ -116,7 +116,7 @@ export function mountDiaryPanel(host) {
116
  status.textContent = `writing on your device with ${currentModel().label}…`
117
  let raw = ''
118
  await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
119
- maxTokens: 220, temperature: 0.9,
120
  onToken: (piece) => {
121
  raw += piece
122
  lastBody = stripThink(raw)
 
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { mountTtsBar } from '/web/ttsBar.js'
8
  import { makeNarrator, ensureTts } from '/web/tts.js'
9
+ import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink, thinkMaxTokens } from '/web/personaPrompts.js'
10
 
11
  function el(tag, props = {}, kids = []) {
12
  const n = document.createElement(tag)
 
116
  status.textContent = `writing on your device with ${currentModel().label}…`
117
  let raw = ''
118
  await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
119
+ maxTokens: thinkMaxTokens(currentModel().id, 220), temperature: 0.9,
120
  onToken: (piece) => {
121
  raw += piece
122
  lastBody = stripThink(raw)
web/engineWebllm.js CHANGED
@@ -58,11 +58,19 @@ function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken,
58
  messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
59
  stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
60
  })
 
 
 
 
 
61
  for await (const ch of chunks) {
62
- const piece = ch.choices?.[0]?.delta?.content || ''
63
- if (!piece) continue
64
- full += piece; if (onToken) onToken(piece); st.tick()
 
 
65
  }
 
66
  return { text: full, stats: st.finish() }
67
  }
68
  const p = _chain.then(run, run); _chain = p.catch(() => {}); return p
 
58
  messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
59
  stream: true, stream_options: { include_usage: true }, temperature, max_tokens: maxTokens,
60
  })
61
+ // MLC routes Qwen3's reasoning into a separate `reasoning_content` field. Re-wrap
62
+ // it as <think>…</think> and prepend, so the rest of the app (stripThink + the raw
63
+ // "thinking" view) treats every engine's output the same.
64
+ let thinkOpen = false
65
+ const emit = (s) => { if (!s) return; full += s; if (onToken) onToken(s); st.tick() }
66
  for await (const ch of chunks) {
67
+ const d = ch.choices?.[0]?.delta || {}
68
+ const r = d.reasoning_content || ''
69
+ const c = d.content || ''
70
+ if (r) { if (!thinkOpen) { emit('<think>'); thinkOpen = true } emit(r) }
71
+ if (c) { if (thinkOpen) { emit('</think>'); thinkOpen = false } emit(c) }
72
  }
73
+ if (thinkOpen) emit('</think>')
74
  return { text: full, stats: st.finish() }
75
  }
76
  const p = _chain.then(run, run); _chain = p.catch(() => {}); return p
web/personaPanel.js CHANGED
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
- import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
10
 
11
  const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
12
 
@@ -72,7 +72,7 @@ export function mountPersonaPanel(host) {
72
  status.textContent = `writing on your device with ${currentModel().label}…`
73
  let acc = ''
74
  await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
75
- maxTokens: 220,
76
  onToken: (piece) => {
77
  acc += piece
78
  thinkEl.textContent = acc // raw view shows the model's <think> reasoning too
 
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
+ import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink, thinkMaxTokens } from '/web/personaPrompts.js'
10
 
11
  const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
12
 
 
72
  status.textContent = `writing on your device with ${currentModel().label}…`
73
  let acc = ''
74
  await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
75
+ maxTokens: thinkMaxTokens(currentModel().id, 220),
76
  onToken: (piece) => {
77
  acc += piece
78
  thinkEl.textContent = acc // raw view shows the model's <think> reasoning too
web/personaPrompts.js CHANGED
@@ -39,7 +39,15 @@ export function stripThink(text) {
39
  }
40
 
41
  // Qwen3 is a thinking model: left alone it burns the whole token budget on a
42
- // <think> block and never reaches the JSON/answer. For these structured, short
43
- // tasks we don't want reasoning — Qwen3 honours the `/no_think` soft switch in the
44
- // prompt (recognised across llama.cpp / WebLLM / Transformers.js). No-op otherwise.
45
- export const noThink = (modelId) => (/qwen3/i.test(String(modelId || '')) ? ' /no_think' : '')
 
 
 
 
 
 
 
 
 
39
  }
40
 
41
  // Qwen3 is a thinking model: left alone it burns the whole token budget on a
42
+ // <think> block and never reaches the JSON/answer.
43
+ export const isThinking = (modelId) => /qwen3/i.test(String(modelId || ''))
44
+
45
+ // `/no_think` soft-switch tells Qwen3 to skip reasoning. llama.cpp/Transformers.js
46
+ // honour it (the model finishes in a few tokens); WebLLM's MLC template does NOT
47
+ // reliably, so we ALSO budget extra tokens (see thinkMaxTokens) for those — enough to
48
+ // reason AND still finish the answer. No-op for non-thinking models.
49
+ export const noThink = (modelId) => (isThinking(modelId) ? ' /no_think' : '')
50
+
51
+ // Token budget: thinking models may spend the budget reasoning, so give them headroom
52
+ // to still complete the answer. (When /no_think works, they stop early anyway.)
53
+ export const thinkMaxTokens = (modelId, base) => (isThinking(modelId) ? Math.max(base, 768) : base)