polats Claude Opus 4.8 (1M context) commited on
Commit
ab87288
·
1 Parent(s): 0218dca

Fix Qwen3 parsing (/no_think) + WebLLM double-load guard & progress clarity

Browse files

Qwen3 is a thinking model: with a 220-token budget it spent everything on a
<think> block and never emitted the JSON/answer, so persona parsing failed after
the thinking. Append Qwen3's `/no_think` soft switch for these structured tasks
(persona + diary) so it goes straight to the answer. Helper noThink(modelId) keyed
on the model id; no-op for non-Qwen3.

WebLLM "downloads twice / no cache":
- ensure() guarded on _loadedId (only set AFTER load), so a re-entrant ensure during
a slow download could start a SECOND load. Guard on _loadingId (set up front)
instead — same fix applied to the Transformers.js engine.
- MLC runs two phases through one progress callback (Fetching from network, then
Loading from cache into GPU) — both 0→100%, which reads as "downloading twice".
Pass MLC's progress text through so the status line shows which phase it is; the
second pass is a cache-load, not a re-download. (Caching = MLC Cache API + our
storage.persist(); evictable under quota pressure.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

web/diaryPanel.js CHANGED
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { mountTtsBar } from '/web/ttsBar.js'
8
  import { makeNarrator, ensureTts } from '/web/tts.js'
9
- import { DIARY_SYSTEM, diaryUserPrompt, stripThink } from '/web/personaPrompts.js'
10
 
11
  function el(tag, props = {}, kids = []) {
12
  const n = document.createElement(tag)
@@ -112,10 +112,10 @@ export function mountDiaryPanel(host) {
112
 
113
  try {
114
  status.textContent = `loading ${currentModel().label} into your browser…`
115
- await ensureModel((frac) => { status.textContent = `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
116
  status.textContent = `writing on your device with ${currentModel().label}…`
117
  let raw = ''
118
- await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
119
  maxTokens: 220, temperature: 0.9,
120
  onToken: (piece) => {
121
  raw += piece
 
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { mountTtsBar } from '/web/ttsBar.js'
8
  import { makeNarrator, ensureTts } from '/web/tts.js'
9
+ import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
10
 
11
  function el(tag, props = {}, kids = []) {
12
  const n = document.createElement(tag)
 
112
 
113
  try {
114
  status.textContent = `loading ${currentModel().label} into your browser…`
115
+ await ensureModel((frac, label) => { status.textContent = label || `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
116
  status.textContent = `writing on your device with ${currentModel().label}…`
117
  let raw = ''
118
+ await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
119
  maxTokens: 220, temperature: 0.9,
120
  onToken: (piece) => {
121
  raw += piece
web/engineTransformers.js CHANGED
@@ -10,14 +10,17 @@ const MODELS = [
10
  const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
11
  const device = () => { try { return navigator.gpu ? 'webgpu' : 'wasm' } catch { return 'wasm' } }
12
 
13
- let _lib = null, _pipe = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
14
  async function lib() { if (!_lib) _lib = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); return _lib }
15
 
16
  async function ensure(id, onProgress) {
17
  const m = get(id)
18
  if (_pipe && _loadedId === m.id) return _pipe
19
- if (_loadPromise && _loadedId === m.id) return _loadPromise
 
 
20
  if (_pipe && _loadedId !== m.id) { try { await _pipe.dispose?.() } catch { /* ignore */ } _pipe = null; _loadedId = null }
 
21
  _loadPromise = (async () => {
22
  const { pipeline } = await lib()
23
  const pipe = await pipeline('text-generation', m.repo, {
@@ -25,7 +28,7 @@ async function ensure(id, onProgress) {
25
  progress_callback: (p) => { if (onProgress && p.status === 'progress' && p.total) onProgress(p.loaded / p.total) },
26
  })
27
  _pipe = pipe; _loadedId = m.id; return pipe
28
- })().catch((e) => { _loadPromise = null; throw e })
29
  return _loadPromise
30
  }
31
 
 
10
  const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
11
  const device = () => { try { return navigator.gpu ? 'webgpu' : 'wasm' } catch { return 'wasm' } }
12
 
13
+ let _lib = null, _pipe = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
14
  async function lib() { if (!_lib) _lib = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); return _lib }
15
 
16
  async function ensure(id, onProgress) {
17
  const m = get(id)
18
  if (_pipe && _loadedId === m.id) return _pipe
19
+ // Guard on _loadingId (set now), not _loadedId (set after load) else a re-entrant
20
+ // ensure() during a slow download starts a second download.
21
+ if (_loadPromise && _loadingId === m.id) return _loadPromise
22
  if (_pipe && _loadedId !== m.id) { try { await _pipe.dispose?.() } catch { /* ignore */ } _pipe = null; _loadedId = null }
23
+ _loadingId = m.id
24
  _loadPromise = (async () => {
25
  const { pipeline } = await lib()
26
  const pipe = await pipeline('text-generation', m.repo, {
 
28
  progress_callback: (p) => { if (onProgress && p.status === 'progress' && p.total) onProgress(p.loaded / p.total) },
29
  })
30
  _pipe = pipe; _loadedId = m.id; return pipe
31
+ })().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
32
  return _loadPromise
33
  }
34
 
web/engineWebllm.js CHANGED
@@ -24,21 +24,28 @@ async function hasF16() {
24
  }
25
  const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`
26
 
27
- let _lib = null, _engine = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
28
  async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }
29
 
30
  async function ensure(id, onProgress) {
31
  const m = get(id)
32
  if (_engine && _loadedId === m.id) return _engine
33
- if (_loadPromise && _loadedId === m.id) return _loadPromise
 
 
 
 
34
  _loadPromise = (async () => {
35
  const { CreateMLCEngine } = await lib()
36
  const target = await mlcId(m)
37
- const cb = (p) => { if (onProgress && typeof p.progress === 'number') onProgress(p.progress) }
 
 
 
38
  if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
39
  _engine = await CreateMLCEngine(target, { initProgressCallback: cb })
40
  _loadedId = m.id; return _engine
41
- })().catch((e) => { _loadPromise = null; throw e })
42
  return _loadPromise
43
  }
44
 
 
24
  }
25
  const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`
26
 
27
+ let _lib = null, _engine = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
28
  async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }
29
 
30
  async function ensure(id, onProgress) {
31
  const m = get(id)
32
  if (_engine && _loadedId === m.id) return _engine
33
+ // Reuse the in-flight load for the SAME model (guard on _loadingId, not _loadedId,
34
+ // which isn't set until the load finishes — otherwise a re-entrant ensure() during
35
+ // a slow download starts a SECOND download).
36
+ if (_loadPromise && _loadingId === m.id) return _loadPromise
37
+ _loadingId = m.id
38
  _loadPromise = (async () => {
39
  const { CreateMLCEngine } = await lib()
40
  const target = await mlcId(m)
41
+ // MLC reports two phases through this one callback: "Fetching param cache…"
42
+ // (network) then "Loading model from cache…" (into GPU). Pass the text so the UI
43
+ // can show which is happening — the 2nd 0→100% is a cache-load, not a re-download.
44
+ const cb = (p) => { if (onProgress) onProgress(typeof p.progress === 'number' ? p.progress : 0, p.text) }
45
  if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
46
  _engine = await CreateMLCEngine(target, { initProgressCallback: cb })
47
  _loadedId = m.id; return _engine
48
+ })().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
49
  return _loadPromise
50
  }
51
 
web/personaPanel.js CHANGED
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
- import { PERSONA_SYSTEM, personaUserPrompt, stripThink } from '/web/personaPrompts.js'
10
 
11
  const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
12
 
@@ -68,10 +68,10 @@ export function mountPersonaPanel(host) {
68
  thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
69
  try {
70
  status.textContent = `loading ${currentModel().label} into your browser…`
71
- await ensureModel((frac) => { status.textContent = `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
72
  status.textContent = `writing on your device with ${currentModel().label}…`
73
  let acc = ''
74
- await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
75
  maxTokens: 220,
76
  onToken: (piece) => {
77
  acc += piece
 
6
  import { mountModelBar } from '/web/modelBar.js'
7
  import { extractLivePersona } from '/web/personaStream.js'
8
  import { parsePersonaJson } from '/web/personaParse.js'
9
+ import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
10
 
11
  const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
12
 
 
68
  thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
69
  try {
70
  status.textContent = `loading ${currentModel().label} into your browser…`
71
+ await ensureModel((frac, label) => { status.textContent = label || `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
72
  status.textContent = `writing on your device with ${currentModel().label}…`
73
  let acc = ''
74
+ await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
75
  maxTokens: 220,
76
  onToken: (piece) => {
77
  acc += piece
web/personaPrompts.js CHANGED
@@ -37,3 +37,9 @@ export function stripThink(text) {
37
  .replace(/<think>[\s\S]*$/i, '')
38
  .replace(/^\s+/, '')
39
  }
 
 
 
 
 
 
 
37
  .replace(/<think>[\s\S]*$/i, '')
38
  .replace(/^\s+/, '')
39
  }
40
+
41
+ // Qwen3 is a thinking model: left alone it burns the whole token budget on a
42
+ // <think> block and never reaches the JSON/answer. For these structured, short
43
+ // tasks we don't want reasoning — Qwen3 honours the `/no_think` soft switch in the
44
+ // prompt (recognised across llama.cpp / WebLLM / Transformers.js). No-op otherwise.
45
+ export const noThink = (modelId) => (/qwen3/i.test(String(modelId || '')) ? ' /no_think' : '')