Spaces:
Running
Fix Qwen3 parsing (/no_think) + WebLLM double-load guard & progress clarity
Browse filesQwen3 is a thinking model: with a 220-token budget it spent everything on a
<think> block and never emitted the JSON/answer, so persona parsing failed after
the thinking. Append Qwen3's `/no_think` soft switch for these structured tasks
(persona + diary) so it goes straight to the answer. Helper noThink(modelId) keyed
on the model id; no-op for non-Qwen3.
WebLLM "downloads twice / no cache":
- ensure() guarded on _loadedId (only set AFTER load), so a re-entrant ensure during
a slow download could start a SECOND load. Guard on _loadingId (set up front)
instead — same fix applied to the Transformers.js engine.
- MLC runs two phases through one progress callback (Fetching from network, then
Loading from cache into GPU) — both 0→100%, which reads as "downloading twice".
Pass MLC's progress text through so the status line shows which phase it is; the
second pass is a cache-load, not a re-download. (Caching = MLC Cache API + our
storage.persist(); evictable under quota pressure.)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
- web/diaryPanel.js +3 -3
- web/engineTransformers.js +6 -3
- web/engineWebllm.js +11 -4
- web/personaPanel.js +3 -3
- web/personaPrompts.js +6 -0
|
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { mountTtsBar } from '/web/ttsBar.js'
|
| 8 |
import { makeNarrator, ensureTts } from '/web/tts.js'
|
| 9 |
-
import { DIARY_SYSTEM, diaryUserPrompt, stripThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
function el(tag, props = {}, kids = []) {
|
| 12 |
const n = document.createElement(tag)
|
|
@@ -112,10 +112,10 @@ export function mountDiaryPanel(host) {
|
|
| 112 |
|
| 113 |
try {
|
| 114 |
status.textContent = `loading ${currentModel().label} into your browser…`
|
| 115 |
-
await ensureModel((frac) => { status.textContent = `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
|
| 116 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 117 |
let raw = ''
|
| 118 |
-
await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
|
| 119 |
maxTokens: 220, temperature: 0.9,
|
| 120 |
onToken: (piece) => {
|
| 121 |
raw += piece
|
|
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { mountTtsBar } from '/web/ttsBar.js'
|
| 8 |
import { makeNarrator, ensureTts } from '/web/tts.js'
|
| 9 |
+
import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
function el(tag, props = {}, kids = []) {
|
| 12 |
const n = document.createElement(tag)
|
|
|
|
| 112 |
|
| 113 |
try {
|
| 114 |
status.textContent = `loading ${currentModel().label} into your browser…`
|
| 115 |
+
await ensureModel((frac, label) => { status.textContent = label || `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
|
| 116 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 117 |
let raw = ''
|
| 118 |
+
await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
|
| 119 |
maxTokens: 220, temperature: 0.9,
|
| 120 |
onToken: (piece) => {
|
| 121 |
raw += piece
|
|
@@ -10,14 +10,17 @@ const MODELS = [
|
|
| 10 |
const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
|
| 11 |
const device = () => { try { return navigator.gpu ? 'webgpu' : 'wasm' } catch { return 'wasm' } }
|
| 12 |
|
| 13 |
-
let _lib = null, _pipe = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
|
| 14 |
async function lib() { if (!_lib) _lib = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); return _lib }
|
| 15 |
|
| 16 |
async function ensure(id, onProgress) {
|
| 17 |
const m = get(id)
|
| 18 |
if (_pipe && _loadedId === m.id) return _pipe
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
if (_pipe && _loadedId !== m.id) { try { await _pipe.dispose?.() } catch { /* ignore */ } _pipe = null; _loadedId = null }
|
|
|
|
| 21 |
_loadPromise = (async () => {
|
| 22 |
const { pipeline } = await lib()
|
| 23 |
const pipe = await pipeline('text-generation', m.repo, {
|
|
@@ -25,7 +28,7 @@ async function ensure(id, onProgress) {
|
|
| 25 |
progress_callback: (p) => { if (onProgress && p.status === 'progress' && p.total) onProgress(p.loaded / p.total) },
|
| 26 |
})
|
| 27 |
_pipe = pipe; _loadedId = m.id; return pipe
|
| 28 |
-
})().catch((e) => { _loadPromise = null; throw e })
|
| 29 |
return _loadPromise
|
| 30 |
}
|
| 31 |
|
|
|
|
| 10 |
const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
|
| 11 |
const device = () => { try { return navigator.gpu ? 'webgpu' : 'wasm' } catch { return 'wasm' } }
|
| 12 |
|
| 13 |
+
let _lib = null, _pipe = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
|
| 14 |
async function lib() { if (!_lib) _lib = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); return _lib }
|
| 15 |
|
| 16 |
async function ensure(id, onProgress) {
|
| 17 |
const m = get(id)
|
| 18 |
if (_pipe && _loadedId === m.id) return _pipe
|
| 19 |
+
// Guard on _loadingId (set now), not _loadedId (set after load) — else a re-entrant
|
| 20 |
+
// ensure() during a slow download starts a second download.
|
| 21 |
+
if (_loadPromise && _loadingId === m.id) return _loadPromise
|
| 22 |
if (_pipe && _loadedId !== m.id) { try { await _pipe.dispose?.() } catch { /* ignore */ } _pipe = null; _loadedId = null }
|
| 23 |
+
_loadingId = m.id
|
| 24 |
_loadPromise = (async () => {
|
| 25 |
const { pipeline } = await lib()
|
| 26 |
const pipe = await pipeline('text-generation', m.repo, {
|
|
|
|
| 28 |
progress_callback: (p) => { if (onProgress && p.status === 'progress' && p.total) onProgress(p.loaded / p.total) },
|
| 29 |
})
|
| 30 |
_pipe = pipe; _loadedId = m.id; return pipe
|
| 31 |
+
})().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
|
| 32 |
return _loadPromise
|
| 33 |
}
|
| 34 |
|
|
@@ -24,21 +24,28 @@ async function hasF16() {
|
|
| 24 |
}
|
| 25 |
const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`
|
| 26 |
|
| 27 |
-
let _lib = null, _engine = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
|
| 28 |
async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }
|
| 29 |
|
| 30 |
async function ensure(id, onProgress) {
|
| 31 |
const m = get(id)
|
| 32 |
if (_engine && _loadedId === m.id) return _engine
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
_loadPromise = (async () => {
|
| 35 |
const { CreateMLCEngine } = await lib()
|
| 36 |
const target = await mlcId(m)
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
|
| 39 |
_engine = await CreateMLCEngine(target, { initProgressCallback: cb })
|
| 40 |
_loadedId = m.id; return _engine
|
| 41 |
-
})().catch((e) => { _loadPromise = null; throw e })
|
| 42 |
return _loadPromise
|
| 43 |
}
|
| 44 |
|
|
|
|
| 24 |
}
|
| 25 |
const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`
|
| 26 |
|
| 27 |
+
let _lib = null, _engine = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
|
| 28 |
async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }
|
| 29 |
|
| 30 |
async function ensure(id, onProgress) {
|
| 31 |
const m = get(id)
|
| 32 |
if (_engine && _loadedId === m.id) return _engine
|
| 33 |
+
// Reuse the in-flight load for the SAME model (guard on _loadingId, not _loadedId,
|
| 34 |
+
// which isn't set until the load finishes — otherwise a re-entrant ensure() during
|
| 35 |
+
// a slow download starts a SECOND download).
|
| 36 |
+
if (_loadPromise && _loadingId === m.id) return _loadPromise
|
| 37 |
+
_loadingId = m.id
|
| 38 |
_loadPromise = (async () => {
|
| 39 |
const { CreateMLCEngine } = await lib()
|
| 40 |
const target = await mlcId(m)
|
| 41 |
+
// MLC reports two phases through this one callback: "Fetching param cache…"
|
| 42 |
+
// (network) then "Loading model from cache…" (into GPU). Pass the text so the UI
|
| 43 |
+
// can show which is happening — the 2nd 0→100% is a cache-load, not a re-download.
|
| 44 |
+
const cb = (p) => { if (onProgress) onProgress(typeof p.progress === 'number' ? p.progress : 0, p.text) }
|
| 45 |
if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
|
| 46 |
_engine = await CreateMLCEngine(target, { initProgressCallback: cb })
|
| 47 |
_loadedId = m.id; return _engine
|
| 48 |
+
})().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
|
| 49 |
return _loadPromise
|
| 50 |
}
|
| 51 |
|
|
@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { extractLivePersona } from '/web/personaStream.js'
|
| 8 |
import { parsePersonaJson } from '/web/personaParse.js'
|
| 9 |
-
import { PERSONA_SYSTEM, personaUserPrompt, stripThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
|
| 12 |
|
|
@@ -68,10 +68,10 @@ export function mountPersonaPanel(host) {
|
|
| 68 |
thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
|
| 69 |
try {
|
| 70 |
status.textContent = `loading ${currentModel().label} into your browser…`
|
| 71 |
-
await ensureModel((frac) => { status.textContent = `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
|
| 72 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 73 |
let acc = ''
|
| 74 |
-
await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
|
| 75 |
maxTokens: 220,
|
| 76 |
onToken: (piece) => {
|
| 77 |
acc += piece
|
|
|
|
| 6 |
import { mountModelBar } from '/web/modelBar.js'
|
| 7 |
import { extractLivePersona } from '/web/personaStream.js'
|
| 8 |
import { parsePersonaJson } from '/web/personaParse.js'
|
| 9 |
+
import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
|
| 10 |
|
| 11 |
const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
|
| 12 |
|
|
|
|
| 68 |
thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
|
| 69 |
try {
|
| 70 |
status.textContent = `loading ${currentModel().label} into your browser…`
|
| 71 |
+
await ensureModel((frac, label) => { status.textContent = label || `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
|
| 72 |
status.textContent = `writing on your device with ${currentModel().label}…`
|
| 73 |
let acc = ''
|
| 74 |
+
await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
|
| 75 |
maxTokens: 220,
|
| 76 |
onToken: (piece) => {
|
| 77 |
acc += piece
|
|
@@ -37,3 +37,9 @@ export function stripThink(text) {
|
|
| 37 |
.replace(/<think>[\s\S]*$/i, '')
|
| 38 |
.replace(/^\s+/, '')
|
| 39 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
.replace(/<think>[\s\S]*$/i, '')
|
| 38 |
.replace(/^\s+/, '')
|
| 39 |
}
|
| 40 |
+
|
| 41 |
+
// Qwen3 is a thinking model: left alone it burns the whole token budget on a
|
| 42 |
+
// <think> block and never reaches the JSON/answer. For these structured, short
|
| 43 |
+
// tasks we don't want reasoning — Qwen3 honours the `/no_think` soft switch in the
|
| 44 |
+
// prompt (recognised across llama.cpp / WebLLM / Transformers.js). No-op otherwise.
|
| 45 |
+
export const noThink = (modelId) => (/qwen3/i.test(String(modelId || '')) ? ' /no_think' : '')
|