Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on 5 days ago

Commit

ab87288

1 Parent(s): 0218dca

Fix Qwen3 parsing (/no_think) + WebLLM double-load guard & progress clarity

Qwen3 is a thinking model: with a 220-token budget it spent everything on a
<think> block and never emitted the JSON/answer, so persona parsing failed after
the thinking. Append Qwen3's `/no_think` soft switch for these structured tasks
(persona + diary) so it goes straight to the answer. Helper noThink(modelId) keyed
on the model id; no-op for non-Qwen3.

WebLLM "downloads twice / no cache":
- ensure() guarded on _loadedId (only set AFTER load), so a re-entrant ensure during
a slow download could start a SECOND load. Guard on _loadingId (set up front)
instead — same fix applied to the Transformers.js engine.
- MLC runs two phases through one progress callback (Fetching from network, then
Loading from cache into GPU) — both 0→100%, which reads as "downloading twice".
Pass MLC's progress text through so the status line shows which phase it is; the
second pass is a cache-load, not a re-download. (Caching = MLC Cache API + our
storage.persist(); evictable under quota pressure.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

web/diaryPanel.js +3 -3
web/engineTransformers.js +6 -3
web/engineWebllm.js +11 -4
web/personaPanel.js +3 -3
web/personaPrompts.js +6 -0

web/diaryPanel.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
 import { mountModelBar } from '/web/modelBar.js'
 import { mountTtsBar } from '/web/ttsBar.js'
 import { makeNarrator, ensureTts } from '/web/tts.js'
-import { DIARY_SYSTEM, diaryUserPrompt, stripThink } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
@@ -112,10 +112,10 @@ export function mountDiaryPanel(host) {
     try {
       status.textContent = `loading ${currentModel().label} into your browser…`
-      await ensureModel((frac) => { status.textContent = `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
       status.textContent = `writing on your device with ${currentModel().label}…`
       let raw = ''
-      await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
         maxTokens: 220, temperature: 0.9,
         onToken: (piece) => {
           raw += piece

 import { mountModelBar } from '/web/modelBar.js'
 import { mountTtsBar } from '/web/ttsBar.js'
 import { makeNarrator, ensureTts } from '/web/tts.js'
+import { DIARY_SYSTEM, diaryUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
     try {
       status.textContent = `loading ${currentModel().label} into your browser…`
+      await ensureModel((frac, label) => { status.textContent = label || `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
       status.textContent = `writing on your device with ${currentModel().label}…`
       let raw = ''
+      await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value) + noThink(currentModel().id), {
         maxTokens: 220, temperature: 0.9,
         onToken: (piece) => {
           raw += piece

web/engineTransformers.js CHANGED Viewed

@@ -10,14 +10,17 @@ const MODELS = [
 const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
 const device = () => { try { return navigator.gpu ? 'webgpu' : 'wasm' } catch { return 'wasm' } }
-let _lib = null, _pipe = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
 async function lib() { if (!_lib) _lib = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); return _lib }
 async function ensure(id, onProgress) {
   const m = get(id)
   if (_pipe && _loadedId === m.id) return _pipe
-  if (_loadPromise && _loadedId === m.id) return _loadPromise
   if (_pipe && _loadedId !== m.id) { try { await _pipe.dispose?.() } catch { /* ignore */ } _pipe = null; _loadedId = null }
   _loadPromise = (async () => {
     const { pipeline } = await lib()
     const pipe = await pipeline('text-generation', m.repo, {
@@ -25,7 +28,7 @@ async function ensure(id, onProgress) {
       progress_callback: (p) => { if (onProgress && p.status === 'progress' && p.total) onProgress(p.loaded / p.total) },
     })
     _pipe = pipe; _loadedId = m.id; return pipe
-  })().catch((e) => { _loadPromise = null; throw e })
   return _loadPromise
 }

 const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
 const device = () => { try { return navigator.gpu ? 'webgpu' : 'wasm' } catch { return 'wasm' } }
+let _lib = null, _pipe = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
 async function lib() { if (!_lib) _lib = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); return _lib }
 async function ensure(id, onProgress) {
   const m = get(id)
   if (_pipe && _loadedId === m.id) return _pipe
+  // Guard on _loadingId (set now), not _loadedId (set after load) — else a re-entrant
+  // ensure() during a slow download starts a second download.
+  if (_loadPromise && _loadingId === m.id) return _loadPromise
   if (_pipe && _loadedId !== m.id) { try { await _pipe.dispose?.() } catch { /* ignore */ } _pipe = null; _loadedId = null }
+  _loadingId = m.id
   _loadPromise = (async () => {
     const { pipeline } = await lib()
     const pipe = await pipeline('text-generation', m.repo, {
       progress_callback: (p) => { if (onProgress && p.status === 'progress' && p.total) onProgress(p.loaded / p.total) },
     })
     _pipe = pipe; _loadedId = m.id; return pipe
+  })().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
   return _loadPromise
 }

web/engineWebllm.js CHANGED Viewed

@@ -24,21 +24,28 @@ async function hasF16() {
 }
 const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`
-let _lib = null, _engine = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
 async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }
 async function ensure(id, onProgress) {
   const m = get(id)
   if (_engine && _loadedId === m.id) return _engine
-  if (_loadPromise && _loadedId === m.id) return _loadPromise
   _loadPromise = (async () => {
     const { CreateMLCEngine } = await lib()
     const target = await mlcId(m)
-    const cb = (p) => { if (onProgress && typeof p.progress === 'number') onProgress(p.progress) }
     if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
     _engine = await CreateMLCEngine(target, { initProgressCallback: cb })
     _loadedId = m.id; return _engine
-  })().catch((e) => { _loadPromise = null; throw e })
   return _loadPromise
 }

 }
 const mlcId = async (m) => `${m.mlcBase}-${(await hasF16()) ? 'q4f16_1' : 'q4f32_1'}-MLC`
+let _lib = null, _engine = null, _loadedId = null, _loadingId = null, _loadPromise = null, _chain = Promise.resolve()
 async function lib() { if (!_lib) _lib = await import('https://esm.run/@mlc-ai/web-llm'); return _lib }
 async function ensure(id, onProgress) {
   const m = get(id)
   if (_engine && _loadedId === m.id) return _engine
+  // Reuse the in-flight load for the SAME model (guard on _loadingId, not _loadedId,
+  // which isn't set until the load finishes — otherwise a re-entrant ensure() during
+  // a slow download starts a SECOND download).
+  if (_loadPromise && _loadingId === m.id) return _loadPromise
+  _loadingId = m.id
   _loadPromise = (async () => {
     const { CreateMLCEngine } = await lib()
     const target = await mlcId(m)
+    // MLC reports two phases through this one callback: "Fetching param cache…"
+    // (network) then "Loading model from cache…" (into GPU). Pass the text so the UI
+    // can show which is happening — the 2nd 0→100% is a cache-load, not a re-download.
+    const cb = (p) => { if (onProgress) onProgress(typeof p.progress === 'number' ? p.progress : 0, p.text) }
     if (_engine && _engine.reload) { await _engine.reload(target); _loadedId = m.id; return _engine }
     _engine = await CreateMLCEngine(target, { initProgressCallback: cb })
     _loadedId = m.id; return _engine
+  })().catch((e) => { _loadPromise = null; _loadingId = null; throw e })
   return _loadPromise
 }

web/personaPanel.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { streamChat, ensureModel, currentModel } from '/web/runtime.js'
 import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
-import { PERSONA_SYSTEM, personaUserPrompt, stripThink } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
@@ -68,10 +68,10 @@ export function mountPersonaPanel(host) {
     thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
     try {
       status.textContent = `loading ${currentModel().label} into your browser…`
-      await ensureModel((frac) => { status.textContent = `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
       status.textContent = `writing on your device with ${currentModel().label}…`
       let acc = ''
-      await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
         maxTokens: 220,
         onToken: (piece) => {
           acc += piece

 import { mountModelBar } from '/web/modelBar.js'
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
+import { PERSONA_SYSTEM, personaUserPrompt, stripThink, noThink } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
     thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
     try {
       status.textContent = `loading ${currentModel().label} into your browser…`
+      await ensureModel((frac, label) => { status.textContent = label || `downloading ${currentModel().label}… ${Math.round(frac * 100)}% (one-time)` })
       status.textContent = `writing on your device with ${currentModel().label}…`
       let acc = ''
+      await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value) + noThink(currentModel().id), {
         maxTokens: 220,
         onToken: (piece) => {
           acc += piece

web/personaPrompts.js CHANGED Viewed

@@ -37,3 +37,9 @@ export function stripThink(text) {
     .replace(/<think>[\s\S]*$/i, '')
     .replace(/^\s+/, '')
 }

     .replace(/<think>[\s\S]*$/i, '')
     .replace(/^\s+/, '')
 }
+// Qwen3 is a thinking model: left alone it burns the whole token budget on a
+// <think> block and never reaches the JSON/answer. For these structured, short
+// tasks we don't want reasoning — Qwen3 honours the `/no_think` soft switch in the
+// prompt (recognised across llama.cpp / WebLLM / Transformers.js). No-op otherwise.
+export const noThink = (modelId) => (/qwen3/i.test(String(modelId || '')) ? ' /no_think' : '')