Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on Jun 4

Commit

03708ca

1 Parent(s): bd4a81a

Personas + War Diary now run llama.cpp IN THE BROWSER via wllama (local-first)

Generation moved off the slow Space CPU (0.58 tok/s) onto the visitor's device:
wllama (llama.cpp compiled to WASM) loads a small GGUF from HF and streams tokens
client-side. Earns 🔌 Off the Grid (no cloud/server inference) + 🦙 Llama Champion
(it IS llama.cpp). Reuses woid's persona parser (vendored personaParse.js) +
extractLivePersona. New web/wllamaLlm.js + personaPrompts.js + diaryPanel.js; the
Barracks + Personas tabs are now gr.HTML divs filled by the in-browser panels. The
Python llama-cpp-python path stays as a lazy fallback (no startup prewarm).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (7) hide show

app.py +12 -12
web/diaryPanel.js +57 -0
web/personaPanel.js +30 -28
web/personaParse.js +100 -0
web/personaPrompts.js +30 -0
web/tiny.js +3 -1
web/wllamaLlm.js +55 -0

app.py CHANGED Viewed

@@ -91,10 +91,11 @@ THEME = ('<style>'
          # Gradio still hides it (display:none on the inactive tab's ancestor).
          '.gradio-container .tabitem{padding:0 !important;}'
          '.gradio-container .tabs{border:0 !important;}'
-         '#sprite-stage,#persona-stage{position:fixed !important;top:0;bottom:0;right:0;'
-         'left:var(--tac-w,240px);height:auto !important;z-index:1;}'
-         'body.tac-collapsed #sprite-stage,body.tac-collapsed #persona-stage{left:0;}'
-         '@media (max-width:768px){#sprite-stage,#persona-stage{left:0;}}'
          '</style>')
 HEAD = ('<meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">'
         + HIDE_TABS + FONTS + THEME +
@@ -187,14 +188,11 @@ with gr.Blocks(title="Tiny Army") as demo:
         battle_tab.select(None, None, None, js="()=>window.tinyResize&&window.tinyResize()")
         sprite_tab.select(None, None, None, js="()=>window.tinyResize&&window.tinyResize()")
         with gr.Tab("Barracks"):
-            with gr.Row():
-                unit = gr.Textbox("Bram the Warrior", label="Unit")
-                traits = gr.Textbox("Cautious, Veteran, Vengeful", label="Traits")
-            out = gr.Textbox(label="War diary", lines=6)
-            gr.Button("Write diary", variant="primary").click(diary, [unit, traits], out)
         with gr.Tab("Personas"):
-            # The vanilla persona panel (web/personaPanel.js) builds the whole page
-            # into this div and streams from /persona/generate/stream.
             gr.HTML('<div id="persona-stage" style="overflow:hidden"></div>')
 # Mount Gradio on FastAPI so we can also serve the JS module + the sprite assets.
@@ -312,7 +310,9 @@ app = gr.mount_gradio_app(fastapi_app, demo, path="/", head=HEAD, theme=gr.theme
 if __name__ == "__main__":
-    llm.prewarm()  # load the GGUF in the background so the first request is warm
     # proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
     # from HF's edge, so it generates https (not http) asset URLs behind the proxy.
     uvicorn.run(app, host="0.0.0.0", port=7860,

          # Gradio still hides it (display:none on the inactive tab's ancestor).
          '.gradio-container .tabitem{padding:0 !important;}'
          '.gradio-container .tabs{border:0 !important;}'
+         '#sprite-stage,#persona-stage,#diary-stage{position:fixed !important;top:0;bottom:0;'
+         'right:0;left:var(--tac-w,240px);height:auto !important;z-index:1;}'
+         'body.tac-collapsed #sprite-stage,body.tac-collapsed #persona-stage,'
+         'body.tac-collapsed #diary-stage{left:0;}'
+         '@media (max-width:768px){#sprite-stage,#persona-stage,#diary-stage{left:0;}}'
          '</style>')
 HEAD = ('<meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">'
         + HIDE_TABS + FONTS + THEME +
         battle_tab.select(None, None, None, js="()=>window.tinyResize&&window.tinyResize()")
         sprite_tab.select(None, None, None, js="()=>window.tinyResize&&window.tinyResize()")
         with gr.Tab("Barracks"):
+            # In-browser war-diary (web/diaryPanel.js → wllama, llama.cpp WASM). Runs
+            # entirely on the visitor's device — no server inference.
+            gr.HTML('<div id="diary-stage" style="overflow:hidden"></div>')
         with gr.Tab("Personas"):
+            # In-browser persona generator (web/personaPanel.js → wllama).
             gr.HTML('<div id="persona-stage" style="overflow:hidden"></div>')
 # Mount Gradio on FastAPI so we can also serve the JS module + the sprite assets.
 if __name__ == "__main__":
+    # The default UI runs the model IN THE BROWSER (wllama). The Python llama.cpp path
+    # stays as a lazy fallback (only loads if /persona/generate/stream is hit), so we
+    # don't pre-download it here.
     # proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
     # from HF's edge, so it generates https (not http) asset URLs behind the proxy.
     uvicorn.run(app, host="0.0.0.0", port=7860,

web/diaryPanel.js ADDED Viewed

	@@ -0,0 +1,57 @@

+// War-diary panel — vanilla DOM, mounted by tiny.js into #diary-stage. Streams a
+// first-person diary entry generated ON THE USER'S DEVICE via wllama (llama.cpp
+// WASM). Shares the persona panel's styling (.persona-*) and the same local model.
+import { streamChat, ensureModel, modelLabel } from '/web/wllamaLlm.js'
+import { DIARY_SYSTEM, diaryUserPrompt } from '/web/personaPrompts.js'
+function el(tag, props = {}, kids = []) {
+  const n = document.createElement(tag)
+  for (const [k, v] of Object.entries(props)) {
+    if (k === 'class') n.className = v
+    else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
+    else if (v != null) n.setAttribute(k, v)
+  }
+  for (const kid of [].concat(kids)) if (kid != null) n.append(kid)
+  return n
+}
+export function mountDiaryPanel(host) {
+  const unit = el('input', { class: 'persona-input', type: 'text', value: 'Bram the Warrior' })
+  const traits = el('input', { class: 'persona-input', type: 'text', value: 'Cautious, Veteran, Vengeful' })
+  const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
+  const btn = el('button', { class: 'persona-go', type: 'button' }, '✒ Write war diary')
+  const out = el('div', { class: 'persona-about' }, 'A first-person diary entry, written by a small llama.cpp model in your browser.')
+  const controls = el('aside', { class: 'persona-controls' }, [
+    el('h2', { class: 'persona-title' }, 'War Diary'),
+    el('label', { class: 'persona-label' }, 'Unit'), unit,
+    el('label', { class: 'persona-label' }, 'Traits'), traits,
+    btn, status,
+  ])
+  const result = el('div', { class: 'persona-result' }, [out])
+  host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
+  let busy = false
+  async function write() {
+    if (busy) return
+    busy = true; btn.disabled = true
+    const header = `— Diary of ${(unit.value || 'a nameless soldier').trim()} —\n\n`
+    out.textContent = header
+    try {
+      status.textContent = 'loading the model into your browser…'
+      await ensureModel((frac) => { status.textContent = `downloading model… ${Math.round(frac * 100)}% (one-time, then cached)` })
+      status.textContent = `writing with ${modelLabel()} — on your device…`
+      let first = true
+      await streamChat(DIARY_SYSTEM, diaryUserPrompt(unit.value, traits.value), {
+        maxTokens: 200, temperature: 0.9,
+        onToken: (piece) => { if (first) { out.textContent = header; first = false } out.textContent += piece },
+      })
+      status.textContent = 'written ✓ (generated locally)'
+    } catch (e) {
+      status.textContent = `couldn't run the local model: ${e.message || e}`
+    } finally {
+      busy = false; btn.disabled = false
+    }
+  }
+  btn.addEventListener('click', write)
+}

web/personaPanel.js CHANGED Viewed

@@ -1,8 +1,11 @@
 // Tiny Army persona panel — vanilla DOM, mounted by tiny.js into #persona-stage.
-// Reuses woid's persona SSE client (/web/personaStream.js) VERBATIM against the
-// Space's own /persona/generate/stream endpoint, live-updating name/about as tokens
-// stream (the same extractLivePersona trick woid uses). No Pixi, no framework.
-import { streamGenerateProfile, extractLivePersona } from '/web/personaStream.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
@@ -19,17 +22,16 @@ function el(tag, props = {}, kids = []) {
 }
 export function mountPersonaPanel(host, opts = {}) {
-  const path = opts.path || '/persona/generate/stream'
   const classes = opts.classes || CLASSES
   const sel = el('select', { class: 'persona-input' }, classes.map((c) => el('option', { value: c }, c)))
   const seed = el('input', { class: 'persona-input', type: 'text', placeholder: 'a word, a vibe… (optional)' })
-  const status = el('div', { class: 'persona-status' })
   const btn = el('button', { class: 'persona-go', type: 'button' }, '⚔ Recruit a soldier')
   const nameEl = el('div', { class: 'persona-name' }, 'Your soldier')
   const tagsEl = el('div', { class: 'persona-tags' })
-  const aboutEl = el('div', { class: 'persona-about' }, 'Pick a class and recruit — the model writes their legend.')
   const controls = el('aside', { class: 'persona-controls' }, [
     el('h2', { class: 'persona-title' }, 'Recruit'),
@@ -50,31 +52,31 @@ export function mountPersonaPanel(host, opts = {}) {
     if (busy) return
     busy = true; btn.disabled = true
     nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
-    status.textContent = 'summoning the model… (first run downloads it)'
-    let acc = ''
     try {
-      await streamGenerateProfile({
-        bridgeUrl: '', path, body: { class: sel.value, seed: seed.value },
-        onEvent: (evt, parsed) => {
-          if (evt === 'model') status.textContent = `writing with ${parsed?.model || 'the model'}…`
-          else if (evt === 'delta') {
-            acc += (parsed?.content || '')
-            const live = extractLivePersona(acc)
-            if (live.name) nameEl.textContent = live.name
-            if (live.about) aboutEl.textContent = live.about
-          } else if (evt === 'persona-done') {
-            if (parsed?.name) nameEl.textContent = parsed.name
-            if (parsed?.about) aboutEl.textContent = parsed.about
-            setTags(parsed || {})
-          } else if (evt === 'done') {
-            status.textContent = 'enlisted ✓'
-          } else if (evt === 'error') {
-            status.textContent = `couldn't recruit: ${parsed?.error || 'unknown error'}`
-          }
         },
       })
     } catch (e) {
-      status.textContent = `couldn't recruit: ${e.message || e}`
     } finally {
       busy = false; btn.disabled = false
     }

 // Tiny Army persona panel — vanilla DOM, mounted by tiny.js into #persona-stage.
+// Generation runs ON THE USER'S DEVICE via wllama (llama.cpp WASM) — no server, no
+// cloud (🔌 Off the Grid + 🦙 Llama Champion). Reuses woid's persona JSON parser
+// (personaParse.js) + live-extraction (extractLivePersona) verbatim.
+import { streamChat, ensureModel, modelLabel } from '/web/wllamaLlm.js'
+import { extractLivePersona } from '/web/personaStream.js'
+import { parsePersonaJson } from '/web/personaParse.js'
+import { PERSONA_SYSTEM, personaUserPrompt } from '/web/personaPrompts.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
 }
 export function mountPersonaPanel(host, opts = {}) {
   const classes = opts.classes || CLASSES
   const sel = el('select', { class: 'persona-input' }, classes.map((c) => el('option', { value: c }, c)))
   const seed = el('input', { class: 'persona-input', type: 'text', placeholder: 'a word, a vibe… (optional)' })
+  const status = el('div', { class: 'persona-status' }, 'Runs on your device — no cloud.')
   const btn = el('button', { class: 'persona-go', type: 'button' }, '⚔ Recruit a soldier')
   const nameEl = el('div', { class: 'persona-name' }, 'Your soldier')
   const tagsEl = el('div', { class: 'persona-tags' })
+  const aboutEl = el('div', { class: 'persona-about' }, 'Pick a class and recruit — a small llama.cpp model in your browser writes their legend.')
   const controls = el('aside', { class: 'persona-controls' }, [
     el('h2', { class: 'persona-title' }, 'Recruit'),
     if (busy) return
     busy = true; btn.disabled = true
     nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
     try {
+      status.textContent = 'loading the model into your browser…'
+      await ensureModel((frac) => { status.textContent = `downloading model… ${Math.round(frac * 100)}% (one-time, then cached)` })
+      status.textContent = `writing with ${modelLabel()} — on your device…`
+      let acc = ''
+      await streamChat(PERSONA_SYSTEM, personaUserPrompt(sel.value, seed.value), {
+        maxTokens: 200,
+        onToken: (piece) => {
+          acc += piece
+          const live = extractLivePersona(acc)
+          if (live.name) nameEl.textContent = live.name
+          if (live.about) aboutEl.textContent = live.about
         },
       })
+      try {
+        const p = parsePersonaJson(acc)
+        if (p.name) nameEl.textContent = p.name
+        aboutEl.textContent = p.about
+        setTags(p)
+        status.textContent = 'enlisted ✓ (generated locally)'
+      } catch (e) {
+        status.textContent = `the model rambled — couldn't parse a clean persona (${e.message || e})`
+      }
     } catch (e) {
+      status.textContent = `couldn't run the local model: ${e.message || e}`
     } finally {
       busy = false; btn.disabled = false
     }

web/personaParse.js ADDED Viewed

	@@ -0,0 +1,100 @@

+/**
+ * Persona JSON parsing helpers.
+ *
+ * Canonical home: agent-sandbox/woid-core/persona/parse.js.
+ *
+ * LLMs return persona JSON wrapped in noise: code fences, preambles,
+ * trailing commentary, occasionally multi-object emissions. These helpers
+ * defensively extract the first valid JSON object and sanitize the
+ * standard fields (name, about, specialty, personality).
+ *
+ * Pi-bridge has its own generatePersona() that uses these as building
+ * blocks. Brain-server's lib/persona.js uses these via a thin wrapper.
+ */
+/**
+ * Tighten a model-returned name: strip wrapping punctuation, collapse
+ * whitespace, reject obvious "name: foo" LLM leakage. Returns "" if
+ * the name fails sanity (too short/long, or looks like a key-value pair).
+ */
+export function sanitizeName(raw) {
+  const s = String(raw ?? "")
+    .replace(/^[\s"'“”‘’`]+|[\s"'“”‘’`]+$/gu, "")
+    .replace(/\s+/g, " ")
+    .trim();
+  if (s.length < 2 || s.length > 40) return "";
+  if (/^(name|character|persona)\s*[:=]/i.test(s)) return "";
+  return s;
+}
+/**
+ * Trim a short tag (specialty / personality). Returns null for empty
+ * input, ellipsizes anything over 48 chars to 46+ellipsis.
+ */
+export function trimTag(raw) {
+  if (typeof raw !== "string") return null;
+  const s = raw.trim().replace(/\.\s*$/, "");
+  if (!s) return null;
+  return s.length > 48 ? s.slice(0, 46).trim() + "…" : s;
+}
+/**
+ * Walk forward from each `{` until we find a bracket-balanced, string-aware
+ * matching `}`. First successful parse wins. Handles trailing prose, multi-
+ * object emissions, and embedded `}` characters inside string literals.
+ * Returns the parsed object or null.
+ */
+export function extractFirstJsonObject(raw) {
+  for (let i = 0; i < raw.length; i++) {
+    if (raw[i] !== "{") continue;
+    let depth = 0, inStr = false, esc = false;
+    for (let j = i; j < raw.length; j++) {
+      const ch = raw[j];
+      if (inStr) {
+        if (esc) esc = false;
+        else if (ch === "\\") esc = true;
+        else if (ch === '"') inStr = false;
+        continue;
+      }
+      if (ch === '"') inStr = true;
+      else if (ch === "{") depth++;
+      else if (ch === "}") {
+        depth--;
+        if (depth === 0) {
+          const slice = raw.slice(i, j + 1);
+          try { return JSON.parse(slice); } catch { break; }
+        }
+      }
+    }
+  }
+  return null;
+}
+/**
+ * Parse a persona JSON response from an LLM. Strips ```json fences,
+ * uses bracket-balanced extraction, sanitizes name and trims tags.
+ *
+ * Throws if no parseable JSON or no `about` field — these are the two
+ * load-bearing fields. Optional fields: avatar_hint, vibe, specialty,
+ * personality. avatar_hint / vibe are brain-server-style; specialty /
+ * personality are pi-bridge-style. Both are surfaced if present.
+ *
+ * @returns {{name:string|null, about:string, avatar_hint:string, vibe:string, specialty:string|null, personality:string|null}}
+ */
+export function parsePersonaJson(raw) {
+  const fenced = String(raw ?? "").match(/```(?:json)?\s*([\s\S]*?)```/i);
+  const candidate = (fenced?.[1] ?? String(raw ?? "")).trim();
+  const parsed = extractFirstJsonObject(candidate);
+  if (!parsed) throw new Error("model did not return a parseable JSON object");
+  const name = sanitizeName(parsed.name ?? parsed.callSign ?? "");
+  const about = (typeof parsed.about === "string" ? parsed.about.trim() : "").slice(0, 1000);
+  if (!about) throw new Error("model did not return an about");
+  const avatar_hint = String(parsed.avatar_hint ?? parsed.avatarHint ?? "").slice(0, 200);
+  const vibe = String(parsed.vibe ?? "").slice(0, 40);
+  const specialty = trimTag(parsed.specialty ?? parsed.role ?? parsed.job ?? null);
+  const personality = trimTag(parsed.personality ?? parsed.personalityTag ?? null);
+  return { name: name || null, about, avatar_hint, vibe, specialty, personality };
+}

web/personaPrompts.js ADDED Viewed

	@@ -0,0 +1,30 @@

+// Tiny-Army persona + war-diary prompts (mirrors the Python prompts.py for the
+// in-browser path). War-legend tone, not woid's.
+export const PERSONA_SYSTEM =
+  'You invent tiny soldiers for a fantasy auto-battler called Tiny Army, where every ' +
+  'fighter writes its own legend. Given a class and an optional seed, return ONE JSON ' +
+  'object and NOTHING else, with exactly these keys:\n' +
+  '  "name": a short evocative soldier name (2-4 words),\n' +
+  '  "about": 1-3 sentences of backstory in a heroic, slightly wry war-legend tone,\n' +
+  '  "specialty": a 1-3 word combat specialty,\n' +
+  '  "personality": a 1-3 word personality tag,\n' +
+  '  "vibe": a 1-3 word vibe.\n' +
+  'Output strictly valid JSON. No preamble, no code fences, no commentary.'
+export const DIARY_SYSTEM =
+  'You are a tiny soldier in the auto-battler Tiny Army, writing a short first-person ' +
+  'war-diary entry. Given your name and traits, write 2-4 vivid sentences in first ' +
+  'person about a day on the battlefield — heroic, grounded, a touch of dark humor. ' +
+  'Prose only: no headings, no lists, no preamble.'
+export function personaUserPrompt(unitClass = '', seed = '') {
+  const s = seed && seed.trim() ? ` Seed inspiration: "${seed.trim()}".` : ''
+  return `Class: ${(unitClass || 'soldier').trim()}.${s} Return the JSON object now.`
+}
+export function diaryUserPrompt(unit = '', traits = '') {
+  const u = (unit || 'a nameless soldier').trim()
+  const t = (traits || 'untested').trim()
+  return `Name: ${u}. Traits: ${t}. Write the diary entry.`
+}

web/tiny.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { makeTeamBattle, step, FIELD } from '/web/engine.js'
 import { sliceGridWith, cellOf, rowFor, facingFor, ANIM } from '/web/sheet.js'
 import { mountSpritePlayground } from '/web/playground.js'
 import { mountPersonaPanel } from '/web/personaPanel.js'
 function whenEl(id, cb) {
   const found = document.getElementById(id)
@@ -53,8 +54,9 @@ whenEl('sprite-stage', async (el) => {
   playground = mountSpritePlayground(PIXI, el, { packs: man.packs || [], urlFor: spriteUrl })
 })
-// ── Personas tab — vanilla persona panel streaming from /persona/generate/stream ──
 whenEl('persona-stage', (el) => { mountPersonaPanel(el) })
 // ── Battle tab (real sprites, reusing the engine + shared renderer) ──────────
 const PLAYERS = [

 import { sliceGridWith, cellOf, rowFor, facingFor, ANIM } from '/web/sheet.js'
 import { mountSpritePlayground } from '/web/playground.js'
 import { mountPersonaPanel } from '/web/personaPanel.js'
+import { mountDiaryPanel } from '/web/diaryPanel.js'
 function whenEl(id, cb) {
   const found = document.getElementById(id)
   playground = mountSpritePlayground(PIXI, el, { packs: man.packs || [], urlFor: spriteUrl })
 })
+// ── Personas + War Diary tabs — in-browser llama.cpp (wllama), runs on the device ──
 whenEl('persona-stage', (el) => { mountPersonaPanel(el) })
+whenEl('diary-stage', (el) => { mountDiaryPanel(el) })
 // ── Battle tab (real sprites, reusing the engine + shared renderer) ──────────
 const PLAYERS = [

web/wllamaLlm.js ADDED Viewed

	@@ -0,0 +1,55 @@

+// In-browser llama.cpp via wllama (WASM) — runs a GGUF from Hugging Face on the
+// USER's device. This is the local-first path: zero cloud/server inference, so it
+// earns 🔌 Off the Grid AND 🦙 Llama Champion (wllama IS llama.cpp, compiled to
+// WebAssembly). It also dodges the free Space's ~0.6 tok/s CPU — generation runs on
+// the visitor's hardware (typically 10–50× faster). Streams tokens like the server
+// path did, so the persona/diary panels barely change.
+import { Wllama } from 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/index.js'
+const WLLAMA_VER = '3.4.1'
+const WASM = { default: `https://cdn.jsdelivr.net/npm/@wllama/wllama@${WLLAMA_VER}/esm/wasm/wllama.wasm` }
+// Small instruct GGUF: ~380 MB, downloaded once then cached by the browser.
+const MODEL = { repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF', file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf' }
+let _wllama = null
+let _loadPromise = null
+let _chain = Promise.resolve() // serialize completions (one model, no parallel decode)
+export function modelLabel() { return MODEL.repo.split('/').pop() }
+// Lazy-load wllama + the GGUF (cached after first download). onProgress(fraction 0..1).
+export function ensureModel(onProgress) {
+  if (_wllama) return Promise.resolve(_wllama)
+  if (_loadPromise) return _loadPromise
+  _loadPromise = (async () => {
+    const w = new Wllama(WASM)
+    await w.loadModelFromHF(MODEL, {
+      n_ctx: 2048,
+      progressCallback: ({ loaded, total }) => onProgress && onProgress(total ? loaded / total : 0),
+    })
+    _wllama = w
+    return w
+  })().catch((e) => { _loadPromise = null; throw e })
+  return _loadPromise
+}
+// Stream a chat completion in-browser. Calls onToken(piece) per chunk; returns full text.
+// Serialized so two panels can't decode at once.
+export function streamChat(system, user, { maxTokens = 200, temperature = 0.8, onToken } = {}) {
+  const run = async () => {
+    const w = await ensureModel()
+    let full = ''
+    const stream = await w.createChatCompletion({
+      messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
+      max_tokens: maxTokens, temperature, top_k: 40, top_p: 0.9, stream: true,
+    })
+    for await (const chunk of stream) {
+      const piece = chunk?.choices?.[0]?.delta?.content || ''
+      if (piece) { full += piece; if (onToken) onToken(piece) }
+    }
+    return full
+  }
+  const p = _chain.then(run, run)
+  _chain = p.catch(() => {})  // keep the chain alive after errors
+  return p
+}