Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on Jun 7

Commit

6e155d8

1 Parent(s): 37982be

Skill Forge: optional "show thinking" for coding models

Add a "show model thinking" toggle (off by default) that reveals the reasoning
models' trace in a collapsible debug panel, mirroring the persona debug div and
reusing stripThink/stripThinkFinal.

A `think` flag flows from the checkbox -> streamCoding -> /text/generate/stream
-> the model. Reasoning comes back inline as <think>…</think>; the clean answer
shows in the output, the raw trace in the panel.

- Nemotron (_nim_text_stream): when think=true, drop reasoning_budget=0 and
surface reasoning_content wrapped as <think>…</think>.
- BLS sidecar: new 5th `think` arg streams the reasoning wrapped in <think>
instead of discarding it; threaded through _space_text_stream's *extra
(defaults keep existing 4-arg callers on the clean path).
- Mellum2 has no reasoning, so the flag is a no-op there.

Verified end-to-end through /text/generate/stream for both models (think on/off)
plus the backward-compat 4-arg sidecar call.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (4) hide show

app.py +41 -19
spaces/bls-code-zerogpu/app.py +53 -30
web/codingModel.js +4 -1
web/skillForgePanel.js +29 -2

app.py CHANGED Viewed

@@ -478,7 +478,7 @@ def _tiny_aya_generate(system, user, max_tokens, temperature):
     return str(result or "")
-def _space_text_generate(space, system, user, max_tokens, temperature):
     from gradio_client import Client
     client = Client(space, token=HF_TOKEN or None)
     result = client.predict(
@@ -486,12 +486,13 @@ def _space_text_generate(space, system, user, max_tokens, temperature):
         user or "",
         int(max_tokens or 400),
         float(temperature if temperature is not None else 0.8),
         api_name="/generate",
     )
     return str(result or "")
-def _space_text_stream(space, system, user, max_tokens, temperature):
     from gradio_client import Client
     client = Client(space, token=HF_TOKEN or None)
     try:
@@ -500,6 +501,7 @@ def _space_text_stream(space, system, user, max_tokens, temperature):
             user or "",
             int(max_tokens or 400),
             float(temperature if temperature is not None else 0.8),
             api_name="/generate_stream",
         )
         prev = ""
@@ -510,7 +512,7 @@ def _space_text_stream(space, system, user, max_tokens, temperature):
                 yield text[len(prev):]
             prev = text
     except Exception:
-        text = _space_text_generate(space, system, user, max_tokens, temperature)
         if text:
             yield text
@@ -527,27 +529,32 @@ def _mellum_stream(system, user, max_tokens, temperature):
     yield from _space_text_stream(MELLUM_SPACE, system, user, max_tokens, temperature)
-def _nim_text_stream(system, user, max_tokens, temperature, model=None):
     """Stream from NVIDIA NIM's OpenAI-compatible chat endpoint (hosted Nemotron). Same
-    nvapi-… key as the portrait NIM. reasoning_budget=0 keeps the coding output clean
-    (Nemotron defaults thinking ON, which would otherwise emit a <think> trace)."""
     model = model or _NIM_NEMOTRON_MODEL  # defined later in the file; resolve at call time
     messages = []
     if system and system.strip():
         messages.append({"role": "system", "content": system.strip()})
     messages.append({"role": "user", "content": (user or "").strip()})
-    body = _json.dumps({
         "model": model,
         "messages": messages,
         "max_tokens": int(max_tokens or 512),
         "temperature": float(temperature if temperature is not None else 0.6),
         "top_p": 0.95,
         "stream": True,
-        "reasoning_budget": 0,
-    }).encode()
     req = urllib.request.Request(_NIM_TEXT_URL, data=body, method="POST", headers={
         "Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "text/event-stream",
     })
     with urllib.request.urlopen(req, timeout=120) as resp:
         for raw in resp:
             line = raw.decode("utf-8").strip()
@@ -557,11 +564,23 @@ def _nim_text_stream(system, user, max_tokens, temperature, model=None):
             if data == "[DONE]":
                 break
             try:
-                delta = _json.loads(data)["choices"][0]["delta"].get("content")
             except Exception:  # noqa: BLE001
                 continue
-            if delta:
-                yield delta
 def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
@@ -581,24 +600,25 @@ def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
         yield from _nim_text_stream(system, user, max_tokens, temperature)
-def _bls_code_stream(system, user, max_tokens, temperature):
-    yield from _space_text_stream(BLS_CODE_SPACE, system, user, max_tokens, temperature)
-def _bls_code_stream_with_fallback(system, user, max_tokens, temperature):
     """BLS Mini-Code ZeroGPU sidecar, falling back to Nemotron (NVIDIA NIM) if the sidecar is
     unavailable BEFORE any token streams (same constraint as Mellum2: can't switch mid-stream)."""
     emitted = False
     try:
         if not BLS_CODE_SPACE:
             raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
-        for chunk in _bls_code_stream(system, user, max_tokens, temperature):
             emitted = True
             yield chunk
     except Exception:  # noqa: BLE001
         if emitted or not NIM_KEY:
             raise
-        yield from _nim_text_stream(system, user, max_tokens, temperature)
 @fastapi_app.post("/voxcpm-tts")
@@ -887,6 +907,8 @@ async def text_generate_stream(request: Request):
     user = body.get("user") or ""
     max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
     temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
     stop = threading.Event()
     async def gen():
@@ -923,14 +945,14 @@ async def text_generate_stream(request: Request):
                     # BLS Mini-Code sidecar, with Nemotron NIM as fallback if it's unavailable.
                     if not BLS_CODE_SPACE and not NIM_KEY:
                         raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
-                    for chunk in _bls_code_stream_with_fallback(system, user, max_tokens, temperature):
                         if stop.is_set():
                             break
                         loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
                 elif model == "nemotron-3-nano-30b-nim":
                     if not NIM_KEY:
                         raise llm.LlmUnavailable("NVIDIA_NIM_API_KEY not set")
-                    for chunk in _nim_text_stream(system, user, max_tokens, temperature):
                         if stop.is_set():
                             break
                         loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))

     return str(result or "")
+def _space_text_generate(space, system, user, max_tokens, temperature, *extra):
     from gradio_client import Client
     client = Client(space, token=HF_TOKEN or None)
     result = client.predict(
         user or "",
         int(max_tokens or 400),
         float(temperature if temperature is not None else 0.8),
+        *extra,  # optional trailing inputs (e.g. BLS sidecar's `think` flag)
         api_name="/generate",
     )
     return str(result or "")
+def _space_text_stream(space, system, user, max_tokens, temperature, *extra):
     from gradio_client import Client
     client = Client(space, token=HF_TOKEN or None)
     try:
             user or "",
             int(max_tokens or 400),
             float(temperature if temperature is not None else 0.8),
+            *extra,  # optional trailing inputs (e.g. BLS sidecar's `think` flag)
             api_name="/generate_stream",
         )
         prev = ""
                 yield text[len(prev):]
             prev = text
     except Exception:
+        text = _space_text_generate(space, system, user, max_tokens, temperature, *extra)
         if text:
             yield text
     yield from _space_text_stream(MELLUM_SPACE, system, user, max_tokens, temperature)
+def _nim_text_stream(system, user, max_tokens, temperature, model=None, think=False):
     """Stream from NVIDIA NIM's OpenAI-compatible chat endpoint (hosted Nemotron). Same
+    nvapi-… key as the portrait NIM. think=False sets reasoning_budget=0 to keep the coding
+    output clean (Nemotron defaults thinking ON); think=True lets it reason and surfaces the
+    reasoning_content wrapped in <think>…</think> ahead of the answer, so the caller can show
+    it in a debug panel (same convention as the persona models)."""
     model = model or _NIM_NEMOTRON_MODEL  # defined later in the file; resolve at call time
     messages = []
     if system and system.strip():
         messages.append({"role": "system", "content": system.strip()})
     messages.append({"role": "user", "content": (user or "").strip()})
+    payload = {
         "model": model,
         "messages": messages,
         "max_tokens": int(max_tokens or 512),
         "temperature": float(temperature if temperature is not None else 0.6),
         "top_p": 0.95,
         "stream": True,
+    }
+    if not think:
+        payload["reasoning_budget"] = 0  # omit entirely to let Nemotron reason
+    body = _json.dumps(payload).encode()
     req = urllib.request.Request(_NIM_TEXT_URL, data=body, method="POST", headers={
         "Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "text/event-stream",
     })
+    think_open = False
     with urllib.request.urlopen(req, timeout=120) as resp:
         for raw in resp:
             line = raw.decode("utf-8").strip()
             if data == "[DONE]":
                 break
             try:
+                delta = _json.loads(data)["choices"][0]["delta"]
             except Exception:  # noqa: BLE001
                 continue
+            reasoning = delta.get("reasoning_content") if think else None
+            content = delta.get("content")
+            if reasoning:
+                if not think_open:
+                    yield "<think>"
+                    think_open = True
+                yield reasoning
+            if content:
+                if think_open:
+                    yield "</think>\n"
+                    think_open = False
+                yield content
+    if think_open:
+        yield "</think>\n"
 def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
         yield from _nim_text_stream(system, user, max_tokens, temperature)
+def _bls_code_stream(system, user, max_tokens, temperature, think=False):
+    # `think` is the BLS sidecar's optional 5th input; passed through _space_text_stream's *extra.
+    yield from _space_text_stream(BLS_CODE_SPACE, system, user, max_tokens, temperature, bool(think))
+def _bls_code_stream_with_fallback(system, user, max_tokens, temperature, think=False):
     """BLS Mini-Code ZeroGPU sidecar, falling back to Nemotron (NVIDIA NIM) if the sidecar is
     unavailable BEFORE any token streams (same constraint as Mellum2: can't switch mid-stream)."""
     emitted = False
     try:
         if not BLS_CODE_SPACE:
             raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
+        for chunk in _bls_code_stream(system, user, max_tokens, temperature, think):
             emitted = True
             yield chunk
     except Exception:  # noqa: BLE001
         if emitted or not NIM_KEY:
             raise
+        yield from _nim_text_stream(system, user, max_tokens, temperature, think=think)
 @fastapi_app.post("/voxcpm-tts")
     user = body.get("user") or ""
     max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
     temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
+    # When set, reasoning models (Nemotron, BLS) surface their <think> trace instead of hiding it.
+    think = bool(body.get("think"))
     stop = threading.Event()
     async def gen():
                     # BLS Mini-Code sidecar, with Nemotron NIM as fallback if it's unavailable.
                     if not BLS_CODE_SPACE and not NIM_KEY:
                         raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
+                    for chunk in _bls_code_stream_with_fallback(system, user, max_tokens, temperature, think):
                         if stop.is_set():
                             break
                         loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
                 elif model == "nemotron-3-nano-30b-nim":
                     if not NIM_KEY:
                         raise llm.LlmUnavailable("NVIDIA_NIM_API_KEY not set")
+                    for chunk in _nim_text_stream(system, user, max_tokens, temperature, think=think):
                         if stop.is_set():
                             break
                         loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))

spaces/bls-code-zerogpu/app.py CHANGED Viewed

@@ -76,21 +76,42 @@ def _build_inputs(system, user):
     return {k: v.to(_model.device) for k, v in enc.items()}
-def _extract_response(raw):
-    """Pull just the answer out of a (possibly partial) raw decode: content after
-    <|START_RESPONSE|> (or after <|END_THINKING|> as a fallback), up to <|END_RESPONSE|>."""
-    i = raw.find(START_RESP)
-    if i != -1:
-        body = raw[i + len(START_RESP):]
     else:
-        j = raw.find(END_THINK)
-        body = raw[j + len(END_THINK):] if j != -1 else ""
-    k = body.find(END_RESP)
     if k != -1:
-        body = body[:k]
-    for mark in _STRIP:
-        body = body.replace(mark, "")
-    return body.strip()
 def _gen_kwargs(inputs, max_tokens, temperature):
@@ -108,9 +129,10 @@ def _gen_kwargs(inputs, max_tokens, temperature):
 @spaces.GPU(duration=GPU_DURATION)
-def generate_stream(system, user, max_tokens, temperature):
-    """Stream CUMULATIVE response text (thinking suppressed). The main app diffs successive
-    yields into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
     try:
         inputs = _build_inputs(system, user)
         # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
@@ -129,32 +151,31 @@ def generate_stream(system, user, max_tokens, temperature):
         thread = threading.Thread(target=_run)
         thread.start()
-        acc, started = "", False
         for piece in streamer:
             acc += piece
-            if not started:
-                if START_RESP not in acc:
-                    continue  # still in the thinking block — emit nothing yet
-                started = True
-            yield _extract_response(acc)
         thread.join()
         if err:
-            yield (_extract_response(acc) + "\n[GENERATE ERROR]\n" + err["tb"])
-        elif not started:
-            # Model never opened a response block — fall back to whatever's after thinking.
-            yield _extract_response(acc) or "[EMPTY OUTPUT — no response block produced]"
     except Exception:  # noqa: BLE001
         import traceback
         yield "[SETUP ERROR]\n" + traceback.format_exc()
 @spaces.GPU(duration=GPU_DURATION)
-def generate(system, user, max_tokens, temperature):
     try:
         inputs = _build_inputs(system, user)
         out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
         raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
-        return _extract_response(raw) or "[EMPTY OUTPUT]"
     except Exception:  # noqa: BLE001
         import traceback
         return "[ERROR]\n" + traceback.format_exc()
@@ -167,14 +188,16 @@ with gr.Blocks(title="BLS Mini-Code 1.0 — Tiny Army sidecar") as demo:
     usr_in = gr.Textbox(label="user", lines=6)
     mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
     temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
     out = gr.Textbox(label="output", lines=12)
     with gr.Row():
         stream_btn = gr.Button("Stream", variant="primary")
         once_btn = gr.Button("Generate")
     stream_btn.click(
-        generate_stream, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate_stream"
     )
-    once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate")
 if __name__ == "__main__":
     demo.queue().launch()

     return {k: v.to(_model.device) for k, v in enc.items()}
+def _clean(s):
+    for mark in _STRIP:
+        s = s.replace(mark, "")
+    return s
+def _split(raw):
+    """Split a (possibly partial) raw decode into (thinking, response, response_started):
+    everything before <|START_RESPONSE|> (or <|END_THINKING|>) is reasoning; the rest, up to
+    <|END_RESPONSE|>, is the answer."""
+    resp_i = raw.find(START_RESP)
+    if resp_i != -1:
+        think_part, resp, started = raw[:resp_i], raw[resp_i + len(START_RESP):], True
     else:
+        end_t = raw.find(END_THINK)
+        if end_t != -1:
+            think_part, resp, started = raw[:end_t], raw[end_t + len(END_THINK):], True
+        else:
+            think_part, resp, started = raw, "", False
+    k = resp.find(END_RESP)
     if k != -1:
+        resp = resp[:k]
+    return _clean(think_part).strip(), _clean(resp).strip(), started
+def _render(raw, think):
+    """Cumulative output string. think=False → clean answer only (reasoning discarded).
+    think=True → reasoning wrapped in <think>…</think> ahead of the answer; the main app
+    strips it for the clean view but shows it in a debug panel (same convention the persona
+    models use), so the user can watch the model reason."""
+    thinking, resp, started = _split(raw)
+    if not think:
+        return resp
+    if started:
+        return f"<think>\n{thinking}\n</think>\n{resp}".strip()
+    return f"<think>\n{thinking}".strip()
 def _gen_kwargs(inputs, max_tokens, temperature):
 @spaces.GPU(duration=GPU_DURATION)
+def generate_stream(system, user, max_tokens, temperature, think=False):
+    """Stream CUMULATIVE output. think=False suppresses reasoning (clean code only); think=True
+    streams the reasoning live wrapped in <think>…</think>. The main app diffs successive yields
+    into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
     try:
         inputs = _build_inputs(system, user)
         # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
         thread = threading.Thread(target=_run)
         thread.start()
+        acc, emitted = "", False
         for piece in streamer:
             acc += piece
+            # When hiding thinking, emit nothing until the response block opens.
+            if not think and not _split(acc)[2]:
+                continue
+            emitted = True
+            yield _render(acc, think)
         thread.join()
         if err:
+            yield (_render(acc, think) + "\n[GENERATE ERROR]\n" + err["tb"])
+        elif not emitted:
+            yield _render(acc, think) or "[EMPTY OUTPUT — no response block produced]"
     except Exception:  # noqa: BLE001
         import traceback
         yield "[SETUP ERROR]\n" + traceback.format_exc()
 @spaces.GPU(duration=GPU_DURATION)
+def generate(system, user, max_tokens, temperature, think=False):
     try:
         inputs = _build_inputs(system, user)
         out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
         raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
+        return _render(raw, think) or "[EMPTY OUTPUT]"
     except Exception:  # noqa: BLE001
         import traceback
         return "[ERROR]\n" + traceback.format_exc()
     usr_in = gr.Textbox(label="user", lines=6)
     mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
     temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
+    # 5th input — defaults False so existing 4-arg API callers keep getting clean code.
+    think_in = gr.Checkbox(value=False, label="show thinking (wrap reasoning in <think>…</think>)")
     out = gr.Textbox(label="output", lines=12)
     with gr.Row():
         stream_btn = gr.Button("Stream", variant="primary")
         once_btn = gr.Button("Generate")
     stream_btn.click(
+        generate_stream, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate_stream"
     )
+    once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate")
 if __name__ == "__main__":
     demo.queue().launch()

web/codingModel.js CHANGED Viewed

@@ -36,7 +36,9 @@ export function setCodingModel(id) {
 }
 // Stream a coding-model completion. Same delta protocol as engineServer.stream.
-export async function streamCoding(system, user, { maxTokens = 512, temperature = 0.6, onToken, onStats, signal } = {}) {
   const st = statsTracker(onStats)
   let full = ''
   await streamSse('/text/generate/stream', {
@@ -45,6 +47,7 @@ export async function streamCoding(system, user, { maxTokens = 512, temperature
     user,
     max_tokens: maxTokens,
     temperature,
   }, {
     signal,
     onEvent(evt, parsed) {

 }
 // Stream a coding-model completion. Same delta protocol as engineServer.stream.
+// think=true asks reasoning models (Nemotron, BLS) to surface their <think>…</think> trace
+// instead of hiding it, so the caller can show it in a debug panel.
+export async function streamCoding(system, user, { maxTokens = 512, temperature = 0.6, think = false, onToken, onStats, signal } = {}) {
   const st = statsTracker(onStats)
   let full = ''
   await streamSse('/text/generate/stream', {
     user,
     max_tokens: maxTokens,
     temperature,
+    think,
   }, {
     signal,
     onEvent(evt, parsed) {

web/skillForgePanel.js CHANGED Viewed

@@ -5,6 +5,7 @@
 // just generates and shows the skill; wiring it into the battle engine comes later.
 import { streamCoding, currentCodingModel, onCodingModelChange } from '/web/codingModel.js'
 import { listPersonas, getPersona, onRosterChange } from '/web/personaStore.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
@@ -46,17 +47,33 @@ export function mountSkillForgePanel(host) {
   const empty = el('div', { class: 'persona-roster-empty' },
     'No heroes yet — recruit one in the Personas tab, then come back to forge its skills.')
   const controls = el('aside', { class: 'persona-controls skillforge' }, [
     el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, 'Skill Forge'), el('span')]),
     el('label', { class: 'persona-label' }, 'Hero'), sel,
     empty,
     el('label', { class: 'persona-label' }, 'Skill request'), req,
     el('div', { class: 'persona-prompt-actions' }, [btn]),
     status,
     el('label', { class: 'persona-label' }, 'Forged skill'), out,
   ])
   host.append(controls)
   function refreshHeroes() {
     const people = listPersonas()
     const prev = sel.value
@@ -80,15 +97,25 @@ export function mountSkillForgePanel(host) {
     const ask = req.value.trim()
     if (!ask) { status.textContent = 'Describe the skill you want.'; return }
     running = true; status.dataset.busy = '1'; btn.disabled = true
-    out.textContent = ''
     status.textContent = `Forging with ${currentCodingModel().label}…`
     const user = `${personaBlock(p)}\n\nSkill to create: ${ask}`
     try {
       const { stats } = await streamCoding(SYSTEM, user, {
         maxTokens: 512,
         temperature: 0.6,
-        onToken: (t) => { out.textContent += t },
       })
       const tps = stats && stats.tokPerSec ? ` · ${stats.tokPerSec} tok/s` : ''
       status.textContent = `Done${tps}.`
     } catch (e) {

 // just generates and shows the skill; wiring it into the battle engine comes later.
 import { streamCoding, currentCodingModel, onCodingModelChange } from '/web/codingModel.js'
 import { listPersonas, getPersona, onRosterChange } from '/web/personaStore.js'
+import { stripThink, stripThinkFinal } from '/web/personaPrompts.js'
 function el(tag, props = {}, kids = []) {
   const n = document.createElement(tag)
   const empty = el('div', { class: 'persona-roster-empty' },
     'No heroes yet — recruit one in the Personas tab, then come back to forge its skills.')
+  // "Show thinking" reveals the reasoning models' <think> trace (Nemotron, BLS) in the debug
+  // panel below; off by default so forging stays fast/clean. (Mellum2 has no reasoning.)
+  const thinkChk = el('input', { type: 'checkbox', class: 'skillforge-think' })
+  const thinkLabel = el('label', { class: 'persona-label skillforge-think-label' },
+    [thinkChk, ' show model thinking'])
+  const dbgEl = el('pre', { class: 'persona-think' })
+  const copyBtn = el('button', { class: 'persona-copy', type: 'button',
+    onclick: () => navigator.clipboard?.writeText(dbgEl.textContent || '') }, 'copy')
+  const dbgWrap = el('details', { class: 'persona-think-wrap' },
+    [el('summary', {}, 'model thinking / raw'), copyBtn, dbgEl])
+  dbgWrap.style.display = thinkChk.checked ? '' : 'none'
   const controls = el('aside', { class: 'persona-controls skillforge' }, [
     el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, 'Skill Forge'), el('span')]),
     el('label', { class: 'persona-label' }, 'Hero'), sel,
     empty,
     el('label', { class: 'persona-label' }, 'Skill request'), req,
+    thinkLabel,
     el('div', { class: 'persona-prompt-actions' }, [btn]),
     status,
     el('label', { class: 'persona-label' }, 'Forged skill'), out,
+    dbgWrap,
   ])
   host.append(controls)
+  thinkChk.addEventListener('change', () => { dbgWrap.style.display = thinkChk.checked ? '' : 'none' })
   function refreshHeroes() {
     const people = listPersonas()
     const prev = sel.value
     const ask = req.value.trim()
     if (!ask) { status.textContent = 'Describe the skill you want.'; return }
     running = true; status.dataset.busy = '1'; btn.disabled = true
+    out.textContent = ''; dbgEl.textContent = ''
     status.textContent = `Forging with ${currentCodingModel().label}…`
     const user = `${personaBlock(p)}\n\nSkill to create: ${ask}`
+    const showThink = thinkChk.checked
+    let raw = ''
     try {
       const { stats } = await streamCoding(SYSTEM, user, {
         maxTokens: 512,
         temperature: 0.6,
+        think: showThink,
+        // Reasoning streams inside <think>…</think>; show the raw trace in the debug panel and
+        // the stripped answer in the output (same split the persona panel uses).
+        onToken: (t) => {
+          raw += t
+          out.textContent = stripThink(raw)
+          if (showThink) { dbgEl.textContent = raw; dbgWrap.open = true; dbgEl.scrollTop = dbgEl.scrollHeight }
+        },
       })
+      out.textContent = stripThinkFinal(raw)
       const tps = stats && stats.tokPerSec ? ` · ${stats.tokPerSec} tok/s` : ''
       status.textContent = `Done${tps}.`
     } catch (e) {