polats Claude Opus 4.8 (1M context) commited on
Commit
6e155d8
·
1 Parent(s): 37982be

Skill Forge: optional "show thinking" for coding models

Browse files

Add a "show model thinking" toggle (off by default) that reveals the reasoning
models' trace in a collapsible debug panel, mirroring the persona debug div and
reusing stripThink/stripThinkFinal.

A `think` flag flows from the checkbox -> streamCoding -> /text/generate/stream
-> the model. Reasoning comes back inline as <think>…</think>; the clean answer
shows in the output, the raw trace in the panel.

- Nemotron (_nim_text_stream): when think=true, drop reasoning_budget=0 and
surface reasoning_content wrapped as <think>…</think>.
- BLS sidecar: new 5th `think` arg streams the reasoning wrapped in <think>
instead of discarding it; threaded through _space_text_stream's *extra
(defaults keep existing 4-arg callers on the clean path).
- Mellum2 has no reasoning, so the flag is a no-op there.

Verified end-to-end through /text/generate/stream for both models (think on/off)
plus the backward-compat 4-arg sidecar call.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

app.py CHANGED
@@ -478,7 +478,7 @@ def _tiny_aya_generate(system, user, max_tokens, temperature):
478
  return str(result or "")
479
 
480
 
481
- def _space_text_generate(space, system, user, max_tokens, temperature):
482
  from gradio_client import Client
483
  client = Client(space, token=HF_TOKEN or None)
484
  result = client.predict(
@@ -486,12 +486,13 @@ def _space_text_generate(space, system, user, max_tokens, temperature):
486
  user or "",
487
  int(max_tokens or 400),
488
  float(temperature if temperature is not None else 0.8),
 
489
  api_name="/generate",
490
  )
491
  return str(result or "")
492
 
493
 
494
- def _space_text_stream(space, system, user, max_tokens, temperature):
495
  from gradio_client import Client
496
  client = Client(space, token=HF_TOKEN or None)
497
  try:
@@ -500,6 +501,7 @@ def _space_text_stream(space, system, user, max_tokens, temperature):
500
  user or "",
501
  int(max_tokens or 400),
502
  float(temperature if temperature is not None else 0.8),
 
503
  api_name="/generate_stream",
504
  )
505
  prev = ""
@@ -510,7 +512,7 @@ def _space_text_stream(space, system, user, max_tokens, temperature):
510
  yield text[len(prev):]
511
  prev = text
512
  except Exception:
513
- text = _space_text_generate(space, system, user, max_tokens, temperature)
514
  if text:
515
  yield text
516
 
@@ -527,27 +529,32 @@ def _mellum_stream(system, user, max_tokens, temperature):
527
  yield from _space_text_stream(MELLUM_SPACE, system, user, max_tokens, temperature)
528
 
529
 
530
- def _nim_text_stream(system, user, max_tokens, temperature, model=None):
531
  """Stream from NVIDIA NIM's OpenAI-compatible chat endpoint (hosted Nemotron). Same
532
- nvapi-… key as the portrait NIM. reasoning_budget=0 keeps the coding output clean
533
- (Nemotron defaults thinking ON, which would otherwise emit a <think> trace)."""
 
 
534
  model = model or _NIM_NEMOTRON_MODEL # defined later in the file; resolve at call time
535
  messages = []
536
  if system and system.strip():
537
  messages.append({"role": "system", "content": system.strip()})
538
  messages.append({"role": "user", "content": (user or "").strip()})
539
- body = _json.dumps({
540
  "model": model,
541
  "messages": messages,
542
  "max_tokens": int(max_tokens or 512),
543
  "temperature": float(temperature if temperature is not None else 0.6),
544
  "top_p": 0.95,
545
  "stream": True,
546
- "reasoning_budget": 0,
547
- }).encode()
 
 
548
  req = urllib.request.Request(_NIM_TEXT_URL, data=body, method="POST", headers={
549
  "Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "text/event-stream",
550
  })
 
551
  with urllib.request.urlopen(req, timeout=120) as resp:
552
  for raw in resp:
553
  line = raw.decode("utf-8").strip()
@@ -557,11 +564,23 @@ def _nim_text_stream(system, user, max_tokens, temperature, model=None):
557
  if data == "[DONE]":
558
  break
559
  try:
560
- delta = _json.loads(data)["choices"][0]["delta"].get("content")
561
  except Exception: # noqa: BLE001
562
  continue
563
- if delta:
564
- yield delta
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
 
567
  def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
@@ -581,24 +600,25 @@ def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
581
  yield from _nim_text_stream(system, user, max_tokens, temperature)
582
 
583
 
584
- def _bls_code_stream(system, user, max_tokens, temperature):
585
- yield from _space_text_stream(BLS_CODE_SPACE, system, user, max_tokens, temperature)
 
586
 
587
 
588
- def _bls_code_stream_with_fallback(system, user, max_tokens, temperature):
589
  """BLS Mini-Code ZeroGPU sidecar, falling back to Nemotron (NVIDIA NIM) if the sidecar is
590
  unavailable BEFORE any token streams (same constraint as Mellum2: can't switch mid-stream)."""
591
  emitted = False
592
  try:
593
  if not BLS_CODE_SPACE:
594
  raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
595
- for chunk in _bls_code_stream(system, user, max_tokens, temperature):
596
  emitted = True
597
  yield chunk
598
  except Exception: # noqa: BLE001
599
  if emitted or not NIM_KEY:
600
  raise
601
- yield from _nim_text_stream(system, user, max_tokens, temperature)
602
 
603
 
604
  @fastapi_app.post("/voxcpm-tts")
@@ -887,6 +907,8 @@ async def text_generate_stream(request: Request):
887
  user = body.get("user") or ""
888
  max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
889
  temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
 
 
890
  stop = threading.Event()
891
 
892
  async def gen():
@@ -923,14 +945,14 @@ async def text_generate_stream(request: Request):
923
  # BLS Mini-Code sidecar, with Nemotron NIM as fallback if it's unavailable.
924
  if not BLS_CODE_SPACE and not NIM_KEY:
925
  raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
926
- for chunk in _bls_code_stream_with_fallback(system, user, max_tokens, temperature):
927
  if stop.is_set():
928
  break
929
  loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
930
  elif model == "nemotron-3-nano-30b-nim":
931
  if not NIM_KEY:
932
  raise llm.LlmUnavailable("NVIDIA_NIM_API_KEY not set")
933
- for chunk in _nim_text_stream(system, user, max_tokens, temperature):
934
  if stop.is_set():
935
  break
936
  loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
 
478
  return str(result or "")
479
 
480
 
481
+ def _space_text_generate(space, system, user, max_tokens, temperature, *extra):
482
  from gradio_client import Client
483
  client = Client(space, token=HF_TOKEN or None)
484
  result = client.predict(
 
486
  user or "",
487
  int(max_tokens or 400),
488
  float(temperature if temperature is not None else 0.8),
489
+ *extra, # optional trailing inputs (e.g. BLS sidecar's `think` flag)
490
  api_name="/generate",
491
  )
492
  return str(result or "")
493
 
494
 
495
+ def _space_text_stream(space, system, user, max_tokens, temperature, *extra):
496
  from gradio_client import Client
497
  client = Client(space, token=HF_TOKEN or None)
498
  try:
 
501
  user or "",
502
  int(max_tokens or 400),
503
  float(temperature if temperature is not None else 0.8),
504
+ *extra, # optional trailing inputs (e.g. BLS sidecar's `think` flag)
505
  api_name="/generate_stream",
506
  )
507
  prev = ""
 
512
  yield text[len(prev):]
513
  prev = text
514
  except Exception:
515
+ text = _space_text_generate(space, system, user, max_tokens, temperature, *extra)
516
  if text:
517
  yield text
518
 
 
529
  yield from _space_text_stream(MELLUM_SPACE, system, user, max_tokens, temperature)
530
 
531
 
532
+ def _nim_text_stream(system, user, max_tokens, temperature, model=None, think=False):
533
  """Stream from NVIDIA NIM's OpenAI-compatible chat endpoint (hosted Nemotron). Same
534
+ nvapi-… key as the portrait NIM. think=False sets reasoning_budget=0 to keep the coding
535
+ output clean (Nemotron defaults thinking ON); think=True lets it reason and surfaces the
536
+ reasoning_content wrapped in <think>…</think> ahead of the answer, so the caller can show
537
+ it in a debug panel (same convention as the persona models)."""
538
  model = model or _NIM_NEMOTRON_MODEL # defined later in the file; resolve at call time
539
  messages = []
540
  if system and system.strip():
541
  messages.append({"role": "system", "content": system.strip()})
542
  messages.append({"role": "user", "content": (user or "").strip()})
543
+ payload = {
544
  "model": model,
545
  "messages": messages,
546
  "max_tokens": int(max_tokens or 512),
547
  "temperature": float(temperature if temperature is not None else 0.6),
548
  "top_p": 0.95,
549
  "stream": True,
550
+ }
551
+ if not think:
552
+ payload["reasoning_budget"] = 0 # omit entirely to let Nemotron reason
553
+ body = _json.dumps(payload).encode()
554
  req = urllib.request.Request(_NIM_TEXT_URL, data=body, method="POST", headers={
555
  "Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "text/event-stream",
556
  })
557
+ think_open = False
558
  with urllib.request.urlopen(req, timeout=120) as resp:
559
  for raw in resp:
560
  line = raw.decode("utf-8").strip()
 
564
  if data == "[DONE]":
565
  break
566
  try:
567
+ delta = _json.loads(data)["choices"][0]["delta"]
568
  except Exception: # noqa: BLE001
569
  continue
570
+ reasoning = delta.get("reasoning_content") if think else None
571
+ content = delta.get("content")
572
+ if reasoning:
573
+ if not think_open:
574
+ yield "<think>"
575
+ think_open = True
576
+ yield reasoning
577
+ if content:
578
+ if think_open:
579
+ yield "</think>\n"
580
+ think_open = False
581
+ yield content
582
+ if think_open:
583
+ yield "</think>\n"
584
 
585
 
586
  def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
 
600
  yield from _nim_text_stream(system, user, max_tokens, temperature)
601
 
602
 
603
+ def _bls_code_stream(system, user, max_tokens, temperature, think=False):
604
+ # `think` is the BLS sidecar's optional 5th input; passed through _space_text_stream's *extra.
605
+ yield from _space_text_stream(BLS_CODE_SPACE, system, user, max_tokens, temperature, bool(think))
606
 
607
 
608
+ def _bls_code_stream_with_fallback(system, user, max_tokens, temperature, think=False):
609
  """BLS Mini-Code ZeroGPU sidecar, falling back to Nemotron (NVIDIA NIM) if the sidecar is
610
  unavailable BEFORE any token streams (same constraint as Mellum2: can't switch mid-stream)."""
611
  emitted = False
612
  try:
613
  if not BLS_CODE_SPACE:
614
  raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
615
+ for chunk in _bls_code_stream(system, user, max_tokens, temperature, think):
616
  emitted = True
617
  yield chunk
618
  except Exception: # noqa: BLE001
619
  if emitted or not NIM_KEY:
620
  raise
621
+ yield from _nim_text_stream(system, user, max_tokens, temperature, think=think)
622
 
623
 
624
  @fastapi_app.post("/voxcpm-tts")
 
907
  user = body.get("user") or ""
908
  max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
909
  temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
910
+ # When set, reasoning models (Nemotron, BLS) surface their <think> trace instead of hiding it.
911
+ think = bool(body.get("think"))
912
  stop = threading.Event()
913
 
914
  async def gen():
 
945
  # BLS Mini-Code sidecar, with Nemotron NIM as fallback if it's unavailable.
946
  if not BLS_CODE_SPACE and not NIM_KEY:
947
  raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
948
+ for chunk in _bls_code_stream_with_fallback(system, user, max_tokens, temperature, think):
949
  if stop.is_set():
950
  break
951
  loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
952
  elif model == "nemotron-3-nano-30b-nim":
953
  if not NIM_KEY:
954
  raise llm.LlmUnavailable("NVIDIA_NIM_API_KEY not set")
955
+ for chunk in _nim_text_stream(system, user, max_tokens, temperature, think=think):
956
  if stop.is_set():
957
  break
958
  loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
spaces/bls-code-zerogpu/app.py CHANGED
@@ -76,21 +76,42 @@ def _build_inputs(system, user):
76
  return {k: v.to(_model.device) for k, v in enc.items()}
77
 
78
 
79
- def _extract_response(raw):
80
- """Pull just the answer out of a (possibly partial) raw decode: content after
81
- <|START_RESPONSE|> (or after <|END_THINKING|> as a fallback), up to <|END_RESPONSE|>."""
82
- i = raw.find(START_RESP)
83
- if i != -1:
84
- body = raw[i + len(START_RESP):]
 
 
 
 
 
 
 
85
  else:
86
- j = raw.find(END_THINK)
87
- body = raw[j + len(END_THINK):] if j != -1 else ""
88
- k = body.find(END_RESP)
 
 
 
89
  if k != -1:
90
- body = body[:k]
91
- for mark in _STRIP:
92
- body = body.replace(mark, "")
93
- return body.strip()
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def _gen_kwargs(inputs, max_tokens, temperature):
@@ -108,9 +129,10 @@ def _gen_kwargs(inputs, max_tokens, temperature):
108
 
109
 
110
  @spaces.GPU(duration=GPU_DURATION)
111
- def generate_stream(system, user, max_tokens, temperature):
112
- """Stream CUMULATIVE response text (thinking suppressed). The main app diffs successive
113
- yields into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
 
114
  try:
115
  inputs = _build_inputs(system, user)
116
  # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
@@ -129,32 +151,31 @@ def generate_stream(system, user, max_tokens, temperature):
129
 
130
  thread = threading.Thread(target=_run)
131
  thread.start()
132
- acc, started = "", False
133
  for piece in streamer:
134
  acc += piece
135
- if not started:
136
- if START_RESP not in acc:
137
- continue # still in the thinking block — emit nothing yet
138
- started = True
139
- yield _extract_response(acc)
140
  thread.join()
141
  if err:
142
- yield (_extract_response(acc) + "\n[GENERATE ERROR]\n" + err["tb"])
143
- elif not started:
144
- # Model never opened a response block fall back to whatever's after thinking.
145
- yield _extract_response(acc) or "[EMPTY OUTPUT — no response block produced]"
146
  except Exception: # noqa: BLE001
147
  import traceback
148
  yield "[SETUP ERROR]\n" + traceback.format_exc()
149
 
150
 
151
  @spaces.GPU(duration=GPU_DURATION)
152
- def generate(system, user, max_tokens, temperature):
153
  try:
154
  inputs = _build_inputs(system, user)
155
  out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
156
  raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
157
- return _extract_response(raw) or "[EMPTY OUTPUT]"
158
  except Exception: # noqa: BLE001
159
  import traceback
160
  return "[ERROR]\n" + traceback.format_exc()
@@ -167,14 +188,16 @@ with gr.Blocks(title="BLS Mini-Code 1.0 — Tiny Army sidecar") as demo:
167
  usr_in = gr.Textbox(label="user", lines=6)
168
  mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
169
  temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
 
 
170
  out = gr.Textbox(label="output", lines=12)
171
  with gr.Row():
172
  stream_btn = gr.Button("Stream", variant="primary")
173
  once_btn = gr.Button("Generate")
174
  stream_btn.click(
175
- generate_stream, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate_stream"
176
  )
177
- once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate")
178
 
179
  if __name__ == "__main__":
180
  demo.queue().launch()
 
76
  return {k: v.to(_model.device) for k, v in enc.items()}
77
 
78
 
79
+ def _clean(s):
80
+ for mark in _STRIP:
81
+ s = s.replace(mark, "")
82
+ return s
83
+
84
+
85
+ def _split(raw):
86
+ """Split a (possibly partial) raw decode into (thinking, response, response_started):
87
+ everything before <|START_RESPONSE|> (or <|END_THINKING|>) is reasoning; the rest, up to
88
+ <|END_RESPONSE|>, is the answer."""
89
+ resp_i = raw.find(START_RESP)
90
+ if resp_i != -1:
91
+ think_part, resp, started = raw[:resp_i], raw[resp_i + len(START_RESP):], True
92
  else:
93
+ end_t = raw.find(END_THINK)
94
+ if end_t != -1:
95
+ think_part, resp, started = raw[:end_t], raw[end_t + len(END_THINK):], True
96
+ else:
97
+ think_part, resp, started = raw, "", False
98
+ k = resp.find(END_RESP)
99
  if k != -1:
100
+ resp = resp[:k]
101
+ return _clean(think_part).strip(), _clean(resp).strip(), started
102
+
103
+
104
+ def _render(raw, think):
105
+ """Cumulative output string. think=False → clean answer only (reasoning discarded).
106
+ think=True → reasoning wrapped in <think>…</think> ahead of the answer; the main app
107
+ strips it for the clean view but shows it in a debug panel (same convention the persona
108
+ models use), so the user can watch the model reason."""
109
+ thinking, resp, started = _split(raw)
110
+ if not think:
111
+ return resp
112
+ if started:
113
+ return f"<think>\n{thinking}\n</think>\n{resp}".strip()
114
+ return f"<think>\n{thinking}".strip()
115
 
116
 
117
  def _gen_kwargs(inputs, max_tokens, temperature):
 
129
 
130
 
131
  @spaces.GPU(duration=GPU_DURATION)
132
+ def generate_stream(system, user, max_tokens, temperature, think=False):
133
+ """Stream CUMULATIVE output. think=False suppresses reasoning (clean code only); think=True
134
+ streams the reasoning live wrapped in <think>…</think>. The main app diffs successive yields
135
+ into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
136
  try:
137
  inputs = _build_inputs(system, user)
138
  # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
 
151
 
152
  thread = threading.Thread(target=_run)
153
  thread.start()
154
+ acc, emitted = "", False
155
  for piece in streamer:
156
  acc += piece
157
+ # When hiding thinking, emit nothing until the response block opens.
158
+ if not think and not _split(acc)[2]:
159
+ continue
160
+ emitted = True
161
+ yield _render(acc, think)
162
  thread.join()
163
  if err:
164
+ yield (_render(acc, think) + "\n[GENERATE ERROR]\n" + err["tb"])
165
+ elif not emitted:
166
+ yield _render(acc, think) or "[EMPTY OUTPUTno response block produced]"
 
167
  except Exception: # noqa: BLE001
168
  import traceback
169
  yield "[SETUP ERROR]\n" + traceback.format_exc()
170
 
171
 
172
  @spaces.GPU(duration=GPU_DURATION)
173
+ def generate(system, user, max_tokens, temperature, think=False):
174
  try:
175
  inputs = _build_inputs(system, user)
176
  out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
177
  raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
178
+ return _render(raw, think) or "[EMPTY OUTPUT]"
179
  except Exception: # noqa: BLE001
180
  import traceback
181
  return "[ERROR]\n" + traceback.format_exc()
 
188
  usr_in = gr.Textbox(label="user", lines=6)
189
  mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
190
  temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
191
+ # 5th input — defaults False so existing 4-arg API callers keep getting clean code.
192
+ think_in = gr.Checkbox(value=False, label="show thinking (wrap reasoning in <think>…</think>)")
193
  out = gr.Textbox(label="output", lines=12)
194
  with gr.Row():
195
  stream_btn = gr.Button("Stream", variant="primary")
196
  once_btn = gr.Button("Generate")
197
  stream_btn.click(
198
+ generate_stream, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate_stream"
199
  )
200
+ once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate")
201
 
202
  if __name__ == "__main__":
203
  demo.queue().launch()
web/codingModel.js CHANGED
@@ -36,7 +36,9 @@ export function setCodingModel(id) {
36
  }
37
 
38
  // Stream a coding-model completion. Same delta protocol as engineServer.stream.
39
- export async function streamCoding(system, user, { maxTokens = 512, temperature = 0.6, onToken, onStats, signal } = {}) {
 
 
40
  const st = statsTracker(onStats)
41
  let full = ''
42
  await streamSse('/text/generate/stream', {
@@ -45,6 +47,7 @@ export async function streamCoding(system, user, { maxTokens = 512, temperature
45
  user,
46
  max_tokens: maxTokens,
47
  temperature,
 
48
  }, {
49
  signal,
50
  onEvent(evt, parsed) {
 
36
  }
37
 
38
  // Stream a coding-model completion. Same delta protocol as engineServer.stream.
39
+ // think=true asks reasoning models (Nemotron, BLS) to surface their <think>…</think> trace
40
+ // instead of hiding it, so the caller can show it in a debug panel.
41
+ export async function streamCoding(system, user, { maxTokens = 512, temperature = 0.6, think = false, onToken, onStats, signal } = {}) {
42
  const st = statsTracker(onStats)
43
  let full = ''
44
  await streamSse('/text/generate/stream', {
 
47
  user,
48
  max_tokens: maxTokens,
49
  temperature,
50
+ think,
51
  }, {
52
  signal,
53
  onEvent(evt, parsed) {
web/skillForgePanel.js CHANGED
@@ -5,6 +5,7 @@
5
  // just generates and shows the skill; wiring it into the battle engine comes later.
6
  import { streamCoding, currentCodingModel, onCodingModelChange } from '/web/codingModel.js'
7
  import { listPersonas, getPersona, onRosterChange } from '/web/personaStore.js'
 
8
 
9
  function el(tag, props = {}, kids = []) {
10
  const n = document.createElement(tag)
@@ -46,17 +47,33 @@ export function mountSkillForgePanel(host) {
46
  const empty = el('div', { class: 'persona-roster-empty' },
47
  'No heroes yet — recruit one in the Personas tab, then come back to forge its skills.')
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  const controls = el('aside', { class: 'persona-controls skillforge' }, [
50
  el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, 'Skill Forge'), el('span')]),
51
  el('label', { class: 'persona-label' }, 'Hero'), sel,
52
  empty,
53
  el('label', { class: 'persona-label' }, 'Skill request'), req,
 
54
  el('div', { class: 'persona-prompt-actions' }, [btn]),
55
  status,
56
  el('label', { class: 'persona-label' }, 'Forged skill'), out,
 
57
  ])
58
  host.append(controls)
59
 
 
 
60
  function refreshHeroes() {
61
  const people = listPersonas()
62
  const prev = sel.value
@@ -80,15 +97,25 @@ export function mountSkillForgePanel(host) {
80
  const ask = req.value.trim()
81
  if (!ask) { status.textContent = 'Describe the skill you want.'; return }
82
  running = true; status.dataset.busy = '1'; btn.disabled = true
83
- out.textContent = ''
84
  status.textContent = `Forging with ${currentCodingModel().label}…`
85
  const user = `${personaBlock(p)}\n\nSkill to create: ${ask}`
 
 
86
  try {
87
  const { stats } = await streamCoding(SYSTEM, user, {
88
  maxTokens: 512,
89
  temperature: 0.6,
90
- onToken: (t) => { out.textContent += t },
 
 
 
 
 
 
 
91
  })
 
92
  const tps = stats && stats.tokPerSec ? ` · ${stats.tokPerSec} tok/s` : ''
93
  status.textContent = `Done${tps}.`
94
  } catch (e) {
 
5
  // just generates and shows the skill; wiring it into the battle engine comes later.
6
  import { streamCoding, currentCodingModel, onCodingModelChange } from '/web/codingModel.js'
7
  import { listPersonas, getPersona, onRosterChange } from '/web/personaStore.js'
8
+ import { stripThink, stripThinkFinal } from '/web/personaPrompts.js'
9
 
10
  function el(tag, props = {}, kids = []) {
11
  const n = document.createElement(tag)
 
47
  const empty = el('div', { class: 'persona-roster-empty' },
48
  'No heroes yet — recruit one in the Personas tab, then come back to forge its skills.')
49
 
50
+ // "Show thinking" reveals the reasoning models' <think> trace (Nemotron, BLS) in the debug
51
+ // panel below; off by default so forging stays fast/clean. (Mellum2 has no reasoning.)
52
+ const thinkChk = el('input', { type: 'checkbox', class: 'skillforge-think' })
53
+ const thinkLabel = el('label', { class: 'persona-label skillforge-think-label' },
54
+ [thinkChk, ' show model thinking'])
55
+ const dbgEl = el('pre', { class: 'persona-think' })
56
+ const copyBtn = el('button', { class: 'persona-copy', type: 'button',
57
+ onclick: () => navigator.clipboard?.writeText(dbgEl.textContent || '') }, 'copy')
58
+ const dbgWrap = el('details', { class: 'persona-think-wrap' },
59
+ [el('summary', {}, 'model thinking / raw'), copyBtn, dbgEl])
60
+ dbgWrap.style.display = thinkChk.checked ? '' : 'none'
61
+
62
  const controls = el('aside', { class: 'persona-controls skillforge' }, [
63
  el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, 'Skill Forge'), el('span')]),
64
  el('label', { class: 'persona-label' }, 'Hero'), sel,
65
  empty,
66
  el('label', { class: 'persona-label' }, 'Skill request'), req,
67
+ thinkLabel,
68
  el('div', { class: 'persona-prompt-actions' }, [btn]),
69
  status,
70
  el('label', { class: 'persona-label' }, 'Forged skill'), out,
71
+ dbgWrap,
72
  ])
73
  host.append(controls)
74
 
75
+ thinkChk.addEventListener('change', () => { dbgWrap.style.display = thinkChk.checked ? '' : 'none' })
76
+
77
  function refreshHeroes() {
78
  const people = listPersonas()
79
  const prev = sel.value
 
97
  const ask = req.value.trim()
98
  if (!ask) { status.textContent = 'Describe the skill you want.'; return }
99
  running = true; status.dataset.busy = '1'; btn.disabled = true
100
+ out.textContent = ''; dbgEl.textContent = ''
101
  status.textContent = `Forging with ${currentCodingModel().label}…`
102
  const user = `${personaBlock(p)}\n\nSkill to create: ${ask}`
103
+ const showThink = thinkChk.checked
104
+ let raw = ''
105
  try {
106
  const { stats } = await streamCoding(SYSTEM, user, {
107
  maxTokens: 512,
108
  temperature: 0.6,
109
+ think: showThink,
110
+ // Reasoning streams inside <think>…</think>; show the raw trace in the debug panel and
111
+ // the stripped answer in the output (same split the persona panel uses).
112
+ onToken: (t) => {
113
+ raw += t
114
+ out.textContent = stripThink(raw)
115
+ if (showThink) { dbgEl.textContent = raw; dbgWrap.open = true; dbgEl.scrollTop = dbgEl.scrollHeight }
116
+ },
117
  })
118
+ out.textContent = stripThinkFinal(raw)
119
  const tps = stats && stats.tokPerSec ? ` · ${stats.tokPerSec} tok/s` : ''
120
  status.textContent = `Done${tps}.`
121
  } catch (e) {