Spaces:
Running
Skill Forge: optional "show thinking" for coding models
Browse filesAdd a "show model thinking" toggle (off by default) that reveals the reasoning
models' trace in a collapsible debug panel, mirroring the persona debug div and
reusing stripThink/stripThinkFinal.
A `think` flag flows from the checkbox -> streamCoding -> /text/generate/stream
-> the model. Reasoning comes back inline as <think>…</think>; the clean answer
shows in the output, the raw trace in the panel.
- Nemotron (_nim_text_stream): when think=true, drop reasoning_budget=0 and
surface reasoning_content wrapped as <think>…</think>.
- BLS sidecar: new 5th `think` arg streams the reasoning wrapped in <think>
instead of discarding it; threaded through _space_text_stream's *extra
(defaults keep existing 4-arg callers on the clean path).
- Mellum2 has no reasoning, so the flag is a no-op there.
Verified end-to-end through /text/generate/stream for both models (think on/off)
plus the backward-compat 4-arg sidecar call.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
- app.py +41 -19
- spaces/bls-code-zerogpu/app.py +53 -30
- web/codingModel.js +4 -1
- web/skillForgePanel.js +29 -2
|
@@ -478,7 +478,7 @@ def _tiny_aya_generate(system, user, max_tokens, temperature):
|
|
| 478 |
return str(result or "")
|
| 479 |
|
| 480 |
|
| 481 |
-
def _space_text_generate(space, system, user, max_tokens, temperature):
|
| 482 |
from gradio_client import Client
|
| 483 |
client = Client(space, token=HF_TOKEN or None)
|
| 484 |
result = client.predict(
|
|
@@ -486,12 +486,13 @@ def _space_text_generate(space, system, user, max_tokens, temperature):
|
|
| 486 |
user or "",
|
| 487 |
int(max_tokens or 400),
|
| 488 |
float(temperature if temperature is not None else 0.8),
|
|
|
|
| 489 |
api_name="/generate",
|
| 490 |
)
|
| 491 |
return str(result or "")
|
| 492 |
|
| 493 |
|
| 494 |
-
def _space_text_stream(space, system, user, max_tokens, temperature):
|
| 495 |
from gradio_client import Client
|
| 496 |
client = Client(space, token=HF_TOKEN or None)
|
| 497 |
try:
|
|
@@ -500,6 +501,7 @@ def _space_text_stream(space, system, user, max_tokens, temperature):
|
|
| 500 |
user or "",
|
| 501 |
int(max_tokens or 400),
|
| 502 |
float(temperature if temperature is not None else 0.8),
|
|
|
|
| 503 |
api_name="/generate_stream",
|
| 504 |
)
|
| 505 |
prev = ""
|
|
@@ -510,7 +512,7 @@ def _space_text_stream(space, system, user, max_tokens, temperature):
|
|
| 510 |
yield text[len(prev):]
|
| 511 |
prev = text
|
| 512 |
except Exception:
|
| 513 |
-
text = _space_text_generate(space, system, user, max_tokens, temperature)
|
| 514 |
if text:
|
| 515 |
yield text
|
| 516 |
|
|
@@ -527,27 +529,32 @@ def _mellum_stream(system, user, max_tokens, temperature):
|
|
| 527 |
yield from _space_text_stream(MELLUM_SPACE, system, user, max_tokens, temperature)
|
| 528 |
|
| 529 |
|
| 530 |
-
def _nim_text_stream(system, user, max_tokens, temperature, model=None):
|
| 531 |
"""Stream from NVIDIA NIM's OpenAI-compatible chat endpoint (hosted Nemotron). Same
|
| 532 |
-
nvapi-… key as the portrait NIM. reasoning_budget=0
|
| 533 |
-
(Nemotron defaults thinking ON
|
|
|
|
|
|
|
| 534 |
model = model or _NIM_NEMOTRON_MODEL # defined later in the file; resolve at call time
|
| 535 |
messages = []
|
| 536 |
if system and system.strip():
|
| 537 |
messages.append({"role": "system", "content": system.strip()})
|
| 538 |
messages.append({"role": "user", "content": (user or "").strip()})
|
| 539 |
-
|
| 540 |
"model": model,
|
| 541 |
"messages": messages,
|
| 542 |
"max_tokens": int(max_tokens or 512),
|
| 543 |
"temperature": float(temperature if temperature is not None else 0.6),
|
| 544 |
"top_p": 0.95,
|
| 545 |
"stream": True,
|
| 546 |
-
|
| 547 |
-
|
|
|
|
|
|
|
| 548 |
req = urllib.request.Request(_NIM_TEXT_URL, data=body, method="POST", headers={
|
| 549 |
"Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "text/event-stream",
|
| 550 |
})
|
|
|
|
| 551 |
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 552 |
for raw in resp:
|
| 553 |
line = raw.decode("utf-8").strip()
|
|
@@ -557,11 +564,23 @@ def _nim_text_stream(system, user, max_tokens, temperature, model=None):
|
|
| 557 |
if data == "[DONE]":
|
| 558 |
break
|
| 559 |
try:
|
| 560 |
-
delta = _json.loads(data)["choices"][0]["delta"]
|
| 561 |
except Exception: # noqa: BLE001
|
| 562 |
continue
|
| 563 |
-
|
| 564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
|
| 567 |
def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
|
|
@@ -581,24 +600,25 @@ def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
|
|
| 581 |
yield from _nim_text_stream(system, user, max_tokens, temperature)
|
| 582 |
|
| 583 |
|
| 584 |
-
def _bls_code_stream(system, user, max_tokens, temperature):
|
| 585 |
-
|
|
|
|
| 586 |
|
| 587 |
|
| 588 |
-
def _bls_code_stream_with_fallback(system, user, max_tokens, temperature):
|
| 589 |
"""BLS Mini-Code ZeroGPU sidecar, falling back to Nemotron (NVIDIA NIM) if the sidecar is
|
| 590 |
unavailable BEFORE any token streams (same constraint as Mellum2: can't switch mid-stream)."""
|
| 591 |
emitted = False
|
| 592 |
try:
|
| 593 |
if not BLS_CODE_SPACE:
|
| 594 |
raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
|
| 595 |
-
for chunk in _bls_code_stream(system, user, max_tokens, temperature):
|
| 596 |
emitted = True
|
| 597 |
yield chunk
|
| 598 |
except Exception: # noqa: BLE001
|
| 599 |
if emitted or not NIM_KEY:
|
| 600 |
raise
|
| 601 |
-
yield from _nim_text_stream(system, user, max_tokens, temperature)
|
| 602 |
|
| 603 |
|
| 604 |
@fastapi_app.post("/voxcpm-tts")
|
|
@@ -887,6 +907,8 @@ async def text_generate_stream(request: Request):
|
|
| 887 |
user = body.get("user") or ""
|
| 888 |
max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
|
| 889 |
temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
|
|
|
|
|
|
|
| 890 |
stop = threading.Event()
|
| 891 |
|
| 892 |
async def gen():
|
|
@@ -923,14 +945,14 @@ async def text_generate_stream(request: Request):
|
|
| 923 |
# BLS Mini-Code sidecar, with Nemotron NIM as fallback if it's unavailable.
|
| 924 |
if not BLS_CODE_SPACE and not NIM_KEY:
|
| 925 |
raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
|
| 926 |
-
for chunk in _bls_code_stream_with_fallback(system, user, max_tokens, temperature):
|
| 927 |
if stop.is_set():
|
| 928 |
break
|
| 929 |
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
| 930 |
elif model == "nemotron-3-nano-30b-nim":
|
| 931 |
if not NIM_KEY:
|
| 932 |
raise llm.LlmUnavailable("NVIDIA_NIM_API_KEY not set")
|
| 933 |
-
for chunk in _nim_text_stream(system, user, max_tokens, temperature):
|
| 934 |
if stop.is_set():
|
| 935 |
break
|
| 936 |
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
|
|
|
| 478 |
return str(result or "")
|
| 479 |
|
| 480 |
|
| 481 |
+
def _space_text_generate(space, system, user, max_tokens, temperature, *extra):
|
| 482 |
from gradio_client import Client
|
| 483 |
client = Client(space, token=HF_TOKEN or None)
|
| 484 |
result = client.predict(
|
|
|
|
| 486 |
user or "",
|
| 487 |
int(max_tokens or 400),
|
| 488 |
float(temperature if temperature is not None else 0.8),
|
| 489 |
+
*extra, # optional trailing inputs (e.g. BLS sidecar's `think` flag)
|
| 490 |
api_name="/generate",
|
| 491 |
)
|
| 492 |
return str(result or "")
|
| 493 |
|
| 494 |
|
| 495 |
+
def _space_text_stream(space, system, user, max_tokens, temperature, *extra):
|
| 496 |
from gradio_client import Client
|
| 497 |
client = Client(space, token=HF_TOKEN or None)
|
| 498 |
try:
|
|
|
|
| 501 |
user or "",
|
| 502 |
int(max_tokens or 400),
|
| 503 |
float(temperature if temperature is not None else 0.8),
|
| 504 |
+
*extra, # optional trailing inputs (e.g. BLS sidecar's `think` flag)
|
| 505 |
api_name="/generate_stream",
|
| 506 |
)
|
| 507 |
prev = ""
|
|
|
|
| 512 |
yield text[len(prev):]
|
| 513 |
prev = text
|
| 514 |
except Exception:
|
| 515 |
+
text = _space_text_generate(space, system, user, max_tokens, temperature, *extra)
|
| 516 |
if text:
|
| 517 |
yield text
|
| 518 |
|
|
|
|
| 529 |
yield from _space_text_stream(MELLUM_SPACE, system, user, max_tokens, temperature)
|
| 530 |
|
| 531 |
|
| 532 |
+
def _nim_text_stream(system, user, max_tokens, temperature, model=None, think=False):
|
| 533 |
"""Stream from NVIDIA NIM's OpenAI-compatible chat endpoint (hosted Nemotron). Same
|
| 534 |
+
nvapi-… key as the portrait NIM. think=False sets reasoning_budget=0 to keep the coding
|
| 535 |
+
output clean (Nemotron defaults thinking ON); think=True lets it reason and surfaces the
|
| 536 |
+
reasoning_content wrapped in <think>…</think> ahead of the answer, so the caller can show
|
| 537 |
+
it in a debug panel (same convention as the persona models)."""
|
| 538 |
model = model or _NIM_NEMOTRON_MODEL # defined later in the file; resolve at call time
|
| 539 |
messages = []
|
| 540 |
if system and system.strip():
|
| 541 |
messages.append({"role": "system", "content": system.strip()})
|
| 542 |
messages.append({"role": "user", "content": (user or "").strip()})
|
| 543 |
+
payload = {
|
| 544 |
"model": model,
|
| 545 |
"messages": messages,
|
| 546 |
"max_tokens": int(max_tokens or 512),
|
| 547 |
"temperature": float(temperature if temperature is not None else 0.6),
|
| 548 |
"top_p": 0.95,
|
| 549 |
"stream": True,
|
| 550 |
+
}
|
| 551 |
+
if not think:
|
| 552 |
+
payload["reasoning_budget"] = 0 # omit entirely to let Nemotron reason
|
| 553 |
+
body = _json.dumps(payload).encode()
|
| 554 |
req = urllib.request.Request(_NIM_TEXT_URL, data=body, method="POST", headers={
|
| 555 |
"Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "text/event-stream",
|
| 556 |
})
|
| 557 |
+
think_open = False
|
| 558 |
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 559 |
for raw in resp:
|
| 560 |
line = raw.decode("utf-8").strip()
|
|
|
|
| 564 |
if data == "[DONE]":
|
| 565 |
break
|
| 566 |
try:
|
| 567 |
+
delta = _json.loads(data)["choices"][0]["delta"]
|
| 568 |
except Exception: # noqa: BLE001
|
| 569 |
continue
|
| 570 |
+
reasoning = delta.get("reasoning_content") if think else None
|
| 571 |
+
content = delta.get("content")
|
| 572 |
+
if reasoning:
|
| 573 |
+
if not think_open:
|
| 574 |
+
yield "<think>"
|
| 575 |
+
think_open = True
|
| 576 |
+
yield reasoning
|
| 577 |
+
if content:
|
| 578 |
+
if think_open:
|
| 579 |
+
yield "</think>\n"
|
| 580 |
+
think_open = False
|
| 581 |
+
yield content
|
| 582 |
+
if think_open:
|
| 583 |
+
yield "</think>\n"
|
| 584 |
|
| 585 |
|
| 586 |
def _mellum_stream_with_fallback(system, user, max_tokens, temperature):
|
|
|
|
| 600 |
yield from _nim_text_stream(system, user, max_tokens, temperature)
|
| 601 |
|
| 602 |
|
| 603 |
+
def _bls_code_stream(system, user, max_tokens, temperature, think=False):
|
| 604 |
+
# `think` is the BLS sidecar's optional 5th input; passed through _space_text_stream's *extra.
|
| 605 |
+
yield from _space_text_stream(BLS_CODE_SPACE, system, user, max_tokens, temperature, bool(think))
|
| 606 |
|
| 607 |
|
| 608 |
+
def _bls_code_stream_with_fallback(system, user, max_tokens, temperature, think=False):
|
| 609 |
"""BLS Mini-Code ZeroGPU sidecar, falling back to Nemotron (NVIDIA NIM) if the sidecar is
|
| 610 |
unavailable BEFORE any token streams (same constraint as Mellum2: can't switch mid-stream)."""
|
| 611 |
emitted = False
|
| 612 |
try:
|
| 613 |
if not BLS_CODE_SPACE:
|
| 614 |
raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
|
| 615 |
+
for chunk in _bls_code_stream(system, user, max_tokens, temperature, think):
|
| 616 |
emitted = True
|
| 617 |
yield chunk
|
| 618 |
except Exception: # noqa: BLE001
|
| 619 |
if emitted or not NIM_KEY:
|
| 620 |
raise
|
| 621 |
+
yield from _nim_text_stream(system, user, max_tokens, temperature, think=think)
|
| 622 |
|
| 623 |
|
| 624 |
@fastapi_app.post("/voxcpm-tts")
|
|
|
|
| 907 |
user = body.get("user") or ""
|
| 908 |
max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
|
| 909 |
temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
|
| 910 |
+
# When set, reasoning models (Nemotron, BLS) surface their <think> trace instead of hiding it.
|
| 911 |
+
think = bool(body.get("think"))
|
| 912 |
stop = threading.Event()
|
| 913 |
|
| 914 |
async def gen():
|
|
|
|
| 945 |
# BLS Mini-Code sidecar, with Nemotron NIM as fallback if it's unavailable.
|
| 946 |
if not BLS_CODE_SPACE and not NIM_KEY:
|
| 947 |
raise llm.LlmUnavailable("TINY_BLS_CODE_SPACE not set")
|
| 948 |
+
for chunk in _bls_code_stream_with_fallback(system, user, max_tokens, temperature, think):
|
| 949 |
if stop.is_set():
|
| 950 |
break
|
| 951 |
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
| 952 |
elif model == "nemotron-3-nano-30b-nim":
|
| 953 |
if not NIM_KEY:
|
| 954 |
raise llm.LlmUnavailable("NVIDIA_NIM_API_KEY not set")
|
| 955 |
+
for chunk in _nim_text_stream(system, user, max_tokens, temperature, think=think):
|
| 956 |
if stop.is_set():
|
| 957 |
break
|
| 958 |
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
|
@@ -76,21 +76,42 @@ def _build_inputs(system, user):
|
|
| 76 |
return {k: v.to(_model.device) for k, v in enc.items()}
|
| 77 |
|
| 78 |
|
| 79 |
-
def
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
else:
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
| 89 |
if k != -1:
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def _gen_kwargs(inputs, max_tokens, temperature):
|
|
@@ -108,9 +129,10 @@ def _gen_kwargs(inputs, max_tokens, temperature):
|
|
| 108 |
|
| 109 |
|
| 110 |
@spaces.GPU(duration=GPU_DURATION)
|
| 111 |
-
def generate_stream(system, user, max_tokens, temperature):
|
| 112 |
-
"""Stream CUMULATIVE
|
| 113 |
-
|
|
|
|
| 114 |
try:
|
| 115 |
inputs = _build_inputs(system, user)
|
| 116 |
# skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
|
|
@@ -129,32 +151,31 @@ def generate_stream(system, user, max_tokens, temperature):
|
|
| 129 |
|
| 130 |
thread = threading.Thread(target=_run)
|
| 131 |
thread.start()
|
| 132 |
-
acc,
|
| 133 |
for piece in streamer:
|
| 134 |
acc += piece
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
yield
|
| 140 |
thread.join()
|
| 141 |
if err:
|
| 142 |
-
yield (
|
| 143 |
-
elif not
|
| 144 |
-
|
| 145 |
-
yield _extract_response(acc) or "[EMPTY OUTPUT — no response block produced]"
|
| 146 |
except Exception: # noqa: BLE001
|
| 147 |
import traceback
|
| 148 |
yield "[SETUP ERROR]\n" + traceback.format_exc()
|
| 149 |
|
| 150 |
|
| 151 |
@spaces.GPU(duration=GPU_DURATION)
|
| 152 |
-
def generate(system, user, max_tokens, temperature):
|
| 153 |
try:
|
| 154 |
inputs = _build_inputs(system, user)
|
| 155 |
out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
|
| 156 |
raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
|
| 157 |
-
return
|
| 158 |
except Exception: # noqa: BLE001
|
| 159 |
import traceback
|
| 160 |
return "[ERROR]\n" + traceback.format_exc()
|
|
@@ -167,14 +188,16 @@ with gr.Blocks(title="BLS Mini-Code 1.0 — Tiny Army sidecar") as demo:
|
|
| 167 |
usr_in = gr.Textbox(label="user", lines=6)
|
| 168 |
mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
|
| 169 |
temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
|
|
|
|
|
|
|
| 170 |
out = gr.Textbox(label="output", lines=12)
|
| 171 |
with gr.Row():
|
| 172 |
stream_btn = gr.Button("Stream", variant="primary")
|
| 173 |
once_btn = gr.Button("Generate")
|
| 174 |
stream_btn.click(
|
| 175 |
-
generate_stream, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate_stream"
|
| 176 |
)
|
| 177 |
-
once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate")
|
| 178 |
|
| 179 |
if __name__ == "__main__":
|
| 180 |
demo.queue().launch()
|
|
|
|
| 76 |
return {k: v.to(_model.device) for k, v in enc.items()}
|
| 77 |
|
| 78 |
|
| 79 |
+
def _clean(s):
|
| 80 |
+
for mark in _STRIP:
|
| 81 |
+
s = s.replace(mark, "")
|
| 82 |
+
return s
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _split(raw):
|
| 86 |
+
"""Split a (possibly partial) raw decode into (thinking, response, response_started):
|
| 87 |
+
everything before <|START_RESPONSE|> (or <|END_THINKING|>) is reasoning; the rest, up to
|
| 88 |
+
<|END_RESPONSE|>, is the answer."""
|
| 89 |
+
resp_i = raw.find(START_RESP)
|
| 90 |
+
if resp_i != -1:
|
| 91 |
+
think_part, resp, started = raw[:resp_i], raw[resp_i + len(START_RESP):], True
|
| 92 |
else:
|
| 93 |
+
end_t = raw.find(END_THINK)
|
| 94 |
+
if end_t != -1:
|
| 95 |
+
think_part, resp, started = raw[:end_t], raw[end_t + len(END_THINK):], True
|
| 96 |
+
else:
|
| 97 |
+
think_part, resp, started = raw, "", False
|
| 98 |
+
k = resp.find(END_RESP)
|
| 99 |
if k != -1:
|
| 100 |
+
resp = resp[:k]
|
| 101 |
+
return _clean(think_part).strip(), _clean(resp).strip(), started
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _render(raw, think):
|
| 105 |
+
"""Cumulative output string. think=False → clean answer only (reasoning discarded).
|
| 106 |
+
think=True → reasoning wrapped in <think>…</think> ahead of the answer; the main app
|
| 107 |
+
strips it for the clean view but shows it in a debug panel (same convention the persona
|
| 108 |
+
models use), so the user can watch the model reason."""
|
| 109 |
+
thinking, resp, started = _split(raw)
|
| 110 |
+
if not think:
|
| 111 |
+
return resp
|
| 112 |
+
if started:
|
| 113 |
+
return f"<think>\n{thinking}\n</think>\n{resp}".strip()
|
| 114 |
+
return f"<think>\n{thinking}".strip()
|
| 115 |
|
| 116 |
|
| 117 |
def _gen_kwargs(inputs, max_tokens, temperature):
|
|
|
|
| 129 |
|
| 130 |
|
| 131 |
@spaces.GPU(duration=GPU_DURATION)
|
| 132 |
+
def generate_stream(system, user, max_tokens, temperature, think=False):
|
| 133 |
+
"""Stream CUMULATIVE output. think=False suppresses reasoning (clean code only); think=True
|
| 134 |
+
streams the reasoning live wrapped in <think>…</think>. The main app diffs successive yields
|
| 135 |
+
into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
|
| 136 |
try:
|
| 137 |
inputs = _build_inputs(system, user)
|
| 138 |
# skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
|
|
|
|
| 151 |
|
| 152 |
thread = threading.Thread(target=_run)
|
| 153 |
thread.start()
|
| 154 |
+
acc, emitted = "", False
|
| 155 |
for piece in streamer:
|
| 156 |
acc += piece
|
| 157 |
+
# When hiding thinking, emit nothing until the response block opens.
|
| 158 |
+
if not think and not _split(acc)[2]:
|
| 159 |
+
continue
|
| 160 |
+
emitted = True
|
| 161 |
+
yield _render(acc, think)
|
| 162 |
thread.join()
|
| 163 |
if err:
|
| 164 |
+
yield (_render(acc, think) + "\n[GENERATE ERROR]\n" + err["tb"])
|
| 165 |
+
elif not emitted:
|
| 166 |
+
yield _render(acc, think) or "[EMPTY OUTPUT — no response block produced]"
|
|
|
|
| 167 |
except Exception: # noqa: BLE001
|
| 168 |
import traceback
|
| 169 |
yield "[SETUP ERROR]\n" + traceback.format_exc()
|
| 170 |
|
| 171 |
|
| 172 |
@spaces.GPU(duration=GPU_DURATION)
|
| 173 |
+
def generate(system, user, max_tokens, temperature, think=False):
|
| 174 |
try:
|
| 175 |
inputs = _build_inputs(system, user)
|
| 176 |
out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
|
| 177 |
raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
|
| 178 |
+
return _render(raw, think) or "[EMPTY OUTPUT]"
|
| 179 |
except Exception: # noqa: BLE001
|
| 180 |
import traceback
|
| 181 |
return "[ERROR]\n" + traceback.format_exc()
|
|
|
|
| 188 |
usr_in = gr.Textbox(label="user", lines=6)
|
| 189 |
mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
|
| 190 |
temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
|
| 191 |
+
# 5th input — defaults False so existing 4-arg API callers keep getting clean code.
|
| 192 |
+
think_in = gr.Checkbox(value=False, label="show thinking (wrap reasoning in <think>…</think>)")
|
| 193 |
out = gr.Textbox(label="output", lines=12)
|
| 194 |
with gr.Row():
|
| 195 |
stream_btn = gr.Button("Stream", variant="primary")
|
| 196 |
once_btn = gr.Button("Generate")
|
| 197 |
stream_btn.click(
|
| 198 |
+
generate_stream, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate_stream"
|
| 199 |
)
|
| 200 |
+
once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate")
|
| 201 |
|
| 202 |
if __name__ == "__main__":
|
| 203 |
demo.queue().launch()
|
|
@@ -36,7 +36,9 @@ export function setCodingModel(id) {
|
|
| 36 |
}
|
| 37 |
|
| 38 |
// Stream a coding-model completion. Same delta protocol as engineServer.stream.
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
const st = statsTracker(onStats)
|
| 41 |
let full = ''
|
| 42 |
await streamSse('/text/generate/stream', {
|
|
@@ -45,6 +47,7 @@ export async function streamCoding(system, user, { maxTokens = 512, temperature
|
|
| 45 |
user,
|
| 46 |
max_tokens: maxTokens,
|
| 47 |
temperature,
|
|
|
|
| 48 |
}, {
|
| 49 |
signal,
|
| 50 |
onEvent(evt, parsed) {
|
|
|
|
| 36 |
}
|
| 37 |
|
| 38 |
// Stream a coding-model completion. Same delta protocol as engineServer.stream.
|
| 39 |
+
// think=true asks reasoning models (Nemotron, BLS) to surface their <think>…</think> trace
|
| 40 |
+
// instead of hiding it, so the caller can show it in a debug panel.
|
| 41 |
+
export async function streamCoding(system, user, { maxTokens = 512, temperature = 0.6, think = false, onToken, onStats, signal } = {}) {
|
| 42 |
const st = statsTracker(onStats)
|
| 43 |
let full = ''
|
| 44 |
await streamSse('/text/generate/stream', {
|
|
|
|
| 47 |
user,
|
| 48 |
max_tokens: maxTokens,
|
| 49 |
temperature,
|
| 50 |
+
think,
|
| 51 |
}, {
|
| 52 |
signal,
|
| 53 |
onEvent(evt, parsed) {
|
|
@@ -5,6 +5,7 @@
|
|
| 5 |
// just generates and shows the skill; wiring it into the battle engine comes later.
|
| 6 |
import { streamCoding, currentCodingModel, onCodingModelChange } from '/web/codingModel.js'
|
| 7 |
import { listPersonas, getPersona, onRosterChange } from '/web/personaStore.js'
|
|
|
|
| 8 |
|
| 9 |
function el(tag, props = {}, kids = []) {
|
| 10 |
const n = document.createElement(tag)
|
|
@@ -46,17 +47,33 @@ export function mountSkillForgePanel(host) {
|
|
| 46 |
const empty = el('div', { class: 'persona-roster-empty' },
|
| 47 |
'No heroes yet — recruit one in the Personas tab, then come back to forge its skills.')
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
const controls = el('aside', { class: 'persona-controls skillforge' }, [
|
| 50 |
el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, 'Skill Forge'), el('span')]),
|
| 51 |
el('label', { class: 'persona-label' }, 'Hero'), sel,
|
| 52 |
empty,
|
| 53 |
el('label', { class: 'persona-label' }, 'Skill request'), req,
|
|
|
|
| 54 |
el('div', { class: 'persona-prompt-actions' }, [btn]),
|
| 55 |
status,
|
| 56 |
el('label', { class: 'persona-label' }, 'Forged skill'), out,
|
|
|
|
| 57 |
])
|
| 58 |
host.append(controls)
|
| 59 |
|
|
|
|
|
|
|
| 60 |
function refreshHeroes() {
|
| 61 |
const people = listPersonas()
|
| 62 |
const prev = sel.value
|
|
@@ -80,15 +97,25 @@ export function mountSkillForgePanel(host) {
|
|
| 80 |
const ask = req.value.trim()
|
| 81 |
if (!ask) { status.textContent = 'Describe the skill you want.'; return }
|
| 82 |
running = true; status.dataset.busy = '1'; btn.disabled = true
|
| 83 |
-
out.textContent = ''
|
| 84 |
status.textContent = `Forging with ${currentCodingModel().label}…`
|
| 85 |
const user = `${personaBlock(p)}\n\nSkill to create: ${ask}`
|
|
|
|
|
|
|
| 86 |
try {
|
| 87 |
const { stats } = await streamCoding(SYSTEM, user, {
|
| 88 |
maxTokens: 512,
|
| 89 |
temperature: 0.6,
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
})
|
|
|
|
| 92 |
const tps = stats && stats.tokPerSec ? ` · ${stats.tokPerSec} tok/s` : ''
|
| 93 |
status.textContent = `Done${tps}.`
|
| 94 |
} catch (e) {
|
|
|
|
| 5 |
// just generates and shows the skill; wiring it into the battle engine comes later.
|
| 6 |
import { streamCoding, currentCodingModel, onCodingModelChange } from '/web/codingModel.js'
|
| 7 |
import { listPersonas, getPersona, onRosterChange } from '/web/personaStore.js'
|
| 8 |
+
import { stripThink, stripThinkFinal } from '/web/personaPrompts.js'
|
| 9 |
|
| 10 |
function el(tag, props = {}, kids = []) {
|
| 11 |
const n = document.createElement(tag)
|
|
|
|
| 47 |
const empty = el('div', { class: 'persona-roster-empty' },
|
| 48 |
'No heroes yet — recruit one in the Personas tab, then come back to forge its skills.')
|
| 49 |
|
| 50 |
+
// "Show thinking" reveals the reasoning models' <think> trace (Nemotron, BLS) in the debug
|
| 51 |
+
// panel below; off by default so forging stays fast/clean. (Mellum2 has no reasoning.)
|
| 52 |
+
const thinkChk = el('input', { type: 'checkbox', class: 'skillforge-think' })
|
| 53 |
+
const thinkLabel = el('label', { class: 'persona-label skillforge-think-label' },
|
| 54 |
+
[thinkChk, ' show model thinking'])
|
| 55 |
+
const dbgEl = el('pre', { class: 'persona-think' })
|
| 56 |
+
const copyBtn = el('button', { class: 'persona-copy', type: 'button',
|
| 57 |
+
onclick: () => navigator.clipboard?.writeText(dbgEl.textContent || '') }, 'copy')
|
| 58 |
+
const dbgWrap = el('details', { class: 'persona-think-wrap' },
|
| 59 |
+
[el('summary', {}, 'model thinking / raw'), copyBtn, dbgEl])
|
| 60 |
+
dbgWrap.style.display = thinkChk.checked ? '' : 'none'
|
| 61 |
+
|
| 62 |
const controls = el('aside', { class: 'persona-controls skillforge' }, [
|
| 63 |
el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, 'Skill Forge'), el('span')]),
|
| 64 |
el('label', { class: 'persona-label' }, 'Hero'), sel,
|
| 65 |
empty,
|
| 66 |
el('label', { class: 'persona-label' }, 'Skill request'), req,
|
| 67 |
+
thinkLabel,
|
| 68 |
el('div', { class: 'persona-prompt-actions' }, [btn]),
|
| 69 |
status,
|
| 70 |
el('label', { class: 'persona-label' }, 'Forged skill'), out,
|
| 71 |
+
dbgWrap,
|
| 72 |
])
|
| 73 |
host.append(controls)
|
| 74 |
|
| 75 |
+
thinkChk.addEventListener('change', () => { dbgWrap.style.display = thinkChk.checked ? '' : 'none' })
|
| 76 |
+
|
| 77 |
function refreshHeroes() {
|
| 78 |
const people = listPersonas()
|
| 79 |
const prev = sel.value
|
|
|
|
| 97 |
const ask = req.value.trim()
|
| 98 |
if (!ask) { status.textContent = 'Describe the skill you want.'; return }
|
| 99 |
running = true; status.dataset.busy = '1'; btn.disabled = true
|
| 100 |
+
out.textContent = ''; dbgEl.textContent = ''
|
| 101 |
status.textContent = `Forging with ${currentCodingModel().label}…`
|
| 102 |
const user = `${personaBlock(p)}\n\nSkill to create: ${ask}`
|
| 103 |
+
const showThink = thinkChk.checked
|
| 104 |
+
let raw = ''
|
| 105 |
try {
|
| 106 |
const { stats } = await streamCoding(SYSTEM, user, {
|
| 107 |
maxTokens: 512,
|
| 108 |
temperature: 0.6,
|
| 109 |
+
think: showThink,
|
| 110 |
+
// Reasoning streams inside <think>…</think>; show the raw trace in the debug panel and
|
| 111 |
+
// the stripped answer in the output (same split the persona panel uses).
|
| 112 |
+
onToken: (t) => {
|
| 113 |
+
raw += t
|
| 114 |
+
out.textContent = stripThink(raw)
|
| 115 |
+
if (showThink) { dbgEl.textContent = raw; dbgWrap.open = true; dbgEl.scrollTop = dbgEl.scrollHeight }
|
| 116 |
+
},
|
| 117 |
})
|
| 118 |
+
out.textContent = stripThinkFinal(raw)
|
| 119 |
const tps = stats && stats.tokPerSec ? ` · ${stats.tokPerSec} tok/s` : ''
|
| 120 |
status.textContent = `Done${tps}.`
|
| 121 |
} catch (e) {
|