Spaces:

polats
/

tiny-army-bls-code-zerogpu

Running on Zero

App Files Files Community

polats commited on 2 days ago

Commit

f1b8cae

verified ·

1 Parent(s): 765dead

Add think flag: optionally stream reasoning wrapped in <think>

Browse files

Files changed (1) hide show

app.py +53 -30

app.py CHANGED Viewed

@@ -76,21 +76,42 @@ def _build_inputs(system, user):
     return {k: v.to(_model.device) for k, v in enc.items()}
-def _extract_response(raw):
-    """Pull just the answer out of a (possibly partial) raw decode: content after
-    <|START_RESPONSE|> (or after <|END_THINKING|> as a fallback), up to <|END_RESPONSE|>."""
-    i = raw.find(START_RESP)
-    if i != -1:
-        body = raw[i + len(START_RESP):]
     else:
-        j = raw.find(END_THINK)
-        body = raw[j + len(END_THINK):] if j != -1 else ""
-    k = body.find(END_RESP)
     if k != -1:
-        body = body[:k]
-    for mark in _STRIP:
-        body = body.replace(mark, "")
-    return body.strip()
 def _gen_kwargs(inputs, max_tokens, temperature):
@@ -108,9 +129,10 @@ def _gen_kwargs(inputs, max_tokens, temperature):
 @spaces.GPU(duration=GPU_DURATION)
-def generate_stream(system, user, max_tokens, temperature):
-    """Stream CUMULATIVE response text (thinking suppressed). The main app diffs successive
-    yields into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
     try:
         inputs = _build_inputs(system, user)
         # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
@@ -129,32 +151,31 @@ def generate_stream(system, user, max_tokens, temperature):
         thread = threading.Thread(target=_run)
         thread.start()
-        acc, started = "", False
         for piece in streamer:
             acc += piece
-            if not started:
-                if START_RESP not in acc:
-                    continue  # still in the thinking block — emit nothing yet
-                started = True
-            yield _extract_response(acc)
         thread.join()
         if err:
-            yield (_extract_response(acc) + "\n[GENERATE ERROR]\n" + err["tb"])
-        elif not started:
-            # Model never opened a response block — fall back to whatever's after thinking.
-            yield _extract_response(acc) or "[EMPTY OUTPUT — no response block produced]"
     except Exception:  # noqa: BLE001
         import traceback
         yield "[SETUP ERROR]\n" + traceback.format_exc()
 @spaces.GPU(duration=GPU_DURATION)
-def generate(system, user, max_tokens, temperature):
     try:
         inputs = _build_inputs(system, user)
         out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
         raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
-        return _extract_response(raw) or "[EMPTY OUTPUT]"
     except Exception:  # noqa: BLE001
         import traceback
         return "[ERROR]\n" + traceback.format_exc()
@@ -167,14 +188,16 @@ with gr.Blocks(title="BLS Mini-Code 1.0 — Tiny Army sidecar") as demo:
     usr_in = gr.Textbox(label="user", lines=6)
     mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
     temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
     out = gr.Textbox(label="output", lines=12)
     with gr.Row():
         stream_btn = gr.Button("Stream", variant="primary")
         once_btn = gr.Button("Generate")
     stream_btn.click(
-        generate_stream, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate_stream"
     )
-    once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate")
 if __name__ == "__main__":
     demo.queue().launch()

     return {k: v.to(_model.device) for k, v in enc.items()}
+def _clean(s):
+    for mark in _STRIP:
+        s = s.replace(mark, "")
+    return s
+def _split(raw):
+    """Split a (possibly partial) raw decode into (thinking, response, response_started):
+    everything before <|START_RESPONSE|> (or <|END_THINKING|>) is reasoning; the rest, up to
+    <|END_RESPONSE|>, is the answer."""
+    resp_i = raw.find(START_RESP)
+    if resp_i != -1:
+        think_part, resp, started = raw[:resp_i], raw[resp_i + len(START_RESP):], True
     else:
+        end_t = raw.find(END_THINK)
+        if end_t != -1:
+            think_part, resp, started = raw[:end_t], raw[end_t + len(END_THINK):], True
+        else:
+            think_part, resp, started = raw, "", False
+    k = resp.find(END_RESP)
     if k != -1:
+        resp = resp[:k]
+    return _clean(think_part).strip(), _clean(resp).strip(), started
+def _render(raw, think):
+    """Cumulative output string. think=False → clean answer only (reasoning discarded).
+    think=True → reasoning wrapped in <think>…</think> ahead of the answer; the main app
+    strips it for the clean view but shows it in a debug panel (same convention the persona
+    models use), so the user can watch the model reason."""
+    thinking, resp, started = _split(raw)
+    if not think:
+        return resp
+    if started:
+        return f"<think>\n{thinking}\n</think>\n{resp}".strip()
+    return f"<think>\n{thinking}".strip()
 def _gen_kwargs(inputs, max_tokens, temperature):
 @spaces.GPU(duration=GPU_DURATION)
+def generate_stream(system, user, max_tokens, temperature, think=False):
+    """Stream CUMULATIVE output. think=False suppresses reasoning (clean code only); think=True
+    streams the reasoning live wrapped in <think>…</think>. The main app diffs successive yields
+    into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
     try:
         inputs = _build_inputs(system, user)
         # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
         thread = threading.Thread(target=_run)
         thread.start()
+        acc, emitted = "", False
         for piece in streamer:
             acc += piece
+            # When hiding thinking, emit nothing until the response block opens.
+            if not think and not _split(acc)[2]:
+                continue
+            emitted = True
+            yield _render(acc, think)
         thread.join()
         if err:
+            yield (_render(acc, think) + "\n[GENERATE ERROR]\n" + err["tb"])
+        elif not emitted:
+            yield _render(acc, think) or "[EMPTY OUTPUT — no response block produced]"
     except Exception:  # noqa: BLE001
         import traceback
         yield "[SETUP ERROR]\n" + traceback.format_exc()
 @spaces.GPU(duration=GPU_DURATION)
+def generate(system, user, max_tokens, temperature, think=False):
     try:
         inputs = _build_inputs(system, user)
         out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
         raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
+        return _render(raw, think) or "[EMPTY OUTPUT]"
     except Exception:  # noqa: BLE001
         import traceback
         return "[ERROR]\n" + traceback.format_exc()
     usr_in = gr.Textbox(label="user", lines=6)
     mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
     temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
+    # 5th input — defaults False so existing 4-arg API callers keep getting clean code.
+    think_in = gr.Checkbox(value=False, label="show thinking (wrap reasoning in <think>…</think>)")
     out = gr.Textbox(label="output", lines=12)
     with gr.Row():
         stream_btn = gr.Button("Stream", variant="primary")
         once_btn = gr.Button("Generate")
     stream_btn.click(
+        generate_stream, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate_stream"
     )
+    once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate")
 if __name__ == "__main__":
     demo.queue().launch()