polats commited on
Commit
f1b8cae
Β·
verified Β·
1 Parent(s): 765dead

Add think flag: optionally stream reasoning wrapped in <think>

Browse files
Files changed (1) hide show
  1. app.py +53 -30
app.py CHANGED
@@ -76,21 +76,42 @@ def _build_inputs(system, user):
76
  return {k: v.to(_model.device) for k, v in enc.items()}
77
 
78
 
79
- def _extract_response(raw):
80
- """Pull just the answer out of a (possibly partial) raw decode: content after
81
- <|START_RESPONSE|> (or after <|END_THINKING|> as a fallback), up to <|END_RESPONSE|>."""
82
- i = raw.find(START_RESP)
83
- if i != -1:
84
- body = raw[i + len(START_RESP):]
 
 
 
 
 
 
 
85
  else:
86
- j = raw.find(END_THINK)
87
- body = raw[j + len(END_THINK):] if j != -1 else ""
88
- k = body.find(END_RESP)
 
 
 
89
  if k != -1:
90
- body = body[:k]
91
- for mark in _STRIP:
92
- body = body.replace(mark, "")
93
- return body.strip()
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def _gen_kwargs(inputs, max_tokens, temperature):
@@ -108,9 +129,10 @@ def _gen_kwargs(inputs, max_tokens, temperature):
108
 
109
 
110
  @spaces.GPU(duration=GPU_DURATION)
111
- def generate_stream(system, user, max_tokens, temperature):
112
- """Stream CUMULATIVE response text (thinking suppressed). The main app diffs successive
113
- yields into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
 
114
  try:
115
  inputs = _build_inputs(system, user)
116
  # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
@@ -129,32 +151,31 @@ def generate_stream(system, user, max_tokens, temperature):
129
 
130
  thread = threading.Thread(target=_run)
131
  thread.start()
132
- acc, started = "", False
133
  for piece in streamer:
134
  acc += piece
135
- if not started:
136
- if START_RESP not in acc:
137
- continue # still in the thinking block β€” emit nothing yet
138
- started = True
139
- yield _extract_response(acc)
140
  thread.join()
141
  if err:
142
- yield (_extract_response(acc) + "\n[GENERATE ERROR]\n" + err["tb"])
143
- elif not started:
144
- # Model never opened a response block β€” fall back to whatever's after thinking.
145
- yield _extract_response(acc) or "[EMPTY OUTPUT β€” no response block produced]"
146
  except Exception: # noqa: BLE001
147
  import traceback
148
  yield "[SETUP ERROR]\n" + traceback.format_exc()
149
 
150
 
151
  @spaces.GPU(duration=GPU_DURATION)
152
- def generate(system, user, max_tokens, temperature):
153
  try:
154
  inputs = _build_inputs(system, user)
155
  out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
156
  raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
157
- return _extract_response(raw) or "[EMPTY OUTPUT]"
158
  except Exception: # noqa: BLE001
159
  import traceback
160
  return "[ERROR]\n" + traceback.format_exc()
@@ -167,14 +188,16 @@ with gr.Blocks(title="BLS Mini-Code 1.0 β€” Tiny Army sidecar") as demo:
167
  usr_in = gr.Textbox(label="user", lines=6)
168
  mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
169
  temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
 
 
170
  out = gr.Textbox(label="output", lines=12)
171
  with gr.Row():
172
  stream_btn = gr.Button("Stream", variant="primary")
173
  once_btn = gr.Button("Generate")
174
  stream_btn.click(
175
- generate_stream, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate_stream"
176
  )
177
- once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in], out, api_name="generate")
178
 
179
  if __name__ == "__main__":
180
  demo.queue().launch()
 
76
  return {k: v.to(_model.device) for k, v in enc.items()}
77
 
78
 
79
+ def _clean(s):
80
+ for mark in _STRIP:
81
+ s = s.replace(mark, "")
82
+ return s
83
+
84
+
85
+ def _split(raw):
86
+ """Split a (possibly partial) raw decode into (thinking, response, response_started):
87
+ everything before <|START_RESPONSE|> (or <|END_THINKING|>) is reasoning; the rest, up to
88
+ <|END_RESPONSE|>, is the answer."""
89
+ resp_i = raw.find(START_RESP)
90
+ if resp_i != -1:
91
+ think_part, resp, started = raw[:resp_i], raw[resp_i + len(START_RESP):], True
92
  else:
93
+ end_t = raw.find(END_THINK)
94
+ if end_t != -1:
95
+ think_part, resp, started = raw[:end_t], raw[end_t + len(END_THINK):], True
96
+ else:
97
+ think_part, resp, started = raw, "", False
98
+ k = resp.find(END_RESP)
99
  if k != -1:
100
+ resp = resp[:k]
101
+ return _clean(think_part).strip(), _clean(resp).strip(), started
102
+
103
+
104
+ def _render(raw, think):
105
+ """Cumulative output string. think=False β†’ clean answer only (reasoning discarded).
106
+ think=True β†’ reasoning wrapped in <think>…</think> ahead of the answer; the main app
107
+ strips it for the clean view but shows it in a debug panel (same convention the persona
108
+ models use), so the user can watch the model reason."""
109
+ thinking, resp, started = _split(raw)
110
+ if not think:
111
+ return resp
112
+ if started:
113
+ return f"<think>\n{thinking}\n</think>\n{resp}".strip()
114
+ return f"<think>\n{thinking}".strip()
115
 
116
 
117
  def _gen_kwargs(inputs, max_tokens, temperature):
 
129
 
130
 
131
  @spaces.GPU(duration=GPU_DURATION)
132
+ def generate_stream(system, user, max_tokens, temperature, think=False):
133
+ """Stream CUMULATIVE output. think=False suppresses reasoning (clean code only); think=True
134
+ streams the reasoning live wrapped in <think>…</think>. The main app diffs successive yields
135
+ into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
136
  try:
137
  inputs = _build_inputs(system, user)
138
  # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
 
151
 
152
  thread = threading.Thread(target=_run)
153
  thread.start()
154
+ acc, emitted = "", False
155
  for piece in streamer:
156
  acc += piece
157
+ # When hiding thinking, emit nothing until the response block opens.
158
+ if not think and not _split(acc)[2]:
159
+ continue
160
+ emitted = True
161
+ yield _render(acc, think)
162
  thread.join()
163
  if err:
164
+ yield (_render(acc, think) + "\n[GENERATE ERROR]\n" + err["tb"])
165
+ elif not emitted:
166
+ yield _render(acc, think) or "[EMPTY OUTPUT β€” no response block produced]"
 
167
  except Exception: # noqa: BLE001
168
  import traceback
169
  yield "[SETUP ERROR]\n" + traceback.format_exc()
170
 
171
 
172
  @spaces.GPU(duration=GPU_DURATION)
173
+ def generate(system, user, max_tokens, temperature, think=False):
174
  try:
175
  inputs = _build_inputs(system, user)
176
  out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
177
  raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
178
+ return _render(raw, think) or "[EMPTY OUTPUT]"
179
  except Exception: # noqa: BLE001
180
  import traceback
181
  return "[ERROR]\n" + traceback.format_exc()
 
188
  usr_in = gr.Textbox(label="user", lines=6)
189
  mt_in = gr.Slider(16, 2048, value=512, step=16, label="max_tokens")
190
  temp_in = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="temperature")
191
+ # 5th input β€” defaults False so existing 4-arg API callers keep getting clean code.
192
+ think_in = gr.Checkbox(value=False, label="show thinking (wrap reasoning in <think>…</think>)")
193
  out = gr.Textbox(label="output", lines=12)
194
  with gr.Row():
195
  stream_btn = gr.Button("Stream", variant="primary")
196
  once_btn = gr.Button("Generate")
197
  stream_btn.click(
198
+ generate_stream, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate_stream"
199
  )
200
+ once_btn.click(generate, [sys_in, usr_in, mt_in, temp_in, think_in], out, api_name="generate")
201
 
202
  if __name__ == "__main__":
203
  demo.queue().launch()