polats commited on
Commit
8fce99a
Β·
verified Β·
1 Parent(s): e41a733

Reasoning model: think in discarded block, stream only clean response code

Browse files
Files changed (1) hide show
  1. app.py +54 -23
app.py CHANGED
@@ -8,6 +8,13 @@
8
  # Model: CohereLabs/BLS-Mini-Code-1.0 β€” 30B MoE (cohere2_moe), BF16 only upstream (no FP8
9
  # weight published as of 2026-06), so we quantize AT LOAD via bitsandbytes to fit the ZeroGPU
10
  # H200 slice. TINY_BLS_QUANT selects 4bit (default, ~18GB) / 8bit (~32GB) / bf16 (~60GB, tight).
 
 
 
 
 
 
 
11
  import os
12
  import threading
13
 
@@ -19,6 +26,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
19
  MODEL_ID = os.environ.get("TINY_BLS_MODEL", "CohereLabs/BLS-Mini-Code-1.0")
20
  QUANT = os.environ.get("TINY_BLS_QUANT", "4bit").strip().lower()
21
  GPU_DURATION = int(os.environ.get("TINY_BLS_GPU_DURATION", "120"))
 
 
 
 
 
22
 
23
  print(f"[bls-code] loading {MODEL_ID} quant={QUANT}", flush=True)
24
 
@@ -53,24 +65,39 @@ def _build_inputs(system, user):
53
  if system and system.strip():
54
  messages.append({"role": "system", "content": system.strip()})
55
  messages.append({"role": "user", "content": (user or "").strip()})
56
- # return_dict=True yields {input_ids, attention_mask}; this transformers build returns a
57
- # BatchEncoding even with return_tensors="pt", so we splat it into generate() rather than
58
- # passing it as a bare input_ids tensor (which fails on .shape).
59
- enc = _tok.apply_chat_template(
60
- messages, add_generation_prompt=True, return_tensors="pt", return_dict=True
61
- )
 
62
  return {k: v.to(_model.device) for k, v in enc.items()}
63
 
64
 
65
- def _prompt_len(inputs):
66
- return inputs["input_ids"].shape[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  def _gen_kwargs(inputs, max_tokens, temperature):
70
  temp = float(temperature if temperature is not None else 0.6)
71
  kw = dict(
72
  **inputs,
73
- max_new_tokens=int(max_tokens or 512),
 
74
  do_sample=temp > 0,
75
  pad_token_id=_tok.pad_token_id or _tok.eos_token_id,
76
  )
@@ -81,12 +108,12 @@ def _gen_kwargs(inputs, max_tokens, temperature):
81
 
82
  @spaces.GPU(duration=GPU_DURATION)
83
  def generate_stream(system, user, max_tokens, temperature):
84
- """Yield CUMULATIVE decoded text β€” the main app diffs successive yields into deltas.
85
- On failure, yield the traceback as text so the client (and us) can see what broke
86
- instead of a silent empty stream."""
87
  try:
88
  inputs = _build_inputs(system, user)
89
- streamer = TextIteratorStreamer(_tok, skip_prompt=True, skip_special_tokens=True)
 
90
  kw = _gen_kwargs(inputs, max_tokens, temperature)
91
  kw["streamer"] = streamer
92
  err = {}
@@ -94,22 +121,27 @@ def generate_stream(system, user, max_tokens, temperature):
94
  def _run():
95
  try:
96
  _model.generate(**kw)
97
- except Exception as e: # noqa: BLE001
98
  import traceback
99
  err["tb"] = traceback.format_exc()
100
- streamer.end() # unblock the consumer
101
 
102
  thread = threading.Thread(target=_run)
103
  thread.start()
104
- acc = ""
105
  for piece in streamer:
106
  acc += piece
107
- yield acc
 
 
 
 
108
  thread.join()
109
  if err:
110
- yield (acc + "\n[GENERATE ERROR]\n" + err["tb"])
111
- elif not acc:
112
- yield "[EMPTY OUTPUT β€” generation produced no decodable tokens]"
 
113
  except Exception: # noqa: BLE001
114
  import traceback
115
  yield "[SETUP ERROR]\n" + traceback.format_exc()
@@ -120,9 +152,8 @@ def generate(system, user, max_tokens, temperature):
120
  try:
121
  inputs = _build_inputs(system, user)
122
  out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
123
- # DEBUG: raw decode (special tokens kept) to inspect the reasoning/response structure.
124
- raw = _tok.decode(out[0][_prompt_len(inputs):], skip_special_tokens=False)
125
- return "[RAW]\n" + raw
126
  except Exception: # noqa: BLE001
127
  import traceback
128
  return "[ERROR]\n" + traceback.format_exc()
 
8
  # Model: CohereLabs/BLS-Mini-Code-1.0 β€” 30B MoE (cohere2_moe), BF16 only upstream (no FP8
9
  # weight published as of 2026-06), so we quantize AT LOAD via bitsandbytes to fit the ZeroGPU
10
  # H200 slice. TINY_BLS_QUANT selects 4bit (default, ~18GB) / 8bit (~32GB) / bf16 (~60GB, tight).
11
+ #
12
+ # REASONING: BLS-Mini-Code is a Cohere reasoning model. Its chat template, with
13
+ # add_generation_prompt=True, force-opens <|START_RESPONSE|> (non-reasoning mode) β€” which makes
14
+ # the model dump its reasoning as prose into the answer. Instead we open a <|START_THINKING|>
15
+ # block so it reasons in a dedicated section we DISCARD, and we stream only the clean code from
16
+ # <|START_RESPONSE|>…<|END_RESPONSE|>. TINY_BLS_THINK_BUDGET extra tokens are reserved for the
17
+ # (discarded) thinking so the requested max_tokens still applies to the visible code.
18
  import os
19
  import threading
20
 
 
26
  MODEL_ID = os.environ.get("TINY_BLS_MODEL", "CohereLabs/BLS-Mini-Code-1.0")
27
  QUANT = os.environ.get("TINY_BLS_QUANT", "4bit").strip().lower()
28
  GPU_DURATION = int(os.environ.get("TINY_BLS_GPU_DURATION", "120"))
29
+ THINK_BUDGET = int(os.environ.get("TINY_BLS_THINK_BUDGET", "1024"))
30
+
31
+ START_THINK, END_THINK = "<|START_THINKING|>", "<|END_THINKING|>"
32
+ START_RESP, END_RESP = "<|START_RESPONSE|>", "<|END_RESPONSE|>"
33
+ _STRIP = (START_THINK, END_THINK, START_RESP, END_RESP, "<|END_OF_TURN_TOKEN|>")
34
 
35
  print(f"[bls-code] loading {MODEL_ID} quant={QUANT}", flush=True)
36
 
 
65
  if system and system.strip():
66
  messages.append({"role": "system", "content": system.strip()})
67
  messages.append({"role": "user", "content": (user or "").strip()})
68
+ text = _tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
69
+ # The template force-opens <|START_RESPONSE|> (non-reasoning). Swap it for a thinking block
70
+ # so the model reasons where we can discard it, leaving clean code in the response section.
71
+ t = text.rstrip()
72
+ if t.endswith(START_RESP):
73
+ text = t[: -len(START_RESP)] + START_THINK
74
+ enc = _tok(text, return_tensors="pt", add_special_tokens=False)
75
  return {k: v.to(_model.device) for k, v in enc.items()}
76
 
77
 
78
+ def _extract_response(raw):
79
+ """Pull just the answer out of a (possibly partial) raw decode: content after
80
+ <|START_RESPONSE|> (or after <|END_THINKING|> as a fallback), up to <|END_RESPONSE|>."""
81
+ i = raw.find(START_RESP)
82
+ if i != -1:
83
+ body = raw[i + len(START_RESP):]
84
+ else:
85
+ j = raw.find(END_THINK)
86
+ body = raw[j + len(END_THINK):] if j != -1 else ""
87
+ k = body.find(END_RESP)
88
+ if k != -1:
89
+ body = body[:k]
90
+ for mark in _STRIP:
91
+ body = body.replace(mark, "")
92
+ return body.strip()
93
 
94
 
95
  def _gen_kwargs(inputs, max_tokens, temperature):
96
  temp = float(temperature if temperature is not None else 0.6)
97
  kw = dict(
98
  **inputs,
99
+ # Reserve THINK_BUDGET on top so the discarded reasoning doesn't eat the code budget.
100
+ max_new_tokens=int(max_tokens or 512) + THINK_BUDGET,
101
  do_sample=temp > 0,
102
  pad_token_id=_tok.pad_token_id or _tok.eos_token_id,
103
  )
 
108
 
109
  @spaces.GPU(duration=GPU_DURATION)
110
  def generate_stream(system, user, max_tokens, temperature):
111
+ """Stream CUMULATIVE response text (thinking suppressed). The main app diffs successive
112
+ yields into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
 
113
  try:
114
  inputs = _build_inputs(system, user)
115
+ # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
116
+ streamer = TextIteratorStreamer(_tok, skip_prompt=True, skip_special_tokens=False)
117
  kw = _gen_kwargs(inputs, max_tokens, temperature)
118
  kw["streamer"] = streamer
119
  err = {}
 
121
  def _run():
122
  try:
123
  _model.generate(**kw)
124
+ except Exception: # noqa: BLE001
125
  import traceback
126
  err["tb"] = traceback.format_exc()
127
+ streamer.end()
128
 
129
  thread = threading.Thread(target=_run)
130
  thread.start()
131
+ acc, started = "", False
132
  for piece in streamer:
133
  acc += piece
134
+ if not started:
135
+ if START_RESP not in acc:
136
+ continue # still in the thinking block β€” emit nothing yet
137
+ started = True
138
+ yield _extract_response(acc)
139
  thread.join()
140
  if err:
141
+ yield (_extract_response(acc) + "\n[GENERATE ERROR]\n" + err["tb"])
142
+ elif not started:
143
+ # Model never opened a response block β€” fall back to whatever's after thinking.
144
+ yield _extract_response(acc) or "[EMPTY OUTPUT β€” no response block produced]"
145
  except Exception: # noqa: BLE001
146
  import traceback
147
  yield "[SETUP ERROR]\n" + traceback.format_exc()
 
152
  try:
153
  inputs = _build_inputs(system, user)
154
  out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
155
+ raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
156
+ return _extract_response(raw) or "[EMPTY OUTPUT]"
 
157
  except Exception: # noqa: BLE001
158
  import traceback
159
  return "[ERROR]\n" + traceback.format_exc()