Spaces:

polats
/

tiny-army-bls-code-zerogpu

Running on Zero

App Files Files Community

polats commited on 3 days ago

Commit

8fce99a

verified ·

1 Parent(s): e41a733

Reasoning model: think in discarded block, stream only clean response code

Browse files

Files changed (1) hide show

app.py +54 -23

app.py CHANGED Viewed

@@ -8,6 +8,13 @@
 # Model: CohereLabs/BLS-Mini-Code-1.0 — 30B MoE (cohere2_moe), BF16 only upstream (no FP8
 # weight published as of 2026-06), so we quantize AT LOAD via bitsandbytes to fit the ZeroGPU
 # H200 slice. TINY_BLS_QUANT selects 4bit (default, ~18GB) / 8bit (~32GB) / bf16 (~60GB, tight).
 import os
 import threading
@@ -19,6 +26,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 MODEL_ID = os.environ.get("TINY_BLS_MODEL", "CohereLabs/BLS-Mini-Code-1.0")
 QUANT = os.environ.get("TINY_BLS_QUANT", "4bit").strip().lower()
 GPU_DURATION = int(os.environ.get("TINY_BLS_GPU_DURATION", "120"))
 print(f"[bls-code] loading {MODEL_ID} quant={QUANT}", flush=True)
@@ -53,24 +65,39 @@ def _build_inputs(system, user):
     if system and system.strip():
         messages.append({"role": "system", "content": system.strip()})
     messages.append({"role": "user", "content": (user or "").strip()})
-    # return_dict=True yields {input_ids, attention_mask}; this transformers build returns a
-    # BatchEncoding even with return_tensors="pt", so we splat it into generate() rather than
-    # passing it as a bare input_ids tensor (which fails on .shape).
-    enc = _tok.apply_chat_template(
-        messages, add_generation_prompt=True, return_tensors="pt", return_dict=True
-    )
     return {k: v.to(_model.device) for k, v in enc.items()}
-def _prompt_len(inputs):
-    return inputs["input_ids"].shape[-1]
 def _gen_kwargs(inputs, max_tokens, temperature):
     temp = float(temperature if temperature is not None else 0.6)
     kw = dict(
         **inputs,
-        max_new_tokens=int(max_tokens or 512),
         do_sample=temp > 0,
         pad_token_id=_tok.pad_token_id or _tok.eos_token_id,
     )
@@ -81,12 +108,12 @@ def _gen_kwargs(inputs, max_tokens, temperature):
 @spaces.GPU(duration=GPU_DURATION)
 def generate_stream(system, user, max_tokens, temperature):
-    """Yield CUMULATIVE decoded text — the main app diffs successive yields into deltas.
-    On failure, yield the traceback as text so the client (and us) can see what broke
-    instead of a silent empty stream."""
     try:
         inputs = _build_inputs(system, user)
-        streamer = TextIteratorStreamer(_tok, skip_prompt=True, skip_special_tokens=True)
         kw = _gen_kwargs(inputs, max_tokens, temperature)
         kw["streamer"] = streamer
         err = {}
@@ -94,22 +121,27 @@ def generate_stream(system, user, max_tokens, temperature):
         def _run():
             try:
                 _model.generate(**kw)
-            except Exception as e:  # noqa: BLE001
                 import traceback
                 err["tb"] = traceback.format_exc()
-                streamer.end()  # unblock the consumer
         thread = threading.Thread(target=_run)
         thread.start()
-        acc = ""
         for piece in streamer:
             acc += piece
-            yield acc
         thread.join()
         if err:
-            yield (acc + "\n[GENERATE ERROR]\n" + err["tb"])
-        elif not acc:
-            yield "[EMPTY OUTPUT — generation produced no decodable tokens]"
     except Exception:  # noqa: BLE001
         import traceback
         yield "[SETUP ERROR]\n" + traceback.format_exc()
@@ -120,9 +152,8 @@ def generate(system, user, max_tokens, temperature):
     try:
         inputs = _build_inputs(system, user)
         out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
-        # DEBUG: raw decode (special tokens kept) to inspect the reasoning/response structure.
-        raw = _tok.decode(out[0][_prompt_len(inputs):], skip_special_tokens=False)
-        return "[RAW]\n" + raw
     except Exception:  # noqa: BLE001
         import traceback
         return "[ERROR]\n" + traceback.format_exc()

 # Model: CohereLabs/BLS-Mini-Code-1.0 — 30B MoE (cohere2_moe), BF16 only upstream (no FP8
 # weight published as of 2026-06), so we quantize AT LOAD via bitsandbytes to fit the ZeroGPU
 # H200 slice. TINY_BLS_QUANT selects 4bit (default, ~18GB) / 8bit (~32GB) / bf16 (~60GB, tight).
+#
+# REASONING: BLS-Mini-Code is a Cohere reasoning model. Its chat template, with
+# add_generation_prompt=True, force-opens <|START_RESPONSE|> (non-reasoning mode) — which makes
+# the model dump its reasoning as prose into the answer. Instead we open a <|START_THINKING|>
+# block so it reasons in a dedicated section we DISCARD, and we stream only the clean code from
+# <|START_RESPONSE|>…<|END_RESPONSE|>. TINY_BLS_THINK_BUDGET extra tokens are reserved for the
+# (discarded) thinking so the requested max_tokens still applies to the visible code.
 import os
 import threading
 MODEL_ID = os.environ.get("TINY_BLS_MODEL", "CohereLabs/BLS-Mini-Code-1.0")
 QUANT = os.environ.get("TINY_BLS_QUANT", "4bit").strip().lower()
 GPU_DURATION = int(os.environ.get("TINY_BLS_GPU_DURATION", "120"))
+THINK_BUDGET = int(os.environ.get("TINY_BLS_THINK_BUDGET", "1024"))
+START_THINK, END_THINK = "<|START_THINKING|>", "<|END_THINKING|>"
+START_RESP, END_RESP = "<|START_RESPONSE|>", "<|END_RESPONSE|>"
+_STRIP = (START_THINK, END_THINK, START_RESP, END_RESP, "<|END_OF_TURN_TOKEN|>")
 print(f"[bls-code] loading {MODEL_ID} quant={QUANT}", flush=True)
     if system and system.strip():
         messages.append({"role": "system", "content": system.strip()})
     messages.append({"role": "user", "content": (user or "").strip()})
+    text = _tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    # The template force-opens <|START_RESPONSE|> (non-reasoning). Swap it for a thinking block
+    # so the model reasons where we can discard it, leaving clean code in the response section.
+    t = text.rstrip()
+    if t.endswith(START_RESP):
+        text = t[: -len(START_RESP)] + START_THINK
+    enc = _tok(text, return_tensors="pt", add_special_tokens=False)
     return {k: v.to(_model.device) for k, v in enc.items()}
+def _extract_response(raw):
+    """Pull just the answer out of a (possibly partial) raw decode: content after
+    <|START_RESPONSE|> (or after <|END_THINKING|> as a fallback), up to <|END_RESPONSE|>."""
+    i = raw.find(START_RESP)
+    if i != -1:
+        body = raw[i + len(START_RESP):]
+    else:
+        j = raw.find(END_THINK)
+        body = raw[j + len(END_THINK):] if j != -1 else ""
+    k = body.find(END_RESP)
+    if k != -1:
+        body = body[:k]
+    for mark in _STRIP:
+        body = body.replace(mark, "")
+    return body.strip()
 def _gen_kwargs(inputs, max_tokens, temperature):
     temp = float(temperature if temperature is not None else 0.6)
     kw = dict(
         **inputs,
+        # Reserve THINK_BUDGET on top so the discarded reasoning doesn't eat the code budget.
+        max_new_tokens=int(max_tokens or 512) + THINK_BUDGET,
         do_sample=temp > 0,
         pad_token_id=_tok.pad_token_id or _tok.eos_token_id,
     )
 @spaces.GPU(duration=GPU_DURATION)
 def generate_stream(system, user, max_tokens, temperature):
+    """Stream CUMULATIVE response text (thinking suppressed). The main app diffs successive
+    yields into deltas. On failure, yield the traceback so it isn't a silent empty stream."""
     try:
         inputs = _build_inputs(system, user)
+        # skip_special_tokens=False so we can SEE the thinking/response markers and split on them.
+        streamer = TextIteratorStreamer(_tok, skip_prompt=True, skip_special_tokens=False)
         kw = _gen_kwargs(inputs, max_tokens, temperature)
         kw["streamer"] = streamer
         err = {}
         def _run():
             try:
                 _model.generate(**kw)
+            except Exception:  # noqa: BLE001
                 import traceback
                 err["tb"] = traceback.format_exc()
+                streamer.end()
         thread = threading.Thread(target=_run)
         thread.start()
+        acc, started = "", False
         for piece in streamer:
             acc += piece
+            if not started:
+                if START_RESP not in acc:
+                    continue  # still in the thinking block — emit nothing yet
+                started = True
+            yield _extract_response(acc)
         thread.join()
         if err:
+            yield (_extract_response(acc) + "\n[GENERATE ERROR]\n" + err["tb"])
+        elif not started:
+            # Model never opened a response block — fall back to whatever's after thinking.
+            yield _extract_response(acc) or "[EMPTY OUTPUT — no response block produced]"
     except Exception:  # noqa: BLE001
         import traceback
         yield "[SETUP ERROR]\n" + traceback.format_exc()
     try:
         inputs = _build_inputs(system, user)
         out = _model.generate(**_gen_kwargs(inputs, max_tokens, temperature))
+        raw = _tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
+        return _extract_response(raw) or "[EMPTY OUTPUT]"
     except Exception:  # noqa: BLE001
         import traceback
         return "[ERROR]\n" + traceback.format_exc()