Spaces:

TAGOOZ
/

reasoning-trajectory-demo

Sleeping

App Files Files Community

Mustafa Tag Eldeen commited on 8 days ago

Commit

80dbe0c

1 Parent(s): 075304e

swap: Qwen2.5-0.5B-Instruct (faster, reliable #### format) + fix dp2 detection bug

Browse files

Files changed (3) hide show

app.py +23 -18
learning-records/0003-qwen-swap.md +21 -0
src/inference/complete_pipeline.py +2 -1

app.py CHANGED Viewed

@@ -26,10 +26,10 @@ from src.features.windows import (
     compute_step_boundaries,
 )
 from src.features.span_detection import extract_answer_after_hash
-from src.utils import format_prompt, answers_match
-MODEL_ID = os.environ.get("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "192"))
 model = None
 tokenizer = None
@@ -70,20 +70,12 @@ def load_model():
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
-        # Find all token IDs that decode to "Step" — some tokenizers have
-        # different IDs depending on context (e.g. after newline vs. word-start)
-        step_token_ids = set()
-        for word in ["Step", "▁Step"]:
-            try:
-                tid = tokenizer.encode(word, add_special_tokens=False)[0]
-                step_token_ids.add(tid)
-            except (IndexError, ValueError):
-                pass
-        # Also check all tokens in the vocab that decode to "Step"
-        for tid in range(tokenizer.vocab_size):
-            if tokenizer.decode([tid]).strip() == "Step":
-                step_token_ids.add(tid)
-        step_token_id = min(step_token_ids)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
@@ -104,7 +96,20 @@ def generate_reasoning(question: str, gold_answer: str, max_new_tokens: int):
         if not loaded:
             return None, "Failed to load model."
-    prompt = format_prompt(question, template="cot")
     inputs = tokenizer(prompt, return_tensors="pt", padding=False)
     input_ids = inputs["input_ids"]
     attention_mask = inputs["attention_mask"]

     compute_step_boundaries,
 )
 from src.features.span_detection import extract_answer_after_hash
+from src.utils import answers_match
+MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct")
+MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "128"))
 model = None
 tokenizer = None
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
+        # Qwen's tokenizer has a stable "Step" token ID (8304) across contexts
+        # Use the encoded token that results from "Step" at the start of generation
+        suffix = "\n<|im_start|>assistant\n"
+        suffix_ids = tokenizer.encode(suffix, add_special_tokens=False)
+        step_token_id = tokenizer.encode(suffix + "Step", add_special_tokens=False)[len(suffix_ids)]
+        step_token_ids = {step_token_id}
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
         if not loaded:
             return None, "Failed to load model."
+    # Build chat messages with the paper's "cot" template content
+    system_msg = (
+        'You are a helpful assistant that solves problems step by step with each step signified by "Step [step_number]: ". '
+        'Always provide your final answer after #### at the end.'
+    )
+    user_msg = (
+        f'Please solve this step by step, putting each step after "Step [step_number]: " '
+        f'and always provide your final answer after ####.\n\nQuestion: {question}'
+    )
+    messages = [
+        {"role": "system", "content": system_msg},
+        {"role": "user", "content": user_msg},
+    ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(prompt, return_tensors="pt", padding=False)
     input_ids = inputs["input_ids"]
     attention_mask = inputs["attention_mask"]

learning-records/0003-qwen-swap.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# 0003: Swapped to Qwen2.5-0.5B-Instruct
+## Why
+TinyLlama-1.1B-Chat couldn't follow the `####` format with the base-model style prompt (`"Solution:\n\n"`), causing answer extraction and dp2 detection to fail. Qwen2.5-0.5B-Instruct is:
+- **494M params** vs 1.1B → ~2x faster on CPU (est. 5-10 tok/s)
+- **Excellent instruction following** → actually outputs `####`
+- **Stable token IDs** — "Step" is always 8304 (no context-dependence)
+## Changes
+### `app.py`
+- `MODEL_ID` → `Qwen/Qwen2.5-0.5B-Instruct`
+- `MAX_NEW_TOKENS` → 128 (smaller model needs fewer tokens)
+- Replaced `format_prompt(question, "cot")` with `tokenizer.apply_chat_template()` — paper's prompt content preserved verbatim, just wrapped in `<|im_start|>` format
+- Simplified `step_token_id` detection — uses context-aware encoding with `\n<|im_start|>assistant\n` suffix (single stable ID)
+### `complete_pipeline.py`
+- Fixed dp2 detection: only runs `detect_dp2_index` when `####` is actually in the generated text — prevents fallback numbers (e.g. "2" from "$2") from creating a wrong dp2_idx
+## Key insight
+Instruct/chat models need their native chat template format, even when the content is from a base-model paper. The `apply_chat_template()` wrapper preserves the research while making the model understand the instructions.

src/inference/complete_pipeline.py CHANGED Viewed

@@ -134,7 +134,8 @@ def process_complete_generation(
     output.produced_answer = produced_answer
     # Detect dp2 (start of extracted answer in token sequence)
-    if produced_answer:
         output.dp2_idx = detect_dp2_index(
             output.full_seq_ids,
             tokenizer,

     output.produced_answer = produced_answer
     # Detect dp2 (start of extracted answer in token sequence)
+    # Only when #### is present — otherwise fallback numbers mislead dp2 detection
+    if produced_answer and "####" in (output.produced_text or ""):
         output.dp2_idx = detect_dp2_index(
             output.full_seq_ids,
             tokenizer,