Spaces:

mindchain
/

rlm-arithmetic-training

Runtime error

App Files Files Community

mindchain commited on Feb 17

Commit

306c5f0

verified ·

1 Parent(s): a80cc87

Upload train_arithmetic.py with huggingface_hub

Browse files

Files changed (1) hide show

train_arithmetic.py +214 -0

train_arithmetic.py ADDED Viewed

	@@ -0,0 +1,214 @@

+#!/usr/bin/env python3
+"""
+GRPO + RLVR Training for Simple Arithmetic
+Task: 2-digit addition and subtraction
+Base Model: Qwen/Qwen3-0.6B-Base
+"""
+import os
+import re
+import random
+import torch
+from datasets import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import GRPOConfig, GRPOTrainer
+# ============================================================================
+# CONFIG
+# ============================================================================
+BASE_MODEL = "Qwen/Qwen3-0.6B-Base"
+OUTPUT_MODEL = "mindchain/qwen3-0.6b-arithmetic"
+MAX_STEPS = 50
+NUM_SAMPLES = 500  # Training samples
+EVAL_SAMPLES = 20   # For baseline test
+# ============================================================================
+# DATA GENERATION
+# ============================================================================
+def generate_arithmetic_samples(n_samples):
+    """Generate simple arithmetic problems"""
+    samples = []
+    for _ in range(n_samples):
+        # Random operation
+        op = random.choice(['+', '-'])
+        if op == '+':
+            a = random.randint(10, 99)
+            b = random.randint(10, 99)
+            answer = a + b
+            problem = f"{a} + {b} = ?"
+        else:
+            a = random.randint(20, 99)
+            b = random.randint(10, a-1)  # Ensure positive result
+            answer = a - b
+            problem = f"{a} - {b} = ?"
+        samples.append({
+            'prompt': f"Solve this arithmetic problem. Give only the answer as a number.\n\n{problem}",
+            'answer': str(answer)
+        })
+    return samples
+# ============================================================================
+# REWARD FUNCTION
+# ============================================================================
+def reward_func(completions, prompts, **kwargs):
+    """
+    Reward function for arithmetic.
+    Extract the last number from completion, compare to ground truth.
+    """
+    answers = kwargs.get('answer', kwargs.get('ground_truth', None))
+    if answers is None:
+        return [0.0] * len(completions)
+    rewards = []
+    for completion, truth in zip(completions, answers):
+        # Handle list format (conversational)
+        if isinstance(completion, list):
+            text = " ".join([m.get('content', '') if isinstance(m, dict) else str(m) for m in completion])
+        else:
+            text = str(completion)
+        # Extract the last number
+        numbers = re.findall(r'-?\d+\.?\d*', text)
+        if numbers:
+            predicted = numbers[-1].strip()
+        else:
+            predicted = ""
+        # Exact match reward
+        if predicted == str(truth).strip():
+            rewards.append(1.0)
+        else:
+            rewards.append(0.0)
+    return rewards
+# ============================================================================
+# BASELINE TEST
+# ============================================================================
+def test_base_model(model, tokenizer, n_samples=20):
+    """Test base model performance before training"""
+    print("\n" + "="*70)
+    print("📊 TESTING BASE MODEL PERFORMANCE")
+    print("="*70)
+    test_samples = generate_arithmetic_samples(n_samples)
+    correct = 0
+    model.eval()
+    with torch.no_grad():
+        for i, sample in enumerate(test_samples):
+            inputs = tokenizer(sample['prompt'], return_tensors='pt').to(model.device)
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=20,
+                do_sample=False,
+                temperature=1.0
+            )
+            response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+            # Extract answer
+            numbers = re.findall(r'-?\d+\.?\d*', response)
+            predicted = numbers[-1].strip() if numbers else ""
+            truth = sample['answer'].strip()
+            is_correct = predicted == truth
+            if is_correct:
+                correct += 1
+            status = "✅" if is_correct else "❌"
+            print(f"[{i+1}] {status} {sample['prompt'].split('= ?')[0].split()[-1]} = {truth} | Predicted: {predicted} | Response: {response[:50]}...")
+    accuracy = correct / n_samples * 100
+    print(f"\n📊 Base Model Accuracy: {accuracy:.1f}% ({correct}/{n_samples})")
+    if accuracy > 90:
+        print("⚠️  WARNING: Base model already performs well! Task may be too easy.")
+    elif accuracy < 50:
+        print("✅ Good! Base model performs poorly. Room for improvement!")
+    print("="*70 + "\n")
+    return accuracy
+# ============================================================================
+# MAIN TRAINING
+# ============================================================================
+def main():
+    print("="*70)
+    print("🔢 GRPO + RLVR Arithmetic Training")
+    print("="*70)
+    print(f"Base Model: {BASE_MODEL}")
+    print(f"Output: {OUTPUT_MODEL}")
+    print(f"Steps: {MAX_STEPS}")
+    print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
+    print("="*70 + "\n")
+    # Load model and tokenizer
+    print("📦 Loading model and tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto" if torch.cuda.is_available() else None
+    )
+    # Test base model first
+    baseline_accuracy = test_base_model(model, tokenizer, n_samples=EVAL_SAMPLES)
+    # Generate training data
+    print("📊 Generating training data...")
+    train_samples = generate_arithmetic_samples(NUM_SAMPLES)
+    train_dataset = Dataset.from_list(train_samples)
+    print(f"✅ {len(train_dataset)} training samples\n")
+    # GRPO Config
+    training_args = GRPOConfig(
+        output_dir="./outputs",
+        max_steps=MAX_STEPS,
+        per_device_train_batch_size=4,
+        num_generations=4,
+        learning_rate=2e-4,
+        beta=0.0,  # No KL penalty for this task
+        bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
+        fp16=False,
+        gradient_checkpointing=True,
+        optim="adamw_8bit",
+        logging_steps=1,
+        save_steps=MAX_STEPS,  # Save at end
+        push_to_hub=False,  # We'll push manually
+        report_to="none",
+    )
+    print("🚀 Starting GRPO Training...")
+    print(f"Baseline accuracy: {baseline_accuracy:.1f}%\n")
+    # Train
+    trainer = GRPOTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        reward_func=reward_func,
+    )
+    trainer.train()
+    print("\n✅ Training complete!")
+    # Save to Hub
+    print(f"\n📦 Pushing to Hub: {OUTPUT_MODEL}")
+    trainer.model.push_to_hub(OUTPUT_MODEL)
+    tokenizer.push_to_hub(OUTPUT_MODEL)
+    print(f"✅ Model pushed to: https://huggingface.co/{OUTPUT_MODEL}")
+    print("="*70)
+if __name__ == "__main__":
+    main()