AutoMathReasoner / train /colab_train.py
HarshitShri026's picture
push
12acaa5
"""
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)
Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:
!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .
3. Run the following Python script in the next cell.
"""
import collections
import random
import unsloth # Must be imported before trl/transformers/peft for patching.
from datasets import Dataset
import torch
import numpy as np
# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction
# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)
max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16
# T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically.
has_cuda = torch.cuda.is_available()
use_bf16 = has_cuda and torch.cuda.is_bf16_supported()
use_fp16 = has_cuda and not use_bf16
# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download
max_seq_length = max_seq_length,
dtype = None,
load_in_4bit = True,
)
# Enable LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
model,
r = lora_rank,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = lora_rank,
use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
)
# 3. Prepare Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(50): # Increased from 30 for better coverage
# This fires an HTTP request to your Hugging Face Space
obs = env.reset()
initial_prompts.append({"prompt": obs.problem_text})
# Deduplicate
seen = set()
unique_prompts = []
for p in initial_prompts:
if p["prompt"] not in seen:
seen.add(p["prompt"])
unique_prompts.append(p)
print(f" Generated {len(unique_prompts)} unique training prompts")
dataset = Dataset.from_list(unique_prompts)
# 4. Define Reward Function for TRL
# Track stats for logging
reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0}
def compute_rewards(prompts, completions, **kwargs):
"""
Interfaces with the OpenEnv running on Hugging Face Spaces.
Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
Improvements over v1:
1. Better answer parsing with multiple delimiter support
2. Confidence-weighted self-consistency bonus
3. Format compliance awareness
4. Progress logging
"""
rewards = []
parsed_actions = []
prompt_answers = collections.defaultdict(list)
# Parse all completions
for prompt, completion in zip(prompts, completions):
try:
if "Answer:" in completion:
parts = completion.split("Answer:")
reasoning = parts[0].strip()
answer = parts[1].strip() if len(parts) > 1 else ""
elif "answer:" in completion.lower():
idx = completion.lower().index("answer:")
reasoning = completion[:idx].strip()
answer = completion[idx + 7:].strip()
else:
lines = completion.strip().split('\n')
if len(lines) > 1:
reasoning = '\n'.join(lines[:-1]).strip()
answer = lines[-1].strip()
else:
reasoning = completion
answer = ""
except Exception:
reasoning = completion
answer = ""
parsed_actions.append((prompt, completion, reasoning, answer))
prompt_answers[prompt].append(answer)
# Majority voting with confidence
majority_answers = {}
majority_confidence = {}
for p, ans_list in prompt_answers.items():
if ans_list:
counter = collections.Counter(ans_list)
most_common = counter.most_common(1)[0]
majority_answers[p] = most_common[0]
majority_confidence[p] = most_common[1] / len(ans_list)
for p, c, r, a in parsed_actions:
action = AutomathreasonerAction(reasoning=r, final_answer=a)
# Reset and step through HTTP API
obs = env.reset()
step_obs = env.step(action)
r_total = step_obs.reward
# Confidence-weighted self-consistency bonus
majority = majority_answers.get(p, "")
confidence = majority_confidence.get(p, 0.0)
if (a == majority) and len(a) > 0 and confidence > 0.3:
r_total += 0.05 + 0.10 * confidence
r_total = max(-1.0, min(1.5, r_total))
rewards.append(r_total)
# Stats
reward_stats["total_calls"] += 1
is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False
reward_stats["total_correct"] += 1 if is_correct else 0
reward_stats["total_reward"] += r_total
# Log every 30 calls
if reward_stats["total_calls"] % 30 < len(prompts):
n = reward_stats["total_calls"]
avg_r = reward_stats["total_reward"] / max(1, n)
acc = reward_stats["total_correct"] / max(1, n)
print(f" πŸ“Š Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}")
return rewards
# 5. Execute Training (T4-optimized parameters)
training_args = GRPOConfig(
output_dir="colab_outputs",
# Learning rate β€” matched to dense reward signal
learning_rate=5e-6,
# Batch β€” T4 memory-safe
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
# Sequence lengths β€” room for math reasoning + hints
max_prompt_length=192, # Was 128
max_completion_length=384, # Was 256
# GRPO group β€” K=8 (kept for T4 memory, was 4)
num_generations=8, # Increased from 4, still T4-safe
# Training duration
max_steps=200, # Was 150
# Logging
logging_steps=5,
# Warmup
warmup_ratio=0.08,
# 8-bit optimizer saves VRAM
optim="adamw_8bit",
bf16=use_bf16,
fp16=use_fp16,
use_cpu=not has_cuda,
)
trainer = GRPOTrainer(
model=model,
reward_funcs=[compute_rewards],
args=training_args,
train_dataset=dataset,
)
print("πŸš€ Starting GRPO Training in Colab using Remote HF Environment...")
print(f" Config: lr={training_args.learning_rate}, "
f"generations={training_args.num_generations}, "
f"max_steps={training_args.max_steps}")
# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()
# Print final summary
n = reward_stats["total_calls"]
if n > 0:
print(f"\nπŸ“ˆ Final Colab Training Summary:")
print(f" Total reward calls: {n}")
print(f" Overall accuracy: {reward_stats['total_correct'] / n:.2%}")
print(f" Average reward: {reward_stats['total_reward'] / n:.4f}")
# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")