Spaces:
Sleeping
Sleeping
File size: 7,815 Bytes
f8319a8 12acaa5 f8319a8 973cd6f f8319a8 973cd6f f8319a8 12acaa5 f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f f8319a8 973cd6f 12acaa5 f8319a8 973cd6f f8319a8 973cd6f f8319a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | """
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)
Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:
!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .
3. Run the following Python script in the next cell.
"""
import collections
import random
import unsloth # Must be imported before trl/transformers/peft for patching.
from datasets import Dataset
import torch
import numpy as np
# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction
# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)
max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16
# T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically.
has_cuda = torch.cuda.is_available()
use_bf16 = has_cuda and torch.cuda.is_bf16_supported()
use_fp16 = has_cuda and not use_bf16
# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download
max_seq_length = max_seq_length,
dtype = None,
load_in_4bit = True,
)
# Enable LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
model,
r = lora_rank,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = lora_rank,
use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
)
# 3. Prepare Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(50): # Increased from 30 for better coverage
# This fires an HTTP request to your Hugging Face Space
obs = env.reset()
initial_prompts.append({"prompt": obs.problem_text})
# Deduplicate
seen = set()
unique_prompts = []
for p in initial_prompts:
if p["prompt"] not in seen:
seen.add(p["prompt"])
unique_prompts.append(p)
print(f" Generated {len(unique_prompts)} unique training prompts")
dataset = Dataset.from_list(unique_prompts)
# 4. Define Reward Function for TRL
# Track stats for logging
reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0}
def compute_rewards(prompts, completions, **kwargs):
"""
Interfaces with the OpenEnv running on Hugging Face Spaces.
Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
Improvements over v1:
1. Better answer parsing with multiple delimiter support
2. Confidence-weighted self-consistency bonus
3. Format compliance awareness
4. Progress logging
"""
rewards = []
parsed_actions = []
prompt_answers = collections.defaultdict(list)
# Parse all completions
for prompt, completion in zip(prompts, completions):
try:
if "Answer:" in completion:
parts = completion.split("Answer:")
reasoning = parts[0].strip()
answer = parts[1].strip() if len(parts) > 1 else ""
elif "answer:" in completion.lower():
idx = completion.lower().index("answer:")
reasoning = completion[:idx].strip()
answer = completion[idx + 7:].strip()
else:
lines = completion.strip().split('\n')
if len(lines) > 1:
reasoning = '\n'.join(lines[:-1]).strip()
answer = lines[-1].strip()
else:
reasoning = completion
answer = ""
except Exception:
reasoning = completion
answer = ""
parsed_actions.append((prompt, completion, reasoning, answer))
prompt_answers[prompt].append(answer)
# Majority voting with confidence
majority_answers = {}
majority_confidence = {}
for p, ans_list in prompt_answers.items():
if ans_list:
counter = collections.Counter(ans_list)
most_common = counter.most_common(1)[0]
majority_answers[p] = most_common[0]
majority_confidence[p] = most_common[1] / len(ans_list)
for p, c, r, a in parsed_actions:
action = AutomathreasonerAction(reasoning=r, final_answer=a)
# Reset and step through HTTP API
obs = env.reset()
step_obs = env.step(action)
r_total = step_obs.reward
# Confidence-weighted self-consistency bonus
majority = majority_answers.get(p, "")
confidence = majority_confidence.get(p, 0.0)
if (a == majority) and len(a) > 0 and confidence > 0.3:
r_total += 0.05 + 0.10 * confidence
r_total = max(-1.0, min(1.5, r_total))
rewards.append(r_total)
# Stats
reward_stats["total_calls"] += 1
is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False
reward_stats["total_correct"] += 1 if is_correct else 0
reward_stats["total_reward"] += r_total
# Log every 30 calls
if reward_stats["total_calls"] % 30 < len(prompts):
n = reward_stats["total_calls"]
avg_r = reward_stats["total_reward"] / max(1, n)
acc = reward_stats["total_correct"] / max(1, n)
print(f" π Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}")
return rewards
# 5. Execute Training (T4-optimized parameters)
training_args = GRPOConfig(
output_dir="colab_outputs",
# Learning rate β matched to dense reward signal
learning_rate=5e-6,
# Batch β T4 memory-safe
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
# Sequence lengths β room for math reasoning + hints
max_prompt_length=192, # Was 128
max_completion_length=384, # Was 256
# GRPO group β K=8 (kept for T4 memory, was 4)
num_generations=8, # Increased from 4, still T4-safe
# Training duration
max_steps=200, # Was 150
# Logging
logging_steps=5,
# Warmup
warmup_ratio=0.08,
# 8-bit optimizer saves VRAM
optim="adamw_8bit",
bf16=use_bf16,
fp16=use_fp16,
use_cpu=not has_cuda,
)
trainer = GRPOTrainer(
model=model,
reward_funcs=[compute_rewards],
args=training_args,
train_dataset=dataset,
)
print("π Starting GRPO Training in Colab using Remote HF Environment...")
print(f" Config: lr={training_args.learning_rate}, "
f"generations={training_args.num_generations}, "
f"max_steps={training_args.max_steps}")
# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()
# Print final summary
n = reward_stats["total_calls"]
if n > 0:
print(f"\nπ Final Colab Training Summary:")
print(f" Total reward calls: {n}")
print(f" Overall accuracy: {reward_stats['total_correct'] / n:.2%}")
print(f" Average reward: {reward_stats['total_reward'] / n:.4f}")
# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")
|