Spaces:

HarshitShri026
/

AutoMathReasoner

Sleeping

App Files Files Community

AutoMathReasoner / train /colab_train.py

HarshitShri026

push

12acaa5 22 days ago

raw

history blame contribute delete

7.82 kB

	"""
	Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)

	Instructions for Colab:
	1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
	2. Run the following installation commands in your first cell:

	!pip install unsloth "trl<0.9.0"
	!pip install openenv-core pydantic httpx
	!git clone <YOUR-GITHUB-REPO-URL>
	!cd AutoMathReasoner && pip install -e .

	3. Run the following Python script in the next cell.
	"""

	import collections
	import random
	import unsloth # Must be imported before trl/transformers/peft for patching.
	from datasets import Dataset
	import torch
	import numpy as np

	# Unsloth & TRL
	from unsloth import FastLanguageModel
	from trl import GRPOConfig, GRPOTrainer

	# AutoMathReasoner OpenEnv Client
	import sys
	sys.path.append("./AutoMathReasoner")
	from AutoMathReasoner.client import AutomathreasonerEnv
	from AutoMathReasoner.env.models import AutomathreasonerAction

	# 1. Configuration
	# Replace with your actual Hugging Face Space URL!
	HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
	env = AutomathreasonerEnv(url=HF_SPACE_URL)

	max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
	lora_rank = 16

	# T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically.
	has_cuda = torch.cuda.is_available()
	use_bf16 = has_cuda and torch.cuda.is_bf16_supported()
	use_fp16 = has_cuda and not use_bf16

	# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
	print("Loading model via Unsloth...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download
	max_seq_length = max_seq_length,
	dtype = None,
	load_in_4bit = True,
	)

	# Enable LoRA fine-tuning
	model = FastLanguageModel.get_peft_model(
	model,
	r = lora_rank,
	target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_alpha = lora_rank,
	use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
	)

	# 3. Prepare Prompts from the Remote Environment
	print("Gathering initial prompts from HF Space environment...")
	initial_prompts = []
	for _ in range(50): # Increased from 30 for better coverage
	# This fires an HTTP request to your Hugging Face Space
	obs = env.reset()
	initial_prompts.append({"prompt": obs.problem_text})

	# Deduplicate
	seen = set()
	unique_prompts = []
	for p in initial_prompts:
	if p["prompt"] not in seen:
	seen.add(p["prompt"])
	unique_prompts.append(p)

	print(f" Generated {len(unique_prompts)} unique training prompts")
	dataset = Dataset.from_list(unique_prompts)

	# 4. Define Reward Function for TRL
	# Track stats for logging
	reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0}

	def compute_rewards(prompts, completions, **kwargs):
	"""
	Interfaces with the OpenEnv running on Hugging Face Spaces.
	Extracts the generation, passes it via HTTP to the env, and yields the dense reward.

	Improvements over v1:
	1. Better answer parsing with multiple delimiter support
	2. Confidence-weighted self-consistency bonus
	3. Format compliance awareness
	4. Progress logging
	"""
	rewards = []
	parsed_actions = []
	prompt_answers = collections.defaultdict(list)

	# Parse all completions
	for prompt, completion in zip(prompts, completions):
	try:
	if "Answer:" in completion:
	parts = completion.split("Answer:")
	reasoning = parts[0].strip()
	answer = parts[1].strip() if len(parts) > 1 else ""
	elif "answer:" in completion.lower():
	idx = completion.lower().index("answer:")
	reasoning = completion[:idx].strip()
	answer = completion[idx + 7:].strip()
	else:
	lines = completion.strip().split('\n')
	if len(lines) > 1:
	reasoning = '\n'.join(lines[:-1]).strip()
	answer = lines[-1].strip()
	else:
	reasoning = completion
	answer = ""
	except Exception:
	reasoning = completion
	answer = ""

	parsed_actions.append((prompt, completion, reasoning, answer))
	prompt_answers[prompt].append(answer)

	# Majority voting with confidence
	majority_answers = {}
	majority_confidence = {}
	for p, ans_list in prompt_answers.items():
	if ans_list:
	counter = collections.Counter(ans_list)
	most_common = counter.most_common(1)[0]
	majority_answers[p] = most_common[0]
	majority_confidence[p] = most_common[1] / len(ans_list)

	for p, c, r, a in parsed_actions:
	action = AutomathreasonerAction(reasoning=r, final_answer=a)

	# Reset and step through HTTP API
	obs = env.reset()
	step_obs = env.step(action)
	r_total = step_obs.reward

	# Confidence-weighted self-consistency bonus
	majority = majority_answers.get(p, "")
	confidence = majority_confidence.get(p, 0.0)
	if (a == majority) and len(a) > 0 and confidence > 0.3:
	r_total += 0.05 + 0.10 * confidence

	r_total = max(-1.0, min(1.5, r_total))
	rewards.append(r_total)

	# Stats
	reward_stats["total_calls"] += 1
	is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False
	reward_stats["total_correct"] += 1 if is_correct else 0
	reward_stats["total_reward"] += r_total

	# Log every 30 calls
	if reward_stats["total_calls"] % 30 < len(prompts):
	n = reward_stats["total_calls"]
	avg_r = reward_stats["total_reward"] / max(1, n)
	acc = reward_stats["total_correct"] / max(1, n)
	print(f" 📊 Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}")

	return rewards

	# 5. Execute Training (T4-optimized parameters)
	training_args = GRPOConfig(
	output_dir="colab_outputs",

	# Learning rate — matched to dense reward signal
	learning_rate=5e-6,

	# Batch — T4 memory-safe
	per_device_train_batch_size=1,
	gradient_accumulation_steps=4,

	# Sequence lengths — room for math reasoning + hints
	max_prompt_length=192, # Was 128
	max_completion_length=384, # Was 256

	# GRPO group — K=8 (kept for T4 memory, was 4)
	num_generations=8, # Increased from 4, still T4-safe

	# Training duration
	max_steps=200, # Was 150

	# Logging
	logging_steps=5,

	# Warmup
	warmup_ratio=0.08,

	# 8-bit optimizer saves VRAM
	optim="adamw_8bit",
	bf16=use_bf16,
	fp16=use_fp16,
	use_cpu=not has_cuda,
	)

	trainer = GRPOTrainer(
	model=model,
	reward_funcs=[compute_rewards],
	args=training_args,
	train_dataset=dataset,
	)

	print("🚀 Starting GRPO Training in Colab using Remote HF Environment...")
	print(f" Config: lr={training_args.learning_rate}, "
	f"generations={training_args.num_generations}, "
	f"max_steps={training_args.max_steps}")

	# Will show wandb/tensorboard logging so you can prove "it is actually learning"
	trainer.train()

	# Print final summary
	n = reward_stats["total_calls"]
	if n > 0:
	print(f"\n📈 Final Colab Training Summary:")
	print(f" Total reward calls: {n}")
	print(f" Overall accuracy: {reward_stats['total_correct'] / n:.2%}")
	print(f" Average reward: {reward_stats['total_reward'] / n:.4f}")

	# 6. Push to Hugging Face
	# Optional: save locally or push to Hub after it learns
	# model.push_to_hub("your-name/AutoMathReasoner-Trained")