""" Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU) Instructions for Colab: 1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth) 2. Run the following installation commands in your first cell: !pip install unsloth "trl<0.9.0" !pip install openenv-core pydantic httpx !git clone !cd AutoMathReasoner && pip install -e . 3. Run the following Python script in the next cell. """ import collections import random import unsloth # Must be imported before trl/transformers/peft for patching. from datasets import Dataset import torch import numpy as np # Unsloth & TRL from unsloth import FastLanguageModel from trl import GRPOConfig, GRPOTrainer # AutoMathReasoner OpenEnv Client import sys sys.path.append("./AutoMathReasoner") from AutoMathReasoner.client import AutomathreasonerEnv from AutoMathReasoner.env.models import AutomathreasonerAction # 1. Configuration # Replace with your actual Hugging Face Space URL! HF_SPACE_URL = "https://your-username-automathreasoner.hf.space" env = AutomathreasonerEnv(url=HF_SPACE_URL) max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit lora_rank = 16 # T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically. has_cuda = torch.cuda.is_available() use_bf16 = has_cuda and torch.cuda.is_bf16_supported() use_fp16 = has_cuda and not use_bf16 # 2. Load Model via Unsloth (optimized for Free Colab VRAM) print("Loading model via Unsloth...") model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, ) # Enable LoRA fine-tuning model = FastLanguageModel.get_peft_model( model, r = lora_rank, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha = lora_rank, use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4 ) # 3. Prepare Prompts from the Remote Environment print("Gathering initial prompts from HF Space environment...") initial_prompts = [] for _ in range(50): # Increased from 30 for better coverage # This fires an HTTP request to your Hugging Face Space obs = env.reset() initial_prompts.append({"prompt": obs.problem_text}) # Deduplicate seen = set() unique_prompts = [] for p in initial_prompts: if p["prompt"] not in seen: seen.add(p["prompt"]) unique_prompts.append(p) print(f" Generated {len(unique_prompts)} unique training prompts") dataset = Dataset.from_list(unique_prompts) # 4. Define Reward Function for TRL # Track stats for logging reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0} def compute_rewards(prompts, completions, **kwargs): """ Interfaces with the OpenEnv running on Hugging Face Spaces. Extracts the generation, passes it via HTTP to the env, and yields the dense reward. Improvements over v1: 1. Better answer parsing with multiple delimiter support 2. Confidence-weighted self-consistency bonus 3. Format compliance awareness 4. Progress logging """ rewards = [] parsed_actions = [] prompt_answers = collections.defaultdict(list) # Parse all completions for prompt, completion in zip(prompts, completions): try: if "Answer:" in completion: parts = completion.split("Answer:") reasoning = parts[0].strip() answer = parts[1].strip() if len(parts) > 1 else "" elif "answer:" in completion.lower(): idx = completion.lower().index("answer:") reasoning = completion[:idx].strip() answer = completion[idx + 7:].strip() else: lines = completion.strip().split('\n') if len(lines) > 1: reasoning = '\n'.join(lines[:-1]).strip() answer = lines[-1].strip() else: reasoning = completion answer = "" except Exception: reasoning = completion answer = "" parsed_actions.append((prompt, completion, reasoning, answer)) prompt_answers[prompt].append(answer) # Majority voting with confidence majority_answers = {} majority_confidence = {} for p, ans_list in prompt_answers.items(): if ans_list: counter = collections.Counter(ans_list) most_common = counter.most_common(1)[0] majority_answers[p] = most_common[0] majority_confidence[p] = most_common[1] / len(ans_list) for p, c, r, a in parsed_actions: action = AutomathreasonerAction(reasoning=r, final_answer=a) # Reset and step through HTTP API obs = env.reset() step_obs = env.step(action) r_total = step_obs.reward # Confidence-weighted self-consistency bonus majority = majority_answers.get(p, "") confidence = majority_confidence.get(p, 0.0) if (a == majority) and len(a) > 0 and confidence > 0.3: r_total += 0.05 + 0.10 * confidence r_total = max(-1.0, min(1.5, r_total)) rewards.append(r_total) # Stats reward_stats["total_calls"] += 1 is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False reward_stats["total_correct"] += 1 if is_correct else 0 reward_stats["total_reward"] += r_total # Log every 30 calls if reward_stats["total_calls"] % 30 < len(prompts): n = reward_stats["total_calls"] avg_r = reward_stats["total_reward"] / max(1, n) acc = reward_stats["total_correct"] / max(1, n) print(f" šŸ“Š Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}") return rewards # 5. Execute Training (T4-optimized parameters) training_args = GRPOConfig( output_dir="colab_outputs", # Learning rate — matched to dense reward signal learning_rate=5e-6, # Batch — T4 memory-safe per_device_train_batch_size=1, gradient_accumulation_steps=4, # Sequence lengths — room for math reasoning + hints max_prompt_length=192, # Was 128 max_completion_length=384, # Was 256 # GRPO group — K=8 (kept for T4 memory, was 4) num_generations=8, # Increased from 4, still T4-safe # Training duration max_steps=200, # Was 150 # Logging logging_steps=5, # Warmup warmup_ratio=0.08, # 8-bit optimizer saves VRAM optim="adamw_8bit", bf16=use_bf16, fp16=use_fp16, use_cpu=not has_cuda, ) trainer = GRPOTrainer( model=model, reward_funcs=[compute_rewards], args=training_args, train_dataset=dataset, ) print("šŸš€ Starting GRPO Training in Colab using Remote HF Environment...") print(f" Config: lr={training_args.learning_rate}, " f"generations={training_args.num_generations}, " f"max_steps={training_args.max_steps}") # Will show wandb/tensorboard logging so you can prove "it is actually learning" trainer.train() # Print final summary n = reward_stats["total_calls"] if n > 0: print(f"\nšŸ“ˆ Final Colab Training Summary:") print(f" Total reward calls: {n}") print(f" Overall accuracy: {reward_stats['total_correct'] / n:.2%}") print(f" Average reward: {reward_stats['total_reward'] / n:.4f}") # 6. Push to Hugging Face # Optional: save locally or push to Hub after it learns # model.push_to_hub("your-name/AutoMathReasoner-Trained")