Spaces:
Sleeping
Sleeping
| """ | |
| Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU) | |
| Instructions for Colab: | |
| 1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth) | |
| 2. Run the following installation commands in your first cell: | |
| !pip install unsloth "trl<0.9.0" | |
| !pip install openenv-core pydantic httpx | |
| !git clone <YOUR-GITHUB-REPO-URL> | |
| !cd AutoMathReasoner && pip install -e . | |
| 3. Run the following Python script in the next cell. | |
| """ | |
| import collections | |
| import random | |
| import unsloth # Must be imported before trl/transformers/peft for patching. | |
| from datasets import Dataset | |
| import torch | |
| import numpy as np | |
| # Unsloth & TRL | |
| from unsloth import FastLanguageModel | |
| from trl import GRPOConfig, GRPOTrainer | |
| # AutoMathReasoner OpenEnv Client | |
| import sys | |
| sys.path.append("./AutoMathReasoner") | |
| from AutoMathReasoner.client import AutomathreasonerEnv | |
| from AutoMathReasoner.env.models import AutomathreasonerAction | |
| # 1. Configuration | |
| # Replace with your actual Hugging Face Space URL! | |
| HF_SPACE_URL = "https://your-username-automathreasoner.hf.space" | |
| env = AutomathreasonerEnv(url=HF_SPACE_URL) | |
| max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit | |
| lora_rank = 16 | |
| # T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically. | |
| has_cuda = torch.cuda.is_available() | |
| use_bf16 = has_cuda and torch.cuda.is_bf16_supported() | |
| use_fp16 = has_cuda and not use_bf16 | |
| # 2. Load Model via Unsloth (optimized for Free Colab VRAM) | |
| print("Loading model via Unsloth...") | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download | |
| max_seq_length = max_seq_length, | |
| dtype = None, | |
| load_in_4bit = True, | |
| ) | |
| # Enable LoRA fine-tuning | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r = lora_rank, | |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha = lora_rank, | |
| use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4 | |
| ) | |
| # 3. Prepare Prompts from the Remote Environment | |
| print("Gathering initial prompts from HF Space environment...") | |
| initial_prompts = [] | |
| for _ in range(50): # Increased from 30 for better coverage | |
| # This fires an HTTP request to your Hugging Face Space | |
| obs = env.reset() | |
| initial_prompts.append({"prompt": obs.problem_text}) | |
| # Deduplicate | |
| seen = set() | |
| unique_prompts = [] | |
| for p in initial_prompts: | |
| if p["prompt"] not in seen: | |
| seen.add(p["prompt"]) | |
| unique_prompts.append(p) | |
| print(f" Generated {len(unique_prompts)} unique training prompts") | |
| dataset = Dataset.from_list(unique_prompts) | |
| # 4. Define Reward Function for TRL | |
| # Track stats for logging | |
| reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0} | |
| def compute_rewards(prompts, completions, **kwargs): | |
| """ | |
| Interfaces with the OpenEnv running on Hugging Face Spaces. | |
| Extracts the generation, passes it via HTTP to the env, and yields the dense reward. | |
| Improvements over v1: | |
| 1. Better answer parsing with multiple delimiter support | |
| 2. Confidence-weighted self-consistency bonus | |
| 3. Format compliance awareness | |
| 4. Progress logging | |
| """ | |
| rewards = [] | |
| parsed_actions = [] | |
| prompt_answers = collections.defaultdict(list) | |
| # Parse all completions | |
| for prompt, completion in zip(prompts, completions): | |
| try: | |
| if "Answer:" in completion: | |
| parts = completion.split("Answer:") | |
| reasoning = parts[0].strip() | |
| answer = parts[1].strip() if len(parts) > 1 else "" | |
| elif "answer:" in completion.lower(): | |
| idx = completion.lower().index("answer:") | |
| reasoning = completion[:idx].strip() | |
| answer = completion[idx + 7:].strip() | |
| else: | |
| lines = completion.strip().split('\n') | |
| if len(lines) > 1: | |
| reasoning = '\n'.join(lines[:-1]).strip() | |
| answer = lines[-1].strip() | |
| else: | |
| reasoning = completion | |
| answer = "" | |
| except Exception: | |
| reasoning = completion | |
| answer = "" | |
| parsed_actions.append((prompt, completion, reasoning, answer)) | |
| prompt_answers[prompt].append(answer) | |
| # Majority voting with confidence | |
| majority_answers = {} | |
| majority_confidence = {} | |
| for p, ans_list in prompt_answers.items(): | |
| if ans_list: | |
| counter = collections.Counter(ans_list) | |
| most_common = counter.most_common(1)[0] | |
| majority_answers[p] = most_common[0] | |
| majority_confidence[p] = most_common[1] / len(ans_list) | |
| for p, c, r, a in parsed_actions: | |
| action = AutomathreasonerAction(reasoning=r, final_answer=a) | |
| # Reset and step through HTTP API | |
| obs = env.reset() | |
| step_obs = env.step(action) | |
| r_total = step_obs.reward | |
| # Confidence-weighted self-consistency bonus | |
| majority = majority_answers.get(p, "") | |
| confidence = majority_confidence.get(p, 0.0) | |
| if (a == majority) and len(a) > 0 and confidence > 0.3: | |
| r_total += 0.05 + 0.10 * confidence | |
| r_total = max(-1.0, min(1.5, r_total)) | |
| rewards.append(r_total) | |
| # Stats | |
| reward_stats["total_calls"] += 1 | |
| is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False | |
| reward_stats["total_correct"] += 1 if is_correct else 0 | |
| reward_stats["total_reward"] += r_total | |
| # Log every 30 calls | |
| if reward_stats["total_calls"] % 30 < len(prompts): | |
| n = reward_stats["total_calls"] | |
| avg_r = reward_stats["total_reward"] / max(1, n) | |
| acc = reward_stats["total_correct"] / max(1, n) | |
| print(f" π Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}") | |
| return rewards | |
| # 5. Execute Training (T4-optimized parameters) | |
| training_args = GRPOConfig( | |
| output_dir="colab_outputs", | |
| # Learning rate β matched to dense reward signal | |
| learning_rate=5e-6, | |
| # Batch β T4 memory-safe | |
| per_device_train_batch_size=1, | |
| gradient_accumulation_steps=4, | |
| # Sequence lengths β room for math reasoning + hints | |
| max_prompt_length=192, # Was 128 | |
| max_completion_length=384, # Was 256 | |
| # GRPO group β K=8 (kept for T4 memory, was 4) | |
| num_generations=8, # Increased from 4, still T4-safe | |
| # Training duration | |
| max_steps=200, # Was 150 | |
| # Logging | |
| logging_steps=5, | |
| # Warmup | |
| warmup_ratio=0.08, | |
| # 8-bit optimizer saves VRAM | |
| optim="adamw_8bit", | |
| bf16=use_bf16, | |
| fp16=use_fp16, | |
| use_cpu=not has_cuda, | |
| ) | |
| trainer = GRPOTrainer( | |
| model=model, | |
| reward_funcs=[compute_rewards], | |
| args=training_args, | |
| train_dataset=dataset, | |
| ) | |
| print("π Starting GRPO Training in Colab using Remote HF Environment...") | |
| print(f" Config: lr={training_args.learning_rate}, " | |
| f"generations={training_args.num_generations}, " | |
| f"max_steps={training_args.max_steps}") | |
| # Will show wandb/tensorboard logging so you can prove "it is actually learning" | |
| trainer.train() | |
| # Print final summary | |
| n = reward_stats["total_calls"] | |
| if n > 0: | |
| print(f"\nπ Final Colab Training Summary:") | |
| print(f" Total reward calls: {n}") | |
| print(f" Overall accuracy: {reward_stats['total_correct'] / n:.2%}") | |
| print(f" Average reward: {reward_stats['total_reward'] / n:.4f}") | |
| # 6. Push to Hugging Face | |
| # Optional: save locally or push to Hub after it learns | |
| # model.push_to_hub("your-name/AutoMathReasoner-Trained") | |