File size: 7,815 Bytes
f8319a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12acaa5
f8319a8
 
973cd6f
f8319a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973cd6f
f8319a8
 
12acaa5
 
 
 
 
f8319a8
 
 
973cd6f
f8319a8
 
 
 
 
 
 
 
 
 
 
 
973cd6f
f8319a8
 
973cd6f
f8319a8
 
973cd6f
f8319a8
 
 
 
973cd6f
 
 
 
 
 
 
 
 
 
f8319a8
 
973cd6f
 
 
f8319a8
 
 
 
973cd6f
 
 
 
 
 
f8319a8
 
 
 
 
973cd6f
f8319a8
 
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8319a8
 
 
 
 
 
 
973cd6f
f8319a8
973cd6f
f8319a8
 
973cd6f
 
 
 
f8319a8
 
 
 
973cd6f
f8319a8
 
 
 
973cd6f
f8319a8
973cd6f
 
 
f8319a8
973cd6f
f8319a8
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
f8319a8
 
 
973cd6f
f8319a8
 
973cd6f
 
 
 
 
 
f8319a8
973cd6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12acaa5
 
 
f8319a8
 
 
 
 
 
 
 
 
973cd6f
 
 
 
 
f8319a8
 
 
973cd6f
 
 
 
 
 
 
 
f8319a8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)

Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:

!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .

3. Run the following Python script in the next cell.
"""

import collections
import random
import unsloth  # Must be imported before trl/transformers/peft for patching.
from datasets import Dataset
import torch
import numpy as np

# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer

# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction

# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)

max_seq_length = 1024  # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16

# T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically.
has_cuda = torch.cuda.is_available()
use_bf16 = has_cuda and torch.cuda.is_bf16_supported()
use_fp16 = has_cuda and not use_bf16

# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",  # Pre-quantized 4bit for fast download 
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Enable LoRA fine-tuning 
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",  # Crucial for fitting into T4
)

# 3. Prepare Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(50):  # Increased from 30 for better coverage
    # This fires an HTTP request to your Hugging Face Space
    obs = env.reset()
    initial_prompts.append({"prompt": obs.problem_text})

# Deduplicate
seen = set()
unique_prompts = []
for p in initial_prompts:
    if p["prompt"] not in seen:
        seen.add(p["prompt"])
        unique_prompts.append(p)

print(f"   Generated {len(unique_prompts)} unique training prompts")
dataset = Dataset.from_list(unique_prompts)

# 4. Define Reward Function for TRL
# Track stats for logging
reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0}

def compute_rewards(prompts, completions, **kwargs):
    """
    Interfaces with the OpenEnv running on Hugging Face Spaces.
    Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
    
    Improvements over v1:
    1. Better answer parsing with multiple delimiter support
    2. Confidence-weighted self-consistency bonus
    3. Format compliance awareness
    4. Progress logging
    """
    rewards = []
    parsed_actions = []
    prompt_answers = collections.defaultdict(list)
    
    # Parse all completions
    for prompt, completion in zip(prompts, completions):
        try:
            if "Answer:" in completion:
                parts = completion.split("Answer:")
                reasoning = parts[0].strip()
                answer = parts[1].strip() if len(parts) > 1 else ""
            elif "answer:" in completion.lower():
                idx = completion.lower().index("answer:")
                reasoning = completion[:idx].strip()
                answer = completion[idx + 7:].strip()
            else:
                lines = completion.strip().split('\n')
                if len(lines) > 1:
                    reasoning = '\n'.join(lines[:-1]).strip()
                    answer = lines[-1].strip()
                else:
                    reasoning = completion
                    answer = ""
        except Exception:
            reasoning = completion
            answer = ""
            
        parsed_actions.append((prompt, completion, reasoning, answer))
        prompt_answers[prompt].append(answer)
        
    # Majority voting with confidence
    majority_answers = {}
    majority_confidence = {}
    for p, ans_list in prompt_answers.items():
        if ans_list:
            counter = collections.Counter(ans_list)
            most_common = counter.most_common(1)[0]
            majority_answers[p] = most_common[0]
            majority_confidence[p] = most_common[1] / len(ans_list)

    for p, c, r, a in parsed_actions:
        action = AutomathreasonerAction(reasoning=r, final_answer=a)
        
        # Reset and step through HTTP API
        obs = env.reset()
        step_obs = env.step(action)
        r_total = step_obs.reward
        
        # Confidence-weighted self-consistency bonus
        majority = majority_answers.get(p, "")
        confidence = majority_confidence.get(p, 0.0)
        if (a == majority) and len(a) > 0 and confidence > 0.3:
            r_total += 0.05 + 0.10 * confidence
            
        r_total = max(-1.0, min(1.5, r_total))
        rewards.append(r_total)
        
        # Stats
        reward_stats["total_calls"] += 1
        is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False
        reward_stats["total_correct"] += 1 if is_correct else 0
        reward_stats["total_reward"] += r_total
    
    # Log every 30 calls
    if reward_stats["total_calls"] % 30 < len(prompts):
        n = reward_stats["total_calls"]
        avg_r = reward_stats["total_reward"] / max(1, n)
        acc = reward_stats["total_correct"] / max(1, n)
        print(f"  πŸ“Š Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}")
            
    return rewards

# 5. Execute Training (T4-optimized parameters)
training_args = GRPOConfig(
    output_dir="colab_outputs",
    
    # Learning rate β€” matched to dense reward signal
    learning_rate=5e-6,
    
    # Batch β€” T4 memory-safe
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    
    # Sequence lengths β€” room for math reasoning + hints
    max_prompt_length=192,          # Was 128
    max_completion_length=384,      # Was 256
    
    # GRPO group β€” K=8 (kept for T4 memory, was 4)
    num_generations=8,              # Increased from 4, still T4-safe
    
    # Training duration
    max_steps=200,                  # Was 150
    
    # Logging
    logging_steps=5,
    
    # Warmup
    warmup_ratio=0.08,
    
    # 8-bit optimizer saves VRAM
    optim="adamw_8bit",
    bf16=use_bf16,
    fp16=use_fp16,
    use_cpu=not has_cuda,
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[compute_rewards],
    args=training_args,
    train_dataset=dataset,
)

print("πŸš€ Starting GRPO Training in Colab using Remote HF Environment...")
print(f"   Config: lr={training_args.learning_rate}, "
      f"generations={training_args.num_generations}, "
      f"max_steps={training_args.max_steps}")

# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()

# Print final summary
n = reward_stats["total_calls"]
if n > 0:
    print(f"\nπŸ“ˆ Final Colab Training Summary:")
    print(f"   Total reward calls: {n}")
    print(f"   Overall accuracy: {reward_stats['total_correct'] / n:.2%}")
    print(f"   Average reward: {reward_stats['total_reward'] / n:.4f}")

# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")