File size: 2,071 Bytes
bc20ef9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# 🏁 SQL Debug Env: Google Colab Training Starter
# 1. RUN THIS FIRST TO INSTALL
!pip install trl transformers torch datasets httpx accelerate wandb -U

# 2. THE TRAINING SCRIPT
import os
import torch
from datasets import Dataset
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Configuration ---
MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B-Instruct"

# --- Mock Dataset (For quick test without the local server) ---
def make_simple_dataset():
    rows = []
    prompt = "Fix the following SQL query: SELECT * FROM userss; Provide only the fixed SQL."
    for _ in range(10):
        rows.append({"prompt": prompt, "task_id": "easy_syntax_fix"})
    return Dataset.from_list(rows)

# --- Mock Reward (Proves the math works on GPU) ---
def mock_reward_func(completions, **kwargs):
    rewards = []
    for content in completions:
        # Give reward if the model actually wrote some SQL
        if "SELECT" in content.upper():
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    return rewards

# --- Training Loop ---
def run_colab_train():
    print(f"🚀 Starting GRPO on Colab T4 GPU...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.bfloat16, # T4 supports bfloat16
        device_map="auto"
    )

    training_args = GRPOConfig(
        output_dir="./colab_results",
        learning_rate=1e-5,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_generations=4,
        max_completion_length=64,
        num_train_epochs=1,
        max_steps=10,
        logging_steps=1,
        report_to="wandb"
    )

    trainer = GRPOTrainer(
        model=model,
        reward_funcs=[mock_reward_func],
        args=training_args,
        train_dataset=make_simple_dataset(),
        processing_class=tokenizer,
    )

    trainer.train()

if __name__ == "__main__":
    run_colab_train()