# 🏁 SQL Debug Env: Google Colab Training Starter # 1. RUN THIS FIRST TO INSTALL !pip install trl transformers torch datasets httpx accelerate wandb -U # 2. THE TRAINING SCRIPT import os import torch from datasets import Dataset from trl import GRPOConfig, GRPOTrainer from transformers import AutoTokenizer, AutoModelForCausalLM # --- Configuration --- MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B-Instruct" # --- Mock Dataset (For quick test without the local server) --- def make_simple_dataset(): rows = [] prompt = "Fix the following SQL query: SELECT * FROM userss; Provide only the fixed SQL." for _ in range(10): rows.append({"prompt": prompt, "task_id": "easy_syntax_fix"}) return Dataset.from_list(rows) # --- Mock Reward (Proves the math works on GPU) --- def mock_reward_func(completions, **kwargs): rewards = [] for content in completions: # Give reward if the model actually wrote some SQL if "SELECT" in content.upper(): rewards.append(1.0) else: rewards.append(0.0) return rewards # --- Training Loop --- def run_colab_train(): print(f"🚀 Starting GRPO on Colab T4 GPU...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, # T4 supports bfloat16 device_map="auto" ) training_args = GRPOConfig( output_dir="./colab_results", learning_rate=1e-5, per_device_train_batch_size=1, gradient_accumulation_steps=4, num_generations=4, max_completion_length=64, num_train_epochs=1, max_steps=10, logging_steps=1, report_to="wandb" ) trainer = GRPOTrainer( model=model, reward_funcs=[mock_reward_func], args=training_args, train_dataset=make_simple_dataset(), processing_class=tokenizer, ) trainer.train() if __name__ == "__main__": run_colab_train()