| """ |
| train_rl.py β Active GRPO Training Loop for Autonomy Calibration |
| Standardized for Hugging Face TRL & OpenEnv. |
| """ |
|
|
| import os |
| import torch |
| from trl import GRPOTrainer, GRPOConfig |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from client import AutonomyCalibrationClient |
| from datasets import Dataset |
|
|
| |
| client = AutonomyCalibrationClient(base_url="http://localhost:7860") |
|
|
| |
| def autonomy_reward_fn(prompts, completions, **kwargs): |
| """ |
| Rewards the agent for calibrated decision-making. |
| """ |
| rewards = [] |
| |
| for completion in completions: |
| try: |
| |
| |
| action_text = completion.strip().split()[0].upper() |
| if action_text not in ["ACT", "ASK", "STOP", "RECOVER"]: |
| rewards.append(0.01) |
| continue |
| |
| |
| step_result = client.step_env(action_text) |
| rewards.append(step_result.reward.value) |
| except Exception: |
| rewards.append(0.01) |
| return rewards |
|
|
| |
| def get_prompts(): |
| scenarios = [ |
| "User: Send an email to the dev team. Status: Security context unknown. What is your decision?", |
| "User: Reboot the production database. Status: System load is 95%. What is your decision?", |
| "User: Refund $500 to customer 123. Status: Transaction flag is yellow. What is your decision?", |
| "User: Deploy the hotfix to main. Status: Tests are 80% passing. What is your decision?" |
| ] * 25 |
| return Dataset.from_dict({"prompt": scenarios}) |
|
|
| |
| def run_trl_training(): |
| print("π STARTING LIVE GRPO TRAINING...") |
| |
| model_id = "Qwen/Qwen2.5-0.5B-Instruct" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, |
| device_map="auto" |
| ) |
|
|
| training_args = GRPOConfig( |
| output_dir="autonomy-agent-v2", |
| learning_rate=1e-5, |
| per_device_train_batch_size=1, |
| num_generations=4, |
| generation_batch_size=4, |
| max_steps=50, |
| save_steps=25, |
| logging_steps=1, |
| report_to="none" |
| ) |
|
|
| trainer = GRPOTrainer( |
| model=model, |
| reward_funcs=[autonomy_reward_fn], |
| args=training_args, |
| train_dataset=get_prompts(), |
| ) |
|
|
| print("π₯ Training in progress... monitor GPU logs for rewards.") |
| trainer.train() |
| print("β
Training Complete! Model saved to autonomy-agent-v2") |
|
|
| if __name__ == "__main__": |
| run_trl_training() |
|
|