| # Default training configuration for GRPO | |
| # Learning parameters | |
| learning_rate: 1e-5 | |
| weight_decay: 0.0 | |
| warmup_ratio: 0.1 | |
| lr_scheduler_type: cosine | |
| optim: adamw_torch | |
| # Training parameters | |
| num_train_epochs: 1 | |
| per_device_train_batch_size: 16 | |
| gradient_accumulation_steps: 16 | |
| bf16: true | |
| remove_unused_columns: false | |
| # Generation parameters | |
| temperature: 1.0 | |
| max_completion_length: 256 | |
| num_generations: 2 | |
| max_prompt_length: 4096 | |
| # Logging and saving | |
| report_to: tensorboard | |
| logging_steps: 1 | |
| save_strategy: steps | |
| save_steps: 10 | |