# Default training configuration for GRPO # Learning parameters learning_rate: 1e-5 weight_decay: 0.0 warmup_ratio: 0.1 lr_scheduler_type: cosine optim: adamw_torch # Training parameters num_train_epochs: 1 per_device_train_batch_size: 16 gradient_accumulation_steps: 16 bf16: true remove_unused_columns: false # Generation parameters temperature: 1.0 max_completion_length: 256 num_generations: 2 max_prompt_length: 4096 # Logging and saving report_to: tensorboard logging_steps: 1 save_strategy: steps save_steps: 10