# Default training configuration for SFT # Learning parameters learning_rate: 2e-5 weight_decay: 0.01 warmup_ratio: 0.1 lr_scheduler_type: linear optim: adamw_torch # Training parameters num_train_epochs: 5 per_device_train_batch_size: 8 gradient_accumulation_steps: 8 bf16: true remove_unused_columns: false # SFT-specific parameters max_length: 4096 # Logging and saving report_to: tensorboard logging_steps: 1 save_strategy: steps save_steps: 10