eval_steps: 50 eval_strategy: steps logging_steps: 10 lora: alpha: 128 dropout: 0.1 r: 64 target_modules: - q_proj - k_proj - v_proj - o_proj - gate_proj - up_proj - down_proj max_length: 1024 model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B save_steps: 50 save_strategy: steps training: batch_size: 2 bf16: true eval_batch_size: 2 fp16: false gradient_accumulation_steps: 8 gradient_checkpointing: false greater_is_better: true group_by_length: true learning_rate: 0.0002 load_best_model_at_end: true lr_scheduler: cosine max_grad_norm: 1.0 metric_for_best_model: eval_encoder_accuracy num_epochs: 2 optim: adamw_torch save_total_limit: 2 warmup_ratio: 0.1 wandb: enabled: true project: steg-finetune