| eval_steps: 50 | |
| eval_strategy: steps | |
| logging_steps: 10 | |
| lora: | |
| alpha: 128 | |
| dropout: 0.1 | |
| r: 64 | |
| target_modules: | |
| - q_proj | |
| - k_proj | |
| - v_proj | |
| - o_proj | |
| - gate_proj | |
| - up_proj | |
| - down_proj | |
| max_length: 1024 | |
| model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B | |
| save_steps: 50 | |
| save_strategy: steps | |
| training: | |
| batch_size: 2 | |
| bf16: true | |
| eval_batch_size: 2 | |
| fp16: false | |
| gradient_accumulation_steps: 8 | |
| gradient_checkpointing: false | |
| greater_is_better: true | |
| group_by_length: true | |
| learning_rate: 0.0001 | |
| load_best_model_at_end: true | |
| lr_scheduler: cosine | |
| max_grad_norm: 1.0 | |
| metric_for_best_model: eval_decoder_accuracy | |
| num_epochs: 3 | |
| optim: adamw_torch | |
| save_total_limit: 2 | |
| warmup_ratio: 0.1 | |
| wandb: | |
| enabled: true | |
| project: steg-finetune | |