| model_id: Qwen/Qwen2.5-7B-Instruct | |
| model_family: qwen2.5 | |
| LoRA: | |
| r: 0 | |
| alpha: 128 | |
| dropout: 0.05 | |
| used_policy: true | |
| used_strategy: true | |
| step_token: <step>,</step> | |
| answer_token: <answer>,</answer> | |
| loss_type: grad_ascent | |
| tune_vision_tower: false | |
| tune_mm_projector: true | |
| tune_language_model: true | |
| data_path: ./dataset/safety_reasoning_v2_4step.json | |
| split: with_strategy_policy_4step | |
| batch_size: 2 | |
| gradient_accumulation_steps: 8 | |
| max_grad_norm: 1.0 | |
| num_epochs: 3 | |
| max_length: 3072 | |
| save_dir: models/final_ft_${num_epochs}_epochs_lr${lr}_${model_family}_${split} | |
| save_steps: 165 | |
| lr: 1.0e-05 | |
| weight_decay: 0.01 | |
| seed: 233 | |
| workers: 4 | |
| lr_scheduler_type: cosine | |
| warmup_ratio: 0.0 | |
| max_train_steps: -1 | |
| report_to: wandb | |
| resume_from_checkpoint: '' | |