| cache_dir: ./cache |
| ddp_find_unused_parameters: false |
| ddp_timeout: 30000 |
| device_map: auto |
| do_eval: true |
| do_train: true |
| eval_steps: 1000 |
| evaluation_strategy: steps |
| fp16: true |
| gradient_accumulation_steps: 1 |
| gradient_checkpointing: true |
| gradient_checkpointing_kwargs: |
| use_reentrant: false |
| hub_model_id: hllj/sft-mistral-v1-clean-valid |
| hub_strategy: every_save |
| learning_rate: 3.0e-05 |
| log_level: info |
| logging_first_step: true |
| logging_steps: 10 |
| logging_strategy: steps |
| lora_alpha: 128 |
| lora_dropout: 0.05 |
| lora_r: 256 |
| lora_target_modules: |
| - q_proj |
| - k_proj |
| - v_proj |
| - o_proj |
| lr_scheduler_type: cosine |
| max_seq_length: 1024 |
| model_name_or_path: hllj/mistral-vi-math |
| model_type: auto |
| num_train_epochs: 2 |
| output_dir: outputs-sft-mistral-v1-clean-valid |
| overwrite_output_dir: true |
| per_device_eval_batch_size: 4 |
| per_device_train_batch_size: 4 |
| preprocessing_num_workers: 4 |
| push_to_hub: true |
| report_to: wandb |
| run_name: sft-mistral-v1-clean-valid |
| save_steps: 1000 |
| save_strategy: steps |
| save_total_limit: 13 |
| seed: 42 |
| token: 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 |
| torch_dtype: float16 |
| train_file_dir: datasets/finetune |
| use_peft: true |
| validation_file_dir: datasets/validation |
| warmup_ratio: 0.05 |
| weight_decay: 0.05 |
|
|