| run: | |
| run_dir: ./runs/cpt_run_v1 | |
| seed: 42 | |
| model: | |
| repo_id: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 | |
| revision: null | |
| base_local_dir: base_model | |
| trust_remote_code: true | |
| tokenizer_use_fast: true | |
| device_map: auto | |
| torch_dtype: bfloat16 | |
| use_4bit: false | |
| bnb_4bit_quant_type: nf4 | |
| bnb_4bit_use_double_quant: false | |
| bnb_4bit_compute_dtype: bfloat16 | |
| attn_implementation: null | |
| data: | |
| train_jsonl: /workspace/all_data_with_descriptions.jsonl | |
| eval_jsonl: null | |
| eval_split_ratio: 0.1 | |
| text_field: text | |
| block_size: 4096 | |
| shuffle: true | |
| num_proc: 4 | |
| pack_mode: pad | |
| peft: | |
| enabled: true | |
| r: 64 | |
| lora_alpha: 128 | |
| lora_dropout: 0.05 | |
| bias: none | |
| target_modules: auto | |
| train: | |
| num_train_epochs: 2 | |
| per_device_train_batch_size: 1 | |
| per_device_eval_batch_size: 1 | |
| gradient_accumulation_steps: 16 | |
| learning_rate: 2e-5 | |
| weight_decay: 0.0 | |
| warmup_ratio: 0.1 | |
| lr_scheduler_type: cosine | |
| optim: paged_adamw_8bit | |
| max_grad_norm: 1.0 | |
| gradient_checkpointing: true | |
| logging_steps: 1 | |
| save_strategy: steps | |
| save_steps: 100 | |
| save_total_limit: 4 | |
| evaluation_strategy: steps | |
| eval_steps: 50 | |
| load_best_model_at_end: true | |
| resume_from_checkpoint: auto | |
| merge: | |
| enabled: true | |
| merged_dtype: float16 | |
| max_shard_size: 2GB | |
| output_dir: ./merged_24b_cpt_lora | |