run: run_dir: ./runs/cpt_run_v1 seed: 42 model: repo_id: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 revision: null base_local_dir: base_model trust_remote_code: true tokenizer_use_fast: true device_map: auto torch_dtype: bfloat16 use_4bit: false bnb_4bit_quant_type: nf4 bnb_4bit_use_double_quant: false bnb_4bit_compute_dtype: bfloat16 attn_implementation: null data: train_jsonl: /workspace/all_data_with_descriptions.jsonl eval_jsonl: null eval_split_ratio: 0.1 text_field: text block_size: 4096 shuffle: true num_proc: 4 pack_mode: pad peft: enabled: true r: 64 lora_alpha: 128 lora_dropout: 0.05 bias: none target_modules: auto train: num_train_epochs: 2 per_device_train_batch_size: 1 per_device_eval_batch_size: 1 gradient_accumulation_steps: 16 learning_rate: 2e-5 weight_decay: 0.0 warmup_ratio: 0.1 lr_scheduler_type: cosine optim: paged_adamw_8bit max_grad_norm: 1.0 gradient_checkpointing: true logging_steps: 1 save_strategy: steps save_steps: 100 save_total_limit: 4 evaluation_strategy: steps eval_steps: 50 load_best_model_at_end: true resume_from_checkpoint: auto merge: enabled: true merged_dtype: float16 max_shard_size: 2GB output_dir: ./merged_24b_cpt_lora