run:
  run_dir: ./runs/cpt_run_v1
  seed: 42
model:
  repo_id: /workspace/Models/Devstral-Small-2-24B-Instruct-2512
  revision: null
  base_local_dir: base_model
  trust_remote_code: true
  tokenizer_use_fast: true
  device_map: auto
  torch_dtype: bfloat16
  use_4bit: false
  bnb_4bit_quant_type: nf4
  bnb_4bit_use_double_quant: false
  bnb_4bit_compute_dtype: bfloat16
  attn_implementation: null
data:
  train_jsonl: /workspace/all_data_with_descriptions.jsonl
  eval_jsonl: null
  eval_split_ratio: 0.1
  text_field: text
  block_size: 4096
  shuffle: true
  num_proc: 4
  pack_mode: pad
peft:
  enabled: true
  r: 64
  lora_alpha: 128
  lora_dropout: 0.05
  bias: none
  target_modules: auto
train:
  num_train_epochs: 2
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 16
  learning_rate: 2e-5
  weight_decay: 0.0
  warmup_ratio: 0.1
  lr_scheduler_type: cosine
  optim: paged_adamw_8bit
  max_grad_norm: 1.0
  gradient_checkpointing: true
  logging_steps: 1
  save_strategy: steps
  save_steps: 100
  save_total_limit: 4
  evaluation_strategy: steps
  eval_steps: 50
  load_best_model_at_end: true
  resume_from_checkpoint: auto
merge:
  enabled: true
  merged_dtype: float16
  max_shard_size: 2GB
  output_dir: ./merged_24b_cpt_lora