run:
  run_dir: "./runs/cpt_run_v1"
  seed: 42

model:
  # Local model path (no download)
  repo_id: "/workspace/Models/Devstral-Small-2-24B-Instruct-2512"
  revision: null

  # Used only when repo_id is a HF repo (not a local path)
  base_local_dir: "base_model"

  trust_remote_code: true
  tokenizer_use_fast: true
  device_map: "auto"

  torch_dtype: "bfloat16"  # "float16" | "bfloat16" | "float32"

  # QLoRA
  use_4bit: false
  bnb_4bit_quant_type: "nf4"
  bnb_4bit_use_double_quant: false
  bnb_4bit_compute_dtype: "bfloat16"

  # optional: "flash_attention_2" | "sdpa" | null
  attn_implementation: null

data:
  train_jsonl: "/workspace/all_data_with_descriptions.jsonl"
  eval_jsonl: null
  eval_split_ratio: 0.1
  text_field: "text"
  block_size: 4096
  shuffle: true
  num_proc: 4

  # ✅ NEW: packing behavior
  # "drop" = strict CPT (drop remainder)
  # "pad"  = pad remainder to block_size + loss mask (-100) + attention_mask=0
  pack_mode: "pad"

peft:
  enabled: true
  r: 64
  lora_alpha: 128
  lora_dropout: 0.05
  bias: "none"
  target_modules: "auto"

train:
  #max_steps: 1000
  num_train_epochs: 2

  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 16

  learning_rate: 2e-5
  weight_decay: 0.0
  warmup_ratio: 0.1
  lr_scheduler_type: "cosine"

  optim: "paged_adamw_8bit"
  max_grad_norm: 1.0
  gradient_checkpointing: true

  logging_steps: 1
  save_strategy: "steps"
  save_steps: 100
  save_total_limit: 4

  evaluation_strategy: "steps"
  eval_steps: 50
  load_best_model_at_end: true

  resume_from_checkpoint: "auto"

merge:
  enabled: true
  merged_dtype: "float16"
  max_shard_size: "2GB"
  output_dir: "./merged_24b_cpt_lora"