run: run_dir: "./runs/cpt_run_v1" seed: 42 model: # Local model path (no download) repo_id: "/workspace/Models/Devstral-Small-2-24B-Instruct-2512" revision: null # Used only when repo_id is a HF repo (not a local path) base_local_dir: "base_model" trust_remote_code: true tokenizer_use_fast: true device_map: "auto" torch_dtype: "bfloat16" # "float16" | "bfloat16" | "float32" # QLoRA use_4bit: false bnb_4bit_quant_type: "nf4" bnb_4bit_use_double_quant: false bnb_4bit_compute_dtype: "bfloat16" # optional: "flash_attention_2" | "sdpa" | null attn_implementation: null data: train_jsonl: "/workspace/all_data_with_descriptions.jsonl" eval_jsonl: null eval_split_ratio: 0.1 text_field: "text" block_size: 4096 shuffle: true num_proc: 4 # ✅ NEW: packing behavior # "drop" = strict CPT (drop remainder) # "pad" = pad remainder to block_size + loss mask (-100) + attention_mask=0 pack_mode: "pad" peft: enabled: true r: 64 lora_alpha: 128 lora_dropout: 0.05 bias: "none" target_modules: "auto" train: #max_steps: 1000 num_train_epochs: 2 per_device_train_batch_size: 1 per_device_eval_batch_size: 1 gradient_accumulation_steps: 16 learning_rate: 2e-5 weight_decay: 0.0 warmup_ratio: 0.1 lr_scheduler_type: "cosine" optim: "paged_adamw_8bit" max_grad_norm: 1.0 gradient_checkpointing: true logging_steps: 1 save_strategy: "steps" save_steps: 100 save_total_limit: 4 evaluation_strategy: "steps" eval_steps: 50 load_best_model_at_end: true resume_from_checkpoint: "auto" merge: enabled: true merged_dtype: "float16" max_shard_size: "2GB" output_dir: "./merged_24b_cpt_lora"