File size: 1,290 Bytes
a555835 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | run:
run_dir: ./runs/cpt_run_v1
seed: 42
model:
repo_id: /workspace/Models/Devstral-Small-2-24B-Instruct-2512
revision: null
base_local_dir: base_model
trust_remote_code: true
tokenizer_use_fast: true
device_map: auto
torch_dtype: bfloat16
use_4bit: false
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: false
bnb_4bit_compute_dtype: bfloat16
attn_implementation: null
data:
train_jsonl: /workspace/all_data_with_descriptions.jsonl
eval_jsonl: null
eval_split_ratio: 0.1
text_field: text
block_size: 4096
shuffle: true
num_proc: 4
pack_mode: pad
peft:
enabled: true
r: 64
lora_alpha: 128
lora_dropout: 0.05
bias: none
target_modules: auto
train:
num_train_epochs: 2
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 16
learning_rate: 2e-5
weight_decay: 0.0
warmup_ratio: 0.1
lr_scheduler_type: cosine
optim: paged_adamw_8bit
max_grad_norm: 1.0
gradient_checkpointing: true
logging_steps: 1
save_strategy: steps
save_steps: 100
save_total_limit: 4
evaluation_strategy: steps
eval_steps: 50
load_best_model_at_end: true
resume_from_checkpoint: auto
merge:
enabled: true
merged_dtype: float16
max_shard_size: 2GB
output_dir: ./merged_24b_cpt_lora
|