|
|
run: |
|
|
run_dir: "./runs/cpt_run_v1" |
|
|
seed: 42 |
|
|
|
|
|
model: |
|
|
|
|
|
repo_id: "/workspace/Models/Devstral-Small-2-24B-Instruct-2512" |
|
|
revision: null |
|
|
|
|
|
|
|
|
base_local_dir: "base_model" |
|
|
|
|
|
trust_remote_code: true |
|
|
tokenizer_use_fast: true |
|
|
device_map: "auto" |
|
|
|
|
|
torch_dtype: "bfloat16" |
|
|
|
|
|
|
|
|
use_4bit: false |
|
|
bnb_4bit_quant_type: "nf4" |
|
|
bnb_4bit_use_double_quant: false |
|
|
bnb_4bit_compute_dtype: "bfloat16" |
|
|
|
|
|
|
|
|
attn_implementation: null |
|
|
|
|
|
data: |
|
|
train_jsonl: "/workspace/all_data_with_descriptions.jsonl" |
|
|
eval_jsonl: null |
|
|
eval_split_ratio: 0.1 |
|
|
text_field: "text" |
|
|
block_size: 4096 |
|
|
shuffle: true |
|
|
num_proc: 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pack_mode: "pad" |
|
|
|
|
|
peft: |
|
|
enabled: true |
|
|
r: 64 |
|
|
lora_alpha: 128 |
|
|
lora_dropout: 0.05 |
|
|
bias: "none" |
|
|
target_modules: "auto" |
|
|
|
|
|
train: |
|
|
|
|
|
num_train_epochs: 2 |
|
|
|
|
|
per_device_train_batch_size: 1 |
|
|
per_device_eval_batch_size: 1 |
|
|
gradient_accumulation_steps: 16 |
|
|
|
|
|
learning_rate: 2e-5 |
|
|
weight_decay: 0.0 |
|
|
warmup_ratio: 0.1 |
|
|
lr_scheduler_type: "cosine" |
|
|
|
|
|
optim: "paged_adamw_8bit" |
|
|
max_grad_norm: 1.0 |
|
|
gradient_checkpointing: true |
|
|
|
|
|
logging_steps: 1 |
|
|
save_strategy: "steps" |
|
|
save_steps: 100 |
|
|
save_total_limit: 4 |
|
|
|
|
|
evaluation_strategy: "steps" |
|
|
eval_steps: 50 |
|
|
load_best_model_at_end: true |
|
|
|
|
|
resume_from_checkpoint: "auto" |
|
|
|
|
|
merge: |
|
|
enabled: true |
|
|
merged_dtype: "float16" |
|
|
max_shard_size: "2GB" |
|
|
output_dir: "./merged_24b_cpt_lora" |
|
|
|