First model training round
Browse files
checkpoints/data_4_train_params.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size_training: '4'
|
| 2 |
+
batching_strategy: packing
|
| 3 |
+
checkpoint_type: StateDictType.FULL_STATE_DICT
|
| 4 |
+
context_length: '4096'
|
| 5 |
+
dataset: fim_dataset
|
| 6 |
+
dist_checkpoint_folder: fine-tuned
|
| 7 |
+
dist_checkpoint_root_folder: /home/model_checkpoints
|
| 8 |
+
enable_fsdp: 'True'
|
| 9 |
+
flop_counter: 'True'
|
| 10 |
+
flop_counter_start: '3'
|
| 11 |
+
freeze_layers: 'False'
|
| 12 |
+
from_peft_checkpoint: ''
|
| 13 |
+
fsdp_activation_checkpointing: 'True'
|
| 14 |
+
fsdp_cpu_offload: 'False'
|
| 15 |
+
gamma: '0.85'
|
| 16 |
+
gradient_accumulation_steps: '1'
|
| 17 |
+
gradient_clipping: 'False'
|
| 18 |
+
gradient_clipping_threshold: '1.0'
|
| 19 |
+
hsdp: 'False'
|
| 20 |
+
low_cpu_fsdp: 'False'
|
| 21 |
+
lr: '0.0001'
|
| 22 |
+
max_eval_step: '0'
|
| 23 |
+
max_train_step: '0'
|
| 24 |
+
mixed_precision: 'True'
|
| 25 |
+
model_name: meta-llama/Llama-3.2-1B-Instruct
|
| 26 |
+
num_epochs: '1'
|
| 27 |
+
num_freeze_layers: '1'
|
| 28 |
+
num_workers_dataloader: '1'
|
| 29 |
+
one_gpu: 'False'
|
| 30 |
+
optimizer: AdamW
|
| 31 |
+
output_dir: PATH/to/save/PEFT/model
|
| 32 |
+
peft_method: lora
|
| 33 |
+
profiler_dir: PATH/to/save/profiler/results
|
| 34 |
+
pure_bf16: 'True'
|
| 35 |
+
quantization: None
|
| 36 |
+
replica_group_size: '0'
|
| 37 |
+
run_validation: 'True'
|
| 38 |
+
save_metrics: 'False'
|
| 39 |
+
save_model: 'True'
|
| 40 |
+
save_optimizer: 'False'
|
| 41 |
+
seed: '42'
|
| 42 |
+
sharding_group_size: '0'
|
| 43 |
+
sharding_strategy: ShardingStrategy.NO_SHARD
|
| 44 |
+
tokenizer_name: simmo/llama3.2-pyfim-3b
|
| 45 |
+
use_fast_kernels: 'True'
|
| 46 |
+
use_fp16: 'False'
|
| 47 |
+
use_peft: 'False'
|
| 48 |
+
use_profiler: 'False'
|
| 49 |
+
use_wandb: 'True'
|
| 50 |
+
val_batch_size: '1'
|
| 51 |
+
weight_decay: '0.0'
|