sleepyhead111 commited on
Commit
0365ba9
·
verified ·
1 Parent(s): 443858a

Upload folder using huggingface_hub

Browse files
configs/accelerate_config.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: 0,1,2,3,4,5,6,7
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'fp16'
9
+ num_machines: 1
10
+ num_processes: 8
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/accelerate_config_1gpu.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: 0
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'fp16'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/accelerate_config_2gpu.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: 4,5
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'fp16'
9
+ num_machines: 1
10
+ num_processes: 2
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/accelerate_config_3gpu.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: 0,1,2
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'fp16'
9
+ num_machines: 1
10
+ num_processes: 3
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/accelerate_config_bf16.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: 0,1,2,3,4,5
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'fp16'
9
+ num_machines: 1
10
+ num_processes: 6
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/accelerate_config_bf16_8gpu.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: 0,1,2,3,4,5,6,7
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'bf16'
9
+ num_machines: 1
10
+ num_processes: 8
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
configs/deepspeed_train_config.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ gradient_clipping: 1.0
4
+ offload_optimizer_device: none
5
+ offload_param_device: cpu
6
+ zero3_init_flag: false
7
+ zero_stage: 2
8
+ distributed_type: DEEPSPEED
9
+ downcast_bf16: 'no'
10
+ gpu_ids: 0,1,2,3,4,5
11
+ machine_rank: 0
12
+ main_training_function: main
13
+ mixed_precision: fp16
14
+ num_machines: 1
15
+ num_processes: 6
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
configs/deepspeed_train_config_bf16.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ gradient_clipping: 1.0
4
+ offload_optimizer_device: none
5
+ offload_param_device: cpu
6
+ zero3_init_flag: false
7
+ zero_stage: 2
8
+ distributed_type: DEEPSPEED
9
+ downcast_bf16: 'no'
10
+ gpu_ids: 0,1,2,3,4,5
11
+ machine_rank: 0
12
+ main_training_function: main
13
+ mixed_precision: bf16
14
+ num_machines: 1
15
+ num_processes: 6
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
configs/deepspeed_train_config_bf16_4gpu.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ gradient_clipping: 1.0
4
+ offload_optimizer_device: none
5
+ offload_param_device: cpu
6
+ zero3_init_flag: false
7
+ zero_stage: 2
8
+ distributed_type: DEEPSPEED
9
+ downcast_bf16: 'no'
10
+ gpu_ids: 0,1,2,3
11
+ machine_rank: 0
12
+ main_training_function: main
13
+ mixed_precision: bf16
14
+ num_machines: 1
15
+ num_processes: 4
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
configs/deepspeed_train_config_fp16.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ gradient_clipping: 1.0
4
+ offload_optimizer_device: none
5
+ offload_param_device: cpu
6
+ zero3_init_flag: false
7
+ zero_stage: 2
8
+ distributed_type: DEEPSPEED
9
+ downcast_bf16: 'no'
10
+ gpu_ids: 0,1,2,3,4,5
11
+ machine_rank: 0
12
+ main_training_function: main
13
+ mixed_precision: fp16
14
+ num_machines: 1
15
+ num_processes: 6
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
configs/deepspeed_train_config_zero3.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ deepspeed_config:
3
+ gradient_clipping: 1.0
4
+ offload_optimizer_device: none
5
+ offload_param_device: cpu
6
+ zero3_init_flag: false
7
+ zero_stage: 3
8
+ distributed_type: DEEPSPEED
9
+ downcast_bf16: 'no'
10
+ gpu_ids: 0,1,2,3,4,5,6,7
11
+ machine_rank: 0
12
+ main_training_function: main
13
+ mixed_precision: bf16
14
+ num_machines: 1
15
+ num_processes: 8
16
+ rdzv_backend: static
17
+ same_network: true
18
+ tpu_env: []
19
+ tpu_use_cluster: false
20
+ tpu_use_sudo: false
21
+ use_cpu: false
configs/ds_z0_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_batch_size": "auto",
3
+ "train_micro_batch_size_per_gpu": "auto",
4
+ "gradient_accumulation_steps": "auto",
5
+ "gradient_clipping": "auto",
6
+ "zero_allow_untested_optimizer": true,
7
+ "fp16": {
8
+ "enabled": false,
9
+ "loss_scale": 0,
10
+ "loss_scale_window": 1000,
11
+ "initial_scale_power": 16,
12
+ "hysteresis": 2,
13
+ "min_loss_scale": 1
14
+ },
15
+ "bf16": {
16
+ "enabled": true,
17
+ "loss_scale": 0,
18
+ "loss_scale_window": 1000,
19
+ "initial_scale_power": 16,
20
+ "hysteresis": 2,
21
+ "min_loss_scale": 1
22
+ },
23
+ "zero_optimization": {
24
+ "stage": 0,
25
+ "allgather_partitions": true,
26
+ "allgather_bucket_size": 5e8,
27
+ "overlap_comm": true,
28
+ "reduce_scatter": true,
29
+ "reduce_bucket_size": 5e8,
30
+ "contiguous_gradients": true,
31
+ "round_robin_gradients": true
32
+ }
33
+ }
configs/ds_z2_config_bf16.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_batch_size": "auto",
3
+ "train_micro_batch_size_per_gpu": "auto",
4
+ "gradient_accumulation_steps": "auto",
5
+ "gradient_clipping": "auto",
6
+ "zero_allow_untested_optimizer": true,
7
+ "fp16": {
8
+ "enabled": false,
9
+ "loss_scale": 0,
10
+ "loss_scale_window": 1000,
11
+ "initial_scale_power": 16,
12
+ "hysteresis": 2,
13
+ "min_loss_scale": 1
14
+ },
15
+ "bf16": {
16
+ "enabled": true,
17
+ "loss_scale": 0,
18
+ "loss_scale_window": 1000,
19
+ "initial_scale_power": 16,
20
+ "hysteresis": 2,
21
+ "min_loss_scale": 1
22
+ },
23
+ "zero_optimization": {
24
+ "stage": 2,
25
+ "allgather_partitions": true,
26
+ "allgather_bucket_size": 5e8,
27
+ "overlap_comm": false,
28
+ "reduce_scatter": true,
29
+ "reduce_bucket_size": 5e8,
30
+ "contiguous_gradients": true,
31
+ "round_robin_gradients": true
32
+ }
33
+ }
configs/ds_z2_config_fp16.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_batch_size": "auto",
3
+ "train_micro_batch_size_per_gpu": "auto",
4
+ "gradient_accumulation_steps": "auto",
5
+ "gradient_clipping": "auto",
6
+ "zero_allow_untested_optimizer": true,
7
+ "fp16": {
8
+ "enabled": true,
9
+ "loss_scale": 0,
10
+ "loss_scale_window": 1000,
11
+ "initial_scale_power": 16,
12
+ "hysteresis": 2,
13
+ "min_loss_scale": 1
14
+ },
15
+ "bf16": {
16
+ "enabled": false,
17
+ "loss_scale": 0,
18
+ "loss_scale_window": 1000,
19
+ "initial_scale_power": 16,
20
+ "hysteresis": 2,
21
+ "min_loss_scale": 1
22
+ },
23
+ "zero_optimization": {
24
+ "stage": 2,
25
+ "allgather_partitions": true,
26
+ "allgather_bucket_size": 5e8,
27
+ "overlap_comm": true,
28
+ "reduce_scatter": true,
29
+ "reduce_bucket_size": 5e8,
30
+ "contiguous_gradients": true,
31
+ "round_robin_gradients": true
32
+ }
33
+ }
configs/ds_z3_config_bf16.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_batch_size": "auto",
3
+ "train_micro_batch_size_per_gpu": "auto",
4
+ "gradient_accumulation_steps": "auto",
5
+ "gradient_clipping": "auto",
6
+ "zero_allow_untested_optimizer": true,
7
+ "fp16": {
8
+ "enabled": "auto",
9
+ "loss_scale": 0,
10
+ "loss_scale_window": 1000,
11
+ "initial_scale_power": 16,
12
+ "hysteresis": 2,
13
+ "min_loss_scale": 1
14
+ },
15
+ "bf16": {
16
+ "enabled": "auto"
17
+ },
18
+ "zero_optimization": {
19
+ "stage": 3,
20
+ "overlap_comm": false,
21
+ "contiguous_gradients": true,
22
+ "sub_group_size": 1e9,
23
+ "reduce_bucket_size": "auto",
24
+ "stage3_prefetch_bucket_size": "auto",
25
+ "stage3_param_persistence_threshold": "auto",
26
+ "stage3_max_live_parameters": 1e9,
27
+ "stage3_max_reuse_distance": 1e9,
28
+ "stage3_gather_16bit_weights_on_model_save": true
29
+ }
30
+ }
configs/fsdp_train_config.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: FSDP
4
+ downcast_bf16: 'no'
5
+ fsdp_config:
6
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
7
+ fsdp_backward_prefetch_policy: BACKWARD_PRE
8
+ fsdp_forward_prefetch: false
9
+ fsdp_cpu_ram_efficient_loading: true
10
+ fsdp_offload_params: false
11
+ fsdp_sharding_strategy: SHARD_GRAD_OP
12
+ fsdp_state_dict_type: SHARDED_STATE_DICT
13
+ fsdp_sync_module_states: true
14
+ fsdp_transformer_layer_cls_to_wrap: BertLayer
15
+ fsdp_use_orig_params: true
16
+ gpu_ids: 0,1,2,3,4,5
17
+ machine_rank: 0
18
+ main_training_function: main
19
+ mixed_precision: fp16
20
+ num_machines: 1
21
+ num_processes: 6
22
+ rdzv_backend: static
23
+ same_network: true
24
+ tpu_env: []
25
+ tpu_use_cluster: false
26
+ tpu_use_sudo: false
27
+ use_cpu: false
configs/llama3_full_pt.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### model
2
+ model_name_or_path: /mnt/luoyingfeng/model_card/Meta-Llama-3.2-1B
3
+ #trust_remote_code: true
4
+ template: llama3
5
+ ### method
6
+ stage: pt
7
+ do_train: true
8
+ finetuning_type: full
9
+
10
+ ### dataset
11
+ dataset_dir: /mnt/luoyingfeng/lora4mt/data/fine-tuning_data/cpt_data
12
+ dataset: cpt-kk-en
13
+ cutoff_len: 512
14
+ use_fast_tokenizer: true
15
+ dataloader_num_workers: 8
16
+ preprocessing_num_workers: 16
17
+
18
+ ### output
19
+ output_dir: /mnt/luoyingfeng/lora4mt/exps/Meta-Llama-3.2-1B/fft_cpt
20
+ logging_steps: 0.01
21
+ save_steps: 0.05
22
+ plot_loss: true
23
+ overwrite_output_dir: true
24
+
25
+ ### train
26
+ per_device_train_batch_size: 8
27
+ gradient_accumulation_steps: 16
28
+ learning_rate: 2.0e-5
29
+ num_train_epochs: 1.0
30
+ lr_scheduler_type: cosine
31
+ warmup_ratio: 0.1
32
+ bf16: true
33
+ ddp_timeout: 180000000
34
+ seed: 42
35
+ save_strategy: steps
36
+ logging_strategy: steps
37
+
38
+ ### eval
39
+ # val_size: 0.1
40
+ # per_device_eval_batch_size: 1
41
+ # eval_strategy: steps
42
+ # eval_steps: 500