binoy370sk commited on
Commit
93b0ec4
·
verified ·
1 Parent(s): 3413ace

Upload 2 files

Browse files
configs/70M-pythia-mqa-residual.yml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ # "data_path": "/data/train_text_document",
3
+ "train_data_paths": ["data/binarised_10p/train_text_document"],
4
+ "valid_data_paths": ["data/binarised_10p/validation_text_document"],
5
+ "test_data_paths": ["data/binarised_10p/test_text_document"],
6
+
7
+ "vocab_file": "download_slimpajama/tokenizer_loubnabnl_slimpajama.json",
8
+ "tokenizer_type": "HFTokenizer",
9
+
10
+ "checkpoint_validation_with_forward_pass": False,
11
+
12
+
13
+ "save": "checkpoints/70M_pythia_mqa_residual_lessLR",
14
+ "load": "checkpoints/70M_pythia_mqa_residual_lessLR",
15
+ "tensorboard_dir": "tensorboard/70M_pythia_mqa_residual_lessLR",
16
+ "log_dir": "logs/70M_pythia_mqa_residual_lessLR",
17
+
18
+ #mods
19
+ "dual_residual": True,
20
+ # "precision": "bfloat16",
21
+ # "activation": "swiglu",
22
+ "num_kv_heads": 1,
23
+
24
+ "use_wandb": False,
25
+
26
+ # parallelism settings
27
+ "pipe_parallel_size": 1,
28
+ "model_parallel_size": 1,
29
+
30
+ # model settings
31
+ "num_layers": 6,
32
+ "hidden_size": 512,
33
+ "num_attention_heads": 8,
34
+ "seq_length": 2048,
35
+ "max_position_embeddings": 2048,
36
+ "pos_emb": "rotary",
37
+ "rotary_pct": 0.25,
38
+ "no_weight_tying": true,
39
+ # "gpt_j_residual": true,
40
+ "output_layer_parallelism": "column",
41
+
42
+ "attention_config": [[["flash"], 6]],
43
+
44
+ "scaled_upper_triang_masked_softmax_fusion": true,
45
+ "bias_gelu_fusion": true,
46
+
47
+ # init methods
48
+ "init_method": "small_init",
49
+ "output_layer_init_method": "wang_init",
50
+
51
+ "optimizer": {
52
+ "type": "Adam",
53
+ "params": {
54
+ "lr": 0.0001,
55
+ "betas": [0.9, 0.95],
56
+ "eps": 1.0e-8
57
+ }
58
+ },
59
+ "min_lr": 0.00001,
60
+
61
+ "zero_optimization": {
62
+ "stage": 0,
63
+ "allgather_partitions": true,
64
+ "allgather_bucket_size": 500000000,
65
+ "overlap_comm": true,
66
+ "reduce_scatter": true,
67
+ "reduce_bucket_size": 500000000,
68
+ "contiguous_gradients": true,
69
+ "cpu_offload": false
70
+ },
71
+
72
+ # batch size (trained on 32 gpus)
73
+ "train_micro_batch_size_per_gpu": 8,
74
+ "data_impl": "mmap",
75
+ "num_workers": 1,
76
+
77
+ # activation checkpointing
78
+ "checkpoint_activations": true,
79
+ "checkpoint_num_layers": 1,
80
+ "partition_activations": true,
81
+ "synchronize_each_layer": true,
82
+
83
+ # regularization
84
+ "gradient_clipping": 1.0,
85
+ "weight_decay": 0.1,
86
+ "hidden_dropout": 0,
87
+ "attention_dropout": 0,
88
+
89
+
90
+ # precision settings
91
+ "fp16": {
92
+ "fp16": true,
93
+ "enabled": true,
94
+ "loss_scale": 0,
95
+ "loss_scale_window": 1000,
96
+ "initial_scale_power": 12,
97
+ "hysteresis": 2,
98
+ "min_loss_scale": 1
99
+ },
100
+
101
+ # misc. training settings
102
+ "train_iters": 100000,
103
+ "lr_decay_iters": 100000,
104
+ "distributed_backend": "nccl",
105
+ "lr_decay_style": "cosine",
106
+ "warmup": 0.01,
107
+ "checkpoint_factor": 5000,
108
+ "eval_interval": 5000,
109
+ "eval_iters": 10,
110
+ "do_test": true,
111
+ "extra_save_iters": [10,100,500,1000],
112
+
113
+ # logging
114
+ "log_interval": 100,
115
+ "steps_per_print": 10,
116
+ "keep_last_n_checkpoints": 10,
117
+ "wall_clock_breakdown": true,
118
+
119
+
120
+ }
mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33d58cbcfa58ccc314981b4e2d92a3ea6863305416f395fdf9cdf54e390e57e7
3
+ size 931142821