leaBroe commited on
Commit
548c872
·
verified ·
1 Parent(s): c08b42b

Upload 17 files

Browse files

Added Heavy2Light model

config.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adapters": {
3
+ "adapters": {
4
+ "heavy2light_adapter": "dcb52aee526a3537"
5
+ },
6
+ "config_map": {
7
+ "dcb52aee526a3537": {
8
+ "adapter_residual_before_ln": false,
9
+ "cross_adapter": false,
10
+ "dropout": 0.0,
11
+ "factorized_phm_W": true,
12
+ "factorized_phm_rule": false,
13
+ "hypercomplex_nonlinearity": "glorot-uniform",
14
+ "init_weights": "bert",
15
+ "inv_adapter": null,
16
+ "inv_adapter_reduction_factor": null,
17
+ "is_parallel": false,
18
+ "learn_phm": true,
19
+ "leave_out": [],
20
+ "ln_after": false,
21
+ "ln_before": false,
22
+ "mh_adapter": true,
23
+ "non_linearity": "relu",
24
+ "original_ln_after": true,
25
+ "original_ln_before": false,
26
+ "output_adapter": true,
27
+ "phm_bias": true,
28
+ "phm_c_init": "normal",
29
+ "phm_dim": 4,
30
+ "phm_init_range": 0.0001,
31
+ "phm_layer": false,
32
+ "phm_rank": 1,
33
+ "reduction_factor": 16,
34
+ "residual_before_ln": true,
35
+ "scaling": 1.0,
36
+ "shared_W_phm": false,
37
+ "shared_phm_rule": true,
38
+ "use_gating": false
39
+ }
40
+ },
41
+ "fusion_config_map": {},
42
+ "fusions": {}
43
+ },
44
+ "add_cross_attention": true,
45
+ "architectures": [
46
+ "EncoderDecoderModelWithAdapters"
47
+ ],
48
+ "bos_token_id": 2,
49
+ "decoder": {
50
+ "_name_or_path": "/ibmm_data2/oas_database/paired_lea_tmp/light_model/gpt_model_light_unpaired/src/gpt_light_model_unpaired/model_outputs/full_new_tokenizer_gpt2_light_seqs_unp_lr_5e-4_wd_0.1_bs_32_epochs_500_/checkpoint-6058816",
51
+ "activation_function": "gelu_new",
52
+ "add_cross_attention": true,
53
+ "architectures": [
54
+ "GPT2LMHeadModel"
55
+ ],
56
+ "attn_pdrop": 0.1,
57
+ "bad_words_ids": null,
58
+ "begin_suppress_tokens": null,
59
+ "bos_token_id": 0,
60
+ "chunk_size_feed_forward": 0,
61
+ "cross_attention_hidden_size": null,
62
+ "decoder_start_token_id": null,
63
+ "diversity_penalty": 0.0,
64
+ "do_sample": false,
65
+ "early_stopping": false,
66
+ "embd_pdrop": 0.1,
67
+ "encoder_no_repeat_ngram_size": 0,
68
+ "eos_token_id": 0,
69
+ "exponential_decay_length_penalty": null,
70
+ "finetuning_task": null,
71
+ "forced_bos_token_id": null,
72
+ "forced_eos_token_id": null,
73
+ "id2label": {
74
+ "0": "LABEL_0",
75
+ "1": "LABEL_1"
76
+ },
77
+ "initializer_range": 0.02,
78
+ "is_decoder": true,
79
+ "is_encoder_decoder": false,
80
+ "label2id": {
81
+ "LABEL_0": 0,
82
+ "LABEL_1": 1
83
+ },
84
+ "layer_norm_epsilon": 1e-05,
85
+ "length_penalty": 1.0,
86
+ "max_length": 20,
87
+ "min_length": 0,
88
+ "model_type": "gpt2",
89
+ "n_ctx": 1024,
90
+ "n_embd": 768,
91
+ "n_head": 12,
92
+ "n_inner": null,
93
+ "n_layer": 12,
94
+ "n_positions": 1024,
95
+ "no_repeat_ngram_size": 0,
96
+ "num_beam_groups": 1,
97
+ "num_beams": 1,
98
+ "num_return_sequences": 1,
99
+ "output_attentions": false,
100
+ "output_hidden_states": false,
101
+ "output_scores": false,
102
+ "pad_token_id": 1,
103
+ "prefix": null,
104
+ "problem_type": null,
105
+ "pruned_heads": {},
106
+ "remove_invalid_values": false,
107
+ "reorder_and_upcast_attn": false,
108
+ "repetition_penalty": 1.0,
109
+ "resid_pdrop": 0.1,
110
+ "return_dict": true,
111
+ "return_dict_in_generate": false,
112
+ "scale_attn_by_inverse_layer_idx": false,
113
+ "scale_attn_weights": true,
114
+ "sep_token_id": null,
115
+ "summary_activation": null,
116
+ "summary_first_dropout": 0.1,
117
+ "summary_proj_to_labels": true,
118
+ "summary_type": "cls_index",
119
+ "summary_use_proj": true,
120
+ "suppress_tokens": null,
121
+ "task_specific_params": null,
122
+ "temperature": 1.0,
123
+ "tf_legacy_loss": false,
124
+ "tie_encoder_decoder": false,
125
+ "tie_word_embeddings": true,
126
+ "tokenizer_class": null,
127
+ "top_k": 50,
128
+ "top_p": 1.0,
129
+ "torch_dtype": "float32",
130
+ "torchscript": false,
131
+ "typical_p": 1.0,
132
+ "unk_token_id": 2,
133
+ "use_bfloat16": false,
134
+ "use_cache": true,
135
+ "vocab_size": 25
136
+ },
137
+ "decoder_start_token_id": 2,
138
+ "encoder": {
139
+ "_name_or_path": "/ibmm_data2/oas_database/paired_lea_tmp/heavy_model/src/redo_ch/FULL_config_4_smaller_model_run_lr5e-5_500epochs_max_seq_length_512/checkpoint-117674391",
140
+ "add_cross_attention": false,
141
+ "architectures": [
142
+ "RobertaForMaskedLM"
143
+ ],
144
+ "attention_probs_dropout_prob": 0.1,
145
+ "bad_words_ids": null,
146
+ "begin_suppress_tokens": null,
147
+ "bos_token_id": 0,
148
+ "chunk_size_feed_forward": 0,
149
+ "classifier_dropout": null,
150
+ "cross_attention_hidden_size": null,
151
+ "decoder_start_token_id": null,
152
+ "diversity_penalty": 0.0,
153
+ "do_sample": false,
154
+ "early_stopping": false,
155
+ "encoder_no_repeat_ngram_size": 0,
156
+ "eos_token_id": 2,
157
+ "exponential_decay_length_penalty": null,
158
+ "finetuning_task": null,
159
+ "forced_bos_token_id": null,
160
+ "forced_eos_token_id": null,
161
+ "hidden_act": "gelu",
162
+ "hidden_dropout_prob": 0.1,
163
+ "hidden_size": 512,
164
+ "id2label": {
165
+ "0": "LABEL_0",
166
+ "1": "LABEL_1"
167
+ },
168
+ "initializer_range": 0.02,
169
+ "intermediate_size": 2048,
170
+ "is_decoder": false,
171
+ "is_encoder_decoder": false,
172
+ "label2id": {
173
+ "LABEL_0": 0,
174
+ "LABEL_1": 1
175
+ },
176
+ "layer_norm_eps": 1e-05,
177
+ "length_penalty": 1.0,
178
+ "max_length": 512,
179
+ "max_position_embeddings": 514,
180
+ "max_seq_length": 512,
181
+ "min_length": 0,
182
+ "model_type": "roberta",
183
+ "no_repeat_ngram_size": 0,
184
+ "num_attention_heads": 4,
185
+ "num_beam_groups": 1,
186
+ "num_beams": 1,
187
+ "num_hidden_layers": 4,
188
+ "num_return_sequences": 1,
189
+ "output_attentions": false,
190
+ "output_hidden_states": false,
191
+ "output_scores": false,
192
+ "pad_token_id": 1,
193
+ "position_embedding_type": "absolute",
194
+ "prefix": null,
195
+ "problem_type": null,
196
+ "pruned_heads": {},
197
+ "remove_invalid_values": false,
198
+ "repetition_penalty": 1.0,
199
+ "return_dict": true,
200
+ "return_dict_in_generate": false,
201
+ "sep_token_id": null,
202
+ "suppress_tokens": null,
203
+ "task_specific_params": null,
204
+ "temperature": 1.0,
205
+ "tf_legacy_loss": false,
206
+ "tie_encoder_decoder": false,
207
+ "tie_word_embeddings": true,
208
+ "tokenizer_class": null,
209
+ "top_k": 50,
210
+ "top_p": 1.0,
211
+ "torch_dtype": "float32",
212
+ "torchscript": false,
213
+ "type_vocab_size": 1,
214
+ "typical_p": 1.0,
215
+ "use_bfloat16": false,
216
+ "use_cache": true,
217
+ "vocab_size": 25
218
+ },
219
+ "eos_token_id": 3,
220
+ "is_encoder_decoder": true,
221
+ "mask_token": null,
222
+ "model_type": "encoder-decoder",
223
+ "pad_token_id": 0,
224
+ "torch_dtype": "float32",
225
+ "transformers_version": "4.40.2"
226
+ }
final_adapter/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "adapter_residual_before_ln": false,
4
+ "cross_adapter": false,
5
+ "dropout": 0.0,
6
+ "factorized_phm_W": true,
7
+ "factorized_phm_rule": false,
8
+ "hypercomplex_nonlinearity": "glorot-uniform",
9
+ "init_weights": "bert",
10
+ "inv_adapter": null,
11
+ "inv_adapter_reduction_factor": null,
12
+ "is_parallel": false,
13
+ "learn_phm": true,
14
+ "leave_out": [],
15
+ "ln_after": false,
16
+ "ln_before": false,
17
+ "mh_adapter": true,
18
+ "non_linearity": "relu",
19
+ "original_ln_after": true,
20
+ "original_ln_before": false,
21
+ "output_adapter": true,
22
+ "phm_bias": true,
23
+ "phm_c_init": "normal",
24
+ "phm_dim": 4,
25
+ "phm_init_range": 0.0001,
26
+ "phm_layer": false,
27
+ "phm_rank": 1,
28
+ "reduction_factor": 16,
29
+ "residual_before_ln": true,
30
+ "scaling": 1.0,
31
+ "shared_W_phm": false,
32
+ "shared_phm_rule": true,
33
+ "use_gating": false
34
+ },
35
+ "hidden_size": null,
36
+ "model_class": "EncoderDecoderModelWithAdapters",
37
+ "model_name": "",
38
+ "model_type": "encoder-decoder",
39
+ "name": "heavy2light_adapter",
40
+ "version": "adapters.1.0.0.dev0"
41
+ }
final_adapter/pytorch_adapter.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dc566c1a694a1e228f25f22da4fa7c8fbe39bf7c35ebe640c712e7fc32ab4e2
3
+ size 8271506
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 2,
3
+ "do_sample": true,
4
+ "eos_token_id": 3,
5
+ "max_length": 110,
6
+ "max_new_tokens": 115,
7
+ "min_length": 100,
8
+ "output_hidden_states": true,
9
+ "output_scores": true,
10
+ "pad_token_id": 0,
11
+ "return_dict_in_geanerate": true,
12
+ "temperature": 0.7,
13
+ "top_k": 0,
14
+ "top_p": 0.9,
15
+ "transformers_version": "4.40.2"
16
+ }
heavy2light_final_checkpoint/heavy2light_adapter/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "adapter_residual_before_ln": false,
4
+ "cross_adapter": false,
5
+ "dropout": 0.0,
6
+ "factorized_phm_W": true,
7
+ "factorized_phm_rule": false,
8
+ "hypercomplex_nonlinearity": "glorot-uniform",
9
+ "init_weights": "bert",
10
+ "inv_adapter": null,
11
+ "inv_adapter_reduction_factor": null,
12
+ "is_parallel": false,
13
+ "learn_phm": true,
14
+ "leave_out": [],
15
+ "ln_after": false,
16
+ "ln_before": false,
17
+ "mh_adapter": true,
18
+ "non_linearity": "relu",
19
+ "original_ln_after": true,
20
+ "original_ln_before": false,
21
+ "output_adapter": true,
22
+ "phm_bias": true,
23
+ "phm_c_init": "normal",
24
+ "phm_dim": 4,
25
+ "phm_init_range": 0.0001,
26
+ "phm_layer": false,
27
+ "phm_rank": 1,
28
+ "reduction_factor": 16,
29
+ "residual_before_ln": true,
30
+ "scaling": 1.0,
31
+ "shared_W_phm": false,
32
+ "shared_phm_rule": true,
33
+ "use_gating": false
34
+ },
35
+ "config_id": "dcb52aee526a3537",
36
+ "hidden_size": null,
37
+ "model_class": "EncoderDecoderModelWithAdapters",
38
+ "model_name": "",
39
+ "model_type": "encoder-decoder",
40
+ "name": "heavy2light_adapter",
41
+ "version": "adapters.1.0.0.dev0"
42
+ }
heavy2light_final_checkpoint/heavy2light_adapter/pytorch_adapter.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dc566c1a694a1e228f25f22da4fa7c8fbe39bf7c35ebe640c712e7fc32ab4e2
3
+ size 8271506
heavy2light_final_checkpoint/merges.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ #version: 0.2
heavy2light_final_checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99f85042d63524481188bf92ede252e028559c9d5b638e0de332527face652af
3
+ size 243398906
heavy2light_final_checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7075ed0d58f01b532635a58196612dac8d613d8dc611072e9770e590d434102d
3
+ size 14244
heavy2light_final_checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7932b8355fb42581a458810e7b3ffdcbbb9698778ec631c00eb93f3bb76dcf2
3
+ size 1064
heavy2light_final_checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|unk|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
heavy2light_final_checkpoint/tokenizer.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 158,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 158
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 0,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<|pad|>"
18
+ },
19
+ "added_tokens": [
20
+ {
21
+ "id": 0,
22
+ "content": "<|pad|>",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": true,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 1,
31
+ "content": "<|unk|>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": true,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 2,
40
+ "content": "<|startoftext|>",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": true,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 3,
49
+ "content": "<|endoftext|>",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": true,
54
+ "special": true
55
+ }
56
+ ],
57
+ "normalizer": null,
58
+ "pre_tokenizer": {
59
+ "type": "ByteLevel",
60
+ "add_prefix_space": false,
61
+ "trim_offsets": true,
62
+ "use_regex": true
63
+ },
64
+ "post_processor": {
65
+ "type": "ByteLevel",
66
+ "add_prefix_space": true,
67
+ "trim_offsets": false,
68
+ "use_regex": true
69
+ },
70
+ "decoder": {
71
+ "type": "ByteLevel",
72
+ "add_prefix_space": true,
73
+ "trim_offsets": true,
74
+ "use_regex": true
75
+ },
76
+ "model": {
77
+ "type": "BPE",
78
+ "dropout": null,
79
+ "unk_token": null,
80
+ "continuing_subword_prefix": "",
81
+ "end_of_word_suffix": "",
82
+ "fuse_unk": false,
83
+ "byte_fallback": false,
84
+ "ignore_merges": false,
85
+ "vocab": {
86
+ "<|pad|>": 0,
87
+ "<|unk|>": 1,
88
+ "<|startoftext|>": 2,
89
+ "<|endoftext|>": 3,
90
+ "<|mask|>": 4,
91
+ "A": 5,
92
+ "C": 6,
93
+ "D": 7,
94
+ "E": 8,
95
+ "F": 9,
96
+ "G": 10,
97
+ "H": 11,
98
+ "I": 12,
99
+ "K": 13,
100
+ "L": 14,
101
+ "M": 15,
102
+ "N": 16,
103
+ "P": 17,
104
+ "Q": 18,
105
+ "R": 19,
106
+ "S": 20,
107
+ "T": 21,
108
+ "V": 22,
109
+ "W": 23,
110
+ "Y": 24
111
+ },
112
+ "merges": []
113
+ }
114
+ }
heavy2light_final_checkpoint/tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|pad|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|unk|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|startoftext|>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
+ "bos_token": "<|startoftext|>",
39
+ "clean_up_tokenization_spaces": false,
40
+ "eos_token": "<|endoftext|>",
41
+ "errors": "replace",
42
+ "extra_special_tokens": {},
43
+ "model_max_length": 1024,
44
+ "pad_token": "<|pad|>",
45
+ "tokenizer_class": "GPT2Tokenizer",
46
+ "unk_token": "<|unk|>"
47
+ }
heavy2light_final_checkpoint/trainer_state.json ADDED
@@ -0,0 +1,771 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 50.0,
5
+ "eval_steps": 500,
6
+ "global_step": 367750,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "grad_norm": 0.11480995267629623,
14
+ "learning_rate": 9.800000000000001e-06,
15
+ "loss": 0.2501,
16
+ "step": 7355
17
+ },
18
+ {
19
+ "epoch": 1.0,
20
+ "eval_loss": 0.42542216181755066,
21
+ "eval_runtime": 128.5308,
22
+ "eval_samples_per_second": 457.774,
23
+ "eval_steps_per_second": 7.158,
24
+ "step": 7355
25
+ },
26
+ {
27
+ "epoch": 2.0,
28
+ "grad_norm": 0.158855602145195,
29
+ "learning_rate": 9.600000000000001e-06,
30
+ "loss": 0.2332,
31
+ "step": 14710
32
+ },
33
+ {
34
+ "epoch": 2.0,
35
+ "eval_loss": 0.41126397252082825,
36
+ "eval_runtime": 128.8707,
37
+ "eval_samples_per_second": 456.566,
38
+ "eval_steps_per_second": 7.139,
39
+ "step": 14710
40
+ },
41
+ {
42
+ "epoch": 3.0,
43
+ "grad_norm": 0.20708617568016052,
44
+ "learning_rate": 9.4e-06,
45
+ "loss": 0.2295,
46
+ "step": 22065
47
+ },
48
+ {
49
+ "epoch": 3.0,
50
+ "eval_loss": 0.40615084767341614,
51
+ "eval_runtime": 128.7304,
52
+ "eval_samples_per_second": 457.064,
53
+ "eval_steps_per_second": 7.147,
54
+ "step": 22065
55
+ },
56
+ {
57
+ "epoch": 4.0,
58
+ "grad_norm": 0.2054029405117035,
59
+ "learning_rate": 9.200000000000002e-06,
60
+ "loss": 0.2273,
61
+ "step": 29420
62
+ },
63
+ {
64
+ "epoch": 4.0,
65
+ "eval_loss": 0.40087950229644775,
66
+ "eval_runtime": 128.7273,
67
+ "eval_samples_per_second": 457.075,
68
+ "eval_steps_per_second": 7.147,
69
+ "step": 29420
70
+ },
71
+ {
72
+ "epoch": 5.0,
73
+ "grad_norm": 0.19840490818023682,
74
+ "learning_rate": 9e-06,
75
+ "loss": 0.2256,
76
+ "step": 36775
77
+ },
78
+ {
79
+ "epoch": 5.0,
80
+ "eval_loss": 0.3977925777435303,
81
+ "eval_runtime": 128.707,
82
+ "eval_samples_per_second": 457.147,
83
+ "eval_steps_per_second": 7.148,
84
+ "step": 36775
85
+ },
86
+ {
87
+ "epoch": 6.0,
88
+ "grad_norm": 0.25789105892181396,
89
+ "learning_rate": 8.8e-06,
90
+ "loss": 0.2243,
91
+ "step": 44130
92
+ },
93
+ {
94
+ "epoch": 6.0,
95
+ "eval_loss": 0.3958837389945984,
96
+ "eval_runtime": 128.6907,
97
+ "eval_samples_per_second": 457.205,
98
+ "eval_steps_per_second": 7.149,
99
+ "step": 44130
100
+ },
101
+ {
102
+ "epoch": 7.0,
103
+ "grad_norm": 0.21235878765583038,
104
+ "learning_rate": 8.6e-06,
105
+ "loss": 0.2231,
106
+ "step": 51485
107
+ },
108
+ {
109
+ "epoch": 7.0,
110
+ "eval_loss": 0.39352869987487793,
111
+ "eval_runtime": 128.701,
112
+ "eval_samples_per_second": 457.168,
113
+ "eval_steps_per_second": 7.148,
114
+ "step": 51485
115
+ },
116
+ {
117
+ "epoch": 8.0,
118
+ "grad_norm": 0.1889820694923401,
119
+ "learning_rate": 8.400000000000001e-06,
120
+ "loss": 0.2221,
121
+ "step": 58840
122
+ },
123
+ {
124
+ "epoch": 8.0,
125
+ "eval_loss": 0.3912597596645355,
126
+ "eval_runtime": 128.7122,
127
+ "eval_samples_per_second": 457.128,
128
+ "eval_steps_per_second": 7.148,
129
+ "step": 58840
130
+ },
131
+ {
132
+ "epoch": 9.0,
133
+ "grad_norm": 0.22390136122703552,
134
+ "learning_rate": 8.2e-06,
135
+ "loss": 0.2212,
136
+ "step": 66195
137
+ },
138
+ {
139
+ "epoch": 9.0,
140
+ "eval_loss": 0.39093491435050964,
141
+ "eval_runtime": 128.7173,
142
+ "eval_samples_per_second": 457.11,
143
+ "eval_steps_per_second": 7.147,
144
+ "step": 66195
145
+ },
146
+ {
147
+ "epoch": 10.0,
148
+ "grad_norm": 0.1813807338476181,
149
+ "learning_rate": 8.000000000000001e-06,
150
+ "loss": 0.2205,
151
+ "step": 73550
152
+ },
153
+ {
154
+ "epoch": 10.0,
155
+ "eval_loss": 0.389521986246109,
156
+ "eval_runtime": 128.684,
157
+ "eval_samples_per_second": 457.229,
158
+ "eval_steps_per_second": 7.149,
159
+ "step": 73550
160
+ },
161
+ {
162
+ "epoch": 11.0,
163
+ "grad_norm": 0.17810355126857758,
164
+ "learning_rate": 7.800000000000002e-06,
165
+ "loss": 0.2197,
166
+ "step": 80905
167
+ },
168
+ {
169
+ "epoch": 11.0,
170
+ "eval_loss": 0.3886621296405792,
171
+ "eval_runtime": 128.7099,
172
+ "eval_samples_per_second": 457.137,
173
+ "eval_steps_per_second": 7.148,
174
+ "step": 80905
175
+ },
176
+ {
177
+ "epoch": 12.0,
178
+ "grad_norm": 0.24489013850688934,
179
+ "learning_rate": 7.600000000000001e-06,
180
+ "loss": 0.219,
181
+ "step": 88260
182
+ },
183
+ {
184
+ "epoch": 12.0,
185
+ "eval_loss": 0.3879886269569397,
186
+ "eval_runtime": 128.7058,
187
+ "eval_samples_per_second": 457.151,
188
+ "eval_steps_per_second": 7.148,
189
+ "step": 88260
190
+ },
191
+ {
192
+ "epoch": 13.0,
193
+ "grad_norm": 0.1965673714876175,
194
+ "learning_rate": 7.4e-06,
195
+ "loss": 0.2184,
196
+ "step": 95615
197
+ },
198
+ {
199
+ "epoch": 13.0,
200
+ "eval_loss": 0.38754525780677795,
201
+ "eval_runtime": 128.7201,
202
+ "eval_samples_per_second": 457.1,
203
+ "eval_steps_per_second": 7.147,
204
+ "step": 95615
205
+ },
206
+ {
207
+ "epoch": 14.0,
208
+ "grad_norm": 0.22494736313819885,
209
+ "learning_rate": 7.2000000000000005e-06,
210
+ "loss": 0.2178,
211
+ "step": 102970
212
+ },
213
+ {
214
+ "epoch": 14.0,
215
+ "eval_loss": 0.3873791992664337,
216
+ "eval_runtime": 128.7144,
217
+ "eval_samples_per_second": 457.121,
218
+ "eval_steps_per_second": 7.148,
219
+ "step": 102970
220
+ },
221
+ {
222
+ "epoch": 15.0,
223
+ "grad_norm": 0.32273635268211365,
224
+ "learning_rate": 7e-06,
225
+ "loss": 0.2172,
226
+ "step": 110325
227
+ },
228
+ {
229
+ "epoch": 15.0,
230
+ "eval_loss": 0.38621675968170166,
231
+ "eval_runtime": 128.7027,
232
+ "eval_samples_per_second": 457.162,
233
+ "eval_steps_per_second": 7.148,
234
+ "step": 110325
235
+ },
236
+ {
237
+ "epoch": 16.0,
238
+ "grad_norm": 0.17209158837795258,
239
+ "learning_rate": 6.800000000000001e-06,
240
+ "loss": 0.2167,
241
+ "step": 117680
242
+ },
243
+ {
244
+ "epoch": 16.0,
245
+ "eval_loss": 0.3857288658618927,
246
+ "eval_runtime": 128.6791,
247
+ "eval_samples_per_second": 457.246,
248
+ "eval_steps_per_second": 7.15,
249
+ "step": 117680
250
+ },
251
+ {
252
+ "epoch": 17.0,
253
+ "grad_norm": 0.27914878726005554,
254
+ "learning_rate": 6.600000000000001e-06,
255
+ "loss": 0.2162,
256
+ "step": 125035
257
+ },
258
+ {
259
+ "epoch": 17.0,
260
+ "eval_loss": 0.3846561014652252,
261
+ "eval_runtime": 128.6869,
262
+ "eval_samples_per_second": 457.218,
263
+ "eval_steps_per_second": 7.149,
264
+ "step": 125035
265
+ },
266
+ {
267
+ "epoch": 18.0,
268
+ "grad_norm": 0.23364859819412231,
269
+ "learning_rate": 6.4000000000000006e-06,
270
+ "loss": 0.2157,
271
+ "step": 132390
272
+ },
273
+ {
274
+ "epoch": 18.0,
275
+ "eval_loss": 0.3847697675228119,
276
+ "eval_runtime": 128.7148,
277
+ "eval_samples_per_second": 457.119,
278
+ "eval_steps_per_second": 7.148,
279
+ "step": 132390
280
+ },
281
+ {
282
+ "epoch": 19.0,
283
+ "grad_norm": 0.172671377658844,
284
+ "learning_rate": 6.200000000000001e-06,
285
+ "loss": 0.2152,
286
+ "step": 139745
287
+ },
288
+ {
289
+ "epoch": 19.0,
290
+ "eval_loss": 0.38386115431785583,
291
+ "eval_runtime": 128.6991,
292
+ "eval_samples_per_second": 457.175,
293
+ "eval_steps_per_second": 7.148,
294
+ "step": 139745
295
+ },
296
+ {
297
+ "epoch": 20.0,
298
+ "grad_norm": 0.19780349731445312,
299
+ "learning_rate": 6e-06,
300
+ "loss": 0.2148,
301
+ "step": 147100
302
+ },
303
+ {
304
+ "epoch": 20.0,
305
+ "eval_loss": 0.3836727738380432,
306
+ "eval_runtime": 128.7294,
307
+ "eval_samples_per_second": 457.067,
308
+ "eval_steps_per_second": 7.147,
309
+ "step": 147100
310
+ },
311
+ {
312
+ "epoch": 21.0,
313
+ "grad_norm": 0.26560327410697937,
314
+ "learning_rate": 5.8e-06,
315
+ "loss": 0.2144,
316
+ "step": 154455
317
+ },
318
+ {
319
+ "epoch": 21.0,
320
+ "eval_loss": 0.3844703435897827,
321
+ "eval_runtime": 128.7099,
322
+ "eval_samples_per_second": 457.137,
323
+ "eval_steps_per_second": 7.148,
324
+ "step": 154455
325
+ },
326
+ {
327
+ "epoch": 22.0,
328
+ "grad_norm": 0.22332455217838287,
329
+ "learning_rate": 5.600000000000001e-06,
330
+ "loss": 0.2139,
331
+ "step": 161810
332
+ },
333
+ {
334
+ "epoch": 22.0,
335
+ "eval_loss": 0.3834006190299988,
336
+ "eval_runtime": 128.7123,
337
+ "eval_samples_per_second": 457.128,
338
+ "eval_steps_per_second": 7.148,
339
+ "step": 161810
340
+ },
341
+ {
342
+ "epoch": 23.0,
343
+ "grad_norm": 0.2586681842803955,
344
+ "learning_rate": 5.400000000000001e-06,
345
+ "loss": 0.2136,
346
+ "step": 169165
347
+ },
348
+ {
349
+ "epoch": 23.0,
350
+ "eval_loss": 0.38348647952079773,
351
+ "eval_runtime": 128.6691,
352
+ "eval_samples_per_second": 457.281,
353
+ "eval_steps_per_second": 7.15,
354
+ "step": 169165
355
+ },
356
+ {
357
+ "epoch": 24.0,
358
+ "grad_norm": 0.2845219075679779,
359
+ "learning_rate": 5.2e-06,
360
+ "loss": 0.2132,
361
+ "step": 176520
362
+ },
363
+ {
364
+ "epoch": 24.0,
365
+ "eval_loss": 0.3828953504562378,
366
+ "eval_runtime": 128.7332,
367
+ "eval_samples_per_second": 457.054,
368
+ "eval_steps_per_second": 7.147,
369
+ "step": 176520
370
+ },
371
+ {
372
+ "epoch": 25.0,
373
+ "grad_norm": 0.27165067195892334,
374
+ "learning_rate": 5e-06,
375
+ "loss": 0.2128,
376
+ "step": 183875
377
+ },
378
+ {
379
+ "epoch": 25.0,
380
+ "eval_loss": 0.38219162821769714,
381
+ "eval_runtime": 128.7152,
382
+ "eval_samples_per_second": 457.118,
383
+ "eval_steps_per_second": 7.148,
384
+ "step": 183875
385
+ },
386
+ {
387
+ "epoch": 26.0,
388
+ "grad_norm": 0.23254956305027008,
389
+ "learning_rate": 4.800000000000001e-06,
390
+ "loss": 0.2125,
391
+ "step": 191230
392
+ },
393
+ {
394
+ "epoch": 26.0,
395
+ "eval_loss": 0.38233497738838196,
396
+ "eval_runtime": 128.8883,
397
+ "eval_samples_per_second": 456.504,
398
+ "eval_steps_per_second": 7.138,
399
+ "step": 191230
400
+ },
401
+ {
402
+ "epoch": 27.0,
403
+ "grad_norm": 0.2750227749347687,
404
+ "learning_rate": 4.600000000000001e-06,
405
+ "loss": 0.2122,
406
+ "step": 198585
407
+ },
408
+ {
409
+ "epoch": 27.0,
410
+ "eval_loss": 0.3827952444553375,
411
+ "eval_runtime": 128.7247,
412
+ "eval_samples_per_second": 457.084,
413
+ "eval_steps_per_second": 7.147,
414
+ "step": 198585
415
+ },
416
+ {
417
+ "epoch": 28.0,
418
+ "grad_norm": 0.3043362498283386,
419
+ "learning_rate": 4.4e-06,
420
+ "loss": 0.2118,
421
+ "step": 205940
422
+ },
423
+ {
424
+ "epoch": 28.0,
425
+ "eval_loss": 0.38314878940582275,
426
+ "eval_runtime": 128.7576,
427
+ "eval_samples_per_second": 456.967,
428
+ "eval_steps_per_second": 7.145,
429
+ "step": 205940
430
+ },
431
+ {
432
+ "epoch": 29.0,
433
+ "grad_norm": 0.22233448922634125,
434
+ "learning_rate": 4.2000000000000004e-06,
435
+ "loss": 0.2115,
436
+ "step": 213295
437
+ },
438
+ {
439
+ "epoch": 29.0,
440
+ "eval_loss": 0.3818701505661011,
441
+ "eval_runtime": 128.736,
442
+ "eval_samples_per_second": 457.044,
443
+ "eval_steps_per_second": 7.146,
444
+ "step": 213295
445
+ },
446
+ {
447
+ "epoch": 30.0,
448
+ "grad_norm": 0.26145127415657043,
449
+ "learning_rate": 4.000000000000001e-06,
450
+ "loss": 0.2112,
451
+ "step": 220650
452
+ },
453
+ {
454
+ "epoch": 30.0,
455
+ "eval_loss": 0.38293564319610596,
456
+ "eval_runtime": 128.9344,
457
+ "eval_samples_per_second": 456.341,
458
+ "eval_steps_per_second": 7.135,
459
+ "step": 220650
460
+ },
461
+ {
462
+ "epoch": 31.0,
463
+ "grad_norm": 0.2705918252468109,
464
+ "learning_rate": 3.8000000000000005e-06,
465
+ "loss": 0.211,
466
+ "step": 228005
467
+ },
468
+ {
469
+ "epoch": 31.0,
470
+ "eval_loss": 0.3823812007904053,
471
+ "eval_runtime": 128.8218,
472
+ "eval_samples_per_second": 456.739,
473
+ "eval_steps_per_second": 7.142,
474
+ "step": 228005
475
+ },
476
+ {
477
+ "epoch": 32.0,
478
+ "grad_norm": 0.2663235366344452,
479
+ "learning_rate": 3.6000000000000003e-06,
480
+ "loss": 0.2107,
481
+ "step": 235360
482
+ },
483
+ {
484
+ "epoch": 32.0,
485
+ "eval_loss": 0.382473886013031,
486
+ "eval_runtime": 128.7309,
487
+ "eval_samples_per_second": 457.062,
488
+ "eval_steps_per_second": 7.147,
489
+ "step": 235360
490
+ },
491
+ {
492
+ "epoch": 33.0,
493
+ "grad_norm": 0.23493929207324982,
494
+ "learning_rate": 3.4000000000000005e-06,
495
+ "loss": 0.2104,
496
+ "step": 242715
497
+ },
498
+ {
499
+ "epoch": 33.0,
500
+ "eval_loss": 0.3834179639816284,
501
+ "eval_runtime": 128.7424,
502
+ "eval_samples_per_second": 457.021,
503
+ "eval_steps_per_second": 7.146,
504
+ "step": 242715
505
+ },
506
+ {
507
+ "epoch": 34.0,
508
+ "grad_norm": 0.2235766053199768,
509
+ "learning_rate": 3.2000000000000003e-06,
510
+ "loss": 0.2102,
511
+ "step": 250070
512
+ },
513
+ {
514
+ "epoch": 34.0,
515
+ "eval_loss": 0.3825724124908447,
516
+ "eval_runtime": 128.7687,
517
+ "eval_samples_per_second": 456.928,
518
+ "eval_steps_per_second": 7.145,
519
+ "step": 250070
520
+ },
521
+ {
522
+ "epoch": 35.0,
523
+ "grad_norm": 0.2881753742694855,
524
+ "learning_rate": 3e-06,
525
+ "loss": 0.2099,
526
+ "step": 257425
527
+ },
528
+ {
529
+ "epoch": 35.0,
530
+ "eval_loss": 0.3824039697647095,
531
+ "eval_runtime": 133.1919,
532
+ "eval_samples_per_second": 441.754,
533
+ "eval_steps_per_second": 6.907,
534
+ "step": 257425
535
+ },
536
+ {
537
+ "epoch": 36.0,
538
+ "grad_norm": 0.35670992732048035,
539
+ "learning_rate": 2.8000000000000003e-06,
540
+ "loss": 0.2097,
541
+ "step": 264780
542
+ },
543
+ {
544
+ "epoch": 36.0,
545
+ "eval_loss": 0.38277488946914673,
546
+ "eval_runtime": 128.7343,
547
+ "eval_samples_per_second": 457.05,
548
+ "eval_steps_per_second": 7.147,
549
+ "step": 264780
550
+ },
551
+ {
552
+ "epoch": 37.0,
553
+ "grad_norm": 0.29673638939857483,
554
+ "learning_rate": 2.6e-06,
555
+ "loss": 0.2095,
556
+ "step": 272135
557
+ },
558
+ {
559
+ "epoch": 37.0,
560
+ "eval_loss": 0.38287386298179626,
561
+ "eval_runtime": 129.4264,
562
+ "eval_samples_per_second": 454.606,
563
+ "eval_steps_per_second": 7.108,
564
+ "step": 272135
565
+ },
566
+ {
567
+ "epoch": 38.0,
568
+ "grad_norm": 0.25621339678764343,
569
+ "learning_rate": 2.4000000000000003e-06,
570
+ "loss": 0.2093,
571
+ "step": 279490
572
+ },
573
+ {
574
+ "epoch": 38.0,
575
+ "eval_loss": 0.3827780485153198,
576
+ "eval_runtime": 129.087,
577
+ "eval_samples_per_second": 455.801,
578
+ "eval_steps_per_second": 7.127,
579
+ "step": 279490
580
+ },
581
+ {
582
+ "epoch": 39.0,
583
+ "grad_norm": 0.31819215416908264,
584
+ "learning_rate": 2.2e-06,
585
+ "loss": 0.2091,
586
+ "step": 286845
587
+ },
588
+ {
589
+ "epoch": 39.0,
590
+ "eval_loss": 0.3822120726108551,
591
+ "eval_runtime": 128.8612,
592
+ "eval_samples_per_second": 456.6,
593
+ "eval_steps_per_second": 7.139,
594
+ "step": 286845
595
+ },
596
+ {
597
+ "epoch": 40.0,
598
+ "grad_norm": 0.2761085033416748,
599
+ "learning_rate": 2.0000000000000003e-06,
600
+ "loss": 0.2089,
601
+ "step": 294200
602
+ },
603
+ {
604
+ "epoch": 40.0,
605
+ "eval_loss": 0.3824302554130554,
606
+ "eval_runtime": 129.0551,
607
+ "eval_samples_per_second": 455.914,
608
+ "eval_steps_per_second": 7.129,
609
+ "step": 294200
610
+ },
611
+ {
612
+ "epoch": 41.0,
613
+ "grad_norm": 0.27816739678382874,
614
+ "learning_rate": 1.8000000000000001e-06,
615
+ "loss": 0.2088,
616
+ "step": 301555
617
+ },
618
+ {
619
+ "epoch": 41.0,
620
+ "eval_loss": 0.38338372111320496,
621
+ "eval_runtime": 129.0432,
622
+ "eval_samples_per_second": 455.956,
623
+ "eval_steps_per_second": 7.129,
624
+ "step": 301555
625
+ },
626
+ {
627
+ "epoch": 42.0,
628
+ "grad_norm": 0.3370245695114136,
629
+ "learning_rate": 1.6000000000000001e-06,
630
+ "loss": 0.2086,
631
+ "step": 308910
632
+ },
633
+ {
634
+ "epoch": 42.0,
635
+ "eval_loss": 0.3826825022697449,
636
+ "eval_runtime": 129.0854,
637
+ "eval_samples_per_second": 455.807,
638
+ "eval_steps_per_second": 7.127,
639
+ "step": 308910
640
+ },
641
+ {
642
+ "epoch": 43.0,
643
+ "grad_norm": 0.23392541706562042,
644
+ "learning_rate": 1.4000000000000001e-06,
645
+ "loss": 0.2085,
646
+ "step": 316265
647
+ },
648
+ {
649
+ "epoch": 43.0,
650
+ "eval_loss": 0.382882684469223,
651
+ "eval_runtime": 128.8341,
652
+ "eval_samples_per_second": 456.696,
653
+ "eval_steps_per_second": 7.141,
654
+ "step": 316265
655
+ },
656
+ {
657
+ "epoch": 44.0,
658
+ "grad_norm": 0.2567419409751892,
659
+ "learning_rate": 1.2000000000000002e-06,
660
+ "loss": 0.2083,
661
+ "step": 323620
662
+ },
663
+ {
664
+ "epoch": 44.0,
665
+ "eval_loss": 0.3828926682472229,
666
+ "eval_runtime": 129.1671,
667
+ "eval_samples_per_second": 455.518,
668
+ "eval_steps_per_second": 7.123,
669
+ "step": 323620
670
+ },
671
+ {
672
+ "epoch": 45.0,
673
+ "grad_norm": 0.22591634094715118,
674
+ "learning_rate": 1.0000000000000002e-06,
675
+ "loss": 0.2082,
676
+ "step": 330975
677
+ },
678
+ {
679
+ "epoch": 45.0,
680
+ "eval_loss": 0.3830114006996155,
681
+ "eval_runtime": 128.7432,
682
+ "eval_samples_per_second": 457.018,
683
+ "eval_steps_per_second": 7.146,
684
+ "step": 330975
685
+ },
686
+ {
687
+ "epoch": 46.0,
688
+ "grad_norm": 0.310523122549057,
689
+ "learning_rate": 8.000000000000001e-07,
690
+ "loss": 0.2081,
691
+ "step": 338330
692
+ },
693
+ {
694
+ "epoch": 46.0,
695
+ "eval_loss": 0.38255205750465393,
696
+ "eval_runtime": 129.0345,
697
+ "eval_samples_per_second": 455.987,
698
+ "eval_steps_per_second": 7.13,
699
+ "step": 338330
700
+ },
701
+ {
702
+ "epoch": 47.0,
703
+ "grad_norm": 0.278604120016098,
704
+ "learning_rate": 6.000000000000001e-07,
705
+ "loss": 0.208,
706
+ "step": 345685
707
+ },
708
+ {
709
+ "epoch": 47.0,
710
+ "eval_loss": 0.3827236294746399,
711
+ "eval_runtime": 129.0063,
712
+ "eval_samples_per_second": 456.086,
713
+ "eval_steps_per_second": 7.131,
714
+ "step": 345685
715
+ },
716
+ {
717
+ "epoch": 48.0,
718
+ "grad_norm": 0.2605680227279663,
719
+ "learning_rate": 4.0000000000000003e-07,
720
+ "loss": 0.2079,
721
+ "step": 353040
722
+ },
723
+ {
724
+ "epoch": 48.0,
725
+ "eval_loss": 0.38287004828453064,
726
+ "eval_runtime": 128.8174,
727
+ "eval_samples_per_second": 456.755,
728
+ "eval_steps_per_second": 7.142,
729
+ "step": 353040
730
+ },
731
+ {
732
+ "epoch": 49.0,
733
+ "grad_norm": 0.3245304822921753,
734
+ "learning_rate": 2.0000000000000002e-07,
735
+ "loss": 0.2078,
736
+ "step": 360395
737
+ },
738
+ {
739
+ "epoch": 49.0,
740
+ "eval_loss": 0.38298800587654114,
741
+ "eval_runtime": 128.8343,
742
+ "eval_samples_per_second": 456.695,
743
+ "eval_steps_per_second": 7.141,
744
+ "step": 360395
745
+ },
746
+ {
747
+ "epoch": 50.0,
748
+ "grad_norm": 0.3787703812122345,
749
+ "learning_rate": 0.0,
750
+ "loss": 0.2078,
751
+ "step": 367750
752
+ },
753
+ {
754
+ "epoch": 50.0,
755
+ "eval_loss": 0.3828030824661255,
756
+ "eval_runtime": 128.8773,
757
+ "eval_samples_per_second": 456.543,
758
+ "eval_steps_per_second": 7.139,
759
+ "step": 367750
760
+ }
761
+ ],
762
+ "logging_steps": 500,
763
+ "max_steps": 367750,
764
+ "num_input_tokens_seen": 0,
765
+ "num_train_epochs": 50,
766
+ "save_steps": 500,
767
+ "total_flos": 2.9088945658368e+18,
768
+ "train_batch_size": 64,
769
+ "trial_name": null,
770
+ "trial_params": null
771
+ }
heavy2light_final_checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7124d079b776b1b2e4b5f5c7349db9adba489f9d0941380d4211927328df0050
3
+ size 7224
heavy2light_final_checkpoint/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<|pad|>":0,"<|unk|>":1,"<|startoftext|>":2,"<|endoftext|>":3,"<|mask|>":4,"A":5,"C":6,"D":7,"E":8,"F":9,"G":10,"H":11,"I":12,"K":13,"L":14,"M":15,"N":16,"P":17,"Q":18,"R":19,"S":20,"T":21,"V":22,"W":23,"Y":24}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b64c24056147029b08f128d4183c3ce5b39d82f62561e34ff6fb06ff697b6e
3
+ size 519367496