kurogane commited on
Commit
75dc0de
·
verified ·
1 Parent(s): 7157f70

Upload 13 files

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {%- if messages[0]['role'] == 'system' %}
3
+ {%- set system_message = messages[0]['content'] | trim + ' ' %}
4
+ {%- set messages = messages[1:] %}
5
+ {%- else %}
6
+ {%- set system_message = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. ' %}
7
+ {%- endif %}
8
+
9
+ {{- system_message }}
10
+ {%- for message in messages %}
11
+ {%- if message['role'] == 'user' %}
12
+ {{- 'USER: ' + message['content'] | trim }}
13
+ {%- elif message['role'] == 'assistant' %}
14
+ {{- ' ASSISTANT: ' + message['content'] | trim + '</s>' }}
15
+ {%- endif %}
16
+ {%- endfor %}
17
+
18
+ {%- if add_generation_prompt %}
19
+ {{- ' ASSISTANT:' }}
20
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Phi3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "attn_implementation": "sdpa",
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_phi3.Phi3Config",
10
+ "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
+ },
12
+ "bos_token_id": 1,
13
+ "embd_pdrop": 0.0,
14
+ "eos_token_id": 2,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 256,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 640,
19
+ "max_position_embeddings": 256,
20
+ "model_type": "phi3",
21
+ "num_attention_heads": 8,
22
+ "num_hidden_layers": 4,
23
+ "num_key_value_heads": 2,
24
+ "original_max_position_embeddings": 256,
25
+ "pad_token_id": 2,
26
+ "partial_rotary_factor": 1.0,
27
+ "resid_pdrop": 0.0,
28
+ "rms_norm_eps": 1e-05,
29
+ "rope_scaling": null,
30
+ "rope_theta": 10000.0,
31
+ "sliding_window": 255,
32
+ "tie_word_embeddings": true,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.55.4",
35
+ "use_cache": false,
36
+ "vocab_size": 48000
37
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.55.4",
7
+ "use_cache": false
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40308c9e312195c8aa95d76ef9b89f07c6f22d69bba34f6880aea819e8ab0459
3
+ size 59649832
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eefc8d82e04a531a4bfa44a34f912c0cf6584bf0102c19a4cf276fc4405bdcd
3
+ size 119317089
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b526e1a6193bbef5c713d1f3b86c07cf14f47ab0393283cef43a72643daf1bc
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ed198070f280e6b44fe3a1d7a99eccf794d35713b12135e6bcb287a7e38ed2
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<unk>",
4
+ "<s>",
5
+ "</s>"
6
+ ],
7
+ "bos_token": {
8
+ "content": "<s>",
9
+ "lstrip": false,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "eos_token": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "pad_token": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "unk_token": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7978987401ef447724ded0544d048831954b7517a96555ce1593149e3678b6dc
3
+ size 755169
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [
32
+ "<unk>",
33
+ "<s>",
34
+ "</s>"
35
+ ],
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "extra_special_tokens": {},
40
+ "legacy": true,
41
+ "model_max_length": 1000000000000000019884624838656,
42
+ "pad_token": "</s>",
43
+ "sp_model_kwargs": {},
44
+ "spaces_between_special_tokens": false,
45
+ "tokenizer_class": "LlamaTokenizer",
46
+ "unk_token": "<unk>",
47
+ "use_default_system_prompt": true
48
+ }
trainer_state.json ADDED
@@ -0,0 +1,970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2094240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.009550003820001528,
14
+ "grad_norm": 1.1688205003738403,
15
+ "learning_rate": 0.00028647758200830823,
16
+ "loss": 4.7726,
17
+ "mean_token_accuracy": 0.29613289506579993,
18
+ "num_tokens": 716800000.0,
19
+ "step": 20000
20
+ },
21
+ {
22
+ "epoch": 0.019100007640003056,
23
+ "grad_norm": 1.0122355222702026,
24
+ "learning_rate": 0.00029724265264455594,
25
+ "loss": 3.5064,
26
+ "mean_token_accuracy": 0.39106567760258915,
27
+ "num_tokens": 1433600000.0,
28
+ "step": 40000
29
+ },
30
+ {
31
+ "epoch": 0.028650011460004583,
32
+ "grad_norm": 0.9957022070884705,
33
+ "learning_rate": 0.0002943487112555509,
34
+ "loss": 3.4122,
35
+ "mean_token_accuracy": 0.40039802242666483,
36
+ "num_tokens": 2150400000.0,
37
+ "step": 60000
38
+ },
39
+ {
40
+ "epoch": 0.03820001528000611,
41
+ "grad_norm": 1.2025322914123535,
42
+ "learning_rate": 0.00029145476986654586,
43
+ "loss": 3.3766,
44
+ "mean_token_accuracy": 0.404039168266952,
45
+ "num_tokens": 2867200000.0,
46
+ "step": 80000
47
+ },
48
+ {
49
+ "epoch": 0.04775001910000764,
50
+ "grad_norm": 1.0380007028579712,
51
+ "learning_rate": 0.0002885608284775408,
52
+ "loss": 3.3565,
53
+ "mean_token_accuracy": 0.40606449462026356,
54
+ "num_tokens": 3584000000.0,
55
+ "step": 100000
56
+ },
57
+ {
58
+ "epoch": 0.057300022920009165,
59
+ "grad_norm": 1.1203994750976562,
60
+ "learning_rate": 0.0002856668870885358,
61
+ "loss": 3.3421,
62
+ "mean_token_accuracy": 0.40754339938014744,
63
+ "num_tokens": 4300800000.0,
64
+ "step": 120000
65
+ },
66
+ {
67
+ "epoch": 0.0668500267400107,
68
+ "grad_norm": 1.1507455110549927,
69
+ "learning_rate": 0.0002827729456995307,
70
+ "loss": 3.3327,
71
+ "mean_token_accuracy": 0.40846395269036295,
72
+ "num_tokens": 5017600000.0,
73
+ "step": 140000
74
+ },
75
+ {
76
+ "epoch": 0.07640003056001222,
77
+ "grad_norm": 1.2616394758224487,
78
+ "learning_rate": 0.0002798790043105257,
79
+ "loss": 3.3239,
80
+ "mean_token_accuracy": 0.40932240092903377,
81
+ "num_tokens": 5734400000.0,
82
+ "step": 160000
83
+ },
84
+ {
85
+ "epoch": 0.08595003438001375,
86
+ "grad_norm": 1.2600022554397583,
87
+ "learning_rate": 0.00027698506292152063,
88
+ "loss": 3.3172,
89
+ "mean_token_accuracy": 0.41004598908573386,
90
+ "num_tokens": 6451200000.0,
91
+ "step": 180000
92
+ },
93
+ {
94
+ "epoch": 0.09550003820001528,
95
+ "grad_norm": 1.1764442920684814,
96
+ "learning_rate": 0.00027409112153251557,
97
+ "loss": 3.3119,
98
+ "mean_token_accuracy": 0.410561589974165,
99
+ "num_tokens": 7168000000.0,
100
+ "step": 200000
101
+ },
102
+ {
103
+ "epoch": 0.1050500420200168,
104
+ "grad_norm": 1.2408899068832397,
105
+ "learning_rate": 0.00027119718014351055,
106
+ "loss": 3.3061,
107
+ "mean_token_accuracy": 0.411199274918437,
108
+ "num_tokens": 7884800000.0,
109
+ "step": 220000
110
+ },
111
+ {
112
+ "epoch": 0.11460004584001833,
113
+ "grad_norm": 1.2953583002090454,
114
+ "learning_rate": 0.0002683032387545055,
115
+ "loss": 3.3012,
116
+ "mean_token_accuracy": 0.41174386738538743,
117
+ "num_tokens": 8601600000.0,
118
+ "step": 240000
119
+ },
120
+ {
121
+ "epoch": 0.12415004966001987,
122
+ "grad_norm": 1.3780796527862549,
123
+ "learning_rate": 0.0002654092973655004,
124
+ "loss": 3.2974,
125
+ "mean_token_accuracy": 0.41211788419932127,
126
+ "num_tokens": 9318400000.0,
127
+ "step": 260000
128
+ },
129
+ {
130
+ "epoch": 0.1337000534800214,
131
+ "grad_norm": 1.2816623449325562,
132
+ "learning_rate": 0.0002625153559764954,
133
+ "loss": 3.2952,
134
+ "mean_token_accuracy": 0.4122900270193815,
135
+ "num_tokens": 10035200000.0,
136
+ "step": 280000
137
+ },
138
+ {
139
+ "epoch": 0.1432500573000229,
140
+ "grad_norm": 1.462896466255188,
141
+ "learning_rate": 0.00025962141458749034,
142
+ "loss": 3.2898,
143
+ "mean_token_accuracy": 0.4128769779801369,
144
+ "num_tokens": 10752000000.0,
145
+ "step": 300000
146
+ },
147
+ {
148
+ "epoch": 0.15280006112002445,
149
+ "grad_norm": 1.4050343036651611,
150
+ "learning_rate": 0.0002567274731984853,
151
+ "loss": 3.2866,
152
+ "mean_token_accuracy": 0.4132985157236457,
153
+ "num_tokens": 11468800000.0,
154
+ "step": 320000
155
+ },
156
+ {
157
+ "epoch": 0.162350064940026,
158
+ "grad_norm": 1.3888232707977295,
159
+ "learning_rate": 0.00025383353180948026,
160
+ "loss": 3.2839,
161
+ "mean_token_accuracy": 0.41356196113973853,
162
+ "num_tokens": 12185600000.0,
163
+ "step": 340000
164
+ },
165
+ {
166
+ "epoch": 0.1719000687600275,
167
+ "grad_norm": 1.3985786437988281,
168
+ "learning_rate": 0.0002509395904204752,
169
+ "loss": 3.2824,
170
+ "mean_token_accuracy": 0.4136822006031871,
171
+ "num_tokens": 12902400000.0,
172
+ "step": 360000
173
+ },
174
+ {
175
+ "epoch": 0.18145007258002904,
176
+ "grad_norm": 1.5853444337844849,
177
+ "learning_rate": 0.00024804564903147013,
178
+ "loss": 3.2796,
179
+ "mean_token_accuracy": 0.41403463195711376,
180
+ "num_tokens": 13619200000.0,
181
+ "step": 380000
182
+ },
183
+ {
184
+ "epoch": 0.19100007640003056,
185
+ "grad_norm": 1.5639410018920898,
186
+ "learning_rate": 0.00024515170764246506,
187
+ "loss": 3.2773,
188
+ "mean_token_accuracy": 0.4142179542243481,
189
+ "num_tokens": 14336000000.0,
190
+ "step": 400000
191
+ },
192
+ {
193
+ "epoch": 0.2005500802200321,
194
+ "grad_norm": 1.4351433515548706,
195
+ "learning_rate": 0.00024225776625346005,
196
+ "loss": 3.2752,
197
+ "mean_token_accuracy": 0.41444926087111233,
198
+ "num_tokens": 15052800000.0,
199
+ "step": 420000
200
+ },
201
+ {
202
+ "epoch": 0.2101000840400336,
203
+ "grad_norm": 1.3414918184280396,
204
+ "learning_rate": 0.00023936382486445498,
205
+ "loss": 3.2725,
206
+ "mean_token_accuracy": 0.41476476649940014,
207
+ "num_tokens": 15769600000.0,
208
+ "step": 440000
209
+ },
210
+ {
211
+ "epoch": 0.21965008786003515,
212
+ "grad_norm": 1.3302001953125,
213
+ "learning_rate": 0.00023646988347544994,
214
+ "loss": 3.2696,
215
+ "mean_token_accuracy": 0.4150806698143482,
216
+ "num_tokens": 16486400000.0,
217
+ "step": 460000
218
+ },
219
+ {
220
+ "epoch": 0.22920009168003666,
221
+ "grad_norm": 1.3756541013717651,
222
+ "learning_rate": 0.00023357594208644488,
223
+ "loss": 3.2683,
224
+ "mean_token_accuracy": 0.4152164765149355,
225
+ "num_tokens": 17203200000.0,
226
+ "step": 480000
227
+ },
228
+ {
229
+ "epoch": 0.2387500955000382,
230
+ "grad_norm": 1.330269694328308,
231
+ "learning_rate": 0.00023068200069743984,
232
+ "loss": 3.2665,
233
+ "mean_token_accuracy": 0.4154003126785159,
234
+ "num_tokens": 17920000000.0,
235
+ "step": 500000
236
+ },
237
+ {
238
+ "epoch": 0.24830009932003974,
239
+ "grad_norm": 1.7756025791168213,
240
+ "learning_rate": 0.00022778805930843483,
241
+ "loss": 3.2648,
242
+ "mean_token_accuracy": 0.4155686768323183,
243
+ "num_tokens": 18636800000.0,
244
+ "step": 520000
245
+ },
246
+ {
247
+ "epoch": 0.25785010314004125,
248
+ "grad_norm": 1.6679662466049194,
249
+ "learning_rate": 0.00022489411791942976,
250
+ "loss": 3.2617,
251
+ "mean_token_accuracy": 0.41595828180462124,
252
+ "num_tokens": 19353600000.0,
253
+ "step": 540000
254
+ },
255
+ {
256
+ "epoch": 0.2674001069600428,
257
+ "grad_norm": 1.5160603523254395,
258
+ "learning_rate": 0.00022200017653042472,
259
+ "loss": 3.2601,
260
+ "mean_token_accuracy": 0.4160811348050833,
261
+ "num_tokens": 716800000.0,
262
+ "step": 560000
263
+ },
264
+ {
265
+ "epoch": 0.27695011078004433,
266
+ "grad_norm": 2.128943681716919,
267
+ "learning_rate": 0.00021910623514141968,
268
+ "loss": 3.259,
269
+ "mean_token_accuracy": 0.41622272021621465,
270
+ "num_tokens": 1433600000.0,
271
+ "step": 580000
272
+ },
273
+ {
274
+ "epoch": 0.2865001146000458,
275
+ "grad_norm": 1.550946593284607,
276
+ "learning_rate": 0.00021621229375241461,
277
+ "loss": 3.2555,
278
+ "mean_token_accuracy": 0.4166563056409359,
279
+ "num_tokens": 2150400000.0,
280
+ "step": 600000
281
+ },
282
+ {
283
+ "epoch": 0.29605011842004736,
284
+ "grad_norm": 1.5084576606750488,
285
+ "learning_rate": 0.00021331835236340957,
286
+ "loss": 3.2543,
287
+ "mean_token_accuracy": 0.4167668910384178,
288
+ "num_tokens": 2867200000.0,
289
+ "step": 620000
290
+ },
291
+ {
292
+ "epoch": 0.3056001222400489,
293
+ "grad_norm": 1.7308917045593262,
294
+ "learning_rate": 0.00021042441097440454,
295
+ "loss": 3.2531,
296
+ "mean_token_accuracy": 0.416923209066689,
297
+ "num_tokens": 3584000000.0,
298
+ "step": 640000
299
+ },
300
+ {
301
+ "epoch": 0.31515012606005044,
302
+ "grad_norm": 1.6496219635009766,
303
+ "learning_rate": 0.00020753046958539947,
304
+ "loss": 3.2503,
305
+ "mean_token_accuracy": 0.4171900423392653,
306
+ "num_tokens": 4300800000.0,
307
+ "step": 660000
308
+ },
309
+ {
310
+ "epoch": 0.324700129880052,
311
+ "grad_norm": 1.911421537399292,
312
+ "learning_rate": 0.00020463652819639443,
313
+ "loss": 3.2494,
314
+ "mean_token_accuracy": 0.4172973266944289,
315
+ "num_tokens": 5017600000.0,
316
+ "step": 680000
317
+ },
318
+ {
319
+ "epoch": 0.33425013370005346,
320
+ "grad_norm": 1.625441074371338,
321
+ "learning_rate": 0.00020174258680738936,
322
+ "loss": 3.2485,
323
+ "mean_token_accuracy": 0.4173892762258649,
324
+ "num_tokens": 5734400000.0,
325
+ "step": 700000
326
+ },
327
+ {
328
+ "epoch": 0.343800137520055,
329
+ "grad_norm": 1.570096731185913,
330
+ "learning_rate": 0.00019884864541838432,
331
+ "loss": 3.2467,
332
+ "mean_token_accuracy": 0.4176161937117577,
333
+ "num_tokens": 6451200000.0,
334
+ "step": 720000
335
+ },
336
+ {
337
+ "epoch": 0.35335014134005654,
338
+ "grad_norm": 1.5831801891326904,
339
+ "learning_rate": 0.00019595470402937928,
340
+ "loss": 3.2441,
341
+ "mean_token_accuracy": 0.41793804090470077,
342
+ "num_tokens": 7168000000.0,
343
+ "step": 740000
344
+ },
345
+ {
346
+ "epoch": 0.3629001451600581,
347
+ "grad_norm": 1.6359102725982666,
348
+ "learning_rate": 0.00019306076264037422,
349
+ "loss": 3.2439,
350
+ "mean_token_accuracy": 0.41790355779081584,
351
+ "num_tokens": 7884800000.0,
352
+ "step": 760000
353
+ },
354
+ {
355
+ "epoch": 0.37245014898005957,
356
+ "grad_norm": 1.9125442504882812,
357
+ "learning_rate": 0.00019016682125136918,
358
+ "loss": 3.2416,
359
+ "mean_token_accuracy": 0.4181554499194026,
360
+ "num_tokens": 8601600000.0,
361
+ "step": 780000
362
+ },
363
+ {
364
+ "epoch": 0.3820001528000611,
365
+ "grad_norm": 1.6356027126312256,
366
+ "learning_rate": 0.0001872728798623641,
367
+ "loss": 3.2392,
368
+ "mean_token_accuracy": 0.41842390780746935,
369
+ "num_tokens": 9318400000.0,
370
+ "step": 800000
371
+ },
372
+ {
373
+ "epoch": 0.39155015662006265,
374
+ "grad_norm": 1.69579017162323,
375
+ "learning_rate": 0.00018437893847335907,
376
+ "loss": 3.2381,
377
+ "mean_token_accuracy": 0.41857809690237047,
378
+ "num_tokens": 10035200000.0,
379
+ "step": 820000
380
+ },
381
+ {
382
+ "epoch": 0.4011001604400642,
383
+ "grad_norm": 1.7878586053848267,
384
+ "learning_rate": 0.00018148499708435403,
385
+ "loss": 3.2375,
386
+ "mean_token_accuracy": 0.41865148201435803,
387
+ "num_tokens": 10752000000.0,
388
+ "step": 840000
389
+ },
390
+ {
391
+ "epoch": 0.41065016426006573,
392
+ "grad_norm": 1.7767938375473022,
393
+ "learning_rate": 0.00017859105569534897,
394
+ "loss": 3.2353,
395
+ "mean_token_accuracy": 0.41889262205660344,
396
+ "num_tokens": 11468800000.0,
397
+ "step": 860000
398
+ },
399
+ {
400
+ "epoch": 0.4202001680800672,
401
+ "grad_norm": 1.6678400039672852,
402
+ "learning_rate": 0.00017569711430634393,
403
+ "loss": 3.2337,
404
+ "mean_token_accuracy": 0.41908056329786775,
405
+ "num_tokens": 12185600000.0,
406
+ "step": 880000
407
+ },
408
+ {
409
+ "epoch": 0.42975017190006876,
410
+ "grad_norm": 1.862349033355713,
411
+ "learning_rate": 0.00017280317291733891,
412
+ "loss": 3.2329,
413
+ "mean_token_accuracy": 0.4191736038953066,
414
+ "num_tokens": 12902400000.0,
415
+ "step": 900000
416
+ },
417
+ {
418
+ "epoch": 0.4393001757200703,
419
+ "grad_norm": 1.8526560068130493,
420
+ "learning_rate": 0.00016990923152833385,
421
+ "loss": 3.232,
422
+ "mean_token_accuracy": 0.4192189232364297,
423
+ "num_tokens": 13619200000.0,
424
+ "step": 920000
425
+ },
426
+ {
427
+ "epoch": 0.44885017954007184,
428
+ "grad_norm": 1.7189236879348755,
429
+ "learning_rate": 0.0001670152901393288,
430
+ "loss": 3.2302,
431
+ "mean_token_accuracy": 0.4194715451017022,
432
+ "num_tokens": 14336000000.0,
433
+ "step": 940000
434
+ },
435
+ {
436
+ "epoch": 0.4584001833600733,
437
+ "grad_norm": 2.0065693855285645,
438
+ "learning_rate": 0.00016412134875032377,
439
+ "loss": 3.2277,
440
+ "mean_token_accuracy": 0.419769259378314,
441
+ "num_tokens": 15052800000.0,
442
+ "step": 960000
443
+ },
444
+ {
445
+ "epoch": 0.46795018718007486,
446
+ "grad_norm": 1.647645354270935,
447
+ "learning_rate": 0.0001612274073613187,
448
+ "loss": 3.2272,
449
+ "mean_token_accuracy": 0.41980642325282097,
450
+ "num_tokens": 15769600000.0,
451
+ "step": 980000
452
+ },
453
+ {
454
+ "epoch": 0.4775001910000764,
455
+ "grad_norm": 1.8431377410888672,
456
+ "learning_rate": 0.00015833346597231366,
457
+ "loss": 3.2244,
458
+ "mean_token_accuracy": 0.42018424404114485,
459
+ "num_tokens": 16486400000.0,
460
+ "step": 1000000
461
+ },
462
+ {
463
+ "epoch": 0.48705019482007794,
464
+ "grad_norm": 1.8152481317520142,
465
+ "learning_rate": 0.0001554395245833086,
466
+ "loss": 3.2239,
467
+ "mean_token_accuracy": 0.42016951968967914,
468
+ "num_tokens": 17203200000.0,
469
+ "step": 1020000
470
+ },
471
+ {
472
+ "epoch": 0.4966001986400795,
473
+ "grad_norm": 1.979134440422058,
474
+ "learning_rate": 0.00015254558319430356,
475
+ "loss": 3.2226,
476
+ "mean_token_accuracy": 0.42036232096105813,
477
+ "num_tokens": 17920000000.0,
478
+ "step": 1040000
479
+ },
480
+ {
481
+ "epoch": 0.506150202460081,
482
+ "grad_norm": 2.2059521675109863,
483
+ "learning_rate": 0.00014965164180529852,
484
+ "loss": 3.2199,
485
+ "mean_token_accuracy": 0.4206514175161719,
486
+ "num_tokens": 18636800000.0,
487
+ "step": 1060000
488
+ },
489
+ {
490
+ "epoch": 0.5157002062800825,
491
+ "grad_norm": 2.0843093395233154,
492
+ "learning_rate": 0.00014675770041629345,
493
+ "loss": 3.2192,
494
+ "mean_token_accuracy": 0.42072975924313066,
495
+ "num_tokens": 19353600000.0,
496
+ "step": 1080000
497
+ },
498
+ {
499
+ "epoch": 0.525250210100084,
500
+ "grad_norm": 1.7924293279647827,
501
+ "learning_rate": 0.0001438637590272884,
502
+ "loss": 3.2178,
503
+ "mean_token_accuracy": 0.4209353769227862,
504
+ "num_tokens": 20070400000.0,
505
+ "step": 1100000
506
+ },
507
+ {
508
+ "epoch": 0.5348002139200856,
509
+ "grad_norm": 1.8186842203140259,
510
+ "learning_rate": 0.00014096981763828334,
511
+ "loss": 3.2166,
512
+ "mean_token_accuracy": 0.42104669906646014,
513
+ "num_tokens": 20787200000.0,
514
+ "step": 1120000
515
+ },
516
+ {
517
+ "epoch": 0.5443502177400871,
518
+ "grad_norm": 2.0781736373901367,
519
+ "learning_rate": 0.0001380758762492783,
520
+ "loss": 3.2147,
521
+ "mean_token_accuracy": 0.42130602815449236,
522
+ "num_tokens": 21504000000.0,
523
+ "step": 1140000
524
+ },
525
+ {
526
+ "epoch": 0.5539002215600887,
527
+ "grad_norm": 2.1371679306030273,
528
+ "learning_rate": 0.00013518193486027327,
529
+ "loss": 3.2135,
530
+ "mean_token_accuracy": 0.421437505723536,
531
+ "num_tokens": 22220800000.0,
532
+ "step": 1160000
533
+ },
534
+ {
535
+ "epoch": 0.5634502253800902,
536
+ "grad_norm": 1.9272387027740479,
537
+ "learning_rate": 0.00013228799347126823,
538
+ "loss": 3.2129,
539
+ "mean_token_accuracy": 0.42148565844893454,
540
+ "num_tokens": 22937600000.0,
541
+ "step": 1180000
542
+ },
543
+ {
544
+ "epoch": 0.5730002292000916,
545
+ "grad_norm": 1.975583553314209,
546
+ "learning_rate": 0.00012939405208226316,
547
+ "loss": 3.2101,
548
+ "mean_token_accuracy": 0.42179442739486694,
549
+ "num_tokens": 23654400000.0,
550
+ "step": 1200000
551
+ },
552
+ {
553
+ "epoch": 0.5825502330200932,
554
+ "grad_norm": 1.8845446109771729,
555
+ "learning_rate": 0.00012650011069325812,
556
+ "loss": 3.209,
557
+ "mean_token_accuracy": 0.42191650020480154,
558
+ "num_tokens": 24371200000.0,
559
+ "step": 1220000
560
+ },
561
+ {
562
+ "epoch": 0.5921002368400947,
563
+ "grad_norm": 1.8266512155532837,
564
+ "learning_rate": 0.00012360616930425308,
565
+ "loss": 3.2075,
566
+ "mean_token_accuracy": 0.4220881850525737,
567
+ "num_tokens": 25088000000.0,
568
+ "step": 1240000
569
+ },
570
+ {
571
+ "epoch": 0.6016502406600963,
572
+ "grad_norm": 1.9238523244857788,
573
+ "learning_rate": 0.00012071222791524803,
574
+ "loss": 3.2051,
575
+ "mean_token_accuracy": 0.42236345537304876,
576
+ "num_tokens": 25804800000.0,
577
+ "step": 1260000
578
+ },
579
+ {
580
+ "epoch": 0.6112002444800978,
581
+ "grad_norm": 1.8870298862457275,
582
+ "learning_rate": 0.00011781828652624297,
583
+ "loss": 3.2044,
584
+ "mean_token_accuracy": 0.4224274314776063,
585
+ "num_tokens": 26521600000.0,
586
+ "step": 1280000
587
+ },
588
+ {
589
+ "epoch": 0.6207502483000993,
590
+ "grad_norm": 2.10017991065979,
591
+ "learning_rate": 0.00011492434513723792,
592
+ "loss": 3.2038,
593
+ "mean_token_accuracy": 0.4225288334354758,
594
+ "num_tokens": 27238400000.0,
595
+ "step": 1300000
596
+ },
597
+ {
598
+ "epoch": 0.6303002521201009,
599
+ "grad_norm": 2.2103898525238037,
600
+ "learning_rate": 0.00011203040374823287,
601
+ "loss": 3.2006,
602
+ "mean_token_accuracy": 0.42296220604628326,
603
+ "num_tokens": 27955200000.0,
604
+ "step": 1320000
605
+ },
606
+ {
607
+ "epoch": 0.6398502559401024,
608
+ "grad_norm": 2.1420845985412598,
609
+ "learning_rate": 0.00010913646235922783,
610
+ "loss": 3.1995,
611
+ "mean_token_accuracy": 0.42305554908663034,
612
+ "num_tokens": 28672000000.0,
613
+ "step": 1340000
614
+ },
615
+ {
616
+ "epoch": 0.649400259760104,
617
+ "grad_norm": 2.468029737472534,
618
+ "learning_rate": 0.00010624252097022279,
619
+ "loss": 3.1994,
620
+ "mean_token_accuracy": 0.4230162718191743,
621
+ "num_tokens": 29388800000.0,
622
+ "step": 1360000
623
+ },
624
+ {
625
+ "epoch": 0.6589502635801054,
626
+ "grad_norm": 2.0718960762023926,
627
+ "learning_rate": 0.00010334857958121774,
628
+ "loss": 3.1964,
629
+ "mean_token_accuracy": 0.4234230771496892,
630
+ "num_tokens": 30105600000.0,
631
+ "step": 1380000
632
+ },
633
+ {
634
+ "epoch": 0.6685002674001069,
635
+ "grad_norm": 2.7991926670074463,
636
+ "learning_rate": 0.00010045463819221268,
637
+ "loss": 3.1943,
638
+ "mean_token_accuracy": 0.4237000042423606,
639
+ "num_tokens": 30822400000.0,
640
+ "step": 1400000
641
+ },
642
+ {
643
+ "epoch": 0.6780502712201085,
644
+ "grad_norm": 2.4362847805023193,
645
+ "learning_rate": 9.756069680320764e-05,
646
+ "loss": 3.1922,
647
+ "mean_token_accuracy": 0.4239600421443582,
648
+ "num_tokens": 31539200000.0,
649
+ "step": 1420000
650
+ },
651
+ {
652
+ "epoch": 0.68760027504011,
653
+ "grad_norm": 2.591127872467041,
654
+ "learning_rate": 9.466675541420259e-05,
655
+ "loss": 3.191,
656
+ "mean_token_accuracy": 0.4240850882053375,
657
+ "num_tokens": 32256000000.0,
658
+ "step": 1440000
659
+ },
660
+ {
661
+ "epoch": 0.6971502788601115,
662
+ "grad_norm": 2.234511375427246,
663
+ "learning_rate": 9.177281402519754e-05,
664
+ "loss": 3.1897,
665
+ "mean_token_accuracy": 0.4242406051442027,
666
+ "num_tokens": 32972800000.0,
667
+ "step": 1460000
668
+ },
669
+ {
670
+ "epoch": 0.7067002826801131,
671
+ "grad_norm": 2.165247917175293,
672
+ "learning_rate": 8.887887263619248e-05,
673
+ "loss": 3.1876,
674
+ "mean_token_accuracy": 0.42447070305049417,
675
+ "num_tokens": 33689600000.0,
676
+ "step": 1480000
677
+ },
678
+ {
679
+ "epoch": 0.7162502865001146,
680
+ "grad_norm": 2.373894453048706,
681
+ "learning_rate": 8.598493124718745e-05,
682
+ "loss": 3.1858,
683
+ "mean_token_accuracy": 0.4246869832932949,
684
+ "num_tokens": 34406400000.0,
685
+ "step": 1500000
686
+ },
687
+ {
688
+ "epoch": 0.7258002903201162,
689
+ "grad_norm": 2.232844352722168,
690
+ "learning_rate": 8.309098985818239e-05,
691
+ "loss": 3.1842,
692
+ "mean_token_accuracy": 0.424876312276721,
693
+ "num_tokens": 35123200000.0,
694
+ "step": 1520000
695
+ },
696
+ {
697
+ "epoch": 0.7353502941401177,
698
+ "grad_norm": 2.4994523525238037,
699
+ "learning_rate": 8.019704846917734e-05,
700
+ "loss": 3.1827,
701
+ "mean_token_accuracy": 0.42511233622431754,
702
+ "num_tokens": 35840000000.0,
703
+ "step": 1540000
704
+ },
705
+ {
706
+ "epoch": 0.7449002979601191,
707
+ "grad_norm": 2.4633235931396484,
708
+ "learning_rate": 7.73031070801723e-05,
709
+ "loss": 3.1798,
710
+ "mean_token_accuracy": 0.4254420336738229,
711
+ "num_tokens": 36556800000.0,
712
+ "step": 1560000
713
+ },
714
+ {
715
+ "epoch": 0.7544503017801207,
716
+ "grad_norm": 2.386373281478882,
717
+ "learning_rate": 7.440916569116725e-05,
718
+ "loss": 3.1786,
719
+ "mean_token_accuracy": 0.4255216441363096,
720
+ "num_tokens": 37273600000.0,
721
+ "step": 1580000
722
+ },
723
+ {
724
+ "epoch": 0.7640003056001222,
725
+ "grad_norm": 2.4472737312316895,
726
+ "learning_rate": 7.151522430216221e-05,
727
+ "loss": 3.1767,
728
+ "mean_token_accuracy": 0.4258053430899978,
729
+ "num_tokens": 37990400000.0,
730
+ "step": 1600000
731
+ },
732
+ {
733
+ "epoch": 0.7735503094201238,
734
+ "grad_norm": 2.5622825622558594,
735
+ "learning_rate": 6.862128291315715e-05,
736
+ "loss": 3.1746,
737
+ "mean_token_accuracy": 0.4260344804123044,
738
+ "num_tokens": 38707200000.0,
739
+ "step": 1620000
740
+ },
741
+ {
742
+ "epoch": 0.7831003132401253,
743
+ "grad_norm": 2.4821906089782715,
744
+ "learning_rate": 6.57273415241521e-05,
745
+ "loss": 3.1723,
746
+ "mean_token_accuracy": 0.4263139396473765,
747
+ "num_tokens": 39424000000.0,
748
+ "step": 1640000
749
+ },
750
+ {
751
+ "epoch": 0.7926503170601268,
752
+ "grad_norm": 2.5140678882598877,
753
+ "learning_rate": 6.283340013514706e-05,
754
+ "loss": 3.1701,
755
+ "mean_token_accuracy": 0.42658785569369795,
756
+ "num_tokens": 40140800000.0,
757
+ "step": 1660000
758
+ },
759
+ {
760
+ "epoch": 0.8022003208801284,
761
+ "grad_norm": 2.634791851043701,
762
+ "learning_rate": 5.9939458746142e-05,
763
+ "loss": 3.1678,
764
+ "mean_token_accuracy": 0.42689967503100634,
765
+ "num_tokens": 40857600000.0,
766
+ "step": 1680000
767
+ },
768
+ {
769
+ "epoch": 0.8117503247001299,
770
+ "grad_norm": 3.0459752082824707,
771
+ "learning_rate": 5.704551735713696e-05,
772
+ "loss": 3.1672,
773
+ "mean_token_accuracy": 0.42699285422861577,
774
+ "num_tokens": 41574400000.0,
775
+ "step": 1700000
776
+ },
777
+ {
778
+ "epoch": 0.8213003285201315,
779
+ "grad_norm": 2.686730146408081,
780
+ "learning_rate": 5.4151575968131916e-05,
781
+ "loss": 3.1642,
782
+ "mean_token_accuracy": 0.42733070581257343,
783
+ "num_tokens": 42291200000.0,
784
+ "step": 1720000
785
+ },
786
+ {
787
+ "epoch": 0.830850332340133,
788
+ "grad_norm": 2.967567205429077,
789
+ "learning_rate": 5.125763457912686e-05,
790
+ "loss": 3.1618,
791
+ "mean_token_accuracy": 0.42769208399355413,
792
+ "num_tokens": 43008000000.0,
793
+ "step": 1740000
794
+ },
795
+ {
796
+ "epoch": 0.8404003361601344,
797
+ "grad_norm": 2.7259535789489746,
798
+ "learning_rate": 4.836369319012181e-05,
799
+ "loss": 3.1596,
800
+ "mean_token_accuracy": 0.42790325200855733,
801
+ "num_tokens": 43724800000.0,
802
+ "step": 1760000
803
+ },
804
+ {
805
+ "epoch": 0.849950339980136,
806
+ "grad_norm": 2.709784746170044,
807
+ "learning_rate": 4.546975180111677e-05,
808
+ "loss": 3.1567,
809
+ "mean_token_accuracy": 0.42829815539866684,
810
+ "num_tokens": 44441600000.0,
811
+ "step": 1780000
812
+ },
813
+ {
814
+ "epoch": 0.8595003438001375,
815
+ "grad_norm": 3.054241418838501,
816
+ "learning_rate": 4.2575810412111724e-05,
817
+ "loss": 3.1551,
818
+ "mean_token_accuracy": 0.42849865667670967,
819
+ "num_tokens": 45158400000.0,
820
+ "step": 1800000
821
+ },
822
+ {
823
+ "epoch": 0.869050347620139,
824
+ "grad_norm": 2.976240873336792,
825
+ "learning_rate": 3.968186902310667e-05,
826
+ "loss": 3.1519,
827
+ "mean_token_accuracy": 0.42887963571995497,
828
+ "num_tokens": 45875200000.0,
829
+ "step": 1820000
830
+ },
831
+ {
832
+ "epoch": 0.8786003514401406,
833
+ "grad_norm": 2.723175525665283,
834
+ "learning_rate": 3.678792763410162e-05,
835
+ "loss": 3.1498,
836
+ "mean_token_accuracy": 0.42917205161750316,
837
+ "num_tokens": 46592000000.0,
838
+ "step": 1840000
839
+ },
840
+ {
841
+ "epoch": 0.8881503552601421,
842
+ "grad_norm": 2.917259693145752,
843
+ "learning_rate": 3.389398624509658e-05,
844
+ "loss": 3.146,
845
+ "mean_token_accuracy": 0.42966016797572376,
846
+ "num_tokens": 47308800000.0,
847
+ "step": 1860000
848
+ },
849
+ {
850
+ "epoch": 0.8977003590801437,
851
+ "grad_norm": 3.254523515701294,
852
+ "learning_rate": 3.1000044856091526e-05,
853
+ "loss": 3.1442,
854
+ "mean_token_accuracy": 0.42988122535943984,
855
+ "num_tokens": 48025600000.0,
856
+ "step": 1880000
857
+ },
858
+ {
859
+ "epoch": 0.9072503629001452,
860
+ "grad_norm": 2.9410157203674316,
861
+ "learning_rate": 2.810610346708648e-05,
862
+ "loss": 3.1408,
863
+ "mean_token_accuracy": 0.43033207250982525,
864
+ "num_tokens": 48742400000.0,
865
+ "step": 1900000
866
+ },
867
+ {
868
+ "epoch": 0.9168003667201466,
869
+ "grad_norm": 3.0534310340881348,
870
+ "learning_rate": 2.521216207808143e-05,
871
+ "loss": 3.1383,
872
+ "mean_token_accuracy": 0.43062722102552653,
873
+ "num_tokens": 49459200000.0,
874
+ "step": 1920000
875
+ },
876
+ {
877
+ "epoch": 0.9263503705401482,
878
+ "grad_norm": 3.0121352672576904,
879
+ "learning_rate": 2.2318220689076384e-05,
880
+ "loss": 3.1352,
881
+ "mean_token_accuracy": 0.43105802639722823,
882
+ "num_tokens": 50176000000.0,
883
+ "step": 1940000
884
+ },
885
+ {
886
+ "epoch": 0.9359003743601497,
887
+ "grad_norm": 3.4966187477111816,
888
+ "learning_rate": 1.9424279300071334e-05,
889
+ "loss": 3.1317,
890
+ "mean_token_accuracy": 0.43149784843176603,
891
+ "num_tokens": 50892800000.0,
892
+ "step": 1960000
893
+ },
894
+ {
895
+ "epoch": 0.9454503781801513,
896
+ "grad_norm": 3.3833205699920654,
897
+ "learning_rate": 1.6530337911066284e-05,
898
+ "loss": 3.1285,
899
+ "mean_token_accuracy": 0.4319396187588572,
900
+ "num_tokens": 51609600000.0,
901
+ "step": 1980000
902
+ },
903
+ {
904
+ "epoch": 0.9550003820001528,
905
+ "grad_norm": 3.30391001701355,
906
+ "learning_rate": 1.3636396522061238e-05,
907
+ "loss": 3.1248,
908
+ "mean_token_accuracy": 0.43242690176963805,
909
+ "num_tokens": 52326400000.0,
910
+ "step": 2000000
911
+ },
912
+ {
913
+ "epoch": 0.9645503858201543,
914
+ "grad_norm": 3.326011896133423,
915
+ "learning_rate": 1.074245513305619e-05,
916
+ "loss": 3.1213,
917
+ "mean_token_accuracy": 0.4328598498493433,
918
+ "num_tokens": 53043200000.0,
919
+ "step": 2020000
920
+ },
921
+ {
922
+ "epoch": 0.9741003896401559,
923
+ "grad_norm": 3.2196035385131836,
924
+ "learning_rate": 7.84851374405114e-06,
925
+ "loss": 3.1176,
926
+ "mean_token_accuracy": 0.4334187435388565,
927
+ "num_tokens": 53760000000.0,
928
+ "step": 2040000
929
+ },
930
+ {
931
+ "epoch": 0.9836503934601574,
932
+ "grad_norm": 3.748178720474243,
933
+ "learning_rate": 4.9545723550460936e-06,
934
+ "loss": 3.1139,
935
+ "mean_token_accuracy": 0.43390719720572235,
936
+ "num_tokens": 54476800000.0,
937
+ "step": 2060000
938
+ },
939
+ {
940
+ "epoch": 0.993200397280159,
941
+ "grad_norm": 3.283731460571289,
942
+ "learning_rate": 2.0606309660410444e-06,
943
+ "loss": 3.1103,
944
+ "mean_token_accuracy": 0.4344122279688716,
945
+ "num_tokens": 55193600000.0,
946
+ "step": 2080000
947
+ }
948
+ ],
949
+ "logging_steps": 20000,
950
+ "max_steps": 2094240,
951
+ "num_input_tokens_seen": 0,
952
+ "num_train_epochs": 1,
953
+ "save_steps": 20000,
954
+ "stateful_callbacks": {
955
+ "TrainerControl": {
956
+ "args": {
957
+ "should_epoch_stop": false,
958
+ "should_evaluate": false,
959
+ "should_log": false,
960
+ "should_save": true,
961
+ "should_training_stop": true
962
+ },
963
+ "attributes": {}
964
+ }
965
+ },
966
+ "total_flos": 1.1815907720024556e+18,
967
+ "train_batch_size": 140,
968
+ "trial_name": null,
969
+ "trial_params": null
970
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03002a33270612970a466cd87cc6bbd62c78f0edf6229027999213de824ba2b6
3
+ size 6161