ChiefTheLord commited on
Commit
c118d96
·
verified ·
1 Parent(s): 90d856a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -37,3 +37,4 @@ checkpoints/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoints-v2.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoints-v2.2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
37
  checkpoints-v2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoints-v2.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoints-v2.2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoints-v2.3/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v2.3/checkpoint-12288/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c4f134b68139a8d9692400b30a578d25ce628b68d1ccb63cdfe9f0629ffdd3e
3
+ size 44107313
checkpoints-v2.3/checkpoint-12288/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200359ef2048194a9355317b1695f656f9d71cb2c1c615c78444d90674bca430
3
+ size 37402680
checkpoints-v2.3/checkpoint-12288/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:936ee6981444997ba9bbef30a4a344c16273ec6e70a0245baf75df9c7250fdd9
3
+ size 512267
checkpoints-v2.3/checkpoint-12288/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f36e6c28b77555b6da6de84681647b558ac8ebc553a1b458e45112e416a213c
3
+ size 14645
checkpoints-v2.3/checkpoint-12288/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd87ecaac101c5fe8b1b0f15753a689841cfa96e9f0b15e5cdbaf87b7829ac89
3
+ size 1383
checkpoints-v2.3/checkpoint-12288/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:220b4c233a19883d884d3d8ad9c7134f99ecd7f2b9282b526b3c0fd015c1a12c
3
+ size 1465
checkpoints-v2.3/checkpoint-12288/trainer_state.json ADDED
@@ -0,0 +1,838 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5675488430095608,
6
+ "eval_steps": 1024,
7
+ "global_step": 12288,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011823934229365849,
14
+ "grad_norm": 1.1381809711456299,
15
+ "learning_rate": 1.9615384615384617e-05,
16
+ "loss": 10.3904,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.023647868458731697,
21
+ "grad_norm": 1.2398728132247925,
22
+ "learning_rate": 3.930769230769231e-05,
23
+ "loss": 7.9162,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03547180268809755,
28
+ "grad_norm": 1.1707866191864014,
29
+ "learning_rate": 4.999617095521894e-05,
30
+ "loss": 5.6793,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "grad_norm": 0.7125558853149414,
36
+ "learning_rate": 4.9961092368776736e-05,
37
+ "loss": 3.8181,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.047295736917463395,
42
+ "eval_acr_loss": 0.9955622624588883,
43
+ "eval_across_var": 0.0022213522599452947,
44
+ "eval_bleu": 0.5783933701616512,
45
+ "eval_ce_loss": 2.3272575776870936,
46
+ "eval_cos_loss": 0.9208345009039526,
47
+ "eval_cov": 0.0631596116714826,
48
+ "eval_cov_loss": 0.006449785340815511,
49
+ "eval_global_kurtosis": 3.5549826104891356,
50
+ "eval_global_mean": -0.0015338476934389437,
51
+ "eval_global_var": 0.2108087409032534,
52
+ "eval_loss": 2.798912787001971,
53
+ "eval_mse_loss": 1.9040727988225685,
54
+ "eval_per_var": 0.2048030487478596,
55
+ "eval_within_var": 0.2086275832633994,
56
+ "step": 1024
57
+ },
58
+ {
59
+ "epoch": 0.047295736917463395,
60
+ "eval_acr_loss": 0.9955622624588883,
61
+ "eval_across_var": 0.0022213522599452947,
62
+ "eval_bleu": 0.5783933701616512,
63
+ "eval_ce_loss": 2.3272575776870936,
64
+ "eval_cos_loss": 0.9208345009039526,
65
+ "eval_cov": 0.0631596116714826,
66
+ "eval_cov_loss": 0.006449785340815511,
67
+ "eval_global_kurtosis": 3.5549826104891356,
68
+ "eval_global_mean": -0.0015338476934389437,
69
+ "eval_global_var": 0.2108087409032534,
70
+ "eval_loss": 2.798912787001971,
71
+ "eval_mse_loss": 1.9040727988225685,
72
+ "eval_per_var": 0.2048030487478596,
73
+ "eval_runtime": 160.1579,
74
+ "eval_samples_per_second": 174.784,
75
+ "eval_steps_per_second": 2.735,
76
+ "eval_within_var": 0.2086275832633994,
77
+ "step": 1024
78
+ },
79
+ {
80
+ "epoch": 0.05911967114682925,
81
+ "grad_norm": 0.39322802424430847,
82
+ "learning_rate": 4.988941132556799e-05,
83
+ "loss": 2.7155,
84
+ "step": 1280
85
+ },
86
+ {
87
+ "epoch": 0.0709436053761951,
88
+ "grad_norm": 0.3898029327392578,
89
+ "learning_rate": 4.9781232937269974e-05,
90
+ "loss": 2.1064,
91
+ "step": 1536
92
+ },
93
+ {
94
+ "epoch": 0.08276753960556095,
95
+ "grad_norm": 0.2939172387123108,
96
+ "learning_rate": 4.963671583455164e-05,
97
+ "loss": 1.7128,
98
+ "step": 1792
99
+ },
100
+ {
101
+ "epoch": 0.09459147383492679,
102
+ "grad_norm": 0.27661314606666565,
103
+ "learning_rate": 4.945607193446079e-05,
104
+ "loss": 1.4405,
105
+ "step": 2048
106
+ },
107
+ {
108
+ "epoch": 0.09459147383492679,
109
+ "eval_acr_loss": 0.9952529187071814,
110
+ "eval_across_var": 0.0023763778212280372,
111
+ "eval_bleu": 0.8110985901711817,
112
+ "eval_ce_loss": 0.6872511814990544,
113
+ "eval_cos_loss": 0.7538281038199386,
114
+ "eval_cov": 0.06513830733625856,
115
+ "eval_cov_loss": 0.006921744725739235,
116
+ "eval_global_kurtosis": 3.977810041000854,
117
+ "eval_global_mean": -0.0010168063858328345,
118
+ "eval_global_var": 0.2243758249500571,
119
+ "eval_loss": 1.0947843177677834,
120
+ "eval_mse_loss": 1.6233446619826364,
121
+ "eval_per_var": 0.21794232930222604,
122
+ "eval_within_var": 0.22208662303887547,
123
+ "step": 2048
124
+ },
125
+ {
126
+ "epoch": 0.09459147383492679,
127
+ "eval_acr_loss": 0.9952529187071814,
128
+ "eval_across_var": 0.0023763778212280372,
129
+ "eval_bleu": 0.8110985901711817,
130
+ "eval_ce_loss": 0.6872511814990544,
131
+ "eval_cos_loss": 0.7538281038199386,
132
+ "eval_cov": 0.06513830733625856,
133
+ "eval_cov_loss": 0.006921744725739235,
134
+ "eval_global_kurtosis": 3.977810041000854,
135
+ "eval_global_mean": -0.0010168063858328345,
136
+ "eval_global_var": 0.2243758249500571,
137
+ "eval_loss": 1.0947843177677834,
138
+ "eval_mse_loss": 1.6233446619826364,
139
+ "eval_per_var": 0.21794232930222604,
140
+ "eval_runtime": 155.3139,
141
+ "eval_samples_per_second": 180.235,
142
+ "eval_steps_per_second": 2.82,
143
+ "eval_within_var": 0.22208662303887547,
144
+ "step": 2048
145
+ },
146
+ {
147
+ "epoch": 0.10641540806429264,
148
+ "grad_norm": 0.19573667645454407,
149
+ "learning_rate": 4.923956612967301e-05,
150
+ "loss": 1.2426,
151
+ "step": 2304
152
+ },
153
+ {
154
+ "epoch": 0.1182393422936585,
155
+ "grad_norm": 0.19026412069797516,
156
+ "learning_rate": 4.898751590005826e-05,
157
+ "loss": 1.0857,
158
+ "step": 2560
159
+ },
160
+ {
161
+ "epoch": 0.13006327652302435,
162
+ "grad_norm": 0.17090244591236115,
163
+ "learning_rate": 4.870029084713462e-05,
164
+ "loss": 0.9634,
165
+ "step": 2816
166
+ },
167
+ {
168
+ "epoch": 0.1418872107523902,
169
+ "grad_norm": 0.15054035186767578,
170
+ "learning_rate": 4.837831215209188e-05,
171
+ "loss": 0.866,
172
+ "step": 3072
173
+ },
174
+ {
175
+ "epoch": 0.1418872107523902,
176
+ "eval_acr_loss": 0.9946045773486568,
177
+ "eval_across_var": 0.002701379077032753,
178
+ "eval_bleu": 0.8988671192714057,
179
+ "eval_ce_loss": 0.32117312840402945,
180
+ "eval_cos_loss": 0.5974249733637457,
181
+ "eval_cov": 0.06391607136486872,
182
+ "eval_cov_loss": 0.006677505914249445,
183
+ "eval_global_kurtosis": 4.353016280692462,
184
+ "eval_global_mean": 0.0002520815150378502,
185
+ "eval_global_var": 0.2451829605450913,
186
+ "eval_loss": 0.6672432675753555,
187
+ "eval_mse_loss": 1.3342886499073952,
188
+ "eval_per_var": 0.23811486649186644,
189
+ "eval_within_var": 0.2426148920094586,
190
+ "step": 3072
191
+ },
192
+ {
193
+ "epoch": 0.1418872107523902,
194
+ "eval_acr_loss": 0.9946045773486568,
195
+ "eval_across_var": 0.002701379077032753,
196
+ "eval_bleu": 0.8988671192714057,
197
+ "eval_ce_loss": 0.32117312840402945,
198
+ "eval_cos_loss": 0.5974249733637457,
199
+ "eval_cov": 0.06391607136486872,
200
+ "eval_cov_loss": 0.006677505914249445,
201
+ "eval_global_kurtosis": 4.353016280692462,
202
+ "eval_global_mean": 0.0002520815150378502,
203
+ "eval_global_var": 0.2451829605450913,
204
+ "eval_loss": 0.6672432675753555,
205
+ "eval_mse_loss": 1.3342886499073952,
206
+ "eval_per_var": 0.23811486649186644,
207
+ "eval_runtime": 155.4012,
208
+ "eval_samples_per_second": 180.134,
209
+ "eval_steps_per_second": 2.819,
210
+ "eval_within_var": 0.2426148920094586,
211
+ "step": 3072
212
+ },
213
+ {
214
+ "epoch": 0.15371114498175603,
215
+ "grad_norm": 0.1389162540435791,
216
+ "learning_rate": 4.802205195817963e-05,
217
+ "loss": 0.7877,
218
+ "step": 3328
219
+ },
220
+ {
221
+ "epoch": 0.1655350792111219,
222
+ "grad_norm": 0.13399961590766907,
223
+ "learning_rate": 4.763203267836576e-05,
224
+ "loss": 0.7208,
225
+ "step": 3584
226
+ },
227
+ {
228
+ "epoch": 0.17735901344048774,
229
+ "grad_norm": 0.11527423560619354,
230
+ "learning_rate": 4.720882622928019e-05,
231
+ "loss": 0.6654,
232
+ "step": 3840
233
+ },
234
+ {
235
+ "epoch": 0.18918294766985358,
236
+ "grad_norm": 0.11336533725261688,
237
+ "learning_rate": 4.675305319256765e-05,
238
+ "loss": 0.6197,
239
+ "step": 4096
240
+ },
241
+ {
242
+ "epoch": 0.18918294766985358,
243
+ "eval_acr_loss": 0.9938907430052213,
244
+ "eval_across_var": 0.0030593323472410984,
245
+ "eval_bleu": 0.9366627287855721,
246
+ "eval_ce_loss": 0.1859381996333327,
247
+ "eval_cos_loss": 0.4787035355829213,
248
+ "eval_cov": 0.0628127006635274,
249
+ "eval_cov_loss": 0.006437324723186286,
250
+ "eval_global_kurtosis": 4.670551087758312,
251
+ "eval_global_mean": -0.0007838325141227408,
252
+ "eval_global_var": 0.2651386696454053,
253
+ "eval_loss": 0.48502123158544164,
254
+ "eval_mse_loss": 1.1087831166236912,
255
+ "eval_per_var": 0.257416468232734,
256
+ "eval_within_var": 0.2622626631363342,
257
+ "step": 4096
258
+ },
259
+ {
260
+ "epoch": 0.18918294766985358,
261
+ "eval_acr_loss": 0.9938907430052213,
262
+ "eval_across_var": 0.0030593323472410984,
263
+ "eval_bleu": 0.9366627287855721,
264
+ "eval_ce_loss": 0.1859381996333327,
265
+ "eval_cos_loss": 0.4787035355829213,
266
+ "eval_cov": 0.0628127006635274,
267
+ "eval_cov_loss": 0.006437324723186286,
268
+ "eval_global_kurtosis": 4.670551087758312,
269
+ "eval_global_mean": -0.0007838325141227408,
270
+ "eval_global_var": 0.2651386696454053,
271
+ "eval_loss": 0.48502123158544164,
272
+ "eval_mse_loss": 1.1087831166236912,
273
+ "eval_per_var": 0.257416468232734,
274
+ "eval_runtime": 156.5211,
275
+ "eval_samples_per_second": 178.845,
276
+ "eval_steps_per_second": 2.798,
277
+ "eval_within_var": 0.2622626631363342,
278
+ "step": 4096
279
+ },
280
+ {
281
+ "epoch": 0.20100688189921945,
282
+ "grad_norm": 0.11432790011167526,
283
+ "learning_rate": 4.6265381904878854e-05,
284
+ "loss": 0.5778,
285
+ "step": 4352
286
+ },
287
+ {
288
+ "epoch": 0.2128308161285853,
289
+ "grad_norm": 0.1015966460108757,
290
+ "learning_rate": 4.57465274778347e-05,
291
+ "loss": 0.5464,
292
+ "step": 4608
293
+ },
294
+ {
295
+ "epoch": 0.22465475035795113,
296
+ "grad_norm": 0.0999189242720604,
297
+ "learning_rate": 4.519725074940068e-05,
298
+ "loss": 0.5131,
299
+ "step": 4864
300
+ },
301
+ {
302
+ "epoch": 0.236478684587317,
303
+ "grad_norm": 0.09335104376077652,
304
+ "learning_rate": 4.461835716820895e-05,
305
+ "loss": 0.4865,
306
+ "step": 5120
307
+ },
308
+ {
309
+ "epoch": 0.236478684587317,
310
+ "eval_acr_loss": 0.993116364903646,
311
+ "eval_across_var": 0.0034477898343912745,
312
+ "eval_bleu": 0.9567762367252449,
313
+ "eval_ce_loss": 0.12156322401136024,
314
+ "eval_cos_loss": 0.39367829685069655,
315
+ "eval_cov": 0.06181878912938784,
316
+ "eval_cov_loss": 0.006211797137995629,
317
+ "eval_global_kurtosis": 4.93804465472426,
318
+ "eval_global_mean": -0.00010290959654333384,
319
+ "eval_global_var": 0.2838530518692922,
320
+ "eval_loss": 0.3869279193687657,
321
+ "eval_mse_loss": 0.946567648213748,
322
+ "eval_per_var": 0.2755494836258562,
323
+ "eval_within_var": 0.28062804267831043,
324
+ "step": 5120
325
+ },
326
+ {
327
+ "epoch": 0.236478684587317,
328
+ "eval_acr_loss": 0.993116364903646,
329
+ "eval_across_var": 0.0034477898343912745,
330
+ "eval_bleu": 0.9567762367252449,
331
+ "eval_ce_loss": 0.12156322401136024,
332
+ "eval_cos_loss": 0.39367829685069655,
333
+ "eval_cov": 0.06181878912938784,
334
+ "eval_cov_loss": 0.006211797137995629,
335
+ "eval_global_kurtosis": 4.93804465472426,
336
+ "eval_global_mean": -0.00010290959654333384,
337
+ "eval_global_var": 0.2838530518692922,
338
+ "eval_loss": 0.3869279193687657,
339
+ "eval_mse_loss": 0.946567648213748,
340
+ "eval_per_var": 0.2755494836258562,
341
+ "eval_runtime": 155.284,
342
+ "eval_samples_per_second": 180.27,
343
+ "eval_steps_per_second": 2.821,
344
+ "eval_within_var": 0.28062804267831043,
345
+ "step": 5120
346
+ },
347
+ {
348
+ "epoch": 0.24830261881668284,
349
+ "grad_norm": 0.0941869243979454,
350
+ "learning_rate": 4.401069561246422e-05,
351
+ "loss": 0.4632,
352
+ "step": 5376
353
+ },
354
+ {
355
+ "epoch": 0.2601265530460487,
356
+ "grad_norm": 0.09946911782026291,
357
+ "learning_rate": 4.337515714516545e-05,
358
+ "loss": 0.4419,
359
+ "step": 5632
360
+ },
361
+ {
362
+ "epoch": 0.27195048727541454,
363
+ "grad_norm": 0.1010931134223938,
364
+ "learning_rate": 4.2712673707468434e-05,
365
+ "loss": 0.4267,
366
+ "step": 5888
367
+ },
368
+ {
369
+ "epoch": 0.2837744215047804,
370
+ "grad_norm": 0.08404899388551712,
371
+ "learning_rate": 4.202421675210565e-05,
372
+ "loss": 0.4103,
373
+ "step": 6144
374
+ },
375
+ {
376
+ "epoch": 0.2837744215047804,
377
+ "eval_acr_loss": 0.9921849604066648,
378
+ "eval_across_var": 0.003915222487135973,
379
+ "eval_bleu": 0.968595516462779,
380
+ "eval_ce_loss": 0.08580248714445933,
381
+ "eval_cos_loss": 0.3347833108956411,
382
+ "eval_cov": 0.06079366326876427,
383
+ "eval_cov_loss": 0.006003422991130246,
384
+ "eval_global_kurtosis": 5.130746393987577,
385
+ "eval_global_mean": -5.609749659011353e-05,
386
+ "eval_global_var": 0.3024344945062785,
387
+ "eval_loss": 0.3278743654625601,
388
+ "eval_mse_loss": 0.836360767143502,
389
+ "eval_per_var": 0.2935495594320776,
390
+ "eval_within_var": 0.2987868603506045,
391
+ "step": 6144
392
+ },
393
+ {
394
+ "epoch": 0.2837744215047804,
395
+ "eval_acr_loss": 0.9921849604066648,
396
+ "eval_across_var": 0.003915222487135973,
397
+ "eval_bleu": 0.968595516462779,
398
+ "eval_ce_loss": 0.08580248714445933,
399
+ "eval_cos_loss": 0.3347833108956411,
400
+ "eval_cov": 0.06079366326876427,
401
+ "eval_cov_loss": 0.006003422991130246,
402
+ "eval_global_kurtosis": 5.130746393987577,
403
+ "eval_global_mean": -5.609749659011353e-05,
404
+ "eval_global_var": 0.3024344945062785,
405
+ "eval_loss": 0.3278743654625601,
406
+ "eval_mse_loss": 0.836360767143502,
407
+ "eval_per_var": 0.2935495594320776,
408
+ "eval_runtime": 154.5518,
409
+ "eval_samples_per_second": 181.124,
410
+ "eval_steps_per_second": 2.834,
411
+ "eval_within_var": 0.2987868603506045,
412
+ "step": 6144
413
+ },
414
+ {
415
+ "epoch": 0.2955983557341462,
416
+ "grad_norm": 0.08095081150531769,
417
+ "learning_rate": 4.131079581886694e-05,
418
+ "loss": 0.393,
419
+ "step": 6400
420
+ },
421
+ {
422
+ "epoch": 0.30742228996351206,
423
+ "grad_norm": 0.08812420070171356,
424
+ "learning_rate": 4.057345705423016e-05,
425
+ "loss": 0.3806,
426
+ "step": 6656
427
+ },
428
+ {
429
+ "epoch": 0.3192462241928779,
430
+ "grad_norm": 0.08378447592258453,
431
+ "learning_rate": 3.981328167731251e-05,
432
+ "loss": 0.3703,
433
+ "step": 6912
434
+ },
435
+ {
436
+ "epoch": 0.3310701584222438,
437
+ "grad_norm": 0.0990639477968216,
438
+ "learning_rate": 3.9031384394391954e-05,
439
+ "loss": 0.3564,
440
+ "step": 7168
441
+ },
442
+ {
443
+ "epoch": 0.3310701584222438,
444
+ "eval_acr_loss": 0.9909341404699299,
445
+ "eval_across_var": 0.004543303069233316,
446
+ "eval_bleu": 0.9765803459841506,
447
+ "eval_ce_loss": 0.06368093232551938,
448
+ "eval_cos_loss": 0.2937984631894386,
449
+ "eval_cov": 0.059937150511023114,
450
+ "eval_cov_loss": 0.005822429295255033,
451
+ "eval_global_kurtosis": 5.257898571284394,
452
+ "eval_global_mean": -0.0003388445126955912,
453
+ "eval_global_var": 0.3208498323344749,
454
+ "eval_loss": 0.28962595475046604,
455
+ "eval_mse_loss": 0.762596405532262,
456
+ "eval_per_var": 0.311440385095605,
457
+ "eval_within_var": 0.31661272981123295,
458
+ "step": 7168
459
+ },
460
+ {
461
+ "epoch": 0.3310701584222438,
462
+ "eval_acr_loss": 0.9909341404699299,
463
+ "eval_across_var": 0.004543303069233316,
464
+ "eval_bleu": 0.9765803459841506,
465
+ "eval_ce_loss": 0.06368093232551938,
466
+ "eval_cos_loss": 0.2937984631894386,
467
+ "eval_cov": 0.059937150511023114,
468
+ "eval_cov_loss": 0.005822429295255033,
469
+ "eval_global_kurtosis": 5.257898571284394,
470
+ "eval_global_mean": -0.0003388445126955912,
471
+ "eval_global_var": 0.3208498323344749,
472
+ "eval_loss": 0.28962595475046604,
473
+ "eval_mse_loss": 0.762596405532262,
474
+ "eval_per_var": 0.311440385095605,
475
+ "eval_runtime": 152.728,
476
+ "eval_samples_per_second": 183.287,
477
+ "eval_steps_per_second": 2.868,
478
+ "eval_within_var": 0.31661272981123295,
479
+ "step": 7168
480
+ },
481
+ {
482
+ "epoch": 0.34289409265160964,
483
+ "grad_norm": 0.07939422130584717,
484
+ "learning_rate": 3.822891176432382e-05,
485
+ "loss": 0.3491,
486
+ "step": 7424
487
+ },
488
+ {
489
+ "epoch": 0.3547180268809755,
490
+ "grad_norm": 0.0864938348531723,
491
+ "learning_rate": 3.7407040517249335e-05,
492
+ "loss": 0.3399,
493
+ "step": 7680
494
+ },
495
+ {
496
+ "epoch": 0.3665419611103413,
497
+ "grad_norm": 0.08456117659807205,
498
+ "learning_rate": 3.6566975829061614e-05,
499
+ "loss": 0.3307,
500
+ "step": 7936
501
+ },
502
+ {
503
+ "epoch": 0.37836589533970716,
504
+ "grad_norm": 0.06939388811588287,
505
+ "learning_rate": 3.5709949554159355e-05,
506
+ "loss": 0.3222,
507
+ "step": 8192
508
+ },
509
+ {
510
+ "epoch": 0.37836589533970716,
511
+ "eval_acr_loss": 0.9887979859358644,
512
+ "eval_across_var": 0.005616884891232943,
513
+ "eval_bleu": 0.9814701772398943,
514
+ "eval_ce_loss": 0.04945951889877178,
515
+ "eval_cos_loss": 0.2659321248395258,
516
+ "eval_cov": 0.05915525950253282,
517
+ "eval_cov_loss": 0.005664959631766699,
518
+ "eval_global_kurtosis": 5.291959479519221,
519
+ "eval_global_mean": 0.00022150909519631025,
520
+ "eval_global_var": 0.3408019183433219,
521
+ "eval_loss": 0.26449544942134046,
522
+ "eval_mse_loss": 0.7161998211248825,
523
+ "eval_per_var": 0.3309253710045662,
524
+ "eval_within_var": 0.3355388471252842,
525
+ "step": 8192
526
+ },
527
+ {
528
+ "epoch": 0.37836589533970716,
529
+ "eval_acr_loss": 0.9887979859358644,
530
+ "eval_across_var": 0.005616884891232943,
531
+ "eval_bleu": 0.9814701772398943,
532
+ "eval_ce_loss": 0.04945951889877178,
533
+ "eval_cos_loss": 0.2659321248395258,
534
+ "eval_cov": 0.05915525950253282,
535
+ "eval_cov_loss": 0.005664959631766699,
536
+ "eval_global_kurtosis": 5.291959479519221,
537
+ "eval_global_mean": 0.00022150909519631025,
538
+ "eval_global_var": 0.3408019183433219,
539
+ "eval_loss": 0.26449544942134046,
540
+ "eval_mse_loss": 0.7161998211248825,
541
+ "eval_per_var": 0.3309253710045662,
542
+ "eval_runtime": 152.3388,
543
+ "eval_samples_per_second": 183.755,
544
+ "eval_steps_per_second": 2.875,
545
+ "eval_within_var": 0.3355388471252842,
546
+ "step": 8192
547
+ },
548
+ {
549
+ "epoch": 0.390189829569073,
550
+ "grad_norm": 0.07356765121221542,
551
+ "learning_rate": 3.483721841907964e-05,
552
+ "loss": 0.3166,
553
+ "step": 8448
554
+ },
555
+ {
556
+ "epoch": 0.4020137637984389,
557
+ "grad_norm": 0.10812926292419434,
558
+ "learning_rate": 3.395006217965885e-05,
559
+ "loss": 0.3106,
560
+ "step": 8704
561
+ },
562
+ {
563
+ "epoch": 0.41383769802780473,
564
+ "grad_norm": 0.08505494147539139,
565
+ "learning_rate": 3.3049781744423665e-05,
566
+ "loss": 0.3032,
567
+ "step": 8960
568
+ },
569
+ {
570
+ "epoch": 0.4256616322571706,
571
+ "grad_norm": 0.07096228003501892,
572
+ "learning_rate": 3.213769726696439e-05,
573
+ "loss": 0.2986,
574
+ "step": 9216
575
+ },
576
+ {
577
+ "epoch": 0.4256616322571706,
578
+ "eval_acr_loss": 0.9829728993923152,
579
+ "eval_across_var": 0.008550605183832993,
580
+ "eval_bleu": 0.9851257395247456,
581
+ "eval_ce_loss": 0.0397455885402484,
582
+ "eval_cos_loss": 0.24592996208362927,
583
+ "eval_cov": 0.05898399875588613,
584
+ "eval_cov_loss": 0.005632478560929157,
585
+ "eval_global_kurtosis": 5.217721328343431,
586
+ "eval_global_mean": 0.0015772416439230584,
587
+ "eval_global_var": 0.3661223111087329,
588
+ "eval_loss": 0.24666939634982854,
589
+ "eval_mse_loss": 0.6856855016592975,
590
+ "eval_per_var": 0.355949227668379,
591
+ "eval_within_var": 0.3579468384180983,
592
+ "step": 9216
593
+ },
594
+ {
595
+ "epoch": 0.4256616322571706,
596
+ "eval_acr_loss": 0.9829728993923152,
597
+ "eval_across_var": 0.008550605183832993,
598
+ "eval_bleu": 0.9851257395247456,
599
+ "eval_ce_loss": 0.0397455885402484,
600
+ "eval_cos_loss": 0.24592996208362927,
601
+ "eval_cov": 0.05898399875588613,
602
+ "eval_cov_loss": 0.005632478560929157,
603
+ "eval_global_kurtosis": 5.217721328343431,
604
+ "eval_global_mean": 0.0015772416439230584,
605
+ "eval_global_var": 0.3661223111087329,
606
+ "eval_loss": 0.24666939634982854,
607
+ "eval_mse_loss": 0.6856855016592975,
608
+ "eval_per_var": 0.355949227668379,
609
+ "eval_runtime": 150.6898,
610
+ "eval_samples_per_second": 185.766,
611
+ "eval_steps_per_second": 2.907,
612
+ "eval_within_var": 0.3579468384180983,
613
+ "step": 9216
614
+ },
615
+ {
616
+ "epoch": 0.4374855664865364,
617
+ "grad_norm": 0.07909992337226868,
618
+ "learning_rate": 3.121514621008757e-05,
619
+ "loss": 0.294,
620
+ "step": 9472
621
+ },
622
+ {
623
+ "epoch": 0.44930950071590225,
624
+ "grad_norm": 0.10224120318889618,
625
+ "learning_rate": 3.0283481384586697e-05,
626
+ "loss": 0.2906,
627
+ "step": 9728
628
+ },
629
+ {
630
+ "epoch": 0.4611334349452681,
631
+ "grad_norm": 0.07880751043558121,
632
+ "learning_rate": 2.9344068965507027e-05,
633
+ "loss": 0.2855,
634
+ "step": 9984
635
+ },
636
+ {
637
+ "epoch": 0.472957369174634,
638
+ "grad_norm": 0.09268064051866531,
639
+ "learning_rate": 2.839828648881323e-05,
640
+ "loss": 0.2825,
641
+ "step": 10240
642
+ },
643
+ {
644
+ "epoch": 0.472957369174634,
645
+ "eval_acr_loss": 0.9428148437036227,
646
+ "eval_across_var": 0.029026934383734722,
647
+ "eval_bleu": 0.9874667625866159,
648
+ "eval_ce_loss": 0.03316931340409673,
649
+ "eval_cos_loss": 0.23319579296868687,
650
+ "eval_cov": 0.06389001297624144,
651
+ "eval_cov_loss": 0.0071894081860151325,
652
+ "eval_global_kurtosis": 4.941469875100541,
653
+ "eval_global_mean": 0.0006307871102198074,
654
+ "eval_global_var": 0.4374793762485731,
655
+ "eval_loss": 0.23165297341537258,
656
+ "eval_mse_loss": 0.6704898216680849,
657
+ "eval_per_var": 0.4275816477597032,
658
+ "eval_within_var": 0.40892007232528843,
659
+ "step": 10240
660
+ },
661
+ {
662
+ "epoch": 0.472957369174634,
663
+ "eval_acr_loss": 0.9428148437036227,
664
+ "eval_across_var": 0.029026934383734722,
665
+ "eval_bleu": 0.9874667625866159,
666
+ "eval_ce_loss": 0.03316931340409673,
667
+ "eval_cos_loss": 0.23319579296868687,
668
+ "eval_cov": 0.06389001297624144,
669
+ "eval_cov_loss": 0.0071894081860151325,
670
+ "eval_global_kurtosis": 4.941469875100541,
671
+ "eval_global_mean": 0.0006307871102198074,
672
+ "eval_global_var": 0.4374793762485731,
673
+ "eval_loss": 0.23165297341537258,
674
+ "eval_mse_loss": 0.6704898216680849,
675
+ "eval_per_var": 0.4275816477597032,
676
+ "eval_runtime": 151.8849,
677
+ "eval_samples_per_second": 184.304,
678
+ "eval_steps_per_second": 2.884,
679
+ "eval_within_var": 0.40892007232528843,
680
+ "step": 10240
681
+ },
682
+ {
683
+ "epoch": 0.48478130340399983,
684
+ "grad_norm": 0.12065292149782181,
685
+ "learning_rate": 2.7447520831397623e-05,
686
+ "loss": 0.2767,
687
+ "step": 10496
688
+ },
689
+ {
690
+ "epoch": 0.49660523763336567,
691
+ "grad_norm": 0.12913434207439423,
692
+ "learning_rate": 2.6493166177391138e-05,
693
+ "loss": 0.2652,
694
+ "step": 10752
695
+ },
696
+ {
697
+ "epoch": 0.5084291718627315,
698
+ "grad_norm": 0.14671213924884796,
699
+ "learning_rate": 2.5536621973758952e-05,
700
+ "loss": 0.2329,
701
+ "step": 11008
702
+ },
703
+ {
704
+ "epoch": 0.5202531060920974,
705
+ "grad_norm": 0.13414837419986725,
706
+ "learning_rate": 2.4579290878178904e-05,
707
+ "loss": 0.2016,
708
+ "step": 11264
709
+ },
710
+ {
711
+ "epoch": 0.5202531060920974,
712
+ "eval_acr_loss": 0.02229548650542855,
713
+ "eval_across_var": 0.9048832782871647,
714
+ "eval_bleu": 0.987608544196938,
715
+ "eval_ce_loss": 0.031427863691869666,
716
+ "eval_cos_loss": 0.2381418139490907,
717
+ "eval_cov": 0.05971070729434218,
718
+ "eval_cov_loss": 0.007483123698522715,
719
+ "eval_global_kurtosis": 23.39030278872137,
720
+ "eval_global_mean": -0.007102423879109561,
721
+ "eval_global_var": 1.6357934681792237,
722
+ "eval_loss": 0.14085906721889702,
723
+ "eval_mse_loss": 0.7002158846757184,
724
+ "eval_per_var": 1.6518420911815068,
725
+ "eval_within_var": 0.7377716272933298,
726
+ "step": 11264
727
+ },
728
+ {
729
+ "epoch": 0.5202531060920974,
730
+ "eval_acr_loss": 0.02229548650542855,
731
+ "eval_across_var": 0.9048832782871647,
732
+ "eval_bleu": 0.987608544196938,
733
+ "eval_ce_loss": 0.031427863691869666,
734
+ "eval_cos_loss": 0.2381418139490907,
735
+ "eval_cov": 0.05971070729434218,
736
+ "eval_cov_loss": 0.007483123698522715,
737
+ "eval_global_kurtosis": 23.39030278872137,
738
+ "eval_global_mean": -0.007102423879109561,
739
+ "eval_global_var": 1.6357934681792237,
740
+ "eval_loss": 0.14085906721889702,
741
+ "eval_mse_loss": 0.7002158846757184,
742
+ "eval_per_var": 1.6518420911815068,
743
+ "eval_runtime": 152.4278,
744
+ "eval_samples_per_second": 183.648,
745
+ "eval_steps_per_second": 2.873,
746
+ "eval_within_var": 0.7377716272933298,
747
+ "step": 11264
748
+ },
749
+ {
750
+ "epoch": 0.5320770403214632,
751
+ "grad_norm": 0.1328643560409546,
752
+ "learning_rate": 2.362257670221181e-05,
753
+ "loss": 0.1901,
754
+ "step": 11520
755
+ },
756
+ {
757
+ "epoch": 0.5439009745508291,
758
+ "grad_norm": 0.1011626198887825,
759
+ "learning_rate": 2.2667882352779608e-05,
760
+ "loss": 0.1844,
761
+ "step": 11776
762
+ },
763
+ {
764
+ "epoch": 0.5557249087801949,
765
+ "grad_norm": 0.11414045095443726,
766
+ "learning_rate": 2.1720315230424133e-05,
767
+ "loss": 0.18,
768
+ "step": 12032
769
+ },
770
+ {
771
+ "epoch": 0.5675488430095608,
772
+ "grad_norm": 0.1001119315624237,
773
+ "learning_rate": 2.0773833841855016e-05,
774
+ "loss": 0.1774,
775
+ "step": 12288
776
+ },
777
+ {
778
+ "epoch": 0.5675488430095608,
779
+ "eval_acr_loss": 0.016273215680705114,
780
+ "eval_across_var": 0.9544410968207877,
781
+ "eval_bleu": 0.9895506585988,
782
+ "eval_ce_loss": 0.026713626858986678,
783
+ "eval_cos_loss": 0.22355560778074612,
784
+ "eval_cov": 0.055647828263234875,
785
+ "eval_cov_loss": 0.006082957281973468,
786
+ "eval_global_kurtosis": 32.18490342353577,
787
+ "eval_global_mean": -0.013440034569126286,
788
+ "eval_global_var": 1.8210059039668949,
789
+ "eval_loss": 0.1298266947609649,
790
+ "eval_mse_loss": 0.6762152809530633,
791
+ "eval_per_var": 1.8439484339326484,
792
+ "eval_within_var": 0.8750175644545795,
793
+ "step": 12288
794
+ },
795
+ {
796
+ "epoch": 0.5675488430095608,
797
+ "eval_acr_loss": 0.016273215680705114,
798
+ "eval_across_var": 0.9544410968207877,
799
+ "eval_bleu": 0.9895506585988,
800
+ "eval_ce_loss": 0.026713626858986678,
801
+ "eval_cos_loss": 0.22355560778074612,
802
+ "eval_cov": 0.055647828263234875,
803
+ "eval_cov_loss": 0.006082957281973468,
804
+ "eval_global_kurtosis": 32.18490342353577,
805
+ "eval_global_mean": -0.013440034569126286,
806
+ "eval_global_var": 1.8210059039668949,
807
+ "eval_loss": 0.1298266947609649,
808
+ "eval_mse_loss": 0.6762152809530633,
809
+ "eval_per_var": 1.8439484339326484,
810
+ "eval_runtime": 150.5474,
811
+ "eval_samples_per_second": 185.941,
812
+ "eval_steps_per_second": 2.909,
813
+ "eval_within_var": 0.8750175644545795,
814
+ "step": 12288
815
+ }
816
+ ],
817
+ "logging_steps": 256,
818
+ "max_steps": 21651,
819
+ "num_input_tokens_seen": 0,
820
+ "num_train_epochs": 1,
821
+ "save_steps": 1024,
822
+ "stateful_callbacks": {
823
+ "TrainerControl": {
824
+ "args": {
825
+ "should_epoch_stop": false,
826
+ "should_evaluate": false,
827
+ "should_log": false,
828
+ "should_save": true,
829
+ "should_training_stop": false
830
+ },
831
+ "attributes": {}
832
+ }
833
+ },
834
+ "total_flos": 0.0,
835
+ "train_batch_size": 64,
836
+ "trial_name": null,
837
+ "trial_params": null
838
+ }
checkpoints-v2.3/checkpoint-12288/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:267efbdef838a34be981fa3b5630732fcfd9373e6d02bbe6840e02256e0daf01
3
+ size 5777