Willowclem commited on
Commit
ba3a8b4
·
verified ·
1 Parent(s): fcdef94

checkpoint complet pour reprise

Browse files
results/checkpoint-1200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9810fb11e27e844da85f316d6e19343b9259a84286f60e41a5a47d94851eaa0a
3
  size 9108904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aadac4b039bae373fdd4721162b0781dcca6c991bae66f228b25e86938e025d4
3
  size 9108904
results/checkpoint-1200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48e30ea83347f9bd88caa51ac7a27ed0fee2d39497d01d7ca7e1e987cc63536d
3
  size 18287162
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abbc98f1f2e0b5315aeb9f79cd7f2c04e653a8bd49b8345dba6a8d0c6b41f7ac
3
  size 18287162
results/checkpoint-1200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3efe79c3c2463f77fe43ad580dd60c311943b1c5433e083e7c9378757397c15
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa8a50f7b976d8c8ca34d880dd26f60dd2f851bac0a0a5095719fb54f5a75773
3
  size 14244
results/checkpoint-1200/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2946f81c3523d36c686d97ab3ae7914939afcd3c46edf20cd1c0443342cca6a1
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beeef06129d3879de46a6da795139adc62396b85b4a9bd7c58a4fe337c9a9c57
3
  size 988
results/checkpoint-1200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d30bfb1b47382b83586c7e4fe5892e12c903176ed5cc061cd9a948072e7f2a3e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cfc5baadd288335fe7d83a0d3dd2b713a9e631fc75cb337745b4efa6e9e4c91
3
  size 1064
results/checkpoint-1200/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.7623529411764705,
6
  "eval_steps": 500,
7
  "global_step": 1200,
8
  "is_hyper_param_search": false,
@@ -10,1091 +10,1091 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03137254901960784,
14
- "grad_norm": 2.0088300704956055,
15
- "learning_rate": 8.000000000000001e-07,
16
- "loss": 3.282,
17
- "mean_token_accuracy": 0.4480127369053662,
18
- "num_tokens": 34003.0,
19
  "step": 10
20
  },
21
  {
22
- "epoch": 0.06274509803921569,
23
- "grad_norm": 0.46773308515548706,
24
- "learning_rate": 1.7000000000000002e-06,
25
- "loss": 3.3452,
26
- "mean_token_accuracy": 0.4279281569644809,
27
- "num_tokens": 66834.0,
28
  "step": 20
29
  },
30
  {
31
- "epoch": 0.09411764705882353,
32
- "grad_norm": 1.1083784103393555,
33
- "learning_rate": 2.7000000000000004e-06,
34
- "loss": 3.1795,
35
- "mean_token_accuracy": 0.4369500808417797,
36
- "num_tokens": 102094.0,
37
  "step": 30
38
  },
39
  {
40
- "epoch": 0.12549019607843137,
41
- "grad_norm": 3.110588788986206,
42
- "learning_rate": 3.7e-06,
43
- "loss": 3.1706,
44
- "mean_token_accuracy": 0.43956867372617126,
45
- "num_tokens": 136916.0,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 0.1568627450980392,
50
- "grad_norm": 0.6114773750305176,
51
- "learning_rate": 4.600000000000001e-06,
52
- "loss": 3.2986,
53
- "mean_token_accuracy": 0.4236688693985343,
54
- "num_tokens": 166339.0,
55
  "step": 50
56
  },
57
  {
58
- "epoch": 0.18823529411764706,
59
- "grad_norm": 1.4991090297698975,
60
- "learning_rate": 5.600000000000001e-06,
61
- "loss": 3.3758,
62
- "mean_token_accuracy": 0.4320780340582132,
63
- "num_tokens": 193757.0,
64
  "step": 60
65
  },
66
  {
67
- "epoch": 0.2196078431372549,
68
- "grad_norm": 1.0190929174423218,
69
- "learning_rate": 6.600000000000001e-06,
70
- "loss": 3.5999,
71
- "mean_token_accuracy": 0.4074632978066802,
72
- "num_tokens": 227753.0,
73
  "step": 70
74
  },
75
  {
76
- "epoch": 0.25098039215686274,
77
- "grad_norm": 0.5823692679405212,
78
- "learning_rate": 7.600000000000001e-06,
79
- "loss": 3.242,
80
- "mean_token_accuracy": 0.4243007113225758,
81
- "num_tokens": 258774.0,
82
  "step": 80
83
  },
84
  {
85
- "epoch": 0.2823529411764706,
86
- "grad_norm": 1.197152018547058,
87
- "learning_rate": 8.6e-06,
88
- "loss": 3.7351,
89
- "mean_token_accuracy": 0.40340174464508893,
90
- "num_tokens": 289476.0,
91
  "step": 90
92
  },
93
  {
94
- "epoch": 0.3137254901960784,
95
- "grad_norm": 1.116959810256958,
96
- "learning_rate": 9.600000000000001e-06,
97
- "loss": 3.4449,
98
- "mean_token_accuracy": 0.42097287215292456,
99
- "num_tokens": 319562.0,
100
  "step": 100
101
  },
102
  {
103
- "epoch": 0.34509803921568627,
104
- "grad_norm": 2.1092543601989746,
105
- "learning_rate": 9.948805460750855e-06,
106
- "loss": 3.2034,
107
- "mean_token_accuracy": 0.42690765811130404,
108
- "num_tokens": 350950.0,
109
  "step": 110
110
  },
111
  {
112
- "epoch": 0.3764705882352941,
113
- "grad_norm": 0.726530909538269,
114
- "learning_rate": 9.863481228668942e-06,
115
- "loss": 3.1113,
116
- "mean_token_accuracy": 0.44094684603624046,
117
- "num_tokens": 379819.0,
118
  "step": 120
119
  },
120
  {
121
- "epoch": 0.40784313725490196,
122
- "grad_norm": 1.3136755228042603,
123
- "learning_rate": 9.778156996587031e-06,
124
- "loss": 3.1945,
125
- "mean_token_accuracy": 0.448084157705307,
126
- "num_tokens": 412785.0,
127
  "step": 130
128
  },
129
  {
130
- "epoch": 0.4392156862745098,
131
- "grad_norm": 0.9245865941047668,
132
- "learning_rate": 9.69283276450512e-06,
133
- "loss": 3.0248,
134
- "mean_token_accuracy": 0.4554275684058666,
135
- "num_tokens": 442964.0,
136
  "step": 140
137
  },
138
  {
139
- "epoch": 0.47058823529411764,
140
- "grad_norm": 4.568413257598877,
141
- "learning_rate": 9.607508532423209e-06,
142
- "loss": 3.0576,
143
- "mean_token_accuracy": 0.45087954150512816,
144
- "num_tokens": 473446.0,
145
  "step": 150
146
  },
147
  {
148
- "epoch": 0.5019607843137255,
149
- "grad_norm": 7.357224464416504,
150
- "learning_rate": 9.522184300341298e-06,
151
- "loss": 3.195,
152
- "mean_token_accuracy": 0.4267027805559337,
153
- "num_tokens": 503608.0,
154
  "step": 160
155
  },
156
  {
157
- "epoch": 0.5333333333333333,
158
- "grad_norm": 0.9659298658370972,
159
- "learning_rate": 9.436860068259387e-06,
160
- "loss": 3.1946,
161
- "mean_token_accuracy": 0.4488052343018353,
162
- "num_tokens": 533341.0,
163
  "step": 170
164
  },
165
  {
166
- "epoch": 0.5647058823529412,
167
- "grad_norm": 1.9798550605773926,
168
- "learning_rate": 9.351535836177476e-06,
169
- "loss": 3.25,
170
- "mean_token_accuracy": 0.4342062085866928,
171
- "num_tokens": 563710.0,
172
  "step": 180
173
  },
174
  {
175
- "epoch": 0.596078431372549,
176
- "grad_norm": 2.385053873062134,
177
- "learning_rate": 9.266211604095564e-06,
178
- "loss": 2.8966,
179
- "mean_token_accuracy": 0.4620134405791759,
180
- "num_tokens": 592080.0,
181
  "step": 190
182
  },
183
  {
184
- "epoch": 0.6274509803921569,
185
- "grad_norm": 1.955040693283081,
186
- "learning_rate": 9.180887372013653e-06,
187
- "loss": 3.2465,
188
- "mean_token_accuracy": 0.42782977214083073,
189
- "num_tokens": 621337.0,
190
  "step": 200
191
  },
192
  {
193
- "epoch": 0.6588235294117647,
194
- "grad_norm": 3.6970317363739014,
195
- "learning_rate": 9.09556313993174e-06,
196
- "loss": 3.1251,
197
- "mean_token_accuracy": 0.44717809772118927,
198
- "num_tokens": 646419.0,
199
  "step": 210
200
  },
201
  {
202
- "epoch": 0.6901960784313725,
203
- "grad_norm": 2.0861480236053467,
204
- "learning_rate": 9.01023890784983e-06,
205
- "loss": 3.1319,
206
- "mean_token_accuracy": 0.4380856929346919,
207
- "num_tokens": 678845.0,
208
  "step": 220
209
  },
210
  {
211
- "epoch": 0.7215686274509804,
212
- "grad_norm": 1.1843408346176147,
213
- "learning_rate": 8.924914675767918e-06,
214
- "loss": 3.0282,
215
- "mean_token_accuracy": 0.4654800074175,
216
- "num_tokens": 708108.0,
217
  "step": 230
218
  },
219
  {
220
- "epoch": 0.7529411764705882,
221
- "grad_norm": 2.084069013595581,
222
- "learning_rate": 8.839590443686009e-06,
223
- "loss": 3.1245,
224
- "mean_token_accuracy": 0.43198747336864474,
225
- "num_tokens": 734439.0,
226
  "step": 240
227
  },
228
  {
229
- "epoch": 0.7843137254901961,
230
- "grad_norm": 3.9663286209106445,
231
- "learning_rate": 8.754266211604096e-06,
232
- "loss": 2.8906,
233
- "mean_token_accuracy": 0.45770675158128143,
234
- "num_tokens": 763349.0,
235
  "step": 250
236
  },
237
  {
238
- "epoch": 0.8156862745098039,
239
- "grad_norm": 2.0605413913726807,
240
- "learning_rate": 8.668941979522185e-06,
241
- "loss": 2.9757,
242
- "mean_token_accuracy": 0.4534512896090746,
243
- "num_tokens": 791592.0,
244
  "step": 260
245
  },
246
  {
247
- "epoch": 0.8470588235294118,
248
- "grad_norm": 3.5317554473876953,
249
- "learning_rate": 8.583617747440274e-06,
250
- "loss": 2.8376,
251
- "mean_token_accuracy": 0.4683062855154276,
252
- "num_tokens": 825019.0,
253
  "step": 270
254
  },
255
  {
256
- "epoch": 0.8784313725490196,
257
- "grad_norm": 3.9178497791290283,
258
- "learning_rate": 8.498293515358363e-06,
259
- "loss": 2.9376,
260
- "mean_token_accuracy": 0.45492212250828745,
261
- "num_tokens": 854288.0,
262
  "step": 280
263
  },
264
  {
265
- "epoch": 0.9098039215686274,
266
- "grad_norm": 0.9526835680007935,
267
- "learning_rate": 8.412969283276451e-06,
268
- "loss": 2.8571,
269
- "mean_token_accuracy": 0.46086471611633895,
270
- "num_tokens": 884793.0,
271
  "step": 290
272
  },
273
  {
274
- "epoch": 0.9411764705882353,
275
- "grad_norm": 3.918769598007202,
276
- "learning_rate": 8.327645051194539e-06,
277
- "loss": 2.7934,
278
- "mean_token_accuracy": 0.4795181108638644,
279
- "num_tokens": 915321.0,
280
  "step": 300
281
  },
282
  {
283
- "epoch": 0.9725490196078431,
284
- "grad_norm": 3.45381760597229,
285
- "learning_rate": 8.24232081911263e-06,
286
- "loss": 2.8085,
287
- "mean_token_accuracy": 0.4741422997787595,
288
- "num_tokens": 946666.0,
289
  "step": 310
290
  },
291
  {
292
- "epoch": 1.0031372549019608,
293
- "grad_norm": 2.1785495281219482,
294
- "learning_rate": 8.156996587030718e-06,
295
- "loss": 2.8618,
296
- "mean_token_accuracy": 0.4749741800702535,
297
- "num_tokens": 974017.0,
298
  "step": 320
299
  },
300
  {
301
- "epoch": 1.0345098039215685,
302
- "grad_norm": 6.006409168243408,
303
- "learning_rate": 8.071672354948807e-06,
304
- "loss": 2.9078,
305
- "mean_token_accuracy": 0.46515854969620707,
306
- "num_tokens": 1004744.0,
307
  "step": 330
308
  },
309
  {
310
- "epoch": 1.0658823529411765,
311
- "grad_norm": 1.7984623908996582,
312
- "learning_rate": 7.986348122866894e-06,
313
- "loss": 2.9124,
314
- "mean_token_accuracy": 0.4585884911939502,
315
- "num_tokens": 1033652.0,
316
  "step": 340
317
  },
318
  {
319
- "epoch": 1.0972549019607842,
320
- "grad_norm": 2.510467052459717,
321
- "learning_rate": 7.901023890784983e-06,
322
- "loss": 2.8057,
323
- "mean_token_accuracy": 0.4740089667029679,
324
- "num_tokens": 1066035.0,
325
  "step": 350
326
  },
327
  {
328
- "epoch": 1.1286274509803922,
329
- "grad_norm": 3.545011520385742,
330
- "learning_rate": 7.815699658703072e-06,
331
- "loss": 2.8801,
332
- "mean_token_accuracy": 0.4632578143849969,
333
- "num_tokens": 1092737.0,
334
  "step": 360
335
  },
336
  {
337
- "epoch": 1.16,
338
- "grad_norm": 2.1517884731292725,
339
- "learning_rate": 7.73037542662116e-06,
340
- "loss": 2.7748,
341
- "mean_token_accuracy": 0.47425267212092875,
342
- "num_tokens": 1121228.0,
343
  "step": 370
344
  },
345
  {
346
- "epoch": 1.1913725490196079,
347
- "grad_norm": 1.727739691734314,
348
- "learning_rate": 7.64505119453925e-06,
349
- "loss": 2.7721,
350
- "mean_token_accuracy": 0.4736901242285967,
351
- "num_tokens": 1152714.0,
352
  "step": 380
353
  },
354
  {
355
- "epoch": 1.2227450980392156,
356
- "grad_norm": 2.197744131088257,
357
- "learning_rate": 7.5597269624573385e-06,
358
- "loss": 2.7644,
359
- "mean_token_accuracy": 0.47409027721732855,
360
- "num_tokens": 1184573.0,
361
  "step": 390
362
  },
363
  {
364
- "epoch": 1.2541176470588236,
365
- "grad_norm": 3.178690195083618,
366
- "learning_rate": 7.474402730375427e-06,
367
- "loss": 2.6941,
368
- "mean_token_accuracy": 0.48159148562699555,
369
- "num_tokens": 1218513.0,
370
  "step": 400
371
  },
372
  {
373
- "epoch": 1.2854901960784313,
374
- "grad_norm": 1.3430229425430298,
375
- "learning_rate": 7.389078498293516e-06,
376
- "loss": 2.5874,
377
- "mean_token_accuracy": 0.49995266608893874,
378
- "num_tokens": 1250333.0,
379
  "step": 410
380
  },
381
  {
382
- "epoch": 1.3168627450980392,
383
- "grad_norm": 3.5784506797790527,
384
- "learning_rate": 7.303754266211604e-06,
385
- "loss": 2.5586,
386
- "mean_token_accuracy": 0.5180117629468441,
387
- "num_tokens": 1286668.0,
388
  "step": 420
389
  },
390
  {
391
- "epoch": 1.348235294117647,
392
- "grad_norm": 31.7750186920166,
393
- "learning_rate": 7.218430034129693e-06,
394
- "loss": 2.6383,
395
- "mean_token_accuracy": 0.48776071686297656,
396
- "num_tokens": 1315580.0,
397
  "step": 430
398
  },
399
  {
400
- "epoch": 1.379607843137255,
401
- "grad_norm": 2.4759323596954346,
402
- "learning_rate": 7.133105802047782e-06,
403
- "loss": 2.6451,
404
- "mean_token_accuracy": 0.4944142198190093,
405
- "num_tokens": 1347539.0,
406
  "step": 440
407
  },
408
  {
409
- "epoch": 1.4109803921568629,
410
- "grad_norm": 1.7809475660324097,
411
- "learning_rate": 7.047781569965872e-06,
412
- "loss": 2.7221,
413
- "mean_token_accuracy": 0.47517210952937605,
414
- "num_tokens": 1377083.0,
415
  "step": 450
416
  },
417
  {
418
- "epoch": 1.4423529411764706,
419
- "grad_norm": 1.1610660552978516,
420
- "learning_rate": 6.96245733788396e-06,
421
- "loss": 2.5579,
422
- "mean_token_accuracy": 0.49381575733423233,
423
- "num_tokens": 1408914.0,
424
  "step": 460
425
  },
426
  {
427
- "epoch": 1.4737254901960783,
428
- "grad_norm": 4.139962673187256,
429
- "learning_rate": 6.877133105802049e-06,
430
- "loss": 2.9326,
431
- "mean_token_accuracy": 0.45861218236386775,
432
- "num_tokens": 1438118.0,
433
  "step": 470
434
  },
435
  {
436
- "epoch": 1.5050980392156863,
437
- "grad_norm": 3.0993845462799072,
438
- "learning_rate": 6.7918088737201375e-06,
439
- "loss": 2.8458,
440
- "mean_token_accuracy": 0.47443244988098743,
441
- "num_tokens": 1467640.0,
442
  "step": 480
443
  },
444
  {
445
- "epoch": 1.5364705882352943,
446
- "grad_norm": 1.291991949081421,
447
- "learning_rate": 6.7064846416382255e-06,
448
- "loss": 2.6781,
449
- "mean_token_accuracy": 0.4779525174759328,
450
- "num_tokens": 1495733.0,
451
  "step": 490
452
  },
453
  {
454
- "epoch": 1.567843137254902,
455
- "grad_norm": 4.795923709869385,
456
- "learning_rate": 6.621160409556314e-06,
457
- "loss": 2.9197,
458
- "mean_token_accuracy": 0.4680457916110754,
459
- "num_tokens": 1525251.0,
460
  "step": 500
461
  },
462
  {
463
- "epoch": 1.5992156862745097,
464
- "grad_norm": 1.3896703720092773,
465
- "learning_rate": 6.535836177474402e-06,
466
- "loss": 2.6147,
467
- "mean_token_accuracy": 0.49835432767868043,
468
- "num_tokens": 1554363.0,
469
  "step": 510
470
  },
471
  {
472
- "epoch": 1.6305882352941177,
473
- "grad_norm": 1.1814641952514648,
474
- "learning_rate": 6.450511945392492e-06,
475
- "loss": 2.6656,
476
- "mean_token_accuracy": 0.48573412485420703,
477
- "num_tokens": 1581026.0,
478
  "step": 520
479
  },
480
  {
481
- "epoch": 1.6619607843137256,
482
- "grad_norm": 1.8640310764312744,
483
- "learning_rate": 6.365187713310581e-06,
484
- "loss": 2.5826,
485
- "mean_token_accuracy": 0.4969061462208629,
486
- "num_tokens": 1611477.0,
487
  "step": 530
488
  },
489
  {
490
- "epoch": 1.6933333333333334,
491
- "grad_norm": 4.471650123596191,
492
- "learning_rate": 6.27986348122867e-06,
493
- "loss": 2.6517,
494
- "mean_token_accuracy": 0.4934783162549138,
495
- "num_tokens": 1641681.0,
496
  "step": 540
497
  },
498
  {
499
- "epoch": 1.724705882352941,
500
- "grad_norm": 3.423351526260376,
501
- "learning_rate": 6.194539249146758e-06,
502
- "loss": 2.6683,
503
- "mean_token_accuracy": 0.48104359675198793,
504
- "num_tokens": 1670996.0,
505
  "step": 550
506
  },
507
  {
508
- "epoch": 1.756078431372549,
509
- "grad_norm": 1.9675357341766357,
510
- "learning_rate": 6.109215017064847e-06,
511
- "loss": 2.5381,
512
- "mean_token_accuracy": 0.49859709180891515,
513
- "num_tokens": 1702169.0,
514
  "step": 560
515
  },
516
  {
517
- "epoch": 1.787450980392157,
518
- "grad_norm": 1.6399911642074585,
519
- "learning_rate": 6.023890784982936e-06,
520
- "loss": 2.5058,
521
- "mean_token_accuracy": 0.5064322877675295,
522
- "num_tokens": 1731408.0,
523
  "step": 570
524
  },
525
  {
526
- "epoch": 1.8188235294117647,
527
- "grad_norm": 1.8453171253204346,
528
- "learning_rate": 5.938566552901024e-06,
529
- "loss": 2.6272,
530
- "mean_token_accuracy": 0.4801918284967542,
531
- "num_tokens": 1759204.0,
532
  "step": 580
533
  },
534
  {
535
- "epoch": 1.8501960784313725,
536
- "grad_norm": 1.7112871408462524,
537
- "learning_rate": 5.853242320819113e-06,
538
- "loss": 2.4362,
539
- "mean_token_accuracy": 0.512086040340364,
540
- "num_tokens": 1789717.0,
541
  "step": 590
542
  },
543
  {
544
- "epoch": 1.8815686274509804,
545
- "grad_norm": 3.174295663833618,
546
- "learning_rate": 5.767918088737202e-06,
547
- "loss": 2.5042,
548
- "mean_token_accuracy": 0.5141274336725473,
549
- "num_tokens": 1821803.0,
550
  "step": 600
551
  },
552
  {
553
- "epoch": 1.9129411764705884,
554
- "grad_norm": 3.231480121612549,
555
- "learning_rate": 5.682593856655291e-06,
556
- "loss": 2.6359,
557
- "mean_token_accuracy": 0.49160230327397586,
558
- "num_tokens": 1853817.0,
559
  "step": 610
560
  },
561
  {
562
- "epoch": 1.944313725490196,
563
- "grad_norm": 1.1881468296051025,
564
- "learning_rate": 5.597269624573379e-06,
565
- "loss": 2.4535,
566
- "mean_token_accuracy": 0.5213793812319636,
567
- "num_tokens": 1885929.0,
568
  "step": 620
569
  },
570
  {
571
- "epoch": 1.9756862745098038,
572
- "grad_norm": 1.3049256801605225,
573
- "learning_rate": 5.511945392491468e-06,
574
- "loss": 2.5596,
575
- "mean_token_accuracy": 0.5133258309215307,
576
- "num_tokens": 1918060.0,
577
  "step": 630
578
  },
579
  {
580
- "epoch": 2.0062745098039216,
581
- "grad_norm": 2.1421661376953125,
582
- "learning_rate": 5.426621160409556e-06,
583
- "loss": 2.4831,
584
- "mean_token_accuracy": 0.5165034267000663,
585
- "num_tokens": 1948420.0,
586
  "step": 640
587
  },
588
  {
589
- "epoch": 2.0376470588235294,
590
- "grad_norm": 2.0425727367401123,
591
- "learning_rate": 5.341296928327645e-06,
592
- "loss": 2.3654,
593
- "mean_token_accuracy": 0.5259943537414074,
594
- "num_tokens": 1977715.0,
595
  "step": 650
596
  },
597
  {
598
- "epoch": 2.069019607843137,
599
- "grad_norm": 4.167781352996826,
600
- "learning_rate": 5.255972696245735e-06,
601
- "loss": 2.3315,
602
- "mean_token_accuracy": 0.5249333314597606,
603
- "num_tokens": 2008534.0,
604
  "step": 660
605
  },
606
  {
607
- "epoch": 2.1003921568627453,
608
- "grad_norm": 1.0092592239379883,
609
- "learning_rate": 5.1706484641638235e-06,
610
- "loss": 2.5238,
611
- "mean_token_accuracy": 0.5057306325063109,
612
- "num_tokens": 2039030.0,
613
  "step": 670
614
  },
615
  {
616
- "epoch": 2.131764705882353,
617
- "grad_norm": 1.6947963237762451,
618
- "learning_rate": 5.0853242320819115e-06,
619
- "loss": 2.5809,
620
- "mean_token_accuracy": 0.5050426244735717,
621
- "num_tokens": 2068912.0,
622
  "step": 680
623
  },
624
  {
625
- "epoch": 2.1631372549019607,
626
- "grad_norm": 1.5759137868881226,
627
- "learning_rate": 5e-06,
628
- "loss": 2.4439,
629
- "mean_token_accuracy": 0.5173273866996169,
630
- "num_tokens": 2101461.0,
631
  "step": 690
632
  },
633
  {
634
- "epoch": 2.1945098039215685,
635
- "grad_norm": 1.685102939605713,
636
- "learning_rate": 4.914675767918089e-06,
637
- "loss": 2.4616,
638
- "mean_token_accuracy": 0.5100228149443865,
639
- "num_tokens": 2131232.0,
640
  "step": 700
641
  },
642
  {
643
- "epoch": 2.2258823529411766,
644
- "grad_norm": 1.9910387992858887,
645
- "learning_rate": 4.829351535836178e-06,
646
- "loss": 2.3545,
647
- "mean_token_accuracy": 0.5206725034862757,
648
- "num_tokens": 2160460.0,
649
  "step": 710
650
  },
651
  {
652
- "epoch": 2.2572549019607844,
653
- "grad_norm": 1.7385118007659912,
654
- "learning_rate": 4.744027303754267e-06,
655
- "loss": 2.521,
656
- "mean_token_accuracy": 0.503148902207613,
657
- "num_tokens": 2188175.0,
658
  "step": 720
659
  },
660
  {
661
- "epoch": 2.288627450980392,
662
- "grad_norm": 5.597545623779297,
663
- "learning_rate": 4.658703071672355e-06,
664
- "loss": 2.467,
665
- "mean_token_accuracy": 0.5022781057283282,
666
- "num_tokens": 2218714.0,
667
  "step": 730
668
  },
669
  {
670
- "epoch": 2.32,
671
- "grad_norm": 1.7059907913208008,
672
- "learning_rate": 4.573378839590444e-06,
673
- "loss": 2.4086,
674
- "mean_token_accuracy": 0.504382885247469,
675
- "num_tokens": 2249170.0,
676
  "step": 740
677
  },
678
  {
679
- "epoch": 2.351372549019608,
680
- "grad_norm": 1.951714277267456,
681
- "learning_rate": 4.488054607508533e-06,
682
- "loss": 2.3236,
683
- "mean_token_accuracy": 0.5256480574607849,
684
- "num_tokens": 2280286.0,
685
  "step": 750
686
  },
687
  {
688
- "epoch": 2.3827450980392157,
689
- "grad_norm": 1.0276103019714355,
690
- "learning_rate": 4.402730375426622e-06,
691
- "loss": 2.3727,
692
- "mean_token_accuracy": 0.5266215573996306,
693
- "num_tokens": 2311312.0,
694
  "step": 760
695
  },
696
  {
697
- "epoch": 2.4141176470588235,
698
- "grad_norm": 2.829286813735962,
699
- "learning_rate": 4.31740614334471e-06,
700
- "loss": 2.5146,
701
- "mean_token_accuracy": 0.5105616014450789,
702
- "num_tokens": 2340935.0,
703
  "step": 770
704
  },
705
  {
706
- "epoch": 2.445490196078431,
707
- "grad_norm": 3.0118846893310547,
708
- "learning_rate": 4.232081911262799e-06,
709
- "loss": 2.3505,
710
- "mean_token_accuracy": 0.5210155340842902,
711
- "num_tokens": 2370291.0,
712
  "step": 780
713
  },
714
  {
715
- "epoch": 2.4768627450980394,
716
- "grad_norm": 1.9568514823913574,
717
- "learning_rate": 4.1467576791808874e-06,
718
- "loss": 2.3832,
719
- "mean_token_accuracy": 0.5071445981040597,
720
- "num_tokens": 2399843.0,
721
  "step": 790
722
  },
723
  {
724
- "epoch": 2.508235294117647,
725
- "grad_norm": 1.8932603597640991,
726
- "learning_rate": 4.061433447098976e-06,
727
- "loss": 2.3508,
728
- "mean_token_accuracy": 0.5251543965190649,
729
- "num_tokens": 2428762.0,
730
  "step": 800
731
  },
732
  {
733
- "epoch": 2.539607843137255,
734
- "grad_norm": 1.755767822265625,
735
- "learning_rate": 3.976109215017065e-06,
736
- "loss": 2.3532,
737
- "mean_token_accuracy": 0.5324380807578564,
738
- "num_tokens": 2458475.0,
739
  "step": 810
740
  },
741
  {
742
- "epoch": 2.5709803921568626,
743
- "grad_norm": 2.4889233112335205,
744
- "learning_rate": 3.890784982935154e-06,
745
- "loss": 2.6067,
746
- "mean_token_accuracy": 0.5031498618423939,
747
- "num_tokens": 2489770.0,
748
  "step": 820
749
  },
750
  {
751
- "epoch": 2.6023529411764708,
752
- "grad_norm": 4.700379371643066,
753
- "learning_rate": 3.8054607508532425e-06,
754
- "loss": 2.5566,
755
- "mean_token_accuracy": 0.502924164570868,
756
- "num_tokens": 2521156.0,
757
  "step": 830
758
  },
759
  {
760
- "epoch": 2.6337254901960785,
761
- "grad_norm": 12.594019889831543,
762
- "learning_rate": 3.7201365187713314e-06,
763
- "loss": 2.1664,
764
- "mean_token_accuracy": 0.5561403293162585,
765
- "num_tokens": 2553903.0,
766
  "step": 840
767
  },
768
  {
769
- "epoch": 2.665098039215686,
770
- "grad_norm": 5.380671977996826,
771
- "learning_rate": 3.6348122866894202e-06,
772
- "loss": 2.3804,
773
- "mean_token_accuracy": 0.5276698149740696,
774
- "num_tokens": 2583417.0,
775
  "step": 850
776
  },
777
  {
778
- "epoch": 2.696470588235294,
779
- "grad_norm": 6.616447448730469,
780
- "learning_rate": 3.5494880546075087e-06,
781
- "loss": 2.4498,
782
- "mean_token_accuracy": 0.5167227942496538,
783
- "num_tokens": 2612099.0,
784
  "step": 860
785
  },
786
  {
787
- "epoch": 2.7278431372549017,
788
- "grad_norm": 1.3597829341888428,
789
- "learning_rate": 3.4641638225255976e-06,
790
- "loss": 2.173,
791
- "mean_token_accuracy": 0.5551321767270565,
792
- "num_tokens": 2644692.0,
793
  "step": 870
794
  },
795
  {
796
- "epoch": 2.75921568627451,
797
- "grad_norm": 2.5514867305755615,
798
- "learning_rate": 3.378839590443686e-06,
799
- "loss": 2.3411,
800
- "mean_token_accuracy": 0.534308859705925,
801
- "num_tokens": 2680221.0,
802
  "step": 880
803
  },
804
  {
805
- "epoch": 2.7905882352941176,
806
- "grad_norm": 2.470513105392456,
807
- "learning_rate": 3.2935153583617753e-06,
808
- "loss": 2.3716,
809
- "mean_token_accuracy": 0.5275221727788448,
810
- "num_tokens": 2715613.0,
811
  "step": 890
812
  },
813
  {
814
- "epoch": 2.8219607843137258,
815
- "grad_norm": 1.194263219833374,
816
- "learning_rate": 3.2081911262798638e-06,
817
- "loss": 2.3571,
818
- "mean_token_accuracy": 0.5199422530829907,
819
- "num_tokens": 2745234.0,
820
  "step": 900
821
  },
822
  {
823
- "epoch": 2.8533333333333335,
824
- "grad_norm": Infinity,
825
- "learning_rate": 3.122866894197952e-06,
826
- "loss": 2.4158,
827
- "mean_token_accuracy": 0.5191751107573509,
828
- "num_tokens": 2775161.0,
829
  "step": 910
830
  },
831
  {
832
- "epoch": 2.8847058823529412,
833
- "grad_norm": 1.294569492340088,
834
- "learning_rate": 3.046075085324232e-06,
835
- "loss": 2.3558,
836
- "mean_token_accuracy": 0.5214510016143322,
837
- "num_tokens": 2805373.0,
838
  "step": 920
839
  },
840
  {
841
- "epoch": 2.916078431372549,
842
- "grad_norm": 4.139784336090088,
843
- "learning_rate": 2.9607508532423213e-06,
844
- "loss": 2.3869,
845
- "mean_token_accuracy": 0.5307831708341837,
846
- "num_tokens": 2831957.0,
847
  "step": 930
848
  },
849
  {
850
- "epoch": 2.9474509803921567,
851
- "grad_norm": 1.2397838830947876,
852
- "learning_rate": 2.8754266211604098e-06,
853
- "loss": 2.3455,
854
- "mean_token_accuracy": 0.5367285626009106,
855
- "num_tokens": 2862724.0,
856
  "step": 940
857
  },
858
  {
859
- "epoch": 2.978823529411765,
860
- "grad_norm": 1.8458396196365356,
861
- "learning_rate": 2.790102389078498e-06,
862
- "loss": 2.3212,
863
- "mean_token_accuracy": 0.540785015001893,
864
- "num_tokens": 2895266.0,
865
  "step": 950
866
  },
867
  {
868
- "epoch": 3.0094117647058822,
869
- "grad_norm": 2.0150907039642334,
870
- "learning_rate": 2.7047781569965875e-06,
871
- "loss": 2.3589,
872
- "mean_token_accuracy": 0.5204295409031403,
873
- "num_tokens": 2924126.0,
874
  "step": 960
875
  },
876
  {
877
- "epoch": 3.0407843137254904,
878
- "grad_norm": 10.822606086730957,
879
- "learning_rate": 2.619453924914676e-06,
880
- "loss": 2.1408,
881
- "mean_token_accuracy": 0.5493647336959839,
882
- "num_tokens": 2956817.0,
883
  "step": 970
884
  },
885
  {
886
- "epoch": 3.072156862745098,
887
- "grad_norm": 1.3175485134124756,
888
- "learning_rate": 2.534129692832765e-06,
889
- "loss": 2.3916,
890
- "mean_token_accuracy": 0.5206685658544302,
891
- "num_tokens": 2986467.0,
892
  "step": 980
893
  },
894
  {
895
- "epoch": 3.103529411764706,
896
- "grad_norm": 1.7138490676879883,
897
- "learning_rate": 2.4488054607508537e-06,
898
- "loss": 2.3403,
899
- "mean_token_accuracy": 0.5319944698363542,
900
- "num_tokens": 3018127.0,
901
  "step": 990
902
  },
903
  {
904
- "epoch": 3.1349019607843136,
905
- "grad_norm": 1.6033964157104492,
906
- "learning_rate": 2.363481228668942e-06,
907
- "loss": 2.2751,
908
- "mean_token_accuracy": 0.5398386877030135,
909
- "num_tokens": 3047280.0,
910
  "step": 1000
911
  },
912
  {
913
- "epoch": 3.1662745098039213,
914
- "grad_norm": 7.103280544281006,
915
- "learning_rate": 2.278156996587031e-06,
916
- "loss": 2.3816,
917
- "mean_token_accuracy": 0.5190372098237276,
918
- "num_tokens": 3077137.0,
919
  "step": 1010
920
  },
921
  {
922
- "epoch": 3.1976470588235295,
923
- "grad_norm": 2.4392924308776855,
924
- "learning_rate": 2.1928327645051195e-06,
925
- "loss": 2.3052,
926
- "mean_token_accuracy": 0.5296947434544563,
927
- "num_tokens": 3106067.0,
928
  "step": 1020
929
  },
930
  {
931
- "epoch": 3.2290196078431372,
932
- "grad_norm": 1.4106686115264893,
933
- "learning_rate": 2.1075085324232083e-06,
934
- "loss": 2.3615,
935
- "mean_token_accuracy": 0.525895349867642,
936
- "num_tokens": 3136450.0,
937
  "step": 1030
938
  },
939
  {
940
- "epoch": 3.260392156862745,
941
- "grad_norm": 3.269272565841675,
942
- "learning_rate": 2.022184300341297e-06,
943
- "loss": 2.3037,
944
- "mean_token_accuracy": 0.5490067519247532,
945
- "num_tokens": 3166808.0,
946
  "step": 1040
947
  },
948
  {
949
- "epoch": 3.291764705882353,
950
- "grad_norm": 1.5100555419921875,
951
- "learning_rate": 1.9368600682593857e-06,
952
- "loss": 2.3014,
953
- "mean_token_accuracy": 0.5390114476904273,
954
- "num_tokens": 3197483.0,
955
  "step": 1050
956
  },
957
  {
958
- "epoch": 3.323137254901961,
959
- "grad_norm": 1.4328869581222534,
960
- "learning_rate": 1.8515358361774745e-06,
961
- "loss": 2.2193,
962
- "mean_token_accuracy": 0.5445488292723895,
963
- "num_tokens": 3229662.0,
964
  "step": 1060
965
  },
966
  {
967
- "epoch": 3.3545098039215686,
968
- "grad_norm": 0.9292280077934265,
969
- "learning_rate": 1.7662116040955632e-06,
970
- "loss": 2.1304,
971
- "mean_token_accuracy": 0.5581423584371805,
972
- "num_tokens": 3262175.0,
973
  "step": 1070
974
  },
975
  {
976
- "epoch": 3.3858823529411763,
977
- "grad_norm": 2.55062198638916,
978
- "learning_rate": 1.680887372013652e-06,
979
- "loss": 2.4022,
980
- "mean_token_accuracy": 0.5283184833824635,
981
- "num_tokens": 3291239.0,
982
  "step": 1080
983
  },
984
  {
985
- "epoch": 3.417254901960784,
986
- "grad_norm": 3.2028212547302246,
987
- "learning_rate": 1.5955631399317405e-06,
988
- "loss": 2.4047,
989
- "mean_token_accuracy": 0.530560277402401,
990
- "num_tokens": 3321636.0,
991
  "step": 1090
992
  },
993
  {
994
- "epoch": 3.4486274509803923,
995
- "grad_norm": 1.1053611040115356,
996
- "learning_rate": 1.5102389078498294e-06,
997
- "loss": 2.0193,
998
- "mean_token_accuracy": 0.5678496524691582,
999
- "num_tokens": 3355839.0,
1000
  "step": 1100
1001
  },
1002
  {
1003
- "epoch": 3.48,
1004
- "grad_norm": 1.1278761625289917,
1005
- "learning_rate": 1.4249146757679183e-06,
1006
- "loss": 2.1899,
1007
- "mean_token_accuracy": 0.5349464191123843,
1008
- "num_tokens": 3390743.0,
1009
  "step": 1110
1010
  },
1011
  {
1012
- "epoch": 3.5113725490196077,
1013
- "grad_norm": 1.3680450916290283,
1014
- "learning_rate": 1.339590443686007e-06,
1015
- "loss": 2.3307,
1016
- "mean_token_accuracy": 0.5308054933324456,
1017
- "num_tokens": 3422911.0,
1018
  "step": 1120
1019
  },
1020
  {
1021
- "epoch": 3.542745098039216,
1022
- "grad_norm": 3.9734294414520264,
1023
- "learning_rate": 1.2542662116040958e-06,
1024
- "loss": 2.2857,
1025
- "mean_token_accuracy": 0.5387092420831323,
1026
- "num_tokens": 3453759.0,
1027
  "step": 1130
1028
  },
1029
  {
1030
- "epoch": 3.5741176470588236,
1031
- "grad_norm": 2.855978012084961,
1032
- "learning_rate": 1.1689419795221844e-06,
1033
- "loss": 2.2933,
1034
- "mean_token_accuracy": 0.5302057925611734,
1035
- "num_tokens": 3482976.0,
1036
  "step": 1140
1037
  },
1038
  {
1039
- "epoch": 3.6054901960784314,
1040
- "grad_norm": 2.837674617767334,
1041
- "learning_rate": 1.0836177474402731e-06,
1042
- "loss": 2.3656,
1043
- "mean_token_accuracy": 0.5338190544396639,
1044
- "num_tokens": 3512124.0,
1045
  "step": 1150
1046
  },
1047
  {
1048
- "epoch": 3.636862745098039,
1049
- "grad_norm": 1.6821599006652832,
1050
- "learning_rate": 9.982935153583618e-07,
1051
- "loss": 2.3696,
1052
- "mean_token_accuracy": 0.5232982926070691,
1053
- "num_tokens": 3539944.0,
1054
  "step": 1160
1055
  },
1056
  {
1057
- "epoch": 3.668235294117647,
1058
- "grad_norm": 8.743041038513184,
1059
- "learning_rate": 9.129692832764505e-07,
1060
- "loss": 2.3186,
1061
- "mean_token_accuracy": 0.5293452955782414,
1062
- "num_tokens": 3568686.0,
1063
  "step": 1170
1064
  },
1065
  {
1066
- "epoch": 3.699607843137255,
1067
- "grad_norm": 3.6034657955169678,
1068
- "learning_rate": 8.276450511945393e-07,
1069
- "loss": 2.474,
1070
- "mean_token_accuracy": 0.518931976519525,
1071
- "num_tokens": 3596306.0,
1072
  "step": 1180
1073
  },
1074
  {
1075
- "epoch": 3.7309803921568627,
1076
- "grad_norm": 1.2798527479171753,
1077
- "learning_rate": 7.42320819112628e-07,
1078
- "loss": 2.1739,
1079
- "mean_token_accuracy": 0.5471075214445591,
1080
- "num_tokens": 3625513.0,
1081
  "step": 1190
1082
  },
1083
  {
1084
- "epoch": 3.7623529411764705,
1085
- "grad_norm": 1.1355539560317993,
1086
- "learning_rate": 6.569965870307168e-07,
1087
- "loss": 2.2781,
1088
- "mean_token_accuracy": 0.5349656146019697,
1089
- "num_tokens": 3658136.0,
1090
  "step": 1200
1091
  }
1092
  ],
1093
  "logging_steps": 10,
1094
- "max_steps": 1272,
1095
  "num_input_tokens_seen": 0,
1096
- "num_train_epochs": 4,
1097
- "save_steps": 200,
1098
  "stateful_callbacks": {
1099
  "TrainerControl": {
1100
  "args": {
@@ -1107,7 +1107,7 @@
1107
  "attributes": {}
1108
  }
1109
  },
1110
- "total_flos": 6.324879825159782e+16,
1111
  "train_batch_size": 1,
1112
  "trial_name": null,
1113
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.4242424242424243,
6
  "eval_steps": 500,
7
  "global_step": 1200,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020202020202020204,
14
+ "grad_norm": 0.7669611573219299,
15
+ "learning_rate": 8.000000000000001e-06,
16
+ "loss": 3.4353,
17
+ "mean_token_accuracy": 0.4085813149809837,
18
+ "num_tokens": 19518.0,
19
  "step": 10
20
  },
21
  {
22
+ "epoch": 0.04040404040404041,
23
+ "grad_norm": 1.0622327327728271,
24
+ "learning_rate": 1.7000000000000003e-05,
25
+ "loss": 2.917,
26
+ "mean_token_accuracy": 0.47086485363543035,
27
+ "num_tokens": 39828.0,
28
  "step": 20
29
  },
30
  {
31
+ "epoch": 0.06060606060606061,
32
+ "grad_norm": 0.7585554718971252,
33
+ "learning_rate": 2.6000000000000002e-05,
34
+ "loss": 3.4086,
35
+ "mean_token_accuracy": 0.4174537444487214,
36
+ "num_tokens": 60025.0,
37
  "step": 30
38
  },
39
  {
40
+ "epoch": 0.08080808080808081,
41
+ "grad_norm": 0.8688523173332214,
42
+ "learning_rate": 3.6e-05,
43
+ "loss": 3.0871,
44
+ "mean_token_accuracy": 0.44842766746878626,
45
+ "num_tokens": 79198.0,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 0.10101010101010101,
50
+ "grad_norm": 1.1718096733093262,
51
+ "learning_rate": 4.600000000000001e-05,
52
+ "loss": 2.8734,
53
+ "mean_token_accuracy": 0.47476900182664394,
54
+ "num_tokens": 99513.0,
55
  "step": 50
56
  },
57
  {
58
+ "epoch": 0.12121212121212122,
59
+ "grad_norm": 4.563867092132568,
60
+ "learning_rate": 5.500000000000001e-05,
61
+ "loss": 3.3615,
62
+ "mean_token_accuracy": 0.4317817037925124,
63
+ "num_tokens": 117370.0,
64
  "step": 60
65
  },
66
  {
67
+ "epoch": 0.1414141414141414,
68
+ "grad_norm": 1.2560386657714844,
69
+ "learning_rate": 6.500000000000001e-05,
70
+ "loss": 3.283,
71
+ "mean_token_accuracy": 0.4325120337307453,
72
+ "num_tokens": 135492.0,
73
  "step": 70
74
  },
75
  {
76
+ "epoch": 0.16161616161616163,
77
+ "grad_norm": 0.9355543255805969,
78
+ "learning_rate": 7.500000000000001e-05,
79
+ "loss": 2.6181,
80
+ "mean_token_accuracy": 0.5057813063263893,
81
+ "num_tokens": 155520.0,
82
  "step": 80
83
  },
84
  {
85
+ "epoch": 0.18181818181818182,
86
+ "grad_norm": 3.2217044830322266,
87
+ "learning_rate": 8.5e-05,
88
+ "loss": 2.7865,
89
+ "mean_token_accuracy": 0.4679586015641689,
90
+ "num_tokens": 175768.0,
91
  "step": 90
92
  },
93
  {
94
+ "epoch": 0.20202020202020202,
95
+ "grad_norm": 3.879002809524536,
96
+ "learning_rate": 9.5e-05,
97
+ "loss": 2.5889,
98
+ "mean_token_accuracy": 0.4929826859384775,
99
+ "num_tokens": 194625.0,
100
  "step": 100
101
  },
102
  {
103
+ "epoch": 0.2222222222222222,
104
+ "grad_norm": 4.434224605560303,
105
+ "learning_rate": 9.96389891696751e-05,
106
+ "loss": 2.8938,
107
+ "mean_token_accuracy": 0.4700875423848629,
108
+ "num_tokens": 213171.0,
109
  "step": 110
110
  },
111
  {
112
+ "epoch": 0.24242424242424243,
113
+ "grad_norm": 9.846081733703613,
114
+ "learning_rate": 9.891696750902527e-05,
115
+ "loss": 2.343,
116
+ "mean_token_accuracy": 0.5364672098308801,
117
+ "num_tokens": 233264.0,
118
  "step": 120
119
  },
120
  {
121
+ "epoch": 0.26262626262626265,
122
+ "grad_norm": 1.6914633512496948,
123
+ "learning_rate": 9.819494584837545e-05,
124
+ "loss": 2.0776,
125
+ "mean_token_accuracy": 0.5672583125531674,
126
+ "num_tokens": 253987.0,
127
  "step": 130
128
  },
129
  {
130
+ "epoch": 0.2828282828282828,
131
+ "grad_norm": 2.6192626953125,
132
+ "learning_rate": 9.747292418772563e-05,
133
+ "loss": 2.3453,
134
+ "mean_token_accuracy": 0.5441015616059304,
135
+ "num_tokens": 270290.0,
136
  "step": 140
137
  },
138
  {
139
+ "epoch": 0.30303030303030304,
140
+ "grad_norm": 1.5915228128433228,
141
+ "learning_rate": 9.675090252707581e-05,
142
+ "loss": 2.3179,
143
+ "mean_token_accuracy": 0.5427416026592254,
144
+ "num_tokens": 287937.0,
145
  "step": 150
146
  },
147
  {
148
+ "epoch": 0.32323232323232326,
149
+ "grad_norm": 3.6255054473876953,
150
+ "learning_rate": 9.6028880866426e-05,
151
+ "loss": 2.0695,
152
+ "mean_token_accuracy": 0.5739563502371311,
153
+ "num_tokens": 307751.0,
154
  "step": 160
155
  },
156
  {
157
+ "epoch": 0.3434343434343434,
158
+ "grad_norm": 1.644443392753601,
159
+ "learning_rate": 9.530685920577617e-05,
160
+ "loss": 2.0005,
161
+ "mean_token_accuracy": 0.5894926242530346,
162
+ "num_tokens": 329149.0,
163
  "step": 170
164
  },
165
  {
166
+ "epoch": 0.36363636363636365,
167
+ "grad_norm": 3.0595431327819824,
168
+ "learning_rate": 9.458483754512635e-05,
169
+ "loss": 2.1171,
170
+ "mean_token_accuracy": 0.563767921924591,
171
+ "num_tokens": 346950.0,
172
  "step": 180
173
  },
174
  {
175
+ "epoch": 0.3838383838383838,
176
+ "grad_norm": 4.366697311401367,
177
+ "learning_rate": 9.386281588447655e-05,
178
+ "loss": 1.8502,
179
+ "mean_token_accuracy": 0.6056702233850956,
180
+ "num_tokens": 365017.0,
181
  "step": 190
182
  },
183
  {
184
+ "epoch": 0.40404040404040403,
185
+ "grad_norm": 2.07828950881958,
186
+ "learning_rate": 9.314079422382673e-05,
187
+ "loss": 1.7173,
188
+ "mean_token_accuracy": 0.621714337170124,
189
+ "num_tokens": 385734.0,
190
  "step": 200
191
  },
192
  {
193
+ "epoch": 0.42424242424242425,
194
+ "grad_norm": 2.536418914794922,
195
+ "learning_rate": 9.24187725631769e-05,
196
+ "loss": 1.8389,
197
+ "mean_token_accuracy": 0.6277161747217178,
198
+ "num_tokens": 403192.0,
199
  "step": 210
200
  },
201
  {
202
+ "epoch": 0.4444444444444444,
203
+ "grad_norm": 1.2784960269927979,
204
+ "learning_rate": 9.169675090252709e-05,
205
+ "loss": 1.8463,
206
+ "mean_token_accuracy": 0.614106347411871,
207
+ "num_tokens": 423909.0,
208
  "step": 220
209
  },
210
  {
211
+ "epoch": 0.46464646464646464,
212
+ "grad_norm": 2.1213629245758057,
213
+ "learning_rate": 9.097472924187727e-05,
214
+ "loss": 1.9916,
215
+ "mean_token_accuracy": 0.5884236626327037,
216
+ "num_tokens": 440385.0,
217
  "step": 230
218
  },
219
  {
220
+ "epoch": 0.48484848484848486,
221
+ "grad_norm": 2.149017810821533,
222
+ "learning_rate": 9.025270758122743e-05,
223
+ "loss": 1.8883,
224
+ "mean_token_accuracy": 0.5964126840233803,
225
+ "num_tokens": 458254.0,
226
  "step": 240
227
  },
228
  {
229
+ "epoch": 0.5050505050505051,
230
+ "grad_norm": 2.0171642303466797,
231
+ "learning_rate": 8.953068592057761e-05,
232
+ "loss": 2.0051,
233
+ "mean_token_accuracy": 0.5975183926522731,
234
+ "num_tokens": 473348.0,
235
  "step": 250
236
  },
237
  {
238
+ "epoch": 0.5252525252525253,
239
+ "grad_norm": 2.7957370281219482,
240
+ "learning_rate": 8.88086642599278e-05,
241
+ "loss": 1.8217,
242
+ "mean_token_accuracy": 0.6270358674228191,
243
+ "num_tokens": 494498.0,
244
  "step": 260
245
  },
246
  {
247
+ "epoch": 0.5454545454545454,
248
+ "grad_norm": 1.990042805671692,
249
+ "learning_rate": 8.808664259927798e-05,
250
+ "loss": 1.9135,
251
+ "mean_token_accuracy": 0.6138852916657924,
252
+ "num_tokens": 513100.0,
253
  "step": 270
254
  },
255
  {
256
+ "epoch": 0.5656565656565656,
257
+ "grad_norm": 2.3455405235290527,
258
+ "learning_rate": 8.736462093862816e-05,
259
+ "loss": 1.73,
260
+ "mean_token_accuracy": 0.6234532974660396,
261
+ "num_tokens": 532747.0,
262
  "step": 280
263
  },
264
  {
265
+ "epoch": 0.5858585858585859,
266
+ "grad_norm": 6.667909145355225,
267
+ "learning_rate": 8.664259927797834e-05,
268
+ "loss": 1.7277,
269
+ "mean_token_accuracy": 0.6382385298609734,
270
+ "num_tokens": 548769.0,
271
  "step": 290
272
  },
273
  {
274
+ "epoch": 0.6060606060606061,
275
+ "grad_norm": 1.917138695716858,
276
+ "learning_rate": 8.592057761732852e-05,
277
+ "loss": 1.5142,
278
+ "mean_token_accuracy": 0.6500309258699417,
279
+ "num_tokens": 567923.0,
280
  "step": 300
281
  },
282
  {
283
+ "epoch": 0.6262626262626263,
284
+ "grad_norm": 2.0420806407928467,
285
+ "learning_rate": 8.51985559566787e-05,
286
+ "loss": 1.7889,
287
+ "mean_token_accuracy": 0.6363476559519767,
288
+ "num_tokens": 585783.0,
289
  "step": 310
290
  },
291
  {
292
+ "epoch": 0.6464646464646465,
293
+ "grad_norm": 2.097153425216675,
294
+ "learning_rate": 8.447653429602888e-05,
295
+ "loss": 1.8036,
296
+ "mean_token_accuracy": 0.6113098107278347,
297
+ "num_tokens": 603216.0,
298
  "step": 320
299
  },
300
  {
301
+ "epoch": 0.6666666666666666,
302
+ "grad_norm": 1.5260653495788574,
303
+ "learning_rate": 8.375451263537906e-05,
304
+ "loss": 1.6468,
305
+ "mean_token_accuracy": 0.6486528031527996,
306
+ "num_tokens": 624173.0,
307
  "step": 330
308
  },
309
  {
310
+ "epoch": 0.6868686868686869,
311
+ "grad_norm": 1.6897279024124146,
312
+ "learning_rate": 8.303249097472924e-05,
313
+ "loss": 1.6672,
314
+ "mean_token_accuracy": 0.6469507545232773,
315
+ "num_tokens": 644656.0,
316
  "step": 340
317
  },
318
  {
319
+ "epoch": 0.7070707070707071,
320
+ "grad_norm": 3.271334648132324,
321
+ "learning_rate": 8.231046931407944e-05,
322
+ "loss": 1.7365,
323
+ "mean_token_accuracy": 0.6231018535792827,
324
+ "num_tokens": 664866.0,
325
  "step": 350
326
  },
327
  {
328
+ "epoch": 0.7272727272727273,
329
+ "grad_norm": 2.4320480823516846,
330
+ "learning_rate": 8.158844765342962e-05,
331
+ "loss": 1.7142,
332
+ "mean_token_accuracy": 0.6582800924777985,
333
+ "num_tokens": 683588.0,
334
  "step": 360
335
  },
336
  {
337
+ "epoch": 0.7474747474747475,
338
+ "grad_norm": 1.7879201173782349,
339
+ "learning_rate": 8.086642599277978e-05,
340
+ "loss": 1.7034,
341
+ "mean_token_accuracy": 0.6335549138486385,
342
+ "num_tokens": 701111.0,
343
  "step": 370
344
  },
345
  {
346
+ "epoch": 0.7676767676767676,
347
+ "grad_norm": 2.026250123977661,
348
+ "learning_rate": 8.014440433212996e-05,
349
+ "loss": 1.7315,
350
+ "mean_token_accuracy": 0.647477601468563,
351
+ "num_tokens": 719347.0,
352
  "step": 380
353
  },
354
  {
355
+ "epoch": 0.7878787878787878,
356
+ "grad_norm": 1.7138152122497559,
357
+ "learning_rate": 7.942238267148014e-05,
358
+ "loss": 1.612,
359
+ "mean_token_accuracy": 0.6578697174787521,
360
+ "num_tokens": 736038.0,
361
  "step": 390
362
  },
363
  {
364
+ "epoch": 0.8080808080808081,
365
+ "grad_norm": 1.5255950689315796,
366
+ "learning_rate": 7.870036101083032e-05,
367
+ "loss": 1.8457,
368
+ "mean_token_accuracy": 0.6219270460307598,
369
+ "num_tokens": 754840.0,
370
  "step": 400
371
  },
372
  {
373
+ "epoch": 0.8282828282828283,
374
+ "grad_norm": 3.739635705947876,
375
+ "learning_rate": 7.79783393501805e-05,
376
+ "loss": 1.7356,
377
+ "mean_token_accuracy": 0.6468625396490097,
378
+ "num_tokens": 769781.0,
379
  "step": 410
380
  },
381
  {
382
+ "epoch": 0.8484848484848485,
383
+ "grad_norm": 1.507598638534546,
384
+ "learning_rate": 7.72563176895307e-05,
385
+ "loss": 1.692,
386
+ "mean_token_accuracy": 0.6468491986393928,
387
+ "num_tokens": 788586.0,
388
  "step": 420
389
  },
390
  {
391
+ "epoch": 0.8686868686868687,
392
+ "grad_norm": 1.7837804555892944,
393
+ "learning_rate": 7.653429602888087e-05,
394
+ "loss": 1.5843,
395
+ "mean_token_accuracy": 0.6515591643750668,
396
+ "num_tokens": 808940.0,
397
  "step": 430
398
  },
399
  {
400
+ "epoch": 0.8888888888888888,
401
+ "grad_norm": 1.6429297924041748,
402
+ "learning_rate": 7.581227436823105e-05,
403
+ "loss": 1.7314,
404
+ "mean_token_accuracy": 0.6319857247173786,
405
+ "num_tokens": 828022.0,
406
  "step": 440
407
  },
408
  {
409
+ "epoch": 0.9090909090909091,
410
+ "grad_norm": 2.7530970573425293,
411
+ "learning_rate": 7.509025270758123e-05,
412
+ "loss": 1.7059,
413
+ "mean_token_accuracy": 0.6434222847223282,
414
+ "num_tokens": 845577.0,
415
  "step": 450
416
  },
417
  {
418
+ "epoch": 0.9292929292929293,
419
+ "grad_norm": 1.5740615129470825,
420
+ "learning_rate": 7.436823104693141e-05,
421
+ "loss": 1.7016,
422
+ "mean_token_accuracy": 0.6465534403920173,
423
+ "num_tokens": 866655.0,
424
  "step": 460
425
  },
426
  {
427
+ "epoch": 0.9494949494949495,
428
+ "grad_norm": 1.735592246055603,
429
+ "learning_rate": 7.36462093862816e-05,
430
+ "loss": 1.7066,
431
+ "mean_token_accuracy": 0.6451319254934788,
432
+ "num_tokens": 884148.0,
433
  "step": 470
434
  },
435
  {
436
+ "epoch": 0.9696969696969697,
437
+ "grad_norm": 2.2288308143615723,
438
+ "learning_rate": 7.292418772563177e-05,
439
+ "loss": 1.5397,
440
+ "mean_token_accuracy": 0.657177159935236,
441
+ "num_tokens": 905387.0,
442
  "step": 480
443
  },
444
  {
445
+ "epoch": 0.98989898989899,
446
+ "grad_norm": 2.363151788711548,
447
+ "learning_rate": 7.220216606498195e-05,
448
+ "loss": 1.919,
449
+ "mean_token_accuracy": 0.632861833833158,
450
+ "num_tokens": 925073.0,
451
  "step": 490
452
  },
453
  {
454
+ "epoch": 1.0101010101010102,
455
+ "grad_norm": 2.896883487701416,
456
+ "learning_rate": 7.148014440433213e-05,
457
+ "loss": 1.7299,
458
+ "mean_token_accuracy": 0.6438414633274079,
459
+ "num_tokens": 941834.0,
460
  "step": 500
461
  },
462
  {
463
+ "epoch": 1.0303030303030303,
464
+ "grad_norm": 5.034731388092041,
465
+ "learning_rate": 7.075812274368231e-05,
466
+ "loss": 1.6831,
467
+ "mean_token_accuracy": 0.6518400736153126,
468
+ "num_tokens": 958017.0,
469
  "step": 510
470
  },
471
  {
472
+ "epoch": 1.0505050505050506,
473
+ "grad_norm": 1.8448883295059204,
474
+ "learning_rate": 7.003610108303249e-05,
475
+ "loss": 1.5903,
476
+ "mean_token_accuracy": 0.656456682831049,
477
+ "num_tokens": 974729.0,
478
  "step": 520
479
  },
480
  {
481
+ "epoch": 1.0707070707070707,
482
+ "grad_norm": 1.8980131149291992,
483
+ "learning_rate": 6.931407942238267e-05,
484
+ "loss": 1.5521,
485
+ "mean_token_accuracy": 0.6531489036977292,
486
+ "num_tokens": 995648.0,
487
  "step": 530
488
  },
489
  {
490
+ "epoch": 1.0909090909090908,
491
+ "grad_norm": 11.001644134521484,
492
+ "learning_rate": 6.859205776173285e-05,
493
+ "loss": 1.6765,
494
+ "mean_token_accuracy": 0.6484075963497162,
495
+ "num_tokens": 1013028.0,
496
  "step": 540
497
  },
498
  {
499
+ "epoch": 1.1111111111111112,
500
+ "grad_norm": 2.1369686126708984,
501
+ "learning_rate": 6.787003610108303e-05,
502
+ "loss": 1.6332,
503
+ "mean_token_accuracy": 0.6697568111121655,
504
+ "num_tokens": 1035666.0,
505
  "step": 550
506
  },
507
  {
508
+ "epoch": 1.1313131313131313,
509
+ "grad_norm": 1.4799697399139404,
510
+ "learning_rate": 6.714801444043321e-05,
511
+ "loss": 1.7022,
512
+ "mean_token_accuracy": 0.6447197504341602,
513
+ "num_tokens": 1055111.0,
514
  "step": 560
515
  },
516
  {
517
+ "epoch": 1.1515151515151516,
518
+ "grad_norm": 2.329430341720581,
519
+ "learning_rate": 6.642599277978339e-05,
520
+ "loss": 1.7747,
521
+ "mean_token_accuracy": 0.6284119591116906,
522
+ "num_tokens": 1073114.0,
523
  "step": 570
524
  },
525
  {
526
+ "epoch": 1.1717171717171717,
527
+ "grad_norm": 3.0006322860717773,
528
+ "learning_rate": 6.570397111913357e-05,
529
+ "loss": 1.6484,
530
+ "mean_token_accuracy": 0.6459825620055198,
531
+ "num_tokens": 1089325.0,
532
  "step": 580
533
  },
534
  {
535
+ "epoch": 1.1919191919191918,
536
+ "grad_norm": 8.296801567077637,
537
+ "learning_rate": 6.498194945848377e-05,
538
+ "loss": 1.6361,
539
+ "mean_token_accuracy": 0.6575549930334091,
540
+ "num_tokens": 1105923.0,
541
  "step": 590
542
  },
543
  {
544
+ "epoch": 1.2121212121212122,
545
+ "grad_norm": 2.0805375576019287,
546
+ "learning_rate": 6.425992779783394e-05,
547
+ "loss": 1.4366,
548
+ "mean_token_accuracy": 0.6729512564837933,
549
+ "num_tokens": 1127328.0,
550
  "step": 600
551
  },
552
  {
553
+ "epoch": 1.2323232323232323,
554
+ "grad_norm": 2.0608692169189453,
555
+ "learning_rate": 6.353790613718412e-05,
556
+ "loss": 1.5935,
557
+ "mean_token_accuracy": 0.6634075284004212,
558
+ "num_tokens": 1147181.0,
559
  "step": 610
560
  },
561
  {
562
+ "epoch": 1.2525252525252526,
563
+ "grad_norm": 3.865906238555908,
564
+ "learning_rate": 6.28158844765343e-05,
565
+ "loss": 1.5445,
566
+ "mean_token_accuracy": 0.6648930206894874,
567
+ "num_tokens": 1164753.0,
568
  "step": 620
569
  },
570
  {
571
+ "epoch": 1.2727272727272727,
572
+ "grad_norm": 1.8212089538574219,
573
+ "learning_rate": 6.209386281588448e-05,
574
+ "loss": 1.6492,
575
+ "mean_token_accuracy": 0.6418032497167587,
576
+ "num_tokens": 1184594.0,
577
  "step": 630
578
  },
579
  {
580
+ "epoch": 1.2929292929292928,
581
+ "grad_norm": 3.3243095874786377,
582
+ "learning_rate": 6.137184115523465e-05,
583
+ "loss": 1.5253,
584
+ "mean_token_accuracy": 0.669656652957201,
585
+ "num_tokens": 1206129.0,
586
  "step": 640
587
  },
588
  {
589
+ "epoch": 1.3131313131313131,
590
+ "grad_norm": 1.6167833805084229,
591
+ "learning_rate": 6.064981949458484e-05,
592
+ "loss": 1.5478,
593
+ "mean_token_accuracy": 0.6591526836156845,
594
+ "num_tokens": 1226012.0,
595
  "step": 650
596
  },
597
  {
598
+ "epoch": 1.3333333333333333,
599
+ "grad_norm": 3.81766676902771,
600
+ "learning_rate": 5.992779783393502e-05,
601
+ "loss": 1.788,
602
+ "mean_token_accuracy": 0.6285306230187416,
603
+ "num_tokens": 1242162.0,
604
  "step": 660
605
  },
606
  {
607
+ "epoch": 1.3535353535353536,
608
+ "grad_norm": 1.2418630123138428,
609
+ "learning_rate": 5.9205776173285197e-05,
610
+ "loss": 1.498,
611
+ "mean_token_accuracy": 0.6632598295807839,
612
+ "num_tokens": 1265769.0,
613
  "step": 670
614
  },
615
  {
616
+ "epoch": 1.3737373737373737,
617
+ "grad_norm": 5.77175235748291,
618
+ "learning_rate": 5.848375451263538e-05,
619
+ "loss": 1.5168,
620
+ "mean_token_accuracy": 0.668974144756794,
621
+ "num_tokens": 1284762.0,
622
  "step": 680
623
  },
624
  {
625
+ "epoch": 1.393939393939394,
626
+ "grad_norm": 2.184446334838867,
627
+ "learning_rate": 5.776173285198556e-05,
628
+ "loss": 1.5881,
629
+ "mean_token_accuracy": 0.6551995210349559,
630
+ "num_tokens": 1303301.0,
631
  "step": 690
632
  },
633
  {
634
+ "epoch": 1.4141414141414141,
635
+ "grad_norm": 1.2407817840576172,
636
+ "learning_rate": 5.703971119133574e-05,
637
+ "loss": 1.5,
638
+ "mean_token_accuracy": 0.6752019837498665,
639
+ "num_tokens": 1325905.0,
640
  "step": 700
641
  },
642
  {
643
+ "epoch": 1.4343434343434343,
644
+ "grad_norm": 1.709302544593811,
645
+ "learning_rate": 5.631768953068592e-05,
646
+ "loss": 1.3928,
647
+ "mean_token_accuracy": 0.6914731428027153,
648
+ "num_tokens": 1345901.0,
649
  "step": 710
650
  },
651
  {
652
+ "epoch": 1.4545454545454546,
653
+ "grad_norm": 1.451839566230774,
654
+ "learning_rate": 5.55956678700361e-05,
655
+ "loss": 1.7266,
656
+ "mean_token_accuracy": 0.6524573139846325,
657
+ "num_tokens": 1362788.0,
658
  "step": 720
659
  },
660
  {
661
+ "epoch": 1.4747474747474747,
662
+ "grad_norm": 3.0613152980804443,
663
+ "learning_rate": 5.487364620938629e-05,
664
+ "loss": 1.5518,
665
+ "mean_token_accuracy": 0.669068893790245,
666
+ "num_tokens": 1379456.0,
667
  "step": 730
668
  },
669
  {
670
+ "epoch": 1.494949494949495,
671
+ "grad_norm": 1.5313241481781006,
672
+ "learning_rate": 5.415162454873647e-05,
673
+ "loss": 1.4793,
674
+ "mean_token_accuracy": 0.6733302772045135,
675
+ "num_tokens": 1398659.0,
676
  "step": 740
677
  },
678
  {
679
+ "epoch": 1.5151515151515151,
680
+ "grad_norm": 1.9046810865402222,
681
+ "learning_rate": 5.342960288808665e-05,
682
+ "loss": 1.441,
683
+ "mean_token_accuracy": 0.681334413588047,
684
+ "num_tokens": 1416828.0,
685
  "step": 750
686
  },
687
  {
688
+ "epoch": 1.5353535353535355,
689
+ "grad_norm": 1.984887719154358,
690
+ "learning_rate": 5.270758122743683e-05,
691
+ "loss": 1.6379,
692
+ "mean_token_accuracy": 0.6509823858737945,
693
+ "num_tokens": 1431285.0,
694
  "step": 760
695
  },
696
  {
697
+ "epoch": 1.5555555555555556,
698
+ "grad_norm": 1.1224578619003296,
699
+ "learning_rate": 5.1985559566787e-05,
700
+ "loss": 1.6412,
701
+ "mean_token_accuracy": 0.6585724964737892,
702
+ "num_tokens": 1451394.0,
703
  "step": 770
704
  },
705
  {
706
+ "epoch": 1.5757575757575757,
707
+ "grad_norm": 1.988461971282959,
708
+ "learning_rate": 5.126353790613718e-05,
709
+ "loss": 1.7935,
710
+ "mean_token_accuracy": 0.6379878364503384,
711
+ "num_tokens": 1471734.0,
712
  "step": 780
713
  },
714
  {
715
+ "epoch": 1.595959595959596,
716
+ "grad_norm": 1.495737075805664,
717
+ "learning_rate": 5.054151624548736e-05,
718
+ "loss": 1.5828,
719
+ "mean_token_accuracy": 0.6762645319104195,
720
+ "num_tokens": 1489257.0,
721
  "step": 790
722
  },
723
  {
724
+ "epoch": 1.6161616161616161,
725
+ "grad_norm": 8.480497360229492,
726
+ "learning_rate": 4.981949458483755e-05,
727
+ "loss": 1.8259,
728
+ "mean_token_accuracy": 0.6398707143962383,
729
+ "num_tokens": 1506944.0,
730
  "step": 800
731
  },
732
  {
733
+ "epoch": 1.6363636363636362,
734
+ "grad_norm": 3.5872299671173096,
735
+ "learning_rate": 4.909747292418773e-05,
736
+ "loss": 1.659,
737
+ "mean_token_accuracy": 0.6536437503993511,
738
+ "num_tokens": 1522614.0,
739
  "step": 810
740
  },
741
  {
742
+ "epoch": 1.6565656565656566,
743
+ "grad_norm": 1.6361726522445679,
744
+ "learning_rate": 4.837545126353791e-05,
745
+ "loss": 1.6725,
746
+ "mean_token_accuracy": 0.6559996947646141,
747
+ "num_tokens": 1543689.0,
748
  "step": 820
749
  },
750
  {
751
+ "epoch": 1.676767676767677,
752
+ "grad_norm": 2.0231411457061768,
753
+ "learning_rate": 4.765342960288809e-05,
754
+ "loss": 1.5075,
755
+ "mean_token_accuracy": 0.6640274345874786,
756
+ "num_tokens": 1563909.0,
757
  "step": 830
758
  },
759
  {
760
+ "epoch": 1.696969696969697,
761
+ "grad_norm": 2.8920161724090576,
762
+ "learning_rate": 4.693140794223827e-05,
763
+ "loss": 1.7398,
764
+ "mean_token_accuracy": 0.6467047482728958,
765
+ "num_tokens": 1581501.0,
766
  "step": 840
767
  },
768
  {
769
+ "epoch": 1.7171717171717171,
770
+ "grad_norm": 1.7013530731201172,
771
+ "learning_rate": 4.620938628158845e-05,
772
+ "loss": 1.5343,
773
+ "mean_token_accuracy": 0.6554797604680062,
774
+ "num_tokens": 1602745.0,
775
  "step": 850
776
  },
777
  {
778
+ "epoch": 1.7373737373737375,
779
+ "grad_norm": 1.5854769945144653,
780
+ "learning_rate": 4.548736462093863e-05,
781
+ "loss": 1.5482,
782
+ "mean_token_accuracy": 0.6624557688832283,
783
+ "num_tokens": 1622681.0,
784
  "step": 860
785
  },
786
  {
787
+ "epoch": 1.7575757575757576,
788
+ "grad_norm": 1.8224149942398071,
789
+ "learning_rate": 4.4765342960288806e-05,
790
+ "loss": 1.5386,
791
+ "mean_token_accuracy": 0.6684516966342926,
792
+ "num_tokens": 1640007.0,
793
  "step": 870
794
  },
795
  {
796
+ "epoch": 1.7777777777777777,
797
+ "grad_norm": 3.453603744506836,
798
+ "learning_rate": 4.404332129963899e-05,
799
+ "loss": 1.517,
800
+ "mean_token_accuracy": 0.6810053952038289,
801
+ "num_tokens": 1662564.0,
802
  "step": 880
803
  },
804
  {
805
+ "epoch": 1.797979797979798,
806
+ "grad_norm": 1.8291434049606323,
807
+ "learning_rate": 4.332129963898917e-05,
808
+ "loss": 1.4867,
809
+ "mean_token_accuracy": 0.6807132661342621,
810
+ "num_tokens": 1682205.0,
811
  "step": 890
812
  },
813
  {
814
+ "epoch": 1.8181818181818183,
815
+ "grad_norm": 3.217017889022827,
816
+ "learning_rate": 4.259927797833935e-05,
817
+ "loss": 1.5669,
818
+ "mean_token_accuracy": 0.6671051770448685,
819
+ "num_tokens": 1697359.0,
820
  "step": 900
821
  },
822
  {
823
+ "epoch": 1.8383838383838382,
824
+ "grad_norm": 1.371291160583496,
825
+ "learning_rate": 4.187725631768953e-05,
826
+ "loss": 1.4343,
827
+ "mean_token_accuracy": 0.6958822838962078,
828
+ "num_tokens": 1720184.0,
829
  "step": 910
830
  },
831
  {
832
+ "epoch": 1.8585858585858586,
833
+ "grad_norm": 2.7192142009735107,
834
+ "learning_rate": 4.115523465703972e-05,
835
+ "loss": 1.4134,
836
+ "mean_token_accuracy": 0.6945954069495202,
837
+ "num_tokens": 1739446.0,
838
  "step": 920
839
  },
840
  {
841
+ "epoch": 1.878787878787879,
842
+ "grad_norm": 2.4172279834747314,
843
+ "learning_rate": 4.043321299638989e-05,
844
+ "loss": 1.5238,
845
+ "mean_token_accuracy": 0.6700037866830826,
846
+ "num_tokens": 1758629.0,
847
  "step": 930
848
  },
849
  {
850
+ "epoch": 1.898989898989899,
851
+ "grad_norm": 1.7151827812194824,
852
+ "learning_rate": 3.971119133574007e-05,
853
+ "loss": 1.5609,
854
+ "mean_token_accuracy": 0.665402963757515,
855
+ "num_tokens": 1777313.0,
856
  "step": 940
857
  },
858
  {
859
+ "epoch": 1.9191919191919191,
860
+ "grad_norm": 2.2101497650146484,
861
+ "learning_rate": 3.898916967509025e-05,
862
+ "loss": 1.6266,
863
+ "mean_token_accuracy": 0.6585289388895035,
864
+ "num_tokens": 1797829.0,
865
  "step": 950
866
  },
867
  {
868
+ "epoch": 1.9393939393939394,
869
+ "grad_norm": 1.5860098600387573,
870
+ "learning_rate": 3.826714801444044e-05,
871
+ "loss": 1.5842,
872
+ "mean_token_accuracy": 0.6568711154162884,
873
+ "num_tokens": 1819044.0,
874
  "step": 960
875
  },
876
  {
877
+ "epoch": 1.9595959595959596,
878
+ "grad_norm": 2.2135324478149414,
879
+ "learning_rate": 3.754512635379062e-05,
880
+ "loss": 1.5017,
881
+ "mean_token_accuracy": 0.6738567680120469,
882
+ "num_tokens": 1837829.0,
883
  "step": 970
884
  },
885
  {
886
+ "epoch": 1.9797979797979797,
887
+ "grad_norm": 1.8832942247390747,
888
+ "learning_rate": 3.68231046931408e-05,
889
+ "loss": 1.6386,
890
+ "mean_token_accuracy": 0.6536656714975834,
891
+ "num_tokens": 1854112.0,
892
  "step": 980
893
  },
894
  {
895
+ "epoch": 2.0,
896
+ "grad_norm": 1.4356534481048584,
897
+ "learning_rate": 3.610108303249098e-05,
898
+ "loss": 1.5847,
899
+ "mean_token_accuracy": 0.661115899682045,
900
+ "num_tokens": 1869584.0,
901
  "step": 990
902
  },
903
  {
904
+ "epoch": 2.0202020202020203,
905
+ "grad_norm": 3.277709484100342,
906
+ "learning_rate": 3.537906137184116e-05,
907
+ "loss": 1.5794,
908
+ "mean_token_accuracy": 0.6566751167178154,
909
+ "num_tokens": 1885930.0,
910
  "step": 1000
911
  },
912
  {
913
+ "epoch": 2.04040404040404,
914
+ "grad_norm": 1.672176718711853,
915
+ "learning_rate": 3.4657039711191336e-05,
916
+ "loss": 1.6426,
917
+ "mean_token_accuracy": 0.669060529768467,
918
+ "num_tokens": 1908386.0,
919
  "step": 1010
920
  },
921
  {
922
+ "epoch": 2.0606060606060606,
923
+ "grad_norm": 1.787185549736023,
924
+ "learning_rate": 3.3935018050541516e-05,
925
+ "loss": 1.487,
926
+ "mean_token_accuracy": 0.6758425906300545,
927
+ "num_tokens": 1928179.0,
928
  "step": 1020
929
  },
930
  {
931
+ "epoch": 2.080808080808081,
932
+ "grad_norm": 1.1577355861663818,
933
+ "learning_rate": 3.3212996389891696e-05,
934
+ "loss": 1.4625,
935
+ "mean_token_accuracy": 0.6700806766748428,
936
+ "num_tokens": 1947571.0,
937
  "step": 1030
938
  },
939
  {
940
+ "epoch": 2.101010101010101,
941
+ "grad_norm": 2.881878137588501,
942
+ "learning_rate": 3.249097472924188e-05,
943
+ "loss": 1.5191,
944
+ "mean_token_accuracy": 0.6762366116046905,
945
+ "num_tokens": 1965861.0,
946
  "step": 1040
947
  },
948
  {
949
+ "epoch": 2.121212121212121,
950
+ "grad_norm": 1.5470958948135376,
951
+ "learning_rate": 3.176895306859206e-05,
952
+ "loss": 1.5277,
953
+ "mean_token_accuracy": 0.6653557240962982,
954
+ "num_tokens": 1987291.0,
955
  "step": 1050
956
  },
957
  {
958
+ "epoch": 2.1414141414141414,
959
+ "grad_norm": 1.8662647008895874,
960
+ "learning_rate": 3.104693140794224e-05,
961
+ "loss": 1.4983,
962
+ "mean_token_accuracy": 0.6764706581830978,
963
+ "num_tokens": 2003260.0,
964
  "step": 1060
965
  },
966
  {
967
+ "epoch": 2.1616161616161618,
968
+ "grad_norm": 1.2521296739578247,
969
+ "learning_rate": 3.032490974729242e-05,
970
+ "loss": 1.4703,
971
+ "mean_token_accuracy": 0.6709941066801548,
972
+ "num_tokens": 2020415.0,
973
  "step": 1070
974
  },
975
  {
976
+ "epoch": 2.1818181818181817,
977
+ "grad_norm": 6.714540004730225,
978
+ "learning_rate": 2.9602888086642598e-05,
979
+ "loss": 1.6314,
980
+ "mean_token_accuracy": 0.6627085514366626,
981
+ "num_tokens": 2037429.0,
982
  "step": 1080
983
  },
984
  {
985
+ "epoch": 2.202020202020202,
986
+ "grad_norm": 2.123655080795288,
987
+ "learning_rate": 2.888086642599278e-05,
988
+ "loss": 1.5588,
989
+ "mean_token_accuracy": 0.6700812846422195,
990
+ "num_tokens": 2054688.0,
991
  "step": 1090
992
  },
993
  {
994
+ "epoch": 2.2222222222222223,
995
+ "grad_norm": 2.0840301513671875,
996
+ "learning_rate": 2.815884476534296e-05,
997
+ "loss": 1.7006,
998
+ "mean_token_accuracy": 0.6508332662284374,
999
+ "num_tokens": 2075865.0,
1000
  "step": 1100
1001
  },
1002
  {
1003
+ "epoch": 2.242424242424242,
1004
+ "grad_norm": 1.9797368049621582,
1005
+ "learning_rate": 2.7436823104693144e-05,
1006
+ "loss": 1.501,
1007
+ "mean_token_accuracy": 0.6622319832444191,
1008
+ "num_tokens": 2093688.0,
1009
  "step": 1110
1010
  },
1011
  {
1012
+ "epoch": 2.2626262626262625,
1013
+ "grad_norm": 2.007617950439453,
1014
+ "learning_rate": 2.6714801444043324e-05,
1015
+ "loss": 1.487,
1016
+ "mean_token_accuracy": 0.6778388306498527,
1017
+ "num_tokens": 2113155.0,
1018
  "step": 1120
1019
  },
1020
  {
1021
+ "epoch": 2.282828282828283,
1022
+ "grad_norm": 1.2606422901153564,
1023
+ "learning_rate": 2.59927797833935e-05,
1024
+ "loss": 1.4389,
1025
+ "mean_token_accuracy": 0.6810906417667866,
1026
+ "num_tokens": 2131996.0,
1027
  "step": 1130
1028
  },
1029
  {
1030
+ "epoch": 2.303030303030303,
1031
+ "grad_norm": 1.655875563621521,
1032
+ "learning_rate": 2.527075812274368e-05,
1033
+ "loss": 1.5242,
1034
+ "mean_token_accuracy": 0.6587833181023598,
1035
+ "num_tokens": 2151671.0,
1036
  "step": 1140
1037
  },
1038
  {
1039
+ "epoch": 2.323232323232323,
1040
+ "grad_norm": 1.516184687614441,
1041
+ "learning_rate": 2.4548736462093864e-05,
1042
+ "loss": 1.4613,
1043
+ "mean_token_accuracy": 0.6847339481115341,
1044
+ "num_tokens": 2173364.0,
1045
  "step": 1150
1046
  },
1047
  {
1048
+ "epoch": 2.3434343434343434,
1049
+ "grad_norm": 1.842247486114502,
1050
+ "learning_rate": 2.3826714801444043e-05,
1051
+ "loss": 1.5037,
1052
+ "mean_token_accuracy": 0.6658960357308388,
1053
+ "num_tokens": 2190588.0,
1054
  "step": 1160
1055
  },
1056
  {
1057
+ "epoch": 2.3636363636363638,
1058
+ "grad_norm": 3.459821939468384,
1059
+ "learning_rate": 2.3104693140794227e-05,
1060
+ "loss": 1.6169,
1061
+ "mean_token_accuracy": 0.6574626617133618,
1062
+ "num_tokens": 2212944.0,
1063
  "step": 1170
1064
  },
1065
  {
1066
+ "epoch": 2.3838383838383836,
1067
+ "grad_norm": 2.880796194076538,
1068
+ "learning_rate": 2.2382671480144403e-05,
1069
+ "loss": 1.4261,
1070
+ "mean_token_accuracy": 0.6781707689166069,
1071
+ "num_tokens": 2230777.0,
1072
  "step": 1180
1073
  },
1074
  {
1075
+ "epoch": 2.404040404040404,
1076
+ "grad_norm": 1.416815996170044,
1077
+ "learning_rate": 2.1660649819494586e-05,
1078
+ "loss": 1.539,
1079
+ "mean_token_accuracy": 0.6707717284560204,
1080
+ "num_tokens": 2248750.0,
1081
  "step": 1190
1082
  },
1083
  {
1084
+ "epoch": 2.4242424242424243,
1085
+ "grad_norm": 1.6914799213409424,
1086
+ "learning_rate": 2.0938628158844766e-05,
1087
+ "loss": 1.4456,
1088
+ "mean_token_accuracy": 0.6809201754629612,
1089
+ "num_tokens": 2266641.0,
1090
  "step": 1200
1091
  }
1092
  ],
1093
  "logging_steps": 10,
1094
+ "max_steps": 1485,
1095
  "num_input_tokens_seen": 0,
1096
+ "num_train_epochs": 3,
1097
+ "save_steps": 50,
1098
  "stateful_callbacks": {
1099
  "TrainerControl": {
1100
  "args": {
 
1107
  "attributes": {}
1108
  }
1109
  },
1110
+ "total_flos": 3.918999165635174e+16,
1111
  "train_batch_size": 1,
1112
  "trial_name": null,
1113
  "trial_params": null
results/checkpoint-1200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d6e53bf1d681901c1cdb28909be54e0d63c65199d2388028634c1022d22ce03
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2e099a6969a2a35f5b0a318e89c5857fca33ddbae202ddebca99dadbbe51de
3
  size 5560