finalform commited on
Commit
a3b72cf
·
verified ·
1 Parent(s): dc9bff7

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,12 +25,12 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "down_proj",
29
- "gate_proj",
30
- "q_proj",
31
  "v_proj",
32
- "up_proj",
33
  "k_proj",
 
 
 
34
  "o_proj"
35
  ],
36
  "target_parameters": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
 
 
28
  "v_proj",
29
+ "q_proj",
30
  "k_proj",
31
+ "up_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
  "o_proj"
35
  ],
36
  "target_parameters": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:113914c3a9ee16733e58f144a9deeca955b0922ab00373741664c59a81d5ed15
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd16a83486b928f6cc0241a7882893e158babc6dd4a67f332ae4f534facd8c86
3
  size 645975704
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e19a83976d7b82f54bb4a0f6f7715f4c02b88623321d19bd752fd983e6806256
3
  size 1292087499
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b62b0bfaf641adbe41f088ddd9a3aa363eba72ea7e327a1da8bbce8885aabc9
3
  size 1292087499
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b8f4725c0848633ed9e183cba04e612b910caa03960fb7a327aef513ddf465e
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:919326fcb5ecb4a36f040cc547270748ad022189adac0a473721620686517f34
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13c315171d65d7a17956efce3890599a6e10f86bff109a674a492a6c135b9c49
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49bc3fab163d4d421c507c6573f14dfaf3a399471064ffb19f74b86c0e66eb8e
3
  size 1465
trainer_state.json CHANGED
@@ -11,799 +11,799 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
- "grad_norm": 0.2549596130847931,
15
- "learning_rate": 0.0001636363636363636,
16
- "loss": 1.7622,
17
- "mean_token_accuracy": 0.636659591794014,
18
- "num_tokens": 157807.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
- "grad_norm": 0.26669397950172424,
24
- "learning_rate": 0.00033409090909090905,
25
- "loss": 0.7954,
26
- "mean_token_accuracy": 0.794013529419899,
27
- "num_tokens": 285036.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
- "grad_norm": 0.19134272634983063,
33
- "learning_rate": 0.0005045454545454546,
34
- "loss": 0.5897,
35
- "mean_token_accuracy": 0.837238620519638,
36
- "num_tokens": 443279.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
- "grad_norm": 0.24841615557670593,
42
- "learning_rate": 0.0005999774265866424,
43
- "loss": 0.5122,
44
- "mean_token_accuracy": 0.8582117992639542,
45
- "num_tokens": 569358.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
- "grad_norm": 0.208372563123703,
51
- "learning_rate": 0.0005997582513956242,
52
- "loss": 0.3663,
53
- "mean_token_accuracy": 0.8957295078039169,
54
- "num_tokens": 729085.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
- "grad_norm": 0.2048349678516388,
60
- "learning_rate": 0.0005993060798733474,
61
- "loss": 0.2976,
62
- "mean_token_accuracy": 0.9137766647338867,
63
- "num_tokens": 857264.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
- "grad_norm": 0.16685613989830017,
69
- "learning_rate": 0.0005986212634840513,
70
- "loss": 0.2196,
71
- "mean_token_accuracy": 0.9362494552135467,
72
- "num_tokens": 1014533.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
- "grad_norm": 0.20695029199123383,
78
- "learning_rate": 0.0005977043345223621,
79
- "loss": 0.2028,
80
- "mean_token_accuracy": 0.9412092477083206,
81
- "num_tokens": 1140813.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
- "grad_norm": 0.15852399170398712,
87
- "learning_rate": 0.0005965560056995495,
88
- "loss": 0.1714,
89
- "mean_token_accuracy": 0.9513516998291016,
90
- "num_tokens": 1298110.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
- "grad_norm": 0.26144832372665405,
96
- "learning_rate": 0.0005951771695895515,
97
- "loss": 0.1319,
98
- "mean_token_accuracy": 0.9622357904911041,
99
- "num_tokens": 1424382.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
- "grad_norm": 0.13929001986980438,
105
- "learning_rate": 0.0005935688979351926,
106
- "loss": 0.1362,
107
- "mean_token_accuracy": 0.9618479776382446,
108
- "num_tokens": 1581234.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
- "grad_norm": 0.20948883891105652,
114
- "learning_rate": 0.0005917324408151391,
115
- "loss": 0.1148,
116
- "mean_token_accuracy": 0.9671791672706604,
117
- "num_tokens": 1707432.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
- "grad_norm": 0.1256396621465683,
123
- "learning_rate": 0.0005896692256722372,
124
- "loss": 0.1204,
125
- "mean_token_accuracy": 0.9664241331815719,
126
- "num_tokens": 1865179.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
- "grad_norm": 0.16760271787643433,
132
- "learning_rate": 0.0005873808562039883,
133
- "loss": 0.0993,
134
- "mean_token_accuracy": 0.9722720664739609,
135
- "num_tokens": 1991194.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
- "grad_norm": 0.13247406482696533,
141
- "learning_rate": 0.000584869111116027,
142
- "loss": 0.1088,
143
- "mean_token_accuracy": 0.9698091906309128,
144
- "num_tokens": 2148159.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
- "grad_norm": 0.1613062173128128,
150
- "learning_rate": 0.000582135942739566,
151
- "loss": 0.0808,
152
- "mean_token_accuracy": 0.9774637776613235,
153
- "num_tokens": 2274851.0,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 1.0,
158
- "eval_loss": 0.08808618038892746,
159
- "eval_mean_token_accuracy": 0.9753262432845863,
160
  "eval_num_tokens": 2354180.0,
161
- "eval_runtime": 15.8043,
162
- "eval_samples_per_second": 23.348,
163
- "eval_steps_per_second": 11.706,
164
  "step": 415
165
  },
166
  {
167
  "epoch": 1.024140012070006,
168
- "grad_norm": 0.15485015511512756,
169
- "learning_rate": 0.0005791834755138876,
170
- "loss": 0.0976,
171
- "mean_token_accuracy": 0.9728126489010054,
172
- "num_tokens": 2422868.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
- "grad_norm": 0.10847453773021698,
178
- "learning_rate": 0.0005760140043350575,
179
  "loss": 0.0611,
180
- "mean_token_accuracy": 0.9824439281225205,
181
- "num_tokens": 2564989.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
- "grad_norm": 0.09102415293455124,
187
- "learning_rate": 0.0005726299927721457,
188
- "loss": 0.0816,
189
- "mean_token_accuracy": 0.9779865646362305,
190
- "num_tokens": 2704814.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
- "grad_norm": 0.09399569034576416,
196
- "learning_rate": 0.0005690340711523424,
197
- "loss": 0.0544,
198
- "mean_token_accuracy": 0.9843483155965805,
199
- "num_tokens": 2845574.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
- "grad_norm": 0.0724228098988533,
205
- "learning_rate": 0.0005652290345164548,
206
- "loss": 0.0815,
207
- "mean_token_accuracy": 0.9766461044549942,
208
- "num_tokens": 2988351.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
- "grad_norm": 0.07961975783109665,
214
- "learning_rate": 0.0005612178404463753,
215
- "loss": 0.0555,
216
- "mean_token_accuracy": 0.9839057290554046,
217
- "num_tokens": 3131450.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
- "grad_norm": 0.1089630052447319,
223
- "learning_rate": 0.0005570036067662102,
224
- "loss": 0.0718,
225
- "mean_token_accuracy": 0.9797851747274399,
226
- "num_tokens": 3274594.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
- "grad_norm": 0.06336581707000732,
232
- "learning_rate": 0.0005525896091188552,
233
  "loss": 0.0492,
234
- "mean_token_accuracy": 0.9854604113101959,
235
- "num_tokens": 3416329.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
- "grad_norm": 0.06915699690580368,
241
- "learning_rate": 0.0005479792784199004,
242
- "loss": 0.0653,
243
- "mean_token_accuracy": 0.9818730032444001,
244
- "num_tokens": 3557102.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
- "grad_norm": 0.09291143715381622,
250
- "learning_rate": 0.0005431761981908461,
251
- "loss": 0.0516,
252
- "mean_token_accuracy": 0.9852055913209915,
253
- "num_tokens": 3701095.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
- "grad_norm": 0.07441911846399307,
259
- "learning_rate": 0.0005381841017737,
260
- "loss": 0.0555,
261
- "mean_token_accuracy": 0.9845552426576615,
262
- "num_tokens": 3844239.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
- "grad_norm": 0.09067196398973465,
268
- "learning_rate": 0.0005330068694291224,
269
  "loss": 0.0454,
270
- "mean_token_accuracy": 0.9871112030744552,
271
- "num_tokens": 3988455.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
- "grad_norm": 0.05332234129309654,
277
- "learning_rate": 0.000527648525320374,
278
- "loss": 0.0527,
279
- "mean_token_accuracy": 0.9853162139654159,
280
- "num_tokens": 4129554.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
- "grad_norm": 0.07973570376634598,
286
- "learning_rate": 0.0005221132343854112,
287
- "loss": 0.0415,
288
- "mean_token_accuracy": 0.9880281090736389,
289
- "num_tokens": 4270951.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
- "grad_norm": 0.06410035490989685,
295
- "learning_rate": 0.0005164052990995595,
296
- "loss": 0.0587,
297
- "mean_token_accuracy": 0.9833071821928024,
298
- "num_tokens": 4412170.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
- "grad_norm": 0.10863803327083588,
304
- "learning_rate": 0.0005105291561312827,
305
- "loss": 0.0426,
306
- "mean_token_accuracy": 0.9874854302406311,
307
- "num_tokens": 4555061.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
- "grad_norm": 0.06359567493200302,
313
- "learning_rate": 0.0005044893728936449,
314
- "loss": 0.0454,
315
- "mean_token_accuracy": 0.9876255023479462,
316
- "num_tokens": 4688503.0,
317
  "step": 825
318
  },
319
  {
320
  "epoch": 2.0,
321
- "eval_loss": 0.054030660539865494,
322
- "eval_mean_token_accuracy": 0.9852899135770025,
323
  "eval_num_tokens": 4708360.0,
324
- "eval_runtime": 15.7855,
325
- "eval_samples_per_second": 23.376,
326
- "eval_steps_per_second": 11.72,
327
  "step": 830
328
  },
329
  {
330
  "epoch": 2.048280024140012,
331
- "grad_norm": 0.08557040989398956,
332
- "learning_rate": 0.0004982906439941489,
333
- "loss": 0.051,
334
- "mean_token_accuracy": 0.9849937619622221,
335
- "num_tokens": 4838547.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
- "grad_norm": 0.07765046507120132,
341
- "learning_rate": 0.0004919377875857071,
342
- "loss": 0.0313,
343
- "mean_token_accuracy": 0.9904298150539398,
344
- "num_tokens": 4971888.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
- "grad_norm": 0.04834737256169319,
350
- "learning_rate": 0.00048543574162158455,
351
- "loss": 0.0484,
352
- "mean_token_accuracy": 0.9856269609928131,
353
- "num_tokens": 5123809.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
- "grad_norm": 0.0744078978896141,
359
- "learning_rate": 0.00047878956001722235,
360
- "loss": 0.0318,
361
- "mean_token_accuracy": 0.9904006707668305,
362
- "num_tokens": 5257157.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
- "grad_norm": 0.06557600945234299,
368
- "learning_rate": 0.00047200440872192636,
369
- "loss": 0.0443,
370
- "mean_token_accuracy": 0.9863678300380707,
371
- "num_tokens": 5407203.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
- "grad_norm": 0.07097447663545609,
377
- "learning_rate": 0.0004650855617034737,
378
- "loss": 0.0326,
379
- "mean_token_accuracy": 0.9901631319522858,
380
- "num_tokens": 5539507.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
- "grad_norm": 0.09130991250276566,
386
- "learning_rate": 0.00045803839684875944,
387
- "loss": 0.0423,
388
- "mean_token_accuracy": 0.9872457224130631,
389
- "num_tokens": 5690823.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
- "grad_norm": 0.06412132829427719,
395
- "learning_rate": 0.00045086839178366795,
396
- "loss": 0.0318,
397
- "mean_token_accuracy": 0.9907098066806793,
398
- "num_tokens": 5825317.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
- "grad_norm": 0.05260787159204483,
404
- "learning_rate": 0.00044358111961541986,
405
- "loss": 0.042,
406
- "mean_token_accuracy": 0.9874378234148026,
407
- "num_tokens": 5975743.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
- "grad_norm": 0.04893992841243744,
413
- "learning_rate": 0.0004361822446007026,
414
- "loss": 0.0323,
415
- "mean_token_accuracy": 0.990597317814827,
416
- "num_tokens": 6107203.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
- "grad_norm": 0.047428593039512634,
422
- "learning_rate": 0.00042867751774295254,
423
- "loss": 0.0458,
424
- "mean_token_accuracy": 0.9866835039854049,
425
- "num_tokens": 6258840.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
- "grad_norm": 0.04629245027899742,
431
- "learning_rate": 0.0004210727723222105,
432
- "loss": 0.0297,
433
- "mean_token_accuracy": 0.990944333076477,
434
- "num_tokens": 6390779.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
- "grad_norm": 0.05299977585673332,
440
- "learning_rate": 0.0004133739193610255,
441
- "loss": 0.0418,
442
- "mean_token_accuracy": 0.9874403899908066,
443
- "num_tokens": 6543210.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
- "grad_norm": 0.044210728257894516,
449
- "learning_rate": 0.00040558694302992963,
450
- "loss": 0.0292,
451
- "mean_token_accuracy": 0.9912893986701965,
452
- "num_tokens": 6677163.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
- "grad_norm": 0.034908175468444824,
458
- "learning_rate": 0.00039771789599605845,
459
- "loss": 0.0404,
460
- "mean_token_accuracy": 0.9877376782894135,
461
- "num_tokens": 6827748.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
- "grad_norm": 0.05875537171959877,
467
- "learning_rate": 0.0003897728947185279,
468
- "loss": 0.028,
469
- "mean_token_accuracy": 0.9915457659959793,
470
- "num_tokens": 6959551.0,
471
  "step": 1225
472
  },
473
  {
474
  "epoch": 3.0,
475
- "eval_loss": 0.0452975295484066,
476
- "eval_mean_token_accuracy": 0.9880153765549531,
477
  "eval_num_tokens": 7062540.0,
478
- "eval_runtime": 15.7338,
479
- "eval_samples_per_second": 23.453,
480
- "eval_steps_per_second": 11.758,
481
  "step": 1245
482
  },
483
  {
484
  "epoch": 3.012070006035003,
485
- "grad_norm": 0.042585525661706924,
486
- "learning_rate": 0.00038175811469422905,
487
- "loss": 0.0374,
488
- "mean_token_accuracy": 0.9883644322759098,
489
- "num_tokens": 7100902.0,
490
  "step": 1250
491
  },
492
  {
493
  "epoch": 3.0724200362100182,
494
- "grad_norm": 0.06379897147417068,
495
- "learning_rate": 0.00037367978565773226,
496
- "loss": 0.028,
497
- "mean_token_accuracy": 0.99155364215374,
498
- "num_tokens": 7250783.0,
499
  "step": 1275
500
  },
501
  {
502
  "epoch": 3.1327700663850333,
503
- "grad_norm": 0.07204161584377289,
504
- "learning_rate": 0.0003655441867390346,
505
- "loss": 0.0306,
506
- "mean_token_accuracy": 0.9907914429903031,
507
- "num_tokens": 7386632.0,
508
  "step": 1300
509
  },
510
  {
511
  "epoch": 3.1931200965600484,
512
- "grad_norm": 0.06669546663761139,
513
- "learning_rate": 0.00035735764158291254,
514
- "loss": 0.0254,
515
- "mean_token_accuracy": 0.9917327278852462,
516
- "num_tokens": 7534074.0,
517
  "step": 1325
518
  },
519
  {
520
  "epoch": 3.2534701267350634,
521
- "grad_norm": 0.056468356400728226,
522
- "learning_rate": 0.0003491265134336745,
523
- "loss": 0.0286,
524
- "mean_token_accuracy": 0.9912703585624695,
525
- "num_tokens": 7670187.0,
526
  "step": 1350
527
  },
528
  {
529
  "epoch": 3.3138201569100785,
530
- "grad_norm": 0.04088124632835388,
531
- "learning_rate": 0.00034085720018913276,
532
- "loss": 0.0252,
533
- "mean_token_accuracy": 0.9922074353694916,
534
- "num_tokens": 7816677.0,
535
  "step": 1375
536
  },
537
  {
538
  "epoch": 3.3741701870850935,
539
- "grad_norm": 0.060834601521492004,
540
- "learning_rate": 0.0003325561294276413,
541
- "loss": 0.0314,
542
- "mean_token_accuracy": 0.9902624082565308,
543
- "num_tokens": 7952220.0,
544
  "step": 1400
545
  },
546
  {
547
  "epoch": 3.4345202172601086,
548
- "grad_norm": 0.047619305551052094,
549
- "learning_rate": 0.00032422975341206157,
550
- "loss": 0.0257,
551
- "mean_token_accuracy": 0.9918809252977371,
552
- "num_tokens": 8099840.0,
553
  "step": 1425
554
  },
555
  {
556
  "epoch": 3.4948702474351236,
557
- "grad_norm": 0.05335197225213051,
558
- "learning_rate": 0.000315884544074543,
559
- "loss": 0.0284,
560
- "mean_token_accuracy": 0.9908446717262268,
561
- "num_tokens": 8236126.0,
562
  "step": 1450
563
  },
564
  {
565
  "epoch": 3.5552202776101387,
566
- "grad_norm": 0.05193324014544487,
567
- "learning_rate": 0.0003075269879860149,
568
- "loss": 0.0237,
569
- "mean_token_accuracy": 0.9922806292772293,
570
- "num_tokens": 8384778.0,
571
  "step": 1475
572
  },
573
  {
574
  "epoch": 3.6155703077851538,
575
- "grad_norm": 0.050464097410440445,
576
- "learning_rate": 0.0002991635813142984,
577
- "loss": 0.0281,
578
- "mean_token_accuracy": 0.9910313940048218,
579
- "num_tokens": 8519003.0,
580
  "step": 1500
581
  },
582
  {
583
  "epoch": 3.675920337960169,
584
- "grad_norm": 0.0418088473379612,
585
- "learning_rate": 0.0002908008247747611,
586
- "loss": 0.0248,
587
- "mean_token_accuracy": 0.9920767372846604,
588
- "num_tokens": 8666448.0,
589
  "step": 1525
590
  },
591
  {
592
  "epoch": 3.736270368135184,
593
- "grad_norm": 0.0733715295791626,
594
- "learning_rate": 0.00028244521857743467,
595
- "loss": 0.0261,
596
- "mean_token_accuracy": 0.9917522144317626,
597
- "num_tokens": 8802995.0,
598
  "step": 1550
599
  },
600
  {
601
  "epoch": 3.796620398310199,
602
- "grad_norm": 0.029181618243455887,
603
- "learning_rate": 0.00027410325737452793,
604
- "loss": 0.0253,
605
- "mean_token_accuracy": 0.9918905264139175,
606
- "num_tokens": 8952485.0,
607
  "step": 1575
608
  },
609
  {
610
  "epoch": 3.856970428485214,
611
- "grad_norm": 0.022401634603738785,
612
- "learning_rate": 0.0002657814252122571,
613
- "loss": 0.0255,
614
- "mean_token_accuracy": 0.9920559304952622,
615
- "num_tokens": 9088126.0,
616
  "step": 1600
617
  },
618
  {
619
  "epoch": 3.9173204586602295,
620
- "grad_norm": 0.035883549600839615,
621
- "learning_rate": 0.00025748619049092167,
622
- "loss": 0.0247,
623
- "mean_token_accuracy": 0.9920018029212951,
624
- "num_tokens": 9238043.0,
625
  "step": 1625
626
  },
627
  {
628
  "epoch": 3.9776704888352445,
629
- "grad_norm": 0.031110195443034172,
630
- "learning_rate": 0.0002492240009371417,
631
- "loss": 0.0248,
632
- "mean_token_accuracy": 0.9918604761362075,
633
- "num_tokens": 9369467.0,
634
  "step": 1650
635
  },
636
  {
637
  "epoch": 4.0,
638
- "eval_loss": 0.04084077104926109,
639
- "eval_mean_token_accuracy": 0.9893259013021315,
640
  "eval_num_tokens": 9416720.0,
641
- "eval_runtime": 15.7205,
642
- "eval_samples_per_second": 23.473,
643
- "eval_steps_per_second": 11.768,
644
  "step": 1660
645
  },
646
  {
647
  "epoch": 4.036210018105009,
648
- "grad_norm": 0.042655326426029205,
649
- "learning_rate": 0.00024100127859216246,
650
- "loss": 0.0245,
651
- "mean_token_accuracy": 0.9926608100379866,
652
- "num_tokens": 9517025.0,
653
  "step": 1675
654
  },
655
  {
656
  "epoch": 4.096560048280024,
657
- "grad_norm": 0.0497356578707695,
658
- "learning_rate": 0.0002328244148201266,
659
- "loss": 0.0179,
660
- "mean_token_accuracy": 0.9941043162345886,
661
- "num_tokens": 9655560.0,
662
  "step": 1700
663
  },
664
  {
665
  "epoch": 4.15691007845504,
666
- "grad_norm": 0.028950069099664688,
667
- "learning_rate": 0.0002246997653401883,
668
- "loss": 0.0236,
669
- "mean_token_accuracy": 0.9927883541584015,
670
- "num_tokens": 9802510.0,
671
  "step": 1725
672
  },
673
  {
674
  "epoch": 4.217260108630055,
675
- "grad_norm": 0.02388886548578739,
676
- "learning_rate": 0.00021663364528633574,
677
- "loss": 0.0183,
678
- "mean_token_accuracy": 0.9939106184244156,
679
- "num_tokens": 9938159.0,
680
  "step": 1750
681
  },
682
  {
683
  "epoch": 4.27761013880507,
684
- "grad_norm": 0.05049145594239235,
685
- "learning_rate": 0.00020863232429875822,
686
  "loss": 0.0233,
687
- "mean_token_accuracy": 0.9924809694290161,
688
- "num_tokens": 10084527.0,
689
  "step": 1775
690
  },
691
  {
692
  "epoch": 4.337960168980085,
693
- "grad_norm": 0.02479761652648449,
694
- "learning_rate": 0.00020070202165057554,
695
- "loss": 0.0189,
696
- "mean_token_accuracy": 0.9934798705577851,
697
- "num_tokens": 10223699.0,
698
  "step": 1800
699
  },
700
  {
701
  "epoch": 4.3983101991551,
702
- "grad_norm": 0.02417929284274578,
703
- "learning_rate": 0.00019284890141371618,
704
- "loss": 0.0211,
705
- "mean_token_accuracy": 0.9930866587162018,
706
- "num_tokens": 10371425.0,
707
  "step": 1825
708
  },
709
  {
710
  "epoch": 4.458660229330115,
711
- "grad_norm": 0.029235906898975372,
712
- "learning_rate": 0.00018507906766770314,
713
- "loss": 0.0188,
714
- "mean_token_accuracy": 0.9934960836172104,
715
- "num_tokens": 10510828.0,
716
  "step": 1850
717
  },
718
  {
719
  "epoch": 4.51901025950513,
720
- "grad_norm": 0.02183988131582737,
721
- "learning_rate": 0.00017739855975506917,
722
- "loss": 0.0204,
723
- "mean_token_accuracy": 0.9933663594722748,
724
- "num_tokens": 10657379.0,
725
  "step": 1875
726
  },
727
  {
728
  "epoch": 4.579360289680145,
729
- "grad_norm": 0.027698421850800514,
730
- "learning_rate": 0.00016981334758709322,
731
- "loss": 0.0183,
732
- "mean_token_accuracy": 0.9937446874380111,
733
- "num_tokens": 10794758.0,
734
  "step": 1900
735
  },
736
  {
737
  "epoch": 4.63971031985516,
738
- "grad_norm": 0.030431602150201797,
739
- "learning_rate": 0.00016232932700350157,
740
- "loss": 0.0199,
741
- "mean_token_accuracy": 0.9932859373092652,
742
- "num_tokens": 10942187.0,
743
  "step": 1925
744
  },
745
  {
746
  "epoch": 4.700060350030175,
747
- "grad_norm": 0.026663949713110924,
748
- "learning_rate": 0.00015495231518974608,
749
- "loss": 0.0188,
750
- "mean_token_accuracy": 0.9938339275121689,
751
- "num_tokens": 11079614.0,
752
  "step": 1950
753
  },
754
  {
755
  "epoch": 4.76041038020519,
756
- "grad_norm": 0.025490237399935722,
757
- "learning_rate": 0.000147688046155417,
758
- "loss": 0.0209,
759
- "mean_token_accuracy": 0.9929323321580887,
760
- "num_tokens": 11226358.0,
761
  "step": 1975
762
  },
763
  {
764
  "epoch": 4.820760410380205,
765
- "grad_norm": 0.0648968443274498,
766
- "learning_rate": 0.00014054216627730755,
767
- "loss": 0.0177,
768
- "mean_token_accuracy": 0.9939511501789093,
769
- "num_tokens": 11363683.0,
770
  "step": 2000
771
  },
772
  {
773
  "epoch": 4.88111044055522,
774
- "grad_norm": 0.040247052907943726,
775
- "learning_rate": 0.00013352022991059375,
776
- "loss": 0.021,
777
- "mean_token_accuracy": 0.9929184436798095,
778
- "num_tokens": 11508599.0,
779
  "step": 2025
780
  },
781
  {
782
  "epoch": 4.941460470730235,
783
- "grad_norm": 0.01861538551747799,
784
- "learning_rate": 0.00012662769507154113,
785
- "loss": 0.0173,
786
- "mean_token_accuracy": 0.9944949728250504,
787
- "num_tokens": 11644898.0,
788
  "step": 2050
789
  },
790
  {
791
  "epoch": 5.0,
792
- "grad_norm": 0.07224704325199127,
793
- "learning_rate": 0.00011986991919509261,
794
- "loss": 0.0207,
795
- "mean_token_accuracy": 0.9934880923979061,
796
  "num_tokens": 11770900.0,
797
  "step": 2075
798
  },
799
  {
800
  "epoch": 5.0,
801
- "eval_loss": 0.03971695154905319,
802
- "eval_mean_token_accuracy": 0.9899426402272404,
803
  "eval_num_tokens": 11770900.0,
804
- "eval_runtime": 15.7222,
805
- "eval_samples_per_second": 23.47,
806
- "eval_steps_per_second": 11.767,
807
  "step": 2075
808
  }
809
  ],
@@ -824,7 +824,7 @@
824
  "attributes": {}
825
  }
826
  },
827
- "total_flos": 5.113947106809139e+17,
828
  "train_batch_size": 2,
829
  "trial_name": null,
830
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
+ "grad_norm": 0.2555118501186371,
15
+ "learning_rate": 0.00014209090909090907,
16
+ "loss": 1.7961,
17
+ "mean_token_accuracy": 0.6338900655508042,
18
+ "num_tokens": 157654.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
+ "grad_norm": 0.2974064350128174,
24
+ "learning_rate": 0.0002901022727272727,
25
+ "loss": 0.8651,
26
+ "mean_token_accuracy": 0.7819488084316254,
27
+ "num_tokens": 283594.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
+ "grad_norm": 0.1821448802947998,
33
+ "learning_rate": 0.0004381136363636364,
34
+ "loss": 0.6201,
35
+ "mean_token_accuracy": 0.829513993859291,
36
+ "num_tokens": 441384.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
+ "grad_norm": 0.219114288687706,
42
+ "learning_rate": 0.0005209803987527346,
43
+ "loss": 0.4804,
44
+ "mean_token_accuracy": 0.8656451117992401,
45
+ "num_tokens": 570025.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
+ "grad_norm": 0.27953630685806274,
51
+ "learning_rate": 0.0005207900816285337,
52
+ "loss": 0.3653,
53
+ "mean_token_accuracy": 0.8948821222782135,
54
+ "num_tokens": 725198.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
+ "grad_norm": 0.23318831622600555,
60
+ "learning_rate": 0.0005203974460233567,
61
+ "loss": 0.3394,
62
+ "mean_token_accuracy": 0.9027728015184402,
63
+ "num_tokens": 850794.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
+ "grad_norm": 0.16859084367752075,
69
+ "learning_rate": 0.000519802797125318,
70
+ "loss": 0.2425,
71
+ "mean_token_accuracy": 0.9307077479362488,
72
+ "num_tokens": 1008079.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
+ "grad_norm": 0.21337567269802094,
78
+ "learning_rate": 0.0005190065971435844,
79
+ "loss": 0.2287,
80
+ "mean_token_accuracy": 0.9341706246137619,
81
+ "num_tokens": 1133934.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
+ "grad_norm": 0.1900060623884201,
87
+ "learning_rate": 0.0005180094649491089,
88
+ "loss": 0.1665,
89
+ "mean_token_accuracy": 0.951556549668312,
90
+ "num_tokens": 1290998.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
+ "grad_norm": 0.2943938374519348,
96
+ "learning_rate": 0.000516812175593594,
97
+ "loss": 0.1434,
98
+ "mean_token_accuracy": 0.9590511202812195,
99
+ "num_tokens": 1417806.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
+ "grad_norm": 0.11302364617586136,
105
+ "learning_rate": 0.0005154156597070589,
106
+ "loss": 0.1444,
107
+ "mean_token_accuracy": 0.9595862078666687,
108
+ "num_tokens": 1575293.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
+ "grad_norm": 0.27972692251205444,
114
+ "learning_rate": 0.0005138210027744792,
115
+ "loss": 0.1209,
116
+ "mean_token_accuracy": 0.9656497663259507,
117
+ "num_tokens": 1701625.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
+ "grad_norm": 0.1272473782300949,
123
+ "learning_rate": 0.0005120294442920594,
124
+ "loss": 0.1096,
125
+ "mean_token_accuracy": 0.9686538934707641,
126
+ "num_tokens": 1860100.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
+ "grad_norm": 0.22622144222259521,
132
+ "learning_rate": 0.0005100423768037967,
133
+ "loss": 0.1035,
134
+ "mean_token_accuracy": 0.970460234284401,
135
+ "num_tokens": 1987374.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
+ "grad_norm": 0.12625542283058167,
141
+ "learning_rate": 0.0005078613448190834,
142
+ "loss": 0.1132,
143
+ "mean_token_accuracy": 0.9690200018882752,
144
+ "num_tokens": 2146461.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
+ "grad_norm": 0.177975594997406,
150
+ "learning_rate": 0.0005054880436121898,
151
+ "loss": 0.0866,
152
+ "mean_token_accuracy": 0.9762758094072342,
153
+ "num_tokens": 2272344.0,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 1.0,
158
+ "eval_loss": 0.0893438458442688,
159
+ "eval_mean_token_accuracy": 0.9755367253277752,
160
  "eval_num_tokens": 2354180.0,
161
+ "eval_runtime": 15.9986,
162
+ "eval_samples_per_second": 23.065,
163
+ "eval_steps_per_second": 11.564,
164
  "step": 415
165
  },
166
  {
167
  "epoch": 1.024140012070006,
168
+ "grad_norm": 0.10577582567930222,
169
+ "learning_rate": 0.0005029243179045591,
170
+ "loss": 0.0871,
171
+ "mean_token_accuracy": 0.9752628661922573,
172
+ "num_tokens": 2422544.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
+ "grad_norm": 0.15040436387062073,
178
+ "learning_rate": 0.0005001721604309415,
179
  "loss": 0.0611,
180
+ "mean_token_accuracy": 0.9824812412261963,
181
+ "num_tokens": 2565633.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
+ "grad_norm": 0.09043437987565994,
187
+ "learning_rate": 0.0004972337103904799,
188
+ "loss": 0.0851,
189
+ "mean_token_accuracy": 0.9762555521726608,
190
+ "num_tokens": 2708192.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
+ "grad_norm": 0.08423709124326706,
196
+ "learning_rate": 0.0004941112517839508,
197
+ "loss": 0.0551,
198
+ "mean_token_accuracy": 0.9843029165267945,
199
+ "num_tokens": 2852072.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
+ "grad_norm": 0.10297609120607376,
205
+ "learning_rate": 0.0004908072116384549,
206
+ "loss": 0.0622,
207
+ "mean_token_accuracy": 0.982346653342247,
208
+ "num_tokens": 2992794.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
+ "grad_norm": 0.08755902945995331,
214
+ "learning_rate": 0.00048732415812093586,
215
+ "loss": 0.056,
216
+ "mean_token_accuracy": 0.984083269238472,
217
+ "num_tokens": 3134652.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
+ "grad_norm": 0.09771203249692917,
223
+ "learning_rate": 0.0004836647985419926,
224
+ "loss": 0.0704,
225
+ "mean_token_accuracy": 0.9805797964334488,
226
+ "num_tokens": 3276784.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
+ "grad_norm": 0.11501556634902954,
232
+ "learning_rate": 0.00047983197725153936,
233
  "loss": 0.0492,
234
+ "mean_token_accuracy": 0.9858285415172577,
235
+ "num_tokens": 3419527.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
+ "grad_norm": 0.06587153673171997,
241
+ "learning_rate": 0.0004758286734279469,
242
+ "loss": 0.0679,
243
+ "mean_token_accuracy": 0.9807516372203827,
244
+ "num_tokens": 3560910.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
+ "grad_norm": 0.11413555592298508,
250
+ "learning_rate": 0.00047165799876238475,
251
+ "loss": 0.0485,
252
+ "mean_token_accuracy": 0.9861949247121811,
253
+ "num_tokens": 3704409.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
+ "grad_norm": 0.07050240784883499,
259
+ "learning_rate": 0.00046732319504016285,
260
+ "loss": 0.0618,
261
+ "mean_token_accuracy": 0.9829549384117127,
262
+ "num_tokens": 3847083.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
+ "grad_norm": 0.11633298546075821,
268
+ "learning_rate": 0.00046282763162095467,
269
  "loss": 0.0454,
270
+ "mean_token_accuracy": 0.9869883376359939,
271
+ "num_tokens": 3989418.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
+ "grad_norm": 0.08928703516721725,
277
+ "learning_rate": 0.00045817480281985817,
278
+ "loss": 0.0593,
279
+ "mean_token_accuracy": 0.9837275487184525,
280
+ "num_tokens": 4128788.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
+ "grad_norm": 0.05468004569411278,
286
+ "learning_rate": 0.00045336832519133203,
287
+ "loss": 0.044,
288
+ "mean_token_accuracy": 0.9875295370817184,
289
+ "num_tokens": 4269534.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
+ "grad_norm": 0.08439141511917114,
295
+ "learning_rate": 0.00044841193471811754,
296
+ "loss": 0.0612,
297
+ "mean_token_accuracy": 0.9824546372890473,
298
+ "num_tokens": 4412269.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
+ "grad_norm": 0.07864434272050858,
304
+ "learning_rate": 0.0004433094839073305,
305
+ "loss": 0.0437,
306
+ "mean_token_accuracy": 0.9871676206588745,
307
+ "num_tokens": 4554996.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
+ "grad_norm": 0.07595550268888474,
313
+ "learning_rate": 0.00043806493879598165,
314
+ "loss": 0.0462,
315
+ "mean_token_accuracy": 0.9875918942689895,
316
+ "num_tokens": 4688327.0,
317
  "step": 825
318
  },
319
  {
320
  "epoch": 2.0,
321
+ "eval_loss": 0.05492718890309334,
322
+ "eval_mean_token_accuracy": 0.9854504933228364,
323
  "eval_num_tokens": 4708360.0,
324
+ "eval_runtime": 16.012,
325
+ "eval_samples_per_second": 23.045,
326
+ "eval_steps_per_second": 11.554,
327
  "step": 830
328
  },
329
  {
330
  "epoch": 2.048280024140012,
331
+ "grad_norm": 0.0821777731180191,
332
+ "learning_rate": 0.0004326823758682526,
333
+ "loss": 0.0464,
334
+ "mean_token_accuracy": 0.9861399519074824,
335
+ "num_tokens": 4837216.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
+ "grad_norm": 0.05397653952240944,
341
+ "learning_rate": 0.0004271659788869223,
342
+ "loss": 0.033,
343
+ "mean_token_accuracy": 0.9902914655208588,
344
+ "num_tokens": 4969882.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
+ "grad_norm": 0.07699369639158249,
350
+ "learning_rate": 0.0004215200356414093,
351
+ "loss": 0.0437,
352
+ "mean_token_accuracy": 0.9872231745719909,
353
+ "num_tokens": 5121640.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
+ "grad_norm": 0.06484604626893997,
359
+ "learning_rate": 0.0004157489346149548,
360
+ "loss": 0.0334,
361
+ "mean_token_accuracy": 0.9897586101293564,
362
+ "num_tokens": 5254936.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
+ "grad_norm": 0.06028969585895538,
368
+ "learning_rate": 0.0004098571615735394,
369
+ "loss": 0.0434,
370
+ "mean_token_accuracy": 0.9870266842842103,
371
+ "num_tokens": 5406478.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
+ "grad_norm": 0.08240769803524017,
377
+ "learning_rate": 0.000403849296079183,
378
+ "loss": 0.0328,
379
+ "mean_token_accuracy": 0.9905992788076401,
380
+ "num_tokens": 5539680.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
+ "grad_norm": 0.07461749762296677,
386
+ "learning_rate": 0.0003977300079303395,
387
+ "loss": 0.046,
388
+ "mean_token_accuracy": 0.9860885471105576,
389
+ "num_tokens": 5690925.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
+ "grad_norm": 0.044400863349437714,
395
+ "learning_rate": 0.0003915040535321517,
396
+ "loss": 0.031,
397
+ "mean_token_accuracy": 0.9906466883420945,
398
+ "num_tokens": 5822826.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
+ "grad_norm": 0.08952440321445465,
404
+ "learning_rate": 0.0003851762721993896,
405
+ "loss": 0.0446,
406
+ "mean_token_accuracy": 0.9869460552930832,
407
+ "num_tokens": 5974897.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
+ "grad_norm": 0.08355443924665451,
413
+ "learning_rate": 0.00037875158239494345,
414
+ "loss": 0.0306,
415
+ "mean_token_accuracy": 0.9908532989025116,
416
+ "num_tokens": 6107881.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
+ "grad_norm": 0.049891915172338486,
422
+ "learning_rate": 0.00037223497790679714,
423
+ "loss": 0.0456,
424
+ "mean_token_accuracy": 0.9867906486988067,
425
+ "num_tokens": 6258259.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
+ "grad_norm": 0.053985197097063065,
431
+ "learning_rate": 0.0003656315239664528,
432
+ "loss": 0.0302,
433
+ "mean_token_accuracy": 0.9908502048254013,
434
+ "num_tokens": 6391420.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
+ "grad_norm": 0.039380334317684174,
440
+ "learning_rate": 0.00035894635331182377,
441
+ "loss": 0.0453,
442
+ "mean_token_accuracy": 0.9865706026554107,
443
+ "num_tokens": 6544101.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
+ "grad_norm": 0.05963930860161781,
449
+ "learning_rate": 0.0003521846621976556,
450
+ "loss": 0.0303,
451
+ "mean_token_accuracy": 0.9909638494253159,
452
+ "num_tokens": 6676760.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
+ "grad_norm": 0.03845281898975372,
458
+ "learning_rate": 0.00034535170635657743,
459
+ "loss": 0.0406,
460
+ "mean_token_accuracy": 0.9878826266527176,
461
+ "num_tokens": 6827493.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
+ "grad_norm": 0.06949020177125931,
467
+ "learning_rate": 0.0003384527969139218,
468
+ "loss": 0.0277,
469
+ "mean_token_accuracy": 0.9915235358476638,
470
+ "num_tokens": 6960271.0,
471
  "step": 1225
472
  },
473
  {
474
  "epoch": 3.0,
475
+ "eval_loss": 0.04539692401885986,
476
+ "eval_mean_token_accuracy": 0.9875544515815942,
477
  "eval_num_tokens": 7062540.0,
478
+ "eval_runtime": 16.016,
479
+ "eval_samples_per_second": 23.039,
480
+ "eval_steps_per_second": 11.551,
481
  "step": 1245
482
  },
483
  {
484
  "epoch": 3.012070006035003,
485
+ "grad_norm": 0.03516312316060066,
486
+ "learning_rate": 0.0003314932962594889,
487
+ "loss": 0.0362,
488
+ "mean_token_accuracy": 0.9887829656453476,
489
+ "num_tokens": 7099304.0,
490
  "step": 1250
491
  },
492
  {
493
  "epoch": 3.0724200362100182,
494
+ "grad_norm": 0.0641891360282898,
495
+ "learning_rate": 0.0003244786138794642,
496
+ "loss": 0.025,
497
+ "mean_token_accuracy": 0.9919194889068603,
498
+ "num_tokens": 7247194.0,
499
  "step": 1275
500
  },
501
  {
502
  "epoch": 3.1327700663850333,
503
+ "grad_norm": 0.05540559068322182,
504
+ "learning_rate": 0.0003174142021517284,
505
+ "loss": 0.0273,
506
+ "mean_token_accuracy": 0.9912847858667374,
507
+ "num_tokens": 7383605.0,
508
  "step": 1300
509
  },
510
  {
511
  "epoch": 3.1931200965600484,
512
+ "grad_norm": 0.036626674234867096,
513
+ "learning_rate": 0.0003103055521078291,
514
+ "loss": 0.0268,
515
+ "mean_token_accuracy": 0.9917373913526535,
516
+ "num_tokens": 7532425.0,
517
  "step": 1325
518
  },
519
  {
520
  "epoch": 3.2534701267350634,
521
+ "grad_norm": 0.07257969677448273,
522
+ "learning_rate": 0.00030315818916490736,
523
+ "loss": 0.0313,
524
+ "mean_token_accuracy": 0.9903122758865357,
525
+ "num_tokens": 7668583.0,
526
  "step": 1350
527
  },
528
  {
529
  "epoch": 3.3138201569100785,
530
+ "grad_norm": 0.026177173480391502,
531
+ "learning_rate": 0.00029597766883089697,
532
+ "loss": 0.0261,
533
+ "mean_token_accuracy": 0.9917293560504913,
534
+ "num_tokens": 7816713.0,
535
  "step": 1375
536
  },
537
  {
538
  "epoch": 3.3741701870850935,
539
+ "grad_norm": 0.058491937816143036,
540
+ "learning_rate": 0.0002887695723863352,
541
+ "loss": 0.0293,
542
+ "mean_token_accuracy": 0.9907053065299988,
543
+ "num_tokens": 7953359.0,
544
  "step": 1400
545
  },
546
  {
547
  "epoch": 3.4345202172601086,
548
+ "grad_norm": 0.04674854129552841,
549
+ "learning_rate": 0.00028153950254614015,
550
+ "loss": 0.0247,
551
+ "mean_token_accuracy": 0.992136053442955,
552
+ "num_tokens": 8103024.0,
553
  "step": 1425
554
  },
555
  {
556
  "epoch": 3.4948702474351236,
557
+ "grad_norm": 0.06961289793252945,
558
+ "learning_rate": 0.0002742930791047282,
559
+ "loss": 0.0299,
560
+ "mean_token_accuracy": 0.9910703629255295,
561
+ "num_tokens": 8239193.0,
562
  "step": 1450
563
  },
564
  {
565
  "epoch": 3.5552202776101387,
566
+ "grad_norm": 0.052263230085372925,
567
+ "learning_rate": 0.0002670359345678563,
568
+ "loss": 0.026,
569
+ "mean_token_accuracy": 0.9916572499275208,
570
+ "num_tokens": 8388210.0,
571
  "step": 1475
572
  },
573
  {
574
  "epoch": 3.6155703077851538,
575
+ "grad_norm": 0.0689224824309349,
576
+ "learning_rate": 0.00025977370977458246,
577
+ "loss": 0.0262,
578
+ "mean_token_accuracy": 0.9915121608972549,
579
+ "num_tokens": 8524498.0,
580
  "step": 1500
581
  },
582
  {
583
  "epoch": 3.675920337960169,
584
+ "grad_norm": 0.026073528453707695,
585
+ "learning_rate": 0.0002525120495127509,
586
+ "loss": 0.0253,
587
+ "mean_token_accuracy": 0.9922972655296326,
588
+ "num_tokens": 8673299.0,
589
  "step": 1525
590
  },
591
  {
592
  "epoch": 3.736270368135184,
593
+ "grad_norm": 0.06121857091784477,
594
+ "learning_rate": 0.0002452565981314058,
595
+ "loss": 0.0279,
596
+ "mean_token_accuracy": 0.9911224627494812,
597
+ "num_tokens": 8808991.0,
598
  "step": 1550
599
  },
600
  {
601
  "epoch": 3.796620398310199,
602
+ "grad_norm": 0.022890372201800346,
603
+ "learning_rate": 0.00023801299515354842,
604
+ "loss": 0.0225,
605
+ "mean_token_accuracy": 0.9928586632013321,
606
+ "num_tokens": 8956517.0,
607
  "step": 1575
608
  },
609
  {
610
  "epoch": 3.856970428485214,
611
+ "grad_norm": 0.05721288546919823,
612
+ "learning_rate": 0.00023078687089264326,
613
+ "loss": 0.0262,
614
+ "mean_token_accuracy": 0.9914663273096085,
615
+ "num_tokens": 9091016.0,
616
  "step": 1600
617
  },
618
  {
619
  "epoch": 3.9173204586602295,
620
+ "grad_norm": 0.02130250260233879,
621
+ "learning_rate": 0.00022358384207628367,
622
+ "loss": 0.0234,
623
+ "mean_token_accuracy": 0.9922544056177139,
624
+ "num_tokens": 9237731.0,
625
  "step": 1625
626
  },
627
  {
628
  "epoch": 3.9776704888352445,
629
+ "grad_norm": 0.04317730665206909,
630
+ "learning_rate": 0.00021640950748041802,
631
+ "loss": 0.0247,
632
+ "mean_token_accuracy": 0.9921871078014374,
633
+ "num_tokens": 9369515.0,
634
  "step": 1650
635
  },
636
  {
637
  "epoch": 4.0,
638
+ "eval_loss": 0.04090578854084015,
639
+ "eval_mean_token_accuracy": 0.9890131476763132,
640
  "eval_num_tokens": 9416720.0,
641
+ "eval_runtime": 16.0268,
642
+ "eval_samples_per_second": 23.024,
643
+ "eval_steps_per_second": 11.543,
644
  "step": 1660
645
  },
646
  {
647
  "epoch": 4.036210018105009,
648
+ "grad_norm": 0.036448780447244644,
649
+ "learning_rate": 0.00020926944357752775,
650
+ "loss": 0.022,
651
+ "mean_token_accuracy": 0.9928884309591707,
652
+ "num_tokens": 9515736.0,
653
  "step": 1675
654
  },
655
  {
656
  "epoch": 4.096560048280024,
657
+ "grad_norm": 0.027000512927770615,
658
+ "learning_rate": 0.00020216920020214326,
659
+ "loss": 0.0188,
660
+ "mean_token_accuracy": 0.9937462592124939,
661
+ "num_tokens": 9651890.0,
662
  "step": 1700
663
  },
664
  {
665
  "epoch": 4.15691007845504,
666
+ "grad_norm": 0.0342092290520668,
667
+ "learning_rate": 0.00019511429623706353,
668
+ "loss": 0.0229,
669
+ "mean_token_accuracy": 0.9928898781538009,
670
+ "num_tokens": 9797002.0,
671
  "step": 1725
672
  },
673
  {
674
  "epoch": 4.217260108630055,
675
+ "grad_norm": 0.031318746507167816,
676
+ "learning_rate": 0.00018811021532363489,
677
+ "loss": 0.0181,
678
+ "mean_token_accuracy": 0.9940047591924668,
679
+ "num_tokens": 9935038.0,
680
  "step": 1750
681
  },
682
  {
683
  "epoch": 4.27761013880507,
684
+ "grad_norm": 0.025446726009249687,
685
+ "learning_rate": 0.00018116240159942172,
686
  "loss": 0.0233,
687
+ "mean_token_accuracy": 0.9922664022445679,
688
+ "num_tokens": 10083216.0,
689
  "step": 1775
690
  },
691
  {
692
  "epoch": 4.337960168980085,
693
+ "grad_norm": 0.03149225190281868,
694
+ "learning_rate": 0.0001742762554665831,
695
+ "loss": 0.0185,
696
+ "mean_token_accuracy": 0.9938816410303116,
697
+ "num_tokens": 10222100.0,
698
  "step": 1800
699
  },
700
  {
701
  "epoch": 4.3983101991551,
702
+ "grad_norm": 0.03767989203333855,
703
+ "learning_rate": 0.00016745712939424356,
704
+ "loss": 0.0226,
705
+ "mean_token_accuracy": 0.9924985402822495,
706
+ "num_tokens": 10370077.0,
707
  "step": 1825
708
  },
709
  {
710
  "epoch": 4.458660229330115,
711
+ "grad_norm": 0.02160339243710041,
712
+ "learning_rate": 0.00016071032375812225,
713
+ "loss": 0.0179,
714
+ "mean_token_accuracy": 0.9938990676403046,
715
+ "num_tokens": 10507974.0,
716
  "step": 1850
717
  },
718
  {
719
  "epoch": 4.51901025950513,
720
+ "grad_norm": 0.04912654682993889,
721
+ "learning_rate": 0.00015404108272065175,
722
+ "loss": 0.022,
723
+ "mean_token_accuracy": 0.9925913631916046,
724
+ "num_tokens": 10653228.0,
725
  "step": 1875
726
  },
727
  {
728
  "epoch": 4.579360289680145,
729
+ "grad_norm": 0.02648838609457016,
730
+ "learning_rate": 0.00014745459015479262,
731
+ "loss": 0.0187,
732
+ "mean_token_accuracy": 0.9936872887611389,
733
+ "num_tokens": 10790348.0,
734
  "step": 1900
735
  },
736
  {
737
  "epoch": 4.63971031985516,
738
+ "grad_norm": 0.030189577490091324,
739
+ "learning_rate": 0.0001409559656147072,
740
+ "loss": 0.0212,
741
+ "mean_token_accuracy": 0.993015621304512,
742
+ "num_tokens": 10937297.0,
743
  "step": 1925
744
  },
745
  {
746
  "epoch": 4.700060350030175,
747
+ "grad_norm": 0.026134416460990906,
748
+ "learning_rate": 0.00013455026035642952,
749
+ "loss": 0.0186,
750
+ "mean_token_accuracy": 0.9938356405496598,
751
+ "num_tokens": 11075319.0,
752
  "step": 1950
753
  },
754
  {
755
  "epoch": 4.76041038020519,
756
+ "grad_norm": 0.029894286766648293,
757
+ "learning_rate": 0.00012824245341162046,
758
+ "loss": 0.0205,
759
+ "mean_token_accuracy": 0.9930271410942078,
760
+ "num_tokens": 11220693.0,
761
  "step": 1975
762
  },
763
  {
764
  "epoch": 4.820760410380205,
765
+ "grad_norm": 0.022440843284130096,
766
+ "learning_rate": 0.00012203744771746207,
767
+ "loss": 0.0181,
768
+ "mean_token_accuracy": 0.9940010941028595,
769
+ "num_tokens": 11357759.0,
770
  "step": 2000
771
  },
772
  {
773
  "epoch": 4.88111044055522,
774
+ "grad_norm": 0.04004226252436638,
775
+ "learning_rate": 0.00011594006630569889,
776
+ "loss": 0.0243,
777
+ "mean_token_accuracy": 0.9918814218044281,
778
+ "num_tokens": 11506389.0,
779
  "step": 2025
780
  },
781
  {
782
  "epoch": 4.941460470730235,
783
+ "grad_norm": 0.025478657335042953,
784
+ "learning_rate": 0.00010995504855378823,
785
+ "loss": 0.0176,
786
+ "mean_token_accuracy": 0.9940668666362762,
787
+ "num_tokens": 11644773.0,
788
  "step": 2050
789
  },
790
  {
791
  "epoch": 5.0,
792
+ "grad_norm": 0.0995091125369072,
793
+ "learning_rate": 0.00010408704650107208,
794
+ "loss": 0.0191,
795
+ "mean_token_accuracy": 0.9937616193417421,
796
  "num_tokens": 11770900.0,
797
  "step": 2075
798
  },
799
  {
800
  "epoch": 5.0,
801
+ "eval_loss": 0.03994831442832947,
802
+ "eval_mean_token_accuracy": 0.9898936400542389,
803
  "eval_num_tokens": 11770900.0,
804
+ "eval_runtime": 15.9599,
805
+ "eval_samples_per_second": 23.12,
806
+ "eval_steps_per_second": 11.592,
807
  "step": 2075
808
  }
809
  ],
 
824
  "attributes": {}
825
  }
826
  },
827
+ "total_flos": 5.1141133004634624e+17,
828
  "train_batch_size": 2,
829
  "trial_name": null,
830
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40794b3fd456b024024487eaa2ab17e954722e38bbaed4ada4bd298dc482abd9
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df2475c9956609c364fb58b0499f81bebce8d55121af31d0d0dbdfdf66aab4c
3
  size 6097