robertou2 commited on
Commit
b595f50
·
verified ·
1 Parent(s): de57a8e

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fdb3758f29d5e94c8350c37ab504d7ff02c3cccbb8332057d66d431ef10f682
3
  size 161515608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adbf32216f68817ac7b8e81d84ec05581ee1d4aec78db3102b8b8bfda9c3203a
3
  size 161515608
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b03158d3c32281c45bae11494452aff7910950a34011e853f3d6c1c18d8651b
3
  size 323181259
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b74bcd870dc58a45d5857957da63a7b34ce5562b9a8ed24f282d74c1daa703e
3
  size 323181259
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5b517d1b8e2b0f837c8b00170b154961d4d989feba4326ac25583df7a55c57a
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f35223b4162b3f25fe602e5e4c5a2349c08c0134f11cd20a82d190f37cb0842a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e3ed70b691deef80930296c31c1f2faec5c46190c3c196aae31c4481cc14ad8
3
  size 1465
trainer_state.json CHANGED
@@ -2,1018 +2,768 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.875912408759124,
6
  "eval_steps": 500,
7
- "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 2.134004071354866,
14
- "epoch": 0.029197080291970802,
15
- "grad_norm": 19.125,
16
  "learning_rate": 0.0,
17
- "loss": 2.5766,
18
- "mean_token_accuracy": 0.42293117567896843,
19
- "num_tokens": 1699.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 2.1332614570856094,
24
- "epoch": 0.058394160583941604,
25
- "grad_norm": 15.375,
26
  "learning_rate": 2e-06,
27
- "loss": 2.3501,
28
- "mean_token_accuracy": 0.43237315863370895,
29
- "num_tokens": 3890.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 2.3441822230815887,
34
- "epoch": 0.08759124087591241,
35
- "grad_norm": 22.875,
36
  "learning_rate": 4e-06,
37
- "loss": 3.134,
38
- "mean_token_accuracy": 0.3771548382937908,
39
- "num_tokens": 5114.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 2.169568419456482,
44
- "epoch": 0.11678832116788321,
45
- "grad_norm": 15.0625,
46
  "learning_rate": 6e-06,
47
- "loss": 2.2609,
48
- "mean_token_accuracy": 0.4582384452223778,
49
- "num_tokens": 6825.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 2.3993491530418396,
54
- "epoch": 0.145985401459854,
55
- "grad_norm": 11.5625,
56
  "learning_rate": 8e-06,
57
- "loss": 2.2625,
58
- "mean_token_accuracy": 0.44751258939504623,
59
- "num_tokens": 8794.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 2.38617005944252,
64
- "epoch": 0.17518248175182483,
65
- "grad_norm": 11.0,
66
  "learning_rate": 9.999999999999999e-06,
67
- "loss": 2.2774,
68
- "mean_token_accuracy": 0.4528072811663151,
69
- "num_tokens": 10473.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 2.17643141746521,
74
- "epoch": 0.20437956204379562,
75
- "grad_norm": 7.21875,
76
  "learning_rate": 1.2e-05,
77
- "loss": 1.911,
78
- "mean_token_accuracy": 0.5113650299608707,
79
- "num_tokens": 12709.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 2.2615339010953903,
84
- "epoch": 0.23357664233576642,
85
- "grad_norm": 5.9375,
86
  "learning_rate": 1.4e-05,
87
- "loss": 1.8747,
88
- "mean_token_accuracy": 0.5006480813026428,
89
- "num_tokens": 15657.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 2.416978284716606,
94
- "epoch": 0.26277372262773724,
95
- "grad_norm": 8.4375,
96
  "learning_rate": 1.6e-05,
97
- "loss": 1.9924,
98
- "mean_token_accuracy": 0.4901970997452736,
99
- "num_tokens": 17681.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 2.3273025155067444,
104
- "epoch": 0.291970802919708,
105
- "grad_norm": 6.09375,
106
  "learning_rate": 1.8e-05,
107
- "loss": 1.8238,
108
- "mean_token_accuracy": 0.4891773872077465,
109
- "num_tokens": 20159.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 2.295111373066902,
114
- "epoch": 0.32116788321167883,
115
- "grad_norm": 5.9375,
116
  "learning_rate": 1.9999999999999998e-05,
117
- "loss": 1.8838,
118
- "mean_token_accuracy": 0.528899259865284,
119
- "num_tokens": 22380.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 2.4463636726140976,
124
- "epoch": 0.35036496350364965,
125
- "grad_norm": 7.28125,
126
  "learning_rate": 2.2e-05,
127
- "loss": 2.0672,
128
- "mean_token_accuracy": 0.4942050985991955,
129
- "num_tokens": 23949.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 2.2411956042051315,
134
- "epoch": 0.3795620437956204,
135
- "grad_norm": 6.625,
136
  "learning_rate": 2.4e-05,
137
- "loss": 1.7258,
138
- "mean_token_accuracy": 0.5641119256615639,
139
- "num_tokens": 25626.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 2.1571693122386932,
144
- "epoch": 0.40875912408759124,
145
- "grad_norm": 6.3125,
146
  "learning_rate": 2.6000000000000002e-05,
147
- "loss": 1.7421,
148
- "mean_token_accuracy": 0.5413074977695942,
149
- "num_tokens": 27703.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 2.0649050027132034,
154
- "epoch": 0.43795620437956206,
155
- "grad_norm": 5.65625,
156
  "learning_rate": 2.8e-05,
157
- "loss": 1.7653,
158
- "mean_token_accuracy": 0.5364297069609165,
159
- "num_tokens": 29910.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 2.0259645730257034,
164
- "epoch": 0.46715328467153283,
165
- "grad_norm": 5.5,
166
  "learning_rate": 3e-05,
167
- "loss": 1.586,
168
- "mean_token_accuracy": 0.5716114267706871,
169
- "num_tokens": 32243.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 2.2259650826454163,
174
- "epoch": 0.49635036496350365,
175
- "grad_norm": 6.46875,
176
- "learning_rate": 2.9990862405286438e-05,
177
- "loss": 1.8815,
178
- "mean_token_accuracy": 0.5339390859007835,
179
- "num_tokens": 33704.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 2.2045857161283493,
184
- "epoch": 0.5255474452554745,
185
- "grad_norm": 7.0,
186
- "learning_rate": 2.9963460753897364e-05,
187
- "loss": 1.8033,
188
- "mean_token_accuracy": 0.5426613725721836,
189
- "num_tokens": 35222.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 2.0502880662679672,
194
- "epoch": 0.5547445255474452,
195
- "grad_norm": 5.625,
196
- "learning_rate": 2.99178284305241e-05,
197
- "loss": 1.6822,
198
- "mean_token_accuracy": 0.5445077642798424,
199
- "num_tokens": 37281.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 1.9275199472904205,
204
- "epoch": 0.583941605839416,
205
- "grad_norm": 5.625,
206
- "learning_rate": 2.9854021031123555e-05,
207
- "loss": 1.5569,
208
- "mean_token_accuracy": 0.5689515694975853,
209
- "num_tokens": 39208.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 2.1408673971891403,
214
- "epoch": 0.6131386861313869,
215
- "grad_norm": 6.5,
216
- "learning_rate": 2.977211629518312e-05,
217
- "loss": 1.8479,
218
- "mean_token_accuracy": 0.5382610447704792,
219
- "num_tokens": 40754.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 2.138097256422043,
224
- "epoch": 0.6423357664233577,
225
- "grad_norm": 6.03125,
226
- "learning_rate": 2.9672214011007087e-05,
227
- "loss": 1.7691,
228
- "mean_token_accuracy": 0.5337589606642723,
229
- "num_tokens": 42447.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 1.9584687054157257,
234
- "epoch": 0.6715328467153284,
235
- "grad_norm": 4.59375,
236
- "learning_rate": 2.9554435894139945e-05,
237
- "loss": 1.502,
238
- "mean_token_accuracy": 0.5679651834070683,
239
- "num_tokens": 44963.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 2.0382106602191925,
244
- "epoch": 0.7007299270072993,
245
- "grad_norm": 5.03125,
246
- "learning_rate": 2.9418925439074784e-05,
247
- "loss": 1.6539,
248
- "mean_token_accuracy": 0.5411265380680561,
249
- "num_tokens": 47138.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 2.0515516996383667,
254
- "epoch": 0.7299270072992701,
255
- "grad_norm": 5.4375,
256
- "learning_rate": 2.9265847744427305e-05,
257
- "loss": 1.7007,
258
- "mean_token_accuracy": 0.5707135051488876,
259
- "num_tokens": 49154.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 1.96835595369339,
264
- "epoch": 0.7591240875912408,
265
- "grad_norm": 4.875,
266
- "learning_rate": 2.9095389311788626e-05,
267
- "loss": 1.5182,
268
- "mean_token_accuracy": 0.5940572991967201,
269
- "num_tokens": 51009.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 1.9829230606555939,
274
- "epoch": 0.7883211678832117,
275
- "grad_norm": 4.78125,
276
- "learning_rate": 2.890775781850181e-05,
277
- "loss": 1.5441,
278
- "mean_token_accuracy": 0.5696061700582504,
279
- "num_tokens": 52866.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 1.9901328533887863,
284
- "epoch": 0.8175182481751825,
285
- "grad_norm": 4.53125,
286
- "learning_rate": 2.8703181864639013e-05,
287
- "loss": 1.5227,
288
- "mean_token_accuracy": 0.5771616920828819,
289
- "num_tokens": 55235.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 2.176472947001457,
294
- "epoch": 0.8467153284671532,
295
- "grad_norm": 7.34375,
296
- "learning_rate": 2.8481910694487507e-05,
297
- "loss": 1.784,
298
- "mean_token_accuracy": 0.5394799076020718,
299
- "num_tokens": 56468.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 2.0398730635643005,
304
- "epoch": 0.8759124087591241,
305
- "grad_norm": 5.5,
306
- "learning_rate": 2.8244213892883907e-05,
307
- "loss": 1.584,
308
- "mean_token_accuracy": 0.564793273806572,
309
- "num_tokens": 58219.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 1.7868350446224213,
314
- "epoch": 0.9051094890510949,
315
- "grad_norm": 3.71875,
316
- "learning_rate": 2.7990381056766583e-05,
317
- "loss": 1.4897,
318
- "mean_token_accuracy": 0.5773478448390961,
319
- "num_tokens": 61246.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 1.8927763998508453,
324
- "epoch": 0.9343065693430657,
325
- "grad_norm": 5.03125,
326
- "learning_rate": 2.772072144234639e-05,
327
- "loss": 1.4658,
328
- "mean_token_accuracy": 0.5965544059872627,
329
- "num_tokens": 63057.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 1.9243939369916916,
334
- "epoch": 0.9635036496350365,
335
- "grad_norm": 4.9375,
336
- "learning_rate": 2.7435563588325627e-05,
337
- "loss": 1.5646,
338
- "mean_token_accuracy": 0.551388930529356,
339
- "num_tokens": 64856.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 1.945557788014412,
344
- "epoch": 0.9927007299270073,
345
- "grad_norm": 5.34375,
346
- "learning_rate": 2.7135254915624213e-05,
347
- "loss": 1.6558,
348
- "mean_token_accuracy": 0.5641069300472736,
349
- "num_tokens": 66564.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 1.8289813995361328,
354
- "epoch": 1.0,
355
- "grad_norm": 12.5,
356
- "learning_rate": 2.6820161304100828e-05,
357
- "loss": 1.6743,
358
- "mean_token_accuracy": 0.5590097606182098,
359
- "num_tokens": 66897.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 1.8240835815668106,
364
- "epoch": 1.0291970802919708,
365
- "grad_norm": 4.0,
366
- "learning_rate": 2.649066664678467e-05,
367
- "loss": 1.2519,
368
- "mean_token_accuracy": 0.6510025560855865,
369
- "num_tokens": 69125.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 1.7388608753681183,
374
- "epoch": 1.0583941605839415,
375
- "grad_norm": 3.671875,
376
- "learning_rate": 2.6147172382160913e-05,
377
- "loss": 1.145,
378
- "mean_token_accuracy": 0.6592915058135986,
379
- "num_tokens": 71403.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 1.7314125299453735,
384
- "epoch": 1.0875912408759123,
385
- "grad_norm": 3.84375,
386
- "learning_rate": 2.5790097005079766e-05,
387
- "loss": 1.2177,
388
- "mean_token_accuracy": 0.6403542906045914,
389
- "num_tokens": 73853.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 1.9059295356273651,
394
- "epoch": 1.1167883211678833,
395
- "grad_norm": 5.09375,
396
- "learning_rate": 2.541987555688496e-05,
397
- "loss": 1.3537,
398
- "mean_token_accuracy": 0.5938370451331139,
399
- "num_tokens": 75484.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 1.8351815044879913,
404
- "epoch": 1.145985401459854,
405
- "grad_norm": 5.03125,
406
- "learning_rate": 2.5036959095382875e-05,
407
- "loss": 1.1891,
408
- "mean_token_accuracy": 0.6363263987004757,
409
- "num_tokens": 77263.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 1.856779396533966,
414
- "epoch": 1.1751824817518248,
415
- "grad_norm": 4.8125,
416
- "learning_rate": 2.464181414529809e-05,
417
- "loss": 1.3116,
418
- "mean_token_accuracy": 0.625493511557579,
419
- "num_tokens": 79113.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 1.7603202909231186,
424
- "epoch": 1.2043795620437956,
425
- "grad_norm": 4.90625,
426
- "learning_rate": 2.4234922129884873e-05,
427
- "loss": 1.2056,
428
- "mean_token_accuracy": 0.6308283284306526,
429
- "num_tokens": 80962.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 1.6366319358348846,
434
- "epoch": 1.2335766423357664,
435
- "grad_norm": 4.6875,
436
- "learning_rate": 2.3816778784387097e-05,
437
- "loss": 1.2438,
438
- "mean_token_accuracy": 0.6533086150884628,
439
- "num_tokens": 83095.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 1.6320330947637558,
444
- "epoch": 1.2627737226277373,
445
- "grad_norm": 4.1875,
446
- "learning_rate": 2.3387893552061202e-05,
447
- "loss": 1.1647,
448
- "mean_token_accuracy": 0.6589736789464951,
449
- "num_tokens": 85383.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 1.575496032834053,
454
- "epoch": 1.2919708029197081,
455
- "grad_norm": 4.65625,
456
- "learning_rate": 2.2948788963498073e-05,
457
- "loss": 1.1654,
458
- "mean_token_accuracy": 0.6555850505828857,
459
- "num_tokens": 87754.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 1.64286208152771,
464
- "epoch": 1.3211678832116789,
465
- "grad_norm": 5.8125,
466
- "learning_rate": 2.25e-05,
467
- "loss": 1.3359,
468
- "mean_token_accuracy": 0.649970181286335,
469
- "num_tokens": 89289.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 1.457002505660057,
474
- "epoch": 1.3503649635036497,
475
- "grad_norm": 4.75,
476
- "learning_rate": 2.2042073441788363e-05,
477
- "loss": 1.1513,
478
- "mean_token_accuracy": 0.6784967109560966,
479
- "num_tokens": 91666.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 1.567281499505043,
484
- "epoch": 1.3795620437956204,
485
- "grad_norm": 6.78125,
486
- "learning_rate": 2.157556720183616e-05,
487
- "loss": 1.212,
488
- "mean_token_accuracy": 0.6601979807019234,
489
- "num_tokens": 93407.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 1.4496354460716248,
494
- "epoch": 1.4087591240875912,
495
- "grad_norm": 4.90625,
496
- "learning_rate": 2.1101049646137008e-05,
497
- "loss": 1.074,
498
- "mean_token_accuracy": 0.6734104976058006,
499
- "num_tokens": 95819.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 1.5027115792036057,
504
- "epoch": 1.437956204379562,
505
- "grad_norm": 4.65625,
506
- "learning_rate": 2.0619098901238684e-05,
507
- "loss": 1.1059,
508
- "mean_token_accuracy": 0.6857927665114403,
509
- "num_tokens": 98052.0,
510
  "step": 50
511
  },
512
  {
513
- "entropy": 1.5403490960597992,
514
- "epoch": 1.4671532846715327,
515
- "grad_norm": 5.75,
516
- "learning_rate": 2.0130302149885033e-05,
517
- "loss": 1.1573,
518
- "mean_token_accuracy": 0.6808772906661034,
519
- "num_tokens": 99865.0,
520
  "step": 51
521
  },
522
  {
523
- "entropy": 1.3851112127304077,
524
- "epoch": 1.4963503649635037,
525
- "grad_norm": 4.3125,
526
- "learning_rate": 1.963525491562421e-05,
527
- "loss": 1.0986,
528
- "mean_token_accuracy": 0.669769361615181,
529
- "num_tokens": 102444.0,
530
  "step": 52
531
  },
532
  {
533
- "entropy": 1.6086822748184204,
534
- "epoch": 1.5255474452554745,
535
- "grad_norm": 5.9375,
536
- "learning_rate": 1.9134560337254986e-05,
537
- "loss": 1.2058,
538
- "mean_token_accuracy": 0.6342265903949738,
539
- "num_tokens": 104135.0,
540
  "step": 53
541
  },
542
  {
543
- "entropy": 1.6186174154281616,
544
- "epoch": 1.5547445255474452,
545
- "grad_norm": 5.75,
546
- "learning_rate": 1.8628828433995013e-05,
547
- "loss": 1.1878,
548
- "mean_token_accuracy": 0.6471928432583809,
549
- "num_tokens": 105888.0,
550
  "step": 54
551
  },
552
  {
553
- "entropy": 1.636601522564888,
554
- "epoch": 1.583941605839416,
555
- "grad_norm": 6.40625,
556
- "learning_rate": 1.8118675362266388e-05,
557
- "loss": 1.2144,
558
- "mean_token_accuracy": 0.669179767370224,
559
- "num_tokens": 107324.0,
560
  "step": 55
561
  },
562
  {
563
- "entropy": 1.6150267571210861,
564
- "epoch": 1.613138686131387,
565
- "grad_norm": 6.21875,
566
- "learning_rate": 1.760472266500396e-05,
567
- "loss": 1.2551,
568
- "mean_token_accuracy": 0.6627604365348816,
569
- "num_tokens": 108844.0,
570
  "step": 56
571
  },
572
  {
573
- "entropy": 1.7444928288459778,
574
- "epoch": 1.6423357664233578,
575
- "grad_norm": 6.34375,
576
- "learning_rate": 1.7087596514400982e-05,
577
- "loss": 1.2656,
578
- "mean_token_accuracy": 0.6279268711805344,
579
- "num_tokens": 110263.0,
580
  "step": 57
581
  },
582
  {
583
- "entropy": 1.5423792004585266,
584
- "epoch": 1.6715328467153285,
585
- "grad_norm": 5.53125,
586
- "learning_rate": 1.6567926949014805e-05,
587
- "loss": 1.2103,
588
- "mean_token_accuracy": 0.6224785149097443,
589
- "num_tokens": 112199.0,
590
  "step": 58
591
  },
592
  {
593
- "entropy": 1.6031899452209473,
594
- "epoch": 1.7007299270072993,
595
- "grad_norm": 6.5,
596
- "learning_rate": 1.604634710616188e-05,
597
- "loss": 1.2274,
598
- "mean_token_accuracy": 0.6428026333451271,
599
- "num_tokens": 113911.0,
600
  "step": 59
601
  },
602
  {
603
- "entropy": 1.7055649012327194,
604
- "epoch": 1.72992700729927,
605
- "grad_norm": 6.6875,
606
- "learning_rate": 1.552349245053752e-05,
607
- "loss": 1.2889,
608
- "mean_token_accuracy": 0.6419094651937485,
609
- "num_tokens": 115316.0,
610
  "step": 60
611
  },
612
  {
613
- "entropy": 1.5212641060352325,
614
- "epoch": 1.7591240875912408,
615
- "grad_norm": 4.4375,
616
- "learning_rate": 1.5e-05,
617
- "loss": 1.0935,
618
- "mean_token_accuracy": 0.6695626378059387,
619
- "num_tokens": 118007.0,
620
  "step": 61
621
  },
622
  {
623
- "entropy": 1.781775563955307,
624
- "epoch": 1.7883211678832116,
625
- "grad_norm": 7.0,
626
- "learning_rate": 1.447650754946249e-05,
627
- "loss": 1.2709,
628
- "mean_token_accuracy": 0.6656767651438713,
629
- "num_tokens": 119232.0,
630
  "step": 62
631
  },
632
  {
633
- "entropy": 1.616694524884224,
634
- "epoch": 1.8175182481751824,
635
- "grad_norm": 6.3125,
636
- "learning_rate": 1.3953652893838121e-05,
637
- "loss": 1.2435,
638
- "mean_token_accuracy": 0.6494908779859543,
639
- "num_tokens": 120725.0,
640
  "step": 63
641
  },
642
  {
643
- "entropy": 1.7247931063175201,
644
- "epoch": 1.8467153284671531,
645
- "grad_norm": 7.15625,
646
- "learning_rate": 1.3432073050985201e-05,
647
- "loss": 1.3701,
648
- "mean_token_accuracy": 0.6305030956864357,
649
- "num_tokens": 122093.0,
650
  "step": 64
651
  },
652
  {
653
- "entropy": 1.590467780828476,
654
- "epoch": 1.8759124087591241,
655
- "grad_norm": 5.0,
656
- "learning_rate": 1.2912403485599022e-05,
657
- "loss": 1.263,
658
- "mean_token_accuracy": 0.6583547666668892,
659
- "num_tokens": 124333.0,
660
  "step": 65
661
  },
662
  {
663
- "entropy": 1.6301420778036118,
664
- "epoch": 1.905109489051095,
665
- "grad_norm": 5.3125,
666
- "learning_rate": 1.2395277334996045e-05,
667
- "loss": 1.1125,
668
- "mean_token_accuracy": 0.650074191391468,
669
- "num_tokens": 126272.0,
670
  "step": 66
671
  },
672
  {
673
- "entropy": 1.5050681680440903,
674
- "epoch": 1.9343065693430657,
675
- "grad_norm": 4.28125,
676
- "learning_rate": 1.1881324637733613e-05,
677
- "loss": 1.037,
678
- "mean_token_accuracy": 0.6733650118112564,
679
- "num_tokens": 128615.0,
680
  "step": 67
681
  },
682
  {
683
- "entropy": 1.5582159608602524,
684
- "epoch": 1.9635036496350367,
685
- "grad_norm": 4.34375,
686
- "learning_rate": 1.1371171566004986e-05,
687
- "loss": 1.0951,
688
- "mean_token_accuracy": 0.6506948918104172,
689
- "num_tokens": 131279.0,
690
  "step": 68
691
  },
692
  {
693
- "entropy": 1.6561681628227234,
694
- "epoch": 1.9927007299270074,
695
- "grad_norm": 5.75,
696
- "learning_rate": 1.0865439662745013e-05,
697
- "loss": 1.1486,
698
- "mean_token_accuracy": 0.6755311414599419,
699
- "num_tokens": 132847.0,
700
  "step": 69
701
  },
702
  {
703
- "entropy": 1.4383031129837036,
704
- "epoch": 2.0,
705
- "grad_norm": 7.4375,
706
- "learning_rate": 1.036474508437579e-05,
707
- "loss": 1.1032,
708
- "mean_token_accuracy": 0.6792386174201965,
709
- "num_tokens": 133794.0,
710
  "step": 70
711
  },
712
  {
713
- "entropy": 1.5033023059368134,
714
- "epoch": 2.0291970802919708,
715
- "grad_norm": 4.09375,
716
- "learning_rate": 9.86969785011497e-06,
717
- "loss": 0.8414,
718
- "mean_token_accuracy": 0.7257160544395447,
719
- "num_tokens": 135994.0,
720
  "step": 71
721
  },
722
  {
723
- "entropy": 1.588482990860939,
724
- "epoch": 2.0583941605839415,
725
- "grad_norm": 5.4375,
726
- "learning_rate": 9.380901098761319e-06,
727
- "loss": 0.8667,
728
- "mean_token_accuracy": 0.7469649091362953,
729
- "num_tokens": 137554.0,
730
  "step": 72
731
  },
732
  {
733
- "entropy": 1.539756417274475,
734
- "epoch": 2.0875912408759123,
735
- "grad_norm": 4.3125,
736
- "learning_rate": 8.898950353863e-06,
737
- "loss": 0.8192,
738
- "mean_token_accuracy": 0.7514503225684166,
739
- "num_tokens": 139542.0,
740
  "step": 73
741
  },
742
  {
743
- "entropy": 1.5114945620298386,
744
- "epoch": 2.116788321167883,
745
- "grad_norm": 4.25,
746
- "learning_rate": 8.424432798163838e-06,
747
- "loss": 0.9041,
748
- "mean_token_accuracy": 0.7257768511772156,
749
- "num_tokens": 141721.0,
750
  "step": 74
751
  },
752
  {
753
- "entropy": 1.4715029448270798,
754
- "epoch": 2.145985401459854,
755
- "grad_norm": 4.375,
756
- "learning_rate": 7.957926558211643e-06,
757
- "loss": 0.8884,
758
- "mean_token_accuracy": 0.7411475032567978,
759
- "num_tokens": 143837.0,
760
  "step": 75
761
- },
762
- {
763
- "entropy": 1.375910922884941,
764
- "epoch": 2.1751824817518246,
765
- "grad_norm": 4.0625,
766
- "learning_rate": 7.500000000000004e-06,
767
- "loss": 0.8403,
768
- "mean_token_accuracy": 0.7337475717067719,
769
- "num_tokens": 146069.0,
770
- "step": 76
771
- },
772
- {
773
- "entropy": 1.530395969748497,
774
- "epoch": 2.204379562043796,
775
- "grad_norm": 4.8125,
776
- "learning_rate": 7.051211036501928e-06,
777
- "loss": 0.9023,
778
- "mean_token_accuracy": 0.7458862364292145,
779
- "num_tokens": 147948.0,
780
- "step": 77
781
- },
782
- {
783
- "entropy": 1.5619382560253143,
784
- "epoch": 2.2335766423357666,
785
- "grad_norm": 5.375,
786
- "learning_rate": 6.6121064479388e-06,
787
- "loss": 0.9471,
788
- "mean_token_accuracy": 0.7247473746538162,
789
- "num_tokens": 149664.0,
790
- "step": 78
791
- },
792
- {
793
- "entropy": 1.4002738296985626,
794
- "epoch": 2.2627737226277373,
795
- "grad_norm": 4.90625,
796
- "learning_rate": 6.1832212156129045e-06,
797
- "loss": 0.8002,
798
- "mean_token_accuracy": 0.7359691336750984,
799
- "num_tokens": 151422.0,
800
- "step": 79
801
- },
802
- {
803
- "entropy": 1.3783821165561676,
804
- "epoch": 2.291970802919708,
805
- "grad_norm": 4.875,
806
- "learning_rate": 5.765077870115126e-06,
807
- "loss": 0.9352,
808
- "mean_token_accuracy": 0.7229901030659676,
809
- "num_tokens": 153330.0,
810
- "step": 80
811
- },
812
- {
813
- "entropy": 1.3214146196842194,
814
- "epoch": 2.321167883211679,
815
- "grad_norm": 4.875,
816
- "learning_rate": 5.3581858547019095e-06,
817
- "loss": 0.7626,
818
- "mean_token_accuracy": 0.7818252220749855,
819
- "num_tokens": 155088.0,
820
- "step": 81
821
- },
822
- {
823
- "entropy": 1.2702767699956894,
824
- "epoch": 2.3503649635036497,
825
- "grad_norm": 4.375,
826
- "learning_rate": 4.963040904617131e-06,
827
- "loss": 0.7893,
828
- "mean_token_accuracy": 0.7699355036020279,
829
- "num_tokens": 157396.0,
830
- "step": 82
831
- },
832
- {
833
- "entropy": 1.397829994559288,
834
- "epoch": 2.3795620437956204,
835
- "grad_norm": 5.25,
836
- "learning_rate": 4.58012444311504e-06,
837
- "loss": 0.9191,
838
- "mean_token_accuracy": 0.7331462875008583,
839
- "num_tokens": 159218.0,
840
- "step": 83
841
- },
842
- {
843
- "entropy": 1.2017180174589157,
844
- "epoch": 2.408759124087591,
845
- "grad_norm": 3.6875,
846
- "learning_rate": 4.209902994920236e-06,
847
- "loss": 0.8082,
848
- "mean_token_accuracy": 0.7587887346744537,
849
- "num_tokens": 162386.0,
850
- "step": 84
851
- },
852
- {
853
- "entropy": 1.374891072511673,
854
- "epoch": 2.437956204379562,
855
- "grad_norm": 5.09375,
856
- "learning_rate": 3.852827617839085e-06,
857
- "loss": 0.8665,
858
- "mean_token_accuracy": 0.7603413909673691,
859
- "num_tokens": 164138.0,
860
- "step": 85
861
- },
862
- {
863
- "entropy": 1.3341291099786758,
864
- "epoch": 2.4671532846715327,
865
- "grad_norm": 4.6875,
866
- "learning_rate": 3.5093333532153316e-06,
867
- "loss": 0.8604,
868
- "mean_token_accuracy": 0.7294721901416779,
869
- "num_tokens": 166308.0,
870
- "step": 86
871
- },
872
- {
873
- "entropy": 1.3214628398418427,
874
- "epoch": 2.4963503649635035,
875
- "grad_norm": 5.4375,
876
- "learning_rate": 3.1798386958991715e-06,
877
- "loss": 0.8978,
878
- "mean_token_accuracy": 0.7371588498353958,
879
- "num_tokens": 168073.0,
880
- "step": 87
881
- },
882
- {
883
- "entropy": 1.358703538775444,
884
- "epoch": 2.5255474452554747,
885
- "grad_norm": 5.125,
886
- "learning_rate": 2.86474508437579e-06,
887
- "loss": 0.859,
888
- "mean_token_accuracy": 0.7255095988512039,
889
- "num_tokens": 169979.0,
890
- "step": 88
891
- },
892
- {
893
- "entropy": 1.258324310183525,
894
- "epoch": 2.554744525547445,
895
- "grad_norm": 4.15625,
896
- "learning_rate": 2.564436411674376e-06,
897
- "loss": 0.825,
898
- "mean_token_accuracy": 0.7614458128809929,
899
- "num_tokens": 172706.0,
900
- "step": 89
901
- },
902
- {
903
- "entropy": 1.329784169793129,
904
- "epoch": 2.5839416058394162,
905
- "grad_norm": 5.40625,
906
- "learning_rate": 2.279278557653611e-06,
907
- "loss": 0.8799,
908
- "mean_token_accuracy": 0.7584780603647232,
909
- "num_tokens": 174586.0,
910
- "step": 90
911
- },
912
- {
913
- "entropy": 1.2622641026973724,
914
- "epoch": 2.613138686131387,
915
- "grad_norm": 5.125,
916
- "learning_rate": 2.0096189432334194e-06,
917
- "loss": 0.8348,
918
- "mean_token_accuracy": 0.7513260990381241,
919
- "num_tokens": 176525.0,
920
- "step": 91
921
- },
922
- {
923
- "entropy": 1.2846813797950745,
924
- "epoch": 2.6423357664233578,
925
- "grad_norm": 5.0,
926
- "learning_rate": 1.7557861071160953e-06,
927
- "loss": 0.7697,
928
- "mean_token_accuracy": 0.7566402554512024,
929
- "num_tokens": 178535.0,
930
- "step": 92
931
- },
932
- {
933
- "entropy": 1.2429047673940659,
934
- "epoch": 2.6715328467153285,
935
- "grad_norm": 4.1875,
936
- "learning_rate": 1.518089305512498e-06,
937
- "loss": 0.8523,
938
- "mean_token_accuracy": 0.7609995678067207,
939
- "num_tokens": 181688.0,
940
- "step": 93
941
- },
942
- {
943
- "entropy": 1.2306764125823975,
944
- "epoch": 2.7007299270072993,
945
- "grad_norm": 5.6875,
946
- "learning_rate": 1.2968181353609854e-06,
947
- "loss": 0.795,
948
- "mean_token_accuracy": 0.7538608759641647,
949
- "num_tokens": 183350.0,
950
- "step": 94
951
- },
952
- {
953
- "entropy": 1.2729838192462921,
954
- "epoch": 2.72992700729927,
955
- "grad_norm": 5.25,
956
- "learning_rate": 1.0922421814981904e-06,
957
- "loss": 0.8463,
958
- "mean_token_accuracy": 0.7443541586399078,
959
- "num_tokens": 185369.0,
960
- "step": 95
961
- },
962
- {
963
- "entropy": 1.2911252602934837,
964
- "epoch": 2.759124087591241,
965
- "grad_norm": 5.125,
966
- "learning_rate": 9.046106882113753e-07,
967
- "loss": 0.7471,
968
- "mean_token_accuracy": 0.752311646938324,
969
- "num_tokens": 187493.0,
970
- "step": 96
971
- },
972
- {
973
- "entropy": 1.28748519718647,
974
- "epoch": 2.7883211678832116,
975
- "grad_norm": 6.4375,
976
- "learning_rate": 7.341522555726971e-07,
977
- "loss": 0.7536,
978
- "mean_token_accuracy": 0.7757409885525703,
979
- "num_tokens": 188864.0,
980
- "step": 97
981
- },
982
- {
983
- "entropy": 1.2816387563943863,
984
- "epoch": 2.8175182481751824,
985
- "grad_norm": 5.46875,
986
- "learning_rate": 5.810745609252166e-07,
987
- "loss": 0.9127,
988
- "mean_token_accuracy": 0.7290580719709396,
989
- "num_tokens": 190843.0,
990
- "step": 98
991
- },
992
- {
993
- "entropy": 1.4024466425180435,
994
- "epoch": 2.846715328467153,
995
- "grad_norm": 6.71875,
996
- "learning_rate": 4.455641058600529e-07,
997
- "loss": 0.9032,
998
- "mean_token_accuracy": 0.7520110681653023,
999
- "num_tokens": 192230.0,
1000
- "step": 99
1001
- },
1002
- {
1003
- "entropy": 1.354932889342308,
1004
- "epoch": 2.875912408759124,
1005
- "grad_norm": 6.71875,
1006
- "learning_rate": 3.277859889929147e-07,
1007
- "loss": 0.7987,
1008
- "mean_token_accuracy": 0.785490907728672,
1009
- "num_tokens": 193518.0,
1010
- "step": 100
1011
  }
1012
  ],
1013
  "logging_steps": 1,
1014
- "max_steps": 105,
1015
  "num_input_tokens_seen": 0,
1016
- "num_train_epochs": 3,
1017
  "save_steps": 5,
1018
  "stateful_callbacks": {
1019
  "TrainerControl": {
@@ -1027,7 +777,7 @@
1027
  "attributes": {}
1028
  }
1029
  },
1030
- "total_flos": 5186447183892480.0,
1031
  "train_batch_size": 2,
1032
  "trial_name": null,
1033
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.175182481751825,
6
  "eval_steps": 500,
7
+ "global_step": 75,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 2.1336327642202377,
14
+ "epoch": 0.058394160583941604,
15
+ "grad_norm": 16.25,
16
  "learning_rate": 0.0,
17
+ "loss": 2.4507,
18
+ "mean_token_accuracy": 0.4276521671563387,
19
+ "num_tokens": 3890.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 2.222498059272766,
24
+ "epoch": 0.11678832116788321,
25
+ "grad_norm": 17.875,
26
  "learning_rate": 2e-06,
27
+ "loss": 2.6879,
28
+ "mean_token_accuracy": 0.4140724800527096,
29
+ "num_tokens": 6825.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 2.262238770723343,
34
+ "epoch": 0.17518248175182483,
35
+ "grad_norm": 14.25,
36
  "learning_rate": 4e-06,
37
+ "loss": 2.4973,
38
+ "mean_token_accuracy": 0.42698577605187893,
39
+ "num_tokens": 10473.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 2.1316296458244324,
44
+ "epoch": 0.23357664233576642,
45
+ "grad_norm": 10.0,
46
  "learning_rate": 6e-06,
47
+ "loss": 2.0863,
48
+ "mean_token_accuracy": 0.4788516294211149,
49
+ "num_tokens": 15657.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 2.3611446991562843,
54
+ "epoch": 0.291970802919708,
55
+ "grad_norm": 10.1875,
56
  "learning_rate": 8e-06,
57
+ "loss": 2.1751,
58
+ "mean_token_accuracy": 0.4456866979598999,
59
+ "num_tokens": 20159.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 2.460206426680088,
64
+ "epoch": 0.35036496350364965,
65
+ "grad_norm": 8.875,
66
  "learning_rate": 9.999999999999999e-06,
67
+ "loss": 2.2655,
68
+ "mean_token_accuracy": 0.4509607646614313,
69
+ "num_tokens": 23949.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 2.321817234158516,
74
+ "epoch": 0.40875912408759124,
75
+ "grad_norm": 7.125,
76
  "learning_rate": 1.2e-05,
77
+ "loss": 2.0123,
78
+ "mean_token_accuracy": 0.5055391453206539,
79
+ "num_tokens": 27703.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 2.2407592684030533,
84
+ "epoch": 0.46715328467153283,
85
+ "grad_norm": 5.4375,
86
  "learning_rate": 1.4e-05,
87
+ "loss": 1.8516,
88
+ "mean_token_accuracy": 0.5130146574229002,
89
+ "num_tokens": 32243.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 2.46332585811615,
94
+ "epoch": 0.5255474452554745,
95
+ "grad_norm": 7.09375,
96
  "learning_rate": 1.6e-05,
97
+ "loss": 2.0974,
98
+ "mean_token_accuracy": 0.5035313870757818,
99
+ "num_tokens": 35222.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 2.237804166972637,
104
+ "epoch": 0.583941605839416,
105
+ "grad_norm": 5.65625,
106
  "learning_rate": 1.8e-05,
107
+ "loss": 1.7838,
108
+ "mean_token_accuracy": 0.5259560514241457,
109
+ "num_tokens": 39208.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 2.352365091443062,
114
+ "epoch": 0.6423357664233577,
115
+ "grad_norm": 5.84375,
116
  "learning_rate": 1.9999999999999998e-05,
117
+ "loss": 2.0078,
118
+ "mean_token_accuracy": 0.5078456345945597,
119
+ "num_tokens": 42447.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 2.1229992732405663,
124
+ "epoch": 0.7007299270072993,
125
+ "grad_norm": 4.5625,
126
  "learning_rate": 2.2e-05,
127
+ "loss": 1.7155,
128
+ "mean_token_accuracy": 0.5374241229146719,
129
+ "num_tokens": 47138.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 2.121931955218315,
134
+ "epoch": 0.7591240875912408,
135
+ "grad_norm": 4.625,
136
  "learning_rate": 2.4e-05,
137
+ "loss": 1.7379,
138
+ "mean_token_accuracy": 0.5694513749331236,
139
+ "num_tokens": 51009.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 2.085137240588665,
144
+ "epoch": 0.8175182481751825,
145
+ "grad_norm": 4.25,
146
  "learning_rate": 2.6000000000000002e-05,
147
+ "loss": 1.6524,
148
+ "mean_token_accuracy": 0.5468352809548378,
149
+ "num_tokens": 55235.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 2.1976606771349907,
154
+ "epoch": 0.8759124087591241,
155
+ "grad_norm": 5.625,
156
  "learning_rate": 2.8e-05,
157
+ "loss": 1.8096,
158
+ "mean_token_accuracy": 0.5231170020997524,
159
+ "num_tokens": 58219.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 1.9179195016622543,
164
+ "epoch": 0.9343065693430657,
165
+ "grad_norm": 3.84375,
166
  "learning_rate": 3e-05,
167
+ "loss": 1.5974,
168
+ "mean_token_accuracy": 0.5759452320635319,
169
+ "num_tokens": 63057.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 2.0428223088383675,
174
+ "epoch": 0.9927007299270073,
175
+ "grad_norm": 4.53125,
176
+ "learning_rate": 2.9986842451482876e-05,
177
+ "loss": 1.7372,
178
+ "mean_token_accuracy": 0.5385774970054626,
179
+ "num_tokens": 66564.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 1.9300671219825745,
184
+ "epoch": 1.0,
185
+ "grad_norm": 12.9375,
186
+ "learning_rate": 2.9947392888742566e-05,
187
+ "loss": 1.7476,
188
+ "mean_token_accuracy": 0.5453733801841736,
189
+ "num_tokens": 66897.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 1.935000792145729,
194
+ "epoch": 1.0583941605839415,
195
+ "grad_norm": 3.484375,
196
+ "learning_rate": 2.988172051971717e-05,
197
+ "loss": 1.4249,
198
+ "mean_token_accuracy": 0.6101336404681206,
199
+ "num_tokens": 71403.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 2.0335680916905403,
204
+ "epoch": 1.1167883211678833,
205
+ "grad_norm": 3.84375,
206
+ "learning_rate": 2.9789940556057574e-05,
207
+ "loss": 1.5345,
208
+ "mean_token_accuracy": 0.5629026051610708,
209
+ "num_tokens": 75484.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 2.10165449231863,
214
+ "epoch": 1.1751824817518248,
215
+ "grad_norm": 4.0625,
216
+ "learning_rate": 2.9672214011007087e-05,
217
+ "loss": 1.4949,
218
+ "mean_token_accuracy": 0.5799959097057581,
219
+ "num_tokens": 79113.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 1.992341309785843,
224
+ "epoch": 1.2335766423357664,
225
+ "grad_norm": 3.59375,
226
+ "learning_rate": 2.9528747416929467e-05,
227
+ "loss": 1.4678,
228
+ "mean_token_accuracy": 0.5918100215494633,
229
+ "num_tokens": 83095.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 1.9140778183937073,
234
+ "epoch": 1.2919708029197081,
235
+ "grad_norm": 3.375,
236
+ "learning_rate": 2.9359792462981007e-05,
237
+ "loss": 1.4038,
238
+ "mean_token_accuracy": 0.6022733096033335,
239
+ "num_tokens": 87754.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 1.8838416188955307,
244
+ "epoch": 1.3503649635036497,
245
+ "grad_norm": 3.8125,
246
+ "learning_rate": 2.9165645553562215e-05,
247
+ "loss": 1.4554,
248
+ "mean_token_accuracy": 0.6133127138018608,
249
+ "num_tokens": 91666.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 1.816191054880619,
254
+ "epoch": 1.4087591240875912,
255
+ "grad_norm": 3.859375,
256
+ "learning_rate": 2.894664728832377e-05,
257
+ "loss": 1.3643,
258
+ "mean_token_accuracy": 0.6147295907139778,
259
+ "num_tokens": 95819.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 1.7681904509663582,
264
+ "epoch": 1.4671532846715327,
265
+ "grad_norm": 3.609375,
266
+ "learning_rate": 2.8703181864639013e-05,
267
+ "loss": 1.3711,
268
+ "mean_token_accuracy": 0.6297403201460838,
269
+ "num_tokens": 99865.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 1.7096636295318604,
274
+ "epoch": 1.5255474452554745,
275
+ "grad_norm": 3.390625,
276
+ "learning_rate": 2.8435676403591193e-05,
277
+ "loss": 1.3362,
278
+ "mean_token_accuracy": 0.6145001202821732,
279
+ "num_tokens": 104135.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 1.828701414167881,
284
+ "epoch": 1.583941605839416,
285
+ "grad_norm": 4.3125,
286
+ "learning_rate": 2.8144600200657953e-05,
287
+ "loss": 1.4266,
288
+ "mean_token_accuracy": 0.5893764644861221,
289
+ "num_tokens": 107324.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 1.8768919259309769,
294
+ "epoch": 1.6423357664233578,
295
+ "grad_norm": 4.65625,
296
+ "learning_rate": 2.78304639024076e-05,
297
+ "loss": 1.5031,
298
+ "mean_token_accuracy": 0.5983850117772818,
299
+ "num_tokens": 110263.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 1.7338064908981323,
304
+ "epoch": 1.7007299270072993,
305
+ "grad_norm": 4.34375,
306
+ "learning_rate": 2.7493818610651493e-05,
307
+ "loss": 1.4431,
308
+ "mean_token_accuracy": 0.5914898477494717,
309
+ "num_tokens": 113911.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 1.7540361359715462,
314
+ "epoch": 1.7591240875912408,
315
+ "grad_norm": 3.734375,
316
+ "learning_rate": 2.7135254915624213e-05,
317
+ "loss": 1.3489,
318
+ "mean_token_accuracy": 0.6010549142956734,
319
+ "num_tokens": 118007.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 1.8890240713953972,
324
+ "epoch": 1.8175182481751824,
325
+ "grad_norm": 4.65625,
326
+ "learning_rate": 2.6755401859887598e-05,
327
+ "loss": 1.4448,
328
+ "mean_token_accuracy": 0.6083299573510885,
329
+ "num_tokens": 120725.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 1.850830078125,
334
+ "epoch": 1.8759124087591241,
335
+ "grad_norm": 4.28125,
336
+ "learning_rate": 2.6354925834776346e-05,
337
+ "loss": 1.502,
338
+ "mean_token_accuracy": 0.6061263754963875,
339
+ "num_tokens": 124333.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 1.7397000417113304,
344
+ "epoch": 1.9343065693430657,
345
+ "grad_norm": 3.671875,
346
+ "learning_rate": 2.5934529411321174e-05,
347
+ "loss": 1.2539,
348
+ "mean_token_accuracy": 0.6317082159221172,
349
+ "num_tokens": 128615.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 1.813131682574749,
354
+ "epoch": 1.9927007299270074,
355
+ "grad_norm": 3.5625,
356
+ "learning_rate": 2.5494950107700482e-05,
357
+ "loss": 1.3284,
358
+ "mean_token_accuracy": 0.6140319798141718,
359
+ "num_tokens": 132847.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 1.5973615646362305,
364
+ "epoch": 2.0,
365
+ "grad_norm": 7.46875,
366
+ "learning_rate": 2.5036959095382875e-05,
367
+ "loss": 1.2697,
368
+ "mean_token_accuracy": 0.6285321712493896,
369
+ "num_tokens": 133794.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 1.7645720839500427,
374
+ "epoch": 2.0583941605839415,
375
+ "grad_norm": 3.859375,
376
+ "learning_rate": 2.4561359846230346e-05,
377
+ "loss": 1.0785,
378
+ "mean_token_accuracy": 0.6664150357246399,
379
+ "num_tokens": 137554.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 1.753688521683216,
384
+ "epoch": 2.116788321167883,
385
+ "grad_norm": 3.3125,
386
+ "learning_rate": 2.4068986722935625e-05,
387
+ "loss": 1.0716,
388
+ "mean_token_accuracy": 0.6744864694774151,
389
+ "num_tokens": 141721.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 1.6263050064444542,
394
+ "epoch": 2.1751824817518246,
395
+ "grad_norm": 4.3125,
396
+ "learning_rate": 2.356070351526648e-05,
397
+ "loss": 1.0687,
398
+ "mean_token_accuracy": 0.6837072521448135,
399
+ "num_tokens": 146069.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 1.8026663437485695,
404
+ "epoch": 2.2335766423357666,
405
+ "grad_norm": 3.84375,
406
+ "learning_rate": 2.303740192468495e-05,
407
+ "loss": 1.1566,
408
+ "mean_token_accuracy": 0.6734990328550339,
409
+ "num_tokens": 149664.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 1.5874073877930641,
414
+ "epoch": 2.291970802919708,
415
+ "grad_norm": 3.53125,
416
+ "learning_rate": 2.25e-05,
417
+ "loss": 1.0591,
418
+ "mean_token_accuracy": 0.6754884608089924,
419
+ "num_tokens": 153330.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 1.4746350944042206,
424
+ "epoch": 2.3503649635036497,
425
+ "grad_norm": 3.453125,
426
+ "learning_rate": 2.1949440526797928e-05,
427
+ "loss": 0.9312,
428
+ "mean_token_accuracy": 0.7215368486940861,
429
+ "num_tokens": 157396.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 1.4334233030676842,
434
+ "epoch": 2.408759124087591,
435
+ "grad_norm": 6.96875,
436
+ "learning_rate": 2.138668937347609e-05,
437
+ "loss": 0.9952,
438
+ "mean_token_accuracy": 0.7047883793711662,
439
+ "num_tokens": 162386.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 1.4815244674682617,
444
+ "epoch": 2.4671532846715327,
445
+ "grad_norm": 3.9375,
446
+ "learning_rate": 2.0812733796781544e-05,
447
+ "loss": 1.0847,
448
+ "mean_token_accuracy": 0.680337205529213,
449
+ "num_tokens": 166308.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 1.4337811917066574,
454
+ "epoch": 2.5255474452554747,
455
+ "grad_norm": 4.21875,
456
+ "learning_rate": 2.022858070982723e-05,
457
+ "loss": 1.0594,
458
+ "mean_token_accuracy": 0.686751551926136,
459
+ "num_tokens": 169979.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 1.380111612379551,
464
+ "epoch": 2.5839416058394162,
465
+ "grad_norm": 3.984375,
466
+ "learning_rate": 1.963525491562421e-05,
467
+ "loss": 0.9718,
468
+ "mean_token_accuracy": 0.7241853773593903,
469
+ "num_tokens": 174586.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 1.339597962796688,
474
+ "epoch": 2.6423357664233578,
475
+ "grad_norm": 4.0625,
476
+ "learning_rate": 1.9033797309228984e-05,
477
+ "loss": 0.9445,
478
+ "mean_token_accuracy": 0.7082682773470879,
479
+ "num_tokens": 178535.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 1.293665699660778,
484
+ "epoch": 2.7007299270072993,
485
+ "grad_norm": 3.765625,
486
+ "learning_rate": 1.8425263051659838e-05,
487
+ "loss": 0.9213,
488
+ "mean_token_accuracy": 0.7238599583506584,
489
+ "num_tokens": 183350.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 1.3446906879544258,
494
+ "epoch": 2.759124087591241,
495
+ "grad_norm": 4.46875,
496
+ "learning_rate": 1.781071971878587e-05,
497
+ "loss": 0.9652,
498
+ "mean_token_accuracy": 0.6951282061636448,
499
+ "num_tokens": 187493.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 1.3415213227272034,
504
+ "epoch": 2.8175182481751824,
505
+ "grad_norm": 4.8125,
506
+ "learning_rate": 1.7191245428436175e-05,
507
+ "loss": 1.0102,
508
+ "mean_token_accuracy": 0.7021605856716633,
509
+ "num_tokens": 190843.0,
510
  "step": 50
511
  },
512
  {
513
+ "entropy": 1.4499380737543106,
514
+ "epoch": 2.875912408759124,
515
+ "grad_norm": 5.71875,
516
+ "learning_rate": 1.6567926949014805e-05,
517
+ "loss": 1.0649,
518
+ "mean_token_accuracy": 0.7037234976887703,
519
+ "num_tokens": 193518.0,
520
  "step": 51
521
  },
522
  {
523
+ "entropy": 1.3929353207349777,
524
+ "epoch": 2.9343065693430654,
525
+ "grad_norm": 4.75,
526
+ "learning_rate": 1.5941857792939702e-05,
527
+ "loss": 1.0284,
528
+ "mean_token_accuracy": 0.6902767680585384,
529
+ "num_tokens": 196895.0,
530
  "step": 52
531
  },
532
  {
533
+ "entropy": 1.4459699764847755,
534
+ "epoch": 2.9927007299270074,
535
+ "grad_norm": 4.75,
536
+ "learning_rate": 1.5314136298250355e-05,
537
+ "loss": 1.013,
538
+ "mean_token_accuracy": 0.6965249925851822,
539
+ "num_tokens": 200296.0,
540
  "step": 53
541
  },
542
  {
543
+ "entropy": 1.399910032749176,
544
+ "epoch": 3.0,
545
+ "grad_norm": 13.0625,
546
+ "learning_rate": 1.4685863701749648e-05,
547
+ "loss": 1.0552,
548
+ "mean_token_accuracy": 0.6890038251876831,
549
+ "num_tokens": 200691.0,
550
  "step": 54
551
  },
552
  {
553
+ "entropy": 1.3579635098576546,
554
+ "epoch": 3.0583941605839415,
555
+ "grad_norm": 4.28125,
556
+ "learning_rate": 1.40581422070603e-05,
557
+ "loss": 0.7865,
558
+ "mean_token_accuracy": 0.765391580760479,
559
+ "num_tokens": 204197.0,
560
  "step": 55
561
  },
562
  {
563
+ "entropy": 1.411361187696457,
564
+ "epoch": 3.116788321167883,
565
+ "grad_norm": 4.21875,
566
+ "learning_rate": 1.3432073050985201e-05,
567
+ "loss": 0.7665,
568
+ "mean_token_accuracy": 0.7553833983838558,
569
+ "num_tokens": 207610.0,
570
  "step": 56
571
  },
572
  {
573
+ "entropy": 1.3223325684666634,
574
+ "epoch": 3.1751824817518246,
575
+ "grad_norm": 3.71875,
576
+ "learning_rate": 1.2808754571563827e-05,
577
+ "loss": 0.804,
578
+ "mean_token_accuracy": 0.7530029378831387,
579
+ "num_tokens": 211730.0,
580
  "step": 57
581
  },
582
  {
583
+ "entropy": 1.2704328149557114,
584
+ "epoch": 3.2335766423357666,
585
+ "grad_norm": 3.46875,
586
+ "learning_rate": 1.2189280281214128e-05,
587
+ "loss": 0.7542,
588
+ "mean_token_accuracy": 0.775670263916254,
589
+ "num_tokens": 216415.0,
590
  "step": 58
591
  },
592
  {
593
+ "entropy": 1.3555709198117256,
594
+ "epoch": 3.291970802919708,
595
+ "grad_norm": 3.9375,
596
+ "learning_rate": 1.1574736948340163e-05,
597
+ "loss": 0.7992,
598
+ "mean_token_accuracy": 0.7488890923559666,
599
+ "num_tokens": 219953.0,
600
  "step": 59
601
  },
602
  {
603
+ "entropy": 1.2632866501808167,
604
+ "epoch": 3.3503649635036497,
605
+ "grad_norm": 3.578125,
606
+ "learning_rate": 1.0966202690771015e-05,
607
+ "loss": 0.75,
608
+ "mean_token_accuracy": 0.7654453739523888,
609
+ "num_tokens": 224335.0,
610
  "step": 60
611
  },
612
  {
613
+ "entropy": 1.2773741334676743,
614
+ "epoch": 3.408759124087591,
615
+ "grad_norm": 4.125,
616
+ "learning_rate": 1.036474508437579e-05,
617
+ "loss": 0.8394,
618
+ "mean_token_accuracy": 0.7538279145956039,
619
+ "num_tokens": 228300.0,
620
  "step": 61
621
  },
622
  {
623
+ "entropy": 1.2203935906291008,
624
+ "epoch": 3.4671532846715327,
625
+ "grad_norm": 4.3125,
626
+ "learning_rate": 9.771419290172776e-06,
627
+ "loss": 0.7866,
628
+ "mean_token_accuracy": 0.7759390734136105,
629
+ "num_tokens": 231820.0,
630
  "step": 62
631
  },
632
  {
633
+ "entropy": 1.2281916178762913,
634
+ "epoch": 3.5255474452554747,
635
+ "grad_norm": 4.5,
636
+ "learning_rate": 9.187266203218457e-06,
637
+ "loss": 0.7456,
638
+ "mean_token_accuracy": 0.7896540127694607,
639
+ "num_tokens": 235502.0,
640
  "step": 63
641
  },
642
  {
643
+ "entropy": 1.1479723155498505,
644
+ "epoch": 3.5839416058394162,
645
+ "grad_norm": 3.84375,
646
+ "learning_rate": 8.61331062652391e-06,
647
+ "loss": 0.6779,
648
+ "mean_token_accuracy": 0.7954859808087349,
649
+ "num_tokens": 239847.0,
650
  "step": 64
651
  },
652
  {
653
+ "entropy": 1.227071214467287,
654
+ "epoch": 3.6423357664233578,
655
+ "grad_norm": 4.78125,
656
+ "learning_rate": 8.050559473202078e-06,
657
+ "loss": 0.7642,
658
+ "mean_token_accuracy": 0.7581925354897976,
659
+ "num_tokens": 243356.0,
660
  "step": 65
661
  },
662
  {
663
+ "entropy": 1.131257489323616,
664
+ "epoch": 3.7007299270072993,
665
+ "grad_norm": 3.5625,
666
+ "learning_rate": 7.500000000000004e-06,
667
+ "loss": 0.7819,
668
+ "mean_token_accuracy": 0.7654204778373241,
669
+ "num_tokens": 249682.0,
670
  "step": 66
671
  },
672
  {
673
+ "entropy": 1.16723557934165,
674
+ "epoch": 3.759124087591241,
675
+ "grad_norm": 4.5,
676
+ "learning_rate": 6.962598075315047e-06,
677
+ "loss": 0.6689,
678
+ "mean_token_accuracy": 0.783266007900238,
679
+ "num_tokens": 253238.0,
680
  "step": 67
681
  },
682
  {
683
+ "entropy": 1.2070689871907234,
684
+ "epoch": 3.8175182481751824,
685
+ "grad_norm": 5.1875,
686
+ "learning_rate": 6.439296484733526e-06,
687
+ "loss": 0.7421,
688
+ "mean_token_accuracy": 0.7796755991876125,
689
+ "num_tokens": 256423.0,
690
  "step": 68
691
  },
692
  {
693
+ "entropy": 1.1488405130803585,
694
+ "epoch": 3.875912408759124,
695
+ "grad_norm": 5.34375,
696
+ "learning_rate": 5.931013277064377e-06,
697
+ "loss": 0.7267,
698
+ "mean_token_accuracy": 0.7691169492900372,
699
+ "num_tokens": 259934.0,
700
  "step": 69
701
  },
702
  {
703
+ "entropy": 1.130510926246643,
704
+ "epoch": 3.9343065693430654,
705
+ "grad_norm": 5.25,
706
+ "learning_rate": 5.438640153769654e-06,
707
+ "loss": 0.7209,
708
+ "mean_token_accuracy": 0.7871466726064682,
709
+ "num_tokens": 263187.0,
710
  "step": 70
711
  },
712
  {
713
+ "entropy": 1.1477855034172535,
714
+ "epoch": 3.9927007299270074,
715
+ "grad_norm": 4.75,
716
+ "learning_rate": 4.963040904617131e-06,
717
+ "loss": 0.7762,
718
+ "mean_token_accuracy": 0.7656804099678993,
719
+ "num_tokens": 267097.0,
720
  "step": 71
721
  },
722
  {
723
+ "entropy": 1.09878408908844,
724
+ "epoch": 4.0,
725
+ "grad_norm": 12.875,
726
+ "learning_rate": 4.505049892299517e-06,
727
+ "loss": 0.7072,
728
+ "mean_token_accuracy": 0.7617444694042206,
729
+ "num_tokens": 267588.0,
730
  "step": 72
731
  },
732
  {
733
+ "entropy": 1.0318926461040974,
734
+ "epoch": 4.0583941605839415,
735
+ "grad_norm": 4.28125,
736
+ "learning_rate": 4.06547058867883e-06,
737
+ "loss": 0.5992,
738
+ "mean_token_accuracy": 0.8166146464645863,
739
+ "num_tokens": 271589.0,
740
  "step": 73
741
  },
742
  {
743
+ "entropy": 1.1504660807549953,
744
+ "epoch": 4.116788321167883,
745
+ "grad_norm": 4.78125,
746
+ "learning_rate": 3.645074165223656e-06,
747
+ "loss": 0.606,
748
+ "mean_token_accuracy": 0.8282722532749176,
749
+ "num_tokens": 274468.0,
750
  "step": 74
751
  },
752
  {
753
+ "entropy": 1.1046061255037785,
754
+ "epoch": 4.175182481751825,
755
+ "grad_norm": 3.671875,
756
+ "learning_rate": 3.244598140112404e-06,
757
+ "loss": 0.6325,
758
+ "mean_token_accuracy": 0.8047133162617683,
759
+ "num_tokens": 278830.0,
760
  "step": 75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  }
762
  ],
763
  "logging_steps": 1,
764
+ "max_steps": 90,
765
  "num_input_tokens_seen": 0,
766
+ "num_train_epochs": 5,
767
  "save_steps": 5,
768
  "stateful_callbacks": {
769
  "TrainerControl": {
 
777
  "attributes": {}
778
  }
779
  },
780
+ "total_flos": 7471994807169024.0,
781
  "train_batch_size": 2,
782
  "trial_name": null,
783
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:219c136550741cd19a46976f6919256d0586a1eb4e41646e6baa81475a2cc056
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4299c868efdf07b9f67c43aca6993615cee8602c1155c2c9e52cf027fcd29126
3
  size 6353