File size: 14,653 Bytes
a14fae7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.4624954262714964,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "entropy": 0.7402557095512747,
      "epoch": 0.029271862422246615,
      "grad_norm": 29.125,
      "learning_rate": 1.8000000000000001e-06,
      "loss": 0.8764,
      "mean_token_accuracy": 0.8633198849856853,
      "num_tokens": 178219.0,
      "step": 10
    },
    {
      "entropy": 0.6707062933593988,
      "epoch": 0.05854372484449323,
      "grad_norm": 29.625,
      "learning_rate": 3.8000000000000005e-06,
      "loss": 0.7056,
      "mean_token_accuracy": 0.8833413228392601,
      "num_tokens": 364483.0,
      "step": 20
    },
    {
      "entropy": 0.6742518592625857,
      "epoch": 0.08781558726673985,
      "grad_norm": 8.4375,
      "learning_rate": 5.8e-06,
      "loss": 0.4525,
      "mean_token_accuracy": 0.9048154704272747,
      "num_tokens": 544552.0,
      "step": 30
    },
    {
      "entropy": 0.7069467414170504,
      "epoch": 0.11708744968898646,
      "grad_norm": 10.0,
      "learning_rate": 7.800000000000002e-06,
      "loss": 0.29,
      "mean_token_accuracy": 0.9340683862566947,
      "num_tokens": 717853.0,
      "step": 40
    },
    {
      "entropy": 0.6330054465681314,
      "epoch": 0.14635931211123307,
      "grad_norm": 2.453125,
      "learning_rate": 9.800000000000001e-06,
      "loss": 0.1453,
      "mean_token_accuracy": 0.9656811647117138,
      "num_tokens": 892769.0,
      "step": 50
    },
    {
      "entropy": 0.6911797292530537,
      "epoch": 0.1756311745334797,
      "grad_norm": 4.71875,
      "learning_rate": 9.997902051783373e-06,
      "loss": 0.1719,
      "mean_token_accuracy": 0.9569758839905262,
      "num_tokens": 1062544.0,
      "step": 60
    },
    {
      "entropy": 0.658262687176466,
      "epoch": 0.20490303695572631,
      "grad_norm": 8.9375,
      "learning_rate": 9.990652145366113e-06,
      "loss": 0.1144,
      "mean_token_accuracy": 0.971483013778925,
      "num_tokens": 1238635.0,
      "step": 70
    },
    {
      "entropy": 0.6544046986848115,
      "epoch": 0.23417489937797292,
      "grad_norm": 11.125,
      "learning_rate": 9.978231889316302e-06,
      "loss": 0.117,
      "mean_token_accuracy": 0.9697644971311092,
      "num_tokens": 1422527.0,
      "step": 80
    },
    {
      "entropy": 0.6321761136874556,
      "epoch": 0.26344676180021953,
      "grad_norm": 5.8125,
      "learning_rate": 9.960654151103846e-06,
      "loss": 0.1172,
      "mean_token_accuracy": 0.9699941977858544,
      "num_tokens": 1604412.0,
      "step": 90
    },
    {
      "entropy": 0.6520156674087048,
      "epoch": 0.29271862422246614,
      "grad_norm": 2.75,
      "learning_rate": 9.937937141385323e-06,
      "loss": 0.127,
      "mean_token_accuracy": 0.9669150829315185,
      "num_tokens": 1785808.0,
      "step": 100
    },
    {
      "entropy": 0.7009117918089032,
      "epoch": 0.32199048664471275,
      "grad_norm": 4.5625,
      "learning_rate": 9.91010439513761e-06,
      "loss": 0.166,
      "mean_token_accuracy": 0.9596201993525029,
      "num_tokens": 1963662.0,
      "step": 110
    },
    {
      "entropy": 0.699804374948144,
      "epoch": 0.3512623490669594,
      "grad_norm": 4.5,
      "learning_rate": 9.87718474727549e-06,
      "loss": 0.1242,
      "mean_token_accuracy": 0.9688111513853073,
      "num_tokens": 2138861.0,
      "step": 120
    },
    {
      "entropy": 0.6593301335349679,
      "epoch": 0.380534211489206,
      "grad_norm": 3.828125,
      "learning_rate": 9.839212302778493e-06,
      "loss": 0.0984,
      "mean_token_accuracy": 0.9735250800848008,
      "num_tokens": 2314376.0,
      "step": 130
    },
    {
      "entropy": 0.6733343230560422,
      "epoch": 0.40980607391145263,
      "grad_norm": 4.03125,
      "learning_rate": 9.796226401357884e-06,
      "loss": 0.1292,
      "mean_token_accuracy": 0.9677626974880695,
      "num_tokens": 2494288.0,
      "step": 140
    },
    {
      "entropy": 0.6483910661190748,
      "epoch": 0.43907793633369924,
      "grad_norm": 9.3125,
      "learning_rate": 9.748271576700476e-06,
      "loss": 0.1171,
      "mean_token_accuracy": 0.9706188321113587,
      "num_tokens": 2682554.0,
      "step": 150
    },
    {
      "entropy": 0.6520246665924787,
      "epoch": 0.46834979875594585,
      "grad_norm": 3.15625,
      "learning_rate": 9.69539751033141e-06,
      "loss": 0.1038,
      "mean_token_accuracy": 0.9712590672075748,
      "num_tokens": 2855437.0,
      "step": 160
    },
    {
      "entropy": 0.6615954734385013,
      "epoch": 0.49762166117819245,
      "grad_norm": 7.0625,
      "learning_rate": 9.637658980143771e-06,
      "loss": 0.0976,
      "mean_token_accuracy": 0.9749271534383297,
      "num_tokens": 3034970.0,
      "step": 170
    },
    {
      "entropy": 0.6598344139754773,
      "epoch": 0.5268935236004391,
      "grad_norm": 3.59375,
      "learning_rate": 9.575115803648303e-06,
      "loss": 0.1018,
      "mean_token_accuracy": 0.9722783856093884,
      "num_tokens": 3208412.0,
      "step": 180
    },
    {
      "entropy": 0.681413133814931,
      "epoch": 0.5561653860226857,
      "grad_norm": 3.78125,
      "learning_rate": 9.507832776002069e-06,
      "loss": 0.106,
      "mean_token_accuracy": 0.9714638628065586,
      "num_tokens": 3395494.0,
      "step": 190
    },
    {
      "entropy": 0.7187394430860877,
      "epoch": 0.5854372484449323,
      "grad_norm": 2.65625,
      "learning_rate": 9.43587960288023e-06,
      "loss": 0.1434,
      "mean_token_accuracy": 0.9640420243144036,
      "num_tokens": 3573674.0,
      "step": 200
    },
    {
      "entropy": 0.7018351562321186,
      "epoch": 0.6147091108671789,
      "grad_norm": 4.9375,
      "learning_rate": 9.359330828260477e-06,
      "loss": 0.1488,
      "mean_token_accuracy": 0.9625116638839245,
      "num_tokens": 3749797.0,
      "step": 210
    },
    {
      "entropy": 0.6656203528866171,
      "epoch": 0.6439809732894255,
      "grad_norm": 22.0,
      "learning_rate": 9.278265757194983e-06,
      "loss": 0.1026,
      "mean_token_accuracy": 0.9728666849434375,
      "num_tokens": 3927226.0,
      "step": 220
    },
    {
      "entropy": 0.7618660530075431,
      "epoch": 0.6732528357116722,
      "grad_norm": 7.3125,
      "learning_rate": 9.1927683736498e-06,
      "loss": 0.1411,
      "mean_token_accuracy": 0.962882998585701,
      "num_tokens": 4107295.0,
      "step": 230
    },
    {
      "entropy": 0.6730721700936556,
      "epoch": 0.7025246981339188,
      "grad_norm": 11.0,
      "learning_rate": 9.102927253496926e-06,
      "loss": 0.1263,
      "mean_token_accuracy": 0.9679374843835831,
      "num_tokens": 4286908.0,
      "step": 240
    },
    {
      "entropy": 0.6370995994657278,
      "epoch": 0.7317965605561654,
      "grad_norm": 6.21875,
      "learning_rate": 9.008835472749085e-06,
      "loss": 0.0971,
      "mean_token_accuracy": 0.9734279833734035,
      "num_tokens": 4467461.0,
      "step": 250
    },
    {
      "entropy": 0.626228180155158,
      "epoch": 0.761068422978412,
      "grad_norm": 2.90625,
      "learning_rate": 8.910590511132339e-06,
      "loss": 0.0872,
      "mean_token_accuracy": 0.976653154194355,
      "num_tokens": 4646737.0,
      "step": 260
    },
    {
      "entropy": 0.6526400525122881,
      "epoch": 0.7903402854006586,
      "grad_norm": 8.5625,
      "learning_rate": 8.808294151096436e-06,
      "loss": 0.1004,
      "mean_token_accuracy": 0.9733942933380604,
      "num_tokens": 4821010.0,
      "step": 270
    },
    {
      "entropy": 0.5949247144162655,
      "epoch": 0.8196121478229053,
      "grad_norm": 4.625,
      "learning_rate": 8.702052372367496e-06,
      "loss": 0.0946,
      "mean_token_accuracy": 0.9757313847541809,
      "num_tokens": 4999440.0,
      "step": 280
    },
    {
      "entropy": 0.6587195005267859,
      "epoch": 0.8488840102451518,
      "grad_norm": 9.75,
      "learning_rate": 8.591975242152293e-06,
      "loss": 0.1173,
      "mean_token_accuracy": 0.9692915640771389,
      "num_tokens": 5171265.0,
      "step": 290
    },
    {
      "entropy": 0.6832986019551754,
      "epoch": 0.8781558726673985,
      "grad_norm": 9.75,
      "learning_rate": 8.478176801107872e-06,
      "loss": 0.132,
      "mean_token_accuracy": 0.9648732647299767,
      "num_tokens": 5338761.0,
      "step": 300
    },
    {
      "entropy": 0.6479978006333112,
      "epoch": 0.907427735089645,
      "grad_norm": 6.59375,
      "learning_rate": 8.360774945194666e-06,
      "loss": 0.1102,
      "mean_token_accuracy": 0.9708870485424995,
      "num_tokens": 5515062.0,
      "step": 310
    },
    {
      "entropy": 0.6245086956769228,
      "epoch": 0.9366995975118917,
      "grad_norm": 33.75,
      "learning_rate": 8.239891303535457e-06,
      "loss": 0.1039,
      "mean_token_accuracy": 0.9724696420133114,
      "num_tokens": 5696911.0,
      "step": 320
    },
    {
      "entropy": 0.6312450472265482,
      "epoch": 0.9659714599341384,
      "grad_norm": 6.3125,
      "learning_rate": 8.1156511124068e-06,
      "loss": 0.0879,
      "mean_token_accuracy": 0.9753462255001069,
      "num_tokens": 5869301.0,
      "step": 330
    },
    {
      "entropy": 0.6530843697488308,
      "epoch": 0.9952433223563849,
      "grad_norm": 5.40625,
      "learning_rate": 7.988183085493362e-06,
      "loss": 0.1105,
      "mean_token_accuracy": 0.969614065438509,
      "num_tokens": 6044424.0,
      "step": 340
    },
    {
      "entropy": 0.6368226534747458,
      "epoch": 1.0234174899377972,
      "grad_norm": 2.8125,
      "learning_rate": 7.85761928053969e-06,
      "loss": 0.0885,
      "mean_token_accuracy": 0.9756075329594798,
      "num_tokens": 6217116.0,
      "step": 350
    },
    {
      "entropy": 0.6533633038401604,
      "epoch": 1.0526893523600438,
      "grad_norm": 4.09375,
      "learning_rate": 7.72409496253747e-06,
      "loss": 0.1008,
      "mean_token_accuracy": 0.9739545792341232,
      "num_tokens": 6387512.0,
      "step": 360
    },
    {
      "entropy": 0.6239726161584258,
      "epoch": 1.0819612147822906,
      "grad_norm": 3.421875,
      "learning_rate": 7.5877484635900876e-06,
      "loss": 0.1032,
      "mean_token_accuracy": 0.9725366532802582,
      "num_tokens": 6573367.0,
      "step": 370
    },
    {
      "entropy": 0.6464782979339361,
      "epoch": 1.1112330772045371,
      "grad_norm": 2.640625,
      "learning_rate": 7.448721039599616e-06,
      "loss": 0.1105,
      "mean_token_accuracy": 0.9710234113037586,
      "num_tokens": 6751810.0,
      "step": 380
    },
    {
      "entropy": 0.6139115165919066,
      "epoch": 1.1405049396267837,
      "grad_norm": 7.84375,
      "learning_rate": 7.307156723924742e-06,
      "loss": 0.0907,
      "mean_token_accuracy": 0.9750210918486119,
      "num_tokens": 6940710.0,
      "step": 390
    },
    {
      "entropy": 0.6495381936430931,
      "epoch": 1.1697768020490305,
      "grad_norm": 2.578125,
      "learning_rate": 7.1632021781612305e-06,
      "loss": 0.0888,
      "mean_token_accuracy": 0.9754416085779667,
      "num_tokens": 7120616.0,
      "step": 400
    },
    {
      "entropy": 0.6580816496163606,
      "epoch": 1.199048664471277,
      "grad_norm": 4.53125,
      "learning_rate": 7.017006540199501e-06,
      "loss": 0.1054,
      "mean_token_accuracy": 0.9714232549071312,
      "num_tokens": 7295346.0,
      "step": 410
    },
    {
      "entropy": 0.6860831430181861,
      "epoch": 1.2283205268935236,
      "grad_norm": 39.75,
      "learning_rate": 6.8687212697167685e-06,
      "loss": 0.1315,
      "mean_token_accuracy": 0.9658499620854855,
      "num_tokens": 7473599.0,
      "step": 420
    },
    {
      "entropy": 0.6426566727459431,
      "epoch": 1.25759238931577,
      "grad_norm": 2.640625,
      "learning_rate": 6.718499991263776e-06,
      "loss": 0.0855,
      "mean_token_accuracy": 0.9763770438730717,
      "num_tokens": 7666324.0,
      "step": 430
    },
    {
      "entropy": 0.6955473996698857,
      "epoch": 1.2868642517380169,
      "grad_norm": 51.75,
      "learning_rate": 6.566498335108719e-06,
      "loss": 0.148,
      "mean_token_accuracy": 0.9621038816869258,
      "num_tokens": 7842308.0,
      "step": 440
    },
    {
      "entropy": 0.6305533852428198,
      "epoch": 1.3161361141602634,
      "grad_norm": 6.84375,
      "learning_rate": 6.412873776003224e-06,
      "loss": 0.0829,
      "mean_token_accuracy": 0.9776231050491333,
      "num_tokens": 8027201.0,
      "step": 450
    },
    {
      "entropy": 0.6824685353785753,
      "epoch": 1.34540797658251,
      "grad_norm": 11.0,
      "learning_rate": 6.2577854700374326e-06,
      "loss": 0.0748,
      "mean_token_accuracy": 0.9776533439755439,
      "num_tokens": 8198350.0,
      "step": 460
    },
    {
      "entropy": 0.6491625333204866,
      "epoch": 1.3746798390047568,
      "grad_norm": 2.9375,
      "learning_rate": 6.101394089753215e-06,
      "loss": 0.096,
      "mean_token_accuracy": 0.9740288414061069,
      "num_tokens": 8384460.0,
      "step": 470
    },
    {
      "entropy": 0.6950589299201966,
      "epoch": 1.4039517014270033,
      "grad_norm": 2.265625,
      "learning_rate": 5.9438616576863085e-06,
      "loss": 0.0866,
      "mean_token_accuracy": 0.9760893404483795,
      "num_tokens": 8555391.0,
      "step": 480
    },
    {
      "entropy": 0.681521375477314,
      "epoch": 1.4332235638492499,
      "grad_norm": 8.375,
      "learning_rate": 5.785351378509875e-06,
      "loss": 0.0922,
      "mean_token_accuracy": 0.9748925127089023,
      "num_tokens": 8734253.0,
      "step": 490
    },
    {
      "entropy": 0.7467833517119289,
      "epoch": 1.4624954262714964,
      "grad_norm": 5.125,
      "learning_rate": 5.626027469953345e-06,
      "loss": 0.1173,
      "mean_token_accuracy": 0.9686291612684726,
      "num_tokens": 8910777.0,
      "step": 500
    }
  ],
  "logging_steps": 10,
  "max_steps": 1026,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 2.1634477360256102e+17,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}