Rubywong123 commited on
Commit
ec97ce5
·
verified ·
1 Parent(s): ef8c0a3

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +268 -457
trainer_state.json CHANGED
@@ -1,667 +1,478 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9991872121376321,
5
  "eval_steps": 500,
6
- "global_step": 922,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.001083717149823896,
13
- "grad_norm": 5.318704129667974,
14
- "learning_rate": 1.0752688172043012e-07,
15
- "loss": 1.4453,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.01083717149823896,
20
- "grad_norm": 3.1837302909790695,
21
- "learning_rate": 1.0752688172043011e-06,
22
- "loss": 1.3469,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.02167434299647792,
27
- "grad_norm": 2.0057434878954625,
28
- "learning_rate": 2.1505376344086023e-06,
29
- "loss": 0.92,
30
  "step": 20
31
  },
32
  {
33
- "epoch": 0.03251151449471688,
34
- "grad_norm": 0.42015507215706815,
35
- "learning_rate": 3.225806451612903e-06,
36
- "loss": 0.4279,
37
  "step": 30
38
  },
39
  {
40
- "epoch": 0.04334868599295584,
41
- "grad_norm": 0.38825656740560066,
42
- "learning_rate": 4.3010752688172045e-06,
43
- "loss": 0.3302,
44
  "step": 40
45
  },
46
  {
47
- "epoch": 0.0541858574911948,
48
- "grad_norm": 0.3611453261381451,
49
- "learning_rate": 5.376344086021506e-06,
50
- "loss": 0.2733,
51
  "step": 50
52
  },
53
  {
54
- "epoch": 0.06502302898943375,
55
- "grad_norm": 0.3521773847454405,
56
- "learning_rate": 6.451612903225806e-06,
57
- "loss": 0.2653,
58
  "step": 60
59
  },
60
  {
61
- "epoch": 0.07586020048767272,
62
- "grad_norm": 0.3064772852895303,
63
- "learning_rate": 7.526881720430108e-06,
64
- "loss": 0.206,
65
  "step": 70
66
  },
67
  {
68
- "epoch": 0.08669737198591168,
69
- "grad_norm": 0.2736081121891908,
70
- "learning_rate": 8.602150537634409e-06,
71
- "loss": 0.2117,
72
  "step": 80
73
  },
74
  {
75
- "epoch": 0.09753454348415064,
76
- "grad_norm": 0.21841630964764266,
77
- "learning_rate": 9.67741935483871e-06,
78
- "loss": 0.1983,
79
  "step": 90
80
  },
81
  {
82
- "epoch": 0.1083717149823896,
83
- "grad_norm": 0.24938850336126014,
84
- "learning_rate": 9.998240856349384e-06,
85
- "loss": 0.218,
86
  "step": 100
87
  },
88
  {
89
- "epoch": 0.11920888648062855,
90
- "grad_norm": 0.2637157927920438,
91
- "learning_rate": 9.989627622294385e-06,
92
- "loss": 0.1933,
93
  "step": 110
94
  },
95
  {
96
- "epoch": 0.1300460579788675,
97
- "grad_norm": 0.1837162492085093,
98
- "learning_rate": 9.97384954250576e-06,
99
- "loss": 0.2105,
100
  "step": 120
101
  },
102
  {
103
- "epoch": 0.14088322947710646,
104
- "grad_norm": 0.1698850784961916,
105
- "learning_rate": 9.950929273485405e-06,
106
- "loss": 0.2108,
107
  "step": 130
108
  },
109
  {
110
- "epoch": 0.15172040097534545,
111
- "grad_norm": 0.18875484792589542,
112
- "learning_rate": 9.920899727547446e-06,
113
- "loss": 0.1929,
114
  "step": 140
115
  },
116
  {
117
- "epoch": 0.1625575724735844,
118
- "grad_norm": 0.2072673883751343,
119
- "learning_rate": 9.883804025557889e-06,
120
- "loss": 0.1783,
121
  "step": 150
122
  },
123
  {
124
- "epoch": 0.17339474397182336,
125
- "grad_norm": 0.1742168135536402,
126
- "learning_rate": 9.83969543501528e-06,
127
- "loss": 0.1822,
128
  "step": 160
129
  },
130
  {
131
- "epoch": 0.18423191547006232,
132
- "grad_norm": 0.20806507412526298,
133
- "learning_rate": 9.788637293561363e-06,
134
- "loss": 0.185,
135
  "step": 170
136
  },
137
  {
138
- "epoch": 0.19506908696830128,
139
- "grad_norm": 0.16955657764451743,
140
- "learning_rate": 9.730702918031512e-06,
141
- "loss": 0.1919,
142
  "step": 180
143
  },
144
  {
145
- "epoch": 0.20590625846654023,
146
- "grad_norm": 0.14025321301320962,
147
- "learning_rate": 9.665975499175571e-06,
148
- "loss": 0.1841,
149
  "step": 190
150
  },
151
  {
152
- "epoch": 0.2167434299647792,
153
- "grad_norm": 0.18097982224244188,
154
- "learning_rate": 9.594547982200266e-06,
155
- "loss": 0.1687,
156
  "step": 200
157
  },
158
  {
159
- "epoch": 0.22758060146301815,
160
- "grad_norm": 0.15675428424195145,
161
- "learning_rate": 9.516522933304721e-06,
162
- "loss": 0.1753,
163
  "step": 210
164
  },
165
  {
166
- "epoch": 0.2384177729612571,
167
- "grad_norm": 0.18314622330890096,
168
- "learning_rate": 9.432012392400734e-06,
169
- "loss": 0.1907,
170
  "step": 220
171
  },
172
  {
173
- "epoch": 0.24925494445949606,
174
- "grad_norm": 0.1436690313077452,
175
- "learning_rate": 9.341137712229282e-06,
176
- "loss": 0.1849,
177
  "step": 230
178
  },
179
  {
180
- "epoch": 0.260092115957735,
181
- "grad_norm": 0.13861916994808463,
182
- "learning_rate": 9.244029384104311e-06,
183
- "loss": 0.1785,
184
  "step": 240
185
  },
186
  {
187
- "epoch": 0.270929287455974,
188
- "grad_norm": 0.17179143403732375,
189
- "learning_rate": 9.140826850533989e-06,
190
- "loss": 0.1913,
191
  "step": 250
192
  },
193
  {
194
- "epoch": 0.28176645895421293,
195
- "grad_norm": 0.1814911526750455,
196
- "learning_rate": 9.031678304988509e-06,
197
- "loss": 0.1933,
198
  "step": 260
199
  },
200
  {
201
- "epoch": 0.2926036304524519,
202
- "grad_norm": 0.15112329360502205,
203
- "learning_rate": 8.916740479101994e-06,
204
- "loss": 0.1974,
205
  "step": 270
206
  },
207
  {
208
- "epoch": 0.3034408019506909,
209
- "grad_norm": 0.17817706909190312,
210
- "learning_rate": 8.796178417614008e-06,
211
- "loss": 0.1728,
212
  "step": 280
213
  },
214
  {
215
- "epoch": 0.3142779734489298,
216
- "grad_norm": 0.14475898393067627,
217
- "learning_rate": 8.670165241373891e-06,
218
- "loss": 0.1628,
219
  "step": 290
220
  },
221
  {
222
- "epoch": 0.3251151449471688,
223
- "grad_norm": 0.16367319934485383,
224
- "learning_rate": 8.53888189874824e-06,
225
- "loss": 0.1877,
226
  "step": 300
227
  },
228
  {
229
- "epoch": 0.33595231644540774,
230
- "grad_norm": 0.20564177370209544,
231
- "learning_rate": 8.402516905788455e-06,
232
- "loss": 0.1777,
233
  "step": 310
234
  },
235
  {
236
- "epoch": 0.3467894879436467,
237
- "grad_norm": 0.11306543588575382,
238
- "learning_rate": 8.261266075531494e-06,
239
- "loss": 0.1618,
240
  "step": 320
241
  },
242
  {
243
- "epoch": 0.35762665944188565,
244
- "grad_norm": 0.16204721387143328,
245
- "learning_rate": 8.115332236822542e-06,
246
- "loss": 0.1553,
247
  "step": 330
248
  },
249
  {
250
- "epoch": 0.36846383094012464,
251
- "grad_norm": 0.15978905440538324,
252
- "learning_rate": 7.964924943063342e-06,
253
- "loss": 0.1639,
254
  "step": 340
255
  },
256
  {
257
- "epoch": 0.37930100243836357,
258
- "grad_norm": 0.12238912187028675,
259
- "learning_rate": 7.8102601713044e-06,
260
- "loss": 0.1675,
261
  "step": 350
262
  },
263
  {
264
- "epoch": 0.39013817393660255,
265
- "grad_norm": 0.16586988923283047,
266
- "learning_rate": 7.651560012113183e-06,
267
- "loss": 0.1718,
268
  "step": 360
269
  },
270
  {
271
- "epoch": 0.4009753454348415,
272
- "grad_norm": 0.13360698107801403,
273
- "learning_rate": 7.489052350663611e-06,
274
- "loss": 0.1528,
275
  "step": 370
276
  },
277
  {
278
- "epoch": 0.41181251693308046,
279
- "grad_norm": 0.13665156197787387,
280
- "learning_rate": 7.322970539504802e-06,
281
- "loss": 0.1701,
282
  "step": 380
283
  },
284
  {
285
- "epoch": 0.42264968843131945,
286
- "grad_norm": 0.1460295753335572,
287
- "learning_rate": 7.153553063478953e-06,
288
- "loss": 0.1812,
289
  "step": 390
290
  },
291
  {
292
- "epoch": 0.4334868599295584,
293
- "grad_norm": 0.1783506259649917,
294
- "learning_rate": 6.981043197269504e-06,
295
- "loss": 0.1864,
296
  "step": 400
297
  },
298
  {
299
- "epoch": 0.44432403142779736,
300
- "grad_norm": 0.11762381396049168,
301
- "learning_rate": 6.805688656071354e-06,
302
- "loss": 0.1723,
303
  "step": 410
304
  },
305
  {
306
- "epoch": 0.4551612029260363,
307
- "grad_norm": 0.16980960385767685,
308
- "learning_rate": 6.627741239884716e-06,
309
- "loss": 0.1607,
310
  "step": 420
311
  },
312
  {
313
- "epoch": 0.4659983744242753,
314
- "grad_norm": 0.15882498938630946,
315
- "learning_rate": 6.447456471943428e-06,
316
- "loss": 0.1766,
317
  "step": 430
318
  },
319
  {
320
- "epoch": 0.4768355459225142,
321
- "grad_norm": 0.13005948970904896,
322
- "learning_rate": 6.265093231796864e-06,
323
- "loss": 0.179,
324
  "step": 440
325
  },
326
  {
327
- "epoch": 0.4876727174207532,
328
- "grad_norm": 0.16307475291475648,
329
- "learning_rate": 6.080913383572378e-06,
330
- "loss": 0.1666,
331
  "step": 450
332
  },
333
  {
334
- "epoch": 0.4985098889189921,
335
- "grad_norm": 0.17614846187699734,
336
- "learning_rate": 5.895181399952038e-06,
337
- "loss": 0.168,
338
  "step": 460
339
  },
340
  {
341
- "epoch": 0.5093470604172311,
342
- "grad_norm": 0.11637756882099502,
343
- "learning_rate": 5.708163982403601e-06,
344
- "loss": 0.1582,
345
  "step": 470
346
  },
347
  {
348
- "epoch": 0.52018423191547,
349
- "grad_norm": 0.09582274164381725,
350
- "learning_rate": 5.520129678211085e-06,
351
- "loss": 0.1533,
352
  "step": 480
353
  },
354
  {
355
- "epoch": 0.5310214034137091,
356
- "grad_norm": 0.1882573486521579,
357
- "learning_rate": 5.331348494854841e-06,
358
- "loss": 0.1643,
359
  "step": 490
360
  },
361
  {
362
- "epoch": 0.541858574911948,
363
- "grad_norm": 0.15334776587401486,
364
- "learning_rate": 5.1420915122948445e-06,
365
- "loss": 0.1857,
366
  "step": 500
367
  },
368
  {
369
- "epoch": 0.5526957464101869,
370
- "grad_norm": 0.13824840025643104,
371
- "learning_rate": 4.952630493713985e-06,
372
- "loss": 0.1417,
373
  "step": 510
374
  },
375
  {
376
- "epoch": 0.5635329179084259,
377
- "grad_norm": 0.15156143562433952,
378
- "learning_rate": 4.7632374952802575e-06,
379
- "loss": 0.1659,
380
  "step": 520
381
  },
382
  {
383
- "epoch": 0.5743700894066649,
384
- "grad_norm": 0.13535886605506386,
385
- "learning_rate": 4.5741844754882746e-06,
386
- "loss": 0.1576,
387
  "step": 530
388
  },
389
  {
390
- "epoch": 0.5852072609049038,
391
- "grad_norm": 0.15151732524219252,
392
- "learning_rate": 4.385742904640993e-06,
393
- "loss": 0.164,
394
  "step": 540
395
  },
396
  {
397
- "epoch": 0.5960444324031428,
398
- "grad_norm": 0.12194150826949314,
399
- "learning_rate": 4.198183375032493e-06,
400
- "loss": 0.1553,
401
  "step": 550
402
  },
403
  {
404
- "epoch": 0.6068816039013818,
405
- "grad_norm": 0.11452120572129604,
406
- "learning_rate": 4.011775212391517e-06,
407
- "loss": 0.165,
408
  "step": 560
409
  },
410
  {
411
- "epoch": 0.6177187753996207,
412
- "grad_norm": 0.12178354596175864,
413
- "learning_rate": 3.826786089143722e-06,
414
- "loss": 0.1587,
415
  "step": 570
416
  },
417
  {
418
- "epoch": 0.6285559468978597,
419
- "grad_norm": 0.1547146987299301,
420
- "learning_rate": 3.643481640048019e-06,
421
- "loss": 0.1452,
422
  "step": 580
423
  },
424
  {
425
- "epoch": 0.6393931183960986,
426
- "grad_norm": 0.13929495872226308,
427
- "learning_rate": 3.4621250807588524e-06,
428
- "loss": 0.1474,
429
  "step": 590
430
  },
431
  {
432
- "epoch": 0.6502302898943376,
433
- "grad_norm": 0.17786744047652958,
434
- "learning_rate": 3.2829768298622273e-06,
435
- "loss": 0.1434,
436
  "step": 600
437
  },
438
  {
439
- "epoch": 0.6610674613925765,
440
- "grad_norm": 0.13211770983307067,
441
- "learning_rate": 3.1062941349281596e-06,
442
- "loss": 0.1781,
443
  "step": 610
444
  },
445
  {
446
- "epoch": 0.6719046328908155,
447
- "grad_norm": 0.1776904769859249,
448
- "learning_rate": 2.9323307031165504e-06,
449
- "loss": 0.1441,
450
  "step": 620
451
  },
452
  {
453
- "epoch": 0.6827418043890544,
454
- "grad_norm": 0.12977600112353288,
455
- "learning_rate": 2.761336336866893e-06,
456
- "loss": 0.1694,
457
  "step": 630
458
  },
459
  {
460
- "epoch": 0.6935789758872934,
461
- "grad_norm": 0.12091563498467163,
462
- "learning_rate": 2.593556575194971e-06,
463
- "loss": 0.1687,
464
  "step": 640
465
  },
466
  {
467
- "epoch": 0.7044161473855324,
468
- "grad_norm": 0.11784060409307216,
469
- "learning_rate": 2.4292323411115965e-06,
470
- "loss": 0.145,
471
  "step": 650
472
- },
473
- {
474
- "epoch": 0.7152533188837713,
475
- "grad_norm": 0.1748624015228809,
476
- "learning_rate": 2.2685995956697037e-06,
477
- "loss": 0.1568,
478
- "step": 660
479
- },
480
- {
481
- "epoch": 0.7260904903820103,
482
- "grad_norm": 0.1723078141044498,
483
- "learning_rate": 2.1118889991365477e-06,
484
- "loss": 0.1544,
485
- "step": 670
486
- },
487
- {
488
- "epoch": 0.7369276618802493,
489
- "grad_norm": 0.14553092139119517,
490
- "learning_rate": 1.9593255797775578e-06,
491
- "loss": 0.1467,
492
- "step": 680
493
- },
494
- {
495
- "epoch": 0.7477648333784882,
496
- "grad_norm": 0.16342951046374027,
497
- "learning_rate": 1.811128410727454e-06,
498
- "loss": 0.1565,
499
- "step": 690
500
- },
501
- {
502
- "epoch": 0.7586020048767271,
503
- "grad_norm": 0.15898031074835672,
504
- "learning_rate": 1.6675102954126203e-06,
505
- "loss": 0.1672,
506
- "step": 700
507
- },
508
- {
509
- "epoch": 0.7694391763749662,
510
- "grad_norm": 0.07132360800064767,
511
- "learning_rate": 1.528677461976451e-06,
512
- "loss": 0.1411,
513
- "step": 710
514
- },
515
- {
516
- "epoch": 0.7802763478732051,
517
- "grad_norm": 0.14147704428723085,
518
- "learning_rate": 1.3948292671464708e-06,
519
- "loss": 0.1454,
520
- "step": 720
521
- },
522
- {
523
- "epoch": 0.791113519371444,
524
- "grad_norm": 0.1217153272659515,
525
- "learning_rate": 1.2661579099684345e-06,
526
- "loss": 0.1367,
527
- "step": 730
528
- },
529
- {
530
- "epoch": 0.801950690869683,
531
- "grad_norm": 0.14222802833323006,
532
- "learning_rate": 1.1428481558184985e-06,
533
- "loss": 0.1658,
534
- "step": 740
535
- },
536
- {
537
- "epoch": 0.812787862367922,
538
- "grad_norm": 0.15290868142041023,
539
- "learning_rate": 1.0250770710897513e-06,
540
- "loss": 0.1837,
541
- "step": 750
542
- },
543
- {
544
- "epoch": 0.8236250338661609,
545
- "grad_norm": 0.10220156710073305,
546
- "learning_rate": 9.13013768934084e-07,
547
- "loss": 0.1642,
548
- "step": 760
549
- },
550
- {
551
- "epoch": 0.8344622053643999,
552
- "grad_norm": 0.13901575395063445,
553
- "learning_rate": 8.068191664244945e-07,
554
- "loss": 0.1637,
555
- "step": 770
556
- },
557
- {
558
- "epoch": 0.8452993768626389,
559
- "grad_norm": 0.13154009484022863,
560
- "learning_rate": 7.066457534865529e-07,
561
- "loss": 0.1485,
562
- "step": 780
563
- },
564
- {
565
- "epoch": 0.8561365483608778,
566
- "grad_norm": 0.15411518006395045,
567
- "learning_rate": 6.126373739307856e-07,
568
- "loss": 0.1351,
569
- "step": 790
570
- },
571
- {
572
- "epoch": 0.8669737198591168,
573
- "grad_norm": 0.1465409663251719,
574
- "learning_rate": 5.249290189004552e-07,
575
- "loss": 0.1616,
576
- "step": 800
577
- },
578
- {
579
- "epoch": 0.8778108913573557,
580
- "grad_norm": 0.151355342159477,
581
- "learning_rate": 4.4364663303129664e-07,
582
- "loss": 0.1338,
583
- "step": 810
584
- },
585
- {
586
- "epoch": 0.8886480628555947,
587
- "grad_norm": 0.13076440882361037,
588
- "learning_rate": 3.6890693360157104e-07,
589
- "loss": 0.1415,
590
- "step": 820
591
- },
592
- {
593
- "epoch": 0.8994852343538337,
594
- "grad_norm": 0.1112996143342554,
595
- "learning_rate": 3.0081724293212653e-07,
596
- "loss": 0.1552,
597
- "step": 830
598
- },
599
- {
600
- "epoch": 0.9103224058520726,
601
- "grad_norm": 0.09563035386898121,
602
- "learning_rate": 2.3947533427712667e-07,
603
- "loss": 0.1501,
604
- "step": 840
605
- },
606
- {
607
- "epoch": 0.9211595773503116,
608
- "grad_norm": 0.15792781910845813,
609
- "learning_rate": 1.8496929142674424e-07,
610
- "loss": 0.1491,
611
- "step": 850
612
- },
613
- {
614
- "epoch": 0.9319967488485505,
615
- "grad_norm": 0.16164911200152776,
616
- "learning_rate": 1.3737738222341813e-07,
617
- "loss": 0.1552,
618
- "step": 860
619
- },
620
- {
621
- "epoch": 0.9428339203467895,
622
- "grad_norm": 0.15162312410480808,
623
- "learning_rate": 9.67679461733051e-08,
624
- "loss": 0.1379,
625
- "step": 870
626
- },
627
- {
628
- "epoch": 0.9536710918450284,
629
- "grad_norm": 0.11915620491115217,
630
- "learning_rate": 6.319929631430077e-08,
631
- "loss": 0.1203,
632
- "step": 880
633
- },
634
- {
635
- "epoch": 0.9645082633432674,
636
- "grad_norm": 0.16926863067932774,
637
- "learning_rate": 3.671963548155244e-08,
638
- "loss": 0.1523,
639
- "step": 890
640
- },
641
- {
642
- "epoch": 0.9753454348415064,
643
- "grad_norm": 0.14805382379357954,
644
- "learning_rate": 1.736698709069673e-08,
645
- "loss": 0.1425,
646
- "step": 900
647
- },
648
- {
649
- "epoch": 0.9861826063397453,
650
- "grad_norm": 0.17214363284720982,
651
- "learning_rate": 5.169140538207051e-09,
652
- "loss": 0.1496,
653
- "step": 910
654
- },
655
- {
656
- "epoch": 0.9970197778379842,
657
- "grad_norm": 0.15327122063501583,
658
- "learning_rate": 1.4361129727025193e-10,
659
- "loss": 0.1459,
660
- "step": 920
661
  }
662
  ],
663
  "logging_steps": 10,
664
- "max_steps": 922,
665
  "num_input_tokens_seen": 0,
666
  "num_train_epochs": 1,
667
  "save_steps": 500,
@@ -677,7 +488,7 @@
677
  "attributes": {}
678
  }
679
  },
680
- "total_flos": 40578576736256.0,
681
  "train_batch_size": 1,
682
  "trial_name": null,
683
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9988545246277205,
5
  "eval_steps": 500,
6
+ "global_step": 654,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0015273004963726614,
13
+ "grad_norm": 5.274756775371289,
14
+ "learning_rate": 1.5151515151515152e-07,
15
+ "loss": 1.3724,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.015273004963726614,
20
+ "grad_norm": 2.4356823026963044,
21
+ "learning_rate": 1.5151515151515152e-06,
22
+ "loss": 1.3215,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.030546009927453228,
27
+ "grad_norm": 0.8415959934340891,
28
+ "learning_rate": 3.0303030303030305e-06,
29
+ "loss": 0.7041,
30
  "step": 20
31
  },
32
  {
33
+ "epoch": 0.045819014891179836,
34
+ "grad_norm": 0.3951472895674848,
35
+ "learning_rate": 4.5454545454545455e-06,
36
+ "loss": 0.3621,
37
  "step": 30
38
  },
39
  {
40
+ "epoch": 0.061092019854906456,
41
+ "grad_norm": 0.3302170549897179,
42
+ "learning_rate": 6.060606060606061e-06,
43
+ "loss": 0.2555,
44
  "step": 40
45
  },
46
  {
47
+ "epoch": 0.07636502481863307,
48
+ "grad_norm": 0.3855993238810659,
49
+ "learning_rate": 7.5757575757575764e-06,
50
+ "loss": 0.21,
51
  "step": 50
52
  },
53
  {
54
+ "epoch": 0.09163802978235967,
55
+ "grad_norm": 0.3643941392356843,
56
+ "learning_rate": 9.090909090909091e-06,
57
+ "loss": 0.1707,
58
  "step": 60
59
  },
60
  {
61
+ "epoch": 0.10691103474608629,
62
+ "grad_norm": 0.24356795945348528,
63
+ "learning_rate": 9.99885820390154e-06,
64
+ "loss": 0.1628,
65
  "step": 70
66
  },
67
  {
68
+ "epoch": 0.12218403970981291,
69
+ "grad_norm": 0.18282398542383,
70
+ "learning_rate": 9.986018985905901e-06,
71
+ "loss": 0.1537,
72
  "step": 80
73
  },
74
  {
75
+ "epoch": 0.13745704467353953,
76
+ "grad_norm": 0.17824704057472698,
77
+ "learning_rate": 9.95895006911623e-06,
78
+ "loss": 0.1472,
79
  "step": 90
80
  },
81
  {
82
+ "epoch": 0.15273004963726614,
83
+ "grad_norm": 0.20039388213383333,
84
+ "learning_rate": 9.917728706052765e-06,
85
+ "loss": 0.1512,
86
  "step": 100
87
  },
88
  {
89
+ "epoch": 0.16800305460099274,
90
+ "grad_norm": 0.21394666608149668,
91
+ "learning_rate": 9.862472539183757e-06,
92
+ "loss": 0.1446,
93
  "step": 110
94
  },
95
  {
96
+ "epoch": 0.18327605956471935,
97
+ "grad_norm": 0.1399694568156125,
98
+ "learning_rate": 9.793339265183303e-06,
99
+ "loss": 0.1379,
100
  "step": 120
101
  },
102
  {
103
+ "epoch": 0.19854906452844598,
104
+ "grad_norm": 0.17756723955927178,
105
+ "learning_rate": 9.710526184877667e-06,
106
+ "loss": 0.1446,
107
  "step": 130
108
  },
109
  {
110
+ "epoch": 0.21382206949217258,
111
+ "grad_norm": 0.1256183224074801,
112
+ "learning_rate": 9.61426964016452e-06,
113
+ "loss": 0.1598,
114
  "step": 140
115
  },
116
  {
117
+ "epoch": 0.2290950744558992,
118
+ "grad_norm": 0.1925308832459223,
119
+ "learning_rate": 9.504844339512096e-06,
120
+ "loss": 0.1703,
121
  "step": 150
122
  },
123
  {
124
+ "epoch": 0.24436807941962582,
125
+ "grad_norm": 0.18425814277453983,
126
+ "learning_rate": 9.382562573963238e-06,
127
+ "loss": 0.1347,
128
  "step": 160
129
  },
130
  {
131
+ "epoch": 0.2596410843833524,
132
+ "grad_norm": 0.16364274896349304,
133
+ "learning_rate": 9.24777332588177e-06,
134
+ "loss": 0.1291,
135
  "step": 170
136
  },
137
  {
138
+ "epoch": 0.27491408934707906,
139
+ "grad_norm": 0.1575283767974318,
140
+ "learning_rate": 9.10086127298478e-06,
141
+ "loss": 0.1455,
142
  "step": 180
143
  },
144
  {
145
+ "epoch": 0.29018709431080564,
146
+ "grad_norm": 0.16299547711079815,
147
+ "learning_rate": 8.94224569050324e-06,
148
+ "loss": 0.1448,
149
  "step": 190
150
  },
151
  {
152
+ "epoch": 0.30546009927453227,
153
+ "grad_norm": 0.1678304349758231,
154
+ "learning_rate": 8.772379254604074e-06,
155
+ "loss": 0.1329,
156
  "step": 200
157
  },
158
  {
159
+ "epoch": 0.3207331042382589,
160
+ "grad_norm": 0.18133792526251943,
161
+ "learning_rate": 8.591746750488639e-06,
162
+ "loss": 0.1523,
163
  "step": 210
164
  },
165
  {
166
+ "epoch": 0.3360061092019855,
167
+ "grad_norm": 0.138988177087972,
168
+ "learning_rate": 8.400863688854598e-06,
169
+ "loss": 0.1487,
170
  "step": 220
171
  },
172
  {
173
+ "epoch": 0.3512791141657121,
174
+ "grad_norm": 0.10975239156978413,
175
+ "learning_rate": 8.200274834669675e-06,
176
+ "loss": 0.1454,
177
  "step": 230
178
  },
179
  {
180
+ "epoch": 0.3665521191294387,
181
+ "grad_norm": 0.13022143377921225,
182
+ "learning_rate": 7.99055265245608e-06,
183
+ "loss": 0.1414,
184
  "step": 240
185
  },
186
  {
187
+ "epoch": 0.3818251240931653,
188
+ "grad_norm": 0.09325598021130543,
189
+ "learning_rate": 7.772295672522615e-06,
190
+ "loss": 0.1406,
191
  "step": 250
192
  },
193
  {
194
+ "epoch": 0.39709812905689196,
195
+ "grad_norm": 0.10192417440664955,
196
+ "learning_rate": 7.546126782807117e-06,
197
+ "loss": 0.1243,
198
  "step": 260
199
  },
200
  {
201
+ "epoch": 0.41237113402061853,
202
+ "grad_norm": 0.10214456159318729,
203
+ "learning_rate": 7.312691451204178e-06,
204
+ "loss": 0.1381,
205
  "step": 270
206
  },
207
  {
208
+ "epoch": 0.42764413898434517,
209
+ "grad_norm": 0.11650402115334278,
210
+ "learning_rate": 7.072655883451478e-06,
211
+ "loss": 0.1155,
212
  "step": 280
213
  },
214
  {
215
+ "epoch": 0.4429171439480718,
216
+ "grad_norm": 0.09899147898887324,
217
+ "learning_rate": 6.8267051218319766e-06,
218
+ "loss": 0.1372,
219
  "step": 290
220
  },
221
  {
222
+ "epoch": 0.4581901489117984,
223
+ "grad_norm": 0.11909991495605356,
224
+ "learning_rate": 6.575541090118105e-06,
225
+ "loss": 0.1449,
226
  "step": 300
227
  },
228
  {
229
+ "epoch": 0.473463153875525,
230
+ "grad_norm": 0.11324258672549159,
231
+ "learning_rate": 6.319880590337549e-06,
232
+ "loss": 0.1355,
233
  "step": 310
234
  },
235
  {
236
+ "epoch": 0.48873615883925164,
237
+ "grad_norm": 0.15684721556128514,
238
+ "learning_rate": 6.060453257077686e-06,
239
+ "loss": 0.1448,
240
  "step": 320
241
  },
242
  {
243
+ "epoch": 0.5040091638029782,
244
+ "grad_norm": 0.13428699506671887,
245
+ "learning_rate": 5.797999475166897e-06,
246
+ "loss": 0.161,
247
  "step": 330
248
  },
249
  {
250
+ "epoch": 0.5192821687667049,
251
+ "grad_norm": 0.16260953903247685,
252
+ "learning_rate": 5.533268266675601e-06,
253
+ "loss": 0.1301,
254
  "step": 340
255
  },
256
  {
257
+ "epoch": 0.5345551737304315,
258
+ "grad_norm": 0.138117775450493,
259
+ "learning_rate": 5.267015153267246e-06,
260
+ "loss": 0.1153,
261
  "step": 350
262
  },
263
  {
264
+ "epoch": 0.5498281786941581,
265
+ "grad_norm": 0.09342193422842268,
266
+ "learning_rate": 5e-06,
267
+ "loss": 0.1388,
268
  "step": 360
269
  },
270
  {
271
+ "epoch": 0.5651011836578846,
272
+ "grad_norm": 0.12521354455054481,
273
+ "learning_rate": 4.732984846732755e-06,
274
+ "loss": 0.1469,
275
  "step": 370
276
  },
277
  {
278
+ "epoch": 0.5803741886216113,
279
+ "grad_norm": 0.12074643796303608,
280
+ "learning_rate": 4.466731733324399e-06,
281
+ "loss": 0.1336,
282
  "step": 380
283
  },
284
  {
285
+ "epoch": 0.5956471935853379,
286
+ "grad_norm": 0.10811612216811932,
287
+ "learning_rate": 4.2020005248331056e-06,
288
+ "loss": 0.1444,
289
  "step": 390
290
  },
291
  {
292
+ "epoch": 0.6109201985490645,
293
+ "grad_norm": 0.07932149519427452,
294
+ "learning_rate": 3.939546742922318e-06,
295
+ "loss": 0.1217,
296
  "step": 400
297
  },
298
  {
299
+ "epoch": 0.6261932035127912,
300
+ "grad_norm": 0.13495076484320434,
301
+ "learning_rate": 3.6801194096624515e-06,
302
+ "loss": 0.1486,
303
  "step": 410
304
  },
305
  {
306
+ "epoch": 0.6414662084765178,
307
+ "grad_norm": 0.09577670394462665,
308
+ "learning_rate": 3.424458909881897e-06,
309
+ "loss": 0.1351,
310
  "step": 420
311
  },
312
  {
313
+ "epoch": 0.6567392134402443,
314
+ "grad_norm": 0.13052717933517727,
315
+ "learning_rate": 3.173294878168025e-06,
316
+ "loss": 0.1115,
317
  "step": 430
318
  },
319
  {
320
+ "epoch": 0.672012218403971,
321
+ "grad_norm": 0.10868560247173807,
322
+ "learning_rate": 2.9273441165485227e-06,
323
+ "loss": 0.1429,
324
  "step": 440
325
  },
326
  {
327
+ "epoch": 0.6872852233676976,
328
+ "grad_norm": 0.08492718279444092,
329
+ "learning_rate": 2.687308548795825e-06,
330
+ "loss": 0.1006,
331
  "step": 450
332
  },
333
  {
334
+ "epoch": 0.7025582283314242,
335
+ "grad_norm": 0.11879878396233616,
336
+ "learning_rate": 2.4538732171928847e-06,
337
+ "loss": 0.1441,
338
  "step": 460
339
  },
340
  {
341
+ "epoch": 0.7178312332951509,
342
+ "grad_norm": 0.08627366089386783,
343
+ "learning_rate": 2.2277043274773856e-06,
344
+ "loss": 0.121,
345
  "step": 470
346
  },
347
  {
348
+ "epoch": 0.7331042382588774,
349
+ "grad_norm": 0.11860493468588708,
350
+ "learning_rate": 2.00944734754392e-06,
351
+ "loss": 0.104,
352
  "step": 480
353
  },
354
  {
355
+ "epoch": 0.748377243222604,
356
+ "grad_norm": 0.13566719962532361,
357
+ "learning_rate": 1.7997251653303249e-06,
358
+ "loss": 0.1223,
359
  "step": 490
360
  },
361
  {
362
+ "epoch": 0.7636502481863306,
363
+ "grad_norm": 0.11713757986892694,
364
+ "learning_rate": 1.5991363111454023e-06,
365
+ "loss": 0.1297,
366
  "step": 500
367
  },
368
  {
369
+ "epoch": 0.7789232531500573,
370
+ "grad_norm": 0.0932474027124396,
371
+ "learning_rate": 1.4082532495113627e-06,
372
+ "loss": 0.129,
373
  "step": 510
374
  },
375
  {
376
+ "epoch": 0.7941962581137839,
377
+ "grad_norm": 0.06960734581371894,
378
+ "learning_rate": 1.2276207453959283e-06,
379
+ "loss": 0.102,
380
  "step": 520
381
  },
382
  {
383
+ "epoch": 0.8094692630775105,
384
+ "grad_norm": 0.1054188314922148,
385
+ "learning_rate": 1.0577543094967613e-06,
386
+ "loss": 0.1182,
387
  "step": 530
388
  },
389
  {
390
+ "epoch": 0.8247422680412371,
391
+ "grad_norm": 0.15433498716395788,
392
+ "learning_rate": 8.991387270152202e-07,
393
+ "loss": 0.1306,
394
  "step": 540
395
  },
396
  {
397
+ "epoch": 0.8400152730049637,
398
+ "grad_norm": 0.10854241303868371,
399
+ "learning_rate": 7.522266741182305e-07,
400
+ "loss": 0.11,
401
  "step": 550
402
  },
403
  {
404
+ "epoch": 0.8552882779686903,
405
+ "grad_norm": 0.13631634648737045,
406
+ "learning_rate": 6.174374260367611e-07,
407
+ "loss": 0.1162,
408
  "step": 560
409
  },
410
  {
411
+ "epoch": 0.870561282932417,
412
+ "grad_norm": 0.11447961702982919,
413
+ "learning_rate": 4.951556604879049e-07,
414
+ "loss": 0.1396,
415
  "step": 570
416
  },
417
  {
418
+ "epoch": 0.8858342878961436,
419
+ "grad_norm": 0.1527666931393435,
420
+ "learning_rate": 3.8573035983548167e-07,
421
+ "loss": 0.1353,
422
  "step": 580
423
  },
424
  {
425
+ "epoch": 0.9011072928598702,
426
+ "grad_norm": 0.13839722389025982,
427
+ "learning_rate": 2.894738151223331e-07,
428
+ "loss": 0.1193,
429
  "step": 590
430
  },
431
  {
432
+ "epoch": 0.9163802978235968,
433
+ "grad_norm": 0.13196785663691574,
434
+ "learning_rate": 2.0666073481669714e-07,
435
+ "loss": 0.1147,
436
  "step": 600
437
  },
438
  {
439
+ "epoch": 0.9316533027873234,
440
+ "grad_norm": 0.09878735062205103,
441
+ "learning_rate": 1.375274608162447e-07,
442
+ "loss": 0.1261,
443
  "step": 610
444
  },
445
  {
446
+ "epoch": 0.94692630775105,
447
+ "grad_norm": 0.07952980044315368,
448
+ "learning_rate": 8.227129394723643e-08,
449
+ "loss": 0.0992,
450
  "step": 620
451
  },
452
  {
453
+ "epoch": 0.9621993127147767,
454
+ "grad_norm": 0.09015417303190752,
455
+ "learning_rate": 4.104993088376974e-08,
456
+ "loss": 0.1212,
457
  "step": 630
458
  },
459
  {
460
+ "epoch": 0.9774723176785033,
461
+ "grad_norm": 0.12899056246523352,
462
+ "learning_rate": 1.3981014094099354e-08,
463
+ "loss": 0.131,
464
  "step": 640
465
  },
466
  {
467
+ "epoch": 0.9927453226422298,
468
+ "grad_norm": 0.08443201140015646,
469
+ "learning_rate": 1.1417960984605459e-09,
470
+ "loss": 0.1302,
471
  "step": 650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  }
473
  ],
474
  "logging_steps": 10,
475
+ "max_steps": 654,
476
  "num_input_tokens_seen": 0,
477
  "num_train_epochs": 1,
478
  "save_steps": 500,
 
488
  "attributes": {}
489
  }
490
  },
491
+ "total_flos": 32812144386048.0,
492
  "train_batch_size": 1,
493
  "trial_name": null,
494
  "trial_params": null