train_rte_456_1760637784 / trainer_state.json
rbelanec's picture
End of training
a249076 verified
{
"best_global_step": 1992,
"best_metric": 0.16210927069187164,
"best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_rte_456_1760637784/checkpoint-1992",
"epoch": 20.0,
"eval_steps": 996,
"global_step": 9960,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010040160642570281,
"grad_norm": 194.0540008544922,
"learning_rate": 4.016064257028112e-08,
"loss": 8.9772,
"num_input_tokens_seen": 2176,
"step": 5
},
{
"epoch": 0.020080321285140562,
"grad_norm": 214.29051208496094,
"learning_rate": 9.036144578313253e-08,
"loss": 9.1361,
"num_input_tokens_seen": 6144,
"step": 10
},
{
"epoch": 0.030120481927710843,
"grad_norm": 204.60433959960938,
"learning_rate": 1.4056224899598394e-07,
"loss": 8.8855,
"num_input_tokens_seen": 9408,
"step": 15
},
{
"epoch": 0.040160642570281124,
"grad_norm": 167.90016174316406,
"learning_rate": 1.9076305220883537e-07,
"loss": 8.6766,
"num_input_tokens_seen": 11840,
"step": 20
},
{
"epoch": 0.050200803212851405,
"grad_norm": 216.1599578857422,
"learning_rate": 2.409638554216868e-07,
"loss": 8.3983,
"num_input_tokens_seen": 14912,
"step": 25
},
{
"epoch": 0.060240963855421686,
"grad_norm": 162.7516326904297,
"learning_rate": 2.911646586345382e-07,
"loss": 8.1478,
"num_input_tokens_seen": 18304,
"step": 30
},
{
"epoch": 0.07028112449799197,
"grad_norm": 177.10873413085938,
"learning_rate": 3.413654618473896e-07,
"loss": 7.7226,
"num_input_tokens_seen": 21344,
"step": 35
},
{
"epoch": 0.08032128514056225,
"grad_norm": 161.67369079589844,
"learning_rate": 3.91566265060241e-07,
"loss": 7.4309,
"num_input_tokens_seen": 25088,
"step": 40
},
{
"epoch": 0.09036144578313253,
"grad_norm": 146.87649536132812,
"learning_rate": 4.417670682730924e-07,
"loss": 7.098,
"num_input_tokens_seen": 28224,
"step": 45
},
{
"epoch": 0.10040160642570281,
"grad_norm": 110.42221069335938,
"learning_rate": 4.919678714859438e-07,
"loss": 6.5589,
"num_input_tokens_seen": 31968,
"step": 50
},
{
"epoch": 0.11044176706827309,
"grad_norm": 104.6060791015625,
"learning_rate": 5.421686746987952e-07,
"loss": 6.3171,
"num_input_tokens_seen": 35104,
"step": 55
},
{
"epoch": 0.12048192771084337,
"grad_norm": 90.55043029785156,
"learning_rate": 5.923694779116467e-07,
"loss": 5.7753,
"num_input_tokens_seen": 38240,
"step": 60
},
{
"epoch": 0.13052208835341367,
"grad_norm": 82.31653594970703,
"learning_rate": 6.425702811244979e-07,
"loss": 5.6056,
"num_input_tokens_seen": 40992,
"step": 65
},
{
"epoch": 0.14056224899598393,
"grad_norm": 80.8585433959961,
"learning_rate": 6.927710843373495e-07,
"loss": 5.2274,
"num_input_tokens_seen": 44544,
"step": 70
},
{
"epoch": 0.15060240963855423,
"grad_norm": 78.3877182006836,
"learning_rate": 7.429718875502008e-07,
"loss": 4.6893,
"num_input_tokens_seen": 48000,
"step": 75
},
{
"epoch": 0.1606425702811245,
"grad_norm": 94.57144165039062,
"learning_rate": 7.931726907630523e-07,
"loss": 3.8478,
"num_input_tokens_seen": 51648,
"step": 80
},
{
"epoch": 0.1706827309236948,
"grad_norm": 75.64936828613281,
"learning_rate": 8.433734939759036e-07,
"loss": 3.5727,
"num_input_tokens_seen": 54656,
"step": 85
},
{
"epoch": 0.18072289156626506,
"grad_norm": 65.45319366455078,
"learning_rate": 8.935742971887551e-07,
"loss": 3.1493,
"num_input_tokens_seen": 57088,
"step": 90
},
{
"epoch": 0.19076305220883535,
"grad_norm": 68.6191635131836,
"learning_rate": 9.437751004016064e-07,
"loss": 2.8682,
"num_input_tokens_seen": 60256,
"step": 95
},
{
"epoch": 0.20080321285140562,
"grad_norm": 64.3413314819336,
"learning_rate": 9.93975903614458e-07,
"loss": 2.6081,
"num_input_tokens_seen": 63072,
"step": 100
},
{
"epoch": 0.21084337349397592,
"grad_norm": 59.27682113647461,
"learning_rate": 1.0441767068273092e-06,
"loss": 1.6849,
"num_input_tokens_seen": 65312,
"step": 105
},
{
"epoch": 0.22088353413654618,
"grad_norm": 43.84572219848633,
"learning_rate": 1.0943775100401608e-06,
"loss": 1.5056,
"num_input_tokens_seen": 68352,
"step": 110
},
{
"epoch": 0.23092369477911648,
"grad_norm": 45.57728576660156,
"learning_rate": 1.1445783132530121e-06,
"loss": 1.2485,
"num_input_tokens_seen": 71104,
"step": 115
},
{
"epoch": 0.24096385542168675,
"grad_norm": 46.22006607055664,
"learning_rate": 1.1947791164658635e-06,
"loss": 1.0266,
"num_input_tokens_seen": 75040,
"step": 120
},
{
"epoch": 0.25100401606425704,
"grad_norm": 53.21188735961914,
"learning_rate": 1.2449799196787148e-06,
"loss": 0.8694,
"num_input_tokens_seen": 77920,
"step": 125
},
{
"epoch": 0.26104417670682734,
"grad_norm": 31.533672332763672,
"learning_rate": 1.2951807228915664e-06,
"loss": 0.5457,
"num_input_tokens_seen": 80000,
"step": 130
},
{
"epoch": 0.2710843373493976,
"grad_norm": 50.55741500854492,
"learning_rate": 1.345381526104418e-06,
"loss": 0.6048,
"num_input_tokens_seen": 82944,
"step": 135
},
{
"epoch": 0.28112449799196787,
"grad_norm": 44.2205924987793,
"learning_rate": 1.395582329317269e-06,
"loss": 0.3919,
"num_input_tokens_seen": 86368,
"step": 140
},
{
"epoch": 0.29116465863453816,
"grad_norm": 34.489261627197266,
"learning_rate": 1.4457831325301204e-06,
"loss": 0.312,
"num_input_tokens_seen": 89344,
"step": 145
},
{
"epoch": 0.30120481927710846,
"grad_norm": 21.01329231262207,
"learning_rate": 1.495983935742972e-06,
"loss": 0.2272,
"num_input_tokens_seen": 92128,
"step": 150
},
{
"epoch": 0.3112449799196787,
"grad_norm": 23.1704158782959,
"learning_rate": 1.5461847389558236e-06,
"loss": 0.2378,
"num_input_tokens_seen": 95296,
"step": 155
},
{
"epoch": 0.321285140562249,
"grad_norm": 20.119722366333008,
"learning_rate": 1.5963855421686747e-06,
"loss": 0.1967,
"num_input_tokens_seen": 98272,
"step": 160
},
{
"epoch": 0.3313253012048193,
"grad_norm": 61.622379302978516,
"learning_rate": 1.6465863453815263e-06,
"loss": 0.2241,
"num_input_tokens_seen": 101344,
"step": 165
},
{
"epoch": 0.3413654618473896,
"grad_norm": 29.35689926147461,
"learning_rate": 1.6967871485943776e-06,
"loss": 0.1959,
"num_input_tokens_seen": 103808,
"step": 170
},
{
"epoch": 0.3514056224899598,
"grad_norm": 28.446813583374023,
"learning_rate": 1.7469879518072292e-06,
"loss": 0.1855,
"num_input_tokens_seen": 106880,
"step": 175
},
{
"epoch": 0.3614457831325301,
"grad_norm": 30.122529983520508,
"learning_rate": 1.7971887550200803e-06,
"loss": 0.1717,
"num_input_tokens_seen": 110016,
"step": 180
},
{
"epoch": 0.3714859437751004,
"grad_norm": 19.520801544189453,
"learning_rate": 1.8473895582329318e-06,
"loss": 0.1764,
"num_input_tokens_seen": 112896,
"step": 185
},
{
"epoch": 0.3815261044176707,
"grad_norm": 28.57624053955078,
"learning_rate": 1.8975903614457832e-06,
"loss": 0.2006,
"num_input_tokens_seen": 115808,
"step": 190
},
{
"epoch": 0.39156626506024095,
"grad_norm": 26.32992172241211,
"learning_rate": 1.947791164658635e-06,
"loss": 0.2027,
"num_input_tokens_seen": 118656,
"step": 195
},
{
"epoch": 0.40160642570281124,
"grad_norm": 30.263748168945312,
"learning_rate": 1.997991967871486e-06,
"loss": 0.2035,
"num_input_tokens_seen": 121728,
"step": 200
},
{
"epoch": 0.41164658634538154,
"grad_norm": 39.99821472167969,
"learning_rate": 2.0481927710843377e-06,
"loss": 0.335,
"num_input_tokens_seen": 124128,
"step": 205
},
{
"epoch": 0.42168674698795183,
"grad_norm": 7.4671549797058105,
"learning_rate": 2.098393574297189e-06,
"loss": 0.1571,
"num_input_tokens_seen": 127072,
"step": 210
},
{
"epoch": 0.43172690763052207,
"grad_norm": 24.52522087097168,
"learning_rate": 2.1485943775100404e-06,
"loss": 0.1704,
"num_input_tokens_seen": 129888,
"step": 215
},
{
"epoch": 0.44176706827309237,
"grad_norm": 11.86972427368164,
"learning_rate": 2.1987951807228917e-06,
"loss": 0.1659,
"num_input_tokens_seen": 132800,
"step": 220
},
{
"epoch": 0.45180722891566266,
"grad_norm": 16.253108978271484,
"learning_rate": 2.248995983935743e-06,
"loss": 0.1919,
"num_input_tokens_seen": 136480,
"step": 225
},
{
"epoch": 0.46184738955823296,
"grad_norm": 31.7047119140625,
"learning_rate": 2.2991967871485944e-06,
"loss": 0.1908,
"num_input_tokens_seen": 140352,
"step": 230
},
{
"epoch": 0.4718875502008032,
"grad_norm": 21.108055114746094,
"learning_rate": 2.349397590361446e-06,
"loss": 0.1697,
"num_input_tokens_seen": 143616,
"step": 235
},
{
"epoch": 0.4819277108433735,
"grad_norm": 15.181034088134766,
"learning_rate": 2.399598393574297e-06,
"loss": 0.1851,
"num_input_tokens_seen": 146656,
"step": 240
},
{
"epoch": 0.4919678714859438,
"grad_norm": 23.453062057495117,
"learning_rate": 2.449799196787149e-06,
"loss": 0.143,
"num_input_tokens_seen": 149664,
"step": 245
},
{
"epoch": 0.5020080321285141,
"grad_norm": 27.794658660888672,
"learning_rate": 2.5e-06,
"loss": 0.2288,
"num_input_tokens_seen": 153344,
"step": 250
},
{
"epoch": 0.5120481927710844,
"grad_norm": 39.275882720947266,
"learning_rate": 2.5502008032128516e-06,
"loss": 0.2078,
"num_input_tokens_seen": 156224,
"step": 255
},
{
"epoch": 0.5220883534136547,
"grad_norm": 25.21923065185547,
"learning_rate": 2.6004016064257033e-06,
"loss": 0.1754,
"num_input_tokens_seen": 159648,
"step": 260
},
{
"epoch": 0.5321285140562249,
"grad_norm": 15.785544395446777,
"learning_rate": 2.6506024096385547e-06,
"loss": 0.1866,
"num_input_tokens_seen": 162304,
"step": 265
},
{
"epoch": 0.5421686746987951,
"grad_norm": 32.11054229736328,
"learning_rate": 2.700803212851406e-06,
"loss": 0.1895,
"num_input_tokens_seen": 165632,
"step": 270
},
{
"epoch": 0.5522088353413654,
"grad_norm": 12.410296440124512,
"learning_rate": 2.751004016064257e-06,
"loss": 0.1845,
"num_input_tokens_seen": 169024,
"step": 275
},
{
"epoch": 0.5622489959839357,
"grad_norm": 16.866844177246094,
"learning_rate": 2.8012048192771087e-06,
"loss": 0.1919,
"num_input_tokens_seen": 173056,
"step": 280
},
{
"epoch": 0.572289156626506,
"grad_norm": 12.017806053161621,
"learning_rate": 2.85140562248996e-06,
"loss": 0.186,
"num_input_tokens_seen": 175904,
"step": 285
},
{
"epoch": 0.5823293172690763,
"grad_norm": 10.06004810333252,
"learning_rate": 2.9016064257028114e-06,
"loss": 0.1724,
"num_input_tokens_seen": 178816,
"step": 290
},
{
"epoch": 0.5923694779116466,
"grad_norm": 22.80934715270996,
"learning_rate": 2.9518072289156627e-06,
"loss": 0.1727,
"num_input_tokens_seen": 182464,
"step": 295
},
{
"epoch": 0.6024096385542169,
"grad_norm": 10.76999282836914,
"learning_rate": 3.0020080321285145e-06,
"loss": 0.1692,
"num_input_tokens_seen": 186368,
"step": 300
},
{
"epoch": 0.6124497991967871,
"grad_norm": 21.18651008605957,
"learning_rate": 3.052208835341366e-06,
"loss": 0.2044,
"num_input_tokens_seen": 189184,
"step": 305
},
{
"epoch": 0.6224899598393574,
"grad_norm": 25.353771209716797,
"learning_rate": 3.1024096385542172e-06,
"loss": 0.1926,
"num_input_tokens_seen": 192640,
"step": 310
},
{
"epoch": 0.6325301204819277,
"grad_norm": 15.923639297485352,
"learning_rate": 3.152610441767068e-06,
"loss": 0.1875,
"num_input_tokens_seen": 195968,
"step": 315
},
{
"epoch": 0.642570281124498,
"grad_norm": 11.498019218444824,
"learning_rate": 3.20281124497992e-06,
"loss": 0.1517,
"num_input_tokens_seen": 199168,
"step": 320
},
{
"epoch": 0.6526104417670683,
"grad_norm": 12.303227424621582,
"learning_rate": 3.2530120481927713e-06,
"loss": 0.1917,
"num_input_tokens_seen": 201728,
"step": 325
},
{
"epoch": 0.6626506024096386,
"grad_norm": 9.231768608093262,
"learning_rate": 3.3032128514056226e-06,
"loss": 0.1671,
"num_input_tokens_seen": 204704,
"step": 330
},
{
"epoch": 0.6726907630522089,
"grad_norm": 8.526322364807129,
"learning_rate": 3.3534136546184744e-06,
"loss": 0.1743,
"num_input_tokens_seen": 207616,
"step": 335
},
{
"epoch": 0.6827309236947792,
"grad_norm": 29.402254104614258,
"learning_rate": 3.4036144578313257e-06,
"loss": 0.1898,
"num_input_tokens_seen": 210528,
"step": 340
},
{
"epoch": 0.6927710843373494,
"grad_norm": 24.25574493408203,
"learning_rate": 3.453815261044177e-06,
"loss": 0.2041,
"num_input_tokens_seen": 214752,
"step": 345
},
{
"epoch": 0.7028112449799196,
"grad_norm": 7.983010768890381,
"learning_rate": 3.504016064257029e-06,
"loss": 0.2241,
"num_input_tokens_seen": 217408,
"step": 350
},
{
"epoch": 0.7128514056224899,
"grad_norm": 9.211853981018066,
"learning_rate": 3.5542168674698798e-06,
"loss": 0.2229,
"num_input_tokens_seen": 220480,
"step": 355
},
{
"epoch": 0.7228915662650602,
"grad_norm": 6.233531951904297,
"learning_rate": 3.604417670682731e-06,
"loss": 0.1506,
"num_input_tokens_seen": 223520,
"step": 360
},
{
"epoch": 0.7329317269076305,
"grad_norm": 20.131092071533203,
"learning_rate": 3.6546184738955825e-06,
"loss": 0.153,
"num_input_tokens_seen": 226752,
"step": 365
},
{
"epoch": 0.7429718875502008,
"grad_norm": 21.862136840820312,
"learning_rate": 3.7048192771084342e-06,
"loss": 0.1937,
"num_input_tokens_seen": 230080,
"step": 370
},
{
"epoch": 0.7530120481927711,
"grad_norm": 17.098079681396484,
"learning_rate": 3.7550200803212856e-06,
"loss": 0.1582,
"num_input_tokens_seen": 233280,
"step": 375
},
{
"epoch": 0.7630522088353414,
"grad_norm": 17.18459701538086,
"learning_rate": 3.805220883534137e-06,
"loss": 0.1743,
"num_input_tokens_seen": 236416,
"step": 380
},
{
"epoch": 0.7730923694779116,
"grad_norm": 8.394510269165039,
"learning_rate": 3.855421686746989e-06,
"loss": 0.1722,
"num_input_tokens_seen": 240352,
"step": 385
},
{
"epoch": 0.7831325301204819,
"grad_norm": 9.847726821899414,
"learning_rate": 3.90562248995984e-06,
"loss": 0.1773,
"num_input_tokens_seen": 243296,
"step": 390
},
{
"epoch": 0.7931726907630522,
"grad_norm": 16.50226593017578,
"learning_rate": 3.9558232931726905e-06,
"loss": 0.1752,
"num_input_tokens_seen": 246656,
"step": 395
},
{
"epoch": 0.8032128514056225,
"grad_norm": 7.0121355056762695,
"learning_rate": 4.006024096385543e-06,
"loss": 0.166,
"num_input_tokens_seen": 250112,
"step": 400
},
{
"epoch": 0.8132530120481928,
"grad_norm": 4.569051265716553,
"learning_rate": 4.056224899598394e-06,
"loss": 0.1758,
"num_input_tokens_seen": 253632,
"step": 405
},
{
"epoch": 0.8232931726907631,
"grad_norm": 8.35086441040039,
"learning_rate": 4.106425702811245e-06,
"loss": 0.1631,
"num_input_tokens_seen": 256864,
"step": 410
},
{
"epoch": 0.8333333333333334,
"grad_norm": 20.156761169433594,
"learning_rate": 4.156626506024097e-06,
"loss": 0.1934,
"num_input_tokens_seen": 259584,
"step": 415
},
{
"epoch": 0.8433734939759037,
"grad_norm": 23.27678680419922,
"learning_rate": 4.206827309236948e-06,
"loss": 0.1684,
"num_input_tokens_seen": 262560,
"step": 420
},
{
"epoch": 0.8534136546184738,
"grad_norm": 21.717554092407227,
"learning_rate": 4.2570281124497995e-06,
"loss": 0.26,
"num_input_tokens_seen": 265760,
"step": 425
},
{
"epoch": 0.8634538152610441,
"grad_norm": 20.287023544311523,
"learning_rate": 4.307228915662651e-06,
"loss": 0.1703,
"num_input_tokens_seen": 267904,
"step": 430
},
{
"epoch": 0.8734939759036144,
"grad_norm": 5.225078105926514,
"learning_rate": 4.357429718875502e-06,
"loss": 0.1868,
"num_input_tokens_seen": 272288,
"step": 435
},
{
"epoch": 0.8835341365461847,
"grad_norm": 5.046104907989502,
"learning_rate": 4.4076305220883535e-06,
"loss": 0.1644,
"num_input_tokens_seen": 275232,
"step": 440
},
{
"epoch": 0.893574297188755,
"grad_norm": 12.361194610595703,
"learning_rate": 4.457831325301205e-06,
"loss": 0.177,
"num_input_tokens_seen": 278624,
"step": 445
},
{
"epoch": 0.9036144578313253,
"grad_norm": 10.267531394958496,
"learning_rate": 4.508032128514056e-06,
"loss": 0.1735,
"num_input_tokens_seen": 281696,
"step": 450
},
{
"epoch": 0.9136546184738956,
"grad_norm": 9.60224437713623,
"learning_rate": 4.558232931726908e-06,
"loss": 0.1862,
"num_input_tokens_seen": 284832,
"step": 455
},
{
"epoch": 0.9236947791164659,
"grad_norm": 6.127063751220703,
"learning_rate": 4.60843373493976e-06,
"loss": 0.1664,
"num_input_tokens_seen": 287424,
"step": 460
},
{
"epoch": 0.9337349397590361,
"grad_norm": 4.402464866638184,
"learning_rate": 4.658634538152611e-06,
"loss": 0.1646,
"num_input_tokens_seen": 289792,
"step": 465
},
{
"epoch": 0.9437751004016064,
"grad_norm": 18.304546356201172,
"learning_rate": 4.7088353413654624e-06,
"loss": 0.1703,
"num_input_tokens_seen": 293152,
"step": 470
},
{
"epoch": 0.9538152610441767,
"grad_norm": 5.38660192489624,
"learning_rate": 4.759036144578314e-06,
"loss": 0.1596,
"num_input_tokens_seen": 296320,
"step": 475
},
{
"epoch": 0.963855421686747,
"grad_norm": 13.366776466369629,
"learning_rate": 4.809236947791165e-06,
"loss": 0.1694,
"num_input_tokens_seen": 299616,
"step": 480
},
{
"epoch": 0.9738955823293173,
"grad_norm": 7.804983615875244,
"learning_rate": 4.8594377510040165e-06,
"loss": 0.1726,
"num_input_tokens_seen": 302528,
"step": 485
},
{
"epoch": 0.9839357429718876,
"grad_norm": 10.998905181884766,
"learning_rate": 4.909638554216868e-06,
"loss": 0.1914,
"num_input_tokens_seen": 305120,
"step": 490
},
{
"epoch": 0.9939759036144579,
"grad_norm": 13.699494361877441,
"learning_rate": 4.959839357429719e-06,
"loss": 0.1552,
"num_input_tokens_seen": 308928,
"step": 495
},
{
"epoch": 1.0040160642570282,
"grad_norm": 6.078279495239258,
"learning_rate": 5.0100401606425705e-06,
"loss": 0.1611,
"num_input_tokens_seen": 312512,
"step": 500
},
{
"epoch": 1.0140562248995983,
"grad_norm": 14.086797714233398,
"learning_rate": 5.060240963855422e-06,
"loss": 0.182,
"num_input_tokens_seen": 315232,
"step": 505
},
{
"epoch": 1.0240963855421688,
"grad_norm": 15.124547004699707,
"learning_rate": 5.110441767068274e-06,
"loss": 0.217,
"num_input_tokens_seen": 317952,
"step": 510
},
{
"epoch": 1.034136546184739,
"grad_norm": 5.822263717651367,
"learning_rate": 5.1606425702811245e-06,
"loss": 0.1535,
"num_input_tokens_seen": 321696,
"step": 515
},
{
"epoch": 1.0441767068273093,
"grad_norm": 6.9545722007751465,
"learning_rate": 5.210843373493977e-06,
"loss": 0.1515,
"num_input_tokens_seen": 325184,
"step": 520
},
{
"epoch": 1.0542168674698795,
"grad_norm": 6.776535511016846,
"learning_rate": 5.261044176706827e-06,
"loss": 0.1761,
"num_input_tokens_seen": 328832,
"step": 525
},
{
"epoch": 1.0642570281124497,
"grad_norm": 6.585295677185059,
"learning_rate": 5.3112449799196794e-06,
"loss": 0.1718,
"num_input_tokens_seen": 332320,
"step": 530
},
{
"epoch": 1.0742971887550201,
"grad_norm": 5.499662399291992,
"learning_rate": 5.361445783132531e-06,
"loss": 0.1717,
"num_input_tokens_seen": 335584,
"step": 535
},
{
"epoch": 1.0843373493975903,
"grad_norm": 16.805959701538086,
"learning_rate": 5.411646586345381e-06,
"loss": 0.1412,
"num_input_tokens_seen": 338464,
"step": 540
},
{
"epoch": 1.0943775100401607,
"grad_norm": 4.637701034545898,
"learning_rate": 5.4618473895582335e-06,
"loss": 0.192,
"num_input_tokens_seen": 341248,
"step": 545
},
{
"epoch": 1.104417670682731,
"grad_norm": 8.762520790100098,
"learning_rate": 5.512048192771085e-06,
"loss": 0.1577,
"num_input_tokens_seen": 344288,
"step": 550
},
{
"epoch": 1.1144578313253013,
"grad_norm": 15.590083122253418,
"learning_rate": 5.562248995983936e-06,
"loss": 0.1656,
"num_input_tokens_seen": 347456,
"step": 555
},
{
"epoch": 1.1244979919678715,
"grad_norm": 3.6315433979034424,
"learning_rate": 5.6124497991967875e-06,
"loss": 0.144,
"num_input_tokens_seen": 350400,
"step": 560
},
{
"epoch": 1.1345381526104417,
"grad_norm": 7.174181938171387,
"learning_rate": 5.66265060240964e-06,
"loss": 0.1922,
"num_input_tokens_seen": 352896,
"step": 565
},
{
"epoch": 1.144578313253012,
"grad_norm": 13.603584289550781,
"learning_rate": 5.71285140562249e-06,
"loss": 0.1733,
"num_input_tokens_seen": 355328,
"step": 570
},
{
"epoch": 1.1546184738955823,
"grad_norm": 4.278202533721924,
"learning_rate": 5.7630522088353416e-06,
"loss": 0.1604,
"num_input_tokens_seen": 358240,
"step": 575
},
{
"epoch": 1.1646586345381527,
"grad_norm": 7.5159149169921875,
"learning_rate": 5.813253012048194e-06,
"loss": 0.2166,
"num_input_tokens_seen": 361792,
"step": 580
},
{
"epoch": 1.1746987951807228,
"grad_norm": 2.941038131713867,
"learning_rate": 5.863453815261044e-06,
"loss": 0.1522,
"num_input_tokens_seen": 364960,
"step": 585
},
{
"epoch": 1.1847389558232932,
"grad_norm": 14.475259780883789,
"learning_rate": 5.9136546184738964e-06,
"loss": 0.1469,
"num_input_tokens_seen": 368416,
"step": 590
},
{
"epoch": 1.1947791164658634,
"grad_norm": 9.602474212646484,
"learning_rate": 5.963855421686747e-06,
"loss": 0.1788,
"num_input_tokens_seen": 371872,
"step": 595
},
{
"epoch": 1.2048192771084336,
"grad_norm": 11.142339706420898,
"learning_rate": 6.014056224899599e-06,
"loss": 0.1649,
"num_input_tokens_seen": 374432,
"step": 600
},
{
"epoch": 1.214859437751004,
"grad_norm": 4.123423099517822,
"learning_rate": 6.0642570281124505e-06,
"loss": 0.1436,
"num_input_tokens_seen": 378080,
"step": 605
},
{
"epoch": 1.2248995983935742,
"grad_norm": 15.446894645690918,
"learning_rate": 6.114457831325302e-06,
"loss": 0.2001,
"num_input_tokens_seen": 380928,
"step": 610
},
{
"epoch": 1.2349397590361446,
"grad_norm": 12.775602340698242,
"learning_rate": 6.164658634538153e-06,
"loss": 0.1959,
"num_input_tokens_seen": 383872,
"step": 615
},
{
"epoch": 1.2449799196787148,
"grad_norm": 3.7326459884643555,
"learning_rate": 6.214859437751004e-06,
"loss": 0.1745,
"num_input_tokens_seen": 386752,
"step": 620
},
{
"epoch": 1.2550200803212852,
"grad_norm": 6.751396179199219,
"learning_rate": 6.265060240963856e-06,
"loss": 0.1546,
"num_input_tokens_seen": 389600,
"step": 625
},
{
"epoch": 1.2650602409638554,
"grad_norm": 8.127579689025879,
"learning_rate": 6.315261044176707e-06,
"loss": 0.1835,
"num_input_tokens_seen": 392672,
"step": 630
},
{
"epoch": 1.2751004016064256,
"grad_norm": 10.662273406982422,
"learning_rate": 6.365461847389559e-06,
"loss": 0.2297,
"num_input_tokens_seen": 395520,
"step": 635
},
{
"epoch": 1.285140562248996,
"grad_norm": 12.054793357849121,
"learning_rate": 6.41566265060241e-06,
"loss": 0.2924,
"num_input_tokens_seen": 398496,
"step": 640
},
{
"epoch": 1.2951807228915664,
"grad_norm": 5.321606636047363,
"learning_rate": 6.465863453815262e-06,
"loss": 0.1721,
"num_input_tokens_seen": 401312,
"step": 645
},
{
"epoch": 1.3052208835341366,
"grad_norm": 5.2592597007751465,
"learning_rate": 6.516064257028113e-06,
"loss": 0.187,
"num_input_tokens_seen": 404960,
"step": 650
},
{
"epoch": 1.3152610441767068,
"grad_norm": 6.9956183433532715,
"learning_rate": 6.566265060240964e-06,
"loss": 0.1458,
"num_input_tokens_seen": 408032,
"step": 655
},
{
"epoch": 1.3253012048192772,
"grad_norm": 3.2474989891052246,
"learning_rate": 6.616465863453816e-06,
"loss": 0.165,
"num_input_tokens_seen": 411968,
"step": 660
},
{
"epoch": 1.3353413654618473,
"grad_norm": 8.723363876342773,
"learning_rate": 6.666666666666667e-06,
"loss": 0.1554,
"num_input_tokens_seen": 415168,
"step": 665
},
{
"epoch": 1.3453815261044177,
"grad_norm": 14.606755256652832,
"learning_rate": 6.716867469879519e-06,
"loss": 0.1623,
"num_input_tokens_seen": 418112,
"step": 670
},
{
"epoch": 1.355421686746988,
"grad_norm": 5.352765083312988,
"learning_rate": 6.76706827309237e-06,
"loss": 0.1602,
"num_input_tokens_seen": 420608,
"step": 675
},
{
"epoch": 1.3654618473895583,
"grad_norm": 4.150283336639404,
"learning_rate": 6.8172690763052215e-06,
"loss": 0.1651,
"num_input_tokens_seen": 423680,
"step": 680
},
{
"epoch": 1.3755020080321285,
"grad_norm": 4.1706061363220215,
"learning_rate": 6.867469879518073e-06,
"loss": 0.15,
"num_input_tokens_seen": 426368,
"step": 685
},
{
"epoch": 1.3855421686746987,
"grad_norm": 9.573399543762207,
"learning_rate": 6.917670682730925e-06,
"loss": 0.1684,
"num_input_tokens_seen": 429568,
"step": 690
},
{
"epoch": 1.395582329317269,
"grad_norm": 24.372079849243164,
"learning_rate": 6.9678714859437756e-06,
"loss": 0.1996,
"num_input_tokens_seen": 432320,
"step": 695
},
{
"epoch": 1.4056224899598393,
"grad_norm": 14.810078620910645,
"learning_rate": 7.018072289156627e-06,
"loss": 0.2936,
"num_input_tokens_seen": 435360,
"step": 700
},
{
"epoch": 1.4156626506024097,
"grad_norm": 2.108583927154541,
"learning_rate": 7.068273092369478e-06,
"loss": 0.1966,
"num_input_tokens_seen": 438112,
"step": 705
},
{
"epoch": 1.4257028112449799,
"grad_norm": 7.824947357177734,
"learning_rate": 7.11847389558233e-06,
"loss": 0.2228,
"num_input_tokens_seen": 440672,
"step": 710
},
{
"epoch": 1.4357429718875503,
"grad_norm": 4.722110271453857,
"learning_rate": 7.168674698795182e-06,
"loss": 0.1611,
"num_input_tokens_seen": 443424,
"step": 715
},
{
"epoch": 1.4457831325301205,
"grad_norm": 9.50132942199707,
"learning_rate": 7.218875502008032e-06,
"loss": 0.1893,
"num_input_tokens_seen": 446528,
"step": 720
},
{
"epoch": 1.4558232931726907,
"grad_norm": 4.900175094604492,
"learning_rate": 7.2690763052208845e-06,
"loss": 0.164,
"num_input_tokens_seen": 450016,
"step": 725
},
{
"epoch": 1.465863453815261,
"grad_norm": 4.841922760009766,
"learning_rate": 7.319277108433736e-06,
"loss": 0.1482,
"num_input_tokens_seen": 453664,
"step": 730
},
{
"epoch": 1.4759036144578312,
"grad_norm": 3.754279851913452,
"learning_rate": 7.369477911646586e-06,
"loss": 0.1891,
"num_input_tokens_seen": 457216,
"step": 735
},
{
"epoch": 1.4859437751004017,
"grad_norm": 8.406981468200684,
"learning_rate": 7.4196787148594385e-06,
"loss": 0.1748,
"num_input_tokens_seen": 460416,
"step": 740
},
{
"epoch": 1.4959839357429718,
"grad_norm": 6.8733015060424805,
"learning_rate": 7.469879518072289e-06,
"loss": 0.1654,
"num_input_tokens_seen": 463776,
"step": 745
},
{
"epoch": 1.5060240963855422,
"grad_norm": 6.487129211425781,
"learning_rate": 7.520080321285141e-06,
"loss": 0.1918,
"num_input_tokens_seen": 467200,
"step": 750
},
{
"epoch": 1.5160642570281124,
"grad_norm": 3.5012166500091553,
"learning_rate": 7.570281124497993e-06,
"loss": 0.1481,
"num_input_tokens_seen": 469728,
"step": 755
},
{
"epoch": 1.5261044176706826,
"grad_norm": 2.2680747509002686,
"learning_rate": 7.620481927710845e-06,
"loss": 0.1537,
"num_input_tokens_seen": 472832,
"step": 760
},
{
"epoch": 1.536144578313253,
"grad_norm": 6.076399326324463,
"learning_rate": 7.670682730923695e-06,
"loss": 0.1617,
"num_input_tokens_seen": 476256,
"step": 765
},
{
"epoch": 1.5461847389558234,
"grad_norm": 2.6769142150878906,
"learning_rate": 7.720883534136547e-06,
"loss": 0.1653,
"num_input_tokens_seen": 478784,
"step": 770
},
{
"epoch": 1.5562248995983936,
"grad_norm": 2.721581220626831,
"learning_rate": 7.771084337349398e-06,
"loss": 0.1399,
"num_input_tokens_seen": 481920,
"step": 775
},
{
"epoch": 1.5662650602409638,
"grad_norm": 2.004193067550659,
"learning_rate": 7.82128514056225e-06,
"loss": 0.1345,
"num_input_tokens_seen": 485344,
"step": 780
},
{
"epoch": 1.5763052208835342,
"grad_norm": 5.72704553604126,
"learning_rate": 7.8714859437751e-06,
"loss": 0.1843,
"num_input_tokens_seen": 487968,
"step": 785
},
{
"epoch": 1.5863453815261044,
"grad_norm": 4.490149021148682,
"learning_rate": 7.921686746987952e-06,
"loss": 0.1692,
"num_input_tokens_seen": 490880,
"step": 790
},
{
"epoch": 1.5963855421686746,
"grad_norm": 2.184390068054199,
"learning_rate": 7.971887550200803e-06,
"loss": 0.1647,
"num_input_tokens_seen": 493568,
"step": 795
},
{
"epoch": 1.606425702811245,
"grad_norm": 6.28218412399292,
"learning_rate": 8.022088353413655e-06,
"loss": 0.1725,
"num_input_tokens_seen": 497184,
"step": 800
},
{
"epoch": 1.6164658634538154,
"grad_norm": 3.339231014251709,
"learning_rate": 8.072289156626508e-06,
"loss": 0.1703,
"num_input_tokens_seen": 499680,
"step": 805
},
{
"epoch": 1.6265060240963856,
"grad_norm": 3.661980152130127,
"learning_rate": 8.122489959839357e-06,
"loss": 0.1438,
"num_input_tokens_seen": 502656,
"step": 810
},
{
"epoch": 1.6365461847389557,
"grad_norm": 1.343388557434082,
"learning_rate": 8.172690763052209e-06,
"loss": 0.1673,
"num_input_tokens_seen": 505056,
"step": 815
},
{
"epoch": 1.6465863453815262,
"grad_norm": 2.127262592315674,
"learning_rate": 8.222891566265062e-06,
"loss": 0.1565,
"num_input_tokens_seen": 508320,
"step": 820
},
{
"epoch": 1.6566265060240963,
"grad_norm": 4.568302631378174,
"learning_rate": 8.273092369477911e-06,
"loss": 0.1429,
"num_input_tokens_seen": 512032,
"step": 825
},
{
"epoch": 1.6666666666666665,
"grad_norm": 6.50462007522583,
"learning_rate": 8.323293172690764e-06,
"loss": 0.2157,
"num_input_tokens_seen": 514752,
"step": 830
},
{
"epoch": 1.676706827309237,
"grad_norm": 3.402589797973633,
"learning_rate": 8.373493975903614e-06,
"loss": 0.1982,
"num_input_tokens_seen": 518176,
"step": 835
},
{
"epoch": 1.6867469879518073,
"grad_norm": 6.337620258331299,
"learning_rate": 8.423694779116467e-06,
"loss": 0.1626,
"num_input_tokens_seen": 521216,
"step": 840
},
{
"epoch": 1.6967871485943775,
"grad_norm": 4.079775810241699,
"learning_rate": 8.473895582329319e-06,
"loss": 0.1674,
"num_input_tokens_seen": 525216,
"step": 845
},
{
"epoch": 1.7068273092369477,
"grad_norm": 7.396521091461182,
"learning_rate": 8.52409638554217e-06,
"loss": 0.1472,
"num_input_tokens_seen": 528896,
"step": 850
},
{
"epoch": 1.716867469879518,
"grad_norm": 9.742034912109375,
"learning_rate": 8.574297188755021e-06,
"loss": 0.2251,
"num_input_tokens_seen": 531744,
"step": 855
},
{
"epoch": 1.7269076305220885,
"grad_norm": 6.10853385925293,
"learning_rate": 8.624497991967873e-06,
"loss": 0.1385,
"num_input_tokens_seen": 534464,
"step": 860
},
{
"epoch": 1.7369477911646585,
"grad_norm": 1.38788640499115,
"learning_rate": 8.674698795180724e-06,
"loss": 0.1555,
"num_input_tokens_seen": 538208,
"step": 865
},
{
"epoch": 1.7469879518072289,
"grad_norm": 3.5134339332580566,
"learning_rate": 8.724899598393575e-06,
"loss": 0.1201,
"num_input_tokens_seen": 541440,
"step": 870
},
{
"epoch": 1.7570281124497993,
"grad_norm": 4.719893932342529,
"learning_rate": 8.775100401606427e-06,
"loss": 0.2551,
"num_input_tokens_seen": 543968,
"step": 875
},
{
"epoch": 1.7670682730923695,
"grad_norm": 8.232791900634766,
"learning_rate": 8.825301204819278e-06,
"loss": 0.1633,
"num_input_tokens_seen": 546752,
"step": 880
},
{
"epoch": 1.7771084337349397,
"grad_norm": 2.3899688720703125,
"learning_rate": 8.87550200803213e-06,
"loss": 0.1441,
"num_input_tokens_seen": 549632,
"step": 885
},
{
"epoch": 1.78714859437751,
"grad_norm": 5.850103378295898,
"learning_rate": 8.92570281124498e-06,
"loss": 0.1765,
"num_input_tokens_seen": 552736,
"step": 890
},
{
"epoch": 1.7971887550200805,
"grad_norm": 2.2977206707000732,
"learning_rate": 8.975903614457832e-06,
"loss": 0.1459,
"num_input_tokens_seen": 555808,
"step": 895
},
{
"epoch": 1.8072289156626506,
"grad_norm": 2.4535903930664062,
"learning_rate": 9.026104417670683e-06,
"loss": 0.1618,
"num_input_tokens_seen": 558624,
"step": 900
},
{
"epoch": 1.8172690763052208,
"grad_norm": 2.2958626747131348,
"learning_rate": 9.076305220883535e-06,
"loss": 0.1617,
"num_input_tokens_seen": 562368,
"step": 905
},
{
"epoch": 1.8273092369477912,
"grad_norm": 2.7728888988494873,
"learning_rate": 9.126506024096386e-06,
"loss": 0.1613,
"num_input_tokens_seen": 565824,
"step": 910
},
{
"epoch": 1.8373493975903614,
"grad_norm": 8.566793441772461,
"learning_rate": 9.176706827309237e-06,
"loss": 0.1839,
"num_input_tokens_seen": 569888,
"step": 915
},
{
"epoch": 1.8473895582329316,
"grad_norm": 8.40902042388916,
"learning_rate": 9.226907630522089e-06,
"loss": 0.216,
"num_input_tokens_seen": 572160,
"step": 920
},
{
"epoch": 1.857429718875502,
"grad_norm": 2.0097877979278564,
"learning_rate": 9.27710843373494e-06,
"loss": 0.1682,
"num_input_tokens_seen": 575072,
"step": 925
},
{
"epoch": 1.8674698795180724,
"grad_norm": 2.2905232906341553,
"learning_rate": 9.327309236947793e-06,
"loss": 0.1811,
"num_input_tokens_seen": 578112,
"step": 930
},
{
"epoch": 1.8775100401606426,
"grad_norm": 4.189212322235107,
"learning_rate": 9.377510040160643e-06,
"loss": 0.1639,
"num_input_tokens_seen": 581248,
"step": 935
},
{
"epoch": 1.8875502008032128,
"grad_norm": 3.09934401512146,
"learning_rate": 9.427710843373494e-06,
"loss": 0.1592,
"num_input_tokens_seen": 584608,
"step": 940
},
{
"epoch": 1.8975903614457832,
"grad_norm": 2.5684139728546143,
"learning_rate": 9.477911646586347e-06,
"loss": 0.1492,
"num_input_tokens_seen": 588192,
"step": 945
},
{
"epoch": 1.9076305220883534,
"grad_norm": 2.9508438110351562,
"learning_rate": 9.528112449799197e-06,
"loss": 0.1581,
"num_input_tokens_seen": 591552,
"step": 950
},
{
"epoch": 1.9176706827309236,
"grad_norm": 4.4684247970581055,
"learning_rate": 9.57831325301205e-06,
"loss": 0.1515,
"num_input_tokens_seen": 594784,
"step": 955
},
{
"epoch": 1.927710843373494,
"grad_norm": 4.187466144561768,
"learning_rate": 9.6285140562249e-06,
"loss": 0.1574,
"num_input_tokens_seen": 598016,
"step": 960
},
{
"epoch": 1.9377510040160644,
"grad_norm": 1.2344138622283936,
"learning_rate": 9.678714859437753e-06,
"loss": 0.166,
"num_input_tokens_seen": 601216,
"step": 965
},
{
"epoch": 1.9477911646586346,
"grad_norm": 5.241006374359131,
"learning_rate": 9.728915662650604e-06,
"loss": 0.1514,
"num_input_tokens_seen": 604640,
"step": 970
},
{
"epoch": 1.9578313253012047,
"grad_norm": 1.7580270767211914,
"learning_rate": 9.779116465863454e-06,
"loss": 0.1461,
"num_input_tokens_seen": 607680,
"step": 975
},
{
"epoch": 1.9678714859437751,
"grad_norm": 2.8425869941711426,
"learning_rate": 9.829317269076307e-06,
"loss": 0.1787,
"num_input_tokens_seen": 611296,
"step": 980
},
{
"epoch": 1.9779116465863453,
"grad_norm": 2.437546730041504,
"learning_rate": 9.879518072289156e-06,
"loss": 0.152,
"num_input_tokens_seen": 614912,
"step": 985
},
{
"epoch": 1.9879518072289155,
"grad_norm": 5.1114726066589355,
"learning_rate": 9.92971887550201e-06,
"loss": 0.2004,
"num_input_tokens_seen": 618272,
"step": 990
},
{
"epoch": 1.997991967871486,
"grad_norm": 3.159273147583008,
"learning_rate": 9.97991967871486e-06,
"loss": 0.1312,
"num_input_tokens_seen": 621824,
"step": 995
},
{
"epoch": 2.0,
"eval_loss": 0.16479995846748352,
"eval_runtime": 8.0659,
"eval_samples_per_second": 61.742,
"eval_steps_per_second": 15.497,
"num_input_tokens_seen": 622720,
"step": 996
},
{
"epoch": 2.0080321285140563,
"grad_norm": 0.732082724571228,
"learning_rate": 9.999997236378723e-06,
"loss": 0.1477,
"num_input_tokens_seen": 625344,
"step": 1000
},
{
"epoch": 2.0180722891566263,
"grad_norm": 1.478649616241455,
"learning_rate": 9.999980347593088e-06,
"loss": 0.1797,
"num_input_tokens_seen": 629056,
"step": 1005
},
{
"epoch": 2.0281124497991967,
"grad_norm": 1.8761229515075684,
"learning_rate": 9.999948105418771e-06,
"loss": 0.1524,
"num_input_tokens_seen": 632640,
"step": 1010
},
{
"epoch": 2.038152610441767,
"grad_norm": 2.1610300540924072,
"learning_rate": 9.999900509954779e-06,
"loss": 0.1683,
"num_input_tokens_seen": 636032,
"step": 1015
},
{
"epoch": 2.0481927710843375,
"grad_norm": 2.8162364959716797,
"learning_rate": 9.999837561347259e-06,
"loss": 0.1756,
"num_input_tokens_seen": 639008,
"step": 1020
},
{
"epoch": 2.0582329317269075,
"grad_norm": 0.9307097792625427,
"learning_rate": 9.99975925978951e-06,
"loss": 0.1659,
"num_input_tokens_seen": 642048,
"step": 1025
},
{
"epoch": 2.068273092369478,
"grad_norm": 2.646435260772705,
"learning_rate": 9.99966560552197e-06,
"loss": 0.1371,
"num_input_tokens_seen": 645312,
"step": 1030
},
{
"epoch": 2.0783132530120483,
"grad_norm": 3.3150744438171387,
"learning_rate": 9.999556598832224e-06,
"loss": 0.1625,
"num_input_tokens_seen": 648864,
"step": 1035
},
{
"epoch": 2.0883534136546187,
"grad_norm": 3.675476551055908,
"learning_rate": 9.999432240054994e-06,
"loss": 0.1701,
"num_input_tokens_seen": 651616,
"step": 1040
},
{
"epoch": 2.0983935742971886,
"grad_norm": 4.225813865661621,
"learning_rate": 9.999292529572152e-06,
"loss": 0.1761,
"num_input_tokens_seen": 654912,
"step": 1045
},
{
"epoch": 2.108433734939759,
"grad_norm": 5.603430271148682,
"learning_rate": 9.9991374678127e-06,
"loss": 0.1769,
"num_input_tokens_seen": 658400,
"step": 1050
},
{
"epoch": 2.1184738955823295,
"grad_norm": 2.353452205657959,
"learning_rate": 9.998967055252791e-06,
"loss": 0.1754,
"num_input_tokens_seen": 661536,
"step": 1055
},
{
"epoch": 2.1285140562248994,
"grad_norm": 2.809842109680176,
"learning_rate": 9.998781292415705e-06,
"loss": 0.1485,
"num_input_tokens_seen": 664736,
"step": 1060
},
{
"epoch": 2.13855421686747,
"grad_norm": 1.9131516218185425,
"learning_rate": 9.998580179871864e-06,
"loss": 0.1628,
"num_input_tokens_seen": 667520,
"step": 1065
},
{
"epoch": 2.1485943775100402,
"grad_norm": 3.7786145210266113,
"learning_rate": 9.998363718238819e-06,
"loss": 0.1636,
"num_input_tokens_seen": 670976,
"step": 1070
},
{
"epoch": 2.1586345381526106,
"grad_norm": 5.997560501098633,
"learning_rate": 9.998131908181262e-06,
"loss": 0.1487,
"num_input_tokens_seen": 674816,
"step": 1075
},
{
"epoch": 2.1686746987951806,
"grad_norm": 2.7555580139160156,
"learning_rate": 9.997884750411004e-06,
"loss": 0.153,
"num_input_tokens_seen": 678080,
"step": 1080
},
{
"epoch": 2.178714859437751,
"grad_norm": 2.348747730255127,
"learning_rate": 9.997622245686993e-06,
"loss": 0.1557,
"num_input_tokens_seen": 680736,
"step": 1085
},
{
"epoch": 2.1887550200803214,
"grad_norm": 3.540637493133545,
"learning_rate": 9.997344394815298e-06,
"loss": 0.1665,
"num_input_tokens_seen": 684064,
"step": 1090
},
{
"epoch": 2.1987951807228914,
"grad_norm": 4.5958476066589355,
"learning_rate": 9.997051198649117e-06,
"loss": 0.1743,
"num_input_tokens_seen": 686560,
"step": 1095
},
{
"epoch": 2.208835341365462,
"grad_norm": 1.7110495567321777,
"learning_rate": 9.996742658088759e-06,
"loss": 0.183,
"num_input_tokens_seen": 689312,
"step": 1100
},
{
"epoch": 2.218875502008032,
"grad_norm": 3.817140817642212,
"learning_rate": 9.996418774081658e-06,
"loss": 0.1928,
"num_input_tokens_seen": 692320,
"step": 1105
},
{
"epoch": 2.2289156626506026,
"grad_norm": 0.9928373098373413,
"learning_rate": 9.996079547622362e-06,
"loss": 0.1581,
"num_input_tokens_seen": 695040,
"step": 1110
},
{
"epoch": 2.2389558232931726,
"grad_norm": 1.4945827722549438,
"learning_rate": 9.995724979752533e-06,
"loss": 0.1677,
"num_input_tokens_seen": 697952,
"step": 1115
},
{
"epoch": 2.248995983935743,
"grad_norm": 3.399029016494751,
"learning_rate": 9.995355071560933e-06,
"loss": 0.1657,
"num_input_tokens_seen": 700992,
"step": 1120
},
{
"epoch": 2.2590361445783134,
"grad_norm": 5.047579288482666,
"learning_rate": 9.994969824183441e-06,
"loss": 0.157,
"num_input_tokens_seen": 703936,
"step": 1125
},
{
"epoch": 2.2690763052208833,
"grad_norm": 2.0834877490997314,
"learning_rate": 9.994569238803027e-06,
"loss": 0.1444,
"num_input_tokens_seen": 707424,
"step": 1130
},
{
"epoch": 2.2791164658634537,
"grad_norm": 2.91137957572937,
"learning_rate": 9.994153316649769e-06,
"loss": 0.1327,
"num_input_tokens_seen": 710592,
"step": 1135
},
{
"epoch": 2.289156626506024,
"grad_norm": 11.498336791992188,
"learning_rate": 9.993722059000833e-06,
"loss": 0.2246,
"num_input_tokens_seen": 714368,
"step": 1140
},
{
"epoch": 2.2991967871485945,
"grad_norm": 4.052079677581787,
"learning_rate": 9.993275467180476e-06,
"loss": 0.1575,
"num_input_tokens_seen": 717344,
"step": 1145
},
{
"epoch": 2.3092369477911645,
"grad_norm": 1.9310601949691772,
"learning_rate": 9.992813542560045e-06,
"loss": 0.1554,
"num_input_tokens_seen": 720576,
"step": 1150
},
{
"epoch": 2.319277108433735,
"grad_norm": 9.181812286376953,
"learning_rate": 9.992336286557967e-06,
"loss": 0.1799,
"num_input_tokens_seen": 723296,
"step": 1155
},
{
"epoch": 2.3293172690763053,
"grad_norm": 1.3441740274429321,
"learning_rate": 9.991843700639747e-06,
"loss": 0.1408,
"num_input_tokens_seen": 726720,
"step": 1160
},
{
"epoch": 2.3393574297188753,
"grad_norm": 5.780555248260498,
"learning_rate": 9.991335786317964e-06,
"loss": 0.1704,
"num_input_tokens_seen": 730240,
"step": 1165
},
{
"epoch": 2.3493975903614457,
"grad_norm": 1.7718957662582397,
"learning_rate": 9.990812545152264e-06,
"loss": 0.1711,
"num_input_tokens_seen": 733984,
"step": 1170
},
{
"epoch": 2.359437751004016,
"grad_norm": 1.9247894287109375,
"learning_rate": 9.990273978749358e-06,
"loss": 0.1465,
"num_input_tokens_seen": 737056,
"step": 1175
},
{
"epoch": 2.3694779116465865,
"grad_norm": 2.258086681365967,
"learning_rate": 9.98972008876302e-06,
"loss": 0.1695,
"num_input_tokens_seen": 739744,
"step": 1180
},
{
"epoch": 2.3795180722891565,
"grad_norm": 1.809032917022705,
"learning_rate": 9.98915087689407e-06,
"loss": 0.169,
"num_input_tokens_seen": 742912,
"step": 1185
},
{
"epoch": 2.389558232931727,
"grad_norm": 2.019859552383423,
"learning_rate": 9.988566344890383e-06,
"loss": 0.1525,
"num_input_tokens_seen": 746400,
"step": 1190
},
{
"epoch": 2.3995983935742973,
"grad_norm": 1.4076141119003296,
"learning_rate": 9.987966494546873e-06,
"loss": 0.1502,
"num_input_tokens_seen": 750144,
"step": 1195
},
{
"epoch": 2.4096385542168672,
"grad_norm": 1.8707914352416992,
"learning_rate": 9.987351327705498e-06,
"loss": 0.1395,
"num_input_tokens_seen": 752640,
"step": 1200
},
{
"epoch": 2.4196787148594376,
"grad_norm": 3.6235058307647705,
"learning_rate": 9.986720846255244e-06,
"loss": 0.1586,
"num_input_tokens_seen": 755584,
"step": 1205
},
{
"epoch": 2.429718875502008,
"grad_norm": 1.7184830904006958,
"learning_rate": 9.986075052132124e-06,
"loss": 0.1645,
"num_input_tokens_seen": 758656,
"step": 1210
},
{
"epoch": 2.4397590361445785,
"grad_norm": 2.1086556911468506,
"learning_rate": 9.98541394731917e-06,
"loss": 0.149,
"num_input_tokens_seen": 761280,
"step": 1215
},
{
"epoch": 2.4497991967871484,
"grad_norm": 1.5263396501541138,
"learning_rate": 9.984737533846429e-06,
"loss": 0.1714,
"num_input_tokens_seen": 765440,
"step": 1220
},
{
"epoch": 2.459839357429719,
"grad_norm": 2.3205933570861816,
"learning_rate": 9.984045813790959e-06,
"loss": 0.1557,
"num_input_tokens_seen": 769184,
"step": 1225
},
{
"epoch": 2.4698795180722892,
"grad_norm": 1.9002354145050049,
"learning_rate": 9.983338789276817e-06,
"loss": 0.1521,
"num_input_tokens_seen": 771584,
"step": 1230
},
{
"epoch": 2.479919678714859,
"grad_norm": 1.5902975797653198,
"learning_rate": 9.982616462475055e-06,
"loss": 0.1492,
"num_input_tokens_seen": 774464,
"step": 1235
},
{
"epoch": 2.4899598393574296,
"grad_norm": 3.1418979167938232,
"learning_rate": 9.981878835603718e-06,
"loss": 0.171,
"num_input_tokens_seen": 777568,
"step": 1240
},
{
"epoch": 2.5,
"grad_norm": 1.7949987649917603,
"learning_rate": 9.981125910927824e-06,
"loss": 0.1484,
"num_input_tokens_seen": 779936,
"step": 1245
},
{
"epoch": 2.5100401606425704,
"grad_norm": 0.7500330805778503,
"learning_rate": 9.980357690759376e-06,
"loss": 0.1717,
"num_input_tokens_seen": 782752,
"step": 1250
},
{
"epoch": 2.520080321285141,
"grad_norm": 1.0766161680221558,
"learning_rate": 9.979574177457337e-06,
"loss": 0.1508,
"num_input_tokens_seen": 785984,
"step": 1255
},
{
"epoch": 2.5301204819277108,
"grad_norm": 2.405531406402588,
"learning_rate": 9.978775373427634e-06,
"loss": 0.1551,
"num_input_tokens_seen": 789280,
"step": 1260
},
{
"epoch": 2.540160642570281,
"grad_norm": 2.476111888885498,
"learning_rate": 9.977961281123146e-06,
"loss": 0.1623,
"num_input_tokens_seen": 792384,
"step": 1265
},
{
"epoch": 2.550200803212851,
"grad_norm": 4.53995418548584,
"learning_rate": 9.9771319030437e-06,
"loss": 0.1581,
"num_input_tokens_seen": 795200,
"step": 1270
},
{
"epoch": 2.5602409638554215,
"grad_norm": 3.0957741737365723,
"learning_rate": 9.976287241736055e-06,
"loss": 0.1561,
"num_input_tokens_seen": 798144,
"step": 1275
},
{
"epoch": 2.570281124497992,
"grad_norm": 2.2821428775787354,
"learning_rate": 9.975427299793908e-06,
"loss": 0.1669,
"num_input_tokens_seen": 800992,
"step": 1280
},
{
"epoch": 2.5803212851405624,
"grad_norm": 2.018181324005127,
"learning_rate": 9.974552079857873e-06,
"loss": 0.1416,
"num_input_tokens_seen": 803872,
"step": 1285
},
{
"epoch": 2.5903614457831328,
"grad_norm": 1.4955519437789917,
"learning_rate": 9.973661584615476e-06,
"loss": 0.1636,
"num_input_tokens_seen": 807616,
"step": 1290
},
{
"epoch": 2.6004016064257027,
"grad_norm": 2.3408939838409424,
"learning_rate": 9.972755816801155e-06,
"loss": 0.1555,
"num_input_tokens_seen": 810592,
"step": 1295
},
{
"epoch": 2.610441767068273,
"grad_norm": 12.589311599731445,
"learning_rate": 9.971834779196238e-06,
"loss": 0.1738,
"num_input_tokens_seen": 812992,
"step": 1300
},
{
"epoch": 2.6204819277108435,
"grad_norm": 2.955625295639038,
"learning_rate": 9.970898474628951e-06,
"loss": 0.1559,
"num_input_tokens_seen": 816544,
"step": 1305
},
{
"epoch": 2.6305220883534135,
"grad_norm": 3.5705697536468506,
"learning_rate": 9.969946905974392e-06,
"loss": 0.1491,
"num_input_tokens_seen": 819904,
"step": 1310
},
{
"epoch": 2.640562248995984,
"grad_norm": 3.9580843448638916,
"learning_rate": 9.968980076154533e-06,
"loss": 0.1472,
"num_input_tokens_seen": 822848,
"step": 1315
},
{
"epoch": 2.6506024096385543,
"grad_norm": 2.570261001586914,
"learning_rate": 9.96799798813821e-06,
"loss": 0.1526,
"num_input_tokens_seen": 825696,
"step": 1320
},
{
"epoch": 2.6606425702811247,
"grad_norm": 2.2151670455932617,
"learning_rate": 9.96700064494111e-06,
"loss": 0.138,
"num_input_tokens_seen": 828704,
"step": 1325
},
{
"epoch": 2.6706827309236947,
"grad_norm": 3.911294937133789,
"learning_rate": 9.965988049625763e-06,
"loss": 0.1962,
"num_input_tokens_seen": 831744,
"step": 1330
},
{
"epoch": 2.680722891566265,
"grad_norm": 3.1704447269439697,
"learning_rate": 9.964960205301534e-06,
"loss": 0.1459,
"num_input_tokens_seen": 834720,
"step": 1335
},
{
"epoch": 2.6907630522088355,
"grad_norm": 3.241382360458374,
"learning_rate": 9.963917115124621e-06,
"loss": 0.1723,
"num_input_tokens_seen": 838048,
"step": 1340
},
{
"epoch": 2.7008032128514055,
"grad_norm": 3.085642099380493,
"learning_rate": 9.962858782298023e-06,
"loss": 0.1566,
"num_input_tokens_seen": 841216,
"step": 1345
},
{
"epoch": 2.710843373493976,
"grad_norm": 3.294822931289673,
"learning_rate": 9.961785210071554e-06,
"loss": 0.1866,
"num_input_tokens_seen": 844576,
"step": 1350
},
{
"epoch": 2.7208835341365463,
"grad_norm": 1.466409683227539,
"learning_rate": 9.960696401741825e-06,
"loss": 0.1571,
"num_input_tokens_seen": 847872,
"step": 1355
},
{
"epoch": 2.7309236947791167,
"grad_norm": 1.720889687538147,
"learning_rate": 9.959592360652224e-06,
"loss": 0.1448,
"num_input_tokens_seen": 850848,
"step": 1360
},
{
"epoch": 2.7409638554216866,
"grad_norm": 2.6980252265930176,
"learning_rate": 9.95847309019292e-06,
"loss": 0.1496,
"num_input_tokens_seen": 853664,
"step": 1365
},
{
"epoch": 2.751004016064257,
"grad_norm": 3.6680150032043457,
"learning_rate": 9.957338593800844e-06,
"loss": 0.1483,
"num_input_tokens_seen": 856928,
"step": 1370
},
{
"epoch": 2.7610441767068274,
"grad_norm": 8.08189868927002,
"learning_rate": 9.956188874959686e-06,
"loss": 0.1877,
"num_input_tokens_seen": 860192,
"step": 1375
},
{
"epoch": 2.7710843373493974,
"grad_norm": 3.0358712673187256,
"learning_rate": 9.955023937199876e-06,
"loss": 0.1748,
"num_input_tokens_seen": 863616,
"step": 1380
},
{
"epoch": 2.781124497991968,
"grad_norm": 2.551198720932007,
"learning_rate": 9.953843784098573e-06,
"loss": 0.1268,
"num_input_tokens_seen": 867296,
"step": 1385
},
{
"epoch": 2.791164658634538,
"grad_norm": 3.2741429805755615,
"learning_rate": 9.952648419279662e-06,
"loss": 0.1956,
"num_input_tokens_seen": 870368,
"step": 1390
},
{
"epoch": 2.8012048192771086,
"grad_norm": 4.114120006561279,
"learning_rate": 9.951437846413738e-06,
"loss": 0.2096,
"num_input_tokens_seen": 873472,
"step": 1395
},
{
"epoch": 2.8112449799196786,
"grad_norm": 3.035440444946289,
"learning_rate": 9.950212069218095e-06,
"loss": 0.1534,
"num_input_tokens_seen": 876224,
"step": 1400
},
{
"epoch": 2.821285140562249,
"grad_norm": 2.0006754398345947,
"learning_rate": 9.948971091456715e-06,
"loss": 0.1606,
"num_input_tokens_seen": 879392,
"step": 1405
},
{
"epoch": 2.8313253012048194,
"grad_norm": 3.954497814178467,
"learning_rate": 9.947714916940257e-06,
"loss": 0.1265,
"num_input_tokens_seen": 882656,
"step": 1410
},
{
"epoch": 2.8413654618473894,
"grad_norm": 4.2772417068481445,
"learning_rate": 9.946443549526041e-06,
"loss": 0.1417,
"num_input_tokens_seen": 885696,
"step": 1415
},
{
"epoch": 2.8514056224899598,
"grad_norm": 4.809564590454102,
"learning_rate": 9.945156993118042e-06,
"loss": 0.1702,
"num_input_tokens_seen": 888640,
"step": 1420
},
{
"epoch": 2.86144578313253,
"grad_norm": 3.161689043045044,
"learning_rate": 9.943855251666873e-06,
"loss": 0.1291,
"num_input_tokens_seen": 892384,
"step": 1425
},
{
"epoch": 2.8714859437751006,
"grad_norm": 2.3096818923950195,
"learning_rate": 9.942538329169786e-06,
"loss": 0.1453,
"num_input_tokens_seen": 895328,
"step": 1430
},
{
"epoch": 2.8815261044176705,
"grad_norm": 12.994061470031738,
"learning_rate": 9.941206229670634e-06,
"loss": 0.181,
"num_input_tokens_seen": 897952,
"step": 1435
},
{
"epoch": 2.891566265060241,
"grad_norm": 3.6042988300323486,
"learning_rate": 9.939858957259887e-06,
"loss": 0.1356,
"num_input_tokens_seen": 901792,
"step": 1440
},
{
"epoch": 2.9016064257028114,
"grad_norm": 3.180940866470337,
"learning_rate": 9.938496516074597e-06,
"loss": 0.1256,
"num_input_tokens_seen": 905664,
"step": 1445
},
{
"epoch": 2.9116465863453813,
"grad_norm": 5.906370162963867,
"learning_rate": 9.937118910298398e-06,
"loss": 0.1685,
"num_input_tokens_seen": 907904,
"step": 1450
},
{
"epoch": 2.9216867469879517,
"grad_norm": 3.592151403427124,
"learning_rate": 9.935726144161492e-06,
"loss": 0.1314,
"num_input_tokens_seen": 910816,
"step": 1455
},
{
"epoch": 2.931726907630522,
"grad_norm": 4.761033058166504,
"learning_rate": 9.934318221940632e-06,
"loss": 0.1309,
"num_input_tokens_seen": 913568,
"step": 1460
},
{
"epoch": 2.9417670682730925,
"grad_norm": 8.582740783691406,
"learning_rate": 9.932895147959106e-06,
"loss": 0.3052,
"num_input_tokens_seen": 916320,
"step": 1465
},
{
"epoch": 2.9518072289156625,
"grad_norm": 5.78909158706665,
"learning_rate": 9.931456926586738e-06,
"loss": 0.1818,
"num_input_tokens_seen": 919136,
"step": 1470
},
{
"epoch": 2.961847389558233,
"grad_norm": 3.7451789379119873,
"learning_rate": 9.930003562239858e-06,
"loss": 0.1883,
"num_input_tokens_seen": 922080,
"step": 1475
},
{
"epoch": 2.9718875502008033,
"grad_norm": 2.4840102195739746,
"learning_rate": 9.928535059381298e-06,
"loss": 0.1681,
"num_input_tokens_seen": 925088,
"step": 1480
},
{
"epoch": 2.9819277108433733,
"grad_norm": 1.527234435081482,
"learning_rate": 9.927051422520373e-06,
"loss": 0.1436,
"num_input_tokens_seen": 928160,
"step": 1485
},
{
"epoch": 2.9919678714859437,
"grad_norm": 3.8078765869140625,
"learning_rate": 9.925552656212871e-06,
"loss": 0.1555,
"num_input_tokens_seen": 930688,
"step": 1490
},
{
"epoch": 3.002008032128514,
"grad_norm": 1.0531964302062988,
"learning_rate": 9.924038765061042e-06,
"loss": 0.1497,
"num_input_tokens_seen": 933504,
"step": 1495
},
{
"epoch": 3.0120481927710845,
"grad_norm": 2.8714663982391357,
"learning_rate": 9.922509753713572e-06,
"loss": 0.1453,
"num_input_tokens_seen": 936448,
"step": 1500
},
{
"epoch": 3.0220883534136544,
"grad_norm": 4.788018226623535,
"learning_rate": 9.920965626865582e-06,
"loss": 0.1549,
"num_input_tokens_seen": 939488,
"step": 1505
},
{
"epoch": 3.032128514056225,
"grad_norm": 4.411100387573242,
"learning_rate": 9.919406389258607e-06,
"loss": 0.145,
"num_input_tokens_seen": 942240,
"step": 1510
},
{
"epoch": 3.0421686746987953,
"grad_norm": 2.9873862266540527,
"learning_rate": 9.917832045680584e-06,
"loss": 0.1603,
"num_input_tokens_seen": 946048,
"step": 1515
},
{
"epoch": 3.0522088353413657,
"grad_norm": 5.309106826782227,
"learning_rate": 9.91624260096583e-06,
"loss": 0.1407,
"num_input_tokens_seen": 948672,
"step": 1520
},
{
"epoch": 3.0622489959839356,
"grad_norm": 4.250939846038818,
"learning_rate": 9.91463805999504e-06,
"loss": 0.1642,
"num_input_tokens_seen": 951744,
"step": 1525
},
{
"epoch": 3.072289156626506,
"grad_norm": 3.651625871658325,
"learning_rate": 9.913018427695257e-06,
"loss": 0.1516,
"num_input_tokens_seen": 955136,
"step": 1530
},
{
"epoch": 3.0823293172690764,
"grad_norm": 2.637655258178711,
"learning_rate": 9.911383709039876e-06,
"loss": 0.1336,
"num_input_tokens_seen": 958240,
"step": 1535
},
{
"epoch": 3.0923694779116464,
"grad_norm": 6.339487552642822,
"learning_rate": 9.909733909048606e-06,
"loss": 0.1601,
"num_input_tokens_seen": 961056,
"step": 1540
},
{
"epoch": 3.102409638554217,
"grad_norm": 3.671630382537842,
"learning_rate": 9.908069032787473e-06,
"loss": 0.1588,
"num_input_tokens_seen": 963808,
"step": 1545
},
{
"epoch": 3.112449799196787,
"grad_norm": 3.158716917037964,
"learning_rate": 9.906389085368792e-06,
"loss": 0.1487,
"num_input_tokens_seen": 967168,
"step": 1550
},
{
"epoch": 3.1224899598393576,
"grad_norm": 1.9967892169952393,
"learning_rate": 9.904694071951167e-06,
"loss": 0.1448,
"num_input_tokens_seen": 970272,
"step": 1555
},
{
"epoch": 3.1325301204819276,
"grad_norm": 2.418212413787842,
"learning_rate": 9.902983997739453e-06,
"loss": 0.1227,
"num_input_tokens_seen": 972960,
"step": 1560
},
{
"epoch": 3.142570281124498,
"grad_norm": 6.753594875335693,
"learning_rate": 9.90125886798476e-06,
"loss": 0.1283,
"num_input_tokens_seen": 976064,
"step": 1565
},
{
"epoch": 3.1526104417670684,
"grad_norm": 4.105684280395508,
"learning_rate": 9.899518687984424e-06,
"loss": 0.1485,
"num_input_tokens_seen": 979168,
"step": 1570
},
{
"epoch": 3.1626506024096384,
"grad_norm": 4.228103160858154,
"learning_rate": 9.897763463082e-06,
"loss": 0.1622,
"num_input_tokens_seen": 982528,
"step": 1575
},
{
"epoch": 3.1726907630522088,
"grad_norm": 4.657632350921631,
"learning_rate": 9.89599319866724e-06,
"loss": 0.1166,
"num_input_tokens_seen": 985472,
"step": 1580
},
{
"epoch": 3.182730923694779,
"grad_norm": 3.033367872238159,
"learning_rate": 9.894207900176074e-06,
"loss": 0.1433,
"num_input_tokens_seen": 988448,
"step": 1585
},
{
"epoch": 3.1927710843373496,
"grad_norm": 3.6265132427215576,
"learning_rate": 9.892407573090603e-06,
"loss": 0.1531,
"num_input_tokens_seen": 991392,
"step": 1590
},
{
"epoch": 3.2028112449799195,
"grad_norm": 3.417398691177368,
"learning_rate": 9.890592222939071e-06,
"loss": 0.191,
"num_input_tokens_seen": 993760,
"step": 1595
},
{
"epoch": 3.21285140562249,
"grad_norm": 2.1362621784210205,
"learning_rate": 9.888761855295855e-06,
"loss": 0.1723,
"num_input_tokens_seen": 997216,
"step": 1600
},
{
"epoch": 3.2228915662650603,
"grad_norm": 2.6160507202148438,
"learning_rate": 9.886916475781448e-06,
"loss": 0.1387,
"num_input_tokens_seen": 1000160,
"step": 1605
},
{
"epoch": 3.2329317269076308,
"grad_norm": 1.8726855516433716,
"learning_rate": 9.885056090062436e-06,
"loss": 0.1349,
"num_input_tokens_seen": 1003424,
"step": 1610
},
{
"epoch": 3.2429718875502007,
"grad_norm": 1.9157968759536743,
"learning_rate": 9.883180703851488e-06,
"loss": 0.1236,
"num_input_tokens_seen": 1006080,
"step": 1615
},
{
"epoch": 3.253012048192771,
"grad_norm": 4.427457809448242,
"learning_rate": 9.881290322907332e-06,
"loss": 0.1659,
"num_input_tokens_seen": 1009472,
"step": 1620
},
{
"epoch": 3.2630522088353415,
"grad_norm": 3.359321355819702,
"learning_rate": 9.879384953034745e-06,
"loss": 0.1175,
"num_input_tokens_seen": 1012576,
"step": 1625
},
{
"epoch": 3.2730923694779115,
"grad_norm": 4.155451774597168,
"learning_rate": 9.877464600084521e-06,
"loss": 0.1796,
"num_input_tokens_seen": 1015744,
"step": 1630
},
{
"epoch": 3.283132530120482,
"grad_norm": 3.184243679046631,
"learning_rate": 9.875529269953474e-06,
"loss": 0.1309,
"num_input_tokens_seen": 1018336,
"step": 1635
},
{
"epoch": 3.2931726907630523,
"grad_norm": 3.387753963470459,
"learning_rate": 9.873578968584399e-06,
"loss": 0.152,
"num_input_tokens_seen": 1021056,
"step": 1640
},
{
"epoch": 3.3032128514056227,
"grad_norm": 1.9127076864242554,
"learning_rate": 9.871613701966067e-06,
"loss": 0.1473,
"num_input_tokens_seen": 1024576,
"step": 1645
},
{
"epoch": 3.3132530120481927,
"grad_norm": 1.9807343482971191,
"learning_rate": 9.869633476133205e-06,
"loss": 0.1158,
"num_input_tokens_seen": 1027840,
"step": 1650
},
{
"epoch": 3.323293172690763,
"grad_norm": 2.7899370193481445,
"learning_rate": 9.867638297166467e-06,
"loss": 0.1114,
"num_input_tokens_seen": 1031232,
"step": 1655
},
{
"epoch": 3.3333333333333335,
"grad_norm": 7.52949333190918,
"learning_rate": 9.865628171192432e-06,
"loss": 0.1627,
"num_input_tokens_seen": 1034624,
"step": 1660
},
{
"epoch": 3.3433734939759034,
"grad_norm": 3.0961153507232666,
"learning_rate": 9.863603104383575e-06,
"loss": 0.1279,
"num_input_tokens_seen": 1037792,
"step": 1665
},
{
"epoch": 3.353413654618474,
"grad_norm": 4.504798889160156,
"learning_rate": 9.861563102958243e-06,
"loss": 0.1248,
"num_input_tokens_seen": 1040352,
"step": 1670
},
{
"epoch": 3.3634538152610443,
"grad_norm": 3.05240797996521,
"learning_rate": 9.859508173180653e-06,
"loss": 0.1567,
"num_input_tokens_seen": 1043328,
"step": 1675
},
{
"epoch": 3.3734939759036147,
"grad_norm": 2.5293796062469482,
"learning_rate": 9.857438321360853e-06,
"loss": 0.1332,
"num_input_tokens_seen": 1046912,
"step": 1680
},
{
"epoch": 3.3835341365461846,
"grad_norm": 3.3068597316741943,
"learning_rate": 9.855353553854719e-06,
"loss": 0.1333,
"num_input_tokens_seen": 1050272,
"step": 1685
},
{
"epoch": 3.393574297188755,
"grad_norm": 1.7536911964416504,
"learning_rate": 9.853253877063922e-06,
"loss": 0.1552,
"num_input_tokens_seen": 1052512,
"step": 1690
},
{
"epoch": 3.4036144578313254,
"grad_norm": 1.41140615940094,
"learning_rate": 9.85113929743592e-06,
"loss": 0.1335,
"num_input_tokens_seen": 1056000,
"step": 1695
},
{
"epoch": 3.4136546184738954,
"grad_norm": 9.492226600646973,
"learning_rate": 9.849009821463931e-06,
"loss": 0.179,
"num_input_tokens_seen": 1058624,
"step": 1700
},
{
"epoch": 3.423694779116466,
"grad_norm": 1.4923559427261353,
"learning_rate": 9.846865455686915e-06,
"loss": 0.1076,
"num_input_tokens_seen": 1061280,
"step": 1705
},
{
"epoch": 3.433734939759036,
"grad_norm": 2.582712173461914,
"learning_rate": 9.844706206689557e-06,
"loss": 0.1559,
"num_input_tokens_seen": 1064576,
"step": 1710
},
{
"epoch": 3.4437751004016066,
"grad_norm": 7.048799514770508,
"learning_rate": 9.842532081102234e-06,
"loss": 0.1566,
"num_input_tokens_seen": 1067232,
"step": 1715
},
{
"epoch": 3.4538152610441766,
"grad_norm": 3.5871365070343018,
"learning_rate": 9.840343085601018e-06,
"loss": 0.1286,
"num_input_tokens_seen": 1070624,
"step": 1720
},
{
"epoch": 3.463855421686747,
"grad_norm": 11.149718284606934,
"learning_rate": 9.838139226907631e-06,
"loss": 0.171,
"num_input_tokens_seen": 1074208,
"step": 1725
},
{
"epoch": 3.4738955823293174,
"grad_norm": 2.5568132400512695,
"learning_rate": 9.835920511789441e-06,
"loss": 0.1862,
"num_input_tokens_seen": 1078144,
"step": 1730
},
{
"epoch": 3.4839357429718874,
"grad_norm": 2.298809051513672,
"learning_rate": 9.833686947059436e-06,
"loss": 0.1129,
"num_input_tokens_seen": 1081728,
"step": 1735
},
{
"epoch": 3.4939759036144578,
"grad_norm": 2.6856706142425537,
"learning_rate": 9.831438539576194e-06,
"loss": 0.1192,
"num_input_tokens_seen": 1084320,
"step": 1740
},
{
"epoch": 3.504016064257028,
"grad_norm": 2.332166910171509,
"learning_rate": 9.829175296243885e-06,
"loss": 0.1328,
"num_input_tokens_seen": 1087168,
"step": 1745
},
{
"epoch": 3.5140562248995986,
"grad_norm": 3.9273953437805176,
"learning_rate": 9.826897224012221e-06,
"loss": 0.1197,
"num_input_tokens_seen": 1090304,
"step": 1750
},
{
"epoch": 3.5240963855421685,
"grad_norm": 2.3180413246154785,
"learning_rate": 9.82460432987646e-06,
"loss": 0.1532,
"num_input_tokens_seen": 1093248,
"step": 1755
},
{
"epoch": 3.534136546184739,
"grad_norm": 3.322129011154175,
"learning_rate": 9.822296620877364e-06,
"loss": 0.1532,
"num_input_tokens_seen": 1096160,
"step": 1760
},
{
"epoch": 3.5441767068273093,
"grad_norm": 2.576061248779297,
"learning_rate": 9.819974104101198e-06,
"loss": 0.1268,
"num_input_tokens_seen": 1099712,
"step": 1765
},
{
"epoch": 3.5542168674698793,
"grad_norm": 2.1420481204986572,
"learning_rate": 9.817636786679682e-06,
"loss": 0.1301,
"num_input_tokens_seen": 1102528,
"step": 1770
},
{
"epoch": 3.5642570281124497,
"grad_norm": 3.8734970092773438,
"learning_rate": 9.815284675789999e-06,
"loss": 0.1342,
"num_input_tokens_seen": 1106368,
"step": 1775
},
{
"epoch": 3.57429718875502,
"grad_norm": 3.125272035598755,
"learning_rate": 9.81291777865475e-06,
"loss": 0.1162,
"num_input_tokens_seen": 1109344,
"step": 1780
},
{
"epoch": 3.5843373493975905,
"grad_norm": 3.877983570098877,
"learning_rate": 9.810536102541941e-06,
"loss": 0.0825,
"num_input_tokens_seen": 1112480,
"step": 1785
},
{
"epoch": 3.5943775100401605,
"grad_norm": 4.222232818603516,
"learning_rate": 9.808139654764962e-06,
"loss": 0.2169,
"num_input_tokens_seen": 1115104,
"step": 1790
},
{
"epoch": 3.604417670682731,
"grad_norm": 4.677252292633057,
"learning_rate": 9.80572844268256e-06,
"loss": 0.21,
"num_input_tokens_seen": 1117376,
"step": 1795
},
{
"epoch": 3.6144578313253013,
"grad_norm": 2.8699440956115723,
"learning_rate": 9.80330247369882e-06,
"loss": 0.1643,
"num_input_tokens_seen": 1120576,
"step": 1800
},
{
"epoch": 3.6244979919678713,
"grad_norm": 3.1444013118743896,
"learning_rate": 9.800861755263141e-06,
"loss": 0.1449,
"num_input_tokens_seen": 1123712,
"step": 1805
},
{
"epoch": 3.6345381526104417,
"grad_norm": 0.9077942371368408,
"learning_rate": 9.79840629487021e-06,
"loss": 0.1334,
"num_input_tokens_seen": 1127232,
"step": 1810
},
{
"epoch": 3.644578313253012,
"grad_norm": 5.333505153656006,
"learning_rate": 9.795936100059986e-06,
"loss": 0.1875,
"num_input_tokens_seen": 1130016,
"step": 1815
},
{
"epoch": 3.6546184738955825,
"grad_norm": 2.171931028366089,
"learning_rate": 9.79345117841767e-06,
"loss": 0.1633,
"num_input_tokens_seen": 1133632,
"step": 1820
},
{
"epoch": 3.664658634538153,
"grad_norm": 3.1967790126800537,
"learning_rate": 9.790951537573686e-06,
"loss": 0.1679,
"num_input_tokens_seen": 1136512,
"step": 1825
},
{
"epoch": 3.674698795180723,
"grad_norm": 1.7990355491638184,
"learning_rate": 9.788437185203655e-06,
"loss": 0.152,
"num_input_tokens_seen": 1139424,
"step": 1830
},
{
"epoch": 3.6847389558232932,
"grad_norm": 1.3819535970687866,
"learning_rate": 9.785908129028374e-06,
"loss": 0.1428,
"num_input_tokens_seen": 1142976,
"step": 1835
},
{
"epoch": 3.694779116465863,
"grad_norm": 2.1355533599853516,
"learning_rate": 9.78336437681379e-06,
"loss": 0.1709,
"num_input_tokens_seen": 1146624,
"step": 1840
},
{
"epoch": 3.7048192771084336,
"grad_norm": 3.2626771926879883,
"learning_rate": 9.780805936370976e-06,
"loss": 0.1462,
"num_input_tokens_seen": 1149632,
"step": 1845
},
{
"epoch": 3.714859437751004,
"grad_norm": 2.5651373863220215,
"learning_rate": 9.77823281555611e-06,
"loss": 0.1669,
"num_input_tokens_seen": 1153760,
"step": 1850
},
{
"epoch": 3.7248995983935744,
"grad_norm": 2.5659170150756836,
"learning_rate": 9.775645022270448e-06,
"loss": 0.1465,
"num_input_tokens_seen": 1156992,
"step": 1855
},
{
"epoch": 3.734939759036145,
"grad_norm": 1.7440698146820068,
"learning_rate": 9.773042564460299e-06,
"loss": 0.1401,
"num_input_tokens_seen": 1160032,
"step": 1860
},
{
"epoch": 3.744979919678715,
"grad_norm": 2.497500419616699,
"learning_rate": 9.770425450117005e-06,
"loss": 0.1428,
"num_input_tokens_seen": 1164128,
"step": 1865
},
{
"epoch": 3.755020080321285,
"grad_norm": 2.873211622238159,
"learning_rate": 9.767793687276913e-06,
"loss": 0.123,
"num_input_tokens_seen": 1167264,
"step": 1870
},
{
"epoch": 3.765060240963855,
"grad_norm": 2.283451795578003,
"learning_rate": 9.76514728402135e-06,
"loss": 0.1539,
"num_input_tokens_seen": 1169920,
"step": 1875
},
{
"epoch": 3.7751004016064256,
"grad_norm": 1.920371174812317,
"learning_rate": 9.762486248476597e-06,
"loss": 0.1462,
"num_input_tokens_seen": 1172640,
"step": 1880
},
{
"epoch": 3.785140562248996,
"grad_norm": 9.184189796447754,
"learning_rate": 9.759810588813872e-06,
"loss": 0.1893,
"num_input_tokens_seen": 1174816,
"step": 1885
},
{
"epoch": 3.7951807228915664,
"grad_norm": 2.4213876724243164,
"learning_rate": 9.757120313249292e-06,
"loss": 0.1554,
"num_input_tokens_seen": 1177568,
"step": 1890
},
{
"epoch": 3.805220883534137,
"grad_norm": 2.758159875869751,
"learning_rate": 9.754415430043864e-06,
"loss": 0.1431,
"num_input_tokens_seen": 1181472,
"step": 1895
},
{
"epoch": 3.8152610441767068,
"grad_norm": 1.6902081966400146,
"learning_rate": 9.751695947503442e-06,
"loss": 0.1324,
"num_input_tokens_seen": 1184064,
"step": 1900
},
{
"epoch": 3.825301204819277,
"grad_norm": 1.6394085884094238,
"learning_rate": 9.748961873978713e-06,
"loss": 0.1494,
"num_input_tokens_seen": 1186976,
"step": 1905
},
{
"epoch": 3.835341365461847,
"grad_norm": 2.8501033782958984,
"learning_rate": 9.74621321786517e-06,
"loss": 0.1644,
"num_input_tokens_seen": 1190080,
"step": 1910
},
{
"epoch": 3.8453815261044175,
"grad_norm": 1.711416482925415,
"learning_rate": 9.743449987603082e-06,
"loss": 0.1484,
"num_input_tokens_seen": 1192800,
"step": 1915
},
{
"epoch": 3.855421686746988,
"grad_norm": 2.7892041206359863,
"learning_rate": 9.740672191677474e-06,
"loss": 0.1237,
"num_input_tokens_seen": 1195936,
"step": 1920
},
{
"epoch": 3.8654618473895583,
"grad_norm": 2.6959807872772217,
"learning_rate": 9.737879838618095e-06,
"loss": 0.1634,
"num_input_tokens_seen": 1199232,
"step": 1925
},
{
"epoch": 3.8755020080321287,
"grad_norm": 1.1288927793502808,
"learning_rate": 9.735072936999392e-06,
"loss": 0.1529,
"num_input_tokens_seen": 1202464,
"step": 1930
},
{
"epoch": 3.8855421686746987,
"grad_norm": 6.117106914520264,
"learning_rate": 9.732251495440495e-06,
"loss": 0.1659,
"num_input_tokens_seen": 1205632,
"step": 1935
},
{
"epoch": 3.895582329317269,
"grad_norm": 3.2092738151550293,
"learning_rate": 9.729415522605171e-06,
"loss": 0.1869,
"num_input_tokens_seen": 1208768,
"step": 1940
},
{
"epoch": 3.9056224899598395,
"grad_norm": 2.194758176803589,
"learning_rate": 9.726565027201813e-06,
"loss": 0.14,
"num_input_tokens_seen": 1211872,
"step": 1945
},
{
"epoch": 3.9156626506024095,
"grad_norm": 6.0824971199035645,
"learning_rate": 9.72370001798341e-06,
"loss": 0.1424,
"num_input_tokens_seen": 1215360,
"step": 1950
},
{
"epoch": 3.92570281124498,
"grad_norm": 3.891514539718628,
"learning_rate": 9.720820503747517e-06,
"loss": 0.1403,
"num_input_tokens_seen": 1218080,
"step": 1955
},
{
"epoch": 3.9357429718875503,
"grad_norm": 5.07526159286499,
"learning_rate": 9.717926493336227e-06,
"loss": 0.167,
"num_input_tokens_seen": 1221216,
"step": 1960
},
{
"epoch": 3.9457831325301207,
"grad_norm": 9.483345031738281,
"learning_rate": 9.715017995636151e-06,
"loss": 0.1711,
"num_input_tokens_seen": 1224096,
"step": 1965
},
{
"epoch": 3.9558232931726907,
"grad_norm": 14.616900444030762,
"learning_rate": 9.712095019578382e-06,
"loss": 0.1591,
"num_input_tokens_seen": 1227584,
"step": 1970
},
{
"epoch": 3.965863453815261,
"grad_norm": 2.052248239517212,
"learning_rate": 9.70915757413847e-06,
"loss": 0.1023,
"num_input_tokens_seen": 1230592,
"step": 1975
},
{
"epoch": 3.9759036144578315,
"grad_norm": 2.4284846782684326,
"learning_rate": 9.706205668336404e-06,
"loss": 0.1923,
"num_input_tokens_seen": 1234592,
"step": 1980
},
{
"epoch": 3.9859437751004014,
"grad_norm": 4.282687664031982,
"learning_rate": 9.703239311236567e-06,
"loss": 0.1158,
"num_input_tokens_seen": 1238464,
"step": 1985
},
{
"epoch": 3.995983935742972,
"grad_norm": 1.9346394538879395,
"learning_rate": 9.700258511947722e-06,
"loss": 0.1786,
"num_input_tokens_seen": 1241760,
"step": 1990
},
{
"epoch": 4.0,
"eval_loss": 0.16210927069187164,
"eval_runtime": 8.0638,
"eval_samples_per_second": 61.757,
"eval_steps_per_second": 15.501,
"num_input_tokens_seen": 1242912,
"step": 1992
},
{
"epoch": 4.006024096385542,
"grad_norm": 2.6092324256896973,
"learning_rate": 9.697263279622982e-06,
"loss": 0.1517,
"num_input_tokens_seen": 1245120,
"step": 1995
},
{
"epoch": 4.016064257028113,
"grad_norm": 4.089380264282227,
"learning_rate": 9.694253623459773e-06,
"loss": 0.1196,
"num_input_tokens_seen": 1247680,
"step": 2000
},
{
"epoch": 4.026104417670683,
"grad_norm": 1.3615751266479492,
"learning_rate": 9.691229552699817e-06,
"loss": 0.1008,
"num_input_tokens_seen": 1250944,
"step": 2005
},
{
"epoch": 4.036144578313253,
"grad_norm": 5.772199630737305,
"learning_rate": 9.688191076629096e-06,
"loss": 0.0652,
"num_input_tokens_seen": 1253888,
"step": 2010
},
{
"epoch": 4.046184738955823,
"grad_norm": 8.530108451843262,
"learning_rate": 9.685138204577829e-06,
"loss": 0.1276,
"num_input_tokens_seen": 1257312,
"step": 2015
},
{
"epoch": 4.056224899598393,
"grad_norm": 0.802332878112793,
"learning_rate": 9.682070945920437e-06,
"loss": 0.1594,
"num_input_tokens_seen": 1260320,
"step": 2020
},
{
"epoch": 4.066265060240964,
"grad_norm": 2.7650415897369385,
"learning_rate": 9.678989310075524e-06,
"loss": 0.1536,
"num_input_tokens_seen": 1263968,
"step": 2025
},
{
"epoch": 4.076305220883534,
"grad_norm": 11.720952033996582,
"learning_rate": 9.675893306505834e-06,
"loss": 0.2476,
"num_input_tokens_seen": 1266912,
"step": 2030
},
{
"epoch": 4.086345381526105,
"grad_norm": 1.9159353971481323,
"learning_rate": 9.672782944718234e-06,
"loss": 0.1311,
"num_input_tokens_seen": 1270016,
"step": 2035
},
{
"epoch": 4.096385542168675,
"grad_norm": 2.143418312072754,
"learning_rate": 9.669658234263682e-06,
"loss": 0.1124,
"num_input_tokens_seen": 1273984,
"step": 2040
},
{
"epoch": 4.106425702811245,
"grad_norm": 2.5297770500183105,
"learning_rate": 9.666519184737193e-06,
"loss": 0.1372,
"num_input_tokens_seen": 1276992,
"step": 2045
},
{
"epoch": 4.116465863453815,
"grad_norm": 4.700195789337158,
"learning_rate": 9.663365805777815e-06,
"loss": 0.1124,
"num_input_tokens_seen": 1279520,
"step": 2050
},
{
"epoch": 4.126506024096385,
"grad_norm": 7.075033664703369,
"learning_rate": 9.660198107068597e-06,
"loss": 0.1087,
"num_input_tokens_seen": 1282496,
"step": 2055
},
{
"epoch": 4.136546184738956,
"grad_norm": 3.068173408508301,
"learning_rate": 9.657016098336557e-06,
"loss": 0.1004,
"num_input_tokens_seen": 1284960,
"step": 2060
},
{
"epoch": 4.146586345381526,
"grad_norm": 2.9260811805725098,
"learning_rate": 9.65381978935266e-06,
"loss": 0.1622,
"num_input_tokens_seen": 1288544,
"step": 2065
},
{
"epoch": 4.156626506024097,
"grad_norm": 1.7771689891815186,
"learning_rate": 9.650609189931778e-06,
"loss": 0.1515,
"num_input_tokens_seen": 1291904,
"step": 2070
},
{
"epoch": 4.166666666666667,
"grad_norm": 3.8127546310424805,
"learning_rate": 9.647384309932665e-06,
"loss": 0.1402,
"num_input_tokens_seen": 1294880,
"step": 2075
},
{
"epoch": 4.176706827309237,
"grad_norm": 1.967512607574463,
"learning_rate": 9.644145159257928e-06,
"loss": 0.194,
"num_input_tokens_seen": 1298432,
"step": 2080
},
{
"epoch": 4.186746987951807,
"grad_norm": 2.751413106918335,
"learning_rate": 9.640891747853995e-06,
"loss": 0.1261,
"num_input_tokens_seen": 1301568,
"step": 2085
},
{
"epoch": 4.196787148594377,
"grad_norm": 4.411474704742432,
"learning_rate": 9.63762408571108e-06,
"loss": 0.1632,
"num_input_tokens_seen": 1304288,
"step": 2090
},
{
"epoch": 4.206827309236948,
"grad_norm": 1.2499157190322876,
"learning_rate": 9.634342182863163e-06,
"loss": 0.1517,
"num_input_tokens_seen": 1306784,
"step": 2095
},
{
"epoch": 4.216867469879518,
"grad_norm": 2.7209055423736572,
"learning_rate": 9.63104604938795e-06,
"loss": 0.1451,
"num_input_tokens_seen": 1309760,
"step": 2100
},
{
"epoch": 4.2269076305220885,
"grad_norm": 1.8991820812225342,
"learning_rate": 9.627735695406842e-06,
"loss": 0.1158,
"num_input_tokens_seen": 1312928,
"step": 2105
},
{
"epoch": 4.236947791164659,
"grad_norm": 8.971945762634277,
"learning_rate": 9.62441113108491e-06,
"loss": 0.1374,
"num_input_tokens_seen": 1316192,
"step": 2110
},
{
"epoch": 4.246987951807229,
"grad_norm": 2.6375224590301514,
"learning_rate": 9.621072366630859e-06,
"loss": 0.1737,
"num_input_tokens_seen": 1319072,
"step": 2115
},
{
"epoch": 4.257028112449799,
"grad_norm": 2.076023578643799,
"learning_rate": 9.617719412297002e-06,
"loss": 0.1038,
"num_input_tokens_seen": 1321760,
"step": 2120
},
{
"epoch": 4.267068273092369,
"grad_norm": 4.5401225090026855,
"learning_rate": 9.614352278379217e-06,
"loss": 0.1736,
"num_input_tokens_seen": 1325600,
"step": 2125
},
{
"epoch": 4.27710843373494,
"grad_norm": 3.4096763134002686,
"learning_rate": 9.610970975216933e-06,
"loss": 0.1458,
"num_input_tokens_seen": 1328992,
"step": 2130
},
{
"epoch": 4.28714859437751,
"grad_norm": 6.173547744750977,
"learning_rate": 9.60757551319308e-06,
"loss": 0.1654,
"num_input_tokens_seen": 1333152,
"step": 2135
},
{
"epoch": 4.2971887550200805,
"grad_norm": 3.055192232131958,
"learning_rate": 9.604165902734069e-06,
"loss": 0.1271,
"num_input_tokens_seen": 1335488,
"step": 2140
},
{
"epoch": 4.307228915662651,
"grad_norm": 2.3239598274230957,
"learning_rate": 9.600742154309756e-06,
"loss": 0.1365,
"num_input_tokens_seen": 1338720,
"step": 2145
},
{
"epoch": 4.317269076305221,
"grad_norm": 4.2289347648620605,
"learning_rate": 9.59730427843341e-06,
"loss": 0.1103,
"num_input_tokens_seen": 1342272,
"step": 2150
},
{
"epoch": 4.327309236947791,
"grad_norm": 2.9112138748168945,
"learning_rate": 9.593852285661684e-06,
"loss": 0.1479,
"num_input_tokens_seen": 1344704,
"step": 2155
},
{
"epoch": 4.337349397590361,
"grad_norm": 3.277313232421875,
"learning_rate": 9.590386186594574e-06,
"loss": 0.1618,
"num_input_tokens_seen": 1347392,
"step": 2160
},
{
"epoch": 4.347389558232932,
"grad_norm": 1.3917313814163208,
"learning_rate": 9.586905991875397e-06,
"loss": 0.1114,
"num_input_tokens_seen": 1350912,
"step": 2165
},
{
"epoch": 4.357429718875502,
"grad_norm": 1.485863208770752,
"learning_rate": 9.583411712190749e-06,
"loss": 0.1447,
"num_input_tokens_seen": 1353824,
"step": 2170
},
{
"epoch": 4.367469879518072,
"grad_norm": 3.005549907684326,
"learning_rate": 9.579903358270482e-06,
"loss": 0.1457,
"num_input_tokens_seen": 1356416,
"step": 2175
},
{
"epoch": 4.377510040160643,
"grad_norm": 1.8012841939926147,
"learning_rate": 9.576380940887661e-06,
"loss": 0.1266,
"num_input_tokens_seen": 1359712,
"step": 2180
},
{
"epoch": 4.387550200803213,
"grad_norm": 1.5120962858200073,
"learning_rate": 9.572844470858537e-06,
"loss": 0.1224,
"num_input_tokens_seen": 1362368,
"step": 2185
},
{
"epoch": 4.397590361445783,
"grad_norm": 6.252309322357178,
"learning_rate": 9.569293959042513e-06,
"loss": 0.1564,
"num_input_tokens_seen": 1365312,
"step": 2190
},
{
"epoch": 4.407630522088353,
"grad_norm": 4.203272819519043,
"learning_rate": 9.56572941634211e-06,
"loss": 0.2695,
"num_input_tokens_seen": 1368416,
"step": 2195
},
{
"epoch": 4.417670682730924,
"grad_norm": 2.676398515701294,
"learning_rate": 9.562150853702931e-06,
"loss": 0.1535,
"num_input_tokens_seen": 1371456,
"step": 2200
},
{
"epoch": 4.427710843373494,
"grad_norm": 1.36383056640625,
"learning_rate": 9.558558282113634e-06,
"loss": 0.1436,
"num_input_tokens_seen": 1375040,
"step": 2205
},
{
"epoch": 4.437751004016064,
"grad_norm": 3.8174405097961426,
"learning_rate": 9.554951712605891e-06,
"loss": 0.1319,
"num_input_tokens_seen": 1379104,
"step": 2210
},
{
"epoch": 4.447791164658635,
"grad_norm": 5.0451154708862305,
"learning_rate": 9.551331156254358e-06,
"loss": 0.1159,
"num_input_tokens_seen": 1382848,
"step": 2215
},
{
"epoch": 4.457831325301205,
"grad_norm": 2.2237813472747803,
"learning_rate": 9.547696624176642e-06,
"loss": 0.1147,
"num_input_tokens_seen": 1386432,
"step": 2220
},
{
"epoch": 4.467871485943775,
"grad_norm": 6.175833702087402,
"learning_rate": 9.544048127533262e-06,
"loss": 0.1554,
"num_input_tokens_seen": 1390048,
"step": 2225
},
{
"epoch": 4.477911646586345,
"grad_norm": 11.775164604187012,
"learning_rate": 9.540385677527617e-06,
"loss": 0.2124,
"num_input_tokens_seen": 1393344,
"step": 2230
},
{
"epoch": 4.4879518072289155,
"grad_norm": 1.2393088340759277,
"learning_rate": 9.53670928540596e-06,
"loss": 0.1014,
"num_input_tokens_seen": 1396096,
"step": 2235
},
{
"epoch": 4.497991967871486,
"grad_norm": 2.5581798553466797,
"learning_rate": 9.533018962457347e-06,
"loss": 0.0831,
"num_input_tokens_seen": 1399168,
"step": 2240
},
{
"epoch": 4.508032128514056,
"grad_norm": 4.483763217926025,
"learning_rate": 9.529314720013618e-06,
"loss": 0.1206,
"num_input_tokens_seen": 1402976,
"step": 2245
},
{
"epoch": 4.518072289156627,
"grad_norm": 1.5558322668075562,
"learning_rate": 9.52559656944935e-06,
"loss": 0.1574,
"num_input_tokens_seen": 1406496,
"step": 2250
},
{
"epoch": 4.528112449799197,
"grad_norm": 5.891838550567627,
"learning_rate": 9.521864522181834e-06,
"loss": 0.1341,
"num_input_tokens_seen": 1409344,
"step": 2255
},
{
"epoch": 4.538152610441767,
"grad_norm": 3.315089702606201,
"learning_rate": 9.518118589671025e-06,
"loss": 0.1485,
"num_input_tokens_seen": 1412544,
"step": 2260
},
{
"epoch": 4.548192771084337,
"grad_norm": 12.753541946411133,
"learning_rate": 9.514358783419518e-06,
"loss": 0.1299,
"num_input_tokens_seen": 1415680,
"step": 2265
},
{
"epoch": 4.5582329317269075,
"grad_norm": 3.243870735168457,
"learning_rate": 9.510585114972518e-06,
"loss": 0.2371,
"num_input_tokens_seen": 1419040,
"step": 2270
},
{
"epoch": 4.568273092369478,
"grad_norm": 2.4356415271759033,
"learning_rate": 9.506797595917787e-06,
"loss": 0.1112,
"num_input_tokens_seen": 1422048,
"step": 2275
},
{
"epoch": 4.578313253012048,
"grad_norm": 3.8205935955047607,
"learning_rate": 9.502996237885623e-06,
"loss": 0.1132,
"num_input_tokens_seen": 1425504,
"step": 2280
},
{
"epoch": 4.588353413654619,
"grad_norm": 4.907054424285889,
"learning_rate": 9.499181052548813e-06,
"loss": 0.1357,
"num_input_tokens_seen": 1428608,
"step": 2285
},
{
"epoch": 4.598393574297189,
"grad_norm": 4.737507343292236,
"learning_rate": 9.495352051622612e-06,
"loss": 0.1227,
"num_input_tokens_seen": 1430752,
"step": 2290
},
{
"epoch": 4.608433734939759,
"grad_norm": 2.2299745082855225,
"learning_rate": 9.491509246864691e-06,
"loss": 0.1814,
"num_input_tokens_seen": 1433600,
"step": 2295
},
{
"epoch": 4.618473895582329,
"grad_norm": 1.2637619972229004,
"learning_rate": 9.487652650075116e-06,
"loss": 0.1479,
"num_input_tokens_seen": 1436352,
"step": 2300
},
{
"epoch": 4.628514056224899,
"grad_norm": 2.430283546447754,
"learning_rate": 9.483782273096295e-06,
"loss": 0.1756,
"num_input_tokens_seen": 1439296,
"step": 2305
},
{
"epoch": 4.63855421686747,
"grad_norm": 6.702330589294434,
"learning_rate": 9.479898127812957e-06,
"loss": 0.1802,
"num_input_tokens_seen": 1443456,
"step": 2310
},
{
"epoch": 4.64859437751004,
"grad_norm": 3.452070474624634,
"learning_rate": 9.476000226152107e-06,
"loss": 0.1391,
"num_input_tokens_seen": 1446624,
"step": 2315
},
{
"epoch": 4.658634538152611,
"grad_norm": 2.66707706451416,
"learning_rate": 9.472088580082991e-06,
"loss": 0.1071,
"num_input_tokens_seen": 1450016,
"step": 2320
},
{
"epoch": 4.668674698795181,
"grad_norm": 1.9608304500579834,
"learning_rate": 9.468163201617063e-06,
"loss": 0.1438,
"num_input_tokens_seen": 1453856,
"step": 2325
},
{
"epoch": 4.678714859437751,
"grad_norm": 2.3657283782958984,
"learning_rate": 9.46422410280794e-06,
"loss": 0.0999,
"num_input_tokens_seen": 1456832,
"step": 2330
},
{
"epoch": 4.688755020080321,
"grad_norm": 8.516715049743652,
"learning_rate": 9.460271295751373e-06,
"loss": 0.2223,
"num_input_tokens_seen": 1459488,
"step": 2335
},
{
"epoch": 4.698795180722891,
"grad_norm": 0.987719714641571,
"learning_rate": 9.456304792585207e-06,
"loss": 0.1433,
"num_input_tokens_seen": 1462400,
"step": 2340
},
{
"epoch": 4.708835341365462,
"grad_norm": 2.08788800239563,
"learning_rate": 9.452324605489344e-06,
"loss": 0.1415,
"num_input_tokens_seen": 1465248,
"step": 2345
},
{
"epoch": 4.718875502008032,
"grad_norm": 6.095128059387207,
"learning_rate": 9.448330746685704e-06,
"loss": 0.1313,
"num_input_tokens_seen": 1468128,
"step": 2350
},
{
"epoch": 4.728915662650603,
"grad_norm": 35.532806396484375,
"learning_rate": 9.444323228438186e-06,
"loss": 0.2186,
"num_input_tokens_seen": 1471040,
"step": 2355
},
{
"epoch": 4.738955823293173,
"grad_norm": 5.288688659667969,
"learning_rate": 9.440302063052638e-06,
"loss": 0.1963,
"num_input_tokens_seen": 1473568,
"step": 2360
},
{
"epoch": 4.7489959839357425,
"grad_norm": 4.212613582611084,
"learning_rate": 9.436267262876808e-06,
"loss": 0.1266,
"num_input_tokens_seen": 1477184,
"step": 2365
},
{
"epoch": 4.759036144578313,
"grad_norm": 2.8863844871520996,
"learning_rate": 9.43221884030032e-06,
"loss": 0.1391,
"num_input_tokens_seen": 1480512,
"step": 2370
},
{
"epoch": 4.769076305220883,
"grad_norm": 4.399484157562256,
"learning_rate": 9.428156807754622e-06,
"loss": 0.1569,
"num_input_tokens_seen": 1483776,
"step": 2375
},
{
"epoch": 4.779116465863454,
"grad_norm": 2.0701286792755127,
"learning_rate": 9.424081177712955e-06,
"loss": 0.1241,
"num_input_tokens_seen": 1486464,
"step": 2380
},
{
"epoch": 4.789156626506024,
"grad_norm": 2.4887611865997314,
"learning_rate": 9.419991962690317e-06,
"loss": 0.1112,
"num_input_tokens_seen": 1489056,
"step": 2385
},
{
"epoch": 4.7991967871485945,
"grad_norm": 3.219498872756958,
"learning_rate": 9.415889175243416e-06,
"loss": 0.1215,
"num_input_tokens_seen": 1491808,
"step": 2390
},
{
"epoch": 4.809236947791165,
"grad_norm": 5.092496871948242,
"learning_rate": 9.411772827970642e-06,
"loss": 0.1055,
"num_input_tokens_seen": 1495008,
"step": 2395
},
{
"epoch": 4.8192771084337345,
"grad_norm": 4.519360542297363,
"learning_rate": 9.40764293351202e-06,
"loss": 0.1365,
"num_input_tokens_seen": 1497760,
"step": 2400
},
{
"epoch": 4.829317269076305,
"grad_norm": 1.1300933361053467,
"learning_rate": 9.403499504549174e-06,
"loss": 0.175,
"num_input_tokens_seen": 1500544,
"step": 2405
},
{
"epoch": 4.839357429718875,
"grad_norm": 2.9588253498077393,
"learning_rate": 9.399342553805289e-06,
"loss": 0.1112,
"num_input_tokens_seen": 1503232,
"step": 2410
},
{
"epoch": 4.849397590361446,
"grad_norm": 3.217325448989868,
"learning_rate": 9.395172094045073e-06,
"loss": 0.1045,
"num_input_tokens_seen": 1506432,
"step": 2415
},
{
"epoch": 4.859437751004016,
"grad_norm": 5.2091569900512695,
"learning_rate": 9.390988138074713e-06,
"loss": 0.1387,
"num_input_tokens_seen": 1510336,
"step": 2420
},
{
"epoch": 4.8694779116465865,
"grad_norm": 8.337028503417969,
"learning_rate": 9.38679069874184e-06,
"loss": 0.2045,
"num_input_tokens_seen": 1512928,
"step": 2425
},
{
"epoch": 4.879518072289157,
"grad_norm": 4.4194111824035645,
"learning_rate": 9.382579788935487e-06,
"loss": 0.1364,
"num_input_tokens_seen": 1515968,
"step": 2430
},
{
"epoch": 4.889558232931726,
"grad_norm": 2.803941011428833,
"learning_rate": 9.378355421586053e-06,
"loss": 0.1537,
"num_input_tokens_seen": 1519168,
"step": 2435
},
{
"epoch": 4.899598393574297,
"grad_norm": 5.292468547821045,
"learning_rate": 9.374117609665263e-06,
"loss": 0.1241,
"num_input_tokens_seen": 1522432,
"step": 2440
},
{
"epoch": 4.909638554216867,
"grad_norm": 3.3019838333129883,
"learning_rate": 9.369866366186116e-06,
"loss": 0.1524,
"num_input_tokens_seen": 1525696,
"step": 2445
},
{
"epoch": 4.919678714859438,
"grad_norm": 7.390207767486572,
"learning_rate": 9.365601704202869e-06,
"loss": 0.1753,
"num_input_tokens_seen": 1528736,
"step": 2450
},
{
"epoch": 4.929718875502008,
"grad_norm": 2.4274277687072754,
"learning_rate": 9.36132363681097e-06,
"loss": 0.1714,
"num_input_tokens_seen": 1531648,
"step": 2455
},
{
"epoch": 4.9397590361445785,
"grad_norm": 1.9142770767211914,
"learning_rate": 9.35703217714704e-06,
"loss": 0.1605,
"num_input_tokens_seen": 1534720,
"step": 2460
},
{
"epoch": 4.949799196787149,
"grad_norm": 3.327009677886963,
"learning_rate": 9.35272733838882e-06,
"loss": 0.195,
"num_input_tokens_seen": 1536928,
"step": 2465
},
{
"epoch": 4.959839357429718,
"grad_norm": 2.2915942668914795,
"learning_rate": 9.348409133755137e-06,
"loss": 0.1326,
"num_input_tokens_seen": 1539648,
"step": 2470
},
{
"epoch": 4.969879518072289,
"grad_norm": 4.97459602355957,
"learning_rate": 9.344077576505853e-06,
"loss": 0.1515,
"num_input_tokens_seen": 1543552,
"step": 2475
},
{
"epoch": 4.979919678714859,
"grad_norm": 2.44541335105896,
"learning_rate": 9.339732679941842e-06,
"loss": 0.1143,
"num_input_tokens_seen": 1546912,
"step": 2480
},
{
"epoch": 4.98995983935743,
"grad_norm": 2.091432809829712,
"learning_rate": 9.335374457404928e-06,
"loss": 0.1388,
"num_input_tokens_seen": 1550688,
"step": 2485
},
{
"epoch": 5.0,
"grad_norm": 2.9539296627044678,
"learning_rate": 9.331002922277865e-06,
"loss": 0.1338,
"num_input_tokens_seen": 1553472,
"step": 2490
},
{
"epoch": 5.01004016064257,
"grad_norm": 1.839915156364441,
"learning_rate": 9.326618087984278e-06,
"loss": 0.0774,
"num_input_tokens_seen": 1557056,
"step": 2495
},
{
"epoch": 5.020080321285141,
"grad_norm": 3.636218786239624,
"learning_rate": 9.322219967988638e-06,
"loss": 0.1358,
"num_input_tokens_seen": 1559968,
"step": 2500
},
{
"epoch": 5.030120481927711,
"grad_norm": 11.675433158874512,
"learning_rate": 9.317808575796202e-06,
"loss": 0.1477,
"num_input_tokens_seen": 1563040,
"step": 2505
},
{
"epoch": 5.040160642570281,
"grad_norm": 2.875239372253418,
"learning_rate": 9.313383924952988e-06,
"loss": 0.1027,
"num_input_tokens_seen": 1565760,
"step": 2510
},
{
"epoch": 5.050200803212851,
"grad_norm": 3.811685085296631,
"learning_rate": 9.308946029045726e-06,
"loss": 0.0806,
"num_input_tokens_seen": 1568928,
"step": 2515
},
{
"epoch": 5.0602409638554215,
"grad_norm": 1.25583815574646,
"learning_rate": 9.304494901701821e-06,
"loss": 0.1202,
"num_input_tokens_seen": 1571808,
"step": 2520
},
{
"epoch": 5.070281124497992,
"grad_norm": 3.814985513687134,
"learning_rate": 9.300030556589303e-06,
"loss": 0.1632,
"num_input_tokens_seen": 1575200,
"step": 2525
},
{
"epoch": 5.080321285140562,
"grad_norm": 4.336440086364746,
"learning_rate": 9.29555300741679e-06,
"loss": 0.1406,
"num_input_tokens_seen": 1578880,
"step": 2530
},
{
"epoch": 5.090361445783133,
"grad_norm": 3.6324188709259033,
"learning_rate": 9.291062267933446e-06,
"loss": 0.1248,
"num_input_tokens_seen": 1581120,
"step": 2535
},
{
"epoch": 5.100401606425703,
"grad_norm": 7.417630195617676,
"learning_rate": 9.28655835192894e-06,
"loss": 0.1795,
"num_input_tokens_seen": 1584064,
"step": 2540
},
{
"epoch": 5.110441767068273,
"grad_norm": 5.5329155921936035,
"learning_rate": 9.282041273233402e-06,
"loss": 0.1542,
"num_input_tokens_seen": 1587744,
"step": 2545
},
{
"epoch": 5.120481927710843,
"grad_norm": 5.481621265411377,
"learning_rate": 9.277511045717377e-06,
"loss": 0.1454,
"num_input_tokens_seen": 1590624,
"step": 2550
},
{
"epoch": 5.1305220883534135,
"grad_norm": 5.395864009857178,
"learning_rate": 9.27296768329179e-06,
"loss": 0.1246,
"num_input_tokens_seen": 1594016,
"step": 2555
},
{
"epoch": 5.140562248995984,
"grad_norm": 2.902698278427124,
"learning_rate": 9.268411199907898e-06,
"loss": 0.1345,
"num_input_tokens_seen": 1596640,
"step": 2560
},
{
"epoch": 5.150602409638554,
"grad_norm": 14.940276145935059,
"learning_rate": 9.263841609557247e-06,
"loss": 0.1773,
"num_input_tokens_seen": 1599840,
"step": 2565
},
{
"epoch": 5.160642570281125,
"grad_norm": 3.3749656677246094,
"learning_rate": 9.259258926271632e-06,
"loss": 0.1523,
"num_input_tokens_seen": 1602656,
"step": 2570
},
{
"epoch": 5.170682730923695,
"grad_norm": 9.682775497436523,
"learning_rate": 9.254663164123052e-06,
"loss": 0.119,
"num_input_tokens_seen": 1606176,
"step": 2575
},
{
"epoch": 5.180722891566265,
"grad_norm": 7.369882583618164,
"learning_rate": 9.250054337223666e-06,
"loss": 0.1502,
"num_input_tokens_seen": 1608768,
"step": 2580
},
{
"epoch": 5.190763052208835,
"grad_norm": 2.4143364429473877,
"learning_rate": 9.245432459725754e-06,
"loss": 0.1165,
"num_input_tokens_seen": 1611168,
"step": 2585
},
{
"epoch": 5.2008032128514055,
"grad_norm": 4.51724100112915,
"learning_rate": 9.240797545821666e-06,
"loss": 0.1484,
"num_input_tokens_seen": 1614720,
"step": 2590
},
{
"epoch": 5.210843373493976,
"grad_norm": 3.192105293273926,
"learning_rate": 9.236149609743786e-06,
"loss": 0.1225,
"num_input_tokens_seen": 1617504,
"step": 2595
},
{
"epoch": 5.220883534136546,
"grad_norm": 1.9884601831436157,
"learning_rate": 9.231488665764485e-06,
"loss": 0.0974,
"num_input_tokens_seen": 1620672,
"step": 2600
},
{
"epoch": 5.230923694779117,
"grad_norm": 3.4887187480926514,
"learning_rate": 9.226814728196072e-06,
"loss": 0.1373,
"num_input_tokens_seen": 1623488,
"step": 2605
},
{
"epoch": 5.240963855421687,
"grad_norm": 4.321260452270508,
"learning_rate": 9.222127811390765e-06,
"loss": 0.1718,
"num_input_tokens_seen": 1626080,
"step": 2610
},
{
"epoch": 5.2510040160642575,
"grad_norm": 1.5512542724609375,
"learning_rate": 9.217427929740625e-06,
"loss": 0.0963,
"num_input_tokens_seen": 1629536,
"step": 2615
},
{
"epoch": 5.261044176706827,
"grad_norm": 1.7714067697525024,
"learning_rate": 9.212715097677537e-06,
"loss": 0.1076,
"num_input_tokens_seen": 1632768,
"step": 2620
},
{
"epoch": 5.271084337349397,
"grad_norm": 5.0114264488220215,
"learning_rate": 9.207989329673143e-06,
"loss": 0.1039,
"num_input_tokens_seen": 1636256,
"step": 2625
},
{
"epoch": 5.281124497991968,
"grad_norm": 2.490394353866577,
"learning_rate": 9.203250640238813e-06,
"loss": 0.1435,
"num_input_tokens_seen": 1639264,
"step": 2630
},
{
"epoch": 5.291164658634538,
"grad_norm": 1.5815428495407104,
"learning_rate": 9.198499043925591e-06,
"loss": 0.1152,
"num_input_tokens_seen": 1642432,
"step": 2635
},
{
"epoch": 5.301204819277109,
"grad_norm": 2.301948308944702,
"learning_rate": 9.193734555324154e-06,
"loss": 0.1229,
"num_input_tokens_seen": 1645600,
"step": 2640
},
{
"epoch": 5.311244979919679,
"grad_norm": 7.6477179527282715,
"learning_rate": 9.18895718906477e-06,
"loss": 0.1259,
"num_input_tokens_seen": 1648192,
"step": 2645
},
{
"epoch": 5.321285140562249,
"grad_norm": 11.513751029968262,
"learning_rate": 9.184166959817247e-06,
"loss": 0.1592,
"num_input_tokens_seen": 1651776,
"step": 2650
},
{
"epoch": 5.331325301204819,
"grad_norm": 7.205603122711182,
"learning_rate": 9.179363882290896e-06,
"loss": 0.0776,
"num_input_tokens_seen": 1654944,
"step": 2655
},
{
"epoch": 5.341365461847389,
"grad_norm": 3.9215004444122314,
"learning_rate": 9.17454797123448e-06,
"loss": 0.1496,
"num_input_tokens_seen": 1657344,
"step": 2660
},
{
"epoch": 5.35140562248996,
"grad_norm": 2.7696170806884766,
"learning_rate": 9.169719241436162e-06,
"loss": 0.0683,
"num_input_tokens_seen": 1659680,
"step": 2665
},
{
"epoch": 5.36144578313253,
"grad_norm": 7.695875644683838,
"learning_rate": 9.164877707723476e-06,
"loss": 0.2095,
"num_input_tokens_seen": 1662560,
"step": 2670
},
{
"epoch": 5.371485943775101,
"grad_norm": 7.6388020515441895,
"learning_rate": 9.160023384963271e-06,
"loss": 0.114,
"num_input_tokens_seen": 1665728,
"step": 2675
},
{
"epoch": 5.381526104417671,
"grad_norm": 11.195474624633789,
"learning_rate": 9.155156288061666e-06,
"loss": 0.1034,
"num_input_tokens_seen": 1669216,
"step": 2680
},
{
"epoch": 5.391566265060241,
"grad_norm": 7.711627006530762,
"learning_rate": 9.150276431964007e-06,
"loss": 0.154,
"num_input_tokens_seen": 1672768,
"step": 2685
},
{
"epoch": 5.401606425702811,
"grad_norm": 5.470666885375977,
"learning_rate": 9.145383831654814e-06,
"loss": 0.1459,
"num_input_tokens_seen": 1675520,
"step": 2690
},
{
"epoch": 5.411646586345381,
"grad_norm": 3.5182509422302246,
"learning_rate": 9.14047850215775e-06,
"loss": 0.0966,
"num_input_tokens_seen": 1678784,
"step": 2695
},
{
"epoch": 5.421686746987952,
"grad_norm": 2.275167942047119,
"learning_rate": 9.13556045853556e-06,
"loss": 0.1964,
"num_input_tokens_seen": 1681376,
"step": 2700
},
{
"epoch": 5.431726907630522,
"grad_norm": 1.8862415552139282,
"learning_rate": 9.130629715890027e-06,
"loss": 0.1234,
"num_input_tokens_seen": 1684864,
"step": 2705
},
{
"epoch": 5.4417670682730925,
"grad_norm": 2.1846301555633545,
"learning_rate": 9.125686289361935e-06,
"loss": 0.1196,
"num_input_tokens_seen": 1688896,
"step": 2710
},
{
"epoch": 5.451807228915663,
"grad_norm": 2.483438491821289,
"learning_rate": 9.120730194131011e-06,
"loss": 0.0986,
"num_input_tokens_seen": 1692288,
"step": 2715
},
{
"epoch": 5.461847389558233,
"grad_norm": 4.106010913848877,
"learning_rate": 9.115761445415887e-06,
"loss": 0.1174,
"num_input_tokens_seen": 1695200,
"step": 2720
},
{
"epoch": 5.471887550200803,
"grad_norm": 0.8280296325683594,
"learning_rate": 9.110780058474052e-06,
"loss": 0.1247,
"num_input_tokens_seen": 1698720,
"step": 2725
},
{
"epoch": 5.481927710843373,
"grad_norm": 1.9926586151123047,
"learning_rate": 9.105786048601795e-06,
"loss": 0.1565,
"num_input_tokens_seen": 1701536,
"step": 2730
},
{
"epoch": 5.491967871485944,
"grad_norm": 2.2215921878814697,
"learning_rate": 9.100779431134175e-06,
"loss": 0.1237,
"num_input_tokens_seen": 1704864,
"step": 2735
},
{
"epoch": 5.502008032128514,
"grad_norm": 4.604000091552734,
"learning_rate": 9.09576022144496e-06,
"loss": 0.1348,
"num_input_tokens_seen": 1708000,
"step": 2740
},
{
"epoch": 5.5120481927710845,
"grad_norm": 1.5299623012542725,
"learning_rate": 9.090728434946584e-06,
"loss": 0.1286,
"num_input_tokens_seen": 1711296,
"step": 2745
},
{
"epoch": 5.522088353413655,
"grad_norm": 6.929166316986084,
"learning_rate": 9.085684087090108e-06,
"loss": 0.1311,
"num_input_tokens_seen": 1714880,
"step": 2750
},
{
"epoch": 5.532128514056225,
"grad_norm": 17.860334396362305,
"learning_rate": 9.080627193365155e-06,
"loss": 0.1346,
"num_input_tokens_seen": 1717728,
"step": 2755
},
{
"epoch": 5.542168674698795,
"grad_norm": 1.9477763175964355,
"learning_rate": 9.075557769299877e-06,
"loss": 0.1556,
"num_input_tokens_seen": 1721280,
"step": 2760
},
{
"epoch": 5.552208835341365,
"grad_norm": 1.6732323169708252,
"learning_rate": 9.070475830460906e-06,
"loss": 0.1214,
"num_input_tokens_seen": 1723968,
"step": 2765
},
{
"epoch": 5.562248995983936,
"grad_norm": 10.01714038848877,
"learning_rate": 9.065381392453296e-06,
"loss": 0.1406,
"num_input_tokens_seen": 1727424,
"step": 2770
},
{
"epoch": 5.572289156626506,
"grad_norm": 3.083617687225342,
"learning_rate": 9.060274470920487e-06,
"loss": 0.1231,
"num_input_tokens_seen": 1730528,
"step": 2775
},
{
"epoch": 5.582329317269076,
"grad_norm": 2.5101115703582764,
"learning_rate": 9.055155081544253e-06,
"loss": 0.1405,
"num_input_tokens_seen": 1734208,
"step": 2780
},
{
"epoch": 5.592369477911647,
"grad_norm": 4.0800580978393555,
"learning_rate": 9.050023240044649e-06,
"loss": 0.1144,
"num_input_tokens_seen": 1737728,
"step": 2785
},
{
"epoch": 5.602409638554217,
"grad_norm": 8.37118911743164,
"learning_rate": 9.044878962179968e-06,
"loss": 0.1405,
"num_input_tokens_seen": 1740800,
"step": 2790
},
{
"epoch": 5.612449799196787,
"grad_norm": 3.184250593185425,
"learning_rate": 9.039722263746693e-06,
"loss": 0.1596,
"num_input_tokens_seen": 1744096,
"step": 2795
},
{
"epoch": 5.622489959839357,
"grad_norm": 2.413656711578369,
"learning_rate": 9.034553160579444e-06,
"loss": 0.0979,
"num_input_tokens_seen": 1746720,
"step": 2800
},
{
"epoch": 5.632530120481928,
"grad_norm": 13.534455299377441,
"learning_rate": 9.029371668550933e-06,
"loss": 0.1587,
"num_input_tokens_seen": 1750304,
"step": 2805
},
{
"epoch": 5.642570281124498,
"grad_norm": 1.7071505784988403,
"learning_rate": 9.024177803571917e-06,
"loss": 0.13,
"num_input_tokens_seen": 1753600,
"step": 2810
},
{
"epoch": 5.652610441767068,
"grad_norm": 4.593850135803223,
"learning_rate": 9.018971581591141e-06,
"loss": 0.1681,
"num_input_tokens_seen": 1756096,
"step": 2815
},
{
"epoch": 5.662650602409639,
"grad_norm": 6.782101154327393,
"learning_rate": 9.013753018595302e-06,
"loss": 0.1039,
"num_input_tokens_seen": 1759072,
"step": 2820
},
{
"epoch": 5.672690763052209,
"grad_norm": 2.9062986373901367,
"learning_rate": 9.008522130608984e-06,
"loss": 0.0958,
"num_input_tokens_seen": 1762720,
"step": 2825
},
{
"epoch": 5.682730923694779,
"grad_norm": 2.183579444885254,
"learning_rate": 9.003278933694625e-06,
"loss": 0.1527,
"num_input_tokens_seen": 1765472,
"step": 2830
},
{
"epoch": 5.692771084337349,
"grad_norm": 5.613504409790039,
"learning_rate": 8.998023443952453e-06,
"loss": 0.0948,
"num_input_tokens_seen": 1769472,
"step": 2835
},
{
"epoch": 5.7028112449799195,
"grad_norm": 8.625523567199707,
"learning_rate": 8.992755677520448e-06,
"loss": 0.1371,
"num_input_tokens_seen": 1772640,
"step": 2840
},
{
"epoch": 5.71285140562249,
"grad_norm": 3.748905897140503,
"learning_rate": 8.987475650574289e-06,
"loss": 0.1788,
"num_input_tokens_seen": 1775744,
"step": 2845
},
{
"epoch": 5.72289156626506,
"grad_norm": 4.181033611297607,
"learning_rate": 8.982183379327299e-06,
"loss": 0.1061,
"num_input_tokens_seen": 1778944,
"step": 2850
},
{
"epoch": 5.732931726907631,
"grad_norm": 4.234264373779297,
"learning_rate": 8.9768788800304e-06,
"loss": 0.1245,
"num_input_tokens_seen": 1782400,
"step": 2855
},
{
"epoch": 5.742971887550201,
"grad_norm": 2.122401714324951,
"learning_rate": 8.971562168972065e-06,
"loss": 0.144,
"num_input_tokens_seen": 1784416,
"step": 2860
},
{
"epoch": 5.753012048192771,
"grad_norm": 6.364081859588623,
"learning_rate": 8.966233262478266e-06,
"loss": 0.1747,
"num_input_tokens_seen": 1787392,
"step": 2865
},
{
"epoch": 5.763052208835341,
"grad_norm": 4.270403861999512,
"learning_rate": 8.960892176912418e-06,
"loss": 0.1084,
"num_input_tokens_seen": 1790976,
"step": 2870
},
{
"epoch": 5.7730923694779115,
"grad_norm": 7.241430759429932,
"learning_rate": 8.955538928675343e-06,
"loss": 0.1494,
"num_input_tokens_seen": 1793952,
"step": 2875
},
{
"epoch": 5.783132530120482,
"grad_norm": 2.145287036895752,
"learning_rate": 8.950173534205202e-06,
"loss": 0.1379,
"num_input_tokens_seen": 1797568,
"step": 2880
},
{
"epoch": 5.793172690763052,
"grad_norm": 3.997114896774292,
"learning_rate": 8.944796009977459e-06,
"loss": 0.1645,
"num_input_tokens_seen": 1800128,
"step": 2885
},
{
"epoch": 5.803212851405623,
"grad_norm": 4.003738880157471,
"learning_rate": 8.939406372504823e-06,
"loss": 0.1543,
"num_input_tokens_seen": 1803712,
"step": 2890
},
{
"epoch": 5.813253012048193,
"grad_norm": 4.363209247589111,
"learning_rate": 8.934004638337197e-06,
"loss": 0.0882,
"num_input_tokens_seen": 1806784,
"step": 2895
},
{
"epoch": 5.823293172690763,
"grad_norm": 2.9139928817749023,
"learning_rate": 8.928590824061633e-06,
"loss": 0.0888,
"num_input_tokens_seen": 1809312,
"step": 2900
},
{
"epoch": 5.833333333333333,
"grad_norm": 5.311194896697998,
"learning_rate": 8.923164946302274e-06,
"loss": 0.1286,
"num_input_tokens_seen": 1812192,
"step": 2905
},
{
"epoch": 5.843373493975903,
"grad_norm": 2.110668420791626,
"learning_rate": 8.917727021720308e-06,
"loss": 0.1309,
"num_input_tokens_seen": 1815168,
"step": 2910
},
{
"epoch": 5.853413654618474,
"grad_norm": 8.800142288208008,
"learning_rate": 8.912277067013914e-06,
"loss": 0.123,
"num_input_tokens_seen": 1818176,
"step": 2915
},
{
"epoch": 5.863453815261044,
"grad_norm": 2.967930555343628,
"learning_rate": 8.906815098918214e-06,
"loss": 0.1411,
"num_input_tokens_seen": 1821120,
"step": 2920
},
{
"epoch": 5.873493975903615,
"grad_norm": 2.1932666301727295,
"learning_rate": 8.901341134205214e-06,
"loss": 0.1104,
"num_input_tokens_seen": 1823840,
"step": 2925
},
{
"epoch": 5.883534136546185,
"grad_norm": 12.93001937866211,
"learning_rate": 8.895855189683768e-06,
"loss": 0.1315,
"num_input_tokens_seen": 1827168,
"step": 2930
},
{
"epoch": 5.893574297188755,
"grad_norm": 3.48124361038208,
"learning_rate": 8.890357282199504e-06,
"loss": 0.1622,
"num_input_tokens_seen": 1829504,
"step": 2935
},
{
"epoch": 5.903614457831325,
"grad_norm": 8.852481842041016,
"learning_rate": 8.884847428634792e-06,
"loss": 0.156,
"num_input_tokens_seen": 1832640,
"step": 2940
},
{
"epoch": 5.913654618473895,
"grad_norm": 4.551000595092773,
"learning_rate": 8.879325645908686e-06,
"loss": 0.1578,
"num_input_tokens_seen": 1836448,
"step": 2945
},
{
"epoch": 5.923694779116466,
"grad_norm": 5.305069446563721,
"learning_rate": 8.873791950976865e-06,
"loss": 0.1497,
"num_input_tokens_seen": 1839104,
"step": 2950
},
{
"epoch": 5.933734939759036,
"grad_norm": 9.336993217468262,
"learning_rate": 8.868246360831589e-06,
"loss": 0.1957,
"num_input_tokens_seen": 1841952,
"step": 2955
},
{
"epoch": 5.943775100401607,
"grad_norm": 2.6634745597839355,
"learning_rate": 8.862688892501648e-06,
"loss": 0.1177,
"num_input_tokens_seen": 1846272,
"step": 2960
},
{
"epoch": 5.953815261044177,
"grad_norm": 2.5295801162719727,
"learning_rate": 8.857119563052301e-06,
"loss": 0.1179,
"num_input_tokens_seen": 1849888,
"step": 2965
},
{
"epoch": 5.9638554216867465,
"grad_norm": 1.7765917778015137,
"learning_rate": 8.851538389585234e-06,
"loss": 0.1344,
"num_input_tokens_seen": 1853152,
"step": 2970
},
{
"epoch": 5.973895582329317,
"grad_norm": 1.9854011535644531,
"learning_rate": 8.845945389238496e-06,
"loss": 0.0961,
"num_input_tokens_seen": 1856128,
"step": 2975
},
{
"epoch": 5.983935742971887,
"grad_norm": 10.58693790435791,
"learning_rate": 8.840340579186457e-06,
"loss": 0.1184,
"num_input_tokens_seen": 1858496,
"step": 2980
},
{
"epoch": 5.993975903614458,
"grad_norm": 0.9854339957237244,
"learning_rate": 8.834723976639752e-06,
"loss": 0.1139,
"num_input_tokens_seen": 1860928,
"step": 2985
},
{
"epoch": 6.0,
"eval_loss": 0.19226641952991486,
"eval_runtime": 8.0748,
"eval_samples_per_second": 61.673,
"eval_steps_per_second": 15.48,
"num_input_tokens_seen": 1862848,
"step": 2988
},
{
"epoch": 6.004016064257028,
"grad_norm": 2.078145742416382,
"learning_rate": 8.829095598845224e-06,
"loss": 0.1233,
"num_input_tokens_seen": 1864064,
"step": 2990
},
{
"epoch": 6.014056224899599,
"grad_norm": 12.33251667022705,
"learning_rate": 8.823455463085873e-06,
"loss": 0.0896,
"num_input_tokens_seen": 1867360,
"step": 2995
},
{
"epoch": 6.024096385542169,
"grad_norm": 4.962930202484131,
"learning_rate": 8.81780358668081e-06,
"loss": 0.114,
"num_input_tokens_seen": 1870112,
"step": 3000
},
{
"epoch": 6.034136546184739,
"grad_norm": 18.131410598754883,
"learning_rate": 8.812139986985194e-06,
"loss": 0.0927,
"num_input_tokens_seen": 1873632,
"step": 3005
},
{
"epoch": 6.044176706827309,
"grad_norm": 2.0991618633270264,
"learning_rate": 8.806464681390182e-06,
"loss": 0.1233,
"num_input_tokens_seen": 1876480,
"step": 3010
},
{
"epoch": 6.054216867469879,
"grad_norm": 5.510014057159424,
"learning_rate": 8.800777687322875e-06,
"loss": 0.1091,
"num_input_tokens_seen": 1880032,
"step": 3015
},
{
"epoch": 6.06425702811245,
"grad_norm": 0.9164973497390747,
"learning_rate": 8.795079022246269e-06,
"loss": 0.1362,
"num_input_tokens_seen": 1882400,
"step": 3020
},
{
"epoch": 6.07429718875502,
"grad_norm": 2.5013444423675537,
"learning_rate": 8.789368703659199e-06,
"loss": 0.1909,
"num_input_tokens_seen": 1885632,
"step": 3025
},
{
"epoch": 6.0843373493975905,
"grad_norm": 7.10783052444458,
"learning_rate": 8.78364674909628e-06,
"loss": 0.1584,
"num_input_tokens_seen": 1889088,
"step": 3030
},
{
"epoch": 6.094377510040161,
"grad_norm": 12.30833911895752,
"learning_rate": 8.777913176127859e-06,
"loss": 0.1345,
"num_input_tokens_seen": 1893056,
"step": 3035
},
{
"epoch": 6.104417670682731,
"grad_norm": 3.545053720474243,
"learning_rate": 8.772168002359962e-06,
"loss": 0.1093,
"num_input_tokens_seen": 1896896,
"step": 3040
},
{
"epoch": 6.114457831325301,
"grad_norm": 1.529579758644104,
"learning_rate": 8.766411245434234e-06,
"loss": 0.081,
"num_input_tokens_seen": 1899968,
"step": 3045
},
{
"epoch": 6.124497991967871,
"grad_norm": 13.889845848083496,
"learning_rate": 8.760642923027888e-06,
"loss": 0.0967,
"num_input_tokens_seen": 1902944,
"step": 3050
},
{
"epoch": 6.134538152610442,
"grad_norm": 5.338316917419434,
"learning_rate": 8.754863052853658e-06,
"loss": 0.1337,
"num_input_tokens_seen": 1906368,
"step": 3055
},
{
"epoch": 6.144578313253012,
"grad_norm": 7.715173721313477,
"learning_rate": 8.74907165265973e-06,
"loss": 0.1273,
"num_input_tokens_seen": 1908832,
"step": 3060
},
{
"epoch": 6.1546184738955825,
"grad_norm": 7.223991870880127,
"learning_rate": 8.743268740229693e-06,
"loss": 0.1598,
"num_input_tokens_seen": 1911360,
"step": 3065
},
{
"epoch": 6.164658634538153,
"grad_norm": 6.749616622924805,
"learning_rate": 8.7374543333825e-06,
"loss": 0.1322,
"num_input_tokens_seen": 1914816,
"step": 3070
},
{
"epoch": 6.174698795180723,
"grad_norm": 5.285307884216309,
"learning_rate": 8.731628449972382e-06,
"loss": 0.0712,
"num_input_tokens_seen": 1917728,
"step": 3075
},
{
"epoch": 6.184738955823293,
"grad_norm": 12.172269821166992,
"learning_rate": 8.725791107888825e-06,
"loss": 0.0885,
"num_input_tokens_seen": 1920672,
"step": 3080
},
{
"epoch": 6.194779116465863,
"grad_norm": 8.986129760742188,
"learning_rate": 8.719942325056496e-06,
"loss": 0.1114,
"num_input_tokens_seen": 1923776,
"step": 3085
},
{
"epoch": 6.204819277108434,
"grad_norm": 4.885204315185547,
"learning_rate": 8.71408211943519e-06,
"loss": 0.1036,
"num_input_tokens_seen": 1926464,
"step": 3090
},
{
"epoch": 6.214859437751004,
"grad_norm": 9.752225875854492,
"learning_rate": 8.70821050901978e-06,
"loss": 0.1345,
"num_input_tokens_seen": 1929792,
"step": 3095
},
{
"epoch": 6.224899598393574,
"grad_norm": 4.695402145385742,
"learning_rate": 8.702327511840165e-06,
"loss": 0.0932,
"num_input_tokens_seen": 1933664,
"step": 3100
},
{
"epoch": 6.234939759036145,
"grad_norm": 2.340029001235962,
"learning_rate": 8.6964331459612e-06,
"loss": 0.1173,
"num_input_tokens_seen": 1936704,
"step": 3105
},
{
"epoch": 6.244979919678715,
"grad_norm": 11.728029251098633,
"learning_rate": 8.690527429482658e-06,
"loss": 0.1965,
"num_input_tokens_seen": 1939552,
"step": 3110
},
{
"epoch": 6.255020080321285,
"grad_norm": 1.949618935585022,
"learning_rate": 8.68461038053916e-06,
"loss": 0.1401,
"num_input_tokens_seen": 1942944,
"step": 3115
},
{
"epoch": 6.265060240963855,
"grad_norm": 2.2153432369232178,
"learning_rate": 8.678682017300126e-06,
"loss": 0.0998,
"num_input_tokens_seen": 1945600,
"step": 3120
},
{
"epoch": 6.275100401606426,
"grad_norm": 2.1864218711853027,
"learning_rate": 8.672742357969724e-06,
"loss": 0.1296,
"num_input_tokens_seen": 1948416,
"step": 3125
},
{
"epoch": 6.285140562248996,
"grad_norm": 16.295392990112305,
"learning_rate": 8.666791420786805e-06,
"loss": 0.1204,
"num_input_tokens_seen": 1951296,
"step": 3130
},
{
"epoch": 6.295180722891566,
"grad_norm": 5.46299934387207,
"learning_rate": 8.660829224024849e-06,
"loss": 0.1233,
"num_input_tokens_seen": 1954784,
"step": 3135
},
{
"epoch": 6.305220883534137,
"grad_norm": 4.170035362243652,
"learning_rate": 8.654855785991915e-06,
"loss": 0.134,
"num_input_tokens_seen": 1957664,
"step": 3140
},
{
"epoch": 6.315261044176707,
"grad_norm": 9.24455451965332,
"learning_rate": 8.648871125030576e-06,
"loss": 0.078,
"num_input_tokens_seen": 1960736,
"step": 3145
},
{
"epoch": 6.325301204819277,
"grad_norm": 4.151712417602539,
"learning_rate": 8.642875259517871e-06,
"loss": 0.0517,
"num_input_tokens_seen": 1964448,
"step": 3150
},
{
"epoch": 6.335341365461847,
"grad_norm": 2.0245907306671143,
"learning_rate": 8.636868207865244e-06,
"loss": 0.1463,
"num_input_tokens_seen": 1967808,
"step": 3155
},
{
"epoch": 6.3453815261044175,
"grad_norm": 3.730196475982666,
"learning_rate": 8.630849988518486e-06,
"loss": 0.0814,
"num_input_tokens_seen": 1970592,
"step": 3160
},
{
"epoch": 6.355421686746988,
"grad_norm": 0.7733585238456726,
"learning_rate": 8.62482061995768e-06,
"loss": 0.0911,
"num_input_tokens_seen": 1973856,
"step": 3165
},
{
"epoch": 6.365461847389558,
"grad_norm": 13.04262638092041,
"learning_rate": 8.618780120697152e-06,
"loss": 0.1716,
"num_input_tokens_seen": 1977760,
"step": 3170
},
{
"epoch": 6.375502008032129,
"grad_norm": 27.22624397277832,
"learning_rate": 8.612728509285395e-06,
"loss": 0.1568,
"num_input_tokens_seen": 1981408,
"step": 3175
},
{
"epoch": 6.385542168674699,
"grad_norm": 1.5505954027175903,
"learning_rate": 8.606665804305034e-06,
"loss": 0.0847,
"num_input_tokens_seen": 1985056,
"step": 3180
},
{
"epoch": 6.395582329317269,
"grad_norm": 7.722294330596924,
"learning_rate": 8.600592024372756e-06,
"loss": 0.1526,
"num_input_tokens_seen": 1988000,
"step": 3185
},
{
"epoch": 6.405622489959839,
"grad_norm": 4.913753509521484,
"learning_rate": 8.594507188139251e-06,
"loss": 0.1492,
"num_input_tokens_seen": 1991168,
"step": 3190
},
{
"epoch": 6.4156626506024095,
"grad_norm": 5.078114032745361,
"learning_rate": 8.588411314289169e-06,
"loss": 0.0747,
"num_input_tokens_seen": 1994560,
"step": 3195
},
{
"epoch": 6.42570281124498,
"grad_norm": 1.480210542678833,
"learning_rate": 8.582304421541045e-06,
"loss": 0.1097,
"num_input_tokens_seen": 1997248,
"step": 3200
},
{
"epoch": 6.43574297188755,
"grad_norm": 27.352954864501953,
"learning_rate": 8.576186528647253e-06,
"loss": 0.1648,
"num_input_tokens_seen": 2000736,
"step": 3205
},
{
"epoch": 6.445783132530121,
"grad_norm": 2.2470853328704834,
"learning_rate": 8.570057654393943e-06,
"loss": 0.0742,
"num_input_tokens_seen": 2004192,
"step": 3210
},
{
"epoch": 6.455823293172691,
"grad_norm": 6.305330753326416,
"learning_rate": 8.563917817600988e-06,
"loss": 0.1466,
"num_input_tokens_seen": 2007616,
"step": 3215
},
{
"epoch": 6.4658634538152615,
"grad_norm": 7.9827752113342285,
"learning_rate": 8.557767037121923e-06,
"loss": 0.1116,
"num_input_tokens_seen": 2010720,
"step": 3220
},
{
"epoch": 6.475903614457831,
"grad_norm": 4.350990295410156,
"learning_rate": 8.551605331843885e-06,
"loss": 0.1186,
"num_input_tokens_seen": 2014368,
"step": 3225
},
{
"epoch": 6.485943775100401,
"grad_norm": 2.8370282649993896,
"learning_rate": 8.545432720687558e-06,
"loss": 0.1393,
"num_input_tokens_seen": 2017280,
"step": 3230
},
{
"epoch": 6.495983935742972,
"grad_norm": 3.6251227855682373,
"learning_rate": 8.53924922260712e-06,
"loss": 0.2241,
"num_input_tokens_seen": 2020256,
"step": 3235
},
{
"epoch": 6.506024096385542,
"grad_norm": 2.832481861114502,
"learning_rate": 8.533054856590175e-06,
"loss": 0.1532,
"num_input_tokens_seen": 2023968,
"step": 3240
},
{
"epoch": 6.516064257028113,
"grad_norm": 7.332273483276367,
"learning_rate": 8.526849641657697e-06,
"loss": 0.1268,
"num_input_tokens_seen": 2026208,
"step": 3245
},
{
"epoch": 6.526104417670683,
"grad_norm": 15.295309066772461,
"learning_rate": 8.520633596863978e-06,
"loss": 0.1392,
"num_input_tokens_seen": 2028512,
"step": 3250
},
{
"epoch": 6.5361445783132535,
"grad_norm": 3.655897617340088,
"learning_rate": 8.514406741296565e-06,
"loss": 0.161,
"num_input_tokens_seen": 2031456,
"step": 3255
},
{
"epoch": 6.546184738955823,
"grad_norm": 3.424639940261841,
"learning_rate": 8.508169094076197e-06,
"loss": 0.108,
"num_input_tokens_seen": 2033888,
"step": 3260
},
{
"epoch": 6.556224899598393,
"grad_norm": 3.003465175628662,
"learning_rate": 8.501920674356755e-06,
"loss": 0.1776,
"num_input_tokens_seen": 2037312,
"step": 3265
},
{
"epoch": 6.566265060240964,
"grad_norm": 2.6872596740722656,
"learning_rate": 8.495661501325197e-06,
"loss": 0.1337,
"num_input_tokens_seen": 2040448,
"step": 3270
},
{
"epoch": 6.576305220883534,
"grad_norm": 5.785903453826904,
"learning_rate": 8.489391594201503e-06,
"loss": 0.1077,
"num_input_tokens_seen": 2043968,
"step": 3275
},
{
"epoch": 6.586345381526105,
"grad_norm": 4.747533798217773,
"learning_rate": 8.483110972238612e-06,
"loss": 0.1124,
"num_input_tokens_seen": 2047584,
"step": 3280
},
{
"epoch": 6.596385542168675,
"grad_norm": 9.165260314941406,
"learning_rate": 8.476819654722365e-06,
"loss": 0.0998,
"num_input_tokens_seen": 2051136,
"step": 3285
},
{
"epoch": 6.606425702811245,
"grad_norm": 3.9798431396484375,
"learning_rate": 8.47051766097145e-06,
"loss": 0.1312,
"num_input_tokens_seen": 2054784,
"step": 3290
},
{
"epoch": 6.616465863453815,
"grad_norm": 3.7183773517608643,
"learning_rate": 8.46420501033733e-06,
"loss": 0.1411,
"num_input_tokens_seen": 2058176,
"step": 3295
},
{
"epoch": 6.626506024096385,
"grad_norm": 1.8856403827667236,
"learning_rate": 8.457881722204201e-06,
"loss": 0.1068,
"num_input_tokens_seen": 2061472,
"step": 3300
},
{
"epoch": 6.636546184738956,
"grad_norm": 6.899387359619141,
"learning_rate": 8.45154781598892e-06,
"loss": 0.1221,
"num_input_tokens_seen": 2064288,
"step": 3305
},
{
"epoch": 6.646586345381526,
"grad_norm": 6.279878616333008,
"learning_rate": 8.445203311140944e-06,
"loss": 0.0784,
"num_input_tokens_seen": 2067936,
"step": 3310
},
{
"epoch": 6.656626506024097,
"grad_norm": 5.195173263549805,
"learning_rate": 8.438848227142282e-06,
"loss": 0.1184,
"num_input_tokens_seen": 2070752,
"step": 3315
},
{
"epoch": 6.666666666666667,
"grad_norm": 2.3702588081359863,
"learning_rate": 8.432482583507425e-06,
"loss": 0.0321,
"num_input_tokens_seen": 2073664,
"step": 3320
},
{
"epoch": 6.676706827309237,
"grad_norm": 0.9415891170501709,
"learning_rate": 8.42610639978329e-06,
"loss": 0.1295,
"num_input_tokens_seen": 2077376,
"step": 3325
},
{
"epoch": 6.686746987951807,
"grad_norm": 0.7066106796264648,
"learning_rate": 8.41971969554916e-06,
"loss": 0.074,
"num_input_tokens_seen": 2080608,
"step": 3330
},
{
"epoch": 6.696787148594377,
"grad_norm": 2.8969779014587402,
"learning_rate": 8.413322490416623e-06,
"loss": 0.1991,
"num_input_tokens_seen": 2083104,
"step": 3335
},
{
"epoch": 6.706827309236948,
"grad_norm": 2.466632843017578,
"learning_rate": 8.40691480402951e-06,
"loss": 0.1012,
"num_input_tokens_seen": 2085856,
"step": 3340
},
{
"epoch": 6.716867469879518,
"grad_norm": 8.746673583984375,
"learning_rate": 8.40049665606384e-06,
"loss": 0.1205,
"num_input_tokens_seen": 2088928,
"step": 3345
},
{
"epoch": 6.7269076305220885,
"grad_norm": 4.114537715911865,
"learning_rate": 8.394068066227752e-06,
"loss": 0.2032,
"num_input_tokens_seen": 2091616,
"step": 3350
},
{
"epoch": 6.736947791164659,
"grad_norm": 11.16876220703125,
"learning_rate": 8.387629054261454e-06,
"loss": 0.1548,
"num_input_tokens_seen": 2094272,
"step": 3355
},
{
"epoch": 6.746987951807229,
"grad_norm": 3.5258936882019043,
"learning_rate": 8.381179639937152e-06,
"loss": 0.1488,
"num_input_tokens_seen": 2097152,
"step": 3360
},
{
"epoch": 6.757028112449799,
"grad_norm": 5.957228660583496,
"learning_rate": 8.374719843059e-06,
"loss": 0.1051,
"num_input_tokens_seen": 2100480,
"step": 3365
},
{
"epoch": 6.767068273092369,
"grad_norm": 3.576631784439087,
"learning_rate": 8.368249683463028e-06,
"loss": 0.1419,
"num_input_tokens_seen": 2103552,
"step": 3370
},
{
"epoch": 6.77710843373494,
"grad_norm": 5.911473274230957,
"learning_rate": 8.361769181017089e-06,
"loss": 0.1153,
"num_input_tokens_seen": 2106848,
"step": 3375
},
{
"epoch": 6.78714859437751,
"grad_norm": 7.324421405792236,
"learning_rate": 8.355278355620795e-06,
"loss": 0.1921,
"num_input_tokens_seen": 2109632,
"step": 3380
},
{
"epoch": 6.7971887550200805,
"grad_norm": 1.6949467658996582,
"learning_rate": 8.348777227205462e-06,
"loss": 0.1737,
"num_input_tokens_seen": 2112128,
"step": 3385
},
{
"epoch": 6.807228915662651,
"grad_norm": 4.560188293457031,
"learning_rate": 8.342265815734034e-06,
"loss": 0.0897,
"num_input_tokens_seen": 2114592,
"step": 3390
},
{
"epoch": 6.817269076305221,
"grad_norm": 10.877038955688477,
"learning_rate": 8.335744141201037e-06,
"loss": 0.1537,
"num_input_tokens_seen": 2117728,
"step": 3395
},
{
"epoch": 6.827309236947791,
"grad_norm": 9.559526443481445,
"learning_rate": 8.329212223632511e-06,
"loss": 0.1561,
"num_input_tokens_seen": 2121792,
"step": 3400
},
{
"epoch": 6.837349397590361,
"grad_norm": 2.976337194442749,
"learning_rate": 8.32267008308595e-06,
"loss": 0.1087,
"num_input_tokens_seen": 2124736,
"step": 3405
},
{
"epoch": 6.847389558232932,
"grad_norm": 3.3281893730163574,
"learning_rate": 8.316117739650235e-06,
"loss": 0.133,
"num_input_tokens_seen": 2127456,
"step": 3410
},
{
"epoch": 6.857429718875502,
"grad_norm": 3.8284034729003906,
"learning_rate": 8.309555213445583e-06,
"loss": 0.1316,
"num_input_tokens_seen": 2130720,
"step": 3415
},
{
"epoch": 6.867469879518072,
"grad_norm": 5.083580017089844,
"learning_rate": 8.302982524623475e-06,
"loss": 0.0751,
"num_input_tokens_seen": 2133376,
"step": 3420
},
{
"epoch": 6.877510040160643,
"grad_norm": 2.2928466796875,
"learning_rate": 8.296399693366601e-06,
"loss": 0.1011,
"num_input_tokens_seen": 2135872,
"step": 3425
},
{
"epoch": 6.887550200803213,
"grad_norm": 0.9877734184265137,
"learning_rate": 8.289806739888791e-06,
"loss": 0.1031,
"num_input_tokens_seen": 2138592,
"step": 3430
},
{
"epoch": 6.897590361445783,
"grad_norm": 12.757080078125,
"learning_rate": 8.283203684434963e-06,
"loss": 0.1485,
"num_input_tokens_seen": 2141312,
"step": 3435
},
{
"epoch": 6.907630522088353,
"grad_norm": 2.4897549152374268,
"learning_rate": 8.27659054728105e-06,
"loss": 0.1108,
"num_input_tokens_seen": 2143936,
"step": 3440
},
{
"epoch": 6.917670682730924,
"grad_norm": 1.2650902271270752,
"learning_rate": 8.269967348733947e-06,
"loss": 0.1032,
"num_input_tokens_seen": 2147456,
"step": 3445
},
{
"epoch": 6.927710843373494,
"grad_norm": 11.644404411315918,
"learning_rate": 8.26333410913144e-06,
"loss": 0.1131,
"num_input_tokens_seen": 2150624,
"step": 3450
},
{
"epoch": 6.937751004016064,
"grad_norm": 4.924938201904297,
"learning_rate": 8.256690848842153e-06,
"loss": 0.124,
"num_input_tokens_seen": 2154176,
"step": 3455
},
{
"epoch": 6.947791164658635,
"grad_norm": 15.072479248046875,
"learning_rate": 8.250037588265473e-06,
"loss": 0.1661,
"num_input_tokens_seen": 2157056,
"step": 3460
},
{
"epoch": 6.957831325301205,
"grad_norm": 1.2311360836029053,
"learning_rate": 8.243374347831505e-06,
"loss": 0.0956,
"num_input_tokens_seen": 2160480,
"step": 3465
},
{
"epoch": 6.967871485943775,
"grad_norm": 4.724909782409668,
"learning_rate": 8.236701148000989e-06,
"loss": 0.1597,
"num_input_tokens_seen": 2163840,
"step": 3470
},
{
"epoch": 6.977911646586345,
"grad_norm": 4.713107585906982,
"learning_rate": 8.230018009265255e-06,
"loss": 0.1122,
"num_input_tokens_seen": 2166848,
"step": 3475
},
{
"epoch": 6.9879518072289155,
"grad_norm": 3.1349759101867676,
"learning_rate": 8.223324952146145e-06,
"loss": 0.1319,
"num_input_tokens_seen": 2169568,
"step": 3480
},
{
"epoch": 6.997991967871486,
"grad_norm": 6.093204021453857,
"learning_rate": 8.216621997195966e-06,
"loss": 0.0853,
"num_input_tokens_seen": 2172288,
"step": 3485
},
{
"epoch": 7.008032128514056,
"grad_norm": 2.002347946166992,
"learning_rate": 8.209909164997409e-06,
"loss": 0.1287,
"num_input_tokens_seen": 2175136,
"step": 3490
},
{
"epoch": 7.018072289156627,
"grad_norm": 9.41158390045166,
"learning_rate": 8.203186476163503e-06,
"loss": 0.0723,
"num_input_tokens_seen": 2178848,
"step": 3495
},
{
"epoch": 7.028112449799197,
"grad_norm": 11.366344451904297,
"learning_rate": 8.196453951337538e-06,
"loss": 0.0719,
"num_input_tokens_seen": 2181568,
"step": 3500
},
{
"epoch": 7.038152610441767,
"grad_norm": 0.7649056315422058,
"learning_rate": 8.189711611193012e-06,
"loss": 0.1081,
"num_input_tokens_seen": 2185664,
"step": 3505
},
{
"epoch": 7.048192771084337,
"grad_norm": 3.984673500061035,
"learning_rate": 8.182959476433555e-06,
"loss": 0.1156,
"num_input_tokens_seen": 2189536,
"step": 3510
},
{
"epoch": 7.0582329317269075,
"grad_norm": 12.080748558044434,
"learning_rate": 8.176197567792883e-06,
"loss": 0.0488,
"num_input_tokens_seen": 2192672,
"step": 3515
},
{
"epoch": 7.068273092369478,
"grad_norm": 4.369615077972412,
"learning_rate": 8.169425906034718e-06,
"loss": 0.1427,
"num_input_tokens_seen": 2195136,
"step": 3520
},
{
"epoch": 7.078313253012048,
"grad_norm": 7.291754722595215,
"learning_rate": 8.162644511952735e-06,
"loss": 0.1883,
"num_input_tokens_seen": 2198368,
"step": 3525
},
{
"epoch": 7.088353413654619,
"grad_norm": 1.121962308883667,
"learning_rate": 8.155853406370488e-06,
"loss": 0.1243,
"num_input_tokens_seen": 2201376,
"step": 3530
},
{
"epoch": 7.098393574297189,
"grad_norm": 5.575697422027588,
"learning_rate": 8.149052610141357e-06,
"loss": 0.1082,
"num_input_tokens_seen": 2204160,
"step": 3535
},
{
"epoch": 7.108433734939759,
"grad_norm": 11.145264625549316,
"learning_rate": 8.142242144148478e-06,
"loss": 0.1217,
"num_input_tokens_seen": 2207296,
"step": 3540
},
{
"epoch": 7.118473895582329,
"grad_norm": 0.5861991047859192,
"learning_rate": 8.135422029304682e-06,
"loss": 0.0637,
"num_input_tokens_seen": 2210528,
"step": 3545
},
{
"epoch": 7.128514056224899,
"grad_norm": 10.110943794250488,
"learning_rate": 8.128592286552422e-06,
"loss": 0.101,
"num_input_tokens_seen": 2212960,
"step": 3550
},
{
"epoch": 7.13855421686747,
"grad_norm": 22.04067039489746,
"learning_rate": 8.12175293686372e-06,
"loss": 0.114,
"num_input_tokens_seen": 2216032,
"step": 3555
},
{
"epoch": 7.14859437751004,
"grad_norm": 30.2784366607666,
"learning_rate": 8.1149040012401e-06,
"loss": 0.1794,
"num_input_tokens_seen": 2218944,
"step": 3560
},
{
"epoch": 7.158634538152611,
"grad_norm": 17.950511932373047,
"learning_rate": 8.108045500712518e-06,
"loss": 0.1161,
"num_input_tokens_seen": 2222336,
"step": 3565
},
{
"epoch": 7.168674698795181,
"grad_norm": 28.468650817871094,
"learning_rate": 8.101177456341301e-06,
"loss": 0.1088,
"num_input_tokens_seen": 2225472,
"step": 3570
},
{
"epoch": 7.178714859437751,
"grad_norm": 10.21017074584961,
"learning_rate": 8.094299889216081e-06,
"loss": 0.1078,
"num_input_tokens_seen": 2228320,
"step": 3575
},
{
"epoch": 7.188755020080321,
"grad_norm": 6.275518417358398,
"learning_rate": 8.087412820455738e-06,
"loss": 0.1111,
"num_input_tokens_seen": 2231648,
"step": 3580
},
{
"epoch": 7.198795180722891,
"grad_norm": 3.738758087158203,
"learning_rate": 8.080516271208319e-06,
"loss": 0.0929,
"num_input_tokens_seen": 2234560,
"step": 3585
},
{
"epoch": 7.208835341365462,
"grad_norm": 4.079806327819824,
"learning_rate": 8.07361026265099e-06,
"loss": 0.1326,
"num_input_tokens_seen": 2237728,
"step": 3590
},
{
"epoch": 7.218875502008032,
"grad_norm": 4.522058010101318,
"learning_rate": 8.066694815989961e-06,
"loss": 0.0802,
"num_input_tokens_seen": 2240992,
"step": 3595
},
{
"epoch": 7.228915662650603,
"grad_norm": 7.339025020599365,
"learning_rate": 8.059769952460423e-06,
"loss": 0.1238,
"num_input_tokens_seen": 2244608,
"step": 3600
},
{
"epoch": 7.238955823293173,
"grad_norm": 4.954975128173828,
"learning_rate": 8.052835693326484e-06,
"loss": 0.1064,
"num_input_tokens_seen": 2247840,
"step": 3605
},
{
"epoch": 7.2489959839357425,
"grad_norm": 31.905029296875,
"learning_rate": 8.045892059881101e-06,
"loss": 0.2156,
"num_input_tokens_seen": 2251104,
"step": 3610
},
{
"epoch": 7.259036144578313,
"grad_norm": 2.187739372253418,
"learning_rate": 8.038939073446022e-06,
"loss": 0.136,
"num_input_tokens_seen": 2254240,
"step": 3615
},
{
"epoch": 7.269076305220883,
"grad_norm": 1.4481827020645142,
"learning_rate": 8.031976755371709e-06,
"loss": 0.119,
"num_input_tokens_seen": 2257472,
"step": 3620
},
{
"epoch": 7.279116465863454,
"grad_norm": 1.2066372632980347,
"learning_rate": 8.025005127037282e-06,
"loss": 0.0584,
"num_input_tokens_seen": 2260640,
"step": 3625
},
{
"epoch": 7.289156626506024,
"grad_norm": 6.761447429656982,
"learning_rate": 8.018024209850448e-06,
"loss": 0.1104,
"num_input_tokens_seen": 2264544,
"step": 3630
},
{
"epoch": 7.2991967871485945,
"grad_norm": 3.3396995067596436,
"learning_rate": 8.01103402524744e-06,
"loss": 0.1133,
"num_input_tokens_seen": 2268064,
"step": 3635
},
{
"epoch": 7.309236947791165,
"grad_norm": 13.4473876953125,
"learning_rate": 8.004034594692946e-06,
"loss": 0.098,
"num_input_tokens_seen": 2271136,
"step": 3640
},
{
"epoch": 7.3192771084337345,
"grad_norm": 4.999135971069336,
"learning_rate": 7.997025939680047e-06,
"loss": 0.0922,
"num_input_tokens_seen": 2274016,
"step": 3645
},
{
"epoch": 7.329317269076305,
"grad_norm": 7.845126628875732,
"learning_rate": 7.990008081730145e-06,
"loss": 0.1477,
"num_input_tokens_seen": 2277344,
"step": 3650
},
{
"epoch": 7.339357429718875,
"grad_norm": 5.341637134552002,
"learning_rate": 7.982981042392907e-06,
"loss": 0.0949,
"num_input_tokens_seen": 2280480,
"step": 3655
},
{
"epoch": 7.349397590361446,
"grad_norm": 10.169334411621094,
"learning_rate": 7.975944843246195e-06,
"loss": 0.1056,
"num_input_tokens_seen": 2283616,
"step": 3660
},
{
"epoch": 7.359437751004016,
"grad_norm": 8.788763046264648,
"learning_rate": 7.968899505895987e-06,
"loss": 0.0823,
"num_input_tokens_seen": 2285888,
"step": 3665
},
{
"epoch": 7.3694779116465865,
"grad_norm": 3.360013008117676,
"learning_rate": 7.961845051976334e-06,
"loss": 0.0945,
"num_input_tokens_seen": 2289920,
"step": 3670
},
{
"epoch": 7.379518072289157,
"grad_norm": 5.178712844848633,
"learning_rate": 7.954781503149272e-06,
"loss": 0.1121,
"num_input_tokens_seen": 2293152,
"step": 3675
},
{
"epoch": 7.389558232931727,
"grad_norm": 4.379055976867676,
"learning_rate": 7.94770888110477e-06,
"loss": 0.1065,
"num_input_tokens_seen": 2295680,
"step": 3680
},
{
"epoch": 7.399598393574297,
"grad_norm": 7.7414069175720215,
"learning_rate": 7.940627207560655e-06,
"loss": 0.1099,
"num_input_tokens_seen": 2299264,
"step": 3685
},
{
"epoch": 7.409638554216867,
"grad_norm": 11.378726959228516,
"learning_rate": 7.933536504262554e-06,
"loss": 0.1326,
"num_input_tokens_seen": 2302528,
"step": 3690
},
{
"epoch": 7.419678714859438,
"grad_norm": 4.260462284088135,
"learning_rate": 7.926436792983813e-06,
"loss": 0.2,
"num_input_tokens_seen": 2305344,
"step": 3695
},
{
"epoch": 7.429718875502008,
"grad_norm": 12.941370964050293,
"learning_rate": 7.919328095525446e-06,
"loss": 0.1095,
"num_input_tokens_seen": 2308480,
"step": 3700
},
{
"epoch": 7.4397590361445785,
"grad_norm": 14.121201515197754,
"learning_rate": 7.912210433716054e-06,
"loss": 0.0761,
"num_input_tokens_seen": 2311712,
"step": 3705
},
{
"epoch": 7.449799196787149,
"grad_norm": 1.1341758966445923,
"learning_rate": 7.90508382941177e-06,
"loss": 0.1312,
"num_input_tokens_seen": 2314816,
"step": 3710
},
{
"epoch": 7.459839357429719,
"grad_norm": 2.659677743911743,
"learning_rate": 7.897948304496189e-06,
"loss": 0.1492,
"num_input_tokens_seen": 2317088,
"step": 3715
},
{
"epoch": 7.469879518072289,
"grad_norm": 1.2822554111480713,
"learning_rate": 7.890803880880291e-06,
"loss": 0.0939,
"num_input_tokens_seen": 2320192,
"step": 3720
},
{
"epoch": 7.479919678714859,
"grad_norm": 0.40696293115615845,
"learning_rate": 7.883650580502384e-06,
"loss": 0.1041,
"num_input_tokens_seen": 2323328,
"step": 3725
},
{
"epoch": 7.48995983935743,
"grad_norm": 7.722894191741943,
"learning_rate": 7.876488425328037e-06,
"loss": 0.1267,
"num_input_tokens_seen": 2325760,
"step": 3730
},
{
"epoch": 7.5,
"grad_norm": 0.4684428870677948,
"learning_rate": 7.869317437350007e-06,
"loss": 0.0458,
"num_input_tokens_seen": 2329280,
"step": 3735
},
{
"epoch": 7.51004016064257,
"grad_norm": 6.411214351654053,
"learning_rate": 7.862137638588171e-06,
"loss": 0.1071,
"num_input_tokens_seen": 2332544,
"step": 3740
},
{
"epoch": 7.520080321285141,
"grad_norm": 7.014988422393799,
"learning_rate": 7.854949051089467e-06,
"loss": 0.1379,
"num_input_tokens_seen": 2336352,
"step": 3745
},
{
"epoch": 7.530120481927711,
"grad_norm": 0.7333240509033203,
"learning_rate": 7.847751696927813e-06,
"loss": 0.0351,
"num_input_tokens_seen": 2339840,
"step": 3750
},
{
"epoch": 7.540160642570282,
"grad_norm": 2.350759267807007,
"learning_rate": 7.840545598204056e-06,
"loss": 0.1339,
"num_input_tokens_seen": 2343808,
"step": 3755
},
{
"epoch": 7.550200803212851,
"grad_norm": 11.666228294372559,
"learning_rate": 7.833330777045886e-06,
"loss": 0.1692,
"num_input_tokens_seen": 2346816,
"step": 3760
},
{
"epoch": 7.5602409638554215,
"grad_norm": 13.286596298217773,
"learning_rate": 7.826107255607784e-06,
"loss": 0.1368,
"num_input_tokens_seen": 2349888,
"step": 3765
},
{
"epoch": 7.570281124497992,
"grad_norm": 8.502277374267578,
"learning_rate": 7.818875056070944e-06,
"loss": 0.0795,
"num_input_tokens_seen": 2352960,
"step": 3770
},
{
"epoch": 7.580321285140562,
"grad_norm": 9.710450172424316,
"learning_rate": 7.811634200643202e-06,
"loss": 0.1426,
"num_input_tokens_seen": 2356672,
"step": 3775
},
{
"epoch": 7.590361445783133,
"grad_norm": 13.459878921508789,
"learning_rate": 7.804384711558983e-06,
"loss": 0.1388,
"num_input_tokens_seen": 2360032,
"step": 3780
},
{
"epoch": 7.600401606425703,
"grad_norm": 5.783694267272949,
"learning_rate": 7.797126611079219e-06,
"loss": 0.0769,
"num_input_tokens_seen": 2363456,
"step": 3785
},
{
"epoch": 7.610441767068274,
"grad_norm": 13.784896850585938,
"learning_rate": 7.789859921491288e-06,
"loss": 0.1245,
"num_input_tokens_seen": 2366912,
"step": 3790
},
{
"epoch": 7.620481927710843,
"grad_norm": 6.058487892150879,
"learning_rate": 7.782584665108934e-06,
"loss": 0.1209,
"num_input_tokens_seen": 2371008,
"step": 3795
},
{
"epoch": 7.6305220883534135,
"grad_norm": 10.394042015075684,
"learning_rate": 7.775300864272214e-06,
"loss": 0.0855,
"num_input_tokens_seen": 2374016,
"step": 3800
},
{
"epoch": 7.640562248995984,
"grad_norm": 9.327715873718262,
"learning_rate": 7.768008541347423e-06,
"loss": 0.166,
"num_input_tokens_seen": 2377408,
"step": 3805
},
{
"epoch": 7.650602409638554,
"grad_norm": 27.500553131103516,
"learning_rate": 7.760707718727023e-06,
"loss": 0.2516,
"num_input_tokens_seen": 2379680,
"step": 3810
},
{
"epoch": 7.660642570281125,
"grad_norm": 11.609461784362793,
"learning_rate": 7.753398418829572e-06,
"loss": 0.1049,
"num_input_tokens_seen": 2383200,
"step": 3815
},
{
"epoch": 7.670682730923695,
"grad_norm": 1.3528823852539062,
"learning_rate": 7.746080664099667e-06,
"loss": 0.0645,
"num_input_tokens_seen": 2386048,
"step": 3820
},
{
"epoch": 7.6807228915662655,
"grad_norm": 1.679419994354248,
"learning_rate": 7.73875447700786e-06,
"loss": 0.0484,
"num_input_tokens_seen": 2389152,
"step": 3825
},
{
"epoch": 7.690763052208835,
"grad_norm": 21.143573760986328,
"learning_rate": 7.731419880050599e-06,
"loss": 0.198,
"num_input_tokens_seen": 2392064,
"step": 3830
},
{
"epoch": 7.7008032128514055,
"grad_norm": 11.375654220581055,
"learning_rate": 7.72407689575016e-06,
"loss": 0.1013,
"num_input_tokens_seen": 2395488,
"step": 3835
},
{
"epoch": 7.710843373493976,
"grad_norm": 38.27085876464844,
"learning_rate": 7.716725546654564e-06,
"loss": 0.1659,
"num_input_tokens_seen": 2398496,
"step": 3840
},
{
"epoch": 7.720883534136546,
"grad_norm": 11.487302780151367,
"learning_rate": 7.709365855337528e-06,
"loss": 0.0891,
"num_input_tokens_seen": 2401728,
"step": 3845
},
{
"epoch": 7.730923694779117,
"grad_norm": 3.648078203201294,
"learning_rate": 7.701997844398379e-06,
"loss": 0.1085,
"num_input_tokens_seen": 2404320,
"step": 3850
},
{
"epoch": 7.740963855421687,
"grad_norm": 13.500536918640137,
"learning_rate": 7.694621536461995e-06,
"loss": 0.1266,
"num_input_tokens_seen": 2407424,
"step": 3855
},
{
"epoch": 7.7510040160642575,
"grad_norm": 2.1225502490997314,
"learning_rate": 7.687236954178729e-06,
"loss": 0.0699,
"num_input_tokens_seen": 2411136,
"step": 3860
},
{
"epoch": 7.761044176706827,
"grad_norm": 7.856932640075684,
"learning_rate": 7.67984412022434e-06,
"loss": 0.099,
"num_input_tokens_seen": 2414080,
"step": 3865
},
{
"epoch": 7.771084337349397,
"grad_norm": 9.14748477935791,
"learning_rate": 7.672443057299931e-06,
"loss": 0.1714,
"num_input_tokens_seen": 2416832,
"step": 3870
},
{
"epoch": 7.781124497991968,
"grad_norm": 0.6267831325531006,
"learning_rate": 7.665033788131869e-06,
"loss": 0.0734,
"num_input_tokens_seen": 2419680,
"step": 3875
},
{
"epoch": 7.791164658634538,
"grad_norm": 4.2000250816345215,
"learning_rate": 7.657616335471723e-06,
"loss": 0.1235,
"num_input_tokens_seen": 2422848,
"step": 3880
},
{
"epoch": 7.801204819277109,
"grad_norm": 29.275188446044922,
"learning_rate": 7.650190722096188e-06,
"loss": 0.1255,
"num_input_tokens_seen": 2426816,
"step": 3885
},
{
"epoch": 7.811244979919679,
"grad_norm": 4.945994853973389,
"learning_rate": 7.64275697080702e-06,
"loss": 0.1027,
"num_input_tokens_seen": 2429440,
"step": 3890
},
{
"epoch": 7.821285140562249,
"grad_norm": 3.7940685749053955,
"learning_rate": 7.635315104430959e-06,
"loss": 0.0784,
"num_input_tokens_seen": 2432064,
"step": 3895
},
{
"epoch": 7.831325301204819,
"grad_norm": 1.6572811603546143,
"learning_rate": 7.6278651458196724e-06,
"loss": 0.0596,
"num_input_tokens_seen": 2435328,
"step": 3900
},
{
"epoch": 7.841365461847389,
"grad_norm": 15.092347145080566,
"learning_rate": 7.620407117849674e-06,
"loss": 0.1102,
"num_input_tokens_seen": 2438240,
"step": 3905
},
{
"epoch": 7.85140562248996,
"grad_norm": 0.5130079388618469,
"learning_rate": 7.6129410434222505e-06,
"loss": 0.036,
"num_input_tokens_seen": 2441312,
"step": 3910
},
{
"epoch": 7.86144578313253,
"grad_norm": 25.864091873168945,
"learning_rate": 7.6054669454634025e-06,
"loss": 0.0824,
"num_input_tokens_seen": 2444288,
"step": 3915
},
{
"epoch": 7.871485943775101,
"grad_norm": 36.89551544189453,
"learning_rate": 7.597984846923765e-06,
"loss": 0.0896,
"num_input_tokens_seen": 2447360,
"step": 3920
},
{
"epoch": 7.881526104417671,
"grad_norm": 33.04966354370117,
"learning_rate": 7.5904947707785434e-06,
"loss": 0.1538,
"num_input_tokens_seen": 2449920,
"step": 3925
},
{
"epoch": 7.891566265060241,
"grad_norm": 40.92861557006836,
"learning_rate": 7.582996740027438e-06,
"loss": 0.2447,
"num_input_tokens_seen": 2453120,
"step": 3930
},
{
"epoch": 7.901606425702811,
"grad_norm": 7.3696770668029785,
"learning_rate": 7.575490777694572e-06,
"loss": 0.1763,
"num_input_tokens_seen": 2456512,
"step": 3935
},
{
"epoch": 7.911646586345381,
"grad_norm": 10.613399505615234,
"learning_rate": 7.567976906828431e-06,
"loss": 0.1581,
"num_input_tokens_seen": 2459488,
"step": 3940
},
{
"epoch": 7.921686746987952,
"grad_norm": 10.762381553649902,
"learning_rate": 7.560455150501781e-06,
"loss": 0.1783,
"num_input_tokens_seen": 2462880,
"step": 3945
},
{
"epoch": 7.931726907630522,
"grad_norm": 9.3571138381958,
"learning_rate": 7.552925531811601e-06,
"loss": 0.1394,
"num_input_tokens_seen": 2466432,
"step": 3950
},
{
"epoch": 7.9417670682730925,
"grad_norm": 7.034041881561279,
"learning_rate": 7.545388073879018e-06,
"loss": 0.125,
"num_input_tokens_seen": 2470048,
"step": 3955
},
{
"epoch": 7.951807228915663,
"grad_norm": 3.9113035202026367,
"learning_rate": 7.537842799849223e-06,
"loss": 0.1295,
"num_input_tokens_seen": 2473344,
"step": 3960
},
{
"epoch": 7.961847389558233,
"grad_norm": 3.321641445159912,
"learning_rate": 7.530289732891415e-06,
"loss": 0.09,
"num_input_tokens_seen": 2475904,
"step": 3965
},
{
"epoch": 7.971887550200803,
"grad_norm": 4.113049507141113,
"learning_rate": 7.522728896198718e-06,
"loss": 0.1257,
"num_input_tokens_seen": 2479584,
"step": 3970
},
{
"epoch": 7.981927710843373,
"grad_norm": 4.71891975402832,
"learning_rate": 7.515160312988117e-06,
"loss": 0.0629,
"num_input_tokens_seen": 2482208,
"step": 3975
},
{
"epoch": 7.991967871485944,
"grad_norm": 3.222778081893921,
"learning_rate": 7.507584006500381e-06,
"loss": 0.0842,
"num_input_tokens_seen": 2485760,
"step": 3980
},
{
"epoch": 8.0,
"eval_loss": 0.1914680153131485,
"eval_runtime": 8.0714,
"eval_samples_per_second": 61.699,
"eval_steps_per_second": 15.487,
"num_input_tokens_seen": 2487712,
"step": 3984
},
{
"epoch": 8.002008032128513,
"grad_norm": 9.23020076751709,
"learning_rate": 7.500000000000001e-06,
"loss": 0.128,
"num_input_tokens_seen": 2488608,
"step": 3985
},
{
"epoch": 8.012048192771084,
"grad_norm": 21.651845932006836,
"learning_rate": 7.492408316775105e-06,
"loss": 0.1054,
"num_input_tokens_seen": 2491424,
"step": 3990
},
{
"epoch": 8.022088353413654,
"grad_norm": 17.216054916381836,
"learning_rate": 7.4848089801374005e-06,
"loss": 0.1303,
"num_input_tokens_seen": 2495136,
"step": 3995
},
{
"epoch": 8.032128514056225,
"grad_norm": 10.99027156829834,
"learning_rate": 7.47720201342209e-06,
"loss": 0.0562,
"num_input_tokens_seen": 2497504,
"step": 4000
},
{
"epoch": 8.042168674698795,
"grad_norm": 15.369956970214844,
"learning_rate": 7.469587439987811e-06,
"loss": 0.0594,
"num_input_tokens_seen": 2500928,
"step": 4005
},
{
"epoch": 8.052208835341366,
"grad_norm": 1.2656151056289673,
"learning_rate": 7.461965283216557e-06,
"loss": 0.0109,
"num_input_tokens_seen": 2504288,
"step": 4010
},
{
"epoch": 8.062248995983936,
"grad_norm": 1.8451780080795288,
"learning_rate": 7.454335566513603e-06,
"loss": 0.1269,
"num_input_tokens_seen": 2507072,
"step": 4015
},
{
"epoch": 8.072289156626505,
"grad_norm": 10.279462814331055,
"learning_rate": 7.446698313307445e-06,
"loss": 0.0798,
"num_input_tokens_seen": 2510176,
"step": 4020
},
{
"epoch": 8.082329317269076,
"grad_norm": 18.197235107421875,
"learning_rate": 7.43905354704972e-06,
"loss": 0.0449,
"num_input_tokens_seen": 2512576,
"step": 4025
},
{
"epoch": 8.092369477911646,
"grad_norm": 5.9086761474609375,
"learning_rate": 7.431401291215131e-06,
"loss": 0.1062,
"num_input_tokens_seen": 2515744,
"step": 4030
},
{
"epoch": 8.102409638554217,
"grad_norm": 5.934508800506592,
"learning_rate": 7.4237415693013846e-06,
"loss": 0.1092,
"num_input_tokens_seen": 2519136,
"step": 4035
},
{
"epoch": 8.112449799196787,
"grad_norm": 2.209650754928589,
"learning_rate": 7.416074404829108e-06,
"loss": 0.1781,
"num_input_tokens_seen": 2522432,
"step": 4040
},
{
"epoch": 8.122489959839358,
"grad_norm": 28.947336196899414,
"learning_rate": 7.408399821341787e-06,
"loss": 0.0839,
"num_input_tokens_seen": 2525856,
"step": 4045
},
{
"epoch": 8.132530120481928,
"grad_norm": 1.6147160530090332,
"learning_rate": 7.400717842405688e-06,
"loss": 0.0912,
"num_input_tokens_seen": 2528736,
"step": 4050
},
{
"epoch": 8.142570281124499,
"grad_norm": 20.141408920288086,
"learning_rate": 7.393028491609782e-06,
"loss": 0.1502,
"num_input_tokens_seen": 2532448,
"step": 4055
},
{
"epoch": 8.152610441767068,
"grad_norm": 6.467395782470703,
"learning_rate": 7.385331792565682e-06,
"loss": 0.1389,
"num_input_tokens_seen": 2535872,
"step": 4060
},
{
"epoch": 8.162650602409638,
"grad_norm": 6.237977981567383,
"learning_rate": 7.377627768907563e-06,
"loss": 0.1405,
"num_input_tokens_seen": 2538880,
"step": 4065
},
{
"epoch": 8.17269076305221,
"grad_norm": 0.7651534080505371,
"learning_rate": 7.369916444292092e-06,
"loss": 0.0228,
"num_input_tokens_seen": 2541888,
"step": 4070
},
{
"epoch": 8.182730923694779,
"grad_norm": 13.309526443481445,
"learning_rate": 7.362197842398355e-06,
"loss": 0.0577,
"num_input_tokens_seen": 2545216,
"step": 4075
},
{
"epoch": 8.19277108433735,
"grad_norm": 14.438260078430176,
"learning_rate": 7.354471986927785e-06,
"loss": 0.107,
"num_input_tokens_seen": 2548768,
"step": 4080
},
{
"epoch": 8.20281124497992,
"grad_norm": 1.590266466140747,
"learning_rate": 7.346738901604086e-06,
"loss": 0.0757,
"num_input_tokens_seen": 2551776,
"step": 4085
},
{
"epoch": 8.21285140562249,
"grad_norm": 9.592524528503418,
"learning_rate": 7.338998610173166e-06,
"loss": 0.1362,
"num_input_tokens_seen": 2555872,
"step": 4090
},
{
"epoch": 8.22289156626506,
"grad_norm": 0.7704935073852539,
"learning_rate": 7.331251136403057e-06,
"loss": 0.1629,
"num_input_tokens_seen": 2559040,
"step": 4095
},
{
"epoch": 8.23293172690763,
"grad_norm": 26.26668357849121,
"learning_rate": 7.323496504083849e-06,
"loss": 0.1447,
"num_input_tokens_seen": 2562560,
"step": 4100
},
{
"epoch": 8.242971887550201,
"grad_norm": 13.633817672729492,
"learning_rate": 7.315734737027612e-06,
"loss": 0.11,
"num_input_tokens_seen": 2565504,
"step": 4105
},
{
"epoch": 8.25301204819277,
"grad_norm": 17.625274658203125,
"learning_rate": 7.307965859068324e-06,
"loss": 0.0784,
"num_input_tokens_seen": 2568256,
"step": 4110
},
{
"epoch": 8.263052208835342,
"grad_norm": 28.72048568725586,
"learning_rate": 7.300189894061802e-06,
"loss": 0.085,
"num_input_tokens_seen": 2571648,
"step": 4115
},
{
"epoch": 8.273092369477911,
"grad_norm": 6.543407917022705,
"learning_rate": 7.292406865885619e-06,
"loss": 0.0661,
"num_input_tokens_seen": 2575104,
"step": 4120
},
{
"epoch": 8.283132530120483,
"grad_norm": 51.86807632446289,
"learning_rate": 7.284616798439045e-06,
"loss": 0.1056,
"num_input_tokens_seen": 2578400,
"step": 4125
},
{
"epoch": 8.293172690763052,
"grad_norm": 13.690403938293457,
"learning_rate": 7.2768197156429564e-06,
"loss": 0.1329,
"num_input_tokens_seen": 2581376,
"step": 4130
},
{
"epoch": 8.303212851405622,
"grad_norm": 13.89306640625,
"learning_rate": 7.2690156414397775e-06,
"loss": 0.0798,
"num_input_tokens_seen": 2584192,
"step": 4135
},
{
"epoch": 8.313253012048193,
"grad_norm": 10.305960655212402,
"learning_rate": 7.261204599793399e-06,
"loss": 0.1572,
"num_input_tokens_seen": 2587040,
"step": 4140
},
{
"epoch": 8.323293172690763,
"grad_norm": 3.0132784843444824,
"learning_rate": 7.2533866146891085e-06,
"loss": 0.0726,
"num_input_tokens_seen": 2590112,
"step": 4145
},
{
"epoch": 8.333333333333334,
"grad_norm": 13.267522811889648,
"learning_rate": 7.245561710133511e-06,
"loss": 0.1047,
"num_input_tokens_seen": 2592640,
"step": 4150
},
{
"epoch": 8.343373493975903,
"grad_norm": 14.66914176940918,
"learning_rate": 7.23772991015446e-06,
"loss": 0.0888,
"num_input_tokens_seen": 2595616,
"step": 4155
},
{
"epoch": 8.353413654618475,
"grad_norm": 5.335896015167236,
"learning_rate": 7.229891238800988e-06,
"loss": 0.182,
"num_input_tokens_seen": 2599936,
"step": 4160
},
{
"epoch": 8.363453815261044,
"grad_norm": 34.975765228271484,
"learning_rate": 7.22204572014322e-06,
"loss": 0.1266,
"num_input_tokens_seen": 2602912,
"step": 4165
},
{
"epoch": 8.373493975903614,
"grad_norm": 6.264699935913086,
"learning_rate": 7.214193378272312e-06,
"loss": 0.0899,
"num_input_tokens_seen": 2605536,
"step": 4170
},
{
"epoch": 8.383534136546185,
"grad_norm": 26.686826705932617,
"learning_rate": 7.2063342373003676e-06,
"loss": 0.1795,
"num_input_tokens_seen": 2607936,
"step": 4175
},
{
"epoch": 8.393574297188755,
"grad_norm": 4.992021083831787,
"learning_rate": 7.198468321360376e-06,
"loss": 0.0467,
"num_input_tokens_seen": 2610656,
"step": 4180
},
{
"epoch": 8.403614457831326,
"grad_norm": 10.714327812194824,
"learning_rate": 7.190595654606118e-06,
"loss": 0.1586,
"num_input_tokens_seen": 2613952,
"step": 4185
},
{
"epoch": 8.413654618473895,
"grad_norm": 26.027496337890625,
"learning_rate": 7.182716261212116e-06,
"loss": 0.1357,
"num_input_tokens_seen": 2616864,
"step": 4190
},
{
"epoch": 8.423694779116467,
"grad_norm": 12.172310829162598,
"learning_rate": 7.174830165373542e-06,
"loss": 0.1129,
"num_input_tokens_seen": 2620480,
"step": 4195
},
{
"epoch": 8.433734939759036,
"grad_norm": 22.3388671875,
"learning_rate": 7.1669373913061505e-06,
"loss": 0.081,
"num_input_tokens_seen": 2623392,
"step": 4200
},
{
"epoch": 8.443775100401606,
"grad_norm": 8.700096130371094,
"learning_rate": 7.1590379632462004e-06,
"loss": 0.0565,
"num_input_tokens_seen": 2626432,
"step": 4205
},
{
"epoch": 8.453815261044177,
"grad_norm": 17.64000701904297,
"learning_rate": 7.151131905450386e-06,
"loss": 0.0768,
"num_input_tokens_seen": 2629888,
"step": 4210
},
{
"epoch": 8.463855421686747,
"grad_norm": 22.10565948486328,
"learning_rate": 7.14321924219576e-06,
"loss": 0.1255,
"num_input_tokens_seen": 2632864,
"step": 4215
},
{
"epoch": 8.473895582329318,
"grad_norm": 11.11665153503418,
"learning_rate": 7.1352999977796565e-06,
"loss": 0.1375,
"num_input_tokens_seen": 2636096,
"step": 4220
},
{
"epoch": 8.483935742971887,
"grad_norm": 16.72979164123535,
"learning_rate": 7.127374196519616e-06,
"loss": 0.0704,
"num_input_tokens_seen": 2638368,
"step": 4225
},
{
"epoch": 8.493975903614459,
"grad_norm": 0.9664208889007568,
"learning_rate": 7.119441862753316e-06,
"loss": 0.059,
"num_input_tokens_seen": 2642080,
"step": 4230
},
{
"epoch": 8.504016064257028,
"grad_norm": 8.617629051208496,
"learning_rate": 7.111503020838495e-06,
"loss": 0.1087,
"num_input_tokens_seen": 2644640,
"step": 4235
},
{
"epoch": 8.514056224899598,
"grad_norm": 15.599228858947754,
"learning_rate": 7.103557695152874e-06,
"loss": 0.0577,
"num_input_tokens_seen": 2647616,
"step": 4240
},
{
"epoch": 8.524096385542169,
"grad_norm": 0.8850897550582886,
"learning_rate": 7.095605910094081e-06,
"loss": 0.0759,
"num_input_tokens_seen": 2650560,
"step": 4245
},
{
"epoch": 8.534136546184738,
"grad_norm": 23.22264862060547,
"learning_rate": 7.087647690079584e-06,
"loss": 0.2256,
"num_input_tokens_seen": 2653216,
"step": 4250
},
{
"epoch": 8.54417670682731,
"grad_norm": 1.2891957759857178,
"learning_rate": 7.079683059546607e-06,
"loss": 0.0457,
"num_input_tokens_seen": 2656256,
"step": 4255
},
{
"epoch": 8.55421686746988,
"grad_norm": 38.172786712646484,
"learning_rate": 7.071712042952061e-06,
"loss": 0.0834,
"num_input_tokens_seen": 2659040,
"step": 4260
},
{
"epoch": 8.56425702811245,
"grad_norm": 1.834019660949707,
"learning_rate": 7.063734664772461e-06,
"loss": 0.1075,
"num_input_tokens_seen": 2661824,
"step": 4265
},
{
"epoch": 8.57429718875502,
"grad_norm": 2.8576161861419678,
"learning_rate": 7.055750949503867e-06,
"loss": 0.0369,
"num_input_tokens_seen": 2664576,
"step": 4270
},
{
"epoch": 8.58433734939759,
"grad_norm": 7.74680233001709,
"learning_rate": 7.047760921661788e-06,
"loss": 0.0959,
"num_input_tokens_seen": 2667712,
"step": 4275
},
{
"epoch": 8.594377510040161,
"grad_norm": 43.823997497558594,
"learning_rate": 7.039764605781121e-06,
"loss": 0.1003,
"num_input_tokens_seen": 2670944,
"step": 4280
},
{
"epoch": 8.60441767068273,
"grad_norm": 38.42998123168945,
"learning_rate": 7.031762026416074e-06,
"loss": 0.0871,
"num_input_tokens_seen": 2674336,
"step": 4285
},
{
"epoch": 8.614457831325302,
"grad_norm": 0.930115818977356,
"learning_rate": 7.023753208140084e-06,
"loss": 0.0914,
"num_input_tokens_seen": 2677824,
"step": 4290
},
{
"epoch": 8.624497991967871,
"grad_norm": 55.05161666870117,
"learning_rate": 7.01573817554575e-06,
"loss": 0.068,
"num_input_tokens_seen": 2680544,
"step": 4295
},
{
"epoch": 8.634538152610443,
"grad_norm": 0.6600883603096008,
"learning_rate": 7.0077169532447474e-06,
"loss": 0.022,
"num_input_tokens_seen": 2683904,
"step": 4300
},
{
"epoch": 8.644578313253012,
"grad_norm": 0.6066603660583496,
"learning_rate": 6.999689565867764e-06,
"loss": 0.0531,
"num_input_tokens_seen": 2687200,
"step": 4305
},
{
"epoch": 8.654618473895582,
"grad_norm": 55.28033447265625,
"learning_rate": 6.991656038064416e-06,
"loss": 0.1906,
"num_input_tokens_seen": 2690560,
"step": 4310
},
{
"epoch": 8.664658634538153,
"grad_norm": 34.07001495361328,
"learning_rate": 6.983616394503177e-06,
"loss": 0.0982,
"num_input_tokens_seen": 2693728,
"step": 4315
},
{
"epoch": 8.674698795180722,
"grad_norm": 15.091057777404785,
"learning_rate": 6.975570659871295e-06,
"loss": 0.1921,
"num_input_tokens_seen": 2697312,
"step": 4320
},
{
"epoch": 8.684738955823294,
"grad_norm": 25.58677101135254,
"learning_rate": 6.967518858874727e-06,
"loss": 0.0666,
"num_input_tokens_seen": 2700480,
"step": 4325
},
{
"epoch": 8.694779116465863,
"grad_norm": 6.8150529861450195,
"learning_rate": 6.959461016238056e-06,
"loss": 0.1458,
"num_input_tokens_seen": 2703520,
"step": 4330
},
{
"epoch": 8.704819277108435,
"grad_norm": 15.582724571228027,
"learning_rate": 6.951397156704418e-06,
"loss": 0.0863,
"num_input_tokens_seen": 2706688,
"step": 4335
},
{
"epoch": 8.714859437751004,
"grad_norm": 68.85173797607422,
"learning_rate": 6.943327305035424e-06,
"loss": 0.1132,
"num_input_tokens_seen": 2709440,
"step": 4340
},
{
"epoch": 8.724899598393574,
"grad_norm": 17.743255615234375,
"learning_rate": 6.9352514860110876e-06,
"loss": 0.0675,
"num_input_tokens_seen": 2712512,
"step": 4345
},
{
"epoch": 8.734939759036145,
"grad_norm": 4.8611297607421875,
"learning_rate": 6.927169724429737e-06,
"loss": 0.0307,
"num_input_tokens_seen": 2715296,
"step": 4350
},
{
"epoch": 8.744979919678714,
"grad_norm": 39.961326599121094,
"learning_rate": 6.919082045107963e-06,
"loss": 0.0683,
"num_input_tokens_seen": 2718720,
"step": 4355
},
{
"epoch": 8.755020080321286,
"grad_norm": 8.697257041931152,
"learning_rate": 6.910988472880515e-06,
"loss": 0.0932,
"num_input_tokens_seen": 2721536,
"step": 4360
},
{
"epoch": 8.765060240963855,
"grad_norm": 4.740288257598877,
"learning_rate": 6.902889032600245e-06,
"loss": 0.0802,
"num_input_tokens_seen": 2725024,
"step": 4365
},
{
"epoch": 8.775100401606426,
"grad_norm": 42.13846206665039,
"learning_rate": 6.894783749138021e-06,
"loss": 0.1271,
"num_input_tokens_seen": 2728288,
"step": 4370
},
{
"epoch": 8.785140562248996,
"grad_norm": 0.44398602843284607,
"learning_rate": 6.886672647382653e-06,
"loss": 0.1137,
"num_input_tokens_seen": 2731424,
"step": 4375
},
{
"epoch": 8.795180722891565,
"grad_norm": 1.1284281015396118,
"learning_rate": 6.878555752240821e-06,
"loss": 0.1214,
"num_input_tokens_seen": 2735008,
"step": 4380
},
{
"epoch": 8.805220883534137,
"grad_norm": 27.683223724365234,
"learning_rate": 6.870433088636992e-06,
"loss": 0.0553,
"num_input_tokens_seen": 2737728,
"step": 4385
},
{
"epoch": 8.815261044176706,
"grad_norm": 10.467280387878418,
"learning_rate": 6.862304681513344e-06,
"loss": 0.2023,
"num_input_tokens_seen": 2741120,
"step": 4390
},
{
"epoch": 8.825301204819278,
"grad_norm": 76.25865173339844,
"learning_rate": 6.8541705558296954e-06,
"loss": 0.1059,
"num_input_tokens_seen": 2744384,
"step": 4395
},
{
"epoch": 8.835341365461847,
"grad_norm": 5.276292324066162,
"learning_rate": 6.8460307365634225e-06,
"loss": 0.0959,
"num_input_tokens_seen": 2747296,
"step": 4400
},
{
"epoch": 8.845381526104418,
"grad_norm": 4.337225914001465,
"learning_rate": 6.837885248709386e-06,
"loss": 0.0289,
"num_input_tokens_seen": 2750880,
"step": 4405
},
{
"epoch": 8.855421686746988,
"grad_norm": 9.778970718383789,
"learning_rate": 6.829734117279853e-06,
"loss": 0.121,
"num_input_tokens_seen": 2753696,
"step": 4410
},
{
"epoch": 8.865461847389557,
"grad_norm": 9.512594223022461,
"learning_rate": 6.8215773673044175e-06,
"loss": 0.1068,
"num_input_tokens_seen": 2756544,
"step": 4415
},
{
"epoch": 8.875502008032129,
"grad_norm": 0.6467050909996033,
"learning_rate": 6.81341502382993e-06,
"loss": 0.1459,
"num_input_tokens_seen": 2759008,
"step": 4420
},
{
"epoch": 8.885542168674698,
"grad_norm": 11.161055564880371,
"learning_rate": 6.805247111920416e-06,
"loss": 0.1331,
"num_input_tokens_seen": 2762112,
"step": 4425
},
{
"epoch": 8.89558232931727,
"grad_norm": 25.896102905273438,
"learning_rate": 6.797073656656998e-06,
"loss": 0.0897,
"num_input_tokens_seen": 2765216,
"step": 4430
},
{
"epoch": 8.905622489959839,
"grad_norm": 8.515508651733398,
"learning_rate": 6.788894683137822e-06,
"loss": 0.0792,
"num_input_tokens_seen": 2767744,
"step": 4435
},
{
"epoch": 8.91566265060241,
"grad_norm": 3.874532699584961,
"learning_rate": 6.780710216477979e-06,
"loss": 0.0705,
"num_input_tokens_seen": 2770976,
"step": 4440
},
{
"epoch": 8.92570281124498,
"grad_norm": 11.629859924316406,
"learning_rate": 6.772520281809426e-06,
"loss": 0.1015,
"num_input_tokens_seen": 2774016,
"step": 4445
},
{
"epoch": 8.93574297188755,
"grad_norm": 17.52058219909668,
"learning_rate": 6.7643249042809146e-06,
"loss": 0.153,
"num_input_tokens_seen": 2776768,
"step": 4450
},
{
"epoch": 8.94578313253012,
"grad_norm": 13.657364845275879,
"learning_rate": 6.7561241090579045e-06,
"loss": 0.0843,
"num_input_tokens_seen": 2779520,
"step": 4455
},
{
"epoch": 8.95582329317269,
"grad_norm": 9.738412857055664,
"learning_rate": 6.747917921322496e-06,
"loss": 0.0886,
"num_input_tokens_seen": 2783136,
"step": 4460
},
{
"epoch": 8.965863453815262,
"grad_norm": 4.442209720611572,
"learning_rate": 6.739706366273346e-06,
"loss": 0.0707,
"num_input_tokens_seen": 2786688,
"step": 4465
},
{
"epoch": 8.975903614457831,
"grad_norm": 2.1606521606445312,
"learning_rate": 6.731489469125591e-06,
"loss": 0.0703,
"num_input_tokens_seen": 2790432,
"step": 4470
},
{
"epoch": 8.985943775100402,
"grad_norm": 5.452800750732422,
"learning_rate": 6.723267255110773e-06,
"loss": 0.1706,
"num_input_tokens_seen": 2793696,
"step": 4475
},
{
"epoch": 8.995983935742972,
"grad_norm": 1.0247032642364502,
"learning_rate": 6.715039749476764e-06,
"loss": 0.1357,
"num_input_tokens_seen": 2796512,
"step": 4480
},
{
"epoch": 9.006024096385541,
"grad_norm": 7.481777191162109,
"learning_rate": 6.7068069774876785e-06,
"loss": 0.1072,
"num_input_tokens_seen": 2799168,
"step": 4485
},
{
"epoch": 9.016064257028113,
"grad_norm": 3.347785472869873,
"learning_rate": 6.698568964423808e-06,
"loss": 0.0907,
"num_input_tokens_seen": 2802304,
"step": 4490
},
{
"epoch": 9.026104417670682,
"grad_norm": 10.880471229553223,
"learning_rate": 6.690325735581532e-06,
"loss": 0.0329,
"num_input_tokens_seen": 2805952,
"step": 4495
},
{
"epoch": 9.036144578313253,
"grad_norm": 6.32849645614624,
"learning_rate": 6.682077316273252e-06,
"loss": 0.1234,
"num_input_tokens_seen": 2809440,
"step": 4500
},
{
"epoch": 9.046184738955823,
"grad_norm": 0.3392082750797272,
"learning_rate": 6.673823731827306e-06,
"loss": 0.1117,
"num_input_tokens_seen": 2812576,
"step": 4505
},
{
"epoch": 9.056224899598394,
"grad_norm": 5.663910388946533,
"learning_rate": 6.665565007587888e-06,
"loss": 0.0835,
"num_input_tokens_seen": 2815552,
"step": 4510
},
{
"epoch": 9.066265060240964,
"grad_norm": 21.429214477539062,
"learning_rate": 6.657301168914983e-06,
"loss": 0.0963,
"num_input_tokens_seen": 2818304,
"step": 4515
},
{
"epoch": 9.076305220883533,
"grad_norm": 0.16872386634349823,
"learning_rate": 6.649032241184271e-06,
"loss": 0.068,
"num_input_tokens_seen": 2822016,
"step": 4520
},
{
"epoch": 9.086345381526105,
"grad_norm": 0.11669722944498062,
"learning_rate": 6.640758249787067e-06,
"loss": 0.0016,
"num_input_tokens_seen": 2824992,
"step": 4525
},
{
"epoch": 9.096385542168674,
"grad_norm": 10.158061981201172,
"learning_rate": 6.632479220130232e-06,
"loss": 0.2172,
"num_input_tokens_seen": 2828384,
"step": 4530
},
{
"epoch": 9.106425702811245,
"grad_norm": 0.6976897120475769,
"learning_rate": 6.624195177636098e-06,
"loss": 0.0183,
"num_input_tokens_seen": 2831616,
"step": 4535
},
{
"epoch": 9.116465863453815,
"grad_norm": 21.54926872253418,
"learning_rate": 6.615906147742389e-06,
"loss": 0.0771,
"num_input_tokens_seen": 2834912,
"step": 4540
},
{
"epoch": 9.126506024096386,
"grad_norm": 0.19631217420101166,
"learning_rate": 6.6076121559021445e-06,
"loss": 0.0624,
"num_input_tokens_seen": 2838080,
"step": 4545
},
{
"epoch": 9.136546184738956,
"grad_norm": 1.3235666751861572,
"learning_rate": 6.599313227583642e-06,
"loss": 0.0352,
"num_input_tokens_seen": 2841056,
"step": 4550
},
{
"epoch": 9.146586345381525,
"grad_norm": 0.08484455943107605,
"learning_rate": 6.591009388270315e-06,
"loss": 0.084,
"num_input_tokens_seen": 2844192,
"step": 4555
},
{
"epoch": 9.156626506024097,
"grad_norm": 0.18264135718345642,
"learning_rate": 6.582700663460679e-06,
"loss": 0.0175,
"num_input_tokens_seen": 2847296,
"step": 4560
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.31725814938545227,
"learning_rate": 6.57438707866825e-06,
"loss": 0.0691,
"num_input_tokens_seen": 2850656,
"step": 4565
},
{
"epoch": 9.176706827309237,
"grad_norm": 0.6224172115325928,
"learning_rate": 6.566068659421467e-06,
"loss": 0.1824,
"num_input_tokens_seen": 2854272,
"step": 4570
},
{
"epoch": 9.186746987951807,
"grad_norm": 25.0843563079834,
"learning_rate": 6.557745431263617e-06,
"loss": 0.105,
"num_input_tokens_seen": 2857248,
"step": 4575
},
{
"epoch": 9.196787148594378,
"grad_norm": 10.956290245056152,
"learning_rate": 6.5494174197527515e-06,
"loss": 0.1361,
"num_input_tokens_seen": 2859392,
"step": 4580
},
{
"epoch": 9.206827309236948,
"grad_norm": 16.801361083984375,
"learning_rate": 6.54108465046161e-06,
"loss": 0.0357,
"num_input_tokens_seen": 2862432,
"step": 4585
},
{
"epoch": 9.216867469879517,
"grad_norm": 44.96355056762695,
"learning_rate": 6.532747148977543e-06,
"loss": 0.1851,
"num_input_tokens_seen": 2865728,
"step": 4590
},
{
"epoch": 9.226907630522089,
"grad_norm": 7.259774684906006,
"learning_rate": 6.52440494090243e-06,
"loss": 0.0511,
"num_input_tokens_seen": 2868448,
"step": 4595
},
{
"epoch": 9.236947791164658,
"grad_norm": 2.497241497039795,
"learning_rate": 6.516058051852605e-06,
"loss": 0.1878,
"num_input_tokens_seen": 2871168,
"step": 4600
},
{
"epoch": 9.24698795180723,
"grad_norm": 9.350099563598633,
"learning_rate": 6.507706507458776e-06,
"loss": 0.0551,
"num_input_tokens_seen": 2873728,
"step": 4605
},
{
"epoch": 9.257028112449799,
"grad_norm": 13.690436363220215,
"learning_rate": 6.499350333365945e-06,
"loss": 0.0495,
"num_input_tokens_seen": 2877376,
"step": 4610
},
{
"epoch": 9.26706827309237,
"grad_norm": 8.819265365600586,
"learning_rate": 6.490989555233328e-06,
"loss": 0.0838,
"num_input_tokens_seen": 2880864,
"step": 4615
},
{
"epoch": 9.27710843373494,
"grad_norm": 0.7378672361373901,
"learning_rate": 6.482624198734284e-06,
"loss": 0.0913,
"num_input_tokens_seen": 2883584,
"step": 4620
},
{
"epoch": 9.28714859437751,
"grad_norm": 0.7477921843528748,
"learning_rate": 6.4742542895562276e-06,
"loss": 0.0817,
"num_input_tokens_seen": 2886272,
"step": 4625
},
{
"epoch": 9.29718875502008,
"grad_norm": 1.3603986501693726,
"learning_rate": 6.465879853400553e-06,
"loss": 0.1111,
"num_input_tokens_seen": 2889216,
"step": 4630
},
{
"epoch": 9.30722891566265,
"grad_norm": 2.1609530448913574,
"learning_rate": 6.457500915982555e-06,
"loss": 0.1591,
"num_input_tokens_seen": 2892224,
"step": 4635
},
{
"epoch": 9.317269076305221,
"grad_norm": 15.493494987487793,
"learning_rate": 6.449117503031355e-06,
"loss": 0.1163,
"num_input_tokens_seen": 2895584,
"step": 4640
},
{
"epoch": 9.32730923694779,
"grad_norm": 26.324840545654297,
"learning_rate": 6.440729640289809e-06,
"loss": 0.126,
"num_input_tokens_seen": 2897920,
"step": 4645
},
{
"epoch": 9.337349397590362,
"grad_norm": 8.079339027404785,
"learning_rate": 6.432337353514444e-06,
"loss": 0.111,
"num_input_tokens_seen": 2900224,
"step": 4650
},
{
"epoch": 9.347389558232932,
"grad_norm": 0.812627911567688,
"learning_rate": 6.4239406684753695e-06,
"loss": 0.1038,
"num_input_tokens_seen": 2903200,
"step": 4655
},
{
"epoch": 9.357429718875501,
"grad_norm": 33.051639556884766,
"learning_rate": 6.4155396109561995e-06,
"loss": 0.1554,
"num_input_tokens_seen": 2905824,
"step": 4660
},
{
"epoch": 9.367469879518072,
"grad_norm": 0.4954128563404083,
"learning_rate": 6.407134206753977e-06,
"loss": 0.0318,
"num_input_tokens_seen": 2909312,
"step": 4665
},
{
"epoch": 9.377510040160642,
"grad_norm": 35.997711181640625,
"learning_rate": 6.39872448167909e-06,
"loss": 0.067,
"num_input_tokens_seen": 2912416,
"step": 4670
},
{
"epoch": 9.387550200803213,
"grad_norm": 38.44462203979492,
"learning_rate": 6.3903104615551956e-06,
"loss": 0.075,
"num_input_tokens_seen": 2915232,
"step": 4675
},
{
"epoch": 9.397590361445783,
"grad_norm": 0.6909103393554688,
"learning_rate": 6.381892172219142e-06,
"loss": 0.0786,
"num_input_tokens_seen": 2918624,
"step": 4680
},
{
"epoch": 9.407630522088354,
"grad_norm": 0.7175284028053284,
"learning_rate": 6.373469639520881e-06,
"loss": 0.0335,
"num_input_tokens_seen": 2922112,
"step": 4685
},
{
"epoch": 9.417670682730924,
"grad_norm": 0.5845101475715637,
"learning_rate": 6.3650428893234e-06,
"loss": 0.0375,
"num_input_tokens_seen": 2924800,
"step": 4690
},
{
"epoch": 9.427710843373493,
"grad_norm": 1.1597626209259033,
"learning_rate": 6.356611947502633e-06,
"loss": 0.0801,
"num_input_tokens_seen": 2928064,
"step": 4695
},
{
"epoch": 9.437751004016064,
"grad_norm": 5.7896504402160645,
"learning_rate": 6.348176839947389e-06,
"loss": 0.1439,
"num_input_tokens_seen": 2931168,
"step": 4700
},
{
"epoch": 9.447791164658634,
"grad_norm": 65.76715087890625,
"learning_rate": 6.3397375925592675e-06,
"loss": 0.223,
"num_input_tokens_seen": 2934496,
"step": 4705
},
{
"epoch": 9.457831325301205,
"grad_norm": 0.3577728271484375,
"learning_rate": 6.331294231252576e-06,
"loss": 0.0278,
"num_input_tokens_seen": 2937984,
"step": 4710
},
{
"epoch": 9.467871485943775,
"grad_norm": 10.809001922607422,
"learning_rate": 6.3228467819542606e-06,
"loss": 0.0346,
"num_input_tokens_seen": 2940928,
"step": 4715
},
{
"epoch": 9.477911646586346,
"grad_norm": 0.2872489094734192,
"learning_rate": 6.314395270603819e-06,
"loss": 0.0529,
"num_input_tokens_seen": 2944448,
"step": 4720
},
{
"epoch": 9.487951807228916,
"grad_norm": 0.39773765206336975,
"learning_rate": 6.305939723153218e-06,
"loss": 0.052,
"num_input_tokens_seen": 2947584,
"step": 4725
},
{
"epoch": 9.497991967871485,
"grad_norm": 0.39810657501220703,
"learning_rate": 6.297480165566823e-06,
"loss": 0.0856,
"num_input_tokens_seen": 2951136,
"step": 4730
},
{
"epoch": 9.508032128514056,
"grad_norm": 20.94434928894043,
"learning_rate": 6.289016623821308e-06,
"loss": 0.1398,
"num_input_tokens_seen": 2953760,
"step": 4735
},
{
"epoch": 9.518072289156626,
"grad_norm": 0.32911476492881775,
"learning_rate": 6.280549123905588e-06,
"loss": 0.0166,
"num_input_tokens_seen": 2956736,
"step": 4740
},
{
"epoch": 9.528112449799197,
"grad_norm": 30.02129554748535,
"learning_rate": 6.2720776918207285e-06,
"loss": 0.094,
"num_input_tokens_seen": 2960224,
"step": 4745
},
{
"epoch": 9.538152610441767,
"grad_norm": 1.1178313493728638,
"learning_rate": 6.263602353579868e-06,
"loss": 0.1783,
"num_input_tokens_seen": 2963616,
"step": 4750
},
{
"epoch": 9.548192771084338,
"grad_norm": 42.708438873291016,
"learning_rate": 6.255123135208141e-06,
"loss": 0.0599,
"num_input_tokens_seen": 2966848,
"step": 4755
},
{
"epoch": 9.558232931726907,
"grad_norm": 44.14406967163086,
"learning_rate": 6.246640062742598e-06,
"loss": 0.1511,
"num_input_tokens_seen": 2969568,
"step": 4760
},
{
"epoch": 9.568273092369477,
"grad_norm": 43.29121398925781,
"learning_rate": 6.2381531622321234e-06,
"loss": 0.1446,
"num_input_tokens_seen": 2971488,
"step": 4765
},
{
"epoch": 9.578313253012048,
"grad_norm": 77.22794342041016,
"learning_rate": 6.229662459737354e-06,
"loss": 0.1338,
"num_input_tokens_seen": 2974656,
"step": 4770
},
{
"epoch": 9.588353413654618,
"grad_norm": 3.0715153217315674,
"learning_rate": 6.221167981330607e-06,
"loss": 0.1145,
"num_input_tokens_seen": 2977536,
"step": 4775
},
{
"epoch": 9.598393574297189,
"grad_norm": 1.3485350608825684,
"learning_rate": 6.212669753095788e-06,
"loss": 0.1066,
"num_input_tokens_seen": 2981632,
"step": 4780
},
{
"epoch": 9.608433734939759,
"grad_norm": 19.530614852905273,
"learning_rate": 6.204167801128319e-06,
"loss": 0.109,
"num_input_tokens_seen": 2985184,
"step": 4785
},
{
"epoch": 9.61847389558233,
"grad_norm": 10.45262622833252,
"learning_rate": 6.19566215153506e-06,
"loss": 0.1409,
"num_input_tokens_seen": 2988352,
"step": 4790
},
{
"epoch": 9.6285140562249,
"grad_norm": 29.81780242919922,
"learning_rate": 6.18715283043422e-06,
"loss": 0.0658,
"num_input_tokens_seen": 2991808,
"step": 4795
},
{
"epoch": 9.638554216867469,
"grad_norm": 26.38890266418457,
"learning_rate": 6.178639863955287e-06,
"loss": 0.1023,
"num_input_tokens_seen": 2994688,
"step": 4800
},
{
"epoch": 9.64859437751004,
"grad_norm": 5.988182067871094,
"learning_rate": 6.170123278238939e-06,
"loss": 0.0813,
"num_input_tokens_seen": 2998304,
"step": 4805
},
{
"epoch": 9.65863453815261,
"grad_norm": 0.7817508578300476,
"learning_rate": 6.161603099436968e-06,
"loss": 0.0725,
"num_input_tokens_seen": 3000928,
"step": 4810
},
{
"epoch": 9.668674698795181,
"grad_norm": 7.621532917022705,
"learning_rate": 6.153079353712201e-06,
"loss": 0.0623,
"num_input_tokens_seen": 3004224,
"step": 4815
},
{
"epoch": 9.67871485943775,
"grad_norm": 28.789592742919922,
"learning_rate": 6.144552067238418e-06,
"loss": 0.1374,
"num_input_tokens_seen": 3007200,
"step": 4820
},
{
"epoch": 9.688755020080322,
"grad_norm": 12.002851486206055,
"learning_rate": 6.136021266200271e-06,
"loss": 0.0868,
"num_input_tokens_seen": 3009920,
"step": 4825
},
{
"epoch": 9.698795180722891,
"grad_norm": 17.804859161376953,
"learning_rate": 6.1274869767932e-06,
"loss": 0.073,
"num_input_tokens_seen": 3013152,
"step": 4830
},
{
"epoch": 9.708835341365463,
"grad_norm": 37.982666015625,
"learning_rate": 6.118949225223365e-06,
"loss": 0.0502,
"num_input_tokens_seen": 3015936,
"step": 4835
},
{
"epoch": 9.718875502008032,
"grad_norm": 0.0973651334643364,
"learning_rate": 6.110408037707551e-06,
"loss": 0.0402,
"num_input_tokens_seen": 3019424,
"step": 4840
},
{
"epoch": 9.728915662650602,
"grad_norm": 9.358402252197266,
"learning_rate": 6.1018634404730945e-06,
"loss": 0.207,
"num_input_tokens_seen": 3023040,
"step": 4845
},
{
"epoch": 9.738955823293173,
"grad_norm": 59.125640869140625,
"learning_rate": 6.093315459757807e-06,
"loss": 0.0914,
"num_input_tokens_seen": 3025728,
"step": 4850
},
{
"epoch": 9.748995983935743,
"grad_norm": 15.047895431518555,
"learning_rate": 6.084764121809878e-06,
"loss": 0.0947,
"num_input_tokens_seen": 3028352,
"step": 4855
},
{
"epoch": 9.759036144578314,
"grad_norm": 46.9537239074707,
"learning_rate": 6.076209452887821e-06,
"loss": 0.0236,
"num_input_tokens_seen": 3031968,
"step": 4860
},
{
"epoch": 9.769076305220883,
"grad_norm": 6.795496463775635,
"learning_rate": 6.067651479260368e-06,
"loss": 0.0284,
"num_input_tokens_seen": 3035072,
"step": 4865
},
{
"epoch": 9.779116465863455,
"grad_norm": 1.5318920612335205,
"learning_rate": 6.059090227206402e-06,
"loss": 0.0481,
"num_input_tokens_seen": 3037568,
"step": 4870
},
{
"epoch": 9.789156626506024,
"grad_norm": 0.43727999925613403,
"learning_rate": 6.0505257230148715e-06,
"loss": 0.0885,
"num_input_tokens_seen": 3040384,
"step": 4875
},
{
"epoch": 9.799196787148594,
"grad_norm": 1.1928132772445679,
"learning_rate": 6.041957992984711e-06,
"loss": 0.0831,
"num_input_tokens_seen": 3043104,
"step": 4880
},
{
"epoch": 9.809236947791165,
"grad_norm": 7.331599235534668,
"learning_rate": 6.033387063424765e-06,
"loss": 0.1508,
"num_input_tokens_seen": 3046240,
"step": 4885
},
{
"epoch": 9.819277108433734,
"grad_norm": 7.764211177825928,
"learning_rate": 6.0248129606536984e-06,
"loss": 0.0875,
"num_input_tokens_seen": 3049792,
"step": 4890
},
{
"epoch": 9.829317269076306,
"grad_norm": 1.7602699995040894,
"learning_rate": 6.01623571099992e-06,
"loss": 0.1742,
"num_input_tokens_seen": 3052928,
"step": 4895
},
{
"epoch": 9.839357429718875,
"grad_norm": 0.1980728656053543,
"learning_rate": 6.0076553408015035e-06,
"loss": 0.0964,
"num_input_tokens_seen": 3056416,
"step": 4900
},
{
"epoch": 9.849397590361447,
"grad_norm": 64.95994567871094,
"learning_rate": 5.999071876406104e-06,
"loss": 0.1073,
"num_input_tokens_seen": 3059456,
"step": 4905
},
{
"epoch": 9.859437751004016,
"grad_norm": 29.12643814086914,
"learning_rate": 5.990485344170879e-06,
"loss": 0.2183,
"num_input_tokens_seen": 3062816,
"step": 4910
},
{
"epoch": 9.869477911646586,
"grad_norm": 52.59812545776367,
"learning_rate": 5.9818957704624046e-06,
"loss": 0.1465,
"num_input_tokens_seen": 3065472,
"step": 4915
},
{
"epoch": 9.879518072289157,
"grad_norm": 46.38893127441406,
"learning_rate": 5.973303181656597e-06,
"loss": 0.1317,
"num_input_tokens_seen": 3068480,
"step": 4920
},
{
"epoch": 9.889558232931726,
"grad_norm": 8.468809127807617,
"learning_rate": 5.964707604138632e-06,
"loss": 0.1163,
"num_input_tokens_seen": 3072032,
"step": 4925
},
{
"epoch": 9.899598393574298,
"grad_norm": 10.626388549804688,
"learning_rate": 5.956109064302862e-06,
"loss": 0.0549,
"num_input_tokens_seen": 3074336,
"step": 4930
},
{
"epoch": 9.909638554216867,
"grad_norm": 6.986921310424805,
"learning_rate": 5.947507588552734e-06,
"loss": 0.0499,
"num_input_tokens_seen": 3077728,
"step": 4935
},
{
"epoch": 9.919678714859439,
"grad_norm": 21.21433448791504,
"learning_rate": 5.9389032033007135e-06,
"loss": 0.0673,
"num_input_tokens_seen": 3080992,
"step": 4940
},
{
"epoch": 9.929718875502008,
"grad_norm": 5.551342010498047,
"learning_rate": 5.930295934968197e-06,
"loss": 0.0601,
"num_input_tokens_seen": 3084768,
"step": 4945
},
{
"epoch": 9.939759036144578,
"grad_norm": 7.235230445861816,
"learning_rate": 5.9216858099854365e-06,
"loss": 0.036,
"num_input_tokens_seen": 3088160,
"step": 4950
},
{
"epoch": 9.949799196787149,
"grad_norm": 8.167715072631836,
"learning_rate": 5.913072854791458e-06,
"loss": 0.0386,
"num_input_tokens_seen": 3091104,
"step": 4955
},
{
"epoch": 9.959839357429718,
"grad_norm": 34.79731369018555,
"learning_rate": 5.90445709583397e-06,
"loss": 0.2039,
"num_input_tokens_seen": 3094272,
"step": 4960
},
{
"epoch": 9.96987951807229,
"grad_norm": 15.844338417053223,
"learning_rate": 5.895838559569298e-06,
"loss": 0.0434,
"num_input_tokens_seen": 3098240,
"step": 4965
},
{
"epoch": 9.97991967871486,
"grad_norm": 7.910551071166992,
"learning_rate": 5.887217272462295e-06,
"loss": 0.1129,
"num_input_tokens_seen": 3101056,
"step": 4970
},
{
"epoch": 9.98995983935743,
"grad_norm": 11.544766426086426,
"learning_rate": 5.878593260986256e-06,
"loss": 0.0883,
"num_input_tokens_seen": 3104576,
"step": 4975
},
{
"epoch": 10.0,
"grad_norm": 19.9595890045166,
"learning_rate": 5.869966551622848e-06,
"loss": 0.0943,
"num_input_tokens_seen": 3108288,
"step": 4980
},
{
"epoch": 10.0,
"eval_loss": 0.29868796467781067,
"eval_runtime": 8.071,
"eval_samples_per_second": 61.703,
"eval_steps_per_second": 15.488,
"num_input_tokens_seen": 3108288,
"step": 4980
},
{
"epoch": 10.01004016064257,
"grad_norm": 7.36317253112793,
"learning_rate": 5.861337170862018e-06,
"loss": 0.0089,
"num_input_tokens_seen": 3111360,
"step": 4985
},
{
"epoch": 10.02008032128514,
"grad_norm": 1.2009451389312744,
"learning_rate": 5.852705145201919e-06,
"loss": 0.0079,
"num_input_tokens_seen": 3114496,
"step": 4990
},
{
"epoch": 10.03012048192771,
"grad_norm": 0.47578689455986023,
"learning_rate": 5.844070501148823e-06,
"loss": 0.1431,
"num_input_tokens_seen": 3117120,
"step": 4995
},
{
"epoch": 10.040160642570282,
"grad_norm": 35.32996368408203,
"learning_rate": 5.835433265217043e-06,
"loss": 0.0972,
"num_input_tokens_seen": 3121376,
"step": 5000
},
{
"epoch": 10.050200803212851,
"grad_norm": 36.080787658691406,
"learning_rate": 5.8267934639288525e-06,
"loss": 0.1294,
"num_input_tokens_seen": 3124352,
"step": 5005
},
{
"epoch": 10.060240963855422,
"grad_norm": 47.2873649597168,
"learning_rate": 5.818151123814401e-06,
"loss": 0.1062,
"num_input_tokens_seen": 3127264,
"step": 5010
},
{
"epoch": 10.070281124497992,
"grad_norm": 22.52366065979004,
"learning_rate": 5.809506271411635e-06,
"loss": 0.0603,
"num_input_tokens_seen": 3130368,
"step": 5015
},
{
"epoch": 10.080321285140561,
"grad_norm": 1.4479087591171265,
"learning_rate": 5.800858933266214e-06,
"loss": 0.044,
"num_input_tokens_seen": 3134016,
"step": 5020
},
{
"epoch": 10.090361445783133,
"grad_norm": 2.216742515563965,
"learning_rate": 5.792209135931428e-06,
"loss": 0.0111,
"num_input_tokens_seen": 3137120,
"step": 5025
},
{
"epoch": 10.100401606425702,
"grad_norm": 8.245423316955566,
"learning_rate": 5.7835569059681255e-06,
"loss": 0.0852,
"num_input_tokens_seen": 3140288,
"step": 5030
},
{
"epoch": 10.110441767068274,
"grad_norm": 13.112469673156738,
"learning_rate": 5.77490226994462e-06,
"loss": 0.1236,
"num_input_tokens_seen": 3142912,
"step": 5035
},
{
"epoch": 10.120481927710843,
"grad_norm": 27.564695358276367,
"learning_rate": 5.766245254436613e-06,
"loss": 0.0678,
"num_input_tokens_seen": 3146112,
"step": 5040
},
{
"epoch": 10.130522088353414,
"grad_norm": 43.371463775634766,
"learning_rate": 5.757585886027114e-06,
"loss": 0.0703,
"num_input_tokens_seen": 3148928,
"step": 5045
},
{
"epoch": 10.140562248995984,
"grad_norm": 3.151542901992798,
"learning_rate": 5.748924191306359e-06,
"loss": 0.021,
"num_input_tokens_seen": 3152256,
"step": 5050
},
{
"epoch": 10.150602409638553,
"grad_norm": 21.960346221923828,
"learning_rate": 5.740260196871726e-06,
"loss": 0.1104,
"num_input_tokens_seen": 3155360,
"step": 5055
},
{
"epoch": 10.160642570281125,
"grad_norm": 0.18042011559009552,
"learning_rate": 5.73159392932765e-06,
"loss": 0.0009,
"num_input_tokens_seen": 3158784,
"step": 5060
},
{
"epoch": 10.170682730923694,
"grad_norm": 54.923194885253906,
"learning_rate": 5.722925415285555e-06,
"loss": 0.1475,
"num_input_tokens_seen": 3161952,
"step": 5065
},
{
"epoch": 10.180722891566266,
"grad_norm": 0.09134513884782791,
"learning_rate": 5.714254681363756e-06,
"loss": 0.0859,
"num_input_tokens_seen": 3165536,
"step": 5070
},
{
"epoch": 10.190763052208835,
"grad_norm": 3.2933976650238037,
"learning_rate": 5.705581754187387e-06,
"loss": 0.0771,
"num_input_tokens_seen": 3168064,
"step": 5075
},
{
"epoch": 10.200803212851406,
"grad_norm": 0.30672141909599304,
"learning_rate": 5.69690666038832e-06,
"loss": 0.0067,
"num_input_tokens_seen": 3171232,
"step": 5080
},
{
"epoch": 10.210843373493976,
"grad_norm": 60.519798278808594,
"learning_rate": 5.688229426605075e-06,
"loss": 0.089,
"num_input_tokens_seen": 3174368,
"step": 5085
},
{
"epoch": 10.220883534136545,
"grad_norm": 0.8040590286254883,
"learning_rate": 5.679550079482747e-06,
"loss": 0.0532,
"num_input_tokens_seen": 3177792,
"step": 5090
},
{
"epoch": 10.230923694779117,
"grad_norm": 9.954076766967773,
"learning_rate": 5.670868645672916e-06,
"loss": 0.0561,
"num_input_tokens_seen": 3180704,
"step": 5095
},
{
"epoch": 10.240963855421686,
"grad_norm": 77.2634048461914,
"learning_rate": 5.6621851518335725e-06,
"loss": 0.085,
"num_input_tokens_seen": 3184320,
"step": 5100
},
{
"epoch": 10.251004016064257,
"grad_norm": 22.160968780517578,
"learning_rate": 5.653499624629035e-06,
"loss": 0.0311,
"num_input_tokens_seen": 3187552,
"step": 5105
},
{
"epoch": 10.261044176706827,
"grad_norm": 0.13508616387844086,
"learning_rate": 5.644812090729863e-06,
"loss": 0.0029,
"num_input_tokens_seen": 3190496,
"step": 5110
},
{
"epoch": 10.271084337349398,
"grad_norm": 0.18858233094215393,
"learning_rate": 5.636122576812776e-06,
"loss": 0.0672,
"num_input_tokens_seen": 3193760,
"step": 5115
},
{
"epoch": 10.281124497991968,
"grad_norm": 0.03702637925744057,
"learning_rate": 5.627431109560577e-06,
"loss": 0.0477,
"num_input_tokens_seen": 3197536,
"step": 5120
},
{
"epoch": 10.291164658634537,
"grad_norm": 0.5422800779342651,
"learning_rate": 5.618737715662067e-06,
"loss": 0.0529,
"num_input_tokens_seen": 3201536,
"step": 5125
},
{
"epoch": 10.301204819277109,
"grad_norm": 34.62778854370117,
"learning_rate": 5.61004242181196e-06,
"loss": 0.1767,
"num_input_tokens_seen": 3205056,
"step": 5130
},
{
"epoch": 10.311244979919678,
"grad_norm": 0.6812612414360046,
"learning_rate": 5.601345254710808e-06,
"loss": 0.0998,
"num_input_tokens_seen": 3208608,
"step": 5135
},
{
"epoch": 10.32128514056225,
"grad_norm": 90.58126831054688,
"learning_rate": 5.592646241064913e-06,
"loss": 0.1761,
"num_input_tokens_seen": 3211648,
"step": 5140
},
{
"epoch": 10.331325301204819,
"grad_norm": 39.69200134277344,
"learning_rate": 5.583945407586247e-06,
"loss": 0.0906,
"num_input_tokens_seen": 3214560,
"step": 5145
},
{
"epoch": 10.34136546184739,
"grad_norm": 45.602333068847656,
"learning_rate": 5.5752427809923704e-06,
"loss": 0.0525,
"num_input_tokens_seen": 3218112,
"step": 5150
},
{
"epoch": 10.35140562248996,
"grad_norm": 0.26112157106399536,
"learning_rate": 5.566538388006351e-06,
"loss": 0.1533,
"num_input_tokens_seen": 3220992,
"step": 5155
},
{
"epoch": 10.36144578313253,
"grad_norm": 0.5084413886070251,
"learning_rate": 5.557832255356677e-06,
"loss": 0.0048,
"num_input_tokens_seen": 3224128,
"step": 5160
},
{
"epoch": 10.3714859437751,
"grad_norm": 27.925601959228516,
"learning_rate": 5.549124409777185e-06,
"loss": 0.1247,
"num_input_tokens_seen": 3227648,
"step": 5165
},
{
"epoch": 10.38152610441767,
"grad_norm": 5.449309349060059,
"learning_rate": 5.540414878006965e-06,
"loss": 0.0086,
"num_input_tokens_seen": 3230848,
"step": 5170
},
{
"epoch": 10.391566265060241,
"grad_norm": 0.9163462519645691,
"learning_rate": 5.5317036867902885e-06,
"loss": 0.0039,
"num_input_tokens_seen": 3234656,
"step": 5175
},
{
"epoch": 10.401606425702811,
"grad_norm": 83.0402603149414,
"learning_rate": 5.52299086287652e-06,
"loss": 0.0651,
"num_input_tokens_seen": 3237760,
"step": 5180
},
{
"epoch": 10.411646586345382,
"grad_norm": 0.2617938220500946,
"learning_rate": 5.514276433020044e-06,
"loss": 0.0379,
"num_input_tokens_seen": 3240928,
"step": 5185
},
{
"epoch": 10.421686746987952,
"grad_norm": 0.2599602937698364,
"learning_rate": 5.505560423980164e-06,
"loss": 0.0041,
"num_input_tokens_seen": 3244512,
"step": 5190
},
{
"epoch": 10.431726907630521,
"grad_norm": 0.06482914090156555,
"learning_rate": 5.496842862521046e-06,
"loss": 0.1364,
"num_input_tokens_seen": 3247488,
"step": 5195
},
{
"epoch": 10.441767068273093,
"grad_norm": 0.03865060955286026,
"learning_rate": 5.4881237754116135e-06,
"loss": 0.1852,
"num_input_tokens_seen": 3249952,
"step": 5200
},
{
"epoch": 10.451807228915662,
"grad_norm": 16.58278465270996,
"learning_rate": 5.479403189425481e-06,
"loss": 0.2229,
"num_input_tokens_seen": 3253248,
"step": 5205
},
{
"epoch": 10.461847389558233,
"grad_norm": 1.5567082166671753,
"learning_rate": 5.4706811313408616e-06,
"loss": 0.0127,
"num_input_tokens_seen": 3255808,
"step": 5210
},
{
"epoch": 10.471887550200803,
"grad_norm": 0.2649865746498108,
"learning_rate": 5.461957627940489e-06,
"loss": 0.0817,
"num_input_tokens_seen": 3259008,
"step": 5215
},
{
"epoch": 10.481927710843374,
"grad_norm": 41.40445327758789,
"learning_rate": 5.453232706011539e-06,
"loss": 0.1075,
"num_input_tokens_seen": 3262208,
"step": 5220
},
{
"epoch": 10.491967871485944,
"grad_norm": 74.19205474853516,
"learning_rate": 5.44450639234554e-06,
"loss": 0.0785,
"num_input_tokens_seen": 3265216,
"step": 5225
},
{
"epoch": 10.502008032128515,
"grad_norm": 0.2126062661409378,
"learning_rate": 5.435778713738292e-06,
"loss": 0.0788,
"num_input_tokens_seen": 3267936,
"step": 5230
},
{
"epoch": 10.512048192771084,
"grad_norm": 49.84227752685547,
"learning_rate": 5.427049696989792e-06,
"loss": 0.0193,
"num_input_tokens_seen": 3271552,
"step": 5235
},
{
"epoch": 10.522088353413654,
"grad_norm": 15.486082077026367,
"learning_rate": 5.418319368904137e-06,
"loss": 0.1446,
"num_input_tokens_seen": 3274304,
"step": 5240
},
{
"epoch": 10.532128514056225,
"grad_norm": 114.0123519897461,
"learning_rate": 5.409587756289462e-06,
"loss": 0.1745,
"num_input_tokens_seen": 3277056,
"step": 5245
},
{
"epoch": 10.542168674698795,
"grad_norm": 14.48117446899414,
"learning_rate": 5.40085488595784e-06,
"loss": 0.025,
"num_input_tokens_seen": 3280448,
"step": 5250
},
{
"epoch": 10.552208835341366,
"grad_norm": 0.0807952955365181,
"learning_rate": 5.392120784725206e-06,
"loss": 0.068,
"num_input_tokens_seen": 3284672,
"step": 5255
},
{
"epoch": 10.562248995983936,
"grad_norm": 0.8095536231994629,
"learning_rate": 5.383385479411276e-06,
"loss": 0.055,
"num_input_tokens_seen": 3287648,
"step": 5260
},
{
"epoch": 10.572289156626507,
"grad_norm": 67.51933288574219,
"learning_rate": 5.374648996839462e-06,
"loss": 0.0597,
"num_input_tokens_seen": 3291040,
"step": 5265
},
{
"epoch": 10.582329317269076,
"grad_norm": 35.59123992919922,
"learning_rate": 5.3659113638367936e-06,
"loss": 0.0567,
"num_input_tokens_seen": 3293536,
"step": 5270
},
{
"epoch": 10.592369477911646,
"grad_norm": 0.17060035467147827,
"learning_rate": 5.357172607233831e-06,
"loss": 0.0485,
"num_input_tokens_seen": 3296704,
"step": 5275
},
{
"epoch": 10.602409638554217,
"grad_norm": 11.730257034301758,
"learning_rate": 5.348432753864582e-06,
"loss": 0.0804,
"num_input_tokens_seen": 3299744,
"step": 5280
},
{
"epoch": 10.612449799196787,
"grad_norm": 8.082003593444824,
"learning_rate": 5.339691830566428e-06,
"loss": 0.2024,
"num_input_tokens_seen": 3302432,
"step": 5285
},
{
"epoch": 10.622489959839358,
"grad_norm": 39.70650100708008,
"learning_rate": 5.330949864180034e-06,
"loss": 0.0596,
"num_input_tokens_seen": 3305760,
"step": 5290
},
{
"epoch": 10.632530120481928,
"grad_norm": 0.2803881764411926,
"learning_rate": 5.322206881549266e-06,
"loss": 0.0486,
"num_input_tokens_seen": 3309312,
"step": 5295
},
{
"epoch": 10.642570281124499,
"grad_norm": 0.115199975669384,
"learning_rate": 5.313462909521111e-06,
"loss": 0.0613,
"num_input_tokens_seen": 3312224,
"step": 5300
},
{
"epoch": 10.652610441767068,
"grad_norm": 23.403295516967773,
"learning_rate": 5.304717974945596e-06,
"loss": 0.1876,
"num_input_tokens_seen": 3314912,
"step": 5305
},
{
"epoch": 10.662650602409638,
"grad_norm": 0.8288989067077637,
"learning_rate": 5.2959721046757004e-06,
"loss": 0.1077,
"num_input_tokens_seen": 3317824,
"step": 5310
},
{
"epoch": 10.67269076305221,
"grad_norm": 1.3652393817901611,
"learning_rate": 5.287225325567281e-06,
"loss": 0.0748,
"num_input_tokens_seen": 3321216,
"step": 5315
},
{
"epoch": 10.682730923694779,
"grad_norm": 0.05525651574134827,
"learning_rate": 5.2784776644789825e-06,
"loss": 0.0298,
"num_input_tokens_seen": 3324640,
"step": 5320
},
{
"epoch": 10.69277108433735,
"grad_norm": 8.195664405822754,
"learning_rate": 5.269729148272158e-06,
"loss": 0.1266,
"num_input_tokens_seen": 3327232,
"step": 5325
},
{
"epoch": 10.70281124497992,
"grad_norm": 26.851442337036133,
"learning_rate": 5.260979803810787e-06,
"loss": 0.0633,
"num_input_tokens_seen": 3330304,
"step": 5330
},
{
"epoch": 10.71285140562249,
"grad_norm": 0.7560333013534546,
"learning_rate": 5.252229657961394e-06,
"loss": 0.0565,
"num_input_tokens_seen": 3333472,
"step": 5335
},
{
"epoch": 10.72289156626506,
"grad_norm": 10.821014404296875,
"learning_rate": 5.2434787375929605e-06,
"loss": 0.0313,
"num_input_tokens_seen": 3336704,
"step": 5340
},
{
"epoch": 10.73293172690763,
"grad_norm": 1.1041162014007568,
"learning_rate": 5.2347270695768505e-06,
"loss": 0.0202,
"num_input_tokens_seen": 3339392,
"step": 5345
},
{
"epoch": 10.742971887550201,
"grad_norm": 50.26311492919922,
"learning_rate": 5.225974680786721e-06,
"loss": 0.1127,
"num_input_tokens_seen": 3342400,
"step": 5350
},
{
"epoch": 10.75301204819277,
"grad_norm": 34.33879470825195,
"learning_rate": 5.217221598098444e-06,
"loss": 0.1213,
"num_input_tokens_seen": 3345792,
"step": 5355
},
{
"epoch": 10.763052208835342,
"grad_norm": 26.683555603027344,
"learning_rate": 5.208467848390018e-06,
"loss": 0.1532,
"num_input_tokens_seen": 3349248,
"step": 5360
},
{
"epoch": 10.773092369477911,
"grad_norm": 14.137528419494629,
"learning_rate": 5.199713458541495e-06,
"loss": 0.0453,
"num_input_tokens_seen": 3352384,
"step": 5365
},
{
"epoch": 10.783132530120483,
"grad_norm": 3.5798494815826416,
"learning_rate": 5.190958455434891e-06,
"loss": 0.0667,
"num_input_tokens_seen": 3355648,
"step": 5370
},
{
"epoch": 10.793172690763052,
"grad_norm": 5.3735761642456055,
"learning_rate": 5.182202865954105e-06,
"loss": 0.1253,
"num_input_tokens_seen": 3358400,
"step": 5375
},
{
"epoch": 10.803212851405622,
"grad_norm": 2.211503267288208,
"learning_rate": 5.173446716984837e-06,
"loss": 0.0201,
"num_input_tokens_seen": 3361408,
"step": 5380
},
{
"epoch": 10.813253012048193,
"grad_norm": 9.731382369995117,
"learning_rate": 5.164690035414501e-06,
"loss": 0.0566,
"num_input_tokens_seen": 3365216,
"step": 5385
},
{
"epoch": 10.823293172690763,
"grad_norm": 20.726686477661133,
"learning_rate": 5.155932848132155e-06,
"loss": 0.0725,
"num_input_tokens_seen": 3368736,
"step": 5390
},
{
"epoch": 10.833333333333334,
"grad_norm": 0.17550311982631683,
"learning_rate": 5.1471751820284e-06,
"loss": 0.0465,
"num_input_tokens_seen": 3372096,
"step": 5395
},
{
"epoch": 10.843373493975903,
"grad_norm": 1.0040000677108765,
"learning_rate": 5.138417063995315e-06,
"loss": 0.0601,
"num_input_tokens_seen": 3375296,
"step": 5400
},
{
"epoch": 10.853413654618475,
"grad_norm": 5.7787065505981445,
"learning_rate": 5.129658520926361e-06,
"loss": 0.0839,
"num_input_tokens_seen": 3378880,
"step": 5405
},
{
"epoch": 10.863453815261044,
"grad_norm": 9.573227882385254,
"learning_rate": 5.1208995797163085e-06,
"loss": 0.162,
"num_input_tokens_seen": 3381600,
"step": 5410
},
{
"epoch": 10.873493975903614,
"grad_norm": 27.446544647216797,
"learning_rate": 5.112140267261151e-06,
"loss": 0.0322,
"num_input_tokens_seen": 3385024,
"step": 5415
},
{
"epoch": 10.883534136546185,
"grad_norm": 0.08992139995098114,
"learning_rate": 5.103380610458016e-06,
"loss": 0.1112,
"num_input_tokens_seen": 3387744,
"step": 5420
},
{
"epoch": 10.893574297188755,
"grad_norm": 0.1688283532857895,
"learning_rate": 5.094620636205096e-06,
"loss": 0.1092,
"num_input_tokens_seen": 3390464,
"step": 5425
},
{
"epoch": 10.903614457831326,
"grad_norm": 0.43699127435684204,
"learning_rate": 5.085860371401552e-06,
"loss": 0.1259,
"num_input_tokens_seen": 3393312,
"step": 5430
},
{
"epoch": 10.913654618473895,
"grad_norm": 107.20014953613281,
"learning_rate": 5.077099842947441e-06,
"loss": 0.1288,
"num_input_tokens_seen": 3396704,
"step": 5435
},
{
"epoch": 10.923694779116467,
"grad_norm": 12.245671272277832,
"learning_rate": 5.068339077743629e-06,
"loss": 0.0167,
"num_input_tokens_seen": 3399264,
"step": 5440
},
{
"epoch": 10.933734939759036,
"grad_norm": 0.1279267817735672,
"learning_rate": 5.059578102691707e-06,
"loss": 0.0114,
"num_input_tokens_seen": 3402144,
"step": 5445
},
{
"epoch": 10.943775100401606,
"grad_norm": 0.1616511046886444,
"learning_rate": 5.050816944693913e-06,
"loss": 0.002,
"num_input_tokens_seen": 3404608,
"step": 5450
},
{
"epoch": 10.953815261044177,
"grad_norm": 22.025815963745117,
"learning_rate": 5.042055630653042e-06,
"loss": 0.0584,
"num_input_tokens_seen": 3407584,
"step": 5455
},
{
"epoch": 10.963855421686747,
"grad_norm": 0.27465957403182983,
"learning_rate": 5.0332941874723775e-06,
"loss": 0.0499,
"num_input_tokens_seen": 3410848,
"step": 5460
},
{
"epoch": 10.973895582329318,
"grad_norm": 4.218213081359863,
"learning_rate": 5.02453264205559e-06,
"loss": 0.0993,
"num_input_tokens_seen": 3413792,
"step": 5465
},
{
"epoch": 10.983935742971887,
"grad_norm": 15.106077194213867,
"learning_rate": 5.01577102130667e-06,
"loss": 0.091,
"num_input_tokens_seen": 3416896,
"step": 5470
},
{
"epoch": 10.993975903614459,
"grad_norm": 8.20037841796875,
"learning_rate": 5.007009352129835e-06,
"loss": 0.0081,
"num_input_tokens_seen": 3419712,
"step": 5475
},
{
"epoch": 11.004016064257028,
"grad_norm": 0.12353216856718063,
"learning_rate": 4.998247661429453e-06,
"loss": 0.0095,
"num_input_tokens_seen": 3423168,
"step": 5480
},
{
"epoch": 11.014056224899598,
"grad_norm": 0.3131659924983978,
"learning_rate": 4.98948597610996e-06,
"loss": 0.0066,
"num_input_tokens_seen": 3426688,
"step": 5485
},
{
"epoch": 11.024096385542169,
"grad_norm": 11.450029373168945,
"learning_rate": 4.980724323075772e-06,
"loss": 0.1703,
"num_input_tokens_seen": 3429952,
"step": 5490
},
{
"epoch": 11.034136546184738,
"grad_norm": 0.36036553978919983,
"learning_rate": 4.971962729231211e-06,
"loss": 0.006,
"num_input_tokens_seen": 3433088,
"step": 5495
},
{
"epoch": 11.04417670682731,
"grad_norm": 0.09164968132972717,
"learning_rate": 4.9632012214804086e-06,
"loss": 0.0025,
"num_input_tokens_seen": 3435840,
"step": 5500
},
{
"epoch": 11.05421686746988,
"grad_norm": 0.10280407220125198,
"learning_rate": 4.954439826727243e-06,
"loss": 0.0105,
"num_input_tokens_seen": 3438976,
"step": 5505
},
{
"epoch": 11.06425702811245,
"grad_norm": 0.31124135851860046,
"learning_rate": 4.945678571875234e-06,
"loss": 0.0452,
"num_input_tokens_seen": 3442208,
"step": 5510
},
{
"epoch": 11.07429718875502,
"grad_norm": 23.169099807739258,
"learning_rate": 4.936917483827483e-06,
"loss": 0.007,
"num_input_tokens_seen": 3445632,
"step": 5515
},
{
"epoch": 11.08433734939759,
"grad_norm": 0.12971937656402588,
"learning_rate": 4.928156589486571e-06,
"loss": 0.1426,
"num_input_tokens_seen": 3448608,
"step": 5520
},
{
"epoch": 11.094377510040161,
"grad_norm": 0.3725661635398865,
"learning_rate": 4.919395915754486e-06,
"loss": 0.0008,
"num_input_tokens_seen": 3451264,
"step": 5525
},
{
"epoch": 11.10441767068273,
"grad_norm": 163.01356506347656,
"learning_rate": 4.910635489532543e-06,
"loss": 0.0699,
"num_input_tokens_seen": 3454496,
"step": 5530
},
{
"epoch": 11.114457831325302,
"grad_norm": 17.35430335998535,
"learning_rate": 4.901875337721289e-06,
"loss": 0.1167,
"num_input_tokens_seen": 3458016,
"step": 5535
},
{
"epoch": 11.124497991967871,
"grad_norm": 51.6560173034668,
"learning_rate": 4.893115487220434e-06,
"loss": 0.0807,
"num_input_tokens_seen": 3461344,
"step": 5540
},
{
"epoch": 11.134538152610443,
"grad_norm": 11.429407119750977,
"learning_rate": 4.884355964928767e-06,
"loss": 0.1003,
"num_input_tokens_seen": 3463424,
"step": 5545
},
{
"epoch": 11.144578313253012,
"grad_norm": 0.01519758440554142,
"learning_rate": 4.875596797744056e-06,
"loss": 0.0127,
"num_input_tokens_seen": 3466560,
"step": 5550
},
{
"epoch": 11.154618473895582,
"grad_norm": 5.342797756195068,
"learning_rate": 4.866838012562993e-06,
"loss": 0.1129,
"num_input_tokens_seen": 3469664,
"step": 5555
},
{
"epoch": 11.164658634538153,
"grad_norm": 1.1703826189041138,
"learning_rate": 4.858079636281086e-06,
"loss": 0.0025,
"num_input_tokens_seen": 3472544,
"step": 5560
},
{
"epoch": 11.174698795180722,
"grad_norm": 39.996116638183594,
"learning_rate": 4.8493216957925915e-06,
"loss": 0.0965,
"num_input_tokens_seen": 3475072,
"step": 5565
},
{
"epoch": 11.184738955823294,
"grad_norm": 35.59189987182617,
"learning_rate": 4.840564217990432e-06,
"loss": 0.0605,
"num_input_tokens_seen": 3477984,
"step": 5570
},
{
"epoch": 11.194779116465863,
"grad_norm": 0.035814475268125534,
"learning_rate": 4.831807229766101e-06,
"loss": 0.0005,
"num_input_tokens_seen": 3481152,
"step": 5575
},
{
"epoch": 11.204819277108435,
"grad_norm": 23.645578384399414,
"learning_rate": 4.823050758009597e-06,
"loss": 0.0041,
"num_input_tokens_seen": 3484800,
"step": 5580
},
{
"epoch": 11.214859437751004,
"grad_norm": 0.532557487487793,
"learning_rate": 4.814294829609325e-06,
"loss": 0.0005,
"num_input_tokens_seen": 3487776,
"step": 5585
},
{
"epoch": 11.224899598393574,
"grad_norm": 0.031553834676742554,
"learning_rate": 4.805539471452026e-06,
"loss": 0.1039,
"num_input_tokens_seen": 3491552,
"step": 5590
},
{
"epoch": 11.234939759036145,
"grad_norm": 33.89256286621094,
"learning_rate": 4.796784710422692e-06,
"loss": 0.0078,
"num_input_tokens_seen": 3495296,
"step": 5595
},
{
"epoch": 11.244979919678714,
"grad_norm": 7.040841102600098,
"learning_rate": 4.788030573404475e-06,
"loss": 0.0828,
"num_input_tokens_seen": 3498208,
"step": 5600
},
{
"epoch": 11.255020080321286,
"grad_norm": 0.027555659413337708,
"learning_rate": 4.779277087278615e-06,
"loss": 0.0301,
"num_input_tokens_seen": 3501472,
"step": 5605
},
{
"epoch": 11.265060240963855,
"grad_norm": 0.02722824178636074,
"learning_rate": 4.770524278924353e-06,
"loss": 0.0149,
"num_input_tokens_seen": 3504352,
"step": 5610
},
{
"epoch": 11.275100401606426,
"grad_norm": 0.2287973165512085,
"learning_rate": 4.761772175218848e-06,
"loss": 0.061,
"num_input_tokens_seen": 3507904,
"step": 5615
},
{
"epoch": 11.285140562248996,
"grad_norm": 0.07063689827919006,
"learning_rate": 4.753020803037098e-06,
"loss": 0.0376,
"num_input_tokens_seen": 3511328,
"step": 5620
},
{
"epoch": 11.295180722891565,
"grad_norm": 0.009446073323488235,
"learning_rate": 4.744270189251848e-06,
"loss": 0.0009,
"num_input_tokens_seen": 3514432,
"step": 5625
},
{
"epoch": 11.305220883534137,
"grad_norm": 60.107078552246094,
"learning_rate": 4.735520360733523e-06,
"loss": 0.073,
"num_input_tokens_seen": 3517824,
"step": 5630
},
{
"epoch": 11.315261044176706,
"grad_norm": 0.03736821189522743,
"learning_rate": 4.7267713443501274e-06,
"loss": 0.0801,
"num_input_tokens_seen": 3520416,
"step": 5635
},
{
"epoch": 11.325301204819278,
"grad_norm": 0.0403904914855957,
"learning_rate": 4.718023166967181e-06,
"loss": 0.1055,
"num_input_tokens_seen": 3523648,
"step": 5640
},
{
"epoch": 11.335341365461847,
"grad_norm": 0.44946715235710144,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.0693,
"num_input_tokens_seen": 3526624,
"step": 5645
},
{
"epoch": 11.345381526104418,
"grad_norm": 0.4542866349220276,
"learning_rate": 4.700529436651729e-06,
"loss": 0.1391,
"num_input_tokens_seen": 3530080,
"step": 5650
},
{
"epoch": 11.355421686746988,
"grad_norm": 0.5738996267318726,
"learning_rate": 4.691783937437043e-06,
"loss": 0.0008,
"num_input_tokens_seen": 3533184,
"step": 5655
},
{
"epoch": 11.365461847389557,
"grad_norm": 0.34505975246429443,
"learning_rate": 4.683039384658276e-06,
"loss": 0.0281,
"num_input_tokens_seen": 3536608,
"step": 5660
},
{
"epoch": 11.375502008032129,
"grad_norm": 0.1401638686656952,
"learning_rate": 4.67429580516724e-06,
"loss": 0.105,
"num_input_tokens_seen": 3539840,
"step": 5665
},
{
"epoch": 11.385542168674698,
"grad_norm": 0.23378750681877136,
"learning_rate": 4.665553225812758e-06,
"loss": 0.0015,
"num_input_tokens_seen": 3541952,
"step": 5670
},
{
"epoch": 11.39558232931727,
"grad_norm": 18.162364959716797,
"learning_rate": 4.656811673440572e-06,
"loss": 0.1544,
"num_input_tokens_seen": 3544992,
"step": 5675
},
{
"epoch": 11.405622489959839,
"grad_norm": 34.31365203857422,
"learning_rate": 4.648071174893285e-06,
"loss": 0.0318,
"num_input_tokens_seen": 3547872,
"step": 5680
},
{
"epoch": 11.41566265060241,
"grad_norm": 0.40164482593536377,
"learning_rate": 4.6393317570102505e-06,
"loss": 0.0131,
"num_input_tokens_seen": 3550880,
"step": 5685
},
{
"epoch": 11.42570281124498,
"grad_norm": 0.06918898224830627,
"learning_rate": 4.6305934466275145e-06,
"loss": 0.0499,
"num_input_tokens_seen": 3554464,
"step": 5690
},
{
"epoch": 11.43574297188755,
"grad_norm": 0.07604250311851501,
"learning_rate": 4.6218562705777185e-06,
"loss": 0.0011,
"num_input_tokens_seen": 3557344,
"step": 5695
},
{
"epoch": 11.44578313253012,
"grad_norm": 0.8705164194107056,
"learning_rate": 4.613120255690014e-06,
"loss": 0.1489,
"num_input_tokens_seen": 3560096,
"step": 5700
},
{
"epoch": 11.45582329317269,
"grad_norm": 0.12667706608772278,
"learning_rate": 4.604385428789997e-06,
"loss": 0.0229,
"num_input_tokens_seen": 3562560,
"step": 5705
},
{
"epoch": 11.465863453815262,
"grad_norm": 0.11770451068878174,
"learning_rate": 4.595651816699612e-06,
"loss": 0.0591,
"num_input_tokens_seen": 3565472,
"step": 5710
},
{
"epoch": 11.475903614457831,
"grad_norm": 94.34481811523438,
"learning_rate": 4.586919446237071e-06,
"loss": 0.0946,
"num_input_tokens_seen": 3568288,
"step": 5715
},
{
"epoch": 11.485943775100402,
"grad_norm": 0.2393973171710968,
"learning_rate": 4.578188344216777e-06,
"loss": 0.0236,
"num_input_tokens_seen": 3571712,
"step": 5720
},
{
"epoch": 11.495983935742972,
"grad_norm": 156.00209045410156,
"learning_rate": 4.5694585374492314e-06,
"loss": 0.0604,
"num_input_tokens_seen": 3574528,
"step": 5725
},
{
"epoch": 11.506024096385541,
"grad_norm": 2.538038492202759,
"learning_rate": 4.560730052740967e-06,
"loss": 0.0027,
"num_input_tokens_seen": 3577504,
"step": 5730
},
{
"epoch": 11.516064257028113,
"grad_norm": 0.1259893774986267,
"learning_rate": 4.552002916894454e-06,
"loss": 0.0029,
"num_input_tokens_seen": 3581024,
"step": 5735
},
{
"epoch": 11.526104417670682,
"grad_norm": 0.10142702609300613,
"learning_rate": 4.543277156708013e-06,
"loss": 0.0853,
"num_input_tokens_seen": 3583552,
"step": 5740
},
{
"epoch": 11.536144578313253,
"grad_norm": 0.24175593256950378,
"learning_rate": 4.534552798975755e-06,
"loss": 0.0414,
"num_input_tokens_seen": 3587136,
"step": 5745
},
{
"epoch": 11.546184738955823,
"grad_norm": 0.05081957206130028,
"learning_rate": 4.525829870487468e-06,
"loss": 0.0038,
"num_input_tokens_seen": 3590368,
"step": 5750
},
{
"epoch": 11.556224899598394,
"grad_norm": 0.03754664212465286,
"learning_rate": 4.517108398028566e-06,
"loss": 0.0486,
"num_input_tokens_seen": 3592896,
"step": 5755
},
{
"epoch": 11.566265060240964,
"grad_norm": 1.1687461137771606,
"learning_rate": 4.508388408379985e-06,
"loss": 0.0376,
"num_input_tokens_seen": 3595424,
"step": 5760
},
{
"epoch": 11.576305220883533,
"grad_norm": 0.5024897456169128,
"learning_rate": 4.499669928318105e-06,
"loss": 0.0384,
"num_input_tokens_seen": 3599136,
"step": 5765
},
{
"epoch": 11.586345381526105,
"grad_norm": 0.012420962564647198,
"learning_rate": 4.490952984614676e-06,
"loss": 0.0472,
"num_input_tokens_seen": 3602496,
"step": 5770
},
{
"epoch": 11.596385542168674,
"grad_norm": 53.2065544128418,
"learning_rate": 4.482237604036729e-06,
"loss": 0.0978,
"num_input_tokens_seen": 3605824,
"step": 5775
},
{
"epoch": 11.606425702811245,
"grad_norm": 25.26783561706543,
"learning_rate": 4.473523813346491e-06,
"loss": 0.1101,
"num_input_tokens_seen": 3608544,
"step": 5780
},
{
"epoch": 11.616465863453815,
"grad_norm": 0.04346461594104767,
"learning_rate": 4.464811639301314e-06,
"loss": 0.0407,
"num_input_tokens_seen": 3611328,
"step": 5785
},
{
"epoch": 11.626506024096386,
"grad_norm": 31.01521873474121,
"learning_rate": 4.456101108653579e-06,
"loss": 0.065,
"num_input_tokens_seen": 3613376,
"step": 5790
},
{
"epoch": 11.636546184738956,
"grad_norm": 54.869712829589844,
"learning_rate": 4.447392248150627e-06,
"loss": 0.1865,
"num_input_tokens_seen": 3616032,
"step": 5795
},
{
"epoch": 11.646586345381525,
"grad_norm": 3.378321647644043,
"learning_rate": 4.438685084534663e-06,
"loss": 0.022,
"num_input_tokens_seen": 3619552,
"step": 5800
},
{
"epoch": 11.656626506024097,
"grad_norm": 0.15011410415172577,
"learning_rate": 4.429979644542689e-06,
"loss": 0.0459,
"num_input_tokens_seen": 3623200,
"step": 5805
},
{
"epoch": 11.666666666666666,
"grad_norm": 44.3188591003418,
"learning_rate": 4.421275954906409e-06,
"loss": 0.0709,
"num_input_tokens_seen": 3626208,
"step": 5810
},
{
"epoch": 11.676706827309237,
"grad_norm": 0.18596585094928741,
"learning_rate": 4.412574042352156e-06,
"loss": 0.0649,
"num_input_tokens_seen": 3629632,
"step": 5815
},
{
"epoch": 11.686746987951807,
"grad_norm": 0.8435150384902954,
"learning_rate": 4.403873933600803e-06,
"loss": 0.0237,
"num_input_tokens_seen": 3632224,
"step": 5820
},
{
"epoch": 11.696787148594378,
"grad_norm": 0.5117378234863281,
"learning_rate": 4.395175655367682e-06,
"loss": 0.0045,
"num_input_tokens_seen": 3635424,
"step": 5825
},
{
"epoch": 11.706827309236948,
"grad_norm": 31.202089309692383,
"learning_rate": 4.386479234362512e-06,
"loss": 0.1638,
"num_input_tokens_seen": 3638560,
"step": 5830
},
{
"epoch": 11.716867469879517,
"grad_norm": 0.47105512022972107,
"learning_rate": 4.377784697289304e-06,
"loss": 0.0549,
"num_input_tokens_seen": 3642560,
"step": 5835
},
{
"epoch": 11.726907630522089,
"grad_norm": 1.4522799253463745,
"learning_rate": 4.36909207084628e-06,
"loss": 0.0198,
"num_input_tokens_seen": 3646016,
"step": 5840
},
{
"epoch": 11.736947791164658,
"grad_norm": 1.9571293592453003,
"learning_rate": 4.360401381725806e-06,
"loss": 0.0741,
"num_input_tokens_seen": 3649152,
"step": 5845
},
{
"epoch": 11.74698795180723,
"grad_norm": 38.02421569824219,
"learning_rate": 4.3517126566142864e-06,
"loss": 0.0736,
"num_input_tokens_seen": 3652096,
"step": 5850
},
{
"epoch": 11.757028112449799,
"grad_norm": 53.22858810424805,
"learning_rate": 4.343025922192104e-06,
"loss": 0.2828,
"num_input_tokens_seen": 3655776,
"step": 5855
},
{
"epoch": 11.76706827309237,
"grad_norm": 15.424005508422852,
"learning_rate": 4.334341205133527e-06,
"loss": 0.0433,
"num_input_tokens_seen": 3658656,
"step": 5860
},
{
"epoch": 11.77710843373494,
"grad_norm": 0.08036069571971893,
"learning_rate": 4.325658532106623e-06,
"loss": 0.0372,
"num_input_tokens_seen": 3661440,
"step": 5865
},
{
"epoch": 11.78714859437751,
"grad_norm": 24.228103637695312,
"learning_rate": 4.316977929773191e-06,
"loss": 0.0765,
"num_input_tokens_seen": 3664288,
"step": 5870
},
{
"epoch": 11.79718875502008,
"grad_norm": 0.32403498888015747,
"learning_rate": 4.308299424788667e-06,
"loss": 0.0367,
"num_input_tokens_seen": 3667744,
"step": 5875
},
{
"epoch": 11.80722891566265,
"grad_norm": 13.046582221984863,
"learning_rate": 4.299623043802046e-06,
"loss": 0.0453,
"num_input_tokens_seen": 3670624,
"step": 5880
},
{
"epoch": 11.817269076305221,
"grad_norm": 0.5890432596206665,
"learning_rate": 4.2909488134558086e-06,
"loss": 0.0436,
"num_input_tokens_seen": 3673600,
"step": 5885
},
{
"epoch": 11.82730923694779,
"grad_norm": 0.2298094779253006,
"learning_rate": 4.2822767603858185e-06,
"loss": 0.0026,
"num_input_tokens_seen": 3676928,
"step": 5890
},
{
"epoch": 11.837349397590362,
"grad_norm": 0.3329310119152069,
"learning_rate": 4.2736069112212656e-06,
"loss": 0.1138,
"num_input_tokens_seen": 3680064,
"step": 5895
},
{
"epoch": 11.847389558232932,
"grad_norm": 0.017066599801182747,
"learning_rate": 4.264939292584565e-06,
"loss": 0.0151,
"num_input_tokens_seen": 3683040,
"step": 5900
},
{
"epoch": 11.857429718875501,
"grad_norm": 0.07971663773059845,
"learning_rate": 4.256273931091284e-06,
"loss": 0.0137,
"num_input_tokens_seen": 3686400,
"step": 5905
},
{
"epoch": 11.867469879518072,
"grad_norm": 14.964932441711426,
"learning_rate": 4.247610853350063e-06,
"loss": 0.0368,
"num_input_tokens_seen": 3689216,
"step": 5910
},
{
"epoch": 11.877510040160642,
"grad_norm": 16.467721939086914,
"learning_rate": 4.238950085962522e-06,
"loss": 0.0593,
"num_input_tokens_seen": 3692288,
"step": 5915
},
{
"epoch": 11.887550200803213,
"grad_norm": 23.15678596496582,
"learning_rate": 4.230291655523197e-06,
"loss": 0.028,
"num_input_tokens_seen": 3696288,
"step": 5920
},
{
"epoch": 11.897590361445783,
"grad_norm": 0.4339298903942108,
"learning_rate": 4.2216355886194355e-06,
"loss": 0.0461,
"num_input_tokens_seen": 3699456,
"step": 5925
},
{
"epoch": 11.907630522088354,
"grad_norm": 74.98004150390625,
"learning_rate": 4.212981911831338e-06,
"loss": 0.0741,
"num_input_tokens_seen": 3703232,
"step": 5930
},
{
"epoch": 11.917670682730924,
"grad_norm": 0.4801337420940399,
"learning_rate": 4.204330651731662e-06,
"loss": 0.0208,
"num_input_tokens_seen": 3705568,
"step": 5935
},
{
"epoch": 11.927710843373493,
"grad_norm": 1.0646982192993164,
"learning_rate": 4.195681834885743e-06,
"loss": 0.0302,
"num_input_tokens_seen": 3709152,
"step": 5940
},
{
"epoch": 11.937751004016064,
"grad_norm": 0.011887975037097931,
"learning_rate": 4.187035487851412e-06,
"loss": 0.0003,
"num_input_tokens_seen": 3713056,
"step": 5945
},
{
"epoch": 11.947791164658634,
"grad_norm": 11.081732749938965,
"learning_rate": 4.178391637178923e-06,
"loss": 0.0046,
"num_input_tokens_seen": 3715744,
"step": 5950
},
{
"epoch": 11.957831325301205,
"grad_norm": 0.07225532084703445,
"learning_rate": 4.169750309410856e-06,
"loss": 0.0004,
"num_input_tokens_seen": 3718912,
"step": 5955
},
{
"epoch": 11.967871485943775,
"grad_norm": 0.05781983956694603,
"learning_rate": 4.161111531082052e-06,
"loss": 0.3039,
"num_input_tokens_seen": 3721504,
"step": 5960
},
{
"epoch": 11.977911646586346,
"grad_norm": 42.374271392822266,
"learning_rate": 4.152475328719517e-06,
"loss": 0.0095,
"num_input_tokens_seen": 3724960,
"step": 5965
},
{
"epoch": 11.987951807228916,
"grad_norm": 206.51792907714844,
"learning_rate": 4.14384172884235e-06,
"loss": 0.0668,
"num_input_tokens_seen": 3728160,
"step": 5970
},
{
"epoch": 11.997991967871485,
"grad_norm": 14.492680549621582,
"learning_rate": 4.13521075796166e-06,
"loss": 0.1454,
"num_input_tokens_seen": 3732000,
"step": 5975
},
{
"epoch": 12.0,
"eval_loss": 0.5769983530044556,
"eval_runtime": 8.0773,
"eval_samples_per_second": 61.655,
"eval_steps_per_second": 15.476,
"num_input_tokens_seen": 3732864,
"step": 5976
},
{
"epoch": 12.008032128514056,
"grad_norm": 0.06490268558263779,
"learning_rate": 4.126582442580478e-06,
"loss": 0.0756,
"num_input_tokens_seen": 3735424,
"step": 5980
},
{
"epoch": 12.018072289156626,
"grad_norm": 0.2292412966489792,
"learning_rate": 4.117956809193687e-06,
"loss": 0.0007,
"num_input_tokens_seen": 3738816,
"step": 5985
},
{
"epoch": 12.028112449799197,
"grad_norm": 0.057011283934116364,
"learning_rate": 4.109333884287929e-06,
"loss": 0.0439,
"num_input_tokens_seen": 3742176,
"step": 5990
},
{
"epoch": 12.038152610441767,
"grad_norm": 0.018794192001223564,
"learning_rate": 4.1007136943415325e-06,
"loss": 0.067,
"num_input_tokens_seen": 3744928,
"step": 5995
},
{
"epoch": 12.048192771084338,
"grad_norm": 0.15085923671722412,
"learning_rate": 4.092096265824429e-06,
"loss": 0.0044,
"num_input_tokens_seen": 3748288,
"step": 6000
},
{
"epoch": 12.058232931726907,
"grad_norm": 0.032142024487257004,
"learning_rate": 4.083481625198065e-06,
"loss": 0.0213,
"num_input_tokens_seen": 3751744,
"step": 6005
},
{
"epoch": 12.068273092369479,
"grad_norm": 0.4201153516769409,
"learning_rate": 4.074869798915333e-06,
"loss": 0.0013,
"num_input_tokens_seen": 3754624,
"step": 6010
},
{
"epoch": 12.078313253012048,
"grad_norm": 1.5055813789367676,
"learning_rate": 4.066260813420477e-06,
"loss": 0.0007,
"num_input_tokens_seen": 3757120,
"step": 6015
},
{
"epoch": 12.088353413654618,
"grad_norm": 0.006558163091540337,
"learning_rate": 4.0576546951490225e-06,
"loss": 0.0007,
"num_input_tokens_seen": 3759872,
"step": 6020
},
{
"epoch": 12.098393574297189,
"grad_norm": 0.017629623413085938,
"learning_rate": 4.049051470527692e-06,
"loss": 0.0741,
"num_input_tokens_seen": 3762848,
"step": 6025
},
{
"epoch": 12.108433734939759,
"grad_norm": 0.0402434803545475,
"learning_rate": 4.040451165974313e-06,
"loss": 0.0594,
"num_input_tokens_seen": 3766080,
"step": 6030
},
{
"epoch": 12.11847389558233,
"grad_norm": 0.3141787052154541,
"learning_rate": 4.031853807897759e-06,
"loss": 0.0397,
"num_input_tokens_seen": 3769216,
"step": 6035
},
{
"epoch": 12.1285140562249,
"grad_norm": 0.008833534084260464,
"learning_rate": 4.023259422697846e-06,
"loss": 0.0017,
"num_input_tokens_seen": 3772480,
"step": 6040
},
{
"epoch": 12.13855421686747,
"grad_norm": 38.78855895996094,
"learning_rate": 4.014668036765267e-06,
"loss": 0.0134,
"num_input_tokens_seen": 3776096,
"step": 6045
},
{
"epoch": 12.14859437751004,
"grad_norm": 0.13365915417671204,
"learning_rate": 4.006079676481504e-06,
"loss": 0.0005,
"num_input_tokens_seen": 3779520,
"step": 6050
},
{
"epoch": 12.15863453815261,
"grad_norm": 0.04451864957809448,
"learning_rate": 3.997494368218745e-06,
"loss": 0.0451,
"num_input_tokens_seen": 3782560,
"step": 6055
},
{
"epoch": 12.168674698795181,
"grad_norm": 0.008807549253106117,
"learning_rate": 3.988912138339812e-06,
"loss": 0.0011,
"num_input_tokens_seen": 3785216,
"step": 6060
},
{
"epoch": 12.17871485943775,
"grad_norm": 0.057633060961961746,
"learning_rate": 3.980333013198067e-06,
"loss": 0.0215,
"num_input_tokens_seen": 3788256,
"step": 6065
},
{
"epoch": 12.188755020080322,
"grad_norm": 0.4633331894874573,
"learning_rate": 3.971757019137342e-06,
"loss": 0.0067,
"num_input_tokens_seen": 3791552,
"step": 6070
},
{
"epoch": 12.198795180722891,
"grad_norm": 0.022058850154280663,
"learning_rate": 3.9631841824918585e-06,
"loss": 0.0042,
"num_input_tokens_seen": 3795008,
"step": 6075
},
{
"epoch": 12.208835341365463,
"grad_norm": 25.464744567871094,
"learning_rate": 3.954614529586135e-06,
"loss": 0.0195,
"num_input_tokens_seen": 3797504,
"step": 6080
},
{
"epoch": 12.218875502008032,
"grad_norm": 0.02658812515437603,
"learning_rate": 3.946048086734921e-06,
"loss": 0.0001,
"num_input_tokens_seen": 3800768,
"step": 6085
},
{
"epoch": 12.228915662650602,
"grad_norm": 27.161558151245117,
"learning_rate": 3.9374848802430995e-06,
"loss": 0.0444,
"num_input_tokens_seen": 3804032,
"step": 6090
},
{
"epoch": 12.238955823293173,
"grad_norm": 0.11181746423244476,
"learning_rate": 3.928924936405625e-06,
"loss": 0.0084,
"num_input_tokens_seen": 3807360,
"step": 6095
},
{
"epoch": 12.248995983935743,
"grad_norm": 58.37928009033203,
"learning_rate": 3.920368281507431e-06,
"loss": 0.0319,
"num_input_tokens_seen": 3810304,
"step": 6100
},
{
"epoch": 12.259036144578314,
"grad_norm": 0.0796833336353302,
"learning_rate": 3.911814941823349e-06,
"loss": 0.0386,
"num_input_tokens_seen": 3813504,
"step": 6105
},
{
"epoch": 12.269076305220883,
"grad_norm": 0.011359083466231823,
"learning_rate": 3.9032649436180325e-06,
"loss": 0.0416,
"num_input_tokens_seen": 3815584,
"step": 6110
},
{
"epoch": 12.279116465863455,
"grad_norm": 0.0377965085208416,
"learning_rate": 3.894718313145873e-06,
"loss": 0.0038,
"num_input_tokens_seen": 3819360,
"step": 6115
},
{
"epoch": 12.289156626506024,
"grad_norm": 0.0067506153136491776,
"learning_rate": 3.88617507665092e-06,
"loss": 0.0426,
"num_input_tokens_seen": 3822336,
"step": 6120
},
{
"epoch": 12.299196787148594,
"grad_norm": 0.011676016263663769,
"learning_rate": 3.877635260366807e-06,
"loss": 0.0295,
"num_input_tokens_seen": 3825184,
"step": 6125
},
{
"epoch": 12.309236947791165,
"grad_norm": 0.05182220786809921,
"learning_rate": 3.869098890516656e-06,
"loss": 0.0001,
"num_input_tokens_seen": 3828864,
"step": 6130
},
{
"epoch": 12.319277108433734,
"grad_norm": 17.081266403198242,
"learning_rate": 3.8605659933130165e-06,
"loss": 0.0386,
"num_input_tokens_seen": 3831168,
"step": 6135
},
{
"epoch": 12.329317269076306,
"grad_norm": 0.012902950868010521,
"learning_rate": 3.852036594957762e-06,
"loss": 0.0117,
"num_input_tokens_seen": 3834304,
"step": 6140
},
{
"epoch": 12.339357429718875,
"grad_norm": 0.16895687580108643,
"learning_rate": 3.843510721642036e-06,
"loss": 0.0005,
"num_input_tokens_seen": 3837792,
"step": 6145
},
{
"epoch": 12.349397590361447,
"grad_norm": 154.15745544433594,
"learning_rate": 3.834988399546145e-06,
"loss": 0.0596,
"num_input_tokens_seen": 3840736,
"step": 6150
},
{
"epoch": 12.359437751004016,
"grad_norm": 0.030498886480927467,
"learning_rate": 3.826469654839501e-06,
"loss": 0.0105,
"num_input_tokens_seen": 3843968,
"step": 6155
},
{
"epoch": 12.369477911646586,
"grad_norm": 0.03108968771994114,
"learning_rate": 3.817954513680524e-06,
"loss": 0.0002,
"num_input_tokens_seen": 3846560,
"step": 6160
},
{
"epoch": 12.379518072289157,
"grad_norm": 16.620052337646484,
"learning_rate": 3.8094430022165713e-06,
"loss": 0.0571,
"num_input_tokens_seen": 3849728,
"step": 6165
},
{
"epoch": 12.389558232931726,
"grad_norm": 0.028063397854566574,
"learning_rate": 3.800935146583854e-06,
"loss": 0.0173,
"num_input_tokens_seen": 3852416,
"step": 6170
},
{
"epoch": 12.399598393574298,
"grad_norm": 0.2141476422548294,
"learning_rate": 3.7924309729073616e-06,
"loss": 0.0484,
"num_input_tokens_seen": 3855968,
"step": 6175
},
{
"epoch": 12.409638554216867,
"grad_norm": 11.455037117004395,
"learning_rate": 3.7839305073007675e-06,
"loss": 0.0015,
"num_input_tokens_seen": 3859552,
"step": 6180
},
{
"epoch": 12.419678714859439,
"grad_norm": 28.907880783081055,
"learning_rate": 3.775433775866369e-06,
"loss": 0.0115,
"num_input_tokens_seen": 3862112,
"step": 6185
},
{
"epoch": 12.429718875502008,
"grad_norm": 2.5717875957489014,
"learning_rate": 3.766940804694992e-06,
"loss": 0.0004,
"num_input_tokens_seen": 3865536,
"step": 6190
},
{
"epoch": 12.439759036144578,
"grad_norm": 0.01706070452928543,
"learning_rate": 3.758451619865915e-06,
"loss": 0.0134,
"num_input_tokens_seen": 3868512,
"step": 6195
},
{
"epoch": 12.449799196787149,
"grad_norm": 4.302461624145508,
"learning_rate": 3.749966247446794e-06,
"loss": 0.0032,
"num_input_tokens_seen": 3870912,
"step": 6200
},
{
"epoch": 12.459839357429718,
"grad_norm": 12.282868385314941,
"learning_rate": 3.7414847134935716e-06,
"loss": 0.1196,
"num_input_tokens_seen": 3873568,
"step": 6205
},
{
"epoch": 12.46987951807229,
"grad_norm": 20.93626594543457,
"learning_rate": 3.7330070440504097e-06,
"loss": 0.0025,
"num_input_tokens_seen": 3876608,
"step": 6210
},
{
"epoch": 12.47991967871486,
"grad_norm": 0.0019983912352472544,
"learning_rate": 3.7245332651496038e-06,
"loss": 0.0249,
"num_input_tokens_seen": 3879232,
"step": 6215
},
{
"epoch": 12.48995983935743,
"grad_norm": 0.015782205387949944,
"learning_rate": 3.716063402811496e-06,
"loss": 0.0179,
"num_input_tokens_seen": 3882752,
"step": 6220
},
{
"epoch": 12.5,
"grad_norm": 0.0136459581553936,
"learning_rate": 3.707597483044411e-06,
"loss": 0.0003,
"num_input_tokens_seen": 3885344,
"step": 6225
},
{
"epoch": 12.51004016064257,
"grad_norm": 4.084256172180176,
"learning_rate": 3.699135531844559e-06,
"loss": 0.0002,
"num_input_tokens_seen": 3887648,
"step": 6230
},
{
"epoch": 12.52008032128514,
"grad_norm": 0.046021297574043274,
"learning_rate": 3.6906775751959667e-06,
"loss": 0.001,
"num_input_tokens_seen": 3891008,
"step": 6235
},
{
"epoch": 12.53012048192771,
"grad_norm": 0.012802932411432266,
"learning_rate": 3.682223639070398e-06,
"loss": 0.0003,
"num_input_tokens_seen": 3894016,
"step": 6240
},
{
"epoch": 12.540160642570282,
"grad_norm": 0.008216789923608303,
"learning_rate": 3.673773749427266e-06,
"loss": 0.0022,
"num_input_tokens_seen": 3897056,
"step": 6245
},
{
"epoch": 12.550200803212851,
"grad_norm": 0.049423910677433014,
"learning_rate": 3.6653279322135637e-06,
"loss": 0.0298,
"num_input_tokens_seen": 3900064,
"step": 6250
},
{
"epoch": 12.560240963855422,
"grad_norm": 3.1109814643859863,
"learning_rate": 3.656886213363772e-06,
"loss": 0.0707,
"num_input_tokens_seen": 3903424,
"step": 6255
},
{
"epoch": 12.570281124497992,
"grad_norm": 0.013427079655230045,
"learning_rate": 3.6484486187997927e-06,
"loss": 0.0367,
"num_input_tokens_seen": 3906528,
"step": 6260
},
{
"epoch": 12.580321285140561,
"grad_norm": 0.43069183826446533,
"learning_rate": 3.640015174430864e-06,
"loss": 0.0005,
"num_input_tokens_seen": 3909728,
"step": 6265
},
{
"epoch": 12.590361445783133,
"grad_norm": 0.037010353058576584,
"learning_rate": 3.6315859061534743e-06,
"loss": 0.1657,
"num_input_tokens_seen": 3913056,
"step": 6270
},
{
"epoch": 12.600401606425702,
"grad_norm": 1.318424940109253,
"learning_rate": 3.623160839851292e-06,
"loss": 0.1218,
"num_input_tokens_seen": 3916032,
"step": 6275
},
{
"epoch": 12.610441767068274,
"grad_norm": 0.23550517857074738,
"learning_rate": 3.6147400013950833e-06,
"loss": 0.0096,
"num_input_tokens_seen": 3919200,
"step": 6280
},
{
"epoch": 12.620481927710843,
"grad_norm": 0.05175193399190903,
"learning_rate": 3.60632341664263e-06,
"loss": 0.0043,
"num_input_tokens_seen": 3922048,
"step": 6285
},
{
"epoch": 12.630522088353414,
"grad_norm": 0.21679647266864777,
"learning_rate": 3.5979111114386556e-06,
"loss": 0.0002,
"num_input_tokens_seen": 3926208,
"step": 6290
},
{
"epoch": 12.640562248995984,
"grad_norm": 0.030079122632741928,
"learning_rate": 3.5895031116147355e-06,
"loss": 0.038,
"num_input_tokens_seen": 3929792,
"step": 6295
},
{
"epoch": 12.650602409638553,
"grad_norm": 0.020691825076937675,
"learning_rate": 3.5810994429892343e-06,
"loss": 0.0355,
"num_input_tokens_seen": 3932768,
"step": 6300
},
{
"epoch": 12.660642570281125,
"grad_norm": 0.17100581526756287,
"learning_rate": 3.5727001313672073e-06,
"loss": 0.1505,
"num_input_tokens_seen": 3936032,
"step": 6305
},
{
"epoch": 12.670682730923694,
"grad_norm": 0.002263088943436742,
"learning_rate": 3.5643052025403366e-06,
"loss": 0.0018,
"num_input_tokens_seen": 3939136,
"step": 6310
},
{
"epoch": 12.680722891566266,
"grad_norm": 0.3044714033603668,
"learning_rate": 3.555914682286845e-06,
"loss": 0.0002,
"num_input_tokens_seen": 3942688,
"step": 6315
},
{
"epoch": 12.690763052208835,
"grad_norm": 6.629093647003174,
"learning_rate": 3.547528596371418e-06,
"loss": 0.0031,
"num_input_tokens_seen": 3945472,
"step": 6320
},
{
"epoch": 12.700803212851406,
"grad_norm": 0.17978572845458984,
"learning_rate": 3.539146970545124e-06,
"loss": 0.0006,
"num_input_tokens_seen": 3948224,
"step": 6325
},
{
"epoch": 12.710843373493976,
"grad_norm": 6.806268692016602,
"learning_rate": 3.530769830545333e-06,
"loss": 0.0669,
"num_input_tokens_seen": 3951840,
"step": 6330
},
{
"epoch": 12.720883534136545,
"grad_norm": 12.894400596618652,
"learning_rate": 3.5223972020956454e-06,
"loss": 0.1137,
"num_input_tokens_seen": 3955424,
"step": 6335
},
{
"epoch": 12.730923694779117,
"grad_norm": 1.2106947898864746,
"learning_rate": 3.514029110905809e-06,
"loss": 0.0008,
"num_input_tokens_seen": 3957952,
"step": 6340
},
{
"epoch": 12.740963855421686,
"grad_norm": 0.024472283199429512,
"learning_rate": 3.505665582671631e-06,
"loss": 0.0704,
"num_input_tokens_seen": 3961152,
"step": 6345
},
{
"epoch": 12.751004016064257,
"grad_norm": 0.2990010976791382,
"learning_rate": 3.4973066430749175e-06,
"loss": 0.0007,
"num_input_tokens_seen": 3964480,
"step": 6350
},
{
"epoch": 12.761044176706827,
"grad_norm": 0.4378281831741333,
"learning_rate": 3.488952317783374e-06,
"loss": 0.024,
"num_input_tokens_seen": 3966912,
"step": 6355
},
{
"epoch": 12.771084337349398,
"grad_norm": 0.05266943201422691,
"learning_rate": 3.480602632450545e-06,
"loss": 0.0229,
"num_input_tokens_seen": 3969152,
"step": 6360
},
{
"epoch": 12.781124497991968,
"grad_norm": 0.08936101943254471,
"learning_rate": 3.4722576127157244e-06,
"loss": 0.0341,
"num_input_tokens_seen": 3972160,
"step": 6365
},
{
"epoch": 12.791164658634537,
"grad_norm": 0.012653462588787079,
"learning_rate": 3.4639172842038766e-06,
"loss": 0.0002,
"num_input_tokens_seen": 3974784,
"step": 6370
},
{
"epoch": 12.801204819277109,
"grad_norm": 0.008684534579515457,
"learning_rate": 3.4555816725255666e-06,
"loss": 0.0721,
"num_input_tokens_seen": 3978592,
"step": 6375
},
{
"epoch": 12.811244979919678,
"grad_norm": 14.522257804870605,
"learning_rate": 3.447250803276869e-06,
"loss": 0.076,
"num_input_tokens_seen": 3982272,
"step": 6380
},
{
"epoch": 12.82128514056225,
"grad_norm": 0.026460807770490646,
"learning_rate": 3.438924702039301e-06,
"loss": 0.0026,
"num_input_tokens_seen": 3985344,
"step": 6385
},
{
"epoch": 12.831325301204819,
"grad_norm": 0.39210912585258484,
"learning_rate": 3.430603394379738e-06,
"loss": 0.003,
"num_input_tokens_seen": 3988064,
"step": 6390
},
{
"epoch": 12.84136546184739,
"grad_norm": 0.493512362241745,
"learning_rate": 3.422286905850332e-06,
"loss": 0.014,
"num_input_tokens_seen": 3990976,
"step": 6395
},
{
"epoch": 12.85140562248996,
"grad_norm": 0.03762149438261986,
"learning_rate": 3.4139752619884415e-06,
"loss": 0.1316,
"num_input_tokens_seen": 3994848,
"step": 6400
},
{
"epoch": 12.861445783132531,
"grad_norm": 23.084840774536133,
"learning_rate": 3.4056684883165454e-06,
"loss": 0.0486,
"num_input_tokens_seen": 3997984,
"step": 6405
},
{
"epoch": 12.8714859437751,
"grad_norm": 0.08375642448663712,
"learning_rate": 3.3973666103421675e-06,
"loss": 0.0143,
"num_input_tokens_seen": 4000896,
"step": 6410
},
{
"epoch": 12.88152610441767,
"grad_norm": 0.24988406896591187,
"learning_rate": 3.389069653557805e-06,
"loss": 0.0106,
"num_input_tokens_seen": 4003776,
"step": 6415
},
{
"epoch": 12.891566265060241,
"grad_norm": 0.08291018009185791,
"learning_rate": 3.3807776434408326e-06,
"loss": 0.0806,
"num_input_tokens_seen": 4006656,
"step": 6420
},
{
"epoch": 12.901606425702811,
"grad_norm": 14.322249412536621,
"learning_rate": 3.3724906054534434e-06,
"loss": 0.0295,
"num_input_tokens_seen": 4010432,
"step": 6425
},
{
"epoch": 12.911646586345382,
"grad_norm": 0.2638911008834839,
"learning_rate": 3.3642085650425625e-06,
"loss": 0.0012,
"num_input_tokens_seen": 4013312,
"step": 6430
},
{
"epoch": 12.921686746987952,
"grad_norm": 0.043951284140348434,
"learning_rate": 3.355931547639764e-06,
"loss": 0.0029,
"num_input_tokens_seen": 4016256,
"step": 6435
},
{
"epoch": 12.931726907630523,
"grad_norm": 10.951936721801758,
"learning_rate": 3.3476595786612044e-06,
"loss": 0.006,
"num_input_tokens_seen": 4019264,
"step": 6440
},
{
"epoch": 12.941767068273093,
"grad_norm": 0.3326930105686188,
"learning_rate": 3.3393926835075307e-06,
"loss": 0.0607,
"num_input_tokens_seen": 4022496,
"step": 6445
},
{
"epoch": 12.951807228915662,
"grad_norm": 0.16518734395503998,
"learning_rate": 3.331130887563815e-06,
"loss": 0.0022,
"num_input_tokens_seen": 4025504,
"step": 6450
},
{
"epoch": 12.961847389558233,
"grad_norm": 105.1984634399414,
"learning_rate": 3.322874216199471e-06,
"loss": 0.0381,
"num_input_tokens_seen": 4028672,
"step": 6455
},
{
"epoch": 12.971887550200803,
"grad_norm": 0.021035606041550636,
"learning_rate": 3.3146226947681724e-06,
"loss": 0.152,
"num_input_tokens_seen": 4032672,
"step": 6460
},
{
"epoch": 12.981927710843374,
"grad_norm": 7.944100856781006,
"learning_rate": 3.306376348607787e-06,
"loss": 0.0037,
"num_input_tokens_seen": 4035968,
"step": 6465
},
{
"epoch": 12.991967871485944,
"grad_norm": 0.0038000282365828753,
"learning_rate": 3.2981352030402795e-06,
"loss": 0.0083,
"num_input_tokens_seen": 4039200,
"step": 6470
},
{
"epoch": 13.002008032128513,
"grad_norm": 9.69698429107666,
"learning_rate": 3.289899283371657e-06,
"loss": 0.0037,
"num_input_tokens_seen": 4042080,
"step": 6475
},
{
"epoch": 13.012048192771084,
"grad_norm": 0.06602434813976288,
"learning_rate": 3.2816686148918708e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4045088,
"step": 6480
},
{
"epoch": 13.022088353413654,
"grad_norm": 0.011850385926663876,
"learning_rate": 3.2734432228747527e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4048736,
"step": 6485
},
{
"epoch": 13.032128514056225,
"grad_norm": 0.028685562312602997,
"learning_rate": 3.26522313257793e-06,
"loss": 0.0145,
"num_input_tokens_seen": 4052416,
"step": 6490
},
{
"epoch": 13.042168674698795,
"grad_norm": 0.20602266490459442,
"learning_rate": 3.2570083692427474e-06,
"loss": 0.0338,
"num_input_tokens_seen": 4055328,
"step": 6495
},
{
"epoch": 13.052208835341366,
"grad_norm": 0.17084050178527832,
"learning_rate": 3.248798958094197e-06,
"loss": 0.0124,
"num_input_tokens_seen": 4058496,
"step": 6500
},
{
"epoch": 13.062248995983936,
"grad_norm": 56.381507873535156,
"learning_rate": 3.240594924340835e-06,
"loss": 0.018,
"num_input_tokens_seen": 4060832,
"step": 6505
},
{
"epoch": 13.072289156626505,
"grad_norm": 0.294530987739563,
"learning_rate": 3.232396293174702e-06,
"loss": 0.0559,
"num_input_tokens_seen": 4063584,
"step": 6510
},
{
"epoch": 13.082329317269076,
"grad_norm": 0.17592327296733856,
"learning_rate": 3.224203089771254e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4066368,
"step": 6515
},
{
"epoch": 13.092369477911646,
"grad_norm": 0.05794261023402214,
"learning_rate": 3.2160153392892737e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4069312,
"step": 6520
},
{
"epoch": 13.102409638554217,
"grad_norm": 0.054852936416864395,
"learning_rate": 3.2078330668708057e-06,
"loss": 0.0104,
"num_input_tokens_seen": 4072416,
"step": 6525
},
{
"epoch": 13.112449799196787,
"grad_norm": 0.00928029976785183,
"learning_rate": 3.19965629764107e-06,
"loss": 0.0019,
"num_input_tokens_seen": 4075424,
"step": 6530
},
{
"epoch": 13.122489959839358,
"grad_norm": 0.12830331921577454,
"learning_rate": 3.1914850567083866e-06,
"loss": 0.028,
"num_input_tokens_seen": 4078656,
"step": 6535
},
{
"epoch": 13.132530120481928,
"grad_norm": 0.1490611582994461,
"learning_rate": 3.1833193691641045e-06,
"loss": 0.061,
"num_input_tokens_seen": 4081216,
"step": 6540
},
{
"epoch": 13.142570281124499,
"grad_norm": 1.6569007635116577,
"learning_rate": 3.1751592600825143e-06,
"loss": 0.0281,
"num_input_tokens_seen": 4084256,
"step": 6545
},
{
"epoch": 13.152610441767068,
"grad_norm": 0.006913966964930296,
"learning_rate": 3.1670047545207817e-06,
"loss": 0.0015,
"num_input_tokens_seen": 4087712,
"step": 6550
},
{
"epoch": 13.162650602409638,
"grad_norm": 0.2316937893629074,
"learning_rate": 3.1588558775188647e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4090464,
"step": 6555
},
{
"epoch": 13.17269076305221,
"grad_norm": 0.1495800018310547,
"learning_rate": 3.1507126540994337e-06,
"loss": 0.0249,
"num_input_tokens_seen": 4093600,
"step": 6560
},
{
"epoch": 13.182730923694779,
"grad_norm": 0.04467432200908661,
"learning_rate": 3.1425751092678064e-06,
"loss": 0.019,
"num_input_tokens_seen": 4096864,
"step": 6565
},
{
"epoch": 13.19277108433735,
"grad_norm": 1.2640222311019897,
"learning_rate": 3.134443268011855e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4100480,
"step": 6570
},
{
"epoch": 13.20281124497992,
"grad_norm": 0.034024544060230255,
"learning_rate": 3.126317155301941e-06,
"loss": 0.1201,
"num_input_tokens_seen": 4103712,
"step": 6575
},
{
"epoch": 13.21285140562249,
"grad_norm": 2.878603219985962,
"learning_rate": 3.11819679609084e-06,
"loss": 0.0016,
"num_input_tokens_seen": 4106976,
"step": 6580
},
{
"epoch": 13.22289156626506,
"grad_norm": 8.289514541625977,
"learning_rate": 3.1100822153136513e-06,
"loss": 0.0047,
"num_input_tokens_seen": 4110464,
"step": 6585
},
{
"epoch": 13.23293172690763,
"grad_norm": 0.03438607603311539,
"learning_rate": 3.1019734378877403e-06,
"loss": 0.012,
"num_input_tokens_seen": 4113600,
"step": 6590
},
{
"epoch": 13.242971887550201,
"grad_norm": 0.005333933513611555,
"learning_rate": 3.0938704887126425e-06,
"loss": 0.0006,
"num_input_tokens_seen": 4116800,
"step": 6595
},
{
"epoch": 13.25301204819277,
"grad_norm": 0.6303266286849976,
"learning_rate": 3.0857733926700033e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4120256,
"step": 6600
},
{
"epoch": 13.263052208835342,
"grad_norm": 75.38514709472656,
"learning_rate": 3.077682174623495e-06,
"loss": 0.0218,
"num_input_tokens_seen": 4123136,
"step": 6605
},
{
"epoch": 13.273092369477911,
"grad_norm": 0.02914111502468586,
"learning_rate": 3.0695968594187366e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4126752,
"step": 6610
},
{
"epoch": 13.283132530120483,
"grad_norm": 0.02487659826874733,
"learning_rate": 3.0615174718832218e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4130080,
"step": 6615
},
{
"epoch": 13.293172690763052,
"grad_norm": 0.07562565803527832,
"learning_rate": 3.053444036826246e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4133184,
"step": 6620
},
{
"epoch": 13.303212851405622,
"grad_norm": 0.03663274273276329,
"learning_rate": 3.045376579038821e-06,
"loss": 0.0007,
"num_input_tokens_seen": 4136192,
"step": 6625
},
{
"epoch": 13.313253012048193,
"grad_norm": 0.007939444854855537,
"learning_rate": 3.037315123293611e-06,
"loss": 0.1104,
"num_input_tokens_seen": 4139552,
"step": 6630
},
{
"epoch": 13.323293172690763,
"grad_norm": 0.00802676472812891,
"learning_rate": 3.0292596943448416e-06,
"loss": 0.0125,
"num_input_tokens_seen": 4143040,
"step": 6635
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.20601074397563934,
"learning_rate": 3.0212103169282415e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4146240,
"step": 6640
},
{
"epoch": 13.343373493975903,
"grad_norm": 0.12495647370815277,
"learning_rate": 3.013167015760946e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4150272,
"step": 6645
},
{
"epoch": 13.353413654618475,
"grad_norm": 1.3309050798416138,
"learning_rate": 3.0051298155414426e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4154624,
"step": 6650
},
{
"epoch": 13.363453815261044,
"grad_norm": 0.1865474134683609,
"learning_rate": 2.9970987409494784e-06,
"loss": 0.0158,
"num_input_tokens_seen": 4157152,
"step": 6655
},
{
"epoch": 13.373493975903614,
"grad_norm": 0.7567386031150818,
"learning_rate": 2.989073816645992e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4159552,
"step": 6660
},
{
"epoch": 13.383534136546185,
"grad_norm": 1.041918158531189,
"learning_rate": 2.9810550672730367e-06,
"loss": 0.0344,
"num_input_tokens_seen": 4163008,
"step": 6665
},
{
"epoch": 13.393574297188755,
"grad_norm": 0.1040244847536087,
"learning_rate": 2.9730425174537057e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4166432,
"step": 6670
},
{
"epoch": 13.403614457831326,
"grad_norm": 0.002490447601303458,
"learning_rate": 2.965036191792052e-06,
"loss": 0.0,
"num_input_tokens_seen": 4169472,
"step": 6675
},
{
"epoch": 13.413654618473895,
"grad_norm": 0.009692768566310406,
"learning_rate": 2.9570361148730213e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4172704,
"step": 6680
},
{
"epoch": 13.423694779116467,
"grad_norm": 0.05210626497864723,
"learning_rate": 2.9490423112623646e-06,
"loss": 0.0648,
"num_input_tokens_seen": 4176000,
"step": 6685
},
{
"epoch": 13.433734939759036,
"grad_norm": 0.009161800146102905,
"learning_rate": 2.9410548055065748e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4178720,
"step": 6690
},
{
"epoch": 13.443775100401606,
"grad_norm": 0.010142548009753227,
"learning_rate": 2.933073622132806e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4181760,
"step": 6695
},
{
"epoch": 13.453815261044177,
"grad_norm": 8.248490333557129,
"learning_rate": 2.9250987856487932e-06,
"loss": 0.0604,
"num_input_tokens_seen": 4185152,
"step": 6700
},
{
"epoch": 13.463855421686747,
"grad_norm": 0.09948313981294632,
"learning_rate": 2.9171303205427883e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4188320,
"step": 6705
},
{
"epoch": 13.473895582329318,
"grad_norm": 0.012316162697970867,
"learning_rate": 2.909168251283474e-06,
"loss": 0.0507,
"num_input_tokens_seen": 4191776,
"step": 6710
},
{
"epoch": 13.483935742971887,
"grad_norm": 0.01065347995609045,
"learning_rate": 2.9012126023198973e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4194752,
"step": 6715
},
{
"epoch": 13.493975903614459,
"grad_norm": 0.005095439963042736,
"learning_rate": 2.893263398081386e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4197280,
"step": 6720
},
{
"epoch": 13.504016064257028,
"grad_norm": 0.004570506047457457,
"learning_rate": 2.8853206629774823e-06,
"loss": 0.0579,
"num_input_tokens_seen": 4200736,
"step": 6725
},
{
"epoch": 13.514056224899598,
"grad_norm": 0.0041636135429143906,
"learning_rate": 2.877384421397862e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4203968,
"step": 6730
},
{
"epoch": 13.524096385542169,
"grad_norm": 0.006220974028110504,
"learning_rate": 2.8694546977122595e-06,
"loss": 0.0273,
"num_input_tokens_seen": 4206528,
"step": 6735
},
{
"epoch": 13.534136546184738,
"grad_norm": 0.13364745676517487,
"learning_rate": 2.8615315162703962e-06,
"loss": 0.0669,
"num_input_tokens_seen": 4209472,
"step": 6740
},
{
"epoch": 13.54417670682731,
"grad_norm": 0.054688554257154465,
"learning_rate": 2.853614901401909e-06,
"loss": 0.0193,
"num_input_tokens_seen": 4212960,
"step": 6745
},
{
"epoch": 13.55421686746988,
"grad_norm": 0.6423424482345581,
"learning_rate": 2.84570487741626e-06,
"loss": 0.0445,
"num_input_tokens_seen": 4216160,
"step": 6750
},
{
"epoch": 13.56425702811245,
"grad_norm": 0.0701519250869751,
"learning_rate": 2.837801468602687e-06,
"loss": 0.0006,
"num_input_tokens_seen": 4219232,
"step": 6755
},
{
"epoch": 13.57429718875502,
"grad_norm": 20.19487953186035,
"learning_rate": 2.8299046992300995e-06,
"loss": 0.1348,
"num_input_tokens_seen": 4221920,
"step": 6760
},
{
"epoch": 13.58433734939759,
"grad_norm": 0.07352690398693085,
"learning_rate": 2.8220145935470276e-06,
"loss": 0.0464,
"num_input_tokens_seen": 4225152,
"step": 6765
},
{
"epoch": 13.594377510040161,
"grad_norm": 0.26299330592155457,
"learning_rate": 2.8141311757815454e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4228736,
"step": 6770
},
{
"epoch": 13.60441767068273,
"grad_norm": 0.006310007069259882,
"learning_rate": 2.806254470141174e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4231872,
"step": 6775
},
{
"epoch": 13.614457831325302,
"grad_norm": 76.36661529541016,
"learning_rate": 2.798384500812842e-06,
"loss": 0.028,
"num_input_tokens_seen": 4234784,
"step": 6780
},
{
"epoch": 13.624497991967871,
"grad_norm": 0.17646919190883636,
"learning_rate": 2.790521291962775e-06,
"loss": 0.0424,
"num_input_tokens_seen": 4237696,
"step": 6785
},
{
"epoch": 13.634538152610443,
"grad_norm": 42.649139404296875,
"learning_rate": 2.7826648677364555e-06,
"loss": 0.0216,
"num_input_tokens_seen": 4240928,
"step": 6790
},
{
"epoch": 13.644578313253012,
"grad_norm": 0.010923897847533226,
"learning_rate": 2.774815252258522e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4244480,
"step": 6795
},
{
"epoch": 13.654618473895582,
"grad_norm": 23.903295516967773,
"learning_rate": 2.7669724696327094e-06,
"loss": 0.0276,
"num_input_tokens_seen": 4247552,
"step": 6800
},
{
"epoch": 13.664658634538153,
"grad_norm": 0.018534662202000618,
"learning_rate": 2.759136543941773e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4250304,
"step": 6805
},
{
"epoch": 13.674698795180722,
"grad_norm": 0.014366156421601772,
"learning_rate": 2.751307499247403e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4254016,
"step": 6810
},
{
"epoch": 13.684738955823294,
"grad_norm": 0.11602424085140228,
"learning_rate": 2.743485359590173e-06,
"loss": 0.011,
"num_input_tokens_seen": 4256704,
"step": 6815
},
{
"epoch": 13.694779116465863,
"grad_norm": 0.0550687350332737,
"learning_rate": 2.7356701489894468e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4259584,
"step": 6820
},
{
"epoch": 13.704819277108435,
"grad_norm": 41.35027313232422,
"learning_rate": 2.7278618914433105e-06,
"loss": 0.0145,
"num_input_tokens_seen": 4262368,
"step": 6825
},
{
"epoch": 13.714859437751004,
"grad_norm": 0.009249622002243996,
"learning_rate": 2.720060610928501e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4265792,
"step": 6830
},
{
"epoch": 13.724899598393574,
"grad_norm": 0.05187319219112396,
"learning_rate": 2.712266331400332e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4268448,
"step": 6835
},
{
"epoch": 13.734939759036145,
"grad_norm": 0.032774023711681366,
"learning_rate": 2.704479076792618e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4272192,
"step": 6840
},
{
"epoch": 13.744979919678714,
"grad_norm": 0.01256249938160181,
"learning_rate": 2.696698871017601e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4275040,
"step": 6845
},
{
"epoch": 13.755020080321286,
"grad_norm": 0.006609546486288309,
"learning_rate": 2.6889257379658804e-06,
"loss": 0.0166,
"num_input_tokens_seen": 4278144,
"step": 6850
},
{
"epoch": 13.765060240963855,
"grad_norm": 0.03675635904073715,
"learning_rate": 2.6811597015063373e-06,
"loss": 0.0872,
"num_input_tokens_seen": 4281344,
"step": 6855
},
{
"epoch": 13.775100401606426,
"grad_norm": 38.53529739379883,
"learning_rate": 2.6734007854860596e-06,
"loss": 0.034,
"num_input_tokens_seen": 4284032,
"step": 6860
},
{
"epoch": 13.785140562248996,
"grad_norm": 1.9347114562988281,
"learning_rate": 2.66564901373027e-06,
"loss": 0.0428,
"num_input_tokens_seen": 4286848,
"step": 6865
},
{
"epoch": 13.795180722891565,
"grad_norm": 28.785078048706055,
"learning_rate": 2.657904410042261e-06,
"loss": 0.048,
"num_input_tokens_seen": 4289536,
"step": 6870
},
{
"epoch": 13.805220883534137,
"grad_norm": 0.011087162420153618,
"learning_rate": 2.6501669982033006e-06,
"loss": 0.0023,
"num_input_tokens_seen": 4292960,
"step": 6875
},
{
"epoch": 13.815261044176706,
"grad_norm": 1.338516116142273,
"learning_rate": 2.6424368019725877e-06,
"loss": 0.0009,
"num_input_tokens_seen": 4296064,
"step": 6880
},
{
"epoch": 13.825301204819278,
"grad_norm": 0.04255475848913193,
"learning_rate": 2.634713845087152e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4299744,
"step": 6885
},
{
"epoch": 13.835341365461847,
"grad_norm": 0.021324431523680687,
"learning_rate": 2.626998151261798e-06,
"loss": 0.0173,
"num_input_tokens_seen": 4302912,
"step": 6890
},
{
"epoch": 13.845381526104418,
"grad_norm": 0.011794524267315865,
"learning_rate": 2.6192897441890337e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4306464,
"step": 6895
},
{
"epoch": 13.855421686746988,
"grad_norm": 253.84390258789062,
"learning_rate": 2.6115886475389786e-06,
"loss": 0.0425,
"num_input_tokens_seen": 4310240,
"step": 6900
},
{
"epoch": 13.865461847389557,
"grad_norm": 0.6399533152580261,
"learning_rate": 2.603894884959317e-06,
"loss": 0.1078,
"num_input_tokens_seen": 4313568,
"step": 6905
},
{
"epoch": 13.875502008032129,
"grad_norm": 0.027019036933779716,
"learning_rate": 2.5962084800752064e-06,
"loss": 0.0262,
"num_input_tokens_seen": 4316832,
"step": 6910
},
{
"epoch": 13.885542168674698,
"grad_norm": 0.009591503068804741,
"learning_rate": 2.588529456489211e-06,
"loss": 0.002,
"num_input_tokens_seen": 4319904,
"step": 6915
},
{
"epoch": 13.89558232931727,
"grad_norm": 0.010872164741158485,
"learning_rate": 2.580857837781231e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4322496,
"step": 6920
},
{
"epoch": 13.905622489959839,
"grad_norm": 0.021072586998343468,
"learning_rate": 2.573193647508426e-06,
"loss": 0.0508,
"num_input_tokens_seen": 4325696,
"step": 6925
},
{
"epoch": 13.91566265060241,
"grad_norm": 2.339646577835083,
"learning_rate": 2.5655369092051495e-06,
"loss": 0.0536,
"num_input_tokens_seen": 4328672,
"step": 6930
},
{
"epoch": 13.92570281124498,
"grad_norm": 0.1687999665737152,
"learning_rate": 2.557887646382868e-06,
"loss": 0.03,
"num_input_tokens_seen": 4331680,
"step": 6935
},
{
"epoch": 13.93574297188755,
"grad_norm": 0.004229373764246702,
"learning_rate": 2.5502458825300956e-06,
"loss": 0.0061,
"num_input_tokens_seen": 4334688,
"step": 6940
},
{
"epoch": 13.94578313253012,
"grad_norm": 25.813377380371094,
"learning_rate": 2.542611641112318e-06,
"loss": 0.0367,
"num_input_tokens_seen": 4338240,
"step": 6945
},
{
"epoch": 13.95582329317269,
"grad_norm": 0.0337495282292366,
"learning_rate": 2.534984945571923e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4341824,
"step": 6950
},
{
"epoch": 13.965863453815262,
"grad_norm": 0.009060739539563656,
"learning_rate": 2.5273658193281252e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4344800,
"step": 6955
},
{
"epoch": 13.975903614457831,
"grad_norm": 182.3167266845703,
"learning_rate": 2.519754285776903e-06,
"loss": 0.0111,
"num_input_tokens_seen": 4347936,
"step": 6960
},
{
"epoch": 13.985943775100402,
"grad_norm": 0.032948389649391174,
"learning_rate": 2.5121503682909095e-06,
"loss": 0.0019,
"num_input_tokens_seen": 4350976,
"step": 6965
},
{
"epoch": 13.995983935742972,
"grad_norm": 3.3692076206207275,
"learning_rate": 2.504554090219418e-06,
"loss": 0.0009,
"num_input_tokens_seen": 4354016,
"step": 6970
},
{
"epoch": 14.0,
"eval_loss": 0.8435496687889099,
"eval_runtime": 8.0735,
"eval_samples_per_second": 61.684,
"eval_steps_per_second": 15.483,
"num_input_tokens_seen": 4355328,
"step": 6972
},
{
"epoch": 14.006024096385541,
"grad_norm": 0.05708456039428711,
"learning_rate": 2.496965474888243e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4356832,
"step": 6975
},
{
"epoch": 14.016064257028113,
"grad_norm": 0.002826336305588484,
"learning_rate": 2.489384545599666e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4360320,
"step": 6980
},
{
"epoch": 14.026104417670682,
"grad_norm": 0.015671400353312492,
"learning_rate": 2.4818113256323745e-06,
"loss": 0.0025,
"num_input_tokens_seen": 4363424,
"step": 6985
},
{
"epoch": 14.036144578313253,
"grad_norm": 0.03397877886891365,
"learning_rate": 2.474245838241371e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4366240,
"step": 6990
},
{
"epoch": 14.046184738955823,
"grad_norm": 0.002914144191890955,
"learning_rate": 2.466688106657927e-06,
"loss": 0.0,
"num_input_tokens_seen": 4368704,
"step": 6995
},
{
"epoch": 14.056224899598394,
"grad_norm": 0.005297825671732426,
"learning_rate": 2.459138154089486e-06,
"loss": 0.0157,
"num_input_tokens_seen": 4372320,
"step": 7000
},
{
"epoch": 14.066265060240964,
"grad_norm": 0.021198710426688194,
"learning_rate": 2.4515960037196146e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4375104,
"step": 7005
},
{
"epoch": 14.076305220883533,
"grad_norm": 0.1993994563817978,
"learning_rate": 2.444061678707915e-06,
"loss": 0.017,
"num_input_tokens_seen": 4377888,
"step": 7010
},
{
"epoch": 14.086345381526105,
"grad_norm": 0.2237624228000641,
"learning_rate": 2.4365352021899635e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4381536,
"step": 7015
},
{
"epoch": 14.096385542168674,
"grad_norm": 0.02824407070875168,
"learning_rate": 2.4290165972772363e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4384096,
"step": 7020
},
{
"epoch": 14.106425702811245,
"grad_norm": 0.0019500048365443945,
"learning_rate": 2.42150588705703e-06,
"loss": 0.0072,
"num_input_tokens_seen": 4387168,
"step": 7025
},
{
"epoch": 14.116465863453815,
"grad_norm": 34.514892578125,
"learning_rate": 2.4140030945924137e-06,
"loss": 0.0612,
"num_input_tokens_seen": 4389728,
"step": 7030
},
{
"epoch": 14.126506024096386,
"grad_norm": 0.023807184770703316,
"learning_rate": 2.4065082429221315e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4393184,
"step": 7035
},
{
"epoch": 14.136546184738956,
"grad_norm": 0.007570713758468628,
"learning_rate": 2.3990213550605496e-06,
"loss": 0.0024,
"num_input_tokens_seen": 4396608,
"step": 7040
},
{
"epoch": 14.146586345381525,
"grad_norm": 0.03999396786093712,
"learning_rate": 2.391542453997578e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4399520,
"step": 7045
},
{
"epoch": 14.156626506024097,
"grad_norm": 4.837028503417969,
"learning_rate": 2.3840715626986016e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4402784,
"step": 7050
},
{
"epoch": 14.166666666666666,
"grad_norm": 0.004308843053877354,
"learning_rate": 2.37660870410441e-06,
"loss": 0.0016,
"num_input_tokens_seen": 4406016,
"step": 7055
},
{
"epoch": 14.176706827309237,
"grad_norm": 0.013919214718043804,
"learning_rate": 2.3691539011311276e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4409600,
"step": 7060
},
{
"epoch": 14.186746987951807,
"grad_norm": 0.06342492997646332,
"learning_rate": 2.3617071766701415e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4412352,
"step": 7065
},
{
"epoch": 14.196787148594378,
"grad_norm": 0.05328844487667084,
"learning_rate": 2.354268553588033e-06,
"loss": 0.0,
"num_input_tokens_seen": 4415072,
"step": 7070
},
{
"epoch": 14.206827309236948,
"grad_norm": 0.01577562279999256,
"learning_rate": 2.346838054726505e-06,
"loss": 0.0006,
"num_input_tokens_seen": 4418848,
"step": 7075
},
{
"epoch": 14.216867469879517,
"grad_norm": 1.3071502447128296,
"learning_rate": 2.3394157029023145e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4421664,
"step": 7080
},
{
"epoch": 14.226907630522089,
"grad_norm": 0.034468941390514374,
"learning_rate": 2.3320015209072056e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4424736,
"step": 7085
},
{
"epoch": 14.236947791164658,
"grad_norm": 102.81072998046875,
"learning_rate": 2.324595531507827e-06,
"loss": 0.0074,
"num_input_tokens_seen": 4427296,
"step": 7090
},
{
"epoch": 14.24698795180723,
"grad_norm": 0.0323479101061821,
"learning_rate": 2.317197757445676e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4430848,
"step": 7095
},
{
"epoch": 14.257028112449799,
"grad_norm": 0.03737180680036545,
"learning_rate": 2.309808221437022e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4433536,
"step": 7100
},
{
"epoch": 14.26706827309237,
"grad_norm": 0.0033175817225128412,
"learning_rate": 2.302426946172836e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4436544,
"step": 7105
},
{
"epoch": 14.27710843373494,
"grad_norm": 7.275900363922119,
"learning_rate": 2.295053954318731e-06,
"loss": 0.0113,
"num_input_tokens_seen": 4439424,
"step": 7110
},
{
"epoch": 14.28714859437751,
"grad_norm": 0.00315207545645535,
"learning_rate": 2.2876892685148696e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4442400,
"step": 7115
},
{
"epoch": 14.29718875502008,
"grad_norm": 0.015147917903959751,
"learning_rate": 2.2803329113759256e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4445408,
"step": 7120
},
{
"epoch": 14.30722891566265,
"grad_norm": 36.320823669433594,
"learning_rate": 2.2729849054909812e-06,
"loss": 0.0087,
"num_input_tokens_seen": 4448928,
"step": 7125
},
{
"epoch": 14.317269076305221,
"grad_norm": 0.006546009331941605,
"learning_rate": 2.26564527342349e-06,
"loss": 0.1558,
"num_input_tokens_seen": 4452416,
"step": 7130
},
{
"epoch": 14.32730923694779,
"grad_norm": 0.018252495676279068,
"learning_rate": 2.258314037711184e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4454976,
"step": 7135
},
{
"epoch": 14.337349397590362,
"grad_norm": 0.1240961030125618,
"learning_rate": 2.2509912208660125e-06,
"loss": 0.0016,
"num_input_tokens_seen": 4457984,
"step": 7140
},
{
"epoch": 14.347389558232932,
"grad_norm": 25.80049705505371,
"learning_rate": 2.2436768453740743e-06,
"loss": 0.0348,
"num_input_tokens_seen": 4460992,
"step": 7145
},
{
"epoch": 14.357429718875501,
"grad_norm": 0.013544696383178234,
"learning_rate": 2.236370933695549e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4463904,
"step": 7150
},
{
"epoch": 14.367469879518072,
"grad_norm": 0.022857604548335075,
"learning_rate": 2.2290735082646254e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4466656,
"step": 7155
},
{
"epoch": 14.377510040160642,
"grad_norm": 352.6915283203125,
"learning_rate": 2.2217845914894315e-06,
"loss": 0.0789,
"num_input_tokens_seen": 4470208,
"step": 7160
},
{
"epoch": 14.387550200803213,
"grad_norm": 0.05897314473986626,
"learning_rate": 2.214504205751971e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4474144,
"step": 7165
},
{
"epoch": 14.397590361445783,
"grad_norm": 0.042249646037817,
"learning_rate": 2.2072323734080503e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4477184,
"step": 7170
},
{
"epoch": 14.407630522088354,
"grad_norm": 0.08079247176647186,
"learning_rate": 2.1999691167872107e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4480064,
"step": 7175
},
{
"epoch": 14.417670682730924,
"grad_norm": 0.4898930788040161,
"learning_rate": 2.1927144581926597e-06,
"loss": 0.034,
"num_input_tokens_seen": 4483616,
"step": 7180
},
{
"epoch": 14.427710843373493,
"grad_norm": 0.01570323295891285,
"learning_rate": 2.1854684199012036e-06,
"loss": 0.0182,
"num_input_tokens_seen": 4487488,
"step": 7185
},
{
"epoch": 14.437751004016064,
"grad_norm": 0.021506065502762794,
"learning_rate": 2.178231024163179e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4489696,
"step": 7190
},
{
"epoch": 14.447791164658634,
"grad_norm": 0.21415483951568604,
"learning_rate": 2.1710022932023805e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4493088,
"step": 7195
},
{
"epoch": 14.457831325301205,
"grad_norm": 0.007669499143958092,
"learning_rate": 2.163782249216005e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4497024,
"step": 7200
},
{
"epoch": 14.467871485943775,
"grad_norm": 0.014763305895030499,
"learning_rate": 2.15657091437456e-06,
"loss": 0.0022,
"num_input_tokens_seen": 4499712,
"step": 7205
},
{
"epoch": 14.477911646586346,
"grad_norm": 2.469639778137207,
"learning_rate": 2.1493683108218254e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4502400,
"step": 7210
},
{
"epoch": 14.487951807228916,
"grad_norm": 0.02391161024570465,
"learning_rate": 2.142174460674755e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4505088,
"step": 7215
},
{
"epoch": 14.497991967871485,
"grad_norm": 0.13245859742164612,
"learning_rate": 2.134989386023437e-06,
"loss": 0.0474,
"num_input_tokens_seen": 4508384,
"step": 7220
},
{
"epoch": 14.508032128514056,
"grad_norm": 0.012419681996107101,
"learning_rate": 2.127813108931007e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4511584,
"step": 7225
},
{
"epoch": 14.518072289156626,
"grad_norm": 0.014137223362922668,
"learning_rate": 2.1206456514335794e-06,
"loss": 0.0,
"num_input_tokens_seen": 4514816,
"step": 7230
},
{
"epoch": 14.528112449799197,
"grad_norm": 0.2595354914665222,
"learning_rate": 2.113487035540201e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4517824,
"step": 7235
},
{
"epoch": 14.538152610441767,
"grad_norm": 0.010352588258683681,
"learning_rate": 2.1063372832327535e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4521088,
"step": 7240
},
{
"epoch": 14.548192771084338,
"grad_norm": 0.018129389733076096,
"learning_rate": 2.099196416465913e-06,
"loss": 0.0092,
"num_input_tokens_seen": 4524416,
"step": 7245
},
{
"epoch": 14.558232931726907,
"grad_norm": 0.005967453587800264,
"learning_rate": 2.092064457167066e-06,
"loss": 0.0919,
"num_input_tokens_seen": 4527520,
"step": 7250
},
{
"epoch": 14.568273092369477,
"grad_norm": 0.004777940455824137,
"learning_rate": 2.084941427236245e-06,
"loss": 0.0014,
"num_input_tokens_seen": 4530976,
"step": 7255
},
{
"epoch": 14.578313253012048,
"grad_norm": 0.009696507826447487,
"learning_rate": 2.0778273485460677e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4534048,
"step": 7260
},
{
"epoch": 14.588353413654618,
"grad_norm": 0.009302028454840183,
"learning_rate": 2.0707222429416613e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4537536,
"step": 7265
},
{
"epoch": 14.598393574297189,
"grad_norm": 0.008531935513019562,
"learning_rate": 2.063626132240602e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4540256,
"step": 7270
},
{
"epoch": 14.608433734939759,
"grad_norm": 0.00231292680837214,
"learning_rate": 2.0565390382328448e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4543552,
"step": 7275
},
{
"epoch": 14.61847389558233,
"grad_norm": 0.018894299864768982,
"learning_rate": 2.049460982680656e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4546304,
"step": 7280
},
{
"epoch": 14.6285140562249,
"grad_norm": 0.0022834010887891054,
"learning_rate": 2.04239198731855e-06,
"loss": 0.0,
"num_input_tokens_seen": 4549312,
"step": 7285
},
{
"epoch": 14.638554216867469,
"grad_norm": 0.20074783265590668,
"learning_rate": 2.035332073853217e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4553152,
"step": 7290
},
{
"epoch": 14.64859437751004,
"grad_norm": 0.9416258335113525,
"learning_rate": 2.0282812639634636e-06,
"loss": 0.0692,
"num_input_tokens_seen": 4555712,
"step": 7295
},
{
"epoch": 14.65863453815261,
"grad_norm": 0.0018730978481471539,
"learning_rate": 2.0212395793001384e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4558304,
"step": 7300
},
{
"epoch": 14.668674698795181,
"grad_norm": 0.027835896238684654,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4560992,
"step": 7305
},
{
"epoch": 14.67871485943775,
"grad_norm": 35.51850509643555,
"learning_rate": 2.007183672116002e-06,
"loss": 0.0026,
"num_input_tokens_seen": 4564384,
"step": 7310
},
{
"epoch": 14.688755020080322,
"grad_norm": 0.07114183902740479,
"learning_rate": 2.000169492756523e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4567936,
"step": 7315
},
{
"epoch": 14.698795180722891,
"grad_norm": 1.0696017742156982,
"learning_rate": 1.9931645249459997e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4571072,
"step": 7320
},
{
"epoch": 14.708835341365463,
"grad_norm": 0.0018745064735412598,
"learning_rate": 1.986168790194521e-06,
"loss": 0.0013,
"num_input_tokens_seen": 4574496,
"step": 7325
},
{
"epoch": 14.718875502008032,
"grad_norm": 0.03353925794363022,
"learning_rate": 1.9791823099838107e-06,
"loss": 0.0039,
"num_input_tokens_seen": 4577440,
"step": 7330
},
{
"epoch": 14.728915662650602,
"grad_norm": 0.01741660013794899,
"learning_rate": 1.9722051057671896e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4580608,
"step": 7335
},
{
"epoch": 14.738955823293173,
"grad_norm": 0.011953890323638916,
"learning_rate": 1.965237198969481e-06,
"loss": 0.0201,
"num_input_tokens_seen": 4584160,
"step": 7340
},
{
"epoch": 14.748995983935743,
"grad_norm": 0.27335840463638306,
"learning_rate": 1.9582786109869713e-06,
"loss": 0.0036,
"num_input_tokens_seen": 4587072,
"step": 7345
},
{
"epoch": 14.759036144578314,
"grad_norm": 341.1443176269531,
"learning_rate": 1.951329363187323e-06,
"loss": 0.0585,
"num_input_tokens_seen": 4590272,
"step": 7350
},
{
"epoch": 14.769076305220883,
"grad_norm": 0.007021079305559397,
"learning_rate": 1.944389476909518e-06,
"loss": 0.0,
"num_input_tokens_seen": 4593824,
"step": 7355
},
{
"epoch": 14.779116465863455,
"grad_norm": 0.0021303293760865927,
"learning_rate": 1.9374589734638e-06,
"loss": 0.0,
"num_input_tokens_seen": 4596352,
"step": 7360
},
{
"epoch": 14.789156626506024,
"grad_norm": 35.135520935058594,
"learning_rate": 1.930537874131588e-06,
"loss": 0.0116,
"num_input_tokens_seen": 4599616,
"step": 7365
},
{
"epoch": 14.799196787148594,
"grad_norm": 0.003619756083935499,
"learning_rate": 1.9236262001654372e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4603584,
"step": 7370
},
{
"epoch": 14.809236947791165,
"grad_norm": 0.004386731423437595,
"learning_rate": 1.9167239727889527e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4607136,
"step": 7375
},
{
"epoch": 14.819277108433734,
"grad_norm": 0.012418882921338081,
"learning_rate": 1.9098312131967327e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4609888,
"step": 7380
},
{
"epoch": 14.829317269076306,
"grad_norm": 0.008296665735542774,
"learning_rate": 1.9029479425543052e-06,
"loss": 0.0,
"num_input_tokens_seen": 4612384,
"step": 7385
},
{
"epoch": 14.839357429718875,
"grad_norm": 10.158157348632812,
"learning_rate": 1.8960741819980576e-06,
"loss": 0.0029,
"num_input_tokens_seen": 4615424,
"step": 7390
},
{
"epoch": 14.849397590361447,
"grad_norm": 0.003470318391919136,
"learning_rate": 1.889209952635178e-06,
"loss": 0.0015,
"num_input_tokens_seen": 4618848,
"step": 7395
},
{
"epoch": 14.859437751004016,
"grad_norm": 0.006565614603459835,
"learning_rate": 1.8823552755435847e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4622176,
"step": 7400
},
{
"epoch": 14.869477911646586,
"grad_norm": 0.001681746100075543,
"learning_rate": 1.875510171771865e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4625088,
"step": 7405
},
{
"epoch": 14.879518072289157,
"grad_norm": 0.003082460956647992,
"learning_rate": 1.868674662339207e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4628640,
"step": 7410
},
{
"epoch": 14.889558232931726,
"grad_norm": 0.011504475958645344,
"learning_rate": 1.8618487682353453e-06,
"loss": 0.0,
"num_input_tokens_seen": 4631808,
"step": 7415
},
{
"epoch": 14.899598393574298,
"grad_norm": 0.10181345045566559,
"learning_rate": 1.855032510420477e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4634176,
"step": 7420
},
{
"epoch": 14.909638554216867,
"grad_norm": 0.0036355298943817616,
"learning_rate": 1.848225909825222e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4636704,
"step": 7425
},
{
"epoch": 14.919678714859439,
"grad_norm": 0.012753061018884182,
"learning_rate": 1.8414289873505337e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4639776,
"step": 7430
},
{
"epoch": 14.929718875502008,
"grad_norm": 0.1419248729944229,
"learning_rate": 1.8346417638676533e-06,
"loss": 0.0,
"num_input_tokens_seen": 4642848,
"step": 7435
},
{
"epoch": 14.939759036144578,
"grad_norm": 0.014822770841419697,
"learning_rate": 1.8278642602180435e-06,
"loss": 0.0051,
"num_input_tokens_seen": 4646400,
"step": 7440
},
{
"epoch": 14.949799196787149,
"grad_norm": 28.095958709716797,
"learning_rate": 1.8210964972133095e-06,
"loss": 0.0857,
"num_input_tokens_seen": 4649088,
"step": 7445
},
{
"epoch": 14.959839357429718,
"grad_norm": 0.0036780142690986395,
"learning_rate": 1.814338495635158e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4652512,
"step": 7450
},
{
"epoch": 14.96987951807229,
"grad_norm": 0.06401392817497253,
"learning_rate": 1.8075902762353093e-06,
"loss": 0.0551,
"num_input_tokens_seen": 4655584,
"step": 7455
},
{
"epoch": 14.97991967871486,
"grad_norm": 0.010513795539736748,
"learning_rate": 1.8008518597354575e-06,
"loss": 0.0,
"num_input_tokens_seen": 4658272,
"step": 7460
},
{
"epoch": 14.98995983935743,
"grad_norm": 0.012088480405509472,
"learning_rate": 1.7941232668271863e-06,
"loss": 0.053,
"num_input_tokens_seen": 4662432,
"step": 7465
},
{
"epoch": 15.0,
"grad_norm": 0.0037560504861176014,
"learning_rate": 1.787404518171919e-06,
"loss": 0.1219,
"num_input_tokens_seen": 4665120,
"step": 7470
},
{
"epoch": 15.01004016064257,
"grad_norm": 0.07317201793193817,
"learning_rate": 1.7806956344008475e-06,
"loss": 0.0,
"num_input_tokens_seen": 4668640,
"step": 7475
},
{
"epoch": 15.02008032128514,
"grad_norm": 0.10222109407186508,
"learning_rate": 1.773996636114873e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4672064,
"step": 7480
},
{
"epoch": 15.03012048192771,
"grad_norm": 0.01750057004392147,
"learning_rate": 1.7673075438845423e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4675264,
"step": 7485
},
{
"epoch": 15.040160642570282,
"grad_norm": 0.009696793742477894,
"learning_rate": 1.7606283782499812e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4677728,
"step": 7490
},
{
"epoch": 15.050200803212851,
"grad_norm": 0.049690455198287964,
"learning_rate": 1.753959159720836e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4680608,
"step": 7495
},
{
"epoch": 15.060240963855422,
"grad_norm": 0.006616874132305384,
"learning_rate": 1.7472999087762081e-06,
"loss": 0.0488,
"num_input_tokens_seen": 4683712,
"step": 7500
},
{
"epoch": 15.070281124497992,
"grad_norm": 0.013991329818964005,
"learning_rate": 1.7406506458645923e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4687520,
"step": 7505
},
{
"epoch": 15.080321285140561,
"grad_norm": 0.00402922835201025,
"learning_rate": 1.7340113914038115e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4690560,
"step": 7510
},
{
"epoch": 15.090361445783133,
"grad_norm": 5.365704536437988,
"learning_rate": 1.727382165780957e-06,
"loss": 0.0018,
"num_input_tokens_seen": 4693696,
"step": 7515
},
{
"epoch": 15.100401606425702,
"grad_norm": 0.006192977540194988,
"learning_rate": 1.7207629893523236e-06,
"loss": 0.0672,
"num_input_tokens_seen": 4696800,
"step": 7520
},
{
"epoch": 15.110441767068274,
"grad_norm": 0.11119314283132553,
"learning_rate": 1.7141538824433506e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4699776,
"step": 7525
},
{
"epoch": 15.120481927710843,
"grad_norm": 0.08953115344047546,
"learning_rate": 1.7075548653485535e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4702528,
"step": 7530
},
{
"epoch": 15.130522088353414,
"grad_norm": 0.05335596948862076,
"learning_rate": 1.7009659583314659e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4705152,
"step": 7535
},
{
"epoch": 15.140562248995984,
"grad_norm": 0.05979358032345772,
"learning_rate": 1.6943871816245826e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4707776,
"step": 7540
},
{
"epoch": 15.150602409638553,
"grad_norm": 0.070265032351017,
"learning_rate": 1.6878185554292787e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4710368,
"step": 7545
},
{
"epoch": 15.160642570281125,
"grad_norm": 0.025123678147792816,
"learning_rate": 1.6812600999157753e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4713536,
"step": 7550
},
{
"epoch": 15.170682730923694,
"grad_norm": 0.019195713102817535,
"learning_rate": 1.6747118352230495e-06,
"loss": 0.0,
"num_input_tokens_seen": 4716672,
"step": 7555
},
{
"epoch": 15.180722891566266,
"grad_norm": 0.003091650316491723,
"learning_rate": 1.6681737814587912e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4719872,
"step": 7560
},
{
"epoch": 15.190763052208835,
"grad_norm": 0.008971435017883778,
"learning_rate": 1.6616459586993394e-06,
"loss": 0.0,
"num_input_tokens_seen": 4723776,
"step": 7565
},
{
"epoch": 15.200803212851406,
"grad_norm": 0.007008485496044159,
"learning_rate": 1.6551283869896073e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4726976,
"step": 7570
},
{
"epoch": 15.210843373493976,
"grad_norm": 0.6058262586593628,
"learning_rate": 1.6486210863430424e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4730176,
"step": 7575
},
{
"epoch": 15.220883534136545,
"grad_norm": 0.013418142683804035,
"learning_rate": 1.6421240767415397e-06,
"loss": 0.0,
"num_input_tokens_seen": 4733152,
"step": 7580
},
{
"epoch": 15.230923694779117,
"grad_norm": 0.005203918553888798,
"learning_rate": 1.6356373781354058e-06,
"loss": 0.0,
"num_input_tokens_seen": 4735648,
"step": 7585
},
{
"epoch": 15.240963855421686,
"grad_norm": 0.004218837711960077,
"learning_rate": 1.629161010443277e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4739136,
"step": 7590
},
{
"epoch": 15.251004016064257,
"grad_norm": 0.0019372202223166823,
"learning_rate": 1.6226949935520708e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4742432,
"step": 7595
},
{
"epoch": 15.261044176706827,
"grad_norm": 0.0026460830122232437,
"learning_rate": 1.6162393473169186e-06,
"loss": 0.0,
"num_input_tokens_seen": 4746304,
"step": 7600
},
{
"epoch": 15.271084337349398,
"grad_norm": 0.02616897039115429,
"learning_rate": 1.6097940915611082e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4749536,
"step": 7605
},
{
"epoch": 15.281124497991968,
"grad_norm": 0.002473875880241394,
"learning_rate": 1.60335924607602e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4753120,
"step": 7610
},
{
"epoch": 15.291164658634537,
"grad_norm": 0.007338056806474924,
"learning_rate": 1.5969348306210692e-06,
"loss": 0.0,
"num_input_tokens_seen": 4755968,
"step": 7615
},
{
"epoch": 15.301204819277109,
"grad_norm": 0.018916072323918343,
"learning_rate": 1.5905208649236426e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4758560,
"step": 7620
},
{
"epoch": 15.311244979919678,
"grad_norm": 0.007263466715812683,
"learning_rate": 1.5841173686790368e-06,
"loss": 0.0139,
"num_input_tokens_seen": 4762368,
"step": 7625
},
{
"epoch": 15.32128514056225,
"grad_norm": 0.1382419317960739,
"learning_rate": 1.5777243615504085e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4765888,
"step": 7630
},
{
"epoch": 15.331325301204819,
"grad_norm": 0.0016897142631933093,
"learning_rate": 1.5713418631686938e-06,
"loss": 0.0022,
"num_input_tokens_seen": 4768928,
"step": 7635
},
{
"epoch": 15.34136546184739,
"grad_norm": 0.21704323589801788,
"learning_rate": 1.564969893132568e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4771904,
"step": 7640
},
{
"epoch": 15.35140562248996,
"grad_norm": 0.009838147088885307,
"learning_rate": 1.5586084710083737e-06,
"loss": 0.0,
"num_input_tokens_seen": 4775104,
"step": 7645
},
{
"epoch": 15.36144578313253,
"grad_norm": 0.0028061573393642902,
"learning_rate": 1.5522576163300635e-06,
"loss": 0.0,
"num_input_tokens_seen": 4778496,
"step": 7650
},
{
"epoch": 15.3714859437751,
"grad_norm": 0.0043613361194729805,
"learning_rate": 1.545917348599147e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4781344,
"step": 7655
},
{
"epoch": 15.38152610441767,
"grad_norm": 0.00559863680973649,
"learning_rate": 1.5395876872846132e-06,
"loss": 0.0213,
"num_input_tokens_seen": 4784352,
"step": 7660
},
{
"epoch": 15.391566265060241,
"grad_norm": 0.004609005060046911,
"learning_rate": 1.5332686518228951e-06,
"loss": 0.0,
"num_input_tokens_seen": 4787424,
"step": 7665
},
{
"epoch": 15.401606425702811,
"grad_norm": 0.00308047141879797,
"learning_rate": 1.5269602616177842e-06,
"loss": 0.0,
"num_input_tokens_seen": 4790656,
"step": 7670
},
{
"epoch": 15.411646586345382,
"grad_norm": 0.033832404762506485,
"learning_rate": 1.5206625360403943e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4793536,
"step": 7675
},
{
"epoch": 15.421686746987952,
"grad_norm": 0.011339832097291946,
"learning_rate": 1.5143754944290862e-06,
"loss": 0.0,
"num_input_tokens_seen": 4796704,
"step": 7680
},
{
"epoch": 15.431726907630521,
"grad_norm": 0.09786521643400192,
"learning_rate": 1.5080991560894142e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4800032,
"step": 7685
},
{
"epoch": 15.441767068273093,
"grad_norm": 0.003034294117242098,
"learning_rate": 1.5018335402940681e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4803552,
"step": 7690
},
{
"epoch": 15.451807228915662,
"grad_norm": 0.002162687247619033,
"learning_rate": 1.4955786662828053e-06,
"loss": 0.0,
"num_input_tokens_seen": 4806848,
"step": 7695
},
{
"epoch": 15.461847389558233,
"grad_norm": 0.022024383768439293,
"learning_rate": 1.4893345532624086e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4809152,
"step": 7700
},
{
"epoch": 15.471887550200803,
"grad_norm": 0.005043026525527239,
"learning_rate": 1.4831012204066114e-06,
"loss": 0.0,
"num_input_tokens_seen": 4812064,
"step": 7705
},
{
"epoch": 15.481927710843374,
"grad_norm": 0.007820216938853264,
"learning_rate": 1.4768786868560443e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4815040,
"step": 7710
},
{
"epoch": 15.491967871485944,
"grad_norm": 0.003925836179405451,
"learning_rate": 1.4706669717181782e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4818880,
"step": 7715
},
{
"epoch": 15.502008032128515,
"grad_norm": 0.030795995146036148,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0345,
"num_input_tokens_seen": 4821696,
"step": 7720
},
{
"epoch": 15.512048192771084,
"grad_norm": 0.003375057829543948,
"learning_rate": 1.4582760729442707e-06,
"loss": 0.0,
"num_input_tokens_seen": 4824608,
"step": 7725
},
{
"epoch": 15.522088353413654,
"grad_norm": 1.4269546270370483,
"learning_rate": 1.4520969273568364e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4827360,
"step": 7730
},
{
"epoch": 15.532128514056225,
"grad_norm": 0.0026180180720984936,
"learning_rate": 1.445928676279199e-06,
"loss": 0.0,
"num_input_tokens_seen": 4830496,
"step": 7735
},
{
"epoch": 15.542168674698795,
"grad_norm": 0.8371602892875671,
"learning_rate": 1.4397713386521444e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4833536,
"step": 7740
},
{
"epoch": 15.552208835341366,
"grad_norm": 0.0011596613330766559,
"learning_rate": 1.4336249333829466e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4836192,
"step": 7745
},
{
"epoch": 15.562248995983936,
"grad_norm": 0.0891217365860939,
"learning_rate": 1.4274894793453075e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4840320,
"step": 7750
},
{
"epoch": 15.572289156626507,
"grad_norm": 22.992704391479492,
"learning_rate": 1.421364995379309e-06,
"loss": 0.0766,
"num_input_tokens_seen": 4843744,
"step": 7755
},
{
"epoch": 15.582329317269076,
"grad_norm": 0.0040291850455105305,
"learning_rate": 1.4152515002913358e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4846464,
"step": 7760
},
{
"epoch": 15.592369477911646,
"grad_norm": 0.005161698441952467,
"learning_rate": 1.4091490128540374e-06,
"loss": 0.0,
"num_input_tokens_seen": 4849184,
"step": 7765
},
{
"epoch": 15.602409638554217,
"grad_norm": 43.281211853027344,
"learning_rate": 1.403057551806259e-06,
"loss": 0.0025,
"num_input_tokens_seen": 4851936,
"step": 7770
},
{
"epoch": 15.612449799196787,
"grad_norm": 0.0021972369868308306,
"learning_rate": 1.3969771358529866e-06,
"loss": 0.0003,
"num_input_tokens_seen": 4855040,
"step": 7775
},
{
"epoch": 15.622489959839358,
"grad_norm": 0.014925581403076649,
"learning_rate": 1.3909077836652968e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4857952,
"step": 7780
},
{
"epoch": 15.632530120481928,
"grad_norm": 0.30831727385520935,
"learning_rate": 1.3848495138802803e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4860960,
"step": 7785
},
{
"epoch": 15.642570281124499,
"grad_norm": 0.010729658417403698,
"learning_rate": 1.3788023451010114e-06,
"loss": 0.0,
"num_input_tokens_seen": 4864544,
"step": 7790
},
{
"epoch": 15.652610441767068,
"grad_norm": 0.10216429084539413,
"learning_rate": 1.3727662958964627e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4867616,
"step": 7795
},
{
"epoch": 15.662650602409638,
"grad_norm": 0.001300434349104762,
"learning_rate": 1.3667413848014738e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4870304,
"step": 7800
},
{
"epoch": 15.67269076305221,
"grad_norm": 0.012358872219920158,
"learning_rate": 1.3607276303166766e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4874240,
"step": 7805
},
{
"epoch": 15.682730923694779,
"grad_norm": 0.035743288695812225,
"learning_rate": 1.3547250509084453e-06,
"loss": 0.0249,
"num_input_tokens_seen": 4876960,
"step": 7810
},
{
"epoch": 15.69277108433735,
"grad_norm": 0.014657980762422085,
"learning_rate": 1.3487336650088417e-06,
"loss": 0.0,
"num_input_tokens_seen": 4879872,
"step": 7815
},
{
"epoch": 15.70281124497992,
"grad_norm": 0.03732848912477493,
"learning_rate": 1.3427534910155475e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4883424,
"step": 7820
},
{
"epoch": 15.71285140562249,
"grad_norm": 0.0021094426047056913,
"learning_rate": 1.3367845472918272e-06,
"loss": 0.0061,
"num_input_tokens_seen": 4886912,
"step": 7825
},
{
"epoch": 15.72289156626506,
"grad_norm": 0.0018562499899417162,
"learning_rate": 1.330826852166454e-06,
"loss": 0.0611,
"num_input_tokens_seen": 4890336,
"step": 7830
},
{
"epoch": 15.73293172690763,
"grad_norm": 0.0016846376238390803,
"learning_rate": 1.3248804239336616e-06,
"loss": 0.0278,
"num_input_tokens_seen": 4894144,
"step": 7835
},
{
"epoch": 15.742971887550201,
"grad_norm": 0.022804176434874535,
"learning_rate": 1.3189452808530866e-06,
"loss": 0.0006,
"num_input_tokens_seen": 4897536,
"step": 7840
},
{
"epoch": 15.75301204819277,
"grad_norm": 0.023388752713799477,
"learning_rate": 1.3130214411497121e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4900544,
"step": 7845
},
{
"epoch": 15.763052208835342,
"grad_norm": 0.05187452584505081,
"learning_rate": 1.3071089230138124e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4903680,
"step": 7850
},
{
"epoch": 15.773092369477911,
"grad_norm": 0.017268287017941475,
"learning_rate": 1.3012077446008969e-06,
"loss": 0.0004,
"num_input_tokens_seen": 4906528,
"step": 7855
},
{
"epoch": 15.783132530120483,
"grad_norm": 0.013094757683575153,
"learning_rate": 1.2953179240316533e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4910176,
"step": 7860
},
{
"epoch": 15.793172690763052,
"grad_norm": 0.0332111194729805,
"learning_rate": 1.289439479391893e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4913184,
"step": 7865
},
{
"epoch": 15.803212851405622,
"grad_norm": 0.003533572657033801,
"learning_rate": 1.2835724287325001e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4916320,
"step": 7870
},
{
"epoch": 15.813253012048193,
"grad_norm": 0.00497691472992301,
"learning_rate": 1.277716790069361e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4919360,
"step": 7875
},
{
"epoch": 15.823293172690763,
"grad_norm": 0.03421995788812637,
"learning_rate": 1.2718725813833322e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4922880,
"step": 7880
},
{
"epoch": 15.833333333333334,
"grad_norm": 0.13940131664276123,
"learning_rate": 1.266039820620159e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4925408,
"step": 7885
},
{
"epoch": 15.843373493975903,
"grad_norm": 0.011046111583709717,
"learning_rate": 1.2602185256904453e-06,
"loss": 0.0,
"num_input_tokens_seen": 4928896,
"step": 7890
},
{
"epoch": 15.853413654618475,
"grad_norm": 0.0013307826593518257,
"learning_rate": 1.2544087144695826e-06,
"loss": 0.0,
"num_input_tokens_seen": 4931872,
"step": 7895
},
{
"epoch": 15.863453815261044,
"grad_norm": 0.014156588353216648,
"learning_rate": 1.2486104047976937e-06,
"loss": 0.0,
"num_input_tokens_seen": 4935136,
"step": 7900
},
{
"epoch": 15.873493975903614,
"grad_norm": 0.005145099479705095,
"learning_rate": 1.2428236144795959e-06,
"loss": 0.0,
"num_input_tokens_seen": 4938176,
"step": 7905
},
{
"epoch": 15.883534136546185,
"grad_norm": 0.02292685955762863,
"learning_rate": 1.2370483612847201e-06,
"loss": 0.0,
"num_input_tokens_seen": 4940672,
"step": 7910
},
{
"epoch": 15.893574297188755,
"grad_norm": 0.009769303724169731,
"learning_rate": 1.2312846629470826e-06,
"loss": 0.0,
"num_input_tokens_seen": 4944192,
"step": 7915
},
{
"epoch": 15.903614457831326,
"grad_norm": 0.0011390167055651546,
"learning_rate": 1.225532537165211e-06,
"loss": 0.0,
"num_input_tokens_seen": 4947488,
"step": 7920
},
{
"epoch": 15.913654618473895,
"grad_norm": 0.0024841073900461197,
"learning_rate": 1.219792001602101e-06,
"loss": 0.0,
"num_input_tokens_seen": 4949824,
"step": 7925
},
{
"epoch": 15.923694779116467,
"grad_norm": 0.0029108019080013037,
"learning_rate": 1.2140630738851544e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4952768,
"step": 7930
},
{
"epoch": 15.933734939759036,
"grad_norm": 0.09127728641033173,
"learning_rate": 1.2083457716061326e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4956544,
"step": 7935
},
{
"epoch": 15.943775100401606,
"grad_norm": 0.0016728171613067389,
"learning_rate": 1.2026401123210968e-06,
"loss": 0.0005,
"num_input_tokens_seen": 4959648,
"step": 7940
},
{
"epoch": 15.953815261044177,
"grad_norm": 0.002108454005792737,
"learning_rate": 1.1969461135503573e-06,
"loss": 0.0104,
"num_input_tokens_seen": 4961888,
"step": 7945
},
{
"epoch": 15.963855421686747,
"grad_norm": 0.0029469470027834177,
"learning_rate": 1.1912637927784176e-06,
"loss": 0.0,
"num_input_tokens_seen": 4965216,
"step": 7950
},
{
"epoch": 15.973895582329318,
"grad_norm": 0.0053405375219881535,
"learning_rate": 1.1855931674539222e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4968608,
"step": 7955
},
{
"epoch": 15.983935742971887,
"grad_norm": 0.004863628186285496,
"learning_rate": 1.1799342549896027e-06,
"loss": 0.0,
"num_input_tokens_seen": 4971456,
"step": 7960
},
{
"epoch": 15.993975903614459,
"grad_norm": 0.0013398093869909644,
"learning_rate": 1.174287072762224e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4974112,
"step": 7965
},
{
"epoch": 16.0,
"eval_loss": 1.0401936769485474,
"eval_runtime": 8.0743,
"eval_samples_per_second": 61.677,
"eval_steps_per_second": 15.481,
"num_input_tokens_seen": 4976032,
"step": 7968
},
{
"epoch": 16.004016064257026,
"grad_norm": 0.005879928823560476,
"learning_rate": 1.1686516381125307e-06,
"loss": 0.0017,
"num_input_tokens_seen": 4977152,
"step": 7970
},
{
"epoch": 16.014056224899598,
"grad_norm": 0.06779361516237259,
"learning_rate": 1.163027968345195e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4981088,
"step": 7975
},
{
"epoch": 16.02409638554217,
"grad_norm": 0.015680238604545593,
"learning_rate": 1.1574160807287615e-06,
"loss": 0.0,
"num_input_tokens_seen": 4984064,
"step": 7980
},
{
"epoch": 16.03413654618474,
"grad_norm": 0.0018723757239058614,
"learning_rate": 1.1518159924955974e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4987424,
"step": 7985
},
{
"epoch": 16.044176706827308,
"grad_norm": 0.14989595115184784,
"learning_rate": 1.1462277208418338e-06,
"loss": 0.0001,
"num_input_tokens_seen": 4990112,
"step": 7990
},
{
"epoch": 16.05421686746988,
"grad_norm": 0.0020622089505195618,
"learning_rate": 1.1406512829273253e-06,
"loss": 0.0,
"num_input_tokens_seen": 4993600,
"step": 7995
},
{
"epoch": 16.06425702811245,
"grad_norm": 0.009413612075150013,
"learning_rate": 1.1350866958755757e-06,
"loss": 0.0,
"num_input_tokens_seen": 4996512,
"step": 8000
},
{
"epoch": 16.07429718875502,
"grad_norm": 0.18731117248535156,
"learning_rate": 1.1295339767737125e-06,
"loss": 0.0002,
"num_input_tokens_seen": 4999168,
"step": 8005
},
{
"epoch": 16.08433734939759,
"grad_norm": 0.019764700904488564,
"learning_rate": 1.1239931426724076e-06,
"loss": 0.0,
"num_input_tokens_seen": 5002336,
"step": 8010
},
{
"epoch": 16.09437751004016,
"grad_norm": 0.006744361482560635,
"learning_rate": 1.1184642105858484e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5005536,
"step": 8015
},
{
"epoch": 16.104417670682732,
"grad_norm": 0.16189798712730408,
"learning_rate": 1.1129471974916696e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5008192,
"step": 8020
},
{
"epoch": 16.1144578313253,
"grad_norm": 0.0014629423385486007,
"learning_rate": 1.1074421203309033e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5010944,
"step": 8025
},
{
"epoch": 16.12449799196787,
"grad_norm": 0.007868721149861813,
"learning_rate": 1.1019489960079389e-06,
"loss": 0.0,
"num_input_tokens_seen": 5013888,
"step": 8030
},
{
"epoch": 16.134538152610443,
"grad_norm": 0.09024116396903992,
"learning_rate": 1.0964678413904529e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5017184,
"step": 8035
},
{
"epoch": 16.14457831325301,
"grad_norm": 0.0039504412561655045,
"learning_rate": 1.0909986733093737e-06,
"loss": 0.0002,
"num_input_tokens_seen": 5020256,
"step": 8040
},
{
"epoch": 16.15461847389558,
"grad_norm": 0.001555570401251316,
"learning_rate": 1.0855415085588194e-06,
"loss": 0.0,
"num_input_tokens_seen": 5023040,
"step": 8045
},
{
"epoch": 16.164658634538153,
"grad_norm": 0.011033423244953156,
"learning_rate": 1.08009636389605e-06,
"loss": 0.0,
"num_input_tokens_seen": 5026752,
"step": 8050
},
{
"epoch": 16.174698795180724,
"grad_norm": 0.0012489588698372245,
"learning_rate": 1.0746632560414154e-06,
"loss": 0.0004,
"num_input_tokens_seen": 5029536,
"step": 8055
},
{
"epoch": 16.184738955823292,
"grad_norm": 0.0016975250327959657,
"learning_rate": 1.069242201678305e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5032832,
"step": 8060
},
{
"epoch": 16.194779116465863,
"grad_norm": 0.0009509876254014671,
"learning_rate": 1.0638332174530953e-06,
"loss": 0.0,
"num_input_tokens_seen": 5036416,
"step": 8065
},
{
"epoch": 16.204819277108435,
"grad_norm": 0.0030300356447696686,
"learning_rate": 1.058436319975098e-06,
"loss": 0.0,
"num_input_tokens_seen": 5039392,
"step": 8070
},
{
"epoch": 16.214859437751002,
"grad_norm": 0.008067947812378407,
"learning_rate": 1.053051525816512e-06,
"loss": 0.0,
"num_input_tokens_seen": 5042720,
"step": 8075
},
{
"epoch": 16.224899598393574,
"grad_norm": 0.0013017337769269943,
"learning_rate": 1.0476788515123687e-06,
"loss": 0.0,
"num_input_tokens_seen": 5045760,
"step": 8080
},
{
"epoch": 16.234939759036145,
"grad_norm": 0.001618090900592506,
"learning_rate": 1.0423183135604874e-06,
"loss": 0.0,
"num_input_tokens_seen": 5048032,
"step": 8085
},
{
"epoch": 16.244979919678716,
"grad_norm": 0.012940296903252602,
"learning_rate": 1.036969928421413e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5051040,
"step": 8090
},
{
"epoch": 16.255020080321284,
"grad_norm": 0.004096478223800659,
"learning_rate": 1.0316337125183817e-06,
"loss": 0.0,
"num_input_tokens_seen": 5054080,
"step": 8095
},
{
"epoch": 16.265060240963855,
"grad_norm": 0.003049603197723627,
"learning_rate": 1.0263096822372537e-06,
"loss": 0.0,
"num_input_tokens_seen": 5057088,
"step": 8100
},
{
"epoch": 16.275100401606426,
"grad_norm": 0.016989678144454956,
"learning_rate": 1.0209978539264747e-06,
"loss": 0.0008,
"num_input_tokens_seen": 5059904,
"step": 8105
},
{
"epoch": 16.285140562248998,
"grad_norm": 0.0049511161632835865,
"learning_rate": 1.0156982438970254e-06,
"loss": 0.0,
"num_input_tokens_seen": 5062656,
"step": 8110
},
{
"epoch": 16.295180722891565,
"grad_norm": 0.007807273417711258,
"learning_rate": 1.010410868422359e-06,
"loss": 0.0,
"num_input_tokens_seen": 5066240,
"step": 8115
},
{
"epoch": 16.305220883534137,
"grad_norm": 0.00864755641669035,
"learning_rate": 1.0051357437383708e-06,
"loss": 0.0018,
"num_input_tokens_seen": 5069600,
"step": 8120
},
{
"epoch": 16.315261044176708,
"grad_norm": 0.0649455189704895,
"learning_rate": 9.998728860433277e-07,
"loss": 0.0,
"num_input_tokens_seen": 5073280,
"step": 8125
},
{
"epoch": 16.325301204819276,
"grad_norm": 0.012507440522313118,
"learning_rate": 9.94622311497836e-07,
"loss": 0.0,
"num_input_tokens_seen": 5076128,
"step": 8130
},
{
"epoch": 16.335341365461847,
"grad_norm": 0.0012168296379968524,
"learning_rate": 9.893840362247809e-07,
"loss": 0.0,
"num_input_tokens_seen": 5079776,
"step": 8135
},
{
"epoch": 16.34538152610442,
"grad_norm": 0.0013548055430874228,
"learning_rate": 9.841580763092812e-07,
"loss": 0.0139,
"num_input_tokens_seen": 5083168,
"step": 8140
},
{
"epoch": 16.355421686746986,
"grad_norm": 3.4245684146881104,
"learning_rate": 9.789444477986375e-07,
"loss": 0.0023,
"num_input_tokens_seen": 5085792,
"step": 8145
},
{
"epoch": 16.365461847389557,
"grad_norm": 0.003231622511520982,
"learning_rate": 9.737431667022866e-07,
"loss": 0.0,
"num_input_tokens_seen": 5089632,
"step": 8150
},
{
"epoch": 16.37550200803213,
"grad_norm": 0.011450034566223621,
"learning_rate": 9.685542489917494e-07,
"loss": 0.0,
"num_input_tokens_seen": 5092064,
"step": 8155
},
{
"epoch": 16.3855421686747,
"grad_norm": 0.00948801077902317,
"learning_rate": 9.633777106005826e-07,
"loss": 0.0003,
"num_input_tokens_seen": 5095488,
"step": 8160
},
{
"epoch": 16.395582329317268,
"grad_norm": 0.0029805630911141634,
"learning_rate": 9.582135674243292e-07,
"loss": 0.0,
"num_input_tokens_seen": 5098944,
"step": 8165
},
{
"epoch": 16.40562248995984,
"grad_norm": 0.002190305618569255,
"learning_rate": 9.530618353204718e-07,
"loss": 0.0,
"num_input_tokens_seen": 5101600,
"step": 8170
},
{
"epoch": 16.41566265060241,
"grad_norm": 0.0043740589171648026,
"learning_rate": 9.479225301083811e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5103904,
"step": 8175
},
{
"epoch": 16.42570281124498,
"grad_norm": 0.0013769206125289202,
"learning_rate": 9.427956675692695e-07,
"loss": 0.0,
"num_input_tokens_seen": 5107616,
"step": 8180
},
{
"epoch": 16.43574297188755,
"grad_norm": 0.04764657840132713,
"learning_rate": 9.376812634461418e-07,
"loss": 0.0,
"num_input_tokens_seen": 5110400,
"step": 8185
},
{
"epoch": 16.44578313253012,
"grad_norm": 0.0012534708948805928,
"learning_rate": 9.32579333443746e-07,
"loss": 0.0033,
"num_input_tokens_seen": 5113504,
"step": 8190
},
{
"epoch": 16.455823293172692,
"grad_norm": 0.015134445391595364,
"learning_rate": 9.27489893228527e-07,
"loss": 0.0,
"num_input_tokens_seen": 5116768,
"step": 8195
},
{
"epoch": 16.46586345381526,
"grad_norm": 0.013975398615002632,
"learning_rate": 9.224129584285768e-07,
"loss": 0.0013,
"num_input_tokens_seen": 5120224,
"step": 8200
},
{
"epoch": 16.47590361445783,
"grad_norm": 0.2885809540748596,
"learning_rate": 9.173485446335862e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5123584,
"step": 8205
},
{
"epoch": 16.485943775100402,
"grad_norm": 0.027181854471564293,
"learning_rate": 9.122966673948025e-07,
"loss": 0.0,
"num_input_tokens_seen": 5126752,
"step": 8210
},
{
"epoch": 16.495983935742974,
"grad_norm": 0.007998216897249222,
"learning_rate": 9.072573422249692e-07,
"loss": 0.0,
"num_input_tokens_seen": 5129312,
"step": 8215
},
{
"epoch": 16.50602409638554,
"grad_norm": 0.001557971932925284,
"learning_rate": 9.022305845982948e-07,
"loss": 0.0584,
"num_input_tokens_seen": 5132192,
"step": 8220
},
{
"epoch": 16.516064257028113,
"grad_norm": 0.004761459771543741,
"learning_rate": 8.972164099503899e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5135520,
"step": 8225
},
{
"epoch": 16.526104417670684,
"grad_norm": 0.049848757684230804,
"learning_rate": 8.922148336782288e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5138432,
"step": 8230
},
{
"epoch": 16.53614457831325,
"grad_norm": 10.381841659545898,
"learning_rate": 8.87225871140105e-07,
"loss": 0.004,
"num_input_tokens_seen": 5141952,
"step": 8235
},
{
"epoch": 16.546184738955823,
"grad_norm": 0.0021748128347098827,
"learning_rate": 8.822495376555695e-07,
"loss": 0.0,
"num_input_tokens_seen": 5145344,
"step": 8240
},
{
"epoch": 16.556224899598394,
"grad_norm": 0.06430647522211075,
"learning_rate": 8.772858485054042e-07,
"loss": 0.0003,
"num_input_tokens_seen": 5148096,
"step": 8245
},
{
"epoch": 16.566265060240966,
"grad_norm": 0.0011095363879576325,
"learning_rate": 8.723348189315534e-07,
"loss": 0.0003,
"num_input_tokens_seen": 5150784,
"step": 8250
},
{
"epoch": 16.576305220883533,
"grad_norm": 0.0028599195647984743,
"learning_rate": 8.673964641370974e-07,
"loss": 0.0,
"num_input_tokens_seen": 5153056,
"step": 8255
},
{
"epoch": 16.586345381526105,
"grad_norm": 0.0014031616738066077,
"learning_rate": 8.624707992861897e-07,
"loss": 0.0,
"num_input_tokens_seen": 5156448,
"step": 8260
},
{
"epoch": 16.596385542168676,
"grad_norm": 0.002864877926185727,
"learning_rate": 8.575578395040202e-07,
"loss": 0.0,
"num_input_tokens_seen": 5160672,
"step": 8265
},
{
"epoch": 16.606425702811244,
"grad_norm": 0.0012285938719287515,
"learning_rate": 8.526575998767638e-07,
"loss": 0.0,
"num_input_tokens_seen": 5163840,
"step": 8270
},
{
"epoch": 16.616465863453815,
"grad_norm": 0.0011251465184614062,
"learning_rate": 8.477700954515372e-07,
"loss": 0.0,
"num_input_tokens_seen": 5167552,
"step": 8275
},
{
"epoch": 16.626506024096386,
"grad_norm": 0.09981559216976166,
"learning_rate": 8.428953412363495e-07,
"loss": 0.0,
"num_input_tokens_seen": 5170496,
"step": 8280
},
{
"epoch": 16.636546184738958,
"grad_norm": 0.004276310559362173,
"learning_rate": 8.380333522000588e-07,
"loss": 0.0,
"num_input_tokens_seen": 5173504,
"step": 8285
},
{
"epoch": 16.646586345381525,
"grad_norm": 0.037310317158699036,
"learning_rate": 8.331841432723253e-07,
"loss": 0.0,
"num_input_tokens_seen": 5176640,
"step": 8290
},
{
"epoch": 16.656626506024097,
"grad_norm": 0.0023100704420357943,
"learning_rate": 8.28347729343566e-07,
"loss": 0.0,
"num_input_tokens_seen": 5180096,
"step": 8295
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.06090432405471802,
"learning_rate": 8.235241252649073e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5183616,
"step": 8300
},
{
"epoch": 16.676706827309236,
"grad_norm": 24.074342727661133,
"learning_rate": 8.187133458481416e-07,
"loss": 0.0765,
"num_input_tokens_seen": 5186720,
"step": 8305
},
{
"epoch": 16.686746987951807,
"grad_norm": 0.030896145850419998,
"learning_rate": 8.139154058656801e-07,
"loss": 0.0,
"num_input_tokens_seen": 5190560,
"step": 8310
},
{
"epoch": 16.696787148594378,
"grad_norm": 0.006053561810404062,
"learning_rate": 8.091303200505074e-07,
"loss": 0.0,
"num_input_tokens_seen": 5194304,
"step": 8315
},
{
"epoch": 16.70682730923695,
"grad_norm": 0.0027364008128643036,
"learning_rate": 8.043581030961372e-07,
"loss": 0.0,
"num_input_tokens_seen": 5197792,
"step": 8320
},
{
"epoch": 16.716867469879517,
"grad_norm": 0.002491719089448452,
"learning_rate": 7.99598769656571e-07,
"loss": 0.0155,
"num_input_tokens_seen": 5201280,
"step": 8325
},
{
"epoch": 16.72690763052209,
"grad_norm": 0.015071824193000793,
"learning_rate": 7.948523343462411e-07,
"loss": 0.0,
"num_input_tokens_seen": 5204704,
"step": 8330
},
{
"epoch": 16.73694779116466,
"grad_norm": 0.0035658315755426884,
"learning_rate": 7.901188117399817e-07,
"loss": 0.0,
"num_input_tokens_seen": 5208320,
"step": 8335
},
{
"epoch": 16.746987951807228,
"grad_norm": 0.0027445750311017036,
"learning_rate": 7.853982163729684e-07,
"loss": 0.0,
"num_input_tokens_seen": 5211136,
"step": 8340
},
{
"epoch": 16.7570281124498,
"grad_norm": 0.006281423382461071,
"learning_rate": 7.806905627406891e-07,
"loss": 0.0,
"num_input_tokens_seen": 5214528,
"step": 8345
},
{
"epoch": 16.76706827309237,
"grad_norm": 0.005827156826853752,
"learning_rate": 7.759958652988858e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5218048,
"step": 8350
},
{
"epoch": 16.77710843373494,
"grad_norm": 0.009136940352618694,
"learning_rate": 7.713141384635186e-07,
"loss": 0.0,
"num_input_tokens_seen": 5221248,
"step": 8355
},
{
"epoch": 16.78714859437751,
"grad_norm": 0.005150144919753075,
"learning_rate": 7.666453966107201e-07,
"loss": 0.0,
"num_input_tokens_seen": 5223776,
"step": 8360
},
{
"epoch": 16.79718875502008,
"grad_norm": 0.004599465057253838,
"learning_rate": 7.619896540767435e-07,
"loss": 0.0,
"num_input_tokens_seen": 5226176,
"step": 8365
},
{
"epoch": 16.80722891566265,
"grad_norm": 0.006183779798448086,
"learning_rate": 7.573469251579346e-07,
"loss": 0.0,
"num_input_tokens_seen": 5229312,
"step": 8370
},
{
"epoch": 16.81726907630522,
"grad_norm": 0.0017236809944733977,
"learning_rate": 7.527172241106718e-07,
"loss": 0.0067,
"num_input_tokens_seen": 5231744,
"step": 8375
},
{
"epoch": 16.82730923694779,
"grad_norm": 0.010270994156599045,
"learning_rate": 7.481005651513312e-07,
"loss": 0.0,
"num_input_tokens_seen": 5234464,
"step": 8380
},
{
"epoch": 16.837349397590362,
"grad_norm": 0.0016576339257881045,
"learning_rate": 7.434969624562405e-07,
"loss": 0.0,
"num_input_tokens_seen": 5238368,
"step": 8385
},
{
"epoch": 16.847389558232933,
"grad_norm": 0.001661753747612238,
"learning_rate": 7.389064301616355e-07,
"loss": 0.0,
"num_input_tokens_seen": 5241792,
"step": 8390
},
{
"epoch": 16.8574297188755,
"grad_norm": 0.0016957195475697517,
"learning_rate": 7.343289823636168e-07,
"loss": 0.0,
"num_input_tokens_seen": 5244960,
"step": 8395
},
{
"epoch": 16.867469879518072,
"grad_norm": 0.013983628712594509,
"learning_rate": 7.297646331181069e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5247520,
"step": 8400
},
{
"epoch": 16.877510040160644,
"grad_norm": 0.00148250802885741,
"learning_rate": 7.252133964408065e-07,
"loss": 0.0,
"num_input_tokens_seen": 5250272,
"step": 8405
},
{
"epoch": 16.88755020080321,
"grad_norm": 0.017954643815755844,
"learning_rate": 7.206752863071515e-07,
"loss": 0.0,
"num_input_tokens_seen": 5252864,
"step": 8410
},
{
"epoch": 16.897590361445783,
"grad_norm": 0.0016218151431530714,
"learning_rate": 7.161503166522704e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5255840,
"step": 8415
},
{
"epoch": 16.907630522088354,
"grad_norm": 0.0025824366603046656,
"learning_rate": 7.116385013709404e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5258400,
"step": 8420
},
{
"epoch": 16.917670682730925,
"grad_norm": 0.0045729330740869045,
"learning_rate": 7.0713985431755e-07,
"loss": 0.0,
"num_input_tokens_seen": 5261376,
"step": 8425
},
{
"epoch": 16.927710843373493,
"grad_norm": 0.0545303151011467,
"learning_rate": 7.026543893060456e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5263968,
"step": 8430
},
{
"epoch": 16.937751004016064,
"grad_norm": 0.12884321808815002,
"learning_rate": 6.981821201098999e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5267488,
"step": 8435
},
{
"epoch": 16.947791164658636,
"grad_norm": 0.0024236757308244705,
"learning_rate": 6.937230604620642e-07,
"loss": 0.0,
"num_input_tokens_seen": 5270528,
"step": 8440
},
{
"epoch": 16.957831325301203,
"grad_norm": 0.131021186709404,
"learning_rate": 6.892772240549267e-07,
"loss": 0.0,
"num_input_tokens_seen": 5274048,
"step": 8445
},
{
"epoch": 16.967871485943775,
"grad_norm": 0.0023038825020194054,
"learning_rate": 6.848446245402751e-07,
"loss": 0.0,
"num_input_tokens_seen": 5276320,
"step": 8450
},
{
"epoch": 16.977911646586346,
"grad_norm": 0.0010659729596227407,
"learning_rate": 6.804252755292429e-07,
"loss": 0.0,
"num_input_tokens_seen": 5278688,
"step": 8455
},
{
"epoch": 16.987951807228917,
"grad_norm": 0.0022493836004287004,
"learning_rate": 6.760191905922847e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5281120,
"step": 8460
},
{
"epoch": 16.997991967871485,
"grad_norm": 0.0067631215788424015,
"learning_rate": 6.716263832591163e-07,
"loss": 0.0,
"num_input_tokens_seen": 5284064,
"step": 8465
},
{
"epoch": 17.008032128514056,
"grad_norm": 0.03593532368540764,
"learning_rate": 6.672468670186899e-07,
"loss": 0.0,
"num_input_tokens_seen": 5287968,
"step": 8470
},
{
"epoch": 17.018072289156628,
"grad_norm": 0.0016852463595569134,
"learning_rate": 6.628806553191397e-07,
"loss": 0.0,
"num_input_tokens_seen": 5291744,
"step": 8475
},
{
"epoch": 17.028112449799195,
"grad_norm": 0.0035839611664414406,
"learning_rate": 6.585277615677472e-07,
"loss": 0.0,
"num_input_tokens_seen": 5293984,
"step": 8480
},
{
"epoch": 17.038152610441767,
"grad_norm": 0.0018862821161746979,
"learning_rate": 6.541881991309013e-07,
"loss": 0.0,
"num_input_tokens_seen": 5296704,
"step": 8485
},
{
"epoch": 17.048192771084338,
"grad_norm": 0.0017078607343137264,
"learning_rate": 6.498619813340473e-07,
"loss": 0.0,
"num_input_tokens_seen": 5299872,
"step": 8490
},
{
"epoch": 17.05823293172691,
"grad_norm": 0.0025925240479409695,
"learning_rate": 6.455491214616622e-07,
"loss": 0.0,
"num_input_tokens_seen": 5303584,
"step": 8495
},
{
"epoch": 17.068273092369477,
"grad_norm": 0.002037809230387211,
"learning_rate": 6.412496327571999e-07,
"loss": 0.0,
"num_input_tokens_seen": 5307488,
"step": 8500
},
{
"epoch": 17.07831325301205,
"grad_norm": 0.006466969382017851,
"learning_rate": 6.369635284230563e-07,
"loss": 0.0,
"num_input_tokens_seen": 5311328,
"step": 8505
},
{
"epoch": 17.08835341365462,
"grad_norm": 0.0013107856502756476,
"learning_rate": 6.32690821620528e-07,
"loss": 0.0,
"num_input_tokens_seen": 5314720,
"step": 8510
},
{
"epoch": 17.098393574297187,
"grad_norm": 0.023452557623386383,
"learning_rate": 6.284315254697726e-07,
"loss": 0.0,
"num_input_tokens_seen": 5318752,
"step": 8515
},
{
"epoch": 17.10843373493976,
"grad_norm": 0.0020837204065173864,
"learning_rate": 6.241856530497669e-07,
"loss": 0.0,
"num_input_tokens_seen": 5321952,
"step": 8520
},
{
"epoch": 17.11847389558233,
"grad_norm": 0.0032065757550299168,
"learning_rate": 6.199532173982692e-07,
"loss": 0.0,
"num_input_tokens_seen": 5325056,
"step": 8525
},
{
"epoch": 17.1285140562249,
"grad_norm": 0.004327481612563133,
"learning_rate": 6.157342315117754e-07,
"loss": 0.0,
"num_input_tokens_seen": 5327936,
"step": 8530
},
{
"epoch": 17.13855421686747,
"grad_norm": 0.0018718891078606248,
"learning_rate": 6.115287083454823e-07,
"loss": 0.0,
"num_input_tokens_seen": 5331968,
"step": 8535
},
{
"epoch": 17.14859437751004,
"grad_norm": 0.0009799738181754947,
"learning_rate": 6.073366608132481e-07,
"loss": 0.0,
"num_input_tokens_seen": 5334144,
"step": 8540
},
{
"epoch": 17.15863453815261,
"grad_norm": 0.0023493673652410507,
"learning_rate": 6.031581017875482e-07,
"loss": 0.0,
"num_input_tokens_seen": 5336928,
"step": 8545
},
{
"epoch": 17.16867469879518,
"grad_norm": 0.0019072717987000942,
"learning_rate": 5.989930440994451e-07,
"loss": 0.0,
"num_input_tokens_seen": 5339904,
"step": 8550
},
{
"epoch": 17.17871485943775,
"grad_norm": 0.003958335146307945,
"learning_rate": 5.948415005385344e-07,
"loss": 0.0,
"num_input_tokens_seen": 5343552,
"step": 8555
},
{
"epoch": 17.188755020080322,
"grad_norm": 0.6150096654891968,
"learning_rate": 5.907034838529224e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5346752,
"step": 8560
},
{
"epoch": 17.198795180722893,
"grad_norm": 0.0013654690701514482,
"learning_rate": 5.865790067491739e-07,
"loss": 0.0,
"num_input_tokens_seen": 5349952,
"step": 8565
},
{
"epoch": 17.20883534136546,
"grad_norm": 0.8217050433158875,
"learning_rate": 5.824680818922762e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5352448,
"step": 8570
},
{
"epoch": 17.218875502008032,
"grad_norm": 0.0032255176920443773,
"learning_rate": 5.783707219056078e-07,
"loss": 0.0,
"num_input_tokens_seen": 5356032,
"step": 8575
},
{
"epoch": 17.228915662650603,
"grad_norm": 0.019121866673231125,
"learning_rate": 5.742869393708872e-07,
"loss": 0.0,
"num_input_tokens_seen": 5358368,
"step": 8580
},
{
"epoch": 17.23895582329317,
"grad_norm": 0.002871948992833495,
"learning_rate": 5.702167468281461e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5361216,
"step": 8585
},
{
"epoch": 17.248995983935743,
"grad_norm": 0.0008276253938674927,
"learning_rate": 5.661601567756819e-07,
"loss": 0.0,
"num_input_tokens_seen": 5364128,
"step": 8590
},
{
"epoch": 17.259036144578314,
"grad_norm": 0.00140668754465878,
"learning_rate": 5.621171816700249e-07,
"loss": 0.0,
"num_input_tokens_seen": 5367200,
"step": 8595
},
{
"epoch": 17.269076305220885,
"grad_norm": 0.0022226141300052404,
"learning_rate": 5.580878339258978e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5370144,
"step": 8600
},
{
"epoch": 17.279116465863453,
"grad_norm": 0.0009751960169523954,
"learning_rate": 5.540721259161774e-07,
"loss": 0.0,
"num_input_tokens_seen": 5373024,
"step": 8605
},
{
"epoch": 17.289156626506024,
"grad_norm": 0.018170801922678947,
"learning_rate": 5.500700699718564e-07,
"loss": 0.0,
"num_input_tokens_seen": 5375904,
"step": 8610
},
{
"epoch": 17.299196787148595,
"grad_norm": 0.0021682889200747013,
"learning_rate": 5.460816783820089e-07,
"loss": 0.0,
"num_input_tokens_seen": 5379264,
"step": 8615
},
{
"epoch": 17.309236947791163,
"grad_norm": 0.0069380393251776695,
"learning_rate": 5.42106963393747e-07,
"loss": 0.0,
"num_input_tokens_seen": 5382208,
"step": 8620
},
{
"epoch": 17.319277108433734,
"grad_norm": 0.11309646815061569,
"learning_rate": 5.381459372121878e-07,
"loss": 0.0,
"num_input_tokens_seen": 5385568,
"step": 8625
},
{
"epoch": 17.329317269076306,
"grad_norm": 0.007803209591656923,
"learning_rate": 5.341986120004145e-07,
"loss": 0.0,
"num_input_tokens_seen": 5389056,
"step": 8630
},
{
"epoch": 17.339357429718877,
"grad_norm": 0.001378397922962904,
"learning_rate": 5.302649998794368e-07,
"loss": 0.0,
"num_input_tokens_seen": 5391840,
"step": 8635
},
{
"epoch": 17.349397590361445,
"grad_norm": 0.008238635957241058,
"learning_rate": 5.263451129281605e-07,
"loss": 0.0,
"num_input_tokens_seen": 5395008,
"step": 8640
},
{
"epoch": 17.359437751004016,
"grad_norm": 0.0009625194361433387,
"learning_rate": 5.224389631833393e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5397728,
"step": 8645
},
{
"epoch": 17.369477911646587,
"grad_norm": 0.019273938611149788,
"learning_rate": 5.185465626395486e-07,
"loss": 0.0,
"num_input_tokens_seen": 5401248,
"step": 8650
},
{
"epoch": 17.379518072289155,
"grad_norm": 0.0024227574467658997,
"learning_rate": 5.146679232491436e-07,
"loss": 0.0,
"num_input_tokens_seen": 5405024,
"step": 8655
},
{
"epoch": 17.389558232931726,
"grad_norm": 0.007197659928351641,
"learning_rate": 5.108030569222211e-07,
"loss": 0.0,
"num_input_tokens_seen": 5407968,
"step": 8660
},
{
"epoch": 17.399598393574298,
"grad_norm": 0.009146859869360924,
"learning_rate": 5.0695197552659e-07,
"loss": 0.0,
"num_input_tokens_seen": 5410944,
"step": 8665
},
{
"epoch": 17.40963855421687,
"grad_norm": 0.002492484636604786,
"learning_rate": 5.031146908877221e-07,
"loss": 0.0,
"num_input_tokens_seen": 5414240,
"step": 8670
},
{
"epoch": 17.419678714859437,
"grad_norm": 0.007068297825753689,
"learning_rate": 4.99291214788733e-07,
"loss": 0.0,
"num_input_tokens_seen": 5416896,
"step": 8675
},
{
"epoch": 17.429718875502008,
"grad_norm": 0.0012975713470950723,
"learning_rate": 4.954815589703277e-07,
"loss": 0.0,
"num_input_tokens_seen": 5419744,
"step": 8680
},
{
"epoch": 17.43975903614458,
"grad_norm": 0.0051535964012146,
"learning_rate": 4.916857351307802e-07,
"loss": 0.0,
"num_input_tokens_seen": 5422560,
"step": 8685
},
{
"epoch": 17.449799196787147,
"grad_norm": 0.004130291286855936,
"learning_rate": 4.879037549258875e-07,
"loss": 0.0,
"num_input_tokens_seen": 5426016,
"step": 8690
},
{
"epoch": 17.45983935742972,
"grad_norm": 0.0020685733761638403,
"learning_rate": 4.841356299689359e-07,
"loss": 0.0,
"num_input_tokens_seen": 5429280,
"step": 8695
},
{
"epoch": 17.46987951807229,
"grad_norm": 0.0011084630386903882,
"learning_rate": 4.803813718306716e-07,
"loss": 0.0,
"num_input_tokens_seen": 5432576,
"step": 8700
},
{
"epoch": 17.47991967871486,
"grad_norm": 0.008571324869990349,
"learning_rate": 4.7664099203925284e-07,
"loss": 0.0,
"num_input_tokens_seen": 5436064,
"step": 8705
},
{
"epoch": 17.48995983935743,
"grad_norm": 0.0010640741093084216,
"learning_rate": 4.7291450208022836e-07,
"loss": 0.0,
"num_input_tokens_seen": 5438880,
"step": 8710
},
{
"epoch": 17.5,
"grad_norm": 0.0077699050307273865,
"learning_rate": 4.692019133964931e-07,
"loss": 0.0,
"num_input_tokens_seen": 5441696,
"step": 8715
},
{
"epoch": 17.51004016064257,
"grad_norm": 0.0019389991648495197,
"learning_rate": 4.65503237388254e-07,
"loss": 0.0009,
"num_input_tokens_seen": 5444448,
"step": 8720
},
{
"epoch": 17.52008032128514,
"grad_norm": 0.0016935811145231128,
"learning_rate": 4.618184854129981e-07,
"loss": 0.0,
"num_input_tokens_seen": 5447424,
"step": 8725
},
{
"epoch": 17.53012048192771,
"grad_norm": 0.0011479026870802045,
"learning_rate": 4.581476687854558e-07,
"loss": 0.0619,
"num_input_tokens_seen": 5450688,
"step": 8730
},
{
"epoch": 17.54016064257028,
"grad_norm": 0.0018761102110147476,
"learning_rate": 4.5449079877756653e-07,
"loss": 0.0,
"num_input_tokens_seen": 5453472,
"step": 8735
},
{
"epoch": 17.550200803212853,
"grad_norm": 0.001550107728689909,
"learning_rate": 4.508478866184435e-07,
"loss": 0.0,
"num_input_tokens_seen": 5456800,
"step": 8740
},
{
"epoch": 17.56024096385542,
"grad_norm": 0.0016778951976448298,
"learning_rate": 4.4721894349434027e-07,
"loss": 0.0,
"num_input_tokens_seen": 5460256,
"step": 8745
},
{
"epoch": 17.570281124497992,
"grad_norm": 0.0011093484936282039,
"learning_rate": 4.4360398054861473e-07,
"loss": 0.0,
"num_input_tokens_seen": 5463712,
"step": 8750
},
{
"epoch": 17.580321285140563,
"grad_norm": 0.011861991137266159,
"learning_rate": 4.4000300888169753e-07,
"loss": 0.0,
"num_input_tokens_seen": 5467104,
"step": 8755
},
{
"epoch": 17.59036144578313,
"grad_norm": 0.002717207185924053,
"learning_rate": 4.364160395510547e-07,
"loss": 0.0,
"num_input_tokens_seen": 5469888,
"step": 8760
},
{
"epoch": 17.600401606425702,
"grad_norm": 0.010903539136052132,
"learning_rate": 4.328430835711589e-07,
"loss": 0.0,
"num_input_tokens_seen": 5473216,
"step": 8765
},
{
"epoch": 17.610441767068274,
"grad_norm": 0.0062487199902534485,
"learning_rate": 4.2928415191344664e-07,
"loss": 0.0,
"num_input_tokens_seen": 5476768,
"step": 8770
},
{
"epoch": 17.620481927710845,
"grad_norm": 0.008080641739070415,
"learning_rate": 4.2573925550629393e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5479648,
"step": 8775
},
{
"epoch": 17.630522088353413,
"grad_norm": 199.20767211914062,
"learning_rate": 4.2220840523497896e-07,
"loss": 0.0178,
"num_input_tokens_seen": 5483360,
"step": 8780
},
{
"epoch": 17.640562248995984,
"grad_norm": 0.0291756484657526,
"learning_rate": 4.1869161194164565e-07,
"loss": 0.0,
"num_input_tokens_seen": 5486528,
"step": 8785
},
{
"epoch": 17.650602409638555,
"grad_norm": 0.0011264854110777378,
"learning_rate": 4.15188886425279e-07,
"loss": 0.0,
"num_input_tokens_seen": 5489728,
"step": 8790
},
{
"epoch": 17.660642570281123,
"grad_norm": 0.0019059681799262762,
"learning_rate": 4.117002394416586e-07,
"loss": 0.0,
"num_input_tokens_seen": 5492320,
"step": 8795
},
{
"epoch": 17.670682730923694,
"grad_norm": 0.0012327972799539566,
"learning_rate": 4.082256817033392e-07,
"loss": 0.0,
"num_input_tokens_seen": 5495840,
"step": 8800
},
{
"epoch": 17.680722891566266,
"grad_norm": 0.0454624705016613,
"learning_rate": 4.047652238796096e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5498784,
"step": 8805
},
{
"epoch": 17.690763052208837,
"grad_norm": 0.02524404413998127,
"learning_rate": 4.0131887659646265e-07,
"loss": 0.0,
"num_input_tokens_seen": 5501088,
"step": 8810
},
{
"epoch": 17.700803212851405,
"grad_norm": 0.0016588432481512427,
"learning_rate": 3.9788665043656083e-07,
"loss": 0.0,
"num_input_tokens_seen": 5504512,
"step": 8815
},
{
"epoch": 17.710843373493976,
"grad_norm": 0.004111200571060181,
"learning_rate": 3.94468555939207e-07,
"loss": 0.0,
"num_input_tokens_seen": 5507616,
"step": 8820
},
{
"epoch": 17.720883534136547,
"grad_norm": 0.056082893162965775,
"learning_rate": 3.9106460360030853e-07,
"loss": 0.0,
"num_input_tokens_seen": 5510624,
"step": 8825
},
{
"epoch": 17.730923694779115,
"grad_norm": 0.0011907127918675542,
"learning_rate": 3.8767480387234714e-07,
"loss": 0.0,
"num_input_tokens_seen": 5513952,
"step": 8830
},
{
"epoch": 17.740963855421686,
"grad_norm": 0.001506793312728405,
"learning_rate": 3.84299167164347e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5516704,
"step": 8835
},
{
"epoch": 17.751004016064257,
"grad_norm": 0.001404171111062169,
"learning_rate": 3.809377038418405e-07,
"loss": 0.0,
"num_input_tokens_seen": 5519328,
"step": 8840
},
{
"epoch": 17.76104417670683,
"grad_norm": 0.0012987729860469699,
"learning_rate": 3.775904242268391e-07,
"loss": 0.0,
"num_input_tokens_seen": 5522688,
"step": 8845
},
{
"epoch": 17.771084337349397,
"grad_norm": 0.0013029174879193306,
"learning_rate": 3.742573385977999e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5525024,
"step": 8850
},
{
"epoch": 17.781124497991968,
"grad_norm": 0.001227586530148983,
"learning_rate": 3.7093845718959575e-07,
"loss": 0.0,
"num_input_tokens_seen": 5527808,
"step": 8855
},
{
"epoch": 17.79116465863454,
"grad_norm": 1.0415514707565308,
"learning_rate": 3.676337901934812e-07,
"loss": 0.0003,
"num_input_tokens_seen": 5530688,
"step": 8860
},
{
"epoch": 17.801204819277107,
"grad_norm": 0.0038885881658643484,
"learning_rate": 3.6434334775706403e-07,
"loss": 0.0,
"num_input_tokens_seen": 5533696,
"step": 8865
},
{
"epoch": 17.811244979919678,
"grad_norm": 0.12921766936779022,
"learning_rate": 3.610671399842719e-07,
"loss": 0.0,
"num_input_tokens_seen": 5536448,
"step": 8870
},
{
"epoch": 17.82128514056225,
"grad_norm": 0.024864312261343002,
"learning_rate": 3.578051769353219e-07,
"loss": 0.0,
"num_input_tokens_seen": 5539808,
"step": 8875
},
{
"epoch": 17.83132530120482,
"grad_norm": 0.0016896923771128058,
"learning_rate": 3.5455746862669336e-07,
"loss": 0.0,
"num_input_tokens_seen": 5542848,
"step": 8880
},
{
"epoch": 17.84136546184739,
"grad_norm": 0.0020967130549252033,
"learning_rate": 3.513240250310873e-07,
"loss": 0.0,
"num_input_tokens_seen": 5545376,
"step": 8885
},
{
"epoch": 17.85140562248996,
"grad_norm": 0.01755034364759922,
"learning_rate": 3.4810485607740975e-07,
"loss": 0.0411,
"num_input_tokens_seen": 5549088,
"step": 8890
},
{
"epoch": 17.86144578313253,
"grad_norm": 0.47964027523994446,
"learning_rate": 3.4489997165072785e-07,
"loss": 0.0003,
"num_input_tokens_seen": 5551712,
"step": 8895
},
{
"epoch": 17.8714859437751,
"grad_norm": 0.014545414596796036,
"learning_rate": 3.4170938159224675e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5554432,
"step": 8900
},
{
"epoch": 17.88152610441767,
"grad_norm": 0.0012772183399647474,
"learning_rate": 3.385330956992816e-07,
"loss": 0.0,
"num_input_tokens_seen": 5557504,
"step": 8905
},
{
"epoch": 17.89156626506024,
"grad_norm": 0.004275870509445667,
"learning_rate": 3.3537112372521777e-07,
"loss": 0.0,
"num_input_tokens_seen": 5560608,
"step": 8910
},
{
"epoch": 17.901606425702813,
"grad_norm": 0.016472170129418373,
"learning_rate": 3.3222347537949395e-07,
"loss": 0.0,
"num_input_tokens_seen": 5563584,
"step": 8915
},
{
"epoch": 17.91164658634538,
"grad_norm": 0.0016320595750585198,
"learning_rate": 3.290901603275587e-07,
"loss": 0.0,
"num_input_tokens_seen": 5566592,
"step": 8920
},
{
"epoch": 17.92168674698795,
"grad_norm": 0.002692002570256591,
"learning_rate": 3.2597118819085227e-07,
"loss": 0.0,
"num_input_tokens_seen": 5569536,
"step": 8925
},
{
"epoch": 17.931726907630523,
"grad_norm": 0.08017129451036453,
"learning_rate": 3.228665685467702e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5572448,
"step": 8930
},
{
"epoch": 17.94176706827309,
"grad_norm": 0.0011358013143762946,
"learning_rate": 3.1977631092863613e-07,
"loss": 0.0,
"num_input_tokens_seen": 5575296,
"step": 8935
},
{
"epoch": 17.951807228915662,
"grad_norm": 0.04995585232973099,
"learning_rate": 3.167004248256733e-07,
"loss": 0.0,
"num_input_tokens_seen": 5578912,
"step": 8940
},
{
"epoch": 17.961847389558233,
"grad_norm": 0.010064242407679558,
"learning_rate": 3.1363891968297367e-07,
"loss": 0.0,
"num_input_tokens_seen": 5581056,
"step": 8945
},
{
"epoch": 17.971887550200805,
"grad_norm": 0.0012515847338363528,
"learning_rate": 3.105918049014689e-07,
"loss": 0.0,
"num_input_tokens_seen": 5584352,
"step": 8950
},
{
"epoch": 17.981927710843372,
"grad_norm": 0.0015645629027858377,
"learning_rate": 3.075590898379044e-07,
"loss": 0.0,
"num_input_tokens_seen": 5587872,
"step": 8955
},
{
"epoch": 17.991967871485944,
"grad_norm": 0.01546509936451912,
"learning_rate": 3.04540783804807e-07,
"loss": 0.0,
"num_input_tokens_seen": 5591680,
"step": 8960
},
{
"epoch": 18.0,
"eval_loss": 1.093041181564331,
"eval_runtime": 8.0723,
"eval_samples_per_second": 61.692,
"eval_steps_per_second": 15.485,
"num_input_tokens_seen": 5594752,
"step": 8964
},
{
"epoch": 18.002008032128515,
"grad_norm": 0.3365046977996826,
"learning_rate": 3.015368960704584e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5595360,
"step": 8965
},
{
"epoch": 18.012048192771083,
"grad_norm": 0.017432406544685364,
"learning_rate": 2.985474358588658e-07,
"loss": 0.0,
"num_input_tokens_seen": 5598368,
"step": 8970
},
{
"epoch": 18.022088353413654,
"grad_norm": 0.003238762030377984,
"learning_rate": 2.9557241234973446e-07,
"loss": 0.0,
"num_input_tokens_seen": 5601664,
"step": 8975
},
{
"epoch": 18.032128514056225,
"grad_norm": 0.02988354116678238,
"learning_rate": 2.926118346784379e-07,
"loss": 0.0,
"num_input_tokens_seen": 5604736,
"step": 8980
},
{
"epoch": 18.042168674698797,
"grad_norm": 0.001696154591627419,
"learning_rate": 2.8966571193599304e-07,
"loss": 0.0,
"num_input_tokens_seen": 5607936,
"step": 8985
},
{
"epoch": 18.052208835341364,
"grad_norm": 0.0027232773136347532,
"learning_rate": 2.8673405316902824e-07,
"loss": 0.0157,
"num_input_tokens_seen": 5611200,
"step": 8990
},
{
"epoch": 18.062248995983936,
"grad_norm": 0.004504075739532709,
"learning_rate": 2.8381686737975867e-07,
"loss": 0.0,
"num_input_tokens_seen": 5613856,
"step": 8995
},
{
"epoch": 18.072289156626507,
"grad_norm": 0.002720191143453121,
"learning_rate": 2.809141635259555e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5617152,
"step": 9000
},
{
"epoch": 18.082329317269075,
"grad_norm": 0.0020942571572959423,
"learning_rate": 2.780259505209249e-07,
"loss": 0.0,
"num_input_tokens_seen": 5620160,
"step": 9005
},
{
"epoch": 18.092369477911646,
"grad_norm": 0.017170244827866554,
"learning_rate": 2.7515223723346974e-07,
"loss": 0.0,
"num_input_tokens_seen": 5623424,
"step": 9010
},
{
"epoch": 18.102409638554217,
"grad_norm": 0.008794093504548073,
"learning_rate": 2.722930324878748e-07,
"loss": 0.0,
"num_input_tokens_seen": 5626208,
"step": 9015
},
{
"epoch": 18.11244979919679,
"grad_norm": 0.00417192792519927,
"learning_rate": 2.694483450638685e-07,
"loss": 0.0,
"num_input_tokens_seen": 5629280,
"step": 9020
},
{
"epoch": 18.122489959839356,
"grad_norm": 0.031785111874341965,
"learning_rate": 2.666181836966053e-07,
"loss": 0.0,
"num_input_tokens_seen": 5632256,
"step": 9025
},
{
"epoch": 18.132530120481928,
"grad_norm": 0.004091055132448673,
"learning_rate": 2.6380255707663285e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5634688,
"step": 9030
},
{
"epoch": 18.1425702811245,
"grad_norm": 0.06754046678543091,
"learning_rate": 2.610014738498656e-07,
"loss": 0.0,
"num_input_tokens_seen": 5637984,
"step": 9035
},
{
"epoch": 18.152610441767067,
"grad_norm": 0.012670686468482018,
"learning_rate": 2.5821494261756284e-07,
"loss": 0.0,
"num_input_tokens_seen": 5641440,
"step": 9040
},
{
"epoch": 18.162650602409638,
"grad_norm": 17.17615509033203,
"learning_rate": 2.554429719362972e-07,
"loss": 0.0529,
"num_input_tokens_seen": 5644960,
"step": 9045
},
{
"epoch": 18.17269076305221,
"grad_norm": 0.0012001717695966363,
"learning_rate": 2.526855703179304e-07,
"loss": 0.0,
"num_input_tokens_seen": 5648512,
"step": 9050
},
{
"epoch": 18.18273092369478,
"grad_norm": 0.005140680354088545,
"learning_rate": 2.4994274622958726e-07,
"loss": 0.0,
"num_input_tokens_seen": 5651584,
"step": 9055
},
{
"epoch": 18.19277108433735,
"grad_norm": 0.025966104120016098,
"learning_rate": 2.4721450809363054e-07,
"loss": 0.0,
"num_input_tokens_seen": 5654720,
"step": 9060
},
{
"epoch": 18.20281124497992,
"grad_norm": 0.003484898479655385,
"learning_rate": 2.4450086428763345e-07,
"loss": 0.0,
"num_input_tokens_seen": 5657952,
"step": 9065
},
{
"epoch": 18.21285140562249,
"grad_norm": 0.00537458062171936,
"learning_rate": 2.4180182314435305e-07,
"loss": 0.0,
"num_input_tokens_seen": 5661120,
"step": 9070
},
{
"epoch": 18.22289156626506,
"grad_norm": 0.06362808495759964,
"learning_rate": 2.3911739295170875e-07,
"loss": 0.0,
"num_input_tokens_seen": 5664704,
"step": 9075
},
{
"epoch": 18.23293172690763,
"grad_norm": 0.013311965391039848,
"learning_rate": 2.364475819527523e-07,
"loss": 0.0,
"num_input_tokens_seen": 5667744,
"step": 9080
},
{
"epoch": 18.2429718875502,
"grad_norm": 0.016375111415982246,
"learning_rate": 2.3379239834564526e-07,
"loss": 0.0,
"num_input_tokens_seen": 5670496,
"step": 9085
},
{
"epoch": 18.253012048192772,
"grad_norm": 0.008078474551439285,
"learning_rate": 2.3115185028363186e-07,
"loss": 0.0,
"num_input_tokens_seen": 5673632,
"step": 9090
},
{
"epoch": 18.26305220883534,
"grad_norm": 0.0013551748124882579,
"learning_rate": 2.2852594587501887e-07,
"loss": 0.0,
"num_input_tokens_seen": 5676672,
"step": 9095
},
{
"epoch": 18.27309236947791,
"grad_norm": 0.009037856943905354,
"learning_rate": 2.259146931831413e-07,
"loss": 0.0,
"num_input_tokens_seen": 5680352,
"step": 9100
},
{
"epoch": 18.283132530120483,
"grad_norm": 0.005470726173371077,
"learning_rate": 2.2331810022634847e-07,
"loss": 0.0,
"num_input_tokens_seen": 5683104,
"step": 9105
},
{
"epoch": 18.29317269076305,
"grad_norm": 0.001181105268187821,
"learning_rate": 2.2073617497797018e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5686688,
"step": 9110
},
{
"epoch": 18.303212851405622,
"grad_norm": 0.0015586339868605137,
"learning_rate": 2.1816892536629775e-07,
"loss": 0.0,
"num_input_tokens_seen": 5689600,
"step": 9115
},
{
"epoch": 18.313253012048193,
"grad_norm": 0.0013247845927253366,
"learning_rate": 2.1561635927456083e-07,
"loss": 0.0,
"num_input_tokens_seen": 5692768,
"step": 9120
},
{
"epoch": 18.323293172690764,
"grad_norm": 0.0010992807801812887,
"learning_rate": 2.1307848454089452e-07,
"loss": 0.0,
"num_input_tokens_seen": 5695584,
"step": 9125
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.006108762696385384,
"learning_rate": 2.1055530895832897e-07,
"loss": 0.0,
"num_input_tokens_seen": 5698784,
"step": 9130
},
{
"epoch": 18.343373493975903,
"grad_norm": 0.006278180982917547,
"learning_rate": 2.0804684027474987e-07,
"loss": 0.0,
"num_input_tokens_seen": 5701504,
"step": 9135
},
{
"epoch": 18.353413654618475,
"grad_norm": 0.0014436625642701983,
"learning_rate": 2.055530861928884e-07,
"loss": 0.0,
"num_input_tokens_seen": 5705216,
"step": 9140
},
{
"epoch": 18.363453815261042,
"grad_norm": 0.009470781311392784,
"learning_rate": 2.0307405437029027e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5708576,
"step": 9145
},
{
"epoch": 18.373493975903614,
"grad_norm": 0.00394744286313653,
"learning_rate": 2.006097524192918e-07,
"loss": 0.0,
"num_input_tokens_seen": 5712288,
"step": 9150
},
{
"epoch": 18.383534136546185,
"grad_norm": 0.007234565913677216,
"learning_rate": 1.9816018790700165e-07,
"loss": 0.0,
"num_input_tokens_seen": 5715648,
"step": 9155
},
{
"epoch": 18.393574297188756,
"grad_norm": 0.0013976708287373185,
"learning_rate": 1.9572536835527013e-07,
"loss": 0.0,
"num_input_tokens_seen": 5718720,
"step": 9160
},
{
"epoch": 18.403614457831324,
"grad_norm": 0.016879552975296974,
"learning_rate": 1.933053012406749e-07,
"loss": 0.0,
"num_input_tokens_seen": 5722560,
"step": 9165
},
{
"epoch": 18.413654618473895,
"grad_norm": 0.0010123576503247023,
"learning_rate": 1.908999939944911e-07,
"loss": 0.0,
"num_input_tokens_seen": 5725408,
"step": 9170
},
{
"epoch": 18.423694779116467,
"grad_norm": 0.0012418956030160189,
"learning_rate": 1.8850945400266994e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5729024,
"step": 9175
},
{
"epoch": 18.433734939759034,
"grad_norm": 0.0023059435188770294,
"learning_rate": 1.861336886058196e-07,
"loss": 0.0,
"num_input_tokens_seen": 5731584,
"step": 9180
},
{
"epoch": 18.443775100401606,
"grad_norm": 0.015861524268984795,
"learning_rate": 1.8377270509917777e-07,
"loss": 0.0,
"num_input_tokens_seen": 5734624,
"step": 9185
},
{
"epoch": 18.453815261044177,
"grad_norm": 0.001924677286297083,
"learning_rate": 1.81426510732593e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5737920,
"step": 9190
},
{
"epoch": 18.46385542168675,
"grad_norm": 0.004833642393350601,
"learning_rate": 1.7909511271050006e-07,
"loss": 0.0,
"num_input_tokens_seen": 5740896,
"step": 9195
},
{
"epoch": 18.473895582329316,
"grad_norm": 0.0023220451548695564,
"learning_rate": 1.7677851819189907e-07,
"loss": 0.0,
"num_input_tokens_seen": 5744000,
"step": 9200
},
{
"epoch": 18.483935742971887,
"grad_norm": 0.01160132884979248,
"learning_rate": 1.7447673429033361e-07,
"loss": 0.0,
"num_input_tokens_seen": 5746816,
"step": 9205
},
{
"epoch": 18.49397590361446,
"grad_norm": 0.005880299024283886,
"learning_rate": 1.7218976807386767e-07,
"loss": 0.0,
"num_input_tokens_seen": 5749696,
"step": 9210
},
{
"epoch": 18.50401606425703,
"grad_norm": 0.003148122224956751,
"learning_rate": 1.6991762656506483e-07,
"loss": 0.0,
"num_input_tokens_seen": 5752544,
"step": 9215
},
{
"epoch": 18.514056224899598,
"grad_norm": 0.0012982593616470695,
"learning_rate": 1.6766031674096795e-07,
"loss": 0.0,
"num_input_tokens_seen": 5756672,
"step": 9220
},
{
"epoch": 18.52409638554217,
"grad_norm": 0.02512902021408081,
"learning_rate": 1.654178455330735e-07,
"loss": 0.0,
"num_input_tokens_seen": 5759520,
"step": 9225
},
{
"epoch": 18.53413654618474,
"grad_norm": 0.002674127696081996,
"learning_rate": 1.631902198273172e-07,
"loss": 0.0,
"num_input_tokens_seen": 5762848,
"step": 9230
},
{
"epoch": 18.544176706827308,
"grad_norm": 0.002228042809292674,
"learning_rate": 1.6097744646404457e-07,
"loss": 0.0,
"num_input_tokens_seen": 5766496,
"step": 9235
},
{
"epoch": 18.55421686746988,
"grad_norm": 0.003513950854539871,
"learning_rate": 1.5877953223799703e-07,
"loss": 0.0,
"num_input_tokens_seen": 5769600,
"step": 9240
},
{
"epoch": 18.56425702811245,
"grad_norm": 0.004237438552081585,
"learning_rate": 1.565964838982881e-07,
"loss": 0.0,
"num_input_tokens_seen": 5772800,
"step": 9245
},
{
"epoch": 18.57429718875502,
"grad_norm": 0.0016228174790740013,
"learning_rate": 1.544283081483805e-07,
"loss": 0.0,
"num_input_tokens_seen": 5776416,
"step": 9250
},
{
"epoch": 18.58433734939759,
"grad_norm": 0.0069399080239236355,
"learning_rate": 1.5227501164607138e-07,
"loss": 0.0,
"num_input_tokens_seen": 5778976,
"step": 9255
},
{
"epoch": 18.59437751004016,
"grad_norm": 0.006944271270185709,
"learning_rate": 1.501366010034644e-07,
"loss": 0.0,
"num_input_tokens_seen": 5782400,
"step": 9260
},
{
"epoch": 18.604417670682732,
"grad_norm": 0.013072614558041096,
"learning_rate": 1.4801308278695636e-07,
"loss": 0.0,
"num_input_tokens_seen": 5784640,
"step": 9265
},
{
"epoch": 18.6144578313253,
"grad_norm": 0.0017089575994759798,
"learning_rate": 1.45904463517213e-07,
"loss": 0.0,
"num_input_tokens_seen": 5787936,
"step": 9270
},
{
"epoch": 18.62449799196787,
"grad_norm": 0.0009865846950560808,
"learning_rate": 1.4381074966914987e-07,
"loss": 0.0,
"num_input_tokens_seen": 5791584,
"step": 9275
},
{
"epoch": 18.634538152610443,
"grad_norm": 0.0016180879902094603,
"learning_rate": 1.4173194767191257e-07,
"loss": 0.0,
"num_input_tokens_seen": 5794912,
"step": 9280
},
{
"epoch": 18.644578313253014,
"grad_norm": 0.0051255906000733376,
"learning_rate": 1.396680639088571e-07,
"loss": 0.0,
"num_input_tokens_seen": 5797568,
"step": 9285
},
{
"epoch": 18.65461847389558,
"grad_norm": 0.0013111892621964216,
"learning_rate": 1.3761910471753126e-07,
"loss": 0.0,
"num_input_tokens_seen": 5801088,
"step": 9290
},
{
"epoch": 18.664658634538153,
"grad_norm": 0.0022690477780997753,
"learning_rate": 1.3558507638965158e-07,
"loss": 0.0,
"num_input_tokens_seen": 5804096,
"step": 9295
},
{
"epoch": 18.674698795180724,
"grad_norm": 0.008249749429523945,
"learning_rate": 1.3356598517108966e-07,
"loss": 0.0001,
"num_input_tokens_seen": 5807392,
"step": 9300
},
{
"epoch": 18.684738955823292,
"grad_norm": 0.009947913698852062,
"learning_rate": 1.3156183726184657e-07,
"loss": 0.0,
"num_input_tokens_seen": 5810848,
"step": 9305
},
{
"epoch": 18.694779116465863,
"grad_norm": 0.0014019834343343973,
"learning_rate": 1.295726388160412e-07,
"loss": 0.0,
"num_input_tokens_seen": 5814176,
"step": 9310
},
{
"epoch": 18.704819277108435,
"grad_norm": 0.0023297134321182966,
"learning_rate": 1.2759839594188307e-07,
"loss": 0.0,
"num_input_tokens_seen": 5816736,
"step": 9315
},
{
"epoch": 18.714859437751002,
"grad_norm": 0.001655187108553946,
"learning_rate": 1.2563911470166057e-07,
"loss": 0.0,
"num_input_tokens_seen": 5819360,
"step": 9320
},
{
"epoch": 18.724899598393574,
"grad_norm": 0.002192035550251603,
"learning_rate": 1.2369480111171784e-07,
"loss": 0.0,
"num_input_tokens_seen": 5822304,
"step": 9325
},
{
"epoch": 18.734939759036145,
"grad_norm": 0.0010390159441158175,
"learning_rate": 1.2176546114243903e-07,
"loss": 0.0,
"num_input_tokens_seen": 5824768,
"step": 9330
},
{
"epoch": 18.744979919678716,
"grad_norm": 0.0041192579083144665,
"learning_rate": 1.198511007182296e-07,
"loss": 0.0,
"num_input_tokens_seen": 5827488,
"step": 9335
},
{
"epoch": 18.755020080321284,
"grad_norm": 0.0012042642338201404,
"learning_rate": 1.1795172571749503e-07,
"loss": 0.0,
"num_input_tokens_seen": 5830496,
"step": 9340
},
{
"epoch": 18.765060240963855,
"grad_norm": 0.0018591363914310932,
"learning_rate": 1.160673419726288e-07,
"loss": 0.0,
"num_input_tokens_seen": 5833952,
"step": 9345
},
{
"epoch": 18.775100401606426,
"grad_norm": 0.005543689243495464,
"learning_rate": 1.1419795526998679e-07,
"loss": 0.0,
"num_input_tokens_seen": 5837280,
"step": 9350
},
{
"epoch": 18.785140562248998,
"grad_norm": 0.0025698868557810783,
"learning_rate": 1.1234357134987717e-07,
"loss": 0.0,
"num_input_tokens_seen": 5839936,
"step": 9355
},
{
"epoch": 18.795180722891565,
"grad_norm": 0.023136422038078308,
"learning_rate": 1.1050419590653726e-07,
"loss": 0.0002,
"num_input_tokens_seen": 5843584,
"step": 9360
},
{
"epoch": 18.805220883534137,
"grad_norm": 0.001498569967225194,
"learning_rate": 1.0867983458811792e-07,
"loss": 0.0,
"num_input_tokens_seen": 5846624,
"step": 9365
},
{
"epoch": 18.815261044176708,
"grad_norm": 0.06827542185783386,
"learning_rate": 1.0687049299666796e-07,
"loss": 0.0,
"num_input_tokens_seen": 5850112,
"step": 9370
},
{
"epoch": 18.825301204819276,
"grad_norm": 0.004503254778683186,
"learning_rate": 1.050761766881131e-07,
"loss": 0.0,
"num_input_tokens_seen": 5853856,
"step": 9375
},
{
"epoch": 18.835341365461847,
"grad_norm": 0.021971486508846283,
"learning_rate": 1.0329689117224262e-07,
"loss": 0.0,
"num_input_tokens_seen": 5857024,
"step": 9380
},
{
"epoch": 18.84538152610442,
"grad_norm": 0.0739317312836647,
"learning_rate": 1.0153264191269052e-07,
"loss": 0.0,
"num_input_tokens_seen": 5860128,
"step": 9385
},
{
"epoch": 18.855421686746986,
"grad_norm": 0.0029136035591363907,
"learning_rate": 9.978343432691884e-08,
"loss": 0.0,
"num_input_tokens_seen": 5862336,
"step": 9390
},
{
"epoch": 18.865461847389557,
"grad_norm": 0.002241392619907856,
"learning_rate": 9.804927378620155e-08,
"loss": 0.0,
"num_input_tokens_seen": 5865792,
"step": 9395
},
{
"epoch": 18.87550200803213,
"grad_norm": 0.0049470034427940845,
"learning_rate": 9.633016561560793e-08,
"loss": 0.0,
"num_input_tokens_seen": 5869280,
"step": 9400
},
{
"epoch": 18.8855421686747,
"grad_norm": 0.0011665538186207414,
"learning_rate": 9.462611509398534e-08,
"loss": 0.0,
"num_input_tokens_seen": 5872288,
"step": 9405
},
{
"epoch": 18.895582329317268,
"grad_norm": 0.0019446617225185037,
"learning_rate": 9.293712745394479e-08,
"loss": 0.0001,
"num_input_tokens_seen": 5874688,
"step": 9410
},
{
"epoch": 18.90562248995984,
"grad_norm": 0.12640930712223053,
"learning_rate": 9.126320788184374e-08,
"loss": 0.0,
"num_input_tokens_seen": 5877824,
"step": 9415
},
{
"epoch": 18.91566265060241,
"grad_norm": 0.014082228764891624,
"learning_rate": 8.960436151776886e-08,
"loss": 0.0,
"num_input_tokens_seen": 5881056,
"step": 9420
},
{
"epoch": 18.92570281124498,
"grad_norm": 0.0034161340445280075,
"learning_rate": 8.796059345552389e-08,
"loss": 0.0,
"num_input_tokens_seen": 5884320,
"step": 9425
},
{
"epoch": 18.93574297188755,
"grad_norm": 0.0019884402863681316,
"learning_rate": 8.633190874261011e-08,
"loss": 0.0,
"num_input_tokens_seen": 5887648,
"step": 9430
},
{
"epoch": 18.94578313253012,
"grad_norm": 0.013954327441751957,
"learning_rate": 8.471831238021366e-08,
"loss": 0.0,
"num_input_tokens_seen": 5890976,
"step": 9435
},
{
"epoch": 18.955823293172692,
"grad_norm": 0.04434090852737427,
"learning_rate": 8.31198093231872e-08,
"loss": 0.0,
"num_input_tokens_seen": 5893344,
"step": 9440
},
{
"epoch": 18.96586345381526,
"grad_norm": 0.006404112558811903,
"learning_rate": 8.153640448003875e-08,
"loss": 0.0,
"num_input_tokens_seen": 5895808,
"step": 9445
},
{
"epoch": 18.97590361445783,
"grad_norm": 0.0015161640476435423,
"learning_rate": 7.996810271291344e-08,
"loss": 0.0,
"num_input_tokens_seen": 5899200,
"step": 9450
},
{
"epoch": 18.985943775100402,
"grad_norm": 0.09693264961242676,
"learning_rate": 7.841490883757907e-08,
"loss": 0.0,
"num_input_tokens_seen": 5902336,
"step": 9455
},
{
"epoch": 18.99598393574297,
"grad_norm": 0.005461865570396185,
"learning_rate": 7.687682762341276e-08,
"loss": 0.0,
"num_input_tokens_seen": 5905248,
"step": 9460
},
{
"epoch": 19.00602409638554,
"grad_norm": 0.0033444438595324755,
"learning_rate": 7.535386379338371e-08,
"loss": 0.0,
"num_input_tokens_seen": 5908704,
"step": 9465
},
{
"epoch": 19.016064257028113,
"grad_norm": 0.019571533426642418,
"learning_rate": 7.384602202404335e-08,
"loss": 0.0,
"num_input_tokens_seen": 5912832,
"step": 9470
},
{
"epoch": 19.026104417670684,
"grad_norm": 0.0012603729264810681,
"learning_rate": 7.235330694550402e-08,
"loss": 0.0,
"num_input_tokens_seen": 5917056,
"step": 9475
},
{
"epoch": 19.03614457831325,
"grad_norm": 0.014011326245963573,
"learning_rate": 7.087572314143198e-08,
"loss": 0.0,
"num_input_tokens_seen": 5920192,
"step": 9480
},
{
"epoch": 19.046184738955823,
"grad_norm": 0.0010616021463647485,
"learning_rate": 6.94132751490284e-08,
"loss": 0.0059,
"num_input_tokens_seen": 5922368,
"step": 9485
},
{
"epoch": 19.056224899598394,
"grad_norm": 0.008679233491420746,
"learning_rate": 6.796596745901717e-08,
"loss": 0.0,
"num_input_tokens_seen": 5925056,
"step": 9490
},
{
"epoch": 19.066265060240966,
"grad_norm": 0.0020861446391791105,
"learning_rate": 6.653380451563219e-08,
"loss": 0.0,
"num_input_tokens_seen": 5928256,
"step": 9495
},
{
"epoch": 19.076305220883533,
"grad_norm": 0.00580920884385705,
"learning_rate": 6.511679071659949e-08,
"loss": 0.0,
"num_input_tokens_seen": 5931392,
"step": 9500
},
{
"epoch": 19.086345381526105,
"grad_norm": 0.005442566704005003,
"learning_rate": 6.371493041313126e-08,
"loss": 0.0,
"num_input_tokens_seen": 5934464,
"step": 9505
},
{
"epoch": 19.096385542168676,
"grad_norm": 0.001245411578565836,
"learning_rate": 6.232822790990467e-08,
"loss": 0.0,
"num_input_tokens_seen": 5937568,
"step": 9510
},
{
"epoch": 19.106425702811244,
"grad_norm": 0.04193849116563797,
"learning_rate": 6.095668746505245e-08,
"loss": 0.0,
"num_input_tokens_seen": 5940768,
"step": 9515
},
{
"epoch": 19.116465863453815,
"grad_norm": 0.003646423341706395,
"learning_rate": 5.96003132901507e-08,
"loss": 0.0001,
"num_input_tokens_seen": 5944032,
"step": 9520
},
{
"epoch": 19.126506024096386,
"grad_norm": 0.29433780908584595,
"learning_rate": 5.825910955020386e-08,
"loss": 0.0001,
"num_input_tokens_seen": 5947168,
"step": 9525
},
{
"epoch": 19.136546184738958,
"grad_norm": 0.0010262697469443083,
"learning_rate": 5.693308036363143e-08,
"loss": 0.0,
"num_input_tokens_seen": 5950592,
"step": 9530
},
{
"epoch": 19.146586345381525,
"grad_norm": 0.00138461880851537,
"learning_rate": 5.562222980225907e-08,
"loss": 0.0,
"num_input_tokens_seen": 5952960,
"step": 9535
},
{
"epoch": 19.156626506024097,
"grad_norm": 0.011420581489801407,
"learning_rate": 5.432656189130137e-08,
"loss": 0.0,
"num_input_tokens_seen": 5956288,
"step": 9540
},
{
"epoch": 19.166666666666668,
"grad_norm": 0.003382657188922167,
"learning_rate": 5.3046080609352455e-08,
"loss": 0.0,
"num_input_tokens_seen": 5958752,
"step": 9545
},
{
"epoch": 19.176706827309236,
"grad_norm": 0.0018110686214640737,
"learning_rate": 5.178078988837432e-08,
"loss": 0.0,
"num_input_tokens_seen": 5962144,
"step": 9550
},
{
"epoch": 19.186746987951807,
"grad_norm": 0.0012735830387100577,
"learning_rate": 5.053069361368068e-08,
"loss": 0.0,
"num_input_tokens_seen": 5965280,
"step": 9555
},
{
"epoch": 19.196787148594378,
"grad_norm": 0.0035719373263418674,
"learning_rate": 4.9295795623930945e-08,
"loss": 0.0,
"num_input_tokens_seen": 5968192,
"step": 9560
},
{
"epoch": 19.20682730923695,
"grad_norm": 0.0012759892269968987,
"learning_rate": 4.807609971111238e-08,
"loss": 0.0,
"num_input_tokens_seen": 5971264,
"step": 9565
},
{
"epoch": 19.216867469879517,
"grad_norm": 0.031974148005247116,
"learning_rate": 4.68716096205335e-08,
"loss": 0.0,
"num_input_tokens_seen": 5973344,
"step": 9570
},
{
"epoch": 19.22690763052209,
"grad_norm": 0.0022920300252735615,
"learning_rate": 4.5682329050810715e-08,
"loss": 0.0,
"num_input_tokens_seen": 5977248,
"step": 9575
},
{
"epoch": 19.23694779116466,
"grad_norm": 0.011636001989245415,
"learning_rate": 4.450826165385336e-08,
"loss": 0.0,
"num_input_tokens_seen": 5980704,
"step": 9580
},
{
"epoch": 19.246987951807228,
"grad_norm": 0.0016436435980722308,
"learning_rate": 4.33494110348609e-08,
"loss": 0.0,
"num_input_tokens_seen": 5983936,
"step": 9585
},
{
"epoch": 19.2570281124498,
"grad_norm": 0.03811986371874809,
"learning_rate": 4.2205780752301865e-08,
"loss": 0.0,
"num_input_tokens_seen": 5987424,
"step": 9590
},
{
"epoch": 19.26706827309237,
"grad_norm": 0.001627400633879006,
"learning_rate": 4.107737431791159e-08,
"loss": 0.0,
"num_input_tokens_seen": 5990112,
"step": 9595
},
{
"epoch": 19.27710843373494,
"grad_norm": 0.002810975071042776,
"learning_rate": 3.996419519667505e-08,
"loss": 0.0,
"num_input_tokens_seen": 5993472,
"step": 9600
},
{
"epoch": 19.28714859437751,
"grad_norm": 0.0020102846901863813,
"learning_rate": 3.8866246806821273e-08,
"loss": 0.0,
"num_input_tokens_seen": 5996320,
"step": 9605
},
{
"epoch": 19.29718875502008,
"grad_norm": 0.0016769138164818287,
"learning_rate": 3.7783532519808376e-08,
"loss": 0.0,
"num_input_tokens_seen": 5999360,
"step": 9610
},
{
"epoch": 19.30722891566265,
"grad_norm": 0.0018848153995350003,
"learning_rate": 3.671605566031633e-08,
"loss": 0.0,
"num_input_tokens_seen": 6002016,
"step": 9615
},
{
"epoch": 19.31726907630522,
"grad_norm": 0.0016581976087763906,
"learning_rate": 3.566381950623588e-08,
"loss": 0.0,
"num_input_tokens_seen": 6004448,
"step": 9620
},
{
"epoch": 19.32730923694779,
"grad_norm": 0.0015075030969455838,
"learning_rate": 3.462682728865685e-08,
"loss": 0.0,
"num_input_tokens_seen": 6007392,
"step": 9625
},
{
"epoch": 19.337349397590362,
"grad_norm": 0.011512084864079952,
"learning_rate": 3.3605082191860985e-08,
"loss": 0.0,
"num_input_tokens_seen": 6010176,
"step": 9630
},
{
"epoch": 19.347389558232933,
"grad_norm": 0.0016901310300454497,
"learning_rate": 3.259858735331134e-08,
"loss": 0.0,
"num_input_tokens_seen": 6013120,
"step": 9635
},
{
"epoch": 19.3574297188755,
"grad_norm": 0.027195386588573456,
"learning_rate": 3.1607345863640114e-08,
"loss": 0.0,
"num_input_tokens_seen": 6015296,
"step": 9640
},
{
"epoch": 19.367469879518072,
"grad_norm": 0.02286182902753353,
"learning_rate": 3.063136076664364e-08,
"loss": 0.0001,
"num_input_tokens_seen": 6018016,
"step": 9645
},
{
"epoch": 19.377510040160644,
"grad_norm": 17.473379135131836,
"learning_rate": 2.967063505926848e-08,
"loss": 0.0474,
"num_input_tokens_seen": 6021408,
"step": 9650
},
{
"epoch": 19.38755020080321,
"grad_norm": 0.001090511097572744,
"learning_rate": 2.8725171691605934e-08,
"loss": 0.0,
"num_input_tokens_seen": 6024544,
"step": 9655
},
{
"epoch": 19.397590361445783,
"grad_norm": 0.007710614707320929,
"learning_rate": 2.7794973566880323e-08,
"loss": 0.0,
"num_input_tokens_seen": 6027872,
"step": 9660
},
{
"epoch": 19.407630522088354,
"grad_norm": 0.012881418690085411,
"learning_rate": 2.6880043541441804e-08,
"loss": 0.0,
"num_input_tokens_seen": 6030784,
"step": 9665
},
{
"epoch": 19.417670682730925,
"grad_norm": 0.0014010410523042083,
"learning_rate": 2.5980384424756366e-08,
"loss": 0.0,
"num_input_tokens_seen": 6034208,
"step": 9670
},
{
"epoch": 19.427710843373493,
"grad_norm": 0.00728964526206255,
"learning_rate": 2.5095998979398628e-08,
"loss": 0.0001,
"num_input_tokens_seen": 6037632,
"step": 9675
},
{
"epoch": 19.437751004016064,
"grad_norm": 0.0008074513752944767,
"learning_rate": 2.4226889921041273e-08,
"loss": 0.0,
"num_input_tokens_seen": 6040928,
"step": 9680
},
{
"epoch": 19.447791164658636,
"grad_norm": 0.0009832337964326143,
"learning_rate": 2.3373059918448958e-08,
"loss": 0.0,
"num_input_tokens_seen": 6044096,
"step": 9685
},
{
"epoch": 19.457831325301203,
"grad_norm": 0.03476724773645401,
"learning_rate": 2.2534511593468866e-08,
"loss": 0.0,
"num_input_tokens_seen": 6047456,
"step": 9690
},
{
"epoch": 19.467871485943775,
"grad_norm": 0.2634872794151306,
"learning_rate": 2.171124752102238e-08,
"loss": 0.0,
"num_input_tokens_seen": 6049856,
"step": 9695
},
{
"epoch": 19.477911646586346,
"grad_norm": 0.001191947259940207,
"learning_rate": 2.0903270229098992e-08,
"loss": 0.0,
"num_input_tokens_seen": 6052704,
"step": 9700
},
{
"epoch": 19.487951807228917,
"grad_norm": 0.01017684955149889,
"learning_rate": 2.0110582198745177e-08,
"loss": 0.0,
"num_input_tokens_seen": 6055456,
"step": 9705
},
{
"epoch": 19.497991967871485,
"grad_norm": 0.0016589167062193155,
"learning_rate": 1.9333185864061077e-08,
"loss": 0.0,
"num_input_tokens_seen": 6058304,
"step": 9710
},
{
"epoch": 19.508032128514056,
"grad_norm": 0.0013716747052967548,
"learning_rate": 1.8571083612188845e-08,
"loss": 0.0,
"num_input_tokens_seen": 6061440,
"step": 9715
},
{
"epoch": 19.518072289156628,
"grad_norm": 0.005404317285865545,
"learning_rate": 1.7824277783308197e-08,
"loss": 0.0,
"num_input_tokens_seen": 6065024,
"step": 9720
},
{
"epoch": 19.528112449799195,
"grad_norm": 0.02352655865252018,
"learning_rate": 1.7092770670628644e-08,
"loss": 0.0,
"num_input_tokens_seen": 6068256,
"step": 9725
},
{
"epoch": 19.538152610441767,
"grad_norm": 0.003068252932280302,
"learning_rate": 1.637656452038172e-08,
"loss": 0.0,
"num_input_tokens_seen": 6071200,
"step": 9730
},
{
"epoch": 19.548192771084338,
"grad_norm": 0.0032745555508881807,
"learning_rate": 1.5675661531813215e-08,
"loss": 0.0,
"num_input_tokens_seen": 6074656,
"step": 9735
},
{
"epoch": 19.55823293172691,
"grad_norm": 0.009923930279910564,
"learning_rate": 1.4990063857180383e-08,
"loss": 0.0,
"num_input_tokens_seen": 6077408,
"step": 9740
},
{
"epoch": 19.568273092369477,
"grad_norm": 0.00499066011980176,
"learning_rate": 1.431977360173975e-08,
"loss": 0.0,
"num_input_tokens_seen": 6080352,
"step": 9745
},
{
"epoch": 19.57831325301205,
"grad_norm": 0.002552691148594022,
"learning_rate": 1.3664792823745442e-08,
"loss": 0.0,
"num_input_tokens_seen": 6082848,
"step": 9750
},
{
"epoch": 19.58835341365462,
"grad_norm": 0.0025797574780881405,
"learning_rate": 1.3025123534440299e-08,
"loss": 0.0,
"num_input_tokens_seen": 6085664,
"step": 9755
},
{
"epoch": 19.598393574297187,
"grad_norm": 0.0038030825089663267,
"learning_rate": 1.240076769804921e-08,
"loss": 0.0,
"num_input_tokens_seen": 6088608,
"step": 9760
},
{
"epoch": 19.60843373493976,
"grad_norm": 0.022904515266418457,
"learning_rate": 1.1791727231776906e-08,
"loss": 0.0,
"num_input_tokens_seen": 6091296,
"step": 9765
},
{
"epoch": 19.61847389558233,
"grad_norm": 0.0012896016705781221,
"learning_rate": 1.1198004005796847e-08,
"loss": 0.0,
"num_input_tokens_seen": 6094880,
"step": 9770
},
{
"epoch": 19.6285140562249,
"grad_norm": 0.024325761944055557,
"learning_rate": 1.0619599843249006e-08,
"loss": 0.0,
"num_input_tokens_seen": 6098208,
"step": 9775
},
{
"epoch": 19.63855421686747,
"grad_norm": 0.0039957738481462,
"learning_rate": 1.0056516520232651e-08,
"loss": 0.0,
"num_input_tokens_seen": 6101024,
"step": 9780
},
{
"epoch": 19.64859437751004,
"grad_norm": 0.012007399462163448,
"learning_rate": 9.508755765802457e-09,
"loss": 0.0,
"num_input_tokens_seen": 6103904,
"step": 9785
},
{
"epoch": 19.65863453815261,
"grad_norm": 0.0020268342923372984,
"learning_rate": 8.976319261962407e-09,
"loss": 0.0,
"num_input_tokens_seen": 6106816,
"step": 9790
},
{
"epoch": 19.66867469879518,
"grad_norm": 0.004168129526078701,
"learning_rate": 8.459208643659122e-09,
"loss": 0.0,
"num_input_tokens_seen": 6110368,
"step": 9795
},
{
"epoch": 19.67871485943775,
"grad_norm": 0.003230429720133543,
"learning_rate": 7.957425498778537e-09,
"loss": 0.0,
"num_input_tokens_seen": 6113856,
"step": 9800
},
{
"epoch": 19.688755020080322,
"grad_norm": 0.01263430342078209,
"learning_rate": 7.470971368142011e-09,
"loss": 0.0,
"num_input_tokens_seen": 6116448,
"step": 9805
},
{
"epoch": 19.698795180722893,
"grad_norm": 0.0028617747593671083,
"learning_rate": 6.999847745498556e-09,
"loss": 0.0,
"num_input_tokens_seen": 6120096,
"step": 9810
},
{
"epoch": 19.70883534136546,
"grad_norm": 0.0074871014803647995,
"learning_rate": 6.544056077523175e-09,
"loss": 0.0,
"num_input_tokens_seen": 6123008,
"step": 9815
},
{
"epoch": 19.718875502008032,
"grad_norm": 0.004571064841002226,
"learning_rate": 6.1035977638101985e-09,
"loss": 0.0,
"num_input_tokens_seen": 6126720,
"step": 9820
},
{
"epoch": 19.728915662650603,
"grad_norm": 0.0010783788748085499,
"learning_rate": 5.678474156871061e-09,
"loss": 0.0,
"num_input_tokens_seen": 6129760,
"step": 9825
},
{
"epoch": 19.73895582329317,
"grad_norm": 0.0013977461494505405,
"learning_rate": 5.268686562127645e-09,
"loss": 0.0,
"num_input_tokens_seen": 6133344,
"step": 9830
},
{
"epoch": 19.748995983935743,
"grad_norm": 0.00812604185193777,
"learning_rate": 4.874236237911723e-09,
"loss": 0.0,
"num_input_tokens_seen": 6136576,
"step": 9835
},
{
"epoch": 19.759036144578314,
"grad_norm": 0.0008148096385411918,
"learning_rate": 4.495124395456629e-09,
"loss": 0.0,
"num_input_tokens_seen": 6139136,
"step": 9840
},
{
"epoch": 19.769076305220885,
"grad_norm": 0.006598465144634247,
"learning_rate": 4.1313521988983754e-09,
"loss": 0.0,
"num_input_tokens_seen": 6142240,
"step": 9845
},
{
"epoch": 19.779116465863453,
"grad_norm": 0.005599440075457096,
"learning_rate": 3.7829207652673175e-09,
"loss": 0.0,
"num_input_tokens_seen": 6145088,
"step": 9850
},
{
"epoch": 19.789156626506024,
"grad_norm": 0.018151333555579185,
"learning_rate": 3.44983116448927e-09,
"loss": 0.0,
"num_input_tokens_seen": 6147968,
"step": 9855
},
{
"epoch": 19.799196787148595,
"grad_norm": 0.015009772963821888,
"learning_rate": 3.1320844193788445e-09,
"loss": 0.0,
"num_input_tokens_seen": 6151296,
"step": 9860
},
{
"epoch": 19.809236947791163,
"grad_norm": 0.0013042399659752846,
"learning_rate": 2.8296815056377824e-09,
"loss": 0.0,
"num_input_tokens_seen": 6154880,
"step": 9865
},
{
"epoch": 19.819277108433734,
"grad_norm": 0.012515922077000141,
"learning_rate": 2.54262335185107e-09,
"loss": 0.0,
"num_input_tokens_seen": 6158528,
"step": 9870
},
{
"epoch": 19.829317269076306,
"grad_norm": 0.0033991907257586718,
"learning_rate": 2.2709108394863845e-09,
"loss": 0.0,
"num_input_tokens_seen": 6161600,
"step": 9875
},
{
"epoch": 19.839357429718877,
"grad_norm": 0.0012220785720273852,
"learning_rate": 2.0145448028874305e-09,
"loss": 0.0002,
"num_input_tokens_seen": 6164288,
"step": 9880
},
{
"epoch": 19.849397590361445,
"grad_norm": 0.0031056124716997147,
"learning_rate": 1.7735260292750522e-09,
"loss": 0.0,
"num_input_tokens_seen": 6167904,
"step": 9885
},
{
"epoch": 19.859437751004016,
"grad_norm": 0.00221474701538682,
"learning_rate": 1.547855258743347e-09,
"loss": 0.0,
"num_input_tokens_seen": 6171360,
"step": 9890
},
{
"epoch": 19.869477911646587,
"grad_norm": 0.006416819524019957,
"learning_rate": 1.3375331842574446e-09,
"loss": 0.0,
"num_input_tokens_seen": 6174848,
"step": 9895
},
{
"epoch": 19.879518072289155,
"grad_norm": 0.011413169093430042,
"learning_rate": 1.1425604516512868e-09,
"loss": 0.0,
"num_input_tokens_seen": 6177472,
"step": 9900
},
{
"epoch": 19.889558232931726,
"grad_norm": 0.04103207588195801,
"learning_rate": 9.629376596248518e-10,
"loss": 0.0,
"num_input_tokens_seen": 6180576,
"step": 9905
},
{
"epoch": 19.899598393574298,
"grad_norm": 0.003425066592171788,
"learning_rate": 7.986653597447102e-10,
"loss": 0.0,
"num_input_tokens_seen": 6183520,
"step": 9910
},
{
"epoch": 19.90963855421687,
"grad_norm": 0.00264586228877306,
"learning_rate": 6.497440564395829e-10,
"loss": 0.0,
"num_input_tokens_seen": 6188000,
"step": 9915
},
{
"epoch": 19.919678714859437,
"grad_norm": 0.00533295376226306,
"learning_rate": 5.161742070014519e-10,
"loss": 0.0,
"num_input_tokens_seen": 6190560,
"step": 9920
},
{
"epoch": 19.929718875502008,
"grad_norm": 0.0016277596587315202,
"learning_rate": 3.9795622158111945e-10,
"loss": 0.0,
"num_input_tokens_seen": 6194080,
"step": 9925
},
{
"epoch": 19.93975903614458,
"grad_norm": 0.004762938711792231,
"learning_rate": 2.950904631893181e-10,
"loss": 0.0,
"num_input_tokens_seen": 6197728,
"step": 9930
},
{
"epoch": 19.949799196787147,
"grad_norm": 0.0040444061160087585,
"learning_rate": 2.0757724769560062e-10,
"loss": 0.0,
"num_input_tokens_seen": 6201088,
"step": 9935
},
{
"epoch": 19.95983935742972,
"grad_norm": 0.05396367609500885,
"learning_rate": 1.354168438255643e-10,
"loss": 0.0,
"num_input_tokens_seen": 6204672,
"step": 9940
},
{
"epoch": 19.96987951807229,
"grad_norm": 0.0035722742322832346,
"learning_rate": 7.860947316140621e-11,
"loss": 0.0,
"num_input_tokens_seen": 6207360,
"step": 9945
},
{
"epoch": 19.97991967871486,
"grad_norm": 0.0020782332867383957,
"learning_rate": 3.715531014025775e-11,
"loss": 0.0,
"num_input_tokens_seen": 6210368,
"step": 9950
},
{
"epoch": 19.98995983935743,
"grad_norm": 0.02569785714149475,
"learning_rate": 1.1054482056405136e-11,
"loss": 0.0,
"num_input_tokens_seen": 6212800,
"step": 9955
},
{
"epoch": 20.0,
"grad_norm": 0.0010948892449960113,
"learning_rate": 3.0706905573829603e-13,
"loss": 0.0001,
"num_input_tokens_seen": 6215968,
"step": 9960
},
{
"epoch": 20.0,
"eval_loss": 1.102670431137085,
"eval_runtime": 8.0733,
"eval_samples_per_second": 61.685,
"eval_steps_per_second": 15.483,
"num_input_tokens_seen": 6215968,
"step": 9960
},
{
"epoch": 20.0,
"num_input_tokens_seen": 6215968,
"step": 9960,
"total_flos": 2.7990222962137498e+17,
"train_loss": 0.14292573261673278,
"train_runtime": 1559.9375,
"train_samples_per_second": 25.539,
"train_steps_per_second": 6.385
}
],
"logging_steps": 5,
"max_steps": 9960,
"num_input_tokens_seen": 6215968,
"num_train_epochs": 20,
"save_steps": 996,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7990222962137498e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}