time-extraction / trainer_state.json
datasetsANDmodels's picture
Upload 11 files
707824c verified
raw
history blame
408 kB
{
"best_metric": 0.051427390426397324,
"best_model_checkpoint": "time_base/checkpoint-2340",
"epoch": 20.0,
"eval_steps": 500,
"global_step": 2340,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008547008547008548,
"grad_norm": 221.6373748779297,
"learning_rate": 9.995726495726496e-06,
"loss": 37.5765,
"step": 1
},
{
"epoch": 0.017094017094017096,
"grad_norm": 219.50563049316406,
"learning_rate": 9.991452991452993e-06,
"loss": 38.6173,
"step": 2
},
{
"epoch": 0.02564102564102564,
"grad_norm": 180.23829650878906,
"learning_rate": 9.987179487179488e-06,
"loss": 40.3853,
"step": 3
},
{
"epoch": 0.03418803418803419,
"grad_norm": 166.3365478515625,
"learning_rate": 9.982905982905984e-06,
"loss": 35.9724,
"step": 4
},
{
"epoch": 0.042735042735042736,
"grad_norm": 199.6571044921875,
"learning_rate": 9.97863247863248e-06,
"loss": 35.0186,
"step": 5
},
{
"epoch": 0.05128205128205128,
"grad_norm": 180.9748992919922,
"learning_rate": 9.974358974358974e-06,
"loss": 39.3679,
"step": 6
},
{
"epoch": 0.05982905982905983,
"grad_norm": 200.05496215820312,
"learning_rate": 9.970085470085471e-06,
"loss": 37.1519,
"step": 7
},
{
"epoch": 0.06837606837606838,
"grad_norm": 154.3177032470703,
"learning_rate": 9.965811965811966e-06,
"loss": 33.9309,
"step": 8
},
{
"epoch": 0.07692307692307693,
"grad_norm": 198.05914306640625,
"learning_rate": 9.961538461538463e-06,
"loss": 34.8814,
"step": 9
},
{
"epoch": 0.08547008547008547,
"grad_norm": 168.3035430908203,
"learning_rate": 9.957264957264958e-06,
"loss": 33.184,
"step": 10
},
{
"epoch": 0.09401709401709402,
"grad_norm": 201.83705139160156,
"learning_rate": 9.952991452991455e-06,
"loss": 35.4025,
"step": 11
},
{
"epoch": 0.10256410256410256,
"grad_norm": 224.4587860107422,
"learning_rate": 9.94871794871795e-06,
"loss": 39.222,
"step": 12
},
{
"epoch": 0.1111111111111111,
"grad_norm": 192.1949005126953,
"learning_rate": 9.944444444444445e-06,
"loss": 37.1982,
"step": 13
},
{
"epoch": 0.11965811965811966,
"grad_norm": 193.05662536621094,
"learning_rate": 9.940170940170942e-06,
"loss": 38.1325,
"step": 14
},
{
"epoch": 0.1282051282051282,
"grad_norm": 150.61575317382812,
"learning_rate": 9.935897435897437e-06,
"loss": 34.8682,
"step": 15
},
{
"epoch": 0.13675213675213677,
"grad_norm": 170.1510772705078,
"learning_rate": 9.931623931623933e-06,
"loss": 33.3652,
"step": 16
},
{
"epoch": 0.1452991452991453,
"grad_norm": 193.86875915527344,
"learning_rate": 9.927350427350428e-06,
"loss": 35.0785,
"step": 17
},
{
"epoch": 0.15384615384615385,
"grad_norm": 164.41986083984375,
"learning_rate": 9.923076923076923e-06,
"loss": 31.9719,
"step": 18
},
{
"epoch": 0.1623931623931624,
"grad_norm": 166.08953857421875,
"learning_rate": 9.91880341880342e-06,
"loss": 34.5398,
"step": 19
},
{
"epoch": 0.17094017094017094,
"grad_norm": 152.2139892578125,
"learning_rate": 9.914529914529915e-06,
"loss": 36.9092,
"step": 20
},
{
"epoch": 0.1794871794871795,
"grad_norm": 198.23095703125,
"learning_rate": 9.910256410256412e-06,
"loss": 35.6744,
"step": 21
},
{
"epoch": 0.18803418803418803,
"grad_norm": 174.7784881591797,
"learning_rate": 9.905982905982907e-06,
"loss": 32.8258,
"step": 22
},
{
"epoch": 0.19658119658119658,
"grad_norm": 133.69859313964844,
"learning_rate": 9.901709401709402e-06,
"loss": 31.431,
"step": 23
},
{
"epoch": 0.20512820512820512,
"grad_norm": 217.17169189453125,
"learning_rate": 9.897435897435899e-06,
"loss": 38.5649,
"step": 24
},
{
"epoch": 0.21367521367521367,
"grad_norm": 172.4914093017578,
"learning_rate": 9.893162393162394e-06,
"loss": 33.9858,
"step": 25
},
{
"epoch": 0.2222222222222222,
"grad_norm": 186.39654541015625,
"learning_rate": 9.88888888888889e-06,
"loss": 32.8029,
"step": 26
},
{
"epoch": 0.23076923076923078,
"grad_norm": 183.65159606933594,
"learning_rate": 9.884615384615386e-06,
"loss": 35.8633,
"step": 27
},
{
"epoch": 0.23931623931623933,
"grad_norm": 228.352294921875,
"learning_rate": 9.880341880341882e-06,
"loss": 35.0285,
"step": 28
},
{
"epoch": 0.24786324786324787,
"grad_norm": 156.77906799316406,
"learning_rate": 9.876068376068377e-06,
"loss": 29.2608,
"step": 29
},
{
"epoch": 0.2564102564102564,
"grad_norm": 232.8336181640625,
"learning_rate": 9.871794871794872e-06,
"loss": 35.0349,
"step": 30
},
{
"epoch": 0.26495726495726496,
"grad_norm": 248.63247680664062,
"learning_rate": 9.86752136752137e-06,
"loss": 34.5067,
"step": 31
},
{
"epoch": 0.27350427350427353,
"grad_norm": 183.5840606689453,
"learning_rate": 9.863247863247864e-06,
"loss": 30.4758,
"step": 32
},
{
"epoch": 0.28205128205128205,
"grad_norm": 160.54530334472656,
"learning_rate": 9.858974358974361e-06,
"loss": 31.7959,
"step": 33
},
{
"epoch": 0.2905982905982906,
"grad_norm": 199.88156127929688,
"learning_rate": 9.854700854700856e-06,
"loss": 35.6482,
"step": 34
},
{
"epoch": 0.29914529914529914,
"grad_norm": 272.9530029296875,
"learning_rate": 9.850427350427351e-06,
"loss": 33.0804,
"step": 35
},
{
"epoch": 0.3076923076923077,
"grad_norm": 200.0990447998047,
"learning_rate": 9.846153846153848e-06,
"loss": 33.2675,
"step": 36
},
{
"epoch": 0.3162393162393162,
"grad_norm": 202.014404296875,
"learning_rate": 9.841880341880343e-06,
"loss": 30.8991,
"step": 37
},
{
"epoch": 0.3247863247863248,
"grad_norm": 181.14865112304688,
"learning_rate": 9.837606837606838e-06,
"loss": 32.3643,
"step": 38
},
{
"epoch": 0.3333333333333333,
"grad_norm": 134.43423461914062,
"learning_rate": 9.833333333333333e-06,
"loss": 30.8094,
"step": 39
},
{
"epoch": 0.3418803418803419,
"grad_norm": 155.96640014648438,
"learning_rate": 9.82905982905983e-06,
"loss": 31.7564,
"step": 40
},
{
"epoch": 0.3504273504273504,
"grad_norm": 146.9285888671875,
"learning_rate": 9.824786324786325e-06,
"loss": 31.9905,
"step": 41
},
{
"epoch": 0.358974358974359,
"grad_norm": 159.67974853515625,
"learning_rate": 9.820512820512821e-06,
"loss": 32.5029,
"step": 42
},
{
"epoch": 0.36752136752136755,
"grad_norm": 172.4975128173828,
"learning_rate": 9.816239316239316e-06,
"loss": 31.2049,
"step": 43
},
{
"epoch": 0.37606837606837606,
"grad_norm": 148.97573852539062,
"learning_rate": 9.811965811965812e-06,
"loss": 27.1673,
"step": 44
},
{
"epoch": 0.38461538461538464,
"grad_norm": 115.93009185791016,
"learning_rate": 9.807692307692308e-06,
"loss": 30.3342,
"step": 45
},
{
"epoch": 0.39316239316239315,
"grad_norm": 184.13145446777344,
"learning_rate": 9.803418803418803e-06,
"loss": 32.317,
"step": 46
},
{
"epoch": 0.4017094017094017,
"grad_norm": 139.3995361328125,
"learning_rate": 9.7991452991453e-06,
"loss": 29.9643,
"step": 47
},
{
"epoch": 0.41025641025641024,
"grad_norm": 184.97996520996094,
"learning_rate": 9.794871794871795e-06,
"loss": 30.6427,
"step": 48
},
{
"epoch": 0.4188034188034188,
"grad_norm": 120.04417419433594,
"learning_rate": 9.790598290598292e-06,
"loss": 26.9772,
"step": 49
},
{
"epoch": 0.42735042735042733,
"grad_norm": 183.2873077392578,
"learning_rate": 9.786324786324787e-06,
"loss": 31.6688,
"step": 50
},
{
"epoch": 0.4358974358974359,
"grad_norm": 206.44898986816406,
"learning_rate": 9.782051282051282e-06,
"loss": 32.0574,
"step": 51
},
{
"epoch": 0.4444444444444444,
"grad_norm": 180.7601318359375,
"learning_rate": 9.777777777777779e-06,
"loss": 31.2178,
"step": 52
},
{
"epoch": 0.452991452991453,
"grad_norm": 150.44012451171875,
"learning_rate": 9.773504273504274e-06,
"loss": 29.9826,
"step": 53
},
{
"epoch": 0.46153846153846156,
"grad_norm": 119.02840423583984,
"learning_rate": 9.76923076923077e-06,
"loss": 26.876,
"step": 54
},
{
"epoch": 0.4700854700854701,
"grad_norm": 164.58209228515625,
"learning_rate": 9.764957264957265e-06,
"loss": 28.1059,
"step": 55
},
{
"epoch": 0.47863247863247865,
"grad_norm": 160.416259765625,
"learning_rate": 9.76068376068376e-06,
"loss": 28.7022,
"step": 56
},
{
"epoch": 0.48717948717948717,
"grad_norm": 177.29747009277344,
"learning_rate": 9.756410256410257e-06,
"loss": 30.7275,
"step": 57
},
{
"epoch": 0.49572649572649574,
"grad_norm": 153.59686279296875,
"learning_rate": 9.752136752136752e-06,
"loss": 28.5575,
"step": 58
},
{
"epoch": 0.5042735042735043,
"grad_norm": 155.79617309570312,
"learning_rate": 9.747863247863249e-06,
"loss": 28.1139,
"step": 59
},
{
"epoch": 0.5128205128205128,
"grad_norm": 173.02581787109375,
"learning_rate": 9.743589743589744e-06,
"loss": 30.4744,
"step": 60
},
{
"epoch": 0.5213675213675214,
"grad_norm": 125.31639862060547,
"learning_rate": 9.739316239316239e-06,
"loss": 26.5559,
"step": 61
},
{
"epoch": 0.5299145299145299,
"grad_norm": 149.00302124023438,
"learning_rate": 9.735042735042736e-06,
"loss": 30.4065,
"step": 62
},
{
"epoch": 0.5384615384615384,
"grad_norm": 101.76395416259766,
"learning_rate": 9.730769230769231e-06,
"loss": 25.8895,
"step": 63
},
{
"epoch": 0.5470085470085471,
"grad_norm": 134.40159606933594,
"learning_rate": 9.726495726495728e-06,
"loss": 26.9317,
"step": 64
},
{
"epoch": 0.5555555555555556,
"grad_norm": 151.01914978027344,
"learning_rate": 9.722222222222223e-06,
"loss": 27.9913,
"step": 65
},
{
"epoch": 0.5641025641025641,
"grad_norm": 124.92068481445312,
"learning_rate": 9.71794871794872e-06,
"loss": 26.7874,
"step": 66
},
{
"epoch": 0.5726495726495726,
"grad_norm": 131.29762268066406,
"learning_rate": 9.713675213675214e-06,
"loss": 27.4047,
"step": 67
},
{
"epoch": 0.5811965811965812,
"grad_norm": 154.37120056152344,
"learning_rate": 9.70940170940171e-06,
"loss": 26.6812,
"step": 68
},
{
"epoch": 0.5897435897435898,
"grad_norm": 86.31095886230469,
"learning_rate": 9.705128205128206e-06,
"loss": 22.9869,
"step": 69
},
{
"epoch": 0.5982905982905983,
"grad_norm": 224.42613220214844,
"learning_rate": 9.700854700854701e-06,
"loss": 28.4812,
"step": 70
},
{
"epoch": 0.6068376068376068,
"grad_norm": 156.15228271484375,
"learning_rate": 9.696581196581198e-06,
"loss": 26.1761,
"step": 71
},
{
"epoch": 0.6153846153846154,
"grad_norm": 117.7806167602539,
"learning_rate": 9.692307692307693e-06,
"loss": 20.7307,
"step": 72
},
{
"epoch": 0.6239316239316239,
"grad_norm": 169.99154663085938,
"learning_rate": 9.688034188034188e-06,
"loss": 27.6369,
"step": 73
},
{
"epoch": 0.6324786324786325,
"grad_norm": 98.81549072265625,
"learning_rate": 9.683760683760685e-06,
"loss": 24.5898,
"step": 74
},
{
"epoch": 0.6410256410256411,
"grad_norm": 199.0179443359375,
"learning_rate": 9.67948717948718e-06,
"loss": 27.664,
"step": 75
},
{
"epoch": 0.6495726495726496,
"grad_norm": 129.81033325195312,
"learning_rate": 9.675213675213677e-06,
"loss": 25.2547,
"step": 76
},
{
"epoch": 0.6581196581196581,
"grad_norm": 140.1121826171875,
"learning_rate": 9.670940170940172e-06,
"loss": 27.4914,
"step": 77
},
{
"epoch": 0.6666666666666666,
"grad_norm": 139.8365478515625,
"learning_rate": 9.666666666666667e-06,
"loss": 24.0178,
"step": 78
},
{
"epoch": 0.6752136752136753,
"grad_norm": 147.24945068359375,
"learning_rate": 9.662393162393163e-06,
"loss": 27.1404,
"step": 79
},
{
"epoch": 0.6837606837606838,
"grad_norm": 165.67242431640625,
"learning_rate": 9.658119658119659e-06,
"loss": 25.6604,
"step": 80
},
{
"epoch": 0.6923076923076923,
"grad_norm": 114.36772918701172,
"learning_rate": 9.653846153846155e-06,
"loss": 24.3695,
"step": 81
},
{
"epoch": 0.7008547008547008,
"grad_norm": 149.76258850097656,
"learning_rate": 9.64957264957265e-06,
"loss": 26.5265,
"step": 82
},
{
"epoch": 0.7094017094017094,
"grad_norm": 121.9085693359375,
"learning_rate": 9.645299145299147e-06,
"loss": 25.7008,
"step": 83
},
{
"epoch": 0.717948717948718,
"grad_norm": 106.49151611328125,
"learning_rate": 9.641025641025642e-06,
"loss": 20.5777,
"step": 84
},
{
"epoch": 0.7264957264957265,
"grad_norm": 114.2357406616211,
"learning_rate": 9.636752136752137e-06,
"loss": 23.3429,
"step": 85
},
{
"epoch": 0.7350427350427351,
"grad_norm": 107.38651275634766,
"learning_rate": 9.632478632478634e-06,
"loss": 24.6408,
"step": 86
},
{
"epoch": 0.7435897435897436,
"grad_norm": 120.4283218383789,
"learning_rate": 9.628205128205129e-06,
"loss": 23.4563,
"step": 87
},
{
"epoch": 0.7521367521367521,
"grad_norm": 165.21783447265625,
"learning_rate": 9.623931623931626e-06,
"loss": 25.878,
"step": 88
},
{
"epoch": 0.7606837606837606,
"grad_norm": 105.8712387084961,
"learning_rate": 9.61965811965812e-06,
"loss": 23.605,
"step": 89
},
{
"epoch": 0.7692307692307693,
"grad_norm": 147.31253051757812,
"learning_rate": 9.615384615384616e-06,
"loss": 24.537,
"step": 90
},
{
"epoch": 0.7777777777777778,
"grad_norm": 127.37718963623047,
"learning_rate": 9.611111111111112e-06,
"loss": 24.6762,
"step": 91
},
{
"epoch": 0.7863247863247863,
"grad_norm": 139.40553283691406,
"learning_rate": 9.606837606837607e-06,
"loss": 23.6076,
"step": 92
},
{
"epoch": 0.7948717948717948,
"grad_norm": 218.39170837402344,
"learning_rate": 9.602564102564104e-06,
"loss": 25.2559,
"step": 93
},
{
"epoch": 0.8034188034188035,
"grad_norm": 115.83401489257812,
"learning_rate": 9.5982905982906e-06,
"loss": 23.6758,
"step": 94
},
{
"epoch": 0.811965811965812,
"grad_norm": 115.8538818359375,
"learning_rate": 9.594017094017094e-06,
"loss": 24.2789,
"step": 95
},
{
"epoch": 0.8205128205128205,
"grad_norm": 122.31534576416016,
"learning_rate": 9.589743589743591e-06,
"loss": 23.5114,
"step": 96
},
{
"epoch": 0.8290598290598291,
"grad_norm": 171.58558654785156,
"learning_rate": 9.585470085470086e-06,
"loss": 24.7028,
"step": 97
},
{
"epoch": 0.8376068376068376,
"grad_norm": 113.29806518554688,
"learning_rate": 9.581196581196583e-06,
"loss": 24.9667,
"step": 98
},
{
"epoch": 0.8461538461538461,
"grad_norm": 183.74928283691406,
"learning_rate": 9.576923076923078e-06,
"loss": 24.7776,
"step": 99
},
{
"epoch": 0.8547008547008547,
"grad_norm": 139.84701538085938,
"learning_rate": 9.572649572649575e-06,
"loss": 22.1558,
"step": 100
},
{
"epoch": 0.8632478632478633,
"grad_norm": 145.9014129638672,
"learning_rate": 9.56837606837607e-06,
"loss": 23.0282,
"step": 101
},
{
"epoch": 0.8717948717948718,
"grad_norm": 195.9859619140625,
"learning_rate": 9.564102564102565e-06,
"loss": 23.7194,
"step": 102
},
{
"epoch": 0.8803418803418803,
"grad_norm": 70.51985168457031,
"learning_rate": 9.559829059829061e-06,
"loss": 16.9605,
"step": 103
},
{
"epoch": 0.8888888888888888,
"grad_norm": 184.04209899902344,
"learning_rate": 9.555555555555556e-06,
"loss": 23.4229,
"step": 104
},
{
"epoch": 0.8974358974358975,
"grad_norm": 177.86727905273438,
"learning_rate": 9.551282051282053e-06,
"loss": 23.6004,
"step": 105
},
{
"epoch": 0.905982905982906,
"grad_norm": 154.30784606933594,
"learning_rate": 9.547008547008548e-06,
"loss": 21.6725,
"step": 106
},
{
"epoch": 0.9145299145299145,
"grad_norm": 104.27069854736328,
"learning_rate": 9.542735042735043e-06,
"loss": 22.856,
"step": 107
},
{
"epoch": 0.9230769230769231,
"grad_norm": 157.4270477294922,
"learning_rate": 9.53846153846154e-06,
"loss": 24.398,
"step": 108
},
{
"epoch": 0.9316239316239316,
"grad_norm": 123.56739807128906,
"learning_rate": 9.534188034188035e-06,
"loss": 20.6925,
"step": 109
},
{
"epoch": 0.9401709401709402,
"grad_norm": 106.64054870605469,
"learning_rate": 9.52991452991453e-06,
"loss": 23.5794,
"step": 110
},
{
"epoch": 0.9487179487179487,
"grad_norm": 88.68234252929688,
"learning_rate": 9.525641025641025e-06,
"loss": 20.729,
"step": 111
},
{
"epoch": 0.9572649572649573,
"grad_norm": 91.86422729492188,
"learning_rate": 9.521367521367522e-06,
"loss": 18.7701,
"step": 112
},
{
"epoch": 0.9658119658119658,
"grad_norm": 118.74354553222656,
"learning_rate": 9.517094017094017e-06,
"loss": 20.8439,
"step": 113
},
{
"epoch": 0.9743589743589743,
"grad_norm": 120.72904968261719,
"learning_rate": 9.512820512820514e-06,
"loss": 21.1903,
"step": 114
},
{
"epoch": 0.9829059829059829,
"grad_norm": 107.36665344238281,
"learning_rate": 9.508547008547009e-06,
"loss": 19.3457,
"step": 115
},
{
"epoch": 0.9914529914529915,
"grad_norm": 131.74441528320312,
"learning_rate": 9.504273504273504e-06,
"loss": 21.4035,
"step": 116
},
{
"epoch": 1.0,
"grad_norm": 161.97703552246094,
"learning_rate": 9.5e-06,
"loss": 22.3831,
"step": 117
},
{
"epoch": 1.0,
"eval_loss": 17.230430603027344,
"eval_runtime": 9.9187,
"eval_samples_per_second": 46.982,
"eval_steps_per_second": 5.948,
"step": 117
},
{
"epoch": 1.0085470085470085,
"grad_norm": 109.44770050048828,
"learning_rate": 9.495726495726496e-06,
"loss": 20.3406,
"step": 118
},
{
"epoch": 1.017094017094017,
"grad_norm": 96.50030517578125,
"learning_rate": 9.491452991452992e-06,
"loss": 19.8086,
"step": 119
},
{
"epoch": 1.0256410256410255,
"grad_norm": 91.27509307861328,
"learning_rate": 9.487179487179487e-06,
"loss": 18.8737,
"step": 120
},
{
"epoch": 1.0341880341880343,
"grad_norm": 123.94478607177734,
"learning_rate": 9.482905982905984e-06,
"loss": 20.1785,
"step": 121
},
{
"epoch": 1.0427350427350428,
"grad_norm": 109.29426574707031,
"learning_rate": 9.478632478632479e-06,
"loss": 18.8151,
"step": 122
},
{
"epoch": 1.0512820512820513,
"grad_norm": 104.0233383178711,
"learning_rate": 9.474358974358974e-06,
"loss": 19.6281,
"step": 123
},
{
"epoch": 1.0598290598290598,
"grad_norm": 75.7523193359375,
"learning_rate": 9.470085470085471e-06,
"loss": 18.5031,
"step": 124
},
{
"epoch": 1.0683760683760684,
"grad_norm": 103.1374740600586,
"learning_rate": 9.465811965811966e-06,
"loss": 19.6443,
"step": 125
},
{
"epoch": 1.0769230769230769,
"grad_norm": 92.68035888671875,
"learning_rate": 9.461538461538463e-06,
"loss": 19.7327,
"step": 126
},
{
"epoch": 1.0854700854700854,
"grad_norm": 88.10079193115234,
"learning_rate": 9.457264957264958e-06,
"loss": 17.8832,
"step": 127
},
{
"epoch": 1.0940170940170941,
"grad_norm": 80.04244232177734,
"learning_rate": 9.452991452991453e-06,
"loss": 16.4485,
"step": 128
},
{
"epoch": 1.1025641025641026,
"grad_norm": 81.02445983886719,
"learning_rate": 9.44871794871795e-06,
"loss": 17.3035,
"step": 129
},
{
"epoch": 1.1111111111111112,
"grad_norm": 98.95979309082031,
"learning_rate": 9.444444444444445e-06,
"loss": 17.5734,
"step": 130
},
{
"epoch": 1.1196581196581197,
"grad_norm": 109.76984405517578,
"learning_rate": 9.440170940170941e-06,
"loss": 20.3985,
"step": 131
},
{
"epoch": 1.1282051282051282,
"grad_norm": 98.52857208251953,
"learning_rate": 9.435897435897436e-06,
"loss": 17.7275,
"step": 132
},
{
"epoch": 1.1367521367521367,
"grad_norm": 91.28802490234375,
"learning_rate": 9.431623931623931e-06,
"loss": 17.9107,
"step": 133
},
{
"epoch": 1.1452991452991452,
"grad_norm": 92.89081573486328,
"learning_rate": 9.427350427350428e-06,
"loss": 18.2876,
"step": 134
},
{
"epoch": 1.1538461538461537,
"grad_norm": 78.9795150756836,
"learning_rate": 9.423076923076923e-06,
"loss": 15.5738,
"step": 135
},
{
"epoch": 1.1623931623931625,
"grad_norm": 83.77166748046875,
"learning_rate": 9.41880341880342e-06,
"loss": 16.0825,
"step": 136
},
{
"epoch": 1.170940170940171,
"grad_norm": 129.62966918945312,
"learning_rate": 9.414529914529915e-06,
"loss": 18.4077,
"step": 137
},
{
"epoch": 1.1794871794871795,
"grad_norm": 110.26199340820312,
"learning_rate": 9.410256410256412e-06,
"loss": 17.6436,
"step": 138
},
{
"epoch": 1.188034188034188,
"grad_norm": 95.36865997314453,
"learning_rate": 9.405982905982907e-06,
"loss": 19.0424,
"step": 139
},
{
"epoch": 1.1965811965811965,
"grad_norm": 98.36263275146484,
"learning_rate": 9.401709401709402e-06,
"loss": 16.6122,
"step": 140
},
{
"epoch": 1.205128205128205,
"grad_norm": 83.68401336669922,
"learning_rate": 9.397435897435899e-06,
"loss": 14.9218,
"step": 141
},
{
"epoch": 1.2136752136752136,
"grad_norm": 92.4602279663086,
"learning_rate": 9.393162393162394e-06,
"loss": 16.3563,
"step": 142
},
{
"epoch": 1.2222222222222223,
"grad_norm": 106.41629791259766,
"learning_rate": 9.38888888888889e-06,
"loss": 16.4447,
"step": 143
},
{
"epoch": 1.2307692307692308,
"grad_norm": 97.70237731933594,
"learning_rate": 9.384615384615385e-06,
"loss": 16.8154,
"step": 144
},
{
"epoch": 1.2393162393162394,
"grad_norm": 76.88361358642578,
"learning_rate": 9.38034188034188e-06,
"loss": 15.7116,
"step": 145
},
{
"epoch": 1.2478632478632479,
"grad_norm": 104.20966339111328,
"learning_rate": 9.376068376068377e-06,
"loss": 15.2283,
"step": 146
},
{
"epoch": 1.2564102564102564,
"grad_norm": 80.29965209960938,
"learning_rate": 9.371794871794872e-06,
"loss": 15.3238,
"step": 147
},
{
"epoch": 1.264957264957265,
"grad_norm": 72.6979751586914,
"learning_rate": 9.367521367521369e-06,
"loss": 14.2293,
"step": 148
},
{
"epoch": 1.2735042735042734,
"grad_norm": 80.29464721679688,
"learning_rate": 9.363247863247864e-06,
"loss": 11.9706,
"step": 149
},
{
"epoch": 1.282051282051282,
"grad_norm": 97.80663299560547,
"learning_rate": 9.358974358974359e-06,
"loss": 14.3517,
"step": 150
},
{
"epoch": 1.2905982905982907,
"grad_norm": 75.88921356201172,
"learning_rate": 9.354700854700856e-06,
"loss": 12.8289,
"step": 151
},
{
"epoch": 1.2991452991452992,
"grad_norm": 75.71963500976562,
"learning_rate": 9.35042735042735e-06,
"loss": 15.2496,
"step": 152
},
{
"epoch": 1.3076923076923077,
"grad_norm": 84.5454330444336,
"learning_rate": 9.346153846153847e-06,
"loss": 15.7946,
"step": 153
},
{
"epoch": 1.3162393162393162,
"grad_norm": 92.24919128417969,
"learning_rate": 9.341880341880343e-06,
"loss": 13.2751,
"step": 154
},
{
"epoch": 1.3247863247863247,
"grad_norm": 76.51255798339844,
"learning_rate": 9.33760683760684e-06,
"loss": 14.1861,
"step": 155
},
{
"epoch": 1.3333333333333333,
"grad_norm": 74.59149169921875,
"learning_rate": 9.333333333333334e-06,
"loss": 12.1881,
"step": 156
},
{
"epoch": 1.341880341880342,
"grad_norm": 69.84959411621094,
"learning_rate": 9.32905982905983e-06,
"loss": 13.1244,
"step": 157
},
{
"epoch": 1.3504273504273505,
"grad_norm": 82.09815979003906,
"learning_rate": 9.324786324786326e-06,
"loss": 12.7492,
"step": 158
},
{
"epoch": 1.358974358974359,
"grad_norm": 87.25080108642578,
"learning_rate": 9.320512820512821e-06,
"loss": 15.5268,
"step": 159
},
{
"epoch": 1.3675213675213675,
"grad_norm": 51.60975646972656,
"learning_rate": 9.316239316239318e-06,
"loss": 10.9868,
"step": 160
},
{
"epoch": 1.376068376068376,
"grad_norm": 65.10023498535156,
"learning_rate": 9.311965811965813e-06,
"loss": 13.2106,
"step": 161
},
{
"epoch": 1.3846153846153846,
"grad_norm": 86.3865737915039,
"learning_rate": 9.307692307692308e-06,
"loss": 12.4873,
"step": 162
},
{
"epoch": 1.393162393162393,
"grad_norm": 89.5868911743164,
"learning_rate": 9.303418803418805e-06,
"loss": 12.3125,
"step": 163
},
{
"epoch": 1.4017094017094016,
"grad_norm": 87.308837890625,
"learning_rate": 9.2991452991453e-06,
"loss": 13.1855,
"step": 164
},
{
"epoch": 1.4102564102564101,
"grad_norm": 79.86372375488281,
"learning_rate": 9.294871794871796e-06,
"loss": 11.2756,
"step": 165
},
{
"epoch": 1.4188034188034189,
"grad_norm": 64.42597961425781,
"learning_rate": 9.290598290598292e-06,
"loss": 11.7395,
"step": 166
},
{
"epoch": 1.4273504273504274,
"grad_norm": 64.65245056152344,
"learning_rate": 9.286324786324787e-06,
"loss": 10.2739,
"step": 167
},
{
"epoch": 1.435897435897436,
"grad_norm": 49.57310104370117,
"learning_rate": 9.282051282051283e-06,
"loss": 11.4798,
"step": 168
},
{
"epoch": 1.4444444444444444,
"grad_norm": 89.93653106689453,
"learning_rate": 9.277777777777778e-06,
"loss": 13.8041,
"step": 169
},
{
"epoch": 1.452991452991453,
"grad_norm": 59.6973876953125,
"learning_rate": 9.273504273504275e-06,
"loss": 11.0414,
"step": 170
},
{
"epoch": 1.4615384615384617,
"grad_norm": 63.07640838623047,
"learning_rate": 9.26923076923077e-06,
"loss": 10.2649,
"step": 171
},
{
"epoch": 1.4700854700854702,
"grad_norm": 121.3633041381836,
"learning_rate": 9.264957264957267e-06,
"loss": 11.9233,
"step": 172
},
{
"epoch": 1.4786324786324787,
"grad_norm": 50.96989822387695,
"learning_rate": 9.260683760683762e-06,
"loss": 8.3527,
"step": 173
},
{
"epoch": 1.4871794871794872,
"grad_norm": 71.61744689941406,
"learning_rate": 9.256410256410257e-06,
"loss": 11.4237,
"step": 174
},
{
"epoch": 1.4957264957264957,
"grad_norm": 69.43048858642578,
"learning_rate": 9.252136752136754e-06,
"loss": 9.9193,
"step": 175
},
{
"epoch": 1.5042735042735043,
"grad_norm": 130.2714385986328,
"learning_rate": 9.247863247863249e-06,
"loss": 12.0676,
"step": 176
},
{
"epoch": 1.5128205128205128,
"grad_norm": 51.40456008911133,
"learning_rate": 9.243589743589745e-06,
"loss": 9.2348,
"step": 177
},
{
"epoch": 1.5213675213675213,
"grad_norm": 48.94670486450195,
"learning_rate": 9.23931623931624e-06,
"loss": 8.8217,
"step": 178
},
{
"epoch": 1.5299145299145298,
"grad_norm": 54.54533386230469,
"learning_rate": 9.235042735042736e-06,
"loss": 9.2478,
"step": 179
},
{
"epoch": 1.5384615384615383,
"grad_norm": 46.581939697265625,
"learning_rate": 9.230769230769232e-06,
"loss": 8.746,
"step": 180
},
{
"epoch": 1.547008547008547,
"grad_norm": 49.31954574584961,
"learning_rate": 9.226495726495727e-06,
"loss": 8.7889,
"step": 181
},
{
"epoch": 1.5555555555555556,
"grad_norm": 48.5145378112793,
"learning_rate": 9.222222222222224e-06,
"loss": 8.4478,
"step": 182
},
{
"epoch": 1.564102564102564,
"grad_norm": 49.587825775146484,
"learning_rate": 9.217948717948717e-06,
"loss": 10.5022,
"step": 183
},
{
"epoch": 1.5726495726495726,
"grad_norm": 47.89423751831055,
"learning_rate": 9.213675213675214e-06,
"loss": 8.7681,
"step": 184
},
{
"epoch": 1.5811965811965814,
"grad_norm": 59.971920013427734,
"learning_rate": 9.20940170940171e-06,
"loss": 9.6469,
"step": 185
},
{
"epoch": 1.5897435897435899,
"grad_norm": 41.139957427978516,
"learning_rate": 9.205128205128206e-06,
"loss": 8.5196,
"step": 186
},
{
"epoch": 1.5982905982905984,
"grad_norm": 36.8078498840332,
"learning_rate": 9.200854700854701e-06,
"loss": 8.2513,
"step": 187
},
{
"epoch": 1.606837606837607,
"grad_norm": 62.23011016845703,
"learning_rate": 9.196581196581196e-06,
"loss": 9.239,
"step": 188
},
{
"epoch": 1.6153846153846154,
"grad_norm": 41.35377502441406,
"learning_rate": 9.192307692307693e-06,
"loss": 8.6788,
"step": 189
},
{
"epoch": 1.623931623931624,
"grad_norm": 53.734134674072266,
"learning_rate": 9.188034188034188e-06,
"loss": 8.2624,
"step": 190
},
{
"epoch": 1.6324786324786325,
"grad_norm": 60.738887786865234,
"learning_rate": 9.183760683760685e-06,
"loss": 9.2777,
"step": 191
},
{
"epoch": 1.641025641025641,
"grad_norm": 26.411643981933594,
"learning_rate": 9.17948717948718e-06,
"loss": 7.6894,
"step": 192
},
{
"epoch": 1.6495726495726495,
"grad_norm": 37.81135940551758,
"learning_rate": 9.175213675213676e-06,
"loss": 8.009,
"step": 193
},
{
"epoch": 1.658119658119658,
"grad_norm": 42.451080322265625,
"learning_rate": 9.170940170940171e-06,
"loss": 8.309,
"step": 194
},
{
"epoch": 1.6666666666666665,
"grad_norm": 54.87519073486328,
"learning_rate": 9.166666666666666e-06,
"loss": 8.3505,
"step": 195
},
{
"epoch": 1.6752136752136753,
"grad_norm": 47.997737884521484,
"learning_rate": 9.162393162393163e-06,
"loss": 8.9444,
"step": 196
},
{
"epoch": 1.6837606837606838,
"grad_norm": 33.1911506652832,
"learning_rate": 9.158119658119658e-06,
"loss": 6.8856,
"step": 197
},
{
"epoch": 1.6923076923076923,
"grad_norm": 28.42953872680664,
"learning_rate": 9.153846153846155e-06,
"loss": 7.0575,
"step": 198
},
{
"epoch": 1.7008547008547008,
"grad_norm": 34.74330520629883,
"learning_rate": 9.14957264957265e-06,
"loss": 7.6837,
"step": 199
},
{
"epoch": 1.7094017094017095,
"grad_norm": 27.730812072753906,
"learning_rate": 9.145299145299145e-06,
"loss": 7.2591,
"step": 200
},
{
"epoch": 1.717948717948718,
"grad_norm": 36.658966064453125,
"learning_rate": 9.141025641025642e-06,
"loss": 7.6744,
"step": 201
},
{
"epoch": 1.7264957264957266,
"grad_norm": 52.580074310302734,
"learning_rate": 9.136752136752137e-06,
"loss": 8.9746,
"step": 202
},
{
"epoch": 1.735042735042735,
"grad_norm": 26.30430030822754,
"learning_rate": 9.132478632478634e-06,
"loss": 7.0829,
"step": 203
},
{
"epoch": 1.7435897435897436,
"grad_norm": 35.77456283569336,
"learning_rate": 9.128205128205129e-06,
"loss": 7.46,
"step": 204
},
{
"epoch": 1.7521367521367521,
"grad_norm": 46.80126953125,
"learning_rate": 9.123931623931624e-06,
"loss": 8.0331,
"step": 205
},
{
"epoch": 1.7606837606837606,
"grad_norm": 26.510988235473633,
"learning_rate": 9.11965811965812e-06,
"loss": 7.0434,
"step": 206
},
{
"epoch": 1.7692307692307692,
"grad_norm": 30.846357345581055,
"learning_rate": 9.115384615384615e-06,
"loss": 6.9022,
"step": 207
},
{
"epoch": 1.7777777777777777,
"grad_norm": 45.06099319458008,
"learning_rate": 9.111111111111112e-06,
"loss": 7.108,
"step": 208
},
{
"epoch": 1.7863247863247862,
"grad_norm": 40.050079345703125,
"learning_rate": 9.106837606837607e-06,
"loss": 7.3628,
"step": 209
},
{
"epoch": 1.7948717948717947,
"grad_norm": 32.066261291503906,
"learning_rate": 9.102564102564104e-06,
"loss": 7.3292,
"step": 210
},
{
"epoch": 1.8034188034188035,
"grad_norm": 29.196252822875977,
"learning_rate": 9.098290598290599e-06,
"loss": 6.6194,
"step": 211
},
{
"epoch": 1.811965811965812,
"grad_norm": 34.54549026489258,
"learning_rate": 9.094017094017094e-06,
"loss": 7.224,
"step": 212
},
{
"epoch": 1.8205128205128205,
"grad_norm": 31.863550186157227,
"learning_rate": 9.08974358974359e-06,
"loss": 7.141,
"step": 213
},
{
"epoch": 1.8290598290598292,
"grad_norm": 36.79090118408203,
"learning_rate": 9.085470085470086e-06,
"loss": 6.9572,
"step": 214
},
{
"epoch": 1.8376068376068377,
"grad_norm": 24.298635482788086,
"learning_rate": 9.081196581196583e-06,
"loss": 6.6881,
"step": 215
},
{
"epoch": 1.8461538461538463,
"grad_norm": 16.75456428527832,
"learning_rate": 9.076923076923078e-06,
"loss": 6.4055,
"step": 216
},
{
"epoch": 1.8547008547008548,
"grad_norm": 20.152400970458984,
"learning_rate": 9.072649572649573e-06,
"loss": 6.9078,
"step": 217
},
{
"epoch": 1.8632478632478633,
"grad_norm": 34.73337173461914,
"learning_rate": 9.06837606837607e-06,
"loss": 6.7923,
"step": 218
},
{
"epoch": 1.8717948717948718,
"grad_norm": 28.418310165405273,
"learning_rate": 9.064102564102564e-06,
"loss": 6.9382,
"step": 219
},
{
"epoch": 1.8803418803418803,
"grad_norm": 13.454174995422363,
"learning_rate": 9.059829059829061e-06,
"loss": 4.5504,
"step": 220
},
{
"epoch": 1.8888888888888888,
"grad_norm": 20.746938705444336,
"learning_rate": 9.055555555555556e-06,
"loss": 6.4711,
"step": 221
},
{
"epoch": 1.8974358974358974,
"grad_norm": 23.29437828063965,
"learning_rate": 9.051282051282051e-06,
"loss": 6.1381,
"step": 222
},
{
"epoch": 1.9059829059829059,
"grad_norm": 31.720672607421875,
"learning_rate": 9.047008547008548e-06,
"loss": 6.7716,
"step": 223
},
{
"epoch": 1.9145299145299144,
"grad_norm": 16.971572875976562,
"learning_rate": 9.042735042735043e-06,
"loss": 6.4734,
"step": 224
},
{
"epoch": 1.9230769230769231,
"grad_norm": 25.185396194458008,
"learning_rate": 9.03846153846154e-06,
"loss": 6.2505,
"step": 225
},
{
"epoch": 1.9316239316239316,
"grad_norm": 42.373863220214844,
"learning_rate": 9.034188034188035e-06,
"loss": 7.1968,
"step": 226
},
{
"epoch": 1.9401709401709402,
"grad_norm": 21.06004524230957,
"learning_rate": 9.029914529914532e-06,
"loss": 6.082,
"step": 227
},
{
"epoch": 1.9487179487179487,
"grad_norm": 21.413599014282227,
"learning_rate": 9.025641025641027e-06,
"loss": 6.2279,
"step": 228
},
{
"epoch": 1.9572649572649574,
"grad_norm": 18.379974365234375,
"learning_rate": 9.021367521367522e-06,
"loss": 6.6032,
"step": 229
},
{
"epoch": 1.965811965811966,
"grad_norm": 28.239042282104492,
"learning_rate": 9.017094017094018e-06,
"loss": 6.5428,
"step": 230
},
{
"epoch": 1.9743589743589745,
"grad_norm": 17.92879867553711,
"learning_rate": 9.012820512820513e-06,
"loss": 5.986,
"step": 231
},
{
"epoch": 1.982905982905983,
"grad_norm": 15.501392364501953,
"learning_rate": 9.00854700854701e-06,
"loss": 5.9526,
"step": 232
},
{
"epoch": 1.9914529914529915,
"grad_norm": 23.742633819580078,
"learning_rate": 9.004273504273505e-06,
"loss": 6.2462,
"step": 233
},
{
"epoch": 2.0,
"grad_norm": 28.22560691833496,
"learning_rate": 9e-06,
"loss": 5.8705,
"step": 234
},
{
"epoch": 2.0,
"eval_loss": 5.379393577575684,
"eval_runtime": 9.2791,
"eval_samples_per_second": 50.22,
"eval_steps_per_second": 6.358,
"step": 234
},
{
"epoch": 2.0085470085470085,
"grad_norm": 21.7072696685791,
"learning_rate": 8.995726495726497e-06,
"loss": 6.2757,
"step": 235
},
{
"epoch": 2.017094017094017,
"grad_norm": 20.955190658569336,
"learning_rate": 8.991452991452992e-06,
"loss": 5.7265,
"step": 236
},
{
"epoch": 2.0256410256410255,
"grad_norm": 15.186567306518555,
"learning_rate": 8.987179487179489e-06,
"loss": 6.1958,
"step": 237
},
{
"epoch": 2.034188034188034,
"grad_norm": 20.938766479492188,
"learning_rate": 8.982905982905984e-06,
"loss": 6.2317,
"step": 238
},
{
"epoch": 2.0427350427350426,
"grad_norm": 18.457494735717773,
"learning_rate": 8.978632478632479e-06,
"loss": 6.4711,
"step": 239
},
{
"epoch": 2.051282051282051,
"grad_norm": 43.505149841308594,
"learning_rate": 8.974358974358976e-06,
"loss": 5.9632,
"step": 240
},
{
"epoch": 2.0598290598290596,
"grad_norm": 15.558544158935547,
"learning_rate": 8.97008547008547e-06,
"loss": 5.8099,
"step": 241
},
{
"epoch": 2.0683760683760686,
"grad_norm": 22.20660400390625,
"learning_rate": 8.965811965811967e-06,
"loss": 5.7939,
"step": 242
},
{
"epoch": 2.076923076923077,
"grad_norm": 15.866617202758789,
"learning_rate": 8.961538461538462e-06,
"loss": 5.9473,
"step": 243
},
{
"epoch": 2.0854700854700856,
"grad_norm": 20.30729103088379,
"learning_rate": 8.957264957264959e-06,
"loss": 6.2028,
"step": 244
},
{
"epoch": 2.094017094017094,
"grad_norm": 15.517614364624023,
"learning_rate": 8.952991452991454e-06,
"loss": 5.906,
"step": 245
},
{
"epoch": 2.1025641025641026,
"grad_norm": 21.30764389038086,
"learning_rate": 8.94871794871795e-06,
"loss": 6.1907,
"step": 246
},
{
"epoch": 2.111111111111111,
"grad_norm": 19.973115921020508,
"learning_rate": 8.944444444444446e-06,
"loss": 5.6895,
"step": 247
},
{
"epoch": 2.1196581196581197,
"grad_norm": 17.40595817565918,
"learning_rate": 8.940170940170941e-06,
"loss": 5.4836,
"step": 248
},
{
"epoch": 2.128205128205128,
"grad_norm": 27.667421340942383,
"learning_rate": 8.935897435897438e-06,
"loss": 5.9082,
"step": 249
},
{
"epoch": 2.1367521367521367,
"grad_norm": 18.151315689086914,
"learning_rate": 8.931623931623933e-06,
"loss": 5.8102,
"step": 250
},
{
"epoch": 2.1452991452991452,
"grad_norm": 15.390297889709473,
"learning_rate": 8.927350427350428e-06,
"loss": 5.5504,
"step": 251
},
{
"epoch": 2.1538461538461537,
"grad_norm": 17.257841110229492,
"learning_rate": 8.923076923076925e-06,
"loss": 5.9043,
"step": 252
},
{
"epoch": 2.1623931623931623,
"grad_norm": 19.2503604888916,
"learning_rate": 8.91880341880342e-06,
"loss": 5.8349,
"step": 253
},
{
"epoch": 2.1709401709401708,
"grad_norm": 25.236759185791016,
"learning_rate": 8.914529914529916e-06,
"loss": 5.2908,
"step": 254
},
{
"epoch": 2.1794871794871793,
"grad_norm": 13.771193504333496,
"learning_rate": 8.910256410256411e-06,
"loss": 5.4743,
"step": 255
},
{
"epoch": 2.1880341880341883,
"grad_norm": 17.406471252441406,
"learning_rate": 8.905982905982906e-06,
"loss": 5.6856,
"step": 256
},
{
"epoch": 2.1965811965811968,
"grad_norm": 14.727091789245605,
"learning_rate": 8.901709401709401e-06,
"loss": 5.7937,
"step": 257
},
{
"epoch": 2.2051282051282053,
"grad_norm": 18.193246841430664,
"learning_rate": 8.897435897435898e-06,
"loss": 5.5704,
"step": 258
},
{
"epoch": 2.213675213675214,
"grad_norm": 21.573726654052734,
"learning_rate": 8.893162393162393e-06,
"loss": 5.479,
"step": 259
},
{
"epoch": 2.2222222222222223,
"grad_norm": 28.72640037536621,
"learning_rate": 8.888888888888888e-06,
"loss": 5.5096,
"step": 260
},
{
"epoch": 2.230769230769231,
"grad_norm": 15.4992094039917,
"learning_rate": 8.884615384615385e-06,
"loss": 5.217,
"step": 261
},
{
"epoch": 2.2393162393162394,
"grad_norm": 17.753416061401367,
"learning_rate": 8.88034188034188e-06,
"loss": 5.8173,
"step": 262
},
{
"epoch": 2.247863247863248,
"grad_norm": 15.91961669921875,
"learning_rate": 8.876068376068377e-06,
"loss": 5.7171,
"step": 263
},
{
"epoch": 2.2564102564102564,
"grad_norm": 23.30504035949707,
"learning_rate": 8.871794871794872e-06,
"loss": 5.6214,
"step": 264
},
{
"epoch": 2.264957264957265,
"grad_norm": 15.583686828613281,
"learning_rate": 8.867521367521369e-06,
"loss": 5.2343,
"step": 265
},
{
"epoch": 2.2735042735042734,
"grad_norm": 24.482046127319336,
"learning_rate": 8.863247863247864e-06,
"loss": 5.0747,
"step": 266
},
{
"epoch": 2.282051282051282,
"grad_norm": 16.17924690246582,
"learning_rate": 8.858974358974359e-06,
"loss": 5.2645,
"step": 267
},
{
"epoch": 2.2905982905982905,
"grad_norm": 19.538314819335938,
"learning_rate": 8.854700854700855e-06,
"loss": 5.3484,
"step": 268
},
{
"epoch": 2.299145299145299,
"grad_norm": 14.472186088562012,
"learning_rate": 8.85042735042735e-06,
"loss": 5.8159,
"step": 269
},
{
"epoch": 2.3076923076923075,
"grad_norm": 16.797805786132812,
"learning_rate": 8.846153846153847e-06,
"loss": 5.4466,
"step": 270
},
{
"epoch": 2.316239316239316,
"grad_norm": 13.237580299377441,
"learning_rate": 8.841880341880342e-06,
"loss": 5.2189,
"step": 271
},
{
"epoch": 2.324786324786325,
"grad_norm": 16.685317993164062,
"learning_rate": 8.837606837606837e-06,
"loss": 5.7098,
"step": 272
},
{
"epoch": 2.3333333333333335,
"grad_norm": 16.63880729675293,
"learning_rate": 8.833333333333334e-06,
"loss": 5.0714,
"step": 273
},
{
"epoch": 2.341880341880342,
"grad_norm": 20.871978759765625,
"learning_rate": 8.829059829059829e-06,
"loss": 4.9509,
"step": 274
},
{
"epoch": 2.3504273504273505,
"grad_norm": 16.95268440246582,
"learning_rate": 8.824786324786326e-06,
"loss": 5.4166,
"step": 275
},
{
"epoch": 2.358974358974359,
"grad_norm": 15.446279525756836,
"learning_rate": 8.820512820512821e-06,
"loss": 4.5967,
"step": 276
},
{
"epoch": 2.3675213675213675,
"grad_norm": 17.148235321044922,
"learning_rate": 8.816239316239316e-06,
"loss": 5.2542,
"step": 277
},
{
"epoch": 2.376068376068376,
"grad_norm": 17.014827728271484,
"learning_rate": 8.811965811965813e-06,
"loss": 5.4702,
"step": 278
},
{
"epoch": 2.3846153846153846,
"grad_norm": 15.313383102416992,
"learning_rate": 8.807692307692308e-06,
"loss": 5.2119,
"step": 279
},
{
"epoch": 2.393162393162393,
"grad_norm": 20.2298641204834,
"learning_rate": 8.803418803418804e-06,
"loss": 5.4064,
"step": 280
},
{
"epoch": 2.4017094017094016,
"grad_norm": 14.982254981994629,
"learning_rate": 8.7991452991453e-06,
"loss": 5.2545,
"step": 281
},
{
"epoch": 2.41025641025641,
"grad_norm": 16.258047103881836,
"learning_rate": 8.794871794871796e-06,
"loss": 5.0141,
"step": 282
},
{
"epoch": 2.4188034188034186,
"grad_norm": 22.5199031829834,
"learning_rate": 8.790598290598291e-06,
"loss": 5.3486,
"step": 283
},
{
"epoch": 2.427350427350427,
"grad_norm": 17.546480178833008,
"learning_rate": 8.786324786324786e-06,
"loss": 5.2785,
"step": 284
},
{
"epoch": 2.435897435897436,
"grad_norm": 22.07866668701172,
"learning_rate": 8.782051282051283e-06,
"loss": 5.4471,
"step": 285
},
{
"epoch": 2.4444444444444446,
"grad_norm": 409.2532043457031,
"learning_rate": 8.777777777777778e-06,
"loss": 6.0948,
"step": 286
},
{
"epoch": 2.452991452991453,
"grad_norm": 185.7334747314453,
"learning_rate": 8.773504273504275e-06,
"loss": 5.5538,
"step": 287
},
{
"epoch": 2.4615384615384617,
"grad_norm": 30.8182430267334,
"learning_rate": 8.76923076923077e-06,
"loss": 4.9661,
"step": 288
},
{
"epoch": 2.47008547008547,
"grad_norm": 18.584409713745117,
"learning_rate": 8.764957264957265e-06,
"loss": 5.0947,
"step": 289
},
{
"epoch": 2.4786324786324787,
"grad_norm": 18.128522872924805,
"learning_rate": 8.760683760683762e-06,
"loss": 4.8816,
"step": 290
},
{
"epoch": 2.4871794871794872,
"grad_norm": 18.800090789794922,
"learning_rate": 8.756410256410257e-06,
"loss": 5.0952,
"step": 291
},
{
"epoch": 2.4957264957264957,
"grad_norm": 22.140430450439453,
"learning_rate": 8.752136752136753e-06,
"loss": 4.5408,
"step": 292
},
{
"epoch": 2.5042735042735043,
"grad_norm": 19.867111206054688,
"learning_rate": 8.747863247863248e-06,
"loss": 4.7435,
"step": 293
},
{
"epoch": 2.5128205128205128,
"grad_norm": 19.437868118286133,
"learning_rate": 8.743589743589743e-06,
"loss": 5.2643,
"step": 294
},
{
"epoch": 2.5213675213675213,
"grad_norm": 18.256561279296875,
"learning_rate": 8.73931623931624e-06,
"loss": 5.2531,
"step": 295
},
{
"epoch": 2.52991452991453,
"grad_norm": 18.65209197998047,
"learning_rate": 8.735042735042735e-06,
"loss": 4.8646,
"step": 296
},
{
"epoch": 2.5384615384615383,
"grad_norm": 14.704927444458008,
"learning_rate": 8.730769230769232e-06,
"loss": 4.8343,
"step": 297
},
{
"epoch": 2.547008547008547,
"grad_norm": 15.522851943969727,
"learning_rate": 8.726495726495727e-06,
"loss": 4.898,
"step": 298
},
{
"epoch": 2.5555555555555554,
"grad_norm": 21.7825927734375,
"learning_rate": 8.722222222222224e-06,
"loss": 5.0732,
"step": 299
},
{
"epoch": 2.564102564102564,
"grad_norm": 17.963552474975586,
"learning_rate": 8.717948717948719e-06,
"loss": 4.9684,
"step": 300
},
{
"epoch": 2.5726495726495724,
"grad_norm": 16.14459991455078,
"learning_rate": 8.713675213675214e-06,
"loss": 4.8802,
"step": 301
},
{
"epoch": 2.5811965811965814,
"grad_norm": 18.386646270751953,
"learning_rate": 8.70940170940171e-06,
"loss": 4.8837,
"step": 302
},
{
"epoch": 2.58974358974359,
"grad_norm": 19.471376419067383,
"learning_rate": 8.705128205128206e-06,
"loss": 4.6325,
"step": 303
},
{
"epoch": 2.5982905982905984,
"grad_norm": 17.839717864990234,
"learning_rate": 8.700854700854702e-06,
"loss": 4.7851,
"step": 304
},
{
"epoch": 2.606837606837607,
"grad_norm": 26.519363403320312,
"learning_rate": 8.696581196581197e-06,
"loss": 5.0576,
"step": 305
},
{
"epoch": 2.6153846153846154,
"grad_norm": 14.135244369506836,
"learning_rate": 8.692307692307692e-06,
"loss": 4.7719,
"step": 306
},
{
"epoch": 2.623931623931624,
"grad_norm": 16.5241641998291,
"learning_rate": 8.68803418803419e-06,
"loss": 4.5826,
"step": 307
},
{
"epoch": 2.6324786324786325,
"grad_norm": 23.982437133789062,
"learning_rate": 8.683760683760684e-06,
"loss": 4.4878,
"step": 308
},
{
"epoch": 2.641025641025641,
"grad_norm": 16.036361694335938,
"learning_rate": 8.679487179487181e-06,
"loss": 4.3867,
"step": 309
},
{
"epoch": 2.6495726495726495,
"grad_norm": 16.19298553466797,
"learning_rate": 8.675213675213676e-06,
"loss": 4.763,
"step": 310
},
{
"epoch": 2.658119658119658,
"grad_norm": 19.32802963256836,
"learning_rate": 8.670940170940171e-06,
"loss": 4.4083,
"step": 311
},
{
"epoch": 2.6666666666666665,
"grad_norm": 21.75898551940918,
"learning_rate": 8.666666666666668e-06,
"loss": 4.8782,
"step": 312
},
{
"epoch": 2.6752136752136755,
"grad_norm": 905.6954956054688,
"learning_rate": 8.662393162393163e-06,
"loss": 5.7901,
"step": 313
},
{
"epoch": 2.683760683760684,
"grad_norm": 21.126985549926758,
"learning_rate": 8.65811965811966e-06,
"loss": 4.918,
"step": 314
},
{
"epoch": 2.6923076923076925,
"grad_norm": 22.190237045288086,
"learning_rate": 8.653846153846155e-06,
"loss": 4.4327,
"step": 315
},
{
"epoch": 2.700854700854701,
"grad_norm": 90.69184875488281,
"learning_rate": 8.649572649572651e-06,
"loss": 5.1477,
"step": 316
},
{
"epoch": 2.7094017094017095,
"grad_norm": 43.43864059448242,
"learning_rate": 8.645299145299146e-06,
"loss": 4.5476,
"step": 317
},
{
"epoch": 2.717948717948718,
"grad_norm": 19.24538230895996,
"learning_rate": 8.641025641025641e-06,
"loss": 4.4304,
"step": 318
},
{
"epoch": 2.7264957264957266,
"grad_norm": 21.809600830078125,
"learning_rate": 8.636752136752138e-06,
"loss": 4.4215,
"step": 319
},
{
"epoch": 2.735042735042735,
"grad_norm": 21.406156539916992,
"learning_rate": 8.632478632478633e-06,
"loss": 4.5411,
"step": 320
},
{
"epoch": 2.7435897435897436,
"grad_norm": 17.57236099243164,
"learning_rate": 8.62820512820513e-06,
"loss": 4.7952,
"step": 321
},
{
"epoch": 2.752136752136752,
"grad_norm": 21.049169540405273,
"learning_rate": 8.623931623931625e-06,
"loss": 4.4596,
"step": 322
},
{
"epoch": 2.7606837606837606,
"grad_norm": 20.04981803894043,
"learning_rate": 8.61965811965812e-06,
"loss": 4.4705,
"step": 323
},
{
"epoch": 2.769230769230769,
"grad_norm": 21.146499633789062,
"learning_rate": 8.615384615384617e-06,
"loss": 4.6081,
"step": 324
},
{
"epoch": 2.7777777777777777,
"grad_norm": 20.9805908203125,
"learning_rate": 8.611111111111112e-06,
"loss": 4.8387,
"step": 325
},
{
"epoch": 2.786324786324786,
"grad_norm": 17.708343505859375,
"learning_rate": 8.606837606837609e-06,
"loss": 4.3455,
"step": 326
},
{
"epoch": 2.7948717948717947,
"grad_norm": 25.657032012939453,
"learning_rate": 8.602564102564104e-06,
"loss": 4.3119,
"step": 327
},
{
"epoch": 2.8034188034188032,
"grad_norm": 17.713972091674805,
"learning_rate": 8.598290598290599e-06,
"loss": 4.5597,
"step": 328
},
{
"epoch": 2.8119658119658117,
"grad_norm": 22.297082901000977,
"learning_rate": 8.594017094017095e-06,
"loss": 3.8398,
"step": 329
},
{
"epoch": 2.8205128205128203,
"grad_norm": 16.11454200744629,
"learning_rate": 8.58974358974359e-06,
"loss": 3.2049,
"step": 330
},
{
"epoch": 2.8290598290598292,
"grad_norm": 27.323585510253906,
"learning_rate": 8.585470085470086e-06,
"loss": 4.0371,
"step": 331
},
{
"epoch": 2.8376068376068377,
"grad_norm": 21.090797424316406,
"learning_rate": 8.58119658119658e-06,
"loss": 4.5193,
"step": 332
},
{
"epoch": 2.8461538461538463,
"grad_norm": 39.087432861328125,
"learning_rate": 8.576923076923077e-06,
"loss": 4.3537,
"step": 333
},
{
"epoch": 2.8547008547008548,
"grad_norm": 18.49846839904785,
"learning_rate": 8.572649572649572e-06,
"loss": 4.614,
"step": 334
},
{
"epoch": 2.8632478632478633,
"grad_norm": 26.671632766723633,
"learning_rate": 8.568376068376069e-06,
"loss": 4.2224,
"step": 335
},
{
"epoch": 2.871794871794872,
"grad_norm": 25.799545288085938,
"learning_rate": 8.564102564102564e-06,
"loss": 4.2209,
"step": 336
},
{
"epoch": 2.8803418803418803,
"grad_norm": 20.131961822509766,
"learning_rate": 8.559829059829061e-06,
"loss": 4.5194,
"step": 337
},
{
"epoch": 2.888888888888889,
"grad_norm": 20.193859100341797,
"learning_rate": 8.555555555555556e-06,
"loss": 3.9966,
"step": 338
},
{
"epoch": 2.8974358974358974,
"grad_norm": 20.06737518310547,
"learning_rate": 8.551282051282051e-06,
"loss": 3.7394,
"step": 339
},
{
"epoch": 2.905982905982906,
"grad_norm": 438.34429931640625,
"learning_rate": 8.547008547008548e-06,
"loss": 5.1558,
"step": 340
},
{
"epoch": 2.9145299145299144,
"grad_norm": 22.152528762817383,
"learning_rate": 8.542735042735043e-06,
"loss": 3.9014,
"step": 341
},
{
"epoch": 2.9230769230769234,
"grad_norm": 29.279739379882812,
"learning_rate": 8.53846153846154e-06,
"loss": 4.0479,
"step": 342
},
{
"epoch": 2.931623931623932,
"grad_norm": 26.182645797729492,
"learning_rate": 8.534188034188035e-06,
"loss": 4.2022,
"step": 343
},
{
"epoch": 2.9401709401709404,
"grad_norm": 22.329736709594727,
"learning_rate": 8.52991452991453e-06,
"loss": 3.8777,
"step": 344
},
{
"epoch": 2.948717948717949,
"grad_norm": 20.62833023071289,
"learning_rate": 8.525641025641026e-06,
"loss": 4.2189,
"step": 345
},
{
"epoch": 2.9572649572649574,
"grad_norm": 20.176612854003906,
"learning_rate": 8.521367521367521e-06,
"loss": 4.0124,
"step": 346
},
{
"epoch": 2.965811965811966,
"grad_norm": 18.77017593383789,
"learning_rate": 8.517094017094018e-06,
"loss": 3.3286,
"step": 347
},
{
"epoch": 2.9743589743589745,
"grad_norm": 226.93701171875,
"learning_rate": 8.512820512820513e-06,
"loss": 4.6969,
"step": 348
},
{
"epoch": 2.982905982905983,
"grad_norm": 675.1133422851562,
"learning_rate": 8.508547008547008e-06,
"loss": 4.6717,
"step": 349
},
{
"epoch": 2.9914529914529915,
"grad_norm": 19.938486099243164,
"learning_rate": 8.504273504273505e-06,
"loss": 4.0103,
"step": 350
},
{
"epoch": 3.0,
"grad_norm": 15.917003631591797,
"learning_rate": 8.5e-06,
"loss": 3.1643,
"step": 351
},
{
"epoch": 3.0,
"eval_loss": 3.4197537899017334,
"eval_runtime": 9.289,
"eval_samples_per_second": 50.167,
"eval_steps_per_second": 6.352,
"step": 351
},
{
"epoch": 3.0085470085470085,
"grad_norm": 22.22833251953125,
"learning_rate": 8.495726495726497e-06,
"loss": 4.3458,
"step": 352
},
{
"epoch": 3.017094017094017,
"grad_norm": 16.4627685546875,
"learning_rate": 8.491452991452992e-06,
"loss": 3.5374,
"step": 353
},
{
"epoch": 3.0256410256410255,
"grad_norm": 16.389379501342773,
"learning_rate": 8.487179487179488e-06,
"loss": 4.1384,
"step": 354
},
{
"epoch": 3.034188034188034,
"grad_norm": 19.589706420898438,
"learning_rate": 8.482905982905983e-06,
"loss": 3.9522,
"step": 355
},
{
"epoch": 3.0427350427350426,
"grad_norm": 21.66250228881836,
"learning_rate": 8.478632478632479e-06,
"loss": 4.0197,
"step": 356
},
{
"epoch": 3.051282051282051,
"grad_norm": 42.1422119140625,
"learning_rate": 8.474358974358975e-06,
"loss": 3.9432,
"step": 357
},
{
"epoch": 3.0598290598290596,
"grad_norm": 23.0153751373291,
"learning_rate": 8.47008547008547e-06,
"loss": 3.9146,
"step": 358
},
{
"epoch": 3.0683760683760686,
"grad_norm": 20.847400665283203,
"learning_rate": 8.465811965811967e-06,
"loss": 3.9736,
"step": 359
},
{
"epoch": 3.076923076923077,
"grad_norm": 23.553855895996094,
"learning_rate": 8.461538461538462e-06,
"loss": 3.646,
"step": 360
},
{
"epoch": 3.0854700854700856,
"grad_norm": 18.651151657104492,
"learning_rate": 8.457264957264957e-06,
"loss": 3.761,
"step": 361
},
{
"epoch": 3.094017094017094,
"grad_norm": 23.437379837036133,
"learning_rate": 8.452991452991454e-06,
"loss": 3.9258,
"step": 362
},
{
"epoch": 3.1025641025641026,
"grad_norm": 19.025928497314453,
"learning_rate": 8.448717948717949e-06,
"loss": 3.4911,
"step": 363
},
{
"epoch": 3.111111111111111,
"grad_norm": 25.955963134765625,
"learning_rate": 8.444444444444446e-06,
"loss": 3.7231,
"step": 364
},
{
"epoch": 3.1196581196581197,
"grad_norm": 19.691673278808594,
"learning_rate": 8.44017094017094e-06,
"loss": 3.9225,
"step": 365
},
{
"epoch": 3.128205128205128,
"grad_norm": 19.47168731689453,
"learning_rate": 8.435897435897436e-06,
"loss": 3.6261,
"step": 366
},
{
"epoch": 3.1367521367521367,
"grad_norm": 20.50010108947754,
"learning_rate": 8.431623931623932e-06,
"loss": 3.3306,
"step": 367
},
{
"epoch": 3.1452991452991452,
"grad_norm": 21.198938369750977,
"learning_rate": 8.427350427350428e-06,
"loss": 3.6388,
"step": 368
},
{
"epoch": 3.1538461538461537,
"grad_norm": 16.93203353881836,
"learning_rate": 8.423076923076924e-06,
"loss": 3.9556,
"step": 369
},
{
"epoch": 3.1623931623931623,
"grad_norm": 15.074128150939941,
"learning_rate": 8.41880341880342e-06,
"loss": 2.9899,
"step": 370
},
{
"epoch": 3.1709401709401708,
"grad_norm": 23.041452407836914,
"learning_rate": 8.414529914529916e-06,
"loss": 3.291,
"step": 371
},
{
"epoch": 3.1794871794871793,
"grad_norm": 24.146419525146484,
"learning_rate": 8.410256410256411e-06,
"loss": 4.0683,
"step": 372
},
{
"epoch": 3.1880341880341883,
"grad_norm": 27.864879608154297,
"learning_rate": 8.405982905982906e-06,
"loss": 3.6171,
"step": 373
},
{
"epoch": 3.1965811965811968,
"grad_norm": 33.83136749267578,
"learning_rate": 8.401709401709403e-06,
"loss": 3.7324,
"step": 374
},
{
"epoch": 3.2051282051282053,
"grad_norm": 21.020702362060547,
"learning_rate": 8.397435897435898e-06,
"loss": 3.5688,
"step": 375
},
{
"epoch": 3.213675213675214,
"grad_norm": 23.521453857421875,
"learning_rate": 8.393162393162395e-06,
"loss": 3.6917,
"step": 376
},
{
"epoch": 3.2222222222222223,
"grad_norm": 35.85578536987305,
"learning_rate": 8.38888888888889e-06,
"loss": 3.6532,
"step": 377
},
{
"epoch": 3.230769230769231,
"grad_norm": 26.080968856811523,
"learning_rate": 8.384615384615385e-06,
"loss": 3.8828,
"step": 378
},
{
"epoch": 3.2393162393162394,
"grad_norm": 20.829381942749023,
"learning_rate": 8.380341880341881e-06,
"loss": 3.8374,
"step": 379
},
{
"epoch": 3.247863247863248,
"grad_norm": 20.85077476501465,
"learning_rate": 8.376068376068377e-06,
"loss": 3.2896,
"step": 380
},
{
"epoch": 3.2564102564102564,
"grad_norm": 19.036088943481445,
"learning_rate": 8.371794871794873e-06,
"loss": 3.4996,
"step": 381
},
{
"epoch": 3.264957264957265,
"grad_norm": 23.725513458251953,
"learning_rate": 8.367521367521368e-06,
"loss": 3.7686,
"step": 382
},
{
"epoch": 3.2735042735042734,
"grad_norm": 22.553386688232422,
"learning_rate": 8.363247863247865e-06,
"loss": 3.8476,
"step": 383
},
{
"epoch": 3.282051282051282,
"grad_norm": 20.263992309570312,
"learning_rate": 8.35897435897436e-06,
"loss": 3.3278,
"step": 384
},
{
"epoch": 3.2905982905982905,
"grad_norm": 22.47858238220215,
"learning_rate": 8.354700854700855e-06,
"loss": 3.5437,
"step": 385
},
{
"epoch": 3.299145299145299,
"grad_norm": 24.14532470703125,
"learning_rate": 8.350427350427352e-06,
"loss": 3.696,
"step": 386
},
{
"epoch": 3.3076923076923075,
"grad_norm": 31.457847595214844,
"learning_rate": 8.346153846153847e-06,
"loss": 4.3065,
"step": 387
},
{
"epoch": 3.316239316239316,
"grad_norm": 24.503095626831055,
"learning_rate": 8.341880341880344e-06,
"loss": 3.4798,
"step": 388
},
{
"epoch": 3.324786324786325,
"grad_norm": 19.798818588256836,
"learning_rate": 8.337606837606839e-06,
"loss": 3.5323,
"step": 389
},
{
"epoch": 3.3333333333333335,
"grad_norm": 22.023189544677734,
"learning_rate": 8.333333333333334e-06,
"loss": 3.4088,
"step": 390
},
{
"epoch": 3.341880341880342,
"grad_norm": 17.314960479736328,
"learning_rate": 8.32905982905983e-06,
"loss": 3.2462,
"step": 391
},
{
"epoch": 3.3504273504273505,
"grad_norm": 22.714536666870117,
"learning_rate": 8.324786324786326e-06,
"loss": 3.7863,
"step": 392
},
{
"epoch": 3.358974358974359,
"grad_norm": 27.710514068603516,
"learning_rate": 8.320512820512822e-06,
"loss": 3.6032,
"step": 393
},
{
"epoch": 3.3675213675213675,
"grad_norm": 23.35419464111328,
"learning_rate": 8.316239316239317e-06,
"loss": 3.5599,
"step": 394
},
{
"epoch": 3.376068376068376,
"grad_norm": 24.0956974029541,
"learning_rate": 8.311965811965812e-06,
"loss": 3.5186,
"step": 395
},
{
"epoch": 3.3846153846153846,
"grad_norm": 22.09107780456543,
"learning_rate": 8.307692307692309e-06,
"loss": 3.4843,
"step": 396
},
{
"epoch": 3.393162393162393,
"grad_norm": 23.956623077392578,
"learning_rate": 8.303418803418804e-06,
"loss": 3.1625,
"step": 397
},
{
"epoch": 3.4017094017094016,
"grad_norm": 18.875917434692383,
"learning_rate": 8.299145299145301e-06,
"loss": 3.3494,
"step": 398
},
{
"epoch": 3.41025641025641,
"grad_norm": 33.475467681884766,
"learning_rate": 8.294871794871796e-06,
"loss": 3.9247,
"step": 399
},
{
"epoch": 3.4188034188034186,
"grad_norm": 16.28295135498047,
"learning_rate": 8.290598290598293e-06,
"loss": 3.7446,
"step": 400
},
{
"epoch": 3.427350427350427,
"grad_norm": 24.205049514770508,
"learning_rate": 8.286324786324788e-06,
"loss": 3.343,
"step": 401
},
{
"epoch": 3.435897435897436,
"grad_norm": 21.21460723876953,
"learning_rate": 8.282051282051283e-06,
"loss": 3.2437,
"step": 402
},
{
"epoch": 3.4444444444444446,
"grad_norm": 36.8713264465332,
"learning_rate": 8.277777777777778e-06,
"loss": 3.5009,
"step": 403
},
{
"epoch": 3.452991452991453,
"grad_norm": 26.85513687133789,
"learning_rate": 8.273504273504273e-06,
"loss": 3.7271,
"step": 404
},
{
"epoch": 3.4615384615384617,
"grad_norm": 18.184600830078125,
"learning_rate": 8.26923076923077e-06,
"loss": 3.2216,
"step": 405
},
{
"epoch": 3.47008547008547,
"grad_norm": 27.03692054748535,
"learning_rate": 8.264957264957265e-06,
"loss": 3.516,
"step": 406
},
{
"epoch": 3.4786324786324787,
"grad_norm": 20.63736915588379,
"learning_rate": 8.260683760683761e-06,
"loss": 3.1349,
"step": 407
},
{
"epoch": 3.4871794871794872,
"grad_norm": 22.467845916748047,
"learning_rate": 8.256410256410256e-06,
"loss": 3.3878,
"step": 408
},
{
"epoch": 3.4957264957264957,
"grad_norm": 21.25887107849121,
"learning_rate": 8.252136752136753e-06,
"loss": 3.8298,
"step": 409
},
{
"epoch": 3.5042735042735043,
"grad_norm": 47.3256721496582,
"learning_rate": 8.247863247863248e-06,
"loss": 3.5321,
"step": 410
},
{
"epoch": 3.5128205128205128,
"grad_norm": 22.103790283203125,
"learning_rate": 8.243589743589743e-06,
"loss": 3.335,
"step": 411
},
{
"epoch": 3.5213675213675213,
"grad_norm": 25.779077529907227,
"learning_rate": 8.23931623931624e-06,
"loss": 3.5047,
"step": 412
},
{
"epoch": 3.52991452991453,
"grad_norm": 22.78207778930664,
"learning_rate": 8.235042735042735e-06,
"loss": 3.3827,
"step": 413
},
{
"epoch": 3.5384615384615383,
"grad_norm": 22.41836166381836,
"learning_rate": 8.230769230769232e-06,
"loss": 3.4521,
"step": 414
},
{
"epoch": 3.547008547008547,
"grad_norm": 60.29216384887695,
"learning_rate": 8.226495726495727e-06,
"loss": 3.4598,
"step": 415
},
{
"epoch": 3.5555555555555554,
"grad_norm": 25.27474021911621,
"learning_rate": 8.222222222222222e-06,
"loss": 3.7443,
"step": 416
},
{
"epoch": 3.564102564102564,
"grad_norm": 25.297466278076172,
"learning_rate": 8.217948717948719e-06,
"loss": 3.3123,
"step": 417
},
{
"epoch": 3.5726495726495724,
"grad_norm": 28.5858154296875,
"learning_rate": 8.213675213675214e-06,
"loss": 3.1801,
"step": 418
},
{
"epoch": 3.5811965811965814,
"grad_norm": 20.05567741394043,
"learning_rate": 8.20940170940171e-06,
"loss": 3.7242,
"step": 419
},
{
"epoch": 3.58974358974359,
"grad_norm": 32.33693313598633,
"learning_rate": 8.205128205128205e-06,
"loss": 3.3587,
"step": 420
},
{
"epoch": 3.5982905982905984,
"grad_norm": 36.1716194152832,
"learning_rate": 8.200854700854702e-06,
"loss": 3.1573,
"step": 421
},
{
"epoch": 3.606837606837607,
"grad_norm": 33.39027404785156,
"learning_rate": 8.196581196581197e-06,
"loss": 3.098,
"step": 422
},
{
"epoch": 3.6153846153846154,
"grad_norm": 28.4794864654541,
"learning_rate": 8.192307692307692e-06,
"loss": 3.6403,
"step": 423
},
{
"epoch": 3.623931623931624,
"grad_norm": 29.702611923217773,
"learning_rate": 8.188034188034189e-06,
"loss": 3.2569,
"step": 424
},
{
"epoch": 3.6324786324786325,
"grad_norm": 24.73663902282715,
"learning_rate": 8.183760683760684e-06,
"loss": 3.0508,
"step": 425
},
{
"epoch": 3.641025641025641,
"grad_norm": 29.606807708740234,
"learning_rate": 8.17948717948718e-06,
"loss": 3.2524,
"step": 426
},
{
"epoch": 3.6495726495726495,
"grad_norm": 22.721933364868164,
"learning_rate": 8.175213675213676e-06,
"loss": 3.2583,
"step": 427
},
{
"epoch": 3.658119658119658,
"grad_norm": 25.009403228759766,
"learning_rate": 8.17094017094017e-06,
"loss": 3.0678,
"step": 428
},
{
"epoch": 3.6666666666666665,
"grad_norm": 25.776636123657227,
"learning_rate": 8.166666666666668e-06,
"loss": 3.1676,
"step": 429
},
{
"epoch": 3.6752136752136755,
"grad_norm": 28.210241317749023,
"learning_rate": 8.162393162393163e-06,
"loss": 3.2869,
"step": 430
},
{
"epoch": 3.683760683760684,
"grad_norm": 26.29328155517578,
"learning_rate": 8.15811965811966e-06,
"loss": 3.3618,
"step": 431
},
{
"epoch": 3.6923076923076925,
"grad_norm": 19.813465118408203,
"learning_rate": 8.153846153846154e-06,
"loss": 3.0655,
"step": 432
},
{
"epoch": 3.700854700854701,
"grad_norm": 29.718812942504883,
"learning_rate": 8.14957264957265e-06,
"loss": 3.1538,
"step": 433
},
{
"epoch": 3.7094017094017095,
"grad_norm": 30.629135131835938,
"learning_rate": 8.145299145299146e-06,
"loss": 3.3252,
"step": 434
},
{
"epoch": 3.717948717948718,
"grad_norm": 27.716825485229492,
"learning_rate": 8.141025641025641e-06,
"loss": 3.4083,
"step": 435
},
{
"epoch": 3.7264957264957266,
"grad_norm": 39.23820877075195,
"learning_rate": 8.136752136752138e-06,
"loss": 3.3074,
"step": 436
},
{
"epoch": 3.735042735042735,
"grad_norm": 34.516422271728516,
"learning_rate": 8.132478632478633e-06,
"loss": 3.3529,
"step": 437
},
{
"epoch": 3.7435897435897436,
"grad_norm": 41.98606872558594,
"learning_rate": 8.12820512820513e-06,
"loss": 3.248,
"step": 438
},
{
"epoch": 3.752136752136752,
"grad_norm": 27.99711799621582,
"learning_rate": 8.123931623931625e-06,
"loss": 3.3054,
"step": 439
},
{
"epoch": 3.7606837606837606,
"grad_norm": 25.21969985961914,
"learning_rate": 8.11965811965812e-06,
"loss": 2.8518,
"step": 440
},
{
"epoch": 3.769230769230769,
"grad_norm": 29.14298439025879,
"learning_rate": 8.115384615384617e-06,
"loss": 3.0063,
"step": 441
},
{
"epoch": 3.7777777777777777,
"grad_norm": 27.040063858032227,
"learning_rate": 8.111111111111112e-06,
"loss": 3.3066,
"step": 442
},
{
"epoch": 3.786324786324786,
"grad_norm": 365.3290100097656,
"learning_rate": 8.106837606837608e-06,
"loss": 3.8057,
"step": 443
},
{
"epoch": 3.7948717948717947,
"grad_norm": 32.89745330810547,
"learning_rate": 8.102564102564103e-06,
"loss": 3.0903,
"step": 444
},
{
"epoch": 3.8034188034188032,
"grad_norm": 29.448022842407227,
"learning_rate": 8.098290598290598e-06,
"loss": 3.2723,
"step": 445
},
{
"epoch": 3.8119658119658117,
"grad_norm": 27.838123321533203,
"learning_rate": 8.094017094017095e-06,
"loss": 3.2903,
"step": 446
},
{
"epoch": 3.8205128205128203,
"grad_norm": 29.047847747802734,
"learning_rate": 8.08974358974359e-06,
"loss": 2.9048,
"step": 447
},
{
"epoch": 3.8290598290598292,
"grad_norm": 28.666589736938477,
"learning_rate": 8.085470085470087e-06,
"loss": 3.2186,
"step": 448
},
{
"epoch": 3.8376068376068377,
"grad_norm": 31.796804428100586,
"learning_rate": 8.081196581196582e-06,
"loss": 3.2668,
"step": 449
},
{
"epoch": 3.8461538461538463,
"grad_norm": 22.665220260620117,
"learning_rate": 8.076923076923077e-06,
"loss": 3.0965,
"step": 450
},
{
"epoch": 3.8547008547008548,
"grad_norm": 32.7353630065918,
"learning_rate": 8.072649572649574e-06,
"loss": 3.1759,
"step": 451
},
{
"epoch": 3.8632478632478633,
"grad_norm": 32.95683670043945,
"learning_rate": 8.068376068376069e-06,
"loss": 2.9589,
"step": 452
},
{
"epoch": 3.871794871794872,
"grad_norm": 30.04659652709961,
"learning_rate": 8.064102564102566e-06,
"loss": 3.4709,
"step": 453
},
{
"epoch": 3.8803418803418803,
"grad_norm": 30.41158676147461,
"learning_rate": 8.05982905982906e-06,
"loss": 2.9385,
"step": 454
},
{
"epoch": 3.888888888888889,
"grad_norm": 30.059635162353516,
"learning_rate": 8.055555555555557e-06,
"loss": 3.0099,
"step": 455
},
{
"epoch": 3.8974358974358974,
"grad_norm": 24.83198356628418,
"learning_rate": 8.051282051282052e-06,
"loss": 2.9783,
"step": 456
},
{
"epoch": 3.905982905982906,
"grad_norm": 25.38758087158203,
"learning_rate": 8.047008547008547e-06,
"loss": 3.0275,
"step": 457
},
{
"epoch": 3.9145299145299144,
"grad_norm": 25.21868133544922,
"learning_rate": 8.042735042735044e-06,
"loss": 2.9096,
"step": 458
},
{
"epoch": 3.9230769230769234,
"grad_norm": 32.02922058105469,
"learning_rate": 8.03846153846154e-06,
"loss": 3.059,
"step": 459
},
{
"epoch": 3.931623931623932,
"grad_norm": 22.240680694580078,
"learning_rate": 8.034188034188036e-06,
"loss": 2.9473,
"step": 460
},
{
"epoch": 3.9401709401709404,
"grad_norm": 27.61838150024414,
"learning_rate": 8.029914529914531e-06,
"loss": 2.4506,
"step": 461
},
{
"epoch": 3.948717948717949,
"grad_norm": 27.742216110229492,
"learning_rate": 8.025641025641026e-06,
"loss": 2.9082,
"step": 462
},
{
"epoch": 3.9572649572649574,
"grad_norm": 29.965059280395508,
"learning_rate": 8.021367521367523e-06,
"loss": 2.8268,
"step": 463
},
{
"epoch": 3.965811965811966,
"grad_norm": 31.429990768432617,
"learning_rate": 8.017094017094018e-06,
"loss": 3.1805,
"step": 464
},
{
"epoch": 3.9743589743589745,
"grad_norm": 31.162532806396484,
"learning_rate": 8.012820512820515e-06,
"loss": 2.64,
"step": 465
},
{
"epoch": 3.982905982905983,
"grad_norm": 28.240577697753906,
"learning_rate": 8.00854700854701e-06,
"loss": 3.249,
"step": 466
},
{
"epoch": 3.9914529914529915,
"grad_norm": 48.52914810180664,
"learning_rate": 8.004273504273505e-06,
"loss": 3.1619,
"step": 467
},
{
"epoch": 4.0,
"grad_norm": 36.80685806274414,
"learning_rate": 8.000000000000001e-06,
"loss": 3.5337,
"step": 468
},
{
"epoch": 4.0,
"eval_loss": 2.1340389251708984,
"eval_runtime": 9.2211,
"eval_samples_per_second": 50.536,
"eval_steps_per_second": 6.398,
"step": 468
},
{
"epoch": 4.0085470085470085,
"grad_norm": 45.45211410522461,
"learning_rate": 7.995726495726496e-06,
"loss": 3.5596,
"step": 469
},
{
"epoch": 4.017094017094017,
"grad_norm": 32.711669921875,
"learning_rate": 7.991452991452993e-06,
"loss": 2.9362,
"step": 470
},
{
"epoch": 4.0256410256410255,
"grad_norm": 26.151872634887695,
"learning_rate": 7.987179487179488e-06,
"loss": 2.6796,
"step": 471
},
{
"epoch": 4.034188034188034,
"grad_norm": 33.02329635620117,
"learning_rate": 7.982905982905985e-06,
"loss": 2.7147,
"step": 472
},
{
"epoch": 4.042735042735043,
"grad_norm": 31.1684513092041,
"learning_rate": 7.97863247863248e-06,
"loss": 3.2356,
"step": 473
},
{
"epoch": 4.051282051282051,
"grad_norm": 37.0435905456543,
"learning_rate": 7.974358974358975e-06,
"loss": 2.9954,
"step": 474
},
{
"epoch": 4.05982905982906,
"grad_norm": 25.989973068237305,
"learning_rate": 7.970085470085472e-06,
"loss": 3.2143,
"step": 475
},
{
"epoch": 4.068376068376068,
"grad_norm": 27.048690795898438,
"learning_rate": 7.965811965811967e-06,
"loss": 2.5087,
"step": 476
},
{
"epoch": 4.076923076923077,
"grad_norm": 26.857696533203125,
"learning_rate": 7.961538461538462e-06,
"loss": 2.6466,
"step": 477
},
{
"epoch": 4.085470085470085,
"grad_norm": 33.342193603515625,
"learning_rate": 7.957264957264957e-06,
"loss": 2.6591,
"step": 478
},
{
"epoch": 4.094017094017094,
"grad_norm": 64.21253967285156,
"learning_rate": 7.952991452991454e-06,
"loss": 3.0295,
"step": 479
},
{
"epoch": 4.102564102564102,
"grad_norm": 31.240161895751953,
"learning_rate": 7.948717948717949e-06,
"loss": 2.9374,
"step": 480
},
{
"epoch": 4.111111111111111,
"grad_norm": 29.338851928710938,
"learning_rate": 7.944444444444445e-06,
"loss": 2.5019,
"step": 481
},
{
"epoch": 4.119658119658119,
"grad_norm": 36.79518127441406,
"learning_rate": 7.94017094017094e-06,
"loss": 2.7649,
"step": 482
},
{
"epoch": 4.128205128205128,
"grad_norm": 37.036739349365234,
"learning_rate": 7.935897435897435e-06,
"loss": 2.5182,
"step": 483
},
{
"epoch": 4.136752136752137,
"grad_norm": 42.571163177490234,
"learning_rate": 7.931623931623932e-06,
"loss": 2.767,
"step": 484
},
{
"epoch": 4.145299145299146,
"grad_norm": 33.72893524169922,
"learning_rate": 7.927350427350427e-06,
"loss": 3.1404,
"step": 485
},
{
"epoch": 4.153846153846154,
"grad_norm": 27.06032943725586,
"learning_rate": 7.923076923076924e-06,
"loss": 2.6825,
"step": 486
},
{
"epoch": 4.162393162393163,
"grad_norm": 31.8147029876709,
"learning_rate": 7.918803418803419e-06,
"loss": 2.5129,
"step": 487
},
{
"epoch": 4.170940170940171,
"grad_norm": 35.681793212890625,
"learning_rate": 7.914529914529914e-06,
"loss": 2.4793,
"step": 488
},
{
"epoch": 4.17948717948718,
"grad_norm": 159.4467315673828,
"learning_rate": 7.91025641025641e-06,
"loss": 3.5531,
"step": 489
},
{
"epoch": 4.188034188034188,
"grad_norm": 40.12252426147461,
"learning_rate": 7.905982905982906e-06,
"loss": 2.7095,
"step": 490
},
{
"epoch": 4.196581196581197,
"grad_norm": 27.05786895751953,
"learning_rate": 7.901709401709403e-06,
"loss": 2.5984,
"step": 491
},
{
"epoch": 4.205128205128205,
"grad_norm": 24.31035614013672,
"learning_rate": 7.897435897435898e-06,
"loss": 2.89,
"step": 492
},
{
"epoch": 4.213675213675214,
"grad_norm": 277.16156005859375,
"learning_rate": 7.893162393162394e-06,
"loss": 3.8076,
"step": 493
},
{
"epoch": 4.222222222222222,
"grad_norm": 29.722867965698242,
"learning_rate": 7.88888888888889e-06,
"loss": 2.4189,
"step": 494
},
{
"epoch": 4.230769230769231,
"grad_norm": 40.47605514526367,
"learning_rate": 7.884615384615384e-06,
"loss": 2.6225,
"step": 495
},
{
"epoch": 4.239316239316239,
"grad_norm": 29.136499404907227,
"learning_rate": 7.880341880341881e-06,
"loss": 2.5223,
"step": 496
},
{
"epoch": 4.247863247863248,
"grad_norm": 78.86258697509766,
"learning_rate": 7.876068376068376e-06,
"loss": 2.6587,
"step": 497
},
{
"epoch": 4.256410256410256,
"grad_norm": 24.473243713378906,
"learning_rate": 7.871794871794873e-06,
"loss": 2.456,
"step": 498
},
{
"epoch": 4.264957264957265,
"grad_norm": 80.45248413085938,
"learning_rate": 7.867521367521368e-06,
"loss": 3.1893,
"step": 499
},
{
"epoch": 4.273504273504273,
"grad_norm": 194.2708282470703,
"learning_rate": 7.863247863247863e-06,
"loss": 3.8294,
"step": 500
},
{
"epoch": 4.282051282051282,
"grad_norm": 27.74302101135254,
"learning_rate": 7.85897435897436e-06,
"loss": 2.2506,
"step": 501
},
{
"epoch": 4.2905982905982905,
"grad_norm": 21.90385627746582,
"learning_rate": 7.854700854700855e-06,
"loss": 3.0985,
"step": 502
},
{
"epoch": 4.299145299145299,
"grad_norm": 50.30342102050781,
"learning_rate": 7.850427350427352e-06,
"loss": 2.526,
"step": 503
},
{
"epoch": 4.3076923076923075,
"grad_norm": 28.666881561279297,
"learning_rate": 7.846153846153847e-06,
"loss": 2.4213,
"step": 504
},
{
"epoch": 4.316239316239316,
"grad_norm": 27.927257537841797,
"learning_rate": 7.841880341880342e-06,
"loss": 2.6731,
"step": 505
},
{
"epoch": 4.3247863247863245,
"grad_norm": 36.12032699584961,
"learning_rate": 7.837606837606838e-06,
"loss": 2.3323,
"step": 506
},
{
"epoch": 4.333333333333333,
"grad_norm": 31.632287979125977,
"learning_rate": 7.833333333333333e-06,
"loss": 2.2966,
"step": 507
},
{
"epoch": 4.3418803418803416,
"grad_norm": 26.511537551879883,
"learning_rate": 7.82905982905983e-06,
"loss": 2.3422,
"step": 508
},
{
"epoch": 4.35042735042735,
"grad_norm": 31.429107666015625,
"learning_rate": 7.824786324786325e-06,
"loss": 2.6764,
"step": 509
},
{
"epoch": 4.358974358974359,
"grad_norm": 29.8817138671875,
"learning_rate": 7.820512820512822e-06,
"loss": 2.4358,
"step": 510
},
{
"epoch": 4.367521367521368,
"grad_norm": 29.293964385986328,
"learning_rate": 7.816239316239317e-06,
"loss": 2.504,
"step": 511
},
{
"epoch": 4.3760683760683765,
"grad_norm": 23.624290466308594,
"learning_rate": 7.811965811965812e-06,
"loss": 2.0312,
"step": 512
},
{
"epoch": 4.384615384615385,
"grad_norm": 25.336505889892578,
"learning_rate": 7.807692307692309e-06,
"loss": 2.1045,
"step": 513
},
{
"epoch": 4.3931623931623935,
"grad_norm": 24.755443572998047,
"learning_rate": 7.803418803418804e-06,
"loss": 2.5754,
"step": 514
},
{
"epoch": 4.401709401709402,
"grad_norm": 29.29696273803711,
"learning_rate": 7.7991452991453e-06,
"loss": 2.562,
"step": 515
},
{
"epoch": 4.410256410256411,
"grad_norm": 28.054868698120117,
"learning_rate": 7.794871794871796e-06,
"loss": 1.9815,
"step": 516
},
{
"epoch": 4.418803418803419,
"grad_norm": 20.894853591918945,
"learning_rate": 7.79059829059829e-06,
"loss": 2.5668,
"step": 517
},
{
"epoch": 4.427350427350428,
"grad_norm": 19.532094955444336,
"learning_rate": 7.786324786324787e-06,
"loss": 2.2314,
"step": 518
},
{
"epoch": 4.435897435897436,
"grad_norm": 27.919715881347656,
"learning_rate": 7.782051282051282e-06,
"loss": 1.9523,
"step": 519
},
{
"epoch": 4.444444444444445,
"grad_norm": 21.91543960571289,
"learning_rate": 7.77777777777778e-06,
"loss": 2.559,
"step": 520
},
{
"epoch": 4.452991452991453,
"grad_norm": 26.20106315612793,
"learning_rate": 7.773504273504274e-06,
"loss": 2.367,
"step": 521
},
{
"epoch": 4.461538461538462,
"grad_norm": 23.455419540405273,
"learning_rate": 7.76923076923077e-06,
"loss": 2.4132,
"step": 522
},
{
"epoch": 4.47008547008547,
"grad_norm": 49.62391662597656,
"learning_rate": 7.764957264957266e-06,
"loss": 1.8896,
"step": 523
},
{
"epoch": 4.478632478632479,
"grad_norm": 25.721101760864258,
"learning_rate": 7.760683760683761e-06,
"loss": 1.9918,
"step": 524
},
{
"epoch": 4.487179487179487,
"grad_norm": 22.906694412231445,
"learning_rate": 7.756410256410258e-06,
"loss": 2.1819,
"step": 525
},
{
"epoch": 4.495726495726496,
"grad_norm": 28.5809268951416,
"learning_rate": 7.752136752136753e-06,
"loss": 2.0516,
"step": 526
},
{
"epoch": 4.504273504273504,
"grad_norm": 26.47665023803711,
"learning_rate": 7.74786324786325e-06,
"loss": 2.0081,
"step": 527
},
{
"epoch": 4.512820512820513,
"grad_norm": 27.221372604370117,
"learning_rate": 7.743589743589745e-06,
"loss": 2.0414,
"step": 528
},
{
"epoch": 4.521367521367521,
"grad_norm": 27.931568145751953,
"learning_rate": 7.73931623931624e-06,
"loss": 2.0335,
"step": 529
},
{
"epoch": 4.52991452991453,
"grad_norm": 25.567049026489258,
"learning_rate": 7.735042735042736e-06,
"loss": 2.0129,
"step": 530
},
{
"epoch": 4.538461538461538,
"grad_norm": 30.897083282470703,
"learning_rate": 7.730769230769231e-06,
"loss": 2.3941,
"step": 531
},
{
"epoch": 4.547008547008547,
"grad_norm": 21.92133903503418,
"learning_rate": 7.726495726495728e-06,
"loss": 2.2563,
"step": 532
},
{
"epoch": 4.555555555555555,
"grad_norm": 27.053892135620117,
"learning_rate": 7.722222222222223e-06,
"loss": 2.2463,
"step": 533
},
{
"epoch": 4.564102564102564,
"grad_norm": 29.3230037689209,
"learning_rate": 7.717948717948718e-06,
"loss": 1.9167,
"step": 534
},
{
"epoch": 4.572649572649572,
"grad_norm": 36.06028747558594,
"learning_rate": 7.713675213675215e-06,
"loss": 1.9106,
"step": 535
},
{
"epoch": 4.581196581196581,
"grad_norm": 24.622135162353516,
"learning_rate": 7.70940170940171e-06,
"loss": 2.2899,
"step": 536
},
{
"epoch": 4.589743589743589,
"grad_norm": 21.3137264251709,
"learning_rate": 7.705128205128207e-06,
"loss": 2.0166,
"step": 537
},
{
"epoch": 4.598290598290598,
"grad_norm": 21.939279556274414,
"learning_rate": 7.700854700854702e-06,
"loss": 2.3319,
"step": 538
},
{
"epoch": 4.6068376068376065,
"grad_norm": 25.496994018554688,
"learning_rate": 7.696581196581197e-06,
"loss": 2.6162,
"step": 539
},
{
"epoch": 4.615384615384615,
"grad_norm": 24.095666885375977,
"learning_rate": 7.692307692307694e-06,
"loss": 2.2863,
"step": 540
},
{
"epoch": 4.6239316239316235,
"grad_norm": 31.96511459350586,
"learning_rate": 7.688034188034189e-06,
"loss": 2.0261,
"step": 541
},
{
"epoch": 4.632478632478632,
"grad_norm": 22.66115379333496,
"learning_rate": 7.683760683760685e-06,
"loss": 2.2786,
"step": 542
},
{
"epoch": 4.641025641025641,
"grad_norm": 23.661611557006836,
"learning_rate": 7.67948717948718e-06,
"loss": 1.7113,
"step": 543
},
{
"epoch": 4.64957264957265,
"grad_norm": 18.64708709716797,
"learning_rate": 7.675213675213677e-06,
"loss": 2.1389,
"step": 544
},
{
"epoch": 4.6581196581196584,
"grad_norm": 20.55480194091797,
"learning_rate": 7.670940170940172e-06,
"loss": 2.0831,
"step": 545
},
{
"epoch": 4.666666666666667,
"grad_norm": 27.876964569091797,
"learning_rate": 7.666666666666667e-06,
"loss": 2.0358,
"step": 546
},
{
"epoch": 4.6752136752136755,
"grad_norm": 20.236507415771484,
"learning_rate": 7.662393162393164e-06,
"loss": 1.5596,
"step": 547
},
{
"epoch": 4.683760683760684,
"grad_norm": 23.360782623291016,
"learning_rate": 7.658119658119659e-06,
"loss": 1.9623,
"step": 548
},
{
"epoch": 4.6923076923076925,
"grad_norm": 41.7568359375,
"learning_rate": 7.653846153846154e-06,
"loss": 1.9884,
"step": 549
},
{
"epoch": 4.700854700854701,
"grad_norm": 28.651065826416016,
"learning_rate": 7.649572649572649e-06,
"loss": 2.1491,
"step": 550
},
{
"epoch": 4.7094017094017095,
"grad_norm": 23.636432647705078,
"learning_rate": 7.645299145299146e-06,
"loss": 1.9352,
"step": 551
},
{
"epoch": 4.717948717948718,
"grad_norm": 25.313966751098633,
"learning_rate": 7.641025641025641e-06,
"loss": 2.4112,
"step": 552
},
{
"epoch": 4.726495726495727,
"grad_norm": 32.4974479675293,
"learning_rate": 7.636752136752138e-06,
"loss": 1.7017,
"step": 553
},
{
"epoch": 4.735042735042735,
"grad_norm": 20.644481658935547,
"learning_rate": 7.632478632478633e-06,
"loss": 1.6904,
"step": 554
},
{
"epoch": 4.743589743589744,
"grad_norm": 26.526721954345703,
"learning_rate": 7.6282051282051286e-06,
"loss": 2.1666,
"step": 555
},
{
"epoch": 4.752136752136752,
"grad_norm": 23.375839233398438,
"learning_rate": 7.6239316239316244e-06,
"loss": 1.5555,
"step": 556
},
{
"epoch": 4.760683760683761,
"grad_norm": 29.890501022338867,
"learning_rate": 7.6196581196581195e-06,
"loss": 2.0195,
"step": 557
},
{
"epoch": 4.769230769230769,
"grad_norm": 687.5745239257812,
"learning_rate": 7.615384615384615e-06,
"loss": 2.4286,
"step": 558
},
{
"epoch": 4.777777777777778,
"grad_norm": 22.844587326049805,
"learning_rate": 7.611111111111111e-06,
"loss": 2.2335,
"step": 559
},
{
"epoch": 4.786324786324786,
"grad_norm": 29.633562088012695,
"learning_rate": 7.606837606837607e-06,
"loss": 1.7579,
"step": 560
},
{
"epoch": 4.794871794871795,
"grad_norm": 48.04582977294922,
"learning_rate": 7.602564102564103e-06,
"loss": 2.3846,
"step": 561
},
{
"epoch": 4.803418803418803,
"grad_norm": 27.2290096282959,
"learning_rate": 7.598290598290599e-06,
"loss": 2.2234,
"step": 562
},
{
"epoch": 4.811965811965812,
"grad_norm": 29.782209396362305,
"learning_rate": 7.594017094017094e-06,
"loss": 2.0365,
"step": 563
},
{
"epoch": 4.82051282051282,
"grad_norm": 32.457061767578125,
"learning_rate": 7.58974358974359e-06,
"loss": 2.0451,
"step": 564
},
{
"epoch": 4.829059829059829,
"grad_norm": 22.089427947998047,
"learning_rate": 7.585470085470086e-06,
"loss": 1.7105,
"step": 565
},
{
"epoch": 4.837606837606837,
"grad_norm": 23.105140686035156,
"learning_rate": 7.581196581196582e-06,
"loss": 1.6817,
"step": 566
},
{
"epoch": 4.846153846153846,
"grad_norm": 24.513713836669922,
"learning_rate": 7.5769230769230775e-06,
"loss": 1.9553,
"step": 567
},
{
"epoch": 4.854700854700854,
"grad_norm": 22.187759399414062,
"learning_rate": 7.572649572649573e-06,
"loss": 2.0309,
"step": 568
},
{
"epoch": 4.863247863247864,
"grad_norm": 53.56728744506836,
"learning_rate": 7.5683760683760685e-06,
"loss": 2.6508,
"step": 569
},
{
"epoch": 4.871794871794872,
"grad_norm": 27.983978271484375,
"learning_rate": 7.564102564102564e-06,
"loss": 2.1942,
"step": 570
},
{
"epoch": 4.880341880341881,
"grad_norm": 25.610252380371094,
"learning_rate": 7.55982905982906e-06,
"loss": 1.4151,
"step": 571
},
{
"epoch": 4.888888888888889,
"grad_norm": 19.856618881225586,
"learning_rate": 7.555555555555556e-06,
"loss": 1.6968,
"step": 572
},
{
"epoch": 4.897435897435898,
"grad_norm": 20.288606643676758,
"learning_rate": 7.551282051282052e-06,
"loss": 1.7494,
"step": 573
},
{
"epoch": 4.905982905982906,
"grad_norm": 23.206768035888672,
"learning_rate": 7.547008547008547e-06,
"loss": 2.1255,
"step": 574
},
{
"epoch": 4.914529914529915,
"grad_norm": 21.275257110595703,
"learning_rate": 7.542735042735043e-06,
"loss": 1.7442,
"step": 575
},
{
"epoch": 4.923076923076923,
"grad_norm": 22.635417938232422,
"learning_rate": 7.538461538461539e-06,
"loss": 1.9129,
"step": 576
},
{
"epoch": 4.931623931623932,
"grad_norm": 21.440109252929688,
"learning_rate": 7.534188034188035e-06,
"loss": 2.0056,
"step": 577
},
{
"epoch": 4.94017094017094,
"grad_norm": 20.939407348632812,
"learning_rate": 7.529914529914531e-06,
"loss": 1.7231,
"step": 578
},
{
"epoch": 4.948717948717949,
"grad_norm": 16.189861297607422,
"learning_rate": 7.5256410256410265e-06,
"loss": 1.4255,
"step": 579
},
{
"epoch": 4.957264957264957,
"grad_norm": 23.6302547454834,
"learning_rate": 7.521367521367522e-06,
"loss": 1.6748,
"step": 580
},
{
"epoch": 4.965811965811966,
"grad_norm": 22.29713249206543,
"learning_rate": 7.5170940170940175e-06,
"loss": 1.5285,
"step": 581
},
{
"epoch": 4.9743589743589745,
"grad_norm": 22.831275939941406,
"learning_rate": 7.512820512820513e-06,
"loss": 1.7742,
"step": 582
},
{
"epoch": 4.982905982905983,
"grad_norm": 630.5899658203125,
"learning_rate": 7.508547008547009e-06,
"loss": 2.8598,
"step": 583
},
{
"epoch": 4.9914529914529915,
"grad_norm": 22.880647659301758,
"learning_rate": 7.504273504273505e-06,
"loss": 1.6231,
"step": 584
},
{
"epoch": 5.0,
"grad_norm": 21.379072189331055,
"learning_rate": 7.500000000000001e-06,
"loss": 1.3506,
"step": 585
},
{
"epoch": 5.0,
"eval_loss": 0.8325614333152771,
"eval_runtime": 9.2303,
"eval_samples_per_second": 50.486,
"eval_steps_per_second": 6.392,
"step": 585
},
{
"epoch": 5.0085470085470085,
"grad_norm": 23.968698501586914,
"learning_rate": 7.495726495726496e-06,
"loss": 1.4263,
"step": 586
},
{
"epoch": 5.017094017094017,
"grad_norm": 24.880769729614258,
"learning_rate": 7.491452991452992e-06,
"loss": 1.4994,
"step": 587
},
{
"epoch": 5.0256410256410255,
"grad_norm": 23.4547176361084,
"learning_rate": 7.487179487179488e-06,
"loss": 1.671,
"step": 588
},
{
"epoch": 5.034188034188034,
"grad_norm": 17.382152557373047,
"learning_rate": 7.482905982905984e-06,
"loss": 1.3935,
"step": 589
},
{
"epoch": 5.042735042735043,
"grad_norm": 19.607717514038086,
"learning_rate": 7.47863247863248e-06,
"loss": 1.5652,
"step": 590
},
{
"epoch": 5.051282051282051,
"grad_norm": 27.735240936279297,
"learning_rate": 7.474358974358975e-06,
"loss": 1.5491,
"step": 591
},
{
"epoch": 5.05982905982906,
"grad_norm": 20.493412017822266,
"learning_rate": 7.4700854700854706e-06,
"loss": 1.9229,
"step": 592
},
{
"epoch": 5.068376068376068,
"grad_norm": 20.492137908935547,
"learning_rate": 7.4658119658119665e-06,
"loss": 1.5066,
"step": 593
},
{
"epoch": 5.076923076923077,
"grad_norm": 27.650495529174805,
"learning_rate": 7.461538461538462e-06,
"loss": 1.4228,
"step": 594
},
{
"epoch": 5.085470085470085,
"grad_norm": 22.38190269470215,
"learning_rate": 7.457264957264958e-06,
"loss": 1.6243,
"step": 595
},
{
"epoch": 5.094017094017094,
"grad_norm": 22.862489700317383,
"learning_rate": 7.452991452991454e-06,
"loss": 1.9224,
"step": 596
},
{
"epoch": 5.102564102564102,
"grad_norm": 17.368051528930664,
"learning_rate": 7.448717948717949e-06,
"loss": 1.3642,
"step": 597
},
{
"epoch": 5.111111111111111,
"grad_norm": 20.587018966674805,
"learning_rate": 7.444444444444445e-06,
"loss": 1.471,
"step": 598
},
{
"epoch": 5.119658119658119,
"grad_norm": 18.502887725830078,
"learning_rate": 7.440170940170941e-06,
"loss": 1.9841,
"step": 599
},
{
"epoch": 5.128205128205128,
"grad_norm": 21.305294036865234,
"learning_rate": 7.435897435897437e-06,
"loss": 1.8564,
"step": 600
},
{
"epoch": 5.136752136752137,
"grad_norm": 20.61264419555664,
"learning_rate": 7.431623931623933e-06,
"loss": 1.3554,
"step": 601
},
{
"epoch": 5.145299145299146,
"grad_norm": 19.05555534362793,
"learning_rate": 7.427350427350429e-06,
"loss": 1.6612,
"step": 602
},
{
"epoch": 5.153846153846154,
"grad_norm": 20.392446517944336,
"learning_rate": 7.423076923076924e-06,
"loss": 1.5071,
"step": 603
},
{
"epoch": 5.162393162393163,
"grad_norm": 22.007591247558594,
"learning_rate": 7.4188034188034196e-06,
"loss": 1.3356,
"step": 604
},
{
"epoch": 5.170940170940171,
"grad_norm": 18.928104400634766,
"learning_rate": 7.4145299145299155e-06,
"loss": 1.6214,
"step": 605
},
{
"epoch": 5.17948717948718,
"grad_norm": 21.151193618774414,
"learning_rate": 7.410256410256411e-06,
"loss": 1.5275,
"step": 606
},
{
"epoch": 5.188034188034188,
"grad_norm": 16.272262573242188,
"learning_rate": 7.405982905982907e-06,
"loss": 1.2773,
"step": 607
},
{
"epoch": 5.196581196581197,
"grad_norm": 21.59275245666504,
"learning_rate": 7.401709401709402e-06,
"loss": 1.3503,
"step": 608
},
{
"epoch": 5.205128205128205,
"grad_norm": 84.31806182861328,
"learning_rate": 7.397435897435898e-06,
"loss": 1.8618,
"step": 609
},
{
"epoch": 5.213675213675214,
"grad_norm": 20.374465942382812,
"learning_rate": 7.393162393162394e-06,
"loss": 1.6153,
"step": 610
},
{
"epoch": 5.222222222222222,
"grad_norm": 18.569623947143555,
"learning_rate": 7.38888888888889e-06,
"loss": 1.7101,
"step": 611
},
{
"epoch": 5.230769230769231,
"grad_norm": 19.51409339904785,
"learning_rate": 7.384615384615386e-06,
"loss": 1.5801,
"step": 612
},
{
"epoch": 5.239316239316239,
"grad_norm": 19.45322608947754,
"learning_rate": 7.380341880341882e-06,
"loss": 1.1376,
"step": 613
},
{
"epoch": 5.247863247863248,
"grad_norm": 23.474557876586914,
"learning_rate": 7.376068376068377e-06,
"loss": 1.442,
"step": 614
},
{
"epoch": 5.256410256410256,
"grad_norm": 21.458847045898438,
"learning_rate": 7.371794871794873e-06,
"loss": 1.2769,
"step": 615
},
{
"epoch": 5.264957264957265,
"grad_norm": 25.741121292114258,
"learning_rate": 7.3675213675213686e-06,
"loss": 1.3321,
"step": 616
},
{
"epoch": 5.273504273504273,
"grad_norm": 15.394718170166016,
"learning_rate": 7.3632478632478645e-06,
"loss": 1.2335,
"step": 617
},
{
"epoch": 5.282051282051282,
"grad_norm": 20.938871383666992,
"learning_rate": 7.35897435897436e-06,
"loss": 1.5741,
"step": 618
},
{
"epoch": 5.2905982905982905,
"grad_norm": 19.348268508911133,
"learning_rate": 7.354700854700856e-06,
"loss": 1.2493,
"step": 619
},
{
"epoch": 5.299145299145299,
"grad_norm": 25.26751708984375,
"learning_rate": 7.350427350427351e-06,
"loss": 1.5167,
"step": 620
},
{
"epoch": 5.3076923076923075,
"grad_norm": 22.099227905273438,
"learning_rate": 7.346153846153847e-06,
"loss": 1.3269,
"step": 621
},
{
"epoch": 5.316239316239316,
"grad_norm": 21.483428955078125,
"learning_rate": 7.341880341880342e-06,
"loss": 1.4249,
"step": 622
},
{
"epoch": 5.3247863247863245,
"grad_norm": 20.089691162109375,
"learning_rate": 7.337606837606837e-06,
"loss": 1.351,
"step": 623
},
{
"epoch": 5.333333333333333,
"grad_norm": 138.9898223876953,
"learning_rate": 7.333333333333333e-06,
"loss": 1.5682,
"step": 624
},
{
"epoch": 5.3418803418803416,
"grad_norm": 16.808000564575195,
"learning_rate": 7.329059829059829e-06,
"loss": 1.4794,
"step": 625
},
{
"epoch": 5.35042735042735,
"grad_norm": 18.58464813232422,
"learning_rate": 7.324786324786325e-06,
"loss": 1.4486,
"step": 626
},
{
"epoch": 5.358974358974359,
"grad_norm": 15.074477195739746,
"learning_rate": 7.320512820512821e-06,
"loss": 1.3124,
"step": 627
},
{
"epoch": 5.367521367521368,
"grad_norm": 15.800148963928223,
"learning_rate": 7.316239316239317e-06,
"loss": 1.7055,
"step": 628
},
{
"epoch": 5.3760683760683765,
"grad_norm": 19.166179656982422,
"learning_rate": 7.311965811965812e-06,
"loss": 1.7306,
"step": 629
},
{
"epoch": 5.384615384615385,
"grad_norm": 55.91648864746094,
"learning_rate": 7.307692307692308e-06,
"loss": 1.2376,
"step": 630
},
{
"epoch": 5.3931623931623935,
"grad_norm": 16.606033325195312,
"learning_rate": 7.3034188034188035e-06,
"loss": 1.1159,
"step": 631
},
{
"epoch": 5.401709401709402,
"grad_norm": 17.0134220123291,
"learning_rate": 7.299145299145299e-06,
"loss": 1.2124,
"step": 632
},
{
"epoch": 5.410256410256411,
"grad_norm": 17.511932373046875,
"learning_rate": 7.294871794871795e-06,
"loss": 1.4221,
"step": 633
},
{
"epoch": 5.418803418803419,
"grad_norm": 44.53416061401367,
"learning_rate": 7.290598290598291e-06,
"loss": 1.9583,
"step": 634
},
{
"epoch": 5.427350427350428,
"grad_norm": 16.546630859375,
"learning_rate": 7.286324786324786e-06,
"loss": 1.1722,
"step": 635
},
{
"epoch": 5.435897435897436,
"grad_norm": 39.90822982788086,
"learning_rate": 7.282051282051282e-06,
"loss": 1.7482,
"step": 636
},
{
"epoch": 5.444444444444445,
"grad_norm": 16.186573028564453,
"learning_rate": 7.277777777777778e-06,
"loss": 1.3422,
"step": 637
},
{
"epoch": 5.452991452991453,
"grad_norm": 18.84516143798828,
"learning_rate": 7.273504273504274e-06,
"loss": 1.3299,
"step": 638
},
{
"epoch": 5.461538461538462,
"grad_norm": 14.620058059692383,
"learning_rate": 7.26923076923077e-06,
"loss": 1.0604,
"step": 639
},
{
"epoch": 5.47008547008547,
"grad_norm": 16.5911865234375,
"learning_rate": 7.264957264957266e-06,
"loss": 1.1138,
"step": 640
},
{
"epoch": 5.478632478632479,
"grad_norm": 15.44485092163086,
"learning_rate": 7.260683760683761e-06,
"loss": 1.435,
"step": 641
},
{
"epoch": 5.487179487179487,
"grad_norm": 121.76724243164062,
"learning_rate": 7.256410256410257e-06,
"loss": 1.7167,
"step": 642
},
{
"epoch": 5.495726495726496,
"grad_norm": 1996.141357421875,
"learning_rate": 7.2521367521367525e-06,
"loss": 4.0296,
"step": 643
},
{
"epoch": 5.504273504273504,
"grad_norm": 15.072067260742188,
"learning_rate": 7.247863247863248e-06,
"loss": 1.0455,
"step": 644
},
{
"epoch": 5.512820512820513,
"grad_norm": 16.684345245361328,
"learning_rate": 7.243589743589744e-06,
"loss": 1.7565,
"step": 645
},
{
"epoch": 5.521367521367521,
"grad_norm": 15.515148162841797,
"learning_rate": 7.239316239316239e-06,
"loss": 1.4601,
"step": 646
},
{
"epoch": 5.52991452991453,
"grad_norm": 20.1015625,
"learning_rate": 7.235042735042735e-06,
"loss": 1.073,
"step": 647
},
{
"epoch": 5.538461538461538,
"grad_norm": 67.10873413085938,
"learning_rate": 7.230769230769231e-06,
"loss": 1.8586,
"step": 648
},
{
"epoch": 5.547008547008547,
"grad_norm": 13.775193214416504,
"learning_rate": 7.226495726495727e-06,
"loss": 1.2891,
"step": 649
},
{
"epoch": 5.555555555555555,
"grad_norm": 14.612048149108887,
"learning_rate": 7.222222222222223e-06,
"loss": 1.033,
"step": 650
},
{
"epoch": 5.564102564102564,
"grad_norm": 14.512042999267578,
"learning_rate": 7.217948717948719e-06,
"loss": 1.1446,
"step": 651
},
{
"epoch": 5.572649572649572,
"grad_norm": 13.720820426940918,
"learning_rate": 7.213675213675214e-06,
"loss": 1.1246,
"step": 652
},
{
"epoch": 5.581196581196581,
"grad_norm": 16.548046112060547,
"learning_rate": 7.20940170940171e-06,
"loss": 1.3162,
"step": 653
},
{
"epoch": 5.589743589743589,
"grad_norm": 20.535181045532227,
"learning_rate": 7.205128205128206e-06,
"loss": 1.3019,
"step": 654
},
{
"epoch": 5.598290598290598,
"grad_norm": 14.317465782165527,
"learning_rate": 7.2008547008547015e-06,
"loss": 1.5447,
"step": 655
},
{
"epoch": 5.6068376068376065,
"grad_norm": 16.23088836669922,
"learning_rate": 7.196581196581197e-06,
"loss": 1.2701,
"step": 656
},
{
"epoch": 5.615384615384615,
"grad_norm": 13.754173278808594,
"learning_rate": 7.192307692307693e-06,
"loss": 1.2218,
"step": 657
},
{
"epoch": 5.6239316239316235,
"grad_norm": 75.77688598632812,
"learning_rate": 7.188034188034188e-06,
"loss": 1.7547,
"step": 658
},
{
"epoch": 5.632478632478632,
"grad_norm": 19.452077865600586,
"learning_rate": 7.183760683760684e-06,
"loss": 1.1446,
"step": 659
},
{
"epoch": 5.641025641025641,
"grad_norm": 14.513677597045898,
"learning_rate": 7.17948717948718e-06,
"loss": 1.0527,
"step": 660
},
{
"epoch": 5.64957264957265,
"grad_norm": 27.67446517944336,
"learning_rate": 7.175213675213676e-06,
"loss": 1.1953,
"step": 661
},
{
"epoch": 5.6581196581196584,
"grad_norm": 12.137639999389648,
"learning_rate": 7.170940170940172e-06,
"loss": 1.1127,
"step": 662
},
{
"epoch": 5.666666666666667,
"grad_norm": 17.2878475189209,
"learning_rate": 7.166666666666667e-06,
"loss": 1.0475,
"step": 663
},
{
"epoch": 5.6752136752136755,
"grad_norm": 28.070842742919922,
"learning_rate": 7.162393162393163e-06,
"loss": 1.6271,
"step": 664
},
{
"epoch": 5.683760683760684,
"grad_norm": 17.74942398071289,
"learning_rate": 7.158119658119659e-06,
"loss": 1.1759,
"step": 665
},
{
"epoch": 5.6923076923076925,
"grad_norm": 19.545486450195312,
"learning_rate": 7.153846153846155e-06,
"loss": 0.9753,
"step": 666
},
{
"epoch": 5.700854700854701,
"grad_norm": 24.34153938293457,
"learning_rate": 7.1495726495726505e-06,
"loss": 1.0905,
"step": 667
},
{
"epoch": 5.7094017094017095,
"grad_norm": 211.7845001220703,
"learning_rate": 7.145299145299146e-06,
"loss": 1.6455,
"step": 668
},
{
"epoch": 5.717948717948718,
"grad_norm": 14.03074836730957,
"learning_rate": 7.1410256410256414e-06,
"loss": 1.3728,
"step": 669
},
{
"epoch": 5.726495726495727,
"grad_norm": 27.600345611572266,
"learning_rate": 7.136752136752137e-06,
"loss": 1.4212,
"step": 670
},
{
"epoch": 5.735042735042735,
"grad_norm": 15.755846977233887,
"learning_rate": 7.132478632478633e-06,
"loss": 1.148,
"step": 671
},
{
"epoch": 5.743589743589744,
"grad_norm": 12.816133499145508,
"learning_rate": 7.128205128205129e-06,
"loss": 1.0053,
"step": 672
},
{
"epoch": 5.752136752136752,
"grad_norm": 25.097660064697266,
"learning_rate": 7.123931623931625e-06,
"loss": 1.1561,
"step": 673
},
{
"epoch": 5.760683760683761,
"grad_norm": 19.249279022216797,
"learning_rate": 7.119658119658121e-06,
"loss": 1.2582,
"step": 674
},
{
"epoch": 5.769230769230769,
"grad_norm": 18.606924057006836,
"learning_rate": 7.115384615384616e-06,
"loss": 0.8569,
"step": 675
},
{
"epoch": 5.777777777777778,
"grad_norm": 20.2148380279541,
"learning_rate": 7.111111111111112e-06,
"loss": 1.1126,
"step": 676
},
{
"epoch": 5.786324786324786,
"grad_norm": 18.623268127441406,
"learning_rate": 7.106837606837608e-06,
"loss": 1.6129,
"step": 677
},
{
"epoch": 5.794871794871795,
"grad_norm": 14.888258934020996,
"learning_rate": 7.102564102564104e-06,
"loss": 1.2533,
"step": 678
},
{
"epoch": 5.803418803418803,
"grad_norm": 15.351551055908203,
"learning_rate": 7.0982905982905995e-06,
"loss": 1.2392,
"step": 679
},
{
"epoch": 5.811965811965812,
"grad_norm": 23.243993759155273,
"learning_rate": 7.0940170940170945e-06,
"loss": 1.3136,
"step": 680
},
{
"epoch": 5.82051282051282,
"grad_norm": 18.346277236938477,
"learning_rate": 7.0897435897435904e-06,
"loss": 1.5691,
"step": 681
},
{
"epoch": 5.829059829059829,
"grad_norm": 12.904829025268555,
"learning_rate": 7.085470085470086e-06,
"loss": 0.9248,
"step": 682
},
{
"epoch": 5.837606837606837,
"grad_norm": 13.263056755065918,
"learning_rate": 7.081196581196582e-06,
"loss": 1.0555,
"step": 683
},
{
"epoch": 5.846153846153846,
"grad_norm": 19.311899185180664,
"learning_rate": 7.076923076923078e-06,
"loss": 1.4341,
"step": 684
},
{
"epoch": 5.854700854700854,
"grad_norm": 282.1452331542969,
"learning_rate": 7.072649572649574e-06,
"loss": 1.9797,
"step": 685
},
{
"epoch": 5.863247863247864,
"grad_norm": 14.317438125610352,
"learning_rate": 7.068376068376069e-06,
"loss": 0.839,
"step": 686
},
{
"epoch": 5.871794871794872,
"grad_norm": 13.549150466918945,
"learning_rate": 7.064102564102565e-06,
"loss": 1.1003,
"step": 687
},
{
"epoch": 5.880341880341881,
"grad_norm": 14.283610343933105,
"learning_rate": 7.059829059829061e-06,
"loss": 1.0297,
"step": 688
},
{
"epoch": 5.888888888888889,
"grad_norm": 18.737884521484375,
"learning_rate": 7.055555555555557e-06,
"loss": 0.9817,
"step": 689
},
{
"epoch": 5.897435897435898,
"grad_norm": 24.12625503540039,
"learning_rate": 7.051282051282053e-06,
"loss": 1.1837,
"step": 690
},
{
"epoch": 5.905982905982906,
"grad_norm": 11.760732650756836,
"learning_rate": 7.0470085470085485e-06,
"loss": 1.5131,
"step": 691
},
{
"epoch": 5.914529914529915,
"grad_norm": 16.138668060302734,
"learning_rate": 7.0427350427350435e-06,
"loss": 0.9569,
"step": 692
},
{
"epoch": 5.923076923076923,
"grad_norm": 17.727285385131836,
"learning_rate": 7.038461538461539e-06,
"loss": 0.9834,
"step": 693
},
{
"epoch": 5.931623931623932,
"grad_norm": 13.434252738952637,
"learning_rate": 7.034188034188035e-06,
"loss": 1.3635,
"step": 694
},
{
"epoch": 5.94017094017094,
"grad_norm": 15.587186813354492,
"learning_rate": 7.02991452991453e-06,
"loss": 1.4814,
"step": 695
},
{
"epoch": 5.948717948717949,
"grad_norm": 31.379039764404297,
"learning_rate": 7.025641025641025e-06,
"loss": 0.8792,
"step": 696
},
{
"epoch": 5.957264957264957,
"grad_norm": 14.575559616088867,
"learning_rate": 7.021367521367521e-06,
"loss": 0.8865,
"step": 697
},
{
"epoch": 5.965811965811966,
"grad_norm": 13.55718994140625,
"learning_rate": 7.017094017094017e-06,
"loss": 0.9564,
"step": 698
},
{
"epoch": 5.9743589743589745,
"grad_norm": 13.288110733032227,
"learning_rate": 7.012820512820513e-06,
"loss": 0.8117,
"step": 699
},
{
"epoch": 5.982905982905983,
"grad_norm": 14.522254943847656,
"learning_rate": 7.008547008547009e-06,
"loss": 1.2037,
"step": 700
},
{
"epoch": 5.9914529914529915,
"grad_norm": 14.575456619262695,
"learning_rate": 7.004273504273504e-06,
"loss": 1.028,
"step": 701
},
{
"epoch": 6.0,
"grad_norm": 13.18249225616455,
"learning_rate": 7e-06,
"loss": 0.6528,
"step": 702
},
{
"epoch": 6.0,
"eval_loss": 0.4769609868526459,
"eval_runtime": 9.253,
"eval_samples_per_second": 50.362,
"eval_steps_per_second": 6.376,
"step": 702
},
{
"epoch": 6.0085470085470085,
"grad_norm": 17.034433364868164,
"learning_rate": 6.995726495726496e-06,
"loss": 0.847,
"step": 703
},
{
"epoch": 6.017094017094017,
"grad_norm": 13.455194473266602,
"learning_rate": 6.991452991452992e-06,
"loss": 0.8545,
"step": 704
},
{
"epoch": 6.0256410256410255,
"grad_norm": 14.511704444885254,
"learning_rate": 6.9871794871794876e-06,
"loss": 0.9365,
"step": 705
},
{
"epoch": 6.034188034188034,
"grad_norm": 14.325255393981934,
"learning_rate": 6.9829059829059835e-06,
"loss": 0.869,
"step": 706
},
{
"epoch": 6.042735042735043,
"grad_norm": 12.944524765014648,
"learning_rate": 6.9786324786324785e-06,
"loss": 1.1417,
"step": 707
},
{
"epoch": 6.051282051282051,
"grad_norm": 14.992669105529785,
"learning_rate": 6.974358974358974e-06,
"loss": 1.4935,
"step": 708
},
{
"epoch": 6.05982905982906,
"grad_norm": 15.394392013549805,
"learning_rate": 6.97008547008547e-06,
"loss": 1.519,
"step": 709
},
{
"epoch": 6.068376068376068,
"grad_norm": 12.605085372924805,
"learning_rate": 6.965811965811966e-06,
"loss": 1.4419,
"step": 710
},
{
"epoch": 6.076923076923077,
"grad_norm": 16.47636604309082,
"learning_rate": 6.961538461538462e-06,
"loss": 0.9552,
"step": 711
},
{
"epoch": 6.085470085470085,
"grad_norm": 17.04586410522461,
"learning_rate": 6.957264957264958e-06,
"loss": 0.9847,
"step": 712
},
{
"epoch": 6.094017094017094,
"grad_norm": 15.464738845825195,
"learning_rate": 6.952991452991453e-06,
"loss": 0.9272,
"step": 713
},
{
"epoch": 6.102564102564102,
"grad_norm": 11.837206840515137,
"learning_rate": 6.948717948717949e-06,
"loss": 1.1682,
"step": 714
},
{
"epoch": 6.111111111111111,
"grad_norm": 11.013447761535645,
"learning_rate": 6.944444444444445e-06,
"loss": 1.222,
"step": 715
},
{
"epoch": 6.119658119658119,
"grad_norm": 15.37415885925293,
"learning_rate": 6.940170940170941e-06,
"loss": 0.9668,
"step": 716
},
{
"epoch": 6.128205128205128,
"grad_norm": 14.077155113220215,
"learning_rate": 6.9358974358974366e-06,
"loss": 0.8448,
"step": 717
},
{
"epoch": 6.136752136752137,
"grad_norm": 13.440519332885742,
"learning_rate": 6.931623931623932e-06,
"loss": 0.891,
"step": 718
},
{
"epoch": 6.145299145299146,
"grad_norm": 13.059304237365723,
"learning_rate": 6.9273504273504275e-06,
"loss": 0.655,
"step": 719
},
{
"epoch": 6.153846153846154,
"grad_norm": 12.96674633026123,
"learning_rate": 6.923076923076923e-06,
"loss": 0.7755,
"step": 720
},
{
"epoch": 6.162393162393163,
"grad_norm": 10.921567916870117,
"learning_rate": 6.918803418803419e-06,
"loss": 0.8533,
"step": 721
},
{
"epoch": 6.170940170940171,
"grad_norm": 10.439260482788086,
"learning_rate": 6.914529914529915e-06,
"loss": 0.8294,
"step": 722
},
{
"epoch": 6.17948717948718,
"grad_norm": 14.948200225830078,
"learning_rate": 6.910256410256411e-06,
"loss": 0.7326,
"step": 723
},
{
"epoch": 6.188034188034188,
"grad_norm": 12.733176231384277,
"learning_rate": 6.905982905982906e-06,
"loss": 1.0244,
"step": 724
},
{
"epoch": 6.196581196581197,
"grad_norm": 12.432938575744629,
"learning_rate": 6.901709401709402e-06,
"loss": 0.7375,
"step": 725
},
{
"epoch": 6.205128205128205,
"grad_norm": 12.047768592834473,
"learning_rate": 6.897435897435898e-06,
"loss": 0.8348,
"step": 726
},
{
"epoch": 6.213675213675214,
"grad_norm": 19.029287338256836,
"learning_rate": 6.893162393162394e-06,
"loss": 0.6091,
"step": 727
},
{
"epoch": 6.222222222222222,
"grad_norm": 11.650983810424805,
"learning_rate": 6.88888888888889e-06,
"loss": 0.9925,
"step": 728
},
{
"epoch": 6.230769230769231,
"grad_norm": 12.12030029296875,
"learning_rate": 6.8846153846153855e-06,
"loss": 1.0205,
"step": 729
},
{
"epoch": 6.239316239316239,
"grad_norm": 10.283143997192383,
"learning_rate": 6.880341880341881e-06,
"loss": 0.7726,
"step": 730
},
{
"epoch": 6.247863247863248,
"grad_norm": 12.965302467346191,
"learning_rate": 6.8760683760683765e-06,
"loss": 1.1761,
"step": 731
},
{
"epoch": 6.256410256410256,
"grad_norm": 9.0562105178833,
"learning_rate": 6.871794871794872e-06,
"loss": 0.9769,
"step": 732
},
{
"epoch": 6.264957264957265,
"grad_norm": 13.647340774536133,
"learning_rate": 6.867521367521368e-06,
"loss": 0.7613,
"step": 733
},
{
"epoch": 6.273504273504273,
"grad_norm": 11.598361015319824,
"learning_rate": 6.863247863247864e-06,
"loss": 0.6236,
"step": 734
},
{
"epoch": 6.282051282051282,
"grad_norm": 10.453935623168945,
"learning_rate": 6.858974358974359e-06,
"loss": 0.9752,
"step": 735
},
{
"epoch": 6.2905982905982905,
"grad_norm": 14.108942985534668,
"learning_rate": 6.854700854700855e-06,
"loss": 0.9212,
"step": 736
},
{
"epoch": 6.299145299145299,
"grad_norm": 21.230859756469727,
"learning_rate": 6.850427350427351e-06,
"loss": 0.9213,
"step": 737
},
{
"epoch": 6.3076923076923075,
"grad_norm": 11.801465034484863,
"learning_rate": 6.846153846153847e-06,
"loss": 0.8182,
"step": 738
},
{
"epoch": 6.316239316239316,
"grad_norm": 18.9310302734375,
"learning_rate": 6.841880341880343e-06,
"loss": 0.6214,
"step": 739
},
{
"epoch": 6.3247863247863245,
"grad_norm": 11.773117065429688,
"learning_rate": 6.837606837606839e-06,
"loss": 0.6221,
"step": 740
},
{
"epoch": 6.333333333333333,
"grad_norm": 187.00250244140625,
"learning_rate": 6.833333333333334e-06,
"loss": 1.5211,
"step": 741
},
{
"epoch": 6.3418803418803416,
"grad_norm": 70.96250915527344,
"learning_rate": 6.82905982905983e-06,
"loss": 1.3472,
"step": 742
},
{
"epoch": 6.35042735042735,
"grad_norm": 11.787941932678223,
"learning_rate": 6.8247863247863255e-06,
"loss": 0.8831,
"step": 743
},
{
"epoch": 6.358974358974359,
"grad_norm": 11.33661937713623,
"learning_rate": 6.820512820512821e-06,
"loss": 1.0555,
"step": 744
},
{
"epoch": 6.367521367521368,
"grad_norm": 14.255888938903809,
"learning_rate": 6.816239316239317e-06,
"loss": 0.8246,
"step": 745
},
{
"epoch": 6.3760683760683765,
"grad_norm": 10.89616870880127,
"learning_rate": 6.811965811965813e-06,
"loss": 1.0179,
"step": 746
},
{
"epoch": 6.384615384615385,
"grad_norm": 9.160380363464355,
"learning_rate": 6.807692307692308e-06,
"loss": 0.9019,
"step": 747
},
{
"epoch": 6.3931623931623935,
"grad_norm": 12.984644889831543,
"learning_rate": 6.803418803418804e-06,
"loss": 0.649,
"step": 748
},
{
"epoch": 6.401709401709402,
"grad_norm": 14.073376655578613,
"learning_rate": 6.7991452991453e-06,
"loss": 0.608,
"step": 749
},
{
"epoch": 6.410256410256411,
"grad_norm": 10.354485511779785,
"learning_rate": 6.794871794871796e-06,
"loss": 0.8812,
"step": 750
},
{
"epoch": 6.418803418803419,
"grad_norm": 9.121294975280762,
"learning_rate": 6.790598290598292e-06,
"loss": 0.768,
"step": 751
},
{
"epoch": 6.427350427350428,
"grad_norm": 10.909361839294434,
"learning_rate": 6.786324786324787e-06,
"loss": 0.8697,
"step": 752
},
{
"epoch": 6.435897435897436,
"grad_norm": 26.324186325073242,
"learning_rate": 6.782051282051283e-06,
"loss": 1.2437,
"step": 753
},
{
"epoch": 6.444444444444445,
"grad_norm": 11.972411155700684,
"learning_rate": 6.777777777777779e-06,
"loss": 0.6366,
"step": 754
},
{
"epoch": 6.452991452991453,
"grad_norm": 25.042150497436523,
"learning_rate": 6.7735042735042745e-06,
"loss": 1.0371,
"step": 755
},
{
"epoch": 6.461538461538462,
"grad_norm": 10.331900596618652,
"learning_rate": 6.76923076923077e-06,
"loss": 0.5618,
"step": 756
},
{
"epoch": 6.47008547008547,
"grad_norm": 11.925344467163086,
"learning_rate": 6.764957264957266e-06,
"loss": 0.629,
"step": 757
},
{
"epoch": 6.478632478632479,
"grad_norm": 10.309441566467285,
"learning_rate": 6.760683760683761e-06,
"loss": 0.7158,
"step": 758
},
{
"epoch": 6.487179487179487,
"grad_norm": 11.374105453491211,
"learning_rate": 6.756410256410257e-06,
"loss": 0.6909,
"step": 759
},
{
"epoch": 6.495726495726496,
"grad_norm": 11.613142967224121,
"learning_rate": 6.752136752136753e-06,
"loss": 0.6139,
"step": 760
},
{
"epoch": 6.504273504273504,
"grad_norm": 14.499147415161133,
"learning_rate": 6.747863247863249e-06,
"loss": 0.7242,
"step": 761
},
{
"epoch": 6.512820512820513,
"grad_norm": 13.683001518249512,
"learning_rate": 6.743589743589745e-06,
"loss": 0.9246,
"step": 762
},
{
"epoch": 6.521367521367521,
"grad_norm": 11.068865776062012,
"learning_rate": 6.739316239316241e-06,
"loss": 0.8866,
"step": 763
},
{
"epoch": 6.52991452991453,
"grad_norm": 13.0232572555542,
"learning_rate": 6.735042735042736e-06,
"loss": 0.86,
"step": 764
},
{
"epoch": 6.538461538461538,
"grad_norm": 10.639331817626953,
"learning_rate": 6.730769230769232e-06,
"loss": 0.6928,
"step": 765
},
{
"epoch": 6.547008547008547,
"grad_norm": 11.792994499206543,
"learning_rate": 6.7264957264957276e-06,
"loss": 0.6571,
"step": 766
},
{
"epoch": 6.555555555555555,
"grad_norm": 15.907414436340332,
"learning_rate": 6.7222222222222235e-06,
"loss": 1.1426,
"step": 767
},
{
"epoch": 6.564102564102564,
"grad_norm": 12.207514762878418,
"learning_rate": 6.717948717948718e-06,
"loss": 1.0932,
"step": 768
},
{
"epoch": 6.572649572649572,
"grad_norm": 20.145288467407227,
"learning_rate": 6.7136752136752135e-06,
"loss": 0.9706,
"step": 769
},
{
"epoch": 6.581196581196581,
"grad_norm": 9.820805549621582,
"learning_rate": 6.7094017094017094e-06,
"loss": 0.4955,
"step": 770
},
{
"epoch": 6.589743589743589,
"grad_norm": 10.385655403137207,
"learning_rate": 6.705128205128205e-06,
"loss": 1.0172,
"step": 771
},
{
"epoch": 6.598290598290598,
"grad_norm": 11.708373069763184,
"learning_rate": 6.700854700854701e-06,
"loss": 0.8048,
"step": 772
},
{
"epoch": 6.6068376068376065,
"grad_norm": 9.812984466552734,
"learning_rate": 6.696581196581196e-06,
"loss": 0.4831,
"step": 773
},
{
"epoch": 6.615384615384615,
"grad_norm": 9.146960258483887,
"learning_rate": 6.692307692307692e-06,
"loss": 0.6178,
"step": 774
},
{
"epoch": 6.6239316239316235,
"grad_norm": 13.61231517791748,
"learning_rate": 6.688034188034188e-06,
"loss": 0.7812,
"step": 775
},
{
"epoch": 6.632478632478632,
"grad_norm": 10.349262237548828,
"learning_rate": 6.683760683760684e-06,
"loss": 0.819,
"step": 776
},
{
"epoch": 6.641025641025641,
"grad_norm": 48.387847900390625,
"learning_rate": 6.67948717948718e-06,
"loss": 1.5294,
"step": 777
},
{
"epoch": 6.64957264957265,
"grad_norm": 9.540630340576172,
"learning_rate": 6.675213675213676e-06,
"loss": 0.6564,
"step": 778
},
{
"epoch": 6.6581196581196584,
"grad_norm": 10.83983039855957,
"learning_rate": 6.670940170940171e-06,
"loss": 0.5109,
"step": 779
},
{
"epoch": 6.666666666666667,
"grad_norm": 15.380743026733398,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6504,
"step": 780
},
{
"epoch": 6.6752136752136755,
"grad_norm": 16.796918869018555,
"learning_rate": 6.6623931623931625e-06,
"loss": 0.7944,
"step": 781
},
{
"epoch": 6.683760683760684,
"grad_norm": 39.64078140258789,
"learning_rate": 6.6581196581196584e-06,
"loss": 0.6929,
"step": 782
},
{
"epoch": 6.6923076923076925,
"grad_norm": 7.730568885803223,
"learning_rate": 6.653846153846154e-06,
"loss": 0.6284,
"step": 783
},
{
"epoch": 6.700854700854701,
"grad_norm": 7.840725898742676,
"learning_rate": 6.64957264957265e-06,
"loss": 0.5113,
"step": 784
},
{
"epoch": 6.7094017094017095,
"grad_norm": 13.925577163696289,
"learning_rate": 6.645299145299145e-06,
"loss": 0.6846,
"step": 785
},
{
"epoch": 6.717948717948718,
"grad_norm": 10.926531791687012,
"learning_rate": 6.641025641025641e-06,
"loss": 1.3245,
"step": 786
},
{
"epoch": 6.726495726495727,
"grad_norm": 10.698541641235352,
"learning_rate": 6.636752136752137e-06,
"loss": 0.6025,
"step": 787
},
{
"epoch": 6.735042735042735,
"grad_norm": 7.572136878967285,
"learning_rate": 6.632478632478633e-06,
"loss": 0.5473,
"step": 788
},
{
"epoch": 6.743589743589744,
"grad_norm": 26.242990493774414,
"learning_rate": 6.628205128205129e-06,
"loss": 0.5637,
"step": 789
},
{
"epoch": 6.752136752136752,
"grad_norm": 8.79776668548584,
"learning_rate": 6.623931623931624e-06,
"loss": 0.7595,
"step": 790
},
{
"epoch": 6.760683760683761,
"grad_norm": 8.951017379760742,
"learning_rate": 6.61965811965812e-06,
"loss": 1.0365,
"step": 791
},
{
"epoch": 6.769230769230769,
"grad_norm": 13.799118041992188,
"learning_rate": 6.615384615384616e-06,
"loss": 1.4206,
"step": 792
},
{
"epoch": 6.777777777777778,
"grad_norm": 674.3671875,
"learning_rate": 6.6111111111111115e-06,
"loss": 1.1752,
"step": 793
},
{
"epoch": 6.786324786324786,
"grad_norm": 8.110879898071289,
"learning_rate": 6.606837606837607e-06,
"loss": 0.4668,
"step": 794
},
{
"epoch": 6.794871794871795,
"grad_norm": 8.119854927062988,
"learning_rate": 6.602564102564103e-06,
"loss": 0.7689,
"step": 795
},
{
"epoch": 6.803418803418803,
"grad_norm": 11.039762496948242,
"learning_rate": 6.598290598290598e-06,
"loss": 0.5636,
"step": 796
},
{
"epoch": 6.811965811965812,
"grad_norm": 12.724084854125977,
"learning_rate": 6.594017094017094e-06,
"loss": 0.5072,
"step": 797
},
{
"epoch": 6.82051282051282,
"grad_norm": 12.196049690246582,
"learning_rate": 6.58974358974359e-06,
"loss": 0.5073,
"step": 798
},
{
"epoch": 6.829059829059829,
"grad_norm": 9.072951316833496,
"learning_rate": 6.585470085470086e-06,
"loss": 0.4855,
"step": 799
},
{
"epoch": 6.837606837606837,
"grad_norm": 10.53836441040039,
"learning_rate": 6.581196581196582e-06,
"loss": 1.0017,
"step": 800
},
{
"epoch": 6.846153846153846,
"grad_norm": 7.728690147399902,
"learning_rate": 6.576923076923078e-06,
"loss": 0.5784,
"step": 801
},
{
"epoch": 6.854700854700854,
"grad_norm": 28.362455368041992,
"learning_rate": 6.572649572649573e-06,
"loss": 1.0295,
"step": 802
},
{
"epoch": 6.863247863247864,
"grad_norm": 7.291123390197754,
"learning_rate": 6.568376068376069e-06,
"loss": 0.7836,
"step": 803
},
{
"epoch": 6.871794871794872,
"grad_norm": 9.566614151000977,
"learning_rate": 6.564102564102565e-06,
"loss": 0.9979,
"step": 804
},
{
"epoch": 6.880341880341881,
"grad_norm": 13.544408798217773,
"learning_rate": 6.5598290598290605e-06,
"loss": 0.5354,
"step": 805
},
{
"epoch": 6.888888888888889,
"grad_norm": 8.546881675720215,
"learning_rate": 6.555555555555556e-06,
"loss": 0.4689,
"step": 806
},
{
"epoch": 6.897435897435898,
"grad_norm": 8.94822883605957,
"learning_rate": 6.5512820512820515e-06,
"loss": 0.4432,
"step": 807
},
{
"epoch": 6.905982905982906,
"grad_norm": 6.5176544189453125,
"learning_rate": 6.547008547008547e-06,
"loss": 0.6747,
"step": 808
},
{
"epoch": 6.914529914529915,
"grad_norm": 9.48947811126709,
"learning_rate": 6.542735042735043e-06,
"loss": 0.4268,
"step": 809
},
{
"epoch": 6.923076923076923,
"grad_norm": 11.432586669921875,
"learning_rate": 6.538461538461539e-06,
"loss": 0.5486,
"step": 810
},
{
"epoch": 6.931623931623932,
"grad_norm": 7.585604667663574,
"learning_rate": 6.534188034188035e-06,
"loss": 0.4412,
"step": 811
},
{
"epoch": 6.94017094017094,
"grad_norm": 7.860292911529541,
"learning_rate": 6.529914529914531e-06,
"loss": 0.6428,
"step": 812
},
{
"epoch": 6.948717948717949,
"grad_norm": 27.83890151977539,
"learning_rate": 6.525641025641026e-06,
"loss": 0.6735,
"step": 813
},
{
"epoch": 6.957264957264957,
"grad_norm": 10.266451835632324,
"learning_rate": 6.521367521367522e-06,
"loss": 0.6757,
"step": 814
},
{
"epoch": 6.965811965811966,
"grad_norm": 8.839099884033203,
"learning_rate": 6.517094017094018e-06,
"loss": 0.7897,
"step": 815
},
{
"epoch": 6.9743589743589745,
"grad_norm": 10.037760734558105,
"learning_rate": 6.512820512820514e-06,
"loss": 0.7133,
"step": 816
},
{
"epoch": 6.982905982905983,
"grad_norm": 14.50278377532959,
"learning_rate": 6.5085470085470095e-06,
"loss": 1.0051,
"step": 817
},
{
"epoch": 6.9914529914529915,
"grad_norm": 8.775527000427246,
"learning_rate": 6.504273504273505e-06,
"loss": 0.8769,
"step": 818
},
{
"epoch": 7.0,
"grad_norm": 8.891378402709961,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.9586,
"step": 819
},
{
"epoch": 7.0,
"eval_loss": 0.23673956096172333,
"eval_runtime": 9.3447,
"eval_samples_per_second": 49.868,
"eval_steps_per_second": 6.314,
"step": 819
},
{
"epoch": 7.0085470085470085,
"grad_norm": 8.925857543945312,
"learning_rate": 6.495726495726496e-06,
"loss": 1.0406,
"step": 820
},
{
"epoch": 7.017094017094017,
"grad_norm": 8.222796440124512,
"learning_rate": 6.491452991452992e-06,
"loss": 0.4911,
"step": 821
},
{
"epoch": 7.0256410256410255,
"grad_norm": 11.528886795043945,
"learning_rate": 6.487179487179488e-06,
"loss": 0.8292,
"step": 822
},
{
"epoch": 7.034188034188034,
"grad_norm": 7.9031524658203125,
"learning_rate": 6.482905982905984e-06,
"loss": 0.5319,
"step": 823
},
{
"epoch": 7.042735042735043,
"grad_norm": 6.788857936859131,
"learning_rate": 6.478632478632479e-06,
"loss": 0.431,
"step": 824
},
{
"epoch": 7.051282051282051,
"grad_norm": 8.84765911102295,
"learning_rate": 6.474358974358975e-06,
"loss": 0.6417,
"step": 825
},
{
"epoch": 7.05982905982906,
"grad_norm": 7.517561435699463,
"learning_rate": 6.470085470085471e-06,
"loss": 0.5828,
"step": 826
},
{
"epoch": 7.068376068376068,
"grad_norm": 9.86832332611084,
"learning_rate": 6.465811965811967e-06,
"loss": 0.5851,
"step": 827
},
{
"epoch": 7.076923076923077,
"grad_norm": 9.632494926452637,
"learning_rate": 6.461538461538463e-06,
"loss": 0.769,
"step": 828
},
{
"epoch": 7.085470085470085,
"grad_norm": 9.874857902526855,
"learning_rate": 6.4572649572649585e-06,
"loss": 0.4393,
"step": 829
},
{
"epoch": 7.094017094017094,
"grad_norm": 11.78085994720459,
"learning_rate": 6.4529914529914535e-06,
"loss": 0.8784,
"step": 830
},
{
"epoch": 7.102564102564102,
"grad_norm": 8.85053825378418,
"learning_rate": 6.4487179487179494e-06,
"loss": 0.5911,
"step": 831
},
{
"epoch": 7.111111111111111,
"grad_norm": 12.405013084411621,
"learning_rate": 6.444444444444445e-06,
"loss": 0.4941,
"step": 832
},
{
"epoch": 7.119658119658119,
"grad_norm": 12.237760543823242,
"learning_rate": 6.440170940170941e-06,
"loss": 0.4468,
"step": 833
},
{
"epoch": 7.128205128205128,
"grad_norm": 7.945899486541748,
"learning_rate": 6.435897435897437e-06,
"loss": 0.4101,
"step": 834
},
{
"epoch": 7.136752136752137,
"grad_norm": 10.743217468261719,
"learning_rate": 6.431623931623933e-06,
"loss": 0.679,
"step": 835
},
{
"epoch": 7.145299145299146,
"grad_norm": 7.700406551361084,
"learning_rate": 6.427350427350428e-06,
"loss": 0.5067,
"step": 836
},
{
"epoch": 7.153846153846154,
"grad_norm": 8.401918411254883,
"learning_rate": 6.423076923076924e-06,
"loss": 0.5893,
"step": 837
},
{
"epoch": 7.162393162393163,
"grad_norm": 23.065881729125977,
"learning_rate": 6.41880341880342e-06,
"loss": 0.6768,
"step": 838
},
{
"epoch": 7.170940170940171,
"grad_norm": 38.71855545043945,
"learning_rate": 6.414529914529916e-06,
"loss": 0.8828,
"step": 839
},
{
"epoch": 7.17948717948718,
"grad_norm": 12.142110824584961,
"learning_rate": 6.410256410256412e-06,
"loss": 0.5444,
"step": 840
},
{
"epoch": 7.188034188034188,
"grad_norm": 69.4731674194336,
"learning_rate": 6.405982905982906e-06,
"loss": 0.7768,
"step": 841
},
{
"epoch": 7.196581196581197,
"grad_norm": 15.926841735839844,
"learning_rate": 6.401709401709402e-06,
"loss": 0.4348,
"step": 842
},
{
"epoch": 7.205128205128205,
"grad_norm": 6.8418965339660645,
"learning_rate": 6.397435897435898e-06,
"loss": 0.3821,
"step": 843
},
{
"epoch": 7.213675213675214,
"grad_norm": 6.716574192047119,
"learning_rate": 6.3931623931623935e-06,
"loss": 0.3621,
"step": 844
},
{
"epoch": 7.222222222222222,
"grad_norm": 7.452919006347656,
"learning_rate": 6.3888888888888885e-06,
"loss": 0.4997,
"step": 845
},
{
"epoch": 7.230769230769231,
"grad_norm": 11.502019882202148,
"learning_rate": 6.384615384615384e-06,
"loss": 0.8017,
"step": 846
},
{
"epoch": 7.239316239316239,
"grad_norm": 7.349746227264404,
"learning_rate": 6.38034188034188e-06,
"loss": 0.2745,
"step": 847
},
{
"epoch": 7.247863247863248,
"grad_norm": 6.269787311553955,
"learning_rate": 6.376068376068376e-06,
"loss": 0.4131,
"step": 848
},
{
"epoch": 7.256410256410256,
"grad_norm": 9.56203842163086,
"learning_rate": 6.371794871794872e-06,
"loss": 0.8147,
"step": 849
},
{
"epoch": 7.264957264957265,
"grad_norm": 7.358108043670654,
"learning_rate": 6.367521367521368e-06,
"loss": 0.3552,
"step": 850
},
{
"epoch": 7.273504273504273,
"grad_norm": 7.6359782218933105,
"learning_rate": 6.363247863247863e-06,
"loss": 0.3302,
"step": 851
},
{
"epoch": 7.282051282051282,
"grad_norm": 7.356925010681152,
"learning_rate": 6.358974358974359e-06,
"loss": 0.2927,
"step": 852
},
{
"epoch": 7.2905982905982905,
"grad_norm": 11.097757339477539,
"learning_rate": 6.354700854700855e-06,
"loss": 0.8117,
"step": 853
},
{
"epoch": 7.299145299145299,
"grad_norm": 10.301170349121094,
"learning_rate": 6.350427350427351e-06,
"loss": 0.4044,
"step": 854
},
{
"epoch": 7.3076923076923075,
"grad_norm": 7.116042613983154,
"learning_rate": 6.3461538461538466e-06,
"loss": 0.289,
"step": 855
},
{
"epoch": 7.316239316239316,
"grad_norm": 7.453964710235596,
"learning_rate": 6.3418803418803425e-06,
"loss": 0.4652,
"step": 856
},
{
"epoch": 7.3247863247863245,
"grad_norm": 11.864774703979492,
"learning_rate": 6.3376068376068375e-06,
"loss": 0.4667,
"step": 857
},
{
"epoch": 7.333333333333333,
"grad_norm": 8.79547119140625,
"learning_rate": 6.333333333333333e-06,
"loss": 0.2874,
"step": 858
},
{
"epoch": 7.3418803418803416,
"grad_norm": 10.173043251037598,
"learning_rate": 6.329059829059829e-06,
"loss": 0.6844,
"step": 859
},
{
"epoch": 7.35042735042735,
"grad_norm": 9.26555061340332,
"learning_rate": 6.324786324786325e-06,
"loss": 0.2903,
"step": 860
},
{
"epoch": 7.358974358974359,
"grad_norm": 10.274518013000488,
"learning_rate": 6.320512820512821e-06,
"loss": 0.7824,
"step": 861
},
{
"epoch": 7.367521367521368,
"grad_norm": 7.104451656341553,
"learning_rate": 6.316239316239316e-06,
"loss": 0.3024,
"step": 862
},
{
"epoch": 7.3760683760683765,
"grad_norm": 9.522738456726074,
"learning_rate": 6.311965811965812e-06,
"loss": 0.3219,
"step": 863
},
{
"epoch": 7.384615384615385,
"grad_norm": 10.145588874816895,
"learning_rate": 6.307692307692308e-06,
"loss": 0.5319,
"step": 864
},
{
"epoch": 7.3931623931623935,
"grad_norm": 8.828988075256348,
"learning_rate": 6.303418803418804e-06,
"loss": 0.3286,
"step": 865
},
{
"epoch": 7.401709401709402,
"grad_norm": 7.314462661743164,
"learning_rate": 6.2991452991453e-06,
"loss": 0.2951,
"step": 866
},
{
"epoch": 7.410256410256411,
"grad_norm": 13.465666770935059,
"learning_rate": 6.2948717948717956e-06,
"loss": 0.4046,
"step": 867
},
{
"epoch": 7.418803418803419,
"grad_norm": 12.40607738494873,
"learning_rate": 6.290598290598291e-06,
"loss": 0.71,
"step": 868
},
{
"epoch": 7.427350427350428,
"grad_norm": 9.282904624938965,
"learning_rate": 6.2863247863247865e-06,
"loss": 0.4083,
"step": 869
},
{
"epoch": 7.435897435897436,
"grad_norm": 5.755247116088867,
"learning_rate": 6.282051282051282e-06,
"loss": 0.3858,
"step": 870
},
{
"epoch": 7.444444444444445,
"grad_norm": 6.996497631072998,
"learning_rate": 6.277777777777778e-06,
"loss": 0.2692,
"step": 871
},
{
"epoch": 7.452991452991453,
"grad_norm": 7.235395431518555,
"learning_rate": 6.273504273504274e-06,
"loss": 0.3936,
"step": 872
},
{
"epoch": 7.461538461538462,
"grad_norm": 14.275704383850098,
"learning_rate": 6.26923076923077e-06,
"loss": 0.4022,
"step": 873
},
{
"epoch": 7.47008547008547,
"grad_norm": 10.365689277648926,
"learning_rate": 6.264957264957265e-06,
"loss": 1.0508,
"step": 874
},
{
"epoch": 7.478632478632479,
"grad_norm": 5.840590000152588,
"learning_rate": 6.260683760683761e-06,
"loss": 0.2511,
"step": 875
},
{
"epoch": 7.487179487179487,
"grad_norm": 10.25346851348877,
"learning_rate": 6.256410256410257e-06,
"loss": 0.5836,
"step": 876
},
{
"epoch": 7.495726495726496,
"grad_norm": 27.662694931030273,
"learning_rate": 6.252136752136753e-06,
"loss": 0.7677,
"step": 877
},
{
"epoch": 7.504273504273504,
"grad_norm": 5.840217590332031,
"learning_rate": 6.247863247863249e-06,
"loss": 0.3889,
"step": 878
},
{
"epoch": 7.512820512820513,
"grad_norm": 9.813179016113281,
"learning_rate": 6.243589743589744e-06,
"loss": 0.8929,
"step": 879
},
{
"epoch": 7.521367521367521,
"grad_norm": 5.49755334854126,
"learning_rate": 6.23931623931624e-06,
"loss": 0.2712,
"step": 880
},
{
"epoch": 7.52991452991453,
"grad_norm": 7.17311429977417,
"learning_rate": 6.2350427350427355e-06,
"loss": 0.3071,
"step": 881
},
{
"epoch": 7.538461538461538,
"grad_norm": 7.706870079040527,
"learning_rate": 6.230769230769231e-06,
"loss": 0.3797,
"step": 882
},
{
"epoch": 7.547008547008547,
"grad_norm": 7.891415596008301,
"learning_rate": 6.226495726495727e-06,
"loss": 0.5352,
"step": 883
},
{
"epoch": 7.555555555555555,
"grad_norm": 8.746044158935547,
"learning_rate": 6.222222222222223e-06,
"loss": 0.263,
"step": 884
},
{
"epoch": 7.564102564102564,
"grad_norm": 9.096441268920898,
"learning_rate": 6.217948717948718e-06,
"loss": 0.2736,
"step": 885
},
{
"epoch": 7.572649572649572,
"grad_norm": 7.031003475189209,
"learning_rate": 6.213675213675214e-06,
"loss": 0.4705,
"step": 886
},
{
"epoch": 7.581196581196581,
"grad_norm": 6.6503143310546875,
"learning_rate": 6.20940170940171e-06,
"loss": 0.3285,
"step": 887
},
{
"epoch": 7.589743589743589,
"grad_norm": 5.398913383483887,
"learning_rate": 6.205128205128206e-06,
"loss": 0.41,
"step": 888
},
{
"epoch": 7.598290598290598,
"grad_norm": 7.47569465637207,
"learning_rate": 6.200854700854702e-06,
"loss": 0.4005,
"step": 889
},
{
"epoch": 7.6068376068376065,
"grad_norm": 8.79906940460205,
"learning_rate": 6.196581196581198e-06,
"loss": 0.2608,
"step": 890
},
{
"epoch": 7.615384615384615,
"grad_norm": 7.604002475738525,
"learning_rate": 6.192307692307693e-06,
"loss": 0.577,
"step": 891
},
{
"epoch": 7.6239316239316235,
"grad_norm": 12.666848182678223,
"learning_rate": 6.188034188034189e-06,
"loss": 0.7296,
"step": 892
},
{
"epoch": 7.632478632478632,
"grad_norm": 20.92390251159668,
"learning_rate": 6.1837606837606845e-06,
"loss": 0.9276,
"step": 893
},
{
"epoch": 7.641025641025641,
"grad_norm": 6.779317855834961,
"learning_rate": 6.17948717948718e-06,
"loss": 0.818,
"step": 894
},
{
"epoch": 7.64957264957265,
"grad_norm": 5.249539852142334,
"learning_rate": 6.175213675213676e-06,
"loss": 0.2117,
"step": 895
},
{
"epoch": 7.6581196581196584,
"grad_norm": 23.55508041381836,
"learning_rate": 6.170940170940171e-06,
"loss": 0.5239,
"step": 896
},
{
"epoch": 7.666666666666667,
"grad_norm": 11.711256980895996,
"learning_rate": 6.166666666666667e-06,
"loss": 0.6595,
"step": 897
},
{
"epoch": 7.6752136752136755,
"grad_norm": 6.641115188598633,
"learning_rate": 6.162393162393163e-06,
"loss": 0.4888,
"step": 898
},
{
"epoch": 7.683760683760684,
"grad_norm": 7.913390159606934,
"learning_rate": 6.158119658119659e-06,
"loss": 0.66,
"step": 899
},
{
"epoch": 7.6923076923076925,
"grad_norm": 17.927574157714844,
"learning_rate": 6.153846153846155e-06,
"loss": 0.9603,
"step": 900
},
{
"epoch": 7.700854700854701,
"grad_norm": 4.567203998565674,
"learning_rate": 6.149572649572651e-06,
"loss": 0.1638,
"step": 901
},
{
"epoch": 7.7094017094017095,
"grad_norm": 5.995935440063477,
"learning_rate": 6.145299145299146e-06,
"loss": 0.6852,
"step": 902
},
{
"epoch": 7.717948717948718,
"grad_norm": 8.323802947998047,
"learning_rate": 6.141025641025642e-06,
"loss": 0.5293,
"step": 903
},
{
"epoch": 7.726495726495727,
"grad_norm": 6.8586859703063965,
"learning_rate": 6.136752136752138e-06,
"loss": 0.3265,
"step": 904
},
{
"epoch": 7.735042735042735,
"grad_norm": 6.507427215576172,
"learning_rate": 6.1324786324786335e-06,
"loss": 0.2841,
"step": 905
},
{
"epoch": 7.743589743589744,
"grad_norm": 6.789999485015869,
"learning_rate": 6.128205128205129e-06,
"loss": 0.4236,
"step": 906
},
{
"epoch": 7.752136752136752,
"grad_norm": 19.444454193115234,
"learning_rate": 6.123931623931625e-06,
"loss": 0.2829,
"step": 907
},
{
"epoch": 7.760683760683761,
"grad_norm": 31.564800262451172,
"learning_rate": 6.11965811965812e-06,
"loss": 1.093,
"step": 908
},
{
"epoch": 7.769230769230769,
"grad_norm": 9.956007957458496,
"learning_rate": 6.115384615384616e-06,
"loss": 0.6749,
"step": 909
},
{
"epoch": 7.777777777777778,
"grad_norm": 5.193087577819824,
"learning_rate": 6.111111111111112e-06,
"loss": 0.1986,
"step": 910
},
{
"epoch": 7.786324786324786,
"grad_norm": 4.792945384979248,
"learning_rate": 6.106837606837608e-06,
"loss": 0.5179,
"step": 911
},
{
"epoch": 7.794871794871795,
"grad_norm": 20.602317810058594,
"learning_rate": 6.102564102564104e-06,
"loss": 1.0343,
"step": 912
},
{
"epoch": 7.803418803418803,
"grad_norm": 22.205543518066406,
"learning_rate": 6.098290598290599e-06,
"loss": 0.4921,
"step": 913
},
{
"epoch": 7.811965811965812,
"grad_norm": 13.392712593078613,
"learning_rate": 6.094017094017095e-06,
"loss": 0.9058,
"step": 914
},
{
"epoch": 7.82051282051282,
"grad_norm": 6.262679100036621,
"learning_rate": 6.08974358974359e-06,
"loss": 0.3877,
"step": 915
},
{
"epoch": 7.829059829059829,
"grad_norm": 12.727428436279297,
"learning_rate": 6.085470085470086e-06,
"loss": 0.4477,
"step": 916
},
{
"epoch": 7.837606837606837,
"grad_norm": 6.595224380493164,
"learning_rate": 6.081196581196581e-06,
"loss": 0.5553,
"step": 917
},
{
"epoch": 7.846153846153846,
"grad_norm": 6.815043926239014,
"learning_rate": 6.076923076923077e-06,
"loss": 0.2978,
"step": 918
},
{
"epoch": 7.854700854700854,
"grad_norm": 11.751949310302734,
"learning_rate": 6.0726495726495726e-06,
"loss": 0.5509,
"step": 919
},
{
"epoch": 7.863247863247864,
"grad_norm": 6.067570209503174,
"learning_rate": 6.0683760683760684e-06,
"loss": 0.475,
"step": 920
},
{
"epoch": 7.871794871794872,
"grad_norm": 7.4297919273376465,
"learning_rate": 6.064102564102564e-06,
"loss": 0.5073,
"step": 921
},
{
"epoch": 7.880341880341881,
"grad_norm": 6.778268337249756,
"learning_rate": 6.05982905982906e-06,
"loss": 0.4718,
"step": 922
},
{
"epoch": 7.888888888888889,
"grad_norm": 9.401915550231934,
"learning_rate": 6.055555555555555e-06,
"loss": 0.7151,
"step": 923
},
{
"epoch": 7.897435897435898,
"grad_norm": 6.359888553619385,
"learning_rate": 6.051282051282051e-06,
"loss": 0.3175,
"step": 924
},
{
"epoch": 7.905982905982906,
"grad_norm": 7.036016464233398,
"learning_rate": 6.047008547008547e-06,
"loss": 0.3172,
"step": 925
},
{
"epoch": 7.914529914529915,
"grad_norm": 5.980124473571777,
"learning_rate": 6.042735042735043e-06,
"loss": 0.2949,
"step": 926
},
{
"epoch": 7.923076923076923,
"grad_norm": 5.738795280456543,
"learning_rate": 6.038461538461539e-06,
"loss": 0.2454,
"step": 927
},
{
"epoch": 7.931623931623932,
"grad_norm": 4.688748359680176,
"learning_rate": 6.034188034188035e-06,
"loss": 0.1949,
"step": 928
},
{
"epoch": 7.94017094017094,
"grad_norm": 7.2333984375,
"learning_rate": 6.02991452991453e-06,
"loss": 0.2174,
"step": 929
},
{
"epoch": 7.948717948717949,
"grad_norm": 6.005523204803467,
"learning_rate": 6.025641025641026e-06,
"loss": 0.4216,
"step": 930
},
{
"epoch": 7.957264957264957,
"grad_norm": 6.017541885375977,
"learning_rate": 6.0213675213675215e-06,
"loss": 0.4904,
"step": 931
},
{
"epoch": 7.965811965811966,
"grad_norm": 19.559003829956055,
"learning_rate": 6.0170940170940174e-06,
"loss": 0.2616,
"step": 932
},
{
"epoch": 7.9743589743589745,
"grad_norm": 5.360724449157715,
"learning_rate": 6.012820512820513e-06,
"loss": 0.3629,
"step": 933
},
{
"epoch": 7.982905982905983,
"grad_norm": 9.472721099853516,
"learning_rate": 6.008547008547008e-06,
"loss": 0.5044,
"step": 934
},
{
"epoch": 7.9914529914529915,
"grad_norm": 6.453597068786621,
"learning_rate": 6.004273504273504e-06,
"loss": 0.4742,
"step": 935
},
{
"epoch": 8.0,
"grad_norm": 7.647386074066162,
"learning_rate": 6e-06,
"loss": 0.402,
"step": 936
},
{
"epoch": 8.0,
"eval_loss": 0.1672903448343277,
"eval_runtime": 9.3047,
"eval_samples_per_second": 50.082,
"eval_steps_per_second": 6.341,
"step": 936
},
{
"epoch": 8.008547008547009,
"grad_norm": 5.8361663818359375,
"learning_rate": 5.995726495726496e-06,
"loss": 0.164,
"step": 937
},
{
"epoch": 8.017094017094017,
"grad_norm": 5.801360130310059,
"learning_rate": 5.991452991452992e-06,
"loss": 0.2858,
"step": 938
},
{
"epoch": 8.025641025641026,
"grad_norm": 4.43051290512085,
"learning_rate": 5.987179487179488e-06,
"loss": 0.2068,
"step": 939
},
{
"epoch": 8.034188034188034,
"grad_norm": 6.544061660766602,
"learning_rate": 5.982905982905983e-06,
"loss": 0.3499,
"step": 940
},
{
"epoch": 8.042735042735043,
"grad_norm": 5.500844955444336,
"learning_rate": 5.978632478632479e-06,
"loss": 0.3134,
"step": 941
},
{
"epoch": 8.051282051282051,
"grad_norm": 4.286651611328125,
"learning_rate": 5.974358974358975e-06,
"loss": 0.1767,
"step": 942
},
{
"epoch": 8.05982905982906,
"grad_norm": 13.860437393188477,
"learning_rate": 5.9700854700854705e-06,
"loss": 0.3913,
"step": 943
},
{
"epoch": 8.068376068376068,
"grad_norm": 5.998767852783203,
"learning_rate": 5.9658119658119664e-06,
"loss": 0.2275,
"step": 944
},
{
"epoch": 8.076923076923077,
"grad_norm": 9.01196002960205,
"learning_rate": 5.961538461538462e-06,
"loss": 0.5202,
"step": 945
},
{
"epoch": 8.085470085470085,
"grad_norm": 6.81577730178833,
"learning_rate": 5.957264957264957e-06,
"loss": 0.5923,
"step": 946
},
{
"epoch": 8.094017094017094,
"grad_norm": 7.400684833526611,
"learning_rate": 5.952991452991453e-06,
"loss": 0.2883,
"step": 947
},
{
"epoch": 8.102564102564102,
"grad_norm": 16.18587875366211,
"learning_rate": 5.948717948717949e-06,
"loss": 0.3377,
"step": 948
},
{
"epoch": 8.11111111111111,
"grad_norm": 5.017345428466797,
"learning_rate": 5.944444444444445e-06,
"loss": 0.3912,
"step": 949
},
{
"epoch": 8.11965811965812,
"grad_norm": 5.300196647644043,
"learning_rate": 5.940170940170941e-06,
"loss": 0.4056,
"step": 950
},
{
"epoch": 8.128205128205128,
"grad_norm": 6.3473405838012695,
"learning_rate": 5.935897435897436e-06,
"loss": 0.2559,
"step": 951
},
{
"epoch": 8.136752136752136,
"grad_norm": 12.37689208984375,
"learning_rate": 5.931623931623932e-06,
"loss": 0.2216,
"step": 952
},
{
"epoch": 8.145299145299145,
"grad_norm": 5.573046684265137,
"learning_rate": 5.927350427350428e-06,
"loss": 0.2047,
"step": 953
},
{
"epoch": 8.153846153846153,
"grad_norm": 5.033559322357178,
"learning_rate": 5.923076923076924e-06,
"loss": 0.3661,
"step": 954
},
{
"epoch": 8.162393162393162,
"grad_norm": 5.341614246368408,
"learning_rate": 5.9188034188034195e-06,
"loss": 0.2597,
"step": 955
},
{
"epoch": 8.17094017094017,
"grad_norm": 8.67937183380127,
"learning_rate": 5.914529914529915e-06,
"loss": 0.4098,
"step": 956
},
{
"epoch": 8.179487179487179,
"grad_norm": 3.957489252090454,
"learning_rate": 5.9102564102564105e-06,
"loss": 0.18,
"step": 957
},
{
"epoch": 8.188034188034187,
"grad_norm": 6.377108573913574,
"learning_rate": 5.905982905982906e-06,
"loss": 0.3414,
"step": 958
},
{
"epoch": 8.196581196581196,
"grad_norm": 8.621227264404297,
"learning_rate": 5.901709401709402e-06,
"loss": 1.1625,
"step": 959
},
{
"epoch": 8.205128205128204,
"grad_norm": 5.775392532348633,
"learning_rate": 5.897435897435898e-06,
"loss": 0.4283,
"step": 960
},
{
"epoch": 8.213675213675213,
"grad_norm": 4.522337913513184,
"learning_rate": 5.893162393162394e-06,
"loss": 0.3432,
"step": 961
},
{
"epoch": 8.222222222222221,
"grad_norm": 5.594667434692383,
"learning_rate": 5.88888888888889e-06,
"loss": 0.5212,
"step": 962
},
{
"epoch": 8.23076923076923,
"grad_norm": 5.478531837463379,
"learning_rate": 5.884615384615385e-06,
"loss": 0.2273,
"step": 963
},
{
"epoch": 8.239316239316238,
"grad_norm": 6.08770751953125,
"learning_rate": 5.880341880341881e-06,
"loss": 0.2673,
"step": 964
},
{
"epoch": 8.247863247863247,
"grad_norm": 7.962898254394531,
"learning_rate": 5.876068376068377e-06,
"loss": 0.2654,
"step": 965
},
{
"epoch": 8.256410256410255,
"grad_norm": 6.443154335021973,
"learning_rate": 5.871794871794873e-06,
"loss": 0.2982,
"step": 966
},
{
"epoch": 8.264957264957266,
"grad_norm": 4.689123153686523,
"learning_rate": 5.8675213675213685e-06,
"loss": 0.3459,
"step": 967
},
{
"epoch": 8.273504273504274,
"grad_norm": 5.446859359741211,
"learning_rate": 5.863247863247864e-06,
"loss": 0.2792,
"step": 968
},
{
"epoch": 8.282051282051283,
"grad_norm": 5.562478542327881,
"learning_rate": 5.8589743589743595e-06,
"loss": 0.1939,
"step": 969
},
{
"epoch": 8.290598290598291,
"grad_norm": 4.726650714874268,
"learning_rate": 5.854700854700855e-06,
"loss": 0.1368,
"step": 970
},
{
"epoch": 8.2991452991453,
"grad_norm": 17.44293785095215,
"learning_rate": 5.850427350427351e-06,
"loss": 0.3836,
"step": 971
},
{
"epoch": 8.307692307692308,
"grad_norm": 5.568243980407715,
"learning_rate": 5.846153846153847e-06,
"loss": 0.3674,
"step": 972
},
{
"epoch": 8.316239316239317,
"grad_norm": 3.488147258758545,
"learning_rate": 5.841880341880343e-06,
"loss": 0.197,
"step": 973
},
{
"epoch": 8.324786324786325,
"grad_norm": 15.902129173278809,
"learning_rate": 5.837606837606838e-06,
"loss": 0.4199,
"step": 974
},
{
"epoch": 8.333333333333334,
"grad_norm": 8.055335998535156,
"learning_rate": 5.833333333333334e-06,
"loss": 0.277,
"step": 975
},
{
"epoch": 8.341880341880342,
"grad_norm": 8.122756004333496,
"learning_rate": 5.82905982905983e-06,
"loss": 0.5572,
"step": 976
},
{
"epoch": 8.350427350427351,
"grad_norm": 5.7439961433410645,
"learning_rate": 5.824786324786326e-06,
"loss": 0.2031,
"step": 977
},
{
"epoch": 8.35897435897436,
"grad_norm": 4.329511642456055,
"learning_rate": 5.820512820512822e-06,
"loss": 0.4405,
"step": 978
},
{
"epoch": 8.367521367521368,
"grad_norm": 10.946788787841797,
"learning_rate": 5.8162393162393175e-06,
"loss": 0.4619,
"step": 979
},
{
"epoch": 8.376068376068377,
"grad_norm": 6.0579352378845215,
"learning_rate": 5.8119658119658126e-06,
"loss": 0.4679,
"step": 980
},
{
"epoch": 8.384615384615385,
"grad_norm": 5.656944751739502,
"learning_rate": 5.8076923076923084e-06,
"loss": 0.2395,
"step": 981
},
{
"epoch": 8.393162393162394,
"grad_norm": 5.344303607940674,
"learning_rate": 5.803418803418804e-06,
"loss": 0.2516,
"step": 982
},
{
"epoch": 8.401709401709402,
"grad_norm": 7.070309638977051,
"learning_rate": 5.7991452991453e-06,
"loss": 0.3169,
"step": 983
},
{
"epoch": 8.41025641025641,
"grad_norm": 5.168705940246582,
"learning_rate": 5.794871794871796e-06,
"loss": 0.3007,
"step": 984
},
{
"epoch": 8.418803418803419,
"grad_norm": 3.556293249130249,
"learning_rate": 5.790598290598292e-06,
"loss": 0.2089,
"step": 985
},
{
"epoch": 8.427350427350428,
"grad_norm": 4.943065166473389,
"learning_rate": 5.786324786324787e-06,
"loss": 0.2093,
"step": 986
},
{
"epoch": 8.435897435897436,
"grad_norm": 6.991105556488037,
"learning_rate": 5.782051282051283e-06,
"loss": 0.4671,
"step": 987
},
{
"epoch": 8.444444444444445,
"grad_norm": 5.276190280914307,
"learning_rate": 5.777777777777778e-06,
"loss": 0.2092,
"step": 988
},
{
"epoch": 8.452991452991453,
"grad_norm": 77.91864776611328,
"learning_rate": 5.773504273504273e-06,
"loss": 1.7536,
"step": 989
},
{
"epoch": 8.461538461538462,
"grad_norm": 4.864828109741211,
"learning_rate": 5.769230769230769e-06,
"loss": 0.1669,
"step": 990
},
{
"epoch": 8.47008547008547,
"grad_norm": 4.416967391967773,
"learning_rate": 5.764957264957265e-06,
"loss": 0.2705,
"step": 991
},
{
"epoch": 8.478632478632479,
"grad_norm": 4.558652400970459,
"learning_rate": 5.760683760683761e-06,
"loss": 0.4332,
"step": 992
},
{
"epoch": 8.487179487179487,
"grad_norm": 8.17482852935791,
"learning_rate": 5.756410256410257e-06,
"loss": 0.7286,
"step": 993
},
{
"epoch": 8.495726495726496,
"grad_norm": 7.322425365447998,
"learning_rate": 5.7521367521367525e-06,
"loss": 0.8554,
"step": 994
},
{
"epoch": 8.504273504273504,
"grad_norm": 4.249075889587402,
"learning_rate": 5.7478632478632475e-06,
"loss": 0.2442,
"step": 995
},
{
"epoch": 8.512820512820513,
"grad_norm": 4.157267093658447,
"learning_rate": 5.743589743589743e-06,
"loss": 0.4207,
"step": 996
},
{
"epoch": 8.521367521367521,
"grad_norm": 4.118504047393799,
"learning_rate": 5.739316239316239e-06,
"loss": 0.1411,
"step": 997
},
{
"epoch": 8.52991452991453,
"grad_norm": 7.273322105407715,
"learning_rate": 5.735042735042735e-06,
"loss": 0.6269,
"step": 998
},
{
"epoch": 8.538461538461538,
"grad_norm": 4.7668633460998535,
"learning_rate": 5.730769230769231e-06,
"loss": 0.1894,
"step": 999
},
{
"epoch": 8.547008547008547,
"grad_norm": 5.869007110595703,
"learning_rate": 5.726495726495727e-06,
"loss": 0.7301,
"step": 1000
},
{
"epoch": 8.555555555555555,
"grad_norm": 5.987617015838623,
"learning_rate": 5.722222222222222e-06,
"loss": 0.29,
"step": 1001
},
{
"epoch": 8.564102564102564,
"grad_norm": 5.445812702178955,
"learning_rate": 5.717948717948718e-06,
"loss": 0.4278,
"step": 1002
},
{
"epoch": 8.572649572649572,
"grad_norm": 4.7509002685546875,
"learning_rate": 5.713675213675214e-06,
"loss": 0.3396,
"step": 1003
},
{
"epoch": 8.581196581196581,
"grad_norm": 5.584397315979004,
"learning_rate": 5.70940170940171e-06,
"loss": 0.1329,
"step": 1004
},
{
"epoch": 8.58974358974359,
"grad_norm": 4.627229690551758,
"learning_rate": 5.705128205128206e-06,
"loss": 0.3012,
"step": 1005
},
{
"epoch": 8.598290598290598,
"grad_norm": 7.724045276641846,
"learning_rate": 5.7008547008547015e-06,
"loss": 0.4876,
"step": 1006
},
{
"epoch": 8.606837606837606,
"grad_norm": 3.488499164581299,
"learning_rate": 5.6965811965811965e-06,
"loss": 0.2025,
"step": 1007
},
{
"epoch": 8.615384615384615,
"grad_norm": 14.487537384033203,
"learning_rate": 5.692307692307692e-06,
"loss": 0.6795,
"step": 1008
},
{
"epoch": 8.623931623931623,
"grad_norm": 4.03059196472168,
"learning_rate": 5.688034188034188e-06,
"loss": 0.2121,
"step": 1009
},
{
"epoch": 8.632478632478632,
"grad_norm": 3.278873920440674,
"learning_rate": 5.683760683760684e-06,
"loss": 0.3475,
"step": 1010
},
{
"epoch": 8.64102564102564,
"grad_norm": 4.599937915802002,
"learning_rate": 5.67948717948718e-06,
"loss": 0.2355,
"step": 1011
},
{
"epoch": 8.649572649572649,
"grad_norm": 6.314788818359375,
"learning_rate": 5.675213675213675e-06,
"loss": 0.2402,
"step": 1012
},
{
"epoch": 8.658119658119658,
"grad_norm": 3.4483532905578613,
"learning_rate": 5.670940170940171e-06,
"loss": 0.2189,
"step": 1013
},
{
"epoch": 8.666666666666666,
"grad_norm": 299.8923645019531,
"learning_rate": 5.666666666666667e-06,
"loss": 1.0473,
"step": 1014
},
{
"epoch": 8.675213675213675,
"grad_norm": 13.14855670928955,
"learning_rate": 5.662393162393163e-06,
"loss": 0.3723,
"step": 1015
},
{
"epoch": 8.683760683760683,
"grad_norm": 6.513180732727051,
"learning_rate": 5.658119658119659e-06,
"loss": 0.483,
"step": 1016
},
{
"epoch": 8.692307692307692,
"grad_norm": 5.026037693023682,
"learning_rate": 5.6538461538461546e-06,
"loss": 0.4417,
"step": 1017
},
{
"epoch": 8.7008547008547,
"grad_norm": 176.535888671875,
"learning_rate": 5.64957264957265e-06,
"loss": 0.5256,
"step": 1018
},
{
"epoch": 8.709401709401709,
"grad_norm": 6.023639678955078,
"learning_rate": 5.6452991452991455e-06,
"loss": 0.3708,
"step": 1019
},
{
"epoch": 8.717948717948717,
"grad_norm": 16.64018440246582,
"learning_rate": 5.641025641025641e-06,
"loss": 0.8908,
"step": 1020
},
{
"epoch": 8.726495726495726,
"grad_norm": 2.9167582988739014,
"learning_rate": 5.636752136752137e-06,
"loss": 0.077,
"step": 1021
},
{
"epoch": 8.735042735042736,
"grad_norm": 3.368325710296631,
"learning_rate": 5.632478632478633e-06,
"loss": 0.2495,
"step": 1022
},
{
"epoch": 8.743589743589745,
"grad_norm": 3.7961905002593994,
"learning_rate": 5.628205128205129e-06,
"loss": 0.4427,
"step": 1023
},
{
"epoch": 8.752136752136753,
"grad_norm": 4.661024570465088,
"learning_rate": 5.623931623931624e-06,
"loss": 0.3092,
"step": 1024
},
{
"epoch": 8.760683760683762,
"grad_norm": 5.1971588134765625,
"learning_rate": 5.61965811965812e-06,
"loss": 0.2213,
"step": 1025
},
{
"epoch": 8.76923076923077,
"grad_norm": 4.427041530609131,
"learning_rate": 5.615384615384616e-06,
"loss": 0.2885,
"step": 1026
},
{
"epoch": 8.777777777777779,
"grad_norm": 7.352906703948975,
"learning_rate": 5.611111111111112e-06,
"loss": 0.2689,
"step": 1027
},
{
"epoch": 8.786324786324787,
"grad_norm": 5.306934833526611,
"learning_rate": 5.606837606837608e-06,
"loss": 0.3758,
"step": 1028
},
{
"epoch": 8.794871794871796,
"grad_norm": 4.502418041229248,
"learning_rate": 5.602564102564103e-06,
"loss": 0.4655,
"step": 1029
},
{
"epoch": 8.803418803418804,
"grad_norm": 3.427734851837158,
"learning_rate": 5.598290598290599e-06,
"loss": 0.1145,
"step": 1030
},
{
"epoch": 8.811965811965813,
"grad_norm": 4.047433376312256,
"learning_rate": 5.5940170940170945e-06,
"loss": 0.1482,
"step": 1031
},
{
"epoch": 8.820512820512821,
"grad_norm": 3.6860435009002686,
"learning_rate": 5.58974358974359e-06,
"loss": 0.1152,
"step": 1032
},
{
"epoch": 8.82905982905983,
"grad_norm": 6.792733669281006,
"learning_rate": 5.585470085470086e-06,
"loss": 0.1732,
"step": 1033
},
{
"epoch": 8.837606837606838,
"grad_norm": 4.222206115722656,
"learning_rate": 5.581196581196582e-06,
"loss": 0.1259,
"step": 1034
},
{
"epoch": 8.846153846153847,
"grad_norm": 4.376220703125,
"learning_rate": 5.576923076923077e-06,
"loss": 0.2403,
"step": 1035
},
{
"epoch": 8.854700854700855,
"grad_norm": 3.459076166152954,
"learning_rate": 5.572649572649573e-06,
"loss": 0.2064,
"step": 1036
},
{
"epoch": 8.863247863247864,
"grad_norm": 6.312697410583496,
"learning_rate": 5.568376068376069e-06,
"loss": 0.5076,
"step": 1037
},
{
"epoch": 8.871794871794872,
"grad_norm": 10.137848854064941,
"learning_rate": 5.564102564102565e-06,
"loss": 0.1649,
"step": 1038
},
{
"epoch": 8.88034188034188,
"grad_norm": 6.605007171630859,
"learning_rate": 5.559829059829061e-06,
"loss": 0.4233,
"step": 1039
},
{
"epoch": 8.88888888888889,
"grad_norm": 3.9786465167999268,
"learning_rate": 5.555555555555557e-06,
"loss": 0.1801,
"step": 1040
},
{
"epoch": 8.897435897435898,
"grad_norm": 4.40491247177124,
"learning_rate": 5.551282051282052e-06,
"loss": 0.169,
"step": 1041
},
{
"epoch": 8.905982905982906,
"grad_norm": 4.719818592071533,
"learning_rate": 5.547008547008548e-06,
"loss": 0.1454,
"step": 1042
},
{
"epoch": 8.914529914529915,
"grad_norm": 2.384941577911377,
"learning_rate": 5.5427350427350435e-06,
"loss": 0.0723,
"step": 1043
},
{
"epoch": 8.923076923076923,
"grad_norm": 3.258315324783325,
"learning_rate": 5.538461538461539e-06,
"loss": 0.1023,
"step": 1044
},
{
"epoch": 8.931623931623932,
"grad_norm": 18.745052337646484,
"learning_rate": 5.534188034188035e-06,
"loss": 0.2673,
"step": 1045
},
{
"epoch": 8.94017094017094,
"grad_norm": 3.788177967071533,
"learning_rate": 5.52991452991453e-06,
"loss": 0.3173,
"step": 1046
},
{
"epoch": 8.948717948717949,
"grad_norm": 2.734895944595337,
"learning_rate": 5.525641025641026e-06,
"loss": 0.0834,
"step": 1047
},
{
"epoch": 8.957264957264957,
"grad_norm": 4.158284664154053,
"learning_rate": 5.521367521367522e-06,
"loss": 0.3414,
"step": 1048
},
{
"epoch": 8.965811965811966,
"grad_norm": 4.875148296356201,
"learning_rate": 5.517094017094018e-06,
"loss": 0.2729,
"step": 1049
},
{
"epoch": 8.974358974358974,
"grad_norm": 5.2556352615356445,
"learning_rate": 5.512820512820514e-06,
"loss": 0.1422,
"step": 1050
},
{
"epoch": 8.982905982905983,
"grad_norm": 3.817049980163574,
"learning_rate": 5.50854700854701e-06,
"loss": 0.2514,
"step": 1051
},
{
"epoch": 8.991452991452991,
"grad_norm": 2.247227668762207,
"learning_rate": 5.504273504273505e-06,
"loss": 0.0703,
"step": 1052
},
{
"epoch": 9.0,
"grad_norm": 34.36362838745117,
"learning_rate": 5.500000000000001e-06,
"loss": 0.7433,
"step": 1053
},
{
"epoch": 9.0,
"eval_loss": 0.12675683200359344,
"eval_runtime": 9.3141,
"eval_samples_per_second": 50.032,
"eval_steps_per_second": 6.334,
"step": 1053
},
{
"epoch": 9.008547008547009,
"grad_norm": 5.314228057861328,
"learning_rate": 5.495726495726497e-06,
"loss": 0.2576,
"step": 1054
},
{
"epoch": 9.017094017094017,
"grad_norm": 34.33782958984375,
"learning_rate": 5.4914529914529925e-06,
"loss": 0.3833,
"step": 1055
},
{
"epoch": 9.025641025641026,
"grad_norm": 5.440598964691162,
"learning_rate": 5.487179487179488e-06,
"loss": 0.3898,
"step": 1056
},
{
"epoch": 9.034188034188034,
"grad_norm": 3.561518907546997,
"learning_rate": 5.482905982905984e-06,
"loss": 0.2197,
"step": 1057
},
{
"epoch": 9.042735042735043,
"grad_norm": 4.7679762840271,
"learning_rate": 5.478632478632479e-06,
"loss": 0.3885,
"step": 1058
},
{
"epoch": 9.051282051282051,
"grad_norm": 4.694134712219238,
"learning_rate": 5.474358974358975e-06,
"loss": 0.2532,
"step": 1059
},
{
"epoch": 9.05982905982906,
"grad_norm": 4.347025394439697,
"learning_rate": 5.470085470085471e-06,
"loss": 0.1949,
"step": 1060
},
{
"epoch": 9.068376068376068,
"grad_norm": 4.064525127410889,
"learning_rate": 5.465811965811966e-06,
"loss": 0.1597,
"step": 1061
},
{
"epoch": 9.076923076923077,
"grad_norm": 3.78560471534729,
"learning_rate": 5.461538461538461e-06,
"loss": 0.18,
"step": 1062
},
{
"epoch": 9.085470085470085,
"grad_norm": 7.843743324279785,
"learning_rate": 5.457264957264957e-06,
"loss": 0.3146,
"step": 1063
},
{
"epoch": 9.094017094017094,
"grad_norm": 8.152037620544434,
"learning_rate": 5.452991452991453e-06,
"loss": 0.3384,
"step": 1064
},
{
"epoch": 9.102564102564102,
"grad_norm": 3.987872838973999,
"learning_rate": 5.448717948717949e-06,
"loss": 0.2071,
"step": 1065
},
{
"epoch": 9.11111111111111,
"grad_norm": 3.478532552719116,
"learning_rate": 5.444444444444445e-06,
"loss": 0.1788,
"step": 1066
},
{
"epoch": 9.11965811965812,
"grad_norm": 3.6598286628723145,
"learning_rate": 5.44017094017094e-06,
"loss": 0.2459,
"step": 1067
},
{
"epoch": 9.128205128205128,
"grad_norm": 9.528829574584961,
"learning_rate": 5.435897435897436e-06,
"loss": 0.2046,
"step": 1068
},
{
"epoch": 9.136752136752136,
"grad_norm": 3.3274407386779785,
"learning_rate": 5.4316239316239316e-06,
"loss": 0.1414,
"step": 1069
},
{
"epoch": 9.145299145299145,
"grad_norm": 5.117324352264404,
"learning_rate": 5.4273504273504275e-06,
"loss": 0.3636,
"step": 1070
},
{
"epoch": 9.153846153846153,
"grad_norm": 8.604976654052734,
"learning_rate": 5.423076923076923e-06,
"loss": 0.2723,
"step": 1071
},
{
"epoch": 9.162393162393162,
"grad_norm": 72.67993927001953,
"learning_rate": 5.418803418803419e-06,
"loss": 0.5863,
"step": 1072
},
{
"epoch": 9.17094017094017,
"grad_norm": 3.8609094619750977,
"learning_rate": 5.414529914529914e-06,
"loss": 0.1778,
"step": 1073
},
{
"epoch": 9.179487179487179,
"grad_norm": 21.24209976196289,
"learning_rate": 5.41025641025641e-06,
"loss": 0.2062,
"step": 1074
},
{
"epoch": 9.188034188034187,
"grad_norm": 5.552285194396973,
"learning_rate": 5.405982905982906e-06,
"loss": 0.4685,
"step": 1075
},
{
"epoch": 9.196581196581196,
"grad_norm": 12.241254806518555,
"learning_rate": 5.401709401709402e-06,
"loss": 0.4309,
"step": 1076
},
{
"epoch": 9.205128205128204,
"grad_norm": 3.6276049613952637,
"learning_rate": 5.397435897435898e-06,
"loss": 0.0924,
"step": 1077
},
{
"epoch": 9.213675213675213,
"grad_norm": 10.98838996887207,
"learning_rate": 5.393162393162394e-06,
"loss": 0.7616,
"step": 1078
},
{
"epoch": 9.222222222222221,
"grad_norm": 4.689146041870117,
"learning_rate": 5.388888888888889e-06,
"loss": 0.346,
"step": 1079
},
{
"epoch": 9.23076923076923,
"grad_norm": 6.385439872741699,
"learning_rate": 5.384615384615385e-06,
"loss": 0.2945,
"step": 1080
},
{
"epoch": 9.239316239316238,
"grad_norm": 2.4931023120880127,
"learning_rate": 5.3803418803418806e-06,
"loss": 0.172,
"step": 1081
},
{
"epoch": 9.247863247863247,
"grad_norm": 3.797539472579956,
"learning_rate": 5.3760683760683764e-06,
"loss": 0.0927,
"step": 1082
},
{
"epoch": 9.256410256410255,
"grad_norm": 2.7136716842651367,
"learning_rate": 5.371794871794872e-06,
"loss": 0.0932,
"step": 1083
},
{
"epoch": 9.264957264957266,
"grad_norm": 5.207858085632324,
"learning_rate": 5.367521367521367e-06,
"loss": 0.1176,
"step": 1084
},
{
"epoch": 9.273504273504274,
"grad_norm": 3.95009183883667,
"learning_rate": 5.363247863247863e-06,
"loss": 0.3045,
"step": 1085
},
{
"epoch": 9.282051282051283,
"grad_norm": 1.9097685813903809,
"learning_rate": 5.358974358974359e-06,
"loss": 0.1793,
"step": 1086
},
{
"epoch": 9.290598290598291,
"grad_norm": 3.205216407775879,
"learning_rate": 5.354700854700855e-06,
"loss": 0.1071,
"step": 1087
},
{
"epoch": 9.2991452991453,
"grad_norm": 3.481822967529297,
"learning_rate": 5.350427350427351e-06,
"loss": 0.3885,
"step": 1088
},
{
"epoch": 9.307692307692308,
"grad_norm": 11.802562713623047,
"learning_rate": 5.346153846153847e-06,
"loss": 0.1769,
"step": 1089
},
{
"epoch": 9.316239316239317,
"grad_norm": 3.101505994796753,
"learning_rate": 5.341880341880342e-06,
"loss": 0.1265,
"step": 1090
},
{
"epoch": 9.324786324786325,
"grad_norm": 5.163032054901123,
"learning_rate": 5.337606837606838e-06,
"loss": 0.4768,
"step": 1091
},
{
"epoch": 9.333333333333334,
"grad_norm": 1.8217605352401733,
"learning_rate": 5.333333333333334e-06,
"loss": 0.053,
"step": 1092
},
{
"epoch": 9.341880341880342,
"grad_norm": 2.6139562129974365,
"learning_rate": 5.3290598290598295e-06,
"loss": 0.0848,
"step": 1093
},
{
"epoch": 9.350427350427351,
"grad_norm": 3.1172311305999756,
"learning_rate": 5.3247863247863254e-06,
"loss": 0.1076,
"step": 1094
},
{
"epoch": 9.35897435897436,
"grad_norm": 5.907342433929443,
"learning_rate": 5.320512820512821e-06,
"loss": 0.1737,
"step": 1095
},
{
"epoch": 9.367521367521368,
"grad_norm": 45.74967575073242,
"learning_rate": 5.316239316239316e-06,
"loss": 0.2455,
"step": 1096
},
{
"epoch": 9.376068376068377,
"grad_norm": 3.1865549087524414,
"learning_rate": 5.311965811965812e-06,
"loss": 0.2236,
"step": 1097
},
{
"epoch": 9.384615384615385,
"grad_norm": 4.028379917144775,
"learning_rate": 5.307692307692308e-06,
"loss": 0.1065,
"step": 1098
},
{
"epoch": 9.393162393162394,
"grad_norm": 5.388605117797852,
"learning_rate": 5.303418803418804e-06,
"loss": 0.2967,
"step": 1099
},
{
"epoch": 9.401709401709402,
"grad_norm": 3.661736249923706,
"learning_rate": 5.2991452991453e-06,
"loss": 0.1271,
"step": 1100
},
{
"epoch": 9.41025641025641,
"grad_norm": 4.693649768829346,
"learning_rate": 5.294871794871795e-06,
"loss": 0.7891,
"step": 1101
},
{
"epoch": 9.418803418803419,
"grad_norm": 14.75247573852539,
"learning_rate": 5.290598290598291e-06,
"loss": 0.707,
"step": 1102
},
{
"epoch": 9.427350427350428,
"grad_norm": 5.123616695404053,
"learning_rate": 5.286324786324787e-06,
"loss": 0.2424,
"step": 1103
},
{
"epoch": 9.435897435897436,
"grad_norm": 5.946259021759033,
"learning_rate": 5.282051282051283e-06,
"loss": 0.2558,
"step": 1104
},
{
"epoch": 9.444444444444445,
"grad_norm": 3.3757872581481934,
"learning_rate": 5.2777777777777785e-06,
"loss": 0.072,
"step": 1105
},
{
"epoch": 9.452991452991453,
"grad_norm": 4.639676094055176,
"learning_rate": 5.2735042735042744e-06,
"loss": 0.1483,
"step": 1106
},
{
"epoch": 9.461538461538462,
"grad_norm": 5.552156925201416,
"learning_rate": 5.2692307692307695e-06,
"loss": 0.341,
"step": 1107
},
{
"epoch": 9.47008547008547,
"grad_norm": 10.601661682128906,
"learning_rate": 5.264957264957265e-06,
"loss": 0.5964,
"step": 1108
},
{
"epoch": 9.478632478632479,
"grad_norm": 4.391530513763428,
"learning_rate": 5.260683760683761e-06,
"loss": 0.2346,
"step": 1109
},
{
"epoch": 9.487179487179487,
"grad_norm": 3.150240659713745,
"learning_rate": 5.256410256410257e-06,
"loss": 0.1,
"step": 1110
},
{
"epoch": 9.495726495726496,
"grad_norm": 5.60894775390625,
"learning_rate": 5.252136752136753e-06,
"loss": 0.397,
"step": 1111
},
{
"epoch": 9.504273504273504,
"grad_norm": 9.21768856048584,
"learning_rate": 5.247863247863249e-06,
"loss": 0.2292,
"step": 1112
},
{
"epoch": 9.512820512820513,
"grad_norm": 8.351348876953125,
"learning_rate": 5.243589743589744e-06,
"loss": 0.3129,
"step": 1113
},
{
"epoch": 9.521367521367521,
"grad_norm": 3.0813419818878174,
"learning_rate": 5.23931623931624e-06,
"loss": 0.2539,
"step": 1114
},
{
"epoch": 9.52991452991453,
"grad_norm": 5.553039073944092,
"learning_rate": 5.235042735042736e-06,
"loss": 0.1121,
"step": 1115
},
{
"epoch": 9.538461538461538,
"grad_norm": 3.973057746887207,
"learning_rate": 5.230769230769232e-06,
"loss": 0.4928,
"step": 1116
},
{
"epoch": 9.547008547008547,
"grad_norm": 4.753414630889893,
"learning_rate": 5.2264957264957275e-06,
"loss": 0.2247,
"step": 1117
},
{
"epoch": 9.555555555555555,
"grad_norm": 7.344094753265381,
"learning_rate": 5.2222222222222226e-06,
"loss": 0.1405,
"step": 1118
},
{
"epoch": 9.564102564102564,
"grad_norm": 47.83219528198242,
"learning_rate": 5.2179487179487185e-06,
"loss": 0.3108,
"step": 1119
},
{
"epoch": 9.572649572649572,
"grad_norm": 2.31591796875,
"learning_rate": 5.213675213675214e-06,
"loss": 0.1019,
"step": 1120
},
{
"epoch": 9.581196581196581,
"grad_norm": 3.871413230895996,
"learning_rate": 5.20940170940171e-06,
"loss": 0.2562,
"step": 1121
},
{
"epoch": 9.58974358974359,
"grad_norm": 2.1789255142211914,
"learning_rate": 5.205128205128206e-06,
"loss": 0.0571,
"step": 1122
},
{
"epoch": 9.598290598290598,
"grad_norm": 4.119174957275391,
"learning_rate": 5.200854700854702e-06,
"loss": 0.2799,
"step": 1123
},
{
"epoch": 9.606837606837606,
"grad_norm": 7.873704433441162,
"learning_rate": 5.196581196581197e-06,
"loss": 0.2154,
"step": 1124
},
{
"epoch": 9.615384615384615,
"grad_norm": 3.386780023574829,
"learning_rate": 5.192307692307693e-06,
"loss": 0.1607,
"step": 1125
},
{
"epoch": 9.623931623931623,
"grad_norm": 3.3607964515686035,
"learning_rate": 5.188034188034189e-06,
"loss": 0.22,
"step": 1126
},
{
"epoch": 9.632478632478632,
"grad_norm": 10.655082702636719,
"learning_rate": 5.183760683760685e-06,
"loss": 0.2102,
"step": 1127
},
{
"epoch": 9.64102564102564,
"grad_norm": 5.550488471984863,
"learning_rate": 5.179487179487181e-06,
"loss": 0.347,
"step": 1128
},
{
"epoch": 9.649572649572649,
"grad_norm": 4.184569835662842,
"learning_rate": 5.1752136752136765e-06,
"loss": 0.183,
"step": 1129
},
{
"epoch": 9.658119658119658,
"grad_norm": 4.892969131469727,
"learning_rate": 5.1709401709401716e-06,
"loss": 0.2896,
"step": 1130
},
{
"epoch": 9.666666666666666,
"grad_norm": 5.926670074462891,
"learning_rate": 5.1666666666666675e-06,
"loss": 0.3321,
"step": 1131
},
{
"epoch": 9.675213675213675,
"grad_norm": 11.719461441040039,
"learning_rate": 5.162393162393163e-06,
"loss": 0.4055,
"step": 1132
},
{
"epoch": 9.683760683760683,
"grad_norm": 3.5666840076446533,
"learning_rate": 5.158119658119659e-06,
"loss": 0.2318,
"step": 1133
},
{
"epoch": 9.692307692307692,
"grad_norm": 6.800848484039307,
"learning_rate": 5.1538461538461534e-06,
"loss": 0.1202,
"step": 1134
},
{
"epoch": 9.7008547008547,
"grad_norm": 4.50139856338501,
"learning_rate": 5.149572649572649e-06,
"loss": 0.1914,
"step": 1135
},
{
"epoch": 9.709401709401709,
"grad_norm": 2.599607467651367,
"learning_rate": 5.145299145299145e-06,
"loss": 0.0833,
"step": 1136
},
{
"epoch": 9.717948717948717,
"grad_norm": 6.084483623504639,
"learning_rate": 5.141025641025641e-06,
"loss": 0.0907,
"step": 1137
},
{
"epoch": 9.726495726495726,
"grad_norm": 4.542915344238281,
"learning_rate": 5.136752136752137e-06,
"loss": 0.4554,
"step": 1138
},
{
"epoch": 9.735042735042736,
"grad_norm": 3.871166229248047,
"learning_rate": 5.132478632478632e-06,
"loss": 0.3037,
"step": 1139
},
{
"epoch": 9.743589743589745,
"grad_norm": 5.121057033538818,
"learning_rate": 5.128205128205128e-06,
"loss": 0.1751,
"step": 1140
},
{
"epoch": 9.752136752136753,
"grad_norm": 3.7517125606536865,
"learning_rate": 5.123931623931624e-06,
"loss": 0.3144,
"step": 1141
},
{
"epoch": 9.760683760683762,
"grad_norm": 1.7604278326034546,
"learning_rate": 5.11965811965812e-06,
"loss": 0.0649,
"step": 1142
},
{
"epoch": 9.76923076923077,
"grad_norm": 13.68947982788086,
"learning_rate": 5.115384615384616e-06,
"loss": 0.2184,
"step": 1143
},
{
"epoch": 9.777777777777779,
"grad_norm": 5.716836452484131,
"learning_rate": 5.1111111111111115e-06,
"loss": 0.1876,
"step": 1144
},
{
"epoch": 9.786324786324787,
"grad_norm": 8.21943187713623,
"learning_rate": 5.1068376068376065e-06,
"loss": 0.349,
"step": 1145
},
{
"epoch": 9.794871794871796,
"grad_norm": 5.270402908325195,
"learning_rate": 5.1025641025641024e-06,
"loss": 0.4442,
"step": 1146
},
{
"epoch": 9.803418803418804,
"grad_norm": 2.3825948238372803,
"learning_rate": 5.098290598290598e-06,
"loss": 0.2237,
"step": 1147
},
{
"epoch": 9.811965811965813,
"grad_norm": 11.812047958374023,
"learning_rate": 5.094017094017094e-06,
"loss": 0.5122,
"step": 1148
},
{
"epoch": 9.820512820512821,
"grad_norm": 9.14202880859375,
"learning_rate": 5.08974358974359e-06,
"loss": 0.3407,
"step": 1149
},
{
"epoch": 9.82905982905983,
"grad_norm": 5.273305892944336,
"learning_rate": 5.085470085470086e-06,
"loss": 0.1702,
"step": 1150
},
{
"epoch": 9.837606837606838,
"grad_norm": 2.995126485824585,
"learning_rate": 5.081196581196581e-06,
"loss": 0.228,
"step": 1151
},
{
"epoch": 9.846153846153847,
"grad_norm": 4.077675819396973,
"learning_rate": 5.076923076923077e-06,
"loss": 0.4022,
"step": 1152
},
{
"epoch": 9.854700854700855,
"grad_norm": 2.1732425689697266,
"learning_rate": 5.072649572649573e-06,
"loss": 0.1178,
"step": 1153
},
{
"epoch": 9.863247863247864,
"grad_norm": 2.905172109603882,
"learning_rate": 5.068376068376069e-06,
"loss": 0.1718,
"step": 1154
},
{
"epoch": 9.871794871794872,
"grad_norm": 2.702521324157715,
"learning_rate": 5.064102564102565e-06,
"loss": 0.1488,
"step": 1155
},
{
"epoch": 9.88034188034188,
"grad_norm": 2.414088487625122,
"learning_rate": 5.05982905982906e-06,
"loss": 0.1034,
"step": 1156
},
{
"epoch": 9.88888888888889,
"grad_norm": 2.618173360824585,
"learning_rate": 5.0555555555555555e-06,
"loss": 0.0783,
"step": 1157
},
{
"epoch": 9.897435897435898,
"grad_norm": 5.002628803253174,
"learning_rate": 5.051282051282051e-06,
"loss": 0.1195,
"step": 1158
},
{
"epoch": 9.905982905982906,
"grad_norm": 2.84708833694458,
"learning_rate": 5.047008547008547e-06,
"loss": 0.0906,
"step": 1159
},
{
"epoch": 9.914529914529915,
"grad_norm": 5.564020156860352,
"learning_rate": 5.042735042735043e-06,
"loss": 0.2037,
"step": 1160
},
{
"epoch": 9.923076923076923,
"grad_norm": 3.7763166427612305,
"learning_rate": 5.038461538461539e-06,
"loss": 0.2067,
"step": 1161
},
{
"epoch": 9.931623931623932,
"grad_norm": 2.67268705368042,
"learning_rate": 5.034188034188034e-06,
"loss": 0.0557,
"step": 1162
},
{
"epoch": 9.94017094017094,
"grad_norm": 2.4144680500030518,
"learning_rate": 5.02991452991453e-06,
"loss": 0.194,
"step": 1163
},
{
"epoch": 9.948717948717949,
"grad_norm": 2.0716731548309326,
"learning_rate": 5.025641025641026e-06,
"loss": 0.1253,
"step": 1164
},
{
"epoch": 9.957264957264957,
"grad_norm": 13.20478630065918,
"learning_rate": 5.021367521367522e-06,
"loss": 0.268,
"step": 1165
},
{
"epoch": 9.965811965811966,
"grad_norm": 2.093698263168335,
"learning_rate": 5.017094017094018e-06,
"loss": 0.0738,
"step": 1166
},
{
"epoch": 9.974358974358974,
"grad_norm": 2.2758119106292725,
"learning_rate": 5.012820512820514e-06,
"loss": 0.0804,
"step": 1167
},
{
"epoch": 9.982905982905983,
"grad_norm": 21.843395233154297,
"learning_rate": 5.008547008547009e-06,
"loss": 0.3298,
"step": 1168
},
{
"epoch": 9.991452991452991,
"grad_norm": 3.0435073375701904,
"learning_rate": 5.0042735042735045e-06,
"loss": 0.1318,
"step": 1169
},
{
"epoch": 10.0,
"grad_norm": 8.449163436889648,
"learning_rate": 5e-06,
"loss": 0.1725,
"step": 1170
},
{
"epoch": 10.0,
"eval_loss": 0.10285739600658417,
"eval_runtime": 9.2384,
"eval_samples_per_second": 50.441,
"eval_steps_per_second": 6.386,
"step": 1170
},
{
"epoch": 10.008547008547009,
"grad_norm": 4.151456356048584,
"learning_rate": 4.995726495726496e-06,
"loss": 0.3336,
"step": 1171
},
{
"epoch": 10.017094017094017,
"grad_norm": 2.38647723197937,
"learning_rate": 4.991452991452992e-06,
"loss": 0.1138,
"step": 1172
},
{
"epoch": 10.025641025641026,
"grad_norm": 4.44817590713501,
"learning_rate": 4.987179487179487e-06,
"loss": 0.0954,
"step": 1173
},
{
"epoch": 10.034188034188034,
"grad_norm": 2.6213347911834717,
"learning_rate": 4.982905982905983e-06,
"loss": 0.0695,
"step": 1174
},
{
"epoch": 10.042735042735043,
"grad_norm": 4.664891719818115,
"learning_rate": 4.978632478632479e-06,
"loss": 0.1067,
"step": 1175
},
{
"epoch": 10.051282051282051,
"grad_norm": 1.7059048414230347,
"learning_rate": 4.974358974358975e-06,
"loss": 0.0321,
"step": 1176
},
{
"epoch": 10.05982905982906,
"grad_norm": 5.123709678649902,
"learning_rate": 4.970085470085471e-06,
"loss": 0.2117,
"step": 1177
},
{
"epoch": 10.068376068376068,
"grad_norm": 2.2717695236206055,
"learning_rate": 4.965811965811967e-06,
"loss": 0.2187,
"step": 1178
},
{
"epoch": 10.076923076923077,
"grad_norm": 4.669886112213135,
"learning_rate": 4.961538461538462e-06,
"loss": 0.4615,
"step": 1179
},
{
"epoch": 10.085470085470085,
"grad_norm": 18.739727020263672,
"learning_rate": 4.957264957264958e-06,
"loss": 0.3431,
"step": 1180
},
{
"epoch": 10.094017094017094,
"grad_norm": 7.798559188842773,
"learning_rate": 4.9529914529914535e-06,
"loss": 0.2483,
"step": 1181
},
{
"epoch": 10.102564102564102,
"grad_norm": 22.59453773498535,
"learning_rate": 4.948717948717949e-06,
"loss": 0.15,
"step": 1182
},
{
"epoch": 10.11111111111111,
"grad_norm": 2.5734364986419678,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0465,
"step": 1183
},
{
"epoch": 10.11965811965812,
"grad_norm": 3.1944875717163086,
"learning_rate": 4.940170940170941e-06,
"loss": 0.1429,
"step": 1184
},
{
"epoch": 10.128205128205128,
"grad_norm": 1.6943906545639038,
"learning_rate": 4.935897435897436e-06,
"loss": 0.0685,
"step": 1185
},
{
"epoch": 10.136752136752136,
"grad_norm": 4.497282981872559,
"learning_rate": 4.931623931623932e-06,
"loss": 0.2113,
"step": 1186
},
{
"epoch": 10.145299145299145,
"grad_norm": 2.9377167224884033,
"learning_rate": 4.927350427350428e-06,
"loss": 0.1352,
"step": 1187
},
{
"epoch": 10.153846153846153,
"grad_norm": 8.528215408325195,
"learning_rate": 4.923076923076924e-06,
"loss": 0.3268,
"step": 1188
},
{
"epoch": 10.162393162393162,
"grad_norm": 2.143850803375244,
"learning_rate": 4.918803418803419e-06,
"loss": 0.0923,
"step": 1189
},
{
"epoch": 10.17094017094017,
"grad_norm": 3.921250343322754,
"learning_rate": 4.914529914529915e-06,
"loss": 0.1451,
"step": 1190
},
{
"epoch": 10.179487179487179,
"grad_norm": 10.713285446166992,
"learning_rate": 4.910256410256411e-06,
"loss": 0.17,
"step": 1191
},
{
"epoch": 10.188034188034187,
"grad_norm": 2.450204849243164,
"learning_rate": 4.905982905982906e-06,
"loss": 0.0765,
"step": 1192
},
{
"epoch": 10.196581196581196,
"grad_norm": 4.750647068023682,
"learning_rate": 4.901709401709402e-06,
"loss": 0.2829,
"step": 1193
},
{
"epoch": 10.205128205128204,
"grad_norm": 12.714463233947754,
"learning_rate": 4.8974358974358975e-06,
"loss": 0.6767,
"step": 1194
},
{
"epoch": 10.213675213675213,
"grad_norm": 6.759951591491699,
"learning_rate": 4.8931623931623934e-06,
"loss": 0.2369,
"step": 1195
},
{
"epoch": 10.222222222222221,
"grad_norm": 8.592784881591797,
"learning_rate": 4.888888888888889e-06,
"loss": 0.4203,
"step": 1196
},
{
"epoch": 10.23076923076923,
"grad_norm": 5.04047155380249,
"learning_rate": 4.884615384615385e-06,
"loss": 0.1023,
"step": 1197
},
{
"epoch": 10.239316239316238,
"grad_norm": 38.112152099609375,
"learning_rate": 4.88034188034188e-06,
"loss": 0.4686,
"step": 1198
},
{
"epoch": 10.247863247863247,
"grad_norm": 6.751104354858398,
"learning_rate": 4.876068376068376e-06,
"loss": 0.085,
"step": 1199
},
{
"epoch": 10.256410256410255,
"grad_norm": 4.3117594718933105,
"learning_rate": 4.871794871794872e-06,
"loss": 0.1504,
"step": 1200
},
{
"epoch": 10.264957264957266,
"grad_norm": 2.251265287399292,
"learning_rate": 4.867521367521368e-06,
"loss": 0.1664,
"step": 1201
},
{
"epoch": 10.273504273504274,
"grad_norm": 2.1650373935699463,
"learning_rate": 4.863247863247864e-06,
"loss": 0.0959,
"step": 1202
},
{
"epoch": 10.282051282051283,
"grad_norm": 2.5863089561462402,
"learning_rate": 4.85897435897436e-06,
"loss": 0.1148,
"step": 1203
},
{
"epoch": 10.290598290598291,
"grad_norm": 1.974357008934021,
"learning_rate": 4.854700854700855e-06,
"loss": 0.0663,
"step": 1204
},
{
"epoch": 10.2991452991453,
"grad_norm": 2.3226940631866455,
"learning_rate": 4.850427350427351e-06,
"loss": 0.1363,
"step": 1205
},
{
"epoch": 10.307692307692308,
"grad_norm": 4.034085750579834,
"learning_rate": 4.8461538461538465e-06,
"loss": 0.3473,
"step": 1206
},
{
"epoch": 10.316239316239317,
"grad_norm": 2.492307186126709,
"learning_rate": 4.8418803418803424e-06,
"loss": 0.1742,
"step": 1207
},
{
"epoch": 10.324786324786325,
"grad_norm": 2.886432409286499,
"learning_rate": 4.837606837606838e-06,
"loss": 0.1382,
"step": 1208
},
{
"epoch": 10.333333333333334,
"grad_norm": 3.6314749717712402,
"learning_rate": 4.833333333333333e-06,
"loss": 0.1556,
"step": 1209
},
{
"epoch": 10.341880341880342,
"grad_norm": 2.2757928371429443,
"learning_rate": 4.829059829059829e-06,
"loss": 0.0434,
"step": 1210
},
{
"epoch": 10.350427350427351,
"grad_norm": 3.4152615070343018,
"learning_rate": 4.824786324786325e-06,
"loss": 0.2903,
"step": 1211
},
{
"epoch": 10.35897435897436,
"grad_norm": 3.873960256576538,
"learning_rate": 4.820512820512821e-06,
"loss": 0.2611,
"step": 1212
},
{
"epoch": 10.367521367521368,
"grad_norm": 4.2241291999816895,
"learning_rate": 4.816239316239317e-06,
"loss": 0.0954,
"step": 1213
},
{
"epoch": 10.376068376068377,
"grad_norm": 5.454725742340088,
"learning_rate": 4.811965811965813e-06,
"loss": 0.1361,
"step": 1214
},
{
"epoch": 10.384615384615385,
"grad_norm": 3.482558012008667,
"learning_rate": 4.807692307692308e-06,
"loss": 0.0861,
"step": 1215
},
{
"epoch": 10.393162393162394,
"grad_norm": 2.301254987716675,
"learning_rate": 4.803418803418804e-06,
"loss": 0.1571,
"step": 1216
},
{
"epoch": 10.401709401709402,
"grad_norm": 6.0665602684021,
"learning_rate": 4.7991452991453e-06,
"loss": 0.5323,
"step": 1217
},
{
"epoch": 10.41025641025641,
"grad_norm": 3.6052770614624023,
"learning_rate": 4.7948717948717955e-06,
"loss": 0.3789,
"step": 1218
},
{
"epoch": 10.418803418803419,
"grad_norm": 3.9434757232666016,
"learning_rate": 4.790598290598291e-06,
"loss": 0.0605,
"step": 1219
},
{
"epoch": 10.427350427350428,
"grad_norm": 5.260069847106934,
"learning_rate": 4.786324786324787e-06,
"loss": 0.3163,
"step": 1220
},
{
"epoch": 10.435897435897436,
"grad_norm": 5.219394207000732,
"learning_rate": 4.782051282051282e-06,
"loss": 0.4339,
"step": 1221
},
{
"epoch": 10.444444444444445,
"grad_norm": 2.7057230472564697,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0787,
"step": 1222
},
{
"epoch": 10.452991452991453,
"grad_norm": 11.005247116088867,
"learning_rate": 4.773504273504274e-06,
"loss": 0.255,
"step": 1223
},
{
"epoch": 10.461538461538462,
"grad_norm": 1.7238801717758179,
"learning_rate": 4.76923076923077e-06,
"loss": 0.0605,
"step": 1224
},
{
"epoch": 10.47008547008547,
"grad_norm": 6.509312629699707,
"learning_rate": 4.764957264957265e-06,
"loss": 0.2899,
"step": 1225
},
{
"epoch": 10.478632478632479,
"grad_norm": 7.1476359367370605,
"learning_rate": 4.760683760683761e-06,
"loss": 0.336,
"step": 1226
},
{
"epoch": 10.487179487179487,
"grad_norm": 15.92902660369873,
"learning_rate": 4.756410256410257e-06,
"loss": 0.4864,
"step": 1227
},
{
"epoch": 10.495726495726496,
"grad_norm": 5.545684337615967,
"learning_rate": 4.752136752136752e-06,
"loss": 0.4741,
"step": 1228
},
{
"epoch": 10.504273504273504,
"grad_norm": 3.2521066665649414,
"learning_rate": 4.747863247863248e-06,
"loss": 0.0894,
"step": 1229
},
{
"epoch": 10.512820512820513,
"grad_norm": 2.696866512298584,
"learning_rate": 4.743589743589744e-06,
"loss": 0.111,
"step": 1230
},
{
"epoch": 10.521367521367521,
"grad_norm": 1.8362340927124023,
"learning_rate": 4.7393162393162396e-06,
"loss": 0.0579,
"step": 1231
},
{
"epoch": 10.52991452991453,
"grad_norm": 2.96872878074646,
"learning_rate": 4.7350427350427355e-06,
"loss": 0.0781,
"step": 1232
},
{
"epoch": 10.538461538461538,
"grad_norm": 1.5503445863723755,
"learning_rate": 4.730769230769231e-06,
"loss": 0.0451,
"step": 1233
},
{
"epoch": 10.547008547008547,
"grad_norm": 3.9600377082824707,
"learning_rate": 4.726495726495726e-06,
"loss": 0.1721,
"step": 1234
},
{
"epoch": 10.555555555555555,
"grad_norm": 3.3868823051452637,
"learning_rate": 4.722222222222222e-06,
"loss": 0.1803,
"step": 1235
},
{
"epoch": 10.564102564102564,
"grad_norm": 2.528111219406128,
"learning_rate": 4.717948717948718e-06,
"loss": 0.238,
"step": 1236
},
{
"epoch": 10.572649572649572,
"grad_norm": 6.960350036621094,
"learning_rate": 4.713675213675214e-06,
"loss": 0.4353,
"step": 1237
},
{
"epoch": 10.581196581196581,
"grad_norm": 2.3169686794281006,
"learning_rate": 4.70940170940171e-06,
"loss": 0.1891,
"step": 1238
},
{
"epoch": 10.58974358974359,
"grad_norm": 2.021212577819824,
"learning_rate": 4.705128205128206e-06,
"loss": 0.0865,
"step": 1239
},
{
"epoch": 10.598290598290598,
"grad_norm": 2.445462942123413,
"learning_rate": 4.700854700854701e-06,
"loss": 0.0973,
"step": 1240
},
{
"epoch": 10.606837606837606,
"grad_norm": 3.4490067958831787,
"learning_rate": 4.696581196581197e-06,
"loss": 0.1419,
"step": 1241
},
{
"epoch": 10.615384615384615,
"grad_norm": 3.2859914302825928,
"learning_rate": 4.692307692307693e-06,
"loss": 0.1587,
"step": 1242
},
{
"epoch": 10.623931623931623,
"grad_norm": 4.754831790924072,
"learning_rate": 4.6880341880341886e-06,
"loss": 0.2537,
"step": 1243
},
{
"epoch": 10.632478632478632,
"grad_norm": 3.220867156982422,
"learning_rate": 4.6837606837606844e-06,
"loss": 0.0941,
"step": 1244
},
{
"epoch": 10.64102564102564,
"grad_norm": 5.699328422546387,
"learning_rate": 4.6794871794871795e-06,
"loss": 0.255,
"step": 1245
},
{
"epoch": 10.649572649572649,
"grad_norm": 1.5174522399902344,
"learning_rate": 4.675213675213675e-06,
"loss": 0.048,
"step": 1246
},
{
"epoch": 10.658119658119658,
"grad_norm": 2.4277050495147705,
"learning_rate": 4.670940170940171e-06,
"loss": 0.1127,
"step": 1247
},
{
"epoch": 10.666666666666666,
"grad_norm": 2.079031229019165,
"learning_rate": 4.666666666666667e-06,
"loss": 0.1038,
"step": 1248
},
{
"epoch": 10.675213675213675,
"grad_norm": 953.4605102539062,
"learning_rate": 4.662393162393163e-06,
"loss": 1.1892,
"step": 1249
},
{
"epoch": 10.683760683760683,
"grad_norm": 9.190105438232422,
"learning_rate": 4.658119658119659e-06,
"loss": 0.3541,
"step": 1250
},
{
"epoch": 10.692307692307692,
"grad_norm": 2.3222947120666504,
"learning_rate": 4.653846153846154e-06,
"loss": 0.0842,
"step": 1251
},
{
"epoch": 10.7008547008547,
"grad_norm": 2.2312700748443604,
"learning_rate": 4.64957264957265e-06,
"loss": 0.088,
"step": 1252
},
{
"epoch": 10.709401709401709,
"grad_norm": 3.987630844116211,
"learning_rate": 4.645299145299146e-06,
"loss": 0.1667,
"step": 1253
},
{
"epoch": 10.717948717948717,
"grad_norm": 5.108981609344482,
"learning_rate": 4.641025641025642e-06,
"loss": 0.4291,
"step": 1254
},
{
"epoch": 10.726495726495726,
"grad_norm": 2.8597464561462402,
"learning_rate": 4.6367521367521375e-06,
"loss": 0.0564,
"step": 1255
},
{
"epoch": 10.735042735042736,
"grad_norm": 2.3642940521240234,
"learning_rate": 4.6324786324786334e-06,
"loss": 0.0909,
"step": 1256
},
{
"epoch": 10.743589743589745,
"grad_norm": 1.5703462362289429,
"learning_rate": 4.6282051282051285e-06,
"loss": 0.0395,
"step": 1257
},
{
"epoch": 10.752136752136753,
"grad_norm": 2.952786922454834,
"learning_rate": 4.623931623931624e-06,
"loss": 0.1824,
"step": 1258
},
{
"epoch": 10.760683760683762,
"grad_norm": 2.9027185440063477,
"learning_rate": 4.61965811965812e-06,
"loss": 0.0765,
"step": 1259
},
{
"epoch": 10.76923076923077,
"grad_norm": 2.4386038780212402,
"learning_rate": 4.615384615384616e-06,
"loss": 0.2761,
"step": 1260
},
{
"epoch": 10.777777777777779,
"grad_norm": 7.146468639373779,
"learning_rate": 4.611111111111112e-06,
"loss": 0.4427,
"step": 1261
},
{
"epoch": 10.786324786324787,
"grad_norm": 2.002096652984619,
"learning_rate": 4.606837606837607e-06,
"loss": 0.0879,
"step": 1262
},
{
"epoch": 10.794871794871796,
"grad_norm": 6.504697322845459,
"learning_rate": 4.602564102564103e-06,
"loss": 0.1805,
"step": 1263
},
{
"epoch": 10.803418803418804,
"grad_norm": 9.748340606689453,
"learning_rate": 4.598290598290598e-06,
"loss": 0.5813,
"step": 1264
},
{
"epoch": 10.811965811965813,
"grad_norm": 3.67153000831604,
"learning_rate": 4.594017094017094e-06,
"loss": 0.4175,
"step": 1265
},
{
"epoch": 10.820512820512821,
"grad_norm": 9.109044075012207,
"learning_rate": 4.58974358974359e-06,
"loss": 0.4505,
"step": 1266
},
{
"epoch": 10.82905982905983,
"grad_norm": 5.419683933258057,
"learning_rate": 4.585470085470086e-06,
"loss": 0.2316,
"step": 1267
},
{
"epoch": 10.837606837606838,
"grad_norm": 2.901182174682617,
"learning_rate": 4.581196581196582e-06,
"loss": 0.0583,
"step": 1268
},
{
"epoch": 10.846153846153847,
"grad_norm": 4.579897403717041,
"learning_rate": 4.5769230769230775e-06,
"loss": 0.0536,
"step": 1269
},
{
"epoch": 10.854700854700855,
"grad_norm": 4.232446670532227,
"learning_rate": 4.5726495726495725e-06,
"loss": 0.17,
"step": 1270
},
{
"epoch": 10.863247863247864,
"grad_norm": 8.059329986572266,
"learning_rate": 4.568376068376068e-06,
"loss": 0.256,
"step": 1271
},
{
"epoch": 10.871794871794872,
"grad_norm": 1.5736984014511108,
"learning_rate": 4.564102564102564e-06,
"loss": 0.058,
"step": 1272
},
{
"epoch": 10.88034188034188,
"grad_norm": 5.397885799407959,
"learning_rate": 4.55982905982906e-06,
"loss": 0.1299,
"step": 1273
},
{
"epoch": 10.88888888888889,
"grad_norm": 3.9831533432006836,
"learning_rate": 4.555555555555556e-06,
"loss": 0.1762,
"step": 1274
},
{
"epoch": 10.897435897435898,
"grad_norm": 2.170370101928711,
"learning_rate": 4.551282051282052e-06,
"loss": 0.1355,
"step": 1275
},
{
"epoch": 10.905982905982906,
"grad_norm": 5.151463508605957,
"learning_rate": 4.547008547008547e-06,
"loss": 0.3151,
"step": 1276
},
{
"epoch": 10.914529914529915,
"grad_norm": 2.215559482574463,
"learning_rate": 4.542735042735043e-06,
"loss": 0.1054,
"step": 1277
},
{
"epoch": 10.923076923076923,
"grad_norm": 3.62188458442688,
"learning_rate": 4.538461538461539e-06,
"loss": 0.3839,
"step": 1278
},
{
"epoch": 10.931623931623932,
"grad_norm": 1.8855514526367188,
"learning_rate": 4.534188034188035e-06,
"loss": 0.0639,
"step": 1279
},
{
"epoch": 10.94017094017094,
"grad_norm": 3.0260651111602783,
"learning_rate": 4.5299145299145306e-06,
"loss": 0.1216,
"step": 1280
},
{
"epoch": 10.948717948717949,
"grad_norm": 13.30820083618164,
"learning_rate": 4.525641025641026e-06,
"loss": 0.3337,
"step": 1281
},
{
"epoch": 10.957264957264957,
"grad_norm": 4.356720447540283,
"learning_rate": 4.5213675213675215e-06,
"loss": 0.2692,
"step": 1282
},
{
"epoch": 10.965811965811966,
"grad_norm": 2.077742576599121,
"learning_rate": 4.517094017094017e-06,
"loss": 0.1181,
"step": 1283
},
{
"epoch": 10.974358974358974,
"grad_norm": 6.6224284172058105,
"learning_rate": 4.512820512820513e-06,
"loss": 0.1526,
"step": 1284
},
{
"epoch": 10.982905982905983,
"grad_norm": 4.072678565979004,
"learning_rate": 4.508547008547009e-06,
"loss": 0.1804,
"step": 1285
},
{
"epoch": 10.991452991452991,
"grad_norm": 3.430922269821167,
"learning_rate": 4.504273504273505e-06,
"loss": 0.1316,
"step": 1286
},
{
"epoch": 11.0,
"grad_norm": 1.6371959447860718,
"learning_rate": 4.5e-06,
"loss": 0.0596,
"step": 1287
},
{
"epoch": 11.0,
"eval_loss": 0.08654214441776276,
"eval_runtime": 9.3013,
"eval_samples_per_second": 50.1,
"eval_steps_per_second": 6.343,
"step": 1287
},
{
"epoch": 11.008547008547009,
"grad_norm": 5.072701454162598,
"learning_rate": 4.495726495726496e-06,
"loss": 0.2195,
"step": 1288
},
{
"epoch": 11.017094017094017,
"grad_norm": 6.791895389556885,
"learning_rate": 4.491452991452992e-06,
"loss": 0.5354,
"step": 1289
},
{
"epoch": 11.025641025641026,
"grad_norm": 12.475218772888184,
"learning_rate": 4.487179487179488e-06,
"loss": 0.1828,
"step": 1290
},
{
"epoch": 11.034188034188034,
"grad_norm": 5.892624855041504,
"learning_rate": 4.482905982905984e-06,
"loss": 0.1617,
"step": 1291
},
{
"epoch": 11.042735042735043,
"grad_norm": 1.742074728012085,
"learning_rate": 4.4786324786324796e-06,
"loss": 0.0508,
"step": 1292
},
{
"epoch": 11.051282051282051,
"grad_norm": 2.389373302459717,
"learning_rate": 4.474358974358975e-06,
"loss": 0.1009,
"step": 1293
},
{
"epoch": 11.05982905982906,
"grad_norm": 3.7152106761932373,
"learning_rate": 4.4700854700854705e-06,
"loss": 0.2157,
"step": 1294
},
{
"epoch": 11.068376068376068,
"grad_norm": 7.217955112457275,
"learning_rate": 4.465811965811966e-06,
"loss": 0.2737,
"step": 1295
},
{
"epoch": 11.076923076923077,
"grad_norm": 2.0971977710723877,
"learning_rate": 4.461538461538462e-06,
"loss": 0.1273,
"step": 1296
},
{
"epoch": 11.085470085470085,
"grad_norm": 1.1616859436035156,
"learning_rate": 4.457264957264958e-06,
"loss": 0.0325,
"step": 1297
},
{
"epoch": 11.094017094017094,
"grad_norm": 3.4287424087524414,
"learning_rate": 4.452991452991453e-06,
"loss": 0.1136,
"step": 1298
},
{
"epoch": 11.102564102564102,
"grad_norm": 1.6207005977630615,
"learning_rate": 4.448717948717949e-06,
"loss": 0.0344,
"step": 1299
},
{
"epoch": 11.11111111111111,
"grad_norm": 3.009976863861084,
"learning_rate": 4.444444444444444e-06,
"loss": 0.1532,
"step": 1300
},
{
"epoch": 11.11965811965812,
"grad_norm": 2.9768505096435547,
"learning_rate": 4.44017094017094e-06,
"loss": 0.0874,
"step": 1301
},
{
"epoch": 11.128205128205128,
"grad_norm": 3.622715473175049,
"learning_rate": 4.435897435897436e-06,
"loss": 0.3132,
"step": 1302
},
{
"epoch": 11.136752136752136,
"grad_norm": 3.5741326808929443,
"learning_rate": 4.431623931623932e-06,
"loss": 0.0914,
"step": 1303
},
{
"epoch": 11.145299145299145,
"grad_norm": 7.436197280883789,
"learning_rate": 4.427350427350428e-06,
"loss": 0.329,
"step": 1304
},
{
"epoch": 11.153846153846153,
"grad_norm": 2.390066146850586,
"learning_rate": 4.423076923076924e-06,
"loss": 0.0867,
"step": 1305
},
{
"epoch": 11.162393162393162,
"grad_norm": 1.928227424621582,
"learning_rate": 4.418803418803419e-06,
"loss": 0.0294,
"step": 1306
},
{
"epoch": 11.17094017094017,
"grad_norm": 4.40464448928833,
"learning_rate": 4.4145299145299145e-06,
"loss": 0.3704,
"step": 1307
},
{
"epoch": 11.179487179487179,
"grad_norm": 22.183835983276367,
"learning_rate": 4.4102564102564104e-06,
"loss": 0.6011,
"step": 1308
},
{
"epoch": 11.188034188034187,
"grad_norm": 2.496633768081665,
"learning_rate": 4.405982905982906e-06,
"loss": 0.0494,
"step": 1309
},
{
"epoch": 11.196581196581196,
"grad_norm": 1.142687201499939,
"learning_rate": 4.401709401709402e-06,
"loss": 0.0292,
"step": 1310
},
{
"epoch": 11.205128205128204,
"grad_norm": 2.0762455463409424,
"learning_rate": 4.397435897435898e-06,
"loss": 0.1123,
"step": 1311
},
{
"epoch": 11.213675213675213,
"grad_norm": 1.5389565229415894,
"learning_rate": 4.393162393162393e-06,
"loss": 0.0316,
"step": 1312
},
{
"epoch": 11.222222222222221,
"grad_norm": 4.252040386199951,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0832,
"step": 1313
},
{
"epoch": 11.23076923076923,
"grad_norm": 2.1999545097351074,
"learning_rate": 4.384615384615385e-06,
"loss": 0.1121,
"step": 1314
},
{
"epoch": 11.239316239316238,
"grad_norm": 3.3256099224090576,
"learning_rate": 4.380341880341881e-06,
"loss": 0.1288,
"step": 1315
},
{
"epoch": 11.247863247863247,
"grad_norm": 2.6664986610412598,
"learning_rate": 4.376068376068377e-06,
"loss": 0.1044,
"step": 1316
},
{
"epoch": 11.256410256410255,
"grad_norm": 4.103114604949951,
"learning_rate": 4.371794871794872e-06,
"loss": 0.3115,
"step": 1317
},
{
"epoch": 11.264957264957266,
"grad_norm": 2.717532157897949,
"learning_rate": 4.367521367521368e-06,
"loss": 0.1144,
"step": 1318
},
{
"epoch": 11.273504273504274,
"grad_norm": 2.7918317317962646,
"learning_rate": 4.3632478632478635e-06,
"loss": 0.1205,
"step": 1319
},
{
"epoch": 11.282051282051283,
"grad_norm": 2.439854383468628,
"learning_rate": 4.358974358974359e-06,
"loss": 0.05,
"step": 1320
},
{
"epoch": 11.290598290598291,
"grad_norm": 1.3528865575790405,
"learning_rate": 4.354700854700855e-06,
"loss": 0.0437,
"step": 1321
},
{
"epoch": 11.2991452991453,
"grad_norm": 3.3273401260375977,
"learning_rate": 4.350427350427351e-06,
"loss": 0.1417,
"step": 1322
},
{
"epoch": 11.307692307692308,
"grad_norm": 4.022815704345703,
"learning_rate": 4.346153846153846e-06,
"loss": 0.0845,
"step": 1323
},
{
"epoch": 11.316239316239317,
"grad_norm": 5.169338703155518,
"learning_rate": 4.341880341880342e-06,
"loss": 0.5235,
"step": 1324
},
{
"epoch": 11.324786324786325,
"grad_norm": 1.8199687004089355,
"learning_rate": 4.337606837606838e-06,
"loss": 0.0399,
"step": 1325
},
{
"epoch": 11.333333333333334,
"grad_norm": 3.3616087436676025,
"learning_rate": 4.333333333333334e-06,
"loss": 0.1428,
"step": 1326
},
{
"epoch": 11.341880341880342,
"grad_norm": 14.056232452392578,
"learning_rate": 4.32905982905983e-06,
"loss": 0.2921,
"step": 1327
},
{
"epoch": 11.350427350427351,
"grad_norm": 2.3905317783355713,
"learning_rate": 4.324786324786326e-06,
"loss": 0.0478,
"step": 1328
},
{
"epoch": 11.35897435897436,
"grad_norm": 9.876815795898438,
"learning_rate": 4.320512820512821e-06,
"loss": 0.1926,
"step": 1329
},
{
"epoch": 11.367521367521368,
"grad_norm": 1.3726049661636353,
"learning_rate": 4.316239316239317e-06,
"loss": 0.0416,
"step": 1330
},
{
"epoch": 11.376068376068377,
"grad_norm": 3.0890841484069824,
"learning_rate": 4.3119658119658125e-06,
"loss": 0.0614,
"step": 1331
},
{
"epoch": 11.384615384615385,
"grad_norm": 2.858560562133789,
"learning_rate": 4.307692307692308e-06,
"loss": 0.2068,
"step": 1332
},
{
"epoch": 11.393162393162394,
"grad_norm": 4.6819658279418945,
"learning_rate": 4.303418803418804e-06,
"loss": 0.5773,
"step": 1333
},
{
"epoch": 11.401709401709402,
"grad_norm": 1.741450548171997,
"learning_rate": 4.299145299145299e-06,
"loss": 0.0505,
"step": 1334
},
{
"epoch": 11.41025641025641,
"grad_norm": 3.5882327556610107,
"learning_rate": 4.294871794871795e-06,
"loss": 0.1797,
"step": 1335
},
{
"epoch": 11.418803418803419,
"grad_norm": 3.59714937210083,
"learning_rate": 4.29059829059829e-06,
"loss": 0.1531,
"step": 1336
},
{
"epoch": 11.427350427350428,
"grad_norm": 3.619572877883911,
"learning_rate": 4.286324786324786e-06,
"loss": 0.1028,
"step": 1337
},
{
"epoch": 11.435897435897436,
"grad_norm": 3.9230782985687256,
"learning_rate": 4.282051282051282e-06,
"loss": 0.2404,
"step": 1338
},
{
"epoch": 11.444444444444445,
"grad_norm": 3.6987717151641846,
"learning_rate": 4.277777777777778e-06,
"loss": 0.1795,
"step": 1339
},
{
"epoch": 11.452991452991453,
"grad_norm": 3.322707176208496,
"learning_rate": 4.273504273504274e-06,
"loss": 0.0968,
"step": 1340
},
{
"epoch": 11.461538461538462,
"grad_norm": 1.2378501892089844,
"learning_rate": 4.26923076923077e-06,
"loss": 0.0387,
"step": 1341
},
{
"epoch": 11.47008547008547,
"grad_norm": 2.6801578998565674,
"learning_rate": 4.264957264957265e-06,
"loss": 0.0475,
"step": 1342
},
{
"epoch": 11.478632478632479,
"grad_norm": 2.2003352642059326,
"learning_rate": 4.260683760683761e-06,
"loss": 0.0505,
"step": 1343
},
{
"epoch": 11.487179487179487,
"grad_norm": 1.701341152191162,
"learning_rate": 4.2564102564102566e-06,
"loss": 0.064,
"step": 1344
},
{
"epoch": 11.495726495726496,
"grad_norm": 9.939803123474121,
"learning_rate": 4.2521367521367524e-06,
"loss": 0.461,
"step": 1345
},
{
"epoch": 11.504273504273504,
"grad_norm": 3.2999305725097656,
"learning_rate": 4.247863247863248e-06,
"loss": 0.1653,
"step": 1346
},
{
"epoch": 11.512820512820513,
"grad_norm": 3.9968252182006836,
"learning_rate": 4.243589743589744e-06,
"loss": 0.123,
"step": 1347
},
{
"epoch": 11.521367521367521,
"grad_norm": 2.846968173980713,
"learning_rate": 4.239316239316239e-06,
"loss": 0.1161,
"step": 1348
},
{
"epoch": 11.52991452991453,
"grad_norm": 4.328092575073242,
"learning_rate": 4.235042735042735e-06,
"loss": 0.065,
"step": 1349
},
{
"epoch": 11.538461538461538,
"grad_norm": 3.649003267288208,
"learning_rate": 4.230769230769231e-06,
"loss": 0.1919,
"step": 1350
},
{
"epoch": 11.547008547008547,
"grad_norm": 4.094634056091309,
"learning_rate": 4.226495726495727e-06,
"loss": 0.1728,
"step": 1351
},
{
"epoch": 11.555555555555555,
"grad_norm": 2.3904240131378174,
"learning_rate": 4.222222222222223e-06,
"loss": 0.105,
"step": 1352
},
{
"epoch": 11.564102564102564,
"grad_norm": 1.8493746519088745,
"learning_rate": 4.217948717948718e-06,
"loss": 0.0373,
"step": 1353
},
{
"epoch": 11.572649572649572,
"grad_norm": 4.690928936004639,
"learning_rate": 4.213675213675214e-06,
"loss": 0.3405,
"step": 1354
},
{
"epoch": 11.581196581196581,
"grad_norm": 6.808948516845703,
"learning_rate": 4.20940170940171e-06,
"loss": 0.1308,
"step": 1355
},
{
"epoch": 11.58974358974359,
"grad_norm": 6.060946464538574,
"learning_rate": 4.2051282051282055e-06,
"loss": 0.1494,
"step": 1356
},
{
"epoch": 11.598290598290598,
"grad_norm": 1.5923279523849487,
"learning_rate": 4.2008547008547014e-06,
"loss": 0.044,
"step": 1357
},
{
"epoch": 11.606837606837606,
"grad_norm": 1.7796354293823242,
"learning_rate": 4.196581196581197e-06,
"loss": 0.0558,
"step": 1358
},
{
"epoch": 11.615384615384615,
"grad_norm": 1.2209490537643433,
"learning_rate": 4.192307692307692e-06,
"loss": 0.0492,
"step": 1359
},
{
"epoch": 11.623931623931623,
"grad_norm": 4.0859880447387695,
"learning_rate": 4.188034188034188e-06,
"loss": 0.0759,
"step": 1360
},
{
"epoch": 11.632478632478632,
"grad_norm": 3.5021755695343018,
"learning_rate": 4.183760683760684e-06,
"loss": 0.1263,
"step": 1361
},
{
"epoch": 11.64102564102564,
"grad_norm": 2.5915517807006836,
"learning_rate": 4.17948717948718e-06,
"loss": 0.1949,
"step": 1362
},
{
"epoch": 11.649572649572649,
"grad_norm": 2.8024656772613525,
"learning_rate": 4.175213675213676e-06,
"loss": 0.2325,
"step": 1363
},
{
"epoch": 11.658119658119658,
"grad_norm": 5.795172691345215,
"learning_rate": 4.170940170940172e-06,
"loss": 0.3253,
"step": 1364
},
{
"epoch": 11.666666666666666,
"grad_norm": 5.056031227111816,
"learning_rate": 4.166666666666667e-06,
"loss": 0.102,
"step": 1365
},
{
"epoch": 11.675213675213675,
"grad_norm": 6.092950820922852,
"learning_rate": 4.162393162393163e-06,
"loss": 0.1938,
"step": 1366
},
{
"epoch": 11.683760683760683,
"grad_norm": 4.44755744934082,
"learning_rate": 4.158119658119659e-06,
"loss": 0.1588,
"step": 1367
},
{
"epoch": 11.692307692307692,
"grad_norm": 171.19509887695312,
"learning_rate": 4.1538461538461545e-06,
"loss": 0.3077,
"step": 1368
},
{
"epoch": 11.7008547008547,
"grad_norm": 13.992602348327637,
"learning_rate": 4.1495726495726504e-06,
"loss": 0.4401,
"step": 1369
},
{
"epoch": 11.709401709401709,
"grad_norm": 2.2174923419952393,
"learning_rate": 4.145299145299146e-06,
"loss": 0.1751,
"step": 1370
},
{
"epoch": 11.717948717948717,
"grad_norm": 2.031663179397583,
"learning_rate": 4.141025641025641e-06,
"loss": 0.049,
"step": 1371
},
{
"epoch": 11.726495726495726,
"grad_norm": 4.201449394226074,
"learning_rate": 4.136752136752136e-06,
"loss": 0.1016,
"step": 1372
},
{
"epoch": 11.735042735042736,
"grad_norm": 3.953226089477539,
"learning_rate": 4.132478632478632e-06,
"loss": 0.1336,
"step": 1373
},
{
"epoch": 11.743589743589745,
"grad_norm": 1.4856081008911133,
"learning_rate": 4.128205128205128e-06,
"loss": 0.0537,
"step": 1374
},
{
"epoch": 11.752136752136753,
"grad_norm": 1.2989288568496704,
"learning_rate": 4.123931623931624e-06,
"loss": 0.0351,
"step": 1375
},
{
"epoch": 11.760683760683762,
"grad_norm": 4.335974216461182,
"learning_rate": 4.11965811965812e-06,
"loss": 0.0722,
"step": 1376
},
{
"epoch": 11.76923076923077,
"grad_norm": 6.298306941986084,
"learning_rate": 4.115384615384616e-06,
"loss": 0.2359,
"step": 1377
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.7119566798210144,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0192,
"step": 1378
},
{
"epoch": 11.786324786324787,
"grad_norm": 2.7993624210357666,
"learning_rate": 4.106837606837607e-06,
"loss": 0.0605,
"step": 1379
},
{
"epoch": 11.794871794871796,
"grad_norm": 6.566782474517822,
"learning_rate": 4.102564102564103e-06,
"loss": 0.3883,
"step": 1380
},
{
"epoch": 11.803418803418804,
"grad_norm": 8.177830696105957,
"learning_rate": 4.0982905982905986e-06,
"loss": 0.257,
"step": 1381
},
{
"epoch": 11.811965811965813,
"grad_norm": 4.04230260848999,
"learning_rate": 4.0940170940170945e-06,
"loss": 0.0943,
"step": 1382
},
{
"epoch": 11.820512820512821,
"grad_norm": 3.595386505126953,
"learning_rate": 4.08974358974359e-06,
"loss": 0.0533,
"step": 1383
},
{
"epoch": 11.82905982905983,
"grad_norm": 3.755312204360962,
"learning_rate": 4.085470085470085e-06,
"loss": 0.0468,
"step": 1384
},
{
"epoch": 11.837606837606838,
"grad_norm": 2.0697362422943115,
"learning_rate": 4.081196581196581e-06,
"loss": 0.063,
"step": 1385
},
{
"epoch": 11.846153846153847,
"grad_norm": 7.690021991729736,
"learning_rate": 4.076923076923077e-06,
"loss": 0.2415,
"step": 1386
},
{
"epoch": 11.854700854700855,
"grad_norm": 3.0239031314849854,
"learning_rate": 4.072649572649573e-06,
"loss": 0.1257,
"step": 1387
},
{
"epoch": 11.863247863247864,
"grad_norm": 2.263847589492798,
"learning_rate": 4.068376068376069e-06,
"loss": 0.132,
"step": 1388
},
{
"epoch": 11.871794871794872,
"grad_norm": 2.9513261318206787,
"learning_rate": 4.064102564102565e-06,
"loss": 0.1229,
"step": 1389
},
{
"epoch": 11.88034188034188,
"grad_norm": 3.03973388671875,
"learning_rate": 4.05982905982906e-06,
"loss": 0.0966,
"step": 1390
},
{
"epoch": 11.88888888888889,
"grad_norm": 1.0075026750564575,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0284,
"step": 1391
},
{
"epoch": 11.897435897435898,
"grad_norm": 1.5330802202224731,
"learning_rate": 4.051282051282052e-06,
"loss": 0.0614,
"step": 1392
},
{
"epoch": 11.905982905982906,
"grad_norm": 3.6498589515686035,
"learning_rate": 4.0470085470085476e-06,
"loss": 0.2236,
"step": 1393
},
{
"epoch": 11.914529914529915,
"grad_norm": 4.659658908843994,
"learning_rate": 4.0427350427350435e-06,
"loss": 0.3245,
"step": 1394
},
{
"epoch": 11.923076923076923,
"grad_norm": 3.921703815460205,
"learning_rate": 4.0384615384615385e-06,
"loss": 0.2981,
"step": 1395
},
{
"epoch": 11.931623931623932,
"grad_norm": 5.816749572753906,
"learning_rate": 4.034188034188034e-06,
"loss": 0.1606,
"step": 1396
},
{
"epoch": 11.94017094017094,
"grad_norm": 1.2831742763519287,
"learning_rate": 4.02991452991453e-06,
"loss": 0.0307,
"step": 1397
},
{
"epoch": 11.948717948717949,
"grad_norm": 5.745227813720703,
"learning_rate": 4.025641025641026e-06,
"loss": 0.5323,
"step": 1398
},
{
"epoch": 11.957264957264957,
"grad_norm": 2.4196462631225586,
"learning_rate": 4.021367521367522e-06,
"loss": 0.09,
"step": 1399
},
{
"epoch": 11.965811965811966,
"grad_norm": 8.084505081176758,
"learning_rate": 4.017094017094018e-06,
"loss": 0.2991,
"step": 1400
},
{
"epoch": 11.974358974358974,
"grad_norm": 3.786708116531372,
"learning_rate": 4.012820512820513e-06,
"loss": 0.2163,
"step": 1401
},
{
"epoch": 11.982905982905983,
"grad_norm": 4.76535701751709,
"learning_rate": 4.008547008547009e-06,
"loss": 0.2453,
"step": 1402
},
{
"epoch": 11.991452991452991,
"grad_norm": 7.380269527435303,
"learning_rate": 4.004273504273505e-06,
"loss": 0.3525,
"step": 1403
},
{
"epoch": 12.0,
"grad_norm": 41.21335983276367,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2139,
"step": 1404
},
{
"epoch": 12.0,
"eval_loss": 0.07730000466108322,
"eval_runtime": 9.2426,
"eval_samples_per_second": 50.419,
"eval_steps_per_second": 6.383,
"step": 1404
},
{
"epoch": 12.008547008547009,
"grad_norm": 2.3692574501037598,
"learning_rate": 3.9957264957264966e-06,
"loss": 0.0939,
"step": 1405
},
{
"epoch": 12.017094017094017,
"grad_norm": 8.087658882141113,
"learning_rate": 3.9914529914529924e-06,
"loss": 0.2801,
"step": 1406
},
{
"epoch": 12.025641025641026,
"grad_norm": 8.448614120483398,
"learning_rate": 3.9871794871794875e-06,
"loss": 0.2069,
"step": 1407
},
{
"epoch": 12.034188034188034,
"grad_norm": 1.8581651449203491,
"learning_rate": 3.982905982905983e-06,
"loss": 0.0509,
"step": 1408
},
{
"epoch": 12.042735042735043,
"grad_norm": 1.711654543876648,
"learning_rate": 3.9786324786324784e-06,
"loss": 0.0464,
"step": 1409
},
{
"epoch": 12.051282051282051,
"grad_norm": 1.482553482055664,
"learning_rate": 3.974358974358974e-06,
"loss": 0.028,
"step": 1410
},
{
"epoch": 12.05982905982906,
"grad_norm": 8.005542755126953,
"learning_rate": 3.97008547008547e-06,
"loss": 0.2587,
"step": 1411
},
{
"epoch": 12.068376068376068,
"grad_norm": 2.1153948307037354,
"learning_rate": 3.965811965811966e-06,
"loss": 0.0563,
"step": 1412
},
{
"epoch": 12.076923076923077,
"grad_norm": 7.791186809539795,
"learning_rate": 3.961538461538462e-06,
"loss": 0.0587,
"step": 1413
},
{
"epoch": 12.085470085470085,
"grad_norm": 21.04537582397461,
"learning_rate": 3.957264957264957e-06,
"loss": 0.252,
"step": 1414
},
{
"epoch": 12.094017094017094,
"grad_norm": 3.144742727279663,
"learning_rate": 3.952991452991453e-06,
"loss": 0.2207,
"step": 1415
},
{
"epoch": 12.102564102564102,
"grad_norm": 2.23223614692688,
"learning_rate": 3.948717948717949e-06,
"loss": 0.0923,
"step": 1416
},
{
"epoch": 12.11111111111111,
"grad_norm": 3.5652217864990234,
"learning_rate": 3.944444444444445e-06,
"loss": 0.2197,
"step": 1417
},
{
"epoch": 12.11965811965812,
"grad_norm": 3.1105499267578125,
"learning_rate": 3.940170940170941e-06,
"loss": 0.071,
"step": 1418
},
{
"epoch": 12.128205128205128,
"grad_norm": 2.525405168533325,
"learning_rate": 3.9358974358974365e-06,
"loss": 0.0874,
"step": 1419
},
{
"epoch": 12.136752136752136,
"grad_norm": 4.479174613952637,
"learning_rate": 3.9316239316239315e-06,
"loss": 0.1872,
"step": 1420
},
{
"epoch": 12.145299145299145,
"grad_norm": 2.0484113693237305,
"learning_rate": 3.927350427350427e-06,
"loss": 0.0739,
"step": 1421
},
{
"epoch": 12.153846153846153,
"grad_norm": 2.014679431915283,
"learning_rate": 3.923076923076923e-06,
"loss": 0.1089,
"step": 1422
},
{
"epoch": 12.162393162393162,
"grad_norm": 4.71014404296875,
"learning_rate": 3.918803418803419e-06,
"loss": 0.3136,
"step": 1423
},
{
"epoch": 12.17094017094017,
"grad_norm": 2.1372437477111816,
"learning_rate": 3.914529914529915e-06,
"loss": 0.0458,
"step": 1424
},
{
"epoch": 12.179487179487179,
"grad_norm": 1.4595564603805542,
"learning_rate": 3.910256410256411e-06,
"loss": 0.0601,
"step": 1425
},
{
"epoch": 12.188034188034187,
"grad_norm": 4.45602560043335,
"learning_rate": 3.905982905982906e-06,
"loss": 0.091,
"step": 1426
},
{
"epoch": 12.196581196581196,
"grad_norm": 1.473585844039917,
"learning_rate": 3.901709401709402e-06,
"loss": 0.0515,
"step": 1427
},
{
"epoch": 12.205128205128204,
"grad_norm": 1.8761534690856934,
"learning_rate": 3.897435897435898e-06,
"loss": 0.055,
"step": 1428
},
{
"epoch": 12.213675213675213,
"grad_norm": 0.7121579647064209,
"learning_rate": 3.893162393162394e-06,
"loss": 0.0197,
"step": 1429
},
{
"epoch": 12.222222222222221,
"grad_norm": 2.0035219192504883,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0904,
"step": 1430
},
{
"epoch": 12.23076923076923,
"grad_norm": 3.820181369781494,
"learning_rate": 3.884615384615385e-06,
"loss": 0.2415,
"step": 1431
},
{
"epoch": 12.239316239316238,
"grad_norm": 3.40633225440979,
"learning_rate": 3.8803418803418805e-06,
"loss": 0.0593,
"step": 1432
},
{
"epoch": 12.247863247863247,
"grad_norm": 7.093897342681885,
"learning_rate": 3.876068376068376e-06,
"loss": 0.2504,
"step": 1433
},
{
"epoch": 12.256410256410255,
"grad_norm": 2.1057517528533936,
"learning_rate": 3.871794871794872e-06,
"loss": 0.0573,
"step": 1434
},
{
"epoch": 12.264957264957266,
"grad_norm": 4.797401428222656,
"learning_rate": 3.867521367521368e-06,
"loss": 0.338,
"step": 1435
},
{
"epoch": 12.273504273504274,
"grad_norm": 20.711339950561523,
"learning_rate": 3.863247863247864e-06,
"loss": 0.1964,
"step": 1436
},
{
"epoch": 12.282051282051283,
"grad_norm": 2.725280523300171,
"learning_rate": 3.858974358974359e-06,
"loss": 0.1837,
"step": 1437
},
{
"epoch": 12.290598290598291,
"grad_norm": 0.9469479322433472,
"learning_rate": 3.854700854700855e-06,
"loss": 0.0231,
"step": 1438
},
{
"epoch": 12.2991452991453,
"grad_norm": 2.0424935817718506,
"learning_rate": 3.850427350427351e-06,
"loss": 0.1373,
"step": 1439
},
{
"epoch": 12.307692307692308,
"grad_norm": 1.4781558513641357,
"learning_rate": 3.846153846153847e-06,
"loss": 0.0393,
"step": 1440
},
{
"epoch": 12.316239316239317,
"grad_norm": 3.7576427459716797,
"learning_rate": 3.841880341880343e-06,
"loss": 0.1134,
"step": 1441
},
{
"epoch": 12.324786324786325,
"grad_norm": 299.5986633300781,
"learning_rate": 3.8376068376068386e-06,
"loss": 0.8017,
"step": 1442
},
{
"epoch": 12.333333333333334,
"grad_norm": 3.109199047088623,
"learning_rate": 3.833333333333334e-06,
"loss": 0.1014,
"step": 1443
},
{
"epoch": 12.341880341880342,
"grad_norm": 6.353960990905762,
"learning_rate": 3.8290598290598295e-06,
"loss": 0.3484,
"step": 1444
},
{
"epoch": 12.350427350427351,
"grad_norm": 12.957517623901367,
"learning_rate": 3.8247863247863246e-06,
"loss": 0.5644,
"step": 1445
},
{
"epoch": 12.35897435897436,
"grad_norm": 10.197676658630371,
"learning_rate": 3.8205128205128204e-06,
"loss": 0.1525,
"step": 1446
},
{
"epoch": 12.367521367521368,
"grad_norm": 1.7754546403884888,
"learning_rate": 3.816239316239316e-06,
"loss": 0.0259,
"step": 1447
},
{
"epoch": 12.376068376068377,
"grad_norm": 1.4237226247787476,
"learning_rate": 3.8119658119658122e-06,
"loss": 0.0307,
"step": 1448
},
{
"epoch": 12.384615384615385,
"grad_norm": 2.94474458694458,
"learning_rate": 3.8076923076923077e-06,
"loss": 0.1447,
"step": 1449
},
{
"epoch": 12.393162393162394,
"grad_norm": 3.7823615074157715,
"learning_rate": 3.8034188034188036e-06,
"loss": 0.071,
"step": 1450
},
{
"epoch": 12.401709401709402,
"grad_norm": 7.5281081199646,
"learning_rate": 3.7991452991452995e-06,
"loss": 0.1805,
"step": 1451
},
{
"epoch": 12.41025641025641,
"grad_norm": 2.523592233657837,
"learning_rate": 3.794871794871795e-06,
"loss": 0.0684,
"step": 1452
},
{
"epoch": 12.418803418803419,
"grad_norm": 2.423443078994751,
"learning_rate": 3.790598290598291e-06,
"loss": 0.0726,
"step": 1453
},
{
"epoch": 12.427350427350428,
"grad_norm": 6.3336005210876465,
"learning_rate": 3.7863247863247863e-06,
"loss": 0.1684,
"step": 1454
},
{
"epoch": 12.435897435897436,
"grad_norm": 248.31146240234375,
"learning_rate": 3.782051282051282e-06,
"loss": 0.6863,
"step": 1455
},
{
"epoch": 12.444444444444445,
"grad_norm": 3.0117695331573486,
"learning_rate": 3.777777777777778e-06,
"loss": 0.217,
"step": 1456
},
{
"epoch": 12.452991452991453,
"grad_norm": 1.4753539562225342,
"learning_rate": 3.7735042735042735e-06,
"loss": 0.0623,
"step": 1457
},
{
"epoch": 12.461538461538462,
"grad_norm": 2.095745325088501,
"learning_rate": 3.7692307692307694e-06,
"loss": 0.055,
"step": 1458
},
{
"epoch": 12.47008547008547,
"grad_norm": 3.508305788040161,
"learning_rate": 3.7649572649572653e-06,
"loss": 0.1097,
"step": 1459
},
{
"epoch": 12.478632478632479,
"grad_norm": 3.0965282917022705,
"learning_rate": 3.760683760683761e-06,
"loss": 0.3374,
"step": 1460
},
{
"epoch": 12.487179487179487,
"grad_norm": 0.7286785244941711,
"learning_rate": 3.7564102564102567e-06,
"loss": 0.0182,
"step": 1461
},
{
"epoch": 12.495726495726496,
"grad_norm": 5.957888126373291,
"learning_rate": 3.7521367521367526e-06,
"loss": 0.3498,
"step": 1462
},
{
"epoch": 12.504273504273504,
"grad_norm": 10.433263778686523,
"learning_rate": 3.747863247863248e-06,
"loss": 0.446,
"step": 1463
},
{
"epoch": 12.512820512820513,
"grad_norm": 4.565568923950195,
"learning_rate": 3.743589743589744e-06,
"loss": 0.1026,
"step": 1464
},
{
"epoch": 12.521367521367521,
"grad_norm": 2.607106924057007,
"learning_rate": 3.73931623931624e-06,
"loss": 0.0912,
"step": 1465
},
{
"epoch": 12.52991452991453,
"grad_norm": 2.415541410446167,
"learning_rate": 3.7350427350427353e-06,
"loss": 0.0594,
"step": 1466
},
{
"epoch": 12.538461538461538,
"grad_norm": 7.978870868682861,
"learning_rate": 3.730769230769231e-06,
"loss": 0.2617,
"step": 1467
},
{
"epoch": 12.547008547008547,
"grad_norm": 6.858293056488037,
"learning_rate": 3.726495726495727e-06,
"loss": 0.3642,
"step": 1468
},
{
"epoch": 12.555555555555555,
"grad_norm": 1.3900551795959473,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.0445,
"step": 1469
},
{
"epoch": 12.564102564102564,
"grad_norm": 8.111970901489258,
"learning_rate": 3.7179487179487184e-06,
"loss": 0.1828,
"step": 1470
},
{
"epoch": 12.572649572649572,
"grad_norm": 2.731841802597046,
"learning_rate": 3.7136752136752143e-06,
"loss": 0.2027,
"step": 1471
},
{
"epoch": 12.581196581196581,
"grad_norm": 4.418527126312256,
"learning_rate": 3.7094017094017098e-06,
"loss": 0.1744,
"step": 1472
},
{
"epoch": 12.58974358974359,
"grad_norm": 2.8263015747070312,
"learning_rate": 3.7051282051282057e-06,
"loss": 0.1123,
"step": 1473
},
{
"epoch": 12.598290598290598,
"grad_norm": 2.3524725437164307,
"learning_rate": 3.700854700854701e-06,
"loss": 0.0999,
"step": 1474
},
{
"epoch": 12.606837606837606,
"grad_norm": 9.863709449768066,
"learning_rate": 3.696581196581197e-06,
"loss": 0.4589,
"step": 1475
},
{
"epoch": 12.615384615384615,
"grad_norm": 3.5506396293640137,
"learning_rate": 3.692307692307693e-06,
"loss": 0.2034,
"step": 1476
},
{
"epoch": 12.623931623931623,
"grad_norm": 2.4352779388427734,
"learning_rate": 3.6880341880341884e-06,
"loss": 0.0806,
"step": 1477
},
{
"epoch": 12.632478632478632,
"grad_norm": 1.8339797258377075,
"learning_rate": 3.6837606837606843e-06,
"loss": 0.0635,
"step": 1478
},
{
"epoch": 12.64102564102564,
"grad_norm": 4.63474178314209,
"learning_rate": 3.67948717948718e-06,
"loss": 0.4568,
"step": 1479
},
{
"epoch": 12.649572649572649,
"grad_norm": 7.696872711181641,
"learning_rate": 3.6752136752136756e-06,
"loss": 0.1769,
"step": 1480
},
{
"epoch": 12.658119658119658,
"grad_norm": 1.3894271850585938,
"learning_rate": 3.670940170940171e-06,
"loss": 0.0747,
"step": 1481
},
{
"epoch": 12.666666666666666,
"grad_norm": 5.607828140258789,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.1178,
"step": 1482
},
{
"epoch": 12.675213675213675,
"grad_norm": 2.120594024658203,
"learning_rate": 3.6623931623931625e-06,
"loss": 0.0497,
"step": 1483
},
{
"epoch": 12.683760683760683,
"grad_norm": 1.359381914138794,
"learning_rate": 3.6581196581196584e-06,
"loss": 0.035,
"step": 1484
},
{
"epoch": 12.692307692307692,
"grad_norm": 2.8533923625946045,
"learning_rate": 3.653846153846154e-06,
"loss": 0.1048,
"step": 1485
},
{
"epoch": 12.7008547008547,
"grad_norm": 6.021198749542236,
"learning_rate": 3.6495726495726497e-06,
"loss": 0.1604,
"step": 1486
},
{
"epoch": 12.709401709401709,
"grad_norm": 7.198216915130615,
"learning_rate": 3.6452991452991456e-06,
"loss": 0.1656,
"step": 1487
},
{
"epoch": 12.717948717948717,
"grad_norm": 1.4581981897354126,
"learning_rate": 3.641025641025641e-06,
"loss": 0.0398,
"step": 1488
},
{
"epoch": 12.726495726495726,
"grad_norm": 30.704627990722656,
"learning_rate": 3.636752136752137e-06,
"loss": 0.3371,
"step": 1489
},
{
"epoch": 12.735042735042736,
"grad_norm": 2.5204057693481445,
"learning_rate": 3.632478632478633e-06,
"loss": 0.0742,
"step": 1490
},
{
"epoch": 12.743589743589745,
"grad_norm": 2.3917508125305176,
"learning_rate": 3.6282051282051283e-06,
"loss": 0.1681,
"step": 1491
},
{
"epoch": 12.752136752136753,
"grad_norm": 1.4529337882995605,
"learning_rate": 3.623931623931624e-06,
"loss": 0.0247,
"step": 1492
},
{
"epoch": 12.760683760683762,
"grad_norm": 31.894805908203125,
"learning_rate": 3.6196581196581197e-06,
"loss": 0.2222,
"step": 1493
},
{
"epoch": 12.76923076923077,
"grad_norm": 3.4240164756774902,
"learning_rate": 3.6153846153846156e-06,
"loss": 0.1432,
"step": 1494
},
{
"epoch": 12.777777777777779,
"grad_norm": 2.0000102519989014,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0383,
"step": 1495
},
{
"epoch": 12.786324786324787,
"grad_norm": 3.7665908336639404,
"learning_rate": 3.606837606837607e-06,
"loss": 0.2719,
"step": 1496
},
{
"epoch": 12.794871794871796,
"grad_norm": 2.0319290161132812,
"learning_rate": 3.602564102564103e-06,
"loss": 0.0741,
"step": 1497
},
{
"epoch": 12.803418803418804,
"grad_norm": 2.3379619121551514,
"learning_rate": 3.5982905982905987e-06,
"loss": 0.1155,
"step": 1498
},
{
"epoch": 12.811965811965813,
"grad_norm": 5.183985233306885,
"learning_rate": 3.594017094017094e-06,
"loss": 0.0815,
"step": 1499
},
{
"epoch": 12.820512820512821,
"grad_norm": 3.1432502269744873,
"learning_rate": 3.58974358974359e-06,
"loss": 0.1855,
"step": 1500
},
{
"epoch": 12.82905982905983,
"grad_norm": 4.5739946365356445,
"learning_rate": 3.585470085470086e-06,
"loss": 0.1424,
"step": 1501
},
{
"epoch": 12.837606837606838,
"grad_norm": 1.6006520986557007,
"learning_rate": 3.5811965811965814e-06,
"loss": 0.0305,
"step": 1502
},
{
"epoch": 12.846153846153847,
"grad_norm": 3.937011241912842,
"learning_rate": 3.5769230769230773e-06,
"loss": 0.2497,
"step": 1503
},
{
"epoch": 12.854700854700855,
"grad_norm": 2.6159651279449463,
"learning_rate": 3.572649572649573e-06,
"loss": 0.1067,
"step": 1504
},
{
"epoch": 12.863247863247864,
"grad_norm": 2.578547239303589,
"learning_rate": 3.5683760683760687e-06,
"loss": 0.0663,
"step": 1505
},
{
"epoch": 12.871794871794872,
"grad_norm": 2.3777639865875244,
"learning_rate": 3.5641025641025646e-06,
"loss": 0.0558,
"step": 1506
},
{
"epoch": 12.88034188034188,
"grad_norm": 7.5656561851501465,
"learning_rate": 3.5598290598290604e-06,
"loss": 0.2448,
"step": 1507
},
{
"epoch": 12.88888888888889,
"grad_norm": 4.21798849105835,
"learning_rate": 3.555555555555556e-06,
"loss": 0.1916,
"step": 1508
},
{
"epoch": 12.897435897435898,
"grad_norm": 1.318049669265747,
"learning_rate": 3.551282051282052e-06,
"loss": 0.0387,
"step": 1509
},
{
"epoch": 12.905982905982906,
"grad_norm": 2.4345362186431885,
"learning_rate": 3.5470085470085473e-06,
"loss": 0.061,
"step": 1510
},
{
"epoch": 12.914529914529915,
"grad_norm": 3.2767112255096436,
"learning_rate": 3.542735042735043e-06,
"loss": 0.1627,
"step": 1511
},
{
"epoch": 12.923076923076923,
"grad_norm": 6.881056785583496,
"learning_rate": 3.538461538461539e-06,
"loss": 0.2452,
"step": 1512
},
{
"epoch": 12.931623931623932,
"grad_norm": 8.017362594604492,
"learning_rate": 3.5341880341880345e-06,
"loss": 0.1972,
"step": 1513
},
{
"epoch": 12.94017094017094,
"grad_norm": 1.1411398649215698,
"learning_rate": 3.5299145299145304e-06,
"loss": 0.0243,
"step": 1514
},
{
"epoch": 12.948717948717949,
"grad_norm": 4.486563205718994,
"learning_rate": 3.5256410256410263e-06,
"loss": 0.1347,
"step": 1515
},
{
"epoch": 12.957264957264957,
"grad_norm": 2.348222494125366,
"learning_rate": 3.5213675213675218e-06,
"loss": 0.1828,
"step": 1516
},
{
"epoch": 12.965811965811966,
"grad_norm": 2.2855775356292725,
"learning_rate": 3.5170940170940177e-06,
"loss": 0.0465,
"step": 1517
},
{
"epoch": 12.974358974358974,
"grad_norm": 10.313456535339355,
"learning_rate": 3.5128205128205127e-06,
"loss": 0.3033,
"step": 1518
},
{
"epoch": 12.982905982905983,
"grad_norm": 12.115890502929688,
"learning_rate": 3.5085470085470086e-06,
"loss": 0.6762,
"step": 1519
},
{
"epoch": 12.991452991452991,
"grad_norm": 2.746267557144165,
"learning_rate": 3.5042735042735045e-06,
"loss": 0.123,
"step": 1520
},
{
"epoch": 13.0,
"grad_norm": 5.204991340637207,
"learning_rate": 3.5e-06,
"loss": 0.2086,
"step": 1521
},
{
"epoch": 13.0,
"eval_loss": 0.06878729909658432,
"eval_runtime": 9.2334,
"eval_samples_per_second": 50.469,
"eval_steps_per_second": 6.39,
"step": 1521
},
{
"epoch": 13.008547008547009,
"grad_norm": 1.8741862773895264,
"learning_rate": 3.495726495726496e-06,
"loss": 0.0594,
"step": 1522
},
{
"epoch": 13.017094017094017,
"grad_norm": 1.6060154438018799,
"learning_rate": 3.4914529914529917e-06,
"loss": 0.0426,
"step": 1523
},
{
"epoch": 13.025641025641026,
"grad_norm": 2.194714069366455,
"learning_rate": 3.487179487179487e-06,
"loss": 0.1907,
"step": 1524
},
{
"epoch": 13.034188034188034,
"grad_norm": 0.716149628162384,
"learning_rate": 3.482905982905983e-06,
"loss": 0.0177,
"step": 1525
},
{
"epoch": 13.042735042735043,
"grad_norm": 4.787989139556885,
"learning_rate": 3.478632478632479e-06,
"loss": 0.246,
"step": 1526
},
{
"epoch": 13.051282051282051,
"grad_norm": 1.662338137626648,
"learning_rate": 3.4743589743589744e-06,
"loss": 0.0561,
"step": 1527
},
{
"epoch": 13.05982905982906,
"grad_norm": 0.9663236737251282,
"learning_rate": 3.4700854700854703e-06,
"loss": 0.0392,
"step": 1528
},
{
"epoch": 13.068376068376068,
"grad_norm": 0.8232766389846802,
"learning_rate": 3.465811965811966e-06,
"loss": 0.0221,
"step": 1529
},
{
"epoch": 13.076923076923077,
"grad_norm": 2.434157609939575,
"learning_rate": 3.4615384615384617e-06,
"loss": 0.1777,
"step": 1530
},
{
"epoch": 13.085470085470085,
"grad_norm": 2.768070936203003,
"learning_rate": 3.4572649572649576e-06,
"loss": 0.1101,
"step": 1531
},
{
"epoch": 13.094017094017094,
"grad_norm": 2.061371088027954,
"learning_rate": 3.452991452991453e-06,
"loss": 0.0591,
"step": 1532
},
{
"epoch": 13.102564102564102,
"grad_norm": 1.6127598285675049,
"learning_rate": 3.448717948717949e-06,
"loss": 0.3858,
"step": 1533
},
{
"epoch": 13.11111111111111,
"grad_norm": 1.2561885118484497,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0315,
"step": 1534
},
{
"epoch": 13.11965811965812,
"grad_norm": 2.2859408855438232,
"learning_rate": 3.4401709401709403e-06,
"loss": 0.047,
"step": 1535
},
{
"epoch": 13.128205128205128,
"grad_norm": 3.7528388500213623,
"learning_rate": 3.435897435897436e-06,
"loss": 0.1069,
"step": 1536
},
{
"epoch": 13.136752136752136,
"grad_norm": 5.547614574432373,
"learning_rate": 3.431623931623932e-06,
"loss": 0.1411,
"step": 1537
},
{
"epoch": 13.145299145299145,
"grad_norm": 1.6566565036773682,
"learning_rate": 3.4273504273504275e-06,
"loss": 0.0266,
"step": 1538
},
{
"epoch": 13.153846153846153,
"grad_norm": 5.280163288116455,
"learning_rate": 3.4230769230769234e-06,
"loss": 0.0843,
"step": 1539
},
{
"epoch": 13.162393162393162,
"grad_norm": 6.624744892120361,
"learning_rate": 3.4188034188034193e-06,
"loss": 0.1652,
"step": 1540
},
{
"epoch": 13.17094017094017,
"grad_norm": 5.325616359710693,
"learning_rate": 3.414529914529915e-06,
"loss": 0.077,
"step": 1541
},
{
"epoch": 13.179487179487179,
"grad_norm": 11.31779956817627,
"learning_rate": 3.4102564102564107e-06,
"loss": 0.4377,
"step": 1542
},
{
"epoch": 13.188034188034187,
"grad_norm": 4.86885404586792,
"learning_rate": 3.4059829059829066e-06,
"loss": 0.2312,
"step": 1543
},
{
"epoch": 13.196581196581196,
"grad_norm": 1.779068112373352,
"learning_rate": 3.401709401709402e-06,
"loss": 0.032,
"step": 1544
},
{
"epoch": 13.205128205128204,
"grad_norm": 1.9934108257293701,
"learning_rate": 3.397435897435898e-06,
"loss": 0.0861,
"step": 1545
},
{
"epoch": 13.213675213675213,
"grad_norm": 2.1829612255096436,
"learning_rate": 3.3931623931623934e-06,
"loss": 0.0855,
"step": 1546
},
{
"epoch": 13.222222222222221,
"grad_norm": 31.108810424804688,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.334,
"step": 1547
},
{
"epoch": 13.23076923076923,
"grad_norm": 4.867705345153809,
"learning_rate": 3.384615384615385e-06,
"loss": 0.0808,
"step": 1548
},
{
"epoch": 13.239316239316238,
"grad_norm": 3.226783275604248,
"learning_rate": 3.3803418803418806e-06,
"loss": 0.1806,
"step": 1549
},
{
"epoch": 13.247863247863247,
"grad_norm": 1.4822824001312256,
"learning_rate": 3.3760683760683765e-06,
"loss": 0.0602,
"step": 1550
},
{
"epoch": 13.256410256410255,
"grad_norm": 4.529379844665527,
"learning_rate": 3.3717948717948724e-06,
"loss": 0.318,
"step": 1551
},
{
"epoch": 13.264957264957266,
"grad_norm": 3.2155706882476807,
"learning_rate": 3.367521367521368e-06,
"loss": 0.1006,
"step": 1552
},
{
"epoch": 13.273504273504274,
"grad_norm": 2.2805707454681396,
"learning_rate": 3.3632478632478638e-06,
"loss": 0.0774,
"step": 1553
},
{
"epoch": 13.282051282051283,
"grad_norm": 11.477370262145996,
"learning_rate": 3.358974358974359e-06,
"loss": 0.8342,
"step": 1554
},
{
"epoch": 13.290598290598291,
"grad_norm": 3.8596534729003906,
"learning_rate": 3.3547008547008547e-06,
"loss": 0.1924,
"step": 1555
},
{
"epoch": 13.2991452991453,
"grad_norm": 4.497336387634277,
"learning_rate": 3.3504273504273506e-06,
"loss": 0.2425,
"step": 1556
},
{
"epoch": 13.307692307692308,
"grad_norm": 1.4496978521347046,
"learning_rate": 3.346153846153846e-06,
"loss": 0.0168,
"step": 1557
},
{
"epoch": 13.316239316239317,
"grad_norm": 2.0277416706085205,
"learning_rate": 3.341880341880342e-06,
"loss": 0.0634,
"step": 1558
},
{
"epoch": 13.324786324786325,
"grad_norm": 2.9120066165924072,
"learning_rate": 3.337606837606838e-06,
"loss": 0.1153,
"step": 1559
},
{
"epoch": 13.333333333333334,
"grad_norm": 4.949625015258789,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1412,
"step": 1560
},
{
"epoch": 13.341880341880342,
"grad_norm": 5.970853805541992,
"learning_rate": 3.3290598290598292e-06,
"loss": 0.1607,
"step": 1561
},
{
"epoch": 13.350427350427351,
"grad_norm": 2.1988022327423096,
"learning_rate": 3.324786324786325e-06,
"loss": 0.0329,
"step": 1562
},
{
"epoch": 13.35897435897436,
"grad_norm": 2.3578758239746094,
"learning_rate": 3.3205128205128206e-06,
"loss": 0.0711,
"step": 1563
},
{
"epoch": 13.367521367521368,
"grad_norm": 4.554023742675781,
"learning_rate": 3.3162393162393165e-06,
"loss": 0.1929,
"step": 1564
},
{
"epoch": 13.376068376068377,
"grad_norm": 3.577073335647583,
"learning_rate": 3.311965811965812e-06,
"loss": 0.0969,
"step": 1565
},
{
"epoch": 13.384615384615385,
"grad_norm": 3.3863015174865723,
"learning_rate": 3.307692307692308e-06,
"loss": 0.2402,
"step": 1566
},
{
"epoch": 13.393162393162394,
"grad_norm": 1.044550895690918,
"learning_rate": 3.3034188034188037e-06,
"loss": 0.026,
"step": 1567
},
{
"epoch": 13.401709401709402,
"grad_norm": 3.1525843143463135,
"learning_rate": 3.299145299145299e-06,
"loss": 0.0619,
"step": 1568
},
{
"epoch": 13.41025641025641,
"grad_norm": 2.0380606651306152,
"learning_rate": 3.294871794871795e-06,
"loss": 0.0477,
"step": 1569
},
{
"epoch": 13.418803418803419,
"grad_norm": 2.4260973930358887,
"learning_rate": 3.290598290598291e-06,
"loss": 0.0709,
"step": 1570
},
{
"epoch": 13.427350427350428,
"grad_norm": 20.958803176879883,
"learning_rate": 3.2863247863247864e-06,
"loss": 0.2297,
"step": 1571
},
{
"epoch": 13.435897435897436,
"grad_norm": 2.847252368927002,
"learning_rate": 3.2820512820512823e-06,
"loss": 0.0565,
"step": 1572
},
{
"epoch": 13.444444444444445,
"grad_norm": 3.646381139755249,
"learning_rate": 3.277777777777778e-06,
"loss": 0.3043,
"step": 1573
},
{
"epoch": 13.452991452991453,
"grad_norm": 3.0526609420776367,
"learning_rate": 3.2735042735042737e-06,
"loss": 0.0941,
"step": 1574
},
{
"epoch": 13.461538461538462,
"grad_norm": 1.6154388189315796,
"learning_rate": 3.2692307692307696e-06,
"loss": 0.0597,
"step": 1575
},
{
"epoch": 13.47008547008547,
"grad_norm": 1.0825392007827759,
"learning_rate": 3.2649572649572655e-06,
"loss": 0.0325,
"step": 1576
},
{
"epoch": 13.478632478632479,
"grad_norm": 6.045910358428955,
"learning_rate": 3.260683760683761e-06,
"loss": 0.2202,
"step": 1577
},
{
"epoch": 13.487179487179487,
"grad_norm": 3.0401153564453125,
"learning_rate": 3.256410256410257e-06,
"loss": 0.0923,
"step": 1578
},
{
"epoch": 13.495726495726496,
"grad_norm": 5.485551834106445,
"learning_rate": 3.2521367521367527e-06,
"loss": 0.3851,
"step": 1579
},
{
"epoch": 13.504273504273504,
"grad_norm": 2.575057029724121,
"learning_rate": 3.247863247863248e-06,
"loss": 0.0307,
"step": 1580
},
{
"epoch": 13.512820512820513,
"grad_norm": 2.7744545936584473,
"learning_rate": 3.243589743589744e-06,
"loss": 0.1791,
"step": 1581
},
{
"epoch": 13.521367521367521,
"grad_norm": 2.430640459060669,
"learning_rate": 3.2393162393162395e-06,
"loss": 0.1128,
"step": 1582
},
{
"epoch": 13.52991452991453,
"grad_norm": 4.902276992797852,
"learning_rate": 3.2350427350427354e-06,
"loss": 0.2661,
"step": 1583
},
{
"epoch": 13.538461538461538,
"grad_norm": 2.601134777069092,
"learning_rate": 3.2307692307692313e-06,
"loss": 0.1311,
"step": 1584
},
{
"epoch": 13.547008547008547,
"grad_norm": 6.309877395629883,
"learning_rate": 3.2264957264957268e-06,
"loss": 0.2621,
"step": 1585
},
{
"epoch": 13.555555555555555,
"grad_norm": 2.079618215560913,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0702,
"step": 1586
},
{
"epoch": 13.564102564102564,
"grad_norm": 2.309541702270508,
"learning_rate": 3.2179487179487186e-06,
"loss": 0.1577,
"step": 1587
},
{
"epoch": 13.572649572649572,
"grad_norm": 4.723629951477051,
"learning_rate": 3.213675213675214e-06,
"loss": 0.142,
"step": 1588
},
{
"epoch": 13.581196581196581,
"grad_norm": 2.557123899459839,
"learning_rate": 3.20940170940171e-06,
"loss": 0.1506,
"step": 1589
},
{
"epoch": 13.58974358974359,
"grad_norm": 2.3154499530792236,
"learning_rate": 3.205128205128206e-06,
"loss": 0.1039,
"step": 1590
},
{
"epoch": 13.598290598290598,
"grad_norm": 1.5464012622833252,
"learning_rate": 3.200854700854701e-06,
"loss": 0.0989,
"step": 1591
},
{
"epoch": 13.606837606837606,
"grad_norm": 1.5885653495788574,
"learning_rate": 3.1965811965811967e-06,
"loss": 0.0278,
"step": 1592
},
{
"epoch": 13.615384615384615,
"grad_norm": 2.7710390090942383,
"learning_rate": 3.192307692307692e-06,
"loss": 0.0521,
"step": 1593
},
{
"epoch": 13.623931623931623,
"grad_norm": 4.587305545806885,
"learning_rate": 3.188034188034188e-06,
"loss": 0.2609,
"step": 1594
},
{
"epoch": 13.632478632478632,
"grad_norm": 4.343963623046875,
"learning_rate": 3.183760683760684e-06,
"loss": 0.1079,
"step": 1595
},
{
"epoch": 13.64102564102564,
"grad_norm": 2.7653536796569824,
"learning_rate": 3.1794871794871795e-06,
"loss": 0.1293,
"step": 1596
},
{
"epoch": 13.649572649572649,
"grad_norm": 3.1731350421905518,
"learning_rate": 3.1752136752136753e-06,
"loss": 0.1279,
"step": 1597
},
{
"epoch": 13.658119658119658,
"grad_norm": 8.032745361328125,
"learning_rate": 3.1709401709401712e-06,
"loss": 0.2114,
"step": 1598
},
{
"epoch": 13.666666666666666,
"grad_norm": 5.6177263259887695,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0926,
"step": 1599
},
{
"epoch": 13.675213675213675,
"grad_norm": 3.3568480014801025,
"learning_rate": 3.1623931623931626e-06,
"loss": 0.1299,
"step": 1600
},
{
"epoch": 13.683760683760683,
"grad_norm": 5.182860374450684,
"learning_rate": 3.158119658119658e-06,
"loss": 0.1688,
"step": 1601
},
{
"epoch": 13.692307692307692,
"grad_norm": 5.954287052154541,
"learning_rate": 3.153846153846154e-06,
"loss": 0.2634,
"step": 1602
},
{
"epoch": 13.7008547008547,
"grad_norm": 2.8563358783721924,
"learning_rate": 3.14957264957265e-06,
"loss": 0.0469,
"step": 1603
},
{
"epoch": 13.709401709401709,
"grad_norm": 1.6049034595489502,
"learning_rate": 3.1452991452991453e-06,
"loss": 0.0855,
"step": 1604
},
{
"epoch": 13.717948717948717,
"grad_norm": 1.9734570980072021,
"learning_rate": 3.141025641025641e-06,
"loss": 0.0554,
"step": 1605
},
{
"epoch": 13.726495726495726,
"grad_norm": 1.8398605585098267,
"learning_rate": 3.136752136752137e-06,
"loss": 0.1033,
"step": 1606
},
{
"epoch": 13.735042735042736,
"grad_norm": 3.3013346195220947,
"learning_rate": 3.1324786324786326e-06,
"loss": 0.1476,
"step": 1607
},
{
"epoch": 13.743589743589745,
"grad_norm": 1.2622041702270508,
"learning_rate": 3.1282051282051284e-06,
"loss": 0.0222,
"step": 1608
},
{
"epoch": 13.752136752136753,
"grad_norm": 3.983888626098633,
"learning_rate": 3.1239316239316243e-06,
"loss": 0.0861,
"step": 1609
},
{
"epoch": 13.760683760683762,
"grad_norm": 2.883335828781128,
"learning_rate": 3.11965811965812e-06,
"loss": 0.0737,
"step": 1610
},
{
"epoch": 13.76923076923077,
"grad_norm": 0.9045059680938721,
"learning_rate": 3.1153846153846157e-06,
"loss": 0.0232,
"step": 1611
},
{
"epoch": 13.777777777777779,
"grad_norm": 1.8752232789993286,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0602,
"step": 1612
},
{
"epoch": 13.786324786324787,
"grad_norm": 3.088440418243408,
"learning_rate": 3.106837606837607e-06,
"loss": 0.102,
"step": 1613
},
{
"epoch": 13.794871794871796,
"grad_norm": 4.067224502563477,
"learning_rate": 3.102564102564103e-06,
"loss": 0.1461,
"step": 1614
},
{
"epoch": 13.803418803418804,
"grad_norm": 6.9123148918151855,
"learning_rate": 3.098290598290599e-06,
"loss": 0.0752,
"step": 1615
},
{
"epoch": 13.811965811965813,
"grad_norm": 17.15372657775879,
"learning_rate": 3.0940170940170943e-06,
"loss": 0.5163,
"step": 1616
},
{
"epoch": 13.820512820512821,
"grad_norm": 2.4951720237731934,
"learning_rate": 3.08974358974359e-06,
"loss": 0.1326,
"step": 1617
},
{
"epoch": 13.82905982905983,
"grad_norm": 2.1316449642181396,
"learning_rate": 3.0854700854700857e-06,
"loss": 0.0469,
"step": 1618
},
{
"epoch": 13.837606837606838,
"grad_norm": 2.5955941677093506,
"learning_rate": 3.0811965811965815e-06,
"loss": 0.1056,
"step": 1619
},
{
"epoch": 13.846153846153847,
"grad_norm": 14.360347747802734,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.4793,
"step": 1620
},
{
"epoch": 13.854700854700855,
"grad_norm": 1.9134567975997925,
"learning_rate": 3.072649572649573e-06,
"loss": 0.054,
"step": 1621
},
{
"epoch": 13.863247863247864,
"grad_norm": 3.1168692111968994,
"learning_rate": 3.068376068376069e-06,
"loss": 0.321,
"step": 1622
},
{
"epoch": 13.871794871794872,
"grad_norm": 4.940008163452148,
"learning_rate": 3.0641025641025647e-06,
"loss": 0.1452,
"step": 1623
},
{
"epoch": 13.88034188034188,
"grad_norm": 3.001660108566284,
"learning_rate": 3.05982905982906e-06,
"loss": 0.1094,
"step": 1624
},
{
"epoch": 13.88888888888889,
"grad_norm": 1.3110100030899048,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0305,
"step": 1625
},
{
"epoch": 13.897435897435898,
"grad_norm": 269.3442077636719,
"learning_rate": 3.051282051282052e-06,
"loss": 0.8319,
"step": 1626
},
{
"epoch": 13.905982905982906,
"grad_norm": 1.5236955881118774,
"learning_rate": 3.0470085470085474e-06,
"loss": 0.0294,
"step": 1627
},
{
"epoch": 13.914529914529915,
"grad_norm": 1.8342583179473877,
"learning_rate": 3.042735042735043e-06,
"loss": 0.1122,
"step": 1628
},
{
"epoch": 13.923076923076923,
"grad_norm": 1.7902953624725342,
"learning_rate": 3.0384615384615383e-06,
"loss": 0.0426,
"step": 1629
},
{
"epoch": 13.931623931623932,
"grad_norm": 1.461769938468933,
"learning_rate": 3.0341880341880342e-06,
"loss": 0.0326,
"step": 1630
},
{
"epoch": 13.94017094017094,
"grad_norm": 2.2590038776397705,
"learning_rate": 3.02991452991453e-06,
"loss": 0.067,
"step": 1631
},
{
"epoch": 13.948717948717949,
"grad_norm": 0.8894402980804443,
"learning_rate": 3.0256410256410256e-06,
"loss": 0.0269,
"step": 1632
},
{
"epoch": 13.957264957264957,
"grad_norm": 2.097757339477539,
"learning_rate": 3.0213675213675215e-06,
"loss": 0.1211,
"step": 1633
},
{
"epoch": 13.965811965811966,
"grad_norm": 4.112930774688721,
"learning_rate": 3.0170940170940174e-06,
"loss": 0.1026,
"step": 1634
},
{
"epoch": 13.974358974358974,
"grad_norm": 4.55318021774292,
"learning_rate": 3.012820512820513e-06,
"loss": 0.2808,
"step": 1635
},
{
"epoch": 13.982905982905983,
"grad_norm": 2.1912014484405518,
"learning_rate": 3.0085470085470087e-06,
"loss": 0.0906,
"step": 1636
},
{
"epoch": 13.991452991452991,
"grad_norm": 4.612771511077881,
"learning_rate": 3.004273504273504e-06,
"loss": 0.17,
"step": 1637
},
{
"epoch": 14.0,
"grad_norm": 7.162411212921143,
"learning_rate": 3e-06,
"loss": 0.131,
"step": 1638
},
{
"epoch": 14.0,
"eval_loss": 0.06268326193094254,
"eval_runtime": 9.262,
"eval_samples_per_second": 50.313,
"eval_steps_per_second": 6.37,
"step": 1638
},
{
"epoch": 14.008547008547009,
"grad_norm": 4.41022253036499,
"learning_rate": 2.995726495726496e-06,
"loss": 0.1989,
"step": 1639
},
{
"epoch": 14.017094017094017,
"grad_norm": 2.2863216400146484,
"learning_rate": 2.9914529914529914e-06,
"loss": 0.0612,
"step": 1640
},
{
"epoch": 14.025641025641026,
"grad_norm": 1.5455230474472046,
"learning_rate": 2.9871794871794873e-06,
"loss": 0.0378,
"step": 1641
},
{
"epoch": 14.034188034188034,
"grad_norm": 0.9546025991439819,
"learning_rate": 2.9829059829059832e-06,
"loss": 0.0214,
"step": 1642
},
{
"epoch": 14.042735042735043,
"grad_norm": 5.546824932098389,
"learning_rate": 2.9786324786324787e-06,
"loss": 0.2502,
"step": 1643
},
{
"epoch": 14.051282051282051,
"grad_norm": 1.6261364221572876,
"learning_rate": 2.9743589743589746e-06,
"loss": 0.0271,
"step": 1644
},
{
"epoch": 14.05982905982906,
"grad_norm": 1.710256814956665,
"learning_rate": 2.9700854700854705e-06,
"loss": 0.0582,
"step": 1645
},
{
"epoch": 14.068376068376068,
"grad_norm": 1.2083494663238525,
"learning_rate": 2.965811965811966e-06,
"loss": 0.026,
"step": 1646
},
{
"epoch": 14.076923076923077,
"grad_norm": 3.6400561332702637,
"learning_rate": 2.961538461538462e-06,
"loss": 0.0896,
"step": 1647
},
{
"epoch": 14.085470085470085,
"grad_norm": 2.1084742546081543,
"learning_rate": 2.9572649572649577e-06,
"loss": 0.0269,
"step": 1648
},
{
"epoch": 14.094017094017094,
"grad_norm": 1.5661289691925049,
"learning_rate": 2.952991452991453e-06,
"loss": 0.0401,
"step": 1649
},
{
"epoch": 14.102564102564102,
"grad_norm": 23.358585357666016,
"learning_rate": 2.948717948717949e-06,
"loss": 0.2069,
"step": 1650
},
{
"epoch": 14.11111111111111,
"grad_norm": 9.171899795532227,
"learning_rate": 2.944444444444445e-06,
"loss": 0.2842,
"step": 1651
},
{
"epoch": 14.11965811965812,
"grad_norm": 1.3189946413040161,
"learning_rate": 2.9401709401709404e-06,
"loss": 0.0331,
"step": 1652
},
{
"epoch": 14.128205128205128,
"grad_norm": 3.6144192218780518,
"learning_rate": 2.9358974358974363e-06,
"loss": 0.2069,
"step": 1653
},
{
"epoch": 14.136752136752136,
"grad_norm": 2.764681577682495,
"learning_rate": 2.931623931623932e-06,
"loss": 0.0646,
"step": 1654
},
{
"epoch": 14.145299145299145,
"grad_norm": 2.073028564453125,
"learning_rate": 2.9273504273504277e-06,
"loss": 0.1223,
"step": 1655
},
{
"epoch": 14.153846153846153,
"grad_norm": 12.209549903869629,
"learning_rate": 2.9230769230769236e-06,
"loss": 0.1922,
"step": 1656
},
{
"epoch": 14.162393162393162,
"grad_norm": 3.1137638092041016,
"learning_rate": 2.918803418803419e-06,
"loss": 0.2586,
"step": 1657
},
{
"epoch": 14.17094017094017,
"grad_norm": 5.130307674407959,
"learning_rate": 2.914529914529915e-06,
"loss": 0.2695,
"step": 1658
},
{
"epoch": 14.179487179487179,
"grad_norm": 3.475097894668579,
"learning_rate": 2.910256410256411e-06,
"loss": 0.2131,
"step": 1659
},
{
"epoch": 14.188034188034187,
"grad_norm": 0.5851498246192932,
"learning_rate": 2.9059829059829063e-06,
"loss": 0.0167,
"step": 1660
},
{
"epoch": 14.196581196581196,
"grad_norm": 1.795509934425354,
"learning_rate": 2.901709401709402e-06,
"loss": 0.0857,
"step": 1661
},
{
"epoch": 14.205128205128204,
"grad_norm": 1.7123979330062866,
"learning_rate": 2.897435897435898e-06,
"loss": 0.0599,
"step": 1662
},
{
"epoch": 14.213675213675213,
"grad_norm": 1.230388879776001,
"learning_rate": 2.8931623931623935e-06,
"loss": 0.0255,
"step": 1663
},
{
"epoch": 14.222222222222221,
"grad_norm": 3.8747615814208984,
"learning_rate": 2.888888888888889e-06,
"loss": 0.1412,
"step": 1664
},
{
"epoch": 14.23076923076923,
"grad_norm": 2.233584403991699,
"learning_rate": 2.8846153846153845e-06,
"loss": 0.068,
"step": 1665
},
{
"epoch": 14.239316239316238,
"grad_norm": 5.327254772186279,
"learning_rate": 2.8803418803418804e-06,
"loss": 0.2616,
"step": 1666
},
{
"epoch": 14.247863247863247,
"grad_norm": 6.126563549041748,
"learning_rate": 2.8760683760683762e-06,
"loss": 0.0931,
"step": 1667
},
{
"epoch": 14.256410256410255,
"grad_norm": 1.4305050373077393,
"learning_rate": 2.8717948717948717e-06,
"loss": 0.0221,
"step": 1668
},
{
"epoch": 14.264957264957266,
"grad_norm": 3.0924506187438965,
"learning_rate": 2.8675213675213676e-06,
"loss": 0.0417,
"step": 1669
},
{
"epoch": 14.273504273504274,
"grad_norm": 2.548558235168457,
"learning_rate": 2.8632478632478635e-06,
"loss": 0.0744,
"step": 1670
},
{
"epoch": 14.282051282051283,
"grad_norm": 0.46632057428359985,
"learning_rate": 2.858974358974359e-06,
"loss": 0.0114,
"step": 1671
},
{
"epoch": 14.290598290598291,
"grad_norm": 2.5199391841888428,
"learning_rate": 2.854700854700855e-06,
"loss": 0.0819,
"step": 1672
},
{
"epoch": 14.2991452991453,
"grad_norm": 1.849133014678955,
"learning_rate": 2.8504273504273507e-06,
"loss": 0.0424,
"step": 1673
},
{
"epoch": 14.307692307692308,
"grad_norm": 2.9396777153015137,
"learning_rate": 2.846153846153846e-06,
"loss": 0.0836,
"step": 1674
},
{
"epoch": 14.316239316239317,
"grad_norm": 0.7128950953483582,
"learning_rate": 2.841880341880342e-06,
"loss": 0.0181,
"step": 1675
},
{
"epoch": 14.324786324786325,
"grad_norm": 2.1387767791748047,
"learning_rate": 2.8376068376068376e-06,
"loss": 0.0432,
"step": 1676
},
{
"epoch": 14.333333333333334,
"grad_norm": 7.104556083679199,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.1277,
"step": 1677
},
{
"epoch": 14.341880341880342,
"grad_norm": 3.718749761581421,
"learning_rate": 2.8290598290598293e-06,
"loss": 0.0738,
"step": 1678
},
{
"epoch": 14.350427350427351,
"grad_norm": 3.9387831687927246,
"learning_rate": 2.824786324786325e-06,
"loss": 0.1374,
"step": 1679
},
{
"epoch": 14.35897435897436,
"grad_norm": 2.1527843475341797,
"learning_rate": 2.8205128205128207e-06,
"loss": 0.1426,
"step": 1680
},
{
"epoch": 14.367521367521368,
"grad_norm": 1.0589011907577515,
"learning_rate": 2.8162393162393166e-06,
"loss": 0.0343,
"step": 1681
},
{
"epoch": 14.376068376068377,
"grad_norm": 3.55014967918396,
"learning_rate": 2.811965811965812e-06,
"loss": 0.2962,
"step": 1682
},
{
"epoch": 14.384615384615385,
"grad_norm": 3.996713399887085,
"learning_rate": 2.807692307692308e-06,
"loss": 0.1458,
"step": 1683
},
{
"epoch": 14.393162393162394,
"grad_norm": 73.28384399414062,
"learning_rate": 2.803418803418804e-06,
"loss": 0.6138,
"step": 1684
},
{
"epoch": 14.401709401709402,
"grad_norm": 5.780628681182861,
"learning_rate": 2.7991452991452993e-06,
"loss": 0.2619,
"step": 1685
},
{
"epoch": 14.41025641025641,
"grad_norm": 3.2047317028045654,
"learning_rate": 2.794871794871795e-06,
"loss": 0.1917,
"step": 1686
},
{
"epoch": 14.418803418803419,
"grad_norm": 7.041647434234619,
"learning_rate": 2.790598290598291e-06,
"loss": 0.2136,
"step": 1687
},
{
"epoch": 14.427350427350428,
"grad_norm": 3.391404867172241,
"learning_rate": 2.7863247863247866e-06,
"loss": 0.094,
"step": 1688
},
{
"epoch": 14.435897435897436,
"grad_norm": 0.5430964231491089,
"learning_rate": 2.7820512820512824e-06,
"loss": 0.0139,
"step": 1689
},
{
"epoch": 14.444444444444445,
"grad_norm": 5.696547985076904,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.5808,
"step": 1690
},
{
"epoch": 14.452991452991453,
"grad_norm": 3.5785481929779053,
"learning_rate": 2.773504273504274e-06,
"loss": 0.219,
"step": 1691
},
{
"epoch": 14.461538461538462,
"grad_norm": 6.63624906539917,
"learning_rate": 2.7692307692307697e-06,
"loss": 0.2586,
"step": 1692
},
{
"epoch": 14.47008547008547,
"grad_norm": 16.79705810546875,
"learning_rate": 2.764957264957265e-06,
"loss": 0.1762,
"step": 1693
},
{
"epoch": 14.478632478632479,
"grad_norm": 4.069973468780518,
"learning_rate": 2.760683760683761e-06,
"loss": 0.1191,
"step": 1694
},
{
"epoch": 14.487179487179487,
"grad_norm": 1.1191340684890747,
"learning_rate": 2.756410256410257e-06,
"loss": 0.0529,
"step": 1695
},
{
"epoch": 14.495726495726496,
"grad_norm": 2.23835825920105,
"learning_rate": 2.7521367521367524e-06,
"loss": 0.0681,
"step": 1696
},
{
"epoch": 14.504273504273504,
"grad_norm": 2.745694160461426,
"learning_rate": 2.7478632478632483e-06,
"loss": 0.1885,
"step": 1697
},
{
"epoch": 14.512820512820513,
"grad_norm": 3.642946720123291,
"learning_rate": 2.743589743589744e-06,
"loss": 0.2061,
"step": 1698
},
{
"epoch": 14.521367521367521,
"grad_norm": 2.7571651935577393,
"learning_rate": 2.7393162393162397e-06,
"loss": 0.074,
"step": 1699
},
{
"epoch": 14.52991452991453,
"grad_norm": 0.889057457447052,
"learning_rate": 2.7350427350427355e-06,
"loss": 0.0342,
"step": 1700
},
{
"epoch": 14.538461538461538,
"grad_norm": 0.5471668243408203,
"learning_rate": 2.7307692307692306e-06,
"loss": 0.0125,
"step": 1701
},
{
"epoch": 14.547008547008547,
"grad_norm": 6.883024215698242,
"learning_rate": 2.7264957264957265e-06,
"loss": 0.4102,
"step": 1702
},
{
"epoch": 14.555555555555555,
"grad_norm": 2.6678171157836914,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0872,
"step": 1703
},
{
"epoch": 14.564102564102564,
"grad_norm": 5.825995445251465,
"learning_rate": 2.717948717948718e-06,
"loss": 0.1081,
"step": 1704
},
{
"epoch": 14.572649572649572,
"grad_norm": 1.5447179079055786,
"learning_rate": 2.7136752136752137e-06,
"loss": 0.0838,
"step": 1705
},
{
"epoch": 14.581196581196581,
"grad_norm": 17.58099937438965,
"learning_rate": 2.7094017094017096e-06,
"loss": 0.6379,
"step": 1706
},
{
"epoch": 14.58974358974359,
"grad_norm": 0.9537908434867859,
"learning_rate": 2.705128205128205e-06,
"loss": 0.0221,
"step": 1707
},
{
"epoch": 14.598290598290598,
"grad_norm": 3.264037847518921,
"learning_rate": 2.700854700854701e-06,
"loss": 0.1282,
"step": 1708
},
{
"epoch": 14.606837606837606,
"grad_norm": 1.7752703428268433,
"learning_rate": 2.696581196581197e-06,
"loss": 0.0194,
"step": 1709
},
{
"epoch": 14.615384615384615,
"grad_norm": 4.8417649269104,
"learning_rate": 2.6923076923076923e-06,
"loss": 0.2217,
"step": 1710
},
{
"epoch": 14.623931623931623,
"grad_norm": 2.915694236755371,
"learning_rate": 2.6880341880341882e-06,
"loss": 0.1506,
"step": 1711
},
{
"epoch": 14.632478632478632,
"grad_norm": 10.983115196228027,
"learning_rate": 2.6837606837606837e-06,
"loss": 0.4307,
"step": 1712
},
{
"epoch": 14.64102564102564,
"grad_norm": 1.1121952533721924,
"learning_rate": 2.6794871794871796e-06,
"loss": 0.0211,
"step": 1713
},
{
"epoch": 14.649572649572649,
"grad_norm": 2.6676313877105713,
"learning_rate": 2.6752136752136755e-06,
"loss": 0.0997,
"step": 1714
},
{
"epoch": 14.658119658119658,
"grad_norm": 1.718767523765564,
"learning_rate": 2.670940170940171e-06,
"loss": 0.0533,
"step": 1715
},
{
"epoch": 14.666666666666666,
"grad_norm": 1.567866563796997,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0913,
"step": 1716
},
{
"epoch": 14.675213675213675,
"grad_norm": 3.0697431564331055,
"learning_rate": 2.6623931623931627e-06,
"loss": 0.1133,
"step": 1717
},
{
"epoch": 14.683760683760683,
"grad_norm": 2.2237489223480225,
"learning_rate": 2.658119658119658e-06,
"loss": 0.1091,
"step": 1718
},
{
"epoch": 14.692307692307692,
"grad_norm": 6.050041198730469,
"learning_rate": 2.653846153846154e-06,
"loss": 0.5622,
"step": 1719
},
{
"epoch": 14.7008547008547,
"grad_norm": 1.1796153783798218,
"learning_rate": 2.64957264957265e-06,
"loss": 0.0522,
"step": 1720
},
{
"epoch": 14.709401709401709,
"grad_norm": 2.4849863052368164,
"learning_rate": 2.6452991452991454e-06,
"loss": 0.0332,
"step": 1721
},
{
"epoch": 14.717948717948717,
"grad_norm": 1.771933674812317,
"learning_rate": 2.6410256410256413e-06,
"loss": 0.0692,
"step": 1722
},
{
"epoch": 14.726495726495726,
"grad_norm": 4.174441337585449,
"learning_rate": 2.6367521367521372e-06,
"loss": 0.1419,
"step": 1723
},
{
"epoch": 14.735042735042736,
"grad_norm": 4.145920276641846,
"learning_rate": 2.6324786324786327e-06,
"loss": 0.5196,
"step": 1724
},
{
"epoch": 14.743589743589745,
"grad_norm": 3.363537073135376,
"learning_rate": 2.6282051282051286e-06,
"loss": 0.1187,
"step": 1725
},
{
"epoch": 14.752136752136753,
"grad_norm": 1.9558751583099365,
"learning_rate": 2.6239316239316245e-06,
"loss": 0.0193,
"step": 1726
},
{
"epoch": 14.760683760683762,
"grad_norm": 2.8293466567993164,
"learning_rate": 2.61965811965812e-06,
"loss": 0.0551,
"step": 1727
},
{
"epoch": 14.76923076923077,
"grad_norm": 1.2654905319213867,
"learning_rate": 2.615384615384616e-06,
"loss": 0.0805,
"step": 1728
},
{
"epoch": 14.777777777777779,
"grad_norm": 0.9344054460525513,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0177,
"step": 1729
},
{
"epoch": 14.786324786324787,
"grad_norm": 1.268433690071106,
"learning_rate": 2.606837606837607e-06,
"loss": 0.0185,
"step": 1730
},
{
"epoch": 14.794871794871796,
"grad_norm": 2.5544192790985107,
"learning_rate": 2.602564102564103e-06,
"loss": 0.063,
"step": 1731
},
{
"epoch": 14.803418803418804,
"grad_norm": 2.1078386306762695,
"learning_rate": 2.5982905982905985e-06,
"loss": 0.1203,
"step": 1732
},
{
"epoch": 14.811965811965813,
"grad_norm": 1.526848554611206,
"learning_rate": 2.5940170940170944e-06,
"loss": 0.0524,
"step": 1733
},
{
"epoch": 14.820512820512821,
"grad_norm": 0.7479220628738403,
"learning_rate": 2.5897435897435903e-06,
"loss": 0.0197,
"step": 1734
},
{
"epoch": 14.82905982905983,
"grad_norm": 2.937556266784668,
"learning_rate": 2.5854700854700858e-06,
"loss": 0.1406,
"step": 1735
},
{
"epoch": 14.837606837606838,
"grad_norm": 2.3128576278686523,
"learning_rate": 2.5811965811965817e-06,
"loss": 0.056,
"step": 1736
},
{
"epoch": 14.846153846153847,
"grad_norm": 2.1093039512634277,
"learning_rate": 2.5769230769230767e-06,
"loss": 0.0645,
"step": 1737
},
{
"epoch": 14.854700854700855,
"grad_norm": 2.104214668273926,
"learning_rate": 2.5726495726495726e-06,
"loss": 0.1097,
"step": 1738
},
{
"epoch": 14.863247863247864,
"grad_norm": 3.781390428543091,
"learning_rate": 2.5683760683760685e-06,
"loss": 0.1214,
"step": 1739
},
{
"epoch": 14.871794871794872,
"grad_norm": 4.119661331176758,
"learning_rate": 2.564102564102564e-06,
"loss": 0.1797,
"step": 1740
},
{
"epoch": 14.88034188034188,
"grad_norm": 6.488205909729004,
"learning_rate": 2.55982905982906e-06,
"loss": 0.0679,
"step": 1741
},
{
"epoch": 14.88888888888889,
"grad_norm": 1.4211604595184326,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0375,
"step": 1742
},
{
"epoch": 14.897435897435898,
"grad_norm": 3.577533721923828,
"learning_rate": 2.5512820512820512e-06,
"loss": 0.1914,
"step": 1743
},
{
"epoch": 14.905982905982906,
"grad_norm": 8.697205543518066,
"learning_rate": 2.547008547008547e-06,
"loss": 0.5511,
"step": 1744
},
{
"epoch": 14.914529914529915,
"grad_norm": 0.49716269969940186,
"learning_rate": 2.542735042735043e-06,
"loss": 0.0125,
"step": 1745
},
{
"epoch": 14.923076923076923,
"grad_norm": 2.8563008308410645,
"learning_rate": 2.5384615384615385e-06,
"loss": 0.0901,
"step": 1746
},
{
"epoch": 14.931623931623932,
"grad_norm": 3.6407926082611084,
"learning_rate": 2.5341880341880344e-06,
"loss": 0.0718,
"step": 1747
},
{
"epoch": 14.94017094017094,
"grad_norm": 1.2601441144943237,
"learning_rate": 2.52991452991453e-06,
"loss": 0.0451,
"step": 1748
},
{
"epoch": 14.948717948717949,
"grad_norm": 2.4402401447296143,
"learning_rate": 2.5256410256410257e-06,
"loss": 0.0771,
"step": 1749
},
{
"epoch": 14.957264957264957,
"grad_norm": 0.6150484681129456,
"learning_rate": 2.5213675213675216e-06,
"loss": 0.0151,
"step": 1750
},
{
"epoch": 14.965811965811966,
"grad_norm": 3.6569836139678955,
"learning_rate": 2.517094017094017e-06,
"loss": 0.0905,
"step": 1751
},
{
"epoch": 14.974358974358974,
"grad_norm": 3.4421300888061523,
"learning_rate": 2.512820512820513e-06,
"loss": 0.0456,
"step": 1752
},
{
"epoch": 14.982905982905983,
"grad_norm": 3.565871477127075,
"learning_rate": 2.508547008547009e-06,
"loss": 0.0491,
"step": 1753
},
{
"epoch": 14.991452991452991,
"grad_norm": 37.519065856933594,
"learning_rate": 2.5042735042735043e-06,
"loss": 0.1348,
"step": 1754
},
{
"epoch": 15.0,
"grad_norm": 5.1902899742126465,
"learning_rate": 2.5e-06,
"loss": 0.1099,
"step": 1755
},
{
"epoch": 15.0,
"eval_loss": 0.05930963531136513,
"eval_runtime": 9.2206,
"eval_samples_per_second": 50.539,
"eval_steps_per_second": 6.399,
"step": 1755
},
{
"epoch": 15.008547008547009,
"grad_norm": 5.6569342613220215,
"learning_rate": 2.495726495726496e-06,
"loss": 0.1931,
"step": 1756
},
{
"epoch": 15.017094017094017,
"grad_norm": 5.23728084564209,
"learning_rate": 2.4914529914529916e-06,
"loss": 0.2789,
"step": 1757
},
{
"epoch": 15.025641025641026,
"grad_norm": 0.8648807406425476,
"learning_rate": 2.4871794871794875e-06,
"loss": 0.0227,
"step": 1758
},
{
"epoch": 15.034188034188034,
"grad_norm": 3.0654587745666504,
"learning_rate": 2.4829059829059833e-06,
"loss": 0.0602,
"step": 1759
},
{
"epoch": 15.042735042735043,
"grad_norm": 4.374608039855957,
"learning_rate": 2.478632478632479e-06,
"loss": 0.2133,
"step": 1760
},
{
"epoch": 15.051282051282051,
"grad_norm": 1.2764301300048828,
"learning_rate": 2.4743589743589747e-06,
"loss": 0.0296,
"step": 1761
},
{
"epoch": 15.05982905982906,
"grad_norm": 0.9672349095344543,
"learning_rate": 2.4700854700854706e-06,
"loss": 0.0224,
"step": 1762
},
{
"epoch": 15.068376068376068,
"grad_norm": 8.807465553283691,
"learning_rate": 2.465811965811966e-06,
"loss": 0.0925,
"step": 1763
},
{
"epoch": 15.076923076923077,
"grad_norm": 1.4733474254608154,
"learning_rate": 2.461538461538462e-06,
"loss": 0.0286,
"step": 1764
},
{
"epoch": 15.085470085470085,
"grad_norm": 6.014289855957031,
"learning_rate": 2.4572649572649574e-06,
"loss": 0.1387,
"step": 1765
},
{
"epoch": 15.094017094017094,
"grad_norm": 1.899086356163025,
"learning_rate": 2.452991452991453e-06,
"loss": 0.07,
"step": 1766
},
{
"epoch": 15.102564102564102,
"grad_norm": 11.32197380065918,
"learning_rate": 2.4487179487179488e-06,
"loss": 0.2452,
"step": 1767
},
{
"epoch": 15.11111111111111,
"grad_norm": 3.223996639251709,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.139,
"step": 1768
},
{
"epoch": 15.11965811965812,
"grad_norm": 2.8729913234710693,
"learning_rate": 2.44017094017094e-06,
"loss": 0.1386,
"step": 1769
},
{
"epoch": 15.128205128205128,
"grad_norm": 1.9730579853057861,
"learning_rate": 2.435897435897436e-06,
"loss": 0.0882,
"step": 1770
},
{
"epoch": 15.136752136752136,
"grad_norm": 5.556413650512695,
"learning_rate": 2.431623931623932e-06,
"loss": 0.1554,
"step": 1771
},
{
"epoch": 15.145299145299145,
"grad_norm": 1.2356898784637451,
"learning_rate": 2.4273504273504274e-06,
"loss": 0.0217,
"step": 1772
},
{
"epoch": 15.153846153846153,
"grad_norm": 7.849127769470215,
"learning_rate": 2.4230769230769233e-06,
"loss": 0.221,
"step": 1773
},
{
"epoch": 15.162393162393162,
"grad_norm": 0.5792569518089294,
"learning_rate": 2.418803418803419e-06,
"loss": 0.017,
"step": 1774
},
{
"epoch": 15.17094017094017,
"grad_norm": 2.2549376487731934,
"learning_rate": 2.4145299145299146e-06,
"loss": 0.0499,
"step": 1775
},
{
"epoch": 15.179487179487179,
"grad_norm": 2.722200870513916,
"learning_rate": 2.4102564102564105e-06,
"loss": 0.0408,
"step": 1776
},
{
"epoch": 15.188034188034187,
"grad_norm": 3.1140944957733154,
"learning_rate": 2.4059829059829064e-06,
"loss": 0.1001,
"step": 1777
},
{
"epoch": 15.196581196581196,
"grad_norm": 4.461791515350342,
"learning_rate": 2.401709401709402e-06,
"loss": 0.3419,
"step": 1778
},
{
"epoch": 15.205128205128204,
"grad_norm": 1.8562372922897339,
"learning_rate": 2.3974358974358978e-06,
"loss": 0.1092,
"step": 1779
},
{
"epoch": 15.213675213675213,
"grad_norm": 5.2086181640625,
"learning_rate": 2.3931623931623937e-06,
"loss": 0.1767,
"step": 1780
},
{
"epoch": 15.222222222222221,
"grad_norm": 1.6226582527160645,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0347,
"step": 1781
},
{
"epoch": 15.23076923076923,
"grad_norm": 2.8507306575775146,
"learning_rate": 2.384615384615385e-06,
"loss": 0.0934,
"step": 1782
},
{
"epoch": 15.239316239316238,
"grad_norm": 2.74642276763916,
"learning_rate": 2.3803418803418805e-06,
"loss": 0.0857,
"step": 1783
},
{
"epoch": 15.247863247863247,
"grad_norm": 3.4352660179138184,
"learning_rate": 2.376068376068376e-06,
"loss": 0.2336,
"step": 1784
},
{
"epoch": 15.256410256410255,
"grad_norm": 3.4673473834991455,
"learning_rate": 2.371794871794872e-06,
"loss": 0.1974,
"step": 1785
},
{
"epoch": 15.264957264957266,
"grad_norm": 21.467744827270508,
"learning_rate": 2.3675213675213677e-06,
"loss": 0.6836,
"step": 1786
},
{
"epoch": 15.273504273504274,
"grad_norm": 2.832465887069702,
"learning_rate": 2.363247863247863e-06,
"loss": 0.245,
"step": 1787
},
{
"epoch": 15.282051282051283,
"grad_norm": 9.717825889587402,
"learning_rate": 2.358974358974359e-06,
"loss": 0.5324,
"step": 1788
},
{
"epoch": 15.290598290598291,
"grad_norm": 2.209528923034668,
"learning_rate": 2.354700854700855e-06,
"loss": 0.0854,
"step": 1789
},
{
"epoch": 15.2991452991453,
"grad_norm": 4.554971218109131,
"learning_rate": 2.3504273504273504e-06,
"loss": 0.1271,
"step": 1790
},
{
"epoch": 15.307692307692308,
"grad_norm": 3.1280457973480225,
"learning_rate": 2.3461538461538463e-06,
"loss": 0.1265,
"step": 1791
},
{
"epoch": 15.316239316239317,
"grad_norm": 2.647224187850952,
"learning_rate": 2.3418803418803422e-06,
"loss": 0.1965,
"step": 1792
},
{
"epoch": 15.324786324786325,
"grad_norm": 2.7695155143737793,
"learning_rate": 2.3376068376068377e-06,
"loss": 0.0528,
"step": 1793
},
{
"epoch": 15.333333333333334,
"grad_norm": 20.151025772094727,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.2011,
"step": 1794
},
{
"epoch": 15.341880341880342,
"grad_norm": 2.8718080520629883,
"learning_rate": 2.3290598290598295e-06,
"loss": 0.0502,
"step": 1795
},
{
"epoch": 15.350427350427351,
"grad_norm": 2.17462158203125,
"learning_rate": 2.324786324786325e-06,
"loss": 0.0658,
"step": 1796
},
{
"epoch": 15.35897435897436,
"grad_norm": 4.324810981750488,
"learning_rate": 2.320512820512821e-06,
"loss": 0.1429,
"step": 1797
},
{
"epoch": 15.367521367521368,
"grad_norm": 184.52798461914062,
"learning_rate": 2.3162393162393167e-06,
"loss": 0.5155,
"step": 1798
},
{
"epoch": 15.376068376068377,
"grad_norm": 2.6076488494873047,
"learning_rate": 2.311965811965812e-06,
"loss": 0.0708,
"step": 1799
},
{
"epoch": 15.384615384615385,
"grad_norm": 3.0682790279388428,
"learning_rate": 2.307692307692308e-06,
"loss": 0.2662,
"step": 1800
},
{
"epoch": 15.393162393162394,
"grad_norm": 1.3366855382919312,
"learning_rate": 2.3034188034188035e-06,
"loss": 0.0136,
"step": 1801
},
{
"epoch": 15.401709401709402,
"grad_norm": 0.5489670634269714,
"learning_rate": 2.299145299145299e-06,
"loss": 0.0148,
"step": 1802
},
{
"epoch": 15.41025641025641,
"grad_norm": 1.080804705619812,
"learning_rate": 2.294871794871795e-06,
"loss": 0.025,
"step": 1803
},
{
"epoch": 15.418803418803419,
"grad_norm": 8.801629066467285,
"learning_rate": 2.290598290598291e-06,
"loss": 0.2038,
"step": 1804
},
{
"epoch": 15.427350427350428,
"grad_norm": 66.96419525146484,
"learning_rate": 2.2863247863247863e-06,
"loss": 0.4094,
"step": 1805
},
{
"epoch": 15.435897435897436,
"grad_norm": 1.3400782346725464,
"learning_rate": 2.282051282051282e-06,
"loss": 0.0452,
"step": 1806
},
{
"epoch": 15.444444444444445,
"grad_norm": 3.5850300788879395,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0919,
"step": 1807
},
{
"epoch": 15.452991452991453,
"grad_norm": 8.670539855957031,
"learning_rate": 2.2735042735042735e-06,
"loss": 0.255,
"step": 1808
},
{
"epoch": 15.461538461538462,
"grad_norm": 3.609617233276367,
"learning_rate": 2.2692307692307694e-06,
"loss": 0.1203,
"step": 1809
},
{
"epoch": 15.47008547008547,
"grad_norm": 1.5857924222946167,
"learning_rate": 2.2649572649572653e-06,
"loss": 0.0371,
"step": 1810
},
{
"epoch": 15.478632478632479,
"grad_norm": 1.386805534362793,
"learning_rate": 2.2606837606837608e-06,
"loss": 0.0385,
"step": 1811
},
{
"epoch": 15.487179487179487,
"grad_norm": 4.130802631378174,
"learning_rate": 2.2564102564102566e-06,
"loss": 0.2261,
"step": 1812
},
{
"epoch": 15.495726495726496,
"grad_norm": 2.974247455596924,
"learning_rate": 2.2521367521367525e-06,
"loss": 0.0651,
"step": 1813
},
{
"epoch": 15.504273504273504,
"grad_norm": 1.2551554441452026,
"learning_rate": 2.247863247863248e-06,
"loss": 0.0229,
"step": 1814
},
{
"epoch": 15.512820512820513,
"grad_norm": 3.1401453018188477,
"learning_rate": 2.243589743589744e-06,
"loss": 0.0409,
"step": 1815
},
{
"epoch": 15.521367521367521,
"grad_norm": 1.3921948671340942,
"learning_rate": 2.2393162393162398e-06,
"loss": 0.0335,
"step": 1816
},
{
"epoch": 15.52991452991453,
"grad_norm": 5.457981586456299,
"learning_rate": 2.2350427350427353e-06,
"loss": 0.22,
"step": 1817
},
{
"epoch": 15.538461538461538,
"grad_norm": 0.9100427031517029,
"learning_rate": 2.230769230769231e-06,
"loss": 0.0217,
"step": 1818
},
{
"epoch": 15.547008547008547,
"grad_norm": 3.5890519618988037,
"learning_rate": 2.2264957264957266e-06,
"loss": 0.2241,
"step": 1819
},
{
"epoch": 15.555555555555555,
"grad_norm": 2.965954303741455,
"learning_rate": 2.222222222222222e-06,
"loss": 0.1453,
"step": 1820
},
{
"epoch": 15.564102564102564,
"grad_norm": 8.436135292053223,
"learning_rate": 2.217948717948718e-06,
"loss": 0.2784,
"step": 1821
},
{
"epoch": 15.572649572649572,
"grad_norm": 2.043687582015991,
"learning_rate": 2.213675213675214e-06,
"loss": 0.0755,
"step": 1822
},
{
"epoch": 15.581196581196581,
"grad_norm": 2.380276918411255,
"learning_rate": 2.2094017094017093e-06,
"loss": 0.1867,
"step": 1823
},
{
"epoch": 15.58974358974359,
"grad_norm": 2.5189390182495117,
"learning_rate": 2.2051282051282052e-06,
"loss": 0.0619,
"step": 1824
},
{
"epoch": 15.598290598290598,
"grad_norm": 1.123610258102417,
"learning_rate": 2.200854700854701e-06,
"loss": 0.0286,
"step": 1825
},
{
"epoch": 15.606837606837606,
"grad_norm": 3.0018534660339355,
"learning_rate": 2.1965811965811966e-06,
"loss": 0.1449,
"step": 1826
},
{
"epoch": 15.615384615384615,
"grad_norm": 2.178926706314087,
"learning_rate": 2.1923076923076925e-06,
"loss": 0.0859,
"step": 1827
},
{
"epoch": 15.623931623931623,
"grad_norm": 5.799438953399658,
"learning_rate": 2.1880341880341884e-06,
"loss": 0.2669,
"step": 1828
},
{
"epoch": 15.632478632478632,
"grad_norm": 2.0338144302368164,
"learning_rate": 2.183760683760684e-06,
"loss": 0.0616,
"step": 1829
},
{
"epoch": 15.64102564102564,
"grad_norm": 3.789525032043457,
"learning_rate": 2.1794871794871797e-06,
"loss": 0.0439,
"step": 1830
},
{
"epoch": 15.649572649572649,
"grad_norm": 2.3695919513702393,
"learning_rate": 2.1752136752136756e-06,
"loss": 0.0979,
"step": 1831
},
{
"epoch": 15.658119658119658,
"grad_norm": 0.8543546795845032,
"learning_rate": 2.170940170940171e-06,
"loss": 0.0171,
"step": 1832
},
{
"epoch": 15.666666666666666,
"grad_norm": 3.7921054363250732,
"learning_rate": 2.166666666666667e-06,
"loss": 0.1094,
"step": 1833
},
{
"epoch": 15.675213675213675,
"grad_norm": 1.9967904090881348,
"learning_rate": 2.162393162393163e-06,
"loss": 0.0382,
"step": 1834
},
{
"epoch": 15.683760683760683,
"grad_norm": 2.5073959827423096,
"learning_rate": 2.1581196581196583e-06,
"loss": 0.0554,
"step": 1835
},
{
"epoch": 15.692307692307692,
"grad_norm": 1.2741888761520386,
"learning_rate": 2.153846153846154e-06,
"loss": 0.056,
"step": 1836
},
{
"epoch": 15.7008547008547,
"grad_norm": 1.992280125617981,
"learning_rate": 2.1495726495726497e-06,
"loss": 0.0206,
"step": 1837
},
{
"epoch": 15.709401709401709,
"grad_norm": 1.0176990032196045,
"learning_rate": 2.145299145299145e-06,
"loss": 0.0276,
"step": 1838
},
{
"epoch": 15.717948717948717,
"grad_norm": 1.6685941219329834,
"learning_rate": 2.141025641025641e-06,
"loss": 0.0222,
"step": 1839
},
{
"epoch": 15.726495726495726,
"grad_norm": 3.171050548553467,
"learning_rate": 2.136752136752137e-06,
"loss": 0.1526,
"step": 1840
},
{
"epoch": 15.735042735042736,
"grad_norm": 1.5068336725234985,
"learning_rate": 2.1324786324786324e-06,
"loss": 0.0271,
"step": 1841
},
{
"epoch": 15.743589743589745,
"grad_norm": 3.171870708465576,
"learning_rate": 2.1282051282051283e-06,
"loss": 0.0628,
"step": 1842
},
{
"epoch": 15.752136752136753,
"grad_norm": 1.9212791919708252,
"learning_rate": 2.123931623931624e-06,
"loss": 0.1018,
"step": 1843
},
{
"epoch": 15.760683760683762,
"grad_norm": 4.073456287384033,
"learning_rate": 2.1196581196581196e-06,
"loss": 0.1144,
"step": 1844
},
{
"epoch": 15.76923076923077,
"grad_norm": 1.8453985452651978,
"learning_rate": 2.1153846153846155e-06,
"loss": 0.0995,
"step": 1845
},
{
"epoch": 15.777777777777779,
"grad_norm": 3.285759210586548,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.1173,
"step": 1846
},
{
"epoch": 15.786324786324787,
"grad_norm": 3.709202289581299,
"learning_rate": 2.106837606837607e-06,
"loss": 0.1906,
"step": 1847
},
{
"epoch": 15.794871794871796,
"grad_norm": 1.951262354850769,
"learning_rate": 2.1025641025641028e-06,
"loss": 0.0954,
"step": 1848
},
{
"epoch": 15.803418803418804,
"grad_norm": 3.249171257019043,
"learning_rate": 2.0982905982905987e-06,
"loss": 0.1258,
"step": 1849
},
{
"epoch": 15.811965811965813,
"grad_norm": 0.5708752274513245,
"learning_rate": 2.094017094017094e-06,
"loss": 0.0128,
"step": 1850
},
{
"epoch": 15.820512820512821,
"grad_norm": 3.2894484996795654,
"learning_rate": 2.08974358974359e-06,
"loss": 0.0621,
"step": 1851
},
{
"epoch": 15.82905982905983,
"grad_norm": 0.8564540147781372,
"learning_rate": 2.085470085470086e-06,
"loss": 0.0194,
"step": 1852
},
{
"epoch": 15.837606837606838,
"grad_norm": 3.319011926651001,
"learning_rate": 2.0811965811965814e-06,
"loss": 0.1413,
"step": 1853
},
{
"epoch": 15.846153846153847,
"grad_norm": 1.5385066270828247,
"learning_rate": 2.0769230769230773e-06,
"loss": 0.0316,
"step": 1854
},
{
"epoch": 15.854700854700855,
"grad_norm": 4.076297283172607,
"learning_rate": 2.072649572649573e-06,
"loss": 0.2257,
"step": 1855
},
{
"epoch": 15.863247863247864,
"grad_norm": 4.738671779632568,
"learning_rate": 2.068376068376068e-06,
"loss": 0.1627,
"step": 1856
},
{
"epoch": 15.871794871794872,
"grad_norm": 5.589550495147705,
"learning_rate": 2.064102564102564e-06,
"loss": 0.3182,
"step": 1857
},
{
"epoch": 15.88034188034188,
"grad_norm": 1.6303757429122925,
"learning_rate": 2.05982905982906e-06,
"loss": 0.0384,
"step": 1858
},
{
"epoch": 15.88888888888889,
"grad_norm": 3.0257458686828613,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0967,
"step": 1859
},
{
"epoch": 15.897435897435898,
"grad_norm": 2.4926559925079346,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.0703,
"step": 1860
},
{
"epoch": 15.905982905982906,
"grad_norm": 2.0784358978271484,
"learning_rate": 2.0470085470085472e-06,
"loss": 0.062,
"step": 1861
},
{
"epoch": 15.914529914529915,
"grad_norm": 4.92131233215332,
"learning_rate": 2.0427350427350427e-06,
"loss": 0.0875,
"step": 1862
},
{
"epoch": 15.923076923076923,
"grad_norm": 2.999511241912842,
"learning_rate": 2.0384615384615386e-06,
"loss": 0.0388,
"step": 1863
},
{
"epoch": 15.931623931623932,
"grad_norm": 5.770095348358154,
"learning_rate": 2.0341880341880345e-06,
"loss": 0.1257,
"step": 1864
},
{
"epoch": 15.94017094017094,
"grad_norm": 4.730950832366943,
"learning_rate": 2.02991452991453e-06,
"loss": 0.2386,
"step": 1865
},
{
"epoch": 15.948717948717949,
"grad_norm": 1.8125661611557007,
"learning_rate": 2.025641025641026e-06,
"loss": 0.0433,
"step": 1866
},
{
"epoch": 15.957264957264957,
"grad_norm": 5.433501243591309,
"learning_rate": 2.0213675213675217e-06,
"loss": 0.0536,
"step": 1867
},
{
"epoch": 15.965811965811966,
"grad_norm": 1.2565219402313232,
"learning_rate": 2.017094017094017e-06,
"loss": 0.0263,
"step": 1868
},
{
"epoch": 15.974358974358974,
"grad_norm": 1.5660192966461182,
"learning_rate": 2.012820512820513e-06,
"loss": 0.0387,
"step": 1869
},
{
"epoch": 15.982905982905983,
"grad_norm": 5.742929935455322,
"learning_rate": 2.008547008547009e-06,
"loss": 0.2158,
"step": 1870
},
{
"epoch": 15.991452991452991,
"grad_norm": 3.597506284713745,
"learning_rate": 2.0042735042735044e-06,
"loss": 0.0962,
"step": 1871
},
{
"epoch": 16.0,
"grad_norm": 1.753219485282898,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0193,
"step": 1872
},
{
"epoch": 16.0,
"eval_loss": 0.05589358136057854,
"eval_runtime": 9.2203,
"eval_samples_per_second": 50.541,
"eval_steps_per_second": 6.399,
"step": 1872
},
{
"epoch": 16.00854700854701,
"grad_norm": 9.627431869506836,
"learning_rate": 1.9957264957264962e-06,
"loss": 0.4748,
"step": 1873
},
{
"epoch": 16.017094017094017,
"grad_norm": 7.770556926727295,
"learning_rate": 1.9914529914529917e-06,
"loss": 0.2615,
"step": 1874
},
{
"epoch": 16.025641025641026,
"grad_norm": 1.7268822193145752,
"learning_rate": 1.987179487179487e-06,
"loss": 0.0808,
"step": 1875
},
{
"epoch": 16.034188034188034,
"grad_norm": 1.7209370136260986,
"learning_rate": 1.982905982905983e-06,
"loss": 0.0575,
"step": 1876
},
{
"epoch": 16.042735042735043,
"grad_norm": 2.6422786712646484,
"learning_rate": 1.9786324786324785e-06,
"loss": 0.0815,
"step": 1877
},
{
"epoch": 16.05128205128205,
"grad_norm": 0.9057373404502869,
"learning_rate": 1.9743589743589744e-06,
"loss": 0.0359,
"step": 1878
},
{
"epoch": 16.05982905982906,
"grad_norm": 1.4879076480865479,
"learning_rate": 1.9700854700854703e-06,
"loss": 0.0658,
"step": 1879
},
{
"epoch": 16.068376068376068,
"grad_norm": 2.1336488723754883,
"learning_rate": 1.9658119658119658e-06,
"loss": 0.0434,
"step": 1880
},
{
"epoch": 16.076923076923077,
"grad_norm": 2.642249822616577,
"learning_rate": 1.9615384615384617e-06,
"loss": 0.0768,
"step": 1881
},
{
"epoch": 16.085470085470085,
"grad_norm": 398.1800842285156,
"learning_rate": 1.9572649572649575e-06,
"loss": 1.7061,
"step": 1882
},
{
"epoch": 16.094017094017094,
"grad_norm": 1.6067556142807007,
"learning_rate": 1.952991452991453e-06,
"loss": 0.0492,
"step": 1883
},
{
"epoch": 16.102564102564102,
"grad_norm": 45.67499542236328,
"learning_rate": 1.948717948717949e-06,
"loss": 0.2883,
"step": 1884
},
{
"epoch": 16.11111111111111,
"grad_norm": 5.477624416351318,
"learning_rate": 1.944444444444445e-06,
"loss": 0.1107,
"step": 1885
},
{
"epoch": 16.11965811965812,
"grad_norm": 2.2795376777648926,
"learning_rate": 1.9401709401709403e-06,
"loss": 0.0427,
"step": 1886
},
{
"epoch": 16.128205128205128,
"grad_norm": 1.9572805166244507,
"learning_rate": 1.935897435897436e-06,
"loss": 0.04,
"step": 1887
},
{
"epoch": 16.136752136752136,
"grad_norm": 1.9205402135849,
"learning_rate": 1.931623931623932e-06,
"loss": 0.0384,
"step": 1888
},
{
"epoch": 16.145299145299145,
"grad_norm": 1.6124738454818726,
"learning_rate": 1.9273504273504275e-06,
"loss": 0.0322,
"step": 1889
},
{
"epoch": 16.153846153846153,
"grad_norm": 3.3396270275115967,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.1302,
"step": 1890
},
{
"epoch": 16.162393162393162,
"grad_norm": 2.4800124168395996,
"learning_rate": 1.9188034188034193e-06,
"loss": 0.1181,
"step": 1891
},
{
"epoch": 16.17094017094017,
"grad_norm": 5.452153205871582,
"learning_rate": 1.9145299145299148e-06,
"loss": 0.2054,
"step": 1892
},
{
"epoch": 16.17948717948718,
"grad_norm": 4.445066452026367,
"learning_rate": 1.9102564102564102e-06,
"loss": 0.1649,
"step": 1893
},
{
"epoch": 16.188034188034187,
"grad_norm": 1.0402263402938843,
"learning_rate": 1.9059829059829061e-06,
"loss": 0.0285,
"step": 1894
},
{
"epoch": 16.196581196581196,
"grad_norm": 1.8124594688415527,
"learning_rate": 1.9017094017094018e-06,
"loss": 0.0717,
"step": 1895
},
{
"epoch": 16.205128205128204,
"grad_norm": 5.0620245933532715,
"learning_rate": 1.8974358974358975e-06,
"loss": 0.3833,
"step": 1896
},
{
"epoch": 16.213675213675213,
"grad_norm": 3.201596975326538,
"learning_rate": 1.8931623931623931e-06,
"loss": 0.0687,
"step": 1897
},
{
"epoch": 16.22222222222222,
"grad_norm": 0.9610732793807983,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0165,
"step": 1898
},
{
"epoch": 16.23076923076923,
"grad_norm": 1.3409554958343506,
"learning_rate": 1.8846153846153847e-06,
"loss": 0.024,
"step": 1899
},
{
"epoch": 16.23931623931624,
"grad_norm": 1.2862681150436401,
"learning_rate": 1.8803418803418804e-06,
"loss": 0.042,
"step": 1900
},
{
"epoch": 16.247863247863247,
"grad_norm": 6.403625011444092,
"learning_rate": 1.8760683760683763e-06,
"loss": 0.5536,
"step": 1901
},
{
"epoch": 16.256410256410255,
"grad_norm": 3.241731882095337,
"learning_rate": 1.871794871794872e-06,
"loss": 0.1045,
"step": 1902
},
{
"epoch": 16.264957264957264,
"grad_norm": 1.1206634044647217,
"learning_rate": 1.8675213675213676e-06,
"loss": 0.0383,
"step": 1903
},
{
"epoch": 16.273504273504273,
"grad_norm": 3.3005762100219727,
"learning_rate": 1.8632478632478635e-06,
"loss": 0.0786,
"step": 1904
},
{
"epoch": 16.28205128205128,
"grad_norm": 0.44867634773254395,
"learning_rate": 1.8589743589743592e-06,
"loss": 0.0104,
"step": 1905
},
{
"epoch": 16.29059829059829,
"grad_norm": 2.7023422718048096,
"learning_rate": 1.8547008547008549e-06,
"loss": 0.1091,
"step": 1906
},
{
"epoch": 16.299145299145298,
"grad_norm": 0.9612734317779541,
"learning_rate": 1.8504273504273506e-06,
"loss": 0.0165,
"step": 1907
},
{
"epoch": 16.307692307692307,
"grad_norm": 3.0632894039154053,
"learning_rate": 1.8461538461538465e-06,
"loss": 0.1118,
"step": 1908
},
{
"epoch": 16.316239316239315,
"grad_norm": 3.932769775390625,
"learning_rate": 1.8418803418803421e-06,
"loss": 0.1084,
"step": 1909
},
{
"epoch": 16.324786324786324,
"grad_norm": 7.795356273651123,
"learning_rate": 1.8376068376068378e-06,
"loss": 0.2923,
"step": 1910
},
{
"epoch": 16.333333333333332,
"grad_norm": 1.4187766313552856,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0408,
"step": 1911
},
{
"epoch": 16.34188034188034,
"grad_norm": 1.1020699739456177,
"learning_rate": 1.8290598290598292e-06,
"loss": 0.0168,
"step": 1912
},
{
"epoch": 16.35042735042735,
"grad_norm": 0.9890375733375549,
"learning_rate": 1.8247863247863249e-06,
"loss": 0.0391,
"step": 1913
},
{
"epoch": 16.358974358974358,
"grad_norm": 39.418235778808594,
"learning_rate": 1.8205128205128205e-06,
"loss": 0.2804,
"step": 1914
},
{
"epoch": 16.367521367521366,
"grad_norm": 1.6613589525222778,
"learning_rate": 1.8162393162393164e-06,
"loss": 0.0475,
"step": 1915
},
{
"epoch": 16.376068376068375,
"grad_norm": 4.359612464904785,
"learning_rate": 1.811965811965812e-06,
"loss": 0.2247,
"step": 1916
},
{
"epoch": 16.384615384615383,
"grad_norm": 1.970078706741333,
"learning_rate": 1.8076923076923078e-06,
"loss": 0.03,
"step": 1917
},
{
"epoch": 16.39316239316239,
"grad_norm": 2.046025037765503,
"learning_rate": 1.8034188034188035e-06,
"loss": 0.0277,
"step": 1918
},
{
"epoch": 16.4017094017094,
"grad_norm": 1.5775028467178345,
"learning_rate": 1.7991452991452994e-06,
"loss": 0.0764,
"step": 1919
},
{
"epoch": 16.41025641025641,
"grad_norm": 2.8837273120880127,
"learning_rate": 1.794871794871795e-06,
"loss": 0.0903,
"step": 1920
},
{
"epoch": 16.418803418803417,
"grad_norm": 7.059972763061523,
"learning_rate": 1.7905982905982907e-06,
"loss": 0.0679,
"step": 1921
},
{
"epoch": 16.427350427350426,
"grad_norm": 3.6101839542388916,
"learning_rate": 1.7863247863247866e-06,
"loss": 0.1402,
"step": 1922
},
{
"epoch": 16.435897435897434,
"grad_norm": 2.3459484577178955,
"learning_rate": 1.7820512820512823e-06,
"loss": 0.0751,
"step": 1923
},
{
"epoch": 16.444444444444443,
"grad_norm": 2.0556280612945557,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0452,
"step": 1924
},
{
"epoch": 16.45299145299145,
"grad_norm": 0.5339368581771851,
"learning_rate": 1.7735042735042736e-06,
"loss": 0.013,
"step": 1925
},
{
"epoch": 16.46153846153846,
"grad_norm": 1.393329381942749,
"learning_rate": 1.7692307692307695e-06,
"loss": 0.038,
"step": 1926
},
{
"epoch": 16.47008547008547,
"grad_norm": 0.9439583420753479,
"learning_rate": 1.7649572649572652e-06,
"loss": 0.0228,
"step": 1927
},
{
"epoch": 16.478632478632477,
"grad_norm": 3.437713384628296,
"learning_rate": 1.7606837606837609e-06,
"loss": 0.2072,
"step": 1928
},
{
"epoch": 16.487179487179485,
"grad_norm": 1.725557804107666,
"learning_rate": 1.7564102564102563e-06,
"loss": 0.0494,
"step": 1929
},
{
"epoch": 16.495726495726494,
"grad_norm": 2.4226529598236084,
"learning_rate": 1.7521367521367522e-06,
"loss": 0.0796,
"step": 1930
},
{
"epoch": 16.504273504273506,
"grad_norm": 36.0551643371582,
"learning_rate": 1.747863247863248e-06,
"loss": 0.1966,
"step": 1931
},
{
"epoch": 16.51282051282051,
"grad_norm": 0.8370515704154968,
"learning_rate": 1.7435897435897436e-06,
"loss": 0.0346,
"step": 1932
},
{
"epoch": 16.521367521367523,
"grad_norm": 2.486854314804077,
"learning_rate": 1.7393162393162395e-06,
"loss": 0.1423,
"step": 1933
},
{
"epoch": 16.52991452991453,
"grad_norm": 3.2457993030548096,
"learning_rate": 1.7350427350427352e-06,
"loss": 0.1894,
"step": 1934
},
{
"epoch": 16.53846153846154,
"grad_norm": 2.1744906902313232,
"learning_rate": 1.7307692307692308e-06,
"loss": 0.0889,
"step": 1935
},
{
"epoch": 16.54700854700855,
"grad_norm": 1.9443250894546509,
"learning_rate": 1.7264957264957265e-06,
"loss": 0.0413,
"step": 1936
},
{
"epoch": 16.555555555555557,
"grad_norm": 2.0389249324798584,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0798,
"step": 1937
},
{
"epoch": 16.564102564102566,
"grad_norm": 4.600223064422607,
"learning_rate": 1.717948717948718e-06,
"loss": 0.0706,
"step": 1938
},
{
"epoch": 16.572649572649574,
"grad_norm": 1.4231921434402466,
"learning_rate": 1.7136752136752138e-06,
"loss": 0.0856,
"step": 1939
},
{
"epoch": 16.581196581196583,
"grad_norm": 4.8655290603637695,
"learning_rate": 1.7094017094017097e-06,
"loss": 0.2519,
"step": 1940
},
{
"epoch": 16.58974358974359,
"grad_norm": 2.6834962368011475,
"learning_rate": 1.7051282051282053e-06,
"loss": 0.0328,
"step": 1941
},
{
"epoch": 16.5982905982906,
"grad_norm": 0.625557541847229,
"learning_rate": 1.700854700854701e-06,
"loss": 0.0129,
"step": 1942
},
{
"epoch": 16.60683760683761,
"grad_norm": 10.57834243774414,
"learning_rate": 1.6965811965811967e-06,
"loss": 0.2987,
"step": 1943
},
{
"epoch": 16.615384615384617,
"grad_norm": 1.2357791662216187,
"learning_rate": 1.6923076923076926e-06,
"loss": 0.0294,
"step": 1944
},
{
"epoch": 16.623931623931625,
"grad_norm": 1.8380581140518188,
"learning_rate": 1.6880341880341883e-06,
"loss": 0.0298,
"step": 1945
},
{
"epoch": 16.632478632478634,
"grad_norm": 1.2370020151138306,
"learning_rate": 1.683760683760684e-06,
"loss": 0.0285,
"step": 1946
},
{
"epoch": 16.641025641025642,
"grad_norm": 5.922267913818359,
"learning_rate": 1.6794871794871794e-06,
"loss": 0.24,
"step": 1947
},
{
"epoch": 16.64957264957265,
"grad_norm": 2.439023494720459,
"learning_rate": 1.6752136752136753e-06,
"loss": 0.0988,
"step": 1948
},
{
"epoch": 16.65811965811966,
"grad_norm": 0.8908723592758179,
"learning_rate": 1.670940170940171e-06,
"loss": 0.026,
"step": 1949
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.8728394508361816,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.018,
"step": 1950
},
{
"epoch": 16.675213675213676,
"grad_norm": 2.7304019927978516,
"learning_rate": 1.6623931623931626e-06,
"loss": 0.1567,
"step": 1951
},
{
"epoch": 16.683760683760685,
"grad_norm": 2.8601150512695312,
"learning_rate": 1.6581196581196582e-06,
"loss": 0.0721,
"step": 1952
},
{
"epoch": 16.692307692307693,
"grad_norm": 2.5990025997161865,
"learning_rate": 1.653846153846154e-06,
"loss": 0.2296,
"step": 1953
},
{
"epoch": 16.700854700854702,
"grad_norm": 3.7956109046936035,
"learning_rate": 1.6495726495726496e-06,
"loss": 0.2565,
"step": 1954
},
{
"epoch": 16.70940170940171,
"grad_norm": 5.933072566986084,
"learning_rate": 1.6452991452991455e-06,
"loss": 0.2712,
"step": 1955
},
{
"epoch": 16.71794871794872,
"grad_norm": 0.5651862621307373,
"learning_rate": 1.6410256410256412e-06,
"loss": 0.0132,
"step": 1956
},
{
"epoch": 16.726495726495727,
"grad_norm": 3.033231735229492,
"learning_rate": 1.6367521367521368e-06,
"loss": 0.074,
"step": 1957
},
{
"epoch": 16.735042735042736,
"grad_norm": 1.3515870571136475,
"learning_rate": 1.6324786324786327e-06,
"loss": 0.0614,
"step": 1958
},
{
"epoch": 16.743589743589745,
"grad_norm": 3.091700792312622,
"learning_rate": 1.6282051282051284e-06,
"loss": 0.1284,
"step": 1959
},
{
"epoch": 16.752136752136753,
"grad_norm": 7.142216205596924,
"learning_rate": 1.623931623931624e-06,
"loss": 0.1965,
"step": 1960
},
{
"epoch": 16.76068376068376,
"grad_norm": 7.488593578338623,
"learning_rate": 1.6196581196581198e-06,
"loss": 0.2498,
"step": 1961
},
{
"epoch": 16.76923076923077,
"grad_norm": 3.943833351135254,
"learning_rate": 1.6153846153846157e-06,
"loss": 0.0967,
"step": 1962
},
{
"epoch": 16.77777777777778,
"grad_norm": 1.8732318878173828,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.029,
"step": 1963
},
{
"epoch": 16.786324786324787,
"grad_norm": 2.5445902347564697,
"learning_rate": 1.606837606837607e-06,
"loss": 0.0808,
"step": 1964
},
{
"epoch": 16.794871794871796,
"grad_norm": 4.969367504119873,
"learning_rate": 1.602564102564103e-06,
"loss": 0.164,
"step": 1965
},
{
"epoch": 16.803418803418804,
"grad_norm": 1.6954468488693237,
"learning_rate": 1.5982905982905984e-06,
"loss": 0.0645,
"step": 1966
},
{
"epoch": 16.811965811965813,
"grad_norm": 1.536352276802063,
"learning_rate": 1.594017094017094e-06,
"loss": 0.0595,
"step": 1967
},
{
"epoch": 16.82051282051282,
"grad_norm": 0.7326592803001404,
"learning_rate": 1.5897435897435897e-06,
"loss": 0.0153,
"step": 1968
},
{
"epoch": 16.82905982905983,
"grad_norm": 10.959025382995605,
"learning_rate": 1.5854700854700856e-06,
"loss": 0.3274,
"step": 1969
},
{
"epoch": 16.837606837606838,
"grad_norm": 10.305845260620117,
"learning_rate": 1.5811965811965813e-06,
"loss": 0.1404,
"step": 1970
},
{
"epoch": 16.846153846153847,
"grad_norm": 7.498697280883789,
"learning_rate": 1.576923076923077e-06,
"loss": 0.2269,
"step": 1971
},
{
"epoch": 16.854700854700855,
"grad_norm": 0.29253125190734863,
"learning_rate": 1.5726495726495727e-06,
"loss": 0.0074,
"step": 1972
},
{
"epoch": 16.863247863247864,
"grad_norm": 9.320234298706055,
"learning_rate": 1.5683760683760685e-06,
"loss": 0.067,
"step": 1973
},
{
"epoch": 16.871794871794872,
"grad_norm": 6.572272300720215,
"learning_rate": 1.5641025641025642e-06,
"loss": 0.4577,
"step": 1974
},
{
"epoch": 16.88034188034188,
"grad_norm": 5.368937969207764,
"learning_rate": 1.55982905982906e-06,
"loss": 0.2016,
"step": 1975
},
{
"epoch": 16.88888888888889,
"grad_norm": 0.5891698598861694,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0174,
"step": 1976
},
{
"epoch": 16.897435897435898,
"grad_norm": 3.045989751815796,
"learning_rate": 1.5512820512820515e-06,
"loss": 0.1748,
"step": 1977
},
{
"epoch": 16.905982905982906,
"grad_norm": 3.013834238052368,
"learning_rate": 1.5470085470085471e-06,
"loss": 0.2283,
"step": 1978
},
{
"epoch": 16.914529914529915,
"grad_norm": 1.2644447088241577,
"learning_rate": 1.5427350427350428e-06,
"loss": 0.0302,
"step": 1979
},
{
"epoch": 16.923076923076923,
"grad_norm": 4.429958820343018,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.2458,
"step": 1980
},
{
"epoch": 16.931623931623932,
"grad_norm": 1.1556981801986694,
"learning_rate": 1.5341880341880344e-06,
"loss": 0.0179,
"step": 1981
},
{
"epoch": 16.94017094017094,
"grad_norm": 1.4588316679000854,
"learning_rate": 1.52991452991453e-06,
"loss": 0.1063,
"step": 1982
},
{
"epoch": 16.94871794871795,
"grad_norm": 1.124496340751648,
"learning_rate": 1.525641025641026e-06,
"loss": 0.0278,
"step": 1983
},
{
"epoch": 16.957264957264957,
"grad_norm": 0.7231981754302979,
"learning_rate": 1.5213675213675214e-06,
"loss": 0.0141,
"step": 1984
},
{
"epoch": 16.965811965811966,
"grad_norm": 1.4819642305374146,
"learning_rate": 1.5170940170940171e-06,
"loss": 0.0601,
"step": 1985
},
{
"epoch": 16.974358974358974,
"grad_norm": 0.7296791672706604,
"learning_rate": 1.5128205128205128e-06,
"loss": 0.0215,
"step": 1986
},
{
"epoch": 16.982905982905983,
"grad_norm": 15.651564598083496,
"learning_rate": 1.5085470085470087e-06,
"loss": 0.2954,
"step": 1987
},
{
"epoch": 16.99145299145299,
"grad_norm": 0.48891735076904297,
"learning_rate": 1.5042735042735044e-06,
"loss": 0.015,
"step": 1988
},
{
"epoch": 17.0,
"grad_norm": 7.363093376159668,
"learning_rate": 1.5e-06,
"loss": 0.2366,
"step": 1989
},
{
"epoch": 17.0,
"eval_loss": 0.05406723916530609,
"eval_runtime": 9.389,
"eval_samples_per_second": 49.633,
"eval_steps_per_second": 6.284,
"step": 1989
},
{
"epoch": 17.00854700854701,
"grad_norm": 2.8626017570495605,
"learning_rate": 1.4957264957264957e-06,
"loss": 0.0902,
"step": 1990
},
{
"epoch": 17.017094017094017,
"grad_norm": 2.461879253387451,
"learning_rate": 1.4914529914529916e-06,
"loss": 0.0387,
"step": 1991
},
{
"epoch": 17.025641025641026,
"grad_norm": 6.336863994598389,
"learning_rate": 1.4871794871794873e-06,
"loss": 0.196,
"step": 1992
},
{
"epoch": 17.034188034188034,
"grad_norm": 1.1044467687606812,
"learning_rate": 1.482905982905983e-06,
"loss": 0.0352,
"step": 1993
},
{
"epoch": 17.042735042735043,
"grad_norm": 3.3509342670440674,
"learning_rate": 1.4786324786324789e-06,
"loss": 0.1459,
"step": 1994
},
{
"epoch": 17.05128205128205,
"grad_norm": 3.2349629402160645,
"learning_rate": 1.4743589743589745e-06,
"loss": 0.0179,
"step": 1995
},
{
"epoch": 17.05982905982906,
"grad_norm": 3.650749921798706,
"learning_rate": 1.4700854700854702e-06,
"loss": 0.1549,
"step": 1996
},
{
"epoch": 17.068376068376068,
"grad_norm": 1.6349891424179077,
"learning_rate": 1.465811965811966e-06,
"loss": 0.0713,
"step": 1997
},
{
"epoch": 17.076923076923077,
"grad_norm": 8.602070808410645,
"learning_rate": 1.4615384615384618e-06,
"loss": 0.3582,
"step": 1998
},
{
"epoch": 17.085470085470085,
"grad_norm": 3.1162590980529785,
"learning_rate": 1.4572649572649575e-06,
"loss": 0.2455,
"step": 1999
},
{
"epoch": 17.094017094017094,
"grad_norm": 1.4878407716751099,
"learning_rate": 1.4529914529914531e-06,
"loss": 0.0195,
"step": 2000
},
{
"epoch": 17.102564102564102,
"grad_norm": 2.565297842025757,
"learning_rate": 1.448717948717949e-06,
"loss": 0.1126,
"step": 2001
},
{
"epoch": 17.11111111111111,
"grad_norm": 4.169450759887695,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.1774,
"step": 2002
},
{
"epoch": 17.11965811965812,
"grad_norm": 1.8476792573928833,
"learning_rate": 1.4401709401709402e-06,
"loss": 0.0288,
"step": 2003
},
{
"epoch": 17.128205128205128,
"grad_norm": 0.7279506921768188,
"learning_rate": 1.4358974358974359e-06,
"loss": 0.0217,
"step": 2004
},
{
"epoch": 17.136752136752136,
"grad_norm": 7.387227535247803,
"learning_rate": 1.4316239316239317e-06,
"loss": 0.248,
"step": 2005
},
{
"epoch": 17.145299145299145,
"grad_norm": 2.9455361366271973,
"learning_rate": 1.4273504273504274e-06,
"loss": 0.0439,
"step": 2006
},
{
"epoch": 17.153846153846153,
"grad_norm": 6.015694618225098,
"learning_rate": 1.423076923076923e-06,
"loss": 0.0656,
"step": 2007
},
{
"epoch": 17.162393162393162,
"grad_norm": 1.741774320602417,
"learning_rate": 1.4188034188034188e-06,
"loss": 0.0344,
"step": 2008
},
{
"epoch": 17.17094017094017,
"grad_norm": 0.5282659530639648,
"learning_rate": 1.4145299145299147e-06,
"loss": 0.0128,
"step": 2009
},
{
"epoch": 17.17948717948718,
"grad_norm": 2.4927468299865723,
"learning_rate": 1.4102564102564104e-06,
"loss": 0.1839,
"step": 2010
},
{
"epoch": 17.188034188034187,
"grad_norm": 0.7872166037559509,
"learning_rate": 1.405982905982906e-06,
"loss": 0.0204,
"step": 2011
},
{
"epoch": 17.196581196581196,
"grad_norm": 0.7072253227233887,
"learning_rate": 1.401709401709402e-06,
"loss": 0.0206,
"step": 2012
},
{
"epoch": 17.205128205128204,
"grad_norm": 1.0154236555099487,
"learning_rate": 1.3974358974358976e-06,
"loss": 0.0238,
"step": 2013
},
{
"epoch": 17.213675213675213,
"grad_norm": 2.9798424243927,
"learning_rate": 1.3931623931623933e-06,
"loss": 0.0542,
"step": 2014
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.9568426012992859,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0239,
"step": 2015
},
{
"epoch": 17.23076923076923,
"grad_norm": 10.525039672851562,
"learning_rate": 1.3846153846153848e-06,
"loss": 0.1768,
"step": 2016
},
{
"epoch": 17.23931623931624,
"grad_norm": 1.697314977645874,
"learning_rate": 1.3803418803418805e-06,
"loss": 0.0453,
"step": 2017
},
{
"epoch": 17.247863247863247,
"grad_norm": 0.6436419486999512,
"learning_rate": 1.3760683760683762e-06,
"loss": 0.0163,
"step": 2018
},
{
"epoch": 17.256410256410255,
"grad_norm": 4.984555721282959,
"learning_rate": 1.371794871794872e-06,
"loss": 0.1157,
"step": 2019
},
{
"epoch": 17.264957264957264,
"grad_norm": 9.088909149169922,
"learning_rate": 1.3675213675213678e-06,
"loss": 0.2842,
"step": 2020
},
{
"epoch": 17.273504273504273,
"grad_norm": 10.398246765136719,
"learning_rate": 1.3632478632478632e-06,
"loss": 0.2528,
"step": 2021
},
{
"epoch": 17.28205128205128,
"grad_norm": 3.60273814201355,
"learning_rate": 1.358974358974359e-06,
"loss": 0.1799,
"step": 2022
},
{
"epoch": 17.29059829059829,
"grad_norm": 0.6845250129699707,
"learning_rate": 1.3547008547008548e-06,
"loss": 0.0196,
"step": 2023
},
{
"epoch": 17.299145299145298,
"grad_norm": 0.5363795161247253,
"learning_rate": 1.3504273504273505e-06,
"loss": 0.0136,
"step": 2024
},
{
"epoch": 17.307692307692307,
"grad_norm": 3.880434274673462,
"learning_rate": 1.3461538461538462e-06,
"loss": 0.3665,
"step": 2025
},
{
"epoch": 17.316239316239315,
"grad_norm": 4.580989360809326,
"learning_rate": 1.3418803418803418e-06,
"loss": 0.2593,
"step": 2026
},
{
"epoch": 17.324786324786324,
"grad_norm": 2.781501293182373,
"learning_rate": 1.3376068376068377e-06,
"loss": 0.1777,
"step": 2027
},
{
"epoch": 17.333333333333332,
"grad_norm": 5.605004787445068,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.3633,
"step": 2028
},
{
"epoch": 17.34188034188034,
"grad_norm": 1.696486473083496,
"learning_rate": 1.329059829059829e-06,
"loss": 0.0353,
"step": 2029
},
{
"epoch": 17.35042735042735,
"grad_norm": 3.4415268898010254,
"learning_rate": 1.324786324786325e-06,
"loss": 0.0906,
"step": 2030
},
{
"epoch": 17.358974358974358,
"grad_norm": 7.722592353820801,
"learning_rate": 1.3205128205128207e-06,
"loss": 0.1804,
"step": 2031
},
{
"epoch": 17.367521367521366,
"grad_norm": 3.3161542415618896,
"learning_rate": 1.3162393162393163e-06,
"loss": 0.1336,
"step": 2032
},
{
"epoch": 17.376068376068375,
"grad_norm": 2.568871021270752,
"learning_rate": 1.3119658119658122e-06,
"loss": 0.0658,
"step": 2033
},
{
"epoch": 17.384615384615383,
"grad_norm": 3.5799806118011475,
"learning_rate": 1.307692307692308e-06,
"loss": 0.0652,
"step": 2034
},
{
"epoch": 17.39316239316239,
"grad_norm": 1.1399949789047241,
"learning_rate": 1.3034188034188036e-06,
"loss": 0.0196,
"step": 2035
},
{
"epoch": 17.4017094017094,
"grad_norm": 2.3688738346099854,
"learning_rate": 1.2991452991452993e-06,
"loss": 0.0706,
"step": 2036
},
{
"epoch": 17.41025641025641,
"grad_norm": 12.726486206054688,
"learning_rate": 1.2948717948717952e-06,
"loss": 0.2506,
"step": 2037
},
{
"epoch": 17.418803418803417,
"grad_norm": 2.249285936355591,
"learning_rate": 1.2905982905982908e-06,
"loss": 0.0532,
"step": 2038
},
{
"epoch": 17.427350427350426,
"grad_norm": 0.7129601836204529,
"learning_rate": 1.2863247863247863e-06,
"loss": 0.0207,
"step": 2039
},
{
"epoch": 17.435897435897434,
"grad_norm": 1.9362183809280396,
"learning_rate": 1.282051282051282e-06,
"loss": 0.0311,
"step": 2040
},
{
"epoch": 17.444444444444443,
"grad_norm": 2.253690242767334,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.1203,
"step": 2041
},
{
"epoch": 17.45299145299145,
"grad_norm": 3.835174798965454,
"learning_rate": 1.2735042735042736e-06,
"loss": 0.0928,
"step": 2042
},
{
"epoch": 17.46153846153846,
"grad_norm": 143.36563110351562,
"learning_rate": 1.2692307692307692e-06,
"loss": 0.2984,
"step": 2043
},
{
"epoch": 17.47008547008547,
"grad_norm": 0.6122754216194153,
"learning_rate": 1.264957264957265e-06,
"loss": 0.0171,
"step": 2044
},
{
"epoch": 17.478632478632477,
"grad_norm": 3.0697991847991943,
"learning_rate": 1.2606837606837608e-06,
"loss": 0.1412,
"step": 2045
},
{
"epoch": 17.487179487179485,
"grad_norm": 1.0684096813201904,
"learning_rate": 1.2564102564102565e-06,
"loss": 0.0278,
"step": 2046
},
{
"epoch": 17.495726495726494,
"grad_norm": 5.379480838775635,
"learning_rate": 1.2521367521367522e-06,
"loss": 0.1114,
"step": 2047
},
{
"epoch": 17.504273504273506,
"grad_norm": 3.893343448638916,
"learning_rate": 1.247863247863248e-06,
"loss": 0.1499,
"step": 2048
},
{
"epoch": 17.51282051282051,
"grad_norm": 1.0436211824417114,
"learning_rate": 1.2435897435897437e-06,
"loss": 0.0259,
"step": 2049
},
{
"epoch": 17.521367521367523,
"grad_norm": 2.8706037998199463,
"learning_rate": 1.2393162393162394e-06,
"loss": 0.1071,
"step": 2050
},
{
"epoch": 17.52991452991453,
"grad_norm": 1.5661158561706543,
"learning_rate": 1.2350427350427353e-06,
"loss": 0.0392,
"step": 2051
},
{
"epoch": 17.53846153846154,
"grad_norm": 3.7152199745178223,
"learning_rate": 1.230769230769231e-06,
"loss": 0.0698,
"step": 2052
},
{
"epoch": 17.54700854700855,
"grad_norm": 2.6527271270751953,
"learning_rate": 1.2264957264957264e-06,
"loss": 0.1276,
"step": 2053
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.9018534421920776,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.066,
"step": 2054
},
{
"epoch": 17.564102564102566,
"grad_norm": 7.11035680770874,
"learning_rate": 1.217948717948718e-06,
"loss": 0.0836,
"step": 2055
},
{
"epoch": 17.572649572649574,
"grad_norm": 2.5168066024780273,
"learning_rate": 1.2136752136752137e-06,
"loss": 0.0662,
"step": 2056
},
{
"epoch": 17.581196581196583,
"grad_norm": 0.7215616703033447,
"learning_rate": 1.2094017094017096e-06,
"loss": 0.0186,
"step": 2057
},
{
"epoch": 17.58974358974359,
"grad_norm": 7.076876640319824,
"learning_rate": 1.2051282051282053e-06,
"loss": 0.1493,
"step": 2058
},
{
"epoch": 17.5982905982906,
"grad_norm": 1.1687662601470947,
"learning_rate": 1.200854700854701e-06,
"loss": 0.0368,
"step": 2059
},
{
"epoch": 17.60683760683761,
"grad_norm": 2.5085737705230713,
"learning_rate": 1.1965811965811968e-06,
"loss": 0.1567,
"step": 2060
},
{
"epoch": 17.615384615384617,
"grad_norm": 0.43566644191741943,
"learning_rate": 1.1923076923076925e-06,
"loss": 0.0097,
"step": 2061
},
{
"epoch": 17.623931623931625,
"grad_norm": 0.7698078155517578,
"learning_rate": 1.188034188034188e-06,
"loss": 0.0231,
"step": 2062
},
{
"epoch": 17.632478632478634,
"grad_norm": 1.8352185487747192,
"learning_rate": 1.1837606837606839e-06,
"loss": 0.0324,
"step": 2063
},
{
"epoch": 17.641025641025642,
"grad_norm": 12.11907958984375,
"learning_rate": 1.1794871794871795e-06,
"loss": 0.6052,
"step": 2064
},
{
"epoch": 17.64957264957265,
"grad_norm": 0.49942728877067566,
"learning_rate": 1.1752136752136752e-06,
"loss": 0.0111,
"step": 2065
},
{
"epoch": 17.65811965811966,
"grad_norm": 3.579129457473755,
"learning_rate": 1.1709401709401711e-06,
"loss": 0.1706,
"step": 2066
},
{
"epoch": 17.666666666666668,
"grad_norm": 2.112550973892212,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0438,
"step": 2067
},
{
"epoch": 17.675213675213676,
"grad_norm": 2.4429895877838135,
"learning_rate": 1.1623931623931625e-06,
"loss": 0.0498,
"step": 2068
},
{
"epoch": 17.683760683760685,
"grad_norm": 1.8436684608459473,
"learning_rate": 1.1581196581196584e-06,
"loss": 0.1228,
"step": 2069
},
{
"epoch": 17.692307692307693,
"grad_norm": 4.679569244384766,
"learning_rate": 1.153846153846154e-06,
"loss": 0.1505,
"step": 2070
},
{
"epoch": 17.700854700854702,
"grad_norm": 2.4409713745117188,
"learning_rate": 1.1495726495726495e-06,
"loss": 0.0603,
"step": 2071
},
{
"epoch": 17.70940170940171,
"grad_norm": 3.577721118927002,
"learning_rate": 1.1452991452991454e-06,
"loss": 0.1078,
"step": 2072
},
{
"epoch": 17.71794871794872,
"grad_norm": 3.774958372116089,
"learning_rate": 1.141025641025641e-06,
"loss": 0.3782,
"step": 2073
},
{
"epoch": 17.726495726495727,
"grad_norm": 2.9011383056640625,
"learning_rate": 1.1367521367521368e-06,
"loss": 0.0714,
"step": 2074
},
{
"epoch": 17.735042735042736,
"grad_norm": 1.7296162843704224,
"learning_rate": 1.1324786324786326e-06,
"loss": 0.0463,
"step": 2075
},
{
"epoch": 17.743589743589745,
"grad_norm": 1.8955838680267334,
"learning_rate": 1.1282051282051283e-06,
"loss": 0.0641,
"step": 2076
},
{
"epoch": 17.752136752136753,
"grad_norm": 3.0198490619659424,
"learning_rate": 1.123931623931624e-06,
"loss": 0.1516,
"step": 2077
},
{
"epoch": 17.76068376068376,
"grad_norm": 1.5012823343276978,
"learning_rate": 1.1196581196581199e-06,
"loss": 0.0206,
"step": 2078
},
{
"epoch": 17.76923076923077,
"grad_norm": 2.4390790462493896,
"learning_rate": 1.1153846153846156e-06,
"loss": 0.0458,
"step": 2079
},
{
"epoch": 17.77777777777778,
"grad_norm": 5.728135585784912,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0443,
"step": 2080
},
{
"epoch": 17.786324786324787,
"grad_norm": 1.423771858215332,
"learning_rate": 1.106837606837607e-06,
"loss": 0.0223,
"step": 2081
},
{
"epoch": 17.794871794871796,
"grad_norm": 2.524941921234131,
"learning_rate": 1.1025641025641026e-06,
"loss": 0.0587,
"step": 2082
},
{
"epoch": 17.803418803418804,
"grad_norm": 0.9632331132888794,
"learning_rate": 1.0982905982905983e-06,
"loss": 0.0324,
"step": 2083
},
{
"epoch": 17.811965811965813,
"grad_norm": 1.8369181156158447,
"learning_rate": 1.0940170940170942e-06,
"loss": 0.0182,
"step": 2084
},
{
"epoch": 17.82051282051282,
"grad_norm": 2.547654867172241,
"learning_rate": 1.0897435897435899e-06,
"loss": 0.1395,
"step": 2085
},
{
"epoch": 17.82905982905983,
"grad_norm": 3.516977310180664,
"learning_rate": 1.0854700854700855e-06,
"loss": 0.1044,
"step": 2086
},
{
"epoch": 17.837606837606838,
"grad_norm": 1.7064217329025269,
"learning_rate": 1.0811965811965814e-06,
"loss": 0.0302,
"step": 2087
},
{
"epoch": 17.846153846153847,
"grad_norm": 1.7427505254745483,
"learning_rate": 1.076923076923077e-06,
"loss": 0.0298,
"step": 2088
},
{
"epoch": 17.854700854700855,
"grad_norm": 1.3395370244979858,
"learning_rate": 1.0726495726495726e-06,
"loss": 0.0302,
"step": 2089
},
{
"epoch": 17.863247863247864,
"grad_norm": 7.244344711303711,
"learning_rate": 1.0683760683760685e-06,
"loss": 0.1925,
"step": 2090
},
{
"epoch": 17.871794871794872,
"grad_norm": 5.942878723144531,
"learning_rate": 1.0641025641025641e-06,
"loss": 0.489,
"step": 2091
},
{
"epoch": 17.88034188034188,
"grad_norm": 3.244260787963867,
"learning_rate": 1.0598290598290598e-06,
"loss": 0.2538,
"step": 2092
},
{
"epoch": 17.88888888888889,
"grad_norm": 0.9833334684371948,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0215,
"step": 2093
},
{
"epoch": 17.897435897435898,
"grad_norm": 3.0194849967956543,
"learning_rate": 1.0512820512820514e-06,
"loss": 0.07,
"step": 2094
},
{
"epoch": 17.905982905982906,
"grad_norm": 0.48535388708114624,
"learning_rate": 1.047008547008547e-06,
"loss": 0.0113,
"step": 2095
},
{
"epoch": 17.914529914529915,
"grad_norm": 4.334452152252197,
"learning_rate": 1.042735042735043e-06,
"loss": 0.127,
"step": 2096
},
{
"epoch": 17.923076923076923,
"grad_norm": 3.54429030418396,
"learning_rate": 1.0384615384615386e-06,
"loss": 0.0704,
"step": 2097
},
{
"epoch": 17.931623931623932,
"grad_norm": 1.1745219230651855,
"learning_rate": 1.034188034188034e-06,
"loss": 0.0418,
"step": 2098
},
{
"epoch": 17.94017094017094,
"grad_norm": 5.157544136047363,
"learning_rate": 1.02991452991453e-06,
"loss": 0.2562,
"step": 2099
},
{
"epoch": 17.94871794871795,
"grad_norm": 4.454767227172852,
"learning_rate": 1.0256410256410257e-06,
"loss": 0.1141,
"step": 2100
},
{
"epoch": 17.957264957264957,
"grad_norm": 12.859573364257812,
"learning_rate": 1.0213675213675213e-06,
"loss": 0.3516,
"step": 2101
},
{
"epoch": 17.965811965811966,
"grad_norm": 5.780513763427734,
"learning_rate": 1.0170940170940172e-06,
"loss": 0.1663,
"step": 2102
},
{
"epoch": 17.974358974358974,
"grad_norm": 2.762153387069702,
"learning_rate": 1.012820512820513e-06,
"loss": 0.19,
"step": 2103
},
{
"epoch": 17.982905982905983,
"grad_norm": 5.649252891540527,
"learning_rate": 1.0085470085470086e-06,
"loss": 0.1736,
"step": 2104
},
{
"epoch": 17.99145299145299,
"grad_norm": 5.10836124420166,
"learning_rate": 1.0042735042735045e-06,
"loss": 0.1739,
"step": 2105
},
{
"epoch": 18.0,
"grad_norm": 6.474237442016602,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3239,
"step": 2106
},
{
"epoch": 18.0,
"eval_loss": 0.052614517509937286,
"eval_runtime": 9.28,
"eval_samples_per_second": 50.216,
"eval_steps_per_second": 6.358,
"step": 2106
},
{
"epoch": 18.00854700854701,
"grad_norm": 0.8820164203643799,
"learning_rate": 9.957264957264958e-07,
"loss": 0.0237,
"step": 2107
},
{
"epoch": 18.017094017094017,
"grad_norm": 2.692166566848755,
"learning_rate": 9.914529914529915e-07,
"loss": 0.0962,
"step": 2108
},
{
"epoch": 18.025641025641026,
"grad_norm": 0.8048399090766907,
"learning_rate": 9.871794871794872e-07,
"loss": 0.0232,
"step": 2109
},
{
"epoch": 18.034188034188034,
"grad_norm": 4.4439826011657715,
"learning_rate": 9.829059829059829e-07,
"loss": 0.064,
"step": 2110
},
{
"epoch": 18.042735042735043,
"grad_norm": 1.62433660030365,
"learning_rate": 9.786324786324788e-07,
"loss": 0.1263,
"step": 2111
},
{
"epoch": 18.05128205128205,
"grad_norm": 4.766104221343994,
"learning_rate": 9.743589743589745e-07,
"loss": 0.2108,
"step": 2112
},
{
"epoch": 18.05982905982906,
"grad_norm": 139.34445190429688,
"learning_rate": 9.700854700854701e-07,
"loss": 0.237,
"step": 2113
},
{
"epoch": 18.068376068376068,
"grad_norm": 0.6069220900535583,
"learning_rate": 9.65811965811966e-07,
"loss": 0.0135,
"step": 2114
},
{
"epoch": 18.076923076923077,
"grad_norm": 2.7833995819091797,
"learning_rate": 9.615384615384617e-07,
"loss": 0.1677,
"step": 2115
},
{
"epoch": 18.085470085470085,
"grad_norm": 4.570268630981445,
"learning_rate": 9.572649572649574e-07,
"loss": 0.2304,
"step": 2116
},
{
"epoch": 18.094017094017094,
"grad_norm": 4.7644805908203125,
"learning_rate": 9.529914529914531e-07,
"loss": 0.138,
"step": 2117
},
{
"epoch": 18.102564102564102,
"grad_norm": 1.9438762664794922,
"learning_rate": 9.487179487179487e-07,
"loss": 0.0488,
"step": 2118
},
{
"epoch": 18.11111111111111,
"grad_norm": 1.4188040494918823,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0545,
"step": 2119
},
{
"epoch": 18.11965811965812,
"grad_norm": 0.357928603887558,
"learning_rate": 9.401709401709402e-07,
"loss": 0.0092,
"step": 2120
},
{
"epoch": 18.128205128205128,
"grad_norm": 1.8646256923675537,
"learning_rate": 9.35897435897436e-07,
"loss": 0.086,
"step": 2121
},
{
"epoch": 18.136752136752136,
"grad_norm": 2.111544609069824,
"learning_rate": 9.316239316239318e-07,
"loss": 0.0319,
"step": 2122
},
{
"epoch": 18.145299145299145,
"grad_norm": 3.0686893463134766,
"learning_rate": 9.273504273504274e-07,
"loss": 0.0689,
"step": 2123
},
{
"epoch": 18.153846153846153,
"grad_norm": 4.028079509735107,
"learning_rate": 9.230769230769232e-07,
"loss": 0.125,
"step": 2124
},
{
"epoch": 18.162393162393162,
"grad_norm": 1.0433181524276733,
"learning_rate": 9.188034188034189e-07,
"loss": 0.0174,
"step": 2125
},
{
"epoch": 18.17094017094017,
"grad_norm": 3.4533402919769287,
"learning_rate": 9.145299145299146e-07,
"loss": 0.1556,
"step": 2126
},
{
"epoch": 18.17948717948718,
"grad_norm": 11.187241554260254,
"learning_rate": 9.102564102564103e-07,
"loss": 0.2578,
"step": 2127
},
{
"epoch": 18.188034188034187,
"grad_norm": 2.544975757598877,
"learning_rate": 9.05982905982906e-07,
"loss": 0.0868,
"step": 2128
},
{
"epoch": 18.196581196581196,
"grad_norm": 2.490493059158325,
"learning_rate": 9.017094017094017e-07,
"loss": 0.1575,
"step": 2129
},
{
"epoch": 18.205128205128204,
"grad_norm": 4.665895938873291,
"learning_rate": 8.974358974358975e-07,
"loss": 0.1644,
"step": 2130
},
{
"epoch": 18.213675213675213,
"grad_norm": 3.135772943496704,
"learning_rate": 8.931623931623933e-07,
"loss": 0.205,
"step": 2131
},
{
"epoch": 18.22222222222222,
"grad_norm": 1.5636606216430664,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0541,
"step": 2132
},
{
"epoch": 18.23076923076923,
"grad_norm": 3.603691816329956,
"learning_rate": 8.846153846153848e-07,
"loss": 0.0478,
"step": 2133
},
{
"epoch": 18.23931623931624,
"grad_norm": 2.6537222862243652,
"learning_rate": 8.803418803418804e-07,
"loss": 0.1206,
"step": 2134
},
{
"epoch": 18.247863247863247,
"grad_norm": 5.086421966552734,
"learning_rate": 8.760683760683761e-07,
"loss": 0.1212,
"step": 2135
},
{
"epoch": 18.256410256410255,
"grad_norm": 4.673394203186035,
"learning_rate": 8.717948717948718e-07,
"loss": 0.0588,
"step": 2136
},
{
"epoch": 18.264957264957264,
"grad_norm": 2.1376845836639404,
"learning_rate": 8.675213675213676e-07,
"loss": 0.0492,
"step": 2137
},
{
"epoch": 18.273504273504273,
"grad_norm": 2.8616504669189453,
"learning_rate": 8.632478632478633e-07,
"loss": 0.1834,
"step": 2138
},
{
"epoch": 18.28205128205128,
"grad_norm": 2.7179784774780273,
"learning_rate": 8.58974358974359e-07,
"loss": 0.1508,
"step": 2139
},
{
"epoch": 18.29059829059829,
"grad_norm": 1.1909416913986206,
"learning_rate": 8.547008547008548e-07,
"loss": 0.0721,
"step": 2140
},
{
"epoch": 18.299145299145298,
"grad_norm": 1.8272216320037842,
"learning_rate": 8.504273504273505e-07,
"loss": 0.0797,
"step": 2141
},
{
"epoch": 18.307692307692307,
"grad_norm": 4.394528388977051,
"learning_rate": 8.461538461538463e-07,
"loss": 0.2762,
"step": 2142
},
{
"epoch": 18.316239316239315,
"grad_norm": 4.276169776916504,
"learning_rate": 8.41880341880342e-07,
"loss": 0.0969,
"step": 2143
},
{
"epoch": 18.324786324786324,
"grad_norm": 2.0932376384735107,
"learning_rate": 8.376068376068377e-07,
"loss": 0.0595,
"step": 2144
},
{
"epoch": 18.333333333333332,
"grad_norm": 5.714378833770752,
"learning_rate": 8.333333333333333e-07,
"loss": 0.1176,
"step": 2145
},
{
"epoch": 18.34188034188034,
"grad_norm": 1.1050394773483276,
"learning_rate": 8.290598290598291e-07,
"loss": 0.0284,
"step": 2146
},
{
"epoch": 18.35042735042735,
"grad_norm": 3.2809271812438965,
"learning_rate": 8.247863247863248e-07,
"loss": 0.0737,
"step": 2147
},
{
"epoch": 18.358974358974358,
"grad_norm": 2.102889060974121,
"learning_rate": 8.205128205128206e-07,
"loss": 0.0477,
"step": 2148
},
{
"epoch": 18.367521367521366,
"grad_norm": 1.5728402137756348,
"learning_rate": 8.162393162393164e-07,
"loss": 0.0476,
"step": 2149
},
{
"epoch": 18.376068376068375,
"grad_norm": 2.0337905883789062,
"learning_rate": 8.11965811965812e-07,
"loss": 0.019,
"step": 2150
},
{
"epoch": 18.384615384615383,
"grad_norm": 5.475340843200684,
"learning_rate": 8.076923076923078e-07,
"loss": 0.1625,
"step": 2151
},
{
"epoch": 18.39316239316239,
"grad_norm": 0.4993753135204315,
"learning_rate": 8.034188034188035e-07,
"loss": 0.0132,
"step": 2152
},
{
"epoch": 18.4017094017094,
"grad_norm": 4.052933216094971,
"learning_rate": 7.991452991452992e-07,
"loss": 0.1603,
"step": 2153
},
{
"epoch": 18.41025641025641,
"grad_norm": 3.005293607711792,
"learning_rate": 7.948717948717949e-07,
"loss": 0.0399,
"step": 2154
},
{
"epoch": 18.418803418803417,
"grad_norm": 3.0186731815338135,
"learning_rate": 7.905982905982906e-07,
"loss": 0.0564,
"step": 2155
},
{
"epoch": 18.427350427350426,
"grad_norm": 5.522226333618164,
"learning_rate": 7.863247863247863e-07,
"loss": 0.1138,
"step": 2156
},
{
"epoch": 18.435897435897434,
"grad_norm": 5.463916301727295,
"learning_rate": 7.820512820512821e-07,
"loss": 0.4811,
"step": 2157
},
{
"epoch": 18.444444444444443,
"grad_norm": 0.41404595971107483,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0114,
"step": 2158
},
{
"epoch": 18.45299145299145,
"grad_norm": 0.9279537200927734,
"learning_rate": 7.735042735042736e-07,
"loss": 0.0268,
"step": 2159
},
{
"epoch": 18.46153846153846,
"grad_norm": 0.5745738744735718,
"learning_rate": 7.692307692307694e-07,
"loss": 0.0155,
"step": 2160
},
{
"epoch": 18.47008547008547,
"grad_norm": 2.329507827758789,
"learning_rate": 7.64957264957265e-07,
"loss": 0.0421,
"step": 2161
},
{
"epoch": 18.478632478632477,
"grad_norm": 2.934424638748169,
"learning_rate": 7.606837606837607e-07,
"loss": 0.0925,
"step": 2162
},
{
"epoch": 18.487179487179485,
"grad_norm": 3.226261854171753,
"learning_rate": 7.564102564102564e-07,
"loss": 0.1914,
"step": 2163
},
{
"epoch": 18.495726495726494,
"grad_norm": 1.2033684253692627,
"learning_rate": 7.521367521367522e-07,
"loss": 0.0218,
"step": 2164
},
{
"epoch": 18.504273504273506,
"grad_norm": 1.092015266418457,
"learning_rate": 7.478632478632479e-07,
"loss": 0.0165,
"step": 2165
},
{
"epoch": 18.51282051282051,
"grad_norm": 1.2283809185028076,
"learning_rate": 7.435897435897436e-07,
"loss": 0.025,
"step": 2166
},
{
"epoch": 18.521367521367523,
"grad_norm": 6.3457722663879395,
"learning_rate": 7.393162393162394e-07,
"loss": 0.2224,
"step": 2167
},
{
"epoch": 18.52991452991453,
"grad_norm": 4.920536518096924,
"learning_rate": 7.350427350427351e-07,
"loss": 0.1381,
"step": 2168
},
{
"epoch": 18.53846153846154,
"grad_norm": 4.16088342666626,
"learning_rate": 7.307692307692309e-07,
"loss": 0.2725,
"step": 2169
},
{
"epoch": 18.54700854700855,
"grad_norm": 1.4776932001113892,
"learning_rate": 7.264957264957266e-07,
"loss": 0.0236,
"step": 2170
},
{
"epoch": 18.555555555555557,
"grad_norm": 5.517492294311523,
"learning_rate": 7.222222222222222e-07,
"loss": 0.3427,
"step": 2171
},
{
"epoch": 18.564102564102566,
"grad_norm": 0.7798398733139038,
"learning_rate": 7.179487179487179e-07,
"loss": 0.0139,
"step": 2172
},
{
"epoch": 18.572649572649574,
"grad_norm": 0.7174245119094849,
"learning_rate": 7.136752136752137e-07,
"loss": 0.0144,
"step": 2173
},
{
"epoch": 18.581196581196583,
"grad_norm": 5.118779182434082,
"learning_rate": 7.094017094017094e-07,
"loss": 0.1899,
"step": 2174
},
{
"epoch": 18.58974358974359,
"grad_norm": 2.8726353645324707,
"learning_rate": 7.051282051282052e-07,
"loss": 0.1177,
"step": 2175
},
{
"epoch": 18.5982905982906,
"grad_norm": 2.3775036334991455,
"learning_rate": 7.00854700854701e-07,
"loss": 0.1183,
"step": 2176
},
{
"epoch": 18.60683760683761,
"grad_norm": 19.23975944519043,
"learning_rate": 6.965811965811966e-07,
"loss": 0.4534,
"step": 2177
},
{
"epoch": 18.615384615384617,
"grad_norm": 1.3832803964614868,
"learning_rate": 6.923076923076924e-07,
"loss": 0.0309,
"step": 2178
},
{
"epoch": 18.623931623931625,
"grad_norm": 1.6752214431762695,
"learning_rate": 6.880341880341881e-07,
"loss": 0.0201,
"step": 2179
},
{
"epoch": 18.632478632478634,
"grad_norm": 3.1885950565338135,
"learning_rate": 6.837606837606839e-07,
"loss": 0.1242,
"step": 2180
},
{
"epoch": 18.641025641025642,
"grad_norm": 0.9290790557861328,
"learning_rate": 6.794871794871795e-07,
"loss": 0.0189,
"step": 2181
},
{
"epoch": 18.64957264957265,
"grad_norm": 0.25725051760673523,
"learning_rate": 6.752136752136752e-07,
"loss": 0.0065,
"step": 2182
},
{
"epoch": 18.65811965811966,
"grad_norm": 1.9815839529037476,
"learning_rate": 6.709401709401709e-07,
"loss": 0.0576,
"step": 2183
},
{
"epoch": 18.666666666666668,
"grad_norm": 1.924490213394165,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0671,
"step": 2184
},
{
"epoch": 18.675213675213676,
"grad_norm": 2.9947164058685303,
"learning_rate": 6.623931623931625e-07,
"loss": 0.1859,
"step": 2185
},
{
"epoch": 18.683760683760685,
"grad_norm": 1.8680211305618286,
"learning_rate": 6.581196581196582e-07,
"loss": 0.1028,
"step": 2186
},
{
"epoch": 18.692307692307693,
"grad_norm": 0.823103666305542,
"learning_rate": 6.53846153846154e-07,
"loss": 0.0198,
"step": 2187
},
{
"epoch": 18.700854700854702,
"grad_norm": 2.3616061210632324,
"learning_rate": 6.495726495726496e-07,
"loss": 0.1025,
"step": 2188
},
{
"epoch": 18.70940170940171,
"grad_norm": 3.1370067596435547,
"learning_rate": 6.452991452991454e-07,
"loss": 0.0438,
"step": 2189
},
{
"epoch": 18.71794871794872,
"grad_norm": 8.058025360107422,
"learning_rate": 6.41025641025641e-07,
"loss": 0.082,
"step": 2190
},
{
"epoch": 18.726495726495727,
"grad_norm": 2.1969916820526123,
"learning_rate": 6.367521367521368e-07,
"loss": 0.1074,
"step": 2191
},
{
"epoch": 18.735042735042736,
"grad_norm": 2.5845255851745605,
"learning_rate": 6.324786324786325e-07,
"loss": 0.0795,
"step": 2192
},
{
"epoch": 18.743589743589745,
"grad_norm": 3.578331708908081,
"learning_rate": 6.282051282051282e-07,
"loss": 0.1111,
"step": 2193
},
{
"epoch": 18.752136752136753,
"grad_norm": 1.5390626192092896,
"learning_rate": 6.23931623931624e-07,
"loss": 0.064,
"step": 2194
},
{
"epoch": 18.76068376068376,
"grad_norm": 3.1742804050445557,
"learning_rate": 6.196581196581197e-07,
"loss": 0.0971,
"step": 2195
},
{
"epoch": 18.76923076923077,
"grad_norm": 1.7017542123794556,
"learning_rate": 6.153846153846155e-07,
"loss": 0.0424,
"step": 2196
},
{
"epoch": 18.77777777777778,
"grad_norm": 2.642102003097534,
"learning_rate": 6.111111111111112e-07,
"loss": 0.1243,
"step": 2197
},
{
"epoch": 18.786324786324787,
"grad_norm": 1.2010291814804077,
"learning_rate": 6.068376068376068e-07,
"loss": 0.0375,
"step": 2198
},
{
"epoch": 18.794871794871796,
"grad_norm": 3.1580190658569336,
"learning_rate": 6.025641025641026e-07,
"loss": 0.0565,
"step": 2199
},
{
"epoch": 18.803418803418804,
"grad_norm": 2.7660391330718994,
"learning_rate": 5.982905982905984e-07,
"loss": 0.0385,
"step": 2200
},
{
"epoch": 18.811965811965813,
"grad_norm": 0.7716617584228516,
"learning_rate": 5.94017094017094e-07,
"loss": 0.0159,
"step": 2201
},
{
"epoch": 18.82051282051282,
"grad_norm": 3.190251588821411,
"learning_rate": 5.897435897435898e-07,
"loss": 0.241,
"step": 2202
},
{
"epoch": 18.82905982905983,
"grad_norm": 7.115220069885254,
"learning_rate": 5.854700854700856e-07,
"loss": 0.1777,
"step": 2203
},
{
"epoch": 18.837606837606838,
"grad_norm": 5.071573257446289,
"learning_rate": 5.811965811965812e-07,
"loss": 0.5421,
"step": 2204
},
{
"epoch": 18.846153846153847,
"grad_norm": 3.8419785499572754,
"learning_rate": 5.76923076923077e-07,
"loss": 0.0784,
"step": 2205
},
{
"epoch": 18.854700854700855,
"grad_norm": 2.8234896659851074,
"learning_rate": 5.726495726495727e-07,
"loss": 0.1071,
"step": 2206
},
{
"epoch": 18.863247863247864,
"grad_norm": 1.4067480564117432,
"learning_rate": 5.683760683760684e-07,
"loss": 0.0375,
"step": 2207
},
{
"epoch": 18.871794871794872,
"grad_norm": 2.508589029312134,
"learning_rate": 5.641025641025642e-07,
"loss": 0.0921,
"step": 2208
},
{
"epoch": 18.88034188034188,
"grad_norm": 7.314038276672363,
"learning_rate": 5.598290598290599e-07,
"loss": 0.3581,
"step": 2209
},
{
"epoch": 18.88888888888889,
"grad_norm": 4.375041961669922,
"learning_rate": 5.555555555555555e-07,
"loss": 0.1115,
"step": 2210
},
{
"epoch": 18.897435897435898,
"grad_norm": 4.789741516113281,
"learning_rate": 5.512820512820513e-07,
"loss": 0.1813,
"step": 2211
},
{
"epoch": 18.905982905982906,
"grad_norm": 3.008720874786377,
"learning_rate": 5.470085470085471e-07,
"loss": 0.104,
"step": 2212
},
{
"epoch": 18.914529914529915,
"grad_norm": 0.6364433765411377,
"learning_rate": 5.427350427350428e-07,
"loss": 0.0153,
"step": 2213
},
{
"epoch": 18.923076923076923,
"grad_norm": 1.4009958505630493,
"learning_rate": 5.384615384615386e-07,
"loss": 0.0499,
"step": 2214
},
{
"epoch": 18.931623931623932,
"grad_norm": 4.53135347366333,
"learning_rate": 5.341880341880342e-07,
"loss": 0.1021,
"step": 2215
},
{
"epoch": 18.94017094017094,
"grad_norm": 0.7855163216590881,
"learning_rate": 5.299145299145299e-07,
"loss": 0.0297,
"step": 2216
},
{
"epoch": 18.94871794871795,
"grad_norm": 1.5316343307495117,
"learning_rate": 5.256410256410257e-07,
"loss": 0.0438,
"step": 2217
},
{
"epoch": 18.957264957264957,
"grad_norm": 1.2713849544525146,
"learning_rate": 5.213675213675215e-07,
"loss": 0.0311,
"step": 2218
},
{
"epoch": 18.965811965811966,
"grad_norm": 1.612418293952942,
"learning_rate": 5.17094017094017e-07,
"loss": 0.0796,
"step": 2219
},
{
"epoch": 18.974358974358974,
"grad_norm": 6.046596527099609,
"learning_rate": 5.128205128205128e-07,
"loss": 0.0835,
"step": 2220
},
{
"epoch": 18.982905982905983,
"grad_norm": 2.527993679046631,
"learning_rate": 5.085470085470086e-07,
"loss": 0.0448,
"step": 2221
},
{
"epoch": 18.99145299145299,
"grad_norm": 0.9519897699356079,
"learning_rate": 5.042735042735043e-07,
"loss": 0.0223,
"step": 2222
},
{
"epoch": 19.0,
"grad_norm": 14.08708667755127,
"learning_rate": 5.000000000000001e-07,
"loss": 0.6753,
"step": 2223
},
{
"epoch": 19.0,
"eval_loss": 0.05170569196343422,
"eval_runtime": 9.3972,
"eval_samples_per_second": 49.589,
"eval_steps_per_second": 6.278,
"step": 2223
},
{
"epoch": 19.00854700854701,
"grad_norm": 5.215019702911377,
"learning_rate": 4.957264957264958e-07,
"loss": 0.1614,
"step": 2224
},
{
"epoch": 19.017094017094017,
"grad_norm": 2.855567216873169,
"learning_rate": 4.914529914529914e-07,
"loss": 0.1051,
"step": 2225
},
{
"epoch": 19.025641025641026,
"grad_norm": 4.078762054443359,
"learning_rate": 4.871794871794872e-07,
"loss": 0.2859,
"step": 2226
},
{
"epoch": 19.034188034188034,
"grad_norm": 0.9259152412414551,
"learning_rate": 4.82905982905983e-07,
"loss": 0.0257,
"step": 2227
},
{
"epoch": 19.042735042735043,
"grad_norm": 3.629925012588501,
"learning_rate": 4.786324786324787e-07,
"loss": 0.1283,
"step": 2228
},
{
"epoch": 19.05128205128205,
"grad_norm": 3.104196310043335,
"learning_rate": 4.7435897435897437e-07,
"loss": 0.0701,
"step": 2229
},
{
"epoch": 19.05982905982906,
"grad_norm": 8.760592460632324,
"learning_rate": 4.700854700854701e-07,
"loss": 0.5793,
"step": 2230
},
{
"epoch": 19.068376068376068,
"grad_norm": 1.2966917753219604,
"learning_rate": 4.658119658119659e-07,
"loss": 0.0573,
"step": 2231
},
{
"epoch": 19.076923076923077,
"grad_norm": 1.7045038938522339,
"learning_rate": 4.615384615384616e-07,
"loss": 0.0497,
"step": 2232
},
{
"epoch": 19.085470085470085,
"grad_norm": 7.805142402648926,
"learning_rate": 4.572649572649573e-07,
"loss": 0.2898,
"step": 2233
},
{
"epoch": 19.094017094017094,
"grad_norm": 0.5019100308418274,
"learning_rate": 4.52991452991453e-07,
"loss": 0.0132,
"step": 2234
},
{
"epoch": 19.102564102564102,
"grad_norm": 3.1100540161132812,
"learning_rate": 4.4871794871794876e-07,
"loss": 0.0874,
"step": 2235
},
{
"epoch": 19.11111111111111,
"grad_norm": 0.40422680974006653,
"learning_rate": 4.444444444444445e-07,
"loss": 0.012,
"step": 2236
},
{
"epoch": 19.11965811965812,
"grad_norm": 1.2845938205718994,
"learning_rate": 4.401709401709402e-07,
"loss": 0.0259,
"step": 2237
},
{
"epoch": 19.128205128205128,
"grad_norm": 4.621537208557129,
"learning_rate": 4.358974358974359e-07,
"loss": 0.246,
"step": 2238
},
{
"epoch": 19.136752136752136,
"grad_norm": 1.1688278913497925,
"learning_rate": 4.3162393162393163e-07,
"loss": 0.0804,
"step": 2239
},
{
"epoch": 19.145299145299145,
"grad_norm": 10.896872520446777,
"learning_rate": 4.273504273504274e-07,
"loss": 0.2695,
"step": 2240
},
{
"epoch": 19.153846153846153,
"grad_norm": 2.7485415935516357,
"learning_rate": 4.2307692307692315e-07,
"loss": 0.0474,
"step": 2241
},
{
"epoch": 19.162393162393162,
"grad_norm": 1.1686739921569824,
"learning_rate": 4.188034188034188e-07,
"loss": 0.0257,
"step": 2242
},
{
"epoch": 19.17094017094017,
"grad_norm": 3.5579254627227783,
"learning_rate": 4.1452991452991456e-07,
"loss": 0.0419,
"step": 2243
},
{
"epoch": 19.17948717948718,
"grad_norm": 3.088649034500122,
"learning_rate": 4.102564102564103e-07,
"loss": 0.1229,
"step": 2244
},
{
"epoch": 19.188034188034187,
"grad_norm": 1.4894665479660034,
"learning_rate": 4.05982905982906e-07,
"loss": 0.0414,
"step": 2245
},
{
"epoch": 19.196581196581196,
"grad_norm": 5.022091865539551,
"learning_rate": 4.0170940170940175e-07,
"loss": 0.1423,
"step": 2246
},
{
"epoch": 19.205128205128204,
"grad_norm": 1.6117054224014282,
"learning_rate": 3.9743589743589743e-07,
"loss": 0.0244,
"step": 2247
},
{
"epoch": 19.213675213675213,
"grad_norm": 0.5429085493087769,
"learning_rate": 3.9316239316239316e-07,
"loss": 0.0122,
"step": 2248
},
{
"epoch": 19.22222222222222,
"grad_norm": 7.429282188415527,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.122,
"step": 2249
},
{
"epoch": 19.23076923076923,
"grad_norm": 4.492022514343262,
"learning_rate": 3.846153846153847e-07,
"loss": 0.3181,
"step": 2250
},
{
"epoch": 19.23931623931624,
"grad_norm": 5.219499588012695,
"learning_rate": 3.8034188034188036e-07,
"loss": 0.1374,
"step": 2251
},
{
"epoch": 19.247863247863247,
"grad_norm": 3.454345941543579,
"learning_rate": 3.760683760683761e-07,
"loss": 0.147,
"step": 2252
},
{
"epoch": 19.256410256410255,
"grad_norm": 0.6370477080345154,
"learning_rate": 3.717948717948718e-07,
"loss": 0.0154,
"step": 2253
},
{
"epoch": 19.264957264957264,
"grad_norm": 1.7189971208572388,
"learning_rate": 3.6752136752136755e-07,
"loss": 0.0635,
"step": 2254
},
{
"epoch": 19.273504273504273,
"grad_norm": 2.716744899749756,
"learning_rate": 3.632478632478633e-07,
"loss": 0.0966,
"step": 2255
},
{
"epoch": 19.28205128205128,
"grad_norm": 2.4959864616394043,
"learning_rate": 3.5897435897435896e-07,
"loss": 0.0779,
"step": 2256
},
{
"epoch": 19.29059829059829,
"grad_norm": 3.625793218612671,
"learning_rate": 3.547008547008547e-07,
"loss": 0.3238,
"step": 2257
},
{
"epoch": 19.299145299145298,
"grad_norm": 1.8783844709396362,
"learning_rate": 3.504273504273505e-07,
"loss": 0.0319,
"step": 2258
},
{
"epoch": 19.307692307692307,
"grad_norm": 1.6740922927856445,
"learning_rate": 3.461538461538462e-07,
"loss": 0.0844,
"step": 2259
},
{
"epoch": 19.316239316239315,
"grad_norm": 2.8891098499298096,
"learning_rate": 3.4188034188034194e-07,
"loss": 0.1916,
"step": 2260
},
{
"epoch": 19.324786324786324,
"grad_norm": 0.9975456595420837,
"learning_rate": 3.376068376068376e-07,
"loss": 0.0266,
"step": 2261
},
{
"epoch": 19.333333333333332,
"grad_norm": 2.576789379119873,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0722,
"step": 2262
},
{
"epoch": 19.34188034188034,
"grad_norm": 9.070858001708984,
"learning_rate": 3.290598290598291e-07,
"loss": 0.2998,
"step": 2263
},
{
"epoch": 19.35042735042735,
"grad_norm": 3.052319049835205,
"learning_rate": 3.247863247863248e-07,
"loss": 0.0435,
"step": 2264
},
{
"epoch": 19.358974358974358,
"grad_norm": 0.8035821318626404,
"learning_rate": 3.205128205128205e-07,
"loss": 0.0233,
"step": 2265
},
{
"epoch": 19.367521367521366,
"grad_norm": 3.7658371925354004,
"learning_rate": 3.1623931623931623e-07,
"loss": 0.3007,
"step": 2266
},
{
"epoch": 19.376068376068375,
"grad_norm": 1.210494875907898,
"learning_rate": 3.11965811965812e-07,
"loss": 0.0344,
"step": 2267
},
{
"epoch": 19.384615384615383,
"grad_norm": 1.1121772527694702,
"learning_rate": 3.0769230769230774e-07,
"loss": 0.054,
"step": 2268
},
{
"epoch": 19.39316239316239,
"grad_norm": 2.842228412628174,
"learning_rate": 3.034188034188034e-07,
"loss": 0.0814,
"step": 2269
},
{
"epoch": 19.4017094017094,
"grad_norm": 1.9269556999206543,
"learning_rate": 2.991452991452992e-07,
"loss": 0.0354,
"step": 2270
},
{
"epoch": 19.41025641025641,
"grad_norm": 7.359715938568115,
"learning_rate": 2.948717948717949e-07,
"loss": 0.3288,
"step": 2271
},
{
"epoch": 19.418803418803417,
"grad_norm": 1.7621564865112305,
"learning_rate": 2.905982905982906e-07,
"loss": 0.0313,
"step": 2272
},
{
"epoch": 19.427350427350426,
"grad_norm": 2.5410284996032715,
"learning_rate": 2.8632478632478635e-07,
"loss": 0.076,
"step": 2273
},
{
"epoch": 19.435897435897434,
"grad_norm": 5.633874416351318,
"learning_rate": 2.820512820512821e-07,
"loss": 0.1903,
"step": 2274
},
{
"epoch": 19.444444444444443,
"grad_norm": 1.935703158378601,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.3778,
"step": 2275
},
{
"epoch": 19.45299145299145,
"grad_norm": 7.559366703033447,
"learning_rate": 2.7350427350427354e-07,
"loss": 0.2684,
"step": 2276
},
{
"epoch": 19.46153846153846,
"grad_norm": 9.240869522094727,
"learning_rate": 2.692307692307693e-07,
"loss": 0.2982,
"step": 2277
},
{
"epoch": 19.47008547008547,
"grad_norm": 6.940350532531738,
"learning_rate": 2.6495726495726495e-07,
"loss": 0.3131,
"step": 2278
},
{
"epoch": 19.478632478632477,
"grad_norm": 1.3201594352722168,
"learning_rate": 2.6068376068376074e-07,
"loss": 0.0191,
"step": 2279
},
{
"epoch": 19.487179487179485,
"grad_norm": 1.626806616783142,
"learning_rate": 2.564102564102564e-07,
"loss": 0.0361,
"step": 2280
},
{
"epoch": 19.495726495726494,
"grad_norm": 8.687582969665527,
"learning_rate": 2.5213675213675215e-07,
"loss": 0.1942,
"step": 2281
},
{
"epoch": 19.504273504273506,
"grad_norm": 5.104561805725098,
"learning_rate": 2.478632478632479e-07,
"loss": 0.1906,
"step": 2282
},
{
"epoch": 19.51282051282051,
"grad_norm": 2.8611207008361816,
"learning_rate": 2.435897435897436e-07,
"loss": 0.1258,
"step": 2283
},
{
"epoch": 19.521367521367523,
"grad_norm": 1.2258422374725342,
"learning_rate": 2.3931623931623934e-07,
"loss": 0.0186,
"step": 2284
},
{
"epoch": 19.52991452991453,
"grad_norm": 5.307450294494629,
"learning_rate": 2.3504273504273505e-07,
"loss": 0.1356,
"step": 2285
},
{
"epoch": 19.53846153846154,
"grad_norm": 2.0854647159576416,
"learning_rate": 2.307692307692308e-07,
"loss": 0.0533,
"step": 2286
},
{
"epoch": 19.54700854700855,
"grad_norm": 1.8560184240341187,
"learning_rate": 2.264957264957265e-07,
"loss": 0.048,
"step": 2287
},
{
"epoch": 19.555555555555557,
"grad_norm": 5.781933307647705,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.2769,
"step": 2288
},
{
"epoch": 19.564102564102566,
"grad_norm": 4.858759880065918,
"learning_rate": 2.1794871794871795e-07,
"loss": 0.4217,
"step": 2289
},
{
"epoch": 19.572649572649574,
"grad_norm": 3.7598235607147217,
"learning_rate": 2.136752136752137e-07,
"loss": 0.162,
"step": 2290
},
{
"epoch": 19.581196581196583,
"grad_norm": 0.5706556439399719,
"learning_rate": 2.094017094017094e-07,
"loss": 0.0151,
"step": 2291
},
{
"epoch": 19.58974358974359,
"grad_norm": 5.697900295257568,
"learning_rate": 2.0512820512820514e-07,
"loss": 0.1015,
"step": 2292
},
{
"epoch": 19.5982905982906,
"grad_norm": 4.635442733764648,
"learning_rate": 2.0085470085470088e-07,
"loss": 0.1827,
"step": 2293
},
{
"epoch": 19.60683760683761,
"grad_norm": 3.070131778717041,
"learning_rate": 1.9658119658119658e-07,
"loss": 0.0802,
"step": 2294
},
{
"epoch": 19.615384615384617,
"grad_norm": 0.979217529296875,
"learning_rate": 1.9230769230769234e-07,
"loss": 0.0237,
"step": 2295
},
{
"epoch": 19.623931623931625,
"grad_norm": 5.640648365020752,
"learning_rate": 1.8803418803418804e-07,
"loss": 0.0588,
"step": 2296
},
{
"epoch": 19.632478632478634,
"grad_norm": 7.1512861251831055,
"learning_rate": 1.8376068376068378e-07,
"loss": 0.1942,
"step": 2297
},
{
"epoch": 19.641025641025642,
"grad_norm": 12.868803024291992,
"learning_rate": 1.7948717948717948e-07,
"loss": 0.2771,
"step": 2298
},
{
"epoch": 19.64957264957265,
"grad_norm": 2.954000234603882,
"learning_rate": 1.7521367521367524e-07,
"loss": 0.1124,
"step": 2299
},
{
"epoch": 19.65811965811966,
"grad_norm": 0.47206825017929077,
"learning_rate": 1.7094017094017097e-07,
"loss": 0.0104,
"step": 2300
},
{
"epoch": 19.666666666666668,
"grad_norm": 0.6243001818656921,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0145,
"step": 2301
},
{
"epoch": 19.675213675213676,
"grad_norm": 1.6680350303649902,
"learning_rate": 1.623931623931624e-07,
"loss": 0.0634,
"step": 2302
},
{
"epoch": 19.683760683760685,
"grad_norm": 6.298573017120361,
"learning_rate": 1.5811965811965811e-07,
"loss": 0.2083,
"step": 2303
},
{
"epoch": 19.692307692307693,
"grad_norm": 0.622466504573822,
"learning_rate": 1.5384615384615387e-07,
"loss": 0.0155,
"step": 2304
},
{
"epoch": 19.700854700854702,
"grad_norm": 2.289080858230591,
"learning_rate": 1.495726495726496e-07,
"loss": 0.0698,
"step": 2305
},
{
"epoch": 19.70940170940171,
"grad_norm": 13.065472602844238,
"learning_rate": 1.452991452991453e-07,
"loss": 0.2587,
"step": 2306
},
{
"epoch": 19.71794871794872,
"grad_norm": 0.903513491153717,
"learning_rate": 1.4102564102564104e-07,
"loss": 0.0222,
"step": 2307
},
{
"epoch": 19.726495726495727,
"grad_norm": 1.3763283491134644,
"learning_rate": 1.3675213675213677e-07,
"loss": 0.042,
"step": 2308
},
{
"epoch": 19.735042735042736,
"grad_norm": 3.3493802547454834,
"learning_rate": 1.3247863247863248e-07,
"loss": 0.1042,
"step": 2309
},
{
"epoch": 19.743589743589745,
"grad_norm": 12.862226486206055,
"learning_rate": 1.282051282051282e-07,
"loss": 0.359,
"step": 2310
},
{
"epoch": 19.752136752136753,
"grad_norm": 5.56069278717041,
"learning_rate": 1.2393162393162394e-07,
"loss": 0.1645,
"step": 2311
},
{
"epoch": 19.76068376068376,
"grad_norm": 2.900381326675415,
"learning_rate": 1.1965811965811967e-07,
"loss": 0.1641,
"step": 2312
},
{
"epoch": 19.76923076923077,
"grad_norm": 1.3674333095550537,
"learning_rate": 1.153846153846154e-07,
"loss": 0.0428,
"step": 2313
},
{
"epoch": 19.77777777777778,
"grad_norm": 2.06278657913208,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.0404,
"step": 2314
},
{
"epoch": 19.786324786324787,
"grad_norm": 5.760499954223633,
"learning_rate": 1.0683760683760685e-07,
"loss": 0.1298,
"step": 2315
},
{
"epoch": 19.794871794871796,
"grad_norm": 3.2554516792297363,
"learning_rate": 1.0256410256410257e-07,
"loss": 0.0432,
"step": 2316
},
{
"epoch": 19.803418803418804,
"grad_norm": 1.7984355688095093,
"learning_rate": 9.829059829059829e-08,
"loss": 0.0461,
"step": 2317
},
{
"epoch": 19.811965811965813,
"grad_norm": 1.633736491203308,
"learning_rate": 9.401709401709402e-08,
"loss": 0.0746,
"step": 2318
},
{
"epoch": 19.82051282051282,
"grad_norm": 2.6958866119384766,
"learning_rate": 8.974358974358974e-08,
"loss": 0.0852,
"step": 2319
},
{
"epoch": 19.82905982905983,
"grad_norm": 0.9744161367416382,
"learning_rate": 8.547008547008549e-08,
"loss": 0.0368,
"step": 2320
},
{
"epoch": 19.837606837606838,
"grad_norm": 1.2404037714004517,
"learning_rate": 8.11965811965812e-08,
"loss": 0.0547,
"step": 2321
},
{
"epoch": 19.846153846153847,
"grad_norm": 1.6044564247131348,
"learning_rate": 7.692307692307694e-08,
"loss": 0.0441,
"step": 2322
},
{
"epoch": 19.854700854700855,
"grad_norm": 0.47167596220970154,
"learning_rate": 7.264957264957265e-08,
"loss": 0.0099,
"step": 2323
},
{
"epoch": 19.863247863247864,
"grad_norm": 1.6729376316070557,
"learning_rate": 6.837606837606839e-08,
"loss": 0.0258,
"step": 2324
},
{
"epoch": 19.871794871794872,
"grad_norm": 0.5823857188224792,
"learning_rate": 6.41025641025641e-08,
"loss": 0.0131,
"step": 2325
},
{
"epoch": 19.88034188034188,
"grad_norm": 4.055545806884766,
"learning_rate": 5.982905982905984e-08,
"loss": 0.073,
"step": 2326
},
{
"epoch": 19.88888888888889,
"grad_norm": 2.693838596343994,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0845,
"step": 2327
},
{
"epoch": 19.897435897435898,
"grad_norm": 0.9895898103713989,
"learning_rate": 5.1282051282051286e-08,
"loss": 0.0205,
"step": 2328
},
{
"epoch": 19.905982905982906,
"grad_norm": 3.560816526412964,
"learning_rate": 4.700854700854701e-08,
"loss": 0.0989,
"step": 2329
},
{
"epoch": 19.914529914529915,
"grad_norm": 5.152528762817383,
"learning_rate": 4.273504273504274e-08,
"loss": 0.0133,
"step": 2330
},
{
"epoch": 19.923076923076923,
"grad_norm": 1.709021806716919,
"learning_rate": 3.846153846153847e-08,
"loss": 0.068,
"step": 2331
},
{
"epoch": 19.931623931623932,
"grad_norm": 0.4786951541900635,
"learning_rate": 3.418803418803419e-08,
"loss": 0.0141,
"step": 2332
},
{
"epoch": 19.94017094017094,
"grad_norm": 1.5413727760314941,
"learning_rate": 2.991452991452992e-08,
"loss": 0.0246,
"step": 2333
},
{
"epoch": 19.94871794871795,
"grad_norm": 1.019601583480835,
"learning_rate": 2.5641025641025643e-08,
"loss": 0.0199,
"step": 2334
},
{
"epoch": 19.957264957264957,
"grad_norm": 1.6115524768829346,
"learning_rate": 2.136752136752137e-08,
"loss": 0.0752,
"step": 2335
},
{
"epoch": 19.965811965811966,
"grad_norm": 2.381624698638916,
"learning_rate": 1.7094017094017096e-08,
"loss": 0.0609,
"step": 2336
},
{
"epoch": 19.974358974358974,
"grad_norm": 1.688704013824463,
"learning_rate": 1.2820512820512822e-08,
"loss": 0.0419,
"step": 2337
},
{
"epoch": 19.982905982905983,
"grad_norm": 1.643002986907959,
"learning_rate": 8.547008547008548e-09,
"loss": 0.0456,
"step": 2338
},
{
"epoch": 19.99145299145299,
"grad_norm": 3.5371882915496826,
"learning_rate": 4.273504273504274e-09,
"loss": 0.0392,
"step": 2339
},
{
"epoch": 20.0,
"grad_norm": 4.692568302154541,
"learning_rate": 0.0,
"loss": 0.1751,
"step": 2340
},
{
"epoch": 20.0,
"eval_loss": 0.051427390426397324,
"eval_runtime": 9.301,
"eval_samples_per_second": 50.102,
"eval_steps_per_second": 6.343,
"step": 2340
}
],
"logging_steps": 1,
"max_steps": 2340,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 560912565657600.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}