nemo_nano_science_300k / trainer_state.json
ryanmarten's picture
Upload model
ec24875 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.987012987012987,
"eval_steps": 500,
"global_step": 360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013852813852813853,
"grad_norm": 10.690503047217376,
"learning_rate": 2.222222222222222e-06,
"loss": 1.664,
"step": 1
},
{
"epoch": 0.027705627705627706,
"grad_norm": 10.549038500876918,
"learning_rate": 4.444444444444444e-06,
"loss": 1.6687,
"step": 2
},
{
"epoch": 0.04155844155844156,
"grad_norm": 9.952372502868275,
"learning_rate": 6.666666666666667e-06,
"loss": 1.6436,
"step": 3
},
{
"epoch": 0.05541125541125541,
"grad_norm": 7.525381680312214,
"learning_rate": 8.888888888888888e-06,
"loss": 1.5751,
"step": 4
},
{
"epoch": 0.06926406926406926,
"grad_norm": 3.6488707097222806,
"learning_rate": 1.1111111111111113e-05,
"loss": 1.4732,
"step": 5
},
{
"epoch": 0.08311688311688312,
"grad_norm": 5.9440833747387405,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.4929,
"step": 6
},
{
"epoch": 0.09696969696969697,
"grad_norm": 7.008224469434576,
"learning_rate": 1.555555555555556e-05,
"loss": 1.4342,
"step": 7
},
{
"epoch": 0.11082251082251082,
"grad_norm": 9.129791969259458,
"learning_rate": 1.7777777777777777e-05,
"loss": 1.4508,
"step": 8
},
{
"epoch": 0.12467532467532468,
"grad_norm": 7.157661170613076,
"learning_rate": 2e-05,
"loss": 1.3993,
"step": 9
},
{
"epoch": 0.13852813852813853,
"grad_norm": 5.878397281654449,
"learning_rate": 2.2222222222222227e-05,
"loss": 1.3716,
"step": 10
},
{
"epoch": 0.1523809523809524,
"grad_norm": 4.04814199716087,
"learning_rate": 2.444444444444445e-05,
"loss": 1.3279,
"step": 11
},
{
"epoch": 0.16623376623376623,
"grad_norm": 4.367325147342624,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.2918,
"step": 12
},
{
"epoch": 0.1800865800865801,
"grad_norm": 3.013051181093589,
"learning_rate": 2.888888888888889e-05,
"loss": 1.2683,
"step": 13
},
{
"epoch": 0.19393939393939394,
"grad_norm": 2.7017616202077597,
"learning_rate": 3.111111111111112e-05,
"loss": 1.2741,
"step": 14
},
{
"epoch": 0.2077922077922078,
"grad_norm": 2.4447347796035936,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.2498,
"step": 15
},
{
"epoch": 0.22164502164502164,
"grad_norm": 2.3013073090511016,
"learning_rate": 3.555555555555555e-05,
"loss": 1.2356,
"step": 16
},
{
"epoch": 0.2354978354978355,
"grad_norm": 2.676331737240606,
"learning_rate": 3.777777777777778e-05,
"loss": 1.2226,
"step": 17
},
{
"epoch": 0.24935064935064935,
"grad_norm": 1.8653678395700215,
"learning_rate": 4e-05,
"loss": 1.1883,
"step": 18
},
{
"epoch": 0.2632034632034632,
"grad_norm": 2.489502341694411,
"learning_rate": 4.222222222222223e-05,
"loss": 1.1903,
"step": 19
},
{
"epoch": 0.27705627705627706,
"grad_norm": 2.2381168497877746,
"learning_rate": 4.444444444444445e-05,
"loss": 1.1823,
"step": 20
},
{
"epoch": 0.2909090909090909,
"grad_norm": 1.0658561341621282,
"learning_rate": 4.666666666666667e-05,
"loss": 1.1644,
"step": 21
},
{
"epoch": 0.3047619047619048,
"grad_norm": 3.3353632520282024,
"learning_rate": 4.88888888888889e-05,
"loss": 1.1866,
"step": 22
},
{
"epoch": 0.31861471861471863,
"grad_norm": 2.0828413940584256,
"learning_rate": 5.111111111111111e-05,
"loss": 1.1606,
"step": 23
},
{
"epoch": 0.33246753246753247,
"grad_norm": 2.0722285174850334,
"learning_rate": 5.333333333333333e-05,
"loss": 1.1689,
"step": 24
},
{
"epoch": 0.3463203463203463,
"grad_norm": 2.6579102865439035,
"learning_rate": 5.555555555555556e-05,
"loss": 1.1555,
"step": 25
},
{
"epoch": 0.3601731601731602,
"grad_norm": 1.9616156182284334,
"learning_rate": 5.777777777777778e-05,
"loss": 1.1683,
"step": 26
},
{
"epoch": 0.37402597402597404,
"grad_norm": 3.2895161663522225,
"learning_rate": 6.000000000000001e-05,
"loss": 1.162,
"step": 27
},
{
"epoch": 0.3878787878787879,
"grad_norm": 2.2524763564895447,
"learning_rate": 6.222222222222223e-05,
"loss": 1.1588,
"step": 28
},
{
"epoch": 0.4017316017316017,
"grad_norm": 2.9587565231476036,
"learning_rate": 6.444444444444446e-05,
"loss": 1.1477,
"step": 29
},
{
"epoch": 0.4155844155844156,
"grad_norm": 2.0001168739095387,
"learning_rate": 6.666666666666667e-05,
"loss": 1.1463,
"step": 30
},
{
"epoch": 0.42943722943722945,
"grad_norm": 3.0781839410346756,
"learning_rate": 6.88888888888889e-05,
"loss": 1.1273,
"step": 31
},
{
"epoch": 0.4432900432900433,
"grad_norm": 2.155490334097704,
"learning_rate": 7.11111111111111e-05,
"loss": 1.1468,
"step": 32
},
{
"epoch": 0.45714285714285713,
"grad_norm": 2.3875247457053566,
"learning_rate": 7.333333333333333e-05,
"loss": 1.1379,
"step": 33
},
{
"epoch": 0.470995670995671,
"grad_norm": 1.71586428053475,
"learning_rate": 7.555555555555556e-05,
"loss": 1.1309,
"step": 34
},
{
"epoch": 0.48484848484848486,
"grad_norm": 2.6858291279872,
"learning_rate": 7.777777777777778e-05,
"loss": 1.1318,
"step": 35
},
{
"epoch": 0.4987012987012987,
"grad_norm": 1.997759995167864,
"learning_rate": 8e-05,
"loss": 1.1323,
"step": 36
},
{
"epoch": 0.5125541125541125,
"grad_norm": 2.629649063991005,
"learning_rate": 7.999811966028904e-05,
"loss": 1.1398,
"step": 37
},
{
"epoch": 0.5264069264069264,
"grad_norm": 2.6927398202491544,
"learning_rate": 7.999247881794007e-05,
"loss": 1.1272,
"step": 38
},
{
"epoch": 0.5402597402597402,
"grad_norm": 1.0260444389642347,
"learning_rate": 7.998307800328803e-05,
"loss": 1.1148,
"step": 39
},
{
"epoch": 0.5541125541125541,
"grad_norm": 3.1260836757156496,
"learning_rate": 7.996991810016922e-05,
"loss": 1.1581,
"step": 40
},
{
"epoch": 0.567965367965368,
"grad_norm": 2.408162449515958,
"learning_rate": 7.995300034583802e-05,
"loss": 1.1579,
"step": 41
},
{
"epoch": 0.5818181818181818,
"grad_norm": 1.7233621870783713,
"learning_rate": 7.993232633085074e-05,
"loss": 1.1154,
"step": 42
},
{
"epoch": 0.5956709956709957,
"grad_norm": 3.2143011392314524,
"learning_rate": 7.990789799891592e-05,
"loss": 1.1361,
"step": 43
},
{
"epoch": 0.6095238095238096,
"grad_norm": 2.541057275107033,
"learning_rate": 7.987971764671168e-05,
"loss": 1.1437,
"step": 44
},
{
"epoch": 0.6233766233766234,
"grad_norm": 2.554077948353239,
"learning_rate": 7.984778792366983e-05,
"loss": 1.1278,
"step": 45
},
{
"epoch": 0.6372294372294373,
"grad_norm": 1.9556507030666455,
"learning_rate": 7.981211183172663e-05,
"loss": 1.125,
"step": 46
},
{
"epoch": 0.651082251082251,
"grad_norm": 2.4591106418916024,
"learning_rate": 7.977269272504075e-05,
"loss": 1.1113,
"step": 47
},
{
"epoch": 0.6649350649350649,
"grad_norm": 1.7374508763969678,
"learning_rate": 7.972953430967773e-05,
"loss": 1.1119,
"step": 48
},
{
"epoch": 0.6787878787878788,
"grad_norm": 2.271122042411741,
"learning_rate": 7.96826406432617e-05,
"loss": 1.1047,
"step": 49
},
{
"epoch": 0.6926406926406926,
"grad_norm": 1.385329225067948,
"learning_rate": 7.963201613459381e-05,
"loss": 1.1104,
"step": 50
},
{
"epoch": 0.7064935064935065,
"grad_norm": 2.0797667060906853,
"learning_rate": 7.957766554323778e-05,
"loss": 1.1008,
"step": 51
},
{
"epoch": 0.7203463203463204,
"grad_norm": 1.4769275764871517,
"learning_rate": 7.951959397907237e-05,
"loss": 1.1063,
"step": 52
},
{
"epoch": 0.7341991341991342,
"grad_norm": 1.5969040026842134,
"learning_rate": 7.945780690181096e-05,
"loss": 1.0958,
"step": 53
},
{
"epoch": 0.7480519480519481,
"grad_norm": 1.5076777523334957,
"learning_rate": 7.939231012048833e-05,
"loss": 1.1038,
"step": 54
},
{
"epoch": 0.7619047619047619,
"grad_norm": 1.5353741235556218,
"learning_rate": 7.932310979291441e-05,
"loss": 1.088,
"step": 55
},
{
"epoch": 0.7757575757575758,
"grad_norm": 1.6688683700597435,
"learning_rate": 7.925021242509539e-05,
"loss": 1.1005,
"step": 56
},
{
"epoch": 0.7896103896103897,
"grad_norm": 1.5907176050250653,
"learning_rate": 7.917362487062207e-05,
"loss": 1.0885,
"step": 57
},
{
"epoch": 0.8034632034632034,
"grad_norm": 1.5886283739500444,
"learning_rate": 7.909335433002543e-05,
"loss": 1.0889,
"step": 58
},
{
"epoch": 0.8173160173160173,
"grad_norm": 1.1345065452265992,
"learning_rate": 7.900940835009974e-05,
"loss": 1.0809,
"step": 59
},
{
"epoch": 0.8311688311688312,
"grad_norm": 1.6727620200346303,
"learning_rate": 7.892179482319297e-05,
"loss": 1.0844,
"step": 60
},
{
"epoch": 0.845021645021645,
"grad_norm": 1.726654683160669,
"learning_rate": 7.883052198646481e-05,
"loss": 1.0868,
"step": 61
},
{
"epoch": 0.8588744588744589,
"grad_norm": 0.7828989407478679,
"learning_rate": 7.873559842111225e-05,
"loss": 1.0711,
"step": 62
},
{
"epoch": 0.8727272727272727,
"grad_norm": 1.3882694170960725,
"learning_rate": 7.863703305156273e-05,
"loss": 1.0752,
"step": 63
},
{
"epoch": 0.8865800865800866,
"grad_norm": 1.5779873659792967,
"learning_rate": 7.853483514463521e-05,
"loss": 1.0766,
"step": 64
},
{
"epoch": 0.9004329004329005,
"grad_norm": 1.4180034460400448,
"learning_rate": 7.842901430866882e-05,
"loss": 1.0725,
"step": 65
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.9127219395084748,
"learning_rate": 7.831958049261956e-05,
"loss": 1.0612,
"step": 66
},
{
"epoch": 0.9281385281385282,
"grad_norm": 1.0847846746337275,
"learning_rate": 7.820654398512492e-05,
"loss": 1.074,
"step": 67
},
{
"epoch": 0.941991341991342,
"grad_norm": 1.8013647852774308,
"learning_rate": 7.808991541353662e-05,
"loss": 1.0954,
"step": 68
},
{
"epoch": 0.9558441558441558,
"grad_norm": 1.377128616335908,
"learning_rate": 7.796970574292136e-05,
"loss": 1.0752,
"step": 69
},
{
"epoch": 0.9696969696969697,
"grad_norm": 1.6958522149590192,
"learning_rate": 7.784592627503004e-05,
"loss": 1.0821,
"step": 70
},
{
"epoch": 0.9835497835497835,
"grad_norm": 1.0049024746726356,
"learning_rate": 7.771858864723504e-05,
"loss": 1.068,
"step": 71
},
{
"epoch": 0.9974025974025974,
"grad_norm": 2.6484071234844953,
"learning_rate": 7.758770483143634e-05,
"loss": 1.0771,
"step": 72
},
{
"epoch": 1.0112554112554113,
"grad_norm": 4.246067022400895,
"learning_rate": 7.745328713293573e-05,
"loss": 1.948,
"step": 73
},
{
"epoch": 1.025108225108225,
"grad_norm": 1.7220828208048158,
"learning_rate": 7.731534818928004e-05,
"loss": 1.0427,
"step": 74
},
{
"epoch": 1.0389610389610389,
"grad_norm": 1.8447923963725428,
"learning_rate": 7.71739009690729e-05,
"loss": 1.0479,
"step": 75
},
{
"epoch": 1.0528138528138529,
"grad_norm": 0.9341938628888585,
"learning_rate": 7.702895877075563e-05,
"loss": 1.0333,
"step": 76
},
{
"epoch": 1.0666666666666667,
"grad_norm": 2.424773237088678,
"learning_rate": 7.688053522135675e-05,
"loss": 1.0579,
"step": 77
},
{
"epoch": 1.0805194805194804,
"grad_norm": 1.6058600540175567,
"learning_rate": 7.672864427521097e-05,
"loss": 1.0636,
"step": 78
},
{
"epoch": 1.0943722943722944,
"grad_norm": 2.091045151793165,
"learning_rate": 7.657330021264718e-05,
"loss": 1.0442,
"step": 79
},
{
"epoch": 1.1082251082251082,
"grad_norm": 1.318962052033536,
"learning_rate": 7.641451763864587e-05,
"loss": 1.045,
"step": 80
},
{
"epoch": 1.122077922077922,
"grad_norm": 2.317561720529343,
"learning_rate": 7.625231148146601e-05,
"loss": 1.0484,
"step": 81
},
{
"epoch": 1.135930735930736,
"grad_norm": 1.4987484149413424,
"learning_rate": 7.608669699124153e-05,
"loss": 1.0484,
"step": 82
},
{
"epoch": 1.1497835497835498,
"grad_norm": 2.3968225100015816,
"learning_rate": 7.591768973854753e-05,
"loss": 1.0453,
"step": 83
},
{
"epoch": 1.1636363636363636,
"grad_norm": 2.0769969941809454,
"learning_rate": 7.57453056129365e-05,
"loss": 1.0473,
"step": 84
},
{
"epoch": 1.1774891774891776,
"grad_norm": 1.5328425512954666,
"learning_rate": 7.556956082144425e-05,
"loss": 1.0432,
"step": 85
},
{
"epoch": 1.1913419913419914,
"grad_norm": 1.5329379349699184,
"learning_rate": 7.539047188706631e-05,
"loss": 1.0502,
"step": 86
},
{
"epoch": 1.2051948051948052,
"grad_norm": 1.2635424997786673,
"learning_rate": 7.520805564720444e-05,
"loss": 1.0389,
"step": 87
},
{
"epoch": 1.2190476190476192,
"grad_norm": 0.9180899722416639,
"learning_rate": 7.502232925208365e-05,
"loss": 1.0297,
"step": 88
},
{
"epoch": 1.232900432900433,
"grad_norm": 0.9088421536152287,
"learning_rate": 7.483331016313969e-05,
"loss": 1.026,
"step": 89
},
{
"epoch": 1.2467532467532467,
"grad_norm": 0.9759584263195824,
"learning_rate": 7.464101615137756e-05,
"loss": 1.042,
"step": 90
},
{
"epoch": 1.2606060606060607,
"grad_norm": 1.7816477052359974,
"learning_rate": 7.444546529570055e-05,
"loss": 1.0375,
"step": 91
},
{
"epoch": 1.2744588744588745,
"grad_norm": 1.0505006199756568,
"learning_rate": 7.424667598121067e-05,
"loss": 1.0232,
"step": 92
},
{
"epoch": 1.2883116883116883,
"grad_norm": 1.1076363899720796,
"learning_rate": 7.404466689747999e-05,
"loss": 1.0358,
"step": 93
},
{
"epoch": 1.3021645021645023,
"grad_norm": 1.766746417129588,
"learning_rate": 7.383945703679365e-05,
"loss": 1.041,
"step": 94
},
{
"epoch": 1.316017316017316,
"grad_norm": 1.1727210609875833,
"learning_rate": 7.363106569236413e-05,
"loss": 1.0373,
"step": 95
},
{
"epoch": 1.3298701298701299,
"grad_norm": 1.3811377730593195,
"learning_rate": 7.341951245651747e-05,
"loss": 1.0232,
"step": 96
},
{
"epoch": 1.3437229437229437,
"grad_norm": 1.8848088994220173,
"learning_rate": 7.320481721885116e-05,
"loss": 1.0331,
"step": 97
},
{
"epoch": 1.3575757575757577,
"grad_norm": 1.5407669706222948,
"learning_rate": 7.298700016436427e-05,
"loss": 1.0392,
"step": 98
},
{
"epoch": 1.3714285714285714,
"grad_norm": 1.6439258533934764,
"learning_rate": 7.276608177155968e-05,
"loss": 1.0302,
"step": 99
},
{
"epoch": 1.3852813852813852,
"grad_norm": 1.6555083210158104,
"learning_rate": 7.254208281051871e-05,
"loss": 1.0359,
"step": 100
},
{
"epoch": 1.399134199134199,
"grad_norm": 1.2444215446875204,
"learning_rate": 7.231502434094845e-05,
"loss": 1.0203,
"step": 101
},
{
"epoch": 1.412987012987013,
"grad_norm": 1.4648122676877777,
"learning_rate": 7.208492771020176e-05,
"loss": 1.0198,
"step": 102
},
{
"epoch": 1.4268398268398268,
"grad_norm": 0.9173692823505156,
"learning_rate": 7.185181455127023e-05,
"loss": 1.0217,
"step": 103
},
{
"epoch": 1.4406926406926406,
"grad_norm": 1.1009749853774418,
"learning_rate": 7.161570678075038e-05,
"loss": 1.0128,
"step": 104
},
{
"epoch": 1.4545454545454546,
"grad_norm": 1.0933932370696173,
"learning_rate": 7.137662659678303e-05,
"loss": 1.0238,
"step": 105
},
{
"epoch": 1.4683982683982684,
"grad_norm": 1.1757437604660779,
"learning_rate": 7.113459647696641e-05,
"loss": 1.0182,
"step": 106
},
{
"epoch": 1.4822510822510822,
"grad_norm": 0.7527900271083177,
"learning_rate": 7.088963917624277e-05,
"loss": 1.012,
"step": 107
},
{
"epoch": 1.4961038961038962,
"grad_norm": 1.1702807594476543,
"learning_rate": 7.064177772475912e-05,
"loss": 1.0264,
"step": 108
},
{
"epoch": 1.50995670995671,
"grad_norm": 0.6981814585755302,
"learning_rate": 7.039103542570199e-05,
"loss": 1.0151,
"step": 109
},
{
"epoch": 1.5238095238095237,
"grad_norm": 1.1192032445094018,
"learning_rate": 7.013743585310642e-05,
"loss": 1.0162,
"step": 110
},
{
"epoch": 1.5376623376623377,
"grad_norm": 1.0770568024481744,
"learning_rate": 6.988100284963985e-05,
"loss": 1.0199,
"step": 111
},
{
"epoch": 1.5515151515151515,
"grad_norm": 1.2005325967972154,
"learning_rate": 6.96217605243602e-05,
"loss": 1.0242,
"step": 112
},
{
"epoch": 1.5653679653679653,
"grad_norm": 0.7699858239179544,
"learning_rate": 6.935973325044941e-05,
"loss": 1.0241,
"step": 113
},
{
"epoch": 1.5792207792207793,
"grad_norm": 1.1064626845196381,
"learning_rate": 6.909494566292195e-05,
"loss": 1.0082,
"step": 114
},
{
"epoch": 1.593073593073593,
"grad_norm": 1.4162206055932687,
"learning_rate": 6.882742265630859e-05,
"loss": 1.0161,
"step": 115
},
{
"epoch": 1.6069264069264069,
"grad_norm": 0.9857373401383442,
"learning_rate": 6.855718938231597e-05,
"loss": 1.0223,
"step": 116
},
{
"epoch": 1.6207792207792209,
"grad_norm": 1.4328471449116547,
"learning_rate": 6.828427124746191e-05,
"loss": 1.0059,
"step": 117
},
{
"epoch": 1.6346320346320347,
"grad_norm": 0.929598786782075,
"learning_rate": 6.800869391068674e-05,
"loss": 1.0161,
"step": 118
},
{
"epoch": 1.6484848484848484,
"grad_norm": 1.5271277070860276,
"learning_rate": 6.773048328094097e-05,
"loss": 1.0109,
"step": 119
},
{
"epoch": 1.6623376623376624,
"grad_norm": 0.7369342923177392,
"learning_rate": 6.744966551474936e-05,
"loss": 1.0187,
"step": 120
},
{
"epoch": 1.6761904761904762,
"grad_norm": 1.1411511227164497,
"learning_rate": 6.716626701375174e-05,
"loss": 1.0131,
"step": 121
},
{
"epoch": 1.69004329004329,
"grad_norm": 1.2904195611318852,
"learning_rate": 6.688031442222091e-05,
"loss": 1.0084,
"step": 122
},
{
"epoch": 1.703896103896104,
"grad_norm": 0.5757097623806057,
"learning_rate": 6.659183462455751e-05,
"loss": 1.0095,
"step": 123
},
{
"epoch": 1.7177489177489178,
"grad_norm": 0.9291802416250161,
"learning_rate": 6.630085474276256e-05,
"loss": 1.0117,
"step": 124
},
{
"epoch": 1.7316017316017316,
"grad_norm": 1.0033464839111939,
"learning_rate": 6.600740213388735e-05,
"loss": 1.0055,
"step": 125
},
{
"epoch": 1.7454545454545456,
"grad_norm": 1.0577865447630987,
"learning_rate": 6.571150438746157e-05,
"loss": 0.9998,
"step": 126
},
{
"epoch": 1.7593073593073592,
"grad_norm": 0.9644457639091424,
"learning_rate": 6.54131893228994e-05,
"loss": 1.003,
"step": 127
},
{
"epoch": 1.7731601731601732,
"grad_norm": 0.80334378142282,
"learning_rate": 6.511248498688396e-05,
"loss": 1.0044,
"step": 128
},
{
"epoch": 1.7870129870129872,
"grad_norm": 0.823547694775696,
"learning_rate": 6.480941965073041e-05,
"loss": 1.0109,
"step": 129
},
{
"epoch": 1.8008658008658007,
"grad_norm": 0.7273863270792912,
"learning_rate": 6.450402180772811e-05,
"loss": 1.0017,
"step": 130
},
{
"epoch": 1.8147186147186147,
"grad_norm": 0.762963999004941,
"learning_rate": 6.419632017046167e-05,
"loss": 1.0018,
"step": 131
},
{
"epoch": 1.8285714285714287,
"grad_norm": 0.8148201089426899,
"learning_rate": 6.388634366811146e-05,
"loss": 0.9993,
"step": 132
},
{
"epoch": 1.8424242424242423,
"grad_norm": 0.8416363889161061,
"learning_rate": 6.35741214437338e-05,
"loss": 1.0095,
"step": 133
},
{
"epoch": 1.8562770562770563,
"grad_norm": 1.142390867021583,
"learning_rate": 6.325968285152107e-05,
"loss": 1.0062,
"step": 134
},
{
"epoch": 1.87012987012987,
"grad_norm": 0.7962536559784616,
"learning_rate": 6.294305745404185e-05,
"loss": 1.0052,
"step": 135
},
{
"epoch": 1.8839826839826839,
"grad_norm": 0.5650336880636371,
"learning_rate": 6.262427501946155e-05,
"loss": 1.0067,
"step": 136
},
{
"epoch": 1.8978354978354979,
"grad_norm": 0.5818038902731943,
"learning_rate": 6.230336551874372e-05,
"loss": 1.0063,
"step": 137
},
{
"epoch": 1.9116883116883117,
"grad_norm": 0.9977727916003996,
"learning_rate": 6.198035912283225e-05,
"loss": 1.0011,
"step": 138
},
{
"epoch": 1.9255411255411254,
"grad_norm": 0.9993134068472553,
"learning_rate": 6.165528619981479e-05,
"loss": 0.9934,
"step": 139
},
{
"epoch": 1.9393939393939394,
"grad_norm": 0.6309774026937955,
"learning_rate": 6.132817731206766e-05,
"loss": 1.0023,
"step": 140
},
{
"epoch": 1.9532467532467532,
"grad_norm": 0.5631788726393073,
"learning_rate": 6.099906321338241e-05,
"loss": 0.9875,
"step": 141
},
{
"epoch": 1.967099567099567,
"grad_norm": 0.6994904876843244,
"learning_rate": 6.0667974846074524e-05,
"loss": 0.9969,
"step": 142
},
{
"epoch": 1.980952380952381,
"grad_norm": 0.6611818685825782,
"learning_rate": 6.033494333807422e-05,
"loss": 1.0052,
"step": 143
},
{
"epoch": 1.9948051948051948,
"grad_norm": 0.5004771960590909,
"learning_rate": 6.000000000000001e-05,
"loss": 0.9857,
"step": 144
},
{
"epoch": 2.0086580086580086,
"grad_norm": 1.0514858543746186,
"learning_rate": 5.9663176322214826e-05,
"loss": 1.8002,
"step": 145
},
{
"epoch": 2.0225108225108226,
"grad_norm": 1.5590490021626622,
"learning_rate": 5.9324503971865545e-05,
"loss": 0.9591,
"step": 146
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.613252686965761,
"learning_rate": 5.8984014789905625e-05,
"loss": 0.9555,
"step": 147
},
{
"epoch": 2.05021645021645,
"grad_norm": 1.5183857859367584,
"learning_rate": 5.8641740788101566e-05,
"loss": 0.9637,
"step": 148
},
{
"epoch": 2.064069264069264,
"grad_norm": 0.599406946967003,
"learning_rate": 5.8297714146023236e-05,
"loss": 0.9396,
"step": 149
},
{
"epoch": 2.0779220779220777,
"grad_norm": 1.171195638149606,
"learning_rate": 5.79519672080185e-05,
"loss": 0.9523,
"step": 150
},
{
"epoch": 2.0917748917748917,
"grad_norm": 0.6714077570634802,
"learning_rate": 5.76045324801722e-05,
"loss": 0.9595,
"step": 151
},
{
"epoch": 2.1056277056277057,
"grad_norm": 1.2318697934613918,
"learning_rate": 5.7255442627250146e-05,
"loss": 0.9514,
"step": 152
},
{
"epoch": 2.1194805194805193,
"grad_norm": 0.746989496141657,
"learning_rate": 5.6904730469627985e-05,
"loss": 0.9482,
"step": 153
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.901261215101538,
"learning_rate": 5.6552428980205575e-05,
"loss": 0.9587,
"step": 154
},
{
"epoch": 2.1471861471861473,
"grad_norm": 0.674529916922478,
"learning_rate": 5.619857128130695e-05,
"loss": 0.9562,
"step": 155
},
{
"epoch": 2.161038961038961,
"grad_norm": 0.8844375890562896,
"learning_rate": 5.584319064156628e-05,
"loss": 0.9459,
"step": 156
},
{
"epoch": 2.174891774891775,
"grad_norm": 0.5176842951829833,
"learning_rate": 5.548632047280003e-05,
"loss": 0.9528,
"step": 157
},
{
"epoch": 2.188744588744589,
"grad_norm": 0.6248120662111469,
"learning_rate": 5.5127994326865706e-05,
"loss": 0.9482,
"step": 158
},
{
"epoch": 2.2025974025974024,
"grad_norm": 0.6093640758186603,
"learning_rate": 5.476824589250738e-05,
"loss": 0.9429,
"step": 159
},
{
"epoch": 2.2164502164502164,
"grad_norm": 0.5492958980107647,
"learning_rate": 5.440710899218842e-05,
"loss": 0.9674,
"step": 160
},
{
"epoch": 2.2303030303030305,
"grad_norm": 0.5903789766574798,
"learning_rate": 5.404461757891156e-05,
"loss": 0.9667,
"step": 161
},
{
"epoch": 2.244155844155844,
"grad_norm": 0.5486871479315714,
"learning_rate": 5.368080573302676e-05,
"loss": 0.9478,
"step": 162
},
{
"epoch": 2.258008658008658,
"grad_norm": 0.45428134417688254,
"learning_rate": 5.331570765902706e-05,
"loss": 0.9409,
"step": 163
},
{
"epoch": 2.271861471861472,
"grad_norm": 0.42847012632012216,
"learning_rate": 5.294935768233285e-05,
"loss": 0.9416,
"step": 164
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.4848698252601225,
"learning_rate": 5.258179024606455e-05,
"loss": 0.9463,
"step": 165
},
{
"epoch": 2.2995670995670996,
"grad_norm": 0.3534788789389581,
"learning_rate": 5.2213039907804535e-05,
"loss": 0.9491,
"step": 166
},
{
"epoch": 2.3134199134199136,
"grad_norm": 0.5082308518432114,
"learning_rate": 5.1843141336348e-05,
"loss": 0.95,
"step": 167
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.33208032748656197,
"learning_rate": 5.1472129308443616e-05,
"loss": 0.953,
"step": 168
},
{
"epoch": 2.341125541125541,
"grad_norm": 0.35843782187780426,
"learning_rate": 5.1100038705523834e-05,
"loss": 0.957,
"step": 169
},
{
"epoch": 2.354978354978355,
"grad_norm": 0.33243645634228375,
"learning_rate": 5.07269045104255e-05,
"loss": 0.9348,
"step": 170
},
{
"epoch": 2.3688311688311687,
"grad_norm": 0.37544932004082693,
"learning_rate": 5.0352761804100835e-05,
"loss": 0.9501,
"step": 171
},
{
"epoch": 2.3826839826839827,
"grad_norm": 0.3396549463565156,
"learning_rate": 4.9977645762319255e-05,
"loss": 0.9548,
"step": 172
},
{
"epoch": 2.3965367965367967,
"grad_norm": 0.27413219762637864,
"learning_rate": 4.9601591652360244e-05,
"loss": 0.9516,
"step": 173
},
{
"epoch": 2.4103896103896103,
"grad_norm": 0.2935194857656813,
"learning_rate": 4.922463482969761e-05,
"loss": 0.9537,
"step": 174
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.31679378581933954,
"learning_rate": 4.884681073467551e-05,
"loss": 0.9566,
"step": 175
},
{
"epoch": 2.4380952380952383,
"grad_norm": 0.2917510642085385,
"learning_rate": 4.846815488917644e-05,
"loss": 0.9602,
"step": 176
},
{
"epoch": 2.451948051948052,
"grad_norm": 0.29512012950255556,
"learning_rate": 4.808870289328153e-05,
"loss": 0.9513,
"step": 177
},
{
"epoch": 2.465800865800866,
"grad_norm": 0.24808203045159094,
"learning_rate": 4.7708490421923596e-05,
"loss": 0.9453,
"step": 178
},
{
"epoch": 2.47965367965368,
"grad_norm": 0.21937289844225158,
"learning_rate": 4.7327553221533074e-05,
"loss": 0.9581,
"step": 179
},
{
"epoch": 2.4935064935064934,
"grad_norm": 0.20437241337234358,
"learning_rate": 4.694592710667723e-05,
"loss": 0.948,
"step": 180
},
{
"epoch": 2.5073593073593075,
"grad_norm": 0.20182625174185811,
"learning_rate": 4.656364795669297e-05,
"loss": 0.9505,
"step": 181
},
{
"epoch": 2.5212121212121215,
"grad_norm": 0.2157700828054003,
"learning_rate": 4.618075171231363e-05,
"loss": 0.955,
"step": 182
},
{
"epoch": 2.535064935064935,
"grad_norm": 0.20198999241369922,
"learning_rate": 4.579727437228987e-05,
"loss": 0.9479,
"step": 183
},
{
"epoch": 2.548917748917749,
"grad_norm": 0.19349997377276865,
"learning_rate": 4.541325199000525e-05,
"loss": 0.9444,
"step": 184
},
{
"epoch": 2.562770562770563,
"grad_norm": 0.20821593855670595,
"learning_rate": 4.502872067008652e-05,
"loss": 0.9484,
"step": 185
},
{
"epoch": 2.5766233766233766,
"grad_norm": 0.22714292711765166,
"learning_rate": 4.464371656500921e-05,
"loss": 0.9478,
"step": 186
},
{
"epoch": 2.5904761904761906,
"grad_norm": 0.22439821970405607,
"learning_rate": 4.425827587169873e-05,
"loss": 0.9642,
"step": 187
},
{
"epoch": 2.6043290043290046,
"grad_norm": 0.19017166723603593,
"learning_rate": 4.387243482812717e-05,
"loss": 0.9354,
"step": 188
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.2338760203213592,
"learning_rate": 4.348622970990634e-05,
"loss": 0.9608,
"step": 189
},
{
"epoch": 2.632034632034632,
"grad_norm": 0.19433184424361064,
"learning_rate": 4.309969682687724e-05,
"loss": 0.9365,
"step": 190
},
{
"epoch": 2.6458874458874457,
"grad_norm": 0.2006639594796061,
"learning_rate": 4.271287251969637e-05,
"loss": 0.943,
"step": 191
},
{
"epoch": 2.6597402597402597,
"grad_norm": 0.19675542180216962,
"learning_rate": 4.2325793156419035e-05,
"loss": 0.9629,
"step": 192
},
{
"epoch": 2.6735930735930737,
"grad_norm": 0.22882862992661218,
"learning_rate": 4.193849512908013e-05,
"loss": 0.9399,
"step": 193
},
{
"epoch": 2.6874458874458873,
"grad_norm": 0.27628995792251587,
"learning_rate": 4.155101485027268e-05,
"loss": 0.9517,
"step": 194
},
{
"epoch": 2.7012987012987013,
"grad_norm": 0.25152494788624064,
"learning_rate": 4.116338874972446e-05,
"loss": 0.9532,
"step": 195
},
{
"epoch": 2.7151515151515153,
"grad_norm": 0.17237631990944813,
"learning_rate": 4.077565327087298e-05,
"loss": 0.9443,
"step": 196
},
{
"epoch": 2.729004329004329,
"grad_norm": 0.22052058799944804,
"learning_rate": 4.0387844867439143e-05,
"loss": 0.9384,
"step": 197
},
{
"epoch": 2.742857142857143,
"grad_norm": 0.2821185693525401,
"learning_rate": 4e-05,
"loss": 0.9506,
"step": 198
},
{
"epoch": 2.7567099567099564,
"grad_norm": 0.23974193332071514,
"learning_rate": 3.961215513256086e-05,
"loss": 0.944,
"step": 199
},
{
"epoch": 2.7705627705627704,
"grad_norm": 0.23881720962641614,
"learning_rate": 3.9224346729127034e-05,
"loss": 0.9423,
"step": 200
},
{
"epoch": 2.7844155844155845,
"grad_norm": 0.1774343946075327,
"learning_rate": 3.8836611250275546e-05,
"loss": 0.9355,
"step": 201
},
{
"epoch": 2.798268398268398,
"grad_norm": 0.23570113544248983,
"learning_rate": 3.844898514972733e-05,
"loss": 0.9519,
"step": 202
},
{
"epoch": 2.812121212121212,
"grad_norm": 0.21653970566029948,
"learning_rate": 3.806150487091989e-05,
"loss": 0.951,
"step": 203
},
{
"epoch": 2.825974025974026,
"grad_norm": 0.1881655573837289,
"learning_rate": 3.767420684358097e-05,
"loss": 0.9425,
"step": 204
},
{
"epoch": 2.8398268398268396,
"grad_norm": 0.19487964543004402,
"learning_rate": 3.7287127480303634e-05,
"loss": 0.9496,
"step": 205
},
{
"epoch": 2.8536796536796536,
"grad_norm": 0.21940934921746677,
"learning_rate": 3.690030317312277e-05,
"loss": 0.9326,
"step": 206
},
{
"epoch": 2.8675324675324676,
"grad_norm": 0.22419835861035028,
"learning_rate": 3.6513770290093674e-05,
"loss": 0.958,
"step": 207
},
{
"epoch": 2.881385281385281,
"grad_norm": 0.20379473922199545,
"learning_rate": 3.612756517187284e-05,
"loss": 0.9475,
"step": 208
},
{
"epoch": 2.895238095238095,
"grad_norm": 0.15734328009114276,
"learning_rate": 3.574172412830127e-05,
"loss": 0.9446,
"step": 209
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.2577374514137676,
"learning_rate": 3.535628343499079e-05,
"loss": 0.9518,
"step": 210
},
{
"epoch": 2.9229437229437227,
"grad_norm": 0.21560632289046236,
"learning_rate": 3.49712793299135e-05,
"loss": 0.9321,
"step": 211
},
{
"epoch": 2.9367965367965367,
"grad_norm": 0.19086166501572058,
"learning_rate": 3.458674800999477e-05,
"loss": 0.939,
"step": 212
},
{
"epoch": 2.9506493506493507,
"grad_norm": 0.1635737725085455,
"learning_rate": 3.4202725627710136e-05,
"loss": 0.9519,
"step": 213
},
{
"epoch": 2.9645021645021643,
"grad_norm": 0.2063878664719065,
"learning_rate": 3.3819248287686386e-05,
"loss": 0.9408,
"step": 214
},
{
"epoch": 2.9783549783549783,
"grad_norm": 0.21758034147643424,
"learning_rate": 3.343635204330704e-05,
"loss": 0.9366,
"step": 215
},
{
"epoch": 2.9922077922077923,
"grad_norm": 0.1756516719858461,
"learning_rate": 3.305407289332279e-05,
"loss": 0.9261,
"step": 216
},
{
"epoch": 3.006060606060606,
"grad_norm": 0.44437950709772883,
"learning_rate": 3.267244677846693e-05,
"loss": 1.6737,
"step": 217
},
{
"epoch": 3.01991341991342,
"grad_norm": 0.5202547459859553,
"learning_rate": 3.229150957807641e-05,
"loss": 0.9065,
"step": 218
},
{
"epoch": 3.033766233766234,
"grad_norm": 0.4201768177217496,
"learning_rate": 3.191129710671849e-05,
"loss": 0.8993,
"step": 219
},
{
"epoch": 3.0476190476190474,
"grad_norm": 0.3469006955241282,
"learning_rate": 3.153184511082359e-05,
"loss": 0.8924,
"step": 220
},
{
"epoch": 3.0614718614718615,
"grad_norm": 0.34894763894121467,
"learning_rate": 3.1153189265324494e-05,
"loss": 0.9091,
"step": 221
},
{
"epoch": 3.0753246753246755,
"grad_norm": 0.3951659967368868,
"learning_rate": 3.07753651703024e-05,
"loss": 0.9103,
"step": 222
},
{
"epoch": 3.089177489177489,
"grad_norm": 0.33506373060928457,
"learning_rate": 3.0398408347639773e-05,
"loss": 0.8895,
"step": 223
},
{
"epoch": 3.103030303030303,
"grad_norm": 0.2808678451376146,
"learning_rate": 3.0022354237680752e-05,
"loss": 0.8954,
"step": 224
},
{
"epoch": 3.116883116883117,
"grad_norm": 0.3452617358086684,
"learning_rate": 2.9647238195899168e-05,
"loss": 0.8954,
"step": 225
},
{
"epoch": 3.1307359307359306,
"grad_norm": 0.32553230647238945,
"learning_rate": 2.9273095489574502e-05,
"loss": 0.897,
"step": 226
},
{
"epoch": 3.1445887445887446,
"grad_norm": 0.2604914839354281,
"learning_rate": 2.889996129447618e-05,
"loss": 0.907,
"step": 227
},
{
"epoch": 3.1584415584415586,
"grad_norm": 0.34111866816202957,
"learning_rate": 2.8527870691556404e-05,
"loss": 0.8981,
"step": 228
},
{
"epoch": 3.172294372294372,
"grad_norm": 0.28026302405180475,
"learning_rate": 2.8156858663652015e-05,
"loss": 0.9033,
"step": 229
},
{
"epoch": 3.186147186147186,
"grad_norm": 0.26870372034953893,
"learning_rate": 2.778696009219548e-05,
"loss": 0.9059,
"step": 230
},
{
"epoch": 3.2,
"grad_norm": 0.3798626491614641,
"learning_rate": 2.7418209753935464e-05,
"loss": 0.8894,
"step": 231
},
{
"epoch": 3.2138528138528137,
"grad_norm": 0.21379716918544014,
"learning_rate": 2.7050642317667164e-05,
"loss": 0.8937,
"step": 232
},
{
"epoch": 3.2277056277056277,
"grad_norm": 0.31956814421124774,
"learning_rate": 2.6684292340972936e-05,
"loss": 0.9068,
"step": 233
},
{
"epoch": 3.2415584415584417,
"grad_norm": 0.194502129845176,
"learning_rate": 2.6319194266973256e-05,
"loss": 0.8999,
"step": 234
},
{
"epoch": 3.2554112554112553,
"grad_norm": 0.25288436825501515,
"learning_rate": 2.5955382421088457e-05,
"loss": 0.8876,
"step": 235
},
{
"epoch": 3.2692640692640693,
"grad_norm": 0.2045328796636946,
"learning_rate": 2.5592891007811594e-05,
"loss": 0.9056,
"step": 236
},
{
"epoch": 3.2831168831168833,
"grad_norm": 0.17690924985251477,
"learning_rate": 2.523175410749263e-05,
"loss": 0.9068,
"step": 237
},
{
"epoch": 3.296969696969697,
"grad_norm": 0.20432688291964138,
"learning_rate": 2.4872005673134307e-05,
"loss": 0.8916,
"step": 238
},
{
"epoch": 3.310822510822511,
"grad_norm": 0.17738981903795317,
"learning_rate": 2.4513679527199986e-05,
"loss": 0.9115,
"step": 239
},
{
"epoch": 3.324675324675325,
"grad_norm": 0.16833331057473214,
"learning_rate": 2.4156809358433728e-05,
"loss": 0.8891,
"step": 240
},
{
"epoch": 3.3385281385281385,
"grad_norm": 0.17407822439034182,
"learning_rate": 2.3801428718693055e-05,
"loss": 0.8936,
"step": 241
},
{
"epoch": 3.3523809523809525,
"grad_norm": 0.16434385080662373,
"learning_rate": 2.3447571019794438e-05,
"loss": 0.9079,
"step": 242
},
{
"epoch": 3.3662337662337665,
"grad_norm": 0.1647420511208294,
"learning_rate": 2.3095269530372032e-05,
"loss": 0.8904,
"step": 243
},
{
"epoch": 3.38008658008658,
"grad_norm": 0.16465200281562736,
"learning_rate": 2.274455737274987e-05,
"loss": 0.8965,
"step": 244
},
{
"epoch": 3.393939393939394,
"grad_norm": 0.1942259697042446,
"learning_rate": 2.239546751982782e-05,
"loss": 0.9039,
"step": 245
},
{
"epoch": 3.407792207792208,
"grad_norm": 0.15418958599426286,
"learning_rate": 2.2048032791981515e-05,
"loss": 0.8921,
"step": 246
},
{
"epoch": 3.4216450216450216,
"grad_norm": 0.15256309020808106,
"learning_rate": 2.1702285853976774e-05,
"loss": 0.8972,
"step": 247
},
{
"epoch": 3.4354978354978356,
"grad_norm": 0.14590845303296213,
"learning_rate": 2.135825921189846e-05,
"loss": 0.8967,
"step": 248
},
{
"epoch": 3.449350649350649,
"grad_norm": 0.1756342017642444,
"learning_rate": 2.1015985210094385e-05,
"loss": 0.9089,
"step": 249
},
{
"epoch": 3.463203463203463,
"grad_norm": 0.14928130402546771,
"learning_rate": 2.067549602813446e-05,
"loss": 0.9116,
"step": 250
},
{
"epoch": 3.477056277056277,
"grad_norm": 0.19622196885081308,
"learning_rate": 2.033682367778518e-05,
"loss": 0.9035,
"step": 251
},
{
"epoch": 3.4909090909090907,
"grad_norm": 0.16833682605095,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.9049,
"step": 252
},
{
"epoch": 3.5047619047619047,
"grad_norm": 0.1700606136967009,
"learning_rate": 1.966505666192579e-05,
"loss": 0.9013,
"step": 253
},
{
"epoch": 3.5186147186147188,
"grad_norm": 0.1795362591013133,
"learning_rate": 1.9332025153925486e-05,
"loss": 0.887,
"step": 254
},
{
"epoch": 3.5324675324675323,
"grad_norm": 0.16623457555792936,
"learning_rate": 1.90009367866176e-05,
"loss": 0.9025,
"step": 255
},
{
"epoch": 3.5463203463203463,
"grad_norm": 0.1724331408670692,
"learning_rate": 1.867182268793236e-05,
"loss": 0.902,
"step": 256
},
{
"epoch": 3.5601731601731603,
"grad_norm": 0.156738658049747,
"learning_rate": 1.8344713800185215e-05,
"loss": 0.8935,
"step": 257
},
{
"epoch": 3.574025974025974,
"grad_norm": 0.16288790800709219,
"learning_rate": 1.8019640877167763e-05,
"loss": 0.898,
"step": 258
},
{
"epoch": 3.587878787878788,
"grad_norm": 0.15690946638171066,
"learning_rate": 1.7696634481256293e-05,
"loss": 0.8959,
"step": 259
},
{
"epoch": 3.601731601731602,
"grad_norm": 0.16001262583220252,
"learning_rate": 1.7375724980538465e-05,
"loss": 0.8888,
"step": 260
},
{
"epoch": 3.6155844155844155,
"grad_norm": 0.15064377615121663,
"learning_rate": 1.7056942545958167e-05,
"loss": 0.9089,
"step": 261
},
{
"epoch": 3.6294372294372295,
"grad_norm": 0.13096790236650285,
"learning_rate": 1.6740317148478932e-05,
"loss": 0.9055,
"step": 262
},
{
"epoch": 3.643290043290043,
"grad_norm": 0.14921599598853594,
"learning_rate": 1.642587855626621e-05,
"loss": 0.9154,
"step": 263
},
{
"epoch": 3.657142857142857,
"grad_norm": 0.13367750739235254,
"learning_rate": 1.6113656331888563e-05,
"loss": 0.8954,
"step": 264
},
{
"epoch": 3.670995670995671,
"grad_norm": 0.14168194296838715,
"learning_rate": 1.580367982953833e-05,
"loss": 0.8939,
"step": 265
},
{
"epoch": 3.6848484848484846,
"grad_norm": 0.14492593957298525,
"learning_rate": 1.5495978192271887e-05,
"loss": 0.91,
"step": 266
},
{
"epoch": 3.6987012987012986,
"grad_norm": 0.1316497818256666,
"learning_rate": 1.5190580349269604e-05,
"loss": 0.9027,
"step": 267
},
{
"epoch": 3.7125541125541126,
"grad_norm": 0.15841380793742146,
"learning_rate": 1.4887515013116067e-05,
"loss": 0.9106,
"step": 268
},
{
"epoch": 3.726406926406926,
"grad_norm": 0.13126491215447147,
"learning_rate": 1.4586810677100608e-05,
"loss": 0.8937,
"step": 269
},
{
"epoch": 3.74025974025974,
"grad_norm": 0.1495403663254427,
"learning_rate": 1.4288495612538427e-05,
"loss": 0.9034,
"step": 270
},
{
"epoch": 3.754112554112554,
"grad_norm": 0.12429246476808327,
"learning_rate": 1.3992597866112667e-05,
"loss": 0.8975,
"step": 271
},
{
"epoch": 3.7679653679653677,
"grad_norm": 0.13097022929593902,
"learning_rate": 1.369914525723746e-05,
"loss": 0.8882,
"step": 272
},
{
"epoch": 3.7818181818181817,
"grad_norm": 0.13482171999455558,
"learning_rate": 1.3408165375442486e-05,
"loss": 0.8906,
"step": 273
},
{
"epoch": 3.7956709956709958,
"grad_norm": 0.12515899928871424,
"learning_rate": 1.3119685577779105e-05,
"loss": 0.9008,
"step": 274
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.13069692054136395,
"learning_rate": 1.2833732986248277e-05,
"loss": 0.8853,
"step": 275
},
{
"epoch": 3.8233766233766233,
"grad_norm": 0.13447223817691295,
"learning_rate": 1.2550334485250661e-05,
"loss": 0.9051,
"step": 276
},
{
"epoch": 3.8372294372294373,
"grad_norm": 0.12306949358534137,
"learning_rate": 1.2269516719059041e-05,
"loss": 0.8979,
"step": 277
},
{
"epoch": 3.851082251082251,
"grad_norm": 0.13274764900634733,
"learning_rate": 1.1991306089313261e-05,
"loss": 0.901,
"step": 278
},
{
"epoch": 3.864935064935065,
"grad_norm": 0.12496506975650054,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.8851,
"step": 279
},
{
"epoch": 3.878787878787879,
"grad_norm": 0.12342700776133213,
"learning_rate": 1.1442810617684046e-05,
"loss": 0.8906,
"step": 280
},
{
"epoch": 3.8926406926406925,
"grad_norm": 0.11718555769651504,
"learning_rate": 1.1172577343691415e-05,
"loss": 0.8945,
"step": 281
},
{
"epoch": 3.9064935064935065,
"grad_norm": 0.11900571530829156,
"learning_rate": 1.0905054337078051e-05,
"loss": 0.8939,
"step": 282
},
{
"epoch": 3.9203463203463205,
"grad_norm": 0.11761709393948508,
"learning_rate": 1.0640266749550593e-05,
"loss": 0.8987,
"step": 283
},
{
"epoch": 3.934199134199134,
"grad_norm": 0.12426098474964,
"learning_rate": 1.0378239475639823e-05,
"loss": 0.8954,
"step": 284
},
{
"epoch": 3.948051948051948,
"grad_norm": 0.11342564958505907,
"learning_rate": 1.0118997150360169e-05,
"loss": 0.8967,
"step": 285
},
{
"epoch": 3.961904761904762,
"grad_norm": 0.12414751882404233,
"learning_rate": 9.862564146893571e-06,
"loss": 0.8942,
"step": 286
},
{
"epoch": 3.9757575757575756,
"grad_norm": 0.11821007668599343,
"learning_rate": 9.60896457429803e-06,
"loss": 0.8981,
"step": 287
},
{
"epoch": 3.9896103896103896,
"grad_norm": 0.11207748566968422,
"learning_rate": 9.358222275240884e-06,
"loss": 0.8969,
"step": 288
},
{
"epoch": 4.003463203463204,
"grad_norm": 0.24776696231966608,
"learning_rate": 9.110360823757235e-06,
"loss": 1.6175,
"step": 289
},
{
"epoch": 4.017316017316017,
"grad_norm": 0.1639268139321257,
"learning_rate": 8.8654035230336e-06,
"loss": 0.8757,
"step": 290
},
{
"epoch": 4.031168831168831,
"grad_norm": 0.1430026414045171,
"learning_rate": 8.623373403216972e-06,
"loss": 0.8619,
"step": 291
},
{
"epoch": 4.045021645021645,
"grad_norm": 0.13983259059672157,
"learning_rate": 8.384293219249633e-06,
"loss": 0.875,
"step": 292
},
{
"epoch": 4.058874458874459,
"grad_norm": 0.14776698103121835,
"learning_rate": 8.148185448729778e-06,
"loss": 0.8712,
"step": 293
},
{
"epoch": 4.072727272727272,
"grad_norm": 0.1453264011082169,
"learning_rate": 7.915072289798247e-06,
"loss": 0.8859,
"step": 294
},
{
"epoch": 4.086580086580087,
"grad_norm": 0.15943779551259862,
"learning_rate": 7.684975659051557e-06,
"loss": 0.8662,
"step": 295
},
{
"epoch": 4.1004329004329,
"grad_norm": 0.1456231807293276,
"learning_rate": 7.457917189481301e-06,
"loss": 0.8774,
"step": 296
},
{
"epoch": 4.114285714285714,
"grad_norm": 0.14340143561096827,
"learning_rate": 7.233918228440324e-06,
"loss": 0.8774,
"step": 297
},
{
"epoch": 4.128138528138528,
"grad_norm": 0.14023071744580373,
"learning_rate": 7.0129998356357295e-06,
"loss": 0.863,
"step": 298
},
{
"epoch": 4.141991341991342,
"grad_norm": 0.14172173606520722,
"learning_rate": 6.795182781148848e-06,
"loss": 0.8767,
"step": 299
},
{
"epoch": 4.1558441558441555,
"grad_norm": 0.1318876467621652,
"learning_rate": 6.58048754348255e-06,
"loss": 0.8709,
"step": 300
},
{
"epoch": 4.16969696969697,
"grad_norm": 0.1517460979685681,
"learning_rate": 6.368934307635881e-06,
"loss": 0.8716,
"step": 301
},
{
"epoch": 4.1835497835497835,
"grad_norm": 0.15120519716651545,
"learning_rate": 6.160542963206357e-06,
"loss": 0.8697,
"step": 302
},
{
"epoch": 4.197402597402597,
"grad_norm": 0.12276533641084203,
"learning_rate": 5.955333102520011e-06,
"loss": 0.8628,
"step": 303
},
{
"epoch": 4.2112554112554115,
"grad_norm": 0.1303847318332295,
"learning_rate": 5.753324018789346e-06,
"loss": 0.8708,
"step": 304
},
{
"epoch": 4.225108225108225,
"grad_norm": 0.13706452110864129,
"learning_rate": 5.554534704299448e-06,
"loss": 0.8566,
"step": 305
},
{
"epoch": 4.238961038961039,
"grad_norm": 0.15781002543920747,
"learning_rate": 5.358983848622452e-06,
"loss": 0.8764,
"step": 306
},
{
"epoch": 4.252813852813853,
"grad_norm": 0.11520912795530423,
"learning_rate": 5.1666898368603195e-06,
"loss": 0.8749,
"step": 307
},
{
"epoch": 4.266666666666667,
"grad_norm": 0.11508546810833122,
"learning_rate": 4.97767074791637e-06,
"loss": 0.8657,
"step": 308
},
{
"epoch": 4.28051948051948,
"grad_norm": 0.14352142083453215,
"learning_rate": 4.791944352795561e-06,
"loss": 0.8919,
"step": 309
},
{
"epoch": 4.294372294372295,
"grad_norm": 0.13642778141475553,
"learning_rate": 4.609528112933688e-06,
"loss": 0.8575,
"step": 310
},
{
"epoch": 4.308225108225108,
"grad_norm": 0.11645525287361383,
"learning_rate": 4.430439178555759e-06,
"loss": 0.874,
"step": 311
},
{
"epoch": 4.322077922077922,
"grad_norm": 0.11198885083380229,
"learning_rate": 4.254694387063514e-06,
"loss": 0.866,
"step": 312
},
{
"epoch": 4.335930735930736,
"grad_norm": 0.11999719505276203,
"learning_rate": 4.082310261452471e-06,
"loss": 0.8809,
"step": 313
},
{
"epoch": 4.34978354978355,
"grad_norm": 0.11431861199461578,
"learning_rate": 3.913303008758491e-06,
"loss": 0.8739,
"step": 314
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.1089763284328194,
"learning_rate": 3.747688518534003e-06,
"loss": 0.8764,
"step": 315
},
{
"epoch": 4.377489177489178,
"grad_norm": 0.11083535668146678,
"learning_rate": 3.585482361354138e-06,
"loss": 0.874,
"step": 316
},
{
"epoch": 4.391341991341991,
"grad_norm": 0.10462111723473196,
"learning_rate": 3.42669978735283e-06,
"loss": 0.8712,
"step": 317
},
{
"epoch": 4.405194805194805,
"grad_norm": 0.11192874060919457,
"learning_rate": 3.2713557247890447e-06,
"loss": 0.865,
"step": 318
},
{
"epoch": 4.419047619047619,
"grad_norm": 0.0998639300176411,
"learning_rate": 3.1194647786432663e-06,
"loss": 0.8628,
"step": 319
},
{
"epoch": 4.432900432900433,
"grad_norm": 0.1037388404966585,
"learning_rate": 2.9710412292443868e-06,
"loss": 0.8744,
"step": 320
},
{
"epoch": 4.4467532467532465,
"grad_norm": 0.10341839983438926,
"learning_rate": 2.8260990309270987e-06,
"loss": 0.8707,
"step": 321
},
{
"epoch": 4.460606060606061,
"grad_norm": 0.10245055505097513,
"learning_rate": 2.6846518107199782e-06,
"loss": 0.869,
"step": 322
},
{
"epoch": 4.4744588744588745,
"grad_norm": 0.10245685258161713,
"learning_rate": 2.546712867064276e-06,
"loss": 0.866,
"step": 323
},
{
"epoch": 4.488311688311688,
"grad_norm": 0.10246348212442796,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.869,
"step": 324
},
{
"epoch": 4.5021645021645025,
"grad_norm": 0.10133630585516906,
"learning_rate": 2.281411352764966e-06,
"loss": 0.8661,
"step": 325
},
{
"epoch": 4.516017316017316,
"grad_norm": 0.10385457357599492,
"learning_rate": 2.1540737249699893e-06,
"loss": 0.8665,
"step": 326
},
{
"epoch": 4.52987012987013,
"grad_norm": 0.09787930849328196,
"learning_rate": 2.0302942570786446e-06,
"loss": 0.8587,
"step": 327
},
{
"epoch": 4.543722943722944,
"grad_norm": 0.09875061097653641,
"learning_rate": 1.9100845864633875e-06,
"loss": 0.862,
"step": 328
},
{
"epoch": 4.557575757575758,
"grad_norm": 0.10019109859451927,
"learning_rate": 1.793456014875079e-06,
"loss": 0.8667,
"step": 329
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.09607007590769094,
"learning_rate": 1.6804195073804442e-06,
"loss": 0.8609,
"step": 330
},
{
"epoch": 4.585281385281386,
"grad_norm": 0.0995091150688806,
"learning_rate": 1.5709856913311795e-06,
"loss": 0.8631,
"step": 331
},
{
"epoch": 4.599134199134199,
"grad_norm": 0.10237535339157534,
"learning_rate": 1.4651648553647869e-06,
"loss": 0.874,
"step": 332
},
{
"epoch": 4.612987012987013,
"grad_norm": 0.09685943360360758,
"learning_rate": 1.3629669484372722e-06,
"loss": 0.8608,
"step": 333
},
{
"epoch": 4.626839826839827,
"grad_norm": 0.10088872360008577,
"learning_rate": 1.2644015788877684e-06,
"loss": 0.8776,
"step": 334
},
{
"epoch": 4.640692640692641,
"grad_norm": 0.09659731541025765,
"learning_rate": 1.1694780135352013e-06,
"loss": 0.8659,
"step": 335
},
{
"epoch": 4.654545454545454,
"grad_norm": 0.09754069143347813,
"learning_rate": 1.0782051768070477e-06,
"loss": 0.8822,
"step": 336
},
{
"epoch": 4.668398268398269,
"grad_norm": 0.09529068088084004,
"learning_rate": 9.905916499002787e-07,
"loss": 0.8632,
"step": 337
},
{
"epoch": 4.682251082251082,
"grad_norm": 0.09443098915190634,
"learning_rate": 9.066456699745774e-07,
"loss": 0.8686,
"step": 338
},
{
"epoch": 4.696103896103896,
"grad_norm": 0.09719204747726426,
"learning_rate": 8.263751293779409e-07,
"loss": 0.8709,
"step": 339
},
{
"epoch": 4.70995670995671,
"grad_norm": 0.0989300648418707,
"learning_rate": 7.497875749046124e-07,
"loss": 0.8706,
"step": 340
},
{
"epoch": 4.723809523809524,
"grad_norm": 0.09639393839499397,
"learning_rate": 6.768902070856031e-07,
"loss": 0.8661,
"step": 341
},
{
"epoch": 4.7376623376623375,
"grad_norm": 0.09557188345066484,
"learning_rate": 6.076898795116792e-07,
"loss": 0.8662,
"step": 342
},
{
"epoch": 4.751515151515152,
"grad_norm": 0.09944408893779064,
"learning_rate": 5.421930981890455e-07,
"loss": 0.877,
"step": 343
},
{
"epoch": 4.7653679653679655,
"grad_norm": 0.0977504011176678,
"learning_rate": 4.804060209276396e-07,
"loss": 0.8658,
"step": 344
},
{
"epoch": 4.779220779220779,
"grad_norm": 0.09464762553229625,
"learning_rate": 4.223344567622212e-07,
"loss": 0.8718,
"step": 345
},
{
"epoch": 4.7930735930735935,
"grad_norm": 0.09515637845594775,
"learning_rate": 3.679838654061874e-07,
"loss": 0.8672,
"step": 346
},
{
"epoch": 4.806926406926407,
"grad_norm": 0.09692757545190614,
"learning_rate": 3.173593567383071e-07,
"loss": 0.8762,
"step": 347
},
{
"epoch": 4.820779220779221,
"grad_norm": 0.09525175615621749,
"learning_rate": 2.704656903222791e-07,
"loss": 0.8792,
"step": 348
},
{
"epoch": 4.834632034632035,
"grad_norm": 0.09621257866702408,
"learning_rate": 2.273072749592631e-07,
"loss": 0.864,
"step": 349
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.09435391607466348,
"learning_rate": 1.8788816827336686e-07,
"loss": 0.8827,
"step": 350
},
{
"epoch": 4.862337662337662,
"grad_norm": 0.09330676760639534,
"learning_rate": 1.522120763301782e-07,
"loss": 0.8634,
"step": 351
},
{
"epoch": 4.876190476190477,
"grad_norm": 0.09377768092440732,
"learning_rate": 1.2028235328831906e-07,
"loss": 0.8782,
"step": 352
},
{
"epoch": 4.89004329004329,
"grad_norm": 0.09540719747182097,
"learning_rate": 9.21020010840934e-08,
"loss": 0.8721,
"step": 353
},
{
"epoch": 4.903896103896104,
"grad_norm": 0.09356725286148478,
"learning_rate": 6.767366914927298e-08,
"loss": 0.8784,
"step": 354
},
{
"epoch": 4.917748917748918,
"grad_norm": 0.09257225973193513,
"learning_rate": 4.699965416198549e-08,
"loss": 0.8794,
"step": 355
},
{
"epoch": 4.931601731601732,
"grad_norm": 0.09315617718680014,
"learning_rate": 3.0081899830798345e-08,
"loss": 0.8658,
"step": 356
},
{
"epoch": 4.945454545454545,
"grad_norm": 0.09320193350709476,
"learning_rate": 1.6921996711976028e-08,
"loss": 0.8666,
"step": 357
},
{
"epoch": 4.95930735930736,
"grad_norm": 0.09451963386678745,
"learning_rate": 7.521182059946342e-09,
"loss": 0.866,
"step": 358
},
{
"epoch": 4.973160173160173,
"grad_norm": 0.09250072566157394,
"learning_rate": 1.8803397109534715e-09,
"loss": 0.8639,
"step": 359
},
{
"epoch": 4.987012987012987,
"grad_norm": 0.09138839375450408,
"learning_rate": 0.0,
"loss": 0.8814,
"step": 360
},
{
"epoch": 4.987012987012987,
"step": 360,
"total_flos": 9.572466247992345e+18,
"train_loss": 0.0,
"train_runtime": 2.6399,
"train_samples_per_second": 69987.374,
"train_steps_per_second": 136.369
}
],
"logging_steps": 1,
"max_steps": 360,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.572466247992345e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}