openthoughts_math / trainer_state.json
ryanmarten's picture
Upload model
437ad67 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.942630185348632,
"eval_steps": 500,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01412180052956752,
"grad_norm": 5.979931009348192,
"learning_rate": 2.285714285714286e-06,
"loss": 0.8154,
"step": 1
},
{
"epoch": 0.02824360105913504,
"grad_norm": 6.047301062181586,
"learning_rate": 4.571428571428572e-06,
"loss": 0.8243,
"step": 2
},
{
"epoch": 0.04236540158870256,
"grad_norm": 5.589397434568418,
"learning_rate": 6.857142857142858e-06,
"loss": 0.8057,
"step": 3
},
{
"epoch": 0.05648720211827008,
"grad_norm": 4.017126644109988,
"learning_rate": 9.142857142857144e-06,
"loss": 0.7602,
"step": 4
},
{
"epoch": 0.0706090026478376,
"grad_norm": 2.1627060531795967,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.7197,
"step": 5
},
{
"epoch": 0.08473080317740513,
"grad_norm": 4.667143281538081,
"learning_rate": 1.3714285714285716e-05,
"loss": 0.7374,
"step": 6
},
{
"epoch": 0.09885260370697264,
"grad_norm": 6.545492179001986,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7325,
"step": 7
},
{
"epoch": 0.11297440423654016,
"grad_norm": 7.281711055430636,
"learning_rate": 1.8285714285714288e-05,
"loss": 0.7502,
"step": 8
},
{
"epoch": 0.12709620476610767,
"grad_norm": 4.392674048119666,
"learning_rate": 2.057142857142857e-05,
"loss": 0.7051,
"step": 9
},
{
"epoch": 0.1412180052956752,
"grad_norm": 2.9074563359884973,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.6593,
"step": 10
},
{
"epoch": 0.1553398058252427,
"grad_norm": 2.205323794009288,
"learning_rate": 2.5142857142857143e-05,
"loss": 0.6276,
"step": 11
},
{
"epoch": 0.16946160635481025,
"grad_norm": 1.4150033586173336,
"learning_rate": 2.742857142857143e-05,
"loss": 0.6013,
"step": 12
},
{
"epoch": 0.18358340688437777,
"grad_norm": 1.3251831618492345,
"learning_rate": 2.9714285714285717e-05,
"loss": 0.5859,
"step": 13
},
{
"epoch": 0.1977052074139453,
"grad_norm": 3.1805546649918544,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.5824,
"step": 14
},
{
"epoch": 0.2118270079435128,
"grad_norm": 1.4989971887780031,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.5708,
"step": 15
},
{
"epoch": 0.22594880847308033,
"grad_norm": 1.203788524759605,
"learning_rate": 3.6571428571428576e-05,
"loss": 0.5563,
"step": 16
},
{
"epoch": 0.24007060900264784,
"grad_norm": 1.3578810581727971,
"learning_rate": 3.885714285714286e-05,
"loss": 0.5566,
"step": 17
},
{
"epoch": 0.25419240953221534,
"grad_norm": 0.8074275719021523,
"learning_rate": 4.114285714285714e-05,
"loss": 0.5507,
"step": 18
},
{
"epoch": 0.26831421006178285,
"grad_norm": 1.1227844753364196,
"learning_rate": 4.342857142857143e-05,
"loss": 0.5418,
"step": 19
},
{
"epoch": 0.2824360105913504,
"grad_norm": 0.8760389990351023,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.5333,
"step": 20
},
{
"epoch": 0.2965578111209179,
"grad_norm": 1.1881496989284341,
"learning_rate": 4.8e-05,
"loss": 0.5305,
"step": 21
},
{
"epoch": 0.3106796116504854,
"grad_norm": 0.9681459485298871,
"learning_rate": 5.0285714285714286e-05,
"loss": 0.5189,
"step": 22
},
{
"epoch": 0.324801412180053,
"grad_norm": 1.4344211614758422,
"learning_rate": 5.257142857142858e-05,
"loss": 0.5198,
"step": 23
},
{
"epoch": 0.3389232127096205,
"grad_norm": 0.8492140987790524,
"learning_rate": 5.485714285714286e-05,
"loss": 0.5102,
"step": 24
},
{
"epoch": 0.353045013239188,
"grad_norm": 1.2351656998878342,
"learning_rate": 5.714285714285715e-05,
"loss": 0.5098,
"step": 25
},
{
"epoch": 0.36716681376875554,
"grad_norm": 0.898578091846962,
"learning_rate": 5.9428571428571434e-05,
"loss": 0.5015,
"step": 26
},
{
"epoch": 0.38128861429832306,
"grad_norm": 1.5089247050740433,
"learning_rate": 6.171428571428573e-05,
"loss": 0.507,
"step": 27
},
{
"epoch": 0.3954104148278906,
"grad_norm": 0.9864208925736987,
"learning_rate": 6.400000000000001e-05,
"loss": 0.5034,
"step": 28
},
{
"epoch": 0.4095322153574581,
"grad_norm": 1.0206985743120736,
"learning_rate": 6.62857142857143e-05,
"loss": 0.4983,
"step": 29
},
{
"epoch": 0.4236540158870256,
"grad_norm": 1.4229934179471342,
"learning_rate": 6.857142857142857e-05,
"loss": 0.508,
"step": 30
},
{
"epoch": 0.43777581641659313,
"grad_norm": 0.9625460430091453,
"learning_rate": 7.085714285714287e-05,
"loss": 0.5016,
"step": 31
},
{
"epoch": 0.45189761694616065,
"grad_norm": 1.1144628190568628,
"learning_rate": 7.314285714285715e-05,
"loss": 0.4939,
"step": 32
},
{
"epoch": 0.46601941747572817,
"grad_norm": 0.9463549200929555,
"learning_rate": 7.542857142857144e-05,
"loss": 0.4924,
"step": 33
},
{
"epoch": 0.4801412180052957,
"grad_norm": 0.8892598203382347,
"learning_rate": 7.771428571428572e-05,
"loss": 0.4902,
"step": 34
},
{
"epoch": 0.4942630185348632,
"grad_norm": 0.9413854750510515,
"learning_rate": 8e-05,
"loss": 0.4852,
"step": 35
},
{
"epoch": 0.5083848190644307,
"grad_norm": 0.9034630826375731,
"learning_rate": 7.999801067823773e-05,
"loss": 0.4853,
"step": 36
},
{
"epoch": 0.5225066195939982,
"grad_norm": 1.2269858722315412,
"learning_rate": 7.999204291082095e-05,
"loss": 0.4764,
"step": 37
},
{
"epoch": 0.5366284201235657,
"grad_norm": 0.9045227868888749,
"learning_rate": 7.998209729134014e-05,
"loss": 0.4809,
"step": 38
},
{
"epoch": 0.5507502206531333,
"grad_norm": 1.2176978127037603,
"learning_rate": 7.996817480904718e-05,
"loss": 0.4719,
"step": 39
},
{
"epoch": 0.5648720211827007,
"grad_norm": 1.2333419409501036,
"learning_rate": 7.99502768487569e-05,
"loss": 0.477,
"step": 40
},
{
"epoch": 0.5789938217122683,
"grad_norm": 0.7483281237491166,
"learning_rate": 7.99284051907094e-05,
"loss": 0.4724,
"step": 41
},
{
"epoch": 0.5931156222418358,
"grad_norm": 0.6151558817864059,
"learning_rate": 7.990256201039297e-05,
"loss": 0.4662,
"step": 42
},
{
"epoch": 0.6072374227714034,
"grad_norm": 0.6814568571856022,
"learning_rate": 7.987274987832764e-05,
"loss": 0.4621,
"step": 43
},
{
"epoch": 0.6213592233009708,
"grad_norm": 0.9240497143419791,
"learning_rate": 7.983897175980957e-05,
"loss": 0.4665,
"step": 44
},
{
"epoch": 0.6354810238305384,
"grad_norm": 1.2034986971304784,
"learning_rate": 7.980123101461606e-05,
"loss": 0.4761,
"step": 45
},
{
"epoch": 0.649602824360106,
"grad_norm": 0.8192788227089312,
"learning_rate": 7.975953139667141e-05,
"loss": 0.4652,
"step": 46
},
{
"epoch": 0.6637246248896734,
"grad_norm": 0.7683306980890072,
"learning_rate": 7.97138770536735e-05,
"loss": 0.4619,
"step": 47
},
{
"epoch": 0.677846425419241,
"grad_norm": 0.7567015766907312,
"learning_rate": 7.966427252668121e-05,
"loss": 0.4638,
"step": 48
},
{
"epoch": 0.6919682259488085,
"grad_norm": 0.6846820764750615,
"learning_rate": 7.961072274966282e-05,
"loss": 0.4527,
"step": 49
},
{
"epoch": 0.706090026478376,
"grad_norm": 0.7395598100512276,
"learning_rate": 7.955323304900514e-05,
"loss": 0.4571,
"step": 50
},
{
"epoch": 0.7202118270079435,
"grad_norm": 0.5351232158771764,
"learning_rate": 7.949180914298383e-05,
"loss": 0.4496,
"step": 51
},
{
"epoch": 0.7343336275375111,
"grad_norm": 0.6324424558337066,
"learning_rate": 7.942645714119452e-05,
"loss": 0.4593,
"step": 52
},
{
"epoch": 0.7484554280670785,
"grad_norm": 0.547964739600884,
"learning_rate": 7.93571835439452e-05,
"loss": 0.4502,
"step": 53
},
{
"epoch": 0.7625772285966461,
"grad_norm": 0.7115536296101671,
"learning_rate": 7.928399524160956e-05,
"loss": 0.447,
"step": 54
},
{
"epoch": 0.7766990291262136,
"grad_norm": 0.742782814289987,
"learning_rate": 7.920689951394175e-05,
"loss": 0.4461,
"step": 55
},
{
"epoch": 0.7908208296557812,
"grad_norm": 0.6862659469941464,
"learning_rate": 7.912590402935223e-05,
"loss": 0.4473,
"step": 56
},
{
"epoch": 0.8049426301853486,
"grad_norm": 0.6235041641613883,
"learning_rate": 7.904101684414498e-05,
"loss": 0.4472,
"step": 57
},
{
"epoch": 0.8190644307149162,
"grad_norm": 0.44600795869954046,
"learning_rate": 7.895224640171625e-05,
"loss": 0.4442,
"step": 58
},
{
"epoch": 0.8331862312444837,
"grad_norm": 0.48251979778530707,
"learning_rate": 7.88596015317147e-05,
"loss": 0.4449,
"step": 59
},
{
"epoch": 0.8473080317740512,
"grad_norm": 0.4787558150068957,
"learning_rate": 7.876309144916312e-05,
"loss": 0.4433,
"step": 60
},
{
"epoch": 0.8614298323036187,
"grad_norm": 0.41827598666685606,
"learning_rate": 7.86627257535419e-05,
"loss": 0.4401,
"step": 61
},
{
"epoch": 0.8755516328331863,
"grad_norm": 0.4724100749619687,
"learning_rate": 7.855851442783414e-05,
"loss": 0.4374,
"step": 62
},
{
"epoch": 0.8896734333627537,
"grad_norm": 0.6571994588226032,
"learning_rate": 7.845046783753276e-05,
"loss": 0.4409,
"step": 63
},
{
"epoch": 0.9037952338923213,
"grad_norm": 0.9369294338435781,
"learning_rate": 7.833859672960943e-05,
"loss": 0.4407,
"step": 64
},
{
"epoch": 0.9179170344218888,
"grad_norm": 1.1601560447987704,
"learning_rate": 7.822291223144564e-05,
"loss": 0.4602,
"step": 65
},
{
"epoch": 0.9320388349514563,
"grad_norm": 0.6934703654331164,
"learning_rate": 7.810342584972585e-05,
"loss": 0.4369,
"step": 66
},
{
"epoch": 0.9461606354810238,
"grad_norm": 0.5791439547503463,
"learning_rate": 7.798014946929306e-05,
"loss": 0.4356,
"step": 67
},
{
"epoch": 0.9602824360105914,
"grad_norm": 0.8373041828808443,
"learning_rate": 7.785309535196657e-05,
"loss": 0.4504,
"step": 68
},
{
"epoch": 0.9744042365401588,
"grad_norm": 0.6796500376958069,
"learning_rate": 7.772227613532242e-05,
"loss": 0.4392,
"step": 69
},
{
"epoch": 0.9885260370697264,
"grad_norm": 0.6686880597044009,
"learning_rate": 7.758770483143634e-05,
"loss": 0.4474,
"step": 70
},
{
"epoch": 1.002647837599294,
"grad_norm": 0.6901488338737102,
"learning_rate": 7.74493948255895e-05,
"loss": 0.5108,
"step": 71
},
{
"epoch": 1.0167696381288613,
"grad_norm": 0.7139924415191212,
"learning_rate": 7.730735987493711e-05,
"loss": 0.4227,
"step": 72
},
{
"epoch": 1.030891438658429,
"grad_norm": 0.7623382444431029,
"learning_rate": 7.71616141071401e-05,
"loss": 0.419,
"step": 73
},
{
"epoch": 1.0450132391879965,
"grad_norm": 0.8179708530719029,
"learning_rate": 7.701217201895987e-05,
"loss": 0.4182,
"step": 74
},
{
"epoch": 1.059135039717564,
"grad_norm": 0.6036364923611257,
"learning_rate": 7.685904847481631e-05,
"loss": 0.4147,
"step": 75
},
{
"epoch": 1.0732568402471314,
"grad_norm": 0.5415944966587694,
"learning_rate": 7.670225870530936e-05,
"loss": 0.4192,
"step": 76
},
{
"epoch": 1.087378640776699,
"grad_norm": 0.548496642769106,
"learning_rate": 7.654181830570404e-05,
"loss": 0.4193,
"step": 77
},
{
"epoch": 1.1015004413062666,
"grad_norm": 0.4357435844414465,
"learning_rate": 7.637774323437929e-05,
"loss": 0.4126,
"step": 78
},
{
"epoch": 1.1156222418358341,
"grad_norm": 0.5890851003105865,
"learning_rate": 7.62100498112406e-05,
"loss": 0.4193,
"step": 79
},
{
"epoch": 1.1297440423654015,
"grad_norm": 0.5417176133106055,
"learning_rate": 7.603875471609677e-05,
"loss": 0.4069,
"step": 80
},
{
"epoch": 1.143865842894969,
"grad_norm": 0.5234067170715418,
"learning_rate": 7.586387498700084e-05,
"loss": 0.4187,
"step": 81
},
{
"epoch": 1.1579876434245366,
"grad_norm": 0.4795761329002007,
"learning_rate": 7.568542801855535e-05,
"loss": 0.4101,
"step": 82
},
{
"epoch": 1.1721094439541042,
"grad_norm": 0.510485207368403,
"learning_rate": 7.550343156018217e-05,
"loss": 0.4074,
"step": 83
},
{
"epoch": 1.1862312444836718,
"grad_norm": 0.5160993194955293,
"learning_rate": 7.531790371435709e-05,
"loss": 0.4105,
"step": 84
},
{
"epoch": 1.2003530450132391,
"grad_norm": 0.6272135654421417,
"learning_rate": 7.512886293480914e-05,
"loss": 0.4131,
"step": 85
},
{
"epoch": 1.2144748455428067,
"grad_norm": 0.7144516241332823,
"learning_rate": 7.49363280246852e-05,
"loss": 0.4123,
"step": 86
},
{
"epoch": 1.2285966460723743,
"grad_norm": 1.0197175196301183,
"learning_rate": 7.474031813467956e-05,
"loss": 0.4199,
"step": 87
},
{
"epoch": 1.2427184466019416,
"grad_norm": 0.9885970877399597,
"learning_rate": 7.454085276112925e-05,
"loss": 0.4152,
"step": 88
},
{
"epoch": 1.2568402471315092,
"grad_norm": 0.5518795345815659,
"learning_rate": 7.433795174407465e-05,
"loss": 0.4064,
"step": 89
},
{
"epoch": 1.2709620476610768,
"grad_norm": 0.42697954065556326,
"learning_rate": 7.413163526528623e-05,
"loss": 0.409,
"step": 90
},
{
"epoch": 1.2850838481906444,
"grad_norm": 0.698380780251885,
"learning_rate": 7.392192384625704e-05,
"loss": 0.4054,
"step": 91
},
{
"epoch": 1.299205648720212,
"grad_norm": 0.6703174317830842,
"learning_rate": 7.370883834616157e-05,
"loss": 0.4099,
"step": 92
},
{
"epoch": 1.3133274492497793,
"grad_norm": 0.3951173073488556,
"learning_rate": 7.349239995978095e-05,
"loss": 0.4084,
"step": 93
},
{
"epoch": 1.3274492497793469,
"grad_norm": 0.43174109319559356,
"learning_rate": 7.327263021539478e-05,
"loss": 0.4048,
"step": 94
},
{
"epoch": 1.3415710503089144,
"grad_norm": 0.5360712514545947,
"learning_rate": 7.30495509726398e-05,
"loss": 0.4068,
"step": 95
},
{
"epoch": 1.3556928508384818,
"grad_norm": 0.42774436448586106,
"learning_rate": 7.282318442033567e-05,
"loss": 0.4034,
"step": 96
},
{
"epoch": 1.3698146513680494,
"grad_norm": 0.5210499488927217,
"learning_rate": 7.259355307427781e-05,
"loss": 0.4078,
"step": 97
},
{
"epoch": 1.383936451897617,
"grad_norm": 0.7093148406292331,
"learning_rate": 7.236067977499791e-05,
"loss": 0.4084,
"step": 98
},
{
"epoch": 1.3980582524271845,
"grad_norm": 0.8196300420238753,
"learning_rate": 7.212458768549208e-05,
"loss": 0.4069,
"step": 99
},
{
"epoch": 1.412180052956752,
"grad_norm": 0.9973540383790642,
"learning_rate": 7.188530028891691e-05,
"loss": 0.4047,
"step": 100
},
{
"epoch": 1.4263018534863194,
"grad_norm": 1.0704815886198962,
"learning_rate": 7.164284138625367e-05,
"loss": 0.4075,
"step": 101
},
{
"epoch": 1.440423654015887,
"grad_norm": 0.5848553932345868,
"learning_rate": 7.13972350939409e-05,
"loss": 0.4036,
"step": 102
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.37289550464762866,
"learning_rate": 7.114850584147577e-05,
"loss": 0.4068,
"step": 103
},
{
"epoch": 1.468667255075022,
"grad_norm": 0.6651429035225815,
"learning_rate": 7.089667836898399e-05,
"loss": 0.4053,
"step": 104
},
{
"epoch": 1.4827890556045895,
"grad_norm": 0.6931193008736451,
"learning_rate": 7.064177772475912e-05,
"loss": 0.4002,
"step": 105
},
{
"epoch": 1.496910856134157,
"grad_norm": 0.3938085941153356,
"learning_rate": 7.038382926277113e-05,
"loss": 0.4013,
"step": 106
},
{
"epoch": 1.5110326566637247,
"grad_norm": 0.410899316731272,
"learning_rate": 7.012285864014445e-05,
"loss": 0.404,
"step": 107
},
{
"epoch": 1.5251544571932922,
"grad_norm": 0.5933306150673846,
"learning_rate": 6.985889181460602e-05,
"loss": 0.3992,
"step": 108
},
{
"epoch": 1.5392762577228596,
"grad_norm": 0.47465582200581674,
"learning_rate": 6.959195504190337e-05,
"loss": 0.4022,
"step": 109
},
{
"epoch": 1.5533980582524272,
"grad_norm": 0.29047076547162964,
"learning_rate": 6.932207487319305e-05,
"loss": 0.3933,
"step": 110
},
{
"epoch": 1.5675198587819947,
"grad_norm": 0.3955673661524972,
"learning_rate": 6.904927815239972e-05,
"loss": 0.4014,
"step": 111
},
{
"epoch": 1.581641659311562,
"grad_norm": 0.4729958849916794,
"learning_rate": 6.877359201354606e-05,
"loss": 0.4014,
"step": 112
},
{
"epoch": 1.5957634598411299,
"grad_norm": 0.3117935062342313,
"learning_rate": 6.84950438780538e-05,
"loss": 0.4021,
"step": 113
},
{
"epoch": 1.6098852603706972,
"grad_norm": 0.29707807435124145,
"learning_rate": 6.821366145201636e-05,
"loss": 0.4003,
"step": 114
},
{
"epoch": 1.6240070609002648,
"grad_norm": 0.43753418225532925,
"learning_rate": 6.792947272344292e-05,
"loss": 0.3992,
"step": 115
},
{
"epoch": 1.6381288614298324,
"grad_norm": 0.27791625901461003,
"learning_rate": 6.76425059594746e-05,
"loss": 0.3982,
"step": 116
},
{
"epoch": 1.6522506619593997,
"grad_norm": 0.2525501356536547,
"learning_rate": 6.73527897035728e-05,
"loss": 0.4053,
"step": 117
},
{
"epoch": 1.6663724624889673,
"grad_norm": 0.3669470139311434,
"learning_rate": 6.706035277268022e-05,
"loss": 0.4024,
"step": 118
},
{
"epoch": 1.6804942630185349,
"grad_norm": 0.30825865476024705,
"learning_rate": 6.676522425435433e-05,
"loss": 0.3945,
"step": 119
},
{
"epoch": 1.6946160635481022,
"grad_norm": 0.28018108144253323,
"learning_rate": 6.646743350387438e-05,
"loss": 0.3984,
"step": 120
},
{
"epoch": 1.70873786407767,
"grad_norm": 0.30987982118204843,
"learning_rate": 6.616701014132138e-05,
"loss": 0.4021,
"step": 121
},
{
"epoch": 1.7228596646072374,
"grad_norm": 0.3633571089136772,
"learning_rate": 6.586398404863198e-05,
"loss": 0.4026,
"step": 122
},
{
"epoch": 1.736981465136805,
"grad_norm": 0.36013913213670684,
"learning_rate": 6.555838536662624e-05,
"loss": 0.3925,
"step": 123
},
{
"epoch": 1.7511032656663725,
"grad_norm": 0.36709693358558493,
"learning_rate": 6.525024449200956e-05,
"loss": 0.3976,
"step": 124
},
{
"epoch": 1.7652250661959399,
"grad_norm": 0.44695442666677676,
"learning_rate": 6.493959207434934e-05,
"loss": 0.3982,
"step": 125
},
{
"epoch": 1.7793468667255075,
"grad_norm": 0.4500722428050271,
"learning_rate": 6.462645901302633e-05,
"loss": 0.3947,
"step": 126
},
{
"epoch": 1.793468667255075,
"grad_norm": 0.39821702121821073,
"learning_rate": 6.431087645416121e-05,
"loss": 0.4015,
"step": 127
},
{
"epoch": 1.8075904677846424,
"grad_norm": 0.42798393839154475,
"learning_rate": 6.399287578751656e-05,
"loss": 0.3959,
"step": 128
},
{
"epoch": 1.8217122683142102,
"grad_norm": 0.4978207058435827,
"learning_rate": 6.367248864337471e-05,
"loss": 0.3975,
"step": 129
},
{
"epoch": 1.8358340688437775,
"grad_norm": 0.4727933665511357,
"learning_rate": 6.334974688939161e-05,
"loss": 0.3961,
"step": 130
},
{
"epoch": 1.849955869373345,
"grad_norm": 0.30157997491072186,
"learning_rate": 6.302468262742695e-05,
"loss": 0.3923,
"step": 131
},
{
"epoch": 1.8640776699029127,
"grad_norm": 0.30111696128126747,
"learning_rate": 6.269732819035128e-05,
"loss": 0.3895,
"step": 132
},
{
"epoch": 1.87819947043248,
"grad_norm": 0.33866239621320493,
"learning_rate": 6.236771613882987e-05,
"loss": 0.3933,
"step": 133
},
{
"epoch": 1.8923212709620476,
"grad_norm": 0.2963866045397337,
"learning_rate": 6.20358792580841e-05,
"loss": 0.3865,
"step": 134
},
{
"epoch": 1.9064430714916152,
"grad_norm": 0.2821832464959724,
"learning_rate": 6.170185055463039e-05,
"loss": 0.3985,
"step": 135
},
{
"epoch": 1.9205648720211828,
"grad_norm": 0.26513081199542754,
"learning_rate": 6.136566325299715e-05,
"loss": 0.3972,
"step": 136
},
{
"epoch": 1.9346866725507503,
"grad_norm": 0.25040847849987535,
"learning_rate": 6.102735079242019e-05,
"loss": 0.398,
"step": 137
},
{
"epoch": 1.9488084730803177,
"grad_norm": 0.303971008854815,
"learning_rate": 6.068694682351651e-05,
"loss": 0.3957,
"step": 138
},
{
"epoch": 1.9629302736098853,
"grad_norm": 0.2610849344447032,
"learning_rate": 6.0344485204937274e-05,
"loss": 0.3953,
"step": 139
},
{
"epoch": 1.9770520741394528,
"grad_norm": 0.24540141466965165,
"learning_rate": 6.000000000000001e-05,
"loss": 0.3955,
"step": 140
},
{
"epoch": 1.9911738746690202,
"grad_norm": 0.2807585102662493,
"learning_rate": 5.965352547330046e-05,
"loss": 0.4096,
"step": 141
},
{
"epoch": 2.005295675198588,
"grad_norm": 0.4269953277008037,
"learning_rate": 5.930509608730444e-05,
"loss": 0.4441,
"step": 142
},
{
"epoch": 2.0194174757281553,
"grad_norm": 0.5672907609303462,
"learning_rate": 5.895474649891995e-05,
"loss": 0.3728,
"step": 143
},
{
"epoch": 2.0335392762577227,
"grad_norm": 0.7266748405757633,
"learning_rate": 5.860251155605003e-05,
"loss": 0.3745,
"step": 144
},
{
"epoch": 2.0476610767872905,
"grad_norm": 1.0069160934332146,
"learning_rate": 5.824842629412653e-05,
"loss": 0.3832,
"step": 145
},
{
"epoch": 2.061782877316858,
"grad_norm": 0.9424187541004289,
"learning_rate": 5.7892525932625305e-05,
"loss": 0.3779,
"step": 146
},
{
"epoch": 2.0759046778464256,
"grad_norm": 0.42574191446629944,
"learning_rate": 5.75348458715631e-05,
"loss": 0.3718,
"step": 147
},
{
"epoch": 2.090026478375993,
"grad_norm": 0.7408316783846461,
"learning_rate": 5.7175421687976374e-05,
"loss": 0.3699,
"step": 148
},
{
"epoch": 2.1041482789055603,
"grad_norm": 0.6750908749341442,
"learning_rate": 5.681428913238263e-05,
"loss": 0.367,
"step": 149
},
{
"epoch": 2.118270079435128,
"grad_norm": 0.486610272879909,
"learning_rate": 5.645148412522447e-05,
"loss": 0.3752,
"step": 150
},
{
"epoch": 2.1323918799646955,
"grad_norm": 0.5306866815139071,
"learning_rate": 5.60870427532967e-05,
"loss": 0.3657,
"step": 151
},
{
"epoch": 2.146513680494263,
"grad_norm": 0.4884339447717486,
"learning_rate": 5.572100126615695e-05,
"loss": 0.3701,
"step": 152
},
{
"epoch": 2.1606354810238306,
"grad_norm": 0.39396923901380754,
"learning_rate": 5.535339607252003e-05,
"loss": 0.364,
"step": 153
},
{
"epoch": 2.174757281553398,
"grad_norm": 0.3784748162116266,
"learning_rate": 5.4984263736636494e-05,
"loss": 0.3641,
"step": 154
},
{
"epoch": 2.1888790820829658,
"grad_norm": 0.38537106208995364,
"learning_rate": 5.461364097465581e-05,
"loss": 0.3634,
"step": 155
},
{
"epoch": 2.203000882612533,
"grad_norm": 0.33639666599879814,
"learning_rate": 5.424156465097428e-05,
"loss": 0.3676,
"step": 156
},
{
"epoch": 2.2171226831421005,
"grad_norm": 0.3286791724075738,
"learning_rate": 5.38680717745683e-05,
"loss": 0.3649,
"step": 157
},
{
"epoch": 2.2312444836716683,
"grad_norm": 0.3129994921836922,
"learning_rate": 5.349319949531321e-05,
"loss": 0.3646,
"step": 158
},
{
"epoch": 2.2453662842012356,
"grad_norm": 0.3031016329231297,
"learning_rate": 5.3116985100288185e-05,
"loss": 0.3682,
"step": 159
},
{
"epoch": 2.259488084730803,
"grad_norm": 0.27541410223019297,
"learning_rate": 5.2739466010067385e-05,
"loss": 0.3606,
"step": 160
},
{
"epoch": 2.2736098852603708,
"grad_norm": 0.36257443661095795,
"learning_rate": 5.23606797749979e-05,
"loss": 0.3638,
"step": 161
},
{
"epoch": 2.287731685789938,
"grad_norm": 0.2567579985831816,
"learning_rate": 5.1980664071464776e-05,
"loss": 0.3667,
"step": 162
},
{
"epoch": 2.301853486319506,
"grad_norm": 0.31190867351244567,
"learning_rate": 5.159945669814345e-05,
"loss": 0.3696,
"step": 163
},
{
"epoch": 2.3159752868490733,
"grad_norm": 0.2937382011800516,
"learning_rate": 5.121709557224011e-05,
"loss": 0.3606,
"step": 164
},
{
"epoch": 2.3300970873786406,
"grad_norm": 0.2256249267158452,
"learning_rate": 5.0833618725720214e-05,
"loss": 0.365,
"step": 165
},
{
"epoch": 2.3442188879082084,
"grad_norm": 0.298331814145165,
"learning_rate": 5.044906430152554e-05,
"loss": 0.3667,
"step": 166
},
{
"epoch": 2.358340688437776,
"grad_norm": 0.18895739371171252,
"learning_rate": 5.006347054978035e-05,
"loss": 0.3699,
"step": 167
},
{
"epoch": 2.3724624889673436,
"grad_norm": 0.25034317840687215,
"learning_rate": 4.967687582398671e-05,
"loss": 0.3587,
"step": 168
},
{
"epoch": 2.386584289496911,
"grad_norm": 0.17907966208059622,
"learning_rate": 4.9289318577209706e-05,
"loss": 0.3636,
"step": 169
},
{
"epoch": 2.4007060900264783,
"grad_norm": 0.21210095036882018,
"learning_rate": 4.890083735825258e-05,
"loss": 0.3605,
"step": 170
},
{
"epoch": 2.414827890556046,
"grad_norm": 0.16489305774518265,
"learning_rate": 4.851147080782249e-05,
"loss": 0.3648,
"step": 171
},
{
"epoch": 2.4289496910856134,
"grad_norm": 0.19143993377462817,
"learning_rate": 4.812125765468705e-05,
"loss": 0.3606,
"step": 172
},
{
"epoch": 2.443071491615181,
"grad_norm": 0.17804983590295367,
"learning_rate": 4.773023671182213e-05,
"loss": 0.3637,
"step": 173
},
{
"epoch": 2.4571932921447486,
"grad_norm": 0.16707259472270428,
"learning_rate": 4.73384468725513e-05,
"loss": 0.3636,
"step": 174
},
{
"epoch": 2.471315092674316,
"grad_norm": 0.17481885632199456,
"learning_rate": 4.694592710667723e-05,
"loss": 0.3645,
"step": 175
},
{
"epoch": 2.4854368932038833,
"grad_norm": 0.1681053608116463,
"learning_rate": 4.6552716456605514e-05,
"loss": 0.3605,
"step": 176
},
{
"epoch": 2.499558693733451,
"grad_norm": 0.14964611415536702,
"learning_rate": 4.615885403346134e-05,
"loss": 0.3562,
"step": 177
},
{
"epoch": 2.5136804942630184,
"grad_norm": 0.14164675176141614,
"learning_rate": 4.576437901319921e-05,
"loss": 0.3636,
"step": 178
},
{
"epoch": 2.5278022947925862,
"grad_norm": 0.16548274190466053,
"learning_rate": 4.5369330632706223e-05,
"loss": 0.3648,
"step": 179
},
{
"epoch": 2.5419240953221536,
"grad_norm": 0.15269683467677936,
"learning_rate": 4.4973748185899416e-05,
"loss": 0.3612,
"step": 180
},
{
"epoch": 2.556045895851721,
"grad_norm": 0.16869434151649507,
"learning_rate": 4.457767101981728e-05,
"loss": 0.3677,
"step": 181
},
{
"epoch": 2.5701676963812887,
"grad_norm": 0.13337265767063033,
"learning_rate": 4.418113853070614e-05,
"loss": 0.3626,
"step": 182
},
{
"epoch": 2.584289496910856,
"grad_norm": 0.14682144236789746,
"learning_rate": 4.378419016010149e-05,
"loss": 0.364,
"step": 183
},
{
"epoch": 2.598411297440424,
"grad_norm": 0.150937900490833,
"learning_rate": 4.338686539090493e-05,
"loss": 0.3615,
"step": 184
},
{
"epoch": 2.6125330979699912,
"grad_norm": 0.1341377364551312,
"learning_rate": 4.298920374345698e-05,
"loss": 0.3596,
"step": 185
},
{
"epoch": 2.6266548984995586,
"grad_norm": 0.15572962430762588,
"learning_rate": 4.259124477160607e-05,
"loss": 0.3625,
"step": 186
},
{
"epoch": 2.6407766990291264,
"grad_norm": 0.1475404012486826,
"learning_rate": 4.219302805877441e-05,
"loss": 0.3617,
"step": 187
},
{
"epoch": 2.6548984995586937,
"grad_norm": 0.1781262720167099,
"learning_rate": 4.17945932140206e-05,
"loss": 0.3666,
"step": 188
},
{
"epoch": 2.6690203000882615,
"grad_norm": 0.13824587532461255,
"learning_rate": 4.139597986810005e-05,
"loss": 0.3629,
"step": 189
},
{
"epoch": 2.683142100617829,
"grad_norm": 0.15963593698467365,
"learning_rate": 4.0997227669522924e-05,
"loss": 0.3628,
"step": 190
},
{
"epoch": 2.6972639011473962,
"grad_norm": 0.12511715922220792,
"learning_rate": 4.059837628061055e-05,
"loss": 0.3638,
"step": 191
},
{
"epoch": 2.7113857016769636,
"grad_norm": 0.15752313446706914,
"learning_rate": 4.019946537355033e-05,
"loss": 0.3614,
"step": 192
},
{
"epoch": 2.7255075022065314,
"grad_norm": 0.13647413322377422,
"learning_rate": 3.9800534626449683e-05,
"loss": 0.3634,
"step": 193
},
{
"epoch": 2.7396293027360987,
"grad_norm": 0.13525074863232164,
"learning_rate": 3.940162371938947e-05,
"loss": 0.3587,
"step": 194
},
{
"epoch": 2.7537511032656665,
"grad_norm": 0.13297285710552217,
"learning_rate": 3.9002772330477096e-05,
"loss": 0.3599,
"step": 195
},
{
"epoch": 2.767872903795234,
"grad_norm": 0.14225004712058384,
"learning_rate": 3.860402013189998e-05,
"loss": 0.3575,
"step": 196
},
{
"epoch": 2.7819947043248012,
"grad_norm": 0.13373630438071715,
"learning_rate": 3.820540678597942e-05,
"loss": 0.3648,
"step": 197
},
{
"epoch": 2.796116504854369,
"grad_norm": 0.12615478953418785,
"learning_rate": 3.78069719412256e-05,
"loss": 0.3609,
"step": 198
},
{
"epoch": 2.8102383053839364,
"grad_norm": 0.12669967225071216,
"learning_rate": 3.740875522839393e-05,
"loss": 0.3608,
"step": 199
},
{
"epoch": 2.824360105913504,
"grad_norm": 0.13635382545910668,
"learning_rate": 3.7010796256543034e-05,
"loss": 0.3549,
"step": 200
},
{
"epoch": 2.8384819064430715,
"grad_norm": 0.11546629160995592,
"learning_rate": 3.661313460909507e-05,
"loss": 0.3593,
"step": 201
},
{
"epoch": 2.852603706972639,
"grad_norm": 0.12139128794186867,
"learning_rate": 3.621580983989852e-05,
"loss": 0.3608,
"step": 202
},
{
"epoch": 2.8667255075022067,
"grad_norm": 0.12319344865206981,
"learning_rate": 3.581886146929387e-05,
"loss": 0.3605,
"step": 203
},
{
"epoch": 2.880847308031774,
"grad_norm": 0.14742473593815408,
"learning_rate": 3.542232898018273e-05,
"loss": 0.3582,
"step": 204
},
{
"epoch": 2.894969108561342,
"grad_norm": 0.11086460953888361,
"learning_rate": 3.5026251814100604e-05,
"loss": 0.359,
"step": 205
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.13533789741325936,
"learning_rate": 3.4630669367293797e-05,
"loss": 0.3562,
"step": 206
},
{
"epoch": 2.9232127096204765,
"grad_norm": 0.11573276006772669,
"learning_rate": 3.4235620986800806e-05,
"loss": 0.3641,
"step": 207
},
{
"epoch": 2.937334510150044,
"grad_norm": 0.12838446326005826,
"learning_rate": 3.384114596653866e-05,
"loss": 0.361,
"step": 208
},
{
"epoch": 2.9514563106796117,
"grad_norm": 0.12304575149956651,
"learning_rate": 3.344728354339449e-05,
"loss": 0.3586,
"step": 209
},
{
"epoch": 2.965578111209179,
"grad_norm": 0.12773291501034634,
"learning_rate": 3.305407289332279e-05,
"loss": 0.3559,
"step": 210
},
{
"epoch": 2.979699911738747,
"grad_norm": 0.16335068209235123,
"learning_rate": 3.266155312744871e-05,
"loss": 0.3631,
"step": 211
},
{
"epoch": 2.993821712268314,
"grad_norm": 0.1186978138033666,
"learning_rate": 3.226976328817788e-05,
"loss": 0.3927,
"step": 212
},
{
"epoch": 3.0079435127978815,
"grad_norm": 0.16211984652497452,
"learning_rate": 3.187874234531296e-05,
"loss": 0.3822,
"step": 213
},
{
"epoch": 3.0220653133274493,
"grad_norm": 0.14214772364476422,
"learning_rate": 3.1488529192177526e-05,
"loss": 0.3393,
"step": 214
},
{
"epoch": 3.0361871138570167,
"grad_norm": 0.13255124874063956,
"learning_rate": 3.109916264174743e-05,
"loss": 0.3373,
"step": 215
},
{
"epoch": 3.0503089143865845,
"grad_norm": 0.16606000923059963,
"learning_rate": 3.071068142279031e-05,
"loss": 0.3371,
"step": 216
},
{
"epoch": 3.064430714916152,
"grad_norm": 0.14657630327267304,
"learning_rate": 3.0323124176013297e-05,
"loss": 0.3355,
"step": 217
},
{
"epoch": 3.078552515445719,
"grad_norm": 0.1341605905929287,
"learning_rate": 2.993652945021966e-05,
"loss": 0.3377,
"step": 218
},
{
"epoch": 3.092674315975287,
"grad_norm": 0.14490108611743277,
"learning_rate": 2.955093569847447e-05,
"loss": 0.3366,
"step": 219
},
{
"epoch": 3.1067961165048543,
"grad_norm": 0.13919821523407064,
"learning_rate": 2.9166381274279803e-05,
"loss": 0.3312,
"step": 220
},
{
"epoch": 3.120917917034422,
"grad_norm": 0.16300975058477254,
"learning_rate": 2.8782904427759898e-05,
"loss": 0.3311,
"step": 221
},
{
"epoch": 3.1350397175639895,
"grad_norm": 0.1183225077661534,
"learning_rate": 2.8400543301856553e-05,
"loss": 0.3282,
"step": 222
},
{
"epoch": 3.149161518093557,
"grad_norm": 0.14092204872317698,
"learning_rate": 2.8019335928535234e-05,
"loss": 0.3297,
"step": 223
},
{
"epoch": 3.1632833186231246,
"grad_norm": 0.1282390396455681,
"learning_rate": 2.7639320225002108e-05,
"loss": 0.327,
"step": 224
},
{
"epoch": 3.177405119152692,
"grad_norm": 0.12936573725572997,
"learning_rate": 2.7260533989932628e-05,
"loss": 0.3346,
"step": 225
},
{
"epoch": 3.1915269196822593,
"grad_norm": 0.11727309920196596,
"learning_rate": 2.688301489971183e-05,
"loss": 0.3271,
"step": 226
},
{
"epoch": 3.205648720211827,
"grad_norm": 0.12274146196879084,
"learning_rate": 2.6506800504686806e-05,
"loss": 0.328,
"step": 227
},
{
"epoch": 3.2197705207413945,
"grad_norm": 0.11029811005681434,
"learning_rate": 2.6131928225431713e-05,
"loss": 0.33,
"step": 228
},
{
"epoch": 3.233892321270962,
"grad_norm": 0.12463320131443856,
"learning_rate": 2.575843534902573e-05,
"loss": 0.3358,
"step": 229
},
{
"epoch": 3.2480141218005296,
"grad_norm": 0.11256203223325899,
"learning_rate": 2.53863590253442e-05,
"loss": 0.3364,
"step": 230
},
{
"epoch": 3.262135922330097,
"grad_norm": 0.10841743259905046,
"learning_rate": 2.501573626336352e-05,
"loss": 0.3337,
"step": 231
},
{
"epoch": 3.2762577228596648,
"grad_norm": 0.11593566286716334,
"learning_rate": 2.464660392747999e-05,
"loss": 0.3301,
"step": 232
},
{
"epoch": 3.290379523389232,
"grad_norm": 0.10969283000201786,
"learning_rate": 2.427899873384306e-05,
"loss": 0.332,
"step": 233
},
{
"epoch": 3.3045013239187995,
"grad_norm": 0.12033857141829916,
"learning_rate": 2.3912957246703305e-05,
"loss": 0.3377,
"step": 234
},
{
"epoch": 3.3186231244483673,
"grad_norm": 0.10210001952439796,
"learning_rate": 2.3548515874775547e-05,
"loss": 0.3297,
"step": 235
},
{
"epoch": 3.3327449249779346,
"grad_norm": 0.12241287674636975,
"learning_rate": 2.3185710867617387e-05,
"loss": 0.3361,
"step": 236
},
{
"epoch": 3.3468667255075024,
"grad_norm": 0.10969299118083352,
"learning_rate": 2.2824578312023632e-05,
"loss": 0.3322,
"step": 237
},
{
"epoch": 3.3609885260370698,
"grad_norm": 0.12151530040465547,
"learning_rate": 2.24651541284369e-05,
"loss": 0.3361,
"step": 238
},
{
"epoch": 3.375110326566637,
"grad_norm": 0.10631863902215113,
"learning_rate": 2.210747406737469e-05,
"loss": 0.3344,
"step": 239
},
{
"epoch": 3.389232127096205,
"grad_norm": 0.11983276963310185,
"learning_rate": 2.175157370587348e-05,
"loss": 0.3324,
"step": 240
},
{
"epoch": 3.4033539276257723,
"grad_norm": 0.10203118790788067,
"learning_rate": 2.1397488443949985e-05,
"loss": 0.3366,
"step": 241
},
{
"epoch": 3.4174757281553396,
"grad_norm": 0.11460733945580791,
"learning_rate": 2.1045253501080058e-05,
"loss": 0.3335,
"step": 242
},
{
"epoch": 3.4315975286849074,
"grad_norm": 0.10361959122829918,
"learning_rate": 2.0694903912695574e-05,
"loss": 0.3342,
"step": 243
},
{
"epoch": 3.4457193292144748,
"grad_norm": 0.10602009006473866,
"learning_rate": 2.0346474526699552e-05,
"loss": 0.3343,
"step": 244
},
{
"epoch": 3.459841129744042,
"grad_norm": 0.0981614565374733,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.3342,
"step": 245
},
{
"epoch": 3.47396293027361,
"grad_norm": 0.10563881070295801,
"learning_rate": 1.9655514795062746e-05,
"loss": 0.3317,
"step": 246
},
{
"epoch": 3.4880847308031773,
"grad_norm": 0.0982393867459211,
"learning_rate": 1.931305317648349e-05,
"loss": 0.336,
"step": 247
},
{
"epoch": 3.502206531332745,
"grad_norm": 0.10341107342114168,
"learning_rate": 1.897264920757981e-05,
"loss": 0.3329,
"step": 248
},
{
"epoch": 3.5163283318623124,
"grad_norm": 0.1009205150822494,
"learning_rate": 1.8634336747002853e-05,
"loss": 0.3363,
"step": 249
},
{
"epoch": 3.5304501323918798,
"grad_norm": 0.09562831286129422,
"learning_rate": 1.829814944536963e-05,
"loss": 0.3366,
"step": 250
},
{
"epoch": 3.5445719329214476,
"grad_norm": 0.10055162803558056,
"learning_rate": 1.7964120741915905e-05,
"loss": 0.3359,
"step": 251
},
{
"epoch": 3.558693733451015,
"grad_norm": 0.10362087580690618,
"learning_rate": 1.7632283861170135e-05,
"loss": 0.33,
"step": 252
},
{
"epoch": 3.5728155339805827,
"grad_norm": 0.09578324331311534,
"learning_rate": 1.7302671809648735e-05,
"loss": 0.3336,
"step": 253
},
{
"epoch": 3.58693733451015,
"grad_norm": 0.1021943484963981,
"learning_rate": 1.6975317372573066e-05,
"loss": 0.334,
"step": 254
},
{
"epoch": 3.6010591350397174,
"grad_norm": 0.10104477227737499,
"learning_rate": 1.6650253110608415e-05,
"loss": 0.3352,
"step": 255
},
{
"epoch": 3.615180935569285,
"grad_norm": 0.09719144111824624,
"learning_rate": 1.6327511356625302e-05,
"loss": 0.3339,
"step": 256
},
{
"epoch": 3.6293027360988526,
"grad_norm": 0.10082549447043057,
"learning_rate": 1.6007124212483453e-05,
"loss": 0.3303,
"step": 257
},
{
"epoch": 3.6434245366284204,
"grad_norm": 0.09855344501708733,
"learning_rate": 1.5689123545838804e-05,
"loss": 0.3319,
"step": 258
},
{
"epoch": 3.6575463371579877,
"grad_norm": 0.10038693196972406,
"learning_rate": 1.537354098697367e-05,
"loss": 0.3285,
"step": 259
},
{
"epoch": 3.671668137687555,
"grad_norm": 0.10993218050906065,
"learning_rate": 1.5060407925650662e-05,
"loss": 0.3346,
"step": 260
},
{
"epoch": 3.6857899382171224,
"grad_norm": 0.09881058692426582,
"learning_rate": 1.4749755507990449e-05,
"loss": 0.3265,
"step": 261
},
{
"epoch": 3.69991173874669,
"grad_norm": 0.11110424733317653,
"learning_rate": 1.4441614633373773e-05,
"loss": 0.3367,
"step": 262
},
{
"epoch": 3.7140335392762576,
"grad_norm": 0.09507466207790345,
"learning_rate": 1.413601595136802e-05,
"loss": 0.335,
"step": 263
},
{
"epoch": 3.7281553398058254,
"grad_norm": 0.10341229060389236,
"learning_rate": 1.383298985867863e-05,
"loss": 0.3324,
"step": 264
},
{
"epoch": 3.7422771403353927,
"grad_norm": 0.09734360531860331,
"learning_rate": 1.3532566496125634e-05,
"loss": 0.3313,
"step": 265
},
{
"epoch": 3.75639894086496,
"grad_norm": 0.09174570798780135,
"learning_rate": 1.3234775745645684e-05,
"loss": 0.3351,
"step": 266
},
{
"epoch": 3.770520741394528,
"grad_norm": 0.10147835781586892,
"learning_rate": 1.2939647227319791e-05,
"loss": 0.3353,
"step": 267
},
{
"epoch": 3.784642541924095,
"grad_norm": 0.09808246222031777,
"learning_rate": 1.2647210296427197e-05,
"loss": 0.3323,
"step": 268
},
{
"epoch": 3.798764342453663,
"grad_norm": 0.09735163985861015,
"learning_rate": 1.2357494040525416e-05,
"loss": 0.3391,
"step": 269
},
{
"epoch": 3.8128861429832304,
"grad_norm": 0.08930562493255255,
"learning_rate": 1.2070527276557092e-05,
"loss": 0.3327,
"step": 270
},
{
"epoch": 3.8270079435127977,
"grad_norm": 0.09744814905553326,
"learning_rate": 1.178633854798365e-05,
"loss": 0.33,
"step": 271
},
{
"epoch": 3.8411297440423655,
"grad_norm": 0.09183836496663382,
"learning_rate": 1.1504956121946216e-05,
"loss": 0.3317,
"step": 272
},
{
"epoch": 3.855251544571933,
"grad_norm": 0.08801876422756064,
"learning_rate": 1.1226407986453963e-05,
"loss": 0.3294,
"step": 273
},
{
"epoch": 3.8693733451015007,
"grad_norm": 0.08798928229950856,
"learning_rate": 1.0950721847600282e-05,
"loss": 0.3282,
"step": 274
},
{
"epoch": 3.883495145631068,
"grad_norm": 0.09000845113363774,
"learning_rate": 1.0677925126806956e-05,
"loss": 0.335,
"step": 275
},
{
"epoch": 3.8976169461606354,
"grad_norm": 0.09609952332604478,
"learning_rate": 1.040804495809665e-05,
"loss": 0.3352,
"step": 276
},
{
"epoch": 3.911738746690203,
"grad_norm": 0.09426777621829556,
"learning_rate": 1.0141108185393995e-05,
"loss": 0.3307,
"step": 277
},
{
"epoch": 3.9258605472197705,
"grad_norm": 0.08749576305220681,
"learning_rate": 9.877141359855567e-06,
"loss": 0.3316,
"step": 278
},
{
"epoch": 3.9399823477493383,
"grad_norm": 0.08573388419725536,
"learning_rate": 9.616170737228882e-06,
"loss": 0.3301,
"step": 279
},
{
"epoch": 3.9541041482789057,
"grad_norm": 0.08677743094561904,
"learning_rate": 9.358222275240884e-06,
"loss": 0.3309,
"step": 280
},
{
"epoch": 3.968225948808473,
"grad_norm": 0.08456912932018501,
"learning_rate": 9.103321631016024e-06,
"loss": 0.3294,
"step": 281
},
{
"epoch": 3.9823477493380404,
"grad_norm": 0.0892840459688823,
"learning_rate": 8.851494158524242e-06,
"loss": 0.3299,
"step": 282
},
{
"epoch": 3.996469549867608,
"grad_norm": 0.09785834932292316,
"learning_rate": 8.602764906059109e-06,
"loss": 0.3734,
"step": 283
},
{
"epoch": 4.010591350397176,
"grad_norm": 0.1159182382828669,
"learning_rate": 8.35715861374636e-06,
"loss": 0.3432,
"step": 284
},
{
"epoch": 4.024713150926743,
"grad_norm": 0.11348869033645836,
"learning_rate": 8.114699711083113e-06,
"loss": 0.3187,
"step": 285
},
{
"epoch": 4.038834951456311,
"grad_norm": 0.09626843456466473,
"learning_rate": 7.875412314507942e-06,
"loss": 0.3213,
"step": 286
},
{
"epoch": 4.052956751985878,
"grad_norm": 0.0918806636447836,
"learning_rate": 7.639320225002106e-06,
"loss": 0.3169,
"step": 287
},
{
"epoch": 4.067078552515445,
"grad_norm": 0.09514043448978982,
"learning_rate": 7.406446925722211e-06,
"loss": 0.3148,
"step": 288
},
{
"epoch": 4.081200353045014,
"grad_norm": 0.10508295602012874,
"learning_rate": 7.176815579664343e-06,
"loss": 0.3132,
"step": 289
},
{
"epoch": 4.095322153574581,
"grad_norm": 0.10091079365331981,
"learning_rate": 6.950449027360213e-06,
"loss": 0.3175,
"step": 290
},
{
"epoch": 4.109443954104148,
"grad_norm": 0.0973346460822993,
"learning_rate": 6.7273697846052515e-06,
"loss": 0.3184,
"step": 291
},
{
"epoch": 4.123565754633716,
"grad_norm": 0.09115379235697503,
"learning_rate": 6.507600040219073e-06,
"loss": 0.3164,
"step": 292
},
{
"epoch": 4.137687555163283,
"grad_norm": 0.08901902718597547,
"learning_rate": 6.291161653838434e-06,
"loss": 0.3177,
"step": 293
},
{
"epoch": 4.151809355692851,
"grad_norm": 0.09132299423316595,
"learning_rate": 6.078076153742962e-06,
"loss": 0.3131,
"step": 294
},
{
"epoch": 4.165931156222419,
"grad_norm": 0.09543903005749907,
"learning_rate": 5.868364734713776e-06,
"loss": 0.3142,
"step": 295
},
{
"epoch": 4.180052956751986,
"grad_norm": 0.09061531269851537,
"learning_rate": 5.662048255925357e-06,
"loss": 0.3204,
"step": 296
},
{
"epoch": 4.194174757281553,
"grad_norm": 0.08551951038992002,
"learning_rate": 5.459147238870768e-06,
"loss": 0.3158,
"step": 297
},
{
"epoch": 4.208296557811121,
"grad_norm": 0.08387425510980595,
"learning_rate": 5.259681865320447e-06,
"loss": 0.3194,
"step": 298
},
{
"epoch": 4.222418358340688,
"grad_norm": 0.0901228464398898,
"learning_rate": 5.063671975314814e-06,
"loss": 0.3163,
"step": 299
},
{
"epoch": 4.236540158870256,
"grad_norm": 0.08691256583540367,
"learning_rate": 4.871137065190854e-06,
"loss": 0.315,
"step": 300
},
{
"epoch": 4.250661959399824,
"grad_norm": 0.0878527835574059,
"learning_rate": 4.6820962856429205e-06,
"loss": 0.3176,
"step": 301
},
{
"epoch": 4.264783759929391,
"grad_norm": 0.0840437037057203,
"learning_rate": 4.496568439817836e-06,
"loss": 0.322,
"step": 302
},
{
"epoch": 4.278905560458958,
"grad_norm": 0.08904988122589128,
"learning_rate": 4.314571981444666e-06,
"loss": 0.311,
"step": 303
},
{
"epoch": 4.293027360988526,
"grad_norm": 0.08120215219780037,
"learning_rate": 4.136125012999168e-06,
"loss": 0.3203,
"step": 304
},
{
"epoch": 4.307149161518094,
"grad_norm": 0.08522052695009742,
"learning_rate": 3.961245283903239e-06,
"loss": 0.3161,
"step": 305
},
{
"epoch": 4.321270962047661,
"grad_norm": 0.08319753808748938,
"learning_rate": 3.7899501887594102e-06,
"loss": 0.315,
"step": 306
},
{
"epoch": 4.335392762577229,
"grad_norm": 0.08198211403858394,
"learning_rate": 3.622256765620713e-06,
"loss": 0.3165,
"step": 307
},
{
"epoch": 4.349514563106796,
"grad_norm": 0.07827444542073485,
"learning_rate": 3.458181694295961e-06,
"loss": 0.3114,
"step": 308
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.07827005931051699,
"learning_rate": 3.297741294690644e-06,
"loss": 0.3125,
"step": 309
},
{
"epoch": 4.3777581641659316,
"grad_norm": 0.07833274350751808,
"learning_rate": 3.140951525183691e-06,
"loss": 0.3156,
"step": 310
},
{
"epoch": 4.391879964695499,
"grad_norm": 0.08055700180528477,
"learning_rate": 2.987827981040132e-06,
"loss": 0.3144,
"step": 311
},
{
"epoch": 4.406001765225066,
"grad_norm": 0.0799614180245514,
"learning_rate": 2.8383858928598963e-06,
"loss": 0.3157,
"step": 312
},
{
"epoch": 4.420123565754634,
"grad_norm": 0.0722165779006397,
"learning_rate": 2.692640125062895e-06,
"loss": 0.3116,
"step": 313
},
{
"epoch": 4.434245366284201,
"grad_norm": 0.07776220076295337,
"learning_rate": 2.550605174410512e-06,
"loss": 0.3206,
"step": 314
},
{
"epoch": 4.448367166813769,
"grad_norm": 0.07577160557474086,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.3119,
"step": 315
},
{
"epoch": 4.4624889673433366,
"grad_norm": 0.07292199486310709,
"learning_rate": 2.2777238646775768e-06,
"loss": 0.314,
"step": 316
},
{
"epoch": 4.476610767872904,
"grad_norm": 0.07321270589774292,
"learning_rate": 2.14690464803343e-06,
"loss": 0.3116,
"step": 317
},
{
"epoch": 4.490732568402471,
"grad_norm": 0.07971761444372055,
"learning_rate": 2.0198505307069462e-06,
"loss": 0.3162,
"step": 318
},
{
"epoch": 4.504854368932039,
"grad_norm": 0.0823725656624792,
"learning_rate": 1.896574150274151e-06,
"loss": 0.318,
"step": 319
},
{
"epoch": 4.518976169461606,
"grad_norm": 0.07311612247681858,
"learning_rate": 1.7770877685543687e-06,
"loss": 0.3146,
"step": 320
},
{
"epoch": 4.533097969991174,
"grad_norm": 0.0754285797360244,
"learning_rate": 1.6614032703905714e-06,
"loss": 0.3188,
"step": 321
},
{
"epoch": 4.5472197705207416,
"grad_norm": 0.07192329712907819,
"learning_rate": 1.5495321624672443e-06,
"loss": 0.3117,
"step": 322
},
{
"epoch": 4.561341571050309,
"grad_norm": 0.07683729191513318,
"learning_rate": 1.4414855721658705e-06,
"loss": 0.3179,
"step": 323
},
{
"epoch": 4.575463371579876,
"grad_norm": 0.07466087193345237,
"learning_rate": 1.3372742464581134e-06,
"loss": 0.3169,
"step": 324
},
{
"epoch": 4.589585172109444,
"grad_norm": 0.07472750780066512,
"learning_rate": 1.2369085508368862e-06,
"loss": 0.313,
"step": 325
},
{
"epoch": 4.603706972639012,
"grad_norm": 0.07567268942020543,
"learning_rate": 1.1403984682852998e-06,
"loss": 0.3162,
"step": 326
},
{
"epoch": 4.617828773168579,
"grad_norm": 0.07193466653913613,
"learning_rate": 1.0477535982837473e-06,
"loss": 0.3169,
"step": 327
},
{
"epoch": 4.631950573698147,
"grad_norm": 0.07310364397796111,
"learning_rate": 9.589831558550222e-07,
"loss": 0.3147,
"step": 328
},
{
"epoch": 4.646072374227714,
"grad_norm": 0.07226831665121733,
"learning_rate": 8.740959706477725e-07,
"loss": 0.3155,
"step": 329
},
{
"epoch": 4.660194174757281,
"grad_norm": 0.07380784680617208,
"learning_rate": 7.93100486058247e-07,
"loss": 0.3172,
"step": 330
},
{
"epoch": 4.674315975286849,
"grad_norm": 0.07265097137199653,
"learning_rate": 7.160047583904473e-07,
"loss": 0.3123,
"step": 331
},
{
"epoch": 4.688437775816417,
"grad_norm": 0.07526606061681983,
"learning_rate": 6.428164560548134e-07,
"loss": 0.3126,
"step": 332
},
{
"epoch": 4.702559576345984,
"grad_norm": 0.07096951660387449,
"learning_rate": 5.735428588054825e-07,
"loss": 0.3091,
"step": 333
},
{
"epoch": 4.716681376875552,
"grad_norm": 0.07491929428893927,
"learning_rate": 5.081908570161753e-07,
"loss": 0.3168,
"step": 334
},
{
"epoch": 4.730803177405119,
"grad_norm": 0.07068035565889964,
"learning_rate": 4.467669509948591e-07,
"loss": 0.3168,
"step": 335
},
{
"epoch": 4.744924977934687,
"grad_norm": 0.07006153238881019,
"learning_rate": 3.8927725033718553e-07,
"loss": 0.3096,
"step": 336
},
{
"epoch": 4.7590467784642545,
"grad_norm": 0.07031296479074185,
"learning_rate": 3.3572747331878984e-07,
"loss": 0.3127,
"step": 337
},
{
"epoch": 4.773168578993822,
"grad_norm": 0.07086156685048181,
"learning_rate": 2.8612294632650586e-07,
"loss": 0.3165,
"step": 338
},
{
"epoch": 4.787290379523389,
"grad_norm": 0.07041702874195928,
"learning_rate": 2.404686033285897e-07,
"loss": 0.3211,
"step": 339
},
{
"epoch": 4.801412180052957,
"grad_norm": 0.07111545002538634,
"learning_rate": 1.9876898538394362e-07,
"loss": 0.3139,
"step": 340
},
{
"epoch": 4.815533980582524,
"grad_norm": 0.06964445264833816,
"learning_rate": 1.6102824019043728e-07,
"loss": 0.3119,
"step": 341
},
{
"epoch": 4.829655781112092,
"grad_norm": 0.07185826317569316,
"learning_rate": 1.2725012167236207e-07,
"loss": 0.3189,
"step": 342
},
{
"epoch": 4.8437775816416595,
"grad_norm": 0.07175971991165786,
"learning_rate": 9.74379896070321e-08,
"loss": 0.3144,
"step": 343
},
{
"epoch": 4.857899382171227,
"grad_norm": 0.07027377563502572,
"learning_rate": 7.159480929059381e-08,
"loss": 0.3208,
"step": 344
},
{
"epoch": 4.872021182700794,
"grad_norm": 0.07130198834034268,
"learning_rate": 4.9723151243106225e-08,
"loss": 0.3164,
"step": 345
},
{
"epoch": 4.886142983230362,
"grad_norm": 0.07512577557190175,
"learning_rate": 3.1825190952829986e-08,
"loss": 0.3183,
"step": 346
},
{
"epoch": 4.90026478375993,
"grad_norm": 0.0718819094759202,
"learning_rate": 1.7902708659867096e-08,
"loss": 0.3185,
"step": 347
},
{
"epoch": 4.914386584289497,
"grad_norm": 0.0706893833001464,
"learning_rate": 7.957089179058131e-09,
"loss": 0.3142,
"step": 348
},
{
"epoch": 4.9285083848190645,
"grad_norm": 0.07170028442056126,
"learning_rate": 1.9893217622790616e-09,
"loss": 0.3181,
"step": 349
},
{
"epoch": 4.942630185348632,
"grad_norm": 0.07142066838497432,
"learning_rate": 0.0,
"loss": 0.313,
"step": 350
},
{
"epoch": 4.942630185348632,
"step": 350,
"total_flos": 9.306564393200255e+18,
"train_loss": 0.0,
"train_runtime": 1.9909,
"train_samples_per_second": 91040.986,
"train_steps_per_second": 175.798
}
],
"logging_steps": 1,
"max_steps": 350,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.306564393200255e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}