UniAE-MoE / trainer_state.json
Syclus's picture
Add model weights and config
6b27366 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 80000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.25e-05,
"grad_norm": 61.570499420166016,
"learning_rate": 0.0,
"loss": 10.836095809936523,
"step": 1
},
{
"epoch": 0.00125,
"grad_norm": 127.05734252929688,
"learning_rate": 1.98e-06,
"loss": 19.720439255839647,
"step": 100
},
{
"epoch": 0.0025,
"grad_norm": 240.12950134277344,
"learning_rate": 3.980000000000001e-06,
"loss": 19.435438232421873,
"step": 200
},
{
"epoch": 0.00375,
"grad_norm": 165.71978759765625,
"learning_rate": 5.98e-06,
"loss": 17.33456298828125,
"step": 300
},
{
"epoch": 0.005,
"grad_norm": 476.399169921875,
"learning_rate": 7.980000000000002e-06,
"loss": 14.20736328125,
"step": 400
},
{
"epoch": 0.00625,
"grad_norm": 73.51789855957031,
"learning_rate": 9.980000000000001e-06,
"loss": 10.920599365234375,
"step": 500
},
{
"epoch": 0.0075,
"grad_norm": 50.991634368896484,
"learning_rate": 9.987547169811321e-06,
"loss": 9.143378295898437,
"step": 600
},
{
"epoch": 0.00875,
"grad_norm": 150.61373901367188,
"learning_rate": 9.97496855345912e-06,
"loss": 7.4310791015625,
"step": 700
},
{
"epoch": 0.01,
"grad_norm": 188.94094848632812,
"learning_rate": 9.962389937106918e-06,
"loss": 6.5367333984375,
"step": 800
},
{
"epoch": 0.01125,
"grad_norm": 37.20081329345703,
"learning_rate": 9.949811320754717e-06,
"loss": 7.25988525390625,
"step": 900
},
{
"epoch": 0.0125,
"grad_norm": 52.72426223754883,
"learning_rate": 9.937232704402516e-06,
"loss": 6.920318603515625,
"step": 1000
},
{
"epoch": 0.01375,
"grad_norm": 77.21712493896484,
"learning_rate": 9.924654088050316e-06,
"loss": 6.468406982421875,
"step": 1100
},
{
"epoch": 0.015,
"grad_norm": 73.15080261230469,
"learning_rate": 9.912075471698114e-06,
"loss": 6.187958374023437,
"step": 1200
},
{
"epoch": 0.01625,
"grad_norm": 21.300718307495117,
"learning_rate": 9.899496855345913e-06,
"loss": 5.908817749023438,
"step": 1300
},
{
"epoch": 0.0175,
"grad_norm": 33.9874153137207,
"learning_rate": 9.886918238993712e-06,
"loss": 5.750390625,
"step": 1400
},
{
"epoch": 0.01875,
"grad_norm": 46.610294342041016,
"learning_rate": 9.87433962264151e-06,
"loss": 5.010645446777343,
"step": 1500
},
{
"epoch": 0.02,
"grad_norm": 116.64041137695312,
"learning_rate": 9.861761006289309e-06,
"loss": 4.999881591796875,
"step": 1600
},
{
"epoch": 0.02125,
"grad_norm": 16.1014347076416,
"learning_rate": 9.849182389937107e-06,
"loss": 5.607005004882812,
"step": 1700
},
{
"epoch": 0.0225,
"grad_norm": 22.09562873840332,
"learning_rate": 9.836603773584906e-06,
"loss": 6.2442431640625,
"step": 1800
},
{
"epoch": 0.02375,
"grad_norm": 62.120033264160156,
"learning_rate": 9.824025157232704e-06,
"loss": 5.164766845703125,
"step": 1900
},
{
"epoch": 0.025,
"grad_norm": 12.194104194641113,
"learning_rate": 9.811446540880503e-06,
"loss": 4.69855224609375,
"step": 2000
},
{
"epoch": 0.02625,
"grad_norm": 21.063518524169922,
"learning_rate": 9.798867924528302e-06,
"loss": 5.364295654296875,
"step": 2100
},
{
"epoch": 0.0275,
"grad_norm": 28.930068969726562,
"learning_rate": 9.786289308176102e-06,
"loss": 5.119418334960938,
"step": 2200
},
{
"epoch": 0.02875,
"grad_norm": 46.796871185302734,
"learning_rate": 9.7737106918239e-06,
"loss": 5.237764892578125,
"step": 2300
},
{
"epoch": 0.03,
"grad_norm": 17.024911880493164,
"learning_rate": 9.761132075471699e-06,
"loss": 5.059874877929688,
"step": 2400
},
{
"epoch": 0.03125,
"grad_norm": 74.16903686523438,
"learning_rate": 9.748553459119498e-06,
"loss": 4.9656503295898435,
"step": 2500
},
{
"epoch": 0.0325,
"grad_norm": 22.257841110229492,
"learning_rate": 9.735974842767296e-06,
"loss": 5.18156982421875,
"step": 2600
},
{
"epoch": 0.03375,
"grad_norm": 47.79827117919922,
"learning_rate": 9.723396226415095e-06,
"loss": 4.9468075561523435,
"step": 2700
},
{
"epoch": 0.035,
"grad_norm": 33.0962028503418,
"learning_rate": 9.710817610062893e-06,
"loss": 4.722409362792969,
"step": 2800
},
{
"epoch": 0.03625,
"grad_norm": 20.282230377197266,
"learning_rate": 9.698238993710692e-06,
"loss": 5.242295532226563,
"step": 2900
},
{
"epoch": 0.0375,
"grad_norm": 25.866464614868164,
"learning_rate": 9.68566037735849e-06,
"loss": 4.773485717773437,
"step": 3000
},
{
"epoch": 0.03875,
"grad_norm": 26.524415969848633,
"learning_rate": 9.67308176100629e-06,
"loss": 5.017108154296875,
"step": 3100
},
{
"epoch": 0.04,
"grad_norm": 18.342044830322266,
"learning_rate": 9.66050314465409e-06,
"loss": 4.6452767944335935,
"step": 3200
},
{
"epoch": 0.04125,
"grad_norm": 20.58109474182129,
"learning_rate": 9.647924528301888e-06,
"loss": 5.00500732421875,
"step": 3300
},
{
"epoch": 0.0425,
"grad_norm": 294.05224609375,
"learning_rate": 9.635345911949687e-06,
"loss": 4.595547790527344,
"step": 3400
},
{
"epoch": 0.04375,
"grad_norm": 32.15380096435547,
"learning_rate": 9.622767295597485e-06,
"loss": 4.3764559936523435,
"step": 3500
},
{
"epoch": 0.045,
"grad_norm": 28.0123233795166,
"learning_rate": 9.610188679245284e-06,
"loss": 4.383021850585937,
"step": 3600
},
{
"epoch": 0.04625,
"grad_norm": 33.30596923828125,
"learning_rate": 9.597610062893082e-06,
"loss": 4.354810180664063,
"step": 3700
},
{
"epoch": 0.0475,
"grad_norm": 26.94670295715332,
"learning_rate": 9.585031446540881e-06,
"loss": 3.8970941162109374,
"step": 3800
},
{
"epoch": 0.04875,
"grad_norm": 28.224117279052734,
"learning_rate": 9.57245283018868e-06,
"loss": 3.8559701538085935,
"step": 3900
},
{
"epoch": 0.05,
"grad_norm": 20.57750701904297,
"learning_rate": 9.559874213836478e-06,
"loss": 4.92269287109375,
"step": 4000
},
{
"epoch": 0.05125,
"grad_norm": 18.131223678588867,
"learning_rate": 9.547295597484277e-06,
"loss": 4.633423461914062,
"step": 4100
},
{
"epoch": 0.0525,
"grad_norm": 29.649024963378906,
"learning_rate": 9.534716981132075e-06,
"loss": 4.432715759277344,
"step": 4200
},
{
"epoch": 0.05375,
"grad_norm": 34.53123474121094,
"learning_rate": 9.522138364779876e-06,
"loss": 4.242776489257812,
"step": 4300
},
{
"epoch": 0.055,
"grad_norm": 19.823360443115234,
"learning_rate": 9.509559748427674e-06,
"loss": 4.448049011230469,
"step": 4400
},
{
"epoch": 0.05625,
"grad_norm": 20.857120513916016,
"learning_rate": 9.496981132075473e-06,
"loss": 4.65389404296875,
"step": 4500
},
{
"epoch": 0.0575,
"grad_norm": 20.397987365722656,
"learning_rate": 9.484402515723271e-06,
"loss": 4.350996704101562,
"step": 4600
},
{
"epoch": 0.05875,
"grad_norm": 22.439617156982422,
"learning_rate": 9.47182389937107e-06,
"loss": 4.1993505859375,
"step": 4700
},
{
"epoch": 0.06,
"grad_norm": 5.18347692489624,
"learning_rate": 9.459245283018869e-06,
"loss": 4.38233154296875,
"step": 4800
},
{
"epoch": 0.06125,
"grad_norm": 20.546842575073242,
"learning_rate": 9.446666666666667e-06,
"loss": 4.136148681640625,
"step": 4900
},
{
"epoch": 0.0625,
"grad_norm": 30.28652572631836,
"learning_rate": 9.434088050314466e-06,
"loss": 4.002590026855469,
"step": 5000
},
{
"epoch": 0.06375,
"grad_norm": 12.638691902160645,
"learning_rate": 9.421509433962264e-06,
"loss": 3.565917663574219,
"step": 5100
},
{
"epoch": 0.065,
"grad_norm": 22.384016036987305,
"learning_rate": 9.408930817610063e-06,
"loss": 3.9356478881835937,
"step": 5200
},
{
"epoch": 0.06625,
"grad_norm": 18.600645065307617,
"learning_rate": 9.396352201257861e-06,
"loss": 4.357646484375,
"step": 5300
},
{
"epoch": 0.0675,
"grad_norm": 19.07636260986328,
"learning_rate": 9.383773584905662e-06,
"loss": 4.550914306640625,
"step": 5400
},
{
"epoch": 0.06875,
"grad_norm": 36.520381927490234,
"learning_rate": 9.37119496855346e-06,
"loss": 4.02431884765625,
"step": 5500
},
{
"epoch": 0.07,
"grad_norm": 31.87358856201172,
"learning_rate": 9.358616352201259e-06,
"loss": 4.209980163574219,
"step": 5600
},
{
"epoch": 0.07125,
"grad_norm": 25.108585357666016,
"learning_rate": 9.346037735849058e-06,
"loss": 4.5496435546875,
"step": 5700
},
{
"epoch": 0.0725,
"grad_norm": 18.682506561279297,
"learning_rate": 9.333459119496856e-06,
"loss": 4.466670837402344,
"step": 5800
},
{
"epoch": 0.07375,
"grad_norm": 18.071388244628906,
"learning_rate": 9.320880503144655e-06,
"loss": 3.757862243652344,
"step": 5900
},
{
"epoch": 0.075,
"grad_norm": 54.04276657104492,
"learning_rate": 9.308301886792453e-06,
"loss": 4.119568786621094,
"step": 6000
},
{
"epoch": 0.07625,
"grad_norm": 20.94058609008789,
"learning_rate": 9.295723270440252e-06,
"loss": 3.6099752807617187,
"step": 6100
},
{
"epoch": 0.0775,
"grad_norm": 28.546890258789062,
"learning_rate": 9.28314465408805e-06,
"loss": 4.167696533203125,
"step": 6200
},
{
"epoch": 0.07875,
"grad_norm": 22.076147079467773,
"learning_rate": 9.270566037735849e-06,
"loss": 3.8074371337890627,
"step": 6300
},
{
"epoch": 0.08,
"grad_norm": 11.4718599319458,
"learning_rate": 9.25798742138365e-06,
"loss": 3.5398162841796874,
"step": 6400
},
{
"epoch": 0.08125,
"grad_norm": 20.66476058959961,
"learning_rate": 9.245408805031448e-06,
"loss": 3.8792694091796873,
"step": 6500
},
{
"epoch": 0.0825,
"grad_norm": 22.23533058166504,
"learning_rate": 9.232830188679246e-06,
"loss": 3.6433590698242186,
"step": 6600
},
{
"epoch": 0.08375,
"grad_norm": 17.721628189086914,
"learning_rate": 9.220251572327045e-06,
"loss": 3.73576171875,
"step": 6700
},
{
"epoch": 0.085,
"grad_norm": 53.24715805053711,
"learning_rate": 9.207672955974844e-06,
"loss": 3.7794915771484376,
"step": 6800
},
{
"epoch": 0.08625,
"grad_norm": 20.755985260009766,
"learning_rate": 9.195094339622642e-06,
"loss": 3.84636962890625,
"step": 6900
},
{
"epoch": 0.0875,
"grad_norm": 26.36724281311035,
"learning_rate": 9.18251572327044e-06,
"loss": 3.7252166748046873,
"step": 7000
},
{
"epoch": 0.08875,
"grad_norm": 24.380794525146484,
"learning_rate": 9.16993710691824e-06,
"loss": 3.7519256591796877,
"step": 7100
},
{
"epoch": 0.09,
"grad_norm": 30.122207641601562,
"learning_rate": 9.157358490566038e-06,
"loss": 3.6482272338867188,
"step": 7200
},
{
"epoch": 0.09125,
"grad_norm": 21.8885555267334,
"learning_rate": 9.144779874213837e-06,
"loss": 3.8224606323242187,
"step": 7300
},
{
"epoch": 0.0925,
"grad_norm": 81.36309814453125,
"learning_rate": 9.132201257861635e-06,
"loss": 3.9120883178710937,
"step": 7400
},
{
"epoch": 0.09375,
"grad_norm": 33.38832473754883,
"learning_rate": 9.119622641509435e-06,
"loss": 3.3836550903320313,
"step": 7500
},
{
"epoch": 0.095,
"grad_norm": 3.9297144412994385,
"learning_rate": 9.107044025157234e-06,
"loss": 3.252887878417969,
"step": 7600
},
{
"epoch": 0.09625,
"grad_norm": 25.273887634277344,
"learning_rate": 9.094465408805033e-06,
"loss": 3.3780868530273436,
"step": 7700
},
{
"epoch": 0.0975,
"grad_norm": 32.79280471801758,
"learning_rate": 9.081886792452831e-06,
"loss": 3.3074383544921875,
"step": 7800
},
{
"epoch": 0.09875,
"grad_norm": 27.27655601501465,
"learning_rate": 9.06930817610063e-06,
"loss": 3.4328045654296875,
"step": 7900
},
{
"epoch": 0.1,
"grad_norm": 18.67003631591797,
"learning_rate": 9.056729559748428e-06,
"loss": 3.4785955810546874,
"step": 8000
},
{
"epoch": 0.10125,
"grad_norm": 16.710369110107422,
"learning_rate": 9.044150943396227e-06,
"loss": 3.604156188964844,
"step": 8100
},
{
"epoch": 0.1025,
"grad_norm": 39.6090202331543,
"learning_rate": 9.031572327044026e-06,
"loss": 3.240309753417969,
"step": 8200
},
{
"epoch": 0.10375,
"grad_norm": 50.57542419433594,
"learning_rate": 9.018993710691824e-06,
"loss": 3.2084616088867186,
"step": 8300
},
{
"epoch": 0.105,
"grad_norm": 3.647831916809082,
"learning_rate": 9.006415094339623e-06,
"loss": 3.3365228271484373,
"step": 8400
},
{
"epoch": 0.10625,
"grad_norm": 25.59325408935547,
"learning_rate": 8.993836477987421e-06,
"loss": 3.3707135009765623,
"step": 8500
},
{
"epoch": 0.1075,
"grad_norm": 29.391414642333984,
"learning_rate": 8.981257861635222e-06,
"loss": 3.698001708984375,
"step": 8600
},
{
"epoch": 0.10875,
"grad_norm": 20.907129287719727,
"learning_rate": 8.96867924528302e-06,
"loss": 3.4555450439453126,
"step": 8700
},
{
"epoch": 0.11,
"grad_norm": 1.1572871208190918,
"learning_rate": 8.956100628930819e-06,
"loss": 3.2748760986328125,
"step": 8800
},
{
"epoch": 0.11125,
"grad_norm": 31.631296157836914,
"learning_rate": 8.943522012578617e-06,
"loss": 3.3183383178710937,
"step": 8900
},
{
"epoch": 0.1125,
"grad_norm": 15.066047668457031,
"learning_rate": 8.930943396226416e-06,
"loss": 3.3248934936523438,
"step": 9000
},
{
"epoch": 0.11375,
"grad_norm": 40.130699157714844,
"learning_rate": 8.918364779874215e-06,
"loss": 3.0782876586914063,
"step": 9100
},
{
"epoch": 0.115,
"grad_norm": 6.560884952545166,
"learning_rate": 8.905786163522013e-06,
"loss": 3.1166537475585936,
"step": 9200
},
{
"epoch": 0.11625,
"grad_norm": 19.945913314819336,
"learning_rate": 8.893207547169812e-06,
"loss": 3.104024658203125,
"step": 9300
},
{
"epoch": 0.1175,
"grad_norm": 26.818218231201172,
"learning_rate": 8.88062893081761e-06,
"loss": 3.5535888671875,
"step": 9400
},
{
"epoch": 0.11875,
"grad_norm": 8.911575317382812,
"learning_rate": 8.868050314465409e-06,
"loss": 3.237396240234375,
"step": 9500
},
{
"epoch": 0.12,
"grad_norm": 27.469289779663086,
"learning_rate": 8.855471698113209e-06,
"loss": 3.37412841796875,
"step": 9600
},
{
"epoch": 0.12125,
"grad_norm": 21.267345428466797,
"learning_rate": 8.842893081761008e-06,
"loss": 3.5673455810546875,
"step": 9700
},
{
"epoch": 0.1225,
"grad_norm": 18.69856834411621,
"learning_rate": 8.830314465408806e-06,
"loss": 3.2800216674804688,
"step": 9800
},
{
"epoch": 0.12375,
"grad_norm": 30.507801055908203,
"learning_rate": 8.817735849056605e-06,
"loss": 3.042904052734375,
"step": 9900
},
{
"epoch": 0.125,
"grad_norm": 4.167088508605957,
"learning_rate": 8.805157232704403e-06,
"loss": 3.038970642089844,
"step": 10000
},
{
"epoch": 0.12625,
"grad_norm": 22.224905014038086,
"learning_rate": 8.792578616352202e-06,
"loss": 3.6609432983398436,
"step": 10100
},
{
"epoch": 0.1275,
"grad_norm": 23.52140235900879,
"learning_rate": 8.78e-06,
"loss": 3.3873190307617187,
"step": 10200
},
{
"epoch": 0.12875,
"grad_norm": 19.35887336730957,
"learning_rate": 8.7674213836478e-06,
"loss": 2.98621337890625,
"step": 10300
},
{
"epoch": 0.13,
"grad_norm": 36.2899169921875,
"learning_rate": 8.754842767295598e-06,
"loss": 3.1030731201171875,
"step": 10400
},
{
"epoch": 0.13125,
"grad_norm": 23.118587493896484,
"learning_rate": 8.742264150943396e-06,
"loss": 3.2017132568359377,
"step": 10500
},
{
"epoch": 0.1325,
"grad_norm": 18.85630989074707,
"learning_rate": 8.729685534591195e-06,
"loss": 3.136984558105469,
"step": 10600
},
{
"epoch": 0.13375,
"grad_norm": 10.303237915039062,
"learning_rate": 8.717106918238995e-06,
"loss": 3.201553955078125,
"step": 10700
},
{
"epoch": 0.135,
"grad_norm": 23.776470184326172,
"learning_rate": 8.704528301886794e-06,
"loss": 2.64281494140625,
"step": 10800
},
{
"epoch": 0.13625,
"grad_norm": 19.486194610595703,
"learning_rate": 8.691949685534592e-06,
"loss": 3.0275909423828127,
"step": 10900
},
{
"epoch": 0.1375,
"grad_norm": 23.196096420288086,
"learning_rate": 8.679371069182391e-06,
"loss": 3.320228271484375,
"step": 11000
},
{
"epoch": 0.13875,
"grad_norm": 18.966537475585938,
"learning_rate": 8.66679245283019e-06,
"loss": 3.0495703125,
"step": 11100
},
{
"epoch": 0.14,
"grad_norm": 24.352205276489258,
"learning_rate": 8.654213836477988e-06,
"loss": 2.7282373046875,
"step": 11200
},
{
"epoch": 0.14125,
"grad_norm": 21.45029067993164,
"learning_rate": 8.641635220125787e-06,
"loss": 2.840162353515625,
"step": 11300
},
{
"epoch": 0.1425,
"grad_norm": 27.577434539794922,
"learning_rate": 8.629056603773585e-06,
"loss": 2.837103271484375,
"step": 11400
},
{
"epoch": 0.14375,
"grad_norm": 26.380931854248047,
"learning_rate": 8.616477987421384e-06,
"loss": 2.9257269287109375,
"step": 11500
},
{
"epoch": 0.145,
"grad_norm": 33.0853271484375,
"learning_rate": 8.603899371069183e-06,
"loss": 2.7715283203125,
"step": 11600
},
{
"epoch": 0.14625,
"grad_norm": 26.68023109436035,
"learning_rate": 8.591320754716981e-06,
"loss": 2.801429443359375,
"step": 11700
},
{
"epoch": 0.1475,
"grad_norm": 23.897310256958008,
"learning_rate": 8.578742138364781e-06,
"loss": 2.7405438232421875,
"step": 11800
},
{
"epoch": 0.14875,
"grad_norm": 24.198909759521484,
"learning_rate": 8.56616352201258e-06,
"loss": 2.8841351318359374,
"step": 11900
},
{
"epoch": 0.15,
"grad_norm": 12.656911849975586,
"learning_rate": 8.553584905660379e-06,
"loss": 2.8034808349609377,
"step": 12000
},
{
"epoch": 0.15125,
"grad_norm": 21.627422332763672,
"learning_rate": 8.541006289308177e-06,
"loss": 3.3647225952148436,
"step": 12100
},
{
"epoch": 0.1525,
"grad_norm": 20.174375534057617,
"learning_rate": 8.528427672955976e-06,
"loss": 3.007825927734375,
"step": 12200
},
{
"epoch": 0.15375,
"grad_norm": 44.98274230957031,
"learning_rate": 8.515849056603774e-06,
"loss": 3.088919677734375,
"step": 12300
},
{
"epoch": 0.155,
"grad_norm": 19.29104995727539,
"learning_rate": 8.503270440251573e-06,
"loss": 3.018197021484375,
"step": 12400
},
{
"epoch": 0.15625,
"grad_norm": 24.123010635375977,
"learning_rate": 8.490691823899371e-06,
"loss": 2.9341339111328124,
"step": 12500
},
{
"epoch": 0.1575,
"grad_norm": 24.07183837890625,
"learning_rate": 8.47811320754717e-06,
"loss": 3.0051565551757813,
"step": 12600
},
{
"epoch": 0.15875,
"grad_norm": 5.301960468292236,
"learning_rate": 8.465534591194969e-06,
"loss": 2.80382568359375,
"step": 12700
},
{
"epoch": 0.16,
"grad_norm": 0.05698124319314957,
"learning_rate": 8.452955974842767e-06,
"loss": 2.8222991943359377,
"step": 12800
},
{
"epoch": 0.16125,
"grad_norm": 18.93758773803711,
"learning_rate": 8.440377358490568e-06,
"loss": 3.1378509521484377,
"step": 12900
},
{
"epoch": 0.1625,
"grad_norm": 20.055757522583008,
"learning_rate": 8.427798742138366e-06,
"loss": 2.886328125,
"step": 13000
},
{
"epoch": 0.16375,
"grad_norm": 12.077770233154297,
"learning_rate": 8.415220125786165e-06,
"loss": 2.8731988525390624,
"step": 13100
},
{
"epoch": 0.165,
"grad_norm": 44.130096435546875,
"learning_rate": 8.402641509433963e-06,
"loss": 2.7432611083984373,
"step": 13200
},
{
"epoch": 0.16625,
"grad_norm": 41.551048278808594,
"learning_rate": 8.390062893081762e-06,
"loss": 3.06159912109375,
"step": 13300
},
{
"epoch": 0.1675,
"grad_norm": 18.465818405151367,
"learning_rate": 8.37748427672956e-06,
"loss": 2.8289349365234373,
"step": 13400
},
{
"epoch": 0.16875,
"grad_norm": 17.856552124023438,
"learning_rate": 8.364905660377359e-06,
"loss": 2.7455133056640624,
"step": 13500
},
{
"epoch": 0.17,
"grad_norm": 38.62467956542969,
"learning_rate": 8.352327044025158e-06,
"loss": 2.51541748046875,
"step": 13600
},
{
"epoch": 0.17125,
"grad_norm": 19.783123016357422,
"learning_rate": 8.339748427672956e-06,
"loss": 3.161922607421875,
"step": 13700
},
{
"epoch": 0.1725,
"grad_norm": 19.746681213378906,
"learning_rate": 8.327169811320755e-06,
"loss": 2.948106384277344,
"step": 13800
},
{
"epoch": 0.17375,
"grad_norm": 11.665578842163086,
"learning_rate": 8.314591194968555e-06,
"loss": 2.707484130859375,
"step": 13900
},
{
"epoch": 0.175,
"grad_norm": 32.41368865966797,
"learning_rate": 8.302012578616354e-06,
"loss": 2.551041259765625,
"step": 14000
},
{
"epoch": 0.17625,
"grad_norm": 19.156505584716797,
"learning_rate": 8.289433962264152e-06,
"loss": 3.110626220703125,
"step": 14100
},
{
"epoch": 0.1775,
"grad_norm": 24.206411361694336,
"learning_rate": 8.27685534591195e-06,
"loss": 2.96248291015625,
"step": 14200
},
{
"epoch": 0.17875,
"grad_norm": 28.069095611572266,
"learning_rate": 8.26427672955975e-06,
"loss": 2.8258160400390624,
"step": 14300
},
{
"epoch": 0.18,
"grad_norm": 29.488801956176758,
"learning_rate": 8.251698113207548e-06,
"loss": 2.697049560546875,
"step": 14400
},
{
"epoch": 0.18125,
"grad_norm": 24.96384620666504,
"learning_rate": 8.239119496855347e-06,
"loss": 2.985064697265625,
"step": 14500
},
{
"epoch": 0.1825,
"grad_norm": 23.861825942993164,
"learning_rate": 8.226540880503145e-06,
"loss": 3.27437255859375,
"step": 14600
},
{
"epoch": 0.18375,
"grad_norm": 17.654409408569336,
"learning_rate": 8.213962264150944e-06,
"loss": 2.7916656494140626,
"step": 14700
},
{
"epoch": 0.185,
"grad_norm": 13.089086532592773,
"learning_rate": 8.201383647798742e-06,
"loss": 2.95639404296875,
"step": 14800
},
{
"epoch": 0.18625,
"grad_norm": 25.958925247192383,
"learning_rate": 8.188805031446541e-06,
"loss": 3.0765805053710937,
"step": 14900
},
{
"epoch": 0.1875,
"grad_norm": 23.476421356201172,
"learning_rate": 8.176226415094341e-06,
"loss": 3.1241455078125,
"step": 15000
},
{
"epoch": 0.18875,
"grad_norm": 6.848545551300049,
"learning_rate": 8.16364779874214e-06,
"loss": 3.0716900634765625,
"step": 15100
},
{
"epoch": 0.19,
"grad_norm": 35.278076171875,
"learning_rate": 8.151069182389938e-06,
"loss": 2.738138427734375,
"step": 15200
},
{
"epoch": 0.19125,
"grad_norm": 18.48978614807129,
"learning_rate": 8.138490566037737e-06,
"loss": 3.2392678833007813,
"step": 15300
},
{
"epoch": 0.1925,
"grad_norm": 25.764511108398438,
"learning_rate": 8.125911949685536e-06,
"loss": 2.935572509765625,
"step": 15400
},
{
"epoch": 0.19375,
"grad_norm": 17.244510650634766,
"learning_rate": 8.113333333333334e-06,
"loss": 2.686405029296875,
"step": 15500
},
{
"epoch": 0.195,
"grad_norm": 58.790367126464844,
"learning_rate": 8.100754716981133e-06,
"loss": 2.930709228515625,
"step": 15600
},
{
"epoch": 0.19625,
"grad_norm": 23.248153686523438,
"learning_rate": 8.088176100628931e-06,
"loss": 3.134276123046875,
"step": 15700
},
{
"epoch": 0.1975,
"grad_norm": 22.906993865966797,
"learning_rate": 8.07559748427673e-06,
"loss": 3.294012451171875,
"step": 15800
},
{
"epoch": 0.19875,
"grad_norm": 18.57663917541504,
"learning_rate": 8.063018867924528e-06,
"loss": 2.659776611328125,
"step": 15900
},
{
"epoch": 0.2,
"grad_norm": 23.491619110107422,
"learning_rate": 8.050440251572327e-06,
"loss": 2.69767822265625,
"step": 16000
},
{
"epoch": 0.20125,
"grad_norm": 34.74635696411133,
"learning_rate": 8.037861635220127e-06,
"loss": 2.9187213134765626,
"step": 16100
},
{
"epoch": 0.2025,
"grad_norm": 17.31464195251465,
"learning_rate": 8.025283018867926e-06,
"loss": 2.6978961181640626,
"step": 16200
},
{
"epoch": 0.20375,
"grad_norm": 15.085783958435059,
"learning_rate": 8.012704402515725e-06,
"loss": 2.227297668457031,
"step": 16300
},
{
"epoch": 0.205,
"grad_norm": 40.41267013549805,
"learning_rate": 8.000125786163523e-06,
"loss": 2.5455712890625,
"step": 16400
},
{
"epoch": 0.20625,
"grad_norm": 21.266006469726562,
"learning_rate": 7.987547169811322e-06,
"loss": 2.6894076538085936,
"step": 16500
},
{
"epoch": 0.2075,
"grad_norm": 27.872455596923828,
"learning_rate": 7.97496855345912e-06,
"loss": 2.957840576171875,
"step": 16600
},
{
"epoch": 0.20875,
"grad_norm": 11.036195755004883,
"learning_rate": 7.962389937106919e-06,
"loss": 2.9388262939453127,
"step": 16700
},
{
"epoch": 0.21,
"grad_norm": 0.358200341463089,
"learning_rate": 7.949811320754717e-06,
"loss": 2.550313720703125,
"step": 16800
},
{
"epoch": 0.21125,
"grad_norm": 17.692394256591797,
"learning_rate": 7.937232704402516e-06,
"loss": 2.9567965698242187,
"step": 16900
},
{
"epoch": 0.2125,
"grad_norm": 20.59417724609375,
"learning_rate": 7.924654088050315e-06,
"loss": 2.8677490234375,
"step": 17000
},
{
"epoch": 0.21375,
"grad_norm": 21.882822036743164,
"learning_rate": 7.912075471698115e-06,
"loss": 2.6563641357421877,
"step": 17100
},
{
"epoch": 0.215,
"grad_norm": 41.62974548339844,
"learning_rate": 7.899496855345913e-06,
"loss": 2.938742370605469,
"step": 17200
},
{
"epoch": 0.21625,
"grad_norm": 17.829666137695312,
"learning_rate": 7.886918238993712e-06,
"loss": 2.936795654296875,
"step": 17300
},
{
"epoch": 0.2175,
"grad_norm": 23.082691192626953,
"learning_rate": 7.87433962264151e-06,
"loss": 3.0123855590820314,
"step": 17400
},
{
"epoch": 0.21875,
"grad_norm": 30.395631790161133,
"learning_rate": 7.86176100628931e-06,
"loss": 2.8054989624023436,
"step": 17500
},
{
"epoch": 0.22,
"grad_norm": 0.134773388504982,
"learning_rate": 7.849182389937108e-06,
"loss": 2.675752868652344,
"step": 17600
},
{
"epoch": 0.22125,
"grad_norm": 20.807125091552734,
"learning_rate": 7.836603773584906e-06,
"loss": 2.7376922607421874,
"step": 17700
},
{
"epoch": 0.2225,
"grad_norm": 43.09901809692383,
"learning_rate": 7.824025157232705e-06,
"loss": 2.9282843017578126,
"step": 17800
},
{
"epoch": 0.22375,
"grad_norm": 25.02425193786621,
"learning_rate": 7.811446540880504e-06,
"loss": 2.571319580078125,
"step": 17900
},
{
"epoch": 0.225,
"grad_norm": 15.398431777954102,
"learning_rate": 7.798867924528302e-06,
"loss": 2.492489929199219,
"step": 18000
},
{
"epoch": 0.22625,
"grad_norm": 19.006174087524414,
"learning_rate": 7.7862893081761e-06,
"loss": 2.7379693603515625,
"step": 18100
},
{
"epoch": 0.2275,
"grad_norm": 21.487060546875,
"learning_rate": 7.773710691823901e-06,
"loss": 3.009235534667969,
"step": 18200
},
{
"epoch": 0.22875,
"grad_norm": 22.86688804626465,
"learning_rate": 7.7611320754717e-06,
"loss": 2.937470703125,
"step": 18300
},
{
"epoch": 0.23,
"grad_norm": 4.950404167175293,
"learning_rate": 7.748553459119498e-06,
"loss": 2.7051239013671875,
"step": 18400
},
{
"epoch": 0.23125,
"grad_norm": 22.454504013061523,
"learning_rate": 7.735974842767297e-06,
"loss": 2.8736590576171874,
"step": 18500
},
{
"epoch": 0.2325,
"grad_norm": 29.73653221130371,
"learning_rate": 7.723396226415095e-06,
"loss": 2.908158874511719,
"step": 18600
},
{
"epoch": 0.23375,
"grad_norm": 28.101703643798828,
"learning_rate": 7.710817610062894e-06,
"loss": 2.6522030639648437,
"step": 18700
},
{
"epoch": 0.235,
"grad_norm": 4.781414985656738,
"learning_rate": 7.698238993710693e-06,
"loss": 2.4308651733398436,
"step": 18800
},
{
"epoch": 0.23625,
"grad_norm": 17.258634567260742,
"learning_rate": 7.685660377358491e-06,
"loss": 3.040390625,
"step": 18900
},
{
"epoch": 0.2375,
"grad_norm": 17.386960983276367,
"learning_rate": 7.67308176100629e-06,
"loss": 2.769420471191406,
"step": 19000
},
{
"epoch": 0.23875,
"grad_norm": 25.329315185546875,
"learning_rate": 7.660503144654088e-06,
"loss": 2.7375177001953124,
"step": 19100
},
{
"epoch": 0.24,
"grad_norm": 6.02203369140625,
"learning_rate": 7.647924528301887e-06,
"loss": 2.505074920654297,
"step": 19200
},
{
"epoch": 0.24125,
"grad_norm": 20.155107498168945,
"learning_rate": 7.635345911949687e-06,
"loss": 2.837679138183594,
"step": 19300
},
{
"epoch": 0.2425,
"grad_norm": 18.765806198120117,
"learning_rate": 7.622767295597485e-06,
"loss": 2.907704162597656,
"step": 19400
},
{
"epoch": 0.24375,
"grad_norm": 17.111177444458008,
"learning_rate": 7.610188679245284e-06,
"loss": 2.3241337585449218,
"step": 19500
},
{
"epoch": 0.245,
"grad_norm": 26.34480094909668,
"learning_rate": 7.597610062893083e-06,
"loss": 2.534189453125,
"step": 19600
},
{
"epoch": 0.24625,
"grad_norm": 26.818429946899414,
"learning_rate": 7.5850314465408815e-06,
"loss": 2.6678060913085937,
"step": 19700
},
{
"epoch": 0.2475,
"grad_norm": 22.52924919128418,
"learning_rate": 7.57245283018868e-06,
"loss": 2.7501620483398437,
"step": 19800
},
{
"epoch": 0.24875,
"grad_norm": 2.6518571376800537,
"learning_rate": 7.559874213836479e-06,
"loss": 2.606297607421875,
"step": 19900
},
{
"epoch": 0.25,
"grad_norm": 13.926076889038086,
"learning_rate": 7.547295597484277e-06,
"loss": 2.6911566162109377,
"step": 20000
},
{
"epoch": 0.25125,
"grad_norm": 21.900789260864258,
"learning_rate": 7.534716981132077e-06,
"loss": 2.6743511962890625,
"step": 20100
},
{
"epoch": 0.2525,
"grad_norm": 17.92312240600586,
"learning_rate": 7.522138364779875e-06,
"loss": 3.041307373046875,
"step": 20200
},
{
"epoch": 0.25375,
"grad_norm": 16.685823440551758,
"learning_rate": 7.509559748427674e-06,
"loss": 2.7370196533203126,
"step": 20300
},
{
"epoch": 0.255,
"grad_norm": 17.445106506347656,
"learning_rate": 7.4969811320754725e-06,
"loss": 2.769534912109375,
"step": 20400
},
{
"epoch": 0.25625,
"grad_norm": 14.679756164550781,
"learning_rate": 7.484402515723271e-06,
"loss": 2.6487054443359375,
"step": 20500
},
{
"epoch": 0.2575,
"grad_norm": 29.46009063720703,
"learning_rate": 7.4718238993710705e-06,
"loss": 2.8649749755859375,
"step": 20600
},
{
"epoch": 0.25875,
"grad_norm": 25.532323837280273,
"learning_rate": 7.459245283018869e-06,
"loss": 2.5376904296875,
"step": 20700
},
{
"epoch": 0.26,
"grad_norm": 0.2213641107082367,
"learning_rate": 7.446666666666668e-06,
"loss": 2.5324951171875,
"step": 20800
},
{
"epoch": 0.26125,
"grad_norm": 26.14183807373047,
"learning_rate": 7.434088050314466e-06,
"loss": 2.9009820556640626,
"step": 20900
},
{
"epoch": 0.2625,
"grad_norm": 20.510589599609375,
"learning_rate": 7.421509433962265e-06,
"loss": 2.8423321533203123,
"step": 21000
},
{
"epoch": 0.26375,
"grad_norm": 19.538053512573242,
"learning_rate": 7.408930817610064e-06,
"loss": 2.8104873657226563,
"step": 21100
},
{
"epoch": 0.265,
"grad_norm": 29.258617401123047,
"learning_rate": 7.396352201257863e-06,
"loss": 2.460407562255859,
"step": 21200
},
{
"epoch": 0.26625,
"grad_norm": 24.23659324645996,
"learning_rate": 7.3837735849056614e-06,
"loss": 2.797374267578125,
"step": 21300
},
{
"epoch": 0.2675,
"grad_norm": 19.262876510620117,
"learning_rate": 7.37119496855346e-06,
"loss": 2.8027474975585935,
"step": 21400
},
{
"epoch": 0.26875,
"grad_norm": 18.131284713745117,
"learning_rate": 7.358616352201259e-06,
"loss": 2.899300842285156,
"step": 21500
},
{
"epoch": 0.27,
"grad_norm": 0.3336597979068756,
"learning_rate": 7.346037735849057e-06,
"loss": 2.505666961669922,
"step": 21600
},
{
"epoch": 0.27125,
"grad_norm": 16.87041473388672,
"learning_rate": 7.333459119496857e-06,
"loss": 2.7630340576171877,
"step": 21700
},
{
"epoch": 0.2725,
"grad_norm": 21.505661010742188,
"learning_rate": 7.320880503144655e-06,
"loss": 2.52651611328125,
"step": 21800
},
{
"epoch": 0.27375,
"grad_norm": 3.3604607582092285,
"learning_rate": 7.308301886792454e-06,
"loss": 2.6651840209960938,
"step": 21900
},
{
"epoch": 0.275,
"grad_norm": 0.0993848517537117,
"learning_rate": 7.295723270440252e-06,
"loss": 2.635201416015625,
"step": 22000
},
{
"epoch": 0.27625,
"grad_norm": 28.343442916870117,
"learning_rate": 7.283144654088051e-06,
"loss": 3.0521551513671876,
"step": 22100
},
{
"epoch": 0.2775,
"grad_norm": 22.11837387084961,
"learning_rate": 7.27056603773585e-06,
"loss": 2.7989617919921876,
"step": 22200
},
{
"epoch": 0.27875,
"grad_norm": 14.693900108337402,
"learning_rate": 7.257987421383649e-06,
"loss": 2.586000671386719,
"step": 22300
},
{
"epoch": 0.28,
"grad_norm": 12.320152282714844,
"learning_rate": 7.2454088050314476e-06,
"loss": 2.6524945068359376,
"step": 22400
},
{
"epoch": 0.28125,
"grad_norm": 16.291311264038086,
"learning_rate": 7.232830188679246e-06,
"loss": 2.5608685302734373,
"step": 22500
},
{
"epoch": 0.2825,
"grad_norm": 19.380096435546875,
"learning_rate": 7.220251572327045e-06,
"loss": 2.84384521484375,
"step": 22600
},
{
"epoch": 0.28375,
"grad_norm": 23.465559005737305,
"learning_rate": 7.207672955974843e-06,
"loss": 2.8349188232421874,
"step": 22700
},
{
"epoch": 0.285,
"grad_norm": 18.676286697387695,
"learning_rate": 7.195094339622643e-06,
"loss": 2.5051495361328127,
"step": 22800
},
{
"epoch": 0.28625,
"grad_norm": 20.133834838867188,
"learning_rate": 7.182515723270441e-06,
"loss": 2.7532232666015624,
"step": 22900
},
{
"epoch": 0.2875,
"grad_norm": 19.53656005859375,
"learning_rate": 7.16993710691824e-06,
"loss": 2.9203271484375,
"step": 23000
},
{
"epoch": 0.28875,
"grad_norm": 18.04520034790039,
"learning_rate": 7.1573584905660385e-06,
"loss": 2.6057052612304688,
"step": 23100
},
{
"epoch": 0.29,
"grad_norm": 0.5314738154411316,
"learning_rate": 7.144779874213837e-06,
"loss": 2.6248223876953123,
"step": 23200
},
{
"epoch": 0.29125,
"grad_norm": 16.670757293701172,
"learning_rate": 7.1322012578616365e-06,
"loss": 2.8461798095703124,
"step": 23300
},
{
"epoch": 0.2925,
"grad_norm": 23.886486053466797,
"learning_rate": 7.119622641509435e-06,
"loss": 2.57169921875,
"step": 23400
},
{
"epoch": 0.29375,
"grad_norm": 8.540549278259277,
"learning_rate": 7.107044025157234e-06,
"loss": 2.620181884765625,
"step": 23500
},
{
"epoch": 0.295,
"grad_norm": 34.379066467285156,
"learning_rate": 7.094465408805032e-06,
"loss": 2.5019407653808594,
"step": 23600
},
{
"epoch": 0.29625,
"grad_norm": 24.65458106994629,
"learning_rate": 7.081886792452831e-06,
"loss": 2.421243133544922,
"step": 23700
},
{
"epoch": 0.2975,
"grad_norm": 24.197660446166992,
"learning_rate": 7.06930817610063e-06,
"loss": 2.658455810546875,
"step": 23800
},
{
"epoch": 0.29875,
"grad_norm": 26.06961441040039,
"learning_rate": 7.056729559748429e-06,
"loss": 2.8529171752929687,
"step": 23900
},
{
"epoch": 0.3,
"grad_norm": 17.869230270385742,
"learning_rate": 7.0441509433962275e-06,
"loss": 2.4935687255859373,
"step": 24000
},
{
"epoch": 0.30125,
"grad_norm": 24.912609100341797,
"learning_rate": 7.031572327044026e-06,
"loss": 2.6119781494140626,
"step": 24100
},
{
"epoch": 0.3025,
"grad_norm": 22.447463989257812,
"learning_rate": 7.018993710691825e-06,
"loss": 2.824261474609375,
"step": 24200
},
{
"epoch": 0.30375,
"grad_norm": 4.013281345367432,
"learning_rate": 7.006415094339623e-06,
"loss": 2.564319763183594,
"step": 24300
},
{
"epoch": 0.305,
"grad_norm": 20.67205810546875,
"learning_rate": 6.993836477987423e-06,
"loss": 2.513914794921875,
"step": 24400
},
{
"epoch": 0.30625,
"grad_norm": 24.64101219177246,
"learning_rate": 6.981257861635221e-06,
"loss": 2.6228094482421875,
"step": 24500
},
{
"epoch": 0.3075,
"grad_norm": 21.346033096313477,
"learning_rate": 6.96867924528302e-06,
"loss": 2.74439208984375,
"step": 24600
},
{
"epoch": 0.30875,
"grad_norm": 10.391414642333984,
"learning_rate": 6.956100628930818e-06,
"loss": 2.2809228515625,
"step": 24700
},
{
"epoch": 0.31,
"grad_norm": 1.595000982284546,
"learning_rate": 6.943522012578617e-06,
"loss": 2.4967855834960937,
"step": 24800
},
{
"epoch": 0.31125,
"grad_norm": 20.543582916259766,
"learning_rate": 6.9309433962264164e-06,
"loss": 2.585915222167969,
"step": 24900
},
{
"epoch": 0.3125,
"grad_norm": 18.013103485107422,
"learning_rate": 6.918364779874215e-06,
"loss": 2.967261962890625,
"step": 25000
},
{
"epoch": 0.31375,
"grad_norm": 2.930172920227051,
"learning_rate": 6.905786163522014e-06,
"loss": 2.45760986328125,
"step": 25100
},
{
"epoch": 0.315,
"grad_norm": 0.313312292098999,
"learning_rate": 6.893207547169812e-06,
"loss": 2.421280822753906,
"step": 25200
},
{
"epoch": 0.31625,
"grad_norm": 19.377904891967773,
"learning_rate": 6.880628930817611e-06,
"loss": 2.710804748535156,
"step": 25300
},
{
"epoch": 0.3175,
"grad_norm": 29.901630401611328,
"learning_rate": 6.86805031446541e-06,
"loss": 2.6909103393554688,
"step": 25400
},
{
"epoch": 0.31875,
"grad_norm": 0.17036622762680054,
"learning_rate": 6.855471698113209e-06,
"loss": 2.56456787109375,
"step": 25500
},
{
"epoch": 0.32,
"grad_norm": 0.009640435688197613,
"learning_rate": 6.842893081761007e-06,
"loss": 2.603443603515625,
"step": 25600
},
{
"epoch": 0.32125,
"grad_norm": 18.46479606628418,
"learning_rate": 6.830314465408806e-06,
"loss": 2.427742156982422,
"step": 25700
},
{
"epoch": 0.3225,
"grad_norm": 20.460947036743164,
"learning_rate": 6.8177358490566046e-06,
"loss": 2.503583984375,
"step": 25800
},
{
"epoch": 0.32375,
"grad_norm": 36.58029556274414,
"learning_rate": 6.805157232704403e-06,
"loss": 2.641945495605469,
"step": 25900
},
{
"epoch": 0.325,
"grad_norm": 0.01783256232738495,
"learning_rate": 6.7925786163522026e-06,
"loss": 2.334880065917969,
"step": 26000
},
{
"epoch": 0.32625,
"grad_norm": 16.67290496826172,
"learning_rate": 6.780000000000001e-06,
"loss": 2.31351806640625,
"step": 26100
},
{
"epoch": 0.3275,
"grad_norm": 22.63792610168457,
"learning_rate": 6.7674213836478e-06,
"loss": 2.6967156982421874,
"step": 26200
},
{
"epoch": 0.32875,
"grad_norm": 9.776408195495605,
"learning_rate": 6.754842767295598e-06,
"loss": 2.56522705078125,
"step": 26300
},
{
"epoch": 0.33,
"grad_norm": 28.92633628845215,
"learning_rate": 6.742264150943397e-06,
"loss": 2.3877410888671875,
"step": 26400
},
{
"epoch": 0.33125,
"grad_norm": 24.558900833129883,
"learning_rate": 6.729685534591196e-06,
"loss": 2.6090899658203126,
"step": 26500
},
{
"epoch": 0.3325,
"grad_norm": 22.432323455810547,
"learning_rate": 6.717106918238995e-06,
"loss": 2.537680358886719,
"step": 26600
},
{
"epoch": 0.33375,
"grad_norm": 16.474348068237305,
"learning_rate": 6.7045283018867935e-06,
"loss": 2.5907723999023435,
"step": 26700
},
{
"epoch": 0.335,
"grad_norm": 0.12232944369316101,
"learning_rate": 6.691949685534592e-06,
"loss": 2.2261618041992186,
"step": 26800
},
{
"epoch": 0.33625,
"grad_norm": 23.504776000976562,
"learning_rate": 6.679371069182391e-06,
"loss": 2.601636962890625,
"step": 26900
},
{
"epoch": 0.3375,
"grad_norm": 25.80137062072754,
"learning_rate": 6.66679245283019e-06,
"loss": 2.415384521484375,
"step": 27000
},
{
"epoch": 0.33875,
"grad_norm": 10.791342735290527,
"learning_rate": 6.654213836477989e-06,
"loss": 2.3595407104492185,
"step": 27100
},
{
"epoch": 0.34,
"grad_norm": 0.23251843452453613,
"learning_rate": 6.641635220125787e-06,
"loss": 2.3029635620117186,
"step": 27200
},
{
"epoch": 0.34125,
"grad_norm": 22.788803100585938,
"learning_rate": 6.629056603773586e-06,
"loss": 2.467623748779297,
"step": 27300
},
{
"epoch": 0.3425,
"grad_norm": 26.63360023498535,
"learning_rate": 6.6164779874213845e-06,
"loss": 2.717623291015625,
"step": 27400
},
{
"epoch": 0.34375,
"grad_norm": 16.987070083618164,
"learning_rate": 6.603899371069183e-06,
"loss": 2.4102777099609374,
"step": 27500
},
{
"epoch": 0.345,
"grad_norm": 0.01087226066738367,
"learning_rate": 6.5913207547169825e-06,
"loss": 2.3370285034179688,
"step": 27600
},
{
"epoch": 0.34625,
"grad_norm": 23.72052574157715,
"learning_rate": 6.578742138364781e-06,
"loss": 2.6314013671875,
"step": 27700
},
{
"epoch": 0.3475,
"grad_norm": 23.36273765563965,
"learning_rate": 6.56616352201258e-06,
"loss": 2.7635400390625,
"step": 27800
},
{
"epoch": 0.34875,
"grad_norm": 1.7312610149383545,
"learning_rate": 6.553584905660378e-06,
"loss": 2.5564617919921875,
"step": 27900
},
{
"epoch": 0.35,
"grad_norm": 35.84255599975586,
"learning_rate": 6.541006289308177e-06,
"loss": 2.7504351806640623,
"step": 28000
},
{
"epoch": 0.35125,
"grad_norm": 26.242603302001953,
"learning_rate": 6.528427672955976e-06,
"loss": 2.373230285644531,
"step": 28100
},
{
"epoch": 0.3525,
"grad_norm": 18.80271339416504,
"learning_rate": 6.515849056603775e-06,
"loss": 2.472604064941406,
"step": 28200
},
{
"epoch": 0.35375,
"grad_norm": 0.8052368760108948,
"learning_rate": 6.5032704402515734e-06,
"loss": 2.4384115600585936,
"step": 28300
},
{
"epoch": 0.355,
"grad_norm": 9.235365867614746,
"learning_rate": 6.490691823899372e-06,
"loss": 2.026627502441406,
"step": 28400
},
{
"epoch": 0.35625,
"grad_norm": 20.106481552124023,
"learning_rate": 6.478113207547171e-06,
"loss": 2.431183319091797,
"step": 28500
},
{
"epoch": 0.3575,
"grad_norm": 26.855960845947266,
"learning_rate": 6.46553459119497e-06,
"loss": 2.78699462890625,
"step": 28600
},
{
"epoch": 0.35875,
"grad_norm": 3.0411767959594727,
"learning_rate": 6.452955974842769e-06,
"loss": 2.862520751953125,
"step": 28700
},
{
"epoch": 0.36,
"grad_norm": 17.747953414916992,
"learning_rate": 6.440377358490567e-06,
"loss": 2.4523715209960937,
"step": 28800
},
{
"epoch": 0.36125,
"grad_norm": 17.899240493774414,
"learning_rate": 6.427798742138366e-06,
"loss": 2.606719970703125,
"step": 28900
},
{
"epoch": 0.3625,
"grad_norm": 27.6655216217041,
"learning_rate": 6.415220125786164e-06,
"loss": 2.9831976318359374,
"step": 29000
},
{
"epoch": 0.36375,
"grad_norm": 11.997615814208984,
"learning_rate": 6.402641509433963e-06,
"loss": 2.345184326171875,
"step": 29100
},
{
"epoch": 0.365,
"grad_norm": 4.737981796264648,
"learning_rate": 6.390062893081761e-06,
"loss": 2.16708251953125,
"step": 29200
},
{
"epoch": 0.36625,
"grad_norm": 26.705848693847656,
"learning_rate": 6.37748427672956e-06,
"loss": 2.63515869140625,
"step": 29300
},
{
"epoch": 0.3675,
"grad_norm": 19.237810134887695,
"learning_rate": 6.364905660377359e-06,
"loss": 2.532059326171875,
"step": 29400
},
{
"epoch": 0.36875,
"grad_norm": 17.01352882385254,
"learning_rate": 6.352327044025157e-06,
"loss": 2.55763671875,
"step": 29500
},
{
"epoch": 0.37,
"grad_norm": 27.392210006713867,
"learning_rate": 6.339748427672956e-06,
"loss": 2.2140924072265626,
"step": 29600
},
{
"epoch": 0.37125,
"grad_norm": 22.72922134399414,
"learning_rate": 6.3271698113207545e-06,
"loss": 2.749759521484375,
"step": 29700
},
{
"epoch": 0.3725,
"grad_norm": 30.038837432861328,
"learning_rate": 6.314591194968553e-06,
"loss": 2.596155700683594,
"step": 29800
},
{
"epoch": 0.37375,
"grad_norm": 11.441932678222656,
"learning_rate": 6.3020125786163525e-06,
"loss": 2.4867178344726564,
"step": 29900
},
{
"epoch": 0.375,
"grad_norm": 1.1486650705337524,
"learning_rate": 6.289433962264151e-06,
"loss": 2.6022509765625,
"step": 30000
},
{
"epoch": 0.37625,
"grad_norm": 19.94702911376953,
"learning_rate": 6.27685534591195e-06,
"loss": 2.4900088500976563,
"step": 30100
},
{
"epoch": 0.3775,
"grad_norm": 23.304832458496094,
"learning_rate": 6.264276729559748e-06,
"loss": 2.6295010375976564,
"step": 30200
},
{
"epoch": 0.37875,
"grad_norm": 0.9189664721488953,
"learning_rate": 6.251698113207547e-06,
"loss": 2.614851379394531,
"step": 30300
},
{
"epoch": 0.38,
"grad_norm": 7.372312545776367,
"learning_rate": 6.239119496855346e-06,
"loss": 2.43028564453125,
"step": 30400
},
{
"epoch": 0.38125,
"grad_norm": 24.777666091918945,
"learning_rate": 6.226540880503145e-06,
"loss": 2.790203857421875,
"step": 30500
},
{
"epoch": 0.3825,
"grad_norm": 24.64716339111328,
"learning_rate": 6.2139622641509434e-06,
"loss": 2.7274844360351564,
"step": 30600
},
{
"epoch": 0.38375,
"grad_norm": 10.4207763671875,
"learning_rate": 6.201383647798742e-06,
"loss": 2.1123760986328124,
"step": 30700
},
{
"epoch": 0.385,
"grad_norm": 0.07715418934822083,
"learning_rate": 6.188805031446541e-06,
"loss": 2.5373687744140625,
"step": 30800
},
{
"epoch": 0.38625,
"grad_norm": 23.558998107910156,
"learning_rate": 6.17622641509434e-06,
"loss": 2.632052001953125,
"step": 30900
},
{
"epoch": 0.3875,
"grad_norm": 26.116592407226562,
"learning_rate": 6.163647798742139e-06,
"loss": 2.494936218261719,
"step": 31000
},
{
"epoch": 0.38875,
"grad_norm": 8.466816902160645,
"learning_rate": 6.151069182389937e-06,
"loss": 2.3917417907714844,
"step": 31100
},
{
"epoch": 0.39,
"grad_norm": 6.2974748611450195,
"learning_rate": 6.138490566037736e-06,
"loss": 2.315247039794922,
"step": 31200
},
{
"epoch": 0.39125,
"grad_norm": 26.84358024597168,
"learning_rate": 6.125911949685534e-06,
"loss": 2.6386013793945313,
"step": 31300
},
{
"epoch": 0.3925,
"grad_norm": 18.76458740234375,
"learning_rate": 6.113333333333333e-06,
"loss": 2.304689636230469,
"step": 31400
},
{
"epoch": 0.39375,
"grad_norm": 12.886114120483398,
"learning_rate": 6.100754716981132e-06,
"loss": 2.476997528076172,
"step": 31500
},
{
"epoch": 0.395,
"grad_norm": 14.977468490600586,
"learning_rate": 6.088176100628931e-06,
"loss": 2.3447396850585935,
"step": 31600
},
{
"epoch": 0.39625,
"grad_norm": 29.1169490814209,
"learning_rate": 6.0755974842767296e-06,
"loss": 2.186744842529297,
"step": 31700
},
{
"epoch": 0.3975,
"grad_norm": 27.776119232177734,
"learning_rate": 6.063018867924528e-06,
"loss": 2.382681884765625,
"step": 31800
},
{
"epoch": 0.39875,
"grad_norm": 48.07829284667969,
"learning_rate": 6.050440251572327e-06,
"loss": 2.3385089111328123,
"step": 31900
},
{
"epoch": 0.4,
"grad_norm": 0.7851316332817078,
"learning_rate": 6.037861635220126e-06,
"loss": 2.229570770263672,
"step": 32000
},
{
"epoch": 0.40125,
"grad_norm": 20.3125057220459,
"learning_rate": 6.025283018867925e-06,
"loss": 2.5623980712890626,
"step": 32100
},
{
"epoch": 0.4025,
"grad_norm": 21.488149642944336,
"learning_rate": 6.012704402515723e-06,
"loss": 2.912520751953125,
"step": 32200
},
{
"epoch": 0.40375,
"grad_norm": 32.51865005493164,
"learning_rate": 6.000125786163522e-06,
"loss": 2.3843609619140627,
"step": 32300
},
{
"epoch": 0.405,
"grad_norm": 1.5765758752822876,
"learning_rate": 5.9875471698113205e-06,
"loss": 2.381104736328125,
"step": 32400
},
{
"epoch": 0.40625,
"grad_norm": 19.67698860168457,
"learning_rate": 5.97496855345912e-06,
"loss": 2.60897216796875,
"step": 32500
},
{
"epoch": 0.4075,
"grad_norm": 25.434328079223633,
"learning_rate": 5.9623899371069185e-06,
"loss": 2.505929260253906,
"step": 32600
},
{
"epoch": 0.40875,
"grad_norm": 1.3922165632247925,
"learning_rate": 5.949811320754717e-06,
"loss": 2.7238693237304688,
"step": 32700
},
{
"epoch": 0.41,
"grad_norm": 34.48624038696289,
"learning_rate": 5.937232704402516e-06,
"loss": 2.2143258666992187,
"step": 32800
},
{
"epoch": 0.41125,
"grad_norm": 23.105432510375977,
"learning_rate": 5.924654088050314e-06,
"loss": 2.609884033203125,
"step": 32900
},
{
"epoch": 0.4125,
"grad_norm": 25.032276153564453,
"learning_rate": 5.912075471698113e-06,
"loss": 2.4755784606933595,
"step": 33000
},
{
"epoch": 0.41375,
"grad_norm": 2.3799238204956055,
"learning_rate": 5.899496855345912e-06,
"loss": 2.575408630371094,
"step": 33100
},
{
"epoch": 0.415,
"grad_norm": 0.27088427543640137,
"learning_rate": 5.886918238993711e-06,
"loss": 2.577947998046875,
"step": 33200
},
{
"epoch": 0.41625,
"grad_norm": 21.328598022460938,
"learning_rate": 5.8743396226415095e-06,
"loss": 2.358548583984375,
"step": 33300
},
{
"epoch": 0.4175,
"grad_norm": 22.012718200683594,
"learning_rate": 5.861761006289308e-06,
"loss": 2.5863958740234376,
"step": 33400
},
{
"epoch": 0.41875,
"grad_norm": 11.000739097595215,
"learning_rate": 5.849182389937107e-06,
"loss": 2.410167236328125,
"step": 33500
},
{
"epoch": 0.42,
"grad_norm": 0.4935649633407593,
"learning_rate": 5.836603773584906e-06,
"loss": 2.570559387207031,
"step": 33600
},
{
"epoch": 0.42125,
"grad_norm": 16.273447036743164,
"learning_rate": 5.824025157232705e-06,
"loss": 2.599737548828125,
"step": 33700
},
{
"epoch": 0.4225,
"grad_norm": 17.573291778564453,
"learning_rate": 5.811446540880503e-06,
"loss": 2.647215576171875,
"step": 33800
},
{
"epoch": 0.42375,
"grad_norm": 11.290910720825195,
"learning_rate": 5.798867924528302e-06,
"loss": 2.6486666870117186,
"step": 33900
},
{
"epoch": 0.425,
"grad_norm": 0.27160441875457764,
"learning_rate": 5.7862893081761004e-06,
"loss": 2.48810791015625,
"step": 34000
},
{
"epoch": 0.42625,
"grad_norm": 33.099632263183594,
"learning_rate": 5.773710691823899e-06,
"loss": 2.4821075439453124,
"step": 34100
},
{
"epoch": 0.4275,
"grad_norm": 18.253433227539062,
"learning_rate": 5.7611320754716984e-06,
"loss": 2.4092727661132813,
"step": 34200
},
{
"epoch": 0.42875,
"grad_norm": 6.506767749786377,
"learning_rate": 5.748553459119497e-06,
"loss": 2.461749267578125,
"step": 34300
},
{
"epoch": 0.43,
"grad_norm": 0.04523186758160591,
"learning_rate": 5.735974842767296e-06,
"loss": 2.4135496520996096,
"step": 34400
},
{
"epoch": 0.43125,
"grad_norm": 22.646928787231445,
"learning_rate": 5.723396226415094e-06,
"loss": 2.4821810913085938,
"step": 34500
},
{
"epoch": 0.4325,
"grad_norm": 24.81723976135254,
"learning_rate": 5.710817610062893e-06,
"loss": 2.5469622802734375,
"step": 34600
},
{
"epoch": 0.43375,
"grad_norm": 13.340218544006348,
"learning_rate": 5.698238993710692e-06,
"loss": 2.5624932861328125,
"step": 34700
},
{
"epoch": 0.435,
"grad_norm": 0.1735188513994217,
"learning_rate": 5.685660377358491e-06,
"loss": 2.5089077758789062,
"step": 34800
},
{
"epoch": 0.43625,
"grad_norm": 21.547271728515625,
"learning_rate": 5.673081761006289e-06,
"loss": 2.6526663208007815,
"step": 34900
},
{
"epoch": 0.4375,
"grad_norm": 19.01319694519043,
"learning_rate": 5.660503144654088e-06,
"loss": 2.860760498046875,
"step": 35000
},
{
"epoch": 0.43875,
"grad_norm": 34.29476547241211,
"learning_rate": 5.6479245283018866e-06,
"loss": 2.6115411376953124,
"step": 35100
},
{
"epoch": 0.44,
"grad_norm": 49.28584671020508,
"learning_rate": 5.635345911949686e-06,
"loss": 2.1887362670898436,
"step": 35200
},
{
"epoch": 0.44125,
"grad_norm": 22.770751953125,
"learning_rate": 5.622767295597485e-06,
"loss": 2.5814306640625,
"step": 35300
},
{
"epoch": 0.4425,
"grad_norm": 30.632217407226562,
"learning_rate": 5.610188679245283e-06,
"loss": 2.3137057495117186,
"step": 35400
},
{
"epoch": 0.44375,
"grad_norm": 1.2359341382980347,
"learning_rate": 5.597610062893082e-06,
"loss": 2.060030517578125,
"step": 35500
},
{
"epoch": 0.445,
"grad_norm": 0.3209603428840637,
"learning_rate": 5.58503144654088e-06,
"loss": 2.113879852294922,
"step": 35600
},
{
"epoch": 0.44625,
"grad_norm": 18.14032554626465,
"learning_rate": 5.572452830188679e-06,
"loss": 2.4774960327148436,
"step": 35700
},
{
"epoch": 0.4475,
"grad_norm": 28.4627628326416,
"learning_rate": 5.559874213836478e-06,
"loss": 2.282992248535156,
"step": 35800
},
{
"epoch": 0.44875,
"grad_norm": 9.6217041015625,
"learning_rate": 5.547295597484277e-06,
"loss": 2.6865158081054688,
"step": 35900
},
{
"epoch": 0.45,
"grad_norm": 0.1811428815126419,
"learning_rate": 5.5347169811320755e-06,
"loss": 2.6177047729492187,
"step": 36000
},
{
"epoch": 0.45125,
"grad_norm": 29.922924041748047,
"learning_rate": 5.522138364779874e-06,
"loss": 2.666334533691406,
"step": 36100
},
{
"epoch": 0.4525,
"grad_norm": 22.7872371673584,
"learning_rate": 5.509559748427673e-06,
"loss": 2.8180126953125,
"step": 36200
},
{
"epoch": 0.45375,
"grad_norm": 47.54941177368164,
"learning_rate": 5.496981132075472e-06,
"loss": 2.5696286010742186,
"step": 36300
},
{
"epoch": 0.455,
"grad_norm": 0.02181134559214115,
"learning_rate": 5.484402515723271e-06,
"loss": 2.3324235534667968,
"step": 36400
},
{
"epoch": 0.45625,
"grad_norm": 29.27972412109375,
"learning_rate": 5.471823899371069e-06,
"loss": 2.377205657958984,
"step": 36500
},
{
"epoch": 0.4575,
"grad_norm": 27.066911697387695,
"learning_rate": 5.459245283018868e-06,
"loss": 2.6452157592773435,
"step": 36600
},
{
"epoch": 0.45875,
"grad_norm": 5.8870849609375,
"learning_rate": 5.4466666666666665e-06,
"loss": 2.6054962158203123,
"step": 36700
},
{
"epoch": 0.46,
"grad_norm": 9.92285442352295,
"learning_rate": 5.434088050314466e-06,
"loss": 2.3585281372070312,
"step": 36800
},
{
"epoch": 0.46125,
"grad_norm": 25.87106704711914,
"learning_rate": 5.4215094339622645e-06,
"loss": 2.3194918823242188,
"step": 36900
},
{
"epoch": 0.4625,
"grad_norm": 24.31088638305664,
"learning_rate": 5.408930817610063e-06,
"loss": 2.558509521484375,
"step": 37000
},
{
"epoch": 0.46375,
"grad_norm": 8.893757820129395,
"learning_rate": 5.396352201257862e-06,
"loss": 2.501906433105469,
"step": 37100
},
{
"epoch": 0.465,
"grad_norm": 0.19256582856178284,
"learning_rate": 5.38377358490566e-06,
"loss": 2.4328764343261717,
"step": 37200
},
{
"epoch": 0.46625,
"grad_norm": 36.3577995300293,
"learning_rate": 5.371194968553459e-06,
"loss": 2.593095397949219,
"step": 37300
},
{
"epoch": 0.4675,
"grad_norm": 28.993858337402344,
"learning_rate": 5.358616352201258e-06,
"loss": 2.5382254028320315,
"step": 37400
},
{
"epoch": 0.46875,
"grad_norm": 1.2644715309143066,
"learning_rate": 5.346037735849057e-06,
"loss": 2.5988037109375,
"step": 37500
},
{
"epoch": 0.47,
"grad_norm": 1.758081078529358,
"learning_rate": 5.3334591194968554e-06,
"loss": 2.3407969665527344,
"step": 37600
},
{
"epoch": 0.47125,
"grad_norm": 29.136577606201172,
"learning_rate": 5.320880503144654e-06,
"loss": 2.649151611328125,
"step": 37700
},
{
"epoch": 0.4725,
"grad_norm": 25.28328514099121,
"learning_rate": 5.308301886792453e-06,
"loss": 2.5671429443359375,
"step": 37800
},
{
"epoch": 0.47375,
"grad_norm": 15.502847671508789,
"learning_rate": 5.295723270440252e-06,
"loss": 2.6158074951171875,
"step": 37900
},
{
"epoch": 0.475,
"grad_norm": 0.22777187824249268,
"learning_rate": 5.283144654088051e-06,
"loss": 2.15365966796875,
"step": 38000
},
{
"epoch": 0.47625,
"grad_norm": 11.893803596496582,
"learning_rate": 5.270566037735849e-06,
"loss": 2.3156301879882815,
"step": 38100
},
{
"epoch": 0.4775,
"grad_norm": 25.753210067749023,
"learning_rate": 5.257987421383648e-06,
"loss": 2.503589324951172,
"step": 38200
},
{
"epoch": 0.47875,
"grad_norm": 16.314041137695312,
"learning_rate": 5.245408805031446e-06,
"loss": 2.639173583984375,
"step": 38300
},
{
"epoch": 0.48,
"grad_norm": 0.021451743319630623,
"learning_rate": 5.232830188679246e-06,
"loss": 2.311227111816406,
"step": 38400
},
{
"epoch": 0.48125,
"grad_norm": 23.82429313659668,
"learning_rate": 5.220251572327044e-06,
"loss": 2.3693087768554686,
"step": 38500
},
{
"epoch": 0.4825,
"grad_norm": 27.395193099975586,
"learning_rate": 5.207672955974843e-06,
"loss": 2.6940673828125,
"step": 38600
},
{
"epoch": 0.48375,
"grad_norm": 4.3216071128845215,
"learning_rate": 5.1950943396226416e-06,
"loss": 2.763636474609375,
"step": 38700
},
{
"epoch": 0.485,
"grad_norm": 7.015088081359863,
"learning_rate": 5.18251572327044e-06,
"loss": 2.432607879638672,
"step": 38800
},
{
"epoch": 0.48625,
"grad_norm": 22.117116928100586,
"learning_rate": 5.169937106918239e-06,
"loss": 2.6036077880859376,
"step": 38900
},
{
"epoch": 0.4875,
"grad_norm": 19.56247329711914,
"learning_rate": 5.157358490566038e-06,
"loss": 2.742894287109375,
"step": 39000
},
{
"epoch": 0.48875,
"grad_norm": 9.732338905334473,
"learning_rate": 5.144779874213837e-06,
"loss": 2.3442169189453126,
"step": 39100
},
{
"epoch": 0.49,
"grad_norm": 12.13438892364502,
"learning_rate": 5.132201257861635e-06,
"loss": 2.3807391357421874,
"step": 39200
},
{
"epoch": 0.49125,
"grad_norm": 20.819271087646484,
"learning_rate": 5.119622641509434e-06,
"loss": 2.489746551513672,
"step": 39300
},
{
"epoch": 0.4925,
"grad_norm": 22.794889450073242,
"learning_rate": 5.1070440251572325e-06,
"loss": 2.7172930908203123,
"step": 39400
},
{
"epoch": 0.49375,
"grad_norm": 9.96747875213623,
"learning_rate": 5.094465408805032e-06,
"loss": 2.375674133300781,
"step": 39500
},
{
"epoch": 0.495,
"grad_norm": 0.012898732908070087,
"learning_rate": 5.0818867924528305e-06,
"loss": 2.238426513671875,
"step": 39600
},
{
"epoch": 0.49625,
"grad_norm": 23.519365310668945,
"learning_rate": 5.069308176100629e-06,
"loss": 2.3515843200683593,
"step": 39700
},
{
"epoch": 0.4975,
"grad_norm": 24.20486068725586,
"learning_rate": 5.056729559748428e-06,
"loss": 2.6381719970703124,
"step": 39800
},
{
"epoch": 0.49875,
"grad_norm": 25.931121826171875,
"learning_rate": 5.044150943396226e-06,
"loss": 2.2815740966796874,
"step": 39900
},
{
"epoch": 0.5,
"grad_norm": 0.4895442724227905,
"learning_rate": 5.031572327044026e-06,
"loss": 2.1325482177734374,
"step": 40000
},
{
"epoch": 0.50125,
"grad_norm": 22.385295867919922,
"learning_rate": 5.018993710691824e-06,
"loss": 2.376383514404297,
"step": 40100
},
{
"epoch": 0.5025,
"grad_norm": 24.816694259643555,
"learning_rate": 5.006415094339623e-06,
"loss": 2.4902183532714846,
"step": 40200
},
{
"epoch": 0.50375,
"grad_norm": 0.2846035957336426,
"learning_rate": 4.9938364779874215e-06,
"loss": 2.5287326049804686,
"step": 40300
},
{
"epoch": 0.505,
"grad_norm": 2.7115321159362793,
"learning_rate": 4.98125786163522e-06,
"loss": 2.357637939453125,
"step": 40400
},
{
"epoch": 0.50625,
"grad_norm": 23.814891815185547,
"learning_rate": 4.968679245283019e-06,
"loss": 2.194998016357422,
"step": 40500
},
{
"epoch": 0.5075,
"grad_norm": 22.298978805541992,
"learning_rate": 4.956100628930818e-06,
"loss": 2.3045025634765626,
"step": 40600
},
{
"epoch": 0.50875,
"grad_norm": 1.843037724494934,
"learning_rate": 4.943522012578617e-06,
"loss": 2.532041473388672,
"step": 40700
},
{
"epoch": 0.51,
"grad_norm": 0.05613021180033684,
"learning_rate": 4.930943396226415e-06,
"loss": 2.358630828857422,
"step": 40800
},
{
"epoch": 0.51125,
"grad_norm": 26.92227554321289,
"learning_rate": 4.918364779874214e-06,
"loss": 2.375510711669922,
"step": 40900
},
{
"epoch": 0.5125,
"grad_norm": 24.53324317932129,
"learning_rate": 4.9057861635220124e-06,
"loss": 2.3742355346679687,
"step": 41000
},
{
"epoch": 0.51375,
"grad_norm": 1.7757785320281982,
"learning_rate": 4.893207547169812e-06,
"loss": 2.1228904724121094,
"step": 41100
},
{
"epoch": 0.515,
"grad_norm": 1.370865821838379,
"learning_rate": 4.8806289308176104e-06,
"loss": 2.2730003356933595,
"step": 41200
},
{
"epoch": 0.51625,
"grad_norm": 27.052255630493164,
"learning_rate": 4.868050314465409e-06,
"loss": 2.1805772399902343,
"step": 41300
},
{
"epoch": 0.5175,
"grad_norm": 19.289770126342773,
"learning_rate": 4.855471698113208e-06,
"loss": 2.3394677734375,
"step": 41400
},
{
"epoch": 0.51875,
"grad_norm": 0.14031943678855896,
"learning_rate": 4.842893081761006e-06,
"loss": 2.16936767578125,
"step": 41500
},
{
"epoch": 0.52,
"grad_norm": 0.3180188536643982,
"learning_rate": 4.830314465408806e-06,
"loss": 2.288330383300781,
"step": 41600
},
{
"epoch": 0.52125,
"grad_norm": 28.790279388427734,
"learning_rate": 4.817735849056604e-06,
"loss": 2.2974285888671875,
"step": 41700
},
{
"epoch": 0.5225,
"grad_norm": 29.82061767578125,
"learning_rate": 4.805157232704403e-06,
"loss": 2.2505938720703127,
"step": 41800
},
{
"epoch": 0.52375,
"grad_norm": 1.3207628726959229,
"learning_rate": 4.792578616352201e-06,
"loss": 2.1726535034179686,
"step": 41900
},
{
"epoch": 0.525,
"grad_norm": 19.538122177124023,
"learning_rate": 4.78e-06,
"loss": 2.3601947021484375,
"step": 42000
},
{
"epoch": 0.52625,
"grad_norm": 23.45235252380371,
"learning_rate": 4.7674213836477986e-06,
"loss": 2.3694439697265626,
"step": 42100
},
{
"epoch": 0.5275,
"grad_norm": 29.030534744262695,
"learning_rate": 4.754842767295598e-06,
"loss": 2.375699462890625,
"step": 42200
},
{
"epoch": 0.52875,
"grad_norm": 0.05739065632224083,
"learning_rate": 4.742264150943397e-06,
"loss": 2.2624081420898436,
"step": 42300
},
{
"epoch": 0.53,
"grad_norm": 0.09302949905395508,
"learning_rate": 4.729685534591195e-06,
"loss": 2.418585205078125,
"step": 42400
},
{
"epoch": 0.53125,
"grad_norm": 20.36932945251465,
"learning_rate": 4.717106918238994e-06,
"loss": 2.7232342529296876,
"step": 42500
},
{
"epoch": 0.5325,
"grad_norm": 25.583152770996094,
"learning_rate": 4.704528301886792e-06,
"loss": 2.5170713806152345,
"step": 42600
},
{
"epoch": 0.53375,
"grad_norm": 27.926185607910156,
"learning_rate": 4.691949685534592e-06,
"loss": 2.2554306030273437,
"step": 42700
},
{
"epoch": 0.535,
"grad_norm": 0.015048661269247532,
"learning_rate": 4.67937106918239e-06,
"loss": 2.410753173828125,
"step": 42800
},
{
"epoch": 0.53625,
"grad_norm": 22.782848358154297,
"learning_rate": 4.666792452830189e-06,
"loss": 2.506884765625,
"step": 42900
},
{
"epoch": 0.5375,
"grad_norm": 27.83270263671875,
"learning_rate": 4.6542138364779875e-06,
"loss": 2.42444091796875,
"step": 43000
},
{
"epoch": 0.53875,
"grad_norm": 14.226387023925781,
"learning_rate": 4.641635220125786e-06,
"loss": 2.2237681579589843,
"step": 43100
},
{
"epoch": 0.54,
"grad_norm": 30.74934196472168,
"learning_rate": 4.6290566037735855e-06,
"loss": 1.8440298461914062,
"step": 43200
},
{
"epoch": 0.54125,
"grad_norm": 26.56260871887207,
"learning_rate": 4.616477987421384e-06,
"loss": 2.5520880126953127,
"step": 43300
},
{
"epoch": 0.5425,
"grad_norm": 26.363876342773438,
"learning_rate": 4.603899371069183e-06,
"loss": 2.6202496337890624,
"step": 43400
},
{
"epoch": 0.54375,
"grad_norm": 24.67633628845215,
"learning_rate": 4.591320754716981e-06,
"loss": 2.253356170654297,
"step": 43500
},
{
"epoch": 0.545,
"grad_norm": 0.01842404529452324,
"learning_rate": 4.57874213836478e-06,
"loss": 2.283622283935547,
"step": 43600
},
{
"epoch": 0.54625,
"grad_norm": 15.855024337768555,
"learning_rate": 4.5661635220125785e-06,
"loss": 2.501055908203125,
"step": 43700
},
{
"epoch": 0.5475,
"grad_norm": 26.492700576782227,
"learning_rate": 4.553584905660378e-06,
"loss": 2.426959228515625,
"step": 43800
},
{
"epoch": 0.54875,
"grad_norm": 7.623685836791992,
"learning_rate": 4.5410062893081765e-06,
"loss": 1.9829179382324218,
"step": 43900
},
{
"epoch": 0.55,
"grad_norm": 17.01411247253418,
"learning_rate": 4.528427672955975e-06,
"loss": 2.1798980712890623,
"step": 44000
},
{
"epoch": 0.55125,
"grad_norm": 21.383556365966797,
"learning_rate": 4.515849056603774e-06,
"loss": 2.0453337097167967,
"step": 44100
},
{
"epoch": 0.5525,
"grad_norm": 22.872831344604492,
"learning_rate": 4.503270440251572e-06,
"loss": 2.4065341186523437,
"step": 44200
},
{
"epoch": 0.55375,
"grad_norm": 0.3448590934276581,
"learning_rate": 4.490691823899372e-06,
"loss": 2.0755601501464844,
"step": 44300
},
{
"epoch": 0.555,
"grad_norm": 64.99383544921875,
"learning_rate": 4.47811320754717e-06,
"loss": 2.167461242675781,
"step": 44400
},
{
"epoch": 0.55625,
"grad_norm": 26.849061965942383,
"learning_rate": 4.465534591194969e-06,
"loss": 2.392653045654297,
"step": 44500
},
{
"epoch": 0.5575,
"grad_norm": 22.663536071777344,
"learning_rate": 4.4529559748427674e-06,
"loss": 2.6425860595703123,
"step": 44600
},
{
"epoch": 0.55875,
"grad_norm": 0.09723508358001709,
"learning_rate": 4.440377358490566e-06,
"loss": 2.041617431640625,
"step": 44700
},
{
"epoch": 0.56,
"grad_norm": 0.6772779822349548,
"learning_rate": 4.427798742138365e-06,
"loss": 2.0510581970214843,
"step": 44800
},
{
"epoch": 0.56125,
"grad_norm": 23.326744079589844,
"learning_rate": 4.415220125786164e-06,
"loss": 2.28655029296875,
"step": 44900
},
{
"epoch": 0.5625,
"grad_norm": 22.986696243286133,
"learning_rate": 4.402641509433963e-06,
"loss": 2.162388916015625,
"step": 45000
},
{
"epoch": 0.56375,
"grad_norm": 0.11825785040855408,
"learning_rate": 4.390062893081761e-06,
"loss": 2.4120408630371095,
"step": 45100
},
{
"epoch": 0.565,
"grad_norm": 0.07316289842128754,
"learning_rate": 4.37748427672956e-06,
"loss": 1.9428927612304687,
"step": 45200
},
{
"epoch": 0.56625,
"grad_norm": 19.97572135925293,
"learning_rate": 4.364905660377358e-06,
"loss": 2.409757537841797,
"step": 45300
},
{
"epoch": 0.5675,
"grad_norm": 21.209138870239258,
"learning_rate": 4.352327044025158e-06,
"loss": 2.303609619140625,
"step": 45400
},
{
"epoch": 0.56875,
"grad_norm": 0.48939380049705505,
"learning_rate": 4.339748427672956e-06,
"loss": 2.2687278747558595,
"step": 45500
},
{
"epoch": 0.57,
"grad_norm": 2.690356492996216,
"learning_rate": 4.327169811320755e-06,
"loss": 2.068523406982422,
"step": 45600
},
{
"epoch": 0.57125,
"grad_norm": 24.331865310668945,
"learning_rate": 4.3145911949685536e-06,
"loss": 2.371395721435547,
"step": 45700
},
{
"epoch": 0.5725,
"grad_norm": 19.168750762939453,
"learning_rate": 4.302012578616352e-06,
"loss": 2.48111083984375,
"step": 45800
},
{
"epoch": 0.57375,
"grad_norm": 1.1316941976547241,
"learning_rate": 4.289433962264152e-06,
"loss": 2.1914462280273437,
"step": 45900
},
{
"epoch": 0.575,
"grad_norm": 0.7238625884056091,
"learning_rate": 4.27685534591195e-06,
"loss": 2.408410949707031,
"step": 46000
},
{
"epoch": 0.57625,
"grad_norm": 18.509540557861328,
"learning_rate": 4.264276729559749e-06,
"loss": 2.5385202026367186,
"step": 46100
},
{
"epoch": 0.5775,
"grad_norm": 24.170326232910156,
"learning_rate": 4.251698113207547e-06,
"loss": 2.726481628417969,
"step": 46200
},
{
"epoch": 0.57875,
"grad_norm": 5.890894889831543,
"learning_rate": 4.239119496855346e-06,
"loss": 2.3738558959960936,
"step": 46300
},
{
"epoch": 0.58,
"grad_norm": 0.009730951860547066,
"learning_rate": 4.2265408805031445e-06,
"loss": 2.6054721069335938,
"step": 46400
},
{
"epoch": 0.58125,
"grad_norm": 24.269445419311523,
"learning_rate": 4.213962264150944e-06,
"loss": 2.3926129150390625,
"step": 46500
},
{
"epoch": 0.5825,
"grad_norm": 20.738441467285156,
"learning_rate": 4.2013836477987425e-06,
"loss": 2.491241455078125,
"step": 46600
},
{
"epoch": 0.58375,
"grad_norm": 1.0278619527816772,
"learning_rate": 4.188805031446541e-06,
"loss": 2.247518768310547,
"step": 46700
},
{
"epoch": 0.585,
"grad_norm": 4.836909294128418,
"learning_rate": 4.17622641509434e-06,
"loss": 2.104373016357422,
"step": 46800
},
{
"epoch": 0.58625,
"grad_norm": 31.359527587890625,
"learning_rate": 4.163647798742138e-06,
"loss": 2.190884704589844,
"step": 46900
},
{
"epoch": 0.5875,
"grad_norm": 33.07499313354492,
"learning_rate": 4.151069182389938e-06,
"loss": 2.332211608886719,
"step": 47000
},
{
"epoch": 0.58875,
"grad_norm": 18.392343521118164,
"learning_rate": 4.138490566037736e-06,
"loss": 2.077041015625,
"step": 47100
},
{
"epoch": 0.59,
"grad_norm": 0.6778843998908997,
"learning_rate": 4.125911949685535e-06,
"loss": 2.148955993652344,
"step": 47200
},
{
"epoch": 0.59125,
"grad_norm": 29.361854553222656,
"learning_rate": 4.1133333333333335e-06,
"loss": 2.404347381591797,
"step": 47300
},
{
"epoch": 0.5925,
"grad_norm": 21.975435256958008,
"learning_rate": 4.100754716981132e-06,
"loss": 2.4371409606933594,
"step": 47400
},
{
"epoch": 0.59375,
"grad_norm": 0.4038066864013672,
"learning_rate": 4.0881761006289315e-06,
"loss": 2.4506686401367186,
"step": 47500
},
{
"epoch": 0.595,
"grad_norm": 0.0259912870824337,
"learning_rate": 4.07559748427673e-06,
"loss": 2.2095245361328124,
"step": 47600
},
{
"epoch": 0.59625,
"grad_norm": 24.107616424560547,
"learning_rate": 4.063018867924529e-06,
"loss": 2.3150425720214844,
"step": 47700
},
{
"epoch": 0.5975,
"grad_norm": 22.568410873413086,
"learning_rate": 4.050440251572327e-06,
"loss": 2.3050765991210938,
"step": 47800
},
{
"epoch": 0.59875,
"grad_norm": 1.4052597284317017,
"learning_rate": 4.037861635220126e-06,
"loss": 2.3776676940917967,
"step": 47900
},
{
"epoch": 0.6,
"grad_norm": 5.013542652130127,
"learning_rate": 4.0252830188679244e-06,
"loss": 2.1164060974121095,
"step": 48000
},
{
"epoch": 0.60125,
"grad_norm": 24.288990020751953,
"learning_rate": 4.012704402515724e-06,
"loss": 2.0022381591796874,
"step": 48100
},
{
"epoch": 0.6025,
"grad_norm": 26.07032012939453,
"learning_rate": 4.0001257861635224e-06,
"loss": 2.1755996704101563,
"step": 48200
},
{
"epoch": 0.60375,
"grad_norm": 1.911749005317688,
"learning_rate": 3.987547169811321e-06,
"loss": 2.01885986328125,
"step": 48300
},
{
"epoch": 0.605,
"grad_norm": 12.477325439453125,
"learning_rate": 3.97496855345912e-06,
"loss": 2.368613739013672,
"step": 48400
},
{
"epoch": 0.60625,
"grad_norm": 21.526126861572266,
"learning_rate": 3.962389937106918e-06,
"loss": 2.3319718933105467,
"step": 48500
},
{
"epoch": 0.6075,
"grad_norm": 21.947265625,
"learning_rate": 3.949811320754718e-06,
"loss": 2.5695175170898437,
"step": 48600
},
{
"epoch": 0.60875,
"grad_norm": 9.083207130432129,
"learning_rate": 3.937232704402516e-06,
"loss": 2.4009765625,
"step": 48700
},
{
"epoch": 0.61,
"grad_norm": 9.588478088378906,
"learning_rate": 3.924654088050315e-06,
"loss": 2.1275369262695314,
"step": 48800
},
{
"epoch": 0.61125,
"grad_norm": 16.444225311279297,
"learning_rate": 3.912075471698113e-06,
"loss": 2.22287841796875,
"step": 48900
},
{
"epoch": 0.6125,
"grad_norm": 20.83136558532715,
"learning_rate": 3.899496855345912e-06,
"loss": 2.3845196533203126,
"step": 49000
},
{
"epoch": 0.61375,
"grad_norm": 10.17677116394043,
"learning_rate": 3.886918238993711e-06,
"loss": 2.190762176513672,
"step": 49100
},
{
"epoch": 0.615,
"grad_norm": 0.006293127313256264,
"learning_rate": 3.87433962264151e-06,
"loss": 2.2727685546875,
"step": 49200
},
{
"epoch": 0.61625,
"grad_norm": 25.517677307128906,
"learning_rate": 3.861761006289309e-06,
"loss": 2.267947082519531,
"step": 49300
},
{
"epoch": 0.6175,
"grad_norm": 28.428192138671875,
"learning_rate": 3.849182389937107e-06,
"loss": 2.418175048828125,
"step": 49400
},
{
"epoch": 0.61875,
"grad_norm": 27.646535873413086,
"learning_rate": 3.836603773584906e-06,
"loss": 2.280270233154297,
"step": 49500
},
{
"epoch": 0.62,
"grad_norm": 0.03348470479249954,
"learning_rate": 3.824025157232704e-06,
"loss": 2.0512631225585936,
"step": 49600
},
{
"epoch": 0.62125,
"grad_norm": 29.742656707763672,
"learning_rate": 3.8114465408805033e-06,
"loss": 2.4671990966796873,
"step": 49700
},
{
"epoch": 0.6225,
"grad_norm": 23.064598083496094,
"learning_rate": 3.7988679245283024e-06,
"loss": 2.4474307250976564,
"step": 49800
},
{
"epoch": 0.62375,
"grad_norm": 2.3941636085510254,
"learning_rate": 3.786289308176101e-06,
"loss": 2.03037109375,
"step": 49900
},
{
"epoch": 0.625,
"grad_norm": 0.19696100056171417,
"learning_rate": 3.7737106918238995e-06,
"loss": 2.0240495300292967,
"step": 50000
},
{
"epoch": 0.62625,
"grad_norm": 26.428871154785156,
"learning_rate": 3.7611320754716985e-06,
"loss": 2.200367889404297,
"step": 50100
},
{
"epoch": 0.6275,
"grad_norm": 23.72710609436035,
"learning_rate": 3.748553459119497e-06,
"loss": 2.7000701904296873,
"step": 50200
},
{
"epoch": 0.62875,
"grad_norm": 27.32769012451172,
"learning_rate": 3.735974842767296e-06,
"loss": 2.0785140991210938,
"step": 50300
},
{
"epoch": 0.63,
"grad_norm": 32.80459213256836,
"learning_rate": 3.7233962264150947e-06,
"loss": 2.2123362731933596,
"step": 50400
},
{
"epoch": 0.63125,
"grad_norm": 26.444387435913086,
"learning_rate": 3.7108176100628933e-06,
"loss": 2.479073944091797,
"step": 50500
},
{
"epoch": 0.6325,
"grad_norm": 28.786527633666992,
"learning_rate": 3.6982389937106923e-06,
"loss": 2.289686737060547,
"step": 50600
},
{
"epoch": 0.63375,
"grad_norm": 0.1663801074028015,
"learning_rate": 3.685660377358491e-06,
"loss": 2.1976898193359373,
"step": 50700
},
{
"epoch": 0.635,
"grad_norm": 30.683305740356445,
"learning_rate": 3.6730817610062895e-06,
"loss": 2.2509585571289064,
"step": 50800
},
{
"epoch": 0.63625,
"grad_norm": 22.308727264404297,
"learning_rate": 3.6605031446540885e-06,
"loss": 2.5132717895507812,
"step": 50900
},
{
"epoch": 0.6375,
"grad_norm": 19.899017333984375,
"learning_rate": 3.647924528301887e-06,
"loss": 2.246257781982422,
"step": 51000
},
{
"epoch": 0.63875,
"grad_norm": 0.14116325974464417,
"learning_rate": 3.635345911949686e-06,
"loss": 2.05459228515625,
"step": 51100
},
{
"epoch": 0.64,
"grad_norm": 0.020385975018143654,
"learning_rate": 3.6227672955974847e-06,
"loss": 2.099994812011719,
"step": 51200
},
{
"epoch": 0.64125,
"grad_norm": 17.793832778930664,
"learning_rate": 3.6101886792452833e-06,
"loss": 2.259058074951172,
"step": 51300
},
{
"epoch": 0.6425,
"grad_norm": 21.187685012817383,
"learning_rate": 3.5976100628930823e-06,
"loss": 2.204525146484375,
"step": 51400
},
{
"epoch": 0.64375,
"grad_norm": 1.0163244009017944,
"learning_rate": 3.585031446540881e-06,
"loss": 2.2062753295898436,
"step": 51500
},
{
"epoch": 0.645,
"grad_norm": 0.44557446241378784,
"learning_rate": 3.5724528301886794e-06,
"loss": 2.1636553955078126,
"step": 51600
},
{
"epoch": 0.64625,
"grad_norm": 22.489730834960938,
"learning_rate": 3.5598742138364784e-06,
"loss": 2.343136444091797,
"step": 51700
},
{
"epoch": 0.6475,
"grad_norm": 27.255525588989258,
"learning_rate": 3.547295597484277e-06,
"loss": 2.585829162597656,
"step": 51800
},
{
"epoch": 0.64875,
"grad_norm": 0.30436962842941284,
"learning_rate": 3.534716981132076e-06,
"loss": 2.0592010498046873,
"step": 51900
},
{
"epoch": 0.65,
"grad_norm": 0.008368916809558868,
"learning_rate": 3.5221383647798746e-06,
"loss": 2.0964291381835936,
"step": 52000
},
{
"epoch": 0.65125,
"grad_norm": 25.623943328857422,
"learning_rate": 3.509559748427673e-06,
"loss": 2.2132614135742186,
"step": 52100
},
{
"epoch": 0.6525,
"grad_norm": 23.970529556274414,
"learning_rate": 3.4969811320754722e-06,
"loss": 2.2939547729492187,
"step": 52200
},
{
"epoch": 0.65375,
"grad_norm": 0.13999006152153015,
"learning_rate": 3.484402515723271e-06,
"loss": 2.1940087890625,
"step": 52300
},
{
"epoch": 0.655,
"grad_norm": 6.885776519775391,
"learning_rate": 3.4718238993710694e-06,
"loss": 2.3822996520996096,
"step": 52400
},
{
"epoch": 0.65625,
"grad_norm": 28.326892852783203,
"learning_rate": 3.4592452830188684e-06,
"loss": 2.7556744384765626,
"step": 52500
},
{
"epoch": 0.6575,
"grad_norm": 29.53263282775879,
"learning_rate": 3.446666666666667e-06,
"loss": 2.4048992919921877,
"step": 52600
},
{
"epoch": 0.65875,
"grad_norm": 9.168279647827148,
"learning_rate": 3.434088050314466e-06,
"loss": 2.3623193359375,
"step": 52700
},
{
"epoch": 0.66,
"grad_norm": 51.9276123046875,
"learning_rate": 3.4215094339622646e-06,
"loss": 2.3159774780273437,
"step": 52800
},
{
"epoch": 0.66125,
"grad_norm": 34.09861373901367,
"learning_rate": 3.408930817610063e-06,
"loss": 2.254779968261719,
"step": 52900
},
{
"epoch": 0.6625,
"grad_norm": 24.50155258178711,
"learning_rate": 3.396352201257862e-06,
"loss": 2.4943356323242187,
"step": 53000
},
{
"epoch": 0.66375,
"grad_norm": 1.6096267700195312,
"learning_rate": 3.3837735849056608e-06,
"loss": 2.4191680908203126,
"step": 53100
},
{
"epoch": 0.665,
"grad_norm": 9.347710609436035,
"learning_rate": 3.3711949685534593e-06,
"loss": 2.3084637451171877,
"step": 53200
},
{
"epoch": 0.66625,
"grad_norm": 27.17746925354004,
"learning_rate": 3.3586163522012584e-06,
"loss": 2.3632272338867186,
"step": 53300
},
{
"epoch": 0.6675,
"grad_norm": 19.401647567749023,
"learning_rate": 3.346037735849057e-06,
"loss": 2.4689659118652343,
"step": 53400
},
{
"epoch": 0.66875,
"grad_norm": 0.02893674187362194,
"learning_rate": 3.333459119496856e-06,
"loss": 2.2349473571777345,
"step": 53500
},
{
"epoch": 0.67,
"grad_norm": 0.308145672082901,
"learning_rate": 3.3208805031446545e-06,
"loss": 2.4012570190429687,
"step": 53600
},
{
"epoch": 0.67125,
"grad_norm": 27.814510345458984,
"learning_rate": 3.308301886792453e-06,
"loss": 2.364227752685547,
"step": 53700
},
{
"epoch": 0.6725,
"grad_norm": 30.748811721801758,
"learning_rate": 3.295723270440252e-06,
"loss": 2.298586883544922,
"step": 53800
},
{
"epoch": 0.67375,
"grad_norm": 12.556225776672363,
"learning_rate": 3.2831446540880507e-06,
"loss": 2.026658935546875,
"step": 53900
},
{
"epoch": 0.675,
"grad_norm": 32.828857421875,
"learning_rate": 3.2705660377358493e-06,
"loss": 2.2101339721679687,
"step": 54000
},
{
"epoch": 0.67625,
"grad_norm": 41.711971282958984,
"learning_rate": 3.2579874213836483e-06,
"loss": 2.2615397644042967,
"step": 54100
},
{
"epoch": 0.6775,
"grad_norm": 22.22212791442871,
"learning_rate": 3.245408805031447e-06,
"loss": 2.264314880371094,
"step": 54200
},
{
"epoch": 0.67875,
"grad_norm": 0.31344112753868103,
"learning_rate": 3.232830188679246e-06,
"loss": 1.843946990966797,
"step": 54300
},
{
"epoch": 0.68,
"grad_norm": 11.150541305541992,
"learning_rate": 3.2202515723270445e-06,
"loss": 2.107444305419922,
"step": 54400
},
{
"epoch": 0.68125,
"grad_norm": 27.848705291748047,
"learning_rate": 3.207672955974843e-06,
"loss": 2.6434942626953126,
"step": 54500
},
{
"epoch": 0.6825,
"grad_norm": 20.511205673217773,
"learning_rate": 3.195094339622642e-06,
"loss": 2.2614511108398436,
"step": 54600
},
{
"epoch": 0.68375,
"grad_norm": 15.717256546020508,
"learning_rate": 3.1825157232704407e-06,
"loss": 2.4019781494140626,
"step": 54700
},
{
"epoch": 0.685,
"grad_norm": 15.28030014038086,
"learning_rate": 3.1699371069182393e-06,
"loss": 2.1407426452636718,
"step": 54800
},
{
"epoch": 0.68625,
"grad_norm": 26.36769676208496,
"learning_rate": 3.1573584905660383e-06,
"loss": 2.4883404541015626,
"step": 54900
},
{
"epoch": 0.6875,
"grad_norm": 30.11307144165039,
"learning_rate": 3.144779874213837e-06,
"loss": 2.4928182983398437,
"step": 55000
},
{
"epoch": 0.68875,
"grad_norm": 3.050213575363159,
"learning_rate": 3.132201257861636e-06,
"loss": 2.242222900390625,
"step": 55100
},
{
"epoch": 0.69,
"grad_norm": 0.08332010358572006,
"learning_rate": 3.1196226415094344e-06,
"loss": 2.095365447998047,
"step": 55200
},
{
"epoch": 0.69125,
"grad_norm": 25.02206802368164,
"learning_rate": 3.107044025157233e-06,
"loss": 2.2118115234375,
"step": 55300
},
{
"epoch": 0.6925,
"grad_norm": 21.8033390045166,
"learning_rate": 3.094465408805032e-06,
"loss": 2.372667236328125,
"step": 55400
},
{
"epoch": 0.69375,
"grad_norm": 0.31131941080093384,
"learning_rate": 3.0818867924528306e-06,
"loss": 2.4498031616210936,
"step": 55500
},
{
"epoch": 0.695,
"grad_norm": 1.234559416770935,
"learning_rate": 3.069308176100629e-06,
"loss": 2.095816650390625,
"step": 55600
},
{
"epoch": 0.69625,
"grad_norm": 24.62751007080078,
"learning_rate": 3.0567295597484282e-06,
"loss": 2.246084442138672,
"step": 55700
},
{
"epoch": 0.6975,
"grad_norm": 32.08023452758789,
"learning_rate": 3.044150943396227e-06,
"loss": 2.2728422546386717,
"step": 55800
},
{
"epoch": 0.69875,
"grad_norm": 1.7580287456512451,
"learning_rate": 3.0315723270440254e-06,
"loss": 1.912986297607422,
"step": 55900
},
{
"epoch": 0.7,
"grad_norm": 0.024206427857279778,
"learning_rate": 3.0189937106918244e-06,
"loss": 2.12907470703125,
"step": 56000
},
{
"epoch": 0.70125,
"grad_norm": 26.95330810546875,
"learning_rate": 3.006415094339623e-06,
"loss": 2.421792297363281,
"step": 56100
},
{
"epoch": 0.7025,
"grad_norm": 22.763641357421875,
"learning_rate": 2.993836477987422e-06,
"loss": 2.2753536987304686,
"step": 56200
},
{
"epoch": 0.70375,
"grad_norm": 6.763653755187988,
"learning_rate": 2.9812578616352206e-06,
"loss": 2.017315673828125,
"step": 56300
},
{
"epoch": 0.705,
"grad_norm": 0.030549824237823486,
"learning_rate": 2.968679245283019e-06,
"loss": 2.2717471313476563,
"step": 56400
},
{
"epoch": 0.70625,
"grad_norm": 30.307323455810547,
"learning_rate": 2.956100628930818e-06,
"loss": 2.375959777832031,
"step": 56500
},
{
"epoch": 0.7075,
"grad_norm": 22.114675521850586,
"learning_rate": 2.9435220125786168e-06,
"loss": 2.2014059448242187,
"step": 56600
},
{
"epoch": 0.70875,
"grad_norm": 0.3245386779308319,
"learning_rate": 2.9309433962264153e-06,
"loss": 1.9942924499511718,
"step": 56700
},
{
"epoch": 0.71,
"grad_norm": 0.012478250078856945,
"learning_rate": 2.9183647798742144e-06,
"loss": 2.166055908203125,
"step": 56800
},
{
"epoch": 0.71125,
"grad_norm": 25.91703987121582,
"learning_rate": 2.905786163522013e-06,
"loss": 2.2955760192871093,
"step": 56900
},
{
"epoch": 0.7125,
"grad_norm": 23.741256713867188,
"learning_rate": 2.893207547169812e-06,
"loss": 2.308643493652344,
"step": 57000
},
{
"epoch": 0.71375,
"grad_norm": 32.72561264038086,
"learning_rate": 2.8806289308176105e-06,
"loss": 2.2564122009277345,
"step": 57100
},
{
"epoch": 0.715,
"grad_norm": 0.09459064900875092,
"learning_rate": 2.868050314465409e-06,
"loss": 2.263880615234375,
"step": 57200
},
{
"epoch": 0.71625,
"grad_norm": 31.514862060546875,
"learning_rate": 2.855471698113208e-06,
"loss": 2.22376953125,
"step": 57300
},
{
"epoch": 0.7175,
"grad_norm": 23.945999145507812,
"learning_rate": 2.8428930817610067e-06,
"loss": 2.121052703857422,
"step": 57400
},
{
"epoch": 0.71875,
"grad_norm": 11.593914985656738,
"learning_rate": 2.8303144654088053e-06,
"loss": 2.0279052734375,
"step": 57500
},
{
"epoch": 0.72,
"grad_norm": 0.03185856342315674,
"learning_rate": 2.8177358490566043e-06,
"loss": 1.9632949829101562,
"step": 57600
},
{
"epoch": 0.72125,
"grad_norm": 26.430574417114258,
"learning_rate": 2.805157232704403e-06,
"loss": 2.224911804199219,
"step": 57700
},
{
"epoch": 0.7225,
"grad_norm": 23.202377319335938,
"learning_rate": 2.792578616352202e-06,
"loss": 2.3928271484375,
"step": 57800
},
{
"epoch": 0.72375,
"grad_norm": 6.8675127029418945,
"learning_rate": 2.7800000000000005e-06,
"loss": 2.345365447998047,
"step": 57900
},
{
"epoch": 0.725,
"grad_norm": 7.9162750244140625,
"learning_rate": 2.767421383647799e-06,
"loss": 2.40666015625,
"step": 58000
},
{
"epoch": 0.72625,
"grad_norm": 28.883695602416992,
"learning_rate": 2.754842767295598e-06,
"loss": 2.1290104675292967,
"step": 58100
},
{
"epoch": 0.7275,
"grad_norm": 28.86813735961914,
"learning_rate": 2.7422641509433967e-06,
"loss": 2.3013229370117188,
"step": 58200
},
{
"epoch": 0.72875,
"grad_norm": 0.05526283383369446,
"learning_rate": 2.7296855345911953e-06,
"loss": 2.349634246826172,
"step": 58300
},
{
"epoch": 0.73,
"grad_norm": 0.46888020634651184,
"learning_rate": 2.7171069182389943e-06,
"loss": 2.269898681640625,
"step": 58400
},
{
"epoch": 0.73125,
"grad_norm": 21.197877883911133,
"learning_rate": 2.704528301886793e-06,
"loss": 2.294650115966797,
"step": 58500
},
{
"epoch": 0.7325,
"grad_norm": 28.490921020507812,
"learning_rate": 2.691949685534592e-06,
"loss": 2.507839813232422,
"step": 58600
},
{
"epoch": 0.73375,
"grad_norm": 0.9312068819999695,
"learning_rate": 2.6793710691823904e-06,
"loss": 2.1638980102539063,
"step": 58700
},
{
"epoch": 0.735,
"grad_norm": 7.32960319519043,
"learning_rate": 2.666792452830189e-06,
"loss": 2.1146189880371096,
"step": 58800
},
{
"epoch": 0.73625,
"grad_norm": 16.3245792388916,
"learning_rate": 2.654213836477988e-06,
"loss": 2.353025360107422,
"step": 58900
},
{
"epoch": 0.7375,
"grad_norm": 28.876869201660156,
"learning_rate": 2.641635220125786e-06,
"loss": 2.3220205688476563,
"step": 59000
},
{
"epoch": 0.73875,
"grad_norm": 1.1006479263305664,
"learning_rate": 2.6290566037735848e-06,
"loss": 2.3847764587402343,
"step": 59100
},
{
"epoch": 0.74,
"grad_norm": 0.052632566541433334,
"learning_rate": 2.616477987421384e-06,
"loss": 1.9747433471679687,
"step": 59200
},
{
"epoch": 0.74125,
"grad_norm": 27.36806297302246,
"learning_rate": 2.6038993710691824e-06,
"loss": 2.234722137451172,
"step": 59300
},
{
"epoch": 0.7425,
"grad_norm": 32.45076370239258,
"learning_rate": 2.591320754716981e-06,
"loss": 2.2224740600585937,
"step": 59400
},
{
"epoch": 0.74375,
"grad_norm": 0.18155953288078308,
"learning_rate": 2.57874213836478e-06,
"loss": 2.1847653198242187,
"step": 59500
},
{
"epoch": 0.745,
"grad_norm": 21.358049392700195,
"learning_rate": 2.5661635220125786e-06,
"loss": 2.1930168151855467,
"step": 59600
},
{
"epoch": 0.74625,
"grad_norm": 23.599361419677734,
"learning_rate": 2.553584905660377e-06,
"loss": 2.13337646484375,
"step": 59700
},
{
"epoch": 0.7475,
"grad_norm": 25.336759567260742,
"learning_rate": 2.541006289308176e-06,
"loss": 2.245163879394531,
"step": 59800
},
{
"epoch": 0.74875,
"grad_norm": 14.410560607910156,
"learning_rate": 2.5284276729559747e-06,
"loss": 2.1882379150390623,
"step": 59900
},
{
"epoch": 0.75,
"grad_norm": 17.1163330078125,
"learning_rate": 2.5158490566037737e-06,
"loss": 1.899642791748047,
"step": 60000
},
{
"epoch": 0.75125,
"grad_norm": 32.30154800415039,
"learning_rate": 2.5032704402515723e-06,
"loss": 2.124136962890625,
"step": 60100
},
{
"epoch": 0.7525,
"grad_norm": 34.646514892578125,
"learning_rate": 2.4906918238993713e-06,
"loss": 2.2636474609375,
"step": 60200
},
{
"epoch": 0.75375,
"grad_norm": 0.17916053533554077,
"learning_rate": 2.47811320754717e-06,
"loss": 2.10548095703125,
"step": 60300
},
{
"epoch": 0.755,
"grad_norm": 0.3836560845375061,
"learning_rate": 2.465534591194969e-06,
"loss": 2.248681640625,
"step": 60400
},
{
"epoch": 0.75625,
"grad_norm": 25.696950912475586,
"learning_rate": 2.4529559748427675e-06,
"loss": 2.4749154663085937,
"step": 60500
},
{
"epoch": 0.7575,
"grad_norm": 11.523337364196777,
"learning_rate": 2.440377358490566e-06,
"loss": 2.1865379333496096,
"step": 60600
},
{
"epoch": 0.75875,
"grad_norm": 0.2725467383861542,
"learning_rate": 2.427798742138365e-06,
"loss": 2.069666442871094,
"step": 60700
},
{
"epoch": 0.76,
"grad_norm": 0.008206835016608238,
"learning_rate": 2.4152201257861637e-06,
"loss": 2.1584426879882814,
"step": 60800
},
{
"epoch": 0.76125,
"grad_norm": 28.906057357788086,
"learning_rate": 2.4026415094339627e-06,
"loss": 2.254979705810547,
"step": 60900
},
{
"epoch": 0.7625,
"grad_norm": 12.576886177062988,
"learning_rate": 2.3900628930817613e-06,
"loss": 2.25433349609375,
"step": 61000
},
{
"epoch": 0.76375,
"grad_norm": 10.839340209960938,
"learning_rate": 2.37748427672956e-06,
"loss": 2.084885711669922,
"step": 61100
},
{
"epoch": 0.765,
"grad_norm": 0.06599520146846771,
"learning_rate": 2.364905660377359e-06,
"loss": 2.23760986328125,
"step": 61200
},
{
"epoch": 0.76625,
"grad_norm": 31.791736602783203,
"learning_rate": 2.3523270440251575e-06,
"loss": 2.42175537109375,
"step": 61300
},
{
"epoch": 0.7675,
"grad_norm": 29.30803680419922,
"learning_rate": 2.339748427672956e-06,
"loss": 2.215085754394531,
"step": 61400
},
{
"epoch": 0.76875,
"grad_norm": 5.6741461753845215,
"learning_rate": 2.327169811320755e-06,
"loss": 2.066580810546875,
"step": 61500
},
{
"epoch": 0.77,
"grad_norm": 0.006725333631038666,
"learning_rate": 2.3145911949685537e-06,
"loss": 2.1789097595214844,
"step": 61600
},
{
"epoch": 0.77125,
"grad_norm": 21.000843048095703,
"learning_rate": 2.3020125786163527e-06,
"loss": 2.4083587646484377,
"step": 61700
},
{
"epoch": 0.7725,
"grad_norm": 24.83283042907715,
"learning_rate": 2.2894339622641513e-06,
"loss": 2.813940124511719,
"step": 61800
},
{
"epoch": 0.77375,
"grad_norm": 0.34976866841316223,
"learning_rate": 2.27685534591195e-06,
"loss": 2.1371139526367187,
"step": 61900
},
{
"epoch": 0.775,
"grad_norm": 0.2197369635105133,
"learning_rate": 2.264276729559749e-06,
"loss": 1.9622064208984376,
"step": 62000
},
{
"epoch": 0.77625,
"grad_norm": 23.31989860534668,
"learning_rate": 2.2516981132075474e-06,
"loss": 2.1933587646484374,
"step": 62100
},
{
"epoch": 0.7775,
"grad_norm": 26.234939575195312,
"learning_rate": 2.239119496855346e-06,
"loss": 2.341946716308594,
"step": 62200
},
{
"epoch": 0.77875,
"grad_norm": 27.059640884399414,
"learning_rate": 2.226540880503145e-06,
"loss": 2.026497497558594,
"step": 62300
},
{
"epoch": 0.78,
"grad_norm": 10.47779369354248,
"learning_rate": 2.2139622641509436e-06,
"loss": 2.1496810913085938,
"step": 62400
},
{
"epoch": 0.78125,
"grad_norm": 14.565492630004883,
"learning_rate": 2.2013836477987426e-06,
"loss": 2.1073020935058593,
"step": 62500
},
{
"epoch": 0.7825,
"grad_norm": 26.76126480102539,
"learning_rate": 2.188805031446541e-06,
"loss": 2.1374913024902344,
"step": 62600
},
{
"epoch": 0.78375,
"grad_norm": 1.1460466384887695,
"learning_rate": 2.17622641509434e-06,
"loss": 2.4106900024414064,
"step": 62700
},
{
"epoch": 0.785,
"grad_norm": 0.04275045916438103,
"learning_rate": 2.163647798742139e-06,
"loss": 1.977836151123047,
"step": 62800
},
{
"epoch": 0.78625,
"grad_norm": 29.69572639465332,
"learning_rate": 2.1510691823899374e-06,
"loss": 2.211481628417969,
"step": 62900
},
{
"epoch": 0.7875,
"grad_norm": 27.114084243774414,
"learning_rate": 2.138490566037736e-06,
"loss": 2.3308370971679686,
"step": 63000
},
{
"epoch": 0.78875,
"grad_norm": 0.024876583367586136,
"learning_rate": 2.125911949685535e-06,
"loss": 2.2788821411132814,
"step": 63100
},
{
"epoch": 0.79,
"grad_norm": 0.011269732378423214,
"learning_rate": 2.1133333333333336e-06,
"loss": 2.2254403686523436,
"step": 63200
},
{
"epoch": 0.79125,
"grad_norm": 29.721452713012695,
"learning_rate": 2.1007547169811326e-06,
"loss": 2.2361138916015624,
"step": 63300
},
{
"epoch": 0.7925,
"grad_norm": 26.52519416809082,
"learning_rate": 2.088176100628931e-06,
"loss": 2.304778747558594,
"step": 63400
},
{
"epoch": 0.79375,
"grad_norm": 0.33907511830329895,
"learning_rate": 2.0755974842767297e-06,
"loss": 2.1444125366210938,
"step": 63500
},
{
"epoch": 0.795,
"grad_norm": 0.848170280456543,
"learning_rate": 2.0630188679245288e-06,
"loss": 2.224850311279297,
"step": 63600
},
{
"epoch": 0.79625,
"grad_norm": 30.878385543823242,
"learning_rate": 2.0504402515723273e-06,
"loss": 2.3003367614746093,
"step": 63700
},
{
"epoch": 0.7975,
"grad_norm": 21.298805236816406,
"learning_rate": 2.037861635220126e-06,
"loss": 2.251806640625,
"step": 63800
},
{
"epoch": 0.79875,
"grad_norm": 1.4935413599014282,
"learning_rate": 2.025283018867925e-06,
"loss": 2.0355984497070314,
"step": 63900
},
{
"epoch": 0.8,
"grad_norm": 0.23573997616767883,
"learning_rate": 2.012704402515723e-06,
"loss": 2.3094590759277343,
"step": 64000
},
{
"epoch": 0.80125,
"grad_norm": 21.646190643310547,
"learning_rate": 2.000125786163522e-06,
"loss": 2.45231689453125,
"step": 64100
},
{
"epoch": 0.8025,
"grad_norm": 26.4865779876709,
"learning_rate": 1.9875471698113207e-06,
"loss": 2.481203918457031,
"step": 64200
},
{
"epoch": 0.80375,
"grad_norm": 16.815086364746094,
"learning_rate": 1.9749685534591197e-06,
"loss": 2.274866180419922,
"step": 64300
},
{
"epoch": 0.805,
"grad_norm": 1.5168429613113403,
"learning_rate": 1.9623899371069183e-06,
"loss": 2.090848083496094,
"step": 64400
},
{
"epoch": 0.80625,
"grad_norm": 24.665088653564453,
"learning_rate": 1.949811320754717e-06,
"loss": 2.149015197753906,
"step": 64500
},
{
"epoch": 0.8075,
"grad_norm": 44.3161735534668,
"learning_rate": 1.937232704402516e-06,
"loss": 2.267424774169922,
"step": 64600
},
{
"epoch": 0.80875,
"grad_norm": 14.768138885498047,
"learning_rate": 1.9246540880503145e-06,
"loss": 2.1676406860351562,
"step": 64700
},
{
"epoch": 0.81,
"grad_norm": 23.735509872436523,
"learning_rate": 1.912075471698113e-06,
"loss": 2.066508026123047,
"step": 64800
},
{
"epoch": 0.81125,
"grad_norm": 27.178951263427734,
"learning_rate": 1.899496855345912e-06,
"loss": 2.202220916748047,
"step": 64900
},
{
"epoch": 0.8125,
"grad_norm": 27.76671600341797,
"learning_rate": 1.8869182389937107e-06,
"loss": 2.390093994140625,
"step": 65000
},
{
"epoch": 0.81375,
"grad_norm": 22.02654266357422,
"learning_rate": 1.8743396226415094e-06,
"loss": 2.02405517578125,
"step": 65100
},
{
"epoch": 0.815,
"grad_norm": 0.019850876182317734,
"learning_rate": 1.8617610062893082e-06,
"loss": 2.1032846069335935,
"step": 65200
},
{
"epoch": 0.81625,
"grad_norm": 26.451114654541016,
"learning_rate": 1.849182389937107e-06,
"loss": 2.0573655700683595,
"step": 65300
},
{
"epoch": 0.8175,
"grad_norm": 25.203014373779297,
"learning_rate": 1.8366037735849056e-06,
"loss": 2.23822265625,
"step": 65400
},
{
"epoch": 0.81875,
"grad_norm": 0.07573448121547699,
"learning_rate": 1.8240251572327044e-06,
"loss": 2.170495910644531,
"step": 65500
},
{
"epoch": 0.82,
"grad_norm": 0.6197527647018433,
"learning_rate": 1.8114465408805032e-06,
"loss": 2.0060690307617186,
"step": 65600
},
{
"epoch": 0.82125,
"grad_norm": 29.55573081970215,
"learning_rate": 1.798867924528302e-06,
"loss": 2.104353942871094,
"step": 65700
},
{
"epoch": 0.8225,
"grad_norm": 13.039762496948242,
"learning_rate": 1.7862893081761006e-06,
"loss": 1.940672607421875,
"step": 65800
},
{
"epoch": 0.82375,
"grad_norm": 10.955610275268555,
"learning_rate": 1.7737106918238994e-06,
"loss": 1.850831298828125,
"step": 65900
},
{
"epoch": 0.825,
"grad_norm": 28.8968505859375,
"learning_rate": 1.7611320754716982e-06,
"loss": 2.2148202514648436,
"step": 66000
},
{
"epoch": 0.82625,
"grad_norm": 20.05891990661621,
"learning_rate": 1.748553459119497e-06,
"loss": 2.0501080322265626,
"step": 66100
},
{
"epoch": 0.8275,
"grad_norm": 29.854055404663086,
"learning_rate": 1.7359748427672956e-06,
"loss": 2.338354187011719,
"step": 66200
},
{
"epoch": 0.82875,
"grad_norm": 0.17976756393909454,
"learning_rate": 1.7233962264150944e-06,
"loss": 2.372988739013672,
"step": 66300
},
{
"epoch": 0.83,
"grad_norm": 14.72498893737793,
"learning_rate": 1.7108176100628932e-06,
"loss": 2.317147674560547,
"step": 66400
},
{
"epoch": 0.83125,
"grad_norm": 25.00243377685547,
"learning_rate": 1.698238993710692e-06,
"loss": 2.546220550537109,
"step": 66500
},
{
"epoch": 0.8325,
"grad_norm": 28.58661460876465,
"learning_rate": 1.6856603773584906e-06,
"loss": 2.251160125732422,
"step": 66600
},
{
"epoch": 0.83375,
"grad_norm": 24.616119384765625,
"learning_rate": 1.6730817610062894e-06,
"loss": 1.8937945556640625,
"step": 66700
},
{
"epoch": 0.835,
"grad_norm": 0.021497901529073715,
"learning_rate": 1.6605031446540882e-06,
"loss": 2.2238792419433593,
"step": 66800
},
{
"epoch": 0.83625,
"grad_norm": 20.581586837768555,
"learning_rate": 1.647924528301887e-06,
"loss": 2.025327606201172,
"step": 66900
},
{
"epoch": 0.8375,
"grad_norm": 29.323177337646484,
"learning_rate": 1.6353459119496855e-06,
"loss": 2.465900115966797,
"step": 67000
},
{
"epoch": 0.83875,
"grad_norm": 39.928653717041016,
"learning_rate": 1.6227672955974843e-06,
"loss": 2.1216752624511717,
"step": 67100
},
{
"epoch": 0.84,
"grad_norm": 0.043776318430900574,
"learning_rate": 1.6101886792452831e-06,
"loss": 2.0655088806152344,
"step": 67200
},
{
"epoch": 0.84125,
"grad_norm": 25.438045501708984,
"learning_rate": 1.597610062893082e-06,
"loss": 2.087451171875,
"step": 67300
},
{
"epoch": 0.8425,
"grad_norm": 27.504056930541992,
"learning_rate": 1.5850314465408805e-06,
"loss": 2.1888687133789064,
"step": 67400
},
{
"epoch": 0.84375,
"grad_norm": 3.616384744644165,
"learning_rate": 1.5724528301886793e-06,
"loss": 2.11557861328125,
"step": 67500
},
{
"epoch": 0.845,
"grad_norm": 0.0108040077611804,
"learning_rate": 1.5598742138364781e-06,
"loss": 1.8791226196289061,
"step": 67600
},
{
"epoch": 0.84625,
"grad_norm": 24.401247024536133,
"learning_rate": 1.547295597484277e-06,
"loss": 2.1741673278808595,
"step": 67700
},
{
"epoch": 0.8475,
"grad_norm": 30.91659164428711,
"learning_rate": 1.5347169811320755e-06,
"loss": 2.1437338256835936,
"step": 67800
},
{
"epoch": 0.84875,
"grad_norm": 0.09920002520084381,
"learning_rate": 1.5221383647798743e-06,
"loss": 2.361597900390625,
"step": 67900
},
{
"epoch": 0.85,
"grad_norm": 0.24567686021327972,
"learning_rate": 1.509559748427673e-06,
"loss": 2.162798309326172,
"step": 68000
},
{
"epoch": 0.85125,
"grad_norm": 30.701915740966797,
"learning_rate": 1.4969811320754719e-06,
"loss": 2.3576284790039064,
"step": 68100
},
{
"epoch": 0.8525,
"grad_norm": 27.654489517211914,
"learning_rate": 1.4844025157232705e-06,
"loss": 2.233973083496094,
"step": 68200
},
{
"epoch": 0.85375,
"grad_norm": 3.693110227584839,
"learning_rate": 1.4718238993710693e-06,
"loss": 2.0266494750976562,
"step": 68300
},
{
"epoch": 0.855,
"grad_norm": 24.848302841186523,
"learning_rate": 1.459245283018868e-06,
"loss": 2.010597839355469,
"step": 68400
},
{
"epoch": 0.85625,
"grad_norm": 29.43360137939453,
"learning_rate": 1.4466666666666669e-06,
"loss": 2.4028634643554687,
"step": 68500
},
{
"epoch": 0.8575,
"grad_norm": 19.056543350219727,
"learning_rate": 1.4340880503144654e-06,
"loss": 2.272985076904297,
"step": 68600
},
{
"epoch": 0.85875,
"grad_norm": 1.6909524202346802,
"learning_rate": 1.4215094339622642e-06,
"loss": 2.3617742919921874,
"step": 68700
},
{
"epoch": 0.86,
"grad_norm": 21.4754695892334,
"learning_rate": 1.408930817610063e-06,
"loss": 2.270048828125,
"step": 68800
},
{
"epoch": 0.86125,
"grad_norm": 23.546085357666016,
"learning_rate": 1.3963522012578618e-06,
"loss": 2.2229489135742186,
"step": 68900
},
{
"epoch": 0.8625,
"grad_norm": 31.1220760345459,
"learning_rate": 1.3837735849056604e-06,
"loss": 2.3392955017089845,
"step": 69000
},
{
"epoch": 0.86375,
"grad_norm": 0.902050256729126,
"learning_rate": 1.3711949685534592e-06,
"loss": 2.018695526123047,
"step": 69100
},
{
"epoch": 0.865,
"grad_norm": 0.15184037387371063,
"learning_rate": 1.358616352201258e-06,
"loss": 2.03200439453125,
"step": 69200
},
{
"epoch": 0.86625,
"grad_norm": 25.513137817382812,
"learning_rate": 1.3460377358490568e-06,
"loss": 2.394747619628906,
"step": 69300
},
{
"epoch": 0.8675,
"grad_norm": 22.844642639160156,
"learning_rate": 1.3334591194968554e-06,
"loss": 2.346148681640625,
"step": 69400
},
{
"epoch": 0.86875,
"grad_norm": 0.07020768523216248,
"learning_rate": 1.3208805031446542e-06,
"loss": 2.269253387451172,
"step": 69500
},
{
"epoch": 0.87,
"grad_norm": 0.2192593514919281,
"learning_rate": 1.308301886792453e-06,
"loss": 2.37726806640625,
"step": 69600
},
{
"epoch": 0.87125,
"grad_norm": 33.360755920410156,
"learning_rate": 1.2957232704402518e-06,
"loss": 2.214630584716797,
"step": 69700
},
{
"epoch": 0.8725,
"grad_norm": 23.013324737548828,
"learning_rate": 1.2831446540880504e-06,
"loss": 2.24765869140625,
"step": 69800
},
{
"epoch": 0.87375,
"grad_norm": 0.6654048562049866,
"learning_rate": 1.2705660377358492e-06,
"loss": 2.5640155029296876,
"step": 69900
},
{
"epoch": 0.875,
"grad_norm": 3.4219138622283936,
"learning_rate": 1.257987421383648e-06,
"loss": 2.13403564453125,
"step": 70000
},
{
"epoch": 0.87625,
"grad_norm": 29.68235969543457,
"learning_rate": 1.2454088050314468e-06,
"loss": 2.29338134765625,
"step": 70100
},
{
"epoch": 0.8775,
"grad_norm": 23.957015991210938,
"learning_rate": 1.2328301886792454e-06,
"loss": 2.2298233032226564,
"step": 70200
},
{
"epoch": 0.87875,
"grad_norm": 17.787887573242188,
"learning_rate": 1.2202515723270442e-06,
"loss": 2.1456781005859376,
"step": 70300
},
{
"epoch": 0.88,
"grad_norm": 6.154158115386963,
"learning_rate": 1.207672955974843e-06,
"loss": 2.085262908935547,
"step": 70400
},
{
"epoch": 0.88125,
"grad_norm": 27.724740982055664,
"learning_rate": 1.1950943396226417e-06,
"loss": 2.1977059936523435,
"step": 70500
},
{
"epoch": 0.8825,
"grad_norm": 25.416337966918945,
"learning_rate": 1.1825157232704403e-06,
"loss": 2.048145751953125,
"step": 70600
},
{
"epoch": 0.88375,
"grad_norm": 30.56559181213379,
"learning_rate": 1.1699371069182391e-06,
"loss": 2.236173553466797,
"step": 70700
},
{
"epoch": 0.885,
"grad_norm": 0.02905646152794361,
"learning_rate": 1.157358490566038e-06,
"loss": 2.1224407958984375,
"step": 70800
},
{
"epoch": 0.88625,
"grad_norm": 22.59400749206543,
"learning_rate": 1.1447798742138367e-06,
"loss": 2.273807373046875,
"step": 70900
},
{
"epoch": 0.8875,
"grad_norm": 20.05291175842285,
"learning_rate": 1.1322012578616353e-06,
"loss": 2.122344207763672,
"step": 71000
},
{
"epoch": 0.88875,
"grad_norm": 0.9370853900909424,
"learning_rate": 1.1196226415094341e-06,
"loss": 2.123062744140625,
"step": 71100
},
{
"epoch": 0.89,
"grad_norm": 33.95699691772461,
"learning_rate": 1.107044025157233e-06,
"loss": 2.0445166015625,
"step": 71200
},
{
"epoch": 0.89125,
"grad_norm": 18.907957077026367,
"learning_rate": 1.0944654088050317e-06,
"loss": 1.96984619140625,
"step": 71300
},
{
"epoch": 0.8925,
"grad_norm": 28.72160530090332,
"learning_rate": 1.0818867924528303e-06,
"loss": 2.2017242431640627,
"step": 71400
},
{
"epoch": 0.89375,
"grad_norm": 0.1414874643087387,
"learning_rate": 1.0693081761006289e-06,
"loss": 2.244500732421875,
"step": 71500
},
{
"epoch": 0.895,
"grad_norm": 9.050920486450195,
"learning_rate": 1.0567295597484277e-06,
"loss": 2.038321990966797,
"step": 71600
},
{
"epoch": 0.89625,
"grad_norm": 20.498157501220703,
"learning_rate": 1.0441509433962265e-06,
"loss": 2.200545349121094,
"step": 71700
},
{
"epoch": 0.8975,
"grad_norm": 28.807994842529297,
"learning_rate": 1.0315723270440253e-06,
"loss": 2.4168437194824217,
"step": 71800
},
{
"epoch": 0.89875,
"grad_norm": 13.557960510253906,
"learning_rate": 1.0189937106918239e-06,
"loss": 1.8951638793945313,
"step": 71900
},
{
"epoch": 0.9,
"grad_norm": 0.04173203930258751,
"learning_rate": 1.0064150943396227e-06,
"loss": 1.8145158386230469,
"step": 72000
},
{
"epoch": 0.90125,
"grad_norm": 26.1096248626709,
"learning_rate": 9.938364779874214e-07,
"loss": 1.9488568115234375,
"step": 72100
},
{
"epoch": 0.9025,
"grad_norm": 26.35661506652832,
"learning_rate": 9.812578616352202e-07,
"loss": 2.003428649902344,
"step": 72200
},
{
"epoch": 0.90375,
"grad_norm": 0.6807858943939209,
"learning_rate": 9.686792452830188e-07,
"loss": 2.1505252075195314,
"step": 72300
},
{
"epoch": 0.905,
"grad_norm": 8.778648376464844,
"learning_rate": 9.561006289308176e-07,
"loss": 1.9629728698730469,
"step": 72400
},
{
"epoch": 0.90625,
"grad_norm": 21.047719955444336,
"learning_rate": 9.435220125786164e-07,
"loss": 2.397852020263672,
"step": 72500
},
{
"epoch": 0.9075,
"grad_norm": 28.41111946105957,
"learning_rate": 9.309433962264151e-07,
"loss": 2.4121713256835937,
"step": 72600
},
{
"epoch": 0.90875,
"grad_norm": 0.15218345820903778,
"learning_rate": 9.183647798742139e-07,
"loss": 2.357157897949219,
"step": 72700
},
{
"epoch": 0.91,
"grad_norm": 0.017881672829389572,
"learning_rate": 9.057861635220126e-07,
"loss": 1.9150712585449219,
"step": 72800
},
{
"epoch": 0.91125,
"grad_norm": 25.772994995117188,
"learning_rate": 8.932075471698114e-07,
"loss": 2.163759460449219,
"step": 72900
},
{
"epoch": 0.9125,
"grad_norm": 23.19663429260254,
"learning_rate": 8.806289308176101e-07,
"loss": 2.012795867919922,
"step": 73000
},
{
"epoch": 0.91375,
"grad_norm": 4.303622722625732,
"learning_rate": 8.680503144654089e-07,
"loss": 2.200050048828125,
"step": 73100
},
{
"epoch": 0.915,
"grad_norm": 0.030221115797758102,
"learning_rate": 8.554716981132076e-07,
"loss": 2.2937960815429688,
"step": 73200
},
{
"epoch": 0.91625,
"grad_norm": 26.862483978271484,
"learning_rate": 8.428930817610064e-07,
"loss": 2.150227508544922,
"step": 73300
},
{
"epoch": 0.9175,
"grad_norm": 21.228776931762695,
"learning_rate": 8.303144654088051e-07,
"loss": 2.2184078979492186,
"step": 73400
},
{
"epoch": 0.91875,
"grad_norm": 4.426553726196289,
"learning_rate": 8.177358490566039e-07,
"loss": 2.0852456665039063,
"step": 73500
},
{
"epoch": 0.92,
"grad_norm": 5.594974994659424,
"learning_rate": 8.051572327044026e-07,
"loss": 2.157823944091797,
"step": 73600
},
{
"epoch": 0.92125,
"grad_norm": 27.547351837158203,
"learning_rate": 7.925786163522014e-07,
"loss": 2.1868157958984376,
"step": 73700
},
{
"epoch": 0.9225,
"grad_norm": 29.176279067993164,
"learning_rate": 7.8e-07,
"loss": 2.2395770263671877,
"step": 73800
},
{
"epoch": 0.92375,
"grad_norm": 0.281236857175827,
"learning_rate": 7.674213836477988e-07,
"loss": 2.0369967651367187,
"step": 73900
},
{
"epoch": 0.925,
"grad_norm": 16.027692794799805,
"learning_rate": 7.548427672955975e-07,
"loss": 2.113655853271484,
"step": 74000
},
{
"epoch": 0.92625,
"grad_norm": 31.675695419311523,
"learning_rate": 7.422641509433963e-07,
"loss": 2.0642041015625,
"step": 74100
},
{
"epoch": 0.9275,
"grad_norm": 36.27760696411133,
"learning_rate": 7.29685534591195e-07,
"loss": 1.925348358154297,
"step": 74200
},
{
"epoch": 0.92875,
"grad_norm": 0.32410475611686707,
"learning_rate": 7.171069182389938e-07,
"loss": 1.969832763671875,
"step": 74300
},
{
"epoch": 0.93,
"grad_norm": 0.028427617624402046,
"learning_rate": 7.045283018867925e-07,
"loss": 1.9005357360839843,
"step": 74400
},
{
"epoch": 0.93125,
"grad_norm": 34.29441833496094,
"learning_rate": 6.919496855345913e-07,
"loss": 2.174887390136719,
"step": 74500
},
{
"epoch": 0.9325,
"grad_norm": 11.873321533203125,
"learning_rate": 6.7937106918239e-07,
"loss": 2.1516876220703125,
"step": 74600
},
{
"epoch": 0.93375,
"grad_norm": 3.312832832336426,
"learning_rate": 6.667924528301888e-07,
"loss": 2.204561767578125,
"step": 74700
},
{
"epoch": 0.935,
"grad_norm": 0.09160123765468597,
"learning_rate": 6.542138364779875e-07,
"loss": 2.13657958984375,
"step": 74800
},
{
"epoch": 0.93625,
"grad_norm": 23.657917022705078,
"learning_rate": 6.416352201257863e-07,
"loss": 2.3265130615234373,
"step": 74900
},
{
"epoch": 0.9375,
"grad_norm": 21.326000213623047,
"learning_rate": 6.29056603773585e-07,
"loss": 2.133411865234375,
"step": 75000
},
{
"epoch": 0.93875,
"grad_norm": 0.5653764009475708,
"learning_rate": 6.164779874213837e-07,
"loss": 1.9549205017089843,
"step": 75100
},
{
"epoch": 0.94,
"grad_norm": 20.32880210876465,
"learning_rate": 6.038993710691824e-07,
"loss": 1.80646240234375,
"step": 75200
},
{
"epoch": 0.94125,
"grad_norm": 24.76996421813965,
"learning_rate": 5.913207547169812e-07,
"loss": 2.3081170654296876,
"step": 75300
},
{
"epoch": 0.9425,
"grad_norm": 26.569751739501953,
"learning_rate": 5.787421383647799e-07,
"loss": 2.0019149780273438,
"step": 75400
},
{
"epoch": 0.94375,
"grad_norm": 0.3244832158088684,
"learning_rate": 5.661635220125787e-07,
"loss": 2.1780990600585937,
"step": 75500
},
{
"epoch": 0.945,
"grad_norm": 0.22316910326480865,
"learning_rate": 5.535849056603773e-07,
"loss": 2.0430670166015625,
"step": 75600
},
{
"epoch": 0.94625,
"grad_norm": 27.824186325073242,
"learning_rate": 5.410062893081761e-07,
"loss": 2.4939804077148438,
"step": 75700
},
{
"epoch": 0.9475,
"grad_norm": 31.06242561340332,
"learning_rate": 5.284276729559748e-07,
"loss": 2.15638916015625,
"step": 75800
},
{
"epoch": 0.94875,
"grad_norm": 3.928506851196289,
"learning_rate": 5.158490566037736e-07,
"loss": 2.05089111328125,
"step": 75900
},
{
"epoch": 0.95,
"grad_norm": 11.446708679199219,
"learning_rate": 5.032704402515723e-07,
"loss": 2.08630615234375,
"step": 76000
},
{
"epoch": 0.95125,
"grad_norm": 26.42768096923828,
"learning_rate": 4.906918238993711e-07,
"loss": 2.343414764404297,
"step": 76100
},
{
"epoch": 0.9525,
"grad_norm": 28.124576568603516,
"learning_rate": 4.781132075471698e-07,
"loss": 2.227467498779297,
"step": 76200
},
{
"epoch": 0.95375,
"grad_norm": 4.6327290534973145,
"learning_rate": 4.655345911949686e-07,
"loss": 2.305023193359375,
"step": 76300
},
{
"epoch": 0.955,
"grad_norm": 0.009622328914701939,
"learning_rate": 4.5295597484276735e-07,
"loss": 1.961500701904297,
"step": 76400
},
{
"epoch": 0.95625,
"grad_norm": 24.499088287353516,
"learning_rate": 4.403773584905661e-07,
"loss": 2.338931121826172,
"step": 76500
},
{
"epoch": 0.9575,
"grad_norm": 32.421878814697266,
"learning_rate": 4.2779874213836484e-07,
"loss": 2.53737548828125,
"step": 76600
},
{
"epoch": 0.95875,
"grad_norm": 0.04140196368098259,
"learning_rate": 4.152201257861636e-07,
"loss": 2.1324703979492186,
"step": 76700
},
{
"epoch": 0.96,
"grad_norm": 0.056152064353227615,
"learning_rate": 4.0264150943396233e-07,
"loss": 2.1852513122558594,
"step": 76800
},
{
"epoch": 0.96125,
"grad_norm": 28.970626831054688,
"learning_rate": 3.9006289308176107e-07,
"loss": 2.2562815856933596,
"step": 76900
},
{
"epoch": 0.9625,
"grad_norm": 28.90065574645996,
"learning_rate": 3.774842767295598e-07,
"loss": 2.4266934204101562,
"step": 77000
},
{
"epoch": 0.96375,
"grad_norm": 0.12267394363880157,
"learning_rate": 3.6490566037735856e-07,
"loss": 2.2760604858398437,
"step": 77100
},
{
"epoch": 0.965,
"grad_norm": 55.57836151123047,
"learning_rate": 3.523270440251573e-07,
"loss": 2.051964111328125,
"step": 77200
},
{
"epoch": 0.96625,
"grad_norm": 27.030414581298828,
"learning_rate": 3.3974842767295605e-07,
"loss": 2.140052490234375,
"step": 77300
},
{
"epoch": 0.9675,
"grad_norm": 30.74704933166504,
"learning_rate": 3.271698113207548e-07,
"loss": 2.362886657714844,
"step": 77400
},
{
"epoch": 0.96875,
"grad_norm": 0.04671184718608856,
"learning_rate": 3.1459119496855354e-07,
"loss": 2.0699423217773436,
"step": 77500
},
{
"epoch": 0.97,
"grad_norm": 0.007876750081777573,
"learning_rate": 3.0201257861635223e-07,
"loss": 1.9259161376953124,
"step": 77600
},
{
"epoch": 0.97125,
"grad_norm": 29.24554443359375,
"learning_rate": 2.89433962264151e-07,
"loss": 2.4339639282226564,
"step": 77700
},
{
"epoch": 0.9725,
"grad_norm": 29.591955184936523,
"learning_rate": 2.768553459119497e-07,
"loss": 2.3606378173828126,
"step": 77800
},
{
"epoch": 0.97375,
"grad_norm": 12.657386779785156,
"learning_rate": 2.6427672955974846e-07,
"loss": 2.428155822753906,
"step": 77900
},
{
"epoch": 0.975,
"grad_norm": 0.024583876132965088,
"learning_rate": 2.516981132075472e-07,
"loss": 2.105915985107422,
"step": 78000
},
{
"epoch": 0.97625,
"grad_norm": 21.956266403198242,
"learning_rate": 2.3911949685534595e-07,
"loss": 2.1736614990234373,
"step": 78100
},
{
"epoch": 0.9775,
"grad_norm": 25.43096923828125,
"learning_rate": 2.2654088050314464e-07,
"loss": 2.178233337402344,
"step": 78200
},
{
"epoch": 0.97875,
"grad_norm": 25.53718376159668,
"learning_rate": 2.139622641509434e-07,
"loss": 2.3168376159667967,
"step": 78300
},
{
"epoch": 0.98,
"grad_norm": 0.009442003443837166,
"learning_rate": 2.0138364779874213e-07,
"loss": 2.333406677246094,
"step": 78400
},
{
"epoch": 0.98125,
"grad_norm": 33.688865661621094,
"learning_rate": 1.8880503144654088e-07,
"loss": 2.119158935546875,
"step": 78500
},
{
"epoch": 0.9825,
"grad_norm": 19.21613883972168,
"learning_rate": 1.7622641509433962e-07,
"loss": 2.3394792175292967,
"step": 78600
},
{
"epoch": 0.98375,
"grad_norm": 10.80718994140625,
"learning_rate": 1.6364779874213836e-07,
"loss": 2.2544096374511717,
"step": 78700
},
{
"epoch": 0.985,
"grad_norm": 0.019625332206487656,
"learning_rate": 1.510691823899371e-07,
"loss": 2.086898193359375,
"step": 78800
},
{
"epoch": 0.98625,
"grad_norm": 23.536130905151367,
"learning_rate": 1.3849056603773585e-07,
"loss": 2.2242425537109374,
"step": 78900
},
{
"epoch": 0.9875,
"grad_norm": 28.006027221679688,
"learning_rate": 1.259119496855346e-07,
"loss": 2.1889492797851564,
"step": 79000
},
{
"epoch": 0.98875,
"grad_norm": 0.1388503462076187,
"learning_rate": 1.1333333333333336e-07,
"loss": 2.131824188232422,
"step": 79100
},
{
"epoch": 0.99,
"grad_norm": 0.1504867821931839,
"learning_rate": 1.0075471698113207e-07,
"loss": 2.0242990112304686,
"step": 79200
},
{
"epoch": 0.99125,
"grad_norm": 26.41069793701172,
"learning_rate": 8.817610062893082e-08,
"loss": 2.210938720703125,
"step": 79300
},
{
"epoch": 0.9925,
"grad_norm": 17.642717361450195,
"learning_rate": 7.559748427672956e-08,
"loss": 1.8704595947265625,
"step": 79400
},
{
"epoch": 0.99375,
"grad_norm": 0.1886824667453766,
"learning_rate": 6.30188679245283e-08,
"loss": 2.071743621826172,
"step": 79500
},
{
"epoch": 0.995,
"grad_norm": 0.07498449087142944,
"learning_rate": 5.0440251572327044e-08,
"loss": 2.117955780029297,
"step": 79600
},
{
"epoch": 0.99625,
"grad_norm": 22.661165237426758,
"learning_rate": 3.786163522012579e-08,
"loss": 2.199887542724609,
"step": 79700
},
{
"epoch": 0.9975,
"grad_norm": 22.61822509765625,
"learning_rate": 2.5283018867924533e-08,
"loss": 2.135641784667969,
"step": 79800
},
{
"epoch": 0.99875,
"grad_norm": 0.6577712297439575,
"learning_rate": 1.2704402515723271e-08,
"loss": 1.8882171630859375,
"step": 79900
},
{
"epoch": 1.0,
"grad_norm": 0.028230739757418633,
"learning_rate": 1.257861635220126e-10,
"loss": 2.1253326416015623,
"step": 80000
}
],
"logging_steps": 100,
"max_steps": 80000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.228147083365581e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}