craa's picture
End of training
5278d28 verified
{
"best_global_step": 89000,
"best_metric": 3.533585786819458,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_0.7_resemble_to_push_5039/checkpoint-30000",
"epoch": 31.778425655976676,
"eval_steps": 1000,
"global_step": 109000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014577259475218658,
"grad_norm": 0.7874077558517456,
"learning_rate": 0.000294,
"loss": 8.5058,
"step": 50
},
{
"epoch": 0.029154518950437316,
"grad_norm": 0.595557451248169,
"learning_rate": 0.0005939999999999999,
"loss": 6.7716,
"step": 100
},
{
"epoch": 0.043731778425655975,
"grad_norm": 0.5917447209358215,
"learning_rate": 0.000599828471411902,
"loss": 6.36,
"step": 150
},
{
"epoch": 0.05830903790087463,
"grad_norm": 0.6458065509796143,
"learning_rate": 0.0005996534422403733,
"loss": 6.1614,
"step": 200
},
{
"epoch": 0.0728862973760933,
"grad_norm": 0.5044608116149902,
"learning_rate": 0.0005994784130688448,
"loss": 6.0021,
"step": 250
},
{
"epoch": 0.08746355685131195,
"grad_norm": 0.45285528898239136,
"learning_rate": 0.0005993033838973162,
"loss": 5.8768,
"step": 300
},
{
"epoch": 0.10204081632653061,
"grad_norm": 0.4967013895511627,
"learning_rate": 0.0005991283547257876,
"loss": 5.7439,
"step": 350
},
{
"epoch": 0.11661807580174927,
"grad_norm": 0.6553553342819214,
"learning_rate": 0.000598953325554259,
"loss": 5.6286,
"step": 400
},
{
"epoch": 0.13119533527696792,
"grad_norm": 0.5217544436454773,
"learning_rate": 0.0005987782963827304,
"loss": 5.5172,
"step": 450
},
{
"epoch": 0.1457725947521866,
"grad_norm": 0.4531030058860779,
"learning_rate": 0.0005986032672112018,
"loss": 5.4139,
"step": 500
},
{
"epoch": 0.16034985422740525,
"grad_norm": 0.4770136773586273,
"learning_rate": 0.0005984282380396732,
"loss": 5.3499,
"step": 550
},
{
"epoch": 0.1749271137026239,
"grad_norm": 0.5866448283195496,
"learning_rate": 0.0005982532088681447,
"loss": 5.2578,
"step": 600
},
{
"epoch": 0.18950437317784258,
"grad_norm": 0.4144020676612854,
"learning_rate": 0.000598078179696616,
"loss": 5.199,
"step": 650
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.4637407958507538,
"learning_rate": 0.0005979031505250875,
"loss": 5.1355,
"step": 700
},
{
"epoch": 0.21865889212827988,
"grad_norm": 0.4051854908466339,
"learning_rate": 0.0005977281213535589,
"loss": 5.0826,
"step": 750
},
{
"epoch": 0.23323615160349853,
"grad_norm": 0.4222441613674164,
"learning_rate": 0.0005975530921820304,
"loss": 5.0161,
"step": 800
},
{
"epoch": 0.2478134110787172,
"grad_norm": 0.43692415952682495,
"learning_rate": 0.0005973780630105017,
"loss": 4.9604,
"step": 850
},
{
"epoch": 0.26239067055393583,
"grad_norm": 0.4478825032711029,
"learning_rate": 0.0005972030338389731,
"loss": 4.9278,
"step": 900
},
{
"epoch": 0.27696793002915454,
"grad_norm": 0.458936870098114,
"learning_rate": 0.0005970280046674445,
"loss": 4.895,
"step": 950
},
{
"epoch": 0.2915451895043732,
"grad_norm": 0.4285873472690582,
"learning_rate": 0.000596852975495916,
"loss": 4.8427,
"step": 1000
},
{
"epoch": 0.2915451895043732,
"eval_accuracy": 0.25348940253781577,
"eval_loss": 4.757139205932617,
"eval_runtime": 183.1867,
"eval_samples_per_second": 90.749,
"eval_steps_per_second": 5.672,
"step": 1000
},
{
"epoch": 0.30612244897959184,
"grad_norm": 0.45091402530670166,
"learning_rate": 0.0005966779463243873,
"loss": 4.7713,
"step": 1050
},
{
"epoch": 0.3206997084548105,
"grad_norm": 0.43677228689193726,
"learning_rate": 0.0005965029171528587,
"loss": 4.7492,
"step": 1100
},
{
"epoch": 0.33527696793002915,
"grad_norm": 0.4576217830181122,
"learning_rate": 0.0005963278879813302,
"loss": 4.7098,
"step": 1150
},
{
"epoch": 0.3498542274052478,
"grad_norm": 0.517709493637085,
"learning_rate": 0.0005961528588098016,
"loss": 4.6695,
"step": 1200
},
{
"epoch": 0.36443148688046645,
"grad_norm": 0.5433350801467896,
"learning_rate": 0.0005959778296382729,
"loss": 4.6536,
"step": 1250
},
{
"epoch": 0.37900874635568516,
"grad_norm": 0.41080689430236816,
"learning_rate": 0.0005958028004667444,
"loss": 4.6072,
"step": 1300
},
{
"epoch": 0.3935860058309038,
"grad_norm": 0.4460156559944153,
"learning_rate": 0.0005956277712952158,
"loss": 4.5744,
"step": 1350
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.4368729889392853,
"learning_rate": 0.0005954527421236873,
"loss": 4.554,
"step": 1400
},
{
"epoch": 0.4227405247813411,
"grad_norm": 0.4751439094543457,
"learning_rate": 0.0005952777129521586,
"loss": 4.5263,
"step": 1450
},
{
"epoch": 0.43731778425655976,
"grad_norm": 0.4113054573535919,
"learning_rate": 0.0005951026837806301,
"loss": 4.5212,
"step": 1500
},
{
"epoch": 0.4518950437317784,
"grad_norm": 0.41762205958366394,
"learning_rate": 0.0005949276546091015,
"loss": 4.4888,
"step": 1550
},
{
"epoch": 0.46647230320699706,
"grad_norm": 0.3934441804885864,
"learning_rate": 0.0005947526254375729,
"loss": 4.4659,
"step": 1600
},
{
"epoch": 0.48104956268221577,
"grad_norm": 0.47389641404151917,
"learning_rate": 0.0005945775962660442,
"loss": 4.4567,
"step": 1650
},
{
"epoch": 0.4956268221574344,
"grad_norm": 0.4341316521167755,
"learning_rate": 0.0005944025670945157,
"loss": 4.4355,
"step": 1700
},
{
"epoch": 0.5102040816326531,
"grad_norm": 0.4070115387439728,
"learning_rate": 0.0005942275379229871,
"loss": 4.4144,
"step": 1750
},
{
"epoch": 0.5247813411078717,
"grad_norm": 0.4244054853916168,
"learning_rate": 0.0005940525087514585,
"loss": 4.3984,
"step": 1800
},
{
"epoch": 0.5393586005830904,
"grad_norm": 0.42848894000053406,
"learning_rate": 0.00059387747957993,
"loss": 4.3872,
"step": 1850
},
{
"epoch": 0.5539358600583091,
"grad_norm": 0.4671320915222168,
"learning_rate": 0.0005937024504084013,
"loss": 4.3684,
"step": 1900
},
{
"epoch": 0.5685131195335277,
"grad_norm": 0.38709279894828796,
"learning_rate": 0.0005935274212368728,
"loss": 4.3662,
"step": 1950
},
{
"epoch": 0.5830903790087464,
"grad_norm": 0.4209407866001129,
"learning_rate": 0.0005933523920653442,
"loss": 4.3511,
"step": 2000
},
{
"epoch": 0.5830903790087464,
"eval_accuracy": 0.29855510341307406,
"eval_loss": 4.285789966583252,
"eval_runtime": 182.7746,
"eval_samples_per_second": 90.954,
"eval_steps_per_second": 5.685,
"step": 2000
},
{
"epoch": 0.597667638483965,
"grad_norm": 0.39591220021247864,
"learning_rate": 0.0005931773628938157,
"loss": 4.3369,
"step": 2050
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.36318960785865784,
"learning_rate": 0.000593002333722287,
"loss": 4.3037,
"step": 2100
},
{
"epoch": 0.6268221574344023,
"grad_norm": 0.41896095871925354,
"learning_rate": 0.0005928273045507584,
"loss": 4.3111,
"step": 2150
},
{
"epoch": 0.641399416909621,
"grad_norm": 0.42866140604019165,
"learning_rate": 0.0005926522753792298,
"loss": 4.2861,
"step": 2200
},
{
"epoch": 0.6559766763848397,
"grad_norm": 0.39964374899864197,
"learning_rate": 0.0005924772462077013,
"loss": 4.2784,
"step": 2250
},
{
"epoch": 0.6705539358600583,
"grad_norm": 0.37104126811027527,
"learning_rate": 0.0005923022170361726,
"loss": 4.2805,
"step": 2300
},
{
"epoch": 0.685131195335277,
"grad_norm": 0.37753456830978394,
"learning_rate": 0.000592127187864644,
"loss": 4.2608,
"step": 2350
},
{
"epoch": 0.6997084548104956,
"grad_norm": 0.39387914538383484,
"learning_rate": 0.0005919521586931155,
"loss": 4.2389,
"step": 2400
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.38472071290016174,
"learning_rate": 0.0005917771295215869,
"loss": 4.2527,
"step": 2450
},
{
"epoch": 0.7288629737609329,
"grad_norm": 0.3931500315666199,
"learning_rate": 0.0005916021003500583,
"loss": 4.23,
"step": 2500
},
{
"epoch": 0.7434402332361516,
"grad_norm": 0.36343446373939514,
"learning_rate": 0.0005914270711785297,
"loss": 4.2279,
"step": 2550
},
{
"epoch": 0.7580174927113703,
"grad_norm": 0.4511328935623169,
"learning_rate": 0.0005912520420070012,
"loss": 4.2169,
"step": 2600
},
{
"epoch": 0.7725947521865889,
"grad_norm": 0.4024074971675873,
"learning_rate": 0.0005910770128354726,
"loss": 4.2055,
"step": 2650
},
{
"epoch": 0.7871720116618076,
"grad_norm": 0.3848779499530792,
"learning_rate": 0.0005909019836639439,
"loss": 4.1972,
"step": 2700
},
{
"epoch": 0.8017492711370262,
"grad_norm": 0.3447161912918091,
"learning_rate": 0.0005907269544924154,
"loss": 4.1825,
"step": 2750
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.39848795533180237,
"learning_rate": 0.0005905519253208868,
"loss": 4.1709,
"step": 2800
},
{
"epoch": 0.8309037900874635,
"grad_norm": 1.0386348962783813,
"learning_rate": 0.0005903768961493582,
"loss": 4.1715,
"step": 2850
},
{
"epoch": 0.8454810495626822,
"grad_norm": 0.3463401794433594,
"learning_rate": 0.0005902018669778295,
"loss": 4.1826,
"step": 2900
},
{
"epoch": 0.8600583090379009,
"grad_norm": 0.3431394100189209,
"learning_rate": 0.000590026837806301,
"loss": 4.1599,
"step": 2950
},
{
"epoch": 0.8746355685131195,
"grad_norm": 0.37830331921577454,
"learning_rate": 0.0005898518086347724,
"loss": 4.1523,
"step": 3000
},
{
"epoch": 0.8746355685131195,
"eval_accuracy": 0.3145779614600069,
"eval_loss": 4.103229522705078,
"eval_runtime": 183.0069,
"eval_samples_per_second": 90.838,
"eval_steps_per_second": 5.677,
"step": 3000
},
{
"epoch": 0.8892128279883382,
"grad_norm": 0.366996705532074,
"learning_rate": 0.0005896767794632438,
"loss": 4.1463,
"step": 3050
},
{
"epoch": 0.9037900874635568,
"grad_norm": 0.3830026388168335,
"learning_rate": 0.0005895017502917152,
"loss": 4.1389,
"step": 3100
},
{
"epoch": 0.9183673469387755,
"grad_norm": 0.3352486491203308,
"learning_rate": 0.0005893267211201866,
"loss": 4.1402,
"step": 3150
},
{
"epoch": 0.9329446064139941,
"grad_norm": 0.3650812804698944,
"learning_rate": 0.0005891516919486581,
"loss": 4.1292,
"step": 3200
},
{
"epoch": 0.9475218658892128,
"grad_norm": 0.3521343171596527,
"learning_rate": 0.0005889766627771295,
"loss": 4.1101,
"step": 3250
},
{
"epoch": 0.9620991253644315,
"grad_norm": 0.34803035855293274,
"learning_rate": 0.0005888016336056009,
"loss": 4.1055,
"step": 3300
},
{
"epoch": 0.9766763848396501,
"grad_norm": 0.4116007089614868,
"learning_rate": 0.0005886266044340723,
"loss": 4.1019,
"step": 3350
},
{
"epoch": 0.9912536443148688,
"grad_norm": 0.3484455347061157,
"learning_rate": 0.0005884515752625437,
"loss": 4.1101,
"step": 3400
},
{
"epoch": 1.0058309037900874,
"grad_norm": 0.3504573702812195,
"learning_rate": 0.0005882765460910151,
"loss": 4.0796,
"step": 3450
},
{
"epoch": 1.0204081632653061,
"grad_norm": 0.3820534944534302,
"learning_rate": 0.0005881015169194865,
"loss": 4.0189,
"step": 3500
},
{
"epoch": 1.0349854227405249,
"grad_norm": 0.34919607639312744,
"learning_rate": 0.0005879264877479579,
"loss": 4.0246,
"step": 3550
},
{
"epoch": 1.0495626822157433,
"grad_norm": 0.36303994059562683,
"learning_rate": 0.0005877514585764293,
"loss": 4.0159,
"step": 3600
},
{
"epoch": 1.064139941690962,
"grad_norm": 0.3435100018978119,
"learning_rate": 0.0005875764294049008,
"loss": 4.0158,
"step": 3650
},
{
"epoch": 1.0787172011661808,
"grad_norm": 0.3518615961074829,
"learning_rate": 0.0005874014002333721,
"loss": 4.0298,
"step": 3700
},
{
"epoch": 1.0932944606413995,
"grad_norm": 0.3429192304611206,
"learning_rate": 0.0005872263710618436,
"loss": 4.0135,
"step": 3750
},
{
"epoch": 1.1078717201166182,
"grad_norm": 0.3353997468948364,
"learning_rate": 0.000587051341890315,
"loss": 4.0039,
"step": 3800
},
{
"epoch": 1.1224489795918366,
"grad_norm": 0.33416813611984253,
"learning_rate": 0.0005868763127187865,
"loss": 4.0146,
"step": 3850
},
{
"epoch": 1.1370262390670554,
"grad_norm": 0.34684616327285767,
"learning_rate": 0.0005867012835472578,
"loss": 3.9853,
"step": 3900
},
{
"epoch": 1.151603498542274,
"grad_norm": 0.33499616384506226,
"learning_rate": 0.0005865262543757292,
"loss": 3.9963,
"step": 3950
},
{
"epoch": 1.1661807580174928,
"grad_norm": 0.3389959931373596,
"learning_rate": 0.0005863512252042007,
"loss": 3.9998,
"step": 4000
},
{
"epoch": 1.1661807580174928,
"eval_accuracy": 0.3246784174531811,
"eval_loss": 3.99202299118042,
"eval_runtime": 182.9762,
"eval_samples_per_second": 90.853,
"eval_steps_per_second": 5.678,
"step": 4000
},
{
"epoch": 1.1807580174927113,
"grad_norm": 0.3295277953147888,
"learning_rate": 0.0005861761960326721,
"loss": 3.9965,
"step": 4050
},
{
"epoch": 1.19533527696793,
"grad_norm": 0.3454665541648865,
"learning_rate": 0.0005860011668611434,
"loss": 3.9911,
"step": 4100
},
{
"epoch": 1.2099125364431487,
"grad_norm": 0.33836084604263306,
"learning_rate": 0.0005858261376896148,
"loss": 3.9786,
"step": 4150
},
{
"epoch": 1.2244897959183674,
"grad_norm": 0.3491062521934509,
"learning_rate": 0.0005856511085180863,
"loss": 3.9913,
"step": 4200
},
{
"epoch": 1.239067055393586,
"grad_norm": 0.3459561765193939,
"learning_rate": 0.0005854760793465577,
"loss": 3.9858,
"step": 4250
},
{
"epoch": 1.2536443148688048,
"grad_norm": 0.3346427083015442,
"learning_rate": 0.0005853010501750292,
"loss": 3.97,
"step": 4300
},
{
"epoch": 1.2682215743440233,
"grad_norm": 0.3376955986022949,
"learning_rate": 0.0005851260210035005,
"loss": 3.9775,
"step": 4350
},
{
"epoch": 1.282798833819242,
"grad_norm": 0.3495452404022217,
"learning_rate": 0.000584950991831972,
"loss": 3.9714,
"step": 4400
},
{
"epoch": 1.2973760932944607,
"grad_norm": 0.343971848487854,
"learning_rate": 0.0005847759626604434,
"loss": 3.9665,
"step": 4450
},
{
"epoch": 1.3119533527696792,
"grad_norm": 0.3238019645214081,
"learning_rate": 0.0005846009334889148,
"loss": 3.9599,
"step": 4500
},
{
"epoch": 1.3265306122448979,
"grad_norm": 0.33076655864715576,
"learning_rate": 0.0005844259043173862,
"loss": 3.9599,
"step": 4550
},
{
"epoch": 1.3411078717201166,
"grad_norm": 0.34467896819114685,
"learning_rate": 0.0005842508751458576,
"loss": 3.9573,
"step": 4600
},
{
"epoch": 1.3556851311953353,
"grad_norm": 0.3411879539489746,
"learning_rate": 0.000584075845974329,
"loss": 3.9609,
"step": 4650
},
{
"epoch": 1.370262390670554,
"grad_norm": 0.33475279808044434,
"learning_rate": 0.0005839008168028004,
"loss": 3.9461,
"step": 4700
},
{
"epoch": 1.3848396501457727,
"grad_norm": 0.33972883224487305,
"learning_rate": 0.0005837257876312718,
"loss": 3.9545,
"step": 4750
},
{
"epoch": 1.3994169096209912,
"grad_norm": 0.3224533498287201,
"learning_rate": 0.0005835507584597432,
"loss": 3.9628,
"step": 4800
},
{
"epoch": 1.41399416909621,
"grad_norm": 0.3264342248439789,
"learning_rate": 0.0005833757292882146,
"loss": 3.947,
"step": 4850
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.34044086933135986,
"learning_rate": 0.0005832007001166861,
"loss": 3.9474,
"step": 4900
},
{
"epoch": 1.4431486880466473,
"grad_norm": 0.3255755305290222,
"learning_rate": 0.0005830256709451574,
"loss": 3.9333,
"step": 4950
},
{
"epoch": 1.4577259475218658,
"grad_norm": 0.3284819424152374,
"learning_rate": 0.0005828506417736289,
"loss": 3.9337,
"step": 5000
},
{
"epoch": 1.4577259475218658,
"eval_accuracy": 0.33094985393527193,
"eval_loss": 3.9178109169006348,
"eval_runtime": 182.8911,
"eval_samples_per_second": 90.896,
"eval_steps_per_second": 5.681,
"step": 5000
},
{
"epoch": 1.4723032069970845,
"grad_norm": 0.3412162661552429,
"learning_rate": 0.0005826756126021003,
"loss": 3.9258,
"step": 5050
},
{
"epoch": 1.4868804664723032,
"grad_norm": 0.3400501012802124,
"learning_rate": 0.0005825005834305718,
"loss": 3.9309,
"step": 5100
},
{
"epoch": 1.501457725947522,
"grad_norm": 0.32828906178474426,
"learning_rate": 0.0005823255542590431,
"loss": 3.9236,
"step": 5150
},
{
"epoch": 1.5160349854227406,
"grad_norm": 0.32428792119026184,
"learning_rate": 0.0005821505250875145,
"loss": 3.9297,
"step": 5200
},
{
"epoch": 1.5306122448979593,
"grad_norm": 0.3326531946659088,
"learning_rate": 0.000581975495915986,
"loss": 3.9204,
"step": 5250
},
{
"epoch": 1.5451895043731778,
"grad_norm": 0.3215237855911255,
"learning_rate": 0.0005818004667444574,
"loss": 3.9227,
"step": 5300
},
{
"epoch": 1.5597667638483965,
"grad_norm": 0.3173177242279053,
"learning_rate": 0.0005816254375729287,
"loss": 3.9008,
"step": 5350
},
{
"epoch": 1.574344023323615,
"grad_norm": 0.33386069536209106,
"learning_rate": 0.0005814504084014001,
"loss": 3.915,
"step": 5400
},
{
"epoch": 1.5889212827988337,
"grad_norm": 0.31523650884628296,
"learning_rate": 0.0005812753792298716,
"loss": 3.9,
"step": 5450
},
{
"epoch": 1.6034985422740524,
"grad_norm": 0.320574015378952,
"learning_rate": 0.000581100350058343,
"loss": 3.9083,
"step": 5500
},
{
"epoch": 1.6180758017492711,
"grad_norm": 0.30293840169906616,
"learning_rate": 0.0005809253208868144,
"loss": 3.9042,
"step": 5550
},
{
"epoch": 1.6326530612244898,
"grad_norm": 0.31950491666793823,
"learning_rate": 0.0005807502917152858,
"loss": 3.9096,
"step": 5600
},
{
"epoch": 1.6472303206997085,
"grad_norm": 0.3188002109527588,
"learning_rate": 0.0005805752625437573,
"loss": 3.902,
"step": 5650
},
{
"epoch": 1.6618075801749272,
"grad_norm": 0.3315708041191101,
"learning_rate": 0.0005804002333722287,
"loss": 3.9093,
"step": 5700
},
{
"epoch": 1.6763848396501457,
"grad_norm": 0.3257363438606262,
"learning_rate": 0.0005802252042007,
"loss": 3.8996,
"step": 5750
},
{
"epoch": 1.6909620991253644,
"grad_norm": 0.3249954879283905,
"learning_rate": 0.0005800501750291715,
"loss": 3.8859,
"step": 5800
},
{
"epoch": 1.7055393586005831,
"grad_norm": 0.333447128534317,
"learning_rate": 0.0005798751458576429,
"loss": 3.9072,
"step": 5850
},
{
"epoch": 1.7201166180758016,
"grad_norm": 0.32005417346954346,
"learning_rate": 0.0005797001166861143,
"loss": 3.8911,
"step": 5900
},
{
"epoch": 1.7346938775510203,
"grad_norm": 0.3107415735721588,
"learning_rate": 0.0005795250875145856,
"loss": 3.8948,
"step": 5950
},
{
"epoch": 1.749271137026239,
"grad_norm": 0.3174699544906616,
"learning_rate": 0.0005793500583430571,
"loss": 3.8708,
"step": 6000
},
{
"epoch": 1.749271137026239,
"eval_accuracy": 0.3363507644148276,
"eval_loss": 3.858534812927246,
"eval_runtime": 182.5563,
"eval_samples_per_second": 91.062,
"eval_steps_per_second": 5.691,
"step": 6000
},
{
"epoch": 1.7638483965014577,
"grad_norm": 0.3156437575817108,
"learning_rate": 0.0005791750291715285,
"loss": 3.8716,
"step": 6050
},
{
"epoch": 1.7784256559766765,
"grad_norm": 0.3218945860862732,
"learning_rate": 0.000579,
"loss": 3.8865,
"step": 6100
},
{
"epoch": 1.7930029154518952,
"grad_norm": 0.3210793137550354,
"learning_rate": 0.0005788249708284714,
"loss": 3.8891,
"step": 6150
},
{
"epoch": 1.8075801749271136,
"grad_norm": 0.3157784044742584,
"learning_rate": 0.0005786499416569428,
"loss": 3.8754,
"step": 6200
},
{
"epoch": 1.8221574344023324,
"grad_norm": 0.3452181816101074,
"learning_rate": 0.0005784749124854142,
"loss": 3.8703,
"step": 6250
},
{
"epoch": 1.836734693877551,
"grad_norm": 0.3172648549079895,
"learning_rate": 0.0005782998833138856,
"loss": 3.8612,
"step": 6300
},
{
"epoch": 1.8513119533527695,
"grad_norm": 0.33045291900634766,
"learning_rate": 0.0005781248541423571,
"loss": 3.8765,
"step": 6350
},
{
"epoch": 1.8658892128279883,
"grad_norm": 0.305176317691803,
"learning_rate": 0.0005779498249708284,
"loss": 3.8544,
"step": 6400
},
{
"epoch": 1.880466472303207,
"grad_norm": 0.31116318702697754,
"learning_rate": 0.0005777747957992998,
"loss": 3.8652,
"step": 6450
},
{
"epoch": 1.8950437317784257,
"grad_norm": 0.303443044424057,
"learning_rate": 0.0005775997666277713,
"loss": 3.8628,
"step": 6500
},
{
"epoch": 1.9096209912536444,
"grad_norm": 0.32800889015197754,
"learning_rate": 0.0005774247374562427,
"loss": 3.8523,
"step": 6550
},
{
"epoch": 1.924198250728863,
"grad_norm": 0.3365955948829651,
"learning_rate": 0.000577249708284714,
"loss": 3.8625,
"step": 6600
},
{
"epoch": 1.9387755102040818,
"grad_norm": 0.3247227668762207,
"learning_rate": 0.0005770746791131854,
"loss": 3.8558,
"step": 6650
},
{
"epoch": 1.9533527696793003,
"grad_norm": 0.33500850200653076,
"learning_rate": 0.0005768996499416569,
"loss": 3.8482,
"step": 6700
},
{
"epoch": 1.967930029154519,
"grad_norm": 0.307003915309906,
"learning_rate": 0.0005767246207701283,
"loss": 3.8539,
"step": 6750
},
{
"epoch": 1.9825072886297375,
"grad_norm": 0.31048765778541565,
"learning_rate": 0.0005765495915985997,
"loss": 3.8608,
"step": 6800
},
{
"epoch": 1.9970845481049562,
"grad_norm": 0.30202147364616394,
"learning_rate": 0.0005763745624270711,
"loss": 3.8392,
"step": 6850
},
{
"epoch": 2.011661807580175,
"grad_norm": 0.31619763374328613,
"learning_rate": 0.0005761995332555426,
"loss": 3.7587,
"step": 6900
},
{
"epoch": 2.0262390670553936,
"grad_norm": 0.3118601143360138,
"learning_rate": 0.000576024504084014,
"loss": 3.7478,
"step": 6950
},
{
"epoch": 2.0408163265306123,
"grad_norm": 0.3118847906589508,
"learning_rate": 0.0005758494749124853,
"loss": 3.7474,
"step": 7000
},
{
"epoch": 2.0408163265306123,
"eval_accuracy": 0.3408536028357841,
"eval_loss": 3.8182103633880615,
"eval_runtime": 182.8244,
"eval_samples_per_second": 90.929,
"eval_steps_per_second": 5.683,
"step": 7000
},
{
"epoch": 2.055393586005831,
"grad_norm": 0.3154442608356476,
"learning_rate": 0.0005756744457409568,
"loss": 3.7527,
"step": 7050
},
{
"epoch": 2.0699708454810497,
"grad_norm": 0.31959810853004456,
"learning_rate": 0.0005754994165694282,
"loss": 3.7533,
"step": 7100
},
{
"epoch": 2.0845481049562684,
"grad_norm": 0.3303550183773041,
"learning_rate": 0.0005753243873978996,
"loss": 3.7602,
"step": 7150
},
{
"epoch": 2.0991253644314867,
"grad_norm": 0.3240067958831787,
"learning_rate": 0.0005751493582263709,
"loss": 3.7683,
"step": 7200
},
{
"epoch": 2.1137026239067054,
"grad_norm": 0.3330995738506317,
"learning_rate": 0.0005749743290548424,
"loss": 3.7604,
"step": 7250
},
{
"epoch": 2.128279883381924,
"grad_norm": 0.3294008672237396,
"learning_rate": 0.0005747992998833138,
"loss": 3.7791,
"step": 7300
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.3313888609409332,
"learning_rate": 0.0005746242707117853,
"loss": 3.7583,
"step": 7350
},
{
"epoch": 2.1574344023323615,
"grad_norm": 0.32828205823898315,
"learning_rate": 0.0005744492415402566,
"loss": 3.7621,
"step": 7400
},
{
"epoch": 2.17201166180758,
"grad_norm": 0.339874267578125,
"learning_rate": 0.0005742742123687281,
"loss": 3.7587,
"step": 7450
},
{
"epoch": 2.186588921282799,
"grad_norm": 0.3092948794364929,
"learning_rate": 0.0005740991831971995,
"loss": 3.7532,
"step": 7500
},
{
"epoch": 2.2011661807580176,
"grad_norm": 0.320710152387619,
"learning_rate": 0.0005739241540256709,
"loss": 3.7672,
"step": 7550
},
{
"epoch": 2.2157434402332363,
"grad_norm": 0.31004834175109863,
"learning_rate": 0.0005737491248541423,
"loss": 3.7649,
"step": 7600
},
{
"epoch": 2.2303206997084546,
"grad_norm": 0.3079014718532562,
"learning_rate": 0.0005735740956826137,
"loss": 3.7555,
"step": 7650
},
{
"epoch": 2.2448979591836733,
"grad_norm": 0.3182198107242584,
"learning_rate": 0.0005733990665110851,
"loss": 3.7582,
"step": 7700
},
{
"epoch": 2.259475218658892,
"grad_norm": 0.32147687673568726,
"learning_rate": 0.0005732240373395566,
"loss": 3.7592,
"step": 7750
},
{
"epoch": 2.2740524781341107,
"grad_norm": 0.31712988018989563,
"learning_rate": 0.000573049008168028,
"loss": 3.7643,
"step": 7800
},
{
"epoch": 2.2886297376093294,
"grad_norm": 0.3175004720687866,
"learning_rate": 0.0005728739789964993,
"loss": 3.7593,
"step": 7850
},
{
"epoch": 2.303206997084548,
"grad_norm": 0.30601269006729126,
"learning_rate": 0.0005726989498249708,
"loss": 3.7656,
"step": 7900
},
{
"epoch": 2.317784256559767,
"grad_norm": 0.3067971467971802,
"learning_rate": 0.0005725239206534422,
"loss": 3.7622,
"step": 7950
},
{
"epoch": 2.3323615160349855,
"grad_norm": 0.304423451423645,
"learning_rate": 0.0005723488914819137,
"loss": 3.7592,
"step": 8000
},
{
"epoch": 2.3323615160349855,
"eval_accuracy": 0.34391568835004305,
"eval_loss": 3.7847554683685303,
"eval_runtime": 182.6857,
"eval_samples_per_second": 90.998,
"eval_steps_per_second": 5.687,
"step": 8000
},
{
"epoch": 2.3469387755102042,
"grad_norm": 0.31666499376296997,
"learning_rate": 0.000572173862310385,
"loss": 3.7609,
"step": 8050
},
{
"epoch": 2.3615160349854225,
"grad_norm": 0.3182964324951172,
"learning_rate": 0.0005719988331388564,
"loss": 3.7533,
"step": 8100
},
{
"epoch": 2.376093294460641,
"grad_norm": 0.3177492916584015,
"learning_rate": 0.0005718238039673279,
"loss": 3.7578,
"step": 8150
},
{
"epoch": 2.39067055393586,
"grad_norm": 0.3210289776325226,
"learning_rate": 0.0005716487747957993,
"loss": 3.7552,
"step": 8200
},
{
"epoch": 2.4052478134110786,
"grad_norm": 0.31907418370246887,
"learning_rate": 0.0005714737456242706,
"loss": 3.7558,
"step": 8250
},
{
"epoch": 2.4198250728862973,
"grad_norm": 0.3094446659088135,
"learning_rate": 0.0005712987164527421,
"loss": 3.7434,
"step": 8300
},
{
"epoch": 2.434402332361516,
"grad_norm": 0.3326069414615631,
"learning_rate": 0.0005711236872812135,
"loss": 3.7418,
"step": 8350
},
{
"epoch": 2.4489795918367347,
"grad_norm": 0.32391783595085144,
"learning_rate": 0.0005709486581096849,
"loss": 3.7372,
"step": 8400
},
{
"epoch": 2.4635568513119535,
"grad_norm": 0.3140561580657959,
"learning_rate": 0.0005707736289381562,
"loss": 3.7612,
"step": 8450
},
{
"epoch": 2.478134110787172,
"grad_norm": 0.2998807728290558,
"learning_rate": 0.0005705985997666277,
"loss": 3.7458,
"step": 8500
},
{
"epoch": 2.4927113702623904,
"grad_norm": 0.3217441141605377,
"learning_rate": 0.0005704235705950991,
"loss": 3.755,
"step": 8550
},
{
"epoch": 2.5072886297376096,
"grad_norm": 0.35609811544418335,
"learning_rate": 0.0005702485414235706,
"loss": 3.7465,
"step": 8600
},
{
"epoch": 2.521865889212828,
"grad_norm": 0.3114137351512909,
"learning_rate": 0.0005700735122520419,
"loss": 3.7363,
"step": 8650
},
{
"epoch": 2.5364431486880465,
"grad_norm": 0.3186824321746826,
"learning_rate": 0.0005698984830805134,
"loss": 3.747,
"step": 8700
},
{
"epoch": 2.5510204081632653,
"grad_norm": 0.29359111189842224,
"learning_rate": 0.0005697234539089848,
"loss": 3.7405,
"step": 8750
},
{
"epoch": 2.565597667638484,
"grad_norm": 0.3077991008758545,
"learning_rate": 0.0005695484247374562,
"loss": 3.7468,
"step": 8800
},
{
"epoch": 2.5801749271137027,
"grad_norm": 0.3335474729537964,
"learning_rate": 0.0005693733955659276,
"loss": 3.7459,
"step": 8850
},
{
"epoch": 2.5947521865889214,
"grad_norm": 0.32796528935432434,
"learning_rate": 0.000569198366394399,
"loss": 3.7428,
"step": 8900
},
{
"epoch": 2.60932944606414,
"grad_norm": 0.31937655806541443,
"learning_rate": 0.0005690233372228704,
"loss": 3.7475,
"step": 8950
},
{
"epoch": 2.6239067055393583,
"grad_norm": 0.3045900762081146,
"learning_rate": 0.0005688483080513419,
"loss": 3.7338,
"step": 9000
},
{
"epoch": 2.6239067055393583,
"eval_accuracy": 0.34635598639366094,
"eval_loss": 3.756603240966797,
"eval_runtime": 182.4333,
"eval_samples_per_second": 91.124,
"eval_steps_per_second": 5.695,
"step": 9000
},
{
"epoch": 2.6384839650145775,
"grad_norm": 0.2968880534172058,
"learning_rate": 0.0005686732788798132,
"loss": 3.7332,
"step": 9050
},
{
"epoch": 2.6530612244897958,
"grad_norm": 0.3103148937225342,
"learning_rate": 0.0005684982497082846,
"loss": 3.7447,
"step": 9100
},
{
"epoch": 2.6676384839650145,
"grad_norm": 0.3076816201210022,
"learning_rate": 0.0005683232205367561,
"loss": 3.733,
"step": 9150
},
{
"epoch": 2.682215743440233,
"grad_norm": 0.31646642088890076,
"learning_rate": 0.0005681481913652275,
"loss": 3.7393,
"step": 9200
},
{
"epoch": 2.696793002915452,
"grad_norm": 0.3197529911994934,
"learning_rate": 0.000567973162193699,
"loss": 3.7425,
"step": 9250
},
{
"epoch": 2.7113702623906706,
"grad_norm": 0.2880556881427765,
"learning_rate": 0.0005677981330221703,
"loss": 3.7393,
"step": 9300
},
{
"epoch": 2.7259475218658893,
"grad_norm": 0.3087301552295685,
"learning_rate": 0.0005676231038506417,
"loss": 3.7316,
"step": 9350
},
{
"epoch": 2.740524781341108,
"grad_norm": 0.3143419027328491,
"learning_rate": 0.0005674480746791132,
"loss": 3.7278,
"step": 9400
},
{
"epoch": 2.7551020408163263,
"grad_norm": 0.2983132600784302,
"learning_rate": 0.0005672730455075845,
"loss": 3.7314,
"step": 9450
},
{
"epoch": 2.7696793002915454,
"grad_norm": 0.3293115198612213,
"learning_rate": 0.0005670980163360559,
"loss": 3.7273,
"step": 9500
},
{
"epoch": 2.7842565597667637,
"grad_norm": 0.30666598677635193,
"learning_rate": 0.0005669229871645274,
"loss": 3.7354,
"step": 9550
},
{
"epoch": 2.7988338192419824,
"grad_norm": 0.3132167160511017,
"learning_rate": 0.0005667479579929988,
"loss": 3.7246,
"step": 9600
},
{
"epoch": 2.813411078717201,
"grad_norm": 0.2964831590652466,
"learning_rate": 0.0005665729288214701,
"loss": 3.7363,
"step": 9650
},
{
"epoch": 2.82798833819242,
"grad_norm": 0.30478906631469727,
"learning_rate": 0.0005663978996499416,
"loss": 3.7301,
"step": 9700
},
{
"epoch": 2.8425655976676385,
"grad_norm": 0.30541983246803284,
"learning_rate": 0.000566222870478413,
"loss": 3.7306,
"step": 9750
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.29288506507873535,
"learning_rate": 0.0005660478413068845,
"loss": 3.7356,
"step": 9800
},
{
"epoch": 2.871720116618076,
"grad_norm": 0.31285762786865234,
"learning_rate": 0.0005658728121353559,
"loss": 3.7214,
"step": 9850
},
{
"epoch": 2.8862973760932946,
"grad_norm": 0.3204987645149231,
"learning_rate": 0.0005656977829638273,
"loss": 3.7282,
"step": 9900
},
{
"epoch": 2.9008746355685133,
"grad_norm": 0.30404728651046753,
"learning_rate": 0.0005655227537922987,
"loss": 3.722,
"step": 9950
},
{
"epoch": 2.9154518950437316,
"grad_norm": 0.3133791387081146,
"learning_rate": 0.0005653477246207701,
"loss": 3.722,
"step": 10000
},
{
"epoch": 2.9154518950437316,
"eval_accuracy": 0.3486605553661601,
"eval_loss": 3.7310969829559326,
"eval_runtime": 182.8035,
"eval_samples_per_second": 90.939,
"eval_steps_per_second": 5.684,
"step": 10000
},
{
"epoch": 2.9300291545189503,
"grad_norm": 0.3230670392513275,
"learning_rate": 0.0005651726954492415,
"loss": 3.7355,
"step": 10050
},
{
"epoch": 2.944606413994169,
"grad_norm": 0.30999648571014404,
"learning_rate": 0.0005649976662777129,
"loss": 3.7289,
"step": 10100
},
{
"epoch": 2.9591836734693877,
"grad_norm": 0.3303220868110657,
"learning_rate": 0.0005648226371061843,
"loss": 3.7192,
"step": 10150
},
{
"epoch": 2.9737609329446064,
"grad_norm": 0.31200259923934937,
"learning_rate": 0.0005646476079346557,
"loss": 3.7102,
"step": 10200
},
{
"epoch": 2.988338192419825,
"grad_norm": 0.29743126034736633,
"learning_rate": 0.0005644725787631272,
"loss": 3.7206,
"step": 10250
},
{
"epoch": 3.002915451895044,
"grad_norm": 0.3076407015323639,
"learning_rate": 0.0005642975495915985,
"loss": 3.7061,
"step": 10300
},
{
"epoch": 3.0174927113702625,
"grad_norm": 0.3168206512928009,
"learning_rate": 0.0005641225204200699,
"loss": 3.6213,
"step": 10350
},
{
"epoch": 3.0320699708454812,
"grad_norm": 0.3298965096473694,
"learning_rate": 0.0005639474912485414,
"loss": 3.6165,
"step": 10400
},
{
"epoch": 3.0466472303206995,
"grad_norm": 0.3050922453403473,
"learning_rate": 0.0005637724620770128,
"loss": 3.6171,
"step": 10450
},
{
"epoch": 3.061224489795918,
"grad_norm": 0.2993483543395996,
"learning_rate": 0.0005635974329054842,
"loss": 3.6079,
"step": 10500
},
{
"epoch": 3.075801749271137,
"grad_norm": 0.3267271816730499,
"learning_rate": 0.0005634224037339556,
"loss": 3.6364,
"step": 10550
},
{
"epoch": 3.0903790087463556,
"grad_norm": 0.3062150776386261,
"learning_rate": 0.000563247374562427,
"loss": 3.6323,
"step": 10600
},
{
"epoch": 3.1049562682215743,
"grad_norm": 0.31601178646087646,
"learning_rate": 0.0005630723453908985,
"loss": 3.6353,
"step": 10650
},
{
"epoch": 3.119533527696793,
"grad_norm": 0.30633261799812317,
"learning_rate": 0.0005628973162193698,
"loss": 3.6308,
"step": 10700
},
{
"epoch": 3.1341107871720117,
"grad_norm": 0.3371477425098419,
"learning_rate": 0.0005627222870478412,
"loss": 3.6318,
"step": 10750
},
{
"epoch": 3.1486880466472305,
"grad_norm": 0.28911158442497253,
"learning_rate": 0.0005625472578763127,
"loss": 3.64,
"step": 10800
},
{
"epoch": 3.163265306122449,
"grad_norm": 0.3178606927394867,
"learning_rate": 0.0005623722287047841,
"loss": 3.6445,
"step": 10850
},
{
"epoch": 3.1778425655976674,
"grad_norm": 0.3044661283493042,
"learning_rate": 0.0005621971995332554,
"loss": 3.6358,
"step": 10900
},
{
"epoch": 3.192419825072886,
"grad_norm": 0.32306867837905884,
"learning_rate": 0.0005620221703617269,
"loss": 3.6349,
"step": 10950
},
{
"epoch": 3.206997084548105,
"grad_norm": 0.3064355254173279,
"learning_rate": 0.0005618471411901983,
"loss": 3.6432,
"step": 11000
},
{
"epoch": 3.206997084548105,
"eval_accuracy": 0.3509007324896549,
"eval_loss": 3.7172634601593018,
"eval_runtime": 182.6273,
"eval_samples_per_second": 91.027,
"eval_steps_per_second": 5.689,
"step": 11000
},
{
"epoch": 3.2215743440233235,
"grad_norm": 0.3162972629070282,
"learning_rate": 0.0005616721120186698,
"loss": 3.6392,
"step": 11050
},
{
"epoch": 3.2361516034985423,
"grad_norm": 0.31746646761894226,
"learning_rate": 0.0005614970828471411,
"loss": 3.64,
"step": 11100
},
{
"epoch": 3.250728862973761,
"grad_norm": 0.31247153878211975,
"learning_rate": 0.0005613220536756126,
"loss": 3.6393,
"step": 11150
},
{
"epoch": 3.2653061224489797,
"grad_norm": 0.31435567140579224,
"learning_rate": 0.000561147024504084,
"loss": 3.6414,
"step": 11200
},
{
"epoch": 3.2798833819241984,
"grad_norm": 0.3080807328224182,
"learning_rate": 0.0005609719953325554,
"loss": 3.627,
"step": 11250
},
{
"epoch": 3.294460641399417,
"grad_norm": 0.30449172854423523,
"learning_rate": 0.0005607969661610267,
"loss": 3.6524,
"step": 11300
},
{
"epoch": 3.3090379008746353,
"grad_norm": 0.31494206190109253,
"learning_rate": 0.0005606219369894982,
"loss": 3.6241,
"step": 11350
},
{
"epoch": 3.323615160349854,
"grad_norm": 0.30978018045425415,
"learning_rate": 0.0005604469078179696,
"loss": 3.6578,
"step": 11400
},
{
"epoch": 3.3381924198250728,
"grad_norm": 0.31096816062927246,
"learning_rate": 0.000560271878646441,
"loss": 3.6475,
"step": 11450
},
{
"epoch": 3.3527696793002915,
"grad_norm": 0.29989251494407654,
"learning_rate": 0.0005600968494749124,
"loss": 3.6372,
"step": 11500
},
{
"epoch": 3.36734693877551,
"grad_norm": 0.319997638463974,
"learning_rate": 0.0005599218203033838,
"loss": 3.6548,
"step": 11550
},
{
"epoch": 3.381924198250729,
"grad_norm": 0.3196156919002533,
"learning_rate": 0.0005597467911318553,
"loss": 3.6365,
"step": 11600
},
{
"epoch": 3.3965014577259476,
"grad_norm": 0.3064935803413391,
"learning_rate": 0.0005595717619603267,
"loss": 3.6479,
"step": 11650
},
{
"epoch": 3.4110787172011663,
"grad_norm": 0.30868738889694214,
"learning_rate": 0.0005593967327887982,
"loss": 3.6481,
"step": 11700
},
{
"epoch": 3.425655976676385,
"grad_norm": 0.31745100021362305,
"learning_rate": 0.0005592217036172695,
"loss": 3.6495,
"step": 11750
},
{
"epoch": 3.4402332361516033,
"grad_norm": 0.29115235805511475,
"learning_rate": 0.0005590466744457409,
"loss": 3.6573,
"step": 11800
},
{
"epoch": 3.454810495626822,
"grad_norm": 0.2950221300125122,
"learning_rate": 0.0005588716452742123,
"loss": 3.6449,
"step": 11850
},
{
"epoch": 3.4693877551020407,
"grad_norm": 0.3181147277355194,
"learning_rate": 0.0005586966161026838,
"loss": 3.6513,
"step": 11900
},
{
"epoch": 3.4839650145772594,
"grad_norm": 0.3124925196170807,
"learning_rate": 0.0005585215869311551,
"loss": 3.6395,
"step": 11950
},
{
"epoch": 3.498542274052478,
"grad_norm": 0.3157157301902771,
"learning_rate": 0.0005583465577596265,
"loss": 3.6514,
"step": 12000
},
{
"epoch": 3.498542274052478,
"eval_accuracy": 0.3525496111532804,
"eval_loss": 3.69809889793396,
"eval_runtime": 182.4166,
"eval_samples_per_second": 91.132,
"eval_steps_per_second": 5.696,
"step": 12000
},
{
"epoch": 3.513119533527697,
"grad_norm": 0.3136838972568512,
"learning_rate": 0.000558171528588098,
"loss": 3.6535,
"step": 12050
},
{
"epoch": 3.5276967930029155,
"grad_norm": 0.32249802350997925,
"learning_rate": 0.0005579964994165694,
"loss": 3.6552,
"step": 12100
},
{
"epoch": 3.542274052478134,
"grad_norm": 0.32572638988494873,
"learning_rate": 0.0005578214702450407,
"loss": 3.65,
"step": 12150
},
{
"epoch": 3.556851311953353,
"grad_norm": 0.3193070888519287,
"learning_rate": 0.0005576464410735122,
"loss": 3.6407,
"step": 12200
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.2971830368041992,
"learning_rate": 0.0005574714119019836,
"loss": 3.6456,
"step": 12250
},
{
"epoch": 3.5860058309037903,
"grad_norm": 0.3185986280441284,
"learning_rate": 0.0005572963827304551,
"loss": 3.6469,
"step": 12300
},
{
"epoch": 3.6005830903790086,
"grad_norm": 0.3031637370586395,
"learning_rate": 0.0005571213535589264,
"loss": 3.6425,
"step": 12350
},
{
"epoch": 3.6151603498542273,
"grad_norm": 0.31252244114875793,
"learning_rate": 0.0005569463243873979,
"loss": 3.6485,
"step": 12400
},
{
"epoch": 3.629737609329446,
"grad_norm": 0.3112841248512268,
"learning_rate": 0.0005567712952158693,
"loss": 3.6345,
"step": 12450
},
{
"epoch": 3.6443148688046647,
"grad_norm": 0.3132002353668213,
"learning_rate": 0.0005565962660443407,
"loss": 3.6553,
"step": 12500
},
{
"epoch": 3.6588921282798834,
"grad_norm": 0.3197864592075348,
"learning_rate": 0.000556421236872812,
"loss": 3.6468,
"step": 12550
},
{
"epoch": 3.673469387755102,
"grad_norm": 0.3167048692703247,
"learning_rate": 0.0005562462077012835,
"loss": 3.6386,
"step": 12600
},
{
"epoch": 3.688046647230321,
"grad_norm": 0.319764643907547,
"learning_rate": 0.0005560711785297549,
"loss": 3.6401,
"step": 12650
},
{
"epoch": 3.702623906705539,
"grad_norm": 0.31712111830711365,
"learning_rate": 0.0005558961493582263,
"loss": 3.6502,
"step": 12700
},
{
"epoch": 3.7172011661807582,
"grad_norm": 0.3105193078517914,
"learning_rate": 0.0005557211201866977,
"loss": 3.6501,
"step": 12750
},
{
"epoch": 3.7317784256559765,
"grad_norm": 0.3264790177345276,
"learning_rate": 0.0005555460910151691,
"loss": 3.6548,
"step": 12800
},
{
"epoch": 3.746355685131195,
"grad_norm": 0.3052196502685547,
"learning_rate": 0.0005553710618436406,
"loss": 3.6431,
"step": 12850
},
{
"epoch": 3.760932944606414,
"grad_norm": 0.3182891011238098,
"learning_rate": 0.000555196032672112,
"loss": 3.6298,
"step": 12900
},
{
"epoch": 3.7755102040816326,
"grad_norm": 0.31025004386901855,
"learning_rate": 0.0005550210035005834,
"loss": 3.6411,
"step": 12950
},
{
"epoch": 3.7900874635568513,
"grad_norm": 0.3146236836910248,
"learning_rate": 0.0005548459743290548,
"loss": 3.6411,
"step": 13000
},
{
"epoch": 3.7900874635568513,
"eval_accuracy": 0.3540398056990671,
"eval_loss": 3.681382179260254,
"eval_runtime": 182.6914,
"eval_samples_per_second": 90.995,
"eval_steps_per_second": 5.687,
"step": 13000
},
{
"epoch": 3.80466472303207,
"grad_norm": 0.2979094982147217,
"learning_rate": 0.0005546709451575262,
"loss": 3.6437,
"step": 13050
},
{
"epoch": 3.8192419825072887,
"grad_norm": 0.2954643666744232,
"learning_rate": 0.0005544959159859976,
"loss": 3.643,
"step": 13100
},
{
"epoch": 3.8338192419825075,
"grad_norm": 0.31352123618125916,
"learning_rate": 0.000554320886814469,
"loss": 3.6516,
"step": 13150
},
{
"epoch": 3.848396501457726,
"grad_norm": 0.312225878238678,
"learning_rate": 0.0005541458576429404,
"loss": 3.641,
"step": 13200
},
{
"epoch": 3.8629737609329444,
"grad_norm": 0.30493468046188354,
"learning_rate": 0.0005539708284714118,
"loss": 3.6328,
"step": 13250
},
{
"epoch": 3.877551020408163,
"grad_norm": 0.30350905656814575,
"learning_rate": 0.0005537957992998833,
"loss": 3.6445,
"step": 13300
},
{
"epoch": 3.892128279883382,
"grad_norm": 0.3183624744415283,
"learning_rate": 0.0005536207701283546,
"loss": 3.6384,
"step": 13350
},
{
"epoch": 3.9067055393586005,
"grad_norm": 0.29714614152908325,
"learning_rate": 0.0005534457409568261,
"loss": 3.6475,
"step": 13400
},
{
"epoch": 3.9212827988338192,
"grad_norm": 0.3224017024040222,
"learning_rate": 0.0005532707117852975,
"loss": 3.6286,
"step": 13450
},
{
"epoch": 3.935860058309038,
"grad_norm": 0.31913307309150696,
"learning_rate": 0.000553095682613769,
"loss": 3.6291,
"step": 13500
},
{
"epoch": 3.9504373177842567,
"grad_norm": 0.31245312094688416,
"learning_rate": 0.0005529206534422404,
"loss": 3.6403,
"step": 13550
},
{
"epoch": 3.9650145772594754,
"grad_norm": 0.3012782335281372,
"learning_rate": 0.0005527456242707117,
"loss": 3.6385,
"step": 13600
},
{
"epoch": 3.979591836734694,
"grad_norm": 0.30770647525787354,
"learning_rate": 0.0005525705950991832,
"loss": 3.6343,
"step": 13650
},
{
"epoch": 3.9941690962099123,
"grad_norm": 0.31351521611213684,
"learning_rate": 0.0005523955659276546,
"loss": 3.6295,
"step": 13700
},
{
"epoch": 4.0087463556851315,
"grad_norm": 0.3147214949131012,
"learning_rate": 0.000552220536756126,
"loss": 3.5772,
"step": 13750
},
{
"epoch": 4.02332361516035,
"grad_norm": 0.32505854964256287,
"learning_rate": 0.0005520455075845973,
"loss": 3.5361,
"step": 13800
},
{
"epoch": 4.037900874635569,
"grad_norm": 0.29005855321884155,
"learning_rate": 0.0005518704784130688,
"loss": 3.5295,
"step": 13850
},
{
"epoch": 4.052478134110787,
"grad_norm": 0.328256756067276,
"learning_rate": 0.0005516954492415402,
"loss": 3.5362,
"step": 13900
},
{
"epoch": 4.067055393586005,
"grad_norm": 0.32246100902557373,
"learning_rate": 0.0005515204200700117,
"loss": 3.544,
"step": 13950
},
{
"epoch": 4.081632653061225,
"grad_norm": 0.31613844633102417,
"learning_rate": 0.000551345390898483,
"loss": 3.5514,
"step": 14000
},
{
"epoch": 4.081632653061225,
"eval_accuracy": 0.3552499486748699,
"eval_loss": 3.6744368076324463,
"eval_runtime": 182.424,
"eval_samples_per_second": 91.128,
"eval_steps_per_second": 5.696,
"step": 14000
},
{
"epoch": 4.096209912536443,
"grad_norm": 0.3086166977882385,
"learning_rate": 0.0005511703617269544,
"loss": 3.5407,
"step": 14050
},
{
"epoch": 4.110787172011662,
"grad_norm": 0.32620468735694885,
"learning_rate": 0.0005509953325554259,
"loss": 3.5484,
"step": 14100
},
{
"epoch": 4.12536443148688,
"grad_norm": 0.31899869441986084,
"learning_rate": 0.0005508203033838973,
"loss": 3.5562,
"step": 14150
},
{
"epoch": 4.139941690962099,
"grad_norm": 0.3118620812892914,
"learning_rate": 0.0005506452742123687,
"loss": 3.5522,
"step": 14200
},
{
"epoch": 4.154518950437318,
"grad_norm": 0.3356049954891205,
"learning_rate": 0.0005504702450408401,
"loss": 3.5622,
"step": 14250
},
{
"epoch": 4.169096209912537,
"grad_norm": 0.30787017941474915,
"learning_rate": 0.0005502952158693115,
"loss": 3.5693,
"step": 14300
},
{
"epoch": 4.183673469387755,
"grad_norm": 0.3089492619037628,
"learning_rate": 0.0005501201866977829,
"loss": 3.5681,
"step": 14350
},
{
"epoch": 4.198250728862973,
"grad_norm": 0.3012238144874573,
"learning_rate": 0.0005499451575262543,
"loss": 3.549,
"step": 14400
},
{
"epoch": 4.2128279883381925,
"grad_norm": 0.3236016631126404,
"learning_rate": 0.0005497701283547257,
"loss": 3.5571,
"step": 14450
},
{
"epoch": 4.227405247813411,
"grad_norm": 0.31478026509284973,
"learning_rate": 0.0005495950991831971,
"loss": 3.5518,
"step": 14500
},
{
"epoch": 4.24198250728863,
"grad_norm": 0.3320184350013733,
"learning_rate": 0.0005494200700116686,
"loss": 3.5767,
"step": 14550
},
{
"epoch": 4.256559766763848,
"grad_norm": 0.331966370344162,
"learning_rate": 0.0005492450408401399,
"loss": 3.5634,
"step": 14600
},
{
"epoch": 4.271137026239067,
"grad_norm": 0.3124910593032837,
"learning_rate": 0.0005490700116686114,
"loss": 3.5565,
"step": 14650
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.3230664134025574,
"learning_rate": 0.0005488949824970828,
"loss": 3.5659,
"step": 14700
},
{
"epoch": 4.300291545189505,
"grad_norm": 0.3078579008579254,
"learning_rate": 0.0005487199533255543,
"loss": 3.5637,
"step": 14750
},
{
"epoch": 4.314868804664723,
"grad_norm": 0.3371805250644684,
"learning_rate": 0.0005485449241540256,
"loss": 3.5832,
"step": 14800
},
{
"epoch": 4.329446064139942,
"grad_norm": 0.31937262415885925,
"learning_rate": 0.000548369894982497,
"loss": 3.5686,
"step": 14850
},
{
"epoch": 4.34402332361516,
"grad_norm": 0.32466357946395874,
"learning_rate": 0.0005481948658109685,
"loss": 3.566,
"step": 14900
},
{
"epoch": 4.358600583090379,
"grad_norm": 0.3096526861190796,
"learning_rate": 0.0005480198366394399,
"loss": 3.5755,
"step": 14950
},
{
"epoch": 4.373177842565598,
"grad_norm": 0.3323536515235901,
"learning_rate": 0.0005478448074679112,
"loss": 3.564,
"step": 15000
},
{
"epoch": 4.373177842565598,
"eval_accuracy": 0.3562713893948155,
"eval_loss": 3.663257360458374,
"eval_runtime": 182.5662,
"eval_samples_per_second": 91.057,
"eval_steps_per_second": 5.691,
"step": 15000
},
{
"epoch": 4.387755102040816,
"grad_norm": 0.3572273254394531,
"learning_rate": 0.0005476697782963826,
"loss": 3.5658,
"step": 15050
},
{
"epoch": 4.402332361516035,
"grad_norm": 0.3238251507282257,
"learning_rate": 0.0005474947491248541,
"loss": 3.5685,
"step": 15100
},
{
"epoch": 4.4169096209912535,
"grad_norm": 0.32899653911590576,
"learning_rate": 0.0005473197199533255,
"loss": 3.5743,
"step": 15150
},
{
"epoch": 4.431486880466473,
"grad_norm": 0.3337746560573578,
"learning_rate": 0.000547144690781797,
"loss": 3.5724,
"step": 15200
},
{
"epoch": 4.446064139941691,
"grad_norm": 0.305427610874176,
"learning_rate": 0.0005469696616102683,
"loss": 3.566,
"step": 15250
},
{
"epoch": 4.460641399416909,
"grad_norm": 0.33104851841926575,
"learning_rate": 0.0005467946324387398,
"loss": 3.5729,
"step": 15300
},
{
"epoch": 4.475218658892128,
"grad_norm": 0.3255021572113037,
"learning_rate": 0.0005466196032672112,
"loss": 3.5754,
"step": 15350
},
{
"epoch": 4.489795918367347,
"grad_norm": 0.3108825087547302,
"learning_rate": 0.0005464445740956825,
"loss": 3.5705,
"step": 15400
},
{
"epoch": 4.504373177842566,
"grad_norm": 0.31072282791137695,
"learning_rate": 0.000546269544924154,
"loss": 3.5848,
"step": 15450
},
{
"epoch": 4.518950437317784,
"grad_norm": 0.31263136863708496,
"learning_rate": 0.0005460945157526254,
"loss": 3.5806,
"step": 15500
},
{
"epoch": 4.533527696793003,
"grad_norm": 0.30347201228141785,
"learning_rate": 0.0005459194865810968,
"loss": 3.5909,
"step": 15550
},
{
"epoch": 4.548104956268221,
"grad_norm": 0.30945679545402527,
"learning_rate": 0.0005457444574095681,
"loss": 3.5817,
"step": 15600
},
{
"epoch": 4.562682215743441,
"grad_norm": 0.3195247948169708,
"learning_rate": 0.0005455694282380396,
"loss": 3.5663,
"step": 15650
},
{
"epoch": 4.577259475218659,
"grad_norm": 0.30114883184432983,
"learning_rate": 0.000545394399066511,
"loss": 3.5784,
"step": 15700
},
{
"epoch": 4.591836734693878,
"grad_norm": 0.3206930160522461,
"learning_rate": 0.0005452193698949825,
"loss": 3.5735,
"step": 15750
},
{
"epoch": 4.606413994169096,
"grad_norm": 0.30759623646736145,
"learning_rate": 0.0005450443407234539,
"loss": 3.5805,
"step": 15800
},
{
"epoch": 4.6209912536443145,
"grad_norm": 0.30484503507614136,
"learning_rate": 0.0005448693115519253,
"loss": 3.5848,
"step": 15850
},
{
"epoch": 4.635568513119534,
"grad_norm": 0.32482293248176575,
"learning_rate": 0.0005446942823803967,
"loss": 3.5718,
"step": 15900
},
{
"epoch": 4.650145772594752,
"grad_norm": 0.3221893012523651,
"learning_rate": 0.0005445192532088681,
"loss": 3.5716,
"step": 15950
},
{
"epoch": 4.664723032069971,
"grad_norm": 0.31135523319244385,
"learning_rate": 0.0005443442240373396,
"loss": 3.5733,
"step": 16000
},
{
"epoch": 4.664723032069971,
"eval_accuracy": 0.35775051843090133,
"eval_loss": 3.649756669998169,
"eval_runtime": 182.7531,
"eval_samples_per_second": 90.964,
"eval_steps_per_second": 5.685,
"step": 16000
},
{
"epoch": 4.679300291545189,
"grad_norm": 0.3205879330635071,
"learning_rate": 0.0005441691948658109,
"loss": 3.577,
"step": 16050
},
{
"epoch": 4.6938775510204085,
"grad_norm": 0.31886178255081177,
"learning_rate": 0.0005439941656942823,
"loss": 3.5797,
"step": 16100
},
{
"epoch": 4.708454810495627,
"grad_norm": 0.2951045334339142,
"learning_rate": 0.0005438191365227538,
"loss": 3.5806,
"step": 16150
},
{
"epoch": 4.723032069970845,
"grad_norm": 0.31141290068626404,
"learning_rate": 0.0005436441073512252,
"loss": 3.5731,
"step": 16200
},
{
"epoch": 4.737609329446064,
"grad_norm": 0.3445282280445099,
"learning_rate": 0.0005434690781796965,
"loss": 3.5716,
"step": 16250
},
{
"epoch": 4.752186588921282,
"grad_norm": 0.30949369072914124,
"learning_rate": 0.0005432940490081679,
"loss": 3.5776,
"step": 16300
},
{
"epoch": 4.766763848396502,
"grad_norm": 0.3139699101448059,
"learning_rate": 0.0005431190198366394,
"loss": 3.5786,
"step": 16350
},
{
"epoch": 4.78134110787172,
"grad_norm": 0.3123582601547241,
"learning_rate": 0.0005429439906651108,
"loss": 3.5759,
"step": 16400
},
{
"epoch": 4.795918367346939,
"grad_norm": 0.3623657822608948,
"learning_rate": 0.0005427689614935822,
"loss": 3.5991,
"step": 16450
},
{
"epoch": 4.810495626822157,
"grad_norm": 0.32249704003334045,
"learning_rate": 0.0005425939323220536,
"loss": 3.5727,
"step": 16500
},
{
"epoch": 4.825072886297376,
"grad_norm": 0.3091285824775696,
"learning_rate": 0.0005424189031505251,
"loss": 3.5919,
"step": 16550
},
{
"epoch": 4.839650145772595,
"grad_norm": 0.2962740659713745,
"learning_rate": 0.0005422438739789965,
"loss": 3.585,
"step": 16600
},
{
"epoch": 4.854227405247814,
"grad_norm": 0.29750195145606995,
"learning_rate": 0.0005420688448074678,
"loss": 3.5741,
"step": 16650
},
{
"epoch": 4.868804664723032,
"grad_norm": 0.30312204360961914,
"learning_rate": 0.0005418938156359393,
"loss": 3.5765,
"step": 16700
},
{
"epoch": 4.88338192419825,
"grad_norm": 0.2950429916381836,
"learning_rate": 0.0005417187864644107,
"loss": 3.5762,
"step": 16750
},
{
"epoch": 4.8979591836734695,
"grad_norm": 0.3145018517971039,
"learning_rate": 0.0005415437572928821,
"loss": 3.5797,
"step": 16800
},
{
"epoch": 4.912536443148688,
"grad_norm": 0.3003164827823639,
"learning_rate": 0.0005413687281213534,
"loss": 3.5873,
"step": 16850
},
{
"epoch": 4.927113702623907,
"grad_norm": 0.308309942483902,
"learning_rate": 0.0005411936989498249,
"loss": 3.5877,
"step": 16900
},
{
"epoch": 4.941690962099125,
"grad_norm": 0.29779040813446045,
"learning_rate": 0.0005410186697782963,
"loss": 3.5722,
"step": 16950
},
{
"epoch": 4.956268221574344,
"grad_norm": 0.3193836212158203,
"learning_rate": 0.0005408436406067678,
"loss": 3.5731,
"step": 17000
},
{
"epoch": 4.956268221574344,
"eval_accuracy": 0.3591342957344579,
"eval_loss": 3.6355204582214355,
"eval_runtime": 182.745,
"eval_samples_per_second": 90.968,
"eval_steps_per_second": 5.686,
"step": 17000
},
{
"epoch": 4.970845481049563,
"grad_norm": 0.3114926517009735,
"learning_rate": 0.0005406686114352391,
"loss": 3.571,
"step": 17050
},
{
"epoch": 4.985422740524781,
"grad_norm": 0.3078295588493347,
"learning_rate": 0.0005404935822637106,
"loss": 3.579,
"step": 17100
},
{
"epoch": 5.0,
"grad_norm": 0.33132073283195496,
"learning_rate": 0.000540318553092182,
"loss": 3.5752,
"step": 17150
},
{
"epoch": 5.014577259475218,
"grad_norm": 0.33800360560417175,
"learning_rate": 0.0005401435239206534,
"loss": 3.4588,
"step": 17200
},
{
"epoch": 5.029154518950437,
"grad_norm": 0.33555933833122253,
"learning_rate": 0.0005399684947491249,
"loss": 3.475,
"step": 17250
},
{
"epoch": 5.043731778425656,
"grad_norm": 0.3214545249938965,
"learning_rate": 0.0005397934655775962,
"loss": 3.4702,
"step": 17300
},
{
"epoch": 5.058309037900875,
"grad_norm": 0.32993388175964355,
"learning_rate": 0.0005396184364060676,
"loss": 3.4876,
"step": 17350
},
{
"epoch": 5.072886297376093,
"grad_norm": 0.3097357153892517,
"learning_rate": 0.0005394434072345391,
"loss": 3.4873,
"step": 17400
},
{
"epoch": 5.087463556851312,
"grad_norm": 0.3072371482849121,
"learning_rate": 0.0005392683780630105,
"loss": 3.4899,
"step": 17450
},
{
"epoch": 5.1020408163265305,
"grad_norm": 0.3170117139816284,
"learning_rate": 0.0005390933488914818,
"loss": 3.4837,
"step": 17500
},
{
"epoch": 5.11661807580175,
"grad_norm": 0.30793899297714233,
"learning_rate": 0.0005389183197199533,
"loss": 3.4947,
"step": 17550
},
{
"epoch": 5.131195335276968,
"grad_norm": 0.32247328758239746,
"learning_rate": 0.0005387432905484247,
"loss": 3.479,
"step": 17600
},
{
"epoch": 5.145772594752186,
"grad_norm": 0.3052769601345062,
"learning_rate": 0.0005385682613768962,
"loss": 3.4999,
"step": 17650
},
{
"epoch": 5.160349854227405,
"grad_norm": 0.31978240609169006,
"learning_rate": 0.0005383932322053675,
"loss": 3.495,
"step": 17700
},
{
"epoch": 5.174927113702624,
"grad_norm": 0.3358873426914215,
"learning_rate": 0.0005382182030338389,
"loss": 3.5045,
"step": 17750
},
{
"epoch": 5.189504373177843,
"grad_norm": 0.3461468815803528,
"learning_rate": 0.0005380431738623104,
"loss": 3.4943,
"step": 17800
},
{
"epoch": 5.204081632653061,
"grad_norm": 0.3337807357311249,
"learning_rate": 0.0005378681446907818,
"loss": 3.5,
"step": 17850
},
{
"epoch": 5.21865889212828,
"grad_norm": 0.34245598316192627,
"learning_rate": 0.0005376931155192531,
"loss": 3.5111,
"step": 17900
},
{
"epoch": 5.233236151603498,
"grad_norm": 0.32653409242630005,
"learning_rate": 0.0005375180863477246,
"loss": 3.5028,
"step": 17950
},
{
"epoch": 5.247813411078718,
"grad_norm": 0.330037921667099,
"learning_rate": 0.000537343057176196,
"loss": 3.5143,
"step": 18000
},
{
"epoch": 5.247813411078718,
"eval_accuracy": 0.3593785609751963,
"eval_loss": 3.6401000022888184,
"eval_runtime": 182.7236,
"eval_samples_per_second": 90.979,
"eval_steps_per_second": 5.686,
"step": 18000
},
{
"epoch": 5.262390670553936,
"grad_norm": 0.30893272161483765,
"learning_rate": 0.0005371680280046674,
"loss": 3.5003,
"step": 18050
},
{
"epoch": 5.276967930029155,
"grad_norm": 0.3184139132499695,
"learning_rate": 0.0005369929988331387,
"loss": 3.5157,
"step": 18100
},
{
"epoch": 5.291545189504373,
"grad_norm": 0.3190644681453705,
"learning_rate": 0.0005368179696616102,
"loss": 3.5079,
"step": 18150
},
{
"epoch": 5.3061224489795915,
"grad_norm": 0.31882479786872864,
"learning_rate": 0.0005366429404900816,
"loss": 3.499,
"step": 18200
},
{
"epoch": 5.320699708454811,
"grad_norm": 0.31382501125335693,
"learning_rate": 0.0005364679113185531,
"loss": 3.5072,
"step": 18250
},
{
"epoch": 5.335276967930029,
"grad_norm": 0.3288552463054657,
"learning_rate": 0.0005362928821470244,
"loss": 3.504,
"step": 18300
},
{
"epoch": 5.349854227405248,
"grad_norm": 0.29884201288223267,
"learning_rate": 0.0005361178529754959,
"loss": 3.5116,
"step": 18350
},
{
"epoch": 5.364431486880466,
"grad_norm": 0.3015066087245941,
"learning_rate": 0.0005359428238039673,
"loss": 3.5072,
"step": 18400
},
{
"epoch": 5.3790087463556855,
"grad_norm": 0.334290474653244,
"learning_rate": 0.0005357677946324387,
"loss": 3.5136,
"step": 18450
},
{
"epoch": 5.393586005830904,
"grad_norm": 0.3181033730506897,
"learning_rate": 0.0005355927654609101,
"loss": 3.524,
"step": 18500
},
{
"epoch": 5.408163265306122,
"grad_norm": 0.33072495460510254,
"learning_rate": 0.0005354177362893815,
"loss": 3.5146,
"step": 18550
},
{
"epoch": 5.422740524781341,
"grad_norm": 0.33785611391067505,
"learning_rate": 0.0005352427071178529,
"loss": 3.5204,
"step": 18600
},
{
"epoch": 5.437317784256559,
"grad_norm": 0.31333351135253906,
"learning_rate": 0.0005350676779463244,
"loss": 3.5243,
"step": 18650
},
{
"epoch": 5.451895043731779,
"grad_norm": 0.32065486907958984,
"learning_rate": 0.0005348926487747957,
"loss": 3.5175,
"step": 18700
},
{
"epoch": 5.466472303206997,
"grad_norm": 0.3304148018360138,
"learning_rate": 0.0005347176196032671,
"loss": 3.528,
"step": 18750
},
{
"epoch": 5.481049562682216,
"grad_norm": 0.31169185042381287,
"learning_rate": 0.0005345425904317386,
"loss": 3.5387,
"step": 18800
},
{
"epoch": 5.495626822157434,
"grad_norm": 0.3135135769844055,
"learning_rate": 0.00053436756126021,
"loss": 3.5116,
"step": 18850
},
{
"epoch": 5.510204081632653,
"grad_norm": 0.30703073740005493,
"learning_rate": 0.0005341925320886814,
"loss": 3.5188,
"step": 18900
},
{
"epoch": 5.524781341107872,
"grad_norm": 0.302874892950058,
"learning_rate": 0.0005340175029171528,
"loss": 3.5403,
"step": 18950
},
{
"epoch": 5.539358600583091,
"grad_norm": 0.32535436749458313,
"learning_rate": 0.0005338424737456243,
"loss": 3.5218,
"step": 19000
},
{
"epoch": 5.539358600583091,
"eval_accuracy": 0.36021494870312226,
"eval_loss": 3.628293752670288,
"eval_runtime": 182.4386,
"eval_samples_per_second": 91.121,
"eval_steps_per_second": 5.695,
"step": 19000
},
{
"epoch": 5.553935860058309,
"grad_norm": 0.3248957395553589,
"learning_rate": 0.0005336674445740957,
"loss": 3.5256,
"step": 19050
},
{
"epoch": 5.568513119533527,
"grad_norm": 0.31554681062698364,
"learning_rate": 0.000533492415402567,
"loss": 3.5352,
"step": 19100
},
{
"epoch": 5.5830903790087465,
"grad_norm": 0.3147410452365875,
"learning_rate": 0.0005333173862310384,
"loss": 3.5234,
"step": 19150
},
{
"epoch": 5.597667638483965,
"grad_norm": 0.3296838402748108,
"learning_rate": 0.0005331423570595099,
"loss": 3.5218,
"step": 19200
},
{
"epoch": 5.612244897959184,
"grad_norm": 0.34120234847068787,
"learning_rate": 0.0005329673278879813,
"loss": 3.5284,
"step": 19250
},
{
"epoch": 5.626822157434402,
"grad_norm": 0.3368380665779114,
"learning_rate": 0.0005327922987164526,
"loss": 3.5254,
"step": 19300
},
{
"epoch": 5.641399416909621,
"grad_norm": 0.31956803798675537,
"learning_rate": 0.0005326172695449241,
"loss": 3.5199,
"step": 19350
},
{
"epoch": 5.65597667638484,
"grad_norm": 0.3001638352870941,
"learning_rate": 0.0005324422403733955,
"loss": 3.5285,
"step": 19400
},
{
"epoch": 5.670553935860058,
"grad_norm": 0.3221629559993744,
"learning_rate": 0.000532267211201867,
"loss": 3.5179,
"step": 19450
},
{
"epoch": 5.685131195335277,
"grad_norm": 0.30188408493995667,
"learning_rate": 0.0005320921820303384,
"loss": 3.5368,
"step": 19500
},
{
"epoch": 5.699708454810495,
"grad_norm": 0.3237511217594147,
"learning_rate": 0.0005319171528588098,
"loss": 3.5325,
"step": 19550
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.30857592821121216,
"learning_rate": 0.0005317421236872812,
"loss": 3.5249,
"step": 19600
},
{
"epoch": 5.728862973760933,
"grad_norm": 0.3192301094532013,
"learning_rate": 0.0005315670945157526,
"loss": 3.5469,
"step": 19650
},
{
"epoch": 5.743440233236152,
"grad_norm": 0.29896432161331177,
"learning_rate": 0.000531392065344224,
"loss": 3.526,
"step": 19700
},
{
"epoch": 5.75801749271137,
"grad_norm": 0.3077335059642792,
"learning_rate": 0.0005312170361726954,
"loss": 3.535,
"step": 19750
},
{
"epoch": 5.772594752186589,
"grad_norm": 0.3408810496330261,
"learning_rate": 0.0005310420070011668,
"loss": 3.5384,
"step": 19800
},
{
"epoch": 5.7871720116618075,
"grad_norm": 0.32832178473472595,
"learning_rate": 0.0005308669778296382,
"loss": 3.5298,
"step": 19850
},
{
"epoch": 5.801749271137027,
"grad_norm": 0.3311011493206024,
"learning_rate": 0.0005306919486581097,
"loss": 3.5337,
"step": 19900
},
{
"epoch": 5.816326530612245,
"grad_norm": 0.3176785707473755,
"learning_rate": 0.000530516919486581,
"loss": 3.5446,
"step": 19950
},
{
"epoch": 5.830903790087463,
"grad_norm": 0.321762353181839,
"learning_rate": 0.0005303418903150524,
"loss": 3.528,
"step": 20000
},
{
"epoch": 5.830903790087463,
"eval_accuracy": 0.3612652303792033,
"eval_loss": 3.6179208755493164,
"eval_runtime": 182.0938,
"eval_samples_per_second": 91.294,
"eval_steps_per_second": 5.706,
"step": 20000
},
{
"epoch": 5.845481049562682,
"grad_norm": 0.30524224042892456,
"learning_rate": 0.0005301668611435239,
"loss": 3.5328,
"step": 20050
},
{
"epoch": 5.860058309037901,
"grad_norm": 0.333676278591156,
"learning_rate": 0.0005299918319719953,
"loss": 3.5335,
"step": 20100
},
{
"epoch": 5.87463556851312,
"grad_norm": 0.30587267875671387,
"learning_rate": 0.0005298168028004667,
"loss": 3.528,
"step": 20150
},
{
"epoch": 5.889212827988338,
"grad_norm": 0.309893399477005,
"learning_rate": 0.0005296417736289381,
"loss": 3.5382,
"step": 20200
},
{
"epoch": 5.903790087463557,
"grad_norm": 0.3243180215358734,
"learning_rate": 0.0005294667444574096,
"loss": 3.5356,
"step": 20250
},
{
"epoch": 5.918367346938775,
"grad_norm": 0.3317978084087372,
"learning_rate": 0.000529291715285881,
"loss": 3.5329,
"step": 20300
},
{
"epoch": 5.932944606413994,
"grad_norm": 0.3489520847797394,
"learning_rate": 0.0005291166861143523,
"loss": 3.5281,
"step": 20350
},
{
"epoch": 5.947521865889213,
"grad_norm": 0.3286189138889313,
"learning_rate": 0.0005289416569428237,
"loss": 3.53,
"step": 20400
},
{
"epoch": 5.962099125364432,
"grad_norm": 0.32896900177001953,
"learning_rate": 0.0005287666277712952,
"loss": 3.5365,
"step": 20450
},
{
"epoch": 5.97667638483965,
"grad_norm": 0.3125670254230499,
"learning_rate": 0.0005285915985997666,
"loss": 3.5392,
"step": 20500
},
{
"epoch": 5.9912536443148685,
"grad_norm": 0.3274867534637451,
"learning_rate": 0.0005284165694282379,
"loss": 3.5389,
"step": 20550
},
{
"epoch": 6.005830903790088,
"grad_norm": 0.3022937774658203,
"learning_rate": 0.0005282415402567094,
"loss": 3.4769,
"step": 20600
},
{
"epoch": 6.020408163265306,
"grad_norm": 0.319007009267807,
"learning_rate": 0.0005280665110851808,
"loss": 3.423,
"step": 20650
},
{
"epoch": 6.034985422740525,
"grad_norm": 0.33166149258613586,
"learning_rate": 0.0005278914819136523,
"loss": 3.4239,
"step": 20700
},
{
"epoch": 6.049562682215743,
"grad_norm": 0.31625524163246155,
"learning_rate": 0.0005277164527421236,
"loss": 3.4286,
"step": 20750
},
{
"epoch": 6.0641399416909625,
"grad_norm": 0.32785487174987793,
"learning_rate": 0.0005275414235705951,
"loss": 3.4299,
"step": 20800
},
{
"epoch": 6.078717201166181,
"grad_norm": 0.32926735281944275,
"learning_rate": 0.0005273663943990665,
"loss": 3.4416,
"step": 20850
},
{
"epoch": 6.093294460641399,
"grad_norm": 0.316875696182251,
"learning_rate": 0.0005271913652275379,
"loss": 3.4401,
"step": 20900
},
{
"epoch": 6.107871720116618,
"grad_norm": 0.3242247402667999,
"learning_rate": 0.0005270163360560092,
"loss": 3.4508,
"step": 20950
},
{
"epoch": 6.122448979591836,
"grad_norm": 0.3237857222557068,
"learning_rate": 0.0005268413068844807,
"loss": 3.446,
"step": 21000
},
{
"epoch": 6.122448979591836,
"eval_accuracy": 0.36163439461773605,
"eval_loss": 3.6209189891815186,
"eval_runtime": 182.3861,
"eval_samples_per_second": 91.147,
"eval_steps_per_second": 5.697,
"step": 21000
},
{
"epoch": 6.137026239067056,
"grad_norm": 0.34047290682792664,
"learning_rate": 0.0005266662777129521,
"loss": 3.4436,
"step": 21050
},
{
"epoch": 6.151603498542274,
"grad_norm": 0.31786465644836426,
"learning_rate": 0.0005264912485414235,
"loss": 3.4569,
"step": 21100
},
{
"epoch": 6.166180758017493,
"grad_norm": 0.3193897604942322,
"learning_rate": 0.000526316219369895,
"loss": 3.4581,
"step": 21150
},
{
"epoch": 6.180758017492711,
"grad_norm": 0.32340869307518005,
"learning_rate": 0.0005261411901983663,
"loss": 3.4661,
"step": 21200
},
{
"epoch": 6.19533527696793,
"grad_norm": 0.3188331723213196,
"learning_rate": 0.0005259661610268378,
"loss": 3.4469,
"step": 21250
},
{
"epoch": 6.209912536443149,
"grad_norm": 0.3093229830265045,
"learning_rate": 0.0005257911318553092,
"loss": 3.4518,
"step": 21300
},
{
"epoch": 6.224489795918367,
"grad_norm": 0.3273768424987793,
"learning_rate": 0.0005256161026837807,
"loss": 3.4675,
"step": 21350
},
{
"epoch": 6.239067055393586,
"grad_norm": 0.3344630300998688,
"learning_rate": 0.000525441073512252,
"loss": 3.4561,
"step": 21400
},
{
"epoch": 6.253644314868804,
"grad_norm": 0.3129141628742218,
"learning_rate": 0.0005252660443407234,
"loss": 3.4563,
"step": 21450
},
{
"epoch": 6.2682215743440235,
"grad_norm": 0.32025453448295593,
"learning_rate": 0.0005250910151691949,
"loss": 3.4571,
"step": 21500
},
{
"epoch": 6.282798833819242,
"grad_norm": 0.3402857482433319,
"learning_rate": 0.0005249159859976663,
"loss": 3.4777,
"step": 21550
},
{
"epoch": 6.297376093294461,
"grad_norm": 0.32615235447883606,
"learning_rate": 0.0005247409568261376,
"loss": 3.4708,
"step": 21600
},
{
"epoch": 6.311953352769679,
"grad_norm": 0.33531254529953003,
"learning_rate": 0.000524565927654609,
"loss": 3.4549,
"step": 21650
},
{
"epoch": 6.326530612244898,
"grad_norm": 0.33214399218559265,
"learning_rate": 0.0005243908984830805,
"loss": 3.4745,
"step": 21700
},
{
"epoch": 6.341107871720117,
"grad_norm": 0.34065625071525574,
"learning_rate": 0.0005242158693115519,
"loss": 3.4731,
"step": 21750
},
{
"epoch": 6.355685131195335,
"grad_norm": 0.35216084122657776,
"learning_rate": 0.0005240408401400232,
"loss": 3.4692,
"step": 21800
},
{
"epoch": 6.370262390670554,
"grad_norm": 0.337293803691864,
"learning_rate": 0.0005238658109684947,
"loss": 3.4719,
"step": 21850
},
{
"epoch": 6.384839650145772,
"grad_norm": 0.3386717140674591,
"learning_rate": 0.0005236907817969661,
"loss": 3.4759,
"step": 21900
},
{
"epoch": 6.399416909620991,
"grad_norm": 0.3118914067745209,
"learning_rate": 0.0005235157526254376,
"loss": 3.4722,
"step": 21950
},
{
"epoch": 6.41399416909621,
"grad_norm": 0.31701526045799255,
"learning_rate": 0.0005233407234539089,
"loss": 3.479,
"step": 22000
},
{
"epoch": 6.41399416909621,
"eval_accuracy": 0.36226359833424054,
"eval_loss": 3.6132073402404785,
"eval_runtime": 182.1237,
"eval_samples_per_second": 91.279,
"eval_steps_per_second": 5.705,
"step": 22000
},
{
"epoch": 6.428571428571429,
"grad_norm": 0.33094874024391174,
"learning_rate": 0.0005231656942823804,
"loss": 3.4769,
"step": 22050
},
{
"epoch": 6.443148688046647,
"grad_norm": 0.31896036863327026,
"learning_rate": 0.0005229906651108518,
"loss": 3.4843,
"step": 22100
},
{
"epoch": 6.457725947521866,
"grad_norm": 0.33175987005233765,
"learning_rate": 0.0005228156359393232,
"loss": 3.4735,
"step": 22150
},
{
"epoch": 6.4723032069970845,
"grad_norm": 0.3212224543094635,
"learning_rate": 0.0005226406067677945,
"loss": 3.4855,
"step": 22200
},
{
"epoch": 6.486880466472304,
"grad_norm": 0.3497124910354614,
"learning_rate": 0.000522465577596266,
"loss": 3.4818,
"step": 22250
},
{
"epoch": 6.501457725947522,
"grad_norm": 0.3271764814853668,
"learning_rate": 0.0005222905484247374,
"loss": 3.4805,
"step": 22300
},
{
"epoch": 6.51603498542274,
"grad_norm": 0.31307926774024963,
"learning_rate": 0.0005221155192532088,
"loss": 3.4754,
"step": 22350
},
{
"epoch": 6.530612244897959,
"grad_norm": 0.3359667956829071,
"learning_rate": 0.0005219404900816802,
"loss": 3.4805,
"step": 22400
},
{
"epoch": 6.545189504373178,
"grad_norm": 0.3326173722743988,
"learning_rate": 0.0005217654609101516,
"loss": 3.4845,
"step": 22450
},
{
"epoch": 6.559766763848397,
"grad_norm": 0.3412718176841736,
"learning_rate": 0.0005215904317386231,
"loss": 3.4866,
"step": 22500
},
{
"epoch": 6.574344023323615,
"grad_norm": 0.33276066184043884,
"learning_rate": 0.0005214154025670945,
"loss": 3.4816,
"step": 22550
},
{
"epoch": 6.588921282798834,
"grad_norm": 0.30933451652526855,
"learning_rate": 0.000521240373395566,
"loss": 3.4847,
"step": 22600
},
{
"epoch": 6.603498542274052,
"grad_norm": 0.38130906224250793,
"learning_rate": 0.0005210653442240373,
"loss": 3.4943,
"step": 22650
},
{
"epoch": 6.618075801749271,
"grad_norm": 0.3150744140148163,
"learning_rate": 0.0005208903150525087,
"loss": 3.4801,
"step": 22700
},
{
"epoch": 6.63265306122449,
"grad_norm": 0.31047096848487854,
"learning_rate": 0.0005207152858809802,
"loss": 3.4828,
"step": 22750
},
{
"epoch": 6.647230320699708,
"grad_norm": 0.32708585262298584,
"learning_rate": 0.0005205402567094515,
"loss": 3.4926,
"step": 22800
},
{
"epoch": 6.661807580174927,
"grad_norm": 0.36320850253105164,
"learning_rate": 0.0005203652275379229,
"loss": 3.4778,
"step": 22850
},
{
"epoch": 6.6763848396501455,
"grad_norm": 0.3161414861679077,
"learning_rate": 0.0005201901983663943,
"loss": 3.4908,
"step": 22900
},
{
"epoch": 6.690962099125365,
"grad_norm": 0.3203990161418915,
"learning_rate": 0.0005200151691948658,
"loss": 3.5021,
"step": 22950
},
{
"epoch": 6.705539358600583,
"grad_norm": 0.31184670329093933,
"learning_rate": 0.0005198401400233371,
"loss": 3.4919,
"step": 23000
},
{
"epoch": 6.705539358600583,
"eval_accuracy": 0.363203107195124,
"eval_loss": 3.604387044906616,
"eval_runtime": 183.0205,
"eval_samples_per_second": 90.831,
"eval_steps_per_second": 5.677,
"step": 23000
},
{
"epoch": 6.720116618075802,
"grad_norm": 0.33052515983581543,
"learning_rate": 0.0005196651108518086,
"loss": 3.4911,
"step": 23050
},
{
"epoch": 6.73469387755102,
"grad_norm": 0.35309287905693054,
"learning_rate": 0.00051949008168028,
"loss": 3.4878,
"step": 23100
},
{
"epoch": 6.7492711370262395,
"grad_norm": 0.3378291726112366,
"learning_rate": 0.0005193150525087515,
"loss": 3.4892,
"step": 23150
},
{
"epoch": 6.763848396501458,
"grad_norm": 0.3175658583641052,
"learning_rate": 0.0005191400233372229,
"loss": 3.4935,
"step": 23200
},
{
"epoch": 6.778425655976676,
"grad_norm": 0.31248053908348083,
"learning_rate": 0.0005189649941656942,
"loss": 3.5044,
"step": 23250
},
{
"epoch": 6.793002915451895,
"grad_norm": 0.30260851979255676,
"learning_rate": 0.0005187899649941657,
"loss": 3.505,
"step": 23300
},
{
"epoch": 6.807580174927113,
"grad_norm": 0.3121766149997711,
"learning_rate": 0.0005186149358226371,
"loss": 3.4833,
"step": 23350
},
{
"epoch": 6.822157434402333,
"grad_norm": 0.3289031982421875,
"learning_rate": 0.0005184399066511085,
"loss": 3.4957,
"step": 23400
},
{
"epoch": 6.836734693877551,
"grad_norm": 0.32009539008140564,
"learning_rate": 0.0005182648774795798,
"loss": 3.4924,
"step": 23450
},
{
"epoch": 6.85131195335277,
"grad_norm": 0.3235000669956207,
"learning_rate": 0.0005180898483080513,
"loss": 3.4927,
"step": 23500
},
{
"epoch": 6.865889212827988,
"grad_norm": 0.3122546970844269,
"learning_rate": 0.0005179148191365227,
"loss": 3.5006,
"step": 23550
},
{
"epoch": 6.8804664723032065,
"grad_norm": 0.3026493191719055,
"learning_rate": 0.000517739789964994,
"loss": 3.4974,
"step": 23600
},
{
"epoch": 6.895043731778426,
"grad_norm": 0.30712851881980896,
"learning_rate": 0.0005175647607934655,
"loss": 3.4842,
"step": 23650
},
{
"epoch": 6.909620991253644,
"grad_norm": 0.32958361506462097,
"learning_rate": 0.0005173897316219369,
"loss": 3.5019,
"step": 23700
},
{
"epoch": 6.924198250728863,
"grad_norm": 0.32281431555747986,
"learning_rate": 0.0005172147024504084,
"loss": 3.4878,
"step": 23750
},
{
"epoch": 6.938775510204081,
"grad_norm": 0.34471213817596436,
"learning_rate": 0.0005170396732788798,
"loss": 3.4986,
"step": 23800
},
{
"epoch": 6.9533527696793005,
"grad_norm": 0.31065723299980164,
"learning_rate": 0.0005168646441073512,
"loss": 3.4782,
"step": 23850
},
{
"epoch": 6.967930029154519,
"grad_norm": 0.3147946894168854,
"learning_rate": 0.0005166896149358226,
"loss": 3.5057,
"step": 23900
},
{
"epoch": 6.982507288629738,
"grad_norm": 0.3277629613876343,
"learning_rate": 0.000516514585764294,
"loss": 3.4993,
"step": 23950
},
{
"epoch": 6.997084548104956,
"grad_norm": 0.3514486849308014,
"learning_rate": 0.0005163395565927655,
"loss": 3.505,
"step": 24000
},
{
"epoch": 6.997084548104956,
"eval_accuracy": 0.3639886406657011,
"eval_loss": 3.593942642211914,
"eval_runtime": 187.1371,
"eval_samples_per_second": 88.833,
"eval_steps_per_second": 5.552,
"step": 24000
},
{
"epoch": 7.011661807580175,
"grad_norm": 0.3496927320957184,
"learning_rate": 0.0005161645274212368,
"loss": 3.3972,
"step": 24050
},
{
"epoch": 7.026239067055394,
"grad_norm": 0.3268931806087494,
"learning_rate": 0.0005159894982497082,
"loss": 3.3784,
"step": 24100
},
{
"epoch": 7.040816326530612,
"grad_norm": 0.34126970171928406,
"learning_rate": 0.0005158144690781796,
"loss": 3.3877,
"step": 24150
},
{
"epoch": 7.055393586005831,
"grad_norm": 0.3556489646434784,
"learning_rate": 0.0005156394399066511,
"loss": 3.3915,
"step": 24200
},
{
"epoch": 7.069970845481049,
"grad_norm": 0.341993123292923,
"learning_rate": 0.0005154644107351224,
"loss": 3.3872,
"step": 24250
},
{
"epoch": 7.084548104956268,
"grad_norm": 0.32109686732292175,
"learning_rate": 0.0005152893815635939,
"loss": 3.3911,
"step": 24300
},
{
"epoch": 7.099125364431487,
"grad_norm": 0.33900973200798035,
"learning_rate": 0.0005151143523920653,
"loss": 3.3986,
"step": 24350
},
{
"epoch": 7.113702623906706,
"grad_norm": 0.3317485749721527,
"learning_rate": 0.0005149393232205368,
"loss": 3.411,
"step": 24400
},
{
"epoch": 7.128279883381924,
"grad_norm": 0.31022968888282776,
"learning_rate": 0.0005147642940490081,
"loss": 3.4088,
"step": 24450
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.3452603220939636,
"learning_rate": 0.0005145892648774795,
"loss": 3.4057,
"step": 24500
},
{
"epoch": 7.1574344023323615,
"grad_norm": 0.3521261513233185,
"learning_rate": 0.000514414235705951,
"loss": 3.4058,
"step": 24550
},
{
"epoch": 7.17201166180758,
"grad_norm": 0.32531312108039856,
"learning_rate": 0.0005142392065344224,
"loss": 3.4137,
"step": 24600
},
{
"epoch": 7.186588921282799,
"grad_norm": 0.32090941071510315,
"learning_rate": 0.0005140641773628937,
"loss": 3.4222,
"step": 24650
},
{
"epoch": 7.201166180758017,
"grad_norm": 0.35693666338920593,
"learning_rate": 0.0005138891481913651,
"loss": 3.4178,
"step": 24700
},
{
"epoch": 7.215743440233236,
"grad_norm": 0.3247111141681671,
"learning_rate": 0.0005137141190198366,
"loss": 3.4282,
"step": 24750
},
{
"epoch": 7.230320699708455,
"grad_norm": 0.3346105217933655,
"learning_rate": 0.000513539089848308,
"loss": 3.4282,
"step": 24800
},
{
"epoch": 7.244897959183674,
"grad_norm": 0.328382670879364,
"learning_rate": 0.0005133640606767795,
"loss": 3.4372,
"step": 24850
},
{
"epoch": 7.259475218658892,
"grad_norm": 0.33675169944763184,
"learning_rate": 0.0005131890315052508,
"loss": 3.4343,
"step": 24900
},
{
"epoch": 7.274052478134111,
"grad_norm": 0.3539164960384369,
"learning_rate": 0.0005130140023337223,
"loss": 3.4344,
"step": 24950
},
{
"epoch": 7.288629737609329,
"grad_norm": 0.33521610498428345,
"learning_rate": 0.0005128389731621937,
"loss": 3.4421,
"step": 25000
},
{
"epoch": 7.288629737609329,
"eval_accuracy": 0.36366291443865373,
"eval_loss": 3.6036746501922607,
"eval_runtime": 183.2654,
"eval_samples_per_second": 90.71,
"eval_steps_per_second": 5.669,
"step": 25000
},
{
"epoch": 7.303206997084548,
"grad_norm": 0.3432515561580658,
"learning_rate": 0.000512663943990665,
"loss": 3.4345,
"step": 25050
},
{
"epoch": 7.317784256559767,
"grad_norm": 0.3308854103088379,
"learning_rate": 0.0005124889148191365,
"loss": 3.4333,
"step": 25100
},
{
"epoch": 7.332361516034985,
"grad_norm": 0.3173339068889618,
"learning_rate": 0.0005123138856476079,
"loss": 3.4399,
"step": 25150
},
{
"epoch": 7.346938775510204,
"grad_norm": 0.3661780059337616,
"learning_rate": 0.0005121388564760793,
"loss": 3.4486,
"step": 25200
},
{
"epoch": 7.3615160349854225,
"grad_norm": 0.3121057152748108,
"learning_rate": 0.0005119638273045508,
"loss": 3.4385,
"step": 25250
},
{
"epoch": 7.376093294460642,
"grad_norm": 0.33542507886886597,
"learning_rate": 0.0005117887981330221,
"loss": 3.4495,
"step": 25300
},
{
"epoch": 7.39067055393586,
"grad_norm": 0.3543615937232971,
"learning_rate": 0.0005116137689614935,
"loss": 3.443,
"step": 25350
},
{
"epoch": 7.405247813411079,
"grad_norm": 0.3314652144908905,
"learning_rate": 0.0005114387397899649,
"loss": 3.4458,
"step": 25400
},
{
"epoch": 7.419825072886297,
"grad_norm": 0.34093043208122253,
"learning_rate": 0.0005112637106184364,
"loss": 3.4395,
"step": 25450
},
{
"epoch": 7.4344023323615165,
"grad_norm": 0.32408225536346436,
"learning_rate": 0.0005110886814469077,
"loss": 3.4368,
"step": 25500
},
{
"epoch": 7.448979591836735,
"grad_norm": 0.3401145935058594,
"learning_rate": 0.0005109136522753792,
"loss": 3.4453,
"step": 25550
},
{
"epoch": 7.463556851311953,
"grad_norm": 0.3173987865447998,
"learning_rate": 0.0005107386231038506,
"loss": 3.4492,
"step": 25600
},
{
"epoch": 7.478134110787172,
"grad_norm": 0.3380630612373352,
"learning_rate": 0.0005105635939323221,
"loss": 3.4549,
"step": 25650
},
{
"epoch": 7.49271137026239,
"grad_norm": 0.32536083459854126,
"learning_rate": 0.0005103885647607934,
"loss": 3.4493,
"step": 25700
},
{
"epoch": 7.50728862973761,
"grad_norm": 0.32971838116645813,
"learning_rate": 0.0005102135355892648,
"loss": 3.4459,
"step": 25750
},
{
"epoch": 7.521865889212828,
"grad_norm": 0.3344475328922272,
"learning_rate": 0.0005100385064177363,
"loss": 3.4463,
"step": 25800
},
{
"epoch": 7.536443148688047,
"grad_norm": 0.32902637124061584,
"learning_rate": 0.0005098634772462077,
"loss": 3.4497,
"step": 25850
},
{
"epoch": 7.551020408163265,
"grad_norm": 0.314765602350235,
"learning_rate": 0.000509688448074679,
"loss": 3.439,
"step": 25900
},
{
"epoch": 7.5655976676384835,
"grad_norm": 0.3501645028591156,
"learning_rate": 0.0005095134189031504,
"loss": 3.4522,
"step": 25950
},
{
"epoch": 7.580174927113703,
"grad_norm": 0.3293103277683258,
"learning_rate": 0.0005093383897316219,
"loss": 3.4527,
"step": 26000
},
{
"epoch": 7.580174927113703,
"eval_accuracy": 0.36490955005283193,
"eval_loss": 3.5910089015960693,
"eval_runtime": 182.9055,
"eval_samples_per_second": 90.888,
"eval_steps_per_second": 5.681,
"step": 26000
},
{
"epoch": 7.594752186588921,
"grad_norm": 0.3306674659252167,
"learning_rate": 0.0005091633605600933,
"loss": 3.4567,
"step": 26050
},
{
"epoch": 7.60932944606414,
"grad_norm": 0.33794093132019043,
"learning_rate": 0.0005089883313885647,
"loss": 3.4417,
"step": 26100
},
{
"epoch": 7.623906705539358,
"grad_norm": 0.32369881868362427,
"learning_rate": 0.0005088133022170361,
"loss": 3.4365,
"step": 26150
},
{
"epoch": 7.6384839650145775,
"grad_norm": 0.33512216806411743,
"learning_rate": 0.0005086382730455076,
"loss": 3.4494,
"step": 26200
},
{
"epoch": 7.653061224489796,
"grad_norm": 0.32545676827430725,
"learning_rate": 0.000508463243873979,
"loss": 3.4495,
"step": 26250
},
{
"epoch": 7.667638483965015,
"grad_norm": 0.32898396253585815,
"learning_rate": 0.0005082882147024503,
"loss": 3.4623,
"step": 26300
},
{
"epoch": 7.682215743440233,
"grad_norm": 0.33407220244407654,
"learning_rate": 0.0005081131855309218,
"loss": 3.4538,
"step": 26350
},
{
"epoch": 7.696793002915452,
"grad_norm": 0.32770851254463196,
"learning_rate": 0.0005079381563593932,
"loss": 3.4612,
"step": 26400
},
{
"epoch": 7.711370262390671,
"grad_norm": 0.3490068018436432,
"learning_rate": 0.0005077631271878646,
"loss": 3.4593,
"step": 26450
},
{
"epoch": 7.725947521865889,
"grad_norm": 0.3532838523387909,
"learning_rate": 0.000507588098016336,
"loss": 3.4614,
"step": 26500
},
{
"epoch": 7.740524781341108,
"grad_norm": 0.3085959255695343,
"learning_rate": 0.0005074130688448074,
"loss": 3.4539,
"step": 26550
},
{
"epoch": 7.755102040816326,
"grad_norm": 0.32022443413734436,
"learning_rate": 0.0005072380396732788,
"loss": 3.4592,
"step": 26600
},
{
"epoch": 7.769679300291545,
"grad_norm": 0.32210254669189453,
"learning_rate": 0.0005070630105017503,
"loss": 3.4593,
"step": 26650
},
{
"epoch": 7.784256559766764,
"grad_norm": 0.33314940333366394,
"learning_rate": 0.0005068879813302216,
"loss": 3.4635,
"step": 26700
},
{
"epoch": 7.798833819241983,
"grad_norm": 0.3578518331050873,
"learning_rate": 0.0005067129521586931,
"loss": 3.461,
"step": 26750
},
{
"epoch": 7.813411078717201,
"grad_norm": 0.33715447783470154,
"learning_rate": 0.0005065379229871645,
"loss": 3.4692,
"step": 26800
},
{
"epoch": 7.827988338192419,
"grad_norm": 0.32819151878356934,
"learning_rate": 0.0005063628938156359,
"loss": 3.4603,
"step": 26850
},
{
"epoch": 7.8425655976676385,
"grad_norm": 0.3273194134235382,
"learning_rate": 0.0005061878646441074,
"loss": 3.4548,
"step": 26900
},
{
"epoch": 7.857142857142857,
"grad_norm": 0.31844374537467957,
"learning_rate": 0.0005060128354725787,
"loss": 3.4487,
"step": 26950
},
{
"epoch": 7.871720116618076,
"grad_norm": 0.33120185136795044,
"learning_rate": 0.0005058378063010501,
"loss": 3.4547,
"step": 27000
},
{
"epoch": 7.871720116618076,
"eval_accuracy": 0.36534510734957026,
"eval_loss": 3.584082841873169,
"eval_runtime": 183.1945,
"eval_samples_per_second": 90.745,
"eval_steps_per_second": 5.672,
"step": 27000
},
{
"epoch": 7.886297376093294,
"grad_norm": 0.319904088973999,
"learning_rate": 0.0005056627771295216,
"loss": 3.4783,
"step": 27050
},
{
"epoch": 7.900874635568513,
"grad_norm": 0.34819895029067993,
"learning_rate": 0.000505487747957993,
"loss": 3.4657,
"step": 27100
},
{
"epoch": 7.915451895043732,
"grad_norm": 0.33924388885498047,
"learning_rate": 0.0005053127187864643,
"loss": 3.4583,
"step": 27150
},
{
"epoch": 7.930029154518951,
"grad_norm": 0.3199351727962494,
"learning_rate": 0.0005051376896149357,
"loss": 3.4614,
"step": 27200
},
{
"epoch": 7.944606413994169,
"grad_norm": 0.3343052268028259,
"learning_rate": 0.0005049626604434072,
"loss": 3.4726,
"step": 27250
},
{
"epoch": 7.959183673469388,
"grad_norm": 0.30949947237968445,
"learning_rate": 0.0005047876312718786,
"loss": 3.4688,
"step": 27300
},
{
"epoch": 7.973760932944606,
"grad_norm": 0.3280562162399292,
"learning_rate": 0.00050461260210035,
"loss": 3.4816,
"step": 27350
},
{
"epoch": 7.988338192419825,
"grad_norm": 0.3171611428260803,
"learning_rate": 0.0005044375729288214,
"loss": 3.4537,
"step": 27400
},
{
"epoch": 8.002915451895044,
"grad_norm": 0.3121030628681183,
"learning_rate": 0.0005042625437572929,
"loss": 3.4459,
"step": 27450
},
{
"epoch": 8.017492711370263,
"grad_norm": 0.3203401565551758,
"learning_rate": 0.0005040875145857643,
"loss": 3.3559,
"step": 27500
},
{
"epoch": 8.03206997084548,
"grad_norm": 0.333235502243042,
"learning_rate": 0.0005039124854142356,
"loss": 3.3655,
"step": 27550
},
{
"epoch": 8.0466472303207,
"grad_norm": 0.3198469579219818,
"learning_rate": 0.0005037374562427071,
"loss": 3.3627,
"step": 27600
},
{
"epoch": 8.061224489795919,
"grad_norm": 0.325804740190506,
"learning_rate": 0.0005035624270711785,
"loss": 3.3693,
"step": 27650
},
{
"epoch": 8.075801749271138,
"grad_norm": 0.32386887073516846,
"learning_rate": 0.0005033873978996499,
"loss": 3.3771,
"step": 27700
},
{
"epoch": 8.090379008746355,
"grad_norm": 0.34717753529548645,
"learning_rate": 0.0005032123687281213,
"loss": 3.3663,
"step": 27750
},
{
"epoch": 8.104956268221574,
"grad_norm": 0.32944604754447937,
"learning_rate": 0.0005030373395565927,
"loss": 3.373,
"step": 27800
},
{
"epoch": 8.119533527696793,
"grad_norm": 0.3473806083202362,
"learning_rate": 0.0005028623103850641,
"loss": 3.3747,
"step": 27850
},
{
"epoch": 8.13411078717201,
"grad_norm": 0.32745233178138733,
"learning_rate": 0.0005026872812135356,
"loss": 3.3825,
"step": 27900
},
{
"epoch": 8.14868804664723,
"grad_norm": 0.34420016407966614,
"learning_rate": 0.0005025122520420069,
"loss": 3.3733,
"step": 27950
},
{
"epoch": 8.16326530612245,
"grad_norm": 0.35828736424446106,
"learning_rate": 0.0005023372228704784,
"loss": 3.3729,
"step": 28000
},
{
"epoch": 8.16326530612245,
"eval_accuracy": 0.3654113826895875,
"eval_loss": 3.5947518348693848,
"eval_runtime": 183.5384,
"eval_samples_per_second": 90.575,
"eval_steps_per_second": 5.661,
"step": 28000
},
{
"epoch": 8.177842565597668,
"grad_norm": 0.3347868323326111,
"learning_rate": 0.0005021621936989498,
"loss": 3.3855,
"step": 28050
},
{
"epoch": 8.192419825072886,
"grad_norm": 0.3392854928970337,
"learning_rate": 0.0005019871645274213,
"loss": 3.3912,
"step": 28100
},
{
"epoch": 8.206997084548105,
"grad_norm": 0.3293224573135376,
"learning_rate": 0.0005018121353558926,
"loss": 3.3988,
"step": 28150
},
{
"epoch": 8.221574344023324,
"grad_norm": 0.34300434589385986,
"learning_rate": 0.000501637106184364,
"loss": 3.3908,
"step": 28200
},
{
"epoch": 8.236151603498543,
"grad_norm": 0.3153841197490692,
"learning_rate": 0.0005014620770128354,
"loss": 3.4038,
"step": 28250
},
{
"epoch": 8.25072886297376,
"grad_norm": 0.3276742994785309,
"learning_rate": 0.0005012870478413069,
"loss": 3.3991,
"step": 28300
},
{
"epoch": 8.26530612244898,
"grad_norm": 0.338752806186676,
"learning_rate": 0.0005011120186697782,
"loss": 3.403,
"step": 28350
},
{
"epoch": 8.279883381924199,
"grad_norm": 0.33559349179267883,
"learning_rate": 0.0005009369894982496,
"loss": 3.3886,
"step": 28400
},
{
"epoch": 8.294460641399416,
"grad_norm": 0.33547982573509216,
"learning_rate": 0.0005007619603267211,
"loss": 3.406,
"step": 28450
},
{
"epoch": 8.309037900874635,
"grad_norm": 0.38501545786857605,
"learning_rate": 0.0005005869311551925,
"loss": 3.3903,
"step": 28500
},
{
"epoch": 8.323615160349854,
"grad_norm": 0.305745393037796,
"learning_rate": 0.000500411901983664,
"loss": 3.4011,
"step": 28550
},
{
"epoch": 8.338192419825074,
"grad_norm": 0.3493766188621521,
"learning_rate": 0.0005002368728121353,
"loss": 3.4158,
"step": 28600
},
{
"epoch": 8.352769679300291,
"grad_norm": 0.3287374675273895,
"learning_rate": 0.0005000618436406068,
"loss": 3.4062,
"step": 28650
},
{
"epoch": 8.36734693877551,
"grad_norm": 0.3492163419723511,
"learning_rate": 0.0004998868144690782,
"loss": 3.4161,
"step": 28700
},
{
"epoch": 8.38192419825073,
"grad_norm": 0.3114756941795349,
"learning_rate": 0.0004997117852975495,
"loss": 3.3955,
"step": 28750
},
{
"epoch": 8.396501457725947,
"grad_norm": 0.3263934254646301,
"learning_rate": 0.0004995367561260209,
"loss": 3.3945,
"step": 28800
},
{
"epoch": 8.411078717201166,
"grad_norm": 0.3255505859851837,
"learning_rate": 0.0004993617269544924,
"loss": 3.4063,
"step": 28850
},
{
"epoch": 8.425655976676385,
"grad_norm": 0.3421190083026886,
"learning_rate": 0.0004991866977829638,
"loss": 3.4224,
"step": 28900
},
{
"epoch": 8.440233236151604,
"grad_norm": 0.34055987000465393,
"learning_rate": 0.0004990116686114351,
"loss": 3.4091,
"step": 28950
},
{
"epoch": 8.454810495626822,
"grad_norm": 0.30603712797164917,
"learning_rate": 0.0004988366394399066,
"loss": 3.4161,
"step": 29000
},
{
"epoch": 8.454810495626822,
"eval_accuracy": 0.36562303999216467,
"eval_loss": 3.589662790298462,
"eval_runtime": 183.1607,
"eval_samples_per_second": 90.762,
"eval_steps_per_second": 5.673,
"step": 29000
},
{
"epoch": 8.46938775510204,
"grad_norm": 0.32788148522377014,
"learning_rate": 0.000498661610268378,
"loss": 3.4013,
"step": 29050
},
{
"epoch": 8.48396501457726,
"grad_norm": 0.3365269899368286,
"learning_rate": 0.0004984865810968494,
"loss": 3.4032,
"step": 29100
},
{
"epoch": 8.498542274052479,
"grad_norm": 0.3457231819629669,
"learning_rate": 0.0004983115519253209,
"loss": 3.42,
"step": 29150
},
{
"epoch": 8.513119533527696,
"grad_norm": 0.32088690996170044,
"learning_rate": 0.0004981365227537922,
"loss": 3.4214,
"step": 29200
},
{
"epoch": 8.527696793002915,
"grad_norm": 0.3353572189807892,
"learning_rate": 0.0004979614935822637,
"loss": 3.43,
"step": 29250
},
{
"epoch": 8.542274052478135,
"grad_norm": 0.3345482349395752,
"learning_rate": 0.0004977864644107351,
"loss": 3.4276,
"step": 29300
},
{
"epoch": 8.556851311953352,
"grad_norm": 0.3418853282928467,
"learning_rate": 0.0004976114352392066,
"loss": 3.4262,
"step": 29350
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.3535926938056946,
"learning_rate": 0.0004974364060676779,
"loss": 3.4243,
"step": 29400
},
{
"epoch": 8.58600583090379,
"grad_norm": 0.3353271186351776,
"learning_rate": 0.0004972613768961493,
"loss": 3.4257,
"step": 29450
},
{
"epoch": 8.60058309037901,
"grad_norm": 0.33887091279029846,
"learning_rate": 0.0004970863477246207,
"loss": 3.4173,
"step": 29500
},
{
"epoch": 8.615160349854227,
"grad_norm": 0.3468446433544159,
"learning_rate": 0.0004969113185530922,
"loss": 3.4244,
"step": 29550
},
{
"epoch": 8.629737609329446,
"grad_norm": 0.33011898398399353,
"learning_rate": 0.0004967362893815635,
"loss": 3.4207,
"step": 29600
},
{
"epoch": 8.644314868804665,
"grad_norm": 0.32781869173049927,
"learning_rate": 0.0004965612602100349,
"loss": 3.4287,
"step": 29650
},
{
"epoch": 8.658892128279884,
"grad_norm": 0.3354378342628479,
"learning_rate": 0.0004963862310385064,
"loss": 3.4278,
"step": 29700
},
{
"epoch": 8.673469387755102,
"grad_norm": 0.32178574800491333,
"learning_rate": 0.0004962112018669778,
"loss": 3.4176,
"step": 29750
},
{
"epoch": 8.68804664723032,
"grad_norm": 0.3298463523387909,
"learning_rate": 0.0004960361726954492,
"loss": 3.4381,
"step": 29800
},
{
"epoch": 8.70262390670554,
"grad_norm": 0.3558914065361023,
"learning_rate": 0.0004958611435239206,
"loss": 3.4415,
"step": 29850
},
{
"epoch": 8.717201166180757,
"grad_norm": 0.33775535225868225,
"learning_rate": 0.0004956861143523921,
"loss": 3.4368,
"step": 29900
},
{
"epoch": 8.731778425655977,
"grad_norm": 0.33008068799972534,
"learning_rate": 0.0004955110851808635,
"loss": 3.4347,
"step": 29950
},
{
"epoch": 8.746355685131196,
"grad_norm": 0.3202441930770874,
"learning_rate": 0.0004953360560093348,
"loss": 3.4295,
"step": 30000
},
{
"epoch": 8.746355685131196,
"eval_accuracy": 0.3664582505382076,
"eval_loss": 3.5774459838867188,
"eval_runtime": 183.0331,
"eval_samples_per_second": 90.825,
"eval_steps_per_second": 5.677,
"step": 30000
},
{
"epoch": 8.760932944606415,
"grad_norm": 0.33541247248649597,
"learning_rate": 0.0004951610268378062,
"loss": 3.4232,
"step": 30050
},
{
"epoch": 8.775510204081632,
"grad_norm": 0.3246397376060486,
"learning_rate": 0.0004949859976662777,
"loss": 3.4325,
"step": 30100
},
{
"epoch": 8.790087463556851,
"grad_norm": 0.3221427798271179,
"learning_rate": 0.0004948109684947491,
"loss": 3.4291,
"step": 30150
},
{
"epoch": 8.80466472303207,
"grad_norm": 0.3057732880115509,
"learning_rate": 0.0004946359393232204,
"loss": 3.4421,
"step": 30200
},
{
"epoch": 8.819241982507288,
"grad_norm": 0.32524847984313965,
"learning_rate": 0.0004944609101516919,
"loss": 3.4218,
"step": 30250
},
{
"epoch": 8.833819241982507,
"grad_norm": 0.33912190794944763,
"learning_rate": 0.0004942858809801633,
"loss": 3.4362,
"step": 30300
},
{
"epoch": 8.848396501457726,
"grad_norm": 0.33464741706848145,
"learning_rate": 0.0004941108518086348,
"loss": 3.4409,
"step": 30350
},
{
"epoch": 8.862973760932945,
"grad_norm": 0.3329154849052429,
"learning_rate": 0.0004939358226371061,
"loss": 3.4305,
"step": 30400
},
{
"epoch": 8.877551020408163,
"grad_norm": 0.34702932834625244,
"learning_rate": 0.0004937607934655776,
"loss": 3.4224,
"step": 30450
},
{
"epoch": 8.892128279883382,
"grad_norm": 0.33861494064331055,
"learning_rate": 0.000493585764294049,
"loss": 3.4326,
"step": 30500
},
{
"epoch": 8.906705539358601,
"grad_norm": 0.3515016436576843,
"learning_rate": 0.0004934107351225204,
"loss": 3.443,
"step": 30550
},
{
"epoch": 8.921282798833818,
"grad_norm": 0.32505327463150024,
"learning_rate": 0.0004932357059509919,
"loss": 3.4282,
"step": 30600
},
{
"epoch": 8.935860058309038,
"grad_norm": 0.3279268741607666,
"learning_rate": 0.0004930606767794632,
"loss": 3.4436,
"step": 30650
},
{
"epoch": 8.950437317784257,
"grad_norm": 0.32305216789245605,
"learning_rate": 0.0004928856476079346,
"loss": 3.4385,
"step": 30700
},
{
"epoch": 8.965014577259476,
"grad_norm": 0.32832610607147217,
"learning_rate": 0.000492710618436406,
"loss": 3.4272,
"step": 30750
},
{
"epoch": 8.979591836734693,
"grad_norm": 0.35155633091926575,
"learning_rate": 0.0004925355892648775,
"loss": 3.4509,
"step": 30800
},
{
"epoch": 8.994169096209912,
"grad_norm": 0.32218649983406067,
"learning_rate": 0.0004923605600933488,
"loss": 3.4463,
"step": 30850
},
{
"epoch": 9.008746355685131,
"grad_norm": 0.3552287817001343,
"learning_rate": 0.0004921855309218202,
"loss": 3.3747,
"step": 30900
},
{
"epoch": 9.02332361516035,
"grad_norm": 0.31643298268318176,
"learning_rate": 0.0004920105017502917,
"loss": 3.3176,
"step": 30950
},
{
"epoch": 9.037900874635568,
"grad_norm": 0.3509899079799652,
"learning_rate": 0.000491835472578763,
"loss": 3.3256,
"step": 31000
},
{
"epoch": 9.037900874635568,
"eval_accuracy": 0.3664020989623848,
"eval_loss": 3.5828676223754883,
"eval_runtime": 182.7157,
"eval_samples_per_second": 90.983,
"eval_steps_per_second": 5.686,
"step": 31000
},
{
"epoch": 9.052478134110787,
"grad_norm": 0.34235477447509766,
"learning_rate": 0.0004916604434072345,
"loss": 3.341,
"step": 31050
},
{
"epoch": 9.067055393586006,
"grad_norm": 0.35295766592025757,
"learning_rate": 0.0004914854142357059,
"loss": 3.3398,
"step": 31100
},
{
"epoch": 9.081632653061224,
"grad_norm": 0.3388746380805969,
"learning_rate": 0.0004913103850641774,
"loss": 3.3436,
"step": 31150
},
{
"epoch": 9.096209912536443,
"grad_norm": 0.37326183915138245,
"learning_rate": 0.0004911353558926488,
"loss": 3.3369,
"step": 31200
},
{
"epoch": 9.110787172011662,
"grad_norm": 0.33712223172187805,
"learning_rate": 0.0004909603267211201,
"loss": 3.3502,
"step": 31250
},
{
"epoch": 9.125364431486881,
"grad_norm": 0.32785558700561523,
"learning_rate": 0.0004907852975495915,
"loss": 3.3526,
"step": 31300
},
{
"epoch": 9.139941690962099,
"grad_norm": 0.34080231189727783,
"learning_rate": 0.000490610268378063,
"loss": 3.3697,
"step": 31350
},
{
"epoch": 9.154518950437318,
"grad_norm": 0.3284015357494354,
"learning_rate": 0.0004904352392065344,
"loss": 3.3574,
"step": 31400
},
{
"epoch": 9.169096209912537,
"grad_norm": 0.34510546922683716,
"learning_rate": 0.0004902602100350057,
"loss": 3.3552,
"step": 31450
},
{
"epoch": 9.183673469387756,
"grad_norm": 0.3248676657676697,
"learning_rate": 0.0004900851808634772,
"loss": 3.3581,
"step": 31500
},
{
"epoch": 9.198250728862973,
"grad_norm": 0.3320048749446869,
"learning_rate": 0.0004899101516919486,
"loss": 3.3597,
"step": 31550
},
{
"epoch": 9.212827988338192,
"grad_norm": 0.3347003757953644,
"learning_rate": 0.0004897351225204201,
"loss": 3.3533,
"step": 31600
},
{
"epoch": 9.227405247813412,
"grad_norm": 0.34604883193969727,
"learning_rate": 0.0004895600933488914,
"loss": 3.3575,
"step": 31650
},
{
"epoch": 9.241982507288629,
"grad_norm": 0.33016645908355713,
"learning_rate": 0.0004893850641773629,
"loss": 3.3726,
"step": 31700
},
{
"epoch": 9.256559766763848,
"grad_norm": 0.34527072310447693,
"learning_rate": 0.0004892100350058343,
"loss": 3.3707,
"step": 31750
},
{
"epoch": 9.271137026239067,
"grad_norm": 0.33581438660621643,
"learning_rate": 0.0004890350058343057,
"loss": 3.3656,
"step": 31800
},
{
"epoch": 9.285714285714286,
"grad_norm": 0.33454182744026184,
"learning_rate": 0.0004888599766627771,
"loss": 3.3743,
"step": 31850
},
{
"epoch": 9.300291545189504,
"grad_norm": 0.3457191586494446,
"learning_rate": 0.0004886849474912485,
"loss": 3.3819,
"step": 31900
},
{
"epoch": 9.314868804664723,
"grad_norm": 0.36336174607276917,
"learning_rate": 0.0004885099183197199,
"loss": 3.3725,
"step": 31950
},
{
"epoch": 9.329446064139942,
"grad_norm": 0.3674074113368988,
"learning_rate": 0.0004883348891481913,
"loss": 3.3733,
"step": 32000
},
{
"epoch": 9.329446064139942,
"eval_accuracy": 0.36651228318664075,
"eval_loss": 3.5811729431152344,
"eval_runtime": 182.8386,
"eval_samples_per_second": 90.922,
"eval_steps_per_second": 5.683,
"step": 32000
},
{
"epoch": 9.34402332361516,
"grad_norm": 0.3265478312969208,
"learning_rate": 0.00048815985997666274,
"loss": 3.3797,
"step": 32050
},
{
"epoch": 9.358600583090379,
"grad_norm": 0.36413219571113586,
"learning_rate": 0.00048798483080513417,
"loss": 3.3817,
"step": 32100
},
{
"epoch": 9.373177842565598,
"grad_norm": 0.36710673570632935,
"learning_rate": 0.00048780980163360554,
"loss": 3.3872,
"step": 32150
},
{
"epoch": 9.387755102040817,
"grad_norm": 0.3260069191455841,
"learning_rate": 0.0004876347724620769,
"loss": 3.3867,
"step": 32200
},
{
"epoch": 9.402332361516034,
"grad_norm": 0.34948107600212097,
"learning_rate": 0.0004874597432905484,
"loss": 3.388,
"step": 32250
},
{
"epoch": 9.416909620991254,
"grad_norm": 0.3187805712223053,
"learning_rate": 0.00048728471411901977,
"loss": 3.385,
"step": 32300
},
{
"epoch": 9.431486880466473,
"grad_norm": 0.3865429759025574,
"learning_rate": 0.0004871096849474912,
"loss": 3.3729,
"step": 32350
},
{
"epoch": 9.446064139941692,
"grad_norm": 0.3081527352333069,
"learning_rate": 0.0004869346557759626,
"loss": 3.3893,
"step": 32400
},
{
"epoch": 9.46064139941691,
"grad_norm": 0.33219993114471436,
"learning_rate": 0.00048675962660443405,
"loss": 3.3854,
"step": 32450
},
{
"epoch": 9.475218658892128,
"grad_norm": 0.3221242427825928,
"learning_rate": 0.0004865845974329054,
"loss": 3.4055,
"step": 32500
},
{
"epoch": 9.489795918367347,
"grad_norm": 0.33291497826576233,
"learning_rate": 0.00048640956826137685,
"loss": 3.3888,
"step": 32550
},
{
"epoch": 9.504373177842565,
"grad_norm": 0.35030466318130493,
"learning_rate": 0.0004862345390898483,
"loss": 3.4062,
"step": 32600
},
{
"epoch": 9.518950437317784,
"grad_norm": 0.33631065487861633,
"learning_rate": 0.0004860595099183197,
"loss": 3.3959,
"step": 32650
},
{
"epoch": 9.533527696793003,
"grad_norm": 0.3579002618789673,
"learning_rate": 0.0004858844807467911,
"loss": 3.3959,
"step": 32700
},
{
"epoch": 9.548104956268222,
"grad_norm": 0.3267410099506378,
"learning_rate": 0.00048570945157526256,
"loss": 3.4049,
"step": 32750
},
{
"epoch": 9.56268221574344,
"grad_norm": 0.329932302236557,
"learning_rate": 0.00048553442240373393,
"loss": 3.3952,
"step": 32800
},
{
"epoch": 9.577259475218659,
"grad_norm": 0.33361557126045227,
"learning_rate": 0.0004853593932322053,
"loss": 3.391,
"step": 32850
},
{
"epoch": 9.591836734693878,
"grad_norm": 0.37531715631484985,
"learning_rate": 0.00048518436406067673,
"loss": 3.4024,
"step": 32900
},
{
"epoch": 9.606413994169095,
"grad_norm": 0.3536641597747803,
"learning_rate": 0.00048500933488914816,
"loss": 3.4074,
"step": 32950
},
{
"epoch": 9.620991253644315,
"grad_norm": 0.33492809534072876,
"learning_rate": 0.0004848343057176196,
"loss": 3.4062,
"step": 33000
},
{
"epoch": 9.620991253644315,
"eval_accuracy": 0.36730181907562026,
"eval_loss": 3.5756776332855225,
"eval_runtime": 182.9487,
"eval_samples_per_second": 90.867,
"eval_steps_per_second": 5.679,
"step": 33000
},
{
"epoch": 9.635568513119534,
"grad_norm": 0.34546518325805664,
"learning_rate": 0.00048465927654609095,
"loss": 3.3993,
"step": 33050
},
{
"epoch": 9.650145772594753,
"grad_norm": 0.3619083762168884,
"learning_rate": 0.00048448424737456244,
"loss": 3.4156,
"step": 33100
},
{
"epoch": 9.66472303206997,
"grad_norm": 0.331717848777771,
"learning_rate": 0.0004843092182030338,
"loss": 3.4023,
"step": 33150
},
{
"epoch": 9.67930029154519,
"grad_norm": 0.3297898471355438,
"learning_rate": 0.0004841341890315052,
"loss": 3.4045,
"step": 33200
},
{
"epoch": 9.693877551020408,
"grad_norm": 0.30799806118011475,
"learning_rate": 0.0004839591598599766,
"loss": 3.3953,
"step": 33250
},
{
"epoch": 9.708454810495628,
"grad_norm": 0.34676632285118103,
"learning_rate": 0.00048378413068844803,
"loss": 3.4142,
"step": 33300
},
{
"epoch": 9.723032069970845,
"grad_norm": 0.32712289690971375,
"learning_rate": 0.00048360910151691946,
"loss": 3.411,
"step": 33350
},
{
"epoch": 9.737609329446064,
"grad_norm": 0.36372727155685425,
"learning_rate": 0.00048343407234539083,
"loss": 3.3969,
"step": 33400
},
{
"epoch": 9.752186588921283,
"grad_norm": 0.3348037898540497,
"learning_rate": 0.00048325904317386226,
"loss": 3.4057,
"step": 33450
},
{
"epoch": 9.7667638483965,
"grad_norm": 0.3365459442138672,
"learning_rate": 0.0004830840140023337,
"loss": 3.4077,
"step": 33500
},
{
"epoch": 9.78134110787172,
"grad_norm": 0.3372909426689148,
"learning_rate": 0.0004829089848308051,
"loss": 3.4113,
"step": 33550
},
{
"epoch": 9.795918367346939,
"grad_norm": 0.34129634499549866,
"learning_rate": 0.0004827339556592765,
"loss": 3.4069,
"step": 33600
},
{
"epoch": 9.810495626822158,
"grad_norm": 0.3516503870487213,
"learning_rate": 0.00048255892648774797,
"loss": 3.4067,
"step": 33650
},
{
"epoch": 9.825072886297376,
"grad_norm": 0.3345436453819275,
"learning_rate": 0.00048238389731621934,
"loss": 3.4148,
"step": 33700
},
{
"epoch": 9.839650145772595,
"grad_norm": 0.32488852739334106,
"learning_rate": 0.0004822088681446907,
"loss": 3.4156,
"step": 33750
},
{
"epoch": 9.854227405247814,
"grad_norm": 0.32768747210502625,
"learning_rate": 0.00048203383897316214,
"loss": 3.4029,
"step": 33800
},
{
"epoch": 9.868804664723033,
"grad_norm": 0.3539787828922272,
"learning_rate": 0.00048185880980163357,
"loss": 3.409,
"step": 33850
},
{
"epoch": 9.88338192419825,
"grad_norm": 0.3092935085296631,
"learning_rate": 0.000481683780630105,
"loss": 3.4199,
"step": 33900
},
{
"epoch": 9.89795918367347,
"grad_norm": 0.34841668605804443,
"learning_rate": 0.00048150875145857637,
"loss": 3.4128,
"step": 33950
},
{
"epoch": 9.912536443148689,
"grad_norm": 0.3587806522846222,
"learning_rate": 0.00048133372228704785,
"loss": 3.4021,
"step": 34000
},
{
"epoch": 9.912536443148689,
"eval_accuracy": 0.3678635702702245,
"eval_loss": 3.563772678375244,
"eval_runtime": 181.4798,
"eval_samples_per_second": 91.602,
"eval_steps_per_second": 5.725,
"step": 34000
},
{
"epoch": 9.927113702623906,
"grad_norm": 0.32694658637046814,
"learning_rate": 0.0004811586931155192,
"loss": 3.4201,
"step": 34050
},
{
"epoch": 9.941690962099125,
"grad_norm": 0.33355414867401123,
"learning_rate": 0.0004809836639439906,
"loss": 3.4142,
"step": 34100
},
{
"epoch": 9.956268221574344,
"grad_norm": 0.3344906270503998,
"learning_rate": 0.000480808634772462,
"loss": 3.4114,
"step": 34150
},
{
"epoch": 9.970845481049563,
"grad_norm": 0.31297606229782104,
"learning_rate": 0.00048063360560093345,
"loss": 3.4114,
"step": 34200
},
{
"epoch": 9.98542274052478,
"grad_norm": 0.34211552143096924,
"learning_rate": 0.00048045857642940487,
"loss": 3.4133,
"step": 34250
},
{
"epoch": 10.0,
"grad_norm": 0.347125768661499,
"learning_rate": 0.00048028354725787625,
"loss": 3.4196,
"step": 34300
},
{
"epoch": 10.01457725947522,
"grad_norm": 0.3528343737125397,
"learning_rate": 0.0004801085180863477,
"loss": 3.2922,
"step": 34350
},
{
"epoch": 10.029154518950437,
"grad_norm": 0.3449252247810364,
"learning_rate": 0.0004799334889148191,
"loss": 3.3206,
"step": 34400
},
{
"epoch": 10.043731778425656,
"grad_norm": 0.3424980640411377,
"learning_rate": 0.0004797584597432905,
"loss": 3.3048,
"step": 34450
},
{
"epoch": 10.058309037900875,
"grad_norm": 0.348306804895401,
"learning_rate": 0.0004795834305717619,
"loss": 3.3034,
"step": 34500
},
{
"epoch": 10.072886297376094,
"grad_norm": 0.3183503746986389,
"learning_rate": 0.0004794084014002334,
"loss": 3.3098,
"step": 34550
},
{
"epoch": 10.087463556851311,
"grad_norm": 0.34545329213142395,
"learning_rate": 0.00047923337222870475,
"loss": 3.3213,
"step": 34600
},
{
"epoch": 10.10204081632653,
"grad_norm": 0.3453138470649719,
"learning_rate": 0.0004790583430571761,
"loss": 3.335,
"step": 34650
},
{
"epoch": 10.11661807580175,
"grad_norm": 0.32295459508895874,
"learning_rate": 0.00047888331388564755,
"loss": 3.329,
"step": 34700
},
{
"epoch": 10.131195335276967,
"grad_norm": 0.32609206438064575,
"learning_rate": 0.000478708284714119,
"loss": 3.3368,
"step": 34750
},
{
"epoch": 10.145772594752186,
"grad_norm": 0.32333531975746155,
"learning_rate": 0.0004785332555425904,
"loss": 3.3365,
"step": 34800
},
{
"epoch": 10.160349854227405,
"grad_norm": 0.31967732310295105,
"learning_rate": 0.0004783582263710618,
"loss": 3.3437,
"step": 34850
},
{
"epoch": 10.174927113702624,
"grad_norm": 0.3523292541503906,
"learning_rate": 0.00047818319719953326,
"loss": 3.3288,
"step": 34900
},
{
"epoch": 10.189504373177842,
"grad_norm": 0.3372192084789276,
"learning_rate": 0.00047800816802800463,
"loss": 3.3424,
"step": 34950
},
{
"epoch": 10.204081632653061,
"grad_norm": 0.3291529715061188,
"learning_rate": 0.000477833138856476,
"loss": 3.3341,
"step": 35000
},
{
"epoch": 10.204081632653061,
"eval_accuracy": 0.36714042743945047,
"eval_loss": 3.5801405906677246,
"eval_runtime": 188.68,
"eval_samples_per_second": 88.107,
"eval_steps_per_second": 5.507,
"step": 35000
},
{
"epoch": 10.21865889212828,
"grad_norm": 0.3474276661872864,
"learning_rate": 0.00047765810968494743,
"loss": 3.3483,
"step": 35050
},
{
"epoch": 10.2332361516035,
"grad_norm": 0.330358624458313,
"learning_rate": 0.00047748308051341886,
"loss": 3.3368,
"step": 35100
},
{
"epoch": 10.247813411078717,
"grad_norm": 0.38131415843963623,
"learning_rate": 0.0004773080513418903,
"loss": 3.3401,
"step": 35150
},
{
"epoch": 10.262390670553936,
"grad_norm": 0.33875641226768494,
"learning_rate": 0.00047713302217036166,
"loss": 3.3439,
"step": 35200
},
{
"epoch": 10.276967930029155,
"grad_norm": 0.34253689646720886,
"learning_rate": 0.00047695799299883314,
"loss": 3.3633,
"step": 35250
},
{
"epoch": 10.291545189504372,
"grad_norm": 0.3439371585845947,
"learning_rate": 0.0004767829638273045,
"loss": 3.3574,
"step": 35300
},
{
"epoch": 10.306122448979592,
"grad_norm": 0.33233463764190674,
"learning_rate": 0.00047660793465577594,
"loss": 3.3527,
"step": 35350
},
{
"epoch": 10.32069970845481,
"grad_norm": 0.3227967321872711,
"learning_rate": 0.0004764329054842473,
"loss": 3.3656,
"step": 35400
},
{
"epoch": 10.33527696793003,
"grad_norm": 0.3533599078655243,
"learning_rate": 0.0004762578763127188,
"loss": 3.3613,
"step": 35450
},
{
"epoch": 10.349854227405247,
"grad_norm": 0.36060336232185364,
"learning_rate": 0.00047608284714119016,
"loss": 3.3651,
"step": 35500
},
{
"epoch": 10.364431486880466,
"grad_norm": 0.33997446298599243,
"learning_rate": 0.00047590781796966154,
"loss": 3.3645,
"step": 35550
},
{
"epoch": 10.379008746355685,
"grad_norm": 0.3344101309776306,
"learning_rate": 0.000475732788798133,
"loss": 3.3598,
"step": 35600
},
{
"epoch": 10.393586005830905,
"grad_norm": 0.38466399908065796,
"learning_rate": 0.0004755577596266044,
"loss": 3.3571,
"step": 35650
},
{
"epoch": 10.408163265306122,
"grad_norm": 0.3226321041584015,
"learning_rate": 0.0004753827304550758,
"loss": 3.3667,
"step": 35700
},
{
"epoch": 10.422740524781341,
"grad_norm": 0.3315838575363159,
"learning_rate": 0.0004752077012835472,
"loss": 3.3521,
"step": 35750
},
{
"epoch": 10.43731778425656,
"grad_norm": 0.31704387068748474,
"learning_rate": 0.00047503267211201867,
"loss": 3.3738,
"step": 35800
},
{
"epoch": 10.451895043731778,
"grad_norm": 0.3453398048877716,
"learning_rate": 0.00047485764294049004,
"loss": 3.3588,
"step": 35850
},
{
"epoch": 10.466472303206997,
"grad_norm": 0.3315698802471161,
"learning_rate": 0.0004746826137689614,
"loss": 3.375,
"step": 35900
},
{
"epoch": 10.481049562682216,
"grad_norm": 0.31997567415237427,
"learning_rate": 0.00047450758459743284,
"loss": 3.3575,
"step": 35950
},
{
"epoch": 10.495626822157435,
"grad_norm": 0.3380381166934967,
"learning_rate": 0.00047433255542590427,
"loss": 3.3611,
"step": 36000
},
{
"epoch": 10.495626822157435,
"eval_accuracy": 0.36805368514434134,
"eval_loss": 3.5723817348480225,
"eval_runtime": 181.6702,
"eval_samples_per_second": 91.506,
"eval_steps_per_second": 5.719,
"step": 36000
},
{
"epoch": 10.510204081632653,
"grad_norm": 0.3401227295398712,
"learning_rate": 0.0004741575262543757,
"loss": 3.3722,
"step": 36050
},
{
"epoch": 10.524781341107872,
"grad_norm": 0.3410283625125885,
"learning_rate": 0.00047398249708284707,
"loss": 3.3738,
"step": 36100
},
{
"epoch": 10.53935860058309,
"grad_norm": 0.3501899540424347,
"learning_rate": 0.00047380746791131855,
"loss": 3.3763,
"step": 36150
},
{
"epoch": 10.55393586005831,
"grad_norm": 0.33715593814849854,
"learning_rate": 0.0004736324387397899,
"loss": 3.3773,
"step": 36200
},
{
"epoch": 10.568513119533527,
"grad_norm": 0.33864983916282654,
"learning_rate": 0.00047345740956826135,
"loss": 3.3685,
"step": 36250
},
{
"epoch": 10.583090379008746,
"grad_norm": 0.3323419690132141,
"learning_rate": 0.0004732823803967327,
"loss": 3.3754,
"step": 36300
},
{
"epoch": 10.597667638483966,
"grad_norm": 0.33451366424560547,
"learning_rate": 0.0004731073512252042,
"loss": 3.3812,
"step": 36350
},
{
"epoch": 10.612244897959183,
"grad_norm": 0.33620506525039673,
"learning_rate": 0.0004729323220536756,
"loss": 3.3679,
"step": 36400
},
{
"epoch": 10.626822157434402,
"grad_norm": 0.35062161087989807,
"learning_rate": 0.00047275729288214695,
"loss": 3.3713,
"step": 36450
},
{
"epoch": 10.641399416909621,
"grad_norm": 0.34864264726638794,
"learning_rate": 0.00047258226371061843,
"loss": 3.3778,
"step": 36500
},
{
"epoch": 10.65597667638484,
"grad_norm": 0.3500450849533081,
"learning_rate": 0.0004724072345390898,
"loss": 3.385,
"step": 36550
},
{
"epoch": 10.670553935860058,
"grad_norm": 0.3300856947898865,
"learning_rate": 0.00047223220536756123,
"loss": 3.3774,
"step": 36600
},
{
"epoch": 10.685131195335277,
"grad_norm": 0.3436342179775238,
"learning_rate": 0.0004720571761960326,
"loss": 3.3858,
"step": 36650
},
{
"epoch": 10.699708454810496,
"grad_norm": 0.3223649561405182,
"learning_rate": 0.0004718821470245041,
"loss": 3.39,
"step": 36700
},
{
"epoch": 10.714285714285714,
"grad_norm": 0.3425664007663727,
"learning_rate": 0.00047170711785297545,
"loss": 3.3773,
"step": 36750
},
{
"epoch": 10.728862973760933,
"grad_norm": 0.35593581199645996,
"learning_rate": 0.00047153208868144683,
"loss": 3.3936,
"step": 36800
},
{
"epoch": 10.743440233236152,
"grad_norm": 0.3750920295715332,
"learning_rate": 0.0004713570595099183,
"loss": 3.3842,
"step": 36850
},
{
"epoch": 10.758017492711371,
"grad_norm": 0.3294471800327301,
"learning_rate": 0.0004711820303383897,
"loss": 3.3889,
"step": 36900
},
{
"epoch": 10.772594752186588,
"grad_norm": 0.3309391140937805,
"learning_rate": 0.0004710070011668611,
"loss": 3.4021,
"step": 36950
},
{
"epoch": 10.787172011661808,
"grad_norm": 0.37308335304260254,
"learning_rate": 0.0004708319719953325,
"loss": 3.3845,
"step": 37000
},
{
"epoch": 10.787172011661808,
"eval_accuracy": 0.36864180521312645,
"eval_loss": 3.5626156330108643,
"eval_runtime": 182.1455,
"eval_samples_per_second": 91.268,
"eval_steps_per_second": 5.704,
"step": 37000
},
{
"epoch": 10.801749271137027,
"grad_norm": 0.34260591864585876,
"learning_rate": 0.00047065694282380396,
"loss": 3.3928,
"step": 37050
},
{
"epoch": 10.816326530612244,
"grad_norm": 0.3432435691356659,
"learning_rate": 0.00047048191365227533,
"loss": 3.3851,
"step": 37100
},
{
"epoch": 10.830903790087463,
"grad_norm": 0.3411959111690521,
"learning_rate": 0.00047030688448074676,
"loss": 3.3789,
"step": 37150
},
{
"epoch": 10.845481049562682,
"grad_norm": 0.3398871123790741,
"learning_rate": 0.00047013185530921813,
"loss": 3.3843,
"step": 37200
},
{
"epoch": 10.860058309037901,
"grad_norm": 0.33319368958473206,
"learning_rate": 0.0004699568261376896,
"loss": 3.3983,
"step": 37250
},
{
"epoch": 10.874635568513119,
"grad_norm": 0.48678529262542725,
"learning_rate": 0.000469781796966161,
"loss": 3.3963,
"step": 37300
},
{
"epoch": 10.889212827988338,
"grad_norm": 0.33169659972190857,
"learning_rate": 0.00046960676779463236,
"loss": 3.394,
"step": 37350
},
{
"epoch": 10.903790087463557,
"grad_norm": 0.3148620128631592,
"learning_rate": 0.00046943173862310384,
"loss": 3.3961,
"step": 37400
},
{
"epoch": 10.918367346938776,
"grad_norm": 0.3453501760959625,
"learning_rate": 0.0004692567094515752,
"loss": 3.4005,
"step": 37450
},
{
"epoch": 10.932944606413994,
"grad_norm": 0.36119741201400757,
"learning_rate": 0.00046908168028004664,
"loss": 3.3911,
"step": 37500
},
{
"epoch": 10.947521865889213,
"grad_norm": 0.3385680019855499,
"learning_rate": 0.000468906651108518,
"loss": 3.3792,
"step": 37550
},
{
"epoch": 10.962099125364432,
"grad_norm": 0.30895158648490906,
"learning_rate": 0.0004687316219369895,
"loss": 3.3978,
"step": 37600
},
{
"epoch": 10.97667638483965,
"grad_norm": 0.3564043939113617,
"learning_rate": 0.00046855659276546087,
"loss": 3.388,
"step": 37650
},
{
"epoch": 10.991253644314869,
"grad_norm": 0.3299904465675354,
"learning_rate": 0.00046838156359393224,
"loss": 3.3916,
"step": 37700
},
{
"epoch": 11.005830903790088,
"grad_norm": 0.3511546850204468,
"learning_rate": 0.0004682065344224037,
"loss": 3.3494,
"step": 37750
},
{
"epoch": 11.020408163265307,
"grad_norm": 0.37850990891456604,
"learning_rate": 0.0004680315052508751,
"loss": 3.289,
"step": 37800
},
{
"epoch": 11.034985422740524,
"grad_norm": 0.3183746933937073,
"learning_rate": 0.0004678564760793465,
"loss": 3.2999,
"step": 37850
},
{
"epoch": 11.049562682215743,
"grad_norm": 0.3424279987812042,
"learning_rate": 0.0004676814469078179,
"loss": 3.3011,
"step": 37900
},
{
"epoch": 11.064139941690962,
"grad_norm": 0.34383082389831543,
"learning_rate": 0.00046750641773628937,
"loss": 3.2857,
"step": 37950
},
{
"epoch": 11.078717201166182,
"grad_norm": 0.34979602694511414,
"learning_rate": 0.00046733138856476075,
"loss": 3.2814,
"step": 38000
},
{
"epoch": 11.078717201166182,
"eval_accuracy": 0.36815092036788344,
"eval_loss": 3.574191093444824,
"eval_runtime": 182.0074,
"eval_samples_per_second": 91.337,
"eval_steps_per_second": 5.709,
"step": 38000
},
{
"epoch": 11.093294460641399,
"grad_norm": 0.3627917170524597,
"learning_rate": 0.00046715635939323217,
"loss": 3.2924,
"step": 38050
},
{
"epoch": 11.107871720116618,
"grad_norm": 0.36935657262802124,
"learning_rate": 0.0004669813302217036,
"loss": 3.3067,
"step": 38100
},
{
"epoch": 11.122448979591837,
"grad_norm": 0.3449645936489105,
"learning_rate": 0.000466806301050175,
"loss": 3.3012,
"step": 38150
},
{
"epoch": 11.137026239067055,
"grad_norm": 0.3319040834903717,
"learning_rate": 0.0004666312718786464,
"loss": 3.3077,
"step": 38200
},
{
"epoch": 11.151603498542274,
"grad_norm": 0.3427700102329254,
"learning_rate": 0.00046645624270711777,
"loss": 3.3209,
"step": 38250
},
{
"epoch": 11.166180758017493,
"grad_norm": 0.3540455996990204,
"learning_rate": 0.00046628121353558925,
"loss": 3.3128,
"step": 38300
},
{
"epoch": 11.180758017492712,
"grad_norm": 0.34144529700279236,
"learning_rate": 0.0004661061843640606,
"loss": 3.3151,
"step": 38350
},
{
"epoch": 11.19533527696793,
"grad_norm": 0.3432770371437073,
"learning_rate": 0.00046593115519253205,
"loss": 3.3182,
"step": 38400
},
{
"epoch": 11.209912536443149,
"grad_norm": 0.36312398314476013,
"learning_rate": 0.0004657561260210034,
"loss": 3.3341,
"step": 38450
},
{
"epoch": 11.224489795918368,
"grad_norm": 0.3511596620082855,
"learning_rate": 0.0004655810968494749,
"loss": 3.3246,
"step": 38500
},
{
"epoch": 11.239067055393585,
"grad_norm": 0.3314606249332428,
"learning_rate": 0.0004654060676779463,
"loss": 3.3157,
"step": 38550
},
{
"epoch": 11.253644314868804,
"grad_norm": 0.32473742961883545,
"learning_rate": 0.00046523103850641765,
"loss": 3.3226,
"step": 38600
},
{
"epoch": 11.268221574344023,
"grad_norm": 0.3840942084789276,
"learning_rate": 0.00046505600933488913,
"loss": 3.3321,
"step": 38650
},
{
"epoch": 11.282798833819243,
"grad_norm": 0.33107447624206543,
"learning_rate": 0.0004648809801633605,
"loss": 3.3308,
"step": 38700
},
{
"epoch": 11.29737609329446,
"grad_norm": 0.37443867325782776,
"learning_rate": 0.00046470595099183193,
"loss": 3.3342,
"step": 38750
},
{
"epoch": 11.31195335276968,
"grad_norm": 0.3259379267692566,
"learning_rate": 0.0004645309218203033,
"loss": 3.3435,
"step": 38800
},
{
"epoch": 11.326530612244898,
"grad_norm": 0.34641292691230774,
"learning_rate": 0.0004643558926487748,
"loss": 3.3338,
"step": 38850
},
{
"epoch": 11.341107871720117,
"grad_norm": 0.36578723788261414,
"learning_rate": 0.00046418086347724616,
"loss": 3.3361,
"step": 38900
},
{
"epoch": 11.355685131195335,
"grad_norm": 0.37702852487564087,
"learning_rate": 0.0004640058343057176,
"loss": 3.3376,
"step": 38950
},
{
"epoch": 11.370262390670554,
"grad_norm": 0.3479772210121155,
"learning_rate": 0.000463830805134189,
"loss": 3.3361,
"step": 39000
},
{
"epoch": 11.370262390670554,
"eval_accuracy": 0.36859189270128395,
"eval_loss": 3.5683250427246094,
"eval_runtime": 182.349,
"eval_samples_per_second": 91.166,
"eval_steps_per_second": 5.698,
"step": 39000
},
{
"epoch": 11.384839650145773,
"grad_norm": 0.35327914357185364,
"learning_rate": 0.00046365577596266044,
"loss": 3.3319,
"step": 39050
},
{
"epoch": 11.39941690962099,
"grad_norm": 0.3379618227481842,
"learning_rate": 0.0004634807467911318,
"loss": 3.3439,
"step": 39100
},
{
"epoch": 11.41399416909621,
"grad_norm": 0.34011024236679077,
"learning_rate": 0.0004633057176196032,
"loss": 3.3306,
"step": 39150
},
{
"epoch": 11.428571428571429,
"grad_norm": 0.3477155566215515,
"learning_rate": 0.00046313068844807466,
"loss": 3.3477,
"step": 39200
},
{
"epoch": 11.443148688046648,
"grad_norm": 0.3348955512046814,
"learning_rate": 0.00046295565927654604,
"loss": 3.3553,
"step": 39250
},
{
"epoch": 11.457725947521865,
"grad_norm": 0.3500663936138153,
"learning_rate": 0.00046278063010501746,
"loss": 3.3497,
"step": 39300
},
{
"epoch": 11.472303206997085,
"grad_norm": 0.3484138250350952,
"learning_rate": 0.0004626056009334889,
"loss": 3.3546,
"step": 39350
},
{
"epoch": 11.486880466472304,
"grad_norm": 0.3597988784313202,
"learning_rate": 0.0004624305717619603,
"loss": 3.3544,
"step": 39400
},
{
"epoch": 11.501457725947521,
"grad_norm": 0.33883535861968994,
"learning_rate": 0.0004622555425904317,
"loss": 3.3627,
"step": 39450
},
{
"epoch": 11.51603498542274,
"grad_norm": 0.3259831964969635,
"learning_rate": 0.00046208051341890306,
"loss": 3.3501,
"step": 39500
},
{
"epoch": 11.53061224489796,
"grad_norm": 0.3629702925682068,
"learning_rate": 0.00046190548424737454,
"loss": 3.3493,
"step": 39550
},
{
"epoch": 11.545189504373178,
"grad_norm": 0.3257094919681549,
"learning_rate": 0.0004617304550758459,
"loss": 3.3555,
"step": 39600
},
{
"epoch": 11.559766763848396,
"grad_norm": 0.3216487169265747,
"learning_rate": 0.00046155542590431734,
"loss": 3.3588,
"step": 39650
},
{
"epoch": 11.574344023323615,
"grad_norm": 0.3487645089626312,
"learning_rate": 0.0004613803967327887,
"loss": 3.3672,
"step": 39700
},
{
"epoch": 11.588921282798834,
"grad_norm": 0.3463019132614136,
"learning_rate": 0.0004612053675612602,
"loss": 3.3684,
"step": 39750
},
{
"epoch": 11.603498542274053,
"grad_norm": 0.36498206853866577,
"learning_rate": 0.00046103033838973157,
"loss": 3.3629,
"step": 39800
},
{
"epoch": 11.61807580174927,
"grad_norm": 0.3331190347671509,
"learning_rate": 0.000460855309218203,
"loss": 3.363,
"step": 39850
},
{
"epoch": 11.63265306122449,
"grad_norm": 0.36024343967437744,
"learning_rate": 0.0004606802800466744,
"loss": 3.3487,
"step": 39900
},
{
"epoch": 11.647230320699709,
"grad_norm": 0.3634866774082184,
"learning_rate": 0.00046050525087514585,
"loss": 3.3566,
"step": 39950
},
{
"epoch": 11.661807580174926,
"grad_norm": 0.34901607036590576,
"learning_rate": 0.0004603302217036172,
"loss": 3.3702,
"step": 40000
},
{
"epoch": 11.661807580174926,
"eval_accuracy": 0.36894645988446667,
"eval_loss": 3.562626361846924,
"eval_runtime": 182.1151,
"eval_samples_per_second": 91.283,
"eval_steps_per_second": 5.705,
"step": 40000
},
{
"epoch": 11.676384839650146,
"grad_norm": 0.33021676540374756,
"learning_rate": 0.0004601551925320886,
"loss": 3.3612,
"step": 40050
},
{
"epoch": 11.690962099125365,
"grad_norm": 0.33711355924606323,
"learning_rate": 0.0004599801633605601,
"loss": 3.3578,
"step": 40100
},
{
"epoch": 11.705539358600584,
"grad_norm": 0.3925802409648895,
"learning_rate": 0.00045980513418903145,
"loss": 3.3587,
"step": 40150
},
{
"epoch": 11.720116618075801,
"grad_norm": 0.3565693199634552,
"learning_rate": 0.0004596301050175029,
"loss": 3.366,
"step": 40200
},
{
"epoch": 11.73469387755102,
"grad_norm": 0.33789220452308655,
"learning_rate": 0.0004594550758459743,
"loss": 3.3658,
"step": 40250
},
{
"epoch": 11.74927113702624,
"grad_norm": 0.326738178730011,
"learning_rate": 0.00045928004667444573,
"loss": 3.3584,
"step": 40300
},
{
"epoch": 11.763848396501459,
"grad_norm": 0.34021317958831787,
"learning_rate": 0.0004591050175029171,
"loss": 3.373,
"step": 40350
},
{
"epoch": 11.778425655976676,
"grad_norm": 0.3672606348991394,
"learning_rate": 0.0004589299883313885,
"loss": 3.3564,
"step": 40400
},
{
"epoch": 11.793002915451895,
"grad_norm": 0.34738481044769287,
"learning_rate": 0.00045875495915985995,
"loss": 3.3651,
"step": 40450
},
{
"epoch": 11.807580174927114,
"grad_norm": 0.33744242787361145,
"learning_rate": 0.00045857992998833133,
"loss": 3.3682,
"step": 40500
},
{
"epoch": 11.822157434402332,
"grad_norm": 0.35602742433547974,
"learning_rate": 0.00045840490081680275,
"loss": 3.3661,
"step": 40550
},
{
"epoch": 11.83673469387755,
"grad_norm": 0.3444158434867859,
"learning_rate": 0.0004582298716452742,
"loss": 3.364,
"step": 40600
},
{
"epoch": 11.85131195335277,
"grad_norm": 0.342560738325119,
"learning_rate": 0.0004580548424737456,
"loss": 3.3817,
"step": 40650
},
{
"epoch": 11.86588921282799,
"grad_norm": 0.364653617143631,
"learning_rate": 0.000457879813302217,
"loss": 3.3652,
"step": 40700
},
{
"epoch": 11.880466472303207,
"grad_norm": 0.34066253900527954,
"learning_rate": 0.0004577047841306884,
"loss": 3.3719,
"step": 40750
},
{
"epoch": 11.895043731778426,
"grad_norm": 0.3448898196220398,
"learning_rate": 0.00045752975495915983,
"loss": 3.3707,
"step": 40800
},
{
"epoch": 11.909620991253645,
"grad_norm": 0.31567615270614624,
"learning_rate": 0.00045735472578763126,
"loss": 3.3814,
"step": 40850
},
{
"epoch": 11.924198250728862,
"grad_norm": 0.3457653820514679,
"learning_rate": 0.00045717969661610263,
"loss": 3.3668,
"step": 40900
},
{
"epoch": 11.938775510204081,
"grad_norm": 0.34771329164505005,
"learning_rate": 0.000457004667444574,
"loss": 3.377,
"step": 40950
},
{
"epoch": 11.9533527696793,
"grad_norm": 0.3419136106967926,
"learning_rate": 0.0004568296382730455,
"loss": 3.3664,
"step": 41000
},
{
"epoch": 11.9533527696793,
"eval_accuracy": 0.36959120240182775,
"eval_loss": 3.554091691970825,
"eval_runtime": 181.934,
"eval_samples_per_second": 91.374,
"eval_steps_per_second": 5.711,
"step": 41000
},
{
"epoch": 11.96793002915452,
"grad_norm": 0.3192198872566223,
"learning_rate": 0.00045665460910151686,
"loss": 3.3775,
"step": 41050
},
{
"epoch": 11.982507288629737,
"grad_norm": 0.36953675746917725,
"learning_rate": 0.0004564795799299883,
"loss": 3.3717,
"step": 41100
},
{
"epoch": 11.997084548104956,
"grad_norm": 0.3306615948677063,
"learning_rate": 0.0004563045507584597,
"loss": 3.377,
"step": 41150
},
{
"epoch": 12.011661807580175,
"grad_norm": 0.3414677083492279,
"learning_rate": 0.00045612952158693114,
"loss": 3.2798,
"step": 41200
},
{
"epoch": 12.026239067055394,
"grad_norm": 0.3555249869823456,
"learning_rate": 0.0004559544924154025,
"loss": 3.2622,
"step": 41250
},
{
"epoch": 12.040816326530612,
"grad_norm": 0.34148281812667847,
"learning_rate": 0.0004557794632438739,
"loss": 3.2654,
"step": 41300
},
{
"epoch": 12.055393586005831,
"grad_norm": 0.340695858001709,
"learning_rate": 0.00045560443407234537,
"loss": 3.28,
"step": 41350
},
{
"epoch": 12.06997084548105,
"grad_norm": 0.342260479927063,
"learning_rate": 0.00045542940490081674,
"loss": 3.2805,
"step": 41400
},
{
"epoch": 12.084548104956268,
"grad_norm": 0.3923518657684326,
"learning_rate": 0.00045525437572928817,
"loss": 3.2852,
"step": 41450
},
{
"epoch": 12.099125364431487,
"grad_norm": 0.331145703792572,
"learning_rate": 0.0004550793465577596,
"loss": 3.262,
"step": 41500
},
{
"epoch": 12.113702623906706,
"grad_norm": 0.35945871472358704,
"learning_rate": 0.000454904317386231,
"loss": 3.2814,
"step": 41550
},
{
"epoch": 12.128279883381925,
"grad_norm": 0.34431734681129456,
"learning_rate": 0.0004547292882147024,
"loss": 3.3014,
"step": 41600
},
{
"epoch": 12.142857142857142,
"grad_norm": 0.3369593322277069,
"learning_rate": 0.0004545542590431738,
"loss": 3.2962,
"step": 41650
},
{
"epoch": 12.157434402332362,
"grad_norm": 0.34433311223983765,
"learning_rate": 0.00045437922987164525,
"loss": 3.2765,
"step": 41700
},
{
"epoch": 12.17201166180758,
"grad_norm": 0.3524353802204132,
"learning_rate": 0.00045420420070011667,
"loss": 3.2916,
"step": 41750
},
{
"epoch": 12.186588921282798,
"grad_norm": 0.3536403477191925,
"learning_rate": 0.00045402917152858804,
"loss": 3.2998,
"step": 41800
},
{
"epoch": 12.201166180758017,
"grad_norm": 0.3326958119869232,
"learning_rate": 0.0004538541423570595,
"loss": 3.3098,
"step": 41850
},
{
"epoch": 12.215743440233236,
"grad_norm": 0.34114643931388855,
"learning_rate": 0.0004536791131855309,
"loss": 3.301,
"step": 41900
},
{
"epoch": 12.230320699708455,
"grad_norm": 0.37027707695961,
"learning_rate": 0.00045350408401400227,
"loss": 3.2952,
"step": 41950
},
{
"epoch": 12.244897959183673,
"grad_norm": 0.346375435590744,
"learning_rate": 0.0004533290548424737,
"loss": 3.3073,
"step": 42000
},
{
"epoch": 12.244897959183673,
"eval_accuracy": 0.3688454576788987,
"eval_loss": 3.5696043968200684,
"eval_runtime": 181.8129,
"eval_samples_per_second": 91.435,
"eval_steps_per_second": 5.715,
"step": 42000
},
{
"epoch": 12.259475218658892,
"grad_norm": 0.3549197316169739,
"learning_rate": 0.0004531540256709451,
"loss": 3.3238,
"step": 42050
},
{
"epoch": 12.274052478134111,
"grad_norm": 0.3610092103481293,
"learning_rate": 0.00045297899649941655,
"loss": 3.3197,
"step": 42100
},
{
"epoch": 12.28862973760933,
"grad_norm": 0.3515133857727051,
"learning_rate": 0.0004528039673278879,
"loss": 3.316,
"step": 42150
},
{
"epoch": 12.303206997084548,
"grad_norm": 0.3595082461833954,
"learning_rate": 0.0004526289381563593,
"loss": 3.3178,
"step": 42200
},
{
"epoch": 12.317784256559767,
"grad_norm": 0.34504249691963196,
"learning_rate": 0.0004524539089848308,
"loss": 3.3047,
"step": 42250
},
{
"epoch": 12.332361516034986,
"grad_norm": 0.33892661333084106,
"learning_rate": 0.00045227887981330215,
"loss": 3.3231,
"step": 42300
},
{
"epoch": 12.346938775510203,
"grad_norm": 0.33925503492355347,
"learning_rate": 0.0004521038506417736,
"loss": 3.3194,
"step": 42350
},
{
"epoch": 12.361516034985423,
"grad_norm": 0.3226408064365387,
"learning_rate": 0.000451928821470245,
"loss": 3.3263,
"step": 42400
},
{
"epoch": 12.376093294460642,
"grad_norm": 0.33827292919158936,
"learning_rate": 0.00045175379229871643,
"loss": 3.3174,
"step": 42450
},
{
"epoch": 12.39067055393586,
"grad_norm": 0.3600287437438965,
"learning_rate": 0.0004515787631271878,
"loss": 3.3228,
"step": 42500
},
{
"epoch": 12.405247813411078,
"grad_norm": 0.353061318397522,
"learning_rate": 0.00045140373395565923,
"loss": 3.3152,
"step": 42550
},
{
"epoch": 12.419825072886297,
"grad_norm": 0.36559176445007324,
"learning_rate": 0.00045122870478413066,
"loss": 3.3145,
"step": 42600
},
{
"epoch": 12.434402332361516,
"grad_norm": 0.34186482429504395,
"learning_rate": 0.0004510536756126021,
"loss": 3.341,
"step": 42650
},
{
"epoch": 12.448979591836734,
"grad_norm": 0.3551715016365051,
"learning_rate": 0.00045087864644107346,
"loss": 3.3345,
"step": 42700
},
{
"epoch": 12.463556851311953,
"grad_norm": 0.3601856529712677,
"learning_rate": 0.00045070361726954494,
"loss": 3.3279,
"step": 42750
},
{
"epoch": 12.478134110787172,
"grad_norm": 0.33436763286590576,
"learning_rate": 0.0004505285880980163,
"loss": 3.3244,
"step": 42800
},
{
"epoch": 12.492711370262391,
"grad_norm": 0.33347827196121216,
"learning_rate": 0.0004503535589264877,
"loss": 3.3318,
"step": 42850
},
{
"epoch": 12.507288629737609,
"grad_norm": 0.3343319892883301,
"learning_rate": 0.0004501785297549591,
"loss": 3.3318,
"step": 42900
},
{
"epoch": 12.521865889212828,
"grad_norm": 0.3466685712337494,
"learning_rate": 0.00045000350058343054,
"loss": 3.3361,
"step": 42950
},
{
"epoch": 12.536443148688047,
"grad_norm": 0.3229898512363434,
"learning_rate": 0.00044982847141190196,
"loss": 3.3408,
"step": 43000
},
{
"epoch": 12.536443148688047,
"eval_accuracy": 0.36942121733791145,
"eval_loss": 3.55837345123291,
"eval_runtime": 181.7545,
"eval_samples_per_second": 91.464,
"eval_steps_per_second": 5.717,
"step": 43000
},
{
"epoch": 12.551020408163264,
"grad_norm": 0.34018051624298096,
"learning_rate": 0.00044965344224037334,
"loss": 3.329,
"step": 43050
},
{
"epoch": 12.565597667638484,
"grad_norm": 0.36611273884773254,
"learning_rate": 0.0004494784130688448,
"loss": 3.337,
"step": 43100
},
{
"epoch": 12.580174927113703,
"grad_norm": 0.3752545714378357,
"learning_rate": 0.0004493033838973162,
"loss": 3.3445,
"step": 43150
},
{
"epoch": 12.594752186588922,
"grad_norm": 0.3641834259033203,
"learning_rate": 0.00044912835472578756,
"loss": 3.3453,
"step": 43200
},
{
"epoch": 12.60932944606414,
"grad_norm": 0.36704397201538086,
"learning_rate": 0.000448953325554259,
"loss": 3.341,
"step": 43250
},
{
"epoch": 12.623906705539358,
"grad_norm": 0.33914434909820557,
"learning_rate": 0.0004487782963827304,
"loss": 3.3488,
"step": 43300
},
{
"epoch": 12.638483965014577,
"grad_norm": 0.3278019428253174,
"learning_rate": 0.00044860326721120184,
"loss": 3.3478,
"step": 43350
},
{
"epoch": 12.653061224489797,
"grad_norm": 0.35495081543922424,
"learning_rate": 0.0004484282380396732,
"loss": 3.3384,
"step": 43400
},
{
"epoch": 12.667638483965014,
"grad_norm": 0.35847678780555725,
"learning_rate": 0.00044825320886814464,
"loss": 3.3517,
"step": 43450
},
{
"epoch": 12.682215743440233,
"grad_norm": 0.3521868884563446,
"learning_rate": 0.00044807817969661607,
"loss": 3.3402,
"step": 43500
},
{
"epoch": 12.696793002915452,
"grad_norm": 0.36590296030044556,
"learning_rate": 0.0004479031505250875,
"loss": 3.3511,
"step": 43550
},
{
"epoch": 12.71137026239067,
"grad_norm": 0.3575020432472229,
"learning_rate": 0.00044772812135355887,
"loss": 3.3492,
"step": 43600
},
{
"epoch": 12.725947521865889,
"grad_norm": 0.31945955753326416,
"learning_rate": 0.00044755309218203035,
"loss": 3.347,
"step": 43650
},
{
"epoch": 12.740524781341108,
"grad_norm": 0.3386148512363434,
"learning_rate": 0.0004473780630105017,
"loss": 3.3531,
"step": 43700
},
{
"epoch": 12.755102040816327,
"grad_norm": 0.3749040365219116,
"learning_rate": 0.0004472030338389731,
"loss": 3.3309,
"step": 43750
},
{
"epoch": 12.769679300291545,
"grad_norm": 0.35152819752693176,
"learning_rate": 0.0004470280046674445,
"loss": 3.353,
"step": 43800
},
{
"epoch": 12.784256559766764,
"grad_norm": 0.3418697416782379,
"learning_rate": 0.00044685297549591595,
"loss": 3.3484,
"step": 43850
},
{
"epoch": 12.798833819241983,
"grad_norm": 0.33918634057044983,
"learning_rate": 0.0004466779463243874,
"loss": 3.3562,
"step": 43900
},
{
"epoch": 12.813411078717202,
"grad_norm": 0.355820894241333,
"learning_rate": 0.00044650291715285875,
"loss": 3.3487,
"step": 43950
},
{
"epoch": 12.82798833819242,
"grad_norm": 0.339217871427536,
"learning_rate": 0.00044632788798133023,
"loss": 3.3521,
"step": 44000
},
{
"epoch": 12.82798833819242,
"eval_accuracy": 0.37001840170719624,
"eval_loss": 3.549464464187622,
"eval_runtime": 181.7706,
"eval_samples_per_second": 91.456,
"eval_steps_per_second": 5.716,
"step": 44000
},
{
"epoch": 12.842565597667638,
"grad_norm": 0.35124990344047546,
"learning_rate": 0.0004461528588098016,
"loss": 3.3483,
"step": 44050
},
{
"epoch": 12.857142857142858,
"grad_norm": 0.3328862190246582,
"learning_rate": 0.000445977829638273,
"loss": 3.3474,
"step": 44100
},
{
"epoch": 12.871720116618075,
"grad_norm": 0.330714613199234,
"learning_rate": 0.0004458028004667444,
"loss": 3.3629,
"step": 44150
},
{
"epoch": 12.886297376093294,
"grad_norm": 0.362414687871933,
"learning_rate": 0.00044562777129521583,
"loss": 3.3562,
"step": 44200
},
{
"epoch": 12.900874635568513,
"grad_norm": 0.3345976769924164,
"learning_rate": 0.00044545274212368725,
"loss": 3.3585,
"step": 44250
},
{
"epoch": 12.915451895043732,
"grad_norm": 0.40451279282569885,
"learning_rate": 0.0004452777129521586,
"loss": 3.3541,
"step": 44300
},
{
"epoch": 12.93002915451895,
"grad_norm": 0.3470134139060974,
"learning_rate": 0.0004451026837806301,
"loss": 3.3578,
"step": 44350
},
{
"epoch": 12.944606413994169,
"grad_norm": 0.3472599983215332,
"learning_rate": 0.0004449276546091015,
"loss": 3.3493,
"step": 44400
},
{
"epoch": 12.959183673469388,
"grad_norm": 0.3843700885772705,
"learning_rate": 0.0004447526254375729,
"loss": 3.3645,
"step": 44450
},
{
"epoch": 12.973760932944607,
"grad_norm": 0.3511066436767578,
"learning_rate": 0.0004445775962660443,
"loss": 3.3645,
"step": 44500
},
{
"epoch": 12.988338192419825,
"grad_norm": 0.3752990961074829,
"learning_rate": 0.00044440256709451576,
"loss": 3.3557,
"step": 44550
},
{
"epoch": 13.002915451895044,
"grad_norm": 0.35009706020355225,
"learning_rate": 0.00044422753792298713,
"loss": 3.3437,
"step": 44600
},
{
"epoch": 13.017492711370263,
"grad_norm": 0.36041513085365295,
"learning_rate": 0.0004440525087514585,
"loss": 3.25,
"step": 44650
},
{
"epoch": 13.03206997084548,
"grad_norm": 0.373749703168869,
"learning_rate": 0.00044387747957992993,
"loss": 3.2508,
"step": 44700
},
{
"epoch": 13.0466472303207,
"grad_norm": 0.34117427468299866,
"learning_rate": 0.00044370245040840136,
"loss": 3.2498,
"step": 44750
},
{
"epoch": 13.061224489795919,
"grad_norm": 0.3592318296432495,
"learning_rate": 0.0004435274212368728,
"loss": 3.2561,
"step": 44800
},
{
"epoch": 13.075801749271138,
"grad_norm": 0.3574276864528656,
"learning_rate": 0.00044335239206534416,
"loss": 3.2736,
"step": 44850
},
{
"epoch": 13.090379008746355,
"grad_norm": 0.35315731167793274,
"learning_rate": 0.00044317736289381564,
"loss": 3.267,
"step": 44900
},
{
"epoch": 13.104956268221574,
"grad_norm": 0.35178348422050476,
"learning_rate": 0.000443002333722287,
"loss": 3.2747,
"step": 44950
},
{
"epoch": 13.119533527696793,
"grad_norm": 0.4218001365661621,
"learning_rate": 0.0004428273045507584,
"loss": 3.2711,
"step": 45000
},
{
"epoch": 13.119533527696793,
"eval_accuracy": 0.36915187812306355,
"eval_loss": 3.5657756328582764,
"eval_runtime": 181.8618,
"eval_samples_per_second": 91.41,
"eval_steps_per_second": 5.713,
"step": 45000
},
{
"epoch": 13.13411078717201,
"grad_norm": 0.37173452973365784,
"learning_rate": 0.0004426522753792298,
"loss": 3.277,
"step": 45050
},
{
"epoch": 13.14868804664723,
"grad_norm": 0.34601107239723206,
"learning_rate": 0.00044247724620770124,
"loss": 3.2698,
"step": 45100
},
{
"epoch": 13.16326530612245,
"grad_norm": 0.37380507588386536,
"learning_rate": 0.00044230221703617267,
"loss": 3.2775,
"step": 45150
},
{
"epoch": 13.177842565597668,
"grad_norm": 0.34601885080337524,
"learning_rate": 0.00044212718786464404,
"loss": 3.2846,
"step": 45200
},
{
"epoch": 13.192419825072886,
"grad_norm": 0.343649685382843,
"learning_rate": 0.0004419521586931155,
"loss": 3.2878,
"step": 45250
},
{
"epoch": 13.206997084548105,
"grad_norm": 0.3760887086391449,
"learning_rate": 0.0004417771295215869,
"loss": 3.2891,
"step": 45300
},
{
"epoch": 13.221574344023324,
"grad_norm": 0.34684184193611145,
"learning_rate": 0.0004416021003500583,
"loss": 3.2816,
"step": 45350
},
{
"epoch": 13.236151603498543,
"grad_norm": 0.37741604447364807,
"learning_rate": 0.0004414270711785297,
"loss": 3.2905,
"step": 45400
},
{
"epoch": 13.25072886297376,
"grad_norm": 0.3436417281627655,
"learning_rate": 0.00044125204200700117,
"loss": 3.2854,
"step": 45450
},
{
"epoch": 13.26530612244898,
"grad_norm": 0.3912341892719269,
"learning_rate": 0.00044107701283547254,
"loss": 3.304,
"step": 45500
},
{
"epoch": 13.279883381924199,
"grad_norm": 0.34772416949272156,
"learning_rate": 0.0004409019836639439,
"loss": 3.2981,
"step": 45550
},
{
"epoch": 13.294460641399416,
"grad_norm": 0.35889309644699097,
"learning_rate": 0.0004407269544924154,
"loss": 3.299,
"step": 45600
},
{
"epoch": 13.309037900874635,
"grad_norm": 0.3548545241355896,
"learning_rate": 0.00044055192532088677,
"loss": 3.2913,
"step": 45650
},
{
"epoch": 13.323615160349854,
"grad_norm": 0.3576371669769287,
"learning_rate": 0.0004403768961493582,
"loss": 3.2963,
"step": 45700
},
{
"epoch": 13.338192419825074,
"grad_norm": 0.35445261001586914,
"learning_rate": 0.00044020186697782957,
"loss": 3.3088,
"step": 45750
},
{
"epoch": 13.352769679300291,
"grad_norm": 0.3462010622024536,
"learning_rate": 0.00044002683780630105,
"loss": 3.3021,
"step": 45800
},
{
"epoch": 13.36734693877551,
"grad_norm": 0.36478662490844727,
"learning_rate": 0.0004398518086347724,
"loss": 3.3029,
"step": 45850
},
{
"epoch": 13.38192419825073,
"grad_norm": 0.36026138067245483,
"learning_rate": 0.0004396767794632438,
"loss": 3.3103,
"step": 45900
},
{
"epoch": 13.396501457725947,
"grad_norm": 0.34346291422843933,
"learning_rate": 0.0004395017502917152,
"loss": 3.3082,
"step": 45950
},
{
"epoch": 13.411078717201166,
"grad_norm": 0.38728317618370056,
"learning_rate": 0.00043932672112018665,
"loss": 3.303,
"step": 46000
},
{
"epoch": 13.411078717201166,
"eval_accuracy": 0.3698803182723114,
"eval_loss": 3.5604703426361084,
"eval_runtime": 181.7583,
"eval_samples_per_second": 91.462,
"eval_steps_per_second": 5.716,
"step": 46000
},
{
"epoch": 13.425655976676385,
"grad_norm": 0.37464943528175354,
"learning_rate": 0.0004391516919486581,
"loss": 3.3086,
"step": 46050
},
{
"epoch": 13.440233236151604,
"grad_norm": 0.3361836373806,
"learning_rate": 0.00043897666277712945,
"loss": 3.3045,
"step": 46100
},
{
"epoch": 13.454810495626822,
"grad_norm": 0.37110447883605957,
"learning_rate": 0.00043880163360560093,
"loss": 3.3114,
"step": 46150
},
{
"epoch": 13.46938775510204,
"grad_norm": 0.3804593086242676,
"learning_rate": 0.0004386266044340723,
"loss": 3.3161,
"step": 46200
},
{
"epoch": 13.48396501457726,
"grad_norm": 0.3696232736110687,
"learning_rate": 0.00043845157526254373,
"loss": 3.3119,
"step": 46250
},
{
"epoch": 13.498542274052479,
"grad_norm": 0.3362153172492981,
"learning_rate": 0.0004382765460910151,
"loss": 3.3166,
"step": 46300
},
{
"epoch": 13.513119533527696,
"grad_norm": 0.355120986700058,
"learning_rate": 0.0004381015169194866,
"loss": 3.3145,
"step": 46350
},
{
"epoch": 13.527696793002915,
"grad_norm": 0.3447381854057312,
"learning_rate": 0.00043792648774795796,
"loss": 3.3336,
"step": 46400
},
{
"epoch": 13.542274052478135,
"grad_norm": 0.3290923535823822,
"learning_rate": 0.00043775145857642933,
"loss": 3.3119,
"step": 46450
},
{
"epoch": 13.556851311953352,
"grad_norm": 0.3647269010543823,
"learning_rate": 0.0004375764294049008,
"loss": 3.314,
"step": 46500
},
{
"epoch": 13.571428571428571,
"grad_norm": 0.34759560227394104,
"learning_rate": 0.0004374014002333722,
"loss": 3.3183,
"step": 46550
},
{
"epoch": 13.58600583090379,
"grad_norm": 0.35087084770202637,
"learning_rate": 0.0004372263710618436,
"loss": 3.314,
"step": 46600
},
{
"epoch": 13.60058309037901,
"grad_norm": 0.3470548987388611,
"learning_rate": 0.000437051341890315,
"loss": 3.3172,
"step": 46650
},
{
"epoch": 13.615160349854227,
"grad_norm": 0.34616619348526,
"learning_rate": 0.00043687631271878646,
"loss": 3.3246,
"step": 46700
},
{
"epoch": 13.629737609329446,
"grad_norm": 0.3519126772880554,
"learning_rate": 0.00043670128354725784,
"loss": 3.3239,
"step": 46750
},
{
"epoch": 13.644314868804665,
"grad_norm": 0.3683834969997406,
"learning_rate": 0.0004365262543757292,
"loss": 3.32,
"step": 46800
},
{
"epoch": 13.658892128279884,
"grad_norm": 0.37900853157043457,
"learning_rate": 0.0004363512252042007,
"loss": 3.3224,
"step": 46850
},
{
"epoch": 13.673469387755102,
"grad_norm": 0.3418913781642914,
"learning_rate": 0.00043617619603267206,
"loss": 3.3192,
"step": 46900
},
{
"epoch": 13.68804664723032,
"grad_norm": 0.33223602175712585,
"learning_rate": 0.0004360011668611435,
"loss": 3.3318,
"step": 46950
},
{
"epoch": 13.70262390670554,
"grad_norm": 0.3742575943470001,
"learning_rate": 0.00043582613768961486,
"loss": 3.3211,
"step": 47000
},
{
"epoch": 13.70262390670554,
"eval_accuracy": 0.37053812750857457,
"eval_loss": 3.552196741104126,
"eval_runtime": 181.8675,
"eval_samples_per_second": 91.407,
"eval_steps_per_second": 5.713,
"step": 47000
},
{
"epoch": 13.717201166180757,
"grad_norm": 0.3494127690792084,
"learning_rate": 0.00043565110851808634,
"loss": 3.3215,
"step": 47050
},
{
"epoch": 13.731778425655977,
"grad_norm": 0.35944312810897827,
"learning_rate": 0.0004354760793465577,
"loss": 3.3346,
"step": 47100
},
{
"epoch": 13.746355685131196,
"grad_norm": 0.34206297993659973,
"learning_rate": 0.00043530105017502914,
"loss": 3.3435,
"step": 47150
},
{
"epoch": 13.760932944606415,
"grad_norm": 0.3618668019771576,
"learning_rate": 0.0004351260210035005,
"loss": 3.3161,
"step": 47200
},
{
"epoch": 13.775510204081632,
"grad_norm": 0.34841158986091614,
"learning_rate": 0.000434950991831972,
"loss": 3.3357,
"step": 47250
},
{
"epoch": 13.790087463556851,
"grad_norm": 0.3624778091907501,
"learning_rate": 0.00043477596266044337,
"loss": 3.3493,
"step": 47300
},
{
"epoch": 13.80466472303207,
"grad_norm": 0.37030959129333496,
"learning_rate": 0.00043460093348891474,
"loss": 3.3391,
"step": 47350
},
{
"epoch": 13.819241982507288,
"grad_norm": 0.37484002113342285,
"learning_rate": 0.0004344259043173862,
"loss": 3.3422,
"step": 47400
},
{
"epoch": 13.833819241982507,
"grad_norm": 0.3513255715370178,
"learning_rate": 0.0004342508751458576,
"loss": 3.3304,
"step": 47450
},
{
"epoch": 13.848396501457726,
"grad_norm": 0.33904463052749634,
"learning_rate": 0.000434075845974329,
"loss": 3.3344,
"step": 47500
},
{
"epoch": 13.862973760932945,
"grad_norm": 0.3457725942134857,
"learning_rate": 0.0004339008168028004,
"loss": 3.34,
"step": 47550
},
{
"epoch": 13.877551020408163,
"grad_norm": 0.324558287858963,
"learning_rate": 0.0004337257876312719,
"loss": 3.3388,
"step": 47600
},
{
"epoch": 13.892128279883382,
"grad_norm": 0.34971484541893005,
"learning_rate": 0.00043355075845974325,
"loss": 3.3423,
"step": 47650
},
{
"epoch": 13.906705539358601,
"grad_norm": 0.3688257038593292,
"learning_rate": 0.0004333757292882146,
"loss": 3.3374,
"step": 47700
},
{
"epoch": 13.921282798833818,
"grad_norm": 0.39048126339912415,
"learning_rate": 0.0004332007001166861,
"loss": 3.3488,
"step": 47750
},
{
"epoch": 13.935860058309038,
"grad_norm": 0.35178861021995544,
"learning_rate": 0.0004330256709451575,
"loss": 3.34,
"step": 47800
},
{
"epoch": 13.950437317784257,
"grad_norm": 0.31503215432167053,
"learning_rate": 0.0004328506417736289,
"loss": 3.3474,
"step": 47850
},
{
"epoch": 13.965014577259476,
"grad_norm": 0.40469351410865784,
"learning_rate": 0.0004326756126021003,
"loss": 3.3482,
"step": 47900
},
{
"epoch": 13.979591836734693,
"grad_norm": 0.3635316491127014,
"learning_rate": 0.00043250058343057175,
"loss": 3.3434,
"step": 47950
},
{
"epoch": 13.994169096209912,
"grad_norm": 0.3199983835220337,
"learning_rate": 0.0004323255542590431,
"loss": 3.3581,
"step": 48000
},
{
"epoch": 13.994169096209912,
"eval_accuracy": 0.3712570324845695,
"eval_loss": 3.541492223739624,
"eval_runtime": 181.829,
"eval_samples_per_second": 91.427,
"eval_steps_per_second": 5.714,
"step": 48000
},
{
"epoch": 14.008746355685131,
"grad_norm": 0.3561180531978607,
"learning_rate": 0.00043215052508751455,
"loss": 3.2609,
"step": 48050
},
{
"epoch": 14.02332361516035,
"grad_norm": 0.34569546580314636,
"learning_rate": 0.000431975495915986,
"loss": 3.2238,
"step": 48100
},
{
"epoch": 14.037900874635568,
"grad_norm": 0.37043094635009766,
"learning_rate": 0.0004318004667444574,
"loss": 3.2315,
"step": 48150
},
{
"epoch": 14.052478134110787,
"grad_norm": 0.3671160936355591,
"learning_rate": 0.0004316254375729288,
"loss": 3.2376,
"step": 48200
},
{
"epoch": 14.067055393586006,
"grad_norm": 0.37406212091445923,
"learning_rate": 0.00043145040840140015,
"loss": 3.2348,
"step": 48250
},
{
"epoch": 14.081632653061224,
"grad_norm": 0.4101208448410034,
"learning_rate": 0.00043127537922987163,
"loss": 3.2382,
"step": 48300
},
{
"epoch": 14.096209912536443,
"grad_norm": 0.36454081535339355,
"learning_rate": 0.000431100350058343,
"loss": 3.2494,
"step": 48350
},
{
"epoch": 14.110787172011662,
"grad_norm": 0.3647298514842987,
"learning_rate": 0.00043092532088681443,
"loss": 3.2586,
"step": 48400
},
{
"epoch": 14.125364431486881,
"grad_norm": 0.3764522075653076,
"learning_rate": 0.0004307502917152858,
"loss": 3.2495,
"step": 48450
},
{
"epoch": 14.139941690962099,
"grad_norm": 0.3430420160293579,
"learning_rate": 0.0004305752625437573,
"loss": 3.2612,
"step": 48500
},
{
"epoch": 14.154518950437318,
"grad_norm": 0.3535497784614563,
"learning_rate": 0.00043040023337222866,
"loss": 3.2669,
"step": 48550
},
{
"epoch": 14.169096209912537,
"grad_norm": 0.3597559630870819,
"learning_rate": 0.00043022520420070003,
"loss": 3.26,
"step": 48600
},
{
"epoch": 14.183673469387756,
"grad_norm": 0.3598780930042267,
"learning_rate": 0.0004300501750291715,
"loss": 3.2653,
"step": 48650
},
{
"epoch": 14.198250728862973,
"grad_norm": 0.3604069650173187,
"learning_rate": 0.0004298751458576429,
"loss": 3.2714,
"step": 48700
},
{
"epoch": 14.212827988338192,
"grad_norm": 0.39230746030807495,
"learning_rate": 0.0004297001166861143,
"loss": 3.2626,
"step": 48750
},
{
"epoch": 14.227405247813412,
"grad_norm": 0.3730916678905487,
"learning_rate": 0.0004295250875145857,
"loss": 3.2742,
"step": 48800
},
{
"epoch": 14.241982507288629,
"grad_norm": 0.3509989380836487,
"learning_rate": 0.00042935005834305717,
"loss": 3.277,
"step": 48850
},
{
"epoch": 14.256559766763848,
"grad_norm": 0.36396312713623047,
"learning_rate": 0.00042917502917152854,
"loss": 3.2784,
"step": 48900
},
{
"epoch": 14.271137026239067,
"grad_norm": 0.3386274576187134,
"learning_rate": 0.00042899999999999997,
"loss": 3.2694,
"step": 48950
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.37972214818000793,
"learning_rate": 0.0004288249708284714,
"loss": 3.2851,
"step": 49000
},
{
"epoch": 14.285714285714286,
"eval_accuracy": 0.37012011022189406,
"eval_loss": 3.5599772930145264,
"eval_runtime": 181.7492,
"eval_samples_per_second": 91.467,
"eval_steps_per_second": 5.717,
"step": 49000
},
{
"epoch": 14.300291545189504,
"grad_norm": 0.3713630437850952,
"learning_rate": 0.0004286499416569428,
"loss": 3.2855,
"step": 49050
},
{
"epoch": 14.314868804664723,
"grad_norm": 0.3884792625904083,
"learning_rate": 0.0004284749124854142,
"loss": 3.2723,
"step": 49100
},
{
"epoch": 14.329446064139942,
"grad_norm": 0.3575059771537781,
"learning_rate": 0.00042829988331388556,
"loss": 3.2844,
"step": 49150
},
{
"epoch": 14.34402332361516,
"grad_norm": 0.35480329394340515,
"learning_rate": 0.00042812485414235704,
"loss": 3.2873,
"step": 49200
},
{
"epoch": 14.358600583090379,
"grad_norm": 0.3646618127822876,
"learning_rate": 0.0004279498249708284,
"loss": 3.2825,
"step": 49250
},
{
"epoch": 14.373177842565598,
"grad_norm": 0.3766191303730011,
"learning_rate": 0.00042777479579929984,
"loss": 3.2883,
"step": 49300
},
{
"epoch": 14.387755102040817,
"grad_norm": 0.3955742418766022,
"learning_rate": 0.00042759976662777127,
"loss": 3.2963,
"step": 49350
},
{
"epoch": 14.402332361516034,
"grad_norm": 0.3547273874282837,
"learning_rate": 0.0004274247374562427,
"loss": 3.2965,
"step": 49400
},
{
"epoch": 14.416909620991254,
"grad_norm": 0.34324541687965393,
"learning_rate": 0.00042724970828471407,
"loss": 3.2945,
"step": 49450
},
{
"epoch": 14.431486880466473,
"grad_norm": 0.36863452196121216,
"learning_rate": 0.00042707467911318544,
"loss": 3.2996,
"step": 49500
},
{
"epoch": 14.446064139941692,
"grad_norm": 0.37455543875694275,
"learning_rate": 0.0004268996499416569,
"loss": 3.2931,
"step": 49550
},
{
"epoch": 14.46064139941691,
"grad_norm": 0.32714369893074036,
"learning_rate": 0.0004267246207701283,
"loss": 3.3118,
"step": 49600
},
{
"epoch": 14.475218658892128,
"grad_norm": 0.3829602003097534,
"learning_rate": 0.0004265495915985997,
"loss": 3.3034,
"step": 49650
},
{
"epoch": 14.489795918367347,
"grad_norm": 0.37818360328674316,
"learning_rate": 0.0004263745624270711,
"loss": 3.3091,
"step": 49700
},
{
"epoch": 14.504373177842565,
"grad_norm": 0.36534905433654785,
"learning_rate": 0.0004261995332555426,
"loss": 3.3183,
"step": 49750
},
{
"epoch": 14.518950437317784,
"grad_norm": 0.3587852716445923,
"learning_rate": 0.00042602450408401395,
"loss": 3.3065,
"step": 49800
},
{
"epoch": 14.533527696793003,
"grad_norm": 0.3622785210609436,
"learning_rate": 0.0004258494749124854,
"loss": 3.3013,
"step": 49850
},
{
"epoch": 14.548104956268222,
"grad_norm": 0.38269826769828796,
"learning_rate": 0.0004256744457409568,
"loss": 3.3107,
"step": 49900
},
{
"epoch": 14.56268221574344,
"grad_norm": 0.3362688720226288,
"learning_rate": 0.00042549941656942823,
"loss": 3.3043,
"step": 49950
},
{
"epoch": 14.577259475218659,
"grad_norm": 0.3540298044681549,
"learning_rate": 0.0004253243873978996,
"loss": 3.3056,
"step": 50000
},
{
"epoch": 14.577259475218659,
"eval_accuracy": 0.3706933977989524,
"eval_loss": 3.552320718765259,
"eval_runtime": 181.7913,
"eval_samples_per_second": 91.446,
"eval_steps_per_second": 5.715,
"step": 50000
},
{
"epoch": 14.591836734693878,
"grad_norm": 0.35744428634643555,
"learning_rate": 0.000425149358226371,
"loss": 3.3006,
"step": 50050
},
{
"epoch": 14.606413994169095,
"grad_norm": 0.35320979356765747,
"learning_rate": 0.00042497432905484246,
"loss": 3.3105,
"step": 50100
},
{
"epoch": 14.620991253644315,
"grad_norm": 0.3753153085708618,
"learning_rate": 0.00042479929988331383,
"loss": 3.3255,
"step": 50150
},
{
"epoch": 14.635568513119534,
"grad_norm": 0.3381422162055969,
"learning_rate": 0.00042462427071178526,
"loss": 3.3203,
"step": 50200
},
{
"epoch": 14.650145772594753,
"grad_norm": 0.37024828791618347,
"learning_rate": 0.0004244492415402567,
"loss": 3.3146,
"step": 50250
},
{
"epoch": 14.66472303206997,
"grad_norm": 0.3409423828125,
"learning_rate": 0.0004242742123687281,
"loss": 3.2963,
"step": 50300
},
{
"epoch": 14.67930029154519,
"grad_norm": 0.33922427892684937,
"learning_rate": 0.0004240991831971995,
"loss": 3.3112,
"step": 50350
},
{
"epoch": 14.693877551020408,
"grad_norm": 0.38118499517440796,
"learning_rate": 0.00042392415402567085,
"loss": 3.3151,
"step": 50400
},
{
"epoch": 14.708454810495628,
"grad_norm": 0.34291163086891174,
"learning_rate": 0.00042374912485414234,
"loss": 3.3245,
"step": 50450
},
{
"epoch": 14.723032069970845,
"grad_norm": 0.3485682010650635,
"learning_rate": 0.0004235740956826137,
"loss": 3.3257,
"step": 50500
},
{
"epoch": 14.737609329446064,
"grad_norm": 0.3919681906700134,
"learning_rate": 0.00042339906651108514,
"loss": 3.3112,
"step": 50550
},
{
"epoch": 14.752186588921283,
"grad_norm": 0.36933663487434387,
"learning_rate": 0.00042322403733955656,
"loss": 3.3269,
"step": 50600
},
{
"epoch": 14.7667638483965,
"grad_norm": 0.37549716234207153,
"learning_rate": 0.000423049008168028,
"loss": 3.3143,
"step": 50650
},
{
"epoch": 14.78134110787172,
"grad_norm": 0.37795278429985046,
"learning_rate": 0.00042287397899649936,
"loss": 3.3174,
"step": 50700
},
{
"epoch": 14.795918367346939,
"grad_norm": 0.3924393951892853,
"learning_rate": 0.0004226989498249708,
"loss": 3.3155,
"step": 50750
},
{
"epoch": 14.810495626822158,
"grad_norm": 0.38116684556007385,
"learning_rate": 0.0004225239206534422,
"loss": 3.3216,
"step": 50800
},
{
"epoch": 14.825072886297376,
"grad_norm": 0.3596772253513336,
"learning_rate": 0.00042234889148191364,
"loss": 3.3287,
"step": 50850
},
{
"epoch": 14.839650145772595,
"grad_norm": 0.34857410192489624,
"learning_rate": 0.000422173862310385,
"loss": 3.3243,
"step": 50900
},
{
"epoch": 14.854227405247814,
"grad_norm": 0.33267682790756226,
"learning_rate": 0.0004219988331388564,
"loss": 3.318,
"step": 50950
},
{
"epoch": 14.868804664723033,
"grad_norm": 0.3842490613460541,
"learning_rate": 0.00042182380396732787,
"loss": 3.3251,
"step": 51000
},
{
"epoch": 14.868804664723033,
"eval_accuracy": 0.3712048833271492,
"eval_loss": 3.5443015098571777,
"eval_runtime": 181.8147,
"eval_samples_per_second": 91.434,
"eval_steps_per_second": 5.715,
"step": 51000
},
{
"epoch": 14.88338192419825,
"grad_norm": 0.3622199296951294,
"learning_rate": 0.00042164877479579924,
"loss": 3.3409,
"step": 51050
},
{
"epoch": 14.89795918367347,
"grad_norm": 0.35461923480033875,
"learning_rate": 0.00042147374562427067,
"loss": 3.3252,
"step": 51100
},
{
"epoch": 14.912536443148689,
"grad_norm": 0.3685915172100067,
"learning_rate": 0.0004212987164527421,
"loss": 3.3284,
"step": 51150
},
{
"epoch": 14.927113702623906,
"grad_norm": 0.3246322274208069,
"learning_rate": 0.0004211236872812135,
"loss": 3.3201,
"step": 51200
},
{
"epoch": 14.941690962099125,
"grad_norm": 0.36358892917633057,
"learning_rate": 0.0004209486581096849,
"loss": 3.322,
"step": 51250
},
{
"epoch": 14.956268221574344,
"grad_norm": 0.3402910828590393,
"learning_rate": 0.00042077362893815627,
"loss": 3.3405,
"step": 51300
},
{
"epoch": 14.970845481049563,
"grad_norm": 0.3693722188472748,
"learning_rate": 0.00042059859976662775,
"loss": 3.3269,
"step": 51350
},
{
"epoch": 14.98542274052478,
"grad_norm": 0.34414148330688477,
"learning_rate": 0.0004204235705950991,
"loss": 3.3193,
"step": 51400
},
{
"epoch": 15.0,
"grad_norm": 0.35340461134910583,
"learning_rate": 0.00042024854142357055,
"loss": 3.3147,
"step": 51450
},
{
"epoch": 15.01457725947522,
"grad_norm": 0.3598097562789917,
"learning_rate": 0.000420073512252042,
"loss": 3.2213,
"step": 51500
},
{
"epoch": 15.029154518950437,
"grad_norm": 0.36583930253982544,
"learning_rate": 0.0004198984830805134,
"loss": 3.2128,
"step": 51550
},
{
"epoch": 15.043731778425656,
"grad_norm": 0.3556552231311798,
"learning_rate": 0.0004197234539089848,
"loss": 3.2216,
"step": 51600
},
{
"epoch": 15.058309037900875,
"grad_norm": 0.35826733708381653,
"learning_rate": 0.0004195484247374562,
"loss": 3.2234,
"step": 51650
},
{
"epoch": 15.072886297376094,
"grad_norm": 0.35535216331481934,
"learning_rate": 0.0004193733955659276,
"loss": 3.2361,
"step": 51700
},
{
"epoch": 15.087463556851311,
"grad_norm": 0.34739965200424194,
"learning_rate": 0.00041919836639439905,
"loss": 3.2338,
"step": 51750
},
{
"epoch": 15.10204081632653,
"grad_norm": 0.3416599631309509,
"learning_rate": 0.0004190233372228704,
"loss": 3.2461,
"step": 51800
},
{
"epoch": 15.11661807580175,
"grad_norm": 0.3742870092391968,
"learning_rate": 0.0004188483080513419,
"loss": 3.2419,
"step": 51850
},
{
"epoch": 15.131195335276967,
"grad_norm": 0.3667660057544708,
"learning_rate": 0.0004186732788798133,
"loss": 3.2593,
"step": 51900
},
{
"epoch": 15.145772594752186,
"grad_norm": 0.38009244203567505,
"learning_rate": 0.00041849824970828465,
"loss": 3.2449,
"step": 51950
},
{
"epoch": 15.160349854227405,
"grad_norm": 0.352700799703598,
"learning_rate": 0.0004183232205367561,
"loss": 3.2478,
"step": 52000
},
{
"epoch": 15.160349854227405,
"eval_accuracy": 0.3707503734020933,
"eval_loss": 3.559041976928711,
"eval_runtime": 181.6538,
"eval_samples_per_second": 91.515,
"eval_steps_per_second": 5.72,
"step": 52000
},
{
"epoch": 15.174927113702624,
"grad_norm": 0.38742712140083313,
"learning_rate": 0.0004181481913652275,
"loss": 3.2548,
"step": 52050
},
{
"epoch": 15.189504373177842,
"grad_norm": 0.39204293489456177,
"learning_rate": 0.00041797316219369893,
"loss": 3.2614,
"step": 52100
},
{
"epoch": 15.204081632653061,
"grad_norm": 0.3423581123352051,
"learning_rate": 0.0004177981330221703,
"loss": 3.2599,
"step": 52150
},
{
"epoch": 15.21865889212828,
"grad_norm": 0.3979237973690033,
"learning_rate": 0.0004176231038506417,
"loss": 3.2615,
"step": 52200
},
{
"epoch": 15.2332361516035,
"grad_norm": 0.37007424235343933,
"learning_rate": 0.00041744807467911316,
"loss": 3.2607,
"step": 52250
},
{
"epoch": 15.247813411078717,
"grad_norm": 0.34523463249206543,
"learning_rate": 0.00041727304550758453,
"loss": 3.2592,
"step": 52300
},
{
"epoch": 15.262390670553936,
"grad_norm": 0.3491571843624115,
"learning_rate": 0.00041709801633605596,
"loss": 3.2602,
"step": 52350
},
{
"epoch": 15.276967930029155,
"grad_norm": 0.3450250029563904,
"learning_rate": 0.0004169229871645274,
"loss": 3.2702,
"step": 52400
},
{
"epoch": 15.291545189504372,
"grad_norm": 0.3784157633781433,
"learning_rate": 0.0004167479579929988,
"loss": 3.2883,
"step": 52450
},
{
"epoch": 15.306122448979592,
"grad_norm": 0.3518495261669159,
"learning_rate": 0.0004165729288214702,
"loss": 3.2777,
"step": 52500
},
{
"epoch": 15.32069970845481,
"grad_norm": 0.37740063667297363,
"learning_rate": 0.0004163978996499416,
"loss": 3.2679,
"step": 52550
},
{
"epoch": 15.33527696793003,
"grad_norm": 0.3901689648628235,
"learning_rate": 0.00041622287047841304,
"loss": 3.2735,
"step": 52600
},
{
"epoch": 15.349854227405247,
"grad_norm": 0.38509419560432434,
"learning_rate": 0.00041604784130688447,
"loss": 3.2671,
"step": 52650
},
{
"epoch": 15.364431486880466,
"grad_norm": 0.35500845313072205,
"learning_rate": 0.00041587281213535584,
"loss": 3.2684,
"step": 52700
},
{
"epoch": 15.379008746355685,
"grad_norm": 0.37637239694595337,
"learning_rate": 0.0004156977829638273,
"loss": 3.27,
"step": 52750
},
{
"epoch": 15.393586005830905,
"grad_norm": 0.3534194827079773,
"learning_rate": 0.0004155227537922987,
"loss": 3.2818,
"step": 52800
},
{
"epoch": 15.408163265306122,
"grad_norm": 0.37404513359069824,
"learning_rate": 0.00041534772462077006,
"loss": 3.2755,
"step": 52850
},
{
"epoch": 15.422740524781341,
"grad_norm": 0.33428794145584106,
"learning_rate": 0.0004151726954492415,
"loss": 3.2799,
"step": 52900
},
{
"epoch": 15.43731778425656,
"grad_norm": 0.37358441948890686,
"learning_rate": 0.0004149976662777129,
"loss": 3.2847,
"step": 52950
},
{
"epoch": 15.451895043731778,
"grad_norm": 0.3440437912940979,
"learning_rate": 0.00041482263710618434,
"loss": 3.2882,
"step": 53000
},
{
"epoch": 15.451895043731778,
"eval_accuracy": 0.3708990514739259,
"eval_loss": 3.5549874305725098,
"eval_runtime": 181.6144,
"eval_samples_per_second": 91.535,
"eval_steps_per_second": 5.721,
"step": 53000
},
{
"epoch": 15.466472303206997,
"grad_norm": 0.41005825996398926,
"learning_rate": 0.0004146476079346557,
"loss": 3.2757,
"step": 53050
},
{
"epoch": 15.481049562682216,
"grad_norm": 0.36211097240448,
"learning_rate": 0.0004144725787631272,
"loss": 3.2745,
"step": 53100
},
{
"epoch": 15.495626822157435,
"grad_norm": 0.37981894612312317,
"learning_rate": 0.00041429754959159857,
"loss": 3.2811,
"step": 53150
},
{
"epoch": 15.510204081632653,
"grad_norm": 0.35121893882751465,
"learning_rate": 0.00041412252042006994,
"loss": 3.2869,
"step": 53200
},
{
"epoch": 15.524781341107872,
"grad_norm": 0.38234373927116394,
"learning_rate": 0.00041394749124854137,
"loss": 3.3016,
"step": 53250
},
{
"epoch": 15.53935860058309,
"grad_norm": 0.3564067482948303,
"learning_rate": 0.0004137724620770128,
"loss": 3.2852,
"step": 53300
},
{
"epoch": 15.55393586005831,
"grad_norm": 0.37256014347076416,
"learning_rate": 0.0004135974329054842,
"loss": 3.2858,
"step": 53350
},
{
"epoch": 15.568513119533527,
"grad_norm": 0.35131582617759705,
"learning_rate": 0.0004134224037339556,
"loss": 3.2981,
"step": 53400
},
{
"epoch": 15.583090379008746,
"grad_norm": 0.3604174852371216,
"learning_rate": 0.000413247374562427,
"loss": 3.2876,
"step": 53450
},
{
"epoch": 15.597667638483966,
"grad_norm": 0.3844100534915924,
"learning_rate": 0.00041307234539089845,
"loss": 3.2905,
"step": 53500
},
{
"epoch": 15.612244897959183,
"grad_norm": 0.33866187930107117,
"learning_rate": 0.0004128973162193699,
"loss": 3.2859,
"step": 53550
},
{
"epoch": 15.626822157434402,
"grad_norm": 0.37496766448020935,
"learning_rate": 0.00041272228704784125,
"loss": 3.2932,
"step": 53600
},
{
"epoch": 15.641399416909621,
"grad_norm": 0.34779778122901917,
"learning_rate": 0.00041254725787631273,
"loss": 3.2996,
"step": 53650
},
{
"epoch": 15.65597667638484,
"grad_norm": 0.3477199673652649,
"learning_rate": 0.0004123722287047841,
"loss": 3.3018,
"step": 53700
},
{
"epoch": 15.670553935860058,
"grad_norm": 0.3835027515888214,
"learning_rate": 0.0004121971995332555,
"loss": 3.3079,
"step": 53750
},
{
"epoch": 15.685131195335277,
"grad_norm": 0.35675570368766785,
"learning_rate": 0.0004120221703617269,
"loss": 3.3029,
"step": 53800
},
{
"epoch": 15.699708454810496,
"grad_norm": 0.3724403977394104,
"learning_rate": 0.00041184714119019833,
"loss": 3.296,
"step": 53850
},
{
"epoch": 15.714285714285714,
"grad_norm": 0.38267725706100464,
"learning_rate": 0.00041167211201866976,
"loss": 3.3099,
"step": 53900
},
{
"epoch": 15.728862973760933,
"grad_norm": 0.3567516803741455,
"learning_rate": 0.00041149708284714113,
"loss": 3.2896,
"step": 53950
},
{
"epoch": 15.743440233236152,
"grad_norm": 0.39038965106010437,
"learning_rate": 0.0004113220536756126,
"loss": 3.3035,
"step": 54000
},
{
"epoch": 15.743440233236152,
"eval_accuracy": 0.3714218379482002,
"eval_loss": 3.546982526779175,
"eval_runtime": 181.7086,
"eval_samples_per_second": 91.487,
"eval_steps_per_second": 5.718,
"step": 54000
},
{
"epoch": 15.758017492711371,
"grad_norm": 0.3851892948150635,
"learning_rate": 0.000411147024504084,
"loss": 3.3165,
"step": 54050
},
{
"epoch": 15.772594752186588,
"grad_norm": 0.36965999007225037,
"learning_rate": 0.00041097199533255535,
"loss": 3.3061,
"step": 54100
},
{
"epoch": 15.787172011661808,
"grad_norm": 0.3408694863319397,
"learning_rate": 0.0004107969661610268,
"loss": 3.3097,
"step": 54150
},
{
"epoch": 15.801749271137027,
"grad_norm": 0.33933934569358826,
"learning_rate": 0.0004106219369894982,
"loss": 3.2994,
"step": 54200
},
{
"epoch": 15.816326530612244,
"grad_norm": 0.5129480957984924,
"learning_rate": 0.00041044690781796964,
"loss": 3.299,
"step": 54250
},
{
"epoch": 15.830903790087463,
"grad_norm": 0.37039506435394287,
"learning_rate": 0.000410271878646441,
"loss": 3.3087,
"step": 54300
},
{
"epoch": 15.845481049562682,
"grad_norm": 0.3602822721004486,
"learning_rate": 0.0004100968494749125,
"loss": 3.3002,
"step": 54350
},
{
"epoch": 15.860058309037901,
"grad_norm": 0.3565203547477722,
"learning_rate": 0.00040992182030338386,
"loss": 3.3034,
"step": 54400
},
{
"epoch": 15.874635568513119,
"grad_norm": 0.3605969548225403,
"learning_rate": 0.0004097467911318553,
"loss": 3.3129,
"step": 54450
},
{
"epoch": 15.889212827988338,
"grad_norm": 0.3564240038394928,
"learning_rate": 0.00040957176196032666,
"loss": 3.308,
"step": 54500
},
{
"epoch": 15.903790087463557,
"grad_norm": 0.3524434268474579,
"learning_rate": 0.00040939673278879814,
"loss": 3.3183,
"step": 54550
},
{
"epoch": 15.918367346938776,
"grad_norm": 0.34038567543029785,
"learning_rate": 0.0004092217036172695,
"loss": 3.3084,
"step": 54600
},
{
"epoch": 15.932944606413994,
"grad_norm": 0.3590739965438843,
"learning_rate": 0.0004090466744457409,
"loss": 3.3132,
"step": 54650
},
{
"epoch": 15.947521865889213,
"grad_norm": 0.3675514757633209,
"learning_rate": 0.0004088716452742123,
"loss": 3.3113,
"step": 54700
},
{
"epoch": 15.962099125364432,
"grad_norm": 0.3558247983455658,
"learning_rate": 0.00040869661610268374,
"loss": 3.3172,
"step": 54750
},
{
"epoch": 15.97667638483965,
"grad_norm": 0.35684889554977417,
"learning_rate": 0.00040852158693115517,
"loss": 3.3083,
"step": 54800
},
{
"epoch": 15.991253644314869,
"grad_norm": 0.3759230375289917,
"learning_rate": 0.00040834655775962654,
"loss": 3.3054,
"step": 54850
},
{
"epoch": 16.005830903790088,
"grad_norm": 0.37320634722709656,
"learning_rate": 0.000408171528588098,
"loss": 3.2709,
"step": 54900
},
{
"epoch": 16.020408163265305,
"grad_norm": 0.3853946626186371,
"learning_rate": 0.0004079964994165694,
"loss": 3.2058,
"step": 54950
},
{
"epoch": 16.034985422740526,
"grad_norm": 0.40219175815582275,
"learning_rate": 0.00040782147024504077,
"loss": 3.1955,
"step": 55000
},
{
"epoch": 16.034985422740526,
"eval_accuracy": 0.37107209721073814,
"eval_loss": 3.5545384883880615,
"eval_runtime": 181.9202,
"eval_samples_per_second": 91.381,
"eval_steps_per_second": 5.711,
"step": 55000
},
{
"epoch": 16.049562682215743,
"grad_norm": 0.38274168968200684,
"learning_rate": 0.0004076464410735122,
"loss": 3.2219,
"step": 55050
},
{
"epoch": 16.06413994169096,
"grad_norm": 0.3740251660346985,
"learning_rate": 0.0004074714119019836,
"loss": 3.2159,
"step": 55100
},
{
"epoch": 16.07871720116618,
"grad_norm": 0.37743881344795227,
"learning_rate": 0.00040729638273045505,
"loss": 3.2209,
"step": 55150
},
{
"epoch": 16.0932944606414,
"grad_norm": 0.40411314368247986,
"learning_rate": 0.0004071213535589264,
"loss": 3.2181,
"step": 55200
},
{
"epoch": 16.107871720116616,
"grad_norm": 0.3726896345615387,
"learning_rate": 0.0004069463243873979,
"loss": 3.2248,
"step": 55250
},
{
"epoch": 16.122448979591837,
"grad_norm": 0.3628726899623871,
"learning_rate": 0.0004067712952158693,
"loss": 3.2261,
"step": 55300
},
{
"epoch": 16.137026239067055,
"grad_norm": 0.35304582118988037,
"learning_rate": 0.0004065962660443407,
"loss": 3.2226,
"step": 55350
},
{
"epoch": 16.151603498542276,
"grad_norm": 0.34734106063842773,
"learning_rate": 0.00040642123687281207,
"loss": 3.2242,
"step": 55400
},
{
"epoch": 16.166180758017493,
"grad_norm": 0.38421186804771423,
"learning_rate": 0.00040624620770128355,
"loss": 3.2378,
"step": 55450
},
{
"epoch": 16.18075801749271,
"grad_norm": 0.3777143955230713,
"learning_rate": 0.0004060711785297549,
"loss": 3.2481,
"step": 55500
},
{
"epoch": 16.19533527696793,
"grad_norm": 0.3493998944759369,
"learning_rate": 0.0004058961493582263,
"loss": 3.239,
"step": 55550
},
{
"epoch": 16.20991253644315,
"grad_norm": 0.39447450637817383,
"learning_rate": 0.0004057211201866978,
"loss": 3.2466,
"step": 55600
},
{
"epoch": 16.224489795918366,
"grad_norm": 0.3562599718570709,
"learning_rate": 0.00040554609101516915,
"loss": 3.237,
"step": 55650
},
{
"epoch": 16.239067055393587,
"grad_norm": 0.38712114095687866,
"learning_rate": 0.0004053710618436406,
"loss": 3.2464,
"step": 55700
},
{
"epoch": 16.253644314868804,
"grad_norm": 0.34968268871307373,
"learning_rate": 0.00040519603267211195,
"loss": 3.2474,
"step": 55750
},
{
"epoch": 16.26822157434402,
"grad_norm": 0.3648940920829773,
"learning_rate": 0.00040502100350058343,
"loss": 3.2563,
"step": 55800
},
{
"epoch": 16.282798833819243,
"grad_norm": 0.378246933221817,
"learning_rate": 0.0004048459743290548,
"loss": 3.2449,
"step": 55850
},
{
"epoch": 16.29737609329446,
"grad_norm": 0.3541260063648224,
"learning_rate": 0.0004046709451575262,
"loss": 3.2452,
"step": 55900
},
{
"epoch": 16.31195335276968,
"grad_norm": 0.3614341914653778,
"learning_rate": 0.0004044959159859976,
"loss": 3.2649,
"step": 55950
},
{
"epoch": 16.3265306122449,
"grad_norm": 0.35265859961509705,
"learning_rate": 0.00040432088681446903,
"loss": 3.2497,
"step": 56000
},
{
"epoch": 16.3265306122449,
"eval_accuracy": 0.37080417061414994,
"eval_loss": 3.555210590362549,
"eval_runtime": 182.0266,
"eval_samples_per_second": 91.327,
"eval_steps_per_second": 5.708,
"step": 56000
},
{
"epoch": 16.341107871720116,
"grad_norm": 0.37136510014533997,
"learning_rate": 0.00040414585764294046,
"loss": 3.2598,
"step": 56050
},
{
"epoch": 16.355685131195337,
"grad_norm": 0.3642927408218384,
"learning_rate": 0.00040397082847141183,
"loss": 3.2527,
"step": 56100
},
{
"epoch": 16.370262390670554,
"grad_norm": 0.41137054562568665,
"learning_rate": 0.0004037957992998833,
"loss": 3.2589,
"step": 56150
},
{
"epoch": 16.38483965014577,
"grad_norm": 0.3674943447113037,
"learning_rate": 0.0004036207701283547,
"loss": 3.2586,
"step": 56200
},
{
"epoch": 16.399416909620992,
"grad_norm": 0.3654170334339142,
"learning_rate": 0.0004034457409568261,
"loss": 3.2585,
"step": 56250
},
{
"epoch": 16.41399416909621,
"grad_norm": 0.4025888741016388,
"learning_rate": 0.0004032707117852975,
"loss": 3.2654,
"step": 56300
},
{
"epoch": 16.428571428571427,
"grad_norm": 0.395749568939209,
"learning_rate": 0.00040309568261376897,
"loss": 3.2618,
"step": 56350
},
{
"epoch": 16.443148688046648,
"grad_norm": 0.37377673387527466,
"learning_rate": 0.00040292065344224034,
"loss": 3.2805,
"step": 56400
},
{
"epoch": 16.457725947521865,
"grad_norm": 0.34729164838790894,
"learning_rate": 0.0004027456242707117,
"loss": 3.2746,
"step": 56450
},
{
"epoch": 16.472303206997086,
"grad_norm": 0.39409855008125305,
"learning_rate": 0.0004025705950991832,
"loss": 3.2704,
"step": 56500
},
{
"epoch": 16.486880466472304,
"grad_norm": 0.38538074493408203,
"learning_rate": 0.00040239556592765456,
"loss": 3.2669,
"step": 56550
},
{
"epoch": 16.50145772594752,
"grad_norm": 0.34272196888923645,
"learning_rate": 0.000402220536756126,
"loss": 3.2761,
"step": 56600
},
{
"epoch": 16.516034985422742,
"grad_norm": 0.34145087003707886,
"learning_rate": 0.00040204550758459736,
"loss": 3.2916,
"step": 56650
},
{
"epoch": 16.53061224489796,
"grad_norm": 0.3740696907043457,
"learning_rate": 0.00040187047841306884,
"loss": 3.2784,
"step": 56700
},
{
"epoch": 16.545189504373177,
"grad_norm": 0.34579378366470337,
"learning_rate": 0.0004016954492415402,
"loss": 3.2782,
"step": 56750
},
{
"epoch": 16.559766763848398,
"grad_norm": 0.3800579011440277,
"learning_rate": 0.00040152042007001164,
"loss": 3.2874,
"step": 56800
},
{
"epoch": 16.574344023323615,
"grad_norm": 0.3693027198314667,
"learning_rate": 0.00040134539089848307,
"loss": 3.2819,
"step": 56850
},
{
"epoch": 16.588921282798832,
"grad_norm": 0.3319258987903595,
"learning_rate": 0.00040117036172695444,
"loss": 3.2907,
"step": 56900
},
{
"epoch": 16.603498542274053,
"grad_norm": 0.3513771593570709,
"learning_rate": 0.00040099533255542587,
"loss": 3.2793,
"step": 56950
},
{
"epoch": 16.61807580174927,
"grad_norm": 0.37214353680610657,
"learning_rate": 0.00040082030338389724,
"loss": 3.2795,
"step": 57000
},
{
"epoch": 16.61807580174927,
"eval_accuracy": 0.37194886227725366,
"eval_loss": 3.544847249984741,
"eval_runtime": 181.8259,
"eval_samples_per_second": 91.428,
"eval_steps_per_second": 5.714,
"step": 57000
},
{
"epoch": 16.632653061224488,
"grad_norm": 0.37557870149612427,
"learning_rate": 0.0004006452742123687,
"loss": 3.2815,
"step": 57050
},
{
"epoch": 16.64723032069971,
"grad_norm": 0.3824288547039032,
"learning_rate": 0.0004004702450408401,
"loss": 3.2786,
"step": 57100
},
{
"epoch": 16.661807580174926,
"grad_norm": 0.4045317769050598,
"learning_rate": 0.0004002952158693115,
"loss": 3.2844,
"step": 57150
},
{
"epoch": 16.676384839650147,
"grad_norm": 0.36842742562294006,
"learning_rate": 0.0004001201866977829,
"loss": 3.2794,
"step": 57200
},
{
"epoch": 16.690962099125365,
"grad_norm": 0.35082441568374634,
"learning_rate": 0.0003999451575262544,
"loss": 3.2956,
"step": 57250
},
{
"epoch": 16.705539358600582,
"grad_norm": 0.3607030510902405,
"learning_rate": 0.00039977012835472575,
"loss": 3.2963,
"step": 57300
},
{
"epoch": 16.720116618075803,
"grad_norm": 0.3500318229198456,
"learning_rate": 0.0003995950991831971,
"loss": 3.2832,
"step": 57350
},
{
"epoch": 16.73469387755102,
"grad_norm": 0.3622967004776001,
"learning_rate": 0.0003994200700116686,
"loss": 3.2963,
"step": 57400
},
{
"epoch": 16.749271137026238,
"grad_norm": 0.3940633237361908,
"learning_rate": 0.00039924504084014,
"loss": 3.2942,
"step": 57450
},
{
"epoch": 16.76384839650146,
"grad_norm": 0.3878549635410309,
"learning_rate": 0.0003990700116686114,
"loss": 3.2843,
"step": 57500
},
{
"epoch": 16.778425655976676,
"grad_norm": 0.34530991315841675,
"learning_rate": 0.0003988949824970828,
"loss": 3.299,
"step": 57550
},
{
"epoch": 16.793002915451893,
"grad_norm": 0.3695226013660431,
"learning_rate": 0.00039871995332555426,
"loss": 3.2941,
"step": 57600
},
{
"epoch": 16.807580174927114,
"grad_norm": 0.3550872802734375,
"learning_rate": 0.00039854492415402563,
"loss": 3.2898,
"step": 57650
},
{
"epoch": 16.82215743440233,
"grad_norm": 0.3620699346065521,
"learning_rate": 0.00039836989498249706,
"loss": 3.3027,
"step": 57700
},
{
"epoch": 16.836734693877553,
"grad_norm": 0.36660054326057434,
"learning_rate": 0.0003981948658109685,
"loss": 3.3013,
"step": 57750
},
{
"epoch": 16.85131195335277,
"grad_norm": 0.37367725372314453,
"learning_rate": 0.00039801983663943985,
"loss": 3.2955,
"step": 57800
},
{
"epoch": 16.865889212827987,
"grad_norm": 0.35791313648223877,
"learning_rate": 0.0003978448074679113,
"loss": 3.3011,
"step": 57850
},
{
"epoch": 16.88046647230321,
"grad_norm": 0.36037060618400574,
"learning_rate": 0.00039766977829638265,
"loss": 3.309,
"step": 57900
},
{
"epoch": 16.895043731778426,
"grad_norm": 0.35850104689598083,
"learning_rate": 0.00039749474912485414,
"loss": 3.295,
"step": 57950
},
{
"epoch": 16.909620991253643,
"grad_norm": 0.3768969178199768,
"learning_rate": 0.0003973197199533255,
"loss": 3.2888,
"step": 58000
},
{
"epoch": 16.909620991253643,
"eval_accuracy": 0.37238277151935567,
"eval_loss": 3.539978265762329,
"eval_runtime": 181.464,
"eval_samples_per_second": 91.61,
"eval_steps_per_second": 5.726,
"step": 58000
},
{
"epoch": 16.924198250728864,
"grad_norm": 0.3688867688179016,
"learning_rate": 0.00039714469078179693,
"loss": 3.3078,
"step": 58050
},
{
"epoch": 16.93877551020408,
"grad_norm": 0.4198916554450989,
"learning_rate": 0.00039696966161026836,
"loss": 3.3041,
"step": 58100
},
{
"epoch": 16.9533527696793,
"grad_norm": 0.3668155372142792,
"learning_rate": 0.0003967946324387398,
"loss": 3.3077,
"step": 58150
},
{
"epoch": 16.96793002915452,
"grad_norm": 0.359727680683136,
"learning_rate": 0.00039661960326721116,
"loss": 3.3038,
"step": 58200
},
{
"epoch": 16.982507288629737,
"grad_norm": 0.34617719054222107,
"learning_rate": 0.00039644457409568253,
"loss": 3.2923,
"step": 58250
},
{
"epoch": 16.997084548104958,
"grad_norm": 0.34785595536231995,
"learning_rate": 0.000396269544924154,
"loss": 3.3034,
"step": 58300
},
{
"epoch": 17.011661807580175,
"grad_norm": 0.3586324453353882,
"learning_rate": 0.0003960945157526254,
"loss": 3.202,
"step": 58350
},
{
"epoch": 17.026239067055393,
"grad_norm": 0.4087458848953247,
"learning_rate": 0.0003959194865810968,
"loss": 3.2051,
"step": 58400
},
{
"epoch": 17.040816326530614,
"grad_norm": 0.3719448745250702,
"learning_rate": 0.0003957444574095682,
"loss": 3.1944,
"step": 58450
},
{
"epoch": 17.05539358600583,
"grad_norm": 0.3907237648963928,
"learning_rate": 0.00039556942823803967,
"loss": 3.2072,
"step": 58500
},
{
"epoch": 17.06997084548105,
"grad_norm": 0.3574305772781372,
"learning_rate": 0.00039539439906651104,
"loss": 3.216,
"step": 58550
},
{
"epoch": 17.08454810495627,
"grad_norm": 0.3930377960205078,
"learning_rate": 0.00039521936989498247,
"loss": 3.195,
"step": 58600
},
{
"epoch": 17.099125364431487,
"grad_norm": 0.38136032223701477,
"learning_rate": 0.0003950443407234539,
"loss": 3.2201,
"step": 58650
},
{
"epoch": 17.113702623906704,
"grad_norm": 0.42671507596969604,
"learning_rate": 0.0003948693115519253,
"loss": 3.2237,
"step": 58700
},
{
"epoch": 17.128279883381925,
"grad_norm": 0.3523384928703308,
"learning_rate": 0.0003946942823803967,
"loss": 3.2151,
"step": 58750
},
{
"epoch": 17.142857142857142,
"grad_norm": 0.38585031032562256,
"learning_rate": 0.00039451925320886807,
"loss": 3.2146,
"step": 58800
},
{
"epoch": 17.157434402332363,
"grad_norm": 0.35892701148986816,
"learning_rate": 0.00039434422403733955,
"loss": 3.2247,
"step": 58850
},
{
"epoch": 17.17201166180758,
"grad_norm": 0.3723486363887787,
"learning_rate": 0.0003941691948658109,
"loss": 3.2318,
"step": 58900
},
{
"epoch": 17.186588921282798,
"grad_norm": 0.3572362959384918,
"learning_rate": 0.00039399416569428235,
"loss": 3.2172,
"step": 58950
},
{
"epoch": 17.20116618075802,
"grad_norm": 0.37814998626708984,
"learning_rate": 0.0003938191365227538,
"loss": 3.2231,
"step": 59000
},
{
"epoch": 17.20116618075802,
"eval_accuracy": 0.3713283697066839,
"eval_loss": 3.558633804321289,
"eval_runtime": 181.6477,
"eval_samples_per_second": 91.518,
"eval_steps_per_second": 5.72,
"step": 59000
},
{
"epoch": 17.215743440233236,
"grad_norm": 0.3887748122215271,
"learning_rate": 0.0003936441073512252,
"loss": 3.24,
"step": 59050
},
{
"epoch": 17.230320699708454,
"grad_norm": 0.39903128147125244,
"learning_rate": 0.00039346907817969657,
"loss": 3.2365,
"step": 59100
},
{
"epoch": 17.244897959183675,
"grad_norm": 0.3754963278770447,
"learning_rate": 0.00039329404900816795,
"loss": 3.234,
"step": 59150
},
{
"epoch": 17.259475218658892,
"grad_norm": 0.37241125106811523,
"learning_rate": 0.0003931190198366394,
"loss": 3.2384,
"step": 59200
},
{
"epoch": 17.27405247813411,
"grad_norm": 0.36812448501586914,
"learning_rate": 0.0003929439906651108,
"loss": 3.2294,
"step": 59250
},
{
"epoch": 17.28862973760933,
"grad_norm": 0.3592478632926941,
"learning_rate": 0.0003927689614935822,
"loss": 3.2397,
"step": 59300
},
{
"epoch": 17.303206997084548,
"grad_norm": 0.367058664560318,
"learning_rate": 0.00039259393232205365,
"loss": 3.2444,
"step": 59350
},
{
"epoch": 17.317784256559765,
"grad_norm": 0.3607527017593384,
"learning_rate": 0.0003924189031505251,
"loss": 3.2395,
"step": 59400
},
{
"epoch": 17.332361516034986,
"grad_norm": 0.3962002694606781,
"learning_rate": 0.00039224387397899645,
"loss": 3.2428,
"step": 59450
},
{
"epoch": 17.346938775510203,
"grad_norm": 0.37258464097976685,
"learning_rate": 0.0003920688448074679,
"loss": 3.2478,
"step": 59500
},
{
"epoch": 17.361516034985424,
"grad_norm": 0.4007439613342285,
"learning_rate": 0.0003918938156359393,
"loss": 3.2549,
"step": 59550
},
{
"epoch": 17.37609329446064,
"grad_norm": 0.40145668387413025,
"learning_rate": 0.00039171878646441073,
"loss": 3.2573,
"step": 59600
},
{
"epoch": 17.39067055393586,
"grad_norm": 0.3657708764076233,
"learning_rate": 0.0003915437572928821,
"loss": 3.2602,
"step": 59650
},
{
"epoch": 17.40524781341108,
"grad_norm": 0.34860551357269287,
"learning_rate": 0.0003913687281213535,
"loss": 3.2522,
"step": 59700
},
{
"epoch": 17.419825072886297,
"grad_norm": 0.36920854449272156,
"learning_rate": 0.00039119369894982496,
"loss": 3.2609,
"step": 59750
},
{
"epoch": 17.434402332361515,
"grad_norm": 0.33922135829925537,
"learning_rate": 0.00039101866977829633,
"loss": 3.2541,
"step": 59800
},
{
"epoch": 17.448979591836736,
"grad_norm": 0.37371137738227844,
"learning_rate": 0.00039084364060676776,
"loss": 3.2537,
"step": 59850
},
{
"epoch": 17.463556851311953,
"grad_norm": 0.37898826599121094,
"learning_rate": 0.0003906686114352392,
"loss": 3.2568,
"step": 59900
},
{
"epoch": 17.47813411078717,
"grad_norm": 0.37351587414741516,
"learning_rate": 0.0003904935822637106,
"loss": 3.2739,
"step": 59950
},
{
"epoch": 17.49271137026239,
"grad_norm": 0.3553948402404785,
"learning_rate": 0.000390318553092182,
"loss": 3.2615,
"step": 60000
},
{
"epoch": 17.49271137026239,
"eval_accuracy": 0.3715660427288771,
"eval_loss": 3.5488474369049072,
"eval_runtime": 181.4626,
"eval_samples_per_second": 91.611,
"eval_steps_per_second": 5.726,
"step": 60000
},
{
"epoch": 17.50728862973761,
"grad_norm": 0.3778980076313019,
"learning_rate": 0.00039014352392065336,
"loss": 3.2639,
"step": 60050
},
{
"epoch": 17.52186588921283,
"grad_norm": 0.3963741958141327,
"learning_rate": 0.00038996849474912484,
"loss": 3.2614,
"step": 60100
},
{
"epoch": 17.536443148688047,
"grad_norm": 0.3608973026275635,
"learning_rate": 0.0003897934655775962,
"loss": 3.2699,
"step": 60150
},
{
"epoch": 17.551020408163264,
"grad_norm": 0.3675440549850464,
"learning_rate": 0.00038961843640606764,
"loss": 3.2676,
"step": 60200
},
{
"epoch": 17.565597667638485,
"grad_norm": 0.36255237460136414,
"learning_rate": 0.00038944340723453906,
"loss": 3.2672,
"step": 60250
},
{
"epoch": 17.580174927113703,
"grad_norm": 0.38902583718299866,
"learning_rate": 0.0003892683780630105,
"loss": 3.2571,
"step": 60300
},
{
"epoch": 17.59475218658892,
"grad_norm": 0.38929134607315063,
"learning_rate": 0.00038909334889148186,
"loss": 3.2739,
"step": 60350
},
{
"epoch": 17.60932944606414,
"grad_norm": 0.3460721969604492,
"learning_rate": 0.0003889183197199533,
"loss": 3.2634,
"step": 60400
},
{
"epoch": 17.62390670553936,
"grad_norm": 0.3623656928539276,
"learning_rate": 0.0003887432905484247,
"loss": 3.2716,
"step": 60450
},
{
"epoch": 17.638483965014576,
"grad_norm": 0.38560980558395386,
"learning_rate": 0.00038856826137689614,
"loss": 3.2748,
"step": 60500
},
{
"epoch": 17.653061224489797,
"grad_norm": 0.39393141865730286,
"learning_rate": 0.0003883932322053675,
"loss": 3.2795,
"step": 60550
},
{
"epoch": 17.667638483965014,
"grad_norm": 0.3850944936275482,
"learning_rate": 0.00038821820303383894,
"loss": 3.2676,
"step": 60600
},
{
"epoch": 17.682215743440235,
"grad_norm": 0.3653624653816223,
"learning_rate": 0.00038804317386231037,
"loss": 3.2848,
"step": 60650
},
{
"epoch": 17.696793002915452,
"grad_norm": 0.35563892126083374,
"learning_rate": 0.00038786814469078174,
"loss": 3.2879,
"step": 60700
},
{
"epoch": 17.71137026239067,
"grad_norm": 0.38104385137557983,
"learning_rate": 0.00038769311551925317,
"loss": 3.2737,
"step": 60750
},
{
"epoch": 17.72594752186589,
"grad_norm": 0.3962397873401642,
"learning_rate": 0.0003875180863477246,
"loss": 3.2767,
"step": 60800
},
{
"epoch": 17.740524781341108,
"grad_norm": 0.3726269006729126,
"learning_rate": 0.000387343057176196,
"loss": 3.2919,
"step": 60850
},
{
"epoch": 17.755102040816325,
"grad_norm": 0.3684369623661041,
"learning_rate": 0.0003871680280046674,
"loss": 3.2791,
"step": 60900
},
{
"epoch": 17.769679300291546,
"grad_norm": 0.39243683218955994,
"learning_rate": 0.00038699299883313877,
"loss": 3.2804,
"step": 60950
},
{
"epoch": 17.784256559766764,
"grad_norm": 0.349883109331131,
"learning_rate": 0.00038681796966161025,
"loss": 3.272,
"step": 61000
},
{
"epoch": 17.784256559766764,
"eval_accuracy": 0.37223009102912064,
"eval_loss": 3.541855812072754,
"eval_runtime": 181.727,
"eval_samples_per_second": 91.478,
"eval_steps_per_second": 5.717,
"step": 61000
},
{
"epoch": 17.79883381924198,
"grad_norm": 0.35422345995903015,
"learning_rate": 0.0003866429404900816,
"loss": 3.2824,
"step": 61050
},
{
"epoch": 17.813411078717202,
"grad_norm": 0.37883347272872925,
"learning_rate": 0.00038646791131855305,
"loss": 3.2653,
"step": 61100
},
{
"epoch": 17.82798833819242,
"grad_norm": 0.3556387722492218,
"learning_rate": 0.0003862928821470245,
"loss": 3.2756,
"step": 61150
},
{
"epoch": 17.842565597667637,
"grad_norm": 0.37966683506965637,
"learning_rate": 0.0003861178529754959,
"loss": 3.2827,
"step": 61200
},
{
"epoch": 17.857142857142858,
"grad_norm": 0.3499722480773926,
"learning_rate": 0.0003859428238039673,
"loss": 3.2842,
"step": 61250
},
{
"epoch": 17.871720116618075,
"grad_norm": 0.36272355914115906,
"learning_rate": 0.0003857677946324387,
"loss": 3.271,
"step": 61300
},
{
"epoch": 17.886297376093296,
"grad_norm": 0.3606753647327423,
"learning_rate": 0.00038559276546091013,
"loss": 3.2813,
"step": 61350
},
{
"epoch": 17.900874635568513,
"grad_norm": 0.37976881861686707,
"learning_rate": 0.00038541773628938156,
"loss": 3.2749,
"step": 61400
},
{
"epoch": 17.91545189504373,
"grad_norm": 0.34956464171409607,
"learning_rate": 0.00038524270711785293,
"loss": 3.2873,
"step": 61450
},
{
"epoch": 17.93002915451895,
"grad_norm": 0.38344627618789673,
"learning_rate": 0.0003850676779463244,
"loss": 3.2827,
"step": 61500
},
{
"epoch": 17.94460641399417,
"grad_norm": 0.3596233129501343,
"learning_rate": 0.0003848926487747958,
"loss": 3.2865,
"step": 61550
},
{
"epoch": 17.959183673469386,
"grad_norm": 0.367553174495697,
"learning_rate": 0.00038471761960326715,
"loss": 3.2868,
"step": 61600
},
{
"epoch": 17.973760932944607,
"grad_norm": 0.35922616720199585,
"learning_rate": 0.0003845425904317386,
"loss": 3.2856,
"step": 61650
},
{
"epoch": 17.988338192419825,
"grad_norm": 0.370110422372818,
"learning_rate": 0.00038436756126021,
"loss": 3.291,
"step": 61700
},
{
"epoch": 18.002915451895042,
"grad_norm": 0.6661171913146973,
"learning_rate": 0.00038419253208868143,
"loss": 3.2759,
"step": 61750
},
{
"epoch": 18.017492711370263,
"grad_norm": 0.3718918561935425,
"learning_rate": 0.0003840175029171528,
"loss": 3.1828,
"step": 61800
},
{
"epoch": 18.03206997084548,
"grad_norm": 0.39824941754341125,
"learning_rate": 0.0003838424737456243,
"loss": 3.1886,
"step": 61850
},
{
"epoch": 18.0466472303207,
"grad_norm": 0.389596164226532,
"learning_rate": 0.00038366744457409566,
"loss": 3.1964,
"step": 61900
},
{
"epoch": 18.06122448979592,
"grad_norm": 0.3624783158302307,
"learning_rate": 0.00038349241540256703,
"loss": 3.1964,
"step": 61950
},
{
"epoch": 18.075801749271136,
"grad_norm": 0.3595573902130127,
"learning_rate": 0.00038331738623103846,
"loss": 3.203,
"step": 62000
},
{
"epoch": 18.075801749271136,
"eval_accuracy": 0.3718941233196906,
"eval_loss": 3.557060480117798,
"eval_runtime": 181.5132,
"eval_samples_per_second": 91.586,
"eval_steps_per_second": 5.724,
"step": 62000
},
{
"epoch": 18.090379008746357,
"grad_norm": 0.3661964535713196,
"learning_rate": 0.0003831423570595099,
"loss": 3.1916,
"step": 62050
},
{
"epoch": 18.104956268221574,
"grad_norm": 0.43816515803337097,
"learning_rate": 0.0003829673278879813,
"loss": 3.1928,
"step": 62100
},
{
"epoch": 18.11953352769679,
"grad_norm": 0.38191908597946167,
"learning_rate": 0.0003827922987164527,
"loss": 3.213,
"step": 62150
},
{
"epoch": 18.134110787172013,
"grad_norm": 0.3885975778102875,
"learning_rate": 0.0003826172695449241,
"loss": 3.2124,
"step": 62200
},
{
"epoch": 18.14868804664723,
"grad_norm": 0.4196251928806305,
"learning_rate": 0.00038244224037339554,
"loss": 3.2082,
"step": 62250
},
{
"epoch": 18.163265306122447,
"grad_norm": 0.38539016246795654,
"learning_rate": 0.00038226721120186697,
"loss": 3.2086,
"step": 62300
},
{
"epoch": 18.17784256559767,
"grad_norm": 0.38454657793045044,
"learning_rate": 0.00038209218203033834,
"loss": 3.2176,
"step": 62350
},
{
"epoch": 18.192419825072886,
"grad_norm": 0.4121674597263336,
"learning_rate": 0.0003819171528588098,
"loss": 3.2164,
"step": 62400
},
{
"epoch": 18.206997084548107,
"grad_norm": 0.3765960931777954,
"learning_rate": 0.0003817421236872812,
"loss": 3.2222,
"step": 62450
},
{
"epoch": 18.221574344023324,
"grad_norm": 0.3664701581001282,
"learning_rate": 0.00038156709451575257,
"loss": 3.2252,
"step": 62500
},
{
"epoch": 18.23615160349854,
"grad_norm": 0.36911651492118835,
"learning_rate": 0.000381392065344224,
"loss": 3.2341,
"step": 62550
},
{
"epoch": 18.250728862973762,
"grad_norm": 0.393379807472229,
"learning_rate": 0.0003812170361726954,
"loss": 3.2249,
"step": 62600
},
{
"epoch": 18.26530612244898,
"grad_norm": 0.3650977611541748,
"learning_rate": 0.00038104200700116685,
"loss": 3.2255,
"step": 62650
},
{
"epoch": 18.279883381924197,
"grad_norm": 0.3733424246311188,
"learning_rate": 0.0003808669778296382,
"loss": 3.2305,
"step": 62700
},
{
"epoch": 18.294460641399418,
"grad_norm": 0.3693566620349884,
"learning_rate": 0.0003806919486581097,
"loss": 3.2214,
"step": 62750
},
{
"epoch": 18.309037900874635,
"grad_norm": 0.3484310805797577,
"learning_rate": 0.00038051691948658107,
"loss": 3.2257,
"step": 62800
},
{
"epoch": 18.323615160349853,
"grad_norm": 0.3896331191062927,
"learning_rate": 0.00038034189031505245,
"loss": 3.2346,
"step": 62850
},
{
"epoch": 18.338192419825074,
"grad_norm": 0.3532673716545105,
"learning_rate": 0.00038016686114352387,
"loss": 3.2408,
"step": 62900
},
{
"epoch": 18.35276967930029,
"grad_norm": 0.3979147672653198,
"learning_rate": 0.0003799918319719953,
"loss": 3.2279,
"step": 62950
},
{
"epoch": 18.367346938775512,
"grad_norm": 0.3705503046512604,
"learning_rate": 0.0003798168028004667,
"loss": 3.2376,
"step": 63000
},
{
"epoch": 18.367346938775512,
"eval_accuracy": 0.37182149119750474,
"eval_loss": 3.552273750305176,
"eval_runtime": 181.406,
"eval_samples_per_second": 91.64,
"eval_steps_per_second": 5.727,
"step": 63000
},
{
"epoch": 18.38192419825073,
"grad_norm": 0.38964420557022095,
"learning_rate": 0.0003796417736289381,
"loss": 3.228,
"step": 63050
},
{
"epoch": 18.396501457725947,
"grad_norm": 0.37392449378967285,
"learning_rate": 0.0003794667444574096,
"loss": 3.2566,
"step": 63100
},
{
"epoch": 18.411078717201168,
"grad_norm": 0.3898674547672272,
"learning_rate": 0.00037929171528588095,
"loss": 3.2463,
"step": 63150
},
{
"epoch": 18.425655976676385,
"grad_norm": 0.37367600202560425,
"learning_rate": 0.0003791166861143524,
"loss": 3.2472,
"step": 63200
},
{
"epoch": 18.440233236151602,
"grad_norm": 0.3786541819572449,
"learning_rate": 0.00037894165694282375,
"loss": 3.243,
"step": 63250
},
{
"epoch": 18.454810495626823,
"grad_norm": 0.37856510281562805,
"learning_rate": 0.00037876662777129523,
"loss": 3.2575,
"step": 63300
},
{
"epoch": 18.46938775510204,
"grad_norm": 0.3948310911655426,
"learning_rate": 0.0003785915985997666,
"loss": 3.2487,
"step": 63350
},
{
"epoch": 18.483965014577258,
"grad_norm": 0.3751145303249359,
"learning_rate": 0.000378416569428238,
"loss": 3.2465,
"step": 63400
},
{
"epoch": 18.49854227405248,
"grad_norm": 0.36819157004356384,
"learning_rate": 0.0003782415402567094,
"loss": 3.2507,
"step": 63450
},
{
"epoch": 18.513119533527696,
"grad_norm": 0.35814929008483887,
"learning_rate": 0.00037806651108518083,
"loss": 3.26,
"step": 63500
},
{
"epoch": 18.527696793002917,
"grad_norm": 0.3531430959701538,
"learning_rate": 0.00037789148191365226,
"loss": 3.2505,
"step": 63550
},
{
"epoch": 18.542274052478135,
"grad_norm": 0.37971770763397217,
"learning_rate": 0.00037771645274212363,
"loss": 3.246,
"step": 63600
},
{
"epoch": 18.556851311953352,
"grad_norm": 0.37397658824920654,
"learning_rate": 0.0003775414235705951,
"loss": 3.2506,
"step": 63650
},
{
"epoch": 18.571428571428573,
"grad_norm": 0.35488125681877136,
"learning_rate": 0.0003773663943990665,
"loss": 3.266,
"step": 63700
},
{
"epoch": 18.58600583090379,
"grad_norm": 0.3492448925971985,
"learning_rate": 0.00037719136522753786,
"loss": 3.2565,
"step": 63750
},
{
"epoch": 18.600583090379008,
"grad_norm": 0.37872013449668884,
"learning_rate": 0.0003770163360560093,
"loss": 3.2577,
"step": 63800
},
{
"epoch": 18.61516034985423,
"grad_norm": 0.35813814401626587,
"learning_rate": 0.0003768413068844807,
"loss": 3.2374,
"step": 63850
},
{
"epoch": 18.629737609329446,
"grad_norm": 0.37753552198410034,
"learning_rate": 0.00037666627771295214,
"loss": 3.2499,
"step": 63900
},
{
"epoch": 18.644314868804663,
"grad_norm": 0.35299044847488403,
"learning_rate": 0.0003764912485414235,
"loss": 3.2535,
"step": 63950
},
{
"epoch": 18.658892128279884,
"grad_norm": 0.3655620217323303,
"learning_rate": 0.000376316219369895,
"loss": 3.2659,
"step": 64000
},
{
"epoch": 18.658892128279884,
"eval_accuracy": 0.3724969581620141,
"eval_loss": 3.543386936187744,
"eval_runtime": 181.6734,
"eval_samples_per_second": 91.505,
"eval_steps_per_second": 5.719,
"step": 64000
},
{
"epoch": 18.6734693877551,
"grad_norm": 0.358867883682251,
"learning_rate": 0.00037614119019836636,
"loss": 3.2715,
"step": 64050
},
{
"epoch": 18.68804664723032,
"grad_norm": 0.3794868588447571,
"learning_rate": 0.0003759661610268378,
"loss": 3.2644,
"step": 64100
},
{
"epoch": 18.70262390670554,
"grad_norm": 0.42781543731689453,
"learning_rate": 0.00037579113185530916,
"loss": 3.2612,
"step": 64150
},
{
"epoch": 18.717201166180757,
"grad_norm": 0.40130743384361267,
"learning_rate": 0.00037561610268378064,
"loss": 3.2794,
"step": 64200
},
{
"epoch": 18.73177842565598,
"grad_norm": 0.38567736744880676,
"learning_rate": 0.000375441073512252,
"loss": 3.2603,
"step": 64250
},
{
"epoch": 18.746355685131196,
"grad_norm": 0.3566703498363495,
"learning_rate": 0.0003752660443407234,
"loss": 3.2642,
"step": 64300
},
{
"epoch": 18.760932944606413,
"grad_norm": 0.3746073544025421,
"learning_rate": 0.00037509101516919487,
"loss": 3.264,
"step": 64350
},
{
"epoch": 18.775510204081634,
"grad_norm": 0.37497463822364807,
"learning_rate": 0.00037491598599766624,
"loss": 3.2568,
"step": 64400
},
{
"epoch": 18.79008746355685,
"grad_norm": 0.37049469351768494,
"learning_rate": 0.00037474095682613767,
"loss": 3.2802,
"step": 64450
},
{
"epoch": 18.80466472303207,
"grad_norm": 0.3565693795681,
"learning_rate": 0.00037456592765460904,
"loss": 3.2713,
"step": 64500
},
{
"epoch": 18.81924198250729,
"grad_norm": 0.36381709575653076,
"learning_rate": 0.0003743908984830805,
"loss": 3.2586,
"step": 64550
},
{
"epoch": 18.833819241982507,
"grad_norm": 0.34240925312042236,
"learning_rate": 0.0003742158693115519,
"loss": 3.2755,
"step": 64600
},
{
"epoch": 18.848396501457724,
"grad_norm": 0.3438854515552521,
"learning_rate": 0.00037404084014002327,
"loss": 3.2651,
"step": 64650
},
{
"epoch": 18.862973760932945,
"grad_norm": 0.3836745023727417,
"learning_rate": 0.0003738658109684947,
"loss": 3.259,
"step": 64700
},
{
"epoch": 18.877551020408163,
"grad_norm": 0.3804767429828644,
"learning_rate": 0.0003736907817969661,
"loss": 3.2655,
"step": 64750
},
{
"epoch": 18.892128279883384,
"grad_norm": 0.36783847212791443,
"learning_rate": 0.00037351575262543755,
"loss": 3.277,
"step": 64800
},
{
"epoch": 18.9067055393586,
"grad_norm": 0.3522402346134186,
"learning_rate": 0.0003733407234539089,
"loss": 3.2743,
"step": 64850
},
{
"epoch": 18.92128279883382,
"grad_norm": 0.3842095732688904,
"learning_rate": 0.0003731656942823804,
"loss": 3.2749,
"step": 64900
},
{
"epoch": 18.93586005830904,
"grad_norm": 0.34390994906425476,
"learning_rate": 0.0003729906651108518,
"loss": 3.2834,
"step": 64950
},
{
"epoch": 18.950437317784257,
"grad_norm": 0.352525919675827,
"learning_rate": 0.0003728156359393232,
"loss": 3.2758,
"step": 65000
},
{
"epoch": 18.950437317784257,
"eval_accuracy": 0.37309014011289643,
"eval_loss": 3.5348432064056396,
"eval_runtime": 181.6813,
"eval_samples_per_second": 91.501,
"eval_steps_per_second": 5.719,
"step": 65000
},
{
"epoch": 18.965014577259474,
"grad_norm": 0.36129817366600037,
"learning_rate": 0.0003726406067677946,
"loss": 3.2741,
"step": 65050
},
{
"epoch": 18.979591836734695,
"grad_norm": 0.36104297637939453,
"learning_rate": 0.00037246557759626606,
"loss": 3.2763,
"step": 65100
},
{
"epoch": 18.994169096209912,
"grad_norm": 0.3394487202167511,
"learning_rate": 0.00037229054842473743,
"loss": 3.2746,
"step": 65150
},
{
"epoch": 19.00874635568513,
"grad_norm": 0.3638574779033661,
"learning_rate": 0.0003721155192532088,
"loss": 3.2164,
"step": 65200
},
{
"epoch": 19.02332361516035,
"grad_norm": 0.41334033012390137,
"learning_rate": 0.0003719404900816803,
"loss": 3.1806,
"step": 65250
},
{
"epoch": 19.037900874635568,
"grad_norm": 0.34988388419151306,
"learning_rate": 0.00037176546091015165,
"loss": 3.1682,
"step": 65300
},
{
"epoch": 19.05247813411079,
"grad_norm": 0.3725140690803528,
"learning_rate": 0.0003715904317386231,
"loss": 3.1849,
"step": 65350
},
{
"epoch": 19.067055393586006,
"grad_norm": 0.37143728137016296,
"learning_rate": 0.00037141540256709445,
"loss": 3.1845,
"step": 65400
},
{
"epoch": 19.081632653061224,
"grad_norm": 0.42141345143318176,
"learning_rate": 0.00037124037339556593,
"loss": 3.1886,
"step": 65450
},
{
"epoch": 19.096209912536445,
"grad_norm": 0.3842184245586395,
"learning_rate": 0.0003710653442240373,
"loss": 3.1938,
"step": 65500
},
{
"epoch": 19.110787172011662,
"grad_norm": 0.3778875172138214,
"learning_rate": 0.0003708903150525087,
"loss": 3.1953,
"step": 65550
},
{
"epoch": 19.12536443148688,
"grad_norm": 0.39668551087379456,
"learning_rate": 0.00037071528588098016,
"loss": 3.1955,
"step": 65600
},
{
"epoch": 19.1399416909621,
"grad_norm": 0.4067428708076477,
"learning_rate": 0.00037054025670945153,
"loss": 3.1883,
"step": 65650
},
{
"epoch": 19.154518950437318,
"grad_norm": 0.3798586428165436,
"learning_rate": 0.00037036522753792296,
"loss": 3.1912,
"step": 65700
},
{
"epoch": 19.169096209912535,
"grad_norm": 0.37418290972709656,
"learning_rate": 0.00037019019836639433,
"loss": 3.2043,
"step": 65750
},
{
"epoch": 19.183673469387756,
"grad_norm": 0.36611396074295044,
"learning_rate": 0.0003700151691948658,
"loss": 3.189,
"step": 65800
},
{
"epoch": 19.198250728862973,
"grad_norm": 0.39174631237983704,
"learning_rate": 0.0003698401400233372,
"loss": 3.2102,
"step": 65850
},
{
"epoch": 19.21282798833819,
"grad_norm": 0.37841376662254333,
"learning_rate": 0.0003696651108518086,
"loss": 3.2186,
"step": 65900
},
{
"epoch": 19.22740524781341,
"grad_norm": 0.4026353657245636,
"learning_rate": 0.00036949008168028,
"loss": 3.1962,
"step": 65950
},
{
"epoch": 19.24198250728863,
"grad_norm": 0.36771753430366516,
"learning_rate": 0.00036931505250875147,
"loss": 3.2112,
"step": 66000
},
{
"epoch": 19.24198250728863,
"eval_accuracy": 0.3721091944497287,
"eval_loss": 3.553576707839966,
"eval_runtime": 181.571,
"eval_samples_per_second": 91.556,
"eval_steps_per_second": 5.722,
"step": 66000
},
{
"epoch": 19.25655976676385,
"grad_norm": 0.36355555057525635,
"learning_rate": 0.00036914002333722284,
"loss": 3.2189,
"step": 66050
},
{
"epoch": 19.271137026239067,
"grad_norm": 0.36505192518234253,
"learning_rate": 0.0003689649941656942,
"loss": 3.2245,
"step": 66100
},
{
"epoch": 19.285714285714285,
"grad_norm": 0.37705734372138977,
"learning_rate": 0.0003687899649941657,
"loss": 3.2141,
"step": 66150
},
{
"epoch": 19.300291545189506,
"grad_norm": 0.399637907743454,
"learning_rate": 0.00036861493582263707,
"loss": 3.2206,
"step": 66200
},
{
"epoch": 19.314868804664723,
"grad_norm": 0.3929356634616852,
"learning_rate": 0.0003684399066511085,
"loss": 3.2324,
"step": 66250
},
{
"epoch": 19.32944606413994,
"grad_norm": 0.37350767850875854,
"learning_rate": 0.00036826487747957987,
"loss": 3.2241,
"step": 66300
},
{
"epoch": 19.34402332361516,
"grad_norm": 0.36935678124427795,
"learning_rate": 0.00036808984830805135,
"loss": 3.2237,
"step": 66350
},
{
"epoch": 19.35860058309038,
"grad_norm": 0.38563114404678345,
"learning_rate": 0.0003679148191365227,
"loss": 3.2205,
"step": 66400
},
{
"epoch": 19.373177842565596,
"grad_norm": 0.36945411562919617,
"learning_rate": 0.0003677397899649941,
"loss": 3.2197,
"step": 66450
},
{
"epoch": 19.387755102040817,
"grad_norm": 0.3770820200443268,
"learning_rate": 0.00036756476079346557,
"loss": 3.2432,
"step": 66500
},
{
"epoch": 19.402332361516034,
"grad_norm": 0.3747752010822296,
"learning_rate": 0.00036738973162193695,
"loss": 3.2299,
"step": 66550
},
{
"epoch": 19.416909620991255,
"grad_norm": 0.38895103335380554,
"learning_rate": 0.00036721470245040837,
"loss": 3.2374,
"step": 66600
},
{
"epoch": 19.431486880466473,
"grad_norm": 0.44169071316719055,
"learning_rate": 0.00036703967327887974,
"loss": 3.2261,
"step": 66650
},
{
"epoch": 19.44606413994169,
"grad_norm": 0.4019821286201477,
"learning_rate": 0.0003668646441073512,
"loss": 3.242,
"step": 66700
},
{
"epoch": 19.46064139941691,
"grad_norm": 0.40160903334617615,
"learning_rate": 0.0003666896149358226,
"loss": 3.2431,
"step": 66750
},
{
"epoch": 19.47521865889213,
"grad_norm": 0.3881227970123291,
"learning_rate": 0.000366514585764294,
"loss": 3.229,
"step": 66800
},
{
"epoch": 19.489795918367346,
"grad_norm": 0.38421711325645447,
"learning_rate": 0.00036633955659276545,
"loss": 3.2255,
"step": 66850
},
{
"epoch": 19.504373177842567,
"grad_norm": 0.39073434472084045,
"learning_rate": 0.0003661645274212369,
"loss": 3.2359,
"step": 66900
},
{
"epoch": 19.518950437317784,
"grad_norm": 0.3665856122970581,
"learning_rate": 0.00036598949824970825,
"loss": 3.2408,
"step": 66950
},
{
"epoch": 19.533527696793,
"grad_norm": 0.40631458163261414,
"learning_rate": 0.0003658144690781796,
"loss": 3.2374,
"step": 67000
},
{
"epoch": 19.533527696793,
"eval_accuracy": 0.3723500458630062,
"eval_loss": 3.5477726459503174,
"eval_runtime": 181.6063,
"eval_samples_per_second": 91.539,
"eval_steps_per_second": 5.721,
"step": 67000
},
{
"epoch": 19.548104956268222,
"grad_norm": 0.382445365190506,
"learning_rate": 0.0003656394399066511,
"loss": 3.2436,
"step": 67050
},
{
"epoch": 19.56268221574344,
"grad_norm": 0.3691641390323639,
"learning_rate": 0.0003654644107351225,
"loss": 3.2403,
"step": 67100
},
{
"epoch": 19.57725947521866,
"grad_norm": 0.36600974202156067,
"learning_rate": 0.0003652893815635939,
"loss": 3.2469,
"step": 67150
},
{
"epoch": 19.591836734693878,
"grad_norm": 0.39847928285598755,
"learning_rate": 0.0003651143523920653,
"loss": 3.2514,
"step": 67200
},
{
"epoch": 19.606413994169095,
"grad_norm": 0.3576008975505829,
"learning_rate": 0.00036493932322053676,
"loss": 3.2444,
"step": 67250
},
{
"epoch": 19.620991253644316,
"grad_norm": 0.3533256947994232,
"learning_rate": 0.00036476429404900813,
"loss": 3.2448,
"step": 67300
},
{
"epoch": 19.635568513119534,
"grad_norm": 0.36868008971214294,
"learning_rate": 0.0003645892648774795,
"loss": 3.2503,
"step": 67350
},
{
"epoch": 19.65014577259475,
"grad_norm": 0.36897534132003784,
"learning_rate": 0.000364414235705951,
"loss": 3.2405,
"step": 67400
},
{
"epoch": 19.664723032069972,
"grad_norm": 0.4246874749660492,
"learning_rate": 0.00036423920653442236,
"loss": 3.2477,
"step": 67450
},
{
"epoch": 19.67930029154519,
"grad_norm": 0.38615843653678894,
"learning_rate": 0.0003640641773628938,
"loss": 3.2607,
"step": 67500
},
{
"epoch": 19.693877551020407,
"grad_norm": 0.3596295118331909,
"learning_rate": 0.00036388914819136516,
"loss": 3.2486,
"step": 67550
},
{
"epoch": 19.708454810495628,
"grad_norm": 0.4084990918636322,
"learning_rate": 0.00036371411901983664,
"loss": 3.2494,
"step": 67600
},
{
"epoch": 19.723032069970845,
"grad_norm": 0.41460683941841125,
"learning_rate": 0.000363539089848308,
"loss": 3.261,
"step": 67650
},
{
"epoch": 19.737609329446066,
"grad_norm": 0.372470498085022,
"learning_rate": 0.00036336406067677944,
"loss": 3.2581,
"step": 67700
},
{
"epoch": 19.752186588921283,
"grad_norm": 0.3677177131175995,
"learning_rate": 0.00036318903150525086,
"loss": 3.2554,
"step": 67750
},
{
"epoch": 19.7667638483965,
"grad_norm": 0.36630979180336,
"learning_rate": 0.0003630140023337223,
"loss": 3.2522,
"step": 67800
},
{
"epoch": 19.78134110787172,
"grad_norm": 0.3879360258579254,
"learning_rate": 0.00036283897316219366,
"loss": 3.2672,
"step": 67850
},
{
"epoch": 19.79591836734694,
"grad_norm": 0.39751380681991577,
"learning_rate": 0.00036266394399066504,
"loss": 3.2533,
"step": 67900
},
{
"epoch": 19.810495626822156,
"grad_norm": 0.38645872473716736,
"learning_rate": 0.0003624889148191365,
"loss": 3.2545,
"step": 67950
},
{
"epoch": 19.825072886297377,
"grad_norm": 0.3810601830482483,
"learning_rate": 0.0003623138856476079,
"loss": 3.2682,
"step": 68000
},
{
"epoch": 19.825072886297377,
"eval_accuracy": 0.3729859595162442,
"eval_loss": 3.536771059036255,
"eval_runtime": 181.5477,
"eval_samples_per_second": 91.568,
"eval_steps_per_second": 5.723,
"step": 68000
},
{
"epoch": 19.839650145772595,
"grad_norm": 0.36461886763572693,
"learning_rate": 0.0003621388564760793,
"loss": 3.2574,
"step": 68050
},
{
"epoch": 19.854227405247812,
"grad_norm": 0.3770570456981659,
"learning_rate": 0.00036196382730455074,
"loss": 3.2661,
"step": 68100
},
{
"epoch": 19.868804664723033,
"grad_norm": 0.3571155071258545,
"learning_rate": 0.00036178879813302217,
"loss": 3.2521,
"step": 68150
},
{
"epoch": 19.88338192419825,
"grad_norm": 0.4131433665752411,
"learning_rate": 0.00036161376896149354,
"loss": 3.2681,
"step": 68200
},
{
"epoch": 19.897959183673468,
"grad_norm": 0.385781466960907,
"learning_rate": 0.0003614387397899649,
"loss": 3.2668,
"step": 68250
},
{
"epoch": 19.91253644314869,
"grad_norm": 0.386671245098114,
"learning_rate": 0.0003612637106184364,
"loss": 3.2753,
"step": 68300
},
{
"epoch": 19.927113702623906,
"grad_norm": 0.3846534490585327,
"learning_rate": 0.00036108868144690777,
"loss": 3.2616,
"step": 68350
},
{
"epoch": 19.941690962099127,
"grad_norm": 0.38834303617477417,
"learning_rate": 0.0003609136522753792,
"loss": 3.2649,
"step": 68400
},
{
"epoch": 19.956268221574344,
"grad_norm": 0.3715721070766449,
"learning_rate": 0.00036073862310385057,
"loss": 3.2694,
"step": 68450
},
{
"epoch": 19.97084548104956,
"grad_norm": 0.35815873742103577,
"learning_rate": 0.00036056359393232205,
"loss": 3.2493,
"step": 68500
},
{
"epoch": 19.985422740524783,
"grad_norm": 0.41704729199409485,
"learning_rate": 0.0003603885647607934,
"loss": 3.2713,
"step": 68550
},
{
"epoch": 20.0,
"grad_norm": 0.39834514260292053,
"learning_rate": 0.00036021353558926485,
"loss": 3.274,
"step": 68600
},
{
"epoch": 20.014577259475217,
"grad_norm": 0.40443840622901917,
"learning_rate": 0.0003600385064177363,
"loss": 3.1637,
"step": 68650
},
{
"epoch": 20.02915451895044,
"grad_norm": 0.3989391624927521,
"learning_rate": 0.0003598634772462077,
"loss": 3.161,
"step": 68700
},
{
"epoch": 20.043731778425656,
"grad_norm": 0.3869283199310303,
"learning_rate": 0.0003596884480746791,
"loss": 3.1616,
"step": 68750
},
{
"epoch": 20.058309037900873,
"grad_norm": 0.3743650019168854,
"learning_rate": 0.00035951341890315045,
"loss": 3.1794,
"step": 68800
},
{
"epoch": 20.072886297376094,
"grad_norm": 0.3653443157672882,
"learning_rate": 0.00035933838973162193,
"loss": 3.1577,
"step": 68850
},
{
"epoch": 20.08746355685131,
"grad_norm": 0.36639904975891113,
"learning_rate": 0.0003591633605600933,
"loss": 3.1708,
"step": 68900
},
{
"epoch": 20.102040816326532,
"grad_norm": 0.3923637568950653,
"learning_rate": 0.00035898833138856473,
"loss": 3.1798,
"step": 68950
},
{
"epoch": 20.11661807580175,
"grad_norm": 0.3729211091995239,
"learning_rate": 0.00035881330221703615,
"loss": 3.1777,
"step": 69000
},
{
"epoch": 20.11661807580175,
"eval_accuracy": 0.3722634052764117,
"eval_loss": 3.555449962615967,
"eval_runtime": 181.5409,
"eval_samples_per_second": 91.572,
"eval_steps_per_second": 5.723,
"step": 69000
},
{
"epoch": 20.131195335276967,
"grad_norm": 0.3827017545700073,
"learning_rate": 0.0003586382730455076,
"loss": 3.1923,
"step": 69050
},
{
"epoch": 20.145772594752188,
"grad_norm": 0.39228540658950806,
"learning_rate": 0.00035846324387397895,
"loss": 3.1933,
"step": 69100
},
{
"epoch": 20.160349854227405,
"grad_norm": 0.4029063880443573,
"learning_rate": 0.0003582882147024503,
"loss": 3.1833,
"step": 69150
},
{
"epoch": 20.174927113702623,
"grad_norm": 0.39909449219703674,
"learning_rate": 0.0003581131855309218,
"loss": 3.1918,
"step": 69200
},
{
"epoch": 20.189504373177844,
"grad_norm": 0.3867656886577606,
"learning_rate": 0.0003579381563593932,
"loss": 3.1897,
"step": 69250
},
{
"epoch": 20.20408163265306,
"grad_norm": 0.3900681734085083,
"learning_rate": 0.0003577631271878646,
"loss": 3.2048,
"step": 69300
},
{
"epoch": 20.21865889212828,
"grad_norm": 0.3752710521221161,
"learning_rate": 0.00035758809801633603,
"loss": 3.2034,
"step": 69350
},
{
"epoch": 20.2332361516035,
"grad_norm": 0.3584527373313904,
"learning_rate": 0.00035741306884480746,
"loss": 3.2036,
"step": 69400
},
{
"epoch": 20.247813411078717,
"grad_norm": 0.41200047731399536,
"learning_rate": 0.00035723803967327883,
"loss": 3.1984,
"step": 69450
},
{
"epoch": 20.262390670553934,
"grad_norm": 0.3794945478439331,
"learning_rate": 0.00035706301050175026,
"loss": 3.2055,
"step": 69500
},
{
"epoch": 20.276967930029155,
"grad_norm": 0.3582080602645874,
"learning_rate": 0.0003568879813302217,
"loss": 3.1918,
"step": 69550
},
{
"epoch": 20.291545189504372,
"grad_norm": 0.3873555064201355,
"learning_rate": 0.0003567129521586931,
"loss": 3.2128,
"step": 69600
},
{
"epoch": 20.306122448979593,
"grad_norm": 0.3871956169605255,
"learning_rate": 0.0003565379229871645,
"loss": 3.2295,
"step": 69650
},
{
"epoch": 20.32069970845481,
"grad_norm": 0.39271074533462524,
"learning_rate": 0.00035636289381563586,
"loss": 3.2124,
"step": 69700
},
{
"epoch": 20.335276967930028,
"grad_norm": 0.37518835067749023,
"learning_rate": 0.00035618786464410734,
"loss": 3.2265,
"step": 69750
},
{
"epoch": 20.34985422740525,
"grad_norm": 0.41981539130210876,
"learning_rate": 0.0003560128354725787,
"loss": 3.2192,
"step": 69800
},
{
"epoch": 20.364431486880466,
"grad_norm": 0.3705388903617859,
"learning_rate": 0.00035583780630105014,
"loss": 3.2053,
"step": 69850
},
{
"epoch": 20.379008746355684,
"grad_norm": 0.3765951693058014,
"learning_rate": 0.00035566277712952157,
"loss": 3.2039,
"step": 69900
},
{
"epoch": 20.393586005830905,
"grad_norm": 0.3718447983264923,
"learning_rate": 0.000355487747957993,
"loss": 3.2242,
"step": 69950
},
{
"epoch": 20.408163265306122,
"grad_norm": 0.3854503929615021,
"learning_rate": 0.00035531271878646437,
"loss": 3.227,
"step": 70000
},
{
"epoch": 20.408163265306122,
"eval_accuracy": 0.3724764751972486,
"eval_loss": 3.549376964569092,
"eval_runtime": 181.7953,
"eval_samples_per_second": 91.444,
"eval_steps_per_second": 5.715,
"step": 70000
},
{
"epoch": 20.42274052478134,
"grad_norm": 0.4070553183555603,
"learning_rate": 0.00035513768961493574,
"loss": 3.2329,
"step": 70050
},
{
"epoch": 20.43731778425656,
"grad_norm": 0.3540206849575043,
"learning_rate": 0.0003549626604434072,
"loss": 3.2228,
"step": 70100
},
{
"epoch": 20.451895043731778,
"grad_norm": 0.36275020241737366,
"learning_rate": 0.0003547876312718786,
"loss": 3.2215,
"step": 70150
},
{
"epoch": 20.466472303207,
"grad_norm": 0.3675984740257263,
"learning_rate": 0.00035461260210035,
"loss": 3.2286,
"step": 70200
},
{
"epoch": 20.481049562682216,
"grad_norm": 0.39644601941108704,
"learning_rate": 0.00035443757292882145,
"loss": 3.2337,
"step": 70250
},
{
"epoch": 20.495626822157433,
"grad_norm": 0.3630872070789337,
"learning_rate": 0.00035426254375729287,
"loss": 3.2301,
"step": 70300
},
{
"epoch": 20.510204081632654,
"grad_norm": 0.3842460811138153,
"learning_rate": 0.00035408751458576424,
"loss": 3.2319,
"step": 70350
},
{
"epoch": 20.52478134110787,
"grad_norm": 0.4168015122413635,
"learning_rate": 0.00035391248541423567,
"loss": 3.2215,
"step": 70400
},
{
"epoch": 20.53935860058309,
"grad_norm": 0.3930748999118805,
"learning_rate": 0.0003537374562427071,
"loss": 3.2295,
"step": 70450
},
{
"epoch": 20.55393586005831,
"grad_norm": 0.4178406894207001,
"learning_rate": 0.0003535624270711785,
"loss": 3.242,
"step": 70500
},
{
"epoch": 20.568513119533527,
"grad_norm": 0.3626347482204437,
"learning_rate": 0.0003533873978996499,
"loss": 3.2405,
"step": 70550
},
{
"epoch": 20.583090379008745,
"grad_norm": 0.3929009437561035,
"learning_rate": 0.0003532123687281214,
"loss": 3.2233,
"step": 70600
},
{
"epoch": 20.597667638483966,
"grad_norm": 0.40020325779914856,
"learning_rate": 0.00035303733955659275,
"loss": 3.2492,
"step": 70650
},
{
"epoch": 20.612244897959183,
"grad_norm": 0.39611926674842834,
"learning_rate": 0.0003528623103850641,
"loss": 3.234,
"step": 70700
},
{
"epoch": 20.626822157434404,
"grad_norm": 0.386678546667099,
"learning_rate": 0.00035268728121353555,
"loss": 3.2254,
"step": 70750
},
{
"epoch": 20.64139941690962,
"grad_norm": 0.4055517911911011,
"learning_rate": 0.000352512252042007,
"loss": 3.2339,
"step": 70800
},
{
"epoch": 20.65597667638484,
"grad_norm": 0.365493506193161,
"learning_rate": 0.0003523372228704784,
"loss": 3.2382,
"step": 70850
},
{
"epoch": 20.67055393586006,
"grad_norm": 0.3866759240627289,
"learning_rate": 0.0003521621936989498,
"loss": 3.2367,
"step": 70900
},
{
"epoch": 20.685131195335277,
"grad_norm": 0.37846946716308594,
"learning_rate": 0.00035198716452742126,
"loss": 3.2349,
"step": 70950
},
{
"epoch": 20.699708454810494,
"grad_norm": 0.3800721764564514,
"learning_rate": 0.00035181213535589263,
"loss": 3.2344,
"step": 71000
},
{
"epoch": 20.699708454810494,
"eval_accuracy": 0.37278112986858886,
"eval_loss": 3.5447607040405273,
"eval_runtime": 181.6259,
"eval_samples_per_second": 91.529,
"eval_steps_per_second": 5.721,
"step": 71000
},
{
"epoch": 20.714285714285715,
"grad_norm": 0.39008185267448425,
"learning_rate": 0.000351637106184364,
"loss": 3.2458,
"step": 71050
},
{
"epoch": 20.728862973760933,
"grad_norm": 0.36584028601646423,
"learning_rate": 0.00035146207701283543,
"loss": 3.2476,
"step": 71100
},
{
"epoch": 20.74344023323615,
"grad_norm": 0.3683364689350128,
"learning_rate": 0.00035128704784130686,
"loss": 3.2401,
"step": 71150
},
{
"epoch": 20.75801749271137,
"grad_norm": 0.36139705777168274,
"learning_rate": 0.0003511120186697783,
"loss": 3.2551,
"step": 71200
},
{
"epoch": 20.77259475218659,
"grad_norm": 0.37175899744033813,
"learning_rate": 0.00035093698949824966,
"loss": 3.249,
"step": 71250
},
{
"epoch": 20.78717201166181,
"grad_norm": 0.3636099100112915,
"learning_rate": 0.0003507619603267211,
"loss": 3.2516,
"step": 71300
},
{
"epoch": 20.801749271137027,
"grad_norm": 0.3795507252216339,
"learning_rate": 0.0003505869311551925,
"loss": 3.2419,
"step": 71350
},
{
"epoch": 20.816326530612244,
"grad_norm": 0.3986833095550537,
"learning_rate": 0.00035041190198366394,
"loss": 3.2476,
"step": 71400
},
{
"epoch": 20.830903790087465,
"grad_norm": 0.38115549087524414,
"learning_rate": 0.0003502368728121353,
"loss": 3.2498,
"step": 71450
},
{
"epoch": 20.845481049562682,
"grad_norm": 0.38317617774009705,
"learning_rate": 0.0003500618436406068,
"loss": 3.242,
"step": 71500
},
{
"epoch": 20.8600583090379,
"grad_norm": 0.3691622018814087,
"learning_rate": 0.00034988681446907816,
"loss": 3.2586,
"step": 71550
},
{
"epoch": 20.87463556851312,
"grad_norm": 0.3669494092464447,
"learning_rate": 0.00034971178529754954,
"loss": 3.2447,
"step": 71600
},
{
"epoch": 20.889212827988338,
"grad_norm": 0.37029528617858887,
"learning_rate": 0.00034953675612602096,
"loss": 3.2453,
"step": 71650
},
{
"epoch": 20.903790087463555,
"grad_norm": 0.3869434893131256,
"learning_rate": 0.0003493617269544924,
"loss": 3.2529,
"step": 71700
},
{
"epoch": 20.918367346938776,
"grad_norm": 0.3559544086456299,
"learning_rate": 0.0003491866977829638,
"loss": 3.2472,
"step": 71750
},
{
"epoch": 20.932944606413994,
"grad_norm": 0.3823868930339813,
"learning_rate": 0.0003490116686114352,
"loss": 3.2558,
"step": 71800
},
{
"epoch": 20.947521865889215,
"grad_norm": 0.35983943939208984,
"learning_rate": 0.00034883663943990667,
"loss": 3.2524,
"step": 71850
},
{
"epoch": 20.962099125364432,
"grad_norm": 0.37556958198547363,
"learning_rate": 0.00034866161026837804,
"loss": 3.253,
"step": 71900
},
{
"epoch": 20.97667638483965,
"grad_norm": 0.3773977756500244,
"learning_rate": 0.0003484865810968494,
"loss": 3.2518,
"step": 71950
},
{
"epoch": 20.99125364431487,
"grad_norm": 0.37112659215927124,
"learning_rate": 0.00034831155192532084,
"loss": 3.2637,
"step": 72000
},
{
"epoch": 20.99125364431487,
"eval_accuracy": 0.3733703094010687,
"eval_loss": 3.5348708629608154,
"eval_runtime": 181.5452,
"eval_samples_per_second": 91.569,
"eval_steps_per_second": 5.723,
"step": 72000
},
{
"epoch": 21.005830903790088,
"grad_norm": 0.3866562843322754,
"learning_rate": 0.00034813652275379227,
"loss": 3.2154,
"step": 72050
},
{
"epoch": 21.020408163265305,
"grad_norm": 0.392566055059433,
"learning_rate": 0.0003479614935822637,
"loss": 3.1381,
"step": 72100
},
{
"epoch": 21.034985422740526,
"grad_norm": 0.378366619348526,
"learning_rate": 0.00034778646441073507,
"loss": 3.1497,
"step": 72150
},
{
"epoch": 21.049562682215743,
"grad_norm": 0.40309053659439087,
"learning_rate": 0.00034761143523920655,
"loss": 3.1555,
"step": 72200
},
{
"epoch": 21.06413994169096,
"grad_norm": 0.3924875557422638,
"learning_rate": 0.0003474364060676779,
"loss": 3.1664,
"step": 72250
},
{
"epoch": 21.07871720116618,
"grad_norm": 0.4179859757423401,
"learning_rate": 0.00034726137689614935,
"loss": 3.1594,
"step": 72300
},
{
"epoch": 21.0932944606414,
"grad_norm": 0.3732832670211792,
"learning_rate": 0.0003470863477246207,
"loss": 3.1762,
"step": 72350
},
{
"epoch": 21.107871720116616,
"grad_norm": 0.364841103553772,
"learning_rate": 0.0003469113185530922,
"loss": 3.1754,
"step": 72400
},
{
"epoch": 21.122448979591837,
"grad_norm": 0.3762074112892151,
"learning_rate": 0.0003467362893815636,
"loss": 3.1719,
"step": 72450
},
{
"epoch": 21.137026239067055,
"grad_norm": 0.3844176232814789,
"learning_rate": 0.00034656126021003495,
"loss": 3.1781,
"step": 72500
},
{
"epoch": 21.151603498542276,
"grad_norm": 0.4166364073753357,
"learning_rate": 0.0003463862310385064,
"loss": 3.1648,
"step": 72550
},
{
"epoch": 21.166180758017493,
"grad_norm": 0.42058807611465454,
"learning_rate": 0.0003462112018669778,
"loss": 3.1758,
"step": 72600
},
{
"epoch": 21.18075801749271,
"grad_norm": 0.427311509847641,
"learning_rate": 0.00034603617269544923,
"loss": 3.186,
"step": 72650
},
{
"epoch": 21.19533527696793,
"grad_norm": 0.405745267868042,
"learning_rate": 0.0003458611435239206,
"loss": 3.2023,
"step": 72700
},
{
"epoch": 21.20991253644315,
"grad_norm": 0.39251014590263367,
"learning_rate": 0.0003456861143523921,
"loss": 3.1813,
"step": 72750
},
{
"epoch": 21.224489795918366,
"grad_norm": 0.39821499586105347,
"learning_rate": 0.00034551108518086345,
"loss": 3.1856,
"step": 72800
},
{
"epoch": 21.239067055393587,
"grad_norm": 0.38475996255874634,
"learning_rate": 0.0003453360560093348,
"loss": 3.1818,
"step": 72850
},
{
"epoch": 21.253644314868804,
"grad_norm": 0.41487428545951843,
"learning_rate": 0.00034516102683780625,
"loss": 3.2021,
"step": 72900
},
{
"epoch": 21.26822157434402,
"grad_norm": 0.37080344557762146,
"learning_rate": 0.0003449859976662777,
"loss": 3.1959,
"step": 72950
},
{
"epoch": 21.282798833819243,
"grad_norm": 0.40031132102012634,
"learning_rate": 0.0003448109684947491,
"loss": 3.206,
"step": 73000
},
{
"epoch": 21.282798833819243,
"eval_accuracy": 0.3722320922383219,
"eval_loss": 3.5537631511688232,
"eval_runtime": 181.5434,
"eval_samples_per_second": 91.57,
"eval_steps_per_second": 5.723,
"step": 73000
},
{
"epoch": 21.29737609329446,
"grad_norm": 0.41430559754371643,
"learning_rate": 0.0003446359393232205,
"loss": 3.2039,
"step": 73050
},
{
"epoch": 21.31195335276968,
"grad_norm": 0.4002816677093506,
"learning_rate": 0.00034446091015169196,
"loss": 3.1988,
"step": 73100
},
{
"epoch": 21.3265306122449,
"grad_norm": 0.3660390079021454,
"learning_rate": 0.00034428588098016333,
"loss": 3.1918,
"step": 73150
},
{
"epoch": 21.341107871720116,
"grad_norm": 0.38535523414611816,
"learning_rate": 0.00034411085180863476,
"loss": 3.2009,
"step": 73200
},
{
"epoch": 21.355685131195337,
"grad_norm": 0.3930363655090332,
"learning_rate": 0.00034393582263710613,
"loss": 3.2043,
"step": 73250
},
{
"epoch": 21.370262390670554,
"grad_norm": 0.3775531053543091,
"learning_rate": 0.0003437607934655776,
"loss": 3.2063,
"step": 73300
},
{
"epoch": 21.38483965014577,
"grad_norm": 0.40686893463134766,
"learning_rate": 0.000343585764294049,
"loss": 3.1895,
"step": 73350
},
{
"epoch": 21.399416909620992,
"grad_norm": 0.4103091359138489,
"learning_rate": 0.00034341073512252036,
"loss": 3.209,
"step": 73400
},
{
"epoch": 21.41399416909621,
"grad_norm": 0.36624330282211304,
"learning_rate": 0.00034323570595099184,
"loss": 3.211,
"step": 73450
},
{
"epoch": 21.428571428571427,
"grad_norm": 0.43008551001548767,
"learning_rate": 0.0003430606767794632,
"loss": 3.217,
"step": 73500
},
{
"epoch": 21.443148688046648,
"grad_norm": 0.37908637523651123,
"learning_rate": 0.00034288564760793464,
"loss": 3.221,
"step": 73550
},
{
"epoch": 21.457725947521865,
"grad_norm": 0.37392768263816833,
"learning_rate": 0.000342710618436406,
"loss": 3.212,
"step": 73600
},
{
"epoch": 21.472303206997086,
"grad_norm": 0.40657246112823486,
"learning_rate": 0.0003425355892648775,
"loss": 3.213,
"step": 73650
},
{
"epoch": 21.486880466472304,
"grad_norm": 0.3826930820941925,
"learning_rate": 0.00034236056009334887,
"loss": 3.2103,
"step": 73700
},
{
"epoch": 21.50145772594752,
"grad_norm": 0.41526952385902405,
"learning_rate": 0.00034218553092182024,
"loss": 3.2188,
"step": 73750
},
{
"epoch": 21.516034985422742,
"grad_norm": 0.3791206479072571,
"learning_rate": 0.00034201050175029166,
"loss": 3.2171,
"step": 73800
},
{
"epoch": 21.53061224489796,
"grad_norm": 0.36035192012786865,
"learning_rate": 0.0003418354725787631,
"loss": 3.2221,
"step": 73850
},
{
"epoch": 21.545189504373177,
"grad_norm": 0.4223874509334564,
"learning_rate": 0.0003416604434072345,
"loss": 3.2262,
"step": 73900
},
{
"epoch": 21.559766763848398,
"grad_norm": 0.3832871615886688,
"learning_rate": 0.0003414854142357059,
"loss": 3.22,
"step": 73950
},
{
"epoch": 21.574344023323615,
"grad_norm": 0.39104190468788147,
"learning_rate": 0.00034131038506417737,
"loss": 3.227,
"step": 74000
},
{
"epoch": 21.574344023323615,
"eval_accuracy": 0.3729764243429913,
"eval_loss": 3.546137809753418,
"eval_runtime": 182.4945,
"eval_samples_per_second": 91.093,
"eval_steps_per_second": 5.693,
"step": 74000
},
{
"epoch": 21.588921282798832,
"grad_norm": 0.4001672863960266,
"learning_rate": 0.00034113535589264874,
"loss": 3.2296,
"step": 74050
},
{
"epoch": 21.603498542274053,
"grad_norm": 0.40520548820495605,
"learning_rate": 0.00034096032672112017,
"loss": 3.2265,
"step": 74100
},
{
"epoch": 21.61807580174927,
"grad_norm": 0.36908841133117676,
"learning_rate": 0.00034078529754959154,
"loss": 3.2176,
"step": 74150
},
{
"epoch": 21.632653061224488,
"grad_norm": 0.4079746603965759,
"learning_rate": 0.000340610268378063,
"loss": 3.2411,
"step": 74200
},
{
"epoch": 21.64723032069971,
"grad_norm": 0.39124277234077454,
"learning_rate": 0.0003404352392065344,
"loss": 3.2292,
"step": 74250
},
{
"epoch": 21.661807580174926,
"grad_norm": 0.39536523818969727,
"learning_rate": 0.00034026021003500577,
"loss": 3.2268,
"step": 74300
},
{
"epoch": 21.676384839650147,
"grad_norm": 0.3833157420158386,
"learning_rate": 0.00034008518086347725,
"loss": 3.2262,
"step": 74350
},
{
"epoch": 21.690962099125365,
"grad_norm": 0.37845858931541443,
"learning_rate": 0.0003399101516919486,
"loss": 3.2272,
"step": 74400
},
{
"epoch": 21.705539358600582,
"grad_norm": 0.41482678055763245,
"learning_rate": 0.00033973512252042005,
"loss": 3.2352,
"step": 74450
},
{
"epoch": 21.720116618075803,
"grad_norm": 0.3896600306034088,
"learning_rate": 0.0003395600933488914,
"loss": 3.2399,
"step": 74500
},
{
"epoch": 21.73469387755102,
"grad_norm": 0.36860349774360657,
"learning_rate": 0.0003393850641773629,
"loss": 3.2342,
"step": 74550
},
{
"epoch": 21.749271137026238,
"grad_norm": 0.380167692899704,
"learning_rate": 0.0003392100350058343,
"loss": 3.2331,
"step": 74600
},
{
"epoch": 21.76384839650146,
"grad_norm": 0.3892786204814911,
"learning_rate": 0.00033903500583430565,
"loss": 3.2339,
"step": 74650
},
{
"epoch": 21.778425655976676,
"grad_norm": 0.3820129334926605,
"learning_rate": 0.00033885997666277713,
"loss": 3.237,
"step": 74700
},
{
"epoch": 21.793002915451893,
"grad_norm": 0.41555044054985046,
"learning_rate": 0.0003386849474912485,
"loss": 3.2339,
"step": 74750
},
{
"epoch": 21.807580174927114,
"grad_norm": 0.41377243399620056,
"learning_rate": 0.00033850991831971993,
"loss": 3.2387,
"step": 74800
},
{
"epoch": 21.82215743440233,
"grad_norm": 0.38593509793281555,
"learning_rate": 0.0003383348891481913,
"loss": 3.2413,
"step": 74850
},
{
"epoch": 21.836734693877553,
"grad_norm": 0.37622013688087463,
"learning_rate": 0.0003381598599766628,
"loss": 3.244,
"step": 74900
},
{
"epoch": 21.85131195335277,
"grad_norm": 0.38674411177635193,
"learning_rate": 0.00033798483080513416,
"loss": 3.244,
"step": 74950
},
{
"epoch": 21.865889212827987,
"grad_norm": 0.3747655749320984,
"learning_rate": 0.0003378098016336056,
"loss": 3.2301,
"step": 75000
},
{
"epoch": 21.865889212827987,
"eval_accuracy": 0.373476020334169,
"eval_loss": 3.5379791259765625,
"eval_runtime": 181.5105,
"eval_samples_per_second": 91.587,
"eval_steps_per_second": 5.724,
"step": 75000
},
{
"epoch": 21.88046647230321,
"grad_norm": 0.4235680103302002,
"learning_rate": 0.00033763477246207696,
"loss": 3.2357,
"step": 75050
},
{
"epoch": 21.895043731778426,
"grad_norm": 0.3823429048061371,
"learning_rate": 0.00033745974329054844,
"loss": 3.2449,
"step": 75100
},
{
"epoch": 21.909620991253643,
"grad_norm": 0.41767048835754395,
"learning_rate": 0.0003372847141190198,
"loss": 3.2541,
"step": 75150
},
{
"epoch": 21.924198250728864,
"grad_norm": 0.39289626479148865,
"learning_rate": 0.0003371096849474912,
"loss": 3.2522,
"step": 75200
},
{
"epoch": 21.93877551020408,
"grad_norm": 0.4047614336013794,
"learning_rate": 0.00033693465577596266,
"loss": 3.2426,
"step": 75250
},
{
"epoch": 21.9533527696793,
"grad_norm": 0.3937784731388092,
"learning_rate": 0.00033675962660443404,
"loss": 3.2358,
"step": 75300
},
{
"epoch": 21.96793002915452,
"grad_norm": 0.4089590609073639,
"learning_rate": 0.00033658459743290546,
"loss": 3.241,
"step": 75350
},
{
"epoch": 21.982507288629737,
"grad_norm": 0.4090530574321747,
"learning_rate": 0.00033640956826137684,
"loss": 3.2541,
"step": 75400
},
{
"epoch": 21.997084548104958,
"grad_norm": 0.38579609990119934,
"learning_rate": 0.0003362345390898483,
"loss": 3.2452,
"step": 75450
},
{
"epoch": 22.011661807580175,
"grad_norm": 0.3704177439212799,
"learning_rate": 0.0003360595099183197,
"loss": 3.168,
"step": 75500
},
{
"epoch": 22.026239067055393,
"grad_norm": 0.3894430100917816,
"learning_rate": 0.00033588448074679106,
"loss": 3.1481,
"step": 75550
},
{
"epoch": 22.040816326530614,
"grad_norm": 0.42577895522117615,
"learning_rate": 0.00033570945157526254,
"loss": 3.1475,
"step": 75600
},
{
"epoch": 22.05539358600583,
"grad_norm": 0.3754096031188965,
"learning_rate": 0.0003355344224037339,
"loss": 3.1548,
"step": 75650
},
{
"epoch": 22.06997084548105,
"grad_norm": 0.3804377615451813,
"learning_rate": 0.00033535939323220534,
"loss": 3.1528,
"step": 75700
},
{
"epoch": 22.08454810495627,
"grad_norm": 0.4568529427051544,
"learning_rate": 0.0003351843640606767,
"loss": 3.1464,
"step": 75750
},
{
"epoch": 22.099125364431487,
"grad_norm": 0.40900859236717224,
"learning_rate": 0.0003350093348891482,
"loss": 3.1626,
"step": 75800
},
{
"epoch": 22.113702623906704,
"grad_norm": 0.4063328802585602,
"learning_rate": 0.00033483430571761957,
"loss": 3.1597,
"step": 75850
},
{
"epoch": 22.128279883381925,
"grad_norm": 0.40163469314575195,
"learning_rate": 0.000334659276546091,
"loss": 3.1652,
"step": 75900
},
{
"epoch": 22.142857142857142,
"grad_norm": 0.40903109312057495,
"learning_rate": 0.0003344842473745624,
"loss": 3.168,
"step": 75950
},
{
"epoch": 22.157434402332363,
"grad_norm": 0.3817201256752014,
"learning_rate": 0.00033430921820303385,
"loss": 3.1727,
"step": 76000
},
{
"epoch": 22.157434402332363,
"eval_accuracy": 0.3725627626292781,
"eval_loss": 3.5528297424316406,
"eval_runtime": 181.4979,
"eval_samples_per_second": 91.593,
"eval_steps_per_second": 5.725,
"step": 76000
},
{
"epoch": 22.17201166180758,
"grad_norm": 0.3955742120742798,
"learning_rate": 0.0003341341890315052,
"loss": 3.1637,
"step": 76050
},
{
"epoch": 22.186588921282798,
"grad_norm": 0.4320707321166992,
"learning_rate": 0.0003339591598599766,
"loss": 3.1777,
"step": 76100
},
{
"epoch": 22.20116618075802,
"grad_norm": 0.3852517902851105,
"learning_rate": 0.0003337841306884481,
"loss": 3.1858,
"step": 76150
},
{
"epoch": 22.215743440233236,
"grad_norm": 0.4392651319503784,
"learning_rate": 0.00033360910151691945,
"loss": 3.1815,
"step": 76200
},
{
"epoch": 22.230320699708454,
"grad_norm": 0.39310264587402344,
"learning_rate": 0.0003334340723453909,
"loss": 3.1695,
"step": 76250
},
{
"epoch": 22.244897959183675,
"grad_norm": 0.388776570558548,
"learning_rate": 0.00033325904317386225,
"loss": 3.1865,
"step": 76300
},
{
"epoch": 22.259475218658892,
"grad_norm": 0.3850609064102173,
"learning_rate": 0.00033308401400233373,
"loss": 3.1881,
"step": 76350
},
{
"epoch": 22.27405247813411,
"grad_norm": 0.3893771171569824,
"learning_rate": 0.0003329089848308051,
"loss": 3.1883,
"step": 76400
},
{
"epoch": 22.28862973760933,
"grad_norm": 0.4042390286922455,
"learning_rate": 0.0003327339556592765,
"loss": 3.1814,
"step": 76450
},
{
"epoch": 22.303206997084548,
"grad_norm": 0.37913334369659424,
"learning_rate": 0.00033255892648774795,
"loss": 3.185,
"step": 76500
},
{
"epoch": 22.317784256559765,
"grad_norm": 0.43061673641204834,
"learning_rate": 0.0003323838973162193,
"loss": 3.2021,
"step": 76550
},
{
"epoch": 22.332361516034986,
"grad_norm": 0.47006747126579285,
"learning_rate": 0.00033220886814469075,
"loss": 3.1932,
"step": 76600
},
{
"epoch": 22.346938775510203,
"grad_norm": 0.40174809098243713,
"learning_rate": 0.0003320338389731621,
"loss": 3.2008,
"step": 76650
},
{
"epoch": 22.361516034985424,
"grad_norm": 0.3931922912597656,
"learning_rate": 0.0003318588098016336,
"loss": 3.1946,
"step": 76700
},
{
"epoch": 22.37609329446064,
"grad_norm": 0.3940417468547821,
"learning_rate": 0.000331683780630105,
"loss": 3.1994,
"step": 76750
},
{
"epoch": 22.39067055393586,
"grad_norm": 0.37044382095336914,
"learning_rate": 0.0003315087514585764,
"loss": 3.198,
"step": 76800
},
{
"epoch": 22.40524781341108,
"grad_norm": 0.41557997465133667,
"learning_rate": 0.00033133372228704783,
"loss": 3.1991,
"step": 76850
},
{
"epoch": 22.419825072886297,
"grad_norm": 0.41059786081314087,
"learning_rate": 0.00033115869311551926,
"loss": 3.1885,
"step": 76900
},
{
"epoch": 22.434402332361515,
"grad_norm": 0.38646191358566284,
"learning_rate": 0.00033098366394399063,
"loss": 3.2002,
"step": 76950
},
{
"epoch": 22.448979591836736,
"grad_norm": 0.3769567608833313,
"learning_rate": 0.000330808634772462,
"loss": 3.2089,
"step": 77000
},
{
"epoch": 22.448979591836736,
"eval_accuracy": 0.37277524395917344,
"eval_loss": 3.5467875003814697,
"eval_runtime": 181.4854,
"eval_samples_per_second": 91.6,
"eval_steps_per_second": 5.725,
"step": 77000
},
{
"epoch": 22.463556851311953,
"grad_norm": 0.3794631063938141,
"learning_rate": 0.0003306336056009335,
"loss": 3.2089,
"step": 77050
},
{
"epoch": 22.47813411078717,
"grad_norm": 0.4073309600353241,
"learning_rate": 0.00033045857642940486,
"loss": 3.1998,
"step": 77100
},
{
"epoch": 22.49271137026239,
"grad_norm": 0.40729716420173645,
"learning_rate": 0.0003302835472578763,
"loss": 3.2111,
"step": 77150
},
{
"epoch": 22.50728862973761,
"grad_norm": 0.4218553602695465,
"learning_rate": 0.0003301085180863477,
"loss": 3.2033,
"step": 77200
},
{
"epoch": 22.52186588921283,
"grad_norm": 0.410984605550766,
"learning_rate": 0.00032993348891481914,
"loss": 3.1964,
"step": 77250
},
{
"epoch": 22.536443148688047,
"grad_norm": 0.3754239082336426,
"learning_rate": 0.0003297584597432905,
"loss": 3.2193,
"step": 77300
},
{
"epoch": 22.551020408163264,
"grad_norm": 0.40258893370628357,
"learning_rate": 0.0003295834305717619,
"loss": 3.2121,
"step": 77350
},
{
"epoch": 22.565597667638485,
"grad_norm": 0.4331571161746979,
"learning_rate": 0.00032940840140023337,
"loss": 3.2096,
"step": 77400
},
{
"epoch": 22.580174927113703,
"grad_norm": 0.39986467361450195,
"learning_rate": 0.00032923337222870474,
"loss": 3.2039,
"step": 77450
},
{
"epoch": 22.59475218658892,
"grad_norm": 0.39954131841659546,
"learning_rate": 0.00032905834305717616,
"loss": 3.22,
"step": 77500
},
{
"epoch": 22.60932944606414,
"grad_norm": 0.3756577968597412,
"learning_rate": 0.00032888331388564754,
"loss": 3.2104,
"step": 77550
},
{
"epoch": 22.62390670553936,
"grad_norm": 0.43170756101608276,
"learning_rate": 0.000328708284714119,
"loss": 3.2222,
"step": 77600
},
{
"epoch": 22.638483965014576,
"grad_norm": 0.4028014838695526,
"learning_rate": 0.0003285332555425904,
"loss": 3.2124,
"step": 77650
},
{
"epoch": 22.653061224489797,
"grad_norm": 0.4260345995426178,
"learning_rate": 0.0003283582263710618,
"loss": 3.223,
"step": 77700
},
{
"epoch": 22.667638483965014,
"grad_norm": 0.40206658840179443,
"learning_rate": 0.00032818319719953324,
"loss": 3.2277,
"step": 77750
},
{
"epoch": 22.682215743440235,
"grad_norm": 0.4037342369556427,
"learning_rate": 0.00032800816802800467,
"loss": 3.2276,
"step": 77800
},
{
"epoch": 22.696793002915452,
"grad_norm": 0.3831779360771179,
"learning_rate": 0.00032783313885647604,
"loss": 3.2048,
"step": 77850
},
{
"epoch": 22.71137026239067,
"grad_norm": 0.41773656010627747,
"learning_rate": 0.0003276581096849474,
"loss": 3.2226,
"step": 77900
},
{
"epoch": 22.72594752186589,
"grad_norm": 0.40552884340286255,
"learning_rate": 0.0003274830805134189,
"loss": 3.2213,
"step": 77950
},
{
"epoch": 22.740524781341108,
"grad_norm": 0.374747097492218,
"learning_rate": 0.00032730805134189027,
"loss": 3.2303,
"step": 78000
},
{
"epoch": 22.740524781341108,
"eval_accuracy": 0.3737029810012262,
"eval_loss": 3.538172721862793,
"eval_runtime": 183.0931,
"eval_samples_per_second": 90.795,
"eval_steps_per_second": 5.675,
"step": 78000
},
{
"epoch": 22.755102040816325,
"grad_norm": 0.3842918276786804,
"learning_rate": 0.0003271330221703617,
"loss": 3.2269,
"step": 78050
},
{
"epoch": 22.769679300291546,
"grad_norm": 0.3732106387615204,
"learning_rate": 0.0003269579929988331,
"loss": 3.2353,
"step": 78100
},
{
"epoch": 22.784256559766764,
"grad_norm": 0.39892974495887756,
"learning_rate": 0.00032678296382730455,
"loss": 3.2265,
"step": 78150
},
{
"epoch": 22.79883381924198,
"grad_norm": 0.3784988224506378,
"learning_rate": 0.0003266079346557759,
"loss": 3.2361,
"step": 78200
},
{
"epoch": 22.813411078717202,
"grad_norm": 0.4117169678211212,
"learning_rate": 0.0003264329054842473,
"loss": 3.2218,
"step": 78250
},
{
"epoch": 22.82798833819242,
"grad_norm": 0.3687457740306854,
"learning_rate": 0.0003262578763127188,
"loss": 3.2365,
"step": 78300
},
{
"epoch": 22.842565597667637,
"grad_norm": 0.4006026089191437,
"learning_rate": 0.00032608284714119015,
"loss": 3.2334,
"step": 78350
},
{
"epoch": 22.857142857142858,
"grad_norm": 0.38996270298957825,
"learning_rate": 0.0003259078179696616,
"loss": 3.2307,
"step": 78400
},
{
"epoch": 22.871720116618075,
"grad_norm": 0.39698663353919983,
"learning_rate": 0.000325732788798133,
"loss": 3.2265,
"step": 78450
},
{
"epoch": 22.886297376093296,
"grad_norm": 0.38513946533203125,
"learning_rate": 0.00032555775962660443,
"loss": 3.2263,
"step": 78500
},
{
"epoch": 22.900874635568513,
"grad_norm": 0.40252456068992615,
"learning_rate": 0.0003253827304550758,
"loss": 3.2238,
"step": 78550
},
{
"epoch": 22.91545189504373,
"grad_norm": 0.3779289424419403,
"learning_rate": 0.00032520770128354723,
"loss": 3.2383,
"step": 78600
},
{
"epoch": 22.93002915451895,
"grad_norm": 0.3678314983844757,
"learning_rate": 0.00032503267211201866,
"loss": 3.2413,
"step": 78650
},
{
"epoch": 22.94460641399417,
"grad_norm": 0.3785208761692047,
"learning_rate": 0.0003248576429404901,
"loss": 3.2335,
"step": 78700
},
{
"epoch": 22.959183673469386,
"grad_norm": 0.3818875849246979,
"learning_rate": 0.00032468261376896146,
"loss": 3.2398,
"step": 78750
},
{
"epoch": 22.973760932944607,
"grad_norm": 0.40353357791900635,
"learning_rate": 0.00032450758459743283,
"loss": 3.2332,
"step": 78800
},
{
"epoch": 22.988338192419825,
"grad_norm": 0.37254422903060913,
"learning_rate": 0.0003243325554259043,
"loss": 3.2311,
"step": 78850
},
{
"epoch": 23.002915451895042,
"grad_norm": 0.4074246883392334,
"learning_rate": 0.0003241575262543757,
"loss": 3.1996,
"step": 78900
},
{
"epoch": 23.017492711370263,
"grad_norm": 0.4083156883716583,
"learning_rate": 0.0003239824970828471,
"loss": 3.129,
"step": 78950
},
{
"epoch": 23.03206997084548,
"grad_norm": 0.40061071515083313,
"learning_rate": 0.00032380746791131854,
"loss": 3.1451,
"step": 79000
},
{
"epoch": 23.03206997084548,
"eval_accuracy": 0.3729666537333617,
"eval_loss": 3.5513992309570312,
"eval_runtime": 183.3231,
"eval_samples_per_second": 90.681,
"eval_steps_per_second": 5.668,
"step": 79000
},
{
"epoch": 23.0466472303207,
"grad_norm": 0.4193076193332672,
"learning_rate": 0.00032363243873978996,
"loss": 3.1481,
"step": 79050
},
{
"epoch": 23.06122448979592,
"grad_norm": 0.42379242181777954,
"learning_rate": 0.00032345740956826134,
"loss": 3.1376,
"step": 79100
},
{
"epoch": 23.075801749271136,
"grad_norm": 0.4095369279384613,
"learning_rate": 0.0003232823803967327,
"loss": 3.1415,
"step": 79150
},
{
"epoch": 23.090379008746357,
"grad_norm": 0.3854268491268158,
"learning_rate": 0.0003231073512252042,
"loss": 3.1506,
"step": 79200
},
{
"epoch": 23.104956268221574,
"grad_norm": 0.4058293104171753,
"learning_rate": 0.00032293232205367556,
"loss": 3.1607,
"step": 79250
},
{
"epoch": 23.11953352769679,
"grad_norm": 0.40621069073677063,
"learning_rate": 0.000322757292882147,
"loss": 3.1575,
"step": 79300
},
{
"epoch": 23.134110787172013,
"grad_norm": 0.4140237867832184,
"learning_rate": 0.0003225822637106184,
"loss": 3.1554,
"step": 79350
},
{
"epoch": 23.14868804664723,
"grad_norm": 0.38233980536460876,
"learning_rate": 0.00032240723453908984,
"loss": 3.1596,
"step": 79400
},
{
"epoch": 23.163265306122447,
"grad_norm": 0.3980560600757599,
"learning_rate": 0.0003222322053675612,
"loss": 3.1618,
"step": 79450
},
{
"epoch": 23.17784256559767,
"grad_norm": 0.40507349371910095,
"learning_rate": 0.00032205717619603264,
"loss": 3.1599,
"step": 79500
},
{
"epoch": 23.192419825072886,
"grad_norm": 0.4133625030517578,
"learning_rate": 0.00032188214702450407,
"loss": 3.1661,
"step": 79550
},
{
"epoch": 23.206997084548107,
"grad_norm": 0.4333428144454956,
"learning_rate": 0.0003217071178529755,
"loss": 3.1745,
"step": 79600
},
{
"epoch": 23.221574344023324,
"grad_norm": 0.41459041833877563,
"learning_rate": 0.00032153208868144687,
"loss": 3.1547,
"step": 79650
},
{
"epoch": 23.23615160349854,
"grad_norm": 0.3677409887313843,
"learning_rate": 0.00032135705950991835,
"loss": 3.1699,
"step": 79700
},
{
"epoch": 23.250728862973762,
"grad_norm": 0.3948548138141632,
"learning_rate": 0.0003211820303383897,
"loss": 3.1748,
"step": 79750
},
{
"epoch": 23.26530612244898,
"grad_norm": 0.3827493488788605,
"learning_rate": 0.0003210070011668611,
"loss": 3.1882,
"step": 79800
},
{
"epoch": 23.279883381924197,
"grad_norm": 0.40871936082839966,
"learning_rate": 0.0003208319719953325,
"loss": 3.1778,
"step": 79850
},
{
"epoch": 23.294460641399418,
"grad_norm": 0.3977854251861572,
"learning_rate": 0.00032065694282380395,
"loss": 3.1791,
"step": 79900
},
{
"epoch": 23.309037900874635,
"grad_norm": 0.38805514574050903,
"learning_rate": 0.0003204819136522754,
"loss": 3.1588,
"step": 79950
},
{
"epoch": 23.323615160349853,
"grad_norm": 0.41924408078193665,
"learning_rate": 0.00032030688448074675,
"loss": 3.1934,
"step": 80000
},
{
"epoch": 23.323615160349853,
"eval_accuracy": 0.3729095604120325,
"eval_loss": 3.5495593547821045,
"eval_runtime": 182.8971,
"eval_samples_per_second": 90.893,
"eval_steps_per_second": 5.681,
"step": 80000
},
{
"epoch": 23.338192419825074,
"grad_norm": 0.40964633226394653,
"learning_rate": 0.0003201318553092181,
"loss": 3.1388,
"step": 80050
},
{
"epoch": 23.35276967930029,
"grad_norm": 0.40592989325523376,
"learning_rate": 0.0003199568261376896,
"loss": 3.1356,
"step": 80100
},
{
"epoch": 23.367346938775512,
"grad_norm": 0.40325120091438293,
"learning_rate": 0.000319781796966161,
"loss": 3.1501,
"step": 80150
},
{
"epoch": 23.38192419825073,
"grad_norm": 0.37463781237602234,
"learning_rate": 0.0003196067677946324,
"loss": 3.1391,
"step": 80200
},
{
"epoch": 23.396501457725947,
"grad_norm": 0.3974815309047699,
"learning_rate": 0.0003194317386231038,
"loss": 3.1471,
"step": 80250
},
{
"epoch": 23.411078717201168,
"grad_norm": 0.3901289701461792,
"learning_rate": 0.00031925670945157525,
"loss": 3.1546,
"step": 80300
},
{
"epoch": 23.425655976676385,
"grad_norm": 0.3855806291103363,
"learning_rate": 0.0003190816802800466,
"loss": 3.1602,
"step": 80350
},
{
"epoch": 23.440233236151602,
"grad_norm": 0.4171423017978668,
"learning_rate": 0.00031890665110851805,
"loss": 3.1602,
"step": 80400
},
{
"epoch": 23.454810495626823,
"grad_norm": 0.4090583920478821,
"learning_rate": 0.0003187316219369895,
"loss": 3.1586,
"step": 80450
},
{
"epoch": 23.46938775510204,
"grad_norm": 0.40589964389801025,
"learning_rate": 0.0003185565927654609,
"loss": 3.161,
"step": 80500
},
{
"epoch": 23.483965014577258,
"grad_norm": 0.4126732647418976,
"learning_rate": 0.0003183815635939323,
"loss": 3.1643,
"step": 80550
},
{
"epoch": 23.49854227405248,
"grad_norm": 0.38648244738578796,
"learning_rate": 0.00031820653442240376,
"loss": 3.1724,
"step": 80600
},
{
"epoch": 23.513119533527696,
"grad_norm": 0.4161723852157593,
"learning_rate": 0.00031803150525087513,
"loss": 3.1661,
"step": 80650
},
{
"epoch": 23.527696793002917,
"grad_norm": 0.463445782661438,
"learning_rate": 0.0003178564760793465,
"loss": 3.1686,
"step": 80700
},
{
"epoch": 23.542274052478135,
"grad_norm": 0.3802230954170227,
"learning_rate": 0.00031768144690781793,
"loss": 3.1774,
"step": 80750
},
{
"epoch": 23.556851311953352,
"grad_norm": 0.40115004777908325,
"learning_rate": 0.00031750641773628936,
"loss": 3.177,
"step": 80800
},
{
"epoch": 23.571428571428573,
"grad_norm": 0.3771075904369354,
"learning_rate": 0.0003173313885647608,
"loss": 3.1733,
"step": 80850
},
{
"epoch": 23.58600583090379,
"grad_norm": 0.4014439284801483,
"learning_rate": 0.00031715635939323216,
"loss": 3.1715,
"step": 80900
},
{
"epoch": 23.600583090379008,
"grad_norm": 0.4014812707901001,
"learning_rate": 0.00031698133022170364,
"loss": 3.1898,
"step": 80950
},
{
"epoch": 23.61516034985423,
"grad_norm": 0.3988043963909149,
"learning_rate": 0.000316806301050175,
"loss": 3.1733,
"step": 81000
},
{
"epoch": 23.61516034985423,
"eval_accuracy": 0.37280632156088667,
"eval_loss": 3.553154468536377,
"eval_runtime": 183.8151,
"eval_samples_per_second": 90.439,
"eval_steps_per_second": 5.652,
"step": 81000
},
{
"epoch": 23.629737609329446,
"grad_norm": 0.41778406500816345,
"learning_rate": 0.0003166312718786464,
"loss": 3.1837,
"step": 81050
},
{
"epoch": 23.644314868804663,
"grad_norm": 0.4018956124782562,
"learning_rate": 0.0003164562427071178,
"loss": 3.1654,
"step": 81100
},
{
"epoch": 23.658892128279884,
"grad_norm": 0.4062495529651642,
"learning_rate": 0.00031628121353558924,
"loss": 3.1817,
"step": 81150
},
{
"epoch": 23.6734693877551,
"grad_norm": 0.42387402057647705,
"learning_rate": 0.00031610618436406066,
"loss": 3.1934,
"step": 81200
},
{
"epoch": 23.68804664723032,
"grad_norm": 0.41637706756591797,
"learning_rate": 0.00031593115519253204,
"loss": 3.1909,
"step": 81250
},
{
"epoch": 23.70262390670554,
"grad_norm": 0.4212613105773926,
"learning_rate": 0.00031575612602100346,
"loss": 3.1765,
"step": 81300
},
{
"epoch": 23.717201166180757,
"grad_norm": 0.404997318983078,
"learning_rate": 0.0003155810968494749,
"loss": 3.1889,
"step": 81350
},
{
"epoch": 23.73177842565598,
"grad_norm": 0.3999521732330322,
"learning_rate": 0.0003154060676779463,
"loss": 3.1951,
"step": 81400
},
{
"epoch": 23.746355685131196,
"grad_norm": 0.4004182517528534,
"learning_rate": 0.0003152310385064177,
"loss": 3.1944,
"step": 81450
},
{
"epoch": 23.760932944606413,
"grad_norm": 0.3951795697212219,
"learning_rate": 0.00031505600933488917,
"loss": 3.1882,
"step": 81500
},
{
"epoch": 23.775510204081634,
"grad_norm": 0.4371464252471924,
"learning_rate": 0.00031488098016336054,
"loss": 3.1815,
"step": 81550
},
{
"epoch": 23.79008746355685,
"grad_norm": 0.4174312949180603,
"learning_rate": 0.0003147059509918319,
"loss": 3.1957,
"step": 81600
},
{
"epoch": 23.80466472303207,
"grad_norm": 0.37675541639328003,
"learning_rate": 0.00031453092182030334,
"loss": 3.1946,
"step": 81650
},
{
"epoch": 23.81924198250729,
"grad_norm": 0.40973252058029175,
"learning_rate": 0.00031435589264877477,
"loss": 3.1906,
"step": 81700
},
{
"epoch": 23.833819241982507,
"grad_norm": 0.41370999813079834,
"learning_rate": 0.0003141808634772462,
"loss": 3.1871,
"step": 81750
},
{
"epoch": 23.848396501457724,
"grad_norm": 0.3945741057395935,
"learning_rate": 0.00031400583430571757,
"loss": 3.2006,
"step": 81800
},
{
"epoch": 23.862973760932945,
"grad_norm": 0.409297913312912,
"learning_rate": 0.00031383080513418905,
"loss": 3.2033,
"step": 81850
},
{
"epoch": 23.877551020408163,
"grad_norm": 0.4507441818714142,
"learning_rate": 0.0003136557759626604,
"loss": 3.2035,
"step": 81900
},
{
"epoch": 23.892128279883384,
"grad_norm": 0.3964073359966278,
"learning_rate": 0.0003134807467911318,
"loss": 3.2036,
"step": 81950
},
{
"epoch": 23.9067055393586,
"grad_norm": 0.4502336084842682,
"learning_rate": 0.0003133057176196032,
"loss": 3.2024,
"step": 82000
},
{
"epoch": 23.9067055393586,
"eval_accuracy": 0.37321904152909335,
"eval_loss": 3.5442070960998535,
"eval_runtime": 183.0942,
"eval_samples_per_second": 90.795,
"eval_steps_per_second": 5.675,
"step": 82000
},
{
"epoch": 23.92128279883382,
"grad_norm": 0.3888959586620331,
"learning_rate": 0.00031313068844807465,
"loss": 3.2061,
"step": 82050
},
{
"epoch": 23.93586005830904,
"grad_norm": 0.3921671509742737,
"learning_rate": 0.0003129556592765461,
"loss": 3.2084,
"step": 82100
},
{
"epoch": 23.950437317784257,
"grad_norm": 0.40496307611465454,
"learning_rate": 0.00031278063010501745,
"loss": 3.1999,
"step": 82150
},
{
"epoch": 23.965014577259474,
"grad_norm": 0.40495121479034424,
"learning_rate": 0.00031260560093348893,
"loss": 3.2047,
"step": 82200
},
{
"epoch": 23.979591836734695,
"grad_norm": 0.3954131305217743,
"learning_rate": 0.0003124305717619603,
"loss": 3.2014,
"step": 82250
},
{
"epoch": 23.994169096209912,
"grad_norm": 0.4274619519710541,
"learning_rate": 0.00031225554259043173,
"loss": 3.2172,
"step": 82300
},
{
"epoch": 24.00874635568513,
"grad_norm": 0.4204201102256775,
"learning_rate": 0.0003120805134189031,
"loss": 3.1609,
"step": 82350
},
{
"epoch": 24.02332361516035,
"grad_norm": 0.4107939302921295,
"learning_rate": 0.0003119054842473746,
"loss": 3.1226,
"step": 82400
},
{
"epoch": 24.037900874635568,
"grad_norm": 0.7560588717460632,
"learning_rate": 0.00031173045507584596,
"loss": 3.1357,
"step": 82450
},
{
"epoch": 24.05247813411079,
"grad_norm": 0.42420998215675354,
"learning_rate": 0.00031155542590431733,
"loss": 3.1287,
"step": 82500
},
{
"epoch": 24.067055393586006,
"grad_norm": 0.3994666337966919,
"learning_rate": 0.00031138039673278876,
"loss": 3.1346,
"step": 82550
},
{
"epoch": 24.081632653061224,
"grad_norm": 0.42727458477020264,
"learning_rate": 0.0003112053675612602,
"loss": 3.1467,
"step": 82600
},
{
"epoch": 24.096209912536445,
"grad_norm": 0.4032772183418274,
"learning_rate": 0.0003110303383897316,
"loss": 3.1444,
"step": 82650
},
{
"epoch": 24.110787172011662,
"grad_norm": 0.40215247869491577,
"learning_rate": 0.000310855309218203,
"loss": 3.154,
"step": 82700
},
{
"epoch": 24.12536443148688,
"grad_norm": 0.39542341232299805,
"learning_rate": 0.00031068028004667446,
"loss": 3.1496,
"step": 82750
},
{
"epoch": 24.1399416909621,
"grad_norm": 0.4189421534538269,
"learning_rate": 0.00031050525087514584,
"loss": 3.1573,
"step": 82800
},
{
"epoch": 24.154518950437318,
"grad_norm": 0.42610371112823486,
"learning_rate": 0.0003103302217036172,
"loss": 3.1536,
"step": 82850
},
{
"epoch": 24.169096209912535,
"grad_norm": 0.41727983951568604,
"learning_rate": 0.00031015519253208863,
"loss": 3.1639,
"step": 82900
},
{
"epoch": 24.183673469387756,
"grad_norm": 0.3976685702800751,
"learning_rate": 0.00030998016336056006,
"loss": 3.1504,
"step": 82950
},
{
"epoch": 24.198250728862973,
"grad_norm": 0.42174771428108215,
"learning_rate": 0.0003098051341890315,
"loss": 3.1588,
"step": 83000
},
{
"epoch": 24.198250728862973,
"eval_accuracy": 0.3729486428505506,
"eval_loss": 3.5548107624053955,
"eval_runtime": 182.5732,
"eval_samples_per_second": 91.054,
"eval_steps_per_second": 5.691,
"step": 83000
},
{
"epoch": 24.21282798833819,
"grad_norm": 0.43244871497154236,
"learning_rate": 0.00030963010501750286,
"loss": 3.1648,
"step": 83050
},
{
"epoch": 24.22740524781341,
"grad_norm": 0.4010060727596283,
"learning_rate": 0.00030945507584597434,
"loss": 3.1591,
"step": 83100
},
{
"epoch": 24.24198250728863,
"grad_norm": 0.409901887178421,
"learning_rate": 0.0003092800466744457,
"loss": 3.1687,
"step": 83150
},
{
"epoch": 24.25655976676385,
"grad_norm": 0.41060811281204224,
"learning_rate": 0.00030910501750291714,
"loss": 3.1708,
"step": 83200
},
{
"epoch": 24.271137026239067,
"grad_norm": 0.4232666790485382,
"learning_rate": 0.0003089299883313885,
"loss": 3.1688,
"step": 83250
},
{
"epoch": 24.285714285714285,
"grad_norm": 0.39276716113090515,
"learning_rate": 0.00030875495915986,
"loss": 3.1786,
"step": 83300
},
{
"epoch": 24.300291545189506,
"grad_norm": 0.38360247015953064,
"learning_rate": 0.00030857992998833137,
"loss": 3.1669,
"step": 83350
},
{
"epoch": 24.314868804664723,
"grad_norm": 0.3872515857219696,
"learning_rate": 0.00030840490081680274,
"loss": 3.1863,
"step": 83400
},
{
"epoch": 24.32944606413994,
"grad_norm": 0.438064306974411,
"learning_rate": 0.0003082298716452742,
"loss": 3.1957,
"step": 83450
},
{
"epoch": 24.34402332361516,
"grad_norm": 0.4066021144390106,
"learning_rate": 0.0003080548424737456,
"loss": 3.1811,
"step": 83500
},
{
"epoch": 24.35860058309038,
"grad_norm": 0.3971489369869232,
"learning_rate": 0.000307879813302217,
"loss": 3.1741,
"step": 83550
},
{
"epoch": 24.373177842565596,
"grad_norm": 0.4131510853767395,
"learning_rate": 0.0003077047841306884,
"loss": 3.1878,
"step": 83600
},
{
"epoch": 24.387755102040817,
"grad_norm": 0.4342701733112335,
"learning_rate": 0.0003075297549591599,
"loss": 3.18,
"step": 83650
},
{
"epoch": 24.402332361516034,
"grad_norm": 0.4129961133003235,
"learning_rate": 0.00030735472578763125,
"loss": 3.1804,
"step": 83700
},
{
"epoch": 24.416909620991255,
"grad_norm": 0.4195544123649597,
"learning_rate": 0.0003071796966161026,
"loss": 3.1703,
"step": 83750
},
{
"epoch": 24.431486880466473,
"grad_norm": 0.4069652557373047,
"learning_rate": 0.00030700466744457405,
"loss": 3.1859,
"step": 83800
},
{
"epoch": 24.44606413994169,
"grad_norm": 0.41638901829719543,
"learning_rate": 0.0003068296382730455,
"loss": 3.1812,
"step": 83850
},
{
"epoch": 24.46064139941691,
"grad_norm": 0.3933592736721039,
"learning_rate": 0.0003066546091015169,
"loss": 3.1845,
"step": 83900
},
{
"epoch": 24.47521865889213,
"grad_norm": 0.44356077909469604,
"learning_rate": 0.00030647957992998827,
"loss": 3.1838,
"step": 83950
},
{
"epoch": 24.489795918367346,
"grad_norm": 0.39887094497680664,
"learning_rate": 0.00030630455075845975,
"loss": 3.1918,
"step": 84000
},
{
"epoch": 24.489795918367346,
"eval_accuracy": 0.3730551778109691,
"eval_loss": 3.548759937286377,
"eval_runtime": 182.4228,
"eval_samples_per_second": 91.129,
"eval_steps_per_second": 5.696,
"step": 84000
},
{
"epoch": 24.504373177842567,
"grad_norm": 0.3996953070163727,
"learning_rate": 0.0003061295215869311,
"loss": 3.186,
"step": 84050
},
{
"epoch": 24.518950437317784,
"grad_norm": 0.4084763526916504,
"learning_rate": 0.00030595449241540255,
"loss": 3.2012,
"step": 84100
},
{
"epoch": 24.533527696793,
"grad_norm": 0.4281042516231537,
"learning_rate": 0.0003057794632438739,
"loss": 3.2031,
"step": 84150
},
{
"epoch": 24.548104956268222,
"grad_norm": 0.4151524305343628,
"learning_rate": 0.0003056044340723454,
"loss": 3.1991,
"step": 84200
},
{
"epoch": 24.56268221574344,
"grad_norm": 0.41447973251342773,
"learning_rate": 0.0003054294049008168,
"loss": 3.1998,
"step": 84250
},
{
"epoch": 24.57725947521866,
"grad_norm": 0.43595123291015625,
"learning_rate": 0.00030525437572928815,
"loss": 3.2024,
"step": 84300
},
{
"epoch": 24.591836734693878,
"grad_norm": 0.392225980758667,
"learning_rate": 0.00030507934655775963,
"loss": 3.2039,
"step": 84350
},
{
"epoch": 24.606413994169095,
"grad_norm": 0.4449690580368042,
"learning_rate": 0.000304904317386231,
"loss": 3.1923,
"step": 84400
},
{
"epoch": 24.620991253644316,
"grad_norm": 0.44518622756004333,
"learning_rate": 0.00030472928821470243,
"loss": 3.1881,
"step": 84450
},
{
"epoch": 24.635568513119534,
"grad_norm": 0.411631315946579,
"learning_rate": 0.0003045542590431738,
"loss": 3.2078,
"step": 84500
},
{
"epoch": 24.65014577259475,
"grad_norm": 0.41331884264945984,
"learning_rate": 0.0003043792298716453,
"loss": 3.2033,
"step": 84550
},
{
"epoch": 24.664723032069972,
"grad_norm": 0.4580352008342743,
"learning_rate": 0.00030420420070011666,
"loss": 3.2081,
"step": 84600
},
{
"epoch": 24.67930029154519,
"grad_norm": 0.38811588287353516,
"learning_rate": 0.00030402917152858803,
"loss": 3.1988,
"step": 84650
},
{
"epoch": 24.693877551020407,
"grad_norm": 0.384888619184494,
"learning_rate": 0.0003038541423570595,
"loss": 3.2015,
"step": 84700
},
{
"epoch": 24.708454810495628,
"grad_norm": 0.4199409484863281,
"learning_rate": 0.0003036791131855309,
"loss": 3.1869,
"step": 84750
},
{
"epoch": 24.723032069970845,
"grad_norm": 0.40446436405181885,
"learning_rate": 0.0003035040840140023,
"loss": 3.1997,
"step": 84800
},
{
"epoch": 24.737609329446066,
"grad_norm": 0.3871822953224182,
"learning_rate": 0.0003033290548424737,
"loss": 3.2059,
"step": 84850
},
{
"epoch": 24.752186588921283,
"grad_norm": 0.41010650992393494,
"learning_rate": 0.00030315402567094517,
"loss": 3.2043,
"step": 84900
},
{
"epoch": 24.7667638483965,
"grad_norm": 0.3981761634349823,
"learning_rate": 0.00030297899649941654,
"loss": 3.2215,
"step": 84950
},
{
"epoch": 24.78134110787172,
"grad_norm": 0.4111586809158325,
"learning_rate": 0.00030280396732788796,
"loss": 3.204,
"step": 85000
},
{
"epoch": 24.78134110787172,
"eval_accuracy": 0.3736616619171302,
"eval_loss": 3.53930401802063,
"eval_runtime": 182.661,
"eval_samples_per_second": 91.01,
"eval_steps_per_second": 5.688,
"step": 85000
},
{
"epoch": 24.79591836734694,
"grad_norm": 0.41816040873527527,
"learning_rate": 0.00030262893815635934,
"loss": 3.2047,
"step": 85050
},
{
"epoch": 24.810495626822156,
"grad_norm": 0.3968750238418579,
"learning_rate": 0.0003024539089848308,
"loss": 3.2043,
"step": 85100
},
{
"epoch": 24.825072886297377,
"grad_norm": 0.395974338054657,
"learning_rate": 0.0003022788798133022,
"loss": 3.2157,
"step": 85150
},
{
"epoch": 24.839650145772595,
"grad_norm": 0.41137680411338806,
"learning_rate": 0.00030210385064177356,
"loss": 3.2179,
"step": 85200
},
{
"epoch": 24.854227405247812,
"grad_norm": 0.40540027618408203,
"learning_rate": 0.00030192882147024504,
"loss": 3.2213,
"step": 85250
},
{
"epoch": 24.868804664723033,
"grad_norm": 0.39145246148109436,
"learning_rate": 0.0003017537922987164,
"loss": 3.2152,
"step": 85300
},
{
"epoch": 24.88338192419825,
"grad_norm": 0.3660285770893097,
"learning_rate": 0.00030157876312718784,
"loss": 3.2211,
"step": 85350
},
{
"epoch": 24.897959183673468,
"grad_norm": 0.3978833556175232,
"learning_rate": 0.0003014037339556592,
"loss": 3.2169,
"step": 85400
},
{
"epoch": 24.91253644314869,
"grad_norm": 0.411643385887146,
"learning_rate": 0.0003012287047841307,
"loss": 3.2271,
"step": 85450
},
{
"epoch": 24.927113702623906,
"grad_norm": 0.4295453727245331,
"learning_rate": 0.00030105367561260207,
"loss": 3.2275,
"step": 85500
},
{
"epoch": 24.941690962099127,
"grad_norm": 0.40958255529403687,
"learning_rate": 0.00030087864644107344,
"loss": 3.2266,
"step": 85550
},
{
"epoch": 24.956268221574344,
"grad_norm": 0.4321279525756836,
"learning_rate": 0.0003007036172695449,
"loss": 3.2174,
"step": 85600
},
{
"epoch": 24.97084548104956,
"grad_norm": 0.4050544500350952,
"learning_rate": 0.0003005285880980163,
"loss": 3.2255,
"step": 85650
},
{
"epoch": 24.985422740524783,
"grad_norm": 0.4148904085159302,
"learning_rate": 0.0003003535589264877,
"loss": 3.2268,
"step": 85700
},
{
"epoch": 25.0,
"grad_norm": 0.39421969652175903,
"learning_rate": 0.0003001785297549591,
"loss": 3.2183,
"step": 85750
},
{
"epoch": 25.014577259475217,
"grad_norm": 0.4394013285636902,
"learning_rate": 0.0003000035005834306,
"loss": 3.1087,
"step": 85800
},
{
"epoch": 25.02915451895044,
"grad_norm": 0.42048442363739014,
"learning_rate": 0.00029982847141190195,
"loss": 3.1202,
"step": 85850
},
{
"epoch": 25.043731778425656,
"grad_norm": 0.45491907000541687,
"learning_rate": 0.0002996534422403734,
"loss": 3.1221,
"step": 85900
},
{
"epoch": 25.058309037900873,
"grad_norm": 0.43091389536857605,
"learning_rate": 0.0002994784130688448,
"loss": 3.1416,
"step": 85950
},
{
"epoch": 25.072886297376094,
"grad_norm": 0.4495410621166229,
"learning_rate": 0.00029930338389731623,
"loss": 3.1397,
"step": 86000
},
{
"epoch": 25.072886297376094,
"eval_accuracy": 0.373125926442142,
"eval_loss": 3.553874969482422,
"eval_runtime": 182.5843,
"eval_samples_per_second": 91.048,
"eval_steps_per_second": 5.691,
"step": 86000
},
{
"epoch": 25.08746355685131,
"grad_norm": 0.412474125623703,
"learning_rate": 0.0002991283547257876,
"loss": 3.1423,
"step": 86050
},
{
"epoch": 25.102040816326532,
"grad_norm": 0.3976951539516449,
"learning_rate": 0.00029895332555425903,
"loss": 3.1351,
"step": 86100
},
{
"epoch": 25.11661807580175,
"grad_norm": 0.40554651618003845,
"learning_rate": 0.0002987782963827304,
"loss": 3.13,
"step": 86150
},
{
"epoch": 25.131195335276967,
"grad_norm": 0.4113754332065582,
"learning_rate": 0.00029860326721120183,
"loss": 3.1354,
"step": 86200
},
{
"epoch": 25.145772594752188,
"grad_norm": 0.39526087045669556,
"learning_rate": 0.00029842823803967326,
"loss": 3.1438,
"step": 86250
},
{
"epoch": 25.160349854227405,
"grad_norm": 0.42777061462402344,
"learning_rate": 0.0002982532088681447,
"loss": 3.1446,
"step": 86300
},
{
"epoch": 25.174927113702623,
"grad_norm": 0.41080230474472046,
"learning_rate": 0.0002980781796966161,
"loss": 3.1497,
"step": 86350
},
{
"epoch": 25.189504373177844,
"grad_norm": 0.40987899899482727,
"learning_rate": 0.0002979031505250875,
"loss": 3.166,
"step": 86400
},
{
"epoch": 25.20408163265306,
"grad_norm": 0.42429783940315247,
"learning_rate": 0.0002977281213535589,
"loss": 3.1537,
"step": 86450
},
{
"epoch": 25.21865889212828,
"grad_norm": 0.4256708323955536,
"learning_rate": 0.0002975530921820303,
"loss": 3.1527,
"step": 86500
},
{
"epoch": 25.2332361516035,
"grad_norm": 0.41000252962112427,
"learning_rate": 0.0002973780630105017,
"loss": 3.1513,
"step": 86550
},
{
"epoch": 25.247813411078717,
"grad_norm": 0.4126875102519989,
"learning_rate": 0.00029720303383897313,
"loss": 3.1622,
"step": 86600
},
{
"epoch": 25.262390670553934,
"grad_norm": 0.3998701572418213,
"learning_rate": 0.00029702800466744456,
"loss": 3.1508,
"step": 86650
},
{
"epoch": 25.276967930029155,
"grad_norm": 0.4031747877597809,
"learning_rate": 0.00029685297549591593,
"loss": 3.1588,
"step": 86700
},
{
"epoch": 25.291545189504372,
"grad_norm": 0.411139577627182,
"learning_rate": 0.00029667794632438736,
"loss": 3.1485,
"step": 86750
},
{
"epoch": 25.306122448979593,
"grad_norm": 0.41821885108947754,
"learning_rate": 0.0002965029171528588,
"loss": 3.165,
"step": 86800
},
{
"epoch": 25.32069970845481,
"grad_norm": 0.4090188443660736,
"learning_rate": 0.0002963278879813302,
"loss": 3.1456,
"step": 86850
},
{
"epoch": 25.335276967930028,
"grad_norm": 0.4112292230129242,
"learning_rate": 0.00029615285880980164,
"loss": 3.1734,
"step": 86900
},
{
"epoch": 25.34985422740525,
"grad_norm": 0.40837302803993225,
"learning_rate": 0.000295977829638273,
"loss": 3.1718,
"step": 86950
},
{
"epoch": 25.364431486880466,
"grad_norm": 0.4170677363872528,
"learning_rate": 0.00029580280046674444,
"loss": 3.1854,
"step": 87000
},
{
"epoch": 25.364431486880466,
"eval_accuracy": 0.3733238107166872,
"eval_loss": 3.5502431392669678,
"eval_runtime": 182.4009,
"eval_samples_per_second": 91.14,
"eval_steps_per_second": 5.696,
"step": 87000
},
{
"epoch": 25.379008746355684,
"grad_norm": 0.41980138421058655,
"learning_rate": 0.0002956277712952158,
"loss": 3.1741,
"step": 87050
},
{
"epoch": 25.393586005830905,
"grad_norm": 0.4188307225704193,
"learning_rate": 0.00029545274212368724,
"loss": 3.1697,
"step": 87100
},
{
"epoch": 25.408163265306122,
"grad_norm": 0.43848034739494324,
"learning_rate": 0.00029527771295215867,
"loss": 3.185,
"step": 87150
},
{
"epoch": 25.42274052478134,
"grad_norm": 0.39616858959198,
"learning_rate": 0.0002951026837806301,
"loss": 3.1797,
"step": 87200
},
{
"epoch": 25.43731778425656,
"grad_norm": 0.41143250465393066,
"learning_rate": 0.0002949276546091015,
"loss": 3.1751,
"step": 87250
},
{
"epoch": 25.451895043731778,
"grad_norm": 0.4165962338447571,
"learning_rate": 0.0002947526254375729,
"loss": 3.1736,
"step": 87300
},
{
"epoch": 25.466472303207,
"grad_norm": 0.47294145822525024,
"learning_rate": 0.0002945775962660443,
"loss": 3.181,
"step": 87350
},
{
"epoch": 25.481049562682216,
"grad_norm": 0.4045226275920868,
"learning_rate": 0.0002944025670945157,
"loss": 3.1849,
"step": 87400
},
{
"epoch": 25.495626822157433,
"grad_norm": 0.4241320788860321,
"learning_rate": 0.0002942275379229871,
"loss": 3.1866,
"step": 87450
},
{
"epoch": 25.510204081632654,
"grad_norm": 0.42185652256011963,
"learning_rate": 0.00029405250875145855,
"loss": 3.1941,
"step": 87500
},
{
"epoch": 25.52478134110787,
"grad_norm": 0.4017044007778168,
"learning_rate": 0.00029387747957993,
"loss": 3.1831,
"step": 87550
},
{
"epoch": 25.53935860058309,
"grad_norm": 0.4189952313899994,
"learning_rate": 0.0002937024504084014,
"loss": 3.187,
"step": 87600
},
{
"epoch": 25.55393586005831,
"grad_norm": 0.4023108184337616,
"learning_rate": 0.00029352742123687277,
"loss": 3.1787,
"step": 87650
},
{
"epoch": 25.568513119533527,
"grad_norm": 0.41992291808128357,
"learning_rate": 0.0002933523920653442,
"loss": 3.185,
"step": 87700
},
{
"epoch": 25.583090379008745,
"grad_norm": 0.40843623876571655,
"learning_rate": 0.0002931773628938156,
"loss": 3.1947,
"step": 87750
},
{
"epoch": 25.597667638483966,
"grad_norm": 0.474385142326355,
"learning_rate": 0.00029300233372228705,
"loss": 3.1747,
"step": 87800
},
{
"epoch": 25.612244897959183,
"grad_norm": 0.39669227600097656,
"learning_rate": 0.0002928273045507584,
"loss": 3.1836,
"step": 87850
},
{
"epoch": 25.626822157434404,
"grad_norm": 0.40173813700675964,
"learning_rate": 0.00029265227537922985,
"loss": 3.1895,
"step": 87900
},
{
"epoch": 25.64139941690962,
"grad_norm": 0.424064964056015,
"learning_rate": 0.0002924772462077012,
"loss": 3.1904,
"step": 87950
},
{
"epoch": 25.65597667638484,
"grad_norm": 0.4088822603225708,
"learning_rate": 0.00029230221703617265,
"loss": 3.1857,
"step": 88000
},
{
"epoch": 25.65597667638484,
"eval_accuracy": 0.37384942242748087,
"eval_loss": 3.5422239303588867,
"eval_runtime": 182.3439,
"eval_samples_per_second": 91.168,
"eval_steps_per_second": 5.698,
"step": 88000
},
{
"epoch": 25.67055393586006,
"grad_norm": 0.4238326847553253,
"learning_rate": 0.0002921271878646441,
"loss": 3.1843,
"step": 88050
},
{
"epoch": 25.685131195335277,
"grad_norm": 0.4173918664455414,
"learning_rate": 0.0002919521586931155,
"loss": 3.1886,
"step": 88100
},
{
"epoch": 25.699708454810494,
"grad_norm": 0.4285440444946289,
"learning_rate": 0.00029177712952158693,
"loss": 3.1955,
"step": 88150
},
{
"epoch": 25.714285714285715,
"grad_norm": 0.42694538831710815,
"learning_rate": 0.0002916021003500583,
"loss": 3.1886,
"step": 88200
},
{
"epoch": 25.728862973760933,
"grad_norm": 0.4153097867965698,
"learning_rate": 0.00029142707117852973,
"loss": 3.2084,
"step": 88250
},
{
"epoch": 25.74344023323615,
"grad_norm": 0.4131809175014496,
"learning_rate": 0.0002912520420070011,
"loss": 3.1952,
"step": 88300
},
{
"epoch": 25.75801749271137,
"grad_norm": 0.4065682888031006,
"learning_rate": 0.00029107701283547253,
"loss": 3.2032,
"step": 88350
},
{
"epoch": 25.77259475218659,
"grad_norm": 0.41387709975242615,
"learning_rate": 0.00029090198366394396,
"loss": 3.1827,
"step": 88400
},
{
"epoch": 25.78717201166181,
"grad_norm": 0.4111236333847046,
"learning_rate": 0.0002907269544924154,
"loss": 3.1942,
"step": 88450
},
{
"epoch": 25.801749271137027,
"grad_norm": 0.3958951532840729,
"learning_rate": 0.0002905519253208868,
"loss": 3.2109,
"step": 88500
},
{
"epoch": 25.816326530612244,
"grad_norm": 0.38538554310798645,
"learning_rate": 0.0002903768961493582,
"loss": 3.1974,
"step": 88550
},
{
"epoch": 25.830903790087465,
"grad_norm": 0.43065372109413147,
"learning_rate": 0.0002902018669778296,
"loss": 3.2017,
"step": 88600
},
{
"epoch": 25.845481049562682,
"grad_norm": 0.4160745143890381,
"learning_rate": 0.00029002683780630104,
"loss": 3.2081,
"step": 88650
},
{
"epoch": 25.8600583090379,
"grad_norm": 0.4086984694004059,
"learning_rate": 0.00028985180863477246,
"loss": 3.2086,
"step": 88700
},
{
"epoch": 25.87463556851312,
"grad_norm": 0.4291251003742218,
"learning_rate": 0.00028967677946324384,
"loss": 3.1999,
"step": 88750
},
{
"epoch": 25.889212827988338,
"grad_norm": 0.4759675860404968,
"learning_rate": 0.00028950175029171526,
"loss": 3.2125,
"step": 88800
},
{
"epoch": 25.903790087463555,
"grad_norm": 0.4120366871356964,
"learning_rate": 0.0002893267211201867,
"loss": 3.1932,
"step": 88850
},
{
"epoch": 25.918367346938776,
"grad_norm": 0.42245393991470337,
"learning_rate": 0.00028915169194865806,
"loss": 3.1992,
"step": 88900
},
{
"epoch": 25.932944606413994,
"grad_norm": 0.4053804874420166,
"learning_rate": 0.0002889766627771295,
"loss": 3.1935,
"step": 88950
},
{
"epoch": 25.947521865889215,
"grad_norm": 0.4270218014717102,
"learning_rate": 0.0002888016336056009,
"loss": 3.2093,
"step": 89000
},
{
"epoch": 25.947521865889215,
"eval_accuracy": 0.374267675150538,
"eval_loss": 3.533585786819458,
"eval_runtime": 182.2598,
"eval_samples_per_second": 91.21,
"eval_steps_per_second": 5.701,
"step": 89000
},
{
"epoch": 25.962099125364432,
"grad_norm": 0.4217967987060547,
"learning_rate": 0.00028862660443407234,
"loss": 3.2001,
"step": 89050
},
{
"epoch": 25.97667638483965,
"grad_norm": 0.4118807911872864,
"learning_rate": 0.0002884515752625437,
"loss": 3.2036,
"step": 89100
},
{
"epoch": 25.99125364431487,
"grad_norm": 0.42750346660614014,
"learning_rate": 0.00028827654609101514,
"loss": 3.2162,
"step": 89150
},
{
"epoch": 26.005830903790088,
"grad_norm": 0.418365478515625,
"learning_rate": 0.0002881015169194865,
"loss": 3.1788,
"step": 89200
},
{
"epoch": 26.020408163265305,
"grad_norm": 0.4194610118865967,
"learning_rate": 0.00028792648774795794,
"loss": 3.1106,
"step": 89250
},
{
"epoch": 26.034985422740526,
"grad_norm": 0.4437054991722107,
"learning_rate": 0.00028775145857642937,
"loss": 3.1061,
"step": 89300
},
{
"epoch": 26.049562682215743,
"grad_norm": 0.424712598323822,
"learning_rate": 0.0002875764294049008,
"loss": 3.1115,
"step": 89350
},
{
"epoch": 26.06413994169096,
"grad_norm": 0.39572784304618835,
"learning_rate": 0.0002874014002333722,
"loss": 3.1158,
"step": 89400
},
{
"epoch": 26.07871720116618,
"grad_norm": 0.4253915846347809,
"learning_rate": 0.0002872263710618436,
"loss": 3.1204,
"step": 89450
},
{
"epoch": 26.0932944606414,
"grad_norm": 0.4415355324745178,
"learning_rate": 0.000287051341890315,
"loss": 3.1092,
"step": 89500
},
{
"epoch": 26.107871720116616,
"grad_norm": 0.4195983111858368,
"learning_rate": 0.00028687631271878645,
"loss": 3.1289,
"step": 89550
},
{
"epoch": 26.122448979591837,
"grad_norm": 0.4112869203090668,
"learning_rate": 0.0002867012835472579,
"loss": 3.1264,
"step": 89600
},
{
"epoch": 26.137026239067055,
"grad_norm": 0.4152833819389343,
"learning_rate": 0.00028652625437572925,
"loss": 3.1271,
"step": 89650
},
{
"epoch": 26.151603498542276,
"grad_norm": 0.4177822172641754,
"learning_rate": 0.0002863512252042007,
"loss": 3.1187,
"step": 89700
},
{
"epoch": 26.166180758017493,
"grad_norm": 0.3986111581325531,
"learning_rate": 0.0002861761960326721,
"loss": 3.1502,
"step": 89750
},
{
"epoch": 26.18075801749271,
"grad_norm": 0.4292623698711395,
"learning_rate": 0.0002860011668611435,
"loss": 3.1275,
"step": 89800
},
{
"epoch": 26.19533527696793,
"grad_norm": 0.4087209701538086,
"learning_rate": 0.0002858261376896149,
"loss": 3.1387,
"step": 89850
},
{
"epoch": 26.20991253644315,
"grad_norm": 0.44936609268188477,
"learning_rate": 0.00028565110851808633,
"loss": 3.1336,
"step": 89900
},
{
"epoch": 26.224489795918366,
"grad_norm": 0.46571049094200134,
"learning_rate": 0.00028547607934655776,
"loss": 3.1375,
"step": 89950
},
{
"epoch": 26.239067055393587,
"grad_norm": 0.4057162404060364,
"learning_rate": 0.00028530105017502913,
"loss": 3.1503,
"step": 90000
},
{
"epoch": 26.239067055393587,
"eval_accuracy": 0.3732979127152595,
"eval_loss": 3.5532782077789307,
"eval_runtime": 182.1912,
"eval_samples_per_second": 91.245,
"eval_steps_per_second": 5.703,
"step": 90000
},
{
"epoch": 26.253644314868804,
"grad_norm": 0.4367867112159729,
"learning_rate": 0.00028512602100350055,
"loss": 3.1442,
"step": 90050
},
{
"epoch": 26.26822157434402,
"grad_norm": 0.45157337188720703,
"learning_rate": 0.000284950991831972,
"loss": 3.1556,
"step": 90100
},
{
"epoch": 26.282798833819243,
"grad_norm": 0.3997921347618103,
"learning_rate": 0.00028477596266044335,
"loss": 3.1671,
"step": 90150
},
{
"epoch": 26.29737609329446,
"grad_norm": 0.5283579230308533,
"learning_rate": 0.0002846009334889148,
"loss": 3.1531,
"step": 90200
},
{
"epoch": 26.31195335276968,
"grad_norm": 0.4433899223804474,
"learning_rate": 0.0002844259043173862,
"loss": 3.1575,
"step": 90250
},
{
"epoch": 26.3265306122449,
"grad_norm": 0.4299410283565521,
"learning_rate": 0.00028425087514585763,
"loss": 3.149,
"step": 90300
},
{
"epoch": 26.341107871720116,
"grad_norm": 0.39908355474472046,
"learning_rate": 0.000284075845974329,
"loss": 3.1512,
"step": 90350
},
{
"epoch": 26.355685131195337,
"grad_norm": 0.4303843080997467,
"learning_rate": 0.00028390081680280043,
"loss": 3.1551,
"step": 90400
},
{
"epoch": 26.370262390670554,
"grad_norm": 0.4121764004230499,
"learning_rate": 0.00028372578763127186,
"loss": 3.1556,
"step": 90450
},
{
"epoch": 26.38483965014577,
"grad_norm": 0.4113008677959442,
"learning_rate": 0.0002835507584597433,
"loss": 3.1623,
"step": 90500
},
{
"epoch": 26.399416909620992,
"grad_norm": 0.4334237575531006,
"learning_rate": 0.0002833757292882147,
"loss": 3.1508,
"step": 90550
},
{
"epoch": 26.41399416909621,
"grad_norm": 0.4339134693145752,
"learning_rate": 0.0002832007001166861,
"loss": 3.1664,
"step": 90600
},
{
"epoch": 26.428571428571427,
"grad_norm": 0.42791739106178284,
"learning_rate": 0.0002830256709451575,
"loss": 3.1589,
"step": 90650
},
{
"epoch": 26.443148688046648,
"grad_norm": 0.4143845736980438,
"learning_rate": 0.0002828506417736289,
"loss": 3.1752,
"step": 90700
},
{
"epoch": 26.457725947521865,
"grad_norm": 0.40137404203414917,
"learning_rate": 0.0002826756126021003,
"loss": 3.1681,
"step": 90750
},
{
"epoch": 26.472303206997086,
"grad_norm": 0.4495770335197449,
"learning_rate": 0.00028250058343057174,
"loss": 3.1732,
"step": 90800
},
{
"epoch": 26.486880466472304,
"grad_norm": 0.42186465859413147,
"learning_rate": 0.00028232555425904317,
"loss": 3.1692,
"step": 90850
},
{
"epoch": 26.50145772594752,
"grad_norm": 0.4350731074810028,
"learning_rate": 0.00028215052508751454,
"loss": 3.1882,
"step": 90900
},
{
"epoch": 26.516034985422742,
"grad_norm": 0.40870970487594604,
"learning_rate": 0.00028197549591598597,
"loss": 3.1807,
"step": 90950
},
{
"epoch": 26.53061224489796,
"grad_norm": 0.39996737241744995,
"learning_rate": 0.0002818004667444574,
"loss": 3.1896,
"step": 91000
},
{
"epoch": 26.53061224489796,
"eval_accuracy": 0.3737564250587178,
"eval_loss": 3.5460140705108643,
"eval_runtime": 182.2217,
"eval_samples_per_second": 91.23,
"eval_steps_per_second": 5.702,
"step": 91000
},
{
"epoch": 26.545189504373177,
"grad_norm": 0.43439486622810364,
"learning_rate": 0.00028162543757292877,
"loss": 3.1718,
"step": 91050
},
{
"epoch": 26.559766763848398,
"grad_norm": 0.4343793988227844,
"learning_rate": 0.0002814504084014002,
"loss": 3.181,
"step": 91100
},
{
"epoch": 26.574344023323615,
"grad_norm": 0.4268733263015747,
"learning_rate": 0.0002812753792298716,
"loss": 3.1965,
"step": 91150
},
{
"epoch": 26.588921282798832,
"grad_norm": 0.42242977023124695,
"learning_rate": 0.00028110035005834305,
"loss": 3.1764,
"step": 91200
},
{
"epoch": 26.603498542274053,
"grad_norm": 0.4688321053981781,
"learning_rate": 0.0002809253208868144,
"loss": 3.1757,
"step": 91250
},
{
"epoch": 26.61807580174927,
"grad_norm": 0.4145612120628357,
"learning_rate": 0.00028075029171528585,
"loss": 3.18,
"step": 91300
},
{
"epoch": 26.632653061224488,
"grad_norm": 0.4039236009120941,
"learning_rate": 0.00028057526254375727,
"loss": 3.185,
"step": 91350
},
{
"epoch": 26.64723032069971,
"grad_norm": 0.44142764806747437,
"learning_rate": 0.0002804002333722287,
"loss": 3.1799,
"step": 91400
},
{
"epoch": 26.661807580174926,
"grad_norm": 0.45887356996536255,
"learning_rate": 0.0002802252042007001,
"loss": 3.1985,
"step": 91450
},
{
"epoch": 26.676384839650147,
"grad_norm": 0.4100106954574585,
"learning_rate": 0.0002800501750291715,
"loss": 3.1803,
"step": 91500
},
{
"epoch": 26.690962099125365,
"grad_norm": 0.3970423638820648,
"learning_rate": 0.0002798751458576429,
"loss": 3.2064,
"step": 91550
},
{
"epoch": 26.705539358600582,
"grad_norm": 0.43205025792121887,
"learning_rate": 0.0002797001166861143,
"loss": 3.1793,
"step": 91600
},
{
"epoch": 26.720116618075803,
"grad_norm": 0.405561238527298,
"learning_rate": 0.0002795250875145857,
"loss": 3.1937,
"step": 91650
},
{
"epoch": 26.73469387755102,
"grad_norm": 0.4518439769744873,
"learning_rate": 0.00027935005834305715,
"loss": 3.1808,
"step": 91700
},
{
"epoch": 26.749271137026238,
"grad_norm": 0.41998225450515747,
"learning_rate": 0.0002791750291715286,
"loss": 3.1911,
"step": 91750
},
{
"epoch": 26.76384839650146,
"grad_norm": 0.41252264380455017,
"learning_rate": 0.000279,
"loss": 3.1918,
"step": 91800
},
{
"epoch": 26.778425655976676,
"grad_norm": 0.3939790427684784,
"learning_rate": 0.0002788249708284714,
"loss": 3.1961,
"step": 91850
},
{
"epoch": 26.793002915451893,
"grad_norm": 0.42222079634666443,
"learning_rate": 0.0002786499416569428,
"loss": 3.1969,
"step": 91900
},
{
"epoch": 26.807580174927114,
"grad_norm": 0.45875927805900574,
"learning_rate": 0.0002784749124854142,
"loss": 3.1919,
"step": 91950
},
{
"epoch": 26.82215743440233,
"grad_norm": 0.43025851249694824,
"learning_rate": 0.0002782998833138856,
"loss": 3.182,
"step": 92000
},
{
"epoch": 26.82215743440233,
"eval_accuracy": 0.3743816263568198,
"eval_loss": 3.536444902420044,
"eval_runtime": 182.341,
"eval_samples_per_second": 91.17,
"eval_steps_per_second": 5.698,
"step": 92000
},
{
"epoch": 26.836734693877553,
"grad_norm": 0.39891061186790466,
"learning_rate": 0.00027812485414235703,
"loss": 3.206,
"step": 92050
},
{
"epoch": 26.85131195335277,
"grad_norm": 0.4320659935474396,
"learning_rate": 0.00027794982497082846,
"loss": 3.2002,
"step": 92100
},
{
"epoch": 26.865889212827987,
"grad_norm": 0.4368121027946472,
"learning_rate": 0.00027777479579929983,
"loss": 3.1955,
"step": 92150
},
{
"epoch": 26.88046647230321,
"grad_norm": 0.41299423575401306,
"learning_rate": 0.00027759976662777126,
"loss": 3.1841,
"step": 92200
},
{
"epoch": 26.895043731778426,
"grad_norm": 0.46896249055862427,
"learning_rate": 0.0002774247374562427,
"loss": 3.1841,
"step": 92250
},
{
"epoch": 26.909620991253643,
"grad_norm": 0.41294729709625244,
"learning_rate": 0.0002772497082847141,
"loss": 3.1996,
"step": 92300
},
{
"epoch": 26.924198250728864,
"grad_norm": 0.41092661023139954,
"learning_rate": 0.00027707467911318554,
"loss": 3.1942,
"step": 92350
},
{
"epoch": 26.93877551020408,
"grad_norm": 0.42133837938308716,
"learning_rate": 0.0002768996499416569,
"loss": 3.1925,
"step": 92400
},
{
"epoch": 26.9533527696793,
"grad_norm": 0.4176836609840393,
"learning_rate": 0.00027672462077012834,
"loss": 3.1996,
"step": 92450
},
{
"epoch": 26.96793002915452,
"grad_norm": 0.41681262850761414,
"learning_rate": 0.0002765495915985997,
"loss": 3.2085,
"step": 92500
},
{
"epoch": 26.982507288629737,
"grad_norm": 0.4322628080844879,
"learning_rate": 0.00027637456242707114,
"loss": 3.1991,
"step": 92550
},
{
"epoch": 26.997084548104958,
"grad_norm": 0.4040561318397522,
"learning_rate": 0.00027619953325554256,
"loss": 3.1959,
"step": 92600
},
{
"epoch": 27.011661807580175,
"grad_norm": 0.4253135919570923,
"learning_rate": 0.000276024504084014,
"loss": 3.1227,
"step": 92650
},
{
"epoch": 27.026239067055393,
"grad_norm": 0.406501829624176,
"learning_rate": 0.0002758494749124854,
"loss": 3.1051,
"step": 92700
},
{
"epoch": 27.040816326530614,
"grad_norm": 0.42616963386535645,
"learning_rate": 0.0002756744457409568,
"loss": 3.1009,
"step": 92750
},
{
"epoch": 27.05539358600583,
"grad_norm": 0.41401493549346924,
"learning_rate": 0.0002754994165694282,
"loss": 3.1149,
"step": 92800
},
{
"epoch": 27.06997084548105,
"grad_norm": 0.46265655755996704,
"learning_rate": 0.0002753243873978996,
"loss": 3.1054,
"step": 92850
},
{
"epoch": 27.08454810495627,
"grad_norm": 0.4060516059398651,
"learning_rate": 0.000275149358226371,
"loss": 3.1261,
"step": 92900
},
{
"epoch": 27.099125364431487,
"grad_norm": 0.4009002149105072,
"learning_rate": 0.00027497432905484244,
"loss": 3.1259,
"step": 92950
},
{
"epoch": 27.113702623906704,
"grad_norm": 0.40886712074279785,
"learning_rate": 0.00027479929988331387,
"loss": 3.132,
"step": 93000
},
{
"epoch": 27.113702623906704,
"eval_accuracy": 0.37335924389136776,
"eval_loss": 3.5546579360961914,
"eval_runtime": 182.3175,
"eval_samples_per_second": 91.182,
"eval_steps_per_second": 5.699,
"step": 93000
},
{
"epoch": 27.128279883381925,
"grad_norm": 0.44393426179885864,
"learning_rate": 0.0002746242707117853,
"loss": 3.1276,
"step": 93050
},
{
"epoch": 27.142857142857142,
"grad_norm": 0.4561188519001007,
"learning_rate": 0.00027444924154025667,
"loss": 3.1334,
"step": 93100
},
{
"epoch": 27.157434402332363,
"grad_norm": 0.41121312975883484,
"learning_rate": 0.0002742742123687281,
"loss": 3.1328,
"step": 93150
},
{
"epoch": 27.17201166180758,
"grad_norm": 0.41144025325775146,
"learning_rate": 0.0002740991831971995,
"loss": 3.1183,
"step": 93200
},
{
"epoch": 27.186588921282798,
"grad_norm": 0.42008209228515625,
"learning_rate": 0.00027392415402567095,
"loss": 3.1315,
"step": 93250
},
{
"epoch": 27.20116618075802,
"grad_norm": 0.3998580276966095,
"learning_rate": 0.0002737491248541423,
"loss": 3.1425,
"step": 93300
},
{
"epoch": 27.215743440233236,
"grad_norm": 0.4630013108253479,
"learning_rate": 0.00027357409568261375,
"loss": 3.1392,
"step": 93350
},
{
"epoch": 27.230320699708454,
"grad_norm": 0.4226396381855011,
"learning_rate": 0.0002733990665110851,
"loss": 3.1364,
"step": 93400
},
{
"epoch": 27.244897959183675,
"grad_norm": 0.41518664360046387,
"learning_rate": 0.00027322403733955655,
"loss": 3.1334,
"step": 93450
},
{
"epoch": 27.259475218658892,
"grad_norm": 0.42913633584976196,
"learning_rate": 0.000273049008168028,
"loss": 3.1365,
"step": 93500
},
{
"epoch": 27.27405247813411,
"grad_norm": 0.42923083901405334,
"learning_rate": 0.0002728739789964994,
"loss": 3.1459,
"step": 93550
},
{
"epoch": 27.28862973760933,
"grad_norm": 0.4026493728160858,
"learning_rate": 0.00027269894982497083,
"loss": 3.1444,
"step": 93600
},
{
"epoch": 27.303206997084548,
"grad_norm": 0.41380855441093445,
"learning_rate": 0.0002725239206534422,
"loss": 3.1418,
"step": 93650
},
{
"epoch": 27.317784256559765,
"grad_norm": 0.3932145833969116,
"learning_rate": 0.00027234889148191363,
"loss": 3.1472,
"step": 93700
},
{
"epoch": 27.332361516034986,
"grad_norm": 0.39401277899742126,
"learning_rate": 0.000272173862310385,
"loss": 3.1552,
"step": 93750
},
{
"epoch": 27.346938775510203,
"grad_norm": 0.42028194665908813,
"learning_rate": 0.00027199883313885643,
"loss": 3.1562,
"step": 93800
},
{
"epoch": 27.361516034985424,
"grad_norm": 0.4550812244415283,
"learning_rate": 0.00027182380396732785,
"loss": 3.1481,
"step": 93850
},
{
"epoch": 27.37609329446064,
"grad_norm": 0.44632673263549805,
"learning_rate": 0.0002716487747957993,
"loss": 3.1464,
"step": 93900
},
{
"epoch": 27.39067055393586,
"grad_norm": 0.42995485663414,
"learning_rate": 0.0002714737456242707,
"loss": 3.1464,
"step": 93950
},
{
"epoch": 27.40524781341108,
"grad_norm": 0.43392640352249146,
"learning_rate": 0.0002712987164527421,
"loss": 3.1532,
"step": 94000
},
{
"epoch": 27.40524781341108,
"eval_accuracy": 0.3739876235805541,
"eval_loss": 3.5490474700927734,
"eval_runtime": 182.2581,
"eval_samples_per_second": 91.211,
"eval_steps_per_second": 5.701,
"step": 94000
},
{
"epoch": 27.419825072886297,
"grad_norm": 0.47143322229385376,
"learning_rate": 0.0002711236872812135,
"loss": 3.1634,
"step": 94050
},
{
"epoch": 27.434402332361515,
"grad_norm": 0.42636147141456604,
"learning_rate": 0.00027094865810968493,
"loss": 3.1727,
"step": 94100
},
{
"epoch": 27.448979591836736,
"grad_norm": 0.42488712072372437,
"learning_rate": 0.00027077362893815636,
"loss": 3.1599,
"step": 94150
},
{
"epoch": 27.463556851311953,
"grad_norm": 0.41161873936653137,
"learning_rate": 0.00027059859976662773,
"loss": 3.1518,
"step": 94200
},
{
"epoch": 27.47813411078717,
"grad_norm": 0.4553752541542053,
"learning_rate": 0.00027042357059509916,
"loss": 3.1597,
"step": 94250
},
{
"epoch": 27.49271137026239,
"grad_norm": 0.44867464900016785,
"learning_rate": 0.0002702485414235706,
"loss": 3.1586,
"step": 94300
},
{
"epoch": 27.50728862973761,
"grad_norm": 0.4399424195289612,
"learning_rate": 0.00027007351225204196,
"loss": 3.1653,
"step": 94350
},
{
"epoch": 27.52186588921283,
"grad_norm": 0.40725404024124146,
"learning_rate": 0.0002698984830805134,
"loss": 3.1611,
"step": 94400
},
{
"epoch": 27.536443148688047,
"grad_norm": 0.4193655252456665,
"learning_rate": 0.0002697234539089848,
"loss": 3.1653,
"step": 94450
},
{
"epoch": 27.551020408163264,
"grad_norm": 0.4339175224304199,
"learning_rate": 0.00026954842473745624,
"loss": 3.1616,
"step": 94500
},
{
"epoch": 27.565597667638485,
"grad_norm": 0.44409579038619995,
"learning_rate": 0.0002693733955659276,
"loss": 3.1797,
"step": 94550
},
{
"epoch": 27.580174927113703,
"grad_norm": 0.43605875968933105,
"learning_rate": 0.00026919836639439904,
"loss": 3.1683,
"step": 94600
},
{
"epoch": 27.59475218658892,
"grad_norm": 0.4129311442375183,
"learning_rate": 0.0002690233372228704,
"loss": 3.1685,
"step": 94650
},
{
"epoch": 27.60932944606414,
"grad_norm": 0.4296424388885498,
"learning_rate": 0.00026884830805134184,
"loss": 3.1754,
"step": 94700
},
{
"epoch": 27.62390670553936,
"grad_norm": 0.4571221172809601,
"learning_rate": 0.00026867327887981327,
"loss": 3.1584,
"step": 94750
},
{
"epoch": 27.638483965014576,
"grad_norm": 0.4272068440914154,
"learning_rate": 0.0002684982497082847,
"loss": 3.1751,
"step": 94800
},
{
"epoch": 27.653061224489797,
"grad_norm": 0.41919034719467163,
"learning_rate": 0.0002683232205367561,
"loss": 3.1656,
"step": 94850
},
{
"epoch": 27.667638483965014,
"grad_norm": 0.42660972476005554,
"learning_rate": 0.0002681481913652275,
"loss": 3.1793,
"step": 94900
},
{
"epoch": 27.682215743440235,
"grad_norm": 0.4040818214416504,
"learning_rate": 0.0002679731621936989,
"loss": 3.1682,
"step": 94950
},
{
"epoch": 27.696793002915452,
"grad_norm": 0.4299778342247009,
"learning_rate": 0.00026779813302217035,
"loss": 3.183,
"step": 95000
},
{
"epoch": 27.696793002915452,
"eval_accuracy": 0.374235891239695,
"eval_loss": 3.542213201522827,
"eval_runtime": 182.3118,
"eval_samples_per_second": 91.184,
"eval_steps_per_second": 5.699,
"step": 95000
},
{
"epoch": 27.71137026239067,
"grad_norm": 0.417683869600296,
"learning_rate": 0.00026762310385064177,
"loss": 3.1681,
"step": 95050
},
{
"epoch": 27.72594752186589,
"grad_norm": 0.46521857380867004,
"learning_rate": 0.0002674480746791132,
"loss": 3.1957,
"step": 95100
},
{
"epoch": 27.740524781341108,
"grad_norm": 0.43779146671295166,
"learning_rate": 0.00026727304550758457,
"loss": 3.174,
"step": 95150
},
{
"epoch": 27.755102040816325,
"grad_norm": 0.43895670771598816,
"learning_rate": 0.000267098016336056,
"loss": 3.1767,
"step": 95200
},
{
"epoch": 27.769679300291546,
"grad_norm": 0.4087444245815277,
"learning_rate": 0.00026692298716452737,
"loss": 3.1691,
"step": 95250
},
{
"epoch": 27.784256559766764,
"grad_norm": 0.4138129949569702,
"learning_rate": 0.0002667479579929988,
"loss": 3.1816,
"step": 95300
},
{
"epoch": 27.79883381924198,
"grad_norm": 0.4436318278312683,
"learning_rate": 0.0002665729288214702,
"loss": 3.1838,
"step": 95350
},
{
"epoch": 27.813411078717202,
"grad_norm": 0.40394556522369385,
"learning_rate": 0.00026639789964994165,
"loss": 3.1816,
"step": 95400
},
{
"epoch": 27.82798833819242,
"grad_norm": 0.3930664360523224,
"learning_rate": 0.000266222870478413,
"loss": 3.1737,
"step": 95450
},
{
"epoch": 27.842565597667637,
"grad_norm": 0.41876089572906494,
"learning_rate": 0.00026604784130688445,
"loss": 3.1655,
"step": 95500
},
{
"epoch": 27.857142857142858,
"grad_norm": 0.4264007806777954,
"learning_rate": 0.0002658728121353559,
"loss": 3.1931,
"step": 95550
},
{
"epoch": 27.871720116618075,
"grad_norm": 0.4150126874446869,
"learning_rate": 0.00026569778296382725,
"loss": 3.1939,
"step": 95600
},
{
"epoch": 27.886297376093296,
"grad_norm": 0.42122021317481995,
"learning_rate": 0.0002655227537922987,
"loss": 3.1902,
"step": 95650
},
{
"epoch": 27.900874635568513,
"grad_norm": 0.41215264797210693,
"learning_rate": 0.0002653477246207701,
"loss": 3.1878,
"step": 95700
},
{
"epoch": 27.91545189504373,
"grad_norm": 0.4309203624725342,
"learning_rate": 0.00026517269544924153,
"loss": 3.181,
"step": 95750
},
{
"epoch": 27.93002915451895,
"grad_norm": 0.42477867007255554,
"learning_rate": 0.0002649976662777129,
"loss": 3.1958,
"step": 95800
},
{
"epoch": 27.94460641399417,
"grad_norm": 0.44038382172584534,
"learning_rate": 0.00026482263710618433,
"loss": 3.1817,
"step": 95850
},
{
"epoch": 27.959183673469386,
"grad_norm": 0.41017380356788635,
"learning_rate": 0.00026464760793465576,
"loss": 3.1799,
"step": 95900
},
{
"epoch": 27.973760932944607,
"grad_norm": 0.40887367725372314,
"learning_rate": 0.0002644725787631272,
"loss": 3.1856,
"step": 95950
},
{
"epoch": 27.988338192419825,
"grad_norm": 0.4441905915737152,
"learning_rate": 0.0002642975495915986,
"loss": 3.1996,
"step": 96000
},
{
"epoch": 27.988338192419825,
"eval_accuracy": 0.3744487257241552,
"eval_loss": 3.5362908840179443,
"eval_runtime": 182.2992,
"eval_samples_per_second": 91.191,
"eval_steps_per_second": 5.699,
"step": 96000
},
{
"epoch": 28.002915451895042,
"grad_norm": 0.4339207112789154,
"learning_rate": 0.00026412252042007,
"loss": 3.1705,
"step": 96050
},
{
"epoch": 28.017492711370263,
"grad_norm": 0.4093478322029114,
"learning_rate": 0.0002639474912485414,
"loss": 3.0895,
"step": 96100
},
{
"epoch": 28.03206997084548,
"grad_norm": 0.41591787338256836,
"learning_rate": 0.0002637724620770128,
"loss": 3.0936,
"step": 96150
},
{
"epoch": 28.0466472303207,
"grad_norm": 0.41636621952056885,
"learning_rate": 0.0002635974329054842,
"loss": 3.0817,
"step": 96200
},
{
"epoch": 28.06122448979592,
"grad_norm": 0.4211430847644806,
"learning_rate": 0.00026342240373395564,
"loss": 3.1026,
"step": 96250
},
{
"epoch": 28.075801749271136,
"grad_norm": 0.4202626049518585,
"learning_rate": 0.00026324737456242706,
"loss": 3.1065,
"step": 96300
},
{
"epoch": 28.090379008746357,
"grad_norm": 0.4492689371109009,
"learning_rate": 0.0002630723453908985,
"loss": 3.1069,
"step": 96350
},
{
"epoch": 28.104956268221574,
"grad_norm": 0.4519897997379303,
"learning_rate": 0.00026289731621936986,
"loss": 3.1109,
"step": 96400
},
{
"epoch": 28.11953352769679,
"grad_norm": 0.43334949016571045,
"learning_rate": 0.0002627222870478413,
"loss": 3.1146,
"step": 96450
},
{
"epoch": 28.134110787172013,
"grad_norm": 0.44164830446243286,
"learning_rate": 0.00026254725787631266,
"loss": 3.1177,
"step": 96500
},
{
"epoch": 28.14868804664723,
"grad_norm": 0.4185495972633362,
"learning_rate": 0.0002623722287047841,
"loss": 3.1323,
"step": 96550
},
{
"epoch": 28.163265306122447,
"grad_norm": 0.42650115489959717,
"learning_rate": 0.0002621971995332555,
"loss": 3.1329,
"step": 96600
},
{
"epoch": 28.17784256559767,
"grad_norm": 0.4367852807044983,
"learning_rate": 0.00026202217036172694,
"loss": 3.1257,
"step": 96650
},
{
"epoch": 28.192419825072886,
"grad_norm": 0.45244693756103516,
"learning_rate": 0.0002618471411901983,
"loss": 3.1137,
"step": 96700
},
{
"epoch": 28.206997084548107,
"grad_norm": 0.4422908425331116,
"learning_rate": 0.00026167211201866974,
"loss": 3.1321,
"step": 96750
},
{
"epoch": 28.221574344023324,
"grad_norm": 0.43210363388061523,
"learning_rate": 0.00026149708284714117,
"loss": 3.1408,
"step": 96800
},
{
"epoch": 28.23615160349854,
"grad_norm": 0.45949944853782654,
"learning_rate": 0.0002613220536756126,
"loss": 3.1283,
"step": 96850
},
{
"epoch": 28.250728862973762,
"grad_norm": 0.41855332255363464,
"learning_rate": 0.000261147024504084,
"loss": 3.1189,
"step": 96900
},
{
"epoch": 28.26530612244898,
"grad_norm": 0.42870011925697327,
"learning_rate": 0.0002609719953325554,
"loss": 3.1422,
"step": 96950
},
{
"epoch": 28.279883381924197,
"grad_norm": 0.43530869483947754,
"learning_rate": 0.0002607969661610268,
"loss": 3.1371,
"step": 97000
},
{
"epoch": 28.279883381924197,
"eval_accuracy": 0.3735036841084213,
"eval_loss": 3.553114652633667,
"eval_runtime": 182.2048,
"eval_samples_per_second": 91.238,
"eval_steps_per_second": 5.702,
"step": 97000
},
{
"epoch": 28.294460641399418,
"grad_norm": 0.42264923453330994,
"learning_rate": 0.0002606219369894982,
"loss": 3.1177,
"step": 97050
},
{
"epoch": 28.309037900874635,
"grad_norm": 0.4130590260028839,
"learning_rate": 0.0002604469078179696,
"loss": 3.1388,
"step": 97100
},
{
"epoch": 28.323615160349853,
"grad_norm": 0.40744999051094055,
"learning_rate": 0.00026027187864644105,
"loss": 3.1388,
"step": 97150
},
{
"epoch": 28.338192419825074,
"grad_norm": 0.46660253405570984,
"learning_rate": 0.0002600968494749125,
"loss": 3.1389,
"step": 97200
},
{
"epoch": 28.35276967930029,
"grad_norm": 0.43974146246910095,
"learning_rate": 0.0002599218203033839,
"loss": 3.1489,
"step": 97250
},
{
"epoch": 28.367346938775512,
"grad_norm": 0.43621718883514404,
"learning_rate": 0.0002597467911318553,
"loss": 3.1529,
"step": 97300
},
{
"epoch": 28.38192419825073,
"grad_norm": 0.42073875665664673,
"learning_rate": 0.0002595717619603267,
"loss": 3.1414,
"step": 97350
},
{
"epoch": 28.396501457725947,
"grad_norm": 0.4213981032371521,
"learning_rate": 0.0002593967327887981,
"loss": 3.1533,
"step": 97400
},
{
"epoch": 28.411078717201168,
"grad_norm": 0.4182080030441284,
"learning_rate": 0.0002592217036172695,
"loss": 3.1385,
"step": 97450
},
{
"epoch": 28.425655976676385,
"grad_norm": 0.4123639464378357,
"learning_rate": 0.00025904667444574093,
"loss": 3.1479,
"step": 97500
},
{
"epoch": 28.440233236151602,
"grad_norm": 0.4333662688732147,
"learning_rate": 0.00025887164527421235,
"loss": 3.1448,
"step": 97550
},
{
"epoch": 28.454810495626823,
"grad_norm": 0.4333436191082001,
"learning_rate": 0.0002586966161026838,
"loss": 3.1457,
"step": 97600
},
{
"epoch": 28.46938775510204,
"grad_norm": 0.4152778089046478,
"learning_rate": 0.00025852158693115515,
"loss": 3.1555,
"step": 97650
},
{
"epoch": 28.483965014577258,
"grad_norm": 0.4503307044506073,
"learning_rate": 0.0002583465577596266,
"loss": 3.1413,
"step": 97700
},
{
"epoch": 28.49854227405248,
"grad_norm": 0.4560645520687103,
"learning_rate": 0.000258171528588098,
"loss": 3.1536,
"step": 97750
},
{
"epoch": 28.513119533527696,
"grad_norm": 0.4178006947040558,
"learning_rate": 0.00025799649941656943,
"loss": 3.1502,
"step": 97800
},
{
"epoch": 28.527696793002917,
"grad_norm": 0.4329889416694641,
"learning_rate": 0.0002578214702450408,
"loss": 3.1638,
"step": 97850
},
{
"epoch": 28.542274052478135,
"grad_norm": 0.42388278245925903,
"learning_rate": 0.00025764644107351223,
"loss": 3.1715,
"step": 97900
},
{
"epoch": 28.556851311953352,
"grad_norm": 0.43577128648757935,
"learning_rate": 0.0002574714119019836,
"loss": 3.1687,
"step": 97950
},
{
"epoch": 28.571428571428573,
"grad_norm": 0.4220936894416809,
"learning_rate": 0.00025729638273045503,
"loss": 3.1674,
"step": 98000
},
{
"epoch": 28.571428571428573,
"eval_accuracy": 0.3740367120650784,
"eval_loss": 3.5461931228637695,
"eval_runtime": 182.2109,
"eval_samples_per_second": 91.235,
"eval_steps_per_second": 5.702,
"step": 98000
},
{
"epoch": 28.58600583090379,
"grad_norm": 0.43830054998397827,
"learning_rate": 0.00025712135355892646,
"loss": 3.1637,
"step": 98050
},
{
"epoch": 28.600583090379008,
"grad_norm": 0.4078274071216583,
"learning_rate": 0.0002569463243873979,
"loss": 3.16,
"step": 98100
},
{
"epoch": 28.61516034985423,
"grad_norm": 0.4261452555656433,
"learning_rate": 0.0002567712952158693,
"loss": 3.1605,
"step": 98150
},
{
"epoch": 28.629737609329446,
"grad_norm": 0.41544103622436523,
"learning_rate": 0.0002565962660443407,
"loss": 3.1539,
"step": 98200
},
{
"epoch": 28.644314868804663,
"grad_norm": 0.45621052384376526,
"learning_rate": 0.0002564212368728121,
"loss": 3.1737,
"step": 98250
},
{
"epoch": 28.658892128279884,
"grad_norm": 0.4310225248336792,
"learning_rate": 0.0002562462077012835,
"loss": 3.1557,
"step": 98300
},
{
"epoch": 28.6734693877551,
"grad_norm": 0.4316820800304413,
"learning_rate": 0.0002560711785297549,
"loss": 3.1647,
"step": 98350
},
{
"epoch": 28.68804664723032,
"grad_norm": 0.43771031498908997,
"learning_rate": 0.00025589614935822634,
"loss": 3.1694,
"step": 98400
},
{
"epoch": 28.70262390670554,
"grad_norm": 0.4118792414665222,
"learning_rate": 0.00025572112018669777,
"loss": 3.1737,
"step": 98450
},
{
"epoch": 28.717201166180757,
"grad_norm": 0.4393903315067291,
"learning_rate": 0.0002555460910151692,
"loss": 3.1775,
"step": 98500
},
{
"epoch": 28.73177842565598,
"grad_norm": 0.43764179944992065,
"learning_rate": 0.00025537106184364057,
"loss": 3.1672,
"step": 98550
},
{
"epoch": 28.746355685131196,
"grad_norm": 0.4366304874420166,
"learning_rate": 0.000255196032672112,
"loss": 3.169,
"step": 98600
},
{
"epoch": 28.760932944606413,
"grad_norm": 0.45967209339141846,
"learning_rate": 0.0002550210035005834,
"loss": 3.1722,
"step": 98650
},
{
"epoch": 28.775510204081634,
"grad_norm": 0.43880704045295715,
"learning_rate": 0.00025484597432905485,
"loss": 3.1695,
"step": 98700
},
{
"epoch": 28.79008746355685,
"grad_norm": 0.43362289667129517,
"learning_rate": 0.0002546709451575262,
"loss": 3.1679,
"step": 98750
},
{
"epoch": 28.80466472303207,
"grad_norm": 0.4177137613296509,
"learning_rate": 0.00025449591598599765,
"loss": 3.1803,
"step": 98800
},
{
"epoch": 28.81924198250729,
"grad_norm": 0.4206119179725647,
"learning_rate": 0.00025432088681446907,
"loss": 3.1719,
"step": 98850
},
{
"epoch": 28.833819241982507,
"grad_norm": 0.41267478466033936,
"learning_rate": 0.00025414585764294044,
"loss": 3.1715,
"step": 98900
},
{
"epoch": 28.848396501457724,
"grad_norm": 0.44268399477005005,
"learning_rate": 0.00025397082847141187,
"loss": 3.1749,
"step": 98950
},
{
"epoch": 28.862973760932945,
"grad_norm": 0.4493129849433899,
"learning_rate": 0.0002537957992998833,
"loss": 3.1818,
"step": 99000
},
{
"epoch": 28.862973760932945,
"eval_accuracy": 0.3747328974307299,
"eval_loss": 3.538551092147827,
"eval_runtime": 182.3234,
"eval_samples_per_second": 91.179,
"eval_steps_per_second": 5.699,
"step": 99000
},
{
"epoch": 28.877551020408163,
"grad_norm": 0.4239693880081177,
"learning_rate": 0.0002536207701283547,
"loss": 3.1753,
"step": 99050
},
{
"epoch": 28.892128279883384,
"grad_norm": 0.4513675570487976,
"learning_rate": 0.0002534457409568261,
"loss": 3.1725,
"step": 99100
},
{
"epoch": 28.9067055393586,
"grad_norm": 0.40344876050949097,
"learning_rate": 0.0002532707117852975,
"loss": 3.1764,
"step": 99150
},
{
"epoch": 28.92128279883382,
"grad_norm": 0.43702664971351624,
"learning_rate": 0.00025309568261376895,
"loss": 3.1875,
"step": 99200
},
{
"epoch": 28.93586005830904,
"grad_norm": 0.4061313271522522,
"learning_rate": 0.0002529206534422403,
"loss": 3.1757,
"step": 99250
},
{
"epoch": 28.950437317784257,
"grad_norm": 0.43192869424819946,
"learning_rate": 0.00025274562427071175,
"loss": 3.1771,
"step": 99300
},
{
"epoch": 28.965014577259474,
"grad_norm": 0.4228961169719696,
"learning_rate": 0.0002525705950991832,
"loss": 3.1744,
"step": 99350
},
{
"epoch": 28.979591836734695,
"grad_norm": 0.4422495365142822,
"learning_rate": 0.0002523955659276546,
"loss": 3.179,
"step": 99400
},
{
"epoch": 28.994169096209912,
"grad_norm": 0.4269513487815857,
"learning_rate": 0.000252220536756126,
"loss": 3.1753,
"step": 99450
},
{
"epoch": 29.00874635568513,
"grad_norm": 0.48027440905570984,
"learning_rate": 0.0002520455075845974,
"loss": 3.1244,
"step": 99500
},
{
"epoch": 29.02332361516035,
"grad_norm": 0.45419687032699585,
"learning_rate": 0.00025187047841306883,
"loss": 3.0918,
"step": 99550
},
{
"epoch": 29.037900874635568,
"grad_norm": 0.45154982805252075,
"learning_rate": 0.00025169544924154026,
"loss": 3.0949,
"step": 99600
},
{
"epoch": 29.05247813411079,
"grad_norm": 0.455363005399704,
"learning_rate": 0.00025152042007001163,
"loss": 3.0937,
"step": 99650
},
{
"epoch": 29.067055393586006,
"grad_norm": 0.4598212242126465,
"learning_rate": 0.00025134539089848306,
"loss": 3.0988,
"step": 99700
},
{
"epoch": 29.081632653061224,
"grad_norm": 0.4252084791660309,
"learning_rate": 0.0002511703617269545,
"loss": 3.1003,
"step": 99750
},
{
"epoch": 29.096209912536445,
"grad_norm": 0.4729098081588745,
"learning_rate": 0.00025099533255542586,
"loss": 3.092,
"step": 99800
},
{
"epoch": 29.110787172011662,
"grad_norm": 0.4432501792907715,
"learning_rate": 0.0002508203033838973,
"loss": 3.1063,
"step": 99850
},
{
"epoch": 29.12536443148688,
"grad_norm": 0.42664968967437744,
"learning_rate": 0.0002506452742123687,
"loss": 3.0943,
"step": 99900
},
{
"epoch": 29.1399416909621,
"grad_norm": 0.4493328332901001,
"learning_rate": 0.00025047024504084014,
"loss": 3.108,
"step": 99950
},
{
"epoch": 29.154518950437318,
"grad_norm": 0.4413515627384186,
"learning_rate": 0.0002502952158693115,
"loss": 3.1052,
"step": 100000
},
{
"epoch": 29.154518950437318,
"eval_accuracy": 0.3735989181227622,
"eval_loss": 3.555105686187744,
"eval_runtime": 182.4652,
"eval_samples_per_second": 91.108,
"eval_steps_per_second": 5.694,
"step": 100000
},
{
"epoch": 29.169096209912535,
"grad_norm": 0.46616682410240173,
"learning_rate": 0.00025012018669778294,
"loss": 3.11,
"step": 100050
},
{
"epoch": 29.183673469387756,
"grad_norm": 0.42780494689941406,
"learning_rate": 0.00024994515752625436,
"loss": 3.108,
"step": 100100
},
{
"epoch": 29.198250728862973,
"grad_norm": 0.4434570074081421,
"learning_rate": 0.0002497701283547258,
"loss": 3.1273,
"step": 100150
},
{
"epoch": 29.21282798833819,
"grad_norm": 0.45673868060112,
"learning_rate": 0.00024959509918319716,
"loss": 3.1163,
"step": 100200
},
{
"epoch": 29.22740524781341,
"grad_norm": 0.43610596656799316,
"learning_rate": 0.0002494200700116686,
"loss": 3.1242,
"step": 100250
},
{
"epoch": 29.24198250728863,
"grad_norm": 0.46187329292297363,
"learning_rate": 0.00024924504084014,
"loss": 3.1222,
"step": 100300
},
{
"epoch": 29.25655976676385,
"grad_norm": 0.4247560501098633,
"learning_rate": 0.0002490700116686114,
"loss": 3.1194,
"step": 100350
},
{
"epoch": 29.271137026239067,
"grad_norm": 0.4311138391494751,
"learning_rate": 0.0002488949824970828,
"loss": 3.1222,
"step": 100400
},
{
"epoch": 29.285714285714285,
"grad_norm": 0.4713177978992462,
"learning_rate": 0.00024871995332555424,
"loss": 3.1259,
"step": 100450
},
{
"epoch": 29.300291545189506,
"grad_norm": 0.48516809940338135,
"learning_rate": 0.00024854492415402567,
"loss": 3.144,
"step": 100500
},
{
"epoch": 29.314868804664723,
"grad_norm": 0.4494832754135132,
"learning_rate": 0.0002483698949824971,
"loss": 3.1357,
"step": 100550
},
{
"epoch": 29.32944606413994,
"grad_norm": 0.4127950966358185,
"learning_rate": 0.00024819486581096847,
"loss": 3.1354,
"step": 100600
},
{
"epoch": 29.34402332361516,
"grad_norm": 0.4290008544921875,
"learning_rate": 0.0002480198366394399,
"loss": 3.1288,
"step": 100650
},
{
"epoch": 29.35860058309038,
"grad_norm": 0.4509317874908447,
"learning_rate": 0.00024784480746791127,
"loss": 3.1309,
"step": 100700
},
{
"epoch": 29.373177842565596,
"grad_norm": 0.45486029982566833,
"learning_rate": 0.0002476697782963827,
"loss": 3.1393,
"step": 100750
},
{
"epoch": 29.387755102040817,
"grad_norm": 0.42254355549812317,
"learning_rate": 0.0002474947491248541,
"loss": 3.1312,
"step": 100800
},
{
"epoch": 29.402332361516034,
"grad_norm": 0.42709168791770935,
"learning_rate": 0.00024731971995332555,
"loss": 3.1449,
"step": 100850
},
{
"epoch": 29.416909620991255,
"grad_norm": 0.4207426905632019,
"learning_rate": 0.0002471446907817969,
"loss": 3.1467,
"step": 100900
},
{
"epoch": 29.431486880466473,
"grad_norm": 0.46891316771507263,
"learning_rate": 0.00024696966161026835,
"loss": 3.1429,
"step": 100950
},
{
"epoch": 29.44606413994169,
"grad_norm": 0.449980229139328,
"learning_rate": 0.0002467946324387398,
"loss": 3.1293,
"step": 101000
},
{
"epoch": 29.44606413994169,
"eval_accuracy": 0.37401199124553375,
"eval_loss": 3.549697160720825,
"eval_runtime": 182.3939,
"eval_samples_per_second": 91.143,
"eval_steps_per_second": 5.696,
"step": 101000
},
{
"epoch": 29.46064139941691,
"grad_norm": 0.4162759780883789,
"learning_rate": 0.0002466196032672112,
"loss": 3.1306,
"step": 101050
},
{
"epoch": 29.47521865889213,
"grad_norm": 0.44192177057266235,
"learning_rate": 0.0002464445740956826,
"loss": 3.1477,
"step": 101100
},
{
"epoch": 29.489795918367346,
"grad_norm": 0.44102761149406433,
"learning_rate": 0.000246269544924154,
"loss": 3.144,
"step": 101150
},
{
"epoch": 29.504373177842567,
"grad_norm": 0.4400504231452942,
"learning_rate": 0.00024609451575262543,
"loss": 3.1505,
"step": 101200
},
{
"epoch": 29.518950437317784,
"grad_norm": 0.46671465039253235,
"learning_rate": 0.0002459194865810968,
"loss": 3.1379,
"step": 101250
},
{
"epoch": 29.533527696793,
"grad_norm": 0.427625447511673,
"learning_rate": 0.0002457444574095682,
"loss": 3.1468,
"step": 101300
},
{
"epoch": 29.548104956268222,
"grad_norm": 0.458279550075531,
"learning_rate": 0.00024556942823803965,
"loss": 3.1479,
"step": 101350
},
{
"epoch": 29.56268221574344,
"grad_norm": 0.4169410169124603,
"learning_rate": 0.0002453943990665111,
"loss": 3.1595,
"step": 101400
},
{
"epoch": 29.57725947521866,
"grad_norm": 0.4758301079273224,
"learning_rate": 0.0002452193698949825,
"loss": 3.145,
"step": 101450
},
{
"epoch": 29.591836734693878,
"grad_norm": 0.4280540347099304,
"learning_rate": 0.0002450443407234539,
"loss": 3.1523,
"step": 101500
},
{
"epoch": 29.606413994169095,
"grad_norm": 0.43501847982406616,
"learning_rate": 0.0002448693115519253,
"loss": 3.1499,
"step": 101550
},
{
"epoch": 29.620991253644316,
"grad_norm": 0.43182411789894104,
"learning_rate": 0.0002446942823803967,
"loss": 3.1456,
"step": 101600
},
{
"epoch": 29.635568513119534,
"grad_norm": 0.4231785237789154,
"learning_rate": 0.0002445192532088681,
"loss": 3.1622,
"step": 101650
},
{
"epoch": 29.65014577259475,
"grad_norm": 0.4608800709247589,
"learning_rate": 0.00024434422403733953,
"loss": 3.1573,
"step": 101700
},
{
"epoch": 29.664723032069972,
"grad_norm": 0.45372170209884644,
"learning_rate": 0.00024416919486581096,
"loss": 3.1656,
"step": 101750
},
{
"epoch": 29.67930029154519,
"grad_norm": 0.43550124764442444,
"learning_rate": 0.00024399416569428236,
"loss": 3.1573,
"step": 101800
},
{
"epoch": 29.693877551020407,
"grad_norm": 0.4416351318359375,
"learning_rate": 0.00024381913652275376,
"loss": 3.1695,
"step": 101850
},
{
"epoch": 29.708454810495628,
"grad_norm": 0.4657592177391052,
"learning_rate": 0.00024364410735122519,
"loss": 3.1636,
"step": 101900
},
{
"epoch": 29.723032069970845,
"grad_norm": 0.46444663405418396,
"learning_rate": 0.00024346907817969659,
"loss": 3.17,
"step": 101950
},
{
"epoch": 29.737609329446066,
"grad_norm": 0.45541027188301086,
"learning_rate": 0.000243294049008168,
"loss": 3.1532,
"step": 102000
},
{
"epoch": 29.737609329446066,
"eval_accuracy": 0.3743919855573909,
"eval_loss": 3.5395925045013428,
"eval_runtime": 182.8413,
"eval_samples_per_second": 90.92,
"eval_steps_per_second": 5.683,
"step": 102000
},
{
"epoch": 29.752186588921283,
"grad_norm": 0.4434316158294678,
"learning_rate": 0.0002431190198366394,
"loss": 3.1644,
"step": 102050
},
{
"epoch": 29.7667638483965,
"grad_norm": 0.42412641644477844,
"learning_rate": 0.00024294399066511084,
"loss": 3.1669,
"step": 102100
},
{
"epoch": 29.78134110787172,
"grad_norm": 0.45610010623931885,
"learning_rate": 0.00024276896149358224,
"loss": 3.1668,
"step": 102150
},
{
"epoch": 29.79591836734694,
"grad_norm": 0.4540196359157562,
"learning_rate": 0.00024259393232205367,
"loss": 3.1735,
"step": 102200
},
{
"epoch": 29.810495626822156,
"grad_norm": 0.4640849530696869,
"learning_rate": 0.0002424189031505251,
"loss": 3.1642,
"step": 102250
},
{
"epoch": 29.825072886297377,
"grad_norm": 0.43845105171203613,
"learning_rate": 0.00024224387397899647,
"loss": 3.1592,
"step": 102300
},
{
"epoch": 29.839650145772595,
"grad_norm": 0.44450509548187256,
"learning_rate": 0.0002420688448074679,
"loss": 3.1668,
"step": 102350
},
{
"epoch": 29.854227405247812,
"grad_norm": 0.4392869472503662,
"learning_rate": 0.0002418938156359393,
"loss": 3.1691,
"step": 102400
},
{
"epoch": 29.868804664723033,
"grad_norm": 0.4368334412574768,
"learning_rate": 0.00024171878646441072,
"loss": 3.1672,
"step": 102450
},
{
"epoch": 29.88338192419825,
"grad_norm": 0.4234115779399872,
"learning_rate": 0.00024154375729288212,
"loss": 3.159,
"step": 102500
},
{
"epoch": 29.897959183673468,
"grad_norm": 0.43562474846839905,
"learning_rate": 0.00024136872812135354,
"loss": 3.1774,
"step": 102550
},
{
"epoch": 29.91253644314869,
"grad_norm": 0.48254790902137756,
"learning_rate": 0.00024119369894982497,
"loss": 3.1659,
"step": 102600
},
{
"epoch": 29.927113702623906,
"grad_norm": 0.4869457185268402,
"learning_rate": 0.00024101866977829637,
"loss": 3.1699,
"step": 102650
},
{
"epoch": 29.941690962099127,
"grad_norm": 0.44558846950531006,
"learning_rate": 0.0002408436406067678,
"loss": 3.1649,
"step": 102700
},
{
"epoch": 29.956268221574344,
"grad_norm": 0.4565999209880829,
"learning_rate": 0.00024066861143523917,
"loss": 3.1667,
"step": 102750
},
{
"epoch": 29.97084548104956,
"grad_norm": 0.4984128475189209,
"learning_rate": 0.0002404935822637106,
"loss": 3.1783,
"step": 102800
},
{
"epoch": 29.985422740524783,
"grad_norm": 0.430683970451355,
"learning_rate": 0.000240318553092182,
"loss": 3.167,
"step": 102850
},
{
"epoch": 30.0,
"grad_norm": 0.47676679491996765,
"learning_rate": 0.00024014352392065342,
"loss": 3.1735,
"step": 102900
},
{
"epoch": 30.014577259475217,
"grad_norm": 0.4585511386394501,
"learning_rate": 0.00023996849474912482,
"loss": 3.0723,
"step": 102950
},
{
"epoch": 30.02915451895044,
"grad_norm": 0.44532549381256104,
"learning_rate": 0.00023979346557759625,
"loss": 3.0842,
"step": 103000
},
{
"epoch": 30.02915451895044,
"eval_accuracy": 0.3739382996596532,
"eval_loss": 3.555943250656128,
"eval_runtime": 182.3297,
"eval_samples_per_second": 91.176,
"eval_steps_per_second": 5.698,
"step": 103000
},
{
"epoch": 30.043731778425656,
"grad_norm": 0.4594990909099579,
"learning_rate": 0.00023961843640606768,
"loss": 3.0802,
"step": 103050
},
{
"epoch": 30.058309037900873,
"grad_norm": 0.46604761481285095,
"learning_rate": 0.00023944340723453908,
"loss": 3.0899,
"step": 103100
},
{
"epoch": 30.072886297376094,
"grad_norm": 0.4697359502315521,
"learning_rate": 0.0002392683780630105,
"loss": 3.0908,
"step": 103150
},
{
"epoch": 30.08746355685131,
"grad_norm": 0.43642377853393555,
"learning_rate": 0.00023909334889148188,
"loss": 3.0955,
"step": 103200
},
{
"epoch": 30.102040816326532,
"grad_norm": 0.47640782594680786,
"learning_rate": 0.0002389183197199533,
"loss": 3.0973,
"step": 103250
},
{
"epoch": 30.11661807580175,
"grad_norm": 0.41699403524398804,
"learning_rate": 0.0002387432905484247,
"loss": 3.101,
"step": 103300
},
{
"epoch": 30.131195335276967,
"grad_norm": 0.4841386079788208,
"learning_rate": 0.00023856826137689613,
"loss": 3.1155,
"step": 103350
},
{
"epoch": 30.145772594752188,
"grad_norm": 0.47871625423431396,
"learning_rate": 0.00023839323220536756,
"loss": 3.101,
"step": 103400
},
{
"epoch": 30.160349854227405,
"grad_norm": 0.4356785714626312,
"learning_rate": 0.00023821820303383896,
"loss": 3.1043,
"step": 103450
},
{
"epoch": 30.174927113702623,
"grad_norm": 0.5174174308776855,
"learning_rate": 0.00023804317386231038,
"loss": 3.1073,
"step": 103500
},
{
"epoch": 30.189504373177844,
"grad_norm": 0.44571802020072937,
"learning_rate": 0.00023786814469078178,
"loss": 3.1039,
"step": 103550
},
{
"epoch": 30.20408163265306,
"grad_norm": 0.45123785734176636,
"learning_rate": 0.0002376931155192532,
"loss": 3.1133,
"step": 103600
},
{
"epoch": 30.21865889212828,
"grad_norm": 0.4339126646518707,
"learning_rate": 0.00023751808634772458,
"loss": 3.1271,
"step": 103650
},
{
"epoch": 30.2332361516035,
"grad_norm": 0.4566538631916046,
"learning_rate": 0.000237343057176196,
"loss": 3.1273,
"step": 103700
},
{
"epoch": 30.247813411078717,
"grad_norm": 0.5019856691360474,
"learning_rate": 0.0002371680280046674,
"loss": 3.1091,
"step": 103750
},
{
"epoch": 30.262390670553934,
"grad_norm": 0.4492882490158081,
"learning_rate": 0.00023699299883313884,
"loss": 3.0973,
"step": 103800
},
{
"epoch": 30.276967930029155,
"grad_norm": 0.44337910413742065,
"learning_rate": 0.00023681796966161026,
"loss": 3.1213,
"step": 103850
},
{
"epoch": 30.291545189504372,
"grad_norm": 0.4558570683002472,
"learning_rate": 0.00023664294049008166,
"loss": 3.1266,
"step": 103900
},
{
"epoch": 30.306122448979593,
"grad_norm": 0.4441608786582947,
"learning_rate": 0.0002364679113185531,
"loss": 3.1165,
"step": 103950
},
{
"epoch": 30.32069970845481,
"grad_norm": 0.4322434365749359,
"learning_rate": 0.0002362928821470245,
"loss": 3.1202,
"step": 104000
},
{
"epoch": 30.32069970845481,
"eval_accuracy": 0.37388403157484335,
"eval_loss": 3.5529487133026123,
"eval_runtime": 182.4101,
"eval_samples_per_second": 91.135,
"eval_steps_per_second": 5.696,
"step": 104000
},
{
"epoch": 30.335276967930028,
"grad_norm": 0.4699609875679016,
"learning_rate": 0.00023611785297549592,
"loss": 3.1181,
"step": 104050
},
{
"epoch": 30.34985422740525,
"grad_norm": 0.4313887655735016,
"learning_rate": 0.0002359428238039673,
"loss": 3.1199,
"step": 104100
},
{
"epoch": 30.364431486880466,
"grad_norm": 0.4372082054615021,
"learning_rate": 0.00023576779463243872,
"loss": 3.1379,
"step": 104150
},
{
"epoch": 30.379008746355684,
"grad_norm": 0.48427414894104004,
"learning_rate": 0.00023559276546091011,
"loss": 3.1321,
"step": 104200
},
{
"epoch": 30.393586005830905,
"grad_norm": 0.47731345891952515,
"learning_rate": 0.00023541773628938154,
"loss": 3.138,
"step": 104250
},
{
"epoch": 30.408163265306122,
"grad_norm": 0.44795680046081543,
"learning_rate": 0.00023524270711785297,
"loss": 3.1338,
"step": 104300
},
{
"epoch": 30.42274052478134,
"grad_norm": 0.4745403230190277,
"learning_rate": 0.00023506767794632437,
"loss": 3.1312,
"step": 104350
},
{
"epoch": 30.43731778425656,
"grad_norm": 0.4556351602077484,
"learning_rate": 0.0002348926487747958,
"loss": 3.1254,
"step": 104400
},
{
"epoch": 30.451895043731778,
"grad_norm": 0.4545431435108185,
"learning_rate": 0.0002347176196032672,
"loss": 3.1416,
"step": 104450
},
{
"epoch": 30.466472303207,
"grad_norm": 0.4592565596103668,
"learning_rate": 0.00023454259043173862,
"loss": 3.1285,
"step": 104500
},
{
"epoch": 30.481049562682216,
"grad_norm": 0.456776887178421,
"learning_rate": 0.00023436756126021,
"loss": 3.1428,
"step": 104550
},
{
"epoch": 30.495626822157433,
"grad_norm": 0.4269309639930725,
"learning_rate": 0.00023419253208868142,
"loss": 3.1424,
"step": 104600
},
{
"epoch": 30.510204081632654,
"grad_norm": 0.436102032661438,
"learning_rate": 0.00023401750291715285,
"loss": 3.1285,
"step": 104650
},
{
"epoch": 30.52478134110787,
"grad_norm": 0.43040576577186584,
"learning_rate": 0.00023384247374562425,
"loss": 3.1361,
"step": 104700
},
{
"epoch": 30.53935860058309,
"grad_norm": 0.48119938373565674,
"learning_rate": 0.00023366744457409567,
"loss": 3.1409,
"step": 104750
},
{
"epoch": 30.55393586005831,
"grad_norm": 0.4720640778541565,
"learning_rate": 0.00023349241540256707,
"loss": 3.1386,
"step": 104800
},
{
"epoch": 30.568513119533527,
"grad_norm": 0.45356231927871704,
"learning_rate": 0.0002333173862310385,
"loss": 3.1327,
"step": 104850
},
{
"epoch": 30.583090379008745,
"grad_norm": 0.4686661958694458,
"learning_rate": 0.0002331423570595099,
"loss": 3.1445,
"step": 104900
},
{
"epoch": 30.597667638483966,
"grad_norm": 0.44459545612335205,
"learning_rate": 0.00023296732788798133,
"loss": 3.1407,
"step": 104950
},
{
"epoch": 30.612244897959183,
"grad_norm": 0.45365190505981445,
"learning_rate": 0.0002327922987164527,
"loss": 3.1488,
"step": 105000
},
{
"epoch": 30.612244897959183,
"eval_accuracy": 0.37435149050061306,
"eval_loss": 3.5447380542755127,
"eval_runtime": 182.8207,
"eval_samples_per_second": 90.931,
"eval_steps_per_second": 5.683,
"step": 105000
},
{
"epoch": 30.626822157434404,
"grad_norm": 0.4460262656211853,
"learning_rate": 0.00023261726954492413,
"loss": 3.1396,
"step": 105050
},
{
"epoch": 30.64139941690962,
"grad_norm": 0.45241525769233704,
"learning_rate": 0.00023244224037339555,
"loss": 3.1427,
"step": 105100
},
{
"epoch": 30.65597667638484,
"grad_norm": 0.44525688886642456,
"learning_rate": 0.00023226721120186695,
"loss": 3.1477,
"step": 105150
},
{
"epoch": 30.67055393586006,
"grad_norm": 0.4357472360134125,
"learning_rate": 0.00023209218203033838,
"loss": 3.1524,
"step": 105200
},
{
"epoch": 30.685131195335277,
"grad_norm": 0.46180203557014465,
"learning_rate": 0.00023191715285880978,
"loss": 3.1406,
"step": 105250
},
{
"epoch": 30.699708454810494,
"grad_norm": 0.43893539905548096,
"learning_rate": 0.0002317421236872812,
"loss": 3.1644,
"step": 105300
},
{
"epoch": 30.714285714285715,
"grad_norm": 0.45540228486061096,
"learning_rate": 0.0002315670945157526,
"loss": 3.1574,
"step": 105350
},
{
"epoch": 30.728862973760933,
"grad_norm": 0.4692342281341553,
"learning_rate": 0.00023139206534422403,
"loss": 3.1624,
"step": 105400
},
{
"epoch": 30.74344023323615,
"grad_norm": 0.4671376049518585,
"learning_rate": 0.0002312170361726954,
"loss": 3.1451,
"step": 105450
},
{
"epoch": 30.75801749271137,
"grad_norm": 0.48794859647750854,
"learning_rate": 0.00023104200700116683,
"loss": 3.1566,
"step": 105500
},
{
"epoch": 30.77259475218659,
"grad_norm": 0.4582727551460266,
"learning_rate": 0.00023086697782963826,
"loss": 3.1436,
"step": 105550
},
{
"epoch": 30.78717201166181,
"grad_norm": 0.5062020421028137,
"learning_rate": 0.00023069194865810966,
"loss": 3.1498,
"step": 105600
},
{
"epoch": 30.801749271137027,
"grad_norm": 0.4618031978607178,
"learning_rate": 0.00023051691948658109,
"loss": 3.1509,
"step": 105650
},
{
"epoch": 30.816326530612244,
"grad_norm": 0.4281717538833618,
"learning_rate": 0.00023034189031505249,
"loss": 3.1481,
"step": 105700
},
{
"epoch": 30.830903790087465,
"grad_norm": 0.45748892426490784,
"learning_rate": 0.0002301668611435239,
"loss": 3.1511,
"step": 105750
},
{
"epoch": 30.845481049562682,
"grad_norm": 0.4601440131664276,
"learning_rate": 0.0002299918319719953,
"loss": 3.1632,
"step": 105800
},
{
"epoch": 30.8600583090379,
"grad_norm": 0.45072951912879944,
"learning_rate": 0.00022981680280046674,
"loss": 3.1635,
"step": 105850
},
{
"epoch": 30.87463556851312,
"grad_norm": 0.45450711250305176,
"learning_rate": 0.00022964177362893817,
"loss": 3.1546,
"step": 105900
},
{
"epoch": 30.889212827988338,
"grad_norm": 0.4641033113002777,
"learning_rate": 0.00022946674445740954,
"loss": 3.1569,
"step": 105950
},
{
"epoch": 30.903790087463555,
"grad_norm": 0.4180297553539276,
"learning_rate": 0.00022929171528588097,
"loss": 3.1591,
"step": 106000
},
{
"epoch": 30.903790087463555,
"eval_accuracy": 0.3749691578346634,
"eval_loss": 3.5384469032287598,
"eval_runtime": 182.6987,
"eval_samples_per_second": 90.991,
"eval_steps_per_second": 5.687,
"step": 106000
},
{
"epoch": 30.918367346938776,
"grad_norm": 0.4221542179584503,
"learning_rate": 0.00022911668611435236,
"loss": 3.1583,
"step": 106050
},
{
"epoch": 30.932944606413994,
"grad_norm": 0.47019174695014954,
"learning_rate": 0.0002289416569428238,
"loss": 3.1603,
"step": 106100
},
{
"epoch": 30.947521865889215,
"grad_norm": 0.43644431233406067,
"learning_rate": 0.0002287666277712952,
"loss": 3.1593,
"step": 106150
},
{
"epoch": 30.962099125364432,
"grad_norm": 0.4441497027873993,
"learning_rate": 0.00022859159859976662,
"loss": 3.1701,
"step": 106200
},
{
"epoch": 30.97667638483965,
"grad_norm": 0.43236950039863586,
"learning_rate": 0.00022841656942823802,
"loss": 3.1581,
"step": 106250
},
{
"epoch": 30.99125364431487,
"grad_norm": 0.4527025520801544,
"learning_rate": 0.00022824154025670944,
"loss": 3.1718,
"step": 106300
},
{
"epoch": 31.005830903790088,
"grad_norm": 0.4435344934463501,
"learning_rate": 0.00022806651108518087,
"loss": 3.1195,
"step": 106350
},
{
"epoch": 31.020408163265305,
"grad_norm": 0.4386453926563263,
"learning_rate": 0.00022789148191365224,
"loss": 3.072,
"step": 106400
},
{
"epoch": 31.034985422740526,
"grad_norm": 0.45795875787734985,
"learning_rate": 0.00022771645274212367,
"loss": 3.0613,
"step": 106450
},
{
"epoch": 31.049562682215743,
"grad_norm": 0.424356609582901,
"learning_rate": 0.00022754142357059507,
"loss": 3.0828,
"step": 106500
},
{
"epoch": 31.06413994169096,
"grad_norm": 0.44901180267333984,
"learning_rate": 0.0002273663943990665,
"loss": 3.0799,
"step": 106550
},
{
"epoch": 31.07871720116618,
"grad_norm": 0.4648335576057434,
"learning_rate": 0.0002271913652275379,
"loss": 3.0804,
"step": 106600
},
{
"epoch": 31.0932944606414,
"grad_norm": 0.4268786311149597,
"learning_rate": 0.00022701633605600932,
"loss": 3.0826,
"step": 106650
},
{
"epoch": 31.107871720116616,
"grad_norm": 0.5035303831100464,
"learning_rate": 0.00022684130688448072,
"loss": 3.0941,
"step": 106700
},
{
"epoch": 31.122448979591837,
"grad_norm": 0.46598684787750244,
"learning_rate": 0.00022666627771295215,
"loss": 3.0922,
"step": 106750
},
{
"epoch": 31.137026239067055,
"grad_norm": 0.4177006483078003,
"learning_rate": 0.00022649124854142358,
"loss": 3.1001,
"step": 106800
},
{
"epoch": 31.151603498542276,
"grad_norm": 0.44359204173088074,
"learning_rate": 0.00022631621936989495,
"loss": 3.0948,
"step": 106850
},
{
"epoch": 31.166180758017493,
"grad_norm": 0.4663641154766083,
"learning_rate": 0.00022614119019836638,
"loss": 3.1159,
"step": 106900
},
{
"epoch": 31.18075801749271,
"grad_norm": 0.43660664558410645,
"learning_rate": 0.00022596616102683778,
"loss": 3.0966,
"step": 106950
},
{
"epoch": 31.19533527696793,
"grad_norm": 0.45154282450675964,
"learning_rate": 0.0002257911318553092,
"loss": 3.1026,
"step": 107000
},
{
"epoch": 31.19533527696793,
"eval_accuracy": 0.3739278227408938,
"eval_loss": 3.558981418609619,
"eval_runtime": 182.4643,
"eval_samples_per_second": 91.108,
"eval_steps_per_second": 5.694,
"step": 107000
},
{
"epoch": 31.20991253644315,
"grad_norm": 0.4914431571960449,
"learning_rate": 0.0002256161026837806,
"loss": 3.1109,
"step": 107050
},
{
"epoch": 31.224489795918366,
"grad_norm": 0.4672841727733612,
"learning_rate": 0.00022544107351225203,
"loss": 3.0984,
"step": 107100
},
{
"epoch": 31.239067055393587,
"grad_norm": 0.46421849727630615,
"learning_rate": 0.00022526604434072346,
"loss": 3.1086,
"step": 107150
},
{
"epoch": 31.253644314868804,
"grad_norm": 0.4598676860332489,
"learning_rate": 0.00022509101516919486,
"loss": 3.0982,
"step": 107200
},
{
"epoch": 31.26822157434402,
"grad_norm": 0.4586013853549957,
"learning_rate": 0.00022491598599766628,
"loss": 3.0929,
"step": 107250
},
{
"epoch": 31.282798833819243,
"grad_norm": 0.44894787669181824,
"learning_rate": 0.00022474095682613766,
"loss": 3.1093,
"step": 107300
},
{
"epoch": 31.29737609329446,
"grad_norm": 0.43039682507514954,
"learning_rate": 0.00022456592765460908,
"loss": 3.1057,
"step": 107350
},
{
"epoch": 31.31195335276968,
"grad_norm": 0.45393604040145874,
"learning_rate": 0.00022439089848308048,
"loss": 3.1132,
"step": 107400
},
{
"epoch": 31.3265306122449,
"grad_norm": 0.5165916681289673,
"learning_rate": 0.0002242158693115519,
"loss": 3.1134,
"step": 107450
},
{
"epoch": 31.341107871720116,
"grad_norm": 0.4439025819301605,
"learning_rate": 0.0002240408401400233,
"loss": 3.126,
"step": 107500
},
{
"epoch": 31.355685131195337,
"grad_norm": 0.4372089207172394,
"learning_rate": 0.00022386581096849474,
"loss": 3.1231,
"step": 107550
},
{
"epoch": 31.370262390670554,
"grad_norm": 0.4908539056777954,
"learning_rate": 0.00022369078179696616,
"loss": 3.113,
"step": 107600
},
{
"epoch": 31.38483965014577,
"grad_norm": 0.47500312328338623,
"learning_rate": 0.00022351575262543756,
"loss": 3.1198,
"step": 107650
},
{
"epoch": 31.399416909620992,
"grad_norm": 0.49036237597465515,
"learning_rate": 0.000223340723453909,
"loss": 3.1199,
"step": 107700
},
{
"epoch": 31.41399416909621,
"grad_norm": 0.4451589286327362,
"learning_rate": 0.00022316569428238036,
"loss": 3.1185,
"step": 107750
},
{
"epoch": 31.428571428571427,
"grad_norm": 0.4553769528865814,
"learning_rate": 0.0002229906651108518,
"loss": 3.126,
"step": 107800
},
{
"epoch": 31.443148688046648,
"grad_norm": 0.43727388978004456,
"learning_rate": 0.0002228156359393232,
"loss": 3.1303,
"step": 107850
},
{
"epoch": 31.457725947521865,
"grad_norm": 0.46281135082244873,
"learning_rate": 0.00022264060676779461,
"loss": 3.1318,
"step": 107900
},
{
"epoch": 31.472303206997086,
"grad_norm": 0.48131096363067627,
"learning_rate": 0.00022246557759626601,
"loss": 3.1295,
"step": 107950
},
{
"epoch": 31.486880466472304,
"grad_norm": 0.4911995828151703,
"learning_rate": 0.00022229054842473744,
"loss": 3.1355,
"step": 108000
},
{
"epoch": 31.486880466472304,
"eval_accuracy": 0.3740834461858365,
"eval_loss": 3.5502305030822754,
"eval_runtime": 182.3882,
"eval_samples_per_second": 91.146,
"eval_steps_per_second": 5.697,
"step": 108000
},
{
"epoch": 31.50145772594752,
"grad_norm": 0.508841335773468,
"learning_rate": 0.00022211551925320887,
"loss": 3.121,
"step": 108050
},
{
"epoch": 31.516034985422742,
"grad_norm": 0.488435834646225,
"learning_rate": 0.00022194049008168027,
"loss": 3.1243,
"step": 108100
},
{
"epoch": 31.53061224489796,
"grad_norm": 0.46807485818862915,
"learning_rate": 0.0002217654609101517,
"loss": 3.1247,
"step": 108150
},
{
"epoch": 31.545189504373177,
"grad_norm": 0.4565454125404358,
"learning_rate": 0.00022159043173862307,
"loss": 3.1369,
"step": 108200
},
{
"epoch": 31.559766763848398,
"grad_norm": 0.47071194648742676,
"learning_rate": 0.0002214154025670945,
"loss": 3.1403,
"step": 108250
},
{
"epoch": 31.574344023323615,
"grad_norm": 0.4571170210838318,
"learning_rate": 0.0002212403733955659,
"loss": 3.1373,
"step": 108300
},
{
"epoch": 31.588921282798832,
"grad_norm": 0.46521154046058655,
"learning_rate": 0.00022106534422403732,
"loss": 3.1306,
"step": 108350
},
{
"epoch": 31.603498542274053,
"grad_norm": 0.46532467007637024,
"learning_rate": 0.00022089031505250875,
"loss": 3.1411,
"step": 108400
},
{
"epoch": 31.61807580174927,
"grad_norm": 0.46492308378219604,
"learning_rate": 0.00022071528588098015,
"loss": 3.1392,
"step": 108450
},
{
"epoch": 31.632653061224488,
"grad_norm": 0.4574034810066223,
"learning_rate": 0.00022054025670945157,
"loss": 3.1387,
"step": 108500
},
{
"epoch": 31.64723032069971,
"grad_norm": 0.4768873453140259,
"learning_rate": 0.00022036522753792297,
"loss": 3.1437,
"step": 108550
},
{
"epoch": 31.661807580174926,
"grad_norm": 0.4694845676422119,
"learning_rate": 0.0002201901983663944,
"loss": 3.1375,
"step": 108600
},
{
"epoch": 31.676384839650147,
"grad_norm": 0.4521714746952057,
"learning_rate": 0.00022001516919486577,
"loss": 3.1324,
"step": 108650
},
{
"epoch": 31.690962099125365,
"grad_norm": 0.44469496607780457,
"learning_rate": 0.0002198401400233372,
"loss": 3.1336,
"step": 108700
},
{
"epoch": 31.705539358600582,
"grad_norm": 0.43908751010894775,
"learning_rate": 0.0002196651108518086,
"loss": 3.1413,
"step": 108750
},
{
"epoch": 31.720116618075803,
"grad_norm": 0.4636845588684082,
"learning_rate": 0.00021949008168028003,
"loss": 3.1506,
"step": 108800
},
{
"epoch": 31.73469387755102,
"grad_norm": 0.45257994532585144,
"learning_rate": 0.00021931505250875145,
"loss": 3.1482,
"step": 108850
},
{
"epoch": 31.749271137026238,
"grad_norm": 0.4656467139720917,
"learning_rate": 0.00021914002333722285,
"loss": 3.1473,
"step": 108900
},
{
"epoch": 31.76384839650146,
"grad_norm": 0.4685870409011841,
"learning_rate": 0.00021896499416569428,
"loss": 3.1573,
"step": 108950
},
{
"epoch": 31.778425655976676,
"grad_norm": 0.4359848201274872,
"learning_rate": 0.00021878996499416568,
"loss": 3.1469,
"step": 109000
},
{
"epoch": 31.778425655976676,
"eval_accuracy": 0.37454784443871025,
"eval_loss": 3.5438437461853027,
"eval_runtime": 182.5841,
"eval_samples_per_second": 91.048,
"eval_steps_per_second": 5.691,
"step": 109000
},
{
"epoch": 31.778425655976676,
"step": 109000,
"total_flos": 2.278393618563072e+18,
"train_loss": 0.8399101010943771,
"train_runtime": 58098.688,
"train_samples_per_second": 236.142,
"train_steps_per_second": 2.952
}
],
"logging_steps": 50,
"max_steps": 171500,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 11
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.278393618563072e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}