{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 80000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.25e-05, "grad_norm": 61.570499420166016, "learning_rate": 0.0, "loss": 10.836095809936523, "step": 1 }, { "epoch": 0.00125, "grad_norm": 127.05734252929688, "learning_rate": 1.98e-06, "loss": 19.720439255839647, "step": 100 }, { "epoch": 0.0025, "grad_norm": 240.12950134277344, "learning_rate": 3.980000000000001e-06, "loss": 19.435438232421873, "step": 200 }, { "epoch": 0.00375, "grad_norm": 165.71978759765625, "learning_rate": 5.98e-06, "loss": 17.33456298828125, "step": 300 }, { "epoch": 0.005, "grad_norm": 476.399169921875, "learning_rate": 7.980000000000002e-06, "loss": 14.20736328125, "step": 400 }, { "epoch": 0.00625, "grad_norm": 73.51789855957031, "learning_rate": 9.980000000000001e-06, "loss": 10.920599365234375, "step": 500 }, { "epoch": 0.0075, "grad_norm": 50.991634368896484, "learning_rate": 9.987547169811321e-06, "loss": 9.143378295898437, "step": 600 }, { "epoch": 0.00875, "grad_norm": 150.61373901367188, "learning_rate": 9.97496855345912e-06, "loss": 7.4310791015625, "step": 700 }, { "epoch": 0.01, "grad_norm": 188.94094848632812, "learning_rate": 9.962389937106918e-06, "loss": 6.5367333984375, "step": 800 }, { "epoch": 0.01125, "grad_norm": 37.20081329345703, "learning_rate": 9.949811320754717e-06, "loss": 7.25988525390625, "step": 900 }, { "epoch": 0.0125, "grad_norm": 52.72426223754883, "learning_rate": 9.937232704402516e-06, "loss": 6.920318603515625, "step": 1000 }, { "epoch": 0.01375, "grad_norm": 77.21712493896484, "learning_rate": 9.924654088050316e-06, "loss": 6.468406982421875, "step": 1100 }, { "epoch": 0.015, "grad_norm": 73.15080261230469, "learning_rate": 9.912075471698114e-06, "loss": 6.187958374023437, "step": 1200 }, { "epoch": 0.01625, "grad_norm": 21.300718307495117, "learning_rate": 9.899496855345913e-06, "loss": 5.908817749023438, "step": 1300 }, { "epoch": 0.0175, "grad_norm": 33.9874153137207, "learning_rate": 9.886918238993712e-06, "loss": 5.750390625, "step": 1400 }, { "epoch": 0.01875, "grad_norm": 46.610294342041016, "learning_rate": 9.87433962264151e-06, "loss": 5.010645446777343, "step": 1500 }, { "epoch": 0.02, "grad_norm": 116.64041137695312, "learning_rate": 9.861761006289309e-06, "loss": 4.999881591796875, "step": 1600 }, { "epoch": 0.02125, "grad_norm": 16.1014347076416, "learning_rate": 9.849182389937107e-06, "loss": 5.607005004882812, "step": 1700 }, { "epoch": 0.0225, "grad_norm": 22.09562873840332, "learning_rate": 9.836603773584906e-06, "loss": 6.2442431640625, "step": 1800 }, { "epoch": 0.02375, "grad_norm": 62.120033264160156, "learning_rate": 9.824025157232704e-06, "loss": 5.164766845703125, "step": 1900 }, { "epoch": 0.025, "grad_norm": 12.194104194641113, "learning_rate": 9.811446540880503e-06, "loss": 4.69855224609375, "step": 2000 }, { "epoch": 0.02625, "grad_norm": 21.063518524169922, "learning_rate": 9.798867924528302e-06, "loss": 5.364295654296875, "step": 2100 }, { "epoch": 0.0275, "grad_norm": 28.930068969726562, "learning_rate": 9.786289308176102e-06, "loss": 5.119418334960938, "step": 2200 }, { "epoch": 0.02875, "grad_norm": 46.796871185302734, "learning_rate": 9.7737106918239e-06, "loss": 5.237764892578125, "step": 2300 }, { "epoch": 0.03, "grad_norm": 17.024911880493164, "learning_rate": 9.761132075471699e-06, "loss": 5.059874877929688, "step": 2400 }, { "epoch": 0.03125, "grad_norm": 74.16903686523438, "learning_rate": 9.748553459119498e-06, "loss": 4.9656503295898435, "step": 2500 }, { "epoch": 0.0325, "grad_norm": 22.257841110229492, "learning_rate": 9.735974842767296e-06, "loss": 5.18156982421875, "step": 2600 }, { "epoch": 0.03375, "grad_norm": 47.79827117919922, "learning_rate": 9.723396226415095e-06, "loss": 4.9468075561523435, "step": 2700 }, { "epoch": 0.035, "grad_norm": 33.0962028503418, "learning_rate": 9.710817610062893e-06, "loss": 4.722409362792969, "step": 2800 }, { "epoch": 0.03625, "grad_norm": 20.282230377197266, "learning_rate": 9.698238993710692e-06, "loss": 5.242295532226563, "step": 2900 }, { "epoch": 0.0375, "grad_norm": 25.866464614868164, "learning_rate": 9.68566037735849e-06, "loss": 4.773485717773437, "step": 3000 }, { "epoch": 0.03875, "grad_norm": 26.524415969848633, "learning_rate": 9.67308176100629e-06, "loss": 5.017108154296875, "step": 3100 }, { "epoch": 0.04, "grad_norm": 18.342044830322266, "learning_rate": 9.66050314465409e-06, "loss": 4.6452767944335935, "step": 3200 }, { "epoch": 0.04125, "grad_norm": 20.58109474182129, "learning_rate": 9.647924528301888e-06, "loss": 5.00500732421875, "step": 3300 }, { "epoch": 0.0425, "grad_norm": 294.05224609375, "learning_rate": 9.635345911949687e-06, "loss": 4.595547790527344, "step": 3400 }, { "epoch": 0.04375, "grad_norm": 32.15380096435547, "learning_rate": 9.622767295597485e-06, "loss": 4.3764559936523435, "step": 3500 }, { "epoch": 0.045, "grad_norm": 28.0123233795166, "learning_rate": 9.610188679245284e-06, "loss": 4.383021850585937, "step": 3600 }, { "epoch": 0.04625, "grad_norm": 33.30596923828125, "learning_rate": 9.597610062893082e-06, "loss": 4.354810180664063, "step": 3700 }, { "epoch": 0.0475, "grad_norm": 26.94670295715332, "learning_rate": 9.585031446540881e-06, "loss": 3.8970941162109374, "step": 3800 }, { "epoch": 0.04875, "grad_norm": 28.224117279052734, "learning_rate": 9.57245283018868e-06, "loss": 3.8559701538085935, "step": 3900 }, { "epoch": 0.05, "grad_norm": 20.57750701904297, "learning_rate": 9.559874213836478e-06, "loss": 4.92269287109375, "step": 4000 }, { "epoch": 0.05125, "grad_norm": 18.131223678588867, "learning_rate": 9.547295597484277e-06, "loss": 4.633423461914062, "step": 4100 }, { "epoch": 0.0525, "grad_norm": 29.649024963378906, "learning_rate": 9.534716981132075e-06, "loss": 4.432715759277344, "step": 4200 }, { "epoch": 0.05375, "grad_norm": 34.53123474121094, "learning_rate": 9.522138364779876e-06, "loss": 4.242776489257812, "step": 4300 }, { "epoch": 0.055, "grad_norm": 19.823360443115234, "learning_rate": 9.509559748427674e-06, "loss": 4.448049011230469, "step": 4400 }, { "epoch": 0.05625, "grad_norm": 20.857120513916016, "learning_rate": 9.496981132075473e-06, "loss": 4.65389404296875, "step": 4500 }, { "epoch": 0.0575, "grad_norm": 20.397987365722656, "learning_rate": 9.484402515723271e-06, "loss": 4.350996704101562, "step": 4600 }, { "epoch": 0.05875, "grad_norm": 22.439617156982422, "learning_rate": 9.47182389937107e-06, "loss": 4.1993505859375, "step": 4700 }, { "epoch": 0.06, "grad_norm": 5.18347692489624, "learning_rate": 9.459245283018869e-06, "loss": 4.38233154296875, "step": 4800 }, { "epoch": 0.06125, "grad_norm": 20.546842575073242, "learning_rate": 9.446666666666667e-06, "loss": 4.136148681640625, "step": 4900 }, { "epoch": 0.0625, "grad_norm": 30.28652572631836, "learning_rate": 9.434088050314466e-06, "loss": 4.002590026855469, "step": 5000 }, { "epoch": 0.06375, "grad_norm": 12.638691902160645, "learning_rate": 9.421509433962264e-06, "loss": 3.565917663574219, "step": 5100 }, { "epoch": 0.065, "grad_norm": 22.384016036987305, "learning_rate": 9.408930817610063e-06, "loss": 3.9356478881835937, "step": 5200 }, { "epoch": 0.06625, "grad_norm": 18.600645065307617, "learning_rate": 9.396352201257861e-06, "loss": 4.357646484375, "step": 5300 }, { "epoch": 0.0675, "grad_norm": 19.07636260986328, "learning_rate": 9.383773584905662e-06, "loss": 4.550914306640625, "step": 5400 }, { "epoch": 0.06875, "grad_norm": 36.520381927490234, "learning_rate": 9.37119496855346e-06, "loss": 4.02431884765625, "step": 5500 }, { "epoch": 0.07, "grad_norm": 31.87358856201172, "learning_rate": 9.358616352201259e-06, "loss": 4.209980163574219, "step": 5600 }, { "epoch": 0.07125, "grad_norm": 25.108585357666016, "learning_rate": 9.346037735849058e-06, "loss": 4.5496435546875, "step": 5700 }, { "epoch": 0.0725, "grad_norm": 18.682506561279297, "learning_rate": 9.333459119496856e-06, "loss": 4.466670837402344, "step": 5800 }, { "epoch": 0.07375, "grad_norm": 18.071388244628906, "learning_rate": 9.320880503144655e-06, "loss": 3.757862243652344, "step": 5900 }, { "epoch": 0.075, "grad_norm": 54.04276657104492, "learning_rate": 9.308301886792453e-06, "loss": 4.119568786621094, "step": 6000 }, { "epoch": 0.07625, "grad_norm": 20.94058609008789, "learning_rate": 9.295723270440252e-06, "loss": 3.6099752807617187, "step": 6100 }, { "epoch": 0.0775, "grad_norm": 28.546890258789062, "learning_rate": 9.28314465408805e-06, "loss": 4.167696533203125, "step": 6200 }, { "epoch": 0.07875, "grad_norm": 22.076147079467773, "learning_rate": 9.270566037735849e-06, "loss": 3.8074371337890627, "step": 6300 }, { "epoch": 0.08, "grad_norm": 11.4718599319458, "learning_rate": 9.25798742138365e-06, "loss": 3.5398162841796874, "step": 6400 }, { "epoch": 0.08125, "grad_norm": 20.66476058959961, "learning_rate": 9.245408805031448e-06, "loss": 3.8792694091796873, "step": 6500 }, { "epoch": 0.0825, "grad_norm": 22.23533058166504, "learning_rate": 9.232830188679246e-06, "loss": 3.6433590698242186, "step": 6600 }, { "epoch": 0.08375, "grad_norm": 17.721628189086914, "learning_rate": 9.220251572327045e-06, "loss": 3.73576171875, "step": 6700 }, { "epoch": 0.085, "grad_norm": 53.24715805053711, "learning_rate": 9.207672955974844e-06, "loss": 3.7794915771484376, "step": 6800 }, { "epoch": 0.08625, "grad_norm": 20.755985260009766, "learning_rate": 9.195094339622642e-06, "loss": 3.84636962890625, "step": 6900 }, { "epoch": 0.0875, "grad_norm": 26.36724281311035, "learning_rate": 9.18251572327044e-06, "loss": 3.7252166748046873, "step": 7000 }, { "epoch": 0.08875, "grad_norm": 24.380794525146484, "learning_rate": 9.16993710691824e-06, "loss": 3.7519256591796877, "step": 7100 }, { "epoch": 0.09, "grad_norm": 30.122207641601562, "learning_rate": 9.157358490566038e-06, "loss": 3.6482272338867188, "step": 7200 }, { "epoch": 0.09125, "grad_norm": 21.8885555267334, "learning_rate": 9.144779874213837e-06, "loss": 3.8224606323242187, "step": 7300 }, { "epoch": 0.0925, "grad_norm": 81.36309814453125, "learning_rate": 9.132201257861635e-06, "loss": 3.9120883178710937, "step": 7400 }, { "epoch": 0.09375, "grad_norm": 33.38832473754883, "learning_rate": 9.119622641509435e-06, "loss": 3.3836550903320313, "step": 7500 }, { "epoch": 0.095, "grad_norm": 3.9297144412994385, "learning_rate": 9.107044025157234e-06, "loss": 3.252887878417969, "step": 7600 }, { "epoch": 0.09625, "grad_norm": 25.273887634277344, "learning_rate": 9.094465408805033e-06, "loss": 3.3780868530273436, "step": 7700 }, { "epoch": 0.0975, "grad_norm": 32.79280471801758, "learning_rate": 9.081886792452831e-06, "loss": 3.3074383544921875, "step": 7800 }, { "epoch": 0.09875, "grad_norm": 27.27655601501465, "learning_rate": 9.06930817610063e-06, "loss": 3.4328045654296875, "step": 7900 }, { "epoch": 0.1, "grad_norm": 18.67003631591797, "learning_rate": 9.056729559748428e-06, "loss": 3.4785955810546874, "step": 8000 }, { "epoch": 0.10125, "grad_norm": 16.710369110107422, "learning_rate": 9.044150943396227e-06, "loss": 3.604156188964844, "step": 8100 }, { "epoch": 0.1025, "grad_norm": 39.6090202331543, "learning_rate": 9.031572327044026e-06, "loss": 3.240309753417969, "step": 8200 }, { "epoch": 0.10375, "grad_norm": 50.57542419433594, "learning_rate": 9.018993710691824e-06, "loss": 3.2084616088867186, "step": 8300 }, { "epoch": 0.105, "grad_norm": 3.647831916809082, "learning_rate": 9.006415094339623e-06, "loss": 3.3365228271484373, "step": 8400 }, { "epoch": 0.10625, "grad_norm": 25.59325408935547, "learning_rate": 8.993836477987421e-06, "loss": 3.3707135009765623, "step": 8500 }, { "epoch": 0.1075, "grad_norm": 29.391414642333984, "learning_rate": 8.981257861635222e-06, "loss": 3.698001708984375, "step": 8600 }, { "epoch": 0.10875, "grad_norm": 20.907129287719727, "learning_rate": 8.96867924528302e-06, "loss": 3.4555450439453126, "step": 8700 }, { "epoch": 0.11, "grad_norm": 1.1572871208190918, "learning_rate": 8.956100628930819e-06, "loss": 3.2748760986328125, "step": 8800 }, { "epoch": 0.11125, "grad_norm": 31.631296157836914, "learning_rate": 8.943522012578617e-06, "loss": 3.3183383178710937, "step": 8900 }, { "epoch": 0.1125, "grad_norm": 15.066047668457031, "learning_rate": 8.930943396226416e-06, "loss": 3.3248934936523438, "step": 9000 }, { "epoch": 0.11375, "grad_norm": 40.130699157714844, "learning_rate": 8.918364779874215e-06, "loss": 3.0782876586914063, "step": 9100 }, { "epoch": 0.115, "grad_norm": 6.560884952545166, "learning_rate": 8.905786163522013e-06, "loss": 3.1166537475585936, "step": 9200 }, { "epoch": 0.11625, "grad_norm": 19.945913314819336, "learning_rate": 8.893207547169812e-06, "loss": 3.104024658203125, "step": 9300 }, { "epoch": 0.1175, "grad_norm": 26.818218231201172, "learning_rate": 8.88062893081761e-06, "loss": 3.5535888671875, "step": 9400 }, { "epoch": 0.11875, "grad_norm": 8.911575317382812, "learning_rate": 8.868050314465409e-06, "loss": 3.237396240234375, "step": 9500 }, { "epoch": 0.12, "grad_norm": 27.469289779663086, "learning_rate": 8.855471698113209e-06, "loss": 3.37412841796875, "step": 9600 }, { "epoch": 0.12125, "grad_norm": 21.267345428466797, "learning_rate": 8.842893081761008e-06, "loss": 3.5673455810546875, "step": 9700 }, { "epoch": 0.1225, "grad_norm": 18.69856834411621, "learning_rate": 8.830314465408806e-06, "loss": 3.2800216674804688, "step": 9800 }, { "epoch": 0.12375, "grad_norm": 30.507801055908203, "learning_rate": 8.817735849056605e-06, "loss": 3.042904052734375, "step": 9900 }, { "epoch": 0.125, "grad_norm": 4.167088508605957, "learning_rate": 8.805157232704403e-06, "loss": 3.038970642089844, "step": 10000 }, { "epoch": 0.12625, "grad_norm": 22.224905014038086, "learning_rate": 8.792578616352202e-06, "loss": 3.6609432983398436, "step": 10100 }, { "epoch": 0.1275, "grad_norm": 23.52140235900879, "learning_rate": 8.78e-06, "loss": 3.3873190307617187, "step": 10200 }, { "epoch": 0.12875, "grad_norm": 19.35887336730957, "learning_rate": 8.7674213836478e-06, "loss": 2.98621337890625, "step": 10300 }, { "epoch": 0.13, "grad_norm": 36.2899169921875, "learning_rate": 8.754842767295598e-06, "loss": 3.1030731201171875, "step": 10400 }, { "epoch": 0.13125, "grad_norm": 23.118587493896484, "learning_rate": 8.742264150943396e-06, "loss": 3.2017132568359377, "step": 10500 }, { "epoch": 0.1325, "grad_norm": 18.85630989074707, "learning_rate": 8.729685534591195e-06, "loss": 3.136984558105469, "step": 10600 }, { "epoch": 0.13375, "grad_norm": 10.303237915039062, "learning_rate": 8.717106918238995e-06, "loss": 3.201553955078125, "step": 10700 }, { "epoch": 0.135, "grad_norm": 23.776470184326172, "learning_rate": 8.704528301886794e-06, "loss": 2.64281494140625, "step": 10800 }, { "epoch": 0.13625, "grad_norm": 19.486194610595703, "learning_rate": 8.691949685534592e-06, "loss": 3.0275909423828127, "step": 10900 }, { "epoch": 0.1375, "grad_norm": 23.196096420288086, "learning_rate": 8.679371069182391e-06, "loss": 3.320228271484375, "step": 11000 }, { "epoch": 0.13875, "grad_norm": 18.966537475585938, "learning_rate": 8.66679245283019e-06, "loss": 3.0495703125, "step": 11100 }, { "epoch": 0.14, "grad_norm": 24.352205276489258, "learning_rate": 8.654213836477988e-06, "loss": 2.7282373046875, "step": 11200 }, { "epoch": 0.14125, "grad_norm": 21.45029067993164, "learning_rate": 8.641635220125787e-06, "loss": 2.840162353515625, "step": 11300 }, { "epoch": 0.1425, "grad_norm": 27.577434539794922, "learning_rate": 8.629056603773585e-06, "loss": 2.837103271484375, "step": 11400 }, { "epoch": 0.14375, "grad_norm": 26.380931854248047, "learning_rate": 8.616477987421384e-06, "loss": 2.9257269287109375, "step": 11500 }, { "epoch": 0.145, "grad_norm": 33.0853271484375, "learning_rate": 8.603899371069183e-06, "loss": 2.7715283203125, "step": 11600 }, { "epoch": 0.14625, "grad_norm": 26.68023109436035, "learning_rate": 8.591320754716981e-06, "loss": 2.801429443359375, "step": 11700 }, { "epoch": 0.1475, "grad_norm": 23.897310256958008, "learning_rate": 8.578742138364781e-06, "loss": 2.7405438232421875, "step": 11800 }, { "epoch": 0.14875, "grad_norm": 24.198909759521484, "learning_rate": 8.56616352201258e-06, "loss": 2.8841351318359374, "step": 11900 }, { "epoch": 0.15, "grad_norm": 12.656911849975586, "learning_rate": 8.553584905660379e-06, "loss": 2.8034808349609377, "step": 12000 }, { "epoch": 0.15125, "grad_norm": 21.627422332763672, "learning_rate": 8.541006289308177e-06, "loss": 3.3647225952148436, "step": 12100 }, { "epoch": 0.1525, "grad_norm": 20.174375534057617, "learning_rate": 8.528427672955976e-06, "loss": 3.007825927734375, "step": 12200 }, { "epoch": 0.15375, "grad_norm": 44.98274230957031, "learning_rate": 8.515849056603774e-06, "loss": 3.088919677734375, "step": 12300 }, { "epoch": 0.155, "grad_norm": 19.29104995727539, "learning_rate": 8.503270440251573e-06, "loss": 3.018197021484375, "step": 12400 }, { "epoch": 0.15625, "grad_norm": 24.123010635375977, "learning_rate": 8.490691823899371e-06, "loss": 2.9341339111328124, "step": 12500 }, { "epoch": 0.1575, "grad_norm": 24.07183837890625, "learning_rate": 8.47811320754717e-06, "loss": 3.0051565551757813, "step": 12600 }, { "epoch": 0.15875, "grad_norm": 5.301960468292236, "learning_rate": 8.465534591194969e-06, "loss": 2.80382568359375, "step": 12700 }, { "epoch": 0.16, "grad_norm": 0.05698124319314957, "learning_rate": 8.452955974842767e-06, "loss": 2.8222991943359377, "step": 12800 }, { "epoch": 0.16125, "grad_norm": 18.93758773803711, "learning_rate": 8.440377358490568e-06, "loss": 3.1378509521484377, "step": 12900 }, { "epoch": 0.1625, "grad_norm": 20.055757522583008, "learning_rate": 8.427798742138366e-06, "loss": 2.886328125, "step": 13000 }, { "epoch": 0.16375, "grad_norm": 12.077770233154297, "learning_rate": 8.415220125786165e-06, "loss": 2.8731988525390624, "step": 13100 }, { "epoch": 0.165, "grad_norm": 44.130096435546875, "learning_rate": 8.402641509433963e-06, "loss": 2.7432611083984373, "step": 13200 }, { "epoch": 0.16625, "grad_norm": 41.551048278808594, "learning_rate": 8.390062893081762e-06, "loss": 3.06159912109375, "step": 13300 }, { "epoch": 0.1675, "grad_norm": 18.465818405151367, "learning_rate": 8.37748427672956e-06, "loss": 2.8289349365234373, "step": 13400 }, { "epoch": 0.16875, "grad_norm": 17.856552124023438, "learning_rate": 8.364905660377359e-06, "loss": 2.7455133056640624, "step": 13500 }, { "epoch": 0.17, "grad_norm": 38.62467956542969, "learning_rate": 8.352327044025158e-06, "loss": 2.51541748046875, "step": 13600 }, { "epoch": 0.17125, "grad_norm": 19.783123016357422, "learning_rate": 8.339748427672956e-06, "loss": 3.161922607421875, "step": 13700 }, { "epoch": 0.1725, "grad_norm": 19.746681213378906, "learning_rate": 8.327169811320755e-06, "loss": 2.948106384277344, "step": 13800 }, { "epoch": 0.17375, "grad_norm": 11.665578842163086, "learning_rate": 8.314591194968555e-06, "loss": 2.707484130859375, "step": 13900 }, { "epoch": 0.175, "grad_norm": 32.41368865966797, "learning_rate": 8.302012578616354e-06, "loss": 2.551041259765625, "step": 14000 }, { "epoch": 0.17625, "grad_norm": 19.156505584716797, "learning_rate": 8.289433962264152e-06, "loss": 3.110626220703125, "step": 14100 }, { "epoch": 0.1775, "grad_norm": 24.206411361694336, "learning_rate": 8.27685534591195e-06, "loss": 2.96248291015625, "step": 14200 }, { "epoch": 0.17875, "grad_norm": 28.069095611572266, "learning_rate": 8.26427672955975e-06, "loss": 2.8258160400390624, "step": 14300 }, { "epoch": 0.18, "grad_norm": 29.488801956176758, "learning_rate": 8.251698113207548e-06, "loss": 2.697049560546875, "step": 14400 }, { "epoch": 0.18125, "grad_norm": 24.96384620666504, "learning_rate": 8.239119496855347e-06, "loss": 2.985064697265625, "step": 14500 }, { "epoch": 0.1825, "grad_norm": 23.861825942993164, "learning_rate": 8.226540880503145e-06, "loss": 3.27437255859375, "step": 14600 }, { "epoch": 0.18375, "grad_norm": 17.654409408569336, "learning_rate": 8.213962264150944e-06, "loss": 2.7916656494140626, "step": 14700 }, { "epoch": 0.185, "grad_norm": 13.089086532592773, "learning_rate": 8.201383647798742e-06, "loss": 2.95639404296875, "step": 14800 }, { "epoch": 0.18625, "grad_norm": 25.958925247192383, "learning_rate": 8.188805031446541e-06, "loss": 3.0765805053710937, "step": 14900 }, { "epoch": 0.1875, "grad_norm": 23.476421356201172, "learning_rate": 8.176226415094341e-06, "loss": 3.1241455078125, "step": 15000 }, { "epoch": 0.18875, "grad_norm": 6.848545551300049, "learning_rate": 8.16364779874214e-06, "loss": 3.0716900634765625, "step": 15100 }, { "epoch": 0.19, "grad_norm": 35.278076171875, "learning_rate": 8.151069182389938e-06, "loss": 2.738138427734375, "step": 15200 }, { "epoch": 0.19125, "grad_norm": 18.48978614807129, "learning_rate": 8.138490566037737e-06, "loss": 3.2392678833007813, "step": 15300 }, { "epoch": 0.1925, "grad_norm": 25.764511108398438, "learning_rate": 8.125911949685536e-06, "loss": 2.935572509765625, "step": 15400 }, { "epoch": 0.19375, "grad_norm": 17.244510650634766, "learning_rate": 8.113333333333334e-06, "loss": 2.686405029296875, "step": 15500 }, { "epoch": 0.195, "grad_norm": 58.790367126464844, "learning_rate": 8.100754716981133e-06, "loss": 2.930709228515625, "step": 15600 }, { "epoch": 0.19625, "grad_norm": 23.248153686523438, "learning_rate": 8.088176100628931e-06, "loss": 3.134276123046875, "step": 15700 }, { "epoch": 0.1975, "grad_norm": 22.906993865966797, "learning_rate": 8.07559748427673e-06, "loss": 3.294012451171875, "step": 15800 }, { "epoch": 0.19875, "grad_norm": 18.57663917541504, "learning_rate": 8.063018867924528e-06, "loss": 2.659776611328125, "step": 15900 }, { "epoch": 0.2, "grad_norm": 23.491619110107422, "learning_rate": 8.050440251572327e-06, "loss": 2.69767822265625, "step": 16000 }, { "epoch": 0.20125, "grad_norm": 34.74635696411133, "learning_rate": 8.037861635220127e-06, "loss": 2.9187213134765626, "step": 16100 }, { "epoch": 0.2025, "grad_norm": 17.31464195251465, "learning_rate": 8.025283018867926e-06, "loss": 2.6978961181640626, "step": 16200 }, { "epoch": 0.20375, "grad_norm": 15.085783958435059, "learning_rate": 8.012704402515725e-06, "loss": 2.227297668457031, "step": 16300 }, { "epoch": 0.205, "grad_norm": 40.41267013549805, "learning_rate": 8.000125786163523e-06, "loss": 2.5455712890625, "step": 16400 }, { "epoch": 0.20625, "grad_norm": 21.266006469726562, "learning_rate": 7.987547169811322e-06, "loss": 2.6894076538085936, "step": 16500 }, { "epoch": 0.2075, "grad_norm": 27.872455596923828, "learning_rate": 7.97496855345912e-06, "loss": 2.957840576171875, "step": 16600 }, { "epoch": 0.20875, "grad_norm": 11.036195755004883, "learning_rate": 7.962389937106919e-06, "loss": 2.9388262939453127, "step": 16700 }, { "epoch": 0.21, "grad_norm": 0.358200341463089, "learning_rate": 7.949811320754717e-06, "loss": 2.550313720703125, "step": 16800 }, { "epoch": 0.21125, "grad_norm": 17.692394256591797, "learning_rate": 7.937232704402516e-06, "loss": 2.9567965698242187, "step": 16900 }, { "epoch": 0.2125, "grad_norm": 20.59417724609375, "learning_rate": 7.924654088050315e-06, "loss": 2.8677490234375, "step": 17000 }, { "epoch": 0.21375, "grad_norm": 21.882822036743164, "learning_rate": 7.912075471698115e-06, "loss": 2.6563641357421877, "step": 17100 }, { "epoch": 0.215, "grad_norm": 41.62974548339844, "learning_rate": 7.899496855345913e-06, "loss": 2.938742370605469, "step": 17200 }, { "epoch": 0.21625, "grad_norm": 17.829666137695312, "learning_rate": 7.886918238993712e-06, "loss": 2.936795654296875, "step": 17300 }, { "epoch": 0.2175, "grad_norm": 23.082691192626953, "learning_rate": 7.87433962264151e-06, "loss": 3.0123855590820314, "step": 17400 }, { "epoch": 0.21875, "grad_norm": 30.395631790161133, "learning_rate": 7.86176100628931e-06, "loss": 2.8054989624023436, "step": 17500 }, { "epoch": 0.22, "grad_norm": 0.134773388504982, "learning_rate": 7.849182389937108e-06, "loss": 2.675752868652344, "step": 17600 }, { "epoch": 0.22125, "grad_norm": 20.807125091552734, "learning_rate": 7.836603773584906e-06, "loss": 2.7376922607421874, "step": 17700 }, { "epoch": 0.2225, "grad_norm": 43.09901809692383, "learning_rate": 7.824025157232705e-06, "loss": 2.9282843017578126, "step": 17800 }, { "epoch": 0.22375, "grad_norm": 25.02425193786621, "learning_rate": 7.811446540880504e-06, "loss": 2.571319580078125, "step": 17900 }, { "epoch": 0.225, "grad_norm": 15.398431777954102, "learning_rate": 7.798867924528302e-06, "loss": 2.492489929199219, "step": 18000 }, { "epoch": 0.22625, "grad_norm": 19.006174087524414, "learning_rate": 7.7862893081761e-06, "loss": 2.7379693603515625, "step": 18100 }, { "epoch": 0.2275, "grad_norm": 21.487060546875, "learning_rate": 7.773710691823901e-06, "loss": 3.009235534667969, "step": 18200 }, { "epoch": 0.22875, "grad_norm": 22.86688804626465, "learning_rate": 7.7611320754717e-06, "loss": 2.937470703125, "step": 18300 }, { "epoch": 0.23, "grad_norm": 4.950404167175293, "learning_rate": 7.748553459119498e-06, "loss": 2.7051239013671875, "step": 18400 }, { "epoch": 0.23125, "grad_norm": 22.454504013061523, "learning_rate": 7.735974842767297e-06, "loss": 2.8736590576171874, "step": 18500 }, { "epoch": 0.2325, "grad_norm": 29.73653221130371, "learning_rate": 7.723396226415095e-06, "loss": 2.908158874511719, "step": 18600 }, { "epoch": 0.23375, "grad_norm": 28.101703643798828, "learning_rate": 7.710817610062894e-06, "loss": 2.6522030639648437, "step": 18700 }, { "epoch": 0.235, "grad_norm": 4.781414985656738, "learning_rate": 7.698238993710693e-06, "loss": 2.4308651733398436, "step": 18800 }, { "epoch": 0.23625, "grad_norm": 17.258634567260742, "learning_rate": 7.685660377358491e-06, "loss": 3.040390625, "step": 18900 }, { "epoch": 0.2375, "grad_norm": 17.386960983276367, "learning_rate": 7.67308176100629e-06, "loss": 2.769420471191406, "step": 19000 }, { "epoch": 0.23875, "grad_norm": 25.329315185546875, "learning_rate": 7.660503144654088e-06, "loss": 2.7375177001953124, "step": 19100 }, { "epoch": 0.24, "grad_norm": 6.02203369140625, "learning_rate": 7.647924528301887e-06, "loss": 2.505074920654297, "step": 19200 }, { "epoch": 0.24125, "grad_norm": 20.155107498168945, "learning_rate": 7.635345911949687e-06, "loss": 2.837679138183594, "step": 19300 }, { "epoch": 0.2425, "grad_norm": 18.765806198120117, "learning_rate": 7.622767295597485e-06, "loss": 2.907704162597656, "step": 19400 }, { "epoch": 0.24375, "grad_norm": 17.111177444458008, "learning_rate": 7.610188679245284e-06, "loss": 2.3241337585449218, "step": 19500 }, { "epoch": 0.245, "grad_norm": 26.34480094909668, "learning_rate": 7.597610062893083e-06, "loss": 2.534189453125, "step": 19600 }, { "epoch": 0.24625, "grad_norm": 26.818429946899414, "learning_rate": 7.5850314465408815e-06, "loss": 2.6678060913085937, "step": 19700 }, { "epoch": 0.2475, "grad_norm": 22.52924919128418, "learning_rate": 7.57245283018868e-06, "loss": 2.7501620483398437, "step": 19800 }, { "epoch": 0.24875, "grad_norm": 2.6518571376800537, "learning_rate": 7.559874213836479e-06, "loss": 2.606297607421875, "step": 19900 }, { "epoch": 0.25, "grad_norm": 13.926076889038086, "learning_rate": 7.547295597484277e-06, "loss": 2.6911566162109377, "step": 20000 }, { "epoch": 0.25125, "grad_norm": 21.900789260864258, "learning_rate": 7.534716981132077e-06, "loss": 2.6743511962890625, "step": 20100 }, { "epoch": 0.2525, "grad_norm": 17.92312240600586, "learning_rate": 7.522138364779875e-06, "loss": 3.041307373046875, "step": 20200 }, { "epoch": 0.25375, "grad_norm": 16.685823440551758, "learning_rate": 7.509559748427674e-06, "loss": 2.7370196533203126, "step": 20300 }, { "epoch": 0.255, "grad_norm": 17.445106506347656, "learning_rate": 7.4969811320754725e-06, "loss": 2.769534912109375, "step": 20400 }, { "epoch": 0.25625, "grad_norm": 14.679756164550781, "learning_rate": 7.484402515723271e-06, "loss": 2.6487054443359375, "step": 20500 }, { "epoch": 0.2575, "grad_norm": 29.46009063720703, "learning_rate": 7.4718238993710705e-06, "loss": 2.8649749755859375, "step": 20600 }, { "epoch": 0.25875, "grad_norm": 25.532323837280273, "learning_rate": 7.459245283018869e-06, "loss": 2.5376904296875, "step": 20700 }, { "epoch": 0.26, "grad_norm": 0.2213641107082367, "learning_rate": 7.446666666666668e-06, "loss": 2.5324951171875, "step": 20800 }, { "epoch": 0.26125, "grad_norm": 26.14183807373047, "learning_rate": 7.434088050314466e-06, "loss": 2.9009820556640626, "step": 20900 }, { "epoch": 0.2625, "grad_norm": 20.510589599609375, "learning_rate": 7.421509433962265e-06, "loss": 2.8423321533203123, "step": 21000 }, { "epoch": 0.26375, "grad_norm": 19.538053512573242, "learning_rate": 7.408930817610064e-06, "loss": 2.8104873657226563, "step": 21100 }, { "epoch": 0.265, "grad_norm": 29.258617401123047, "learning_rate": 7.396352201257863e-06, "loss": 2.460407562255859, "step": 21200 }, { "epoch": 0.26625, "grad_norm": 24.23659324645996, "learning_rate": 7.3837735849056614e-06, "loss": 2.797374267578125, "step": 21300 }, { "epoch": 0.2675, "grad_norm": 19.262876510620117, "learning_rate": 7.37119496855346e-06, "loss": 2.8027474975585935, "step": 21400 }, { "epoch": 0.26875, "grad_norm": 18.131284713745117, "learning_rate": 7.358616352201259e-06, "loss": 2.899300842285156, "step": 21500 }, { "epoch": 0.27, "grad_norm": 0.3336597979068756, "learning_rate": 7.346037735849057e-06, "loss": 2.505666961669922, "step": 21600 }, { "epoch": 0.27125, "grad_norm": 16.87041473388672, "learning_rate": 7.333459119496857e-06, "loss": 2.7630340576171877, "step": 21700 }, { "epoch": 0.2725, "grad_norm": 21.505661010742188, "learning_rate": 7.320880503144655e-06, "loss": 2.52651611328125, "step": 21800 }, { "epoch": 0.27375, "grad_norm": 3.3604607582092285, "learning_rate": 7.308301886792454e-06, "loss": 2.6651840209960938, "step": 21900 }, { "epoch": 0.275, "grad_norm": 0.0993848517537117, "learning_rate": 7.295723270440252e-06, "loss": 2.635201416015625, "step": 22000 }, { "epoch": 0.27625, "grad_norm": 28.343442916870117, "learning_rate": 7.283144654088051e-06, "loss": 3.0521551513671876, "step": 22100 }, { "epoch": 0.2775, "grad_norm": 22.11837387084961, "learning_rate": 7.27056603773585e-06, "loss": 2.7989617919921876, "step": 22200 }, { "epoch": 0.27875, "grad_norm": 14.693900108337402, "learning_rate": 7.257987421383649e-06, "loss": 2.586000671386719, "step": 22300 }, { "epoch": 0.28, "grad_norm": 12.320152282714844, "learning_rate": 7.2454088050314476e-06, "loss": 2.6524945068359376, "step": 22400 }, { "epoch": 0.28125, "grad_norm": 16.291311264038086, "learning_rate": 7.232830188679246e-06, "loss": 2.5608685302734373, "step": 22500 }, { "epoch": 0.2825, "grad_norm": 19.380096435546875, "learning_rate": 7.220251572327045e-06, "loss": 2.84384521484375, "step": 22600 }, { "epoch": 0.28375, "grad_norm": 23.465559005737305, "learning_rate": 7.207672955974843e-06, "loss": 2.8349188232421874, "step": 22700 }, { "epoch": 0.285, "grad_norm": 18.676286697387695, "learning_rate": 7.195094339622643e-06, "loss": 2.5051495361328127, "step": 22800 }, { "epoch": 0.28625, "grad_norm": 20.133834838867188, "learning_rate": 7.182515723270441e-06, "loss": 2.7532232666015624, "step": 22900 }, { "epoch": 0.2875, "grad_norm": 19.53656005859375, "learning_rate": 7.16993710691824e-06, "loss": 2.9203271484375, "step": 23000 }, { "epoch": 0.28875, "grad_norm": 18.04520034790039, "learning_rate": 7.1573584905660385e-06, "loss": 2.6057052612304688, "step": 23100 }, { "epoch": 0.29, "grad_norm": 0.5314738154411316, "learning_rate": 7.144779874213837e-06, "loss": 2.6248223876953123, "step": 23200 }, { "epoch": 0.29125, "grad_norm": 16.670757293701172, "learning_rate": 7.1322012578616365e-06, "loss": 2.8461798095703124, "step": 23300 }, { "epoch": 0.2925, "grad_norm": 23.886486053466797, "learning_rate": 7.119622641509435e-06, "loss": 2.57169921875, "step": 23400 }, { "epoch": 0.29375, "grad_norm": 8.540549278259277, "learning_rate": 7.107044025157234e-06, "loss": 2.620181884765625, "step": 23500 }, { "epoch": 0.295, "grad_norm": 34.379066467285156, "learning_rate": 7.094465408805032e-06, "loss": 2.5019407653808594, "step": 23600 }, { "epoch": 0.29625, "grad_norm": 24.65458106994629, "learning_rate": 7.081886792452831e-06, "loss": 2.421243133544922, "step": 23700 }, { "epoch": 0.2975, "grad_norm": 24.197660446166992, "learning_rate": 7.06930817610063e-06, "loss": 2.658455810546875, "step": 23800 }, { "epoch": 0.29875, "grad_norm": 26.06961441040039, "learning_rate": 7.056729559748429e-06, "loss": 2.8529171752929687, "step": 23900 }, { "epoch": 0.3, "grad_norm": 17.869230270385742, "learning_rate": 7.0441509433962275e-06, "loss": 2.4935687255859373, "step": 24000 }, { "epoch": 0.30125, "grad_norm": 24.912609100341797, "learning_rate": 7.031572327044026e-06, "loss": 2.6119781494140626, "step": 24100 }, { "epoch": 0.3025, "grad_norm": 22.447463989257812, "learning_rate": 7.018993710691825e-06, "loss": 2.824261474609375, "step": 24200 }, { "epoch": 0.30375, "grad_norm": 4.013281345367432, "learning_rate": 7.006415094339623e-06, "loss": 2.564319763183594, "step": 24300 }, { "epoch": 0.305, "grad_norm": 20.67205810546875, "learning_rate": 6.993836477987423e-06, "loss": 2.513914794921875, "step": 24400 }, { "epoch": 0.30625, "grad_norm": 24.64101219177246, "learning_rate": 6.981257861635221e-06, "loss": 2.6228094482421875, "step": 24500 }, { "epoch": 0.3075, "grad_norm": 21.346033096313477, "learning_rate": 6.96867924528302e-06, "loss": 2.74439208984375, "step": 24600 }, { "epoch": 0.30875, "grad_norm": 10.391414642333984, "learning_rate": 6.956100628930818e-06, "loss": 2.2809228515625, "step": 24700 }, { "epoch": 0.31, "grad_norm": 1.595000982284546, "learning_rate": 6.943522012578617e-06, "loss": 2.4967855834960937, "step": 24800 }, { "epoch": 0.31125, "grad_norm": 20.543582916259766, "learning_rate": 6.9309433962264164e-06, "loss": 2.585915222167969, "step": 24900 }, { "epoch": 0.3125, "grad_norm": 18.013103485107422, "learning_rate": 6.918364779874215e-06, "loss": 2.967261962890625, "step": 25000 }, { "epoch": 0.31375, "grad_norm": 2.930172920227051, "learning_rate": 6.905786163522014e-06, "loss": 2.45760986328125, "step": 25100 }, { "epoch": 0.315, "grad_norm": 0.313312292098999, "learning_rate": 6.893207547169812e-06, "loss": 2.421280822753906, "step": 25200 }, { "epoch": 0.31625, "grad_norm": 19.377904891967773, "learning_rate": 6.880628930817611e-06, "loss": 2.710804748535156, "step": 25300 }, { "epoch": 0.3175, "grad_norm": 29.901630401611328, "learning_rate": 6.86805031446541e-06, "loss": 2.6909103393554688, "step": 25400 }, { "epoch": 0.31875, "grad_norm": 0.17036622762680054, "learning_rate": 6.855471698113209e-06, "loss": 2.56456787109375, "step": 25500 }, { "epoch": 0.32, "grad_norm": 0.009640435688197613, "learning_rate": 6.842893081761007e-06, "loss": 2.603443603515625, "step": 25600 }, { "epoch": 0.32125, "grad_norm": 18.46479606628418, "learning_rate": 6.830314465408806e-06, "loss": 2.427742156982422, "step": 25700 }, { "epoch": 0.3225, "grad_norm": 20.460947036743164, "learning_rate": 6.8177358490566046e-06, "loss": 2.503583984375, "step": 25800 }, { "epoch": 0.32375, "grad_norm": 36.58029556274414, "learning_rate": 6.805157232704403e-06, "loss": 2.641945495605469, "step": 25900 }, { "epoch": 0.325, "grad_norm": 0.01783256232738495, "learning_rate": 6.7925786163522026e-06, "loss": 2.334880065917969, "step": 26000 }, { "epoch": 0.32625, "grad_norm": 16.67290496826172, "learning_rate": 6.780000000000001e-06, "loss": 2.31351806640625, "step": 26100 }, { "epoch": 0.3275, "grad_norm": 22.63792610168457, "learning_rate": 6.7674213836478e-06, "loss": 2.6967156982421874, "step": 26200 }, { "epoch": 0.32875, "grad_norm": 9.776408195495605, "learning_rate": 6.754842767295598e-06, "loss": 2.56522705078125, "step": 26300 }, { "epoch": 0.33, "grad_norm": 28.92633628845215, "learning_rate": 6.742264150943397e-06, "loss": 2.3877410888671875, "step": 26400 }, { "epoch": 0.33125, "grad_norm": 24.558900833129883, "learning_rate": 6.729685534591196e-06, "loss": 2.6090899658203126, "step": 26500 }, { "epoch": 0.3325, "grad_norm": 22.432323455810547, "learning_rate": 6.717106918238995e-06, "loss": 2.537680358886719, "step": 26600 }, { "epoch": 0.33375, "grad_norm": 16.474348068237305, "learning_rate": 6.7045283018867935e-06, "loss": 2.5907723999023435, "step": 26700 }, { "epoch": 0.335, "grad_norm": 0.12232944369316101, "learning_rate": 6.691949685534592e-06, "loss": 2.2261618041992186, "step": 26800 }, { "epoch": 0.33625, "grad_norm": 23.504776000976562, "learning_rate": 6.679371069182391e-06, "loss": 2.601636962890625, "step": 26900 }, { "epoch": 0.3375, "grad_norm": 25.80137062072754, "learning_rate": 6.66679245283019e-06, "loss": 2.415384521484375, "step": 27000 }, { "epoch": 0.33875, "grad_norm": 10.791342735290527, "learning_rate": 6.654213836477989e-06, "loss": 2.3595407104492185, "step": 27100 }, { "epoch": 0.34, "grad_norm": 0.23251843452453613, "learning_rate": 6.641635220125787e-06, "loss": 2.3029635620117186, "step": 27200 }, { "epoch": 0.34125, "grad_norm": 22.788803100585938, "learning_rate": 6.629056603773586e-06, "loss": 2.467623748779297, "step": 27300 }, { "epoch": 0.3425, "grad_norm": 26.63360023498535, "learning_rate": 6.6164779874213845e-06, "loss": 2.717623291015625, "step": 27400 }, { "epoch": 0.34375, "grad_norm": 16.987070083618164, "learning_rate": 6.603899371069183e-06, "loss": 2.4102777099609374, "step": 27500 }, { "epoch": 0.345, "grad_norm": 0.01087226066738367, "learning_rate": 6.5913207547169825e-06, "loss": 2.3370285034179688, "step": 27600 }, { "epoch": 0.34625, "grad_norm": 23.72052574157715, "learning_rate": 6.578742138364781e-06, "loss": 2.6314013671875, "step": 27700 }, { "epoch": 0.3475, "grad_norm": 23.36273765563965, "learning_rate": 6.56616352201258e-06, "loss": 2.7635400390625, "step": 27800 }, { "epoch": 0.34875, "grad_norm": 1.7312610149383545, "learning_rate": 6.553584905660378e-06, "loss": 2.5564617919921875, "step": 27900 }, { "epoch": 0.35, "grad_norm": 35.84255599975586, "learning_rate": 6.541006289308177e-06, "loss": 2.7504351806640623, "step": 28000 }, { "epoch": 0.35125, "grad_norm": 26.242603302001953, "learning_rate": 6.528427672955976e-06, "loss": 2.373230285644531, "step": 28100 }, { "epoch": 0.3525, "grad_norm": 18.80271339416504, "learning_rate": 6.515849056603775e-06, "loss": 2.472604064941406, "step": 28200 }, { "epoch": 0.35375, "grad_norm": 0.8052368760108948, "learning_rate": 6.5032704402515734e-06, "loss": 2.4384115600585936, "step": 28300 }, { "epoch": 0.355, "grad_norm": 9.235365867614746, "learning_rate": 6.490691823899372e-06, "loss": 2.026627502441406, "step": 28400 }, { "epoch": 0.35625, "grad_norm": 20.106481552124023, "learning_rate": 6.478113207547171e-06, "loss": 2.431183319091797, "step": 28500 }, { "epoch": 0.3575, "grad_norm": 26.855960845947266, "learning_rate": 6.46553459119497e-06, "loss": 2.78699462890625, "step": 28600 }, { "epoch": 0.35875, "grad_norm": 3.0411767959594727, "learning_rate": 6.452955974842769e-06, "loss": 2.862520751953125, "step": 28700 }, { "epoch": 0.36, "grad_norm": 17.747953414916992, "learning_rate": 6.440377358490567e-06, "loss": 2.4523715209960937, "step": 28800 }, { "epoch": 0.36125, "grad_norm": 17.899240493774414, "learning_rate": 6.427798742138366e-06, "loss": 2.606719970703125, "step": 28900 }, { "epoch": 0.3625, "grad_norm": 27.6655216217041, "learning_rate": 6.415220125786164e-06, "loss": 2.9831976318359374, "step": 29000 }, { "epoch": 0.36375, "grad_norm": 11.997615814208984, "learning_rate": 6.402641509433963e-06, "loss": 2.345184326171875, "step": 29100 }, { "epoch": 0.365, "grad_norm": 4.737981796264648, "learning_rate": 6.390062893081761e-06, "loss": 2.16708251953125, "step": 29200 }, { "epoch": 0.36625, "grad_norm": 26.705848693847656, "learning_rate": 6.37748427672956e-06, "loss": 2.63515869140625, "step": 29300 }, { "epoch": 0.3675, "grad_norm": 19.237810134887695, "learning_rate": 6.364905660377359e-06, "loss": 2.532059326171875, "step": 29400 }, { "epoch": 0.36875, "grad_norm": 17.01352882385254, "learning_rate": 6.352327044025157e-06, "loss": 2.55763671875, "step": 29500 }, { "epoch": 0.37, "grad_norm": 27.392210006713867, "learning_rate": 6.339748427672956e-06, "loss": 2.2140924072265626, "step": 29600 }, { "epoch": 0.37125, "grad_norm": 22.72922134399414, "learning_rate": 6.3271698113207545e-06, "loss": 2.749759521484375, "step": 29700 }, { "epoch": 0.3725, "grad_norm": 30.038837432861328, "learning_rate": 6.314591194968553e-06, "loss": 2.596155700683594, "step": 29800 }, { "epoch": 0.37375, "grad_norm": 11.441932678222656, "learning_rate": 6.3020125786163525e-06, "loss": 2.4867178344726564, "step": 29900 }, { "epoch": 0.375, "grad_norm": 1.1486650705337524, "learning_rate": 6.289433962264151e-06, "loss": 2.6022509765625, "step": 30000 }, { "epoch": 0.37625, "grad_norm": 19.94702911376953, "learning_rate": 6.27685534591195e-06, "loss": 2.4900088500976563, "step": 30100 }, { "epoch": 0.3775, "grad_norm": 23.304832458496094, "learning_rate": 6.264276729559748e-06, "loss": 2.6295010375976564, "step": 30200 }, { "epoch": 0.37875, "grad_norm": 0.9189664721488953, "learning_rate": 6.251698113207547e-06, "loss": 2.614851379394531, "step": 30300 }, { "epoch": 0.38, "grad_norm": 7.372312545776367, "learning_rate": 6.239119496855346e-06, "loss": 2.43028564453125, "step": 30400 }, { "epoch": 0.38125, "grad_norm": 24.777666091918945, "learning_rate": 6.226540880503145e-06, "loss": 2.790203857421875, "step": 30500 }, { "epoch": 0.3825, "grad_norm": 24.64716339111328, "learning_rate": 6.2139622641509434e-06, "loss": 2.7274844360351564, "step": 30600 }, { "epoch": 0.38375, "grad_norm": 10.4207763671875, "learning_rate": 6.201383647798742e-06, "loss": 2.1123760986328124, "step": 30700 }, { "epoch": 0.385, "grad_norm": 0.07715418934822083, "learning_rate": 6.188805031446541e-06, "loss": 2.5373687744140625, "step": 30800 }, { "epoch": 0.38625, "grad_norm": 23.558998107910156, "learning_rate": 6.17622641509434e-06, "loss": 2.632052001953125, "step": 30900 }, { "epoch": 0.3875, "grad_norm": 26.116592407226562, "learning_rate": 6.163647798742139e-06, "loss": 2.494936218261719, "step": 31000 }, { "epoch": 0.38875, "grad_norm": 8.466816902160645, "learning_rate": 6.151069182389937e-06, "loss": 2.3917417907714844, "step": 31100 }, { "epoch": 0.39, "grad_norm": 6.2974748611450195, "learning_rate": 6.138490566037736e-06, "loss": 2.315247039794922, "step": 31200 }, { "epoch": 0.39125, "grad_norm": 26.84358024597168, "learning_rate": 6.125911949685534e-06, "loss": 2.6386013793945313, "step": 31300 }, { "epoch": 0.3925, "grad_norm": 18.76458740234375, "learning_rate": 6.113333333333333e-06, "loss": 2.304689636230469, "step": 31400 }, { "epoch": 0.39375, "grad_norm": 12.886114120483398, "learning_rate": 6.100754716981132e-06, "loss": 2.476997528076172, "step": 31500 }, { "epoch": 0.395, "grad_norm": 14.977468490600586, "learning_rate": 6.088176100628931e-06, "loss": 2.3447396850585935, "step": 31600 }, { "epoch": 0.39625, "grad_norm": 29.1169490814209, "learning_rate": 6.0755974842767296e-06, "loss": 2.186744842529297, "step": 31700 }, { "epoch": 0.3975, "grad_norm": 27.776119232177734, "learning_rate": 6.063018867924528e-06, "loss": 2.382681884765625, "step": 31800 }, { "epoch": 0.39875, "grad_norm": 48.07829284667969, "learning_rate": 6.050440251572327e-06, "loss": 2.3385089111328123, "step": 31900 }, { "epoch": 0.4, "grad_norm": 0.7851316332817078, "learning_rate": 6.037861635220126e-06, "loss": 2.229570770263672, "step": 32000 }, { "epoch": 0.40125, "grad_norm": 20.3125057220459, "learning_rate": 6.025283018867925e-06, "loss": 2.5623980712890626, "step": 32100 }, { "epoch": 0.4025, "grad_norm": 21.488149642944336, "learning_rate": 6.012704402515723e-06, "loss": 2.912520751953125, "step": 32200 }, { "epoch": 0.40375, "grad_norm": 32.51865005493164, "learning_rate": 6.000125786163522e-06, "loss": 2.3843609619140627, "step": 32300 }, { "epoch": 0.405, "grad_norm": 1.5765758752822876, "learning_rate": 5.9875471698113205e-06, "loss": 2.381104736328125, "step": 32400 }, { "epoch": 0.40625, "grad_norm": 19.67698860168457, "learning_rate": 5.97496855345912e-06, "loss": 2.60897216796875, "step": 32500 }, { "epoch": 0.4075, "grad_norm": 25.434328079223633, "learning_rate": 5.9623899371069185e-06, "loss": 2.505929260253906, "step": 32600 }, { "epoch": 0.40875, "grad_norm": 1.3922165632247925, "learning_rate": 5.949811320754717e-06, "loss": 2.7238693237304688, "step": 32700 }, { "epoch": 0.41, "grad_norm": 34.48624038696289, "learning_rate": 5.937232704402516e-06, "loss": 2.2143258666992187, "step": 32800 }, { "epoch": 0.41125, "grad_norm": 23.105432510375977, "learning_rate": 5.924654088050314e-06, "loss": 2.609884033203125, "step": 32900 }, { "epoch": 0.4125, "grad_norm": 25.032276153564453, "learning_rate": 5.912075471698113e-06, "loss": 2.4755784606933595, "step": 33000 }, { "epoch": 0.41375, "grad_norm": 2.3799238204956055, "learning_rate": 5.899496855345912e-06, "loss": 2.575408630371094, "step": 33100 }, { "epoch": 0.415, "grad_norm": 0.27088427543640137, "learning_rate": 5.886918238993711e-06, "loss": 2.577947998046875, "step": 33200 }, { "epoch": 0.41625, "grad_norm": 21.328598022460938, "learning_rate": 5.8743396226415095e-06, "loss": 2.358548583984375, "step": 33300 }, { "epoch": 0.4175, "grad_norm": 22.012718200683594, "learning_rate": 5.861761006289308e-06, "loss": 2.5863958740234376, "step": 33400 }, { "epoch": 0.41875, "grad_norm": 11.000739097595215, "learning_rate": 5.849182389937107e-06, "loss": 2.410167236328125, "step": 33500 }, { "epoch": 0.42, "grad_norm": 0.4935649633407593, "learning_rate": 5.836603773584906e-06, "loss": 2.570559387207031, "step": 33600 }, { "epoch": 0.42125, "grad_norm": 16.273447036743164, "learning_rate": 5.824025157232705e-06, "loss": 2.599737548828125, "step": 33700 }, { "epoch": 0.4225, "grad_norm": 17.573291778564453, "learning_rate": 5.811446540880503e-06, "loss": 2.647215576171875, "step": 33800 }, { "epoch": 0.42375, "grad_norm": 11.290910720825195, "learning_rate": 5.798867924528302e-06, "loss": 2.6486666870117186, "step": 33900 }, { "epoch": 0.425, "grad_norm": 0.27160441875457764, "learning_rate": 5.7862893081761004e-06, "loss": 2.48810791015625, "step": 34000 }, { "epoch": 0.42625, "grad_norm": 33.099632263183594, "learning_rate": 5.773710691823899e-06, "loss": 2.4821075439453124, "step": 34100 }, { "epoch": 0.4275, "grad_norm": 18.253433227539062, "learning_rate": 5.7611320754716984e-06, "loss": 2.4092727661132813, "step": 34200 }, { "epoch": 0.42875, "grad_norm": 6.506767749786377, "learning_rate": 5.748553459119497e-06, "loss": 2.461749267578125, "step": 34300 }, { "epoch": 0.43, "grad_norm": 0.04523186758160591, "learning_rate": 5.735974842767296e-06, "loss": 2.4135496520996096, "step": 34400 }, { "epoch": 0.43125, "grad_norm": 22.646928787231445, "learning_rate": 5.723396226415094e-06, "loss": 2.4821810913085938, "step": 34500 }, { "epoch": 0.4325, "grad_norm": 24.81723976135254, "learning_rate": 5.710817610062893e-06, "loss": 2.5469622802734375, "step": 34600 }, { "epoch": 0.43375, "grad_norm": 13.340218544006348, "learning_rate": 5.698238993710692e-06, "loss": 2.5624932861328125, "step": 34700 }, { "epoch": 0.435, "grad_norm": 0.1735188513994217, "learning_rate": 5.685660377358491e-06, "loss": 2.5089077758789062, "step": 34800 }, { "epoch": 0.43625, "grad_norm": 21.547271728515625, "learning_rate": 5.673081761006289e-06, "loss": 2.6526663208007815, "step": 34900 }, { "epoch": 0.4375, "grad_norm": 19.01319694519043, "learning_rate": 5.660503144654088e-06, "loss": 2.860760498046875, "step": 35000 }, { "epoch": 0.43875, "grad_norm": 34.29476547241211, "learning_rate": 5.6479245283018866e-06, "loss": 2.6115411376953124, "step": 35100 }, { "epoch": 0.44, "grad_norm": 49.28584671020508, "learning_rate": 5.635345911949686e-06, "loss": 2.1887362670898436, "step": 35200 }, { "epoch": 0.44125, "grad_norm": 22.770751953125, "learning_rate": 5.622767295597485e-06, "loss": 2.5814306640625, "step": 35300 }, { "epoch": 0.4425, "grad_norm": 30.632217407226562, "learning_rate": 5.610188679245283e-06, "loss": 2.3137057495117186, "step": 35400 }, { "epoch": 0.44375, "grad_norm": 1.2359341382980347, "learning_rate": 5.597610062893082e-06, "loss": 2.060030517578125, "step": 35500 }, { "epoch": 0.445, "grad_norm": 0.3209603428840637, "learning_rate": 5.58503144654088e-06, "loss": 2.113879852294922, "step": 35600 }, { "epoch": 0.44625, "grad_norm": 18.14032554626465, "learning_rate": 5.572452830188679e-06, "loss": 2.4774960327148436, "step": 35700 }, { "epoch": 0.4475, "grad_norm": 28.4627628326416, "learning_rate": 5.559874213836478e-06, "loss": 2.282992248535156, "step": 35800 }, { "epoch": 0.44875, "grad_norm": 9.6217041015625, "learning_rate": 5.547295597484277e-06, "loss": 2.6865158081054688, "step": 35900 }, { "epoch": 0.45, "grad_norm": 0.1811428815126419, "learning_rate": 5.5347169811320755e-06, "loss": 2.6177047729492187, "step": 36000 }, { "epoch": 0.45125, "grad_norm": 29.922924041748047, "learning_rate": 5.522138364779874e-06, "loss": 2.666334533691406, "step": 36100 }, { "epoch": 0.4525, "grad_norm": 22.7872371673584, "learning_rate": 5.509559748427673e-06, "loss": 2.8180126953125, "step": 36200 }, { "epoch": 0.45375, "grad_norm": 47.54941177368164, "learning_rate": 5.496981132075472e-06, "loss": 2.5696286010742186, "step": 36300 }, { "epoch": 0.455, "grad_norm": 0.02181134559214115, "learning_rate": 5.484402515723271e-06, "loss": 2.3324235534667968, "step": 36400 }, { "epoch": 0.45625, "grad_norm": 29.27972412109375, "learning_rate": 5.471823899371069e-06, "loss": 2.377205657958984, "step": 36500 }, { "epoch": 0.4575, "grad_norm": 27.066911697387695, "learning_rate": 5.459245283018868e-06, "loss": 2.6452157592773435, "step": 36600 }, { "epoch": 0.45875, "grad_norm": 5.8870849609375, "learning_rate": 5.4466666666666665e-06, "loss": 2.6054962158203123, "step": 36700 }, { "epoch": 0.46, "grad_norm": 9.92285442352295, "learning_rate": 5.434088050314466e-06, "loss": 2.3585281372070312, "step": 36800 }, { "epoch": 0.46125, "grad_norm": 25.87106704711914, "learning_rate": 5.4215094339622645e-06, "loss": 2.3194918823242188, "step": 36900 }, { "epoch": 0.4625, "grad_norm": 24.31088638305664, "learning_rate": 5.408930817610063e-06, "loss": 2.558509521484375, "step": 37000 }, { "epoch": 0.46375, "grad_norm": 8.893757820129395, "learning_rate": 5.396352201257862e-06, "loss": 2.501906433105469, "step": 37100 }, { "epoch": 0.465, "grad_norm": 0.19256582856178284, "learning_rate": 5.38377358490566e-06, "loss": 2.4328764343261717, "step": 37200 }, { "epoch": 0.46625, "grad_norm": 36.3577995300293, "learning_rate": 5.371194968553459e-06, "loss": 2.593095397949219, "step": 37300 }, { "epoch": 0.4675, "grad_norm": 28.993858337402344, "learning_rate": 5.358616352201258e-06, "loss": 2.5382254028320315, "step": 37400 }, { "epoch": 0.46875, "grad_norm": 1.2644715309143066, "learning_rate": 5.346037735849057e-06, "loss": 2.5988037109375, "step": 37500 }, { "epoch": 0.47, "grad_norm": 1.758081078529358, "learning_rate": 5.3334591194968554e-06, "loss": 2.3407969665527344, "step": 37600 }, { "epoch": 0.47125, "grad_norm": 29.136577606201172, "learning_rate": 5.320880503144654e-06, "loss": 2.649151611328125, "step": 37700 }, { "epoch": 0.4725, "grad_norm": 25.28328514099121, "learning_rate": 5.308301886792453e-06, "loss": 2.5671429443359375, "step": 37800 }, { "epoch": 0.47375, "grad_norm": 15.502847671508789, "learning_rate": 5.295723270440252e-06, "loss": 2.6158074951171875, "step": 37900 }, { "epoch": 0.475, "grad_norm": 0.22777187824249268, "learning_rate": 5.283144654088051e-06, "loss": 2.15365966796875, "step": 38000 }, { "epoch": 0.47625, "grad_norm": 11.893803596496582, "learning_rate": 5.270566037735849e-06, "loss": 2.3156301879882815, "step": 38100 }, { "epoch": 0.4775, "grad_norm": 25.753210067749023, "learning_rate": 5.257987421383648e-06, "loss": 2.503589324951172, "step": 38200 }, { "epoch": 0.47875, "grad_norm": 16.314041137695312, "learning_rate": 5.245408805031446e-06, "loss": 2.639173583984375, "step": 38300 }, { "epoch": 0.48, "grad_norm": 0.021451743319630623, "learning_rate": 5.232830188679246e-06, "loss": 2.311227111816406, "step": 38400 }, { "epoch": 0.48125, "grad_norm": 23.82429313659668, "learning_rate": 5.220251572327044e-06, "loss": 2.3693087768554686, "step": 38500 }, { "epoch": 0.4825, "grad_norm": 27.395193099975586, "learning_rate": 5.207672955974843e-06, "loss": 2.6940673828125, "step": 38600 }, { "epoch": 0.48375, "grad_norm": 4.3216071128845215, "learning_rate": 5.1950943396226416e-06, "loss": 2.763636474609375, "step": 38700 }, { "epoch": 0.485, "grad_norm": 7.015088081359863, "learning_rate": 5.18251572327044e-06, "loss": 2.432607879638672, "step": 38800 }, { "epoch": 0.48625, "grad_norm": 22.117116928100586, "learning_rate": 5.169937106918239e-06, "loss": 2.6036077880859376, "step": 38900 }, { "epoch": 0.4875, "grad_norm": 19.56247329711914, "learning_rate": 5.157358490566038e-06, "loss": 2.742894287109375, "step": 39000 }, { "epoch": 0.48875, "grad_norm": 9.732338905334473, "learning_rate": 5.144779874213837e-06, "loss": 2.3442169189453126, "step": 39100 }, { "epoch": 0.49, "grad_norm": 12.13438892364502, "learning_rate": 5.132201257861635e-06, "loss": 2.3807391357421874, "step": 39200 }, { "epoch": 0.49125, "grad_norm": 20.819271087646484, "learning_rate": 5.119622641509434e-06, "loss": 2.489746551513672, "step": 39300 }, { "epoch": 0.4925, "grad_norm": 22.794889450073242, "learning_rate": 5.1070440251572325e-06, "loss": 2.7172930908203123, "step": 39400 }, { "epoch": 0.49375, "grad_norm": 9.96747875213623, "learning_rate": 5.094465408805032e-06, "loss": 2.375674133300781, "step": 39500 }, { "epoch": 0.495, "grad_norm": 0.012898732908070087, "learning_rate": 5.0818867924528305e-06, "loss": 2.238426513671875, "step": 39600 }, { "epoch": 0.49625, "grad_norm": 23.519365310668945, "learning_rate": 5.069308176100629e-06, "loss": 2.3515843200683593, "step": 39700 }, { "epoch": 0.4975, "grad_norm": 24.20486068725586, "learning_rate": 5.056729559748428e-06, "loss": 2.6381719970703124, "step": 39800 }, { "epoch": 0.49875, "grad_norm": 25.931121826171875, "learning_rate": 5.044150943396226e-06, "loss": 2.2815740966796874, "step": 39900 }, { "epoch": 0.5, "grad_norm": 0.4895442724227905, "learning_rate": 5.031572327044026e-06, "loss": 2.1325482177734374, "step": 40000 }, { "epoch": 0.50125, "grad_norm": 22.385295867919922, "learning_rate": 5.018993710691824e-06, "loss": 2.376383514404297, "step": 40100 }, { "epoch": 0.5025, "grad_norm": 24.816694259643555, "learning_rate": 5.006415094339623e-06, "loss": 2.4902183532714846, "step": 40200 }, { "epoch": 0.50375, "grad_norm": 0.2846035957336426, "learning_rate": 4.9938364779874215e-06, "loss": 2.5287326049804686, "step": 40300 }, { "epoch": 0.505, "grad_norm": 2.7115321159362793, "learning_rate": 4.98125786163522e-06, "loss": 2.357637939453125, "step": 40400 }, { "epoch": 0.50625, "grad_norm": 23.814891815185547, "learning_rate": 4.968679245283019e-06, "loss": 2.194998016357422, "step": 40500 }, { "epoch": 0.5075, "grad_norm": 22.298978805541992, "learning_rate": 4.956100628930818e-06, "loss": 2.3045025634765626, "step": 40600 }, { "epoch": 0.50875, "grad_norm": 1.843037724494934, "learning_rate": 4.943522012578617e-06, "loss": 2.532041473388672, "step": 40700 }, { "epoch": 0.51, "grad_norm": 0.05613021180033684, "learning_rate": 4.930943396226415e-06, "loss": 2.358630828857422, "step": 40800 }, { "epoch": 0.51125, "grad_norm": 26.92227554321289, "learning_rate": 4.918364779874214e-06, "loss": 2.375510711669922, "step": 40900 }, { "epoch": 0.5125, "grad_norm": 24.53324317932129, "learning_rate": 4.9057861635220124e-06, "loss": 2.3742355346679687, "step": 41000 }, { "epoch": 0.51375, "grad_norm": 1.7757785320281982, "learning_rate": 4.893207547169812e-06, "loss": 2.1228904724121094, "step": 41100 }, { "epoch": 0.515, "grad_norm": 1.370865821838379, "learning_rate": 4.8806289308176104e-06, "loss": 2.2730003356933595, "step": 41200 }, { "epoch": 0.51625, "grad_norm": 27.052255630493164, "learning_rate": 4.868050314465409e-06, "loss": 2.1805772399902343, "step": 41300 }, { "epoch": 0.5175, "grad_norm": 19.289770126342773, "learning_rate": 4.855471698113208e-06, "loss": 2.3394677734375, "step": 41400 }, { "epoch": 0.51875, "grad_norm": 0.14031943678855896, "learning_rate": 4.842893081761006e-06, "loss": 2.16936767578125, "step": 41500 }, { "epoch": 0.52, "grad_norm": 0.3180188536643982, "learning_rate": 4.830314465408806e-06, "loss": 2.288330383300781, "step": 41600 }, { "epoch": 0.52125, "grad_norm": 28.790279388427734, "learning_rate": 4.817735849056604e-06, "loss": 2.2974285888671875, "step": 41700 }, { "epoch": 0.5225, "grad_norm": 29.82061767578125, "learning_rate": 4.805157232704403e-06, "loss": 2.2505938720703127, "step": 41800 }, { "epoch": 0.52375, "grad_norm": 1.3207628726959229, "learning_rate": 4.792578616352201e-06, "loss": 2.1726535034179686, "step": 41900 }, { "epoch": 0.525, "grad_norm": 19.538122177124023, "learning_rate": 4.78e-06, "loss": 2.3601947021484375, "step": 42000 }, { "epoch": 0.52625, "grad_norm": 23.45235252380371, "learning_rate": 4.7674213836477986e-06, "loss": 2.3694439697265626, "step": 42100 }, { "epoch": 0.5275, "grad_norm": 29.030534744262695, "learning_rate": 4.754842767295598e-06, "loss": 2.375699462890625, "step": 42200 }, { "epoch": 0.52875, "grad_norm": 0.05739065632224083, "learning_rate": 4.742264150943397e-06, "loss": 2.2624081420898436, "step": 42300 }, { "epoch": 0.53, "grad_norm": 0.09302949905395508, "learning_rate": 4.729685534591195e-06, "loss": 2.418585205078125, "step": 42400 }, { "epoch": 0.53125, "grad_norm": 20.36932945251465, "learning_rate": 4.717106918238994e-06, "loss": 2.7232342529296876, "step": 42500 }, { "epoch": 0.5325, "grad_norm": 25.583152770996094, "learning_rate": 4.704528301886792e-06, "loss": 2.5170713806152345, "step": 42600 }, { "epoch": 0.53375, "grad_norm": 27.926185607910156, "learning_rate": 4.691949685534592e-06, "loss": 2.2554306030273437, "step": 42700 }, { "epoch": 0.535, "grad_norm": 0.015048661269247532, "learning_rate": 4.67937106918239e-06, "loss": 2.410753173828125, "step": 42800 }, { "epoch": 0.53625, "grad_norm": 22.782848358154297, "learning_rate": 4.666792452830189e-06, "loss": 2.506884765625, "step": 42900 }, { "epoch": 0.5375, "grad_norm": 27.83270263671875, "learning_rate": 4.6542138364779875e-06, "loss": 2.42444091796875, "step": 43000 }, { "epoch": 0.53875, "grad_norm": 14.226387023925781, "learning_rate": 4.641635220125786e-06, "loss": 2.2237681579589843, "step": 43100 }, { "epoch": 0.54, "grad_norm": 30.74934196472168, "learning_rate": 4.6290566037735855e-06, "loss": 1.8440298461914062, "step": 43200 }, { "epoch": 0.54125, "grad_norm": 26.56260871887207, "learning_rate": 4.616477987421384e-06, "loss": 2.5520880126953127, "step": 43300 }, { "epoch": 0.5425, "grad_norm": 26.363876342773438, "learning_rate": 4.603899371069183e-06, "loss": 2.6202496337890624, "step": 43400 }, { "epoch": 0.54375, "grad_norm": 24.67633628845215, "learning_rate": 4.591320754716981e-06, "loss": 2.253356170654297, "step": 43500 }, { "epoch": 0.545, "grad_norm": 0.01842404529452324, "learning_rate": 4.57874213836478e-06, "loss": 2.283622283935547, "step": 43600 }, { "epoch": 0.54625, "grad_norm": 15.855024337768555, "learning_rate": 4.5661635220125785e-06, "loss": 2.501055908203125, "step": 43700 }, { "epoch": 0.5475, "grad_norm": 26.492700576782227, "learning_rate": 4.553584905660378e-06, "loss": 2.426959228515625, "step": 43800 }, { "epoch": 0.54875, "grad_norm": 7.623685836791992, "learning_rate": 4.5410062893081765e-06, "loss": 1.9829179382324218, "step": 43900 }, { "epoch": 0.55, "grad_norm": 17.01411247253418, "learning_rate": 4.528427672955975e-06, "loss": 2.1798980712890623, "step": 44000 }, { "epoch": 0.55125, "grad_norm": 21.383556365966797, "learning_rate": 4.515849056603774e-06, "loss": 2.0453337097167967, "step": 44100 }, { "epoch": 0.5525, "grad_norm": 22.872831344604492, "learning_rate": 4.503270440251572e-06, "loss": 2.4065341186523437, "step": 44200 }, { "epoch": 0.55375, "grad_norm": 0.3448590934276581, "learning_rate": 4.490691823899372e-06, "loss": 2.0755601501464844, "step": 44300 }, { "epoch": 0.555, "grad_norm": 64.99383544921875, "learning_rate": 4.47811320754717e-06, "loss": 2.167461242675781, "step": 44400 }, { "epoch": 0.55625, "grad_norm": 26.849061965942383, "learning_rate": 4.465534591194969e-06, "loss": 2.392653045654297, "step": 44500 }, { "epoch": 0.5575, "grad_norm": 22.663536071777344, "learning_rate": 4.4529559748427674e-06, "loss": 2.6425860595703123, "step": 44600 }, { "epoch": 0.55875, "grad_norm": 0.09723508358001709, "learning_rate": 4.440377358490566e-06, "loss": 2.041617431640625, "step": 44700 }, { "epoch": 0.56, "grad_norm": 0.6772779822349548, "learning_rate": 4.427798742138365e-06, "loss": 2.0510581970214843, "step": 44800 }, { "epoch": 0.56125, "grad_norm": 23.326744079589844, "learning_rate": 4.415220125786164e-06, "loss": 2.28655029296875, "step": 44900 }, { "epoch": 0.5625, "grad_norm": 22.986696243286133, "learning_rate": 4.402641509433963e-06, "loss": 2.162388916015625, "step": 45000 }, { "epoch": 0.56375, "grad_norm": 0.11825785040855408, "learning_rate": 4.390062893081761e-06, "loss": 2.4120408630371095, "step": 45100 }, { "epoch": 0.565, "grad_norm": 0.07316289842128754, "learning_rate": 4.37748427672956e-06, "loss": 1.9428927612304687, "step": 45200 }, { "epoch": 0.56625, "grad_norm": 19.97572135925293, "learning_rate": 4.364905660377358e-06, "loss": 2.409757537841797, "step": 45300 }, { "epoch": 0.5675, "grad_norm": 21.209138870239258, "learning_rate": 4.352327044025158e-06, "loss": 2.303609619140625, "step": 45400 }, { "epoch": 0.56875, "grad_norm": 0.48939380049705505, "learning_rate": 4.339748427672956e-06, "loss": 2.2687278747558595, "step": 45500 }, { "epoch": 0.57, "grad_norm": 2.690356492996216, "learning_rate": 4.327169811320755e-06, "loss": 2.068523406982422, "step": 45600 }, { "epoch": 0.57125, "grad_norm": 24.331865310668945, "learning_rate": 4.3145911949685536e-06, "loss": 2.371395721435547, "step": 45700 }, { "epoch": 0.5725, "grad_norm": 19.168750762939453, "learning_rate": 4.302012578616352e-06, "loss": 2.48111083984375, "step": 45800 }, { "epoch": 0.57375, "grad_norm": 1.1316941976547241, "learning_rate": 4.289433962264152e-06, "loss": 2.1914462280273437, "step": 45900 }, { "epoch": 0.575, "grad_norm": 0.7238625884056091, "learning_rate": 4.27685534591195e-06, "loss": 2.408410949707031, "step": 46000 }, { "epoch": 0.57625, "grad_norm": 18.509540557861328, "learning_rate": 4.264276729559749e-06, "loss": 2.5385202026367186, "step": 46100 }, { "epoch": 0.5775, "grad_norm": 24.170326232910156, "learning_rate": 4.251698113207547e-06, "loss": 2.726481628417969, "step": 46200 }, { "epoch": 0.57875, "grad_norm": 5.890894889831543, "learning_rate": 4.239119496855346e-06, "loss": 2.3738558959960936, "step": 46300 }, { "epoch": 0.58, "grad_norm": 0.009730951860547066, "learning_rate": 4.2265408805031445e-06, "loss": 2.6054721069335938, "step": 46400 }, { "epoch": 0.58125, "grad_norm": 24.269445419311523, "learning_rate": 4.213962264150944e-06, "loss": 2.3926129150390625, "step": 46500 }, { "epoch": 0.5825, "grad_norm": 20.738441467285156, "learning_rate": 4.2013836477987425e-06, "loss": 2.491241455078125, "step": 46600 }, { "epoch": 0.58375, "grad_norm": 1.0278619527816772, "learning_rate": 4.188805031446541e-06, "loss": 2.247518768310547, "step": 46700 }, { "epoch": 0.585, "grad_norm": 4.836909294128418, "learning_rate": 4.17622641509434e-06, "loss": 2.104373016357422, "step": 46800 }, { "epoch": 0.58625, "grad_norm": 31.359527587890625, "learning_rate": 4.163647798742138e-06, "loss": 2.190884704589844, "step": 46900 }, { "epoch": 0.5875, "grad_norm": 33.07499313354492, "learning_rate": 4.151069182389938e-06, "loss": 2.332211608886719, "step": 47000 }, { "epoch": 0.58875, "grad_norm": 18.392343521118164, "learning_rate": 4.138490566037736e-06, "loss": 2.077041015625, "step": 47100 }, { "epoch": 0.59, "grad_norm": 0.6778843998908997, "learning_rate": 4.125911949685535e-06, "loss": 2.148955993652344, "step": 47200 }, { "epoch": 0.59125, "grad_norm": 29.361854553222656, "learning_rate": 4.1133333333333335e-06, "loss": 2.404347381591797, "step": 47300 }, { "epoch": 0.5925, "grad_norm": 21.975435256958008, "learning_rate": 4.100754716981132e-06, "loss": 2.4371409606933594, "step": 47400 }, { "epoch": 0.59375, "grad_norm": 0.4038066864013672, "learning_rate": 4.0881761006289315e-06, "loss": 2.4506686401367186, "step": 47500 }, { "epoch": 0.595, "grad_norm": 0.0259912870824337, "learning_rate": 4.07559748427673e-06, "loss": 2.2095245361328124, "step": 47600 }, { "epoch": 0.59625, "grad_norm": 24.107616424560547, "learning_rate": 4.063018867924529e-06, "loss": 2.3150425720214844, "step": 47700 }, { "epoch": 0.5975, "grad_norm": 22.568410873413086, "learning_rate": 4.050440251572327e-06, "loss": 2.3050765991210938, "step": 47800 }, { "epoch": 0.59875, "grad_norm": 1.4052597284317017, "learning_rate": 4.037861635220126e-06, "loss": 2.3776676940917967, "step": 47900 }, { "epoch": 0.6, "grad_norm": 5.013542652130127, "learning_rate": 4.0252830188679244e-06, "loss": 2.1164060974121095, "step": 48000 }, { "epoch": 0.60125, "grad_norm": 24.288990020751953, "learning_rate": 4.012704402515724e-06, "loss": 2.0022381591796874, "step": 48100 }, { "epoch": 0.6025, "grad_norm": 26.07032012939453, "learning_rate": 4.0001257861635224e-06, "loss": 2.1755996704101563, "step": 48200 }, { "epoch": 0.60375, "grad_norm": 1.911749005317688, "learning_rate": 3.987547169811321e-06, "loss": 2.01885986328125, "step": 48300 }, { "epoch": 0.605, "grad_norm": 12.477325439453125, "learning_rate": 3.97496855345912e-06, "loss": 2.368613739013672, "step": 48400 }, { "epoch": 0.60625, "grad_norm": 21.526126861572266, "learning_rate": 3.962389937106918e-06, "loss": 2.3319718933105467, "step": 48500 }, { "epoch": 0.6075, "grad_norm": 21.947265625, "learning_rate": 3.949811320754718e-06, "loss": 2.5695175170898437, "step": 48600 }, { "epoch": 0.60875, "grad_norm": 9.083207130432129, "learning_rate": 3.937232704402516e-06, "loss": 2.4009765625, "step": 48700 }, { "epoch": 0.61, "grad_norm": 9.588478088378906, "learning_rate": 3.924654088050315e-06, "loss": 2.1275369262695314, "step": 48800 }, { "epoch": 0.61125, "grad_norm": 16.444225311279297, "learning_rate": 3.912075471698113e-06, "loss": 2.22287841796875, "step": 48900 }, { "epoch": 0.6125, "grad_norm": 20.83136558532715, "learning_rate": 3.899496855345912e-06, "loss": 2.3845196533203126, "step": 49000 }, { "epoch": 0.61375, "grad_norm": 10.17677116394043, "learning_rate": 3.886918238993711e-06, "loss": 2.190762176513672, "step": 49100 }, { "epoch": 0.615, "grad_norm": 0.006293127313256264, "learning_rate": 3.87433962264151e-06, "loss": 2.2727685546875, "step": 49200 }, { "epoch": 0.61625, "grad_norm": 25.517677307128906, "learning_rate": 3.861761006289309e-06, "loss": 2.267947082519531, "step": 49300 }, { "epoch": 0.6175, "grad_norm": 28.428192138671875, "learning_rate": 3.849182389937107e-06, "loss": 2.418175048828125, "step": 49400 }, { "epoch": 0.61875, "grad_norm": 27.646535873413086, "learning_rate": 3.836603773584906e-06, "loss": 2.280270233154297, "step": 49500 }, { "epoch": 0.62, "grad_norm": 0.03348470479249954, "learning_rate": 3.824025157232704e-06, "loss": 2.0512631225585936, "step": 49600 }, { "epoch": 0.62125, "grad_norm": 29.742656707763672, "learning_rate": 3.8114465408805033e-06, "loss": 2.4671990966796873, "step": 49700 }, { "epoch": 0.6225, "grad_norm": 23.064598083496094, "learning_rate": 3.7988679245283024e-06, "loss": 2.4474307250976564, "step": 49800 }, { "epoch": 0.62375, "grad_norm": 2.3941636085510254, "learning_rate": 3.786289308176101e-06, "loss": 2.03037109375, "step": 49900 }, { "epoch": 0.625, "grad_norm": 0.19696100056171417, "learning_rate": 3.7737106918238995e-06, "loss": 2.0240495300292967, "step": 50000 }, { "epoch": 0.62625, "grad_norm": 26.428871154785156, "learning_rate": 3.7611320754716985e-06, "loss": 2.200367889404297, "step": 50100 }, { "epoch": 0.6275, "grad_norm": 23.72710609436035, "learning_rate": 3.748553459119497e-06, "loss": 2.7000701904296873, "step": 50200 }, { "epoch": 0.62875, "grad_norm": 27.32769012451172, "learning_rate": 3.735974842767296e-06, "loss": 2.0785140991210938, "step": 50300 }, { "epoch": 0.63, "grad_norm": 32.80459213256836, "learning_rate": 3.7233962264150947e-06, "loss": 2.2123362731933596, "step": 50400 }, { "epoch": 0.63125, "grad_norm": 26.444387435913086, "learning_rate": 3.7108176100628933e-06, "loss": 2.479073944091797, "step": 50500 }, { "epoch": 0.6325, "grad_norm": 28.786527633666992, "learning_rate": 3.6982389937106923e-06, "loss": 2.289686737060547, "step": 50600 }, { "epoch": 0.63375, "grad_norm": 0.1663801074028015, "learning_rate": 3.685660377358491e-06, "loss": 2.1976898193359373, "step": 50700 }, { "epoch": 0.635, "grad_norm": 30.683305740356445, "learning_rate": 3.6730817610062895e-06, "loss": 2.2509585571289064, "step": 50800 }, { "epoch": 0.63625, "grad_norm": 22.308727264404297, "learning_rate": 3.6605031446540885e-06, "loss": 2.5132717895507812, "step": 50900 }, { "epoch": 0.6375, "grad_norm": 19.899017333984375, "learning_rate": 3.647924528301887e-06, "loss": 2.246257781982422, "step": 51000 }, { "epoch": 0.63875, "grad_norm": 0.14116325974464417, "learning_rate": 3.635345911949686e-06, "loss": 2.05459228515625, "step": 51100 }, { "epoch": 0.64, "grad_norm": 0.020385975018143654, "learning_rate": 3.6227672955974847e-06, "loss": 2.099994812011719, "step": 51200 }, { "epoch": 0.64125, "grad_norm": 17.793832778930664, "learning_rate": 3.6101886792452833e-06, "loss": 2.259058074951172, "step": 51300 }, { "epoch": 0.6425, "grad_norm": 21.187685012817383, "learning_rate": 3.5976100628930823e-06, "loss": 2.204525146484375, "step": 51400 }, { "epoch": 0.64375, "grad_norm": 1.0163244009017944, "learning_rate": 3.585031446540881e-06, "loss": 2.2062753295898436, "step": 51500 }, { "epoch": 0.645, "grad_norm": 0.44557446241378784, "learning_rate": 3.5724528301886794e-06, "loss": 2.1636553955078126, "step": 51600 }, { "epoch": 0.64625, "grad_norm": 22.489730834960938, "learning_rate": 3.5598742138364784e-06, "loss": 2.343136444091797, "step": 51700 }, { "epoch": 0.6475, "grad_norm": 27.255525588989258, "learning_rate": 3.547295597484277e-06, "loss": 2.585829162597656, "step": 51800 }, { "epoch": 0.64875, "grad_norm": 0.30436962842941284, "learning_rate": 3.534716981132076e-06, "loss": 2.0592010498046873, "step": 51900 }, { "epoch": 0.65, "grad_norm": 0.008368916809558868, "learning_rate": 3.5221383647798746e-06, "loss": 2.0964291381835936, "step": 52000 }, { "epoch": 0.65125, "grad_norm": 25.623943328857422, "learning_rate": 3.509559748427673e-06, "loss": 2.2132614135742186, "step": 52100 }, { "epoch": 0.6525, "grad_norm": 23.970529556274414, "learning_rate": 3.4969811320754722e-06, "loss": 2.2939547729492187, "step": 52200 }, { "epoch": 0.65375, "grad_norm": 0.13999006152153015, "learning_rate": 3.484402515723271e-06, "loss": 2.1940087890625, "step": 52300 }, { "epoch": 0.655, "grad_norm": 6.885776519775391, "learning_rate": 3.4718238993710694e-06, "loss": 2.3822996520996096, "step": 52400 }, { "epoch": 0.65625, "grad_norm": 28.326892852783203, "learning_rate": 3.4592452830188684e-06, "loss": 2.7556744384765626, "step": 52500 }, { "epoch": 0.6575, "grad_norm": 29.53263282775879, "learning_rate": 3.446666666666667e-06, "loss": 2.4048992919921877, "step": 52600 }, { "epoch": 0.65875, "grad_norm": 9.168279647827148, "learning_rate": 3.434088050314466e-06, "loss": 2.3623193359375, "step": 52700 }, { "epoch": 0.66, "grad_norm": 51.9276123046875, "learning_rate": 3.4215094339622646e-06, "loss": 2.3159774780273437, "step": 52800 }, { "epoch": 0.66125, "grad_norm": 34.09861373901367, "learning_rate": 3.408930817610063e-06, "loss": 2.254779968261719, "step": 52900 }, { "epoch": 0.6625, "grad_norm": 24.50155258178711, "learning_rate": 3.396352201257862e-06, "loss": 2.4943356323242187, "step": 53000 }, { "epoch": 0.66375, "grad_norm": 1.6096267700195312, "learning_rate": 3.3837735849056608e-06, "loss": 2.4191680908203126, "step": 53100 }, { "epoch": 0.665, "grad_norm": 9.347710609436035, "learning_rate": 3.3711949685534593e-06, "loss": 2.3084637451171877, "step": 53200 }, { "epoch": 0.66625, "grad_norm": 27.17746925354004, "learning_rate": 3.3586163522012584e-06, "loss": 2.3632272338867186, "step": 53300 }, { "epoch": 0.6675, "grad_norm": 19.401647567749023, "learning_rate": 3.346037735849057e-06, "loss": 2.4689659118652343, "step": 53400 }, { "epoch": 0.66875, "grad_norm": 0.02893674187362194, "learning_rate": 3.333459119496856e-06, "loss": 2.2349473571777345, "step": 53500 }, { "epoch": 0.67, "grad_norm": 0.308145672082901, "learning_rate": 3.3208805031446545e-06, "loss": 2.4012570190429687, "step": 53600 }, { "epoch": 0.67125, "grad_norm": 27.814510345458984, "learning_rate": 3.308301886792453e-06, "loss": 2.364227752685547, "step": 53700 }, { "epoch": 0.6725, "grad_norm": 30.748811721801758, "learning_rate": 3.295723270440252e-06, "loss": 2.298586883544922, "step": 53800 }, { "epoch": 0.67375, "grad_norm": 12.556225776672363, "learning_rate": 3.2831446540880507e-06, "loss": 2.026658935546875, "step": 53900 }, { "epoch": 0.675, "grad_norm": 32.828857421875, "learning_rate": 3.2705660377358493e-06, "loss": 2.2101339721679687, "step": 54000 }, { "epoch": 0.67625, "grad_norm": 41.711971282958984, "learning_rate": 3.2579874213836483e-06, "loss": 2.2615397644042967, "step": 54100 }, { "epoch": 0.6775, "grad_norm": 22.22212791442871, "learning_rate": 3.245408805031447e-06, "loss": 2.264314880371094, "step": 54200 }, { "epoch": 0.67875, "grad_norm": 0.31344112753868103, "learning_rate": 3.232830188679246e-06, "loss": 1.843946990966797, "step": 54300 }, { "epoch": 0.68, "grad_norm": 11.150541305541992, "learning_rate": 3.2202515723270445e-06, "loss": 2.107444305419922, "step": 54400 }, { "epoch": 0.68125, "grad_norm": 27.848705291748047, "learning_rate": 3.207672955974843e-06, "loss": 2.6434942626953126, "step": 54500 }, { "epoch": 0.6825, "grad_norm": 20.511205673217773, "learning_rate": 3.195094339622642e-06, "loss": 2.2614511108398436, "step": 54600 }, { "epoch": 0.68375, "grad_norm": 15.717256546020508, "learning_rate": 3.1825157232704407e-06, "loss": 2.4019781494140626, "step": 54700 }, { "epoch": 0.685, "grad_norm": 15.28030014038086, "learning_rate": 3.1699371069182393e-06, "loss": 2.1407426452636718, "step": 54800 }, { "epoch": 0.68625, "grad_norm": 26.36769676208496, "learning_rate": 3.1573584905660383e-06, "loss": 2.4883404541015626, "step": 54900 }, { "epoch": 0.6875, "grad_norm": 30.11307144165039, "learning_rate": 3.144779874213837e-06, "loss": 2.4928182983398437, "step": 55000 }, { "epoch": 0.68875, "grad_norm": 3.050213575363159, "learning_rate": 3.132201257861636e-06, "loss": 2.242222900390625, "step": 55100 }, { "epoch": 0.69, "grad_norm": 0.08332010358572006, "learning_rate": 3.1196226415094344e-06, "loss": 2.095365447998047, "step": 55200 }, { "epoch": 0.69125, "grad_norm": 25.02206802368164, "learning_rate": 3.107044025157233e-06, "loss": 2.2118115234375, "step": 55300 }, { "epoch": 0.6925, "grad_norm": 21.8033390045166, "learning_rate": 3.094465408805032e-06, "loss": 2.372667236328125, "step": 55400 }, { "epoch": 0.69375, "grad_norm": 0.31131941080093384, "learning_rate": 3.0818867924528306e-06, "loss": 2.4498031616210936, "step": 55500 }, { "epoch": 0.695, "grad_norm": 1.234559416770935, "learning_rate": 3.069308176100629e-06, "loss": 2.095816650390625, "step": 55600 }, { "epoch": 0.69625, "grad_norm": 24.62751007080078, "learning_rate": 3.0567295597484282e-06, "loss": 2.246084442138672, "step": 55700 }, { "epoch": 0.6975, "grad_norm": 32.08023452758789, "learning_rate": 3.044150943396227e-06, "loss": 2.2728422546386717, "step": 55800 }, { "epoch": 0.69875, "grad_norm": 1.7580287456512451, "learning_rate": 3.0315723270440254e-06, "loss": 1.912986297607422, "step": 55900 }, { "epoch": 0.7, "grad_norm": 0.024206427857279778, "learning_rate": 3.0189937106918244e-06, "loss": 2.12907470703125, "step": 56000 }, { "epoch": 0.70125, "grad_norm": 26.95330810546875, "learning_rate": 3.006415094339623e-06, "loss": 2.421792297363281, "step": 56100 }, { "epoch": 0.7025, "grad_norm": 22.763641357421875, "learning_rate": 2.993836477987422e-06, "loss": 2.2753536987304686, "step": 56200 }, { "epoch": 0.70375, "grad_norm": 6.763653755187988, "learning_rate": 2.9812578616352206e-06, "loss": 2.017315673828125, "step": 56300 }, { "epoch": 0.705, "grad_norm": 0.030549824237823486, "learning_rate": 2.968679245283019e-06, "loss": 2.2717471313476563, "step": 56400 }, { "epoch": 0.70625, "grad_norm": 30.307323455810547, "learning_rate": 2.956100628930818e-06, "loss": 2.375959777832031, "step": 56500 }, { "epoch": 0.7075, "grad_norm": 22.114675521850586, "learning_rate": 2.9435220125786168e-06, "loss": 2.2014059448242187, "step": 56600 }, { "epoch": 0.70875, "grad_norm": 0.3245386779308319, "learning_rate": 2.9309433962264153e-06, "loss": 1.9942924499511718, "step": 56700 }, { "epoch": 0.71, "grad_norm": 0.012478250078856945, "learning_rate": 2.9183647798742144e-06, "loss": 2.166055908203125, "step": 56800 }, { "epoch": 0.71125, "grad_norm": 25.91703987121582, "learning_rate": 2.905786163522013e-06, "loss": 2.2955760192871093, "step": 56900 }, { "epoch": 0.7125, "grad_norm": 23.741256713867188, "learning_rate": 2.893207547169812e-06, "loss": 2.308643493652344, "step": 57000 }, { "epoch": 0.71375, "grad_norm": 32.72561264038086, "learning_rate": 2.8806289308176105e-06, "loss": 2.2564122009277345, "step": 57100 }, { "epoch": 0.715, "grad_norm": 0.09459064900875092, "learning_rate": 2.868050314465409e-06, "loss": 2.263880615234375, "step": 57200 }, { "epoch": 0.71625, "grad_norm": 31.514862060546875, "learning_rate": 2.855471698113208e-06, "loss": 2.22376953125, "step": 57300 }, { "epoch": 0.7175, "grad_norm": 23.945999145507812, "learning_rate": 2.8428930817610067e-06, "loss": 2.121052703857422, "step": 57400 }, { "epoch": 0.71875, "grad_norm": 11.593914985656738, "learning_rate": 2.8303144654088053e-06, "loss": 2.0279052734375, "step": 57500 }, { "epoch": 0.72, "grad_norm": 0.03185856342315674, "learning_rate": 2.8177358490566043e-06, "loss": 1.9632949829101562, "step": 57600 }, { "epoch": 0.72125, "grad_norm": 26.430574417114258, "learning_rate": 2.805157232704403e-06, "loss": 2.224911804199219, "step": 57700 }, { "epoch": 0.7225, "grad_norm": 23.202377319335938, "learning_rate": 2.792578616352202e-06, "loss": 2.3928271484375, "step": 57800 }, { "epoch": 0.72375, "grad_norm": 6.8675127029418945, "learning_rate": 2.7800000000000005e-06, "loss": 2.345365447998047, "step": 57900 }, { "epoch": 0.725, "grad_norm": 7.9162750244140625, "learning_rate": 2.767421383647799e-06, "loss": 2.40666015625, "step": 58000 }, { "epoch": 0.72625, "grad_norm": 28.883695602416992, "learning_rate": 2.754842767295598e-06, "loss": 2.1290104675292967, "step": 58100 }, { "epoch": 0.7275, "grad_norm": 28.86813735961914, "learning_rate": 2.7422641509433967e-06, "loss": 2.3013229370117188, "step": 58200 }, { "epoch": 0.72875, "grad_norm": 0.05526283383369446, "learning_rate": 2.7296855345911953e-06, "loss": 2.349634246826172, "step": 58300 }, { "epoch": 0.73, "grad_norm": 0.46888020634651184, "learning_rate": 2.7171069182389943e-06, "loss": 2.269898681640625, "step": 58400 }, { "epoch": 0.73125, "grad_norm": 21.197877883911133, "learning_rate": 2.704528301886793e-06, "loss": 2.294650115966797, "step": 58500 }, { "epoch": 0.7325, "grad_norm": 28.490921020507812, "learning_rate": 2.691949685534592e-06, "loss": 2.507839813232422, "step": 58600 }, { "epoch": 0.73375, "grad_norm": 0.9312068819999695, "learning_rate": 2.6793710691823904e-06, "loss": 2.1638980102539063, "step": 58700 }, { "epoch": 0.735, "grad_norm": 7.32960319519043, "learning_rate": 2.666792452830189e-06, "loss": 2.1146189880371096, "step": 58800 }, { "epoch": 0.73625, "grad_norm": 16.3245792388916, "learning_rate": 2.654213836477988e-06, "loss": 2.353025360107422, "step": 58900 }, { "epoch": 0.7375, "grad_norm": 28.876869201660156, "learning_rate": 2.641635220125786e-06, "loss": 2.3220205688476563, "step": 59000 }, { "epoch": 0.73875, "grad_norm": 1.1006479263305664, "learning_rate": 2.6290566037735848e-06, "loss": 2.3847764587402343, "step": 59100 }, { "epoch": 0.74, "grad_norm": 0.052632566541433334, "learning_rate": 2.616477987421384e-06, "loss": 1.9747433471679687, "step": 59200 }, { "epoch": 0.74125, "grad_norm": 27.36806297302246, "learning_rate": 2.6038993710691824e-06, "loss": 2.234722137451172, "step": 59300 }, { "epoch": 0.7425, "grad_norm": 32.45076370239258, "learning_rate": 2.591320754716981e-06, "loss": 2.2224740600585937, "step": 59400 }, { "epoch": 0.74375, "grad_norm": 0.18155953288078308, "learning_rate": 2.57874213836478e-06, "loss": 2.1847653198242187, "step": 59500 }, { "epoch": 0.745, "grad_norm": 21.358049392700195, "learning_rate": 2.5661635220125786e-06, "loss": 2.1930168151855467, "step": 59600 }, { "epoch": 0.74625, "grad_norm": 23.599361419677734, "learning_rate": 2.553584905660377e-06, "loss": 2.13337646484375, "step": 59700 }, { "epoch": 0.7475, "grad_norm": 25.336759567260742, "learning_rate": 2.541006289308176e-06, "loss": 2.245163879394531, "step": 59800 }, { "epoch": 0.74875, "grad_norm": 14.410560607910156, "learning_rate": 2.5284276729559747e-06, "loss": 2.1882379150390623, "step": 59900 }, { "epoch": 0.75, "grad_norm": 17.1163330078125, "learning_rate": 2.5158490566037737e-06, "loss": 1.899642791748047, "step": 60000 }, { "epoch": 0.75125, "grad_norm": 32.30154800415039, "learning_rate": 2.5032704402515723e-06, "loss": 2.124136962890625, "step": 60100 }, { "epoch": 0.7525, "grad_norm": 34.646514892578125, "learning_rate": 2.4906918238993713e-06, "loss": 2.2636474609375, "step": 60200 }, { "epoch": 0.75375, "grad_norm": 0.17916053533554077, "learning_rate": 2.47811320754717e-06, "loss": 2.10548095703125, "step": 60300 }, { "epoch": 0.755, "grad_norm": 0.3836560845375061, "learning_rate": 2.465534591194969e-06, "loss": 2.248681640625, "step": 60400 }, { "epoch": 0.75625, "grad_norm": 25.696950912475586, "learning_rate": 2.4529559748427675e-06, "loss": 2.4749154663085937, "step": 60500 }, { "epoch": 0.7575, "grad_norm": 11.523337364196777, "learning_rate": 2.440377358490566e-06, "loss": 2.1865379333496096, "step": 60600 }, { "epoch": 0.75875, "grad_norm": 0.2725467383861542, "learning_rate": 2.427798742138365e-06, "loss": 2.069666442871094, "step": 60700 }, { "epoch": 0.76, "grad_norm": 0.008206835016608238, "learning_rate": 2.4152201257861637e-06, "loss": 2.1584426879882814, "step": 60800 }, { "epoch": 0.76125, "grad_norm": 28.906057357788086, "learning_rate": 2.4026415094339627e-06, "loss": 2.254979705810547, "step": 60900 }, { "epoch": 0.7625, "grad_norm": 12.576886177062988, "learning_rate": 2.3900628930817613e-06, "loss": 2.25433349609375, "step": 61000 }, { "epoch": 0.76375, "grad_norm": 10.839340209960938, "learning_rate": 2.37748427672956e-06, "loss": 2.084885711669922, "step": 61100 }, { "epoch": 0.765, "grad_norm": 0.06599520146846771, "learning_rate": 2.364905660377359e-06, "loss": 2.23760986328125, "step": 61200 }, { "epoch": 0.76625, "grad_norm": 31.791736602783203, "learning_rate": 2.3523270440251575e-06, "loss": 2.42175537109375, "step": 61300 }, { "epoch": 0.7675, "grad_norm": 29.30803680419922, "learning_rate": 2.339748427672956e-06, "loss": 2.215085754394531, "step": 61400 }, { "epoch": 0.76875, "grad_norm": 5.6741461753845215, "learning_rate": 2.327169811320755e-06, "loss": 2.066580810546875, "step": 61500 }, { "epoch": 0.77, "grad_norm": 0.006725333631038666, "learning_rate": 2.3145911949685537e-06, "loss": 2.1789097595214844, "step": 61600 }, { "epoch": 0.77125, "grad_norm": 21.000843048095703, "learning_rate": 2.3020125786163527e-06, "loss": 2.4083587646484377, "step": 61700 }, { "epoch": 0.7725, "grad_norm": 24.83283042907715, "learning_rate": 2.2894339622641513e-06, "loss": 2.813940124511719, "step": 61800 }, { "epoch": 0.77375, "grad_norm": 0.34976866841316223, "learning_rate": 2.27685534591195e-06, "loss": 2.1371139526367187, "step": 61900 }, { "epoch": 0.775, "grad_norm": 0.2197369635105133, "learning_rate": 2.264276729559749e-06, "loss": 1.9622064208984376, "step": 62000 }, { "epoch": 0.77625, "grad_norm": 23.31989860534668, "learning_rate": 2.2516981132075474e-06, "loss": 2.1933587646484374, "step": 62100 }, { "epoch": 0.7775, "grad_norm": 26.234939575195312, "learning_rate": 2.239119496855346e-06, "loss": 2.341946716308594, "step": 62200 }, { "epoch": 0.77875, "grad_norm": 27.059640884399414, "learning_rate": 2.226540880503145e-06, "loss": 2.026497497558594, "step": 62300 }, { "epoch": 0.78, "grad_norm": 10.47779369354248, "learning_rate": 2.2139622641509436e-06, "loss": 2.1496810913085938, "step": 62400 }, { "epoch": 0.78125, "grad_norm": 14.565492630004883, "learning_rate": 2.2013836477987426e-06, "loss": 2.1073020935058593, "step": 62500 }, { "epoch": 0.7825, "grad_norm": 26.76126480102539, "learning_rate": 2.188805031446541e-06, "loss": 2.1374913024902344, "step": 62600 }, { "epoch": 0.78375, "grad_norm": 1.1460466384887695, "learning_rate": 2.17622641509434e-06, "loss": 2.4106900024414064, "step": 62700 }, { "epoch": 0.785, "grad_norm": 0.04275045916438103, "learning_rate": 2.163647798742139e-06, "loss": 1.977836151123047, "step": 62800 }, { "epoch": 0.78625, "grad_norm": 29.69572639465332, "learning_rate": 2.1510691823899374e-06, "loss": 2.211481628417969, "step": 62900 }, { "epoch": 0.7875, "grad_norm": 27.114084243774414, "learning_rate": 2.138490566037736e-06, "loss": 2.3308370971679686, "step": 63000 }, { "epoch": 0.78875, "grad_norm": 0.024876583367586136, "learning_rate": 2.125911949685535e-06, "loss": 2.2788821411132814, "step": 63100 }, { "epoch": 0.79, "grad_norm": 0.011269732378423214, "learning_rate": 2.1133333333333336e-06, "loss": 2.2254403686523436, "step": 63200 }, { "epoch": 0.79125, "grad_norm": 29.721452713012695, "learning_rate": 2.1007547169811326e-06, "loss": 2.2361138916015624, "step": 63300 }, { "epoch": 0.7925, "grad_norm": 26.52519416809082, "learning_rate": 2.088176100628931e-06, "loss": 2.304778747558594, "step": 63400 }, { "epoch": 0.79375, "grad_norm": 0.33907511830329895, "learning_rate": 2.0755974842767297e-06, "loss": 2.1444125366210938, "step": 63500 }, { "epoch": 0.795, "grad_norm": 0.848170280456543, "learning_rate": 2.0630188679245288e-06, "loss": 2.224850311279297, "step": 63600 }, { "epoch": 0.79625, "grad_norm": 30.878385543823242, "learning_rate": 2.0504402515723273e-06, "loss": 2.3003367614746093, "step": 63700 }, { "epoch": 0.7975, "grad_norm": 21.298805236816406, "learning_rate": 2.037861635220126e-06, "loss": 2.251806640625, "step": 63800 }, { "epoch": 0.79875, "grad_norm": 1.4935413599014282, "learning_rate": 2.025283018867925e-06, "loss": 2.0355984497070314, "step": 63900 }, { "epoch": 0.8, "grad_norm": 0.23573997616767883, "learning_rate": 2.012704402515723e-06, "loss": 2.3094590759277343, "step": 64000 }, { "epoch": 0.80125, "grad_norm": 21.646190643310547, "learning_rate": 2.000125786163522e-06, "loss": 2.45231689453125, "step": 64100 }, { "epoch": 0.8025, "grad_norm": 26.4865779876709, "learning_rate": 1.9875471698113207e-06, "loss": 2.481203918457031, "step": 64200 }, { "epoch": 0.80375, "grad_norm": 16.815086364746094, "learning_rate": 1.9749685534591197e-06, "loss": 2.274866180419922, "step": 64300 }, { "epoch": 0.805, "grad_norm": 1.5168429613113403, "learning_rate": 1.9623899371069183e-06, "loss": 2.090848083496094, "step": 64400 }, { "epoch": 0.80625, "grad_norm": 24.665088653564453, "learning_rate": 1.949811320754717e-06, "loss": 2.149015197753906, "step": 64500 }, { "epoch": 0.8075, "grad_norm": 44.3161735534668, "learning_rate": 1.937232704402516e-06, "loss": 2.267424774169922, "step": 64600 }, { "epoch": 0.80875, "grad_norm": 14.768138885498047, "learning_rate": 1.9246540880503145e-06, "loss": 2.1676406860351562, "step": 64700 }, { "epoch": 0.81, "grad_norm": 23.735509872436523, "learning_rate": 1.912075471698113e-06, "loss": 2.066508026123047, "step": 64800 }, { "epoch": 0.81125, "grad_norm": 27.178951263427734, "learning_rate": 1.899496855345912e-06, "loss": 2.202220916748047, "step": 64900 }, { "epoch": 0.8125, "grad_norm": 27.76671600341797, "learning_rate": 1.8869182389937107e-06, "loss": 2.390093994140625, "step": 65000 }, { "epoch": 0.81375, "grad_norm": 22.02654266357422, "learning_rate": 1.8743396226415094e-06, "loss": 2.02405517578125, "step": 65100 }, { "epoch": 0.815, "grad_norm": 0.019850876182317734, "learning_rate": 1.8617610062893082e-06, "loss": 2.1032846069335935, "step": 65200 }, { "epoch": 0.81625, "grad_norm": 26.451114654541016, "learning_rate": 1.849182389937107e-06, "loss": 2.0573655700683595, "step": 65300 }, { "epoch": 0.8175, "grad_norm": 25.203014373779297, "learning_rate": 1.8366037735849056e-06, "loss": 2.23822265625, "step": 65400 }, { "epoch": 0.81875, "grad_norm": 0.07573448121547699, "learning_rate": 1.8240251572327044e-06, "loss": 2.170495910644531, "step": 65500 }, { "epoch": 0.82, "grad_norm": 0.6197527647018433, "learning_rate": 1.8114465408805032e-06, "loss": 2.0060690307617186, "step": 65600 }, { "epoch": 0.82125, "grad_norm": 29.55573081970215, "learning_rate": 1.798867924528302e-06, "loss": 2.104353942871094, "step": 65700 }, { "epoch": 0.8225, "grad_norm": 13.039762496948242, "learning_rate": 1.7862893081761006e-06, "loss": 1.940672607421875, "step": 65800 }, { "epoch": 0.82375, "grad_norm": 10.955610275268555, "learning_rate": 1.7737106918238994e-06, "loss": 1.850831298828125, "step": 65900 }, { "epoch": 0.825, "grad_norm": 28.8968505859375, "learning_rate": 1.7611320754716982e-06, "loss": 2.2148202514648436, "step": 66000 }, { "epoch": 0.82625, "grad_norm": 20.05891990661621, "learning_rate": 1.748553459119497e-06, "loss": 2.0501080322265626, "step": 66100 }, { "epoch": 0.8275, "grad_norm": 29.854055404663086, "learning_rate": 1.7359748427672956e-06, "loss": 2.338354187011719, "step": 66200 }, { "epoch": 0.82875, "grad_norm": 0.17976756393909454, "learning_rate": 1.7233962264150944e-06, "loss": 2.372988739013672, "step": 66300 }, { "epoch": 0.83, "grad_norm": 14.72498893737793, "learning_rate": 1.7108176100628932e-06, "loss": 2.317147674560547, "step": 66400 }, { "epoch": 0.83125, "grad_norm": 25.00243377685547, "learning_rate": 1.698238993710692e-06, "loss": 2.546220550537109, "step": 66500 }, { "epoch": 0.8325, "grad_norm": 28.58661460876465, "learning_rate": 1.6856603773584906e-06, "loss": 2.251160125732422, "step": 66600 }, { "epoch": 0.83375, "grad_norm": 24.616119384765625, "learning_rate": 1.6730817610062894e-06, "loss": 1.8937945556640625, "step": 66700 }, { "epoch": 0.835, "grad_norm": 0.021497901529073715, "learning_rate": 1.6605031446540882e-06, "loss": 2.2238792419433593, "step": 66800 }, { "epoch": 0.83625, "grad_norm": 20.581586837768555, "learning_rate": 1.647924528301887e-06, "loss": 2.025327606201172, "step": 66900 }, { "epoch": 0.8375, "grad_norm": 29.323177337646484, "learning_rate": 1.6353459119496855e-06, "loss": 2.465900115966797, "step": 67000 }, { "epoch": 0.83875, "grad_norm": 39.928653717041016, "learning_rate": 1.6227672955974843e-06, "loss": 2.1216752624511717, "step": 67100 }, { "epoch": 0.84, "grad_norm": 0.043776318430900574, "learning_rate": 1.6101886792452831e-06, "loss": 2.0655088806152344, "step": 67200 }, { "epoch": 0.84125, "grad_norm": 25.438045501708984, "learning_rate": 1.597610062893082e-06, "loss": 2.087451171875, "step": 67300 }, { "epoch": 0.8425, "grad_norm": 27.504056930541992, "learning_rate": 1.5850314465408805e-06, "loss": 2.1888687133789064, "step": 67400 }, { "epoch": 0.84375, "grad_norm": 3.616384744644165, "learning_rate": 1.5724528301886793e-06, "loss": 2.11557861328125, "step": 67500 }, { "epoch": 0.845, "grad_norm": 0.0108040077611804, "learning_rate": 1.5598742138364781e-06, "loss": 1.8791226196289061, "step": 67600 }, { "epoch": 0.84625, "grad_norm": 24.401247024536133, "learning_rate": 1.547295597484277e-06, "loss": 2.1741673278808595, "step": 67700 }, { "epoch": 0.8475, "grad_norm": 30.91659164428711, "learning_rate": 1.5347169811320755e-06, "loss": 2.1437338256835936, "step": 67800 }, { "epoch": 0.84875, "grad_norm": 0.09920002520084381, "learning_rate": 1.5221383647798743e-06, "loss": 2.361597900390625, "step": 67900 }, { "epoch": 0.85, "grad_norm": 0.24567686021327972, "learning_rate": 1.509559748427673e-06, "loss": 2.162798309326172, "step": 68000 }, { "epoch": 0.85125, "grad_norm": 30.701915740966797, "learning_rate": 1.4969811320754719e-06, "loss": 2.3576284790039064, "step": 68100 }, { "epoch": 0.8525, "grad_norm": 27.654489517211914, "learning_rate": 1.4844025157232705e-06, "loss": 2.233973083496094, "step": 68200 }, { "epoch": 0.85375, "grad_norm": 3.693110227584839, "learning_rate": 1.4718238993710693e-06, "loss": 2.0266494750976562, "step": 68300 }, { "epoch": 0.855, "grad_norm": 24.848302841186523, "learning_rate": 1.459245283018868e-06, "loss": 2.010597839355469, "step": 68400 }, { "epoch": 0.85625, "grad_norm": 29.43360137939453, "learning_rate": 1.4466666666666669e-06, "loss": 2.4028634643554687, "step": 68500 }, { "epoch": 0.8575, "grad_norm": 19.056543350219727, "learning_rate": 1.4340880503144654e-06, "loss": 2.272985076904297, "step": 68600 }, { "epoch": 0.85875, "grad_norm": 1.6909524202346802, "learning_rate": 1.4215094339622642e-06, "loss": 2.3617742919921874, "step": 68700 }, { "epoch": 0.86, "grad_norm": 21.4754695892334, "learning_rate": 1.408930817610063e-06, "loss": 2.270048828125, "step": 68800 }, { "epoch": 0.86125, "grad_norm": 23.546085357666016, "learning_rate": 1.3963522012578618e-06, "loss": 2.2229489135742186, "step": 68900 }, { "epoch": 0.8625, "grad_norm": 31.1220760345459, "learning_rate": 1.3837735849056604e-06, "loss": 2.3392955017089845, "step": 69000 }, { "epoch": 0.86375, "grad_norm": 0.902050256729126, "learning_rate": 1.3711949685534592e-06, "loss": 2.018695526123047, "step": 69100 }, { "epoch": 0.865, "grad_norm": 0.15184037387371063, "learning_rate": 1.358616352201258e-06, "loss": 2.03200439453125, "step": 69200 }, { "epoch": 0.86625, "grad_norm": 25.513137817382812, "learning_rate": 1.3460377358490568e-06, "loss": 2.394747619628906, "step": 69300 }, { "epoch": 0.8675, "grad_norm": 22.844642639160156, "learning_rate": 1.3334591194968554e-06, "loss": 2.346148681640625, "step": 69400 }, { "epoch": 0.86875, "grad_norm": 0.07020768523216248, "learning_rate": 1.3208805031446542e-06, "loss": 2.269253387451172, "step": 69500 }, { "epoch": 0.87, "grad_norm": 0.2192593514919281, "learning_rate": 1.308301886792453e-06, "loss": 2.37726806640625, "step": 69600 }, { "epoch": 0.87125, "grad_norm": 33.360755920410156, "learning_rate": 1.2957232704402518e-06, "loss": 2.214630584716797, "step": 69700 }, { "epoch": 0.8725, "grad_norm": 23.013324737548828, "learning_rate": 1.2831446540880504e-06, "loss": 2.24765869140625, "step": 69800 }, { "epoch": 0.87375, "grad_norm": 0.6654048562049866, "learning_rate": 1.2705660377358492e-06, "loss": 2.5640155029296876, "step": 69900 }, { "epoch": 0.875, "grad_norm": 3.4219138622283936, "learning_rate": 1.257987421383648e-06, "loss": 2.13403564453125, "step": 70000 }, { "epoch": 0.87625, "grad_norm": 29.68235969543457, "learning_rate": 1.2454088050314468e-06, "loss": 2.29338134765625, "step": 70100 }, { "epoch": 0.8775, "grad_norm": 23.957015991210938, "learning_rate": 1.2328301886792454e-06, "loss": 2.2298233032226564, "step": 70200 }, { "epoch": 0.87875, "grad_norm": 17.787887573242188, "learning_rate": 1.2202515723270442e-06, "loss": 2.1456781005859376, "step": 70300 }, { "epoch": 0.88, "grad_norm": 6.154158115386963, "learning_rate": 1.207672955974843e-06, "loss": 2.085262908935547, "step": 70400 }, { "epoch": 0.88125, "grad_norm": 27.724740982055664, "learning_rate": 1.1950943396226417e-06, "loss": 2.1977059936523435, "step": 70500 }, { "epoch": 0.8825, "grad_norm": 25.416337966918945, "learning_rate": 1.1825157232704403e-06, "loss": 2.048145751953125, "step": 70600 }, { "epoch": 0.88375, "grad_norm": 30.56559181213379, "learning_rate": 1.1699371069182391e-06, "loss": 2.236173553466797, "step": 70700 }, { "epoch": 0.885, "grad_norm": 0.02905646152794361, "learning_rate": 1.157358490566038e-06, "loss": 2.1224407958984375, "step": 70800 }, { "epoch": 0.88625, "grad_norm": 22.59400749206543, "learning_rate": 1.1447798742138367e-06, "loss": 2.273807373046875, "step": 70900 }, { "epoch": 0.8875, "grad_norm": 20.05291175842285, "learning_rate": 1.1322012578616353e-06, "loss": 2.122344207763672, "step": 71000 }, { "epoch": 0.88875, "grad_norm": 0.9370853900909424, "learning_rate": 1.1196226415094341e-06, "loss": 2.123062744140625, "step": 71100 }, { "epoch": 0.89, "grad_norm": 33.95699691772461, "learning_rate": 1.107044025157233e-06, "loss": 2.0445166015625, "step": 71200 }, { "epoch": 0.89125, "grad_norm": 18.907957077026367, "learning_rate": 1.0944654088050317e-06, "loss": 1.96984619140625, "step": 71300 }, { "epoch": 0.8925, "grad_norm": 28.72160530090332, "learning_rate": 1.0818867924528303e-06, "loss": 2.2017242431640627, "step": 71400 }, { "epoch": 0.89375, "grad_norm": 0.1414874643087387, "learning_rate": 1.0693081761006289e-06, "loss": 2.244500732421875, "step": 71500 }, { "epoch": 0.895, "grad_norm": 9.050920486450195, "learning_rate": 1.0567295597484277e-06, "loss": 2.038321990966797, "step": 71600 }, { "epoch": 0.89625, "grad_norm": 20.498157501220703, "learning_rate": 1.0441509433962265e-06, "loss": 2.200545349121094, "step": 71700 }, { "epoch": 0.8975, "grad_norm": 28.807994842529297, "learning_rate": 1.0315723270440253e-06, "loss": 2.4168437194824217, "step": 71800 }, { "epoch": 0.89875, "grad_norm": 13.557960510253906, "learning_rate": 1.0189937106918239e-06, "loss": 1.8951638793945313, "step": 71900 }, { "epoch": 0.9, "grad_norm": 0.04173203930258751, "learning_rate": 1.0064150943396227e-06, "loss": 1.8145158386230469, "step": 72000 }, { "epoch": 0.90125, "grad_norm": 26.1096248626709, "learning_rate": 9.938364779874214e-07, "loss": 1.9488568115234375, "step": 72100 }, { "epoch": 0.9025, "grad_norm": 26.35661506652832, "learning_rate": 9.812578616352202e-07, "loss": 2.003428649902344, "step": 72200 }, { "epoch": 0.90375, "grad_norm": 0.6807858943939209, "learning_rate": 9.686792452830188e-07, "loss": 2.1505252075195314, "step": 72300 }, { "epoch": 0.905, "grad_norm": 8.778648376464844, "learning_rate": 9.561006289308176e-07, "loss": 1.9629728698730469, "step": 72400 }, { "epoch": 0.90625, "grad_norm": 21.047719955444336, "learning_rate": 9.435220125786164e-07, "loss": 2.397852020263672, "step": 72500 }, { "epoch": 0.9075, "grad_norm": 28.41111946105957, "learning_rate": 9.309433962264151e-07, "loss": 2.4121713256835937, "step": 72600 }, { "epoch": 0.90875, "grad_norm": 0.15218345820903778, "learning_rate": 9.183647798742139e-07, "loss": 2.357157897949219, "step": 72700 }, { "epoch": 0.91, "grad_norm": 0.017881672829389572, "learning_rate": 9.057861635220126e-07, "loss": 1.9150712585449219, "step": 72800 }, { "epoch": 0.91125, "grad_norm": 25.772994995117188, "learning_rate": 8.932075471698114e-07, "loss": 2.163759460449219, "step": 72900 }, { "epoch": 0.9125, "grad_norm": 23.19663429260254, "learning_rate": 8.806289308176101e-07, "loss": 2.012795867919922, "step": 73000 }, { "epoch": 0.91375, "grad_norm": 4.303622722625732, "learning_rate": 8.680503144654089e-07, "loss": 2.200050048828125, "step": 73100 }, { "epoch": 0.915, "grad_norm": 0.030221115797758102, "learning_rate": 8.554716981132076e-07, "loss": 2.2937960815429688, "step": 73200 }, { "epoch": 0.91625, "grad_norm": 26.862483978271484, "learning_rate": 8.428930817610064e-07, "loss": 2.150227508544922, "step": 73300 }, { "epoch": 0.9175, "grad_norm": 21.228776931762695, "learning_rate": 8.303144654088051e-07, "loss": 2.2184078979492186, "step": 73400 }, { "epoch": 0.91875, "grad_norm": 4.426553726196289, "learning_rate": 8.177358490566039e-07, "loss": 2.0852456665039063, "step": 73500 }, { "epoch": 0.92, "grad_norm": 5.594974994659424, "learning_rate": 8.051572327044026e-07, "loss": 2.157823944091797, "step": 73600 }, { "epoch": 0.92125, "grad_norm": 27.547351837158203, "learning_rate": 7.925786163522014e-07, "loss": 2.1868157958984376, "step": 73700 }, { "epoch": 0.9225, "grad_norm": 29.176279067993164, "learning_rate": 7.8e-07, "loss": 2.2395770263671877, "step": 73800 }, { "epoch": 0.92375, "grad_norm": 0.281236857175827, "learning_rate": 7.674213836477988e-07, "loss": 2.0369967651367187, "step": 73900 }, { "epoch": 0.925, "grad_norm": 16.027692794799805, "learning_rate": 7.548427672955975e-07, "loss": 2.113655853271484, "step": 74000 }, { "epoch": 0.92625, "grad_norm": 31.675695419311523, "learning_rate": 7.422641509433963e-07, "loss": 2.0642041015625, "step": 74100 }, { "epoch": 0.9275, "grad_norm": 36.27760696411133, "learning_rate": 7.29685534591195e-07, "loss": 1.925348358154297, "step": 74200 }, { "epoch": 0.92875, "grad_norm": 0.32410475611686707, "learning_rate": 7.171069182389938e-07, "loss": 1.969832763671875, "step": 74300 }, { "epoch": 0.93, "grad_norm": 0.028427617624402046, "learning_rate": 7.045283018867925e-07, "loss": 1.9005357360839843, "step": 74400 }, { "epoch": 0.93125, "grad_norm": 34.29441833496094, "learning_rate": 6.919496855345913e-07, "loss": 2.174887390136719, "step": 74500 }, { "epoch": 0.9325, "grad_norm": 11.873321533203125, "learning_rate": 6.7937106918239e-07, "loss": 2.1516876220703125, "step": 74600 }, { "epoch": 0.93375, "grad_norm": 3.312832832336426, "learning_rate": 6.667924528301888e-07, "loss": 2.204561767578125, "step": 74700 }, { "epoch": 0.935, "grad_norm": 0.09160123765468597, "learning_rate": 6.542138364779875e-07, "loss": 2.13657958984375, "step": 74800 }, { "epoch": 0.93625, "grad_norm": 23.657917022705078, "learning_rate": 6.416352201257863e-07, "loss": 2.3265130615234373, "step": 74900 }, { "epoch": 0.9375, "grad_norm": 21.326000213623047, "learning_rate": 6.29056603773585e-07, "loss": 2.133411865234375, "step": 75000 }, { "epoch": 0.93875, "grad_norm": 0.5653764009475708, "learning_rate": 6.164779874213837e-07, "loss": 1.9549205017089843, "step": 75100 }, { "epoch": 0.94, "grad_norm": 20.32880210876465, "learning_rate": 6.038993710691824e-07, "loss": 1.80646240234375, "step": 75200 }, { "epoch": 0.94125, "grad_norm": 24.76996421813965, "learning_rate": 5.913207547169812e-07, "loss": 2.3081170654296876, "step": 75300 }, { "epoch": 0.9425, "grad_norm": 26.569751739501953, "learning_rate": 5.787421383647799e-07, "loss": 2.0019149780273438, "step": 75400 }, { "epoch": 0.94375, "grad_norm": 0.3244832158088684, "learning_rate": 5.661635220125787e-07, "loss": 2.1780990600585937, "step": 75500 }, { "epoch": 0.945, "grad_norm": 0.22316910326480865, "learning_rate": 5.535849056603773e-07, "loss": 2.0430670166015625, "step": 75600 }, { "epoch": 0.94625, "grad_norm": 27.824186325073242, "learning_rate": 5.410062893081761e-07, "loss": 2.4939804077148438, "step": 75700 }, { "epoch": 0.9475, "grad_norm": 31.06242561340332, "learning_rate": 5.284276729559748e-07, "loss": 2.15638916015625, "step": 75800 }, { "epoch": 0.94875, "grad_norm": 3.928506851196289, "learning_rate": 5.158490566037736e-07, "loss": 2.05089111328125, "step": 75900 }, { "epoch": 0.95, "grad_norm": 11.446708679199219, "learning_rate": 5.032704402515723e-07, "loss": 2.08630615234375, "step": 76000 }, { "epoch": 0.95125, "grad_norm": 26.42768096923828, "learning_rate": 4.906918238993711e-07, "loss": 2.343414764404297, "step": 76100 }, { "epoch": 0.9525, "grad_norm": 28.124576568603516, "learning_rate": 4.781132075471698e-07, "loss": 2.227467498779297, "step": 76200 }, { "epoch": 0.95375, "grad_norm": 4.6327290534973145, "learning_rate": 4.655345911949686e-07, "loss": 2.305023193359375, "step": 76300 }, { "epoch": 0.955, "grad_norm": 0.009622328914701939, "learning_rate": 4.5295597484276735e-07, "loss": 1.961500701904297, "step": 76400 }, { "epoch": 0.95625, "grad_norm": 24.499088287353516, "learning_rate": 4.403773584905661e-07, "loss": 2.338931121826172, "step": 76500 }, { "epoch": 0.9575, "grad_norm": 32.421878814697266, "learning_rate": 4.2779874213836484e-07, "loss": 2.53737548828125, "step": 76600 }, { "epoch": 0.95875, "grad_norm": 0.04140196368098259, "learning_rate": 4.152201257861636e-07, "loss": 2.1324703979492186, "step": 76700 }, { "epoch": 0.96, "grad_norm": 0.056152064353227615, "learning_rate": 4.0264150943396233e-07, "loss": 2.1852513122558594, "step": 76800 }, { "epoch": 0.96125, "grad_norm": 28.970626831054688, "learning_rate": 3.9006289308176107e-07, "loss": 2.2562815856933596, "step": 76900 }, { "epoch": 0.9625, "grad_norm": 28.90065574645996, "learning_rate": 3.774842767295598e-07, "loss": 2.4266934204101562, "step": 77000 }, { "epoch": 0.96375, "grad_norm": 0.12267394363880157, "learning_rate": 3.6490566037735856e-07, "loss": 2.2760604858398437, "step": 77100 }, { "epoch": 0.965, "grad_norm": 55.57836151123047, "learning_rate": 3.523270440251573e-07, "loss": 2.051964111328125, "step": 77200 }, { "epoch": 0.96625, "grad_norm": 27.030414581298828, "learning_rate": 3.3974842767295605e-07, "loss": 2.140052490234375, "step": 77300 }, { "epoch": 0.9675, "grad_norm": 30.74704933166504, "learning_rate": 3.271698113207548e-07, "loss": 2.362886657714844, "step": 77400 }, { "epoch": 0.96875, "grad_norm": 0.04671184718608856, "learning_rate": 3.1459119496855354e-07, "loss": 2.0699423217773436, "step": 77500 }, { "epoch": 0.97, "grad_norm": 0.007876750081777573, "learning_rate": 3.0201257861635223e-07, "loss": 1.9259161376953124, "step": 77600 }, { "epoch": 0.97125, "grad_norm": 29.24554443359375, "learning_rate": 2.89433962264151e-07, "loss": 2.4339639282226564, "step": 77700 }, { "epoch": 0.9725, "grad_norm": 29.591955184936523, "learning_rate": 2.768553459119497e-07, "loss": 2.3606378173828126, "step": 77800 }, { "epoch": 0.97375, "grad_norm": 12.657386779785156, "learning_rate": 2.6427672955974846e-07, "loss": 2.428155822753906, "step": 77900 }, { "epoch": 0.975, "grad_norm": 0.024583876132965088, "learning_rate": 2.516981132075472e-07, "loss": 2.105915985107422, "step": 78000 }, { "epoch": 0.97625, "grad_norm": 21.956266403198242, "learning_rate": 2.3911949685534595e-07, "loss": 2.1736614990234373, "step": 78100 }, { "epoch": 0.9775, "grad_norm": 25.43096923828125, "learning_rate": 2.2654088050314464e-07, "loss": 2.178233337402344, "step": 78200 }, { "epoch": 0.97875, "grad_norm": 25.53718376159668, "learning_rate": 2.139622641509434e-07, "loss": 2.3168376159667967, "step": 78300 }, { "epoch": 0.98, "grad_norm": 0.009442003443837166, "learning_rate": 2.0138364779874213e-07, "loss": 2.333406677246094, "step": 78400 }, { "epoch": 0.98125, "grad_norm": 33.688865661621094, "learning_rate": 1.8880503144654088e-07, "loss": 2.119158935546875, "step": 78500 }, { "epoch": 0.9825, "grad_norm": 19.21613883972168, "learning_rate": 1.7622641509433962e-07, "loss": 2.3394792175292967, "step": 78600 }, { "epoch": 0.98375, "grad_norm": 10.80718994140625, "learning_rate": 1.6364779874213836e-07, "loss": 2.2544096374511717, "step": 78700 }, { "epoch": 0.985, "grad_norm": 0.019625332206487656, "learning_rate": 1.510691823899371e-07, "loss": 2.086898193359375, "step": 78800 }, { "epoch": 0.98625, "grad_norm": 23.536130905151367, "learning_rate": 1.3849056603773585e-07, "loss": 2.2242425537109374, "step": 78900 }, { "epoch": 0.9875, "grad_norm": 28.006027221679688, "learning_rate": 1.259119496855346e-07, "loss": 2.1889492797851564, "step": 79000 }, { "epoch": 0.98875, "grad_norm": 0.1388503462076187, "learning_rate": 1.1333333333333336e-07, "loss": 2.131824188232422, "step": 79100 }, { "epoch": 0.99, "grad_norm": 0.1504867821931839, "learning_rate": 1.0075471698113207e-07, "loss": 2.0242990112304686, "step": 79200 }, { "epoch": 0.99125, "grad_norm": 26.41069793701172, "learning_rate": 8.817610062893082e-08, "loss": 2.210938720703125, "step": 79300 }, { "epoch": 0.9925, "grad_norm": 17.642717361450195, "learning_rate": 7.559748427672956e-08, "loss": 1.8704595947265625, "step": 79400 }, { "epoch": 0.99375, "grad_norm": 0.1886824667453766, "learning_rate": 6.30188679245283e-08, "loss": 2.071743621826172, "step": 79500 }, { "epoch": 0.995, "grad_norm": 0.07498449087142944, "learning_rate": 5.0440251572327044e-08, "loss": 2.117955780029297, "step": 79600 }, { "epoch": 0.99625, "grad_norm": 22.661165237426758, "learning_rate": 3.786163522012579e-08, "loss": 2.199887542724609, "step": 79700 }, { "epoch": 0.9975, "grad_norm": 22.61822509765625, "learning_rate": 2.5283018867924533e-08, "loss": 2.135641784667969, "step": 79800 }, { "epoch": 0.99875, "grad_norm": 0.6577712297439575, "learning_rate": 1.2704402515723271e-08, "loss": 1.8882171630859375, "step": 79900 }, { "epoch": 1.0, "grad_norm": 0.028230739757418633, "learning_rate": 1.257861635220126e-10, "loss": 2.1253326416015623, "step": 80000 } ], "logging_steps": 100, "max_steps": 80000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.228147083365581e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }