{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 54.200542005420054, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02710027100271003, "grad_norm": 25.68050765991211, "learning_rate": 9e-07, "loss": 2.739, "step": 10 }, { "epoch": 0.05420054200542006, "grad_norm": 16.476688385009766, "learning_rate": 1.9e-06, "loss": 2.6161, "step": 20 }, { "epoch": 0.08130081300813008, "grad_norm": 22.8586368560791, "learning_rate": 2.9e-06, "loss": 2.4219, "step": 30 }, { "epoch": 0.10840108401084012, "grad_norm": 6.454858779907227, "learning_rate": 3.9e-06, "loss": 2.1524, "step": 40 }, { "epoch": 0.13550135501355012, "grad_norm": 8.03986930847168, "learning_rate": 4.9000000000000005e-06, "loss": 1.7294, "step": 50 }, { "epoch": 0.16260162601626016, "grad_norm": 7.49747371673584, "learning_rate": 5.9e-06, "loss": 1.3177, "step": 60 }, { "epoch": 0.1897018970189702, "grad_norm": 4.262423515319824, "learning_rate": 6.900000000000001e-06, "loss": 1.0574, "step": 70 }, { "epoch": 0.21680216802168023, "grad_norm": 2.7643113136291504, "learning_rate": 7.9e-06, "loss": 0.7514, "step": 80 }, { "epoch": 0.24390243902439024, "grad_norm": 1.5491983890533447, "learning_rate": 8.9e-06, "loss": 0.7234, "step": 90 }, { "epoch": 0.27100271002710025, "grad_norm": 0.9604158997535706, "learning_rate": 9.900000000000002e-06, "loss": 0.5774, "step": 100 }, { "epoch": 0.2981029810298103, "grad_norm": 1.1531180143356323, "learning_rate": 1.09e-05, "loss": 0.523, "step": 110 }, { "epoch": 0.3252032520325203, "grad_norm": 0.8287228345870972, "learning_rate": 1.19e-05, "loss": 0.4477, "step": 120 }, { "epoch": 0.3523035230352303, "grad_norm": 0.8339620232582092, "learning_rate": 1.29e-05, "loss": 0.4255, "step": 130 }, { "epoch": 0.3794037940379404, "grad_norm": 0.8637806177139282, "learning_rate": 1.3900000000000002e-05, "loss": 0.3747, "step": 140 }, { "epoch": 0.4065040650406504, "grad_norm": 0.8784481883049011, "learning_rate": 1.49e-05, "loss": 0.3209, "step": 150 }, { "epoch": 0.43360433604336046, "grad_norm": 1.0232354402542114, "learning_rate": 1.59e-05, "loss": 0.3179, "step": 160 }, { "epoch": 0.46070460704607047, "grad_norm": 0.7439730763435364, "learning_rate": 1.69e-05, "loss": 0.2599, "step": 170 }, { "epoch": 0.4878048780487805, "grad_norm": 1.1967895030975342, "learning_rate": 1.79e-05, "loss": 0.2481, "step": 180 }, { "epoch": 0.5149051490514905, "grad_norm": 0.7557039260864258, "learning_rate": 1.8900000000000002e-05, "loss": 0.2351, "step": 190 }, { "epoch": 0.5420054200542005, "grad_norm": 0.9632775783538818, "learning_rate": 1.9900000000000003e-05, "loss": 0.2124, "step": 200 }, { "epoch": 0.5691056910569106, "grad_norm": 0.6838817596435547, "learning_rate": 2.09e-05, "loss": 0.1774, "step": 210 }, { "epoch": 0.5962059620596206, "grad_norm": 0.5964762568473816, "learning_rate": 2.19e-05, "loss": 0.1603, "step": 220 }, { "epoch": 0.6233062330623306, "grad_norm": 0.6729223728179932, "learning_rate": 2.29e-05, "loss": 0.1472, "step": 230 }, { "epoch": 0.6504065040650406, "grad_norm": 0.7734860777854919, "learning_rate": 2.39e-05, "loss": 0.1499, "step": 240 }, { "epoch": 0.6775067750677507, "grad_norm": 0.5255666971206665, "learning_rate": 2.4900000000000002e-05, "loss": 0.1267, "step": 250 }, { "epoch": 0.7046070460704607, "grad_norm": 0.7393779158592224, "learning_rate": 2.5900000000000003e-05, "loss": 0.1287, "step": 260 }, { "epoch": 0.7317073170731707, "grad_norm": 0.6468977332115173, "learning_rate": 2.6900000000000003e-05, "loss": 0.1203, "step": 270 }, { "epoch": 0.7588075880758808, "grad_norm": 0.8121851682662964, "learning_rate": 2.7900000000000004e-05, "loss": 0.1147, "step": 280 }, { "epoch": 0.7859078590785907, "grad_norm": 0.6745753884315491, "learning_rate": 2.8899999999999998e-05, "loss": 0.097, "step": 290 }, { "epoch": 0.8130081300813008, "grad_norm": 0.6112396121025085, "learning_rate": 2.9900000000000002e-05, "loss": 0.0912, "step": 300 }, { "epoch": 0.8401084010840109, "grad_norm": 0.6905887722969055, "learning_rate": 3.09e-05, "loss": 0.0833, "step": 310 }, { "epoch": 0.8672086720867209, "grad_norm": 0.6157616972923279, "learning_rate": 3.19e-05, "loss": 0.0817, "step": 320 }, { "epoch": 0.8943089430894309, "grad_norm": 0.46547698974609375, "learning_rate": 3.29e-05, "loss": 0.0826, "step": 330 }, { "epoch": 0.9214092140921409, "grad_norm": 0.7103986740112305, "learning_rate": 3.3900000000000004e-05, "loss": 0.0751, "step": 340 }, { "epoch": 0.948509485094851, "grad_norm": 0.7030147314071655, "learning_rate": 3.49e-05, "loss": 0.0721, "step": 350 }, { "epoch": 0.975609756097561, "grad_norm": 0.5930088758468628, "learning_rate": 3.59e-05, "loss": 0.0688, "step": 360 }, { "epoch": 1.002710027100271, "grad_norm": 0.6263824105262756, "learning_rate": 3.69e-05, "loss": 0.0775, "step": 370 }, { "epoch": 1.029810298102981, "grad_norm": 0.5269455909729004, "learning_rate": 3.79e-05, "loss": 0.0664, "step": 380 }, { "epoch": 1.056910569105691, "grad_norm": 0.5635387301445007, "learning_rate": 3.8900000000000004e-05, "loss": 0.0681, "step": 390 }, { "epoch": 1.084010840108401, "grad_norm": 1.0020140409469604, "learning_rate": 3.99e-05, "loss": 0.0667, "step": 400 }, { "epoch": 1.1111111111111112, "grad_norm": 0.6214953064918518, "learning_rate": 4.09e-05, "loss": 0.0715, "step": 410 }, { "epoch": 1.1382113821138211, "grad_norm": 0.7158096432685852, "learning_rate": 4.19e-05, "loss": 0.064, "step": 420 }, { "epoch": 1.165311653116531, "grad_norm": 0.8390541076660156, "learning_rate": 4.29e-05, "loss": 0.0645, "step": 430 }, { "epoch": 1.1924119241192412, "grad_norm": 0.5527065992355347, "learning_rate": 4.39e-05, "loss": 0.0597, "step": 440 }, { "epoch": 1.2195121951219512, "grad_norm": 0.6157588362693787, "learning_rate": 4.49e-05, "loss": 0.0599, "step": 450 }, { "epoch": 1.2466124661246614, "grad_norm": 1.01018488407135, "learning_rate": 4.5900000000000004e-05, "loss": 0.0554, "step": 460 }, { "epoch": 1.2737127371273713, "grad_norm": 1.1752318143844604, "learning_rate": 4.69e-05, "loss": 0.0572, "step": 470 }, { "epoch": 1.3008130081300813, "grad_norm": 0.8633020520210266, "learning_rate": 4.79e-05, "loss": 0.0535, "step": 480 }, { "epoch": 1.3279132791327912, "grad_norm": 0.6312143206596375, "learning_rate": 4.89e-05, "loss": 0.0519, "step": 490 }, { "epoch": 1.3550135501355014, "grad_norm": 0.9312700033187866, "learning_rate": 4.99e-05, "loss": 0.0529, "step": 500 }, { "epoch": 1.3821138211382114, "grad_norm": 0.5876649618148804, "learning_rate": 5.0900000000000004e-05, "loss": 0.0477, "step": 510 }, { "epoch": 1.4092140921409215, "grad_norm": 0.6448817253112793, "learning_rate": 5.19e-05, "loss": 0.051, "step": 520 }, { "epoch": 1.4363143631436315, "grad_norm": 0.7280922532081604, "learning_rate": 5.2900000000000005e-05, "loss": 0.052, "step": 530 }, { "epoch": 1.4634146341463414, "grad_norm": 0.9772777557373047, "learning_rate": 5.390000000000001e-05, "loss": 0.0583, "step": 540 }, { "epoch": 1.4905149051490514, "grad_norm": 0.8739373087882996, "learning_rate": 5.4900000000000006e-05, "loss": 0.0486, "step": 550 }, { "epoch": 1.5176151761517616, "grad_norm": 0.6847813129425049, "learning_rate": 5.590000000000001e-05, "loss": 0.0467, "step": 560 }, { "epoch": 1.5447154471544715, "grad_norm": 0.8980885744094849, "learning_rate": 5.69e-05, "loss": 0.0455, "step": 570 }, { "epoch": 1.5718157181571817, "grad_norm": 0.7034522891044617, "learning_rate": 5.79e-05, "loss": 0.0495, "step": 580 }, { "epoch": 1.5989159891598916, "grad_norm": 0.7497943639755249, "learning_rate": 5.89e-05, "loss": 0.0479, "step": 590 }, { "epoch": 1.6260162601626016, "grad_norm": 0.48810479044914246, "learning_rate": 5.99e-05, "loss": 0.0521, "step": 600 }, { "epoch": 1.6531165311653115, "grad_norm": 0.7041803598403931, "learning_rate": 6.09e-05, "loss": 0.0486, "step": 610 }, { "epoch": 1.6802168021680217, "grad_norm": 0.7603042125701904, "learning_rate": 6.19e-05, "loss": 0.0458, "step": 620 }, { "epoch": 1.7073170731707317, "grad_norm": 0.7024151682853699, "learning_rate": 6.29e-05, "loss": 0.0511, "step": 630 }, { "epoch": 1.7344173441734418, "grad_norm": 0.7122645974159241, "learning_rate": 6.390000000000001e-05, "loss": 0.0459, "step": 640 }, { "epoch": 1.7615176151761518, "grad_norm": 0.8667749762535095, "learning_rate": 6.49e-05, "loss": 0.0453, "step": 650 }, { "epoch": 1.7886178861788617, "grad_norm": 0.8447861075401306, "learning_rate": 6.59e-05, "loss": 0.0408, "step": 660 }, { "epoch": 1.8157181571815717, "grad_norm": 0.6153303384780884, "learning_rate": 6.690000000000001e-05, "loss": 0.0418, "step": 670 }, { "epoch": 1.8428184281842819, "grad_norm": 1.0534039735794067, "learning_rate": 6.790000000000001e-05, "loss": 0.0414, "step": 680 }, { "epoch": 1.8699186991869918, "grad_norm": 0.8003451228141785, "learning_rate": 6.89e-05, "loss": 0.0407, "step": 690 }, { "epoch": 1.897018970189702, "grad_norm": 0.8086575865745544, "learning_rate": 6.99e-05, "loss": 0.0361, "step": 700 }, { "epoch": 1.924119241192412, "grad_norm": 0.6868980526924133, "learning_rate": 7.09e-05, "loss": 0.0419, "step": 710 }, { "epoch": 1.951219512195122, "grad_norm": 0.7608501315116882, "learning_rate": 7.19e-05, "loss": 0.044, "step": 720 }, { "epoch": 1.9783197831978319, "grad_norm": 0.7567887306213379, "learning_rate": 7.29e-05, "loss": 0.0409, "step": 730 }, { "epoch": 2.005420054200542, "grad_norm": 0.6403924226760864, "learning_rate": 7.390000000000001e-05, "loss": 0.0412, "step": 740 }, { "epoch": 2.032520325203252, "grad_norm": 0.6750456690788269, "learning_rate": 7.49e-05, "loss": 0.0392, "step": 750 }, { "epoch": 2.059620596205962, "grad_norm": 0.7049127817153931, "learning_rate": 7.59e-05, "loss": 0.0404, "step": 760 }, { "epoch": 2.086720867208672, "grad_norm": 0.8951629400253296, "learning_rate": 7.69e-05, "loss": 0.0437, "step": 770 }, { "epoch": 2.113821138211382, "grad_norm": 0.8579763770103455, "learning_rate": 7.790000000000001e-05, "loss": 0.0365, "step": 780 }, { "epoch": 2.140921409214092, "grad_norm": 0.7486189007759094, "learning_rate": 7.890000000000001e-05, "loss": 0.036, "step": 790 }, { "epoch": 2.168021680216802, "grad_norm": 0.7760418057441711, "learning_rate": 7.99e-05, "loss": 0.0406, "step": 800 }, { "epoch": 2.1951219512195124, "grad_norm": 0.6048120856285095, "learning_rate": 8.090000000000001e-05, "loss": 0.0342, "step": 810 }, { "epoch": 2.2222222222222223, "grad_norm": 0.7512058019638062, "learning_rate": 8.19e-05, "loss": 0.0394, "step": 820 }, { "epoch": 2.2493224932249323, "grad_norm": 0.7544538378715515, "learning_rate": 8.29e-05, "loss": 0.0349, "step": 830 }, { "epoch": 2.2764227642276422, "grad_norm": 0.6632108688354492, "learning_rate": 8.39e-05, "loss": 0.0393, "step": 840 }, { "epoch": 2.303523035230352, "grad_norm": 0.6719874143600464, "learning_rate": 8.49e-05, "loss": 0.0398, "step": 850 }, { "epoch": 2.330623306233062, "grad_norm": 0.6815747022628784, "learning_rate": 8.59e-05, "loss": 0.0328, "step": 860 }, { "epoch": 2.3577235772357725, "grad_norm": 0.5879821181297302, "learning_rate": 8.69e-05, "loss": 0.0341, "step": 870 }, { "epoch": 2.3848238482384825, "grad_norm": 0.7946475148200989, "learning_rate": 8.790000000000001e-05, "loss": 0.0381, "step": 880 }, { "epoch": 2.4119241192411924, "grad_norm": 0.7261311411857605, "learning_rate": 8.89e-05, "loss": 0.0339, "step": 890 }, { "epoch": 2.4390243902439024, "grad_norm": 0.7886880040168762, "learning_rate": 8.99e-05, "loss": 0.0361, "step": 900 }, { "epoch": 2.4661246612466123, "grad_norm": 0.9318199157714844, "learning_rate": 9.090000000000001e-05, "loss": 0.0345, "step": 910 }, { "epoch": 2.4932249322493227, "grad_norm": 0.5139050483703613, "learning_rate": 9.190000000000001e-05, "loss": 0.0342, "step": 920 }, { "epoch": 2.5203252032520327, "grad_norm": 0.6596904397010803, "learning_rate": 9.290000000000001e-05, "loss": 0.0343, "step": 930 }, { "epoch": 2.5474254742547426, "grad_norm": 0.5028601884841919, "learning_rate": 9.39e-05, "loss": 0.0366, "step": 940 }, { "epoch": 2.5745257452574526, "grad_norm": 0.6389950513839722, "learning_rate": 9.49e-05, "loss": 0.0387, "step": 950 }, { "epoch": 2.6016260162601625, "grad_norm": 0.5984148979187012, "learning_rate": 9.59e-05, "loss": 0.0376, "step": 960 }, { "epoch": 2.6287262872628725, "grad_norm": 0.731736958026886, "learning_rate": 9.69e-05, "loss": 0.0272, "step": 970 }, { "epoch": 2.6558265582655824, "grad_norm": 0.5914941430091858, "learning_rate": 9.790000000000001e-05, "loss": 0.0325, "step": 980 }, { "epoch": 2.682926829268293, "grad_norm": 0.49021193385124207, "learning_rate": 9.89e-05, "loss": 0.0372, "step": 990 }, { "epoch": 2.710027100271003, "grad_norm": 0.7593311071395874, "learning_rate": 9.99e-05, "loss": 0.0319, "step": 1000 }, { "epoch": 2.7371273712737128, "grad_norm": 0.5248816013336182, "learning_rate": 9.999994463727085e-05, "loss": 0.0403, "step": 1010 }, { "epoch": 2.7642276422764227, "grad_norm": 0.7394348382949829, "learning_rate": 9.999975326009292e-05, "loss": 0.0328, "step": 1020 }, { "epoch": 2.7913279132791327, "grad_norm": 0.685234546661377, "learning_rate": 9.999942518549879e-05, "loss": 0.0328, "step": 1030 }, { "epoch": 2.818428184281843, "grad_norm": 0.6012423634529114, "learning_rate": 9.999896041438544e-05, "loss": 0.0319, "step": 1040 }, { "epoch": 2.845528455284553, "grad_norm": 0.6731837391853333, "learning_rate": 9.999835894802353e-05, "loss": 0.0326, "step": 1050 }, { "epoch": 2.872628726287263, "grad_norm": 0.4991224706172943, "learning_rate": 9.999762078805743e-05, "loss": 0.033, "step": 1060 }, { "epoch": 2.899728997289973, "grad_norm": 0.5958859920501709, "learning_rate": 9.999674593650526e-05, "loss": 0.033, "step": 1070 }, { "epoch": 2.926829268292683, "grad_norm": 0.578107476234436, "learning_rate": 9.99957343957588e-05, "loss": 0.0308, "step": 1080 }, { "epoch": 2.953929539295393, "grad_norm": 0.5320321321487427, "learning_rate": 9.99945861685836e-05, "loss": 0.0312, "step": 1090 }, { "epoch": 2.9810298102981028, "grad_norm": 0.5564437508583069, "learning_rate": 9.999330125811884e-05, "loss": 0.031, "step": 1100 }, { "epoch": 3.008130081300813, "grad_norm": 0.6094812750816345, "learning_rate": 9.999187966787744e-05, "loss": 0.0301, "step": 1110 }, { "epoch": 3.035230352303523, "grad_norm": 0.3020085096359253, "learning_rate": 9.999032140174595e-05, "loss": 0.0324, "step": 1120 }, { "epoch": 3.062330623306233, "grad_norm": 0.5955097079277039, "learning_rate": 9.998862646398464e-05, "loss": 0.032, "step": 1130 }, { "epoch": 3.089430894308943, "grad_norm": 0.6850976943969727, "learning_rate": 9.998679485922739e-05, "loss": 0.038, "step": 1140 }, { "epoch": 3.116531165311653, "grad_norm": 0.8790730237960815, "learning_rate": 9.998482659248174e-05, "loss": 0.0318, "step": 1150 }, { "epoch": 3.1436314363143634, "grad_norm": 0.6181038618087769, "learning_rate": 9.998272166912883e-05, "loss": 0.0324, "step": 1160 }, { "epoch": 3.1707317073170733, "grad_norm": 0.7787555456161499, "learning_rate": 9.998048009492347e-05, "loss": 0.0329, "step": 1170 }, { "epoch": 3.1978319783197833, "grad_norm": 0.46410030126571655, "learning_rate": 9.997810187599403e-05, "loss": 0.0329, "step": 1180 }, { "epoch": 3.2249322493224932, "grad_norm": 0.48053669929504395, "learning_rate": 9.997558701884249e-05, "loss": 0.0309, "step": 1190 }, { "epoch": 3.252032520325203, "grad_norm": 0.5332421064376831, "learning_rate": 9.997293553034433e-05, "loss": 0.0282, "step": 1200 }, { "epoch": 3.279132791327913, "grad_norm": 0.5572290420532227, "learning_rate": 9.997014741774866e-05, "loss": 0.0272, "step": 1210 }, { "epoch": 3.306233062330623, "grad_norm": 0.5009008049964905, "learning_rate": 9.996722268867803e-05, "loss": 0.0248, "step": 1220 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3966503143310547, "learning_rate": 9.996416135112858e-05, "loss": 0.0282, "step": 1230 }, { "epoch": 3.3604336043360434, "grad_norm": 0.7144346833229065, "learning_rate": 9.996096341346988e-05, "loss": 0.0279, "step": 1240 }, { "epoch": 3.3875338753387534, "grad_norm": 0.4181624948978424, "learning_rate": 9.995762888444495e-05, "loss": 0.0261, "step": 1250 }, { "epoch": 3.4146341463414633, "grad_norm": 0.5477401614189148, "learning_rate": 9.995415777317027e-05, "loss": 0.0294, "step": 1260 }, { "epoch": 3.4417344173441733, "grad_norm": 0.926315188407898, "learning_rate": 9.995055008913574e-05, "loss": 0.0292, "step": 1270 }, { "epoch": 3.4688346883468837, "grad_norm": 0.7191458940505981, "learning_rate": 9.994680584220463e-05, "loss": 0.0274, "step": 1280 }, { "epoch": 3.4959349593495936, "grad_norm": 0.493465393781662, "learning_rate": 9.994292504261355e-05, "loss": 0.0297, "step": 1290 }, { "epoch": 3.5230352303523036, "grad_norm": 0.8189237117767334, "learning_rate": 9.993890770097247e-05, "loss": 0.0271, "step": 1300 }, { "epoch": 3.5501355013550135, "grad_norm": 0.5424594283103943, "learning_rate": 9.993475382826467e-05, "loss": 0.0316, "step": 1310 }, { "epoch": 3.5772357723577235, "grad_norm": 0.46002599596977234, "learning_rate": 9.993046343584664e-05, "loss": 0.0315, "step": 1320 }, { "epoch": 3.6043360433604335, "grad_norm": 0.6831870675086975, "learning_rate": 9.992603653544816e-05, "loss": 0.0299, "step": 1330 }, { "epoch": 3.6314363143631434, "grad_norm": 0.5977823734283447, "learning_rate": 9.992147313917222e-05, "loss": 0.0313, "step": 1340 }, { "epoch": 3.658536585365854, "grad_norm": 0.4205773174762726, "learning_rate": 9.991677325949497e-05, "loss": 0.0285, "step": 1350 }, { "epoch": 3.6856368563685638, "grad_norm": 0.7994751334190369, "learning_rate": 9.991193690926568e-05, "loss": 0.0296, "step": 1360 }, { "epoch": 3.7127371273712737, "grad_norm": 0.6613404154777527, "learning_rate": 9.990696410170678e-05, "loss": 0.0342, "step": 1370 }, { "epoch": 3.7398373983739837, "grad_norm": 0.7653180360794067, "learning_rate": 9.990185485041371e-05, "loss": 0.0296, "step": 1380 }, { "epoch": 3.7669376693766936, "grad_norm": 0.6346215605735779, "learning_rate": 9.989660916935498e-05, "loss": 0.0276, "step": 1390 }, { "epoch": 3.794037940379404, "grad_norm": 0.44078266620635986, "learning_rate": 9.989122707287208e-05, "loss": 0.0276, "step": 1400 }, { "epoch": 3.821138211382114, "grad_norm": 0.4286399781703949, "learning_rate": 9.988570857567945e-05, "loss": 0.0279, "step": 1410 }, { "epoch": 3.848238482384824, "grad_norm": 0.6921868920326233, "learning_rate": 9.988005369286446e-05, "loss": 0.0268, "step": 1420 }, { "epoch": 3.875338753387534, "grad_norm": 0.343708336353302, "learning_rate": 9.987426243988734e-05, "loss": 0.0244, "step": 1430 }, { "epoch": 3.902439024390244, "grad_norm": 0.5138548612594604, "learning_rate": 9.986833483258114e-05, "loss": 0.0225, "step": 1440 }, { "epoch": 3.9295392953929538, "grad_norm": 0.41098707914352417, "learning_rate": 9.986227088715173e-05, "loss": 0.0231, "step": 1450 }, { "epoch": 3.9566395663956637, "grad_norm": 0.5508394837379456, "learning_rate": 9.98560706201777e-05, "loss": 0.0272, "step": 1460 }, { "epoch": 3.983739837398374, "grad_norm": 0.5881540775299072, "learning_rate": 9.984973404861036e-05, "loss": 0.025, "step": 1470 }, { "epoch": 4.010840108401084, "grad_norm": 0.4218249022960663, "learning_rate": 9.984326118977361e-05, "loss": 0.0243, "step": 1480 }, { "epoch": 4.0379403794037945, "grad_norm": 0.7339116334915161, "learning_rate": 9.983665206136406e-05, "loss": 0.0253, "step": 1490 }, { "epoch": 4.065040650406504, "grad_norm": 0.457883358001709, "learning_rate": 9.982990668145075e-05, "loss": 0.0283, "step": 1500 }, { "epoch": 4.092140921409214, "grad_norm": 0.5153295993804932, "learning_rate": 9.982302506847534e-05, "loss": 0.0244, "step": 1510 }, { "epoch": 4.119241192411924, "grad_norm": 0.582353413105011, "learning_rate": 9.981600724125189e-05, "loss": 0.026, "step": 1520 }, { "epoch": 4.146341463414634, "grad_norm": 0.641364574432373, "learning_rate": 9.980885321896685e-05, "loss": 0.0277, "step": 1530 }, { "epoch": 4.173441734417344, "grad_norm": 0.6253752708435059, "learning_rate": 9.980156302117905e-05, "loss": 0.0278, "step": 1540 }, { "epoch": 4.200542005420054, "grad_norm": 0.47998499870300293, "learning_rate": 9.979413666781963e-05, "loss": 0.0247, "step": 1550 }, { "epoch": 4.227642276422764, "grad_norm": 0.6375162601470947, "learning_rate": 9.978657417919193e-05, "loss": 0.0287, "step": 1560 }, { "epoch": 4.254742547425474, "grad_norm": 0.5357074737548828, "learning_rate": 9.977887557597153e-05, "loss": 0.0257, "step": 1570 }, { "epoch": 4.281842818428184, "grad_norm": 0.44970202445983887, "learning_rate": 9.97710408792061e-05, "loss": 0.028, "step": 1580 }, { "epoch": 4.308943089430894, "grad_norm": 0.6543464064598083, "learning_rate": 9.976307011031542e-05, "loss": 0.0283, "step": 1590 }, { "epoch": 4.336043360433604, "grad_norm": 0.4034106731414795, "learning_rate": 9.975496329109126e-05, "loss": 0.0259, "step": 1600 }, { "epoch": 4.363143631436314, "grad_norm": 0.7689071893692017, "learning_rate": 9.974672044369732e-05, "loss": 0.0249, "step": 1610 }, { "epoch": 4.390243902439025, "grad_norm": 0.4343847632408142, "learning_rate": 9.97383415906693e-05, "loss": 0.0213, "step": 1620 }, { "epoch": 4.417344173441735, "grad_norm": 0.6475924253463745, "learning_rate": 9.97298267549146e-05, "loss": 0.0242, "step": 1630 }, { "epoch": 4.444444444444445, "grad_norm": 0.5033401846885681, "learning_rate": 9.972117595971249e-05, "loss": 0.0253, "step": 1640 }, { "epoch": 4.471544715447155, "grad_norm": 0.6843452453613281, "learning_rate": 9.971238922871391e-05, "loss": 0.027, "step": 1650 }, { "epoch": 4.4986449864498645, "grad_norm": 0.5454314947128296, "learning_rate": 9.970346658594142e-05, "loss": 0.0243, "step": 1660 }, { "epoch": 4.5257452574525745, "grad_norm": 0.5678504705429077, "learning_rate": 9.969440805578923e-05, "loss": 0.0259, "step": 1670 }, { "epoch": 4.5528455284552845, "grad_norm": 0.48004239797592163, "learning_rate": 9.968521366302298e-05, "loss": 0.0216, "step": 1680 }, { "epoch": 4.579945799457994, "grad_norm": 0.6484246850013733, "learning_rate": 9.967588343277981e-05, "loss": 0.0289, "step": 1690 }, { "epoch": 4.607046070460704, "grad_norm": 0.9131383895874023, "learning_rate": 9.966641739056818e-05, "loss": 0.0258, "step": 1700 }, { "epoch": 4.634146341463414, "grad_norm": 0.47160568833351135, "learning_rate": 9.965681556226793e-05, "loss": 0.024, "step": 1710 }, { "epoch": 4.661246612466124, "grad_norm": 0.6287451982498169, "learning_rate": 9.964707797413006e-05, "loss": 0.0268, "step": 1720 }, { "epoch": 4.688346883468835, "grad_norm": 0.7461898922920227, "learning_rate": 9.963720465277679e-05, "loss": 0.0251, "step": 1730 }, { "epoch": 4.715447154471545, "grad_norm": 0.48007503151893616, "learning_rate": 9.96271956252014e-05, "loss": 0.0213, "step": 1740 }, { "epoch": 4.742547425474255, "grad_norm": 0.5224120616912842, "learning_rate": 9.961705091876816e-05, "loss": 0.026, "step": 1750 }, { "epoch": 4.769647696476965, "grad_norm": 0.5104272961616516, "learning_rate": 9.960677056121235e-05, "loss": 0.0297, "step": 1760 }, { "epoch": 4.796747967479675, "grad_norm": 0.49895179271698, "learning_rate": 9.959635458064005e-05, "loss": 0.0237, "step": 1770 }, { "epoch": 4.823848238482385, "grad_norm": 0.6408563852310181, "learning_rate": 9.958580300552815e-05, "loss": 0.0216, "step": 1780 }, { "epoch": 4.850948509485095, "grad_norm": 0.4692358672618866, "learning_rate": 9.957511586472426e-05, "loss": 0.0221, "step": 1790 }, { "epoch": 4.878048780487805, "grad_norm": 0.805867612361908, "learning_rate": 9.956429318744662e-05, "loss": 0.0256, "step": 1800 }, { "epoch": 4.905149051490515, "grad_norm": 0.452920138835907, "learning_rate": 9.955333500328404e-05, "loss": 0.0262, "step": 1810 }, { "epoch": 4.932249322493225, "grad_norm": 0.4028685688972473, "learning_rate": 9.95422413421957e-05, "loss": 0.0246, "step": 1820 }, { "epoch": 4.959349593495935, "grad_norm": 0.5676128268241882, "learning_rate": 9.953101223451133e-05, "loss": 0.0212, "step": 1830 }, { "epoch": 4.9864498644986455, "grad_norm": 0.3310299217700958, "learning_rate": 9.951964771093085e-05, "loss": 0.0223, "step": 1840 }, { "epoch": 5.013550135501355, "grad_norm": 0.4734501540660858, "learning_rate": 9.950814780252442e-05, "loss": 0.0212, "step": 1850 }, { "epoch": 5.040650406504065, "grad_norm": 0.500758707523346, "learning_rate": 9.949651254073236e-05, "loss": 0.0207, "step": 1860 }, { "epoch": 5.067750677506775, "grad_norm": 0.36089006066322327, "learning_rate": 9.948474195736504e-05, "loss": 0.0201, "step": 1870 }, { "epoch": 5.094850948509485, "grad_norm": 0.5360018610954285, "learning_rate": 9.947283608460277e-05, "loss": 0.021, "step": 1880 }, { "epoch": 5.121951219512195, "grad_norm": 0.4199448525905609, "learning_rate": 9.946079495499577e-05, "loss": 0.0231, "step": 1890 }, { "epoch": 5.149051490514905, "grad_norm": 0.6473070383071899, "learning_rate": 9.944861860146401e-05, "loss": 0.0225, "step": 1900 }, { "epoch": 5.176151761517615, "grad_norm": 0.3879552483558655, "learning_rate": 9.943630705729719e-05, "loss": 0.028, "step": 1910 }, { "epoch": 5.203252032520325, "grad_norm": 0.34856536984443665, "learning_rate": 9.942386035615459e-05, "loss": 0.0228, "step": 1920 }, { "epoch": 5.230352303523035, "grad_norm": 0.4734755754470825, "learning_rate": 9.941127853206503e-05, "loss": 0.0202, "step": 1930 }, { "epoch": 5.257452574525745, "grad_norm": 0.25228434801101685, "learning_rate": 9.939856161942673e-05, "loss": 0.023, "step": 1940 }, { "epoch": 5.284552845528455, "grad_norm": 0.43759065866470337, "learning_rate": 9.938570965300724e-05, "loss": 0.0211, "step": 1950 }, { "epoch": 5.311653116531165, "grad_norm": 0.40182581543922424, "learning_rate": 9.937272266794335e-05, "loss": 0.0211, "step": 1960 }, { "epoch": 5.338753387533876, "grad_norm": 0.2800775170326233, "learning_rate": 9.935960069974096e-05, "loss": 0.0181, "step": 1970 }, { "epoch": 5.365853658536586, "grad_norm": 0.4255676567554474, "learning_rate": 9.934634378427506e-05, "loss": 0.0215, "step": 1980 }, { "epoch": 5.392953929539296, "grad_norm": 0.399319052696228, "learning_rate": 9.933295195778954e-05, "loss": 0.0211, "step": 1990 }, { "epoch": 5.420054200542006, "grad_norm": 0.5038173794746399, "learning_rate": 9.931942525689715e-05, "loss": 0.0226, "step": 2000 }, { "epoch": 5.4471544715447155, "grad_norm": 0.4495754837989807, "learning_rate": 9.930576371857936e-05, "loss": 0.0215, "step": 2010 }, { "epoch": 5.4742547425474255, "grad_norm": 0.46714332699775696, "learning_rate": 9.929196738018629e-05, "loss": 0.0219, "step": 2020 }, { "epoch": 5.5013550135501355, "grad_norm": 0.3134072422981262, "learning_rate": 9.927803627943662e-05, "loss": 0.022, "step": 2030 }, { "epoch": 5.528455284552845, "grad_norm": 0.7098971009254456, "learning_rate": 9.926397045441744e-05, "loss": 0.0215, "step": 2040 }, { "epoch": 5.555555555555555, "grad_norm": 0.4939674139022827, "learning_rate": 9.924976994358417e-05, "loss": 0.0231, "step": 2050 }, { "epoch": 5.582655826558265, "grad_norm": 0.47558271884918213, "learning_rate": 9.923543478576048e-05, "loss": 0.0201, "step": 2060 }, { "epoch": 5.609756097560975, "grad_norm": 0.49327653646469116, "learning_rate": 9.922096502013813e-05, "loss": 0.0264, "step": 2070 }, { "epoch": 5.636856368563686, "grad_norm": 0.37078234553337097, "learning_rate": 9.92063606862769e-05, "loss": 0.0221, "step": 2080 }, { "epoch": 5.663956639566395, "grad_norm": 0.5918521881103516, "learning_rate": 9.919162182410453e-05, "loss": 0.0277, "step": 2090 }, { "epoch": 5.691056910569106, "grad_norm": 0.5480566024780273, "learning_rate": 9.917674847391645e-05, "loss": 0.0196, "step": 2100 }, { "epoch": 5.718157181571816, "grad_norm": 0.5047476291656494, "learning_rate": 9.916174067637584e-05, "loss": 0.0223, "step": 2110 }, { "epoch": 5.745257452574526, "grad_norm": 0.7184700965881348, "learning_rate": 9.914659847251348e-05, "loss": 0.0215, "step": 2120 }, { "epoch": 5.772357723577236, "grad_norm": 0.41346004605293274, "learning_rate": 9.913132190372753e-05, "loss": 0.0248, "step": 2130 }, { "epoch": 5.799457994579946, "grad_norm": 0.5142463445663452, "learning_rate": 9.911591101178359e-05, "loss": 0.021, "step": 2140 }, { "epoch": 5.826558265582656, "grad_norm": 0.453156441450119, "learning_rate": 9.910036583881443e-05, "loss": 0.0223, "step": 2150 }, { "epoch": 5.853658536585366, "grad_norm": 0.42456310987472534, "learning_rate": 9.908468642731995e-05, "loss": 0.0182, "step": 2160 }, { "epoch": 5.880758807588076, "grad_norm": 0.4065864384174347, "learning_rate": 9.906887282016707e-05, "loss": 0.0215, "step": 2170 }, { "epoch": 5.907859078590786, "grad_norm": 0.44205188751220703, "learning_rate": 9.90529250605896e-05, "loss": 0.02, "step": 2180 }, { "epoch": 5.934959349593496, "grad_norm": 0.48631078004837036, "learning_rate": 9.903684319218809e-05, "loss": 0.0234, "step": 2190 }, { "epoch": 5.9620596205962055, "grad_norm": 0.4705348312854767, "learning_rate": 9.902062725892976e-05, "loss": 0.019, "step": 2200 }, { "epoch": 5.989159891598916, "grad_norm": 0.3450539708137512, "learning_rate": 9.900427730514834e-05, "loss": 0.0201, "step": 2210 }, { "epoch": 6.016260162601626, "grad_norm": 0.4834741950035095, "learning_rate": 9.8987793375544e-05, "loss": 0.0205, "step": 2220 }, { "epoch": 6.043360433604336, "grad_norm": 0.4779105484485626, "learning_rate": 9.897117551518318e-05, "loss": 0.0213, "step": 2230 }, { "epoch": 6.070460704607046, "grad_norm": 0.5280022621154785, "learning_rate": 9.895442376949844e-05, "loss": 0.0194, "step": 2240 }, { "epoch": 6.097560975609756, "grad_norm": 0.4040134847164154, "learning_rate": 9.893753818428845e-05, "loss": 0.0183, "step": 2250 }, { "epoch": 6.124661246612466, "grad_norm": 0.4311538338661194, "learning_rate": 9.892051880571773e-05, "loss": 0.0217, "step": 2260 }, { "epoch": 6.151761517615176, "grad_norm": 0.6705392599105835, "learning_rate": 9.890336568031663e-05, "loss": 0.0255, "step": 2270 }, { "epoch": 6.178861788617886, "grad_norm": 0.4386080801486969, "learning_rate": 9.888607885498113e-05, "loss": 0.0202, "step": 2280 }, { "epoch": 6.205962059620596, "grad_norm": 0.3043297529220581, "learning_rate": 9.886865837697275e-05, "loss": 0.0223, "step": 2290 }, { "epoch": 6.233062330623306, "grad_norm": 0.5876930952072144, "learning_rate": 9.88511042939184e-05, "loss": 0.0239, "step": 2300 }, { "epoch": 6.260162601626016, "grad_norm": 0.49726149439811707, "learning_rate": 9.883341665381028e-05, "loss": 0.0195, "step": 2310 }, { "epoch": 6.287262872628727, "grad_norm": 0.4605817198753357, "learning_rate": 9.881559550500575e-05, "loss": 0.0191, "step": 2320 }, { "epoch": 6.314363143631437, "grad_norm": 0.4122980833053589, "learning_rate": 9.879764089622712e-05, "loss": 0.0191, "step": 2330 }, { "epoch": 6.341463414634147, "grad_norm": 0.383861243724823, "learning_rate": 9.87795528765616e-05, "loss": 0.0191, "step": 2340 }, { "epoch": 6.368563685636857, "grad_norm": 0.46537500619888306, "learning_rate": 9.876133149546118e-05, "loss": 0.0166, "step": 2350 }, { "epoch": 6.3956639566395665, "grad_norm": 0.34608832001686096, "learning_rate": 9.874297680274238e-05, "loss": 0.0289, "step": 2360 }, { "epoch": 6.4227642276422765, "grad_norm": 0.42512011528015137, "learning_rate": 9.872448884858624e-05, "loss": 0.0218, "step": 2370 }, { "epoch": 6.4498644986449865, "grad_norm": 0.4939259886741638, "learning_rate": 9.870586768353815e-05, "loss": 0.0231, "step": 2380 }, { "epoch": 6.476964769647696, "grad_norm": 0.4002433121204376, "learning_rate": 9.868711335850764e-05, "loss": 0.02, "step": 2390 }, { "epoch": 6.504065040650406, "grad_norm": 0.3523513972759247, "learning_rate": 9.866822592476833e-05, "loss": 0.0217, "step": 2400 }, { "epoch": 6.531165311653116, "grad_norm": 0.47775450348854065, "learning_rate": 9.86492054339577e-05, "loss": 0.0246, "step": 2410 }, { "epoch": 6.558265582655826, "grad_norm": 0.43510180711746216, "learning_rate": 9.863005193807711e-05, "loss": 0.0209, "step": 2420 }, { "epoch": 6.585365853658536, "grad_norm": 0.5826720595359802, "learning_rate": 9.861076548949143e-05, "loss": 0.0226, "step": 2430 }, { "epoch": 6.612466124661246, "grad_norm": 0.6292181611061096, "learning_rate": 9.859134614092912e-05, "loss": 0.0207, "step": 2440 }, { "epoch": 6.639566395663957, "grad_norm": 0.4380645751953125, "learning_rate": 9.857179394548191e-05, "loss": 0.0188, "step": 2450 }, { "epoch": 6.666666666666667, "grad_norm": 0.7023690342903137, "learning_rate": 9.855210895660477e-05, "loss": 0.0209, "step": 2460 }, { "epoch": 6.693766937669377, "grad_norm": 0.45018765330314636, "learning_rate": 9.853229122811568e-05, "loss": 0.0213, "step": 2470 }, { "epoch": 6.720867208672087, "grad_norm": 0.5172533392906189, "learning_rate": 9.851234081419559e-05, "loss": 0.0196, "step": 2480 }, { "epoch": 6.747967479674797, "grad_norm": 0.29907092452049255, "learning_rate": 9.849225776938814e-05, "loss": 0.0194, "step": 2490 }, { "epoch": 6.775067750677507, "grad_norm": 0.3988078832626343, "learning_rate": 9.847204214859964e-05, "loss": 0.0208, "step": 2500 }, { "epoch": 6.802168021680217, "grad_norm": 0.6814501285552979, "learning_rate": 9.845169400709879e-05, "loss": 0.0189, "step": 2510 }, { "epoch": 6.829268292682927, "grad_norm": 0.2823296785354614, "learning_rate": 9.843121340051664e-05, "loss": 0.0173, "step": 2520 }, { "epoch": 6.856368563685637, "grad_norm": 0.6137546300888062, "learning_rate": 9.841060038484641e-05, "loss": 0.0215, "step": 2530 }, { "epoch": 6.883468834688347, "grad_norm": 0.2892005443572998, "learning_rate": 9.838985501644328e-05, "loss": 0.0199, "step": 2540 }, { "epoch": 6.9105691056910565, "grad_norm": 0.6212902069091797, "learning_rate": 9.83689773520243e-05, "loss": 0.0185, "step": 2550 }, { "epoch": 6.937669376693767, "grad_norm": 0.4318417012691498, "learning_rate": 9.834796744866819e-05, "loss": 0.0174, "step": 2560 }, { "epoch": 6.964769647696477, "grad_norm": 0.39863017201423645, "learning_rate": 9.832682536381525e-05, "loss": 0.0179, "step": 2570 }, { "epoch": 6.991869918699187, "grad_norm": 0.3057374656200409, "learning_rate": 9.830555115526711e-05, "loss": 0.0215, "step": 2580 }, { "epoch": 7.018970189701897, "grad_norm": 0.36670976877212524, "learning_rate": 9.828414488118667e-05, "loss": 0.0236, "step": 2590 }, { "epoch": 7.046070460704607, "grad_norm": 0.34429365396499634, "learning_rate": 9.826260660009785e-05, "loss": 0.0177, "step": 2600 }, { "epoch": 7.073170731707317, "grad_norm": 0.4960726499557495, "learning_rate": 9.824093637088547e-05, "loss": 0.0169, "step": 2610 }, { "epoch": 7.100271002710027, "grad_norm": 0.5577384829521179, "learning_rate": 9.821913425279514e-05, "loss": 0.0189, "step": 2620 }, { "epoch": 7.127371273712737, "grad_norm": 0.3764692544937134, "learning_rate": 9.8197200305433e-05, "loss": 0.0214, "step": 2630 }, { "epoch": 7.154471544715447, "grad_norm": 0.35031089186668396, "learning_rate": 9.817513458876564e-05, "loss": 0.021, "step": 2640 }, { "epoch": 7.181571815718157, "grad_norm": 0.45250627398490906, "learning_rate": 9.815293716311987e-05, "loss": 0.0187, "step": 2650 }, { "epoch": 7.208672086720867, "grad_norm": 0.3953530192375183, "learning_rate": 9.813060808918262e-05, "loss": 0.0189, "step": 2660 }, { "epoch": 7.235772357723577, "grad_norm": 0.557940661907196, "learning_rate": 9.810814742800069e-05, "loss": 0.0209, "step": 2670 }, { "epoch": 7.262872628726287, "grad_norm": 0.4466969966888428, "learning_rate": 9.808555524098074e-05, "loss": 0.0189, "step": 2680 }, { "epoch": 7.289972899728998, "grad_norm": 0.45605015754699707, "learning_rate": 9.806283158988887e-05, "loss": 0.0218, "step": 2690 }, { "epoch": 7.317073170731708, "grad_norm": 0.4991740584373474, "learning_rate": 9.803997653685072e-05, "loss": 0.0164, "step": 2700 }, { "epoch": 7.3441734417344176, "grad_norm": 0.4535083472728729, "learning_rate": 9.801699014435112e-05, "loss": 0.0196, "step": 2710 }, { "epoch": 7.3712737127371275, "grad_norm": 0.44736096262931824, "learning_rate": 9.799387247523398e-05, "loss": 0.0201, "step": 2720 }, { "epoch": 7.3983739837398375, "grad_norm": 0.8168508410453796, "learning_rate": 9.797062359270215e-05, "loss": 0.0221, "step": 2730 }, { "epoch": 7.425474254742547, "grad_norm": 0.43144160509109497, "learning_rate": 9.794724356031715e-05, "loss": 0.0177, "step": 2740 }, { "epoch": 7.452574525745257, "grad_norm": 0.4047624468803406, "learning_rate": 9.792373244199913e-05, "loss": 0.0191, "step": 2750 }, { "epoch": 7.479674796747967, "grad_norm": 0.6585750579833984, "learning_rate": 9.790009030202658e-05, "loss": 0.0206, "step": 2760 }, { "epoch": 7.506775067750677, "grad_norm": 0.3563913106918335, "learning_rate": 9.78763172050362e-05, "loss": 0.0225, "step": 2770 }, { "epoch": 7.533875338753387, "grad_norm": 0.5692076683044434, "learning_rate": 9.785241321602274e-05, "loss": 0.0209, "step": 2780 }, { "epoch": 7.560975609756097, "grad_norm": 0.4811396598815918, "learning_rate": 9.782837840033879e-05, "loss": 0.0201, "step": 2790 }, { "epoch": 7.588075880758808, "grad_norm": 0.5702165961265564, "learning_rate": 9.780421282369461e-05, "loss": 0.0192, "step": 2800 }, { "epoch": 7.615176151761518, "grad_norm": 0.4224900007247925, "learning_rate": 9.777991655215797e-05, "loss": 0.0179, "step": 2810 }, { "epoch": 7.642276422764228, "grad_norm": 0.3861061632633209, "learning_rate": 9.775548965215394e-05, "loss": 0.0208, "step": 2820 }, { "epoch": 7.669376693766938, "grad_norm": 0.545329749584198, "learning_rate": 9.773093219046474e-05, "loss": 0.0182, "step": 2830 }, { "epoch": 7.696476964769648, "grad_norm": 0.5261024236679077, "learning_rate": 9.770624423422954e-05, "loss": 0.0201, "step": 2840 }, { "epoch": 7.723577235772358, "grad_norm": 0.41190841794013977, "learning_rate": 9.768142585094426e-05, "loss": 0.0206, "step": 2850 }, { "epoch": 7.750677506775068, "grad_norm": 0.5202215313911438, "learning_rate": 9.765647710846142e-05, "loss": 0.017, "step": 2860 }, { "epoch": 7.777777777777778, "grad_norm": 0.38157495856285095, "learning_rate": 9.763139807498991e-05, "loss": 0.0177, "step": 2870 }, { "epoch": 7.804878048780488, "grad_norm": 0.33343902230262756, "learning_rate": 9.760618881909487e-05, "loss": 0.0149, "step": 2880 }, { "epoch": 7.831978319783198, "grad_norm": 0.3842809498310089, "learning_rate": 9.758084940969744e-05, "loss": 0.0155, "step": 2890 }, { "epoch": 7.8590785907859075, "grad_norm": 0.36252161860466003, "learning_rate": 9.755537991607459e-05, "loss": 0.0198, "step": 2900 }, { "epoch": 7.886178861788618, "grad_norm": 0.4581364095211029, "learning_rate": 9.752978040785895e-05, "loss": 0.0196, "step": 2910 }, { "epoch": 7.913279132791327, "grad_norm": 0.32870733737945557, "learning_rate": 9.750405095503859e-05, "loss": 0.017, "step": 2920 }, { "epoch": 7.940379403794038, "grad_norm": 0.3550833761692047, "learning_rate": 9.747819162795686e-05, "loss": 0.0163, "step": 2930 }, { "epoch": 7.967479674796748, "grad_norm": 0.36273545026779175, "learning_rate": 9.745220249731217e-05, "loss": 0.0158, "step": 2940 }, { "epoch": 7.994579945799458, "grad_norm": 0.5983418822288513, "learning_rate": 9.742608363415781e-05, "loss": 0.0183, "step": 2950 }, { "epoch": 8.021680216802167, "grad_norm": 0.3098985254764557, "learning_rate": 9.739983510990176e-05, "loss": 0.0186, "step": 2960 }, { "epoch": 8.048780487804878, "grad_norm": 0.42058148980140686, "learning_rate": 9.737345699630647e-05, "loss": 0.0181, "step": 2970 }, { "epoch": 8.075880758807589, "grad_norm": 0.37942612171173096, "learning_rate": 9.734694936548869e-05, "loss": 0.0189, "step": 2980 }, { "epoch": 8.102981029810298, "grad_norm": 0.39531970024108887, "learning_rate": 9.732031228991932e-05, "loss": 0.0206, "step": 2990 }, { "epoch": 8.130081300813009, "grad_norm": 0.3539341390132904, "learning_rate": 9.729354584242302e-05, "loss": 0.0195, "step": 3000 }, { "epoch": 8.157181571815718, "grad_norm": 0.4826233983039856, "learning_rate": 9.726665009617832e-05, "loss": 0.0211, "step": 3010 }, { "epoch": 8.184281842818429, "grad_norm": 0.30337023735046387, "learning_rate": 9.723962512471714e-05, "loss": 0.0199, "step": 3020 }, { "epoch": 8.211382113821138, "grad_norm": 0.36965081095695496, "learning_rate": 9.72124710019247e-05, "loss": 0.0222, "step": 3030 }, { "epoch": 8.238482384823849, "grad_norm": 0.38344642519950867, "learning_rate": 9.718518780203934e-05, "loss": 0.0205, "step": 3040 }, { "epoch": 8.265582655826558, "grad_norm": 0.33391132950782776, "learning_rate": 9.715777559965228e-05, "loss": 0.017, "step": 3050 }, { "epoch": 8.292682926829269, "grad_norm": 0.35611701011657715, "learning_rate": 9.713023446970746e-05, "loss": 0.0193, "step": 3060 }, { "epoch": 8.319783197831978, "grad_norm": 0.386920690536499, "learning_rate": 9.710256448750126e-05, "loss": 0.0182, "step": 3070 }, { "epoch": 8.346883468834688, "grad_norm": 0.31678175926208496, "learning_rate": 9.707476572868235e-05, "loss": 0.0193, "step": 3080 }, { "epoch": 8.373983739837398, "grad_norm": 0.5725724101066589, "learning_rate": 9.704683826925149e-05, "loss": 0.0173, "step": 3090 }, { "epoch": 8.401084010840108, "grad_norm": 0.47054409980773926, "learning_rate": 9.701878218556129e-05, "loss": 0.0179, "step": 3100 }, { "epoch": 8.42818428184282, "grad_norm": 0.40350958704948425, "learning_rate": 9.699059755431598e-05, "loss": 0.0232, "step": 3110 }, { "epoch": 8.455284552845528, "grad_norm": 0.44046106934547424, "learning_rate": 9.696228445257132e-05, "loss": 0.0203, "step": 3120 }, { "epoch": 8.482384823848239, "grad_norm": 0.4044431746006012, "learning_rate": 9.693384295773419e-05, "loss": 0.0172, "step": 3130 }, { "epoch": 8.509485094850948, "grad_norm": 0.6307073831558228, "learning_rate": 9.690527314756259e-05, "loss": 0.0191, "step": 3140 }, { "epoch": 8.536585365853659, "grad_norm": 0.47090259194374084, "learning_rate": 9.687657510016527e-05, "loss": 0.0208, "step": 3150 }, { "epoch": 8.563685636856368, "grad_norm": 0.40962645411491394, "learning_rate": 9.684774889400161e-05, "loss": 0.0196, "step": 3160 }, { "epoch": 8.590785907859079, "grad_norm": 0.3790275454521179, "learning_rate": 9.681879460788135e-05, "loss": 0.0199, "step": 3170 }, { "epoch": 8.617886178861788, "grad_norm": 0.40508076548576355, "learning_rate": 9.67897123209644e-05, "loss": 0.0192, "step": 3180 }, { "epoch": 8.644986449864499, "grad_norm": 0.3037061095237732, "learning_rate": 9.676050211276062e-05, "loss": 0.0198, "step": 3190 }, { "epoch": 8.672086720867208, "grad_norm": 0.40899658203125, "learning_rate": 9.673116406312962e-05, "loss": 0.0193, "step": 3200 }, { "epoch": 8.699186991869919, "grad_norm": 0.41937410831451416, "learning_rate": 9.67016982522805e-05, "loss": 0.016, "step": 3210 }, { "epoch": 8.726287262872628, "grad_norm": 0.4744589626789093, "learning_rate": 9.667210476077164e-05, "loss": 0.0172, "step": 3220 }, { "epoch": 8.753387533875339, "grad_norm": 0.41886743903160095, "learning_rate": 9.664238366951055e-05, "loss": 0.0205, "step": 3230 }, { "epoch": 8.78048780487805, "grad_norm": 0.31115201115608215, "learning_rate": 9.661253505975355e-05, "loss": 0.0177, "step": 3240 }, { "epoch": 8.807588075880759, "grad_norm": 0.439206600189209, "learning_rate": 9.658255901310557e-05, "loss": 0.0191, "step": 3250 }, { "epoch": 8.83468834688347, "grad_norm": 0.4345906972885132, "learning_rate": 9.655245561152e-05, "loss": 0.0189, "step": 3260 }, { "epoch": 8.861788617886178, "grad_norm": 0.4281444251537323, "learning_rate": 9.65222249372984e-05, "loss": 0.0199, "step": 3270 }, { "epoch": 8.88888888888889, "grad_norm": 0.5230737924575806, "learning_rate": 9.649186707309026e-05, "loss": 0.0201, "step": 3280 }, { "epoch": 8.915989159891598, "grad_norm": 0.48609423637390137, "learning_rate": 9.646138210189283e-05, "loss": 0.0166, "step": 3290 }, { "epoch": 8.94308943089431, "grad_norm": 0.5135719776153564, "learning_rate": 9.643077010705087e-05, "loss": 0.024, "step": 3300 }, { "epoch": 8.970189701897018, "grad_norm": 0.5724334716796875, "learning_rate": 9.640003117225637e-05, "loss": 0.0161, "step": 3310 }, { "epoch": 8.997289972899729, "grad_norm": 0.3117789328098297, "learning_rate": 9.636916538154846e-05, "loss": 0.0171, "step": 3320 }, { "epoch": 9.024390243902438, "grad_norm": 0.4648281931877136, "learning_rate": 9.633817281931296e-05, "loss": 0.02, "step": 3330 }, { "epoch": 9.051490514905149, "grad_norm": 0.44250431656837463, "learning_rate": 9.630705357028242e-05, "loss": 0.0163, "step": 3340 }, { "epoch": 9.07859078590786, "grad_norm": 0.36273911595344543, "learning_rate": 9.627580771953563e-05, "loss": 0.0227, "step": 3350 }, { "epoch": 9.105691056910569, "grad_norm": 0.2801952064037323, "learning_rate": 9.624443535249759e-05, "loss": 0.0187, "step": 3360 }, { "epoch": 9.13279132791328, "grad_norm": 0.32204383611679077, "learning_rate": 9.621293655493913e-05, "loss": 0.0151, "step": 3370 }, { "epoch": 9.159891598915989, "grad_norm": 0.41413235664367676, "learning_rate": 9.618131141297675e-05, "loss": 0.02, "step": 3380 }, { "epoch": 9.1869918699187, "grad_norm": 0.6155425906181335, "learning_rate": 9.614956001307242e-05, "loss": 0.0199, "step": 3390 }, { "epoch": 9.214092140921409, "grad_norm": 0.37619081139564514, "learning_rate": 9.611768244203321e-05, "loss": 0.0197, "step": 3400 }, { "epoch": 9.24119241192412, "grad_norm": 0.3616086542606354, "learning_rate": 9.60856787870112e-05, "loss": 0.0168, "step": 3410 }, { "epoch": 9.268292682926829, "grad_norm": 0.4029463827610016, "learning_rate": 9.605354913550318e-05, "loss": 0.0172, "step": 3420 }, { "epoch": 9.29539295392954, "grad_norm": 0.4518592655658722, "learning_rate": 9.602129357535037e-05, "loss": 0.0208, "step": 3430 }, { "epoch": 9.322493224932249, "grad_norm": 0.46268966794013977, "learning_rate": 9.598891219473825e-05, "loss": 0.0173, "step": 3440 }, { "epoch": 9.34959349593496, "grad_norm": 0.3157885670661926, "learning_rate": 9.595640508219625e-05, "loss": 0.0174, "step": 3450 }, { "epoch": 9.37669376693767, "grad_norm": 0.3849833011627197, "learning_rate": 9.592377232659761e-05, "loss": 0.0173, "step": 3460 }, { "epoch": 9.40379403794038, "grad_norm": 0.5847901105880737, "learning_rate": 9.589101401715904e-05, "loss": 0.0171, "step": 3470 }, { "epoch": 9.43089430894309, "grad_norm": 0.35015201568603516, "learning_rate": 9.585813024344045e-05, "loss": 0.0176, "step": 3480 }, { "epoch": 9.4579945799458, "grad_norm": 0.28172650933265686, "learning_rate": 9.58251210953449e-05, "loss": 0.0171, "step": 3490 }, { "epoch": 9.48509485094851, "grad_norm": 0.47807207703590393, "learning_rate": 9.579198666311809e-05, "loss": 0.0177, "step": 3500 }, { "epoch": 9.512195121951219, "grad_norm": 0.4642459750175476, "learning_rate": 9.575872703734832e-05, "loss": 0.0195, "step": 3510 }, { "epoch": 9.53929539295393, "grad_norm": 0.43509867787361145, "learning_rate": 9.572534230896611e-05, "loss": 0.0166, "step": 3520 }, { "epoch": 9.566395663956639, "grad_norm": 0.3764246702194214, "learning_rate": 9.569183256924403e-05, "loss": 0.0175, "step": 3530 }, { "epoch": 9.59349593495935, "grad_norm": 0.4278287887573242, "learning_rate": 9.565819790979646e-05, "loss": 0.0162, "step": 3540 }, { "epoch": 9.620596205962059, "grad_norm": 0.4532361328601837, "learning_rate": 9.562443842257925e-05, "loss": 0.0154, "step": 3550 }, { "epoch": 9.64769647696477, "grad_norm": 0.33848798274993896, "learning_rate": 9.559055419988956e-05, "loss": 0.0174, "step": 3560 }, { "epoch": 9.67479674796748, "grad_norm": 0.36627906560897827, "learning_rate": 9.555654533436557e-05, "loss": 0.016, "step": 3570 }, { "epoch": 9.70189701897019, "grad_norm": 0.37938615679740906, "learning_rate": 9.552241191898621e-05, "loss": 0.0173, "step": 3580 }, { "epoch": 9.7289972899729, "grad_norm": 0.3889431953430176, "learning_rate": 9.548815404707092e-05, "loss": 0.0202, "step": 3590 }, { "epoch": 9.75609756097561, "grad_norm": 0.46049967408180237, "learning_rate": 9.545377181227942e-05, "loss": 0.0148, "step": 3600 }, { "epoch": 9.78319783197832, "grad_norm": 0.36862003803253174, "learning_rate": 9.541926530861145e-05, "loss": 0.0203, "step": 3610 }, { "epoch": 9.81029810298103, "grad_norm": 0.39069220423698425, "learning_rate": 9.538463463040645e-05, "loss": 0.0177, "step": 3620 }, { "epoch": 9.83739837398374, "grad_norm": 0.49851489067077637, "learning_rate": 9.534987987234337e-05, "loss": 0.018, "step": 3630 }, { "epoch": 9.86449864498645, "grad_norm": 0.2856561541557312, "learning_rate": 9.53150011294404e-05, "loss": 0.0155, "step": 3640 }, { "epoch": 9.89159891598916, "grad_norm": 0.26302456855773926, "learning_rate": 9.527999849705471e-05, "loss": 0.0176, "step": 3650 }, { "epoch": 9.91869918699187, "grad_norm": 0.33188334107398987, "learning_rate": 9.524487207088213e-05, "loss": 0.0168, "step": 3660 }, { "epoch": 9.94579945799458, "grad_norm": 0.24069060385227203, "learning_rate": 9.520962194695698e-05, "loss": 0.0168, "step": 3670 }, { "epoch": 9.97289972899729, "grad_norm": 0.38009288907051086, "learning_rate": 9.517424822165175e-05, "loss": 0.0183, "step": 3680 }, { "epoch": 10.0, "grad_norm": 0.3994998037815094, "learning_rate": 9.513875099167685e-05, "loss": 0.0154, "step": 3690 }, { "epoch": 10.02710027100271, "grad_norm": 0.5198771953582764, "learning_rate": 9.510313035408035e-05, "loss": 0.016, "step": 3700 }, { "epoch": 10.05420054200542, "grad_norm": 0.3195599913597107, "learning_rate": 9.506738640624775e-05, "loss": 0.0182, "step": 3710 }, { "epoch": 10.08130081300813, "grad_norm": 0.40803587436676025, "learning_rate": 9.50315192459016e-05, "loss": 0.0204, "step": 3720 }, { "epoch": 10.10840108401084, "grad_norm": 0.49372756481170654, "learning_rate": 9.499552897110136e-05, "loss": 0.0166, "step": 3730 }, { "epoch": 10.13550135501355, "grad_norm": 0.5622862577438354, "learning_rate": 9.495941568024304e-05, "loss": 0.015, "step": 3740 }, { "epoch": 10.16260162601626, "grad_norm": 0.3338458836078644, "learning_rate": 9.492317947205904e-05, "loss": 0.0151, "step": 3750 }, { "epoch": 10.18970189701897, "grad_norm": 0.34991493821144104, "learning_rate": 9.488682044561775e-05, "loss": 0.0155, "step": 3760 }, { "epoch": 10.21680216802168, "grad_norm": 0.5052261352539062, "learning_rate": 9.485033870032335e-05, "loss": 0.0188, "step": 3770 }, { "epoch": 10.24390243902439, "grad_norm": 0.388963520526886, "learning_rate": 9.481373433591556e-05, "loss": 0.0204, "step": 3780 }, { "epoch": 10.2710027100271, "grad_norm": 0.32736435532569885, "learning_rate": 9.47770074524693e-05, "loss": 0.016, "step": 3790 }, { "epoch": 10.29810298102981, "grad_norm": 0.31952229142189026, "learning_rate": 9.474015815039446e-05, "loss": 0.0166, "step": 3800 }, { "epoch": 10.32520325203252, "grad_norm": 0.4527367353439331, "learning_rate": 9.470318653043565e-05, "loss": 0.0146, "step": 3810 }, { "epoch": 10.35230352303523, "grad_norm": 0.4061071574687958, "learning_rate": 9.466609269367185e-05, "loss": 0.016, "step": 3820 }, { "epoch": 10.379403794037941, "grad_norm": 0.3556763529777527, "learning_rate": 9.46288767415162e-05, "loss": 0.0145, "step": 3830 }, { "epoch": 10.40650406504065, "grad_norm": 0.20510871708393097, "learning_rate": 9.459153877571567e-05, "loss": 0.017, "step": 3840 }, { "epoch": 10.433604336043361, "grad_norm": 0.47999557852745056, "learning_rate": 9.455407889835087e-05, "loss": 0.0147, "step": 3850 }, { "epoch": 10.46070460704607, "grad_norm": 0.3334876298904419, "learning_rate": 9.451649721183564e-05, "loss": 0.0167, "step": 3860 }, { "epoch": 10.487804878048781, "grad_norm": 0.30660128593444824, "learning_rate": 9.447879381891692e-05, "loss": 0.0143, "step": 3870 }, { "epoch": 10.51490514905149, "grad_norm": 0.30837514996528625, "learning_rate": 9.444096882267428e-05, "loss": 0.0168, "step": 3880 }, { "epoch": 10.5420054200542, "grad_norm": 0.2502671778202057, "learning_rate": 9.440302232651988e-05, "loss": 0.0147, "step": 3890 }, { "epoch": 10.56910569105691, "grad_norm": 0.49576520919799805, "learning_rate": 9.436495443419795e-05, "loss": 0.0161, "step": 3900 }, { "epoch": 10.59620596205962, "grad_norm": 0.3976099491119385, "learning_rate": 9.432676524978466e-05, "loss": 0.0216, "step": 3910 }, { "epoch": 10.62330623306233, "grad_norm": 0.3256088197231293, "learning_rate": 9.42884548776878e-05, "loss": 0.0193, "step": 3920 }, { "epoch": 10.65040650406504, "grad_norm": 0.2998936176300049, "learning_rate": 9.425002342264646e-05, "loss": 0.0196, "step": 3930 }, { "epoch": 10.677506775067751, "grad_norm": 0.3825250566005707, "learning_rate": 9.421147098973077e-05, "loss": 0.0169, "step": 3940 }, { "epoch": 10.70460704607046, "grad_norm": 0.4926713705062866, "learning_rate": 9.41727976843416e-05, "loss": 0.022, "step": 3950 }, { "epoch": 10.731707317073171, "grad_norm": 0.26846325397491455, "learning_rate": 9.413400361221029e-05, "loss": 0.0177, "step": 3960 }, { "epoch": 10.75880758807588, "grad_norm": 0.4044261574745178, "learning_rate": 9.409508887939835e-05, "loss": 0.0152, "step": 3970 }, { "epoch": 10.785907859078591, "grad_norm": 0.399044930934906, "learning_rate": 9.40560535922972e-05, "loss": 0.017, "step": 3980 }, { "epoch": 10.8130081300813, "grad_norm": 0.2825317978858948, "learning_rate": 9.40168978576278e-05, "loss": 0.017, "step": 3990 }, { "epoch": 10.840108401084011, "grad_norm": 0.2855791747570038, "learning_rate": 9.397762178244043e-05, "loss": 0.0173, "step": 4000 }, { "epoch": 10.86720867208672, "grad_norm": 0.42250627279281616, "learning_rate": 9.393822547411439e-05, "loss": 0.0201, "step": 4010 }, { "epoch": 10.894308943089431, "grad_norm": 0.38725337386131287, "learning_rate": 9.389870904035769e-05, "loss": 0.0153, "step": 4020 }, { "epoch": 10.92140921409214, "grad_norm": 0.44211089611053467, "learning_rate": 9.385907258920672e-05, "loss": 0.0168, "step": 4030 }, { "epoch": 10.948509485094851, "grad_norm": 0.42326247692108154, "learning_rate": 9.381931622902607e-05, "loss": 0.0151, "step": 4040 }, { "epoch": 10.975609756097562, "grad_norm": 0.4364258050918579, "learning_rate": 9.377944006850807e-05, "loss": 0.0138, "step": 4050 }, { "epoch": 11.002710027100271, "grad_norm": 0.28477299213409424, "learning_rate": 9.373944421667265e-05, "loss": 0.0162, "step": 4060 }, { "epoch": 11.029810298102982, "grad_norm": 0.37012505531311035, "learning_rate": 9.369932878286691e-05, "loss": 0.015, "step": 4070 }, { "epoch": 11.05691056910569, "grad_norm": 0.2979791462421417, "learning_rate": 9.365909387676494e-05, "loss": 0.0154, "step": 4080 }, { "epoch": 11.084010840108402, "grad_norm": 0.2585712671279907, "learning_rate": 9.361873960836744e-05, "loss": 0.0155, "step": 4090 }, { "epoch": 11.11111111111111, "grad_norm": 0.37353387475013733, "learning_rate": 9.357826608800142e-05, "loss": 0.0176, "step": 4100 }, { "epoch": 11.138211382113822, "grad_norm": 0.4338681399822235, "learning_rate": 9.353767342631994e-05, "loss": 0.0173, "step": 4110 }, { "epoch": 11.16531165311653, "grad_norm": 0.3354649841785431, "learning_rate": 9.34969617343018e-05, "loss": 0.0161, "step": 4120 }, { "epoch": 11.192411924119241, "grad_norm": 0.34548553824424744, "learning_rate": 9.345613112325122e-05, "loss": 0.0184, "step": 4130 }, { "epoch": 11.21951219512195, "grad_norm": 0.5056374669075012, "learning_rate": 9.34151817047975e-05, "loss": 0.0253, "step": 4140 }, { "epoch": 11.246612466124661, "grad_norm": 0.6679131388664246, "learning_rate": 9.33741135908948e-05, "loss": 0.0175, "step": 4150 }, { "epoch": 11.27371273712737, "grad_norm": 0.3348436951637268, "learning_rate": 9.33329268938218e-05, "loss": 0.0131, "step": 4160 }, { "epoch": 11.300813008130081, "grad_norm": 0.3799762427806854, "learning_rate": 9.329162172618132e-05, "loss": 0.0189, "step": 4170 }, { "epoch": 11.327913279132792, "grad_norm": 0.23194169998168945, "learning_rate": 9.325019820090013e-05, "loss": 0.0201, "step": 4180 }, { "epoch": 11.355013550135501, "grad_norm": 0.32834088802337646, "learning_rate": 9.320865643122855e-05, "loss": 0.0164, "step": 4190 }, { "epoch": 11.382113821138212, "grad_norm": 0.32032129168510437, "learning_rate": 9.316699653074023e-05, "loss": 0.0178, "step": 4200 }, { "epoch": 11.409214092140921, "grad_norm": 0.26543691754341125, "learning_rate": 9.312521861333172e-05, "loss": 0.0144, "step": 4210 }, { "epoch": 11.436314363143632, "grad_norm": 0.2884538471698761, "learning_rate": 9.308332279322224e-05, "loss": 0.0151, "step": 4220 }, { "epoch": 11.463414634146341, "grad_norm": 0.38935190439224243, "learning_rate": 9.304130918495338e-05, "loss": 0.0168, "step": 4230 }, { "epoch": 11.490514905149052, "grad_norm": 0.28828880190849304, "learning_rate": 9.299917790338874e-05, "loss": 0.0153, "step": 4240 }, { "epoch": 11.517615176151761, "grad_norm": 0.3479684293270111, "learning_rate": 9.295692906371363e-05, "loss": 0.0175, "step": 4250 }, { "epoch": 11.544715447154472, "grad_norm": 0.2746388614177704, "learning_rate": 9.291456278143476e-05, "loss": 0.0149, "step": 4260 }, { "epoch": 11.57181571815718, "grad_norm": 0.26983797550201416, "learning_rate": 9.287207917237994e-05, "loss": 0.0165, "step": 4270 }, { "epoch": 11.598915989159892, "grad_norm": 0.42016008496284485, "learning_rate": 9.282947835269773e-05, "loss": 0.0162, "step": 4280 }, { "epoch": 11.6260162601626, "grad_norm": 0.33001309633255005, "learning_rate": 9.278676043885715e-05, "loss": 0.0152, "step": 4290 }, { "epoch": 11.653116531165312, "grad_norm": 0.27096110582351685, "learning_rate": 9.274392554764733e-05, "loss": 0.0248, "step": 4300 }, { "epoch": 11.680216802168022, "grad_norm": 0.30113741755485535, "learning_rate": 9.270097379617723e-05, "loss": 0.016, "step": 4310 }, { "epoch": 11.707317073170731, "grad_norm": 0.31313565373420715, "learning_rate": 9.26579053018753e-05, "loss": 0.0167, "step": 4320 }, { "epoch": 11.734417344173442, "grad_norm": 0.3242699205875397, "learning_rate": 9.261472018248918e-05, "loss": 0.0152, "step": 4330 }, { "epoch": 11.761517615176151, "grad_norm": 0.3740980923175812, "learning_rate": 9.25714185560853e-05, "loss": 0.0169, "step": 4340 }, { "epoch": 11.788617886178862, "grad_norm": 0.3985002934932709, "learning_rate": 9.252800054104868e-05, "loss": 0.0178, "step": 4350 }, { "epoch": 11.815718157181571, "grad_norm": 0.474053293466568, "learning_rate": 9.248446625608252e-05, "loss": 0.0154, "step": 4360 }, { "epoch": 11.842818428184282, "grad_norm": 0.3749001920223236, "learning_rate": 9.244081582020789e-05, "loss": 0.0144, "step": 4370 }, { "epoch": 11.869918699186991, "grad_norm": 0.2207919955253601, "learning_rate": 9.239704935276339e-05, "loss": 0.0156, "step": 4380 }, { "epoch": 11.897018970189702, "grad_norm": 0.3618600368499756, "learning_rate": 9.235316697340489e-05, "loss": 0.0142, "step": 4390 }, { "epoch": 11.924119241192411, "grad_norm": 0.3082793056964874, "learning_rate": 9.230916880210512e-05, "loss": 0.0168, "step": 4400 }, { "epoch": 11.951219512195122, "grad_norm": 0.45993348956108093, "learning_rate": 9.226505495915342e-05, "loss": 0.0132, "step": 4410 }, { "epoch": 11.978319783197833, "grad_norm": 0.2353542447090149, "learning_rate": 9.222082556515536e-05, "loss": 0.0195, "step": 4420 }, { "epoch": 12.005420054200542, "grad_norm": 0.3026447296142578, "learning_rate": 9.217648074103242e-05, "loss": 0.0193, "step": 4430 }, { "epoch": 12.032520325203253, "grad_norm": 0.3371847867965698, "learning_rate": 9.213202060802161e-05, "loss": 0.0162, "step": 4440 }, { "epoch": 12.059620596205962, "grad_norm": 0.4865823984146118, "learning_rate": 9.208744528767528e-05, "loss": 0.0143, "step": 4450 }, { "epoch": 12.086720867208673, "grad_norm": 0.4073032736778259, "learning_rate": 9.204275490186064e-05, "loss": 0.0185, "step": 4460 }, { "epoch": 12.113821138211382, "grad_norm": 0.43152526021003723, "learning_rate": 9.199794957275949e-05, "loss": 0.015, "step": 4470 }, { "epoch": 12.140921409214092, "grad_norm": 0.4283233880996704, "learning_rate": 9.19530294228679e-05, "loss": 0.0149, "step": 4480 }, { "epoch": 12.168021680216802, "grad_norm": 0.36988264322280884, "learning_rate": 9.190799457499583e-05, "loss": 0.0173, "step": 4490 }, { "epoch": 12.195121951219512, "grad_norm": 0.2985976040363312, "learning_rate": 9.186284515226686e-05, "loss": 0.017, "step": 4500 }, { "epoch": 12.222222222222221, "grad_norm": 0.34660500288009644, "learning_rate": 9.181758127811777e-05, "loss": 0.0159, "step": 4510 }, { "epoch": 12.249322493224932, "grad_norm": 0.3189932703971863, "learning_rate": 9.177220307629825e-05, "loss": 0.0174, "step": 4520 }, { "epoch": 12.276422764227643, "grad_norm": 0.3113153874874115, "learning_rate": 9.172671067087059e-05, "loss": 0.0202, "step": 4530 }, { "epoch": 12.303523035230352, "grad_norm": 0.4450657069683075, "learning_rate": 9.16811041862093e-05, "loss": 0.0153, "step": 4540 }, { "epoch": 12.330623306233063, "grad_norm": 0.2966851592063904, "learning_rate": 9.163538374700076e-05, "loss": 0.0207, "step": 4550 }, { "epoch": 12.357723577235772, "grad_norm": 0.37889787554740906, "learning_rate": 9.158954947824287e-05, "loss": 0.0147, "step": 4560 }, { "epoch": 12.384823848238483, "grad_norm": 0.5613305568695068, "learning_rate": 9.154360150524482e-05, "loss": 0.0149, "step": 4570 }, { "epoch": 12.411924119241192, "grad_norm": 0.3804514706134796, "learning_rate": 9.14975399536266e-05, "loss": 0.0161, "step": 4580 }, { "epoch": 12.439024390243903, "grad_norm": 0.3592541813850403, "learning_rate": 9.14513649493187e-05, "loss": 0.0145, "step": 4590 }, { "epoch": 12.466124661246612, "grad_norm": 0.29926979541778564, "learning_rate": 9.140507661856187e-05, "loss": 0.0144, "step": 4600 }, { "epoch": 12.493224932249323, "grad_norm": 0.37076982855796814, "learning_rate": 9.135867508790661e-05, "loss": 0.0188, "step": 4610 }, { "epoch": 12.520325203252032, "grad_norm": 0.3265802562236786, "learning_rate": 9.131216048421291e-05, "loss": 0.0175, "step": 4620 }, { "epoch": 12.547425474254743, "grad_norm": 0.2562776505947113, "learning_rate": 9.126553293464998e-05, "loss": 0.0169, "step": 4630 }, { "epoch": 12.574525745257453, "grad_norm": 0.3824804723262787, "learning_rate": 9.121879256669572e-05, "loss": 0.0158, "step": 4640 }, { "epoch": 12.601626016260163, "grad_norm": 0.23421670496463776, "learning_rate": 9.117193950813652e-05, "loss": 0.0161, "step": 4650 }, { "epoch": 12.628726287262873, "grad_norm": 0.40515634417533875, "learning_rate": 9.112497388706685e-05, "loss": 0.014, "step": 4660 }, { "epoch": 12.655826558265582, "grad_norm": 0.533778727054596, "learning_rate": 9.10778958318889e-05, "loss": 0.0151, "step": 4670 }, { "epoch": 12.682926829268293, "grad_norm": 0.6320534944534302, "learning_rate": 9.103070547131232e-05, "loss": 0.017, "step": 4680 }, { "epoch": 12.710027100271002, "grad_norm": 0.3805629312992096, "learning_rate": 9.098340293435375e-05, "loss": 0.0163, "step": 4690 }, { "epoch": 12.737127371273713, "grad_norm": 0.3636171221733093, "learning_rate": 9.093598835033649e-05, "loss": 0.0159, "step": 4700 }, { "epoch": 12.764227642276422, "grad_norm": 0.3911455571651459, "learning_rate": 9.088846184889021e-05, "loss": 0.0149, "step": 4710 }, { "epoch": 12.791327913279133, "grad_norm": 0.5690465569496155, "learning_rate": 9.084082355995057e-05, "loss": 0.0161, "step": 4720 }, { "epoch": 12.818428184281842, "grad_norm": 0.23277707397937775, "learning_rate": 9.079307361375882e-05, "loss": 0.0161, "step": 4730 }, { "epoch": 12.845528455284553, "grad_norm": 0.5010408163070679, "learning_rate": 9.074521214086149e-05, "loss": 0.0159, "step": 4740 }, { "epoch": 12.872628726287262, "grad_norm": 0.2508997321128845, "learning_rate": 9.069723927211001e-05, "loss": 0.014, "step": 4750 }, { "epoch": 12.899728997289973, "grad_norm": 0.3578003942966461, "learning_rate": 9.064915513866037e-05, "loss": 0.0176, "step": 4760 }, { "epoch": 12.926829268292684, "grad_norm": 0.41484764218330383, "learning_rate": 9.060095987197279e-05, "loss": 0.0171, "step": 4770 }, { "epoch": 12.953929539295393, "grad_norm": 0.412264347076416, "learning_rate": 9.055265360381126e-05, "loss": 0.015, "step": 4780 }, { "epoch": 12.981029810298104, "grad_norm": 0.38076651096343994, "learning_rate": 9.050423646624326e-05, "loss": 0.0145, "step": 4790 }, { "epoch": 13.008130081300813, "grad_norm": 0.6705567240715027, "learning_rate": 9.045570859163943e-05, "loss": 0.0157, "step": 4800 }, { "epoch": 13.035230352303524, "grad_norm": 0.34331464767456055, "learning_rate": 9.04070701126731e-05, "loss": 0.0166, "step": 4810 }, { "epoch": 13.062330623306233, "grad_norm": 0.4220140278339386, "learning_rate": 9.035832116232001e-05, "loss": 0.014, "step": 4820 }, { "epoch": 13.089430894308943, "grad_norm": 0.43213409185409546, "learning_rate": 9.030946187385796e-05, "loss": 0.0154, "step": 4830 }, { "epoch": 13.116531165311653, "grad_norm": 0.4724159836769104, "learning_rate": 9.026049238086635e-05, "loss": 0.0201, "step": 4840 }, { "epoch": 13.143631436314363, "grad_norm": 0.33291006088256836, "learning_rate": 9.021141281722591e-05, "loss": 0.0138, "step": 4850 }, { "epoch": 13.170731707317072, "grad_norm": 0.22058884799480438, "learning_rate": 9.01622233171183e-05, "loss": 0.0172, "step": 4860 }, { "epoch": 13.197831978319783, "grad_norm": 0.33321505784988403, "learning_rate": 9.011292401502574e-05, "loss": 0.0147, "step": 4870 }, { "epoch": 13.224932249322492, "grad_norm": 0.5321431159973145, "learning_rate": 9.006351504573063e-05, "loss": 0.0137, "step": 4880 }, { "epoch": 13.252032520325203, "grad_norm": 0.3020470440387726, "learning_rate": 9.001399654431519e-05, "loss": 0.0145, "step": 4890 }, { "epoch": 13.279132791327914, "grad_norm": 0.3236350119113922, "learning_rate": 8.996436864616116e-05, "loss": 0.015, "step": 4900 }, { "epoch": 13.306233062330623, "grad_norm": 0.2712349593639374, "learning_rate": 8.991463148694925e-05, "loss": 0.0142, "step": 4910 }, { "epoch": 13.333333333333334, "grad_norm": 0.44283074140548706, "learning_rate": 8.986478520265902e-05, "loss": 0.0196, "step": 4920 }, { "epoch": 13.360433604336043, "grad_norm": 0.22564706206321716, "learning_rate": 8.981482992956827e-05, "loss": 0.0127, "step": 4930 }, { "epoch": 13.387533875338754, "grad_norm": 0.3248700797557831, "learning_rate": 8.976476580425282e-05, "loss": 0.0144, "step": 4940 }, { "epoch": 13.414634146341463, "grad_norm": 0.4053505063056946, "learning_rate": 8.971459296358606e-05, "loss": 0.0143, "step": 4950 }, { "epoch": 13.441734417344174, "grad_norm": 0.3838721513748169, "learning_rate": 8.966431154473864e-05, "loss": 0.0169, "step": 4960 }, { "epoch": 13.468834688346883, "grad_norm": 0.23523004353046417, "learning_rate": 8.961392168517803e-05, "loss": 0.0146, "step": 4970 }, { "epoch": 13.495934959349594, "grad_norm": 0.35759395360946655, "learning_rate": 8.956342352266821e-05, "loss": 0.0193, "step": 4980 }, { "epoch": 13.523035230352303, "grad_norm": 0.3079873323440552, "learning_rate": 8.95128171952692e-05, "loss": 0.0148, "step": 4990 }, { "epoch": 13.550135501355014, "grad_norm": 0.2992299795150757, "learning_rate": 8.946210284133676e-05, "loss": 0.0143, "step": 5000 }, { "epoch": 13.577235772357724, "grad_norm": 0.24212828278541565, "learning_rate": 8.941128059952201e-05, "loss": 0.0146, "step": 5010 }, { "epoch": 13.604336043360433, "grad_norm": 0.25437813997268677, "learning_rate": 8.936035060877102e-05, "loss": 0.016, "step": 5020 }, { "epoch": 13.631436314363144, "grad_norm": 0.2603600025177002, "learning_rate": 8.930931300832443e-05, "loss": 0.0146, "step": 5030 }, { "epoch": 13.658536585365853, "grad_norm": 0.4143906235694885, "learning_rate": 8.925816793771711e-05, "loss": 0.0154, "step": 5040 }, { "epoch": 13.685636856368564, "grad_norm": 0.35346177220344543, "learning_rate": 8.92069155367777e-05, "loss": 0.0148, "step": 5050 }, { "epoch": 13.712737127371273, "grad_norm": 0.2627791166305542, "learning_rate": 8.915555594562834e-05, "loss": 0.0166, "step": 5060 }, { "epoch": 13.739837398373984, "grad_norm": 0.3682796359062195, "learning_rate": 8.910408930468416e-05, "loss": 0.0128, "step": 5070 }, { "epoch": 13.766937669376693, "grad_norm": 0.3088335692882538, "learning_rate": 8.905251575465303e-05, "loss": 0.0186, "step": 5080 }, { "epoch": 13.794037940379404, "grad_norm": 0.3770740330219269, "learning_rate": 8.900083543653502e-05, "loss": 0.0188, "step": 5090 }, { "epoch": 13.821138211382113, "grad_norm": 0.2978116273880005, "learning_rate": 8.894904849162218e-05, "loss": 0.0162, "step": 5100 }, { "epoch": 13.848238482384824, "grad_norm": 0.33441388607025146, "learning_rate": 8.889715506149802e-05, "loss": 0.0139, "step": 5110 }, { "epoch": 13.875338753387535, "grad_norm": 0.27596601843833923, "learning_rate": 8.884515528803722e-05, "loss": 0.0127, "step": 5120 }, { "epoch": 13.902439024390244, "grad_norm": 0.35711896419525146, "learning_rate": 8.879304931340517e-05, "loss": 0.0145, "step": 5130 }, { "epoch": 13.929539295392955, "grad_norm": 0.2310306876897812, "learning_rate": 8.874083728005759e-05, "loss": 0.0163, "step": 5140 }, { "epoch": 13.956639566395664, "grad_norm": 0.24063165485858917, "learning_rate": 8.868851933074021e-05, "loss": 0.0161, "step": 5150 }, { "epoch": 13.983739837398375, "grad_norm": 0.2985197603702545, "learning_rate": 8.863609560848829e-05, "loss": 0.0132, "step": 5160 }, { "epoch": 14.010840108401084, "grad_norm": 0.23364852368831635, "learning_rate": 8.85835662566263e-05, "loss": 0.0128, "step": 5170 }, { "epoch": 14.037940379403794, "grad_norm": 0.310087114572525, "learning_rate": 8.853093141876747e-05, "loss": 0.0183, "step": 5180 }, { "epoch": 14.065040650406504, "grad_norm": 0.2995472550392151, "learning_rate": 8.847819123881343e-05, "loss": 0.0141, "step": 5190 }, { "epoch": 14.092140921409214, "grad_norm": 0.2630060613155365, "learning_rate": 8.842534586095383e-05, "loss": 0.0208, "step": 5200 }, { "epoch": 14.119241192411923, "grad_norm": 0.3494822084903717, "learning_rate": 8.837239542966593e-05, "loss": 0.0139, "step": 5210 }, { "epoch": 14.146341463414634, "grad_norm": 0.35882875323295593, "learning_rate": 8.831934008971417e-05, "loss": 0.0151, "step": 5220 }, { "epoch": 14.173441734417343, "grad_norm": 0.2592933177947998, "learning_rate": 8.826617998614982e-05, "loss": 0.0215, "step": 5230 }, { "epoch": 14.200542005420054, "grad_norm": 0.3262818157672882, "learning_rate": 8.821291526431056e-05, "loss": 0.0157, "step": 5240 }, { "epoch": 14.227642276422765, "grad_norm": 0.3307357132434845, "learning_rate": 8.815954606982015e-05, "loss": 0.0174, "step": 5250 }, { "epoch": 14.254742547425474, "grad_norm": 0.23086190223693848, "learning_rate": 8.810607254858789e-05, "loss": 0.0165, "step": 5260 }, { "epoch": 14.281842818428185, "grad_norm": 0.491158127784729, "learning_rate": 8.805249484680838e-05, "loss": 0.0157, "step": 5270 }, { "epoch": 14.308943089430894, "grad_norm": 0.2703213691711426, "learning_rate": 8.799881311096096e-05, "loss": 0.0142, "step": 5280 }, { "epoch": 14.336043360433605, "grad_norm": 0.25312045216560364, "learning_rate": 8.794502748780949e-05, "loss": 0.0135, "step": 5290 }, { "epoch": 14.363143631436314, "grad_norm": 0.24803681671619415, "learning_rate": 8.78911381244018e-05, "loss": 0.0135, "step": 5300 }, { "epoch": 14.390243902439025, "grad_norm": 0.3293529152870178, "learning_rate": 8.783714516806933e-05, "loss": 0.014, "step": 5310 }, { "epoch": 14.417344173441734, "grad_norm": 0.3023531436920166, "learning_rate": 8.77830487664268e-05, "loss": 0.0142, "step": 5320 }, { "epoch": 14.444444444444445, "grad_norm": 0.26721900701522827, "learning_rate": 8.772884906737167e-05, "loss": 0.0161, "step": 5330 }, { "epoch": 14.471544715447154, "grad_norm": 0.23062612116336823, "learning_rate": 8.767454621908387e-05, "loss": 0.013, "step": 5340 }, { "epoch": 14.498644986449865, "grad_norm": 0.36114969849586487, "learning_rate": 8.76201403700253e-05, "loss": 0.0149, "step": 5350 }, { "epoch": 14.525745257452574, "grad_norm": 0.4007132351398468, "learning_rate": 8.756563166893949e-05, "loss": 0.0135, "step": 5360 }, { "epoch": 14.552845528455284, "grad_norm": 0.2818199098110199, "learning_rate": 8.751102026485113e-05, "loss": 0.0155, "step": 5370 }, { "epoch": 14.579945799457995, "grad_norm": 0.33242857456207275, "learning_rate": 8.745630630706571e-05, "loss": 0.0112, "step": 5380 }, { "epoch": 14.607046070460704, "grad_norm": 0.2962556481361389, "learning_rate": 8.740148994516912e-05, "loss": 0.0146, "step": 5390 }, { "epoch": 14.634146341463415, "grad_norm": 0.38459765911102295, "learning_rate": 8.73465713290272e-05, "loss": 0.0148, "step": 5400 }, { "epoch": 14.661246612466124, "grad_norm": 0.26727205514907837, "learning_rate": 8.729155060878533e-05, "loss": 0.0147, "step": 5410 }, { "epoch": 14.688346883468835, "grad_norm": 0.2319994866847992, "learning_rate": 8.723642793486809e-05, "loss": 0.0119, "step": 5420 }, { "epoch": 14.715447154471544, "grad_norm": 0.2850014567375183, "learning_rate": 8.718120345797873e-05, "loss": 0.0134, "step": 5430 }, { "epoch": 14.742547425474255, "grad_norm": 0.3637334108352661, "learning_rate": 8.712587732909889e-05, "loss": 0.0137, "step": 5440 }, { "epoch": 14.769647696476964, "grad_norm": 0.30565622448921204, "learning_rate": 8.707044969948806e-05, "loss": 0.014, "step": 5450 }, { "epoch": 14.796747967479675, "grad_norm": 0.24348856508731842, "learning_rate": 8.701492072068329e-05, "loss": 0.0122, "step": 5460 }, { "epoch": 14.823848238482384, "grad_norm": 0.28320083022117615, "learning_rate": 8.695929054449869e-05, "loss": 0.0131, "step": 5470 }, { "epoch": 14.850948509485095, "grad_norm": 0.2357870638370514, "learning_rate": 8.690355932302501e-05, "loss": 0.0152, "step": 5480 }, { "epoch": 14.878048780487806, "grad_norm": 0.39876553416252136, "learning_rate": 8.684772720862931e-05, "loss": 0.0158, "step": 5490 }, { "epoch": 14.905149051490515, "grad_norm": 0.24273578822612762, "learning_rate": 8.679179435395446e-05, "loss": 0.0146, "step": 5500 }, { "epoch": 14.932249322493226, "grad_norm": 0.24913246929645538, "learning_rate": 8.673576091191874e-05, "loss": 0.0154, "step": 5510 }, { "epoch": 14.959349593495935, "grad_norm": 0.22345690429210663, "learning_rate": 8.667962703571541e-05, "loss": 0.0132, "step": 5520 }, { "epoch": 14.986449864498645, "grad_norm": 0.32146909832954407, "learning_rate": 8.662339287881238e-05, "loss": 0.0141, "step": 5530 }, { "epoch": 15.013550135501355, "grad_norm": 0.4377402365207672, "learning_rate": 8.656705859495169e-05, "loss": 0.0135, "step": 5540 }, { "epoch": 15.040650406504065, "grad_norm": 0.44416552782058716, "learning_rate": 8.651062433814912e-05, "loss": 0.0148, "step": 5550 }, { "epoch": 15.067750677506774, "grad_norm": 0.3092197775840759, "learning_rate": 8.645409026269375e-05, "loss": 0.0161, "step": 5560 }, { "epoch": 15.094850948509485, "grad_norm": 0.5030121207237244, "learning_rate": 8.639745652314759e-05, "loss": 0.0166, "step": 5570 }, { "epoch": 15.121951219512194, "grad_norm": 0.2838561534881592, "learning_rate": 8.634072327434515e-05, "loss": 0.0131, "step": 5580 }, { "epoch": 15.149051490514905, "grad_norm": 0.3340374529361725, "learning_rate": 8.628389067139294e-05, "loss": 0.0172, "step": 5590 }, { "epoch": 15.176151761517616, "grad_norm": 0.38063105940818787, "learning_rate": 8.622695886966911e-05, "loss": 0.0123, "step": 5600 }, { "epoch": 15.203252032520325, "grad_norm": 0.2871711254119873, "learning_rate": 8.616992802482308e-05, "loss": 0.014, "step": 5610 }, { "epoch": 15.230352303523036, "grad_norm": 0.2697857618331909, "learning_rate": 8.611279829277496e-05, "loss": 0.013, "step": 5620 }, { "epoch": 15.257452574525745, "grad_norm": 0.22135888040065765, "learning_rate": 8.605556982971528e-05, "loss": 0.0142, "step": 5630 }, { "epoch": 15.284552845528456, "grad_norm": 0.2637561857700348, "learning_rate": 8.599824279210447e-05, "loss": 0.012, "step": 5640 }, { "epoch": 15.311653116531165, "grad_norm": 0.5207611918449402, "learning_rate": 8.594081733667243e-05, "loss": 0.0117, "step": 5650 }, { "epoch": 15.338753387533876, "grad_norm": 0.20667213201522827, "learning_rate": 8.58832936204182e-05, "loss": 0.0153, "step": 5660 }, { "epoch": 15.365853658536585, "grad_norm": 0.2879221439361572, "learning_rate": 8.582567180060942e-05, "loss": 0.0132, "step": 5670 }, { "epoch": 15.392953929539296, "grad_norm": 0.2962784171104431, "learning_rate": 8.576795203478194e-05, "loss": 0.0146, "step": 5680 }, { "epoch": 15.420054200542005, "grad_norm": 0.2572590112686157, "learning_rate": 8.571013448073939e-05, "loss": 0.0139, "step": 5690 }, { "epoch": 15.447154471544716, "grad_norm": 0.3388477563858032, "learning_rate": 8.565221929655275e-05, "loss": 0.0142, "step": 5700 }, { "epoch": 15.474254742547426, "grad_norm": 0.28191912174224854, "learning_rate": 8.559420664055992e-05, "loss": 0.0135, "step": 5710 }, { "epoch": 15.501355013550135, "grad_norm": 0.35937583446502686, "learning_rate": 8.553609667136532e-05, "loss": 0.0131, "step": 5720 }, { "epoch": 15.528455284552846, "grad_norm": 0.21419385075569153, "learning_rate": 8.547788954783936e-05, "loss": 0.0126, "step": 5730 }, { "epoch": 15.555555555555555, "grad_norm": 0.28029024600982666, "learning_rate": 8.541958542911808e-05, "loss": 0.0118, "step": 5740 }, { "epoch": 15.582655826558266, "grad_norm": 0.3047841787338257, "learning_rate": 8.536118447460275e-05, "loss": 0.0144, "step": 5750 }, { "epoch": 15.609756097560975, "grad_norm": 0.25910335779190063, "learning_rate": 8.530268684395932e-05, "loss": 0.0127, "step": 5760 }, { "epoch": 15.636856368563686, "grad_norm": 0.26464134454727173, "learning_rate": 8.524409269711807e-05, "loss": 0.0121, "step": 5770 }, { "epoch": 15.663956639566395, "grad_norm": 0.23470868170261383, "learning_rate": 8.51854021942732e-05, "loss": 0.013, "step": 5780 }, { "epoch": 15.691056910569106, "grad_norm": 0.2764555513858795, "learning_rate": 8.512661549588227e-05, "loss": 0.0126, "step": 5790 }, { "epoch": 15.718157181571815, "grad_norm": 0.3263511657714844, "learning_rate": 8.506773276266588e-05, "loss": 0.0166, "step": 5800 }, { "epoch": 15.745257452574526, "grad_norm": 0.20919333398342133, "learning_rate": 8.500875415560721e-05, "loss": 0.0133, "step": 5810 }, { "epoch": 15.772357723577235, "grad_norm": 0.1642041802406311, "learning_rate": 8.494967983595144e-05, "loss": 0.0123, "step": 5820 }, { "epoch": 15.799457994579946, "grad_norm": 0.31499746441841125, "learning_rate": 8.489050996520558e-05, "loss": 0.0146, "step": 5830 }, { "epoch": 15.826558265582655, "grad_norm": 0.3515714406967163, "learning_rate": 8.483124470513775e-05, "loss": 0.017, "step": 5840 }, { "epoch": 15.853658536585366, "grad_norm": 0.25549376010894775, "learning_rate": 8.477188421777692e-05, "loss": 0.0187, "step": 5850 }, { "epoch": 15.880758807588077, "grad_norm": 0.30778366327285767, "learning_rate": 8.47124286654124e-05, "loss": 0.0123, "step": 5860 }, { "epoch": 15.907859078590786, "grad_norm": 0.27209848165512085, "learning_rate": 8.465287821059341e-05, "loss": 0.0122, "step": 5870 }, { "epoch": 15.934959349593496, "grad_norm": 0.32061418890953064, "learning_rate": 8.45932330161286e-05, "loss": 0.0147, "step": 5880 }, { "epoch": 15.962059620596206, "grad_norm": 0.28618741035461426, "learning_rate": 8.453349324508567e-05, "loss": 0.0143, "step": 5890 }, { "epoch": 15.989159891598916, "grad_norm": 0.31425946950912476, "learning_rate": 8.447365906079088e-05, "loss": 0.0129, "step": 5900 }, { "epoch": 16.016260162601625, "grad_norm": 0.2510456144809723, "learning_rate": 8.441373062682856e-05, "loss": 0.0129, "step": 5910 }, { "epoch": 16.043360433604335, "grad_norm": 0.44366341829299927, "learning_rate": 8.43537081070408e-05, "loss": 0.014, "step": 5920 }, { "epoch": 16.070460704607047, "grad_norm": 0.21848124265670776, "learning_rate": 8.429359166552689e-05, "loss": 0.0119, "step": 5930 }, { "epoch": 16.097560975609756, "grad_norm": 0.2114468663930893, "learning_rate": 8.423338146664284e-05, "loss": 0.0149, "step": 5940 }, { "epoch": 16.124661246612465, "grad_norm": 0.24260343611240387, "learning_rate": 8.417307767500107e-05, "loss": 0.0128, "step": 5950 }, { "epoch": 16.151761517615178, "grad_norm": 0.38527002930641174, "learning_rate": 8.411268045546983e-05, "loss": 0.0124, "step": 5960 }, { "epoch": 16.178861788617887, "grad_norm": 0.3239612877368927, "learning_rate": 8.405218997317281e-05, "loss": 0.0137, "step": 5970 }, { "epoch": 16.205962059620596, "grad_norm": 0.4456019997596741, "learning_rate": 8.399160639348869e-05, "loss": 0.0136, "step": 5980 }, { "epoch": 16.233062330623305, "grad_norm": 0.3233359754085541, "learning_rate": 8.393092988205065e-05, "loss": 0.0162, "step": 5990 }, { "epoch": 16.260162601626018, "grad_norm": 0.4605883061885834, "learning_rate": 8.387016060474597e-05, "loss": 0.0126, "step": 6000 }, { "epoch": 16.287262872628727, "grad_norm": 0.27295324206352234, "learning_rate": 8.380929872771551e-05, "loss": 0.0124, "step": 6010 }, { "epoch": 16.314363143631436, "grad_norm": 0.26586586236953735, "learning_rate": 8.374834441735335e-05, "loss": 0.0119, "step": 6020 }, { "epoch": 16.341463414634145, "grad_norm": 0.4113512635231018, "learning_rate": 8.368729784030622e-05, "loss": 0.0132, "step": 6030 }, { "epoch": 16.368563685636857, "grad_norm": 0.31470584869384766, "learning_rate": 8.362615916347315e-05, "loss": 0.015, "step": 6040 }, { "epoch": 16.395663956639567, "grad_norm": 0.37046849727630615, "learning_rate": 8.356492855400493e-05, "loss": 0.0144, "step": 6050 }, { "epoch": 16.422764227642276, "grad_norm": 0.2530243992805481, "learning_rate": 8.350360617930371e-05, "loss": 0.0126, "step": 6060 }, { "epoch": 16.449864498644985, "grad_norm": 0.2795167565345764, "learning_rate": 8.344219220702255e-05, "loss": 0.0131, "step": 6070 }, { "epoch": 16.476964769647697, "grad_norm": 0.3026662766933441, "learning_rate": 8.338068680506485e-05, "loss": 0.0132, "step": 6080 }, { "epoch": 16.504065040650406, "grad_norm": 0.41488534212112427, "learning_rate": 8.33190901415841e-05, "loss": 0.0116, "step": 6090 }, { "epoch": 16.531165311653115, "grad_norm": 0.3055301010608673, "learning_rate": 8.325740238498317e-05, "loss": 0.0157, "step": 6100 }, { "epoch": 16.558265582655828, "grad_norm": 0.3128865659236908, "learning_rate": 8.319562370391406e-05, "loss": 0.0146, "step": 6110 }, { "epoch": 16.585365853658537, "grad_norm": 0.22164388000965118, "learning_rate": 8.31337542672773e-05, "loss": 0.0166, "step": 6120 }, { "epoch": 16.612466124661246, "grad_norm": 0.3535219728946686, "learning_rate": 8.307179424422158e-05, "loss": 0.0135, "step": 6130 }, { "epoch": 16.639566395663955, "grad_norm": 0.3227238059043884, "learning_rate": 8.300974380414327e-05, "loss": 0.0145, "step": 6140 }, { "epoch": 16.666666666666668, "grad_norm": 0.3335765600204468, "learning_rate": 8.294760311668586e-05, "loss": 0.0184, "step": 6150 }, { "epoch": 16.693766937669377, "grad_norm": 0.5285273790359497, "learning_rate": 8.288537235173961e-05, "loss": 0.0125, "step": 6160 }, { "epoch": 16.720867208672086, "grad_norm": 0.30432599782943726, "learning_rate": 8.282305167944108e-05, "loss": 0.0139, "step": 6170 }, { "epoch": 16.747967479674795, "grad_norm": 0.25811246037483215, "learning_rate": 8.276064127017262e-05, "loss": 0.0135, "step": 6180 }, { "epoch": 16.775067750677508, "grad_norm": 0.23955923318862915, "learning_rate": 8.269814129456189e-05, "loss": 0.0119, "step": 6190 }, { "epoch": 16.802168021680217, "grad_norm": 0.4376921057701111, "learning_rate": 8.263555192348143e-05, "loss": 0.0162, "step": 6200 }, { "epoch": 16.829268292682926, "grad_norm": 0.24173936247825623, "learning_rate": 8.257287332804819e-05, "loss": 0.0131, "step": 6210 }, { "epoch": 16.85636856368564, "grad_norm": 0.4144163131713867, "learning_rate": 8.251010567962307e-05, "loss": 0.0146, "step": 6220 }, { "epoch": 16.883468834688347, "grad_norm": 0.2001628875732422, "learning_rate": 8.244724914981041e-05, "loss": 0.0135, "step": 6230 }, { "epoch": 16.910569105691057, "grad_norm": 0.34540918469429016, "learning_rate": 8.238430391045757e-05, "loss": 0.0111, "step": 6240 }, { "epoch": 16.937669376693766, "grad_norm": 0.34133610129356384, "learning_rate": 8.232127013365445e-05, "loss": 0.0136, "step": 6250 }, { "epoch": 16.964769647696478, "grad_norm": 0.37593787908554077, "learning_rate": 8.225814799173295e-05, "loss": 0.0147, "step": 6260 }, { "epoch": 16.991869918699187, "grad_norm": 0.2167070060968399, "learning_rate": 8.219493765726663e-05, "loss": 0.0175, "step": 6270 }, { "epoch": 17.018970189701896, "grad_norm": 0.348193496465683, "learning_rate": 8.21316393030701e-05, "loss": 0.0143, "step": 6280 }, { "epoch": 17.046070460704605, "grad_norm": 0.30778640508651733, "learning_rate": 8.206825310219865e-05, "loss": 0.0116, "step": 6290 }, { "epoch": 17.073170731707318, "grad_norm": 0.25451111793518066, "learning_rate": 8.200477922794776e-05, "loss": 0.0125, "step": 6300 }, { "epoch": 17.100271002710027, "grad_norm": 0.33044618368148804, "learning_rate": 8.194121785385256e-05, "loss": 0.0136, "step": 6310 }, { "epoch": 17.127371273712736, "grad_norm": 0.391546368598938, "learning_rate": 8.187756915368741e-05, "loss": 0.013, "step": 6320 }, { "epoch": 17.15447154471545, "grad_norm": 0.30948764085769653, "learning_rate": 8.181383330146544e-05, "loss": 0.0142, "step": 6330 }, { "epoch": 17.181571815718158, "grad_norm": 0.3148198425769806, "learning_rate": 8.175001047143804e-05, "loss": 0.0135, "step": 6340 }, { "epoch": 17.208672086720867, "grad_norm": 0.3118777573108673, "learning_rate": 8.168610083809438e-05, "loss": 0.0167, "step": 6350 }, { "epoch": 17.235772357723576, "grad_norm": 0.2305026650428772, "learning_rate": 8.162210457616095e-05, "loss": 0.0136, "step": 6360 }, { "epoch": 17.26287262872629, "grad_norm": 0.16907799243927002, "learning_rate": 8.155802186060109e-05, "loss": 0.0129, "step": 6370 }, { "epoch": 17.289972899728998, "grad_norm": 0.20824038982391357, "learning_rate": 8.149385286661453e-05, "loss": 0.0127, "step": 6380 }, { "epoch": 17.317073170731707, "grad_norm": 0.2678251266479492, "learning_rate": 8.14295977696368e-05, "loss": 0.0176, "step": 6390 }, { "epoch": 17.344173441734416, "grad_norm": 0.22059978544712067, "learning_rate": 8.13652567453389e-05, "loss": 0.0119, "step": 6400 }, { "epoch": 17.37127371273713, "grad_norm": 0.397085964679718, "learning_rate": 8.130082996962676e-05, "loss": 0.0107, "step": 6410 }, { "epoch": 17.398373983739837, "grad_norm": 0.33665013313293457, "learning_rate": 8.123631761864068e-05, "loss": 0.0157, "step": 6420 }, { "epoch": 17.425474254742547, "grad_norm": 0.24077743291854858, "learning_rate": 8.1171719868755e-05, "loss": 0.0119, "step": 6430 }, { "epoch": 17.45257452574526, "grad_norm": 0.22126083076000214, "learning_rate": 8.110703689657748e-05, "loss": 0.0144, "step": 6440 }, { "epoch": 17.479674796747968, "grad_norm": 0.2559535503387451, "learning_rate": 8.104226887894892e-05, "loss": 0.0124, "step": 6450 }, { "epoch": 17.506775067750677, "grad_norm": 0.35967516899108887, "learning_rate": 8.097741599294257e-05, "loss": 0.0131, "step": 6460 }, { "epoch": 17.533875338753386, "grad_norm": 0.4884195327758789, "learning_rate": 8.091247841586378e-05, "loss": 0.0123, "step": 6470 }, { "epoch": 17.5609756097561, "grad_norm": 0.2754122316837311, "learning_rate": 8.084745632524939e-05, "loss": 0.0127, "step": 6480 }, { "epoch": 17.588075880758808, "grad_norm": 0.3094245195388794, "learning_rate": 8.07823498988673e-05, "loss": 0.0137, "step": 6490 }, { "epoch": 17.615176151761517, "grad_norm": 0.3144064247608185, "learning_rate": 8.071715931471602e-05, "loss": 0.0164, "step": 6500 }, { "epoch": 17.642276422764226, "grad_norm": 0.33935287594795227, "learning_rate": 8.06518847510241e-05, "loss": 0.0134, "step": 6510 }, { "epoch": 17.66937669376694, "grad_norm": 0.5852866768836975, "learning_rate": 8.058652638624971e-05, "loss": 0.0147, "step": 6520 }, { "epoch": 17.696476964769648, "grad_norm": 0.31919071078300476, "learning_rate": 8.052108439908013e-05, "loss": 0.0136, "step": 6530 }, { "epoch": 17.723577235772357, "grad_norm": 0.21925532817840576, "learning_rate": 8.045555896843125e-05, "loss": 0.0113, "step": 6540 }, { "epoch": 17.75067750677507, "grad_norm": 0.34029707312583923, "learning_rate": 8.03899502734471e-05, "loss": 0.0131, "step": 6550 }, { "epoch": 17.77777777777778, "grad_norm": 0.3233349621295929, "learning_rate": 8.032425849349931e-05, "loss": 0.014, "step": 6560 }, { "epoch": 17.804878048780488, "grad_norm": 0.30175909399986267, "learning_rate": 8.025848380818674e-05, "loss": 0.0154, "step": 6570 }, { "epoch": 17.831978319783197, "grad_norm": 0.2058003842830658, "learning_rate": 8.019262639733487e-05, "loss": 0.0129, "step": 6580 }, { "epoch": 17.85907859078591, "grad_norm": 0.22683003544807434, "learning_rate": 8.012668644099531e-05, "loss": 0.0123, "step": 6590 }, { "epoch": 17.88617886178862, "grad_norm": 0.23022229969501495, "learning_rate": 8.006066411944542e-05, "loss": 0.0133, "step": 6600 }, { "epoch": 17.913279132791327, "grad_norm": 0.23451191186904907, "learning_rate": 7.999455961318769e-05, "loss": 0.0156, "step": 6610 }, { "epoch": 17.940379403794037, "grad_norm": 0.40380334854125977, "learning_rate": 7.992837310294932e-05, "loss": 0.0111, "step": 6620 }, { "epoch": 17.96747967479675, "grad_norm": 0.4539012908935547, "learning_rate": 7.986210476968167e-05, "loss": 0.0134, "step": 6630 }, { "epoch": 17.994579945799458, "grad_norm": 0.3891666531562805, "learning_rate": 7.97957547945599e-05, "loss": 0.0122, "step": 6640 }, { "epoch": 18.021680216802167, "grad_norm": 0.22853240370750427, "learning_rate": 7.972932335898226e-05, "loss": 0.0136, "step": 6650 }, { "epoch": 18.048780487804876, "grad_norm": 0.29574054479599, "learning_rate": 7.966281064456975e-05, "loss": 0.014, "step": 6660 }, { "epoch": 18.07588075880759, "grad_norm": 0.30253735184669495, "learning_rate": 7.959621683316563e-05, "loss": 0.0125, "step": 6670 }, { "epoch": 18.102981029810298, "grad_norm": 0.30880478024482727, "learning_rate": 7.952954210683481e-05, "loss": 0.0136, "step": 6680 }, { "epoch": 18.130081300813007, "grad_norm": 0.2935745418071747, "learning_rate": 7.946278664786345e-05, "loss": 0.0125, "step": 6690 }, { "epoch": 18.15718157181572, "grad_norm": 0.24606838822364807, "learning_rate": 7.939595063875842e-05, "loss": 0.0133, "step": 6700 }, { "epoch": 18.18428184281843, "grad_norm": 0.503983736038208, "learning_rate": 7.932903426224683e-05, "loss": 0.0112, "step": 6710 }, { "epoch": 18.211382113821138, "grad_norm": 0.4081834852695465, "learning_rate": 7.926203770127552e-05, "loss": 0.0138, "step": 6720 }, { "epoch": 18.238482384823847, "grad_norm": 0.3164560794830322, "learning_rate": 7.919496113901046e-05, "loss": 0.0139, "step": 6730 }, { "epoch": 18.26558265582656, "grad_norm": 0.24816696345806122, "learning_rate": 7.912780475883649e-05, "loss": 0.0149, "step": 6740 }, { "epoch": 18.29268292682927, "grad_norm": 0.30337637662887573, "learning_rate": 7.906056874435652e-05, "loss": 0.0134, "step": 6750 }, { "epoch": 18.319783197831978, "grad_norm": 0.29605939984321594, "learning_rate": 7.899325327939131e-05, "loss": 0.0123, "step": 6760 }, { "epoch": 18.346883468834687, "grad_norm": 0.3168525993824005, "learning_rate": 7.892585854797872e-05, "loss": 0.0119, "step": 6770 }, { "epoch": 18.3739837398374, "grad_norm": 0.40726765990257263, "learning_rate": 7.88583847343734e-05, "loss": 0.0121, "step": 6780 }, { "epoch": 18.40108401084011, "grad_norm": 0.19942456483840942, "learning_rate": 7.879083202304616e-05, "loss": 0.0118, "step": 6790 }, { "epoch": 18.428184281842817, "grad_norm": 0.2977119982242584, "learning_rate": 7.872320059868355e-05, "loss": 0.0155, "step": 6800 }, { "epoch": 18.45528455284553, "grad_norm": 0.21141929924488068, "learning_rate": 7.865549064618729e-05, "loss": 0.0124, "step": 6810 }, { "epoch": 18.48238482384824, "grad_norm": 0.5234606862068176, "learning_rate": 7.858770235067381e-05, "loss": 0.0108, "step": 6820 }, { "epoch": 18.509485094850948, "grad_norm": 0.4079539179801941, "learning_rate": 7.851983589747374e-05, "loss": 0.0134, "step": 6830 }, { "epoch": 18.536585365853657, "grad_norm": 0.31129997968673706, "learning_rate": 7.845189147213133e-05, "loss": 0.0105, "step": 6840 }, { "epoch": 18.56368563685637, "grad_norm": 0.2529675364494324, "learning_rate": 7.838386926040407e-05, "loss": 0.0115, "step": 6850 }, { "epoch": 18.59078590785908, "grad_norm": 0.21681976318359375, "learning_rate": 7.83157694482621e-05, "loss": 0.0126, "step": 6860 }, { "epoch": 18.617886178861788, "grad_norm": 0.33184707164764404, "learning_rate": 7.824759222188768e-05, "loss": 0.0148, "step": 6870 }, { "epoch": 18.644986449864497, "grad_norm": 0.22313524782657623, "learning_rate": 7.817933776767478e-05, "loss": 0.0118, "step": 6880 }, { "epoch": 18.67208672086721, "grad_norm": 0.2769754230976105, "learning_rate": 7.811100627222842e-05, "loss": 0.0117, "step": 6890 }, { "epoch": 18.69918699186992, "grad_norm": 0.19215968251228333, "learning_rate": 7.804259792236435e-05, "loss": 0.0109, "step": 6900 }, { "epoch": 18.726287262872628, "grad_norm": 0.2409733086824417, "learning_rate": 7.797411290510835e-05, "loss": 0.011, "step": 6910 }, { "epoch": 18.75338753387534, "grad_norm": 0.18799059092998505, "learning_rate": 7.790555140769586e-05, "loss": 0.0115, "step": 6920 }, { "epoch": 18.78048780487805, "grad_norm": 0.22320333123207092, "learning_rate": 7.78369136175714e-05, "loss": 0.0123, "step": 6930 }, { "epoch": 18.80758807588076, "grad_norm": 0.25411784648895264, "learning_rate": 7.776819972238806e-05, "loss": 0.0108, "step": 6940 }, { "epoch": 18.834688346883468, "grad_norm": 0.19237157702445984, "learning_rate": 7.7699409910007e-05, "loss": 0.0144, "step": 6950 }, { "epoch": 18.86178861788618, "grad_norm": 0.27256980538368225, "learning_rate": 7.763054436849694e-05, "loss": 0.0175, "step": 6960 }, { "epoch": 18.88888888888889, "grad_norm": 0.2838786840438843, "learning_rate": 7.756160328613364e-05, "loss": 0.0126, "step": 6970 }, { "epoch": 18.9159891598916, "grad_norm": 0.2777400314807892, "learning_rate": 7.749258685139942e-05, "loss": 0.012, "step": 6980 }, { "epoch": 18.943089430894307, "grad_norm": 0.20568206906318665, "learning_rate": 7.742349525298253e-05, "loss": 0.0109, "step": 6990 }, { "epoch": 18.97018970189702, "grad_norm": 0.37436798214912415, "learning_rate": 7.735432867977679e-05, "loss": 0.012, "step": 7000 }, { "epoch": 18.99728997289973, "grad_norm": 0.28983551263809204, "learning_rate": 7.728508732088096e-05, "loss": 0.0114, "step": 7010 }, { "epoch": 19.024390243902438, "grad_norm": 0.18332475423812866, "learning_rate": 7.721577136559825e-05, "loss": 0.0115, "step": 7020 }, { "epoch": 19.05149051490515, "grad_norm": 0.28062835335731506, "learning_rate": 7.714638100343588e-05, "loss": 0.0119, "step": 7030 }, { "epoch": 19.07859078590786, "grad_norm": 0.29514268040657043, "learning_rate": 7.707691642410444e-05, "loss": 0.0125, "step": 7040 }, { "epoch": 19.10569105691057, "grad_norm": 0.24552655220031738, "learning_rate": 7.70073778175174e-05, "loss": 0.0127, "step": 7050 }, { "epoch": 19.132791327913278, "grad_norm": 0.29238954186439514, "learning_rate": 7.69377653737907e-05, "loss": 0.0137, "step": 7060 }, { "epoch": 19.15989159891599, "grad_norm": 0.4231591522693634, "learning_rate": 7.686807928324209e-05, "loss": 0.018, "step": 7070 }, { "epoch": 19.1869918699187, "grad_norm": 0.36399078369140625, "learning_rate": 7.679831973639065e-05, "loss": 0.0153, "step": 7080 }, { "epoch": 19.21409214092141, "grad_norm": 0.2743346691131592, "learning_rate": 7.672848692395637e-05, "loss": 0.0163, "step": 7090 }, { "epoch": 19.241192411924118, "grad_norm": 0.28001290559768677, "learning_rate": 7.665858103685944e-05, "loss": 0.0168, "step": 7100 }, { "epoch": 19.26829268292683, "grad_norm": 0.23035390675067902, "learning_rate": 7.658860226621991e-05, "loss": 0.0138, "step": 7110 }, { "epoch": 19.29539295392954, "grad_norm": 0.2746807336807251, "learning_rate": 7.651855080335708e-05, "loss": 0.0142, "step": 7120 }, { "epoch": 19.32249322493225, "grad_norm": 0.43805965781211853, "learning_rate": 7.644842683978896e-05, "loss": 0.0146, "step": 7130 }, { "epoch": 19.34959349593496, "grad_norm": 0.3070857524871826, "learning_rate": 7.63782305672318e-05, "loss": 0.0124, "step": 7140 }, { "epoch": 19.37669376693767, "grad_norm": 0.226077601313591, "learning_rate": 7.63079621775995e-05, "loss": 0.0104, "step": 7150 }, { "epoch": 19.40379403794038, "grad_norm": 0.3512376546859741, "learning_rate": 7.623762186300319e-05, "loss": 0.0116, "step": 7160 }, { "epoch": 19.43089430894309, "grad_norm": 0.2383955717086792, "learning_rate": 7.616720981575057e-05, "loss": 0.0132, "step": 7170 }, { "epoch": 19.4579945799458, "grad_norm": 0.3899403214454651, "learning_rate": 7.609672622834552e-05, "loss": 0.0118, "step": 7180 }, { "epoch": 19.48509485094851, "grad_norm": 0.3335207402706146, "learning_rate": 7.602617129348747e-05, "loss": 0.0096, "step": 7190 }, { "epoch": 19.51219512195122, "grad_norm": 0.219376802444458, "learning_rate": 7.595554520407088e-05, "loss": 0.011, "step": 7200 }, { "epoch": 19.539295392953928, "grad_norm": 0.3635261356830597, "learning_rate": 7.588484815318484e-05, "loss": 0.0127, "step": 7210 }, { "epoch": 19.56639566395664, "grad_norm": 0.20308203995227814, "learning_rate": 7.581408033411234e-05, "loss": 0.0118, "step": 7220 }, { "epoch": 19.59349593495935, "grad_norm": 0.2728954553604126, "learning_rate": 7.574324194032995e-05, "loss": 0.0093, "step": 7230 }, { "epoch": 19.62059620596206, "grad_norm": 0.2343025505542755, "learning_rate": 7.567233316550705e-05, "loss": 0.0141, "step": 7240 }, { "epoch": 19.647696476964768, "grad_norm": 0.30419185757637024, "learning_rate": 7.560135420350562e-05, "loss": 0.013, "step": 7250 }, { "epoch": 19.67479674796748, "grad_norm": 0.2150462418794632, "learning_rate": 7.553030524837935e-05, "loss": 0.0114, "step": 7260 }, { "epoch": 19.70189701897019, "grad_norm": 0.22288647294044495, "learning_rate": 7.545918649437341e-05, "loss": 0.0131, "step": 7270 }, { "epoch": 19.7289972899729, "grad_norm": 0.29139092564582825, "learning_rate": 7.538799813592377e-05, "loss": 0.0125, "step": 7280 }, { "epoch": 19.75609756097561, "grad_norm": 0.17364515364170074, "learning_rate": 7.531674036765662e-05, "loss": 0.0113, "step": 7290 }, { "epoch": 19.78319783197832, "grad_norm": 0.18761621415615082, "learning_rate": 7.524541338438807e-05, "loss": 0.0152, "step": 7300 }, { "epoch": 19.81029810298103, "grad_norm": 0.2571414113044739, "learning_rate": 7.517401738112328e-05, "loss": 0.0123, "step": 7310 }, { "epoch": 19.83739837398374, "grad_norm": 0.269074946641922, "learning_rate": 7.510255255305628e-05, "loss": 0.0115, "step": 7320 }, { "epoch": 19.86449864498645, "grad_norm": 0.28819751739501953, "learning_rate": 7.503101909556911e-05, "loss": 0.0105, "step": 7330 }, { "epoch": 19.89159891598916, "grad_norm": 0.16180764138698578, "learning_rate": 7.495941720423154e-05, "loss": 0.011, "step": 7340 }, { "epoch": 19.91869918699187, "grad_norm": 0.2613115608692169, "learning_rate": 7.488774707480042e-05, "loss": 0.0132, "step": 7350 }, { "epoch": 19.94579945799458, "grad_norm": 0.35061144828796387, "learning_rate": 7.481600890321911e-05, "loss": 0.0144, "step": 7360 }, { "epoch": 19.97289972899729, "grad_norm": 0.24568265676498413, "learning_rate": 7.474420288561708e-05, "loss": 0.0114, "step": 7370 }, { "epoch": 20.0, "grad_norm": 0.38036760687828064, "learning_rate": 7.467232921830921e-05, "loss": 0.0116, "step": 7380 }, { "epoch": 20.02710027100271, "grad_norm": 0.285768985748291, "learning_rate": 7.460038809779537e-05, "loss": 0.0153, "step": 7390 }, { "epoch": 20.05420054200542, "grad_norm": 0.35664132237434387, "learning_rate": 7.452837972075983e-05, "loss": 0.0121, "step": 7400 }, { "epoch": 20.08130081300813, "grad_norm": 0.2919205129146576, "learning_rate": 7.445630428407074e-05, "loss": 0.0122, "step": 7410 }, { "epoch": 20.10840108401084, "grad_norm": 0.3644979000091553, "learning_rate": 7.43841619847796e-05, "loss": 0.0135, "step": 7420 }, { "epoch": 20.13550135501355, "grad_norm": 0.3541443645954132, "learning_rate": 7.431195302012072e-05, "loss": 0.0104, "step": 7430 }, { "epoch": 20.16260162601626, "grad_norm": 0.23304247856140137, "learning_rate": 7.423967758751061e-05, "loss": 0.0097, "step": 7440 }, { "epoch": 20.18970189701897, "grad_norm": 0.20723876357078552, "learning_rate": 7.416733588454758e-05, "loss": 0.0127, "step": 7450 }, { "epoch": 20.21680216802168, "grad_norm": 0.18784432113170624, "learning_rate": 7.409492810901106e-05, "loss": 0.0139, "step": 7460 }, { "epoch": 20.24390243902439, "grad_norm": 0.28555965423583984, "learning_rate": 7.402245445886116e-05, "loss": 0.0097, "step": 7470 }, { "epoch": 20.2710027100271, "grad_norm": 0.19868451356887817, "learning_rate": 7.394991513223806e-05, "loss": 0.0104, "step": 7480 }, { "epoch": 20.29810298102981, "grad_norm": 0.18871113657951355, "learning_rate": 7.38773103274615e-05, "loss": 0.0108, "step": 7490 }, { "epoch": 20.32520325203252, "grad_norm": 0.21186703443527222, "learning_rate": 7.380464024303028e-05, "loss": 0.0104, "step": 7500 }, { "epoch": 20.352303523035232, "grad_norm": 0.3026733100414276, "learning_rate": 7.373190507762162e-05, "loss": 0.0119, "step": 7510 }, { "epoch": 20.37940379403794, "grad_norm": 0.2740308344364166, "learning_rate": 7.365910503009066e-05, "loss": 0.0095, "step": 7520 }, { "epoch": 20.40650406504065, "grad_norm": 0.2183270901441574, "learning_rate": 7.358624029946996e-05, "loss": 0.0098, "step": 7530 }, { "epoch": 20.43360433604336, "grad_norm": 0.24252358078956604, "learning_rate": 7.351331108496893e-05, "loss": 0.0172, "step": 7540 }, { "epoch": 20.460704607046072, "grad_norm": 0.2679365277290344, "learning_rate": 7.344031758597325e-05, "loss": 0.0122, "step": 7550 }, { "epoch": 20.48780487804878, "grad_norm": 0.19067543745040894, "learning_rate": 7.336726000204435e-05, "loss": 0.0112, "step": 7560 }, { "epoch": 20.51490514905149, "grad_norm": 0.21411678194999695, "learning_rate": 7.32941385329189e-05, "loss": 0.0123, "step": 7570 }, { "epoch": 20.5420054200542, "grad_norm": 0.33751896023750305, "learning_rate": 7.322095337850816e-05, "loss": 0.0113, "step": 7580 }, { "epoch": 20.56910569105691, "grad_norm": 0.25086501240730286, "learning_rate": 7.314770473889758e-05, "loss": 0.0097, "step": 7590 }, { "epoch": 20.59620596205962, "grad_norm": 0.29047250747680664, "learning_rate": 7.307439281434615e-05, "loss": 0.0124, "step": 7600 }, { "epoch": 20.62330623306233, "grad_norm": 0.2418135106563568, "learning_rate": 7.300101780528585e-05, "loss": 0.0097, "step": 7610 }, { "epoch": 20.65040650406504, "grad_norm": 0.26836565136909485, "learning_rate": 7.292757991232117e-05, "loss": 0.0121, "step": 7620 }, { "epoch": 20.67750677506775, "grad_norm": 0.2631871700286865, "learning_rate": 7.285407933622848e-05, "loss": 0.0097, "step": 7630 }, { "epoch": 20.70460704607046, "grad_norm": 0.2312232255935669, "learning_rate": 7.278051627795557e-05, "loss": 0.012, "step": 7640 }, { "epoch": 20.73170731707317, "grad_norm": 0.21694892644882202, "learning_rate": 7.270689093862105e-05, "loss": 0.0116, "step": 7650 }, { "epoch": 20.758807588075882, "grad_norm": 0.2796224057674408, "learning_rate": 7.263320351951374e-05, "loss": 0.0132, "step": 7660 }, { "epoch": 20.78590785907859, "grad_norm": 0.2762765884399414, "learning_rate": 7.255945422209227e-05, "loss": 0.0104, "step": 7670 }, { "epoch": 20.8130081300813, "grad_norm": 0.19890008866786957, "learning_rate": 7.248564324798437e-05, "loss": 0.0105, "step": 7680 }, { "epoch": 20.84010840108401, "grad_norm": 0.1991325169801712, "learning_rate": 7.241177079898644e-05, "loss": 0.012, "step": 7690 }, { "epoch": 20.867208672086722, "grad_norm": 0.26839953660964966, "learning_rate": 7.233783707706295e-05, "loss": 0.0115, "step": 7700 }, { "epoch": 20.89430894308943, "grad_norm": 0.29301589727401733, "learning_rate": 7.226384228434586e-05, "loss": 0.0112, "step": 7710 }, { "epoch": 20.92140921409214, "grad_norm": 0.21608991920948029, "learning_rate": 7.21897866231341e-05, "loss": 0.0122, "step": 7720 }, { "epoch": 20.948509485094853, "grad_norm": 0.2854992151260376, "learning_rate": 7.211567029589303e-05, "loss": 0.0132, "step": 7730 }, { "epoch": 20.975609756097562, "grad_norm": 0.17529141902923584, "learning_rate": 7.204149350525387e-05, "loss": 0.0092, "step": 7740 }, { "epoch": 21.00271002710027, "grad_norm": 0.24121476709842682, "learning_rate": 7.196725645401309e-05, "loss": 0.0111, "step": 7750 }, { "epoch": 21.02981029810298, "grad_norm": 0.2377505898475647, "learning_rate": 7.1892959345132e-05, "loss": 0.0123, "step": 7760 }, { "epoch": 21.056910569105693, "grad_norm": 0.18046687543392181, "learning_rate": 7.181860238173605e-05, "loss": 0.0108, "step": 7770 }, { "epoch": 21.0840108401084, "grad_norm": 0.24501566588878632, "learning_rate": 7.174418576711432e-05, "loss": 0.0121, "step": 7780 }, { "epoch": 21.11111111111111, "grad_norm": 0.2011549174785614, "learning_rate": 7.1669709704719e-05, "loss": 0.0114, "step": 7790 }, { "epoch": 21.13821138211382, "grad_norm": 0.22397097945213318, "learning_rate": 7.159517439816481e-05, "loss": 0.0123, "step": 7800 }, { "epoch": 21.165311653116532, "grad_norm": 0.28474003076553345, "learning_rate": 7.152058005122842e-05, "loss": 0.0136, "step": 7810 }, { "epoch": 21.19241192411924, "grad_norm": 0.26511913537979126, "learning_rate": 7.144592686784793e-05, "loss": 0.011, "step": 7820 }, { "epoch": 21.21951219512195, "grad_norm": 0.22060564160346985, "learning_rate": 7.137121505212229e-05, "loss": 0.0145, "step": 7830 }, { "epoch": 21.24661246612466, "grad_norm": 0.30577537417411804, "learning_rate": 7.129644480831077e-05, "loss": 0.0133, "step": 7840 }, { "epoch": 21.273712737127372, "grad_norm": 0.329862505197525, "learning_rate": 7.122161634083234e-05, "loss": 0.012, "step": 7850 }, { "epoch": 21.30081300813008, "grad_norm": 0.2549801766872406, "learning_rate": 7.114672985426516e-05, "loss": 0.01, "step": 7860 }, { "epoch": 21.32791327913279, "grad_norm": 0.247748002409935, "learning_rate": 7.107178555334606e-05, "loss": 0.0139, "step": 7870 }, { "epoch": 21.355013550135503, "grad_norm": 0.30858439207077026, "learning_rate": 7.099678364296989e-05, "loss": 0.0123, "step": 7880 }, { "epoch": 21.382113821138212, "grad_norm": 0.2301647663116455, "learning_rate": 7.0921724328189e-05, "loss": 0.0098, "step": 7890 }, { "epoch": 21.40921409214092, "grad_norm": 0.3250274658203125, "learning_rate": 7.084660781421268e-05, "loss": 0.0124, "step": 7900 }, { "epoch": 21.43631436314363, "grad_norm": 0.2447092980146408, "learning_rate": 7.077143430640662e-05, "loss": 0.012, "step": 7910 }, { "epoch": 21.463414634146343, "grad_norm": 0.25585901737213135, "learning_rate": 7.069620401029232e-05, "loss": 0.0122, "step": 7920 }, { "epoch": 21.490514905149052, "grad_norm": 0.2708868086338043, "learning_rate": 7.062091713154655e-05, "loss": 0.0103, "step": 7930 }, { "epoch": 21.51761517615176, "grad_norm": 0.32642513513565063, "learning_rate": 7.054557387600075e-05, "loss": 0.0108, "step": 7940 }, { "epoch": 21.54471544715447, "grad_norm": 0.2739064395427704, "learning_rate": 7.04701744496405e-05, "loss": 0.0123, "step": 7950 }, { "epoch": 21.571815718157183, "grad_norm": 0.3300195336341858, "learning_rate": 7.039471905860495e-05, "loss": 0.011, "step": 7960 }, { "epoch": 21.59891598915989, "grad_norm": 0.19702139496803284, "learning_rate": 7.031920790918628e-05, "loss": 0.0119, "step": 7970 }, { "epoch": 21.6260162601626, "grad_norm": 0.19812731444835663, "learning_rate": 7.024364120782906e-05, "loss": 0.0124, "step": 7980 }, { "epoch": 21.653116531165313, "grad_norm": 0.19966275990009308, "learning_rate": 7.016801916112978e-05, "loss": 0.0124, "step": 7990 }, { "epoch": 21.680216802168022, "grad_norm": 0.20520451664924622, "learning_rate": 7.009234197583623e-05, "loss": 0.0144, "step": 8000 }, { "epoch": 21.70731707317073, "grad_norm": 0.20923396944999695, "learning_rate": 7.001660985884692e-05, "loss": 0.0108, "step": 8010 }, { "epoch": 21.73441734417344, "grad_norm": 0.19509470462799072, "learning_rate": 6.994082301721063e-05, "loss": 0.01, "step": 8020 }, { "epoch": 21.761517615176153, "grad_norm": 0.28363120555877686, "learning_rate": 6.986498165812563e-05, "loss": 0.0101, "step": 8030 }, { "epoch": 21.788617886178862, "grad_norm": 0.24193283915519714, "learning_rate": 6.978908598893932e-05, "loss": 0.0094, "step": 8040 }, { "epoch": 21.81571815718157, "grad_norm": 0.2378820776939392, "learning_rate": 6.971313621714756e-05, "loss": 0.0126, "step": 8050 }, { "epoch": 21.84281842818428, "grad_norm": 0.2070934772491455, "learning_rate": 6.96371325503941e-05, "loss": 0.0095, "step": 8060 }, { "epoch": 21.869918699186993, "grad_norm": 0.21266591548919678, "learning_rate": 6.956107519647014e-05, "loss": 0.0119, "step": 8070 }, { "epoch": 21.897018970189702, "grad_norm": 0.1898735910654068, "learning_rate": 6.94849643633135e-05, "loss": 0.0114, "step": 8080 }, { "epoch": 21.92411924119241, "grad_norm": 0.1878700852394104, "learning_rate": 6.940880025900834e-05, "loss": 0.0134, "step": 8090 }, { "epoch": 21.951219512195124, "grad_norm": 0.3573300242424011, "learning_rate": 6.933258309178438e-05, "loss": 0.0119, "step": 8100 }, { "epoch": 21.978319783197833, "grad_norm": 0.28731608390808105, "learning_rate": 6.925631307001646e-05, "loss": 0.0098, "step": 8110 }, { "epoch": 22.005420054200542, "grad_norm": 0.2779543399810791, "learning_rate": 6.91799904022239e-05, "loss": 0.0095, "step": 8120 }, { "epoch": 22.03252032520325, "grad_norm": 0.23148395121097565, "learning_rate": 6.910361529706997e-05, "loss": 0.0124, "step": 8130 }, { "epoch": 22.059620596205963, "grad_norm": 0.2788093686103821, "learning_rate": 6.902718796336131e-05, "loss": 0.0115, "step": 8140 }, { "epoch": 22.086720867208673, "grad_norm": 0.2624422609806061, "learning_rate": 6.895070861004729e-05, "loss": 0.0123, "step": 8150 }, { "epoch": 22.11382113821138, "grad_norm": 0.40877601504325867, "learning_rate": 6.887417744621956e-05, "loss": 0.0132, "step": 8160 }, { "epoch": 22.14092140921409, "grad_norm": 0.2030528485774994, "learning_rate": 6.87975946811114e-05, "loss": 0.0097, "step": 8170 }, { "epoch": 22.168021680216803, "grad_norm": 0.2767268419265747, "learning_rate": 6.872096052409718e-05, "loss": 0.0124, "step": 8180 }, { "epoch": 22.195121951219512, "grad_norm": 0.2516579329967499, "learning_rate": 6.864427518469174e-05, "loss": 0.0146, "step": 8190 }, { "epoch": 22.22222222222222, "grad_norm": 0.21494616568088531, "learning_rate": 6.856753887254986e-05, "loss": 0.0134, "step": 8200 }, { "epoch": 22.24932249322493, "grad_norm": 0.13836491107940674, "learning_rate": 6.849075179746572e-05, "loss": 0.0112, "step": 8210 }, { "epoch": 22.276422764227643, "grad_norm": 0.249604269862175, "learning_rate": 6.841391416937221e-05, "loss": 0.0132, "step": 8220 }, { "epoch": 22.303523035230352, "grad_norm": 0.21627816557884216, "learning_rate": 6.833702619834053e-05, "loss": 0.0112, "step": 8230 }, { "epoch": 22.33062330623306, "grad_norm": 0.2148495763540268, "learning_rate": 6.82600880945794e-05, "loss": 0.0119, "step": 8240 }, { "epoch": 22.357723577235774, "grad_norm": 0.17187964916229248, "learning_rate": 6.818310006843468e-05, "loss": 0.0095, "step": 8250 }, { "epoch": 22.384823848238483, "grad_norm": 0.22951912879943848, "learning_rate": 6.810606233038868e-05, "loss": 0.0098, "step": 8260 }, { "epoch": 22.411924119241192, "grad_norm": 0.24293231964111328, "learning_rate": 6.802897509105966e-05, "loss": 0.0125, "step": 8270 }, { "epoch": 22.4390243902439, "grad_norm": 0.2736617922782898, "learning_rate": 6.79518385612012e-05, "loss": 0.0108, "step": 8280 }, { "epoch": 22.466124661246614, "grad_norm": 0.22746963798999786, "learning_rate": 6.787465295170157e-05, "loss": 0.011, "step": 8290 }, { "epoch": 22.493224932249323, "grad_norm": 0.22849337756633759, "learning_rate": 6.779741847358332e-05, "loss": 0.0128, "step": 8300 }, { "epoch": 22.520325203252032, "grad_norm": 0.20831696689128876, "learning_rate": 6.772013533800256e-05, "loss": 0.0103, "step": 8310 }, { "epoch": 22.54742547425474, "grad_norm": 0.21329282224178314, "learning_rate": 6.764280375624843e-05, "loss": 0.0138, "step": 8320 }, { "epoch": 22.574525745257453, "grad_norm": 0.23073112964630127, "learning_rate": 6.756542393974252e-05, "loss": 0.0139, "step": 8330 }, { "epoch": 22.601626016260163, "grad_norm": 0.27821218967437744, "learning_rate": 6.748799610003828e-05, "loss": 0.0139, "step": 8340 }, { "epoch": 22.62872628726287, "grad_norm": 0.21537446975708008, "learning_rate": 6.741052044882048e-05, "loss": 0.0119, "step": 8350 }, { "epoch": 22.655826558265584, "grad_norm": 0.32870370149612427, "learning_rate": 6.73329971979046e-05, "loss": 0.01, "step": 8360 }, { "epoch": 22.682926829268293, "grad_norm": 0.21009846031665802, "learning_rate": 6.725542655923625e-05, "loss": 0.0118, "step": 8370 }, { "epoch": 22.710027100271002, "grad_norm": 0.20062941312789917, "learning_rate": 6.717780874489057e-05, "loss": 0.0096, "step": 8380 }, { "epoch": 22.73712737127371, "grad_norm": 0.21154946088790894, "learning_rate": 6.710014396707172e-05, "loss": 0.0127, "step": 8390 }, { "epoch": 22.764227642276424, "grad_norm": 0.20511792600154877, "learning_rate": 6.702243243811221e-05, "loss": 0.0109, "step": 8400 }, { "epoch": 22.791327913279133, "grad_norm": 0.3249759078025818, "learning_rate": 6.694467437047244e-05, "loss": 0.013, "step": 8410 }, { "epoch": 22.818428184281842, "grad_norm": 0.25875338912010193, "learning_rate": 6.686686997673997e-05, "loss": 0.0112, "step": 8420 }, { "epoch": 22.84552845528455, "grad_norm": 0.302944540977478, "learning_rate": 6.678901946962903e-05, "loss": 0.011, "step": 8430 }, { "epoch": 22.872628726287264, "grad_norm": 0.17626437544822693, "learning_rate": 6.671112306197996e-05, "loss": 0.0113, "step": 8440 }, { "epoch": 22.899728997289973, "grad_norm": 0.3278419077396393, "learning_rate": 6.663318096675854e-05, "loss": 0.0155, "step": 8450 }, { "epoch": 22.926829268292682, "grad_norm": 0.19566477835178375, "learning_rate": 6.655519339705552e-05, "loss": 0.0121, "step": 8460 }, { "epoch": 22.953929539295395, "grad_norm": 0.252599835395813, "learning_rate": 6.647716056608588e-05, "loss": 0.0123, "step": 8470 }, { "epoch": 22.981029810298104, "grad_norm": 0.3959272503852844, "learning_rate": 6.639908268718843e-05, "loss": 0.0129, "step": 8480 }, { "epoch": 23.008130081300813, "grad_norm": 0.30370986461639404, "learning_rate": 6.632095997382514e-05, "loss": 0.0128, "step": 8490 }, { "epoch": 23.035230352303522, "grad_norm": 0.23918046057224274, "learning_rate": 6.624279263958047e-05, "loss": 0.012, "step": 8500 }, { "epoch": 23.062330623306234, "grad_norm": 0.3235885798931122, "learning_rate": 6.616458089816097e-05, "loss": 0.0127, "step": 8510 }, { "epoch": 23.089430894308943, "grad_norm": 0.18199728429317474, "learning_rate": 6.608632496339454e-05, "loss": 0.0126, "step": 8520 }, { "epoch": 23.116531165311653, "grad_norm": 0.20208559930324554, "learning_rate": 6.600802504922988e-05, "loss": 0.0085, "step": 8530 }, { "epoch": 23.14363143631436, "grad_norm": 0.26532524824142456, "learning_rate": 6.592968136973604e-05, "loss": 0.0142, "step": 8540 }, { "epoch": 23.170731707317074, "grad_norm": 0.17399199306964874, "learning_rate": 6.585129413910159e-05, "loss": 0.0134, "step": 8550 }, { "epoch": 23.197831978319783, "grad_norm": 0.23326626420021057, "learning_rate": 6.577286357163424e-05, "loss": 0.0136, "step": 8560 }, { "epoch": 23.224932249322492, "grad_norm": 0.43324798345565796, "learning_rate": 6.569438988176018e-05, "loss": 0.0099, "step": 8570 }, { "epoch": 23.252032520325205, "grad_norm": 0.3293931484222412, "learning_rate": 6.561587328402347e-05, "loss": 0.0126, "step": 8580 }, { "epoch": 23.279132791327914, "grad_norm": 0.2818412482738495, "learning_rate": 6.553731399308549e-05, "loss": 0.0105, "step": 8590 }, { "epoch": 23.306233062330623, "grad_norm": 0.29026150703430176, "learning_rate": 6.545871222372436e-05, "loss": 0.0111, "step": 8600 }, { "epoch": 23.333333333333332, "grad_norm": 0.189578577876091, "learning_rate": 6.538006819083426e-05, "loss": 0.0116, "step": 8610 }, { "epoch": 23.360433604336045, "grad_norm": 0.24786244332790375, "learning_rate": 6.530138210942505e-05, "loss": 0.0108, "step": 8620 }, { "epoch": 23.387533875338754, "grad_norm": 0.24465318024158478, "learning_rate": 6.522265419462141e-05, "loss": 0.012, "step": 8630 }, { "epoch": 23.414634146341463, "grad_norm": 0.14947378635406494, "learning_rate": 6.514388466166248e-05, "loss": 0.0118, "step": 8640 }, { "epoch": 23.441734417344172, "grad_norm": 0.2520354986190796, "learning_rate": 6.506507372590119e-05, "loss": 0.0118, "step": 8650 }, { "epoch": 23.468834688346885, "grad_norm": 0.2702355682849884, "learning_rate": 6.498622160280355e-05, "loss": 0.0102, "step": 8660 }, { "epoch": 23.495934959349594, "grad_norm": 0.29349973797798157, "learning_rate": 6.490732850794832e-05, "loss": 0.0109, "step": 8670 }, { "epoch": 23.523035230352303, "grad_norm": 0.23947285115718842, "learning_rate": 6.482839465702616e-05, "loss": 0.0111, "step": 8680 }, { "epoch": 23.550135501355015, "grad_norm": 0.22570790350437164, "learning_rate": 6.474942026583923e-05, "loss": 0.0106, "step": 8690 }, { "epoch": 23.577235772357724, "grad_norm": 0.2933264970779419, "learning_rate": 6.467040555030052e-05, "loss": 0.012, "step": 8700 }, { "epoch": 23.604336043360433, "grad_norm": 0.23203973472118378, "learning_rate": 6.459135072643321e-05, "loss": 0.0096, "step": 8710 }, { "epoch": 23.631436314363143, "grad_norm": 0.3269520401954651, "learning_rate": 6.451225601037019e-05, "loss": 0.0117, "step": 8720 }, { "epoch": 23.658536585365855, "grad_norm": 0.32235240936279297, "learning_rate": 6.443312161835338e-05, "loss": 0.0119, "step": 8730 }, { "epoch": 23.685636856368564, "grad_norm": 0.23889046907424927, "learning_rate": 6.43539477667332e-05, "loss": 0.0105, "step": 8740 }, { "epoch": 23.712737127371273, "grad_norm": 0.36458832025527954, "learning_rate": 6.427473467196793e-05, "loss": 0.0122, "step": 8750 }, { "epoch": 23.739837398373982, "grad_norm": 0.23192261159420013, "learning_rate": 6.419548255062315e-05, "loss": 0.014, "step": 8760 }, { "epoch": 23.766937669376695, "grad_norm": 0.17228098213672638, "learning_rate": 6.411619161937112e-05, "loss": 0.0101, "step": 8770 }, { "epoch": 23.794037940379404, "grad_norm": 0.33537566661834717, "learning_rate": 6.403686209499022e-05, "loss": 0.0132, "step": 8780 }, { "epoch": 23.821138211382113, "grad_norm": 0.24017778038978577, "learning_rate": 6.395749419436437e-05, "loss": 0.0126, "step": 8790 }, { "epoch": 23.848238482384822, "grad_norm": 0.1928717941045761, "learning_rate": 6.387808813448234e-05, "loss": 0.0123, "step": 8800 }, { "epoch": 23.875338753387535, "grad_norm": 0.30530938506126404, "learning_rate": 6.37986441324373e-05, "loss": 0.014, "step": 8810 }, { "epoch": 23.902439024390244, "grad_norm": 0.25739938020706177, "learning_rate": 6.37191624054261e-05, "loss": 0.0094, "step": 8820 }, { "epoch": 23.929539295392953, "grad_norm": 0.3330337703227997, "learning_rate": 6.363964317074872e-05, "loss": 0.0112, "step": 8830 }, { "epoch": 23.956639566395665, "grad_norm": 0.16190795600414276, "learning_rate": 6.356008664580776e-05, "loss": 0.0112, "step": 8840 }, { "epoch": 23.983739837398375, "grad_norm": 0.19831615686416626, "learning_rate": 6.348049304810771e-05, "loss": 0.0126, "step": 8850 }, { "epoch": 24.010840108401084, "grad_norm": 0.30547383427619934, "learning_rate": 6.340086259525442e-05, "loss": 0.0094, "step": 8860 }, { "epoch": 24.037940379403793, "grad_norm": 0.2850518822669983, "learning_rate": 6.332119550495448e-05, "loss": 0.0112, "step": 8870 }, { "epoch": 24.065040650406505, "grad_norm": 0.20962141454219818, "learning_rate": 6.324149199501473e-05, "loss": 0.0111, "step": 8880 }, { "epoch": 24.092140921409214, "grad_norm": 0.18956692516803741, "learning_rate": 6.316175228334146e-05, "loss": 0.0123, "step": 8890 }, { "epoch": 24.119241192411923, "grad_norm": 0.29513657093048096, "learning_rate": 6.308197658794003e-05, "loss": 0.0113, "step": 8900 }, { "epoch": 24.146341463414632, "grad_norm": 0.24792727828025818, "learning_rate": 6.300216512691417e-05, "loss": 0.0112, "step": 8910 }, { "epoch": 24.173441734417345, "grad_norm": 0.21345826983451843, "learning_rate": 6.292231811846532e-05, "loss": 0.0104, "step": 8920 }, { "epoch": 24.200542005420054, "grad_norm": 0.2763778269290924, "learning_rate": 6.284243578089217e-05, "loss": 0.0136, "step": 8930 }, { "epoch": 24.227642276422763, "grad_norm": 0.22362560033798218, "learning_rate": 6.276251833258999e-05, "loss": 0.0107, "step": 8940 }, { "epoch": 24.254742547425476, "grad_norm": 0.28958410024642944, "learning_rate": 6.268256599205003e-05, "loss": 0.012, "step": 8950 }, { "epoch": 24.281842818428185, "grad_norm": 0.21744602918624878, "learning_rate": 6.260257897785892e-05, "loss": 0.0104, "step": 8960 }, { "epoch": 24.308943089430894, "grad_norm": 0.32774683833122253, "learning_rate": 6.252255750869811e-05, "loss": 0.0101, "step": 8970 }, { "epoch": 24.336043360433603, "grad_norm": 0.21691296994686127, "learning_rate": 6.244250180334325e-05, "loss": 0.0097, "step": 8980 }, { "epoch": 24.363143631436316, "grad_norm": 0.1868792325258255, "learning_rate": 6.236241208066356e-05, "loss": 0.0104, "step": 8990 }, { "epoch": 24.390243902439025, "grad_norm": 0.18055908381938934, "learning_rate": 6.228228855962133e-05, "loss": 0.0095, "step": 9000 }, { "epoch": 24.417344173441734, "grad_norm": 0.2408614605665207, "learning_rate": 6.220213145927115e-05, "loss": 0.0095, "step": 9010 }, { "epoch": 24.444444444444443, "grad_norm": 0.25389906764030457, "learning_rate": 6.212194099875951e-05, "loss": 0.0103, "step": 9020 }, { "epoch": 24.471544715447155, "grad_norm": 0.5490466356277466, "learning_rate": 6.204171739732405e-05, "loss": 0.01, "step": 9030 }, { "epoch": 24.498644986449865, "grad_norm": 0.2801071107387543, "learning_rate": 6.196146087429303e-05, "loss": 0.0109, "step": 9040 }, { "epoch": 24.525745257452574, "grad_norm": 0.24728502333164215, "learning_rate": 6.188117164908474e-05, "loss": 0.0109, "step": 9050 }, { "epoch": 24.552845528455286, "grad_norm": 0.19893233478069305, "learning_rate": 6.180084994120684e-05, "loss": 0.0098, "step": 9060 }, { "epoch": 24.579945799457995, "grad_norm": 0.21125246584415436, "learning_rate": 6.17204959702558e-05, "loss": 0.0112, "step": 9070 }, { "epoch": 24.607046070460704, "grad_norm": 0.3140222430229187, "learning_rate": 6.164010995591635e-05, "loss": 0.0097, "step": 9080 }, { "epoch": 24.634146341463413, "grad_norm": 0.16721956431865692, "learning_rate": 6.155969211796076e-05, "loss": 0.0119, "step": 9090 }, { "epoch": 24.661246612466126, "grad_norm": 0.2043803632259369, "learning_rate": 6.147924267624829e-05, "loss": 0.0095, "step": 9100 }, { "epoch": 24.688346883468835, "grad_norm": 0.21668685972690582, "learning_rate": 6.13987618507247e-05, "loss": 0.009, "step": 9110 }, { "epoch": 24.715447154471544, "grad_norm": 0.3114379346370697, "learning_rate": 6.131824986142147e-05, "loss": 0.0118, "step": 9120 }, { "epoch": 24.742547425474253, "grad_norm": 0.3436083197593689, "learning_rate": 6.123770692845529e-05, "loss": 0.0102, "step": 9130 }, { "epoch": 24.769647696476966, "grad_norm": 0.19676075875759125, "learning_rate": 6.11571332720275e-05, "loss": 0.0128, "step": 9140 }, { "epoch": 24.796747967479675, "grad_norm": 0.49857836961746216, "learning_rate": 6.107652911242336e-05, "loss": 0.0119, "step": 9150 }, { "epoch": 24.823848238482384, "grad_norm": 0.295742392539978, "learning_rate": 6.0995894670011586e-05, "loss": 0.0082, "step": 9160 }, { "epoch": 24.850948509485093, "grad_norm": 0.2950915992259979, "learning_rate": 6.091523016524368e-05, "loss": 0.0125, "step": 9170 }, { "epoch": 24.878048780487806, "grad_norm": 0.2882350981235504, "learning_rate": 6.083453581865328e-05, "loss": 0.0094, "step": 9180 }, { "epoch": 24.905149051490515, "grad_norm": 0.14469066262245178, "learning_rate": 6.075381185085568e-05, "loss": 0.0085, "step": 9190 }, { "epoch": 24.932249322493224, "grad_norm": 0.31285911798477173, "learning_rate": 6.067305848254709e-05, "loss": 0.0098, "step": 9200 }, { "epoch": 24.959349593495936, "grad_norm": 0.23078671097755432, "learning_rate": 6.059227593450418e-05, "loss": 0.0081, "step": 9210 }, { "epoch": 24.986449864498645, "grad_norm": 0.2411949336528778, "learning_rate": 6.051146442758333e-05, "loss": 0.0103, "step": 9220 }, { "epoch": 25.013550135501355, "grad_norm": 0.17821933329105377, "learning_rate": 6.043062418272012e-05, "loss": 0.0103, "step": 9230 }, { "epoch": 25.040650406504064, "grad_norm": 0.18621286749839783, "learning_rate": 6.0349755420928666e-05, "loss": 0.0121, "step": 9240 }, { "epoch": 25.067750677506776, "grad_norm": 0.24895775318145752, "learning_rate": 6.0268858363301105e-05, "loss": 0.0109, "step": 9250 }, { "epoch": 25.094850948509485, "grad_norm": 0.21642254292964935, "learning_rate": 6.018793323100689e-05, "loss": 0.01, "step": 9260 }, { "epoch": 25.121951219512194, "grad_norm": 0.1981739103794098, "learning_rate": 6.0106980245292255e-05, "loss": 0.0089, "step": 9270 }, { "epoch": 25.149051490514903, "grad_norm": 0.28569433093070984, "learning_rate": 6.002599962747957e-05, "loss": 0.0126, "step": 9280 }, { "epoch": 25.176151761517616, "grad_norm": 0.35556259751319885, "learning_rate": 5.994499159896673e-05, "loss": 0.0109, "step": 9290 }, { "epoch": 25.203252032520325, "grad_norm": 0.34785205125808716, "learning_rate": 5.9863956381226607e-05, "loss": 0.0097, "step": 9300 }, { "epoch": 25.230352303523034, "grad_norm": 0.27562469244003296, "learning_rate": 5.9782894195806394e-05, "loss": 0.0128, "step": 9310 }, { "epoch": 25.257452574525747, "grad_norm": 0.19445456564426422, "learning_rate": 5.9701805264327004e-05, "loss": 0.0102, "step": 9320 }, { "epoch": 25.284552845528456, "grad_norm": 0.22713123261928558, "learning_rate": 5.96206898084825e-05, "loss": 0.0102, "step": 9330 }, { "epoch": 25.311653116531165, "grad_norm": 0.28242313861846924, "learning_rate": 5.953954805003942e-05, "loss": 0.01, "step": 9340 }, { "epoch": 25.338753387533874, "grad_norm": 0.35152554512023926, "learning_rate": 5.945838021083623e-05, "loss": 0.0125, "step": 9350 }, { "epoch": 25.365853658536587, "grad_norm": 0.30694636702537537, "learning_rate": 5.9377186512782714e-05, "loss": 0.0147, "step": 9360 }, { "epoch": 25.392953929539296, "grad_norm": 0.27566540241241455, "learning_rate": 5.929596717785935e-05, "loss": 0.0095, "step": 9370 }, { "epoch": 25.420054200542005, "grad_norm": 0.18263423442840576, "learning_rate": 5.921472242811668e-05, "loss": 0.0082, "step": 9380 }, { "epoch": 25.447154471544714, "grad_norm": 0.33191996812820435, "learning_rate": 5.913345248567475e-05, "loss": 0.0098, "step": 9390 }, { "epoch": 25.474254742547426, "grad_norm": 0.18025335669517517, "learning_rate": 5.905215757272248e-05, "loss": 0.0093, "step": 9400 }, { "epoch": 25.501355013550135, "grad_norm": 0.23538079857826233, "learning_rate": 5.897083791151706e-05, "loss": 0.0092, "step": 9410 }, { "epoch": 25.528455284552845, "grad_norm": 0.21025508642196655, "learning_rate": 5.888949372438336e-05, "loss": 0.0097, "step": 9420 }, { "epoch": 25.555555555555557, "grad_norm": 0.31268182396888733, "learning_rate": 5.8808125233713255e-05, "loss": 0.0097, "step": 9430 }, { "epoch": 25.582655826558266, "grad_norm": 0.19327910244464874, "learning_rate": 5.872673266196509e-05, "loss": 0.0087, "step": 9440 }, { "epoch": 25.609756097560975, "grad_norm": 0.23275558650493622, "learning_rate": 5.864531623166305e-05, "loss": 0.0112, "step": 9450 }, { "epoch": 25.636856368563684, "grad_norm": 0.5367358922958374, "learning_rate": 5.856387616539656e-05, "loss": 0.0085, "step": 9460 }, { "epoch": 25.663956639566397, "grad_norm": 0.2615271508693695, "learning_rate": 5.848241268581967e-05, "loss": 0.0112, "step": 9470 }, { "epoch": 25.691056910569106, "grad_norm": 0.24285034835338593, "learning_rate": 5.840092601565037e-05, "loss": 0.01, "step": 9480 }, { "epoch": 25.718157181571815, "grad_norm": 0.3718586564064026, "learning_rate": 5.8319416377670144e-05, "loss": 0.0104, "step": 9490 }, { "epoch": 25.745257452574524, "grad_norm": 0.21943219006061554, "learning_rate": 5.82378839947232e-05, "loss": 0.01, "step": 9500 }, { "epoch": 25.772357723577237, "grad_norm": 0.2546646296977997, "learning_rate": 5.815632908971599e-05, "loss": 0.0095, "step": 9510 }, { "epoch": 25.799457994579946, "grad_norm": 0.3516315221786499, "learning_rate": 5.80747518856165e-05, "loss": 0.0126, "step": 9520 }, { "epoch": 25.826558265582655, "grad_norm": 0.16821983456611633, "learning_rate": 5.799315260545367e-05, "loss": 0.0078, "step": 9530 }, { "epoch": 25.853658536585368, "grad_norm": 0.16675671935081482, "learning_rate": 5.791153147231686e-05, "loss": 0.01, "step": 9540 }, { "epoch": 25.880758807588077, "grad_norm": 0.18590198457241058, "learning_rate": 5.782988870935509e-05, "loss": 0.0097, "step": 9550 }, { "epoch": 25.907859078590786, "grad_norm": 0.21138140559196472, "learning_rate": 5.774822453977657e-05, "loss": 0.0084, "step": 9560 }, { "epoch": 25.934959349593495, "grad_norm": 0.19732439517974854, "learning_rate": 5.7666539186848036e-05, "loss": 0.0107, "step": 9570 }, { "epoch": 25.962059620596207, "grad_norm": 0.23027576506137848, "learning_rate": 5.758483287389411e-05, "loss": 0.0103, "step": 9580 }, { "epoch": 25.989159891598916, "grad_norm": 0.2292662411928177, "learning_rate": 5.7503105824296735e-05, "loss": 0.0109, "step": 9590 }, { "epoch": 26.016260162601625, "grad_norm": 0.2171003371477127, "learning_rate": 5.742135826149453e-05, "loss": 0.0094, "step": 9600 }, { "epoch": 26.043360433604335, "grad_norm": 0.20604640245437622, "learning_rate": 5.7339590408982223e-05, "loss": 0.0093, "step": 9610 }, { "epoch": 26.070460704607047, "grad_norm": 0.21843145787715912, "learning_rate": 5.725780249031e-05, "loss": 0.0122, "step": 9620 }, { "epoch": 26.097560975609756, "grad_norm": 0.22876539826393127, "learning_rate": 5.717599472908292e-05, "loss": 0.011, "step": 9630 }, { "epoch": 26.124661246612465, "grad_norm": 0.18329569697380066, "learning_rate": 5.7094167348960237e-05, "loss": 0.0102, "step": 9640 }, { "epoch": 26.151761517615178, "grad_norm": 0.22197484970092773, "learning_rate": 5.7012320573654945e-05, "loss": 0.0086, "step": 9650 }, { "epoch": 26.178861788617887, "grad_norm": 0.2530835270881653, "learning_rate": 5.693045462693295e-05, "loss": 0.0109, "step": 9660 }, { "epoch": 26.205962059620596, "grad_norm": 0.494010329246521, "learning_rate": 5.684856973261266e-05, "loss": 0.0092, "step": 9670 }, { "epoch": 26.233062330623305, "grad_norm": 0.26427462697029114, "learning_rate": 5.6766666114564215e-05, "loss": 0.0095, "step": 9680 }, { "epoch": 26.260162601626018, "grad_norm": 0.2843179404735565, "learning_rate": 5.668474399670899e-05, "loss": 0.0093, "step": 9690 }, { "epoch": 26.287262872628727, "grad_norm": 0.24057792127132416, "learning_rate": 5.660280360301896e-05, "loss": 0.0091, "step": 9700 }, { "epoch": 26.314363143631436, "grad_norm": 0.22438950836658478, "learning_rate": 5.652084515751599e-05, "loss": 0.0087, "step": 9710 }, { "epoch": 26.341463414634145, "grad_norm": 0.22201256453990936, "learning_rate": 5.643886888427137e-05, "loss": 0.0087, "step": 9720 }, { "epoch": 26.368563685636857, "grad_norm": 0.19960157573223114, "learning_rate": 5.6356875007405074e-05, "loss": 0.0097, "step": 9730 }, { "epoch": 26.395663956639567, "grad_norm": 0.32149961590766907, "learning_rate": 5.627486375108525e-05, "loss": 0.0091, "step": 9740 }, { "epoch": 26.422764227642276, "grad_norm": 0.2582100033760071, "learning_rate": 5.619283533952754e-05, "loss": 0.0087, "step": 9750 }, { "epoch": 26.449864498644985, "grad_norm": 0.25565508008003235, "learning_rate": 5.6110789996994474e-05, "loss": 0.0099, "step": 9760 }, { "epoch": 26.476964769647697, "grad_norm": 0.20315691828727722, "learning_rate": 5.602872794779491e-05, "loss": 0.0109, "step": 9770 }, { "epoch": 26.504065040650406, "grad_norm": 0.20167042315006256, "learning_rate": 5.594664941628334e-05, "loss": 0.0114, "step": 9780 }, { "epoch": 26.531165311653115, "grad_norm": 0.21197772026062012, "learning_rate": 5.5864554626859324e-05, "loss": 0.0095, "step": 9790 }, { "epoch": 26.558265582655828, "grad_norm": 0.36763903498649597, "learning_rate": 5.578244380396691e-05, "loss": 0.0126, "step": 9800 }, { "epoch": 26.585365853658537, "grad_norm": 0.2250450700521469, "learning_rate": 5.570031717209394e-05, "loss": 0.008, "step": 9810 }, { "epoch": 26.612466124661246, "grad_norm": 0.2771260440349579, "learning_rate": 5.561817495577147e-05, "loss": 0.0098, "step": 9820 }, { "epoch": 26.639566395663955, "grad_norm": 0.38420456647872925, "learning_rate": 5.5536017379573215e-05, "loss": 0.0078, "step": 9830 }, { "epoch": 26.666666666666668, "grad_norm": 0.34184426069259644, "learning_rate": 5.545384466811483e-05, "loss": 0.0114, "step": 9840 }, { "epoch": 26.693766937669377, "grad_norm": 0.22177670896053314, "learning_rate": 5.5371657046053384e-05, "loss": 0.0097, "step": 9850 }, { "epoch": 26.720867208672086, "grad_norm": 0.36918172240257263, "learning_rate": 5.528945473808669e-05, "loss": 0.0095, "step": 9860 }, { "epoch": 26.747967479674795, "grad_norm": 0.21962010860443115, "learning_rate": 5.520723796895272e-05, "loss": 0.0124, "step": 9870 }, { "epoch": 26.775067750677508, "grad_norm": 0.34287169575691223, "learning_rate": 5.512500696342897e-05, "loss": 0.0099, "step": 9880 }, { "epoch": 26.802168021680217, "grad_norm": 0.25180432200431824, "learning_rate": 5.504276194633188e-05, "loss": 0.0095, "step": 9890 }, { "epoch": 26.829268292682926, "grad_norm": 0.23744545876979828, "learning_rate": 5.49605031425162e-05, "loss": 0.0124, "step": 9900 }, { "epoch": 26.85636856368564, "grad_norm": 0.30820950865745544, "learning_rate": 5.487823077687434e-05, "loss": 0.014, "step": 9910 }, { "epoch": 26.883468834688347, "grad_norm": 0.23990483582019806, "learning_rate": 5.4795945074335806e-05, "loss": 0.0122, "step": 9920 }, { "epoch": 26.910569105691057, "grad_norm": 0.24560090899467468, "learning_rate": 5.471364625986657e-05, "loss": 0.0112, "step": 9930 }, { "epoch": 26.937669376693766, "grad_norm": 0.2016904652118683, "learning_rate": 5.463133455846845e-05, "loss": 0.0092, "step": 9940 }, { "epoch": 26.964769647696478, "grad_norm": 0.22600828111171722, "learning_rate": 5.4549010195178505e-05, "loss": 0.0149, "step": 9950 }, { "epoch": 26.991869918699187, "grad_norm": 0.23661823570728302, "learning_rate": 5.446667339506838e-05, "loss": 0.0132, "step": 9960 }, { "epoch": 27.018970189701896, "grad_norm": 0.2753617763519287, "learning_rate": 5.4384324383243756e-05, "loss": 0.0098, "step": 9970 }, { "epoch": 27.046070460704605, "grad_norm": 0.31090274453163147, "learning_rate": 5.430196338484368e-05, "loss": 0.0108, "step": 9980 }, { "epoch": 27.073170731707318, "grad_norm": 0.2717534005641937, "learning_rate": 5.4219590625039975e-05, "loss": 0.0115, "step": 9990 }, { "epoch": 27.100271002710027, "grad_norm": 0.2773531377315521, "learning_rate": 5.413720632903664e-05, "loss": 0.0125, "step": 10000 }, { "epoch": 27.127371273712736, "grad_norm": 0.25817978382110596, "learning_rate": 5.405481072206917e-05, "loss": 0.0083, "step": 10010 }, { "epoch": 27.15447154471545, "grad_norm": 0.13323499262332916, "learning_rate": 5.397240402940402e-05, "loss": 0.0096, "step": 10020 }, { "epoch": 27.181571815718158, "grad_norm": 0.23895642161369324, "learning_rate": 5.388998647633794e-05, "loss": 0.0096, "step": 10030 }, { "epoch": 27.208672086720867, "grad_norm": 0.2319730967283249, "learning_rate": 5.380755828819737e-05, "loss": 0.0123, "step": 10040 }, { "epoch": 27.235772357723576, "grad_norm": 0.1617361605167389, "learning_rate": 5.3725119690337846e-05, "loss": 0.0096, "step": 10050 }, { "epoch": 27.26287262872629, "grad_norm": 0.17864736914634705, "learning_rate": 5.3642670908143324e-05, "loss": 0.0098, "step": 10060 }, { "epoch": 27.289972899728998, "grad_norm": 0.2091377228498459, "learning_rate": 5.356021216702562e-05, "loss": 0.0108, "step": 10070 }, { "epoch": 27.317073170731707, "grad_norm": 0.27795013785362244, "learning_rate": 5.347774369242381e-05, "loss": 0.0111, "step": 10080 }, { "epoch": 27.344173441734416, "grad_norm": 0.27009475231170654, "learning_rate": 5.3395265709803545e-05, "loss": 0.0087, "step": 10090 }, { "epoch": 27.37127371273713, "grad_norm": 0.24812111258506775, "learning_rate": 5.331277844465647e-05, "loss": 0.0094, "step": 10100 }, { "epoch": 27.398373983739837, "grad_norm": 0.2370874434709549, "learning_rate": 5.323028212249963e-05, "loss": 0.0108, "step": 10110 }, { "epoch": 27.425474254742547, "grad_norm": 0.2158573567867279, "learning_rate": 5.314777696887481e-05, "loss": 0.0082, "step": 10120 }, { "epoch": 27.45257452574526, "grad_norm": 0.22901524603366852, "learning_rate": 5.306526320934796e-05, "loss": 0.0097, "step": 10130 }, { "epoch": 27.479674796747968, "grad_norm": 0.20319697260856628, "learning_rate": 5.298274106950854e-05, "loss": 0.0095, "step": 10140 }, { "epoch": 27.506775067750677, "grad_norm": 0.21797645092010498, "learning_rate": 5.290021077496893e-05, "loss": 0.0102, "step": 10150 }, { "epoch": 27.533875338753386, "grad_norm": 0.45942726731300354, "learning_rate": 5.2817672551363816e-05, "loss": 0.011, "step": 10160 }, { "epoch": 27.5609756097561, "grad_norm": 0.15264716744422913, "learning_rate": 5.273512662434952e-05, "loss": 0.0082, "step": 10170 }, { "epoch": 27.588075880758808, "grad_norm": 0.1930069625377655, "learning_rate": 5.265257321960349e-05, "loss": 0.0105, "step": 10180 }, { "epoch": 27.615176151761517, "grad_norm": 0.27007415890693665, "learning_rate": 5.257001256282357e-05, "loss": 0.0138, "step": 10190 }, { "epoch": 27.642276422764226, "grad_norm": 0.16888189315795898, "learning_rate": 5.248744487972742e-05, "loss": 0.0109, "step": 10200 }, { "epoch": 27.66937669376694, "grad_norm": 0.2357468456029892, "learning_rate": 5.240487039605196e-05, "loss": 0.0076, "step": 10210 }, { "epoch": 27.696476964769648, "grad_norm": 0.2905060052871704, "learning_rate": 5.232228933755267e-05, "loss": 0.0093, "step": 10220 }, { "epoch": 27.723577235772357, "grad_norm": 0.26273080706596375, "learning_rate": 5.2239701930003006e-05, "loss": 0.0095, "step": 10230 }, { "epoch": 27.75067750677507, "grad_norm": 0.20355886220932007, "learning_rate": 5.215710839919379e-05, "loss": 0.0096, "step": 10240 }, { "epoch": 27.77777777777778, "grad_norm": 0.21583794057369232, "learning_rate": 5.207450897093257e-05, "loss": 0.0101, "step": 10250 }, { "epoch": 27.804878048780488, "grad_norm": 0.233380988240242, "learning_rate": 5.1991903871043046e-05, "loss": 0.009, "step": 10260 }, { "epoch": 27.831978319783197, "grad_norm": 0.14136740565299988, "learning_rate": 5.190929332536439e-05, "loss": 0.0094, "step": 10270 }, { "epoch": 27.85907859078591, "grad_norm": 0.2062251716852188, "learning_rate": 5.182667755975071e-05, "loss": 0.0092, "step": 10280 }, { "epoch": 27.88617886178862, "grad_norm": 0.19068439304828644, "learning_rate": 5.1744056800070315e-05, "loss": 0.0074, "step": 10290 }, { "epoch": 27.913279132791327, "grad_norm": 0.2547304630279541, "learning_rate": 5.166143127220524e-05, "loss": 0.01, "step": 10300 }, { "epoch": 27.940379403794037, "grad_norm": 0.24935346841812134, "learning_rate": 5.1578801202050485e-05, "loss": 0.0083, "step": 10310 }, { "epoch": 27.96747967479675, "grad_norm": 0.1791897714138031, "learning_rate": 5.149616681551355e-05, "loss": 0.0091, "step": 10320 }, { "epoch": 27.994579945799458, "grad_norm": 0.3285914957523346, "learning_rate": 5.141352833851367e-05, "loss": 0.0108, "step": 10330 }, { "epoch": 28.021680216802167, "grad_norm": 0.29654961824417114, "learning_rate": 5.1330885996981285e-05, "loss": 0.0109, "step": 10340 }, { "epoch": 28.048780487804876, "grad_norm": 0.26343834400177, "learning_rate": 5.124824001685741e-05, "loss": 0.0085, "step": 10350 }, { "epoch": 28.07588075880759, "grad_norm": 0.1574537754058838, "learning_rate": 5.116559062409298e-05, "loss": 0.0082, "step": 10360 }, { "epoch": 28.102981029810298, "grad_norm": 0.23039774596691132, "learning_rate": 5.10829380446483e-05, "loss": 0.0083, "step": 10370 }, { "epoch": 28.130081300813007, "grad_norm": 0.2999382019042969, "learning_rate": 5.100028250449235e-05, "loss": 0.0089, "step": 10380 }, { "epoch": 28.15718157181572, "grad_norm": 0.2223888337612152, "learning_rate": 5.0917624229602234e-05, "loss": 0.0088, "step": 10390 }, { "epoch": 28.18428184281843, "grad_norm": 0.20657284557819366, "learning_rate": 5.0834963445962524e-05, "loss": 0.0107, "step": 10400 }, { "epoch": 28.211382113821138, "grad_norm": 0.31858357787132263, "learning_rate": 5.075230037956461e-05, "loss": 0.0121, "step": 10410 }, { "epoch": 28.238482384823847, "grad_norm": 0.2919972538948059, "learning_rate": 5.0669635256406213e-05, "loss": 0.0087, "step": 10420 }, { "epoch": 28.26558265582656, "grad_norm": 0.25429847836494446, "learning_rate": 5.058696830249058e-05, "loss": 0.0087, "step": 10430 }, { "epoch": 28.29268292682927, "grad_norm": 0.2559034824371338, "learning_rate": 5.050429974382602e-05, "loss": 0.0088, "step": 10440 }, { "epoch": 28.319783197831978, "grad_norm": 0.13975472748279572, "learning_rate": 5.042162980642523e-05, "loss": 0.0084, "step": 10450 }, { "epoch": 28.346883468834687, "grad_norm": 0.24851307272911072, "learning_rate": 5.033895871630462e-05, "loss": 0.0081, "step": 10460 }, { "epoch": 28.3739837398374, "grad_norm": 0.17998260259628296, "learning_rate": 5.025628669948386e-05, "loss": 0.0079, "step": 10470 }, { "epoch": 28.40108401084011, "grad_norm": 0.20013007521629333, "learning_rate": 5.017361398198502e-05, "loss": 0.0097, "step": 10480 }, { "epoch": 28.428184281842817, "grad_norm": 0.13714627921581268, "learning_rate": 5.009094078983221e-05, "loss": 0.0084, "step": 10490 }, { "epoch": 28.45528455284553, "grad_norm": 0.1751294583082199, "learning_rate": 5.000826734905073e-05, "loss": 0.0089, "step": 10500 }, { "epoch": 28.48238482384824, "grad_norm": 0.2014438807964325, "learning_rate": 4.9925593885666645e-05, "loss": 0.011, "step": 10510 }, { "epoch": 28.509485094850948, "grad_norm": 0.28628212213516235, "learning_rate": 4.984292062570602e-05, "loss": 0.0076, "step": 10520 }, { "epoch": 28.536585365853657, "grad_norm": 0.2230406552553177, "learning_rate": 4.976024779519442e-05, "loss": 0.0115, "step": 10530 }, { "epoch": 28.56368563685637, "grad_norm": 0.24361906945705414, "learning_rate": 4.9677575620156194e-05, "loss": 0.0095, "step": 10540 }, { "epoch": 28.59078590785908, "grad_norm": 0.1769743263721466, "learning_rate": 4.959490432661391e-05, "loss": 0.0122, "step": 10550 }, { "epoch": 28.617886178861788, "grad_norm": 0.17878976464271545, "learning_rate": 4.9512234140587726e-05, "loss": 0.0091, "step": 10560 }, { "epoch": 28.644986449864497, "grad_norm": 0.24179363250732422, "learning_rate": 4.942956528809477e-05, "loss": 0.011, "step": 10570 }, { "epoch": 28.67208672086721, "grad_norm": 0.14366692304611206, "learning_rate": 4.934689799514854e-05, "loss": 0.0113, "step": 10580 }, { "epoch": 28.69918699186992, "grad_norm": 0.27632275223731995, "learning_rate": 4.926423248775827e-05, "loss": 0.0094, "step": 10590 }, { "epoch": 28.726287262872628, "grad_norm": 0.18203632533550262, "learning_rate": 4.918156899192826e-05, "loss": 0.0076, "step": 10600 }, { "epoch": 28.75338753387534, "grad_norm": 0.24471981823444366, "learning_rate": 4.909890773365738e-05, "loss": 0.0093, "step": 10610 }, { "epoch": 28.78048780487805, "grad_norm": 0.17523936927318573, "learning_rate": 4.9016248938938344e-05, "loss": 0.01, "step": 10620 }, { "epoch": 28.80758807588076, "grad_norm": 0.1523134410381317, "learning_rate": 4.8933592833757156e-05, "loss": 0.0094, "step": 10630 }, { "epoch": 28.834688346883468, "grad_norm": 0.20734430849552155, "learning_rate": 4.8850939644092435e-05, "loss": 0.01, "step": 10640 }, { "epoch": 28.86178861788618, "grad_norm": 0.24345751106739044, "learning_rate": 4.876828959591485e-05, "loss": 0.0082, "step": 10650 }, { "epoch": 28.88888888888889, "grad_norm": 0.15275612473487854, "learning_rate": 4.8685642915186474e-05, "loss": 0.0071, "step": 10660 }, { "epoch": 28.9159891598916, "grad_norm": 0.2331627458333969, "learning_rate": 4.860299982786018e-05, "loss": 0.0084, "step": 10670 }, { "epoch": 28.943089430894307, "grad_norm": 0.16987304389476776, "learning_rate": 4.852036055987901e-05, "loss": 0.0078, "step": 10680 }, { "epoch": 28.97018970189702, "grad_norm": 0.21140897274017334, "learning_rate": 4.843772533717558e-05, "loss": 0.0118, "step": 10690 }, { "epoch": 28.99728997289973, "grad_norm": 0.22642479836940765, "learning_rate": 4.835509438567142e-05, "loss": 0.0075, "step": 10700 }, { "epoch": 29.024390243902438, "grad_norm": 0.208902969956398, "learning_rate": 4.827246793127639e-05, "loss": 0.0094, "step": 10710 }, { "epoch": 29.05149051490515, "grad_norm": 0.20861651003360748, "learning_rate": 4.818984619988807e-05, "loss": 0.0104, "step": 10720 }, { "epoch": 29.07859078590786, "grad_norm": 0.11864374577999115, "learning_rate": 4.810722941739115e-05, "loss": 0.0077, "step": 10730 }, { "epoch": 29.10569105691057, "grad_norm": 0.14475736021995544, "learning_rate": 4.8024617809656684e-05, "loss": 0.0072, "step": 10740 }, { "epoch": 29.132791327913278, "grad_norm": 0.172056645154953, "learning_rate": 4.794201160254171e-05, "loss": 0.0085, "step": 10750 }, { "epoch": 29.15989159891599, "grad_norm": 0.1704128384590149, "learning_rate": 4.785941102188844e-05, "loss": 0.0085, "step": 10760 }, { "epoch": 29.1869918699187, "grad_norm": 0.21947519481182098, "learning_rate": 4.7776816293523686e-05, "loss": 0.0088, "step": 10770 }, { "epoch": 29.21409214092141, "grad_norm": 0.22815948724746704, "learning_rate": 4.769422764325832e-05, "loss": 0.0092, "step": 10780 }, { "epoch": 29.241192411924118, "grad_norm": 0.23963424563407898, "learning_rate": 4.76116452968865e-05, "loss": 0.0089, "step": 10790 }, { "epoch": 29.26829268292683, "grad_norm": 0.1836363822221756, "learning_rate": 4.752906948018525e-05, "loss": 0.0087, "step": 10800 }, { "epoch": 29.29539295392954, "grad_norm": 0.26972365379333496, "learning_rate": 4.7446500418913684e-05, "loss": 0.0143, "step": 10810 }, { "epoch": 29.32249322493225, "grad_norm": 0.271626353263855, "learning_rate": 4.736393833881247e-05, "loss": 0.0095, "step": 10820 }, { "epoch": 29.34959349593496, "grad_norm": 0.251496821641922, "learning_rate": 4.7281383465603194e-05, "loss": 0.0108, "step": 10830 }, { "epoch": 29.37669376693767, "grad_norm": 0.13773640990257263, "learning_rate": 4.71988360249877e-05, "loss": 0.0079, "step": 10840 }, { "epoch": 29.40379403794038, "grad_norm": 0.18810562789440155, "learning_rate": 4.7116296242647554e-05, "loss": 0.0086, "step": 10850 }, { "epoch": 29.43089430894309, "grad_norm": 0.24679908156394958, "learning_rate": 4.703376434424336e-05, "loss": 0.0098, "step": 10860 }, { "epoch": 29.4579945799458, "grad_norm": 0.2613721787929535, "learning_rate": 4.695124055541421e-05, "loss": 0.0119, "step": 10870 }, { "epoch": 29.48509485094851, "grad_norm": 0.20030477643013, "learning_rate": 4.6868725101776934e-05, "loss": 0.0091, "step": 10880 }, { "epoch": 29.51219512195122, "grad_norm": 0.16847921907901764, "learning_rate": 4.678621820892567e-05, "loss": 0.0093, "step": 10890 }, { "epoch": 29.539295392953928, "grad_norm": 0.25625744462013245, "learning_rate": 4.670372010243111e-05, "loss": 0.0101, "step": 10900 }, { "epoch": 29.56639566395664, "grad_norm": 0.21095527708530426, "learning_rate": 4.662123100783992e-05, "loss": 0.0099, "step": 10910 }, { "epoch": 29.59349593495935, "grad_norm": 0.41053110361099243, "learning_rate": 4.653875115067415e-05, "loss": 0.0096, "step": 10920 }, { "epoch": 29.62059620596206, "grad_norm": 0.21923263370990753, "learning_rate": 4.6456280756430545e-05, "loss": 0.0074, "step": 10930 }, { "epoch": 29.647696476964768, "grad_norm": 0.1689096987247467, "learning_rate": 4.637382005058004e-05, "loss": 0.0081, "step": 10940 }, { "epoch": 29.67479674796748, "grad_norm": 0.24143432080745697, "learning_rate": 4.629136925856705e-05, "loss": 0.0083, "step": 10950 }, { "epoch": 29.70189701897019, "grad_norm": 0.2895428538322449, "learning_rate": 4.6208928605808895e-05, "loss": 0.0075, "step": 10960 }, { "epoch": 29.7289972899729, "grad_norm": 0.2900696396827698, "learning_rate": 4.612649831769519e-05, "loss": 0.0078, "step": 10970 }, { "epoch": 29.75609756097561, "grad_norm": 0.2108275145292282, "learning_rate": 4.604407861958715e-05, "loss": 0.0087, "step": 10980 }, { "epoch": 29.78319783197832, "grad_norm": 0.16948525607585907, "learning_rate": 4.5961669736817114e-05, "loss": 0.0079, "step": 10990 }, { "epoch": 29.81029810298103, "grad_norm": 0.22885608673095703, "learning_rate": 4.5879271894687814e-05, "loss": 0.0085, "step": 11000 }, { "epoch": 29.83739837398374, "grad_norm": 0.3430503308773041, "learning_rate": 4.5796885318471826e-05, "loss": 0.0084, "step": 11010 }, { "epoch": 29.86449864498645, "grad_norm": 0.17772792279720306, "learning_rate": 4.571451023341086e-05, "loss": 0.01, "step": 11020 }, { "epoch": 29.89159891598916, "grad_norm": 0.1943613886833191, "learning_rate": 4.563214686471527e-05, "loss": 0.01, "step": 11030 }, { "epoch": 29.91869918699187, "grad_norm": 0.13370704650878906, "learning_rate": 4.5549795437563365e-05, "loss": 0.0098, "step": 11040 }, { "epoch": 29.94579945799458, "grad_norm": 0.36264464259147644, "learning_rate": 4.546745617710081e-05, "loss": 0.0095, "step": 11050 }, { "epoch": 29.97289972899729, "grad_norm": 0.4971444308757782, "learning_rate": 4.5385129308440014e-05, "loss": 0.0074, "step": 11060 }, { "epoch": 30.0, "grad_norm": 0.24251338839530945, "learning_rate": 4.530281505665944e-05, "loss": 0.008, "step": 11070 }, { "epoch": 30.02710027100271, "grad_norm": 0.1949591040611267, "learning_rate": 4.5220513646803134e-05, "loss": 0.0097, "step": 11080 }, { "epoch": 30.05420054200542, "grad_norm": 0.19876912236213684, "learning_rate": 4.513822530388003e-05, "loss": 0.0094, "step": 11090 }, { "epoch": 30.08130081300813, "grad_norm": 0.1825476586818695, "learning_rate": 4.5055950252863296e-05, "loss": 0.0088, "step": 11100 }, { "epoch": 30.10840108401084, "grad_norm": 0.22995005548000336, "learning_rate": 4.4973688718689803e-05, "loss": 0.0072, "step": 11110 }, { "epoch": 30.13550135501355, "grad_norm": 0.19757214188575745, "learning_rate": 4.4891440926259406e-05, "loss": 0.0079, "step": 11120 }, { "epoch": 30.16260162601626, "grad_norm": 0.19297824800014496, "learning_rate": 4.480920710043443e-05, "loss": 0.0077, "step": 11130 }, { "epoch": 30.18970189701897, "grad_norm": 0.22252869606018066, "learning_rate": 4.4726987466039044e-05, "loss": 0.0084, "step": 11140 }, { "epoch": 30.21680216802168, "grad_norm": 0.19187916815280914, "learning_rate": 4.46447822478586e-05, "loss": 0.0078, "step": 11150 }, { "epoch": 30.24390243902439, "grad_norm": 0.2608235478401184, "learning_rate": 4.4562591670638974e-05, "loss": 0.01, "step": 11160 }, { "epoch": 30.2710027100271, "grad_norm": 0.16121909022331238, "learning_rate": 4.4480415959086105e-05, "loss": 0.007, "step": 11170 }, { "epoch": 30.29810298102981, "grad_norm": 0.25860926508903503, "learning_rate": 4.439825533786522e-05, "loss": 0.0114, "step": 11180 }, { "epoch": 30.32520325203252, "grad_norm": 0.2266981154680252, "learning_rate": 4.431611003160035e-05, "loss": 0.0086, "step": 11190 }, { "epoch": 30.352303523035232, "grad_norm": 0.1929105669260025, "learning_rate": 4.4233980264873636e-05, "loss": 0.0088, "step": 11200 }, { "epoch": 30.37940379403794, "grad_norm": 0.23155008256435394, "learning_rate": 4.4151866262224684e-05, "loss": 0.0082, "step": 11210 }, { "epoch": 30.40650406504065, "grad_norm": 0.14779236912727356, "learning_rate": 4.406976824815006e-05, "loss": 0.0075, "step": 11220 }, { "epoch": 30.43360433604336, "grad_norm": 0.26337969303131104, "learning_rate": 4.3987686447102595e-05, "loss": 0.0104, "step": 11230 }, { "epoch": 30.460704607046072, "grad_norm": 0.22290772199630737, "learning_rate": 4.3905621083490804e-05, "loss": 0.0081, "step": 11240 }, { "epoch": 30.48780487804878, "grad_norm": 0.13936036825180054, "learning_rate": 4.3823572381678286e-05, "loss": 0.0081, "step": 11250 }, { "epoch": 30.51490514905149, "grad_norm": 0.2738422751426697, "learning_rate": 4.374154056598301e-05, "loss": 0.0101, "step": 11260 }, { "epoch": 30.5420054200542, "grad_norm": 0.1776702105998993, "learning_rate": 4.3659525860676845e-05, "loss": 0.0092, "step": 11270 }, { "epoch": 30.56910569105691, "grad_norm": 0.23093067109584808, "learning_rate": 4.3577528489984854e-05, "loss": 0.0102, "step": 11280 }, { "epoch": 30.59620596205962, "grad_norm": 0.22996915876865387, "learning_rate": 4.349554867808476e-05, "loss": 0.01, "step": 11290 }, { "epoch": 30.62330623306233, "grad_norm": 0.3196538984775543, "learning_rate": 4.34135866491062e-05, "loss": 0.0073, "step": 11300 }, { "epoch": 30.65040650406504, "grad_norm": 0.2693580985069275, "learning_rate": 4.333164262713022e-05, "loss": 0.0087, "step": 11310 }, { "epoch": 30.67750677506775, "grad_norm": 0.15247127413749695, "learning_rate": 4.324971683618868e-05, "loss": 0.0082, "step": 11320 }, { "epoch": 30.70460704607046, "grad_norm": 0.22294805943965912, "learning_rate": 4.316780950026354e-05, "loss": 0.0083, "step": 11330 }, { "epoch": 30.73170731707317, "grad_norm": 0.16558168828487396, "learning_rate": 4.308592084328637e-05, "loss": 0.0079, "step": 11340 }, { "epoch": 30.758807588075882, "grad_norm": 0.1501348912715912, "learning_rate": 4.3004051089137576e-05, "loss": 0.0084, "step": 11350 }, { "epoch": 30.78590785907859, "grad_norm": 0.1832951456308365, "learning_rate": 4.292220046164597e-05, "loss": 0.0117, "step": 11360 }, { "epoch": 30.8130081300813, "grad_norm": 0.33504074811935425, "learning_rate": 4.2840369184588035e-05, "loss": 0.0102, "step": 11370 }, { "epoch": 30.84010840108401, "grad_norm": 0.18531225621700287, "learning_rate": 4.2758557481687345e-05, "loss": 0.0113, "step": 11380 }, { "epoch": 30.867208672086722, "grad_norm": 0.21974162757396698, "learning_rate": 4.267676557661403e-05, "loss": 0.0079, "step": 11390 }, { "epoch": 30.89430894308943, "grad_norm": 0.4932694137096405, "learning_rate": 4.2594993692983955e-05, "loss": 0.0088, "step": 11400 }, { "epoch": 30.92140921409214, "grad_norm": 0.18396183848381042, "learning_rate": 4.251324205435837e-05, "loss": 0.0099, "step": 11410 }, { "epoch": 30.948509485094853, "grad_norm": 0.22145944833755493, "learning_rate": 4.243151088424312e-05, "loss": 0.0074, "step": 11420 }, { "epoch": 30.975609756097562, "grad_norm": 0.15765045583248138, "learning_rate": 4.234980040608813e-05, "loss": 0.0077, "step": 11430 }, { "epoch": 31.00271002710027, "grad_norm": 0.2245842069387436, "learning_rate": 4.22681108432867e-05, "loss": 0.0085, "step": 11440 }, { "epoch": 31.02981029810298, "grad_norm": 0.21836137771606445, "learning_rate": 4.2186442419174984e-05, "loss": 0.0095, "step": 11450 }, { "epoch": 31.056910569105693, "grad_norm": 0.26330411434173584, "learning_rate": 4.210479535703133e-05, "loss": 0.0087, "step": 11460 }, { "epoch": 31.0840108401084, "grad_norm": 0.2704036235809326, "learning_rate": 4.202316988007567e-05, "loss": 0.0068, "step": 11470 }, { "epoch": 31.11111111111111, "grad_norm": 0.18362843990325928, "learning_rate": 4.194156621146901e-05, "loss": 0.0084, "step": 11480 }, { "epoch": 31.13821138211382, "grad_norm": 0.19126276671886444, "learning_rate": 4.1859984574312596e-05, "loss": 0.0082, "step": 11490 }, { "epoch": 31.165311653116532, "grad_norm": 0.14742405712604523, "learning_rate": 4.177842519164752e-05, "loss": 0.0067, "step": 11500 }, { "epoch": 31.19241192411924, "grad_norm": 0.28757932782173157, "learning_rate": 4.169688828645404e-05, "loss": 0.0083, "step": 11510 }, { "epoch": 31.21951219512195, "grad_norm": 0.17363041639328003, "learning_rate": 4.161537408165092e-05, "loss": 0.0084, "step": 11520 }, { "epoch": 31.24661246612466, "grad_norm": 0.1997385323047638, "learning_rate": 4.1533882800094924e-05, "loss": 0.0114, "step": 11530 }, { "epoch": 31.273712737127372, "grad_norm": 0.2140190452337265, "learning_rate": 4.145241466458005e-05, "loss": 0.0087, "step": 11540 }, { "epoch": 31.30081300813008, "grad_norm": 0.309038370847702, "learning_rate": 4.13709698978371e-05, "loss": 0.0083, "step": 11550 }, { "epoch": 31.32791327913279, "grad_norm": 0.19694943726062775, "learning_rate": 4.1289548722532944e-05, "loss": 0.0109, "step": 11560 }, { "epoch": 31.355013550135503, "grad_norm": 0.3354754149913788, "learning_rate": 4.120815136126999e-05, "loss": 0.009, "step": 11570 }, { "epoch": 31.382113821138212, "grad_norm": 0.17481008172035217, "learning_rate": 4.112677803658548e-05, "loss": 0.0084, "step": 11580 }, { "epoch": 31.40921409214092, "grad_norm": 0.17288072407245636, "learning_rate": 4.1045428970951e-05, "loss": 0.0077, "step": 11590 }, { "epoch": 31.43631436314363, "grad_norm": 0.21863844990730286, "learning_rate": 4.0964104386771785e-05, "loss": 0.0082, "step": 11600 }, { "epoch": 31.463414634146343, "grad_norm": 0.12841087579727173, "learning_rate": 4.0882804506386144e-05, "loss": 0.0086, "step": 11610 }, { "epoch": 31.490514905149052, "grad_norm": 0.15767671167850494, "learning_rate": 4.080152955206485e-05, "loss": 0.0059, "step": 11620 }, { "epoch": 31.51761517615176, "grad_norm": 0.20117005705833435, "learning_rate": 4.0720279746010505e-05, "loss": 0.0085, "step": 11630 }, { "epoch": 31.54471544715447, "grad_norm": 0.2255721539258957, "learning_rate": 4.063905531035699e-05, "loss": 0.0083, "step": 11640 }, { "epoch": 31.571815718157183, "grad_norm": 0.15057919919490814, "learning_rate": 4.055785646716882e-05, "loss": 0.008, "step": 11650 }, { "epoch": 31.59891598915989, "grad_norm": 0.19472286105155945, "learning_rate": 4.047668343844051e-05, "loss": 0.0083, "step": 11660 }, { "epoch": 31.6260162601626, "grad_norm": 0.16337215900421143, "learning_rate": 4.039553644609604e-05, "loss": 0.0109, "step": 11670 }, { "epoch": 31.653116531165313, "grad_norm": 0.2147749811410904, "learning_rate": 4.0314415711988176e-05, "loss": 0.0078, "step": 11680 }, { "epoch": 31.680216802168022, "grad_norm": 0.1452392339706421, "learning_rate": 4.023332145789792e-05, "loss": 0.0076, "step": 11690 }, { "epoch": 31.70731707317073, "grad_norm": 0.17313015460968018, "learning_rate": 4.015225390553385e-05, "loss": 0.0065, "step": 11700 }, { "epoch": 31.73441734417344, "grad_norm": 0.22431375086307526, "learning_rate": 4.007121327653158e-05, "loss": 0.0078, "step": 11710 }, { "epoch": 31.761517615176153, "grad_norm": 0.19897514581680298, "learning_rate": 3.9990199792453064e-05, "loss": 0.0071, "step": 11720 }, { "epoch": 31.788617886178862, "grad_norm": 0.27744340896606445, "learning_rate": 3.9909213674786103e-05, "loss": 0.0099, "step": 11730 }, { "epoch": 31.81571815718157, "grad_norm": 0.21343548595905304, "learning_rate": 3.982825514494363e-05, "loss": 0.009, "step": 11740 }, { "epoch": 31.84281842818428, "grad_norm": 0.2803768217563629, "learning_rate": 3.974732442426319e-05, "loss": 0.0081, "step": 11750 }, { "epoch": 31.869918699186993, "grad_norm": 0.18224956095218658, "learning_rate": 3.966642173400629e-05, "loss": 0.0085, "step": 11760 }, { "epoch": 31.897018970189702, "grad_norm": 0.1809883564710617, "learning_rate": 3.9585547295357764e-05, "loss": 0.0101, "step": 11770 }, { "epoch": 31.92411924119241, "grad_norm": 0.29575446248054504, "learning_rate": 3.950470132942526e-05, "loss": 0.0091, "step": 11780 }, { "epoch": 31.951219512195124, "grad_norm": 0.19209258258342743, "learning_rate": 3.942388405723856e-05, "loss": 0.0101, "step": 11790 }, { "epoch": 31.978319783197833, "grad_norm": 0.14948388934135437, "learning_rate": 3.9343095699749e-05, "loss": 0.0114, "step": 11800 }, { "epoch": 32.00542005420054, "grad_norm": 0.26820212602615356, "learning_rate": 3.9262336477828874e-05, "loss": 0.0076, "step": 11810 }, { "epoch": 32.03252032520325, "grad_norm": 0.21456654369831085, "learning_rate": 3.9181606612270794e-05, "loss": 0.0122, "step": 11820 }, { "epoch": 32.05962059620596, "grad_norm": 0.17459581792354584, "learning_rate": 3.910090632378713e-05, "loss": 0.0071, "step": 11830 }, { "epoch": 32.08672086720867, "grad_norm": 0.17015302181243896, "learning_rate": 3.90202358330094e-05, "loss": 0.0082, "step": 11840 }, { "epoch": 32.113821138211385, "grad_norm": 0.18265244364738464, "learning_rate": 3.8939595360487656e-05, "loss": 0.0093, "step": 11850 }, { "epoch": 32.140921409214094, "grad_norm": 0.1902509480714798, "learning_rate": 3.885898512668984e-05, "loss": 0.0072, "step": 11860 }, { "epoch": 32.1680216802168, "grad_norm": 0.13339625298976898, "learning_rate": 3.877840535200127e-05, "loss": 0.0079, "step": 11870 }, { "epoch": 32.19512195121951, "grad_norm": 0.16690488159656525, "learning_rate": 3.869785625672397e-05, "loss": 0.008, "step": 11880 }, { "epoch": 32.22222222222222, "grad_norm": 0.19106239080429077, "learning_rate": 3.8617338061076094e-05, "loss": 0.0081, "step": 11890 }, { "epoch": 32.24932249322493, "grad_norm": 0.16632361710071564, "learning_rate": 3.853685098519132e-05, "loss": 0.0094, "step": 11900 }, { "epoch": 32.27642276422764, "grad_norm": 0.16504329442977905, "learning_rate": 3.845639524911823e-05, "loss": 0.0066, "step": 11910 }, { "epoch": 32.303523035230356, "grad_norm": 0.2537733018398285, "learning_rate": 3.837597107281974e-05, "loss": 0.0077, "step": 11920 }, { "epoch": 32.330623306233065, "grad_norm": 0.2595174312591553, "learning_rate": 3.829557867617247e-05, "loss": 0.0083, "step": 11930 }, { "epoch": 32.357723577235774, "grad_norm": 0.2974930703639984, "learning_rate": 3.821521827896618e-05, "loss": 0.0094, "step": 11940 }, { "epoch": 32.38482384823848, "grad_norm": 0.16895994544029236, "learning_rate": 3.81348901009031e-05, "loss": 0.0089, "step": 11950 }, { "epoch": 32.41192411924119, "grad_norm": 0.27898970246315, "learning_rate": 3.805459436159741e-05, "loss": 0.0091, "step": 11960 }, { "epoch": 32.4390243902439, "grad_norm": 0.3496355414390564, "learning_rate": 3.797433128057461e-05, "loss": 0.0097, "step": 11970 }, { "epoch": 32.46612466124661, "grad_norm": 0.2783356308937073, "learning_rate": 3.789410107727089e-05, "loss": 0.0085, "step": 11980 }, { "epoch": 32.49322493224932, "grad_norm": 0.24896495044231415, "learning_rate": 3.781390397103257e-05, "loss": 0.0106, "step": 11990 }, { "epoch": 32.520325203252035, "grad_norm": 0.20614685118198395, "learning_rate": 3.7733740181115455e-05, "loss": 0.0117, "step": 12000 }, { "epoch": 32.547425474254744, "grad_norm": 0.1776188611984253, "learning_rate": 3.7653609926684306e-05, "loss": 0.0107, "step": 12010 }, { "epoch": 32.57452574525745, "grad_norm": 0.2159109115600586, "learning_rate": 3.757351342681217e-05, "loss": 0.0065, "step": 12020 }, { "epoch": 32.60162601626016, "grad_norm": 0.16483139991760254, "learning_rate": 3.749345090047982e-05, "loss": 0.0081, "step": 12030 }, { "epoch": 32.62872628726287, "grad_norm": 0.30450138449668884, "learning_rate": 3.741342256657515e-05, "loss": 0.0102, "step": 12040 }, { "epoch": 32.65582655826558, "grad_norm": 0.22272224724292755, "learning_rate": 3.7333428643892567e-05, "loss": 0.0097, "step": 12050 }, { "epoch": 32.68292682926829, "grad_norm": 0.14604122936725616, "learning_rate": 3.725346935113239e-05, "loss": 0.0075, "step": 12060 }, { "epoch": 32.710027100271006, "grad_norm": 0.21522843837738037, "learning_rate": 3.717354490690029e-05, "loss": 0.0111, "step": 12070 }, { "epoch": 32.737127371273715, "grad_norm": 0.12360890954732895, "learning_rate": 3.709365552970664e-05, "loss": 0.0072, "step": 12080 }, { "epoch": 32.764227642276424, "grad_norm": 0.16261635720729828, "learning_rate": 3.7013801437965945e-05, "loss": 0.0093, "step": 12090 }, { "epoch": 32.79132791327913, "grad_norm": 0.14510896801948547, "learning_rate": 3.693398284999623e-05, "loss": 0.0084, "step": 12100 }, { "epoch": 32.81842818428184, "grad_norm": 0.18989792466163635, "learning_rate": 3.6854199984018484e-05, "loss": 0.0088, "step": 12110 }, { "epoch": 32.84552845528455, "grad_norm": 0.16261082887649536, "learning_rate": 3.677445305815601e-05, "loss": 0.0079, "step": 12120 }, { "epoch": 32.87262872628726, "grad_norm": 0.14783169329166412, "learning_rate": 3.669474229043387e-05, "loss": 0.0084, "step": 12130 }, { "epoch": 32.89972899728997, "grad_norm": 0.22671401500701904, "learning_rate": 3.6615067898778235e-05, "loss": 0.0087, "step": 12140 }, { "epoch": 32.926829268292686, "grad_norm": 0.22855931520462036, "learning_rate": 3.6535430101015866e-05, "loss": 0.0065, "step": 12150 }, { "epoch": 32.953929539295395, "grad_norm": 0.21125288307666779, "learning_rate": 3.645582911487345e-05, "loss": 0.01, "step": 12160 }, { "epoch": 32.981029810298104, "grad_norm": 0.25644397735595703, "learning_rate": 3.637626515797706e-05, "loss": 0.0077, "step": 12170 }, { "epoch": 33.00813008130081, "grad_norm": 0.1850961595773697, "learning_rate": 3.629673844785152e-05, "loss": 0.0071, "step": 12180 }, { "epoch": 33.03523035230352, "grad_norm": 0.13218265771865845, "learning_rate": 3.621724920191979e-05, "loss": 0.0075, "step": 12190 }, { "epoch": 33.06233062330623, "grad_norm": 0.20934203267097473, "learning_rate": 3.6137797637502444e-05, "loss": 0.0072, "step": 12200 }, { "epoch": 33.08943089430894, "grad_norm": 0.2797098457813263, "learning_rate": 3.6058383971817035e-05, "loss": 0.0072, "step": 12210 }, { "epoch": 33.116531165311656, "grad_norm": 0.2015627920627594, "learning_rate": 3.59790084219775e-05, "loss": 0.0105, "step": 12220 }, { "epoch": 33.143631436314365, "grad_norm": 0.22654017806053162, "learning_rate": 3.589967120499353e-05, "loss": 0.0079, "step": 12230 }, { "epoch": 33.170731707317074, "grad_norm": 0.2165304273366928, "learning_rate": 3.5820372537770075e-05, "loss": 0.0076, "step": 12240 }, { "epoch": 33.19783197831978, "grad_norm": 0.1719217151403427, "learning_rate": 3.5741112637106655e-05, "loss": 0.0083, "step": 12250 }, { "epoch": 33.22493224932249, "grad_norm": 0.2400035411119461, "learning_rate": 3.5661891719696804e-05, "loss": 0.008, "step": 12260 }, { "epoch": 33.2520325203252, "grad_norm": 0.4389023184776306, "learning_rate": 3.5582710002127504e-05, "loss": 0.0108, "step": 12270 }, { "epoch": 33.27913279132791, "grad_norm": 0.21456770598888397, "learning_rate": 3.550356770087853e-05, "loss": 0.0095, "step": 12280 }, { "epoch": 33.30623306233063, "grad_norm": 0.12460032105445862, "learning_rate": 3.5424465032321914e-05, "loss": 0.0096, "step": 12290 }, { "epoch": 33.333333333333336, "grad_norm": 0.21859782934188843, "learning_rate": 3.5345402212721335e-05, "loss": 0.008, "step": 12300 }, { "epoch": 33.360433604336045, "grad_norm": 0.23087798058986664, "learning_rate": 3.526637945823152e-05, "loss": 0.0075, "step": 12310 }, { "epoch": 33.387533875338754, "grad_norm": 0.2130027413368225, "learning_rate": 3.518739698489767e-05, "loss": 0.0087, "step": 12320 }, { "epoch": 33.41463414634146, "grad_norm": 0.1770734339952469, "learning_rate": 3.510845500865485e-05, "loss": 0.0078, "step": 12330 }, { "epoch": 33.44173441734417, "grad_norm": 0.18947899341583252, "learning_rate": 3.502955374532739e-05, "loss": 0.0073, "step": 12340 }, { "epoch": 33.46883468834688, "grad_norm": 0.17567571997642517, "learning_rate": 3.495069341062836e-05, "loss": 0.0083, "step": 12350 }, { "epoch": 33.49593495934959, "grad_norm": 0.18556010723114014, "learning_rate": 3.4871874220158896e-05, "loss": 0.0082, "step": 12360 }, { "epoch": 33.523035230352306, "grad_norm": 0.24670737981796265, "learning_rate": 3.479309638940762e-05, "loss": 0.0081, "step": 12370 }, { "epoch": 33.550135501355015, "grad_norm": 0.12711697816848755, "learning_rate": 3.4714360133750146e-05, "loss": 0.0082, "step": 12380 }, { "epoch": 33.577235772357724, "grad_norm": 0.29980677366256714, "learning_rate": 3.463566566844839e-05, "loss": 0.0084, "step": 12390 }, { "epoch": 33.60433604336043, "grad_norm": 0.18874973058700562, "learning_rate": 3.4557013208650016e-05, "loss": 0.0092, "step": 12400 }, { "epoch": 33.63143631436314, "grad_norm": 0.25362005829811096, "learning_rate": 3.4478402969387857e-05, "loss": 0.0083, "step": 12410 }, { "epoch": 33.65853658536585, "grad_norm": 0.2993088364601135, "learning_rate": 3.4399835165579266e-05, "loss": 0.007, "step": 12420 }, { "epoch": 33.68563685636856, "grad_norm": 0.1768006980419159, "learning_rate": 3.4321310012025645e-05, "loss": 0.0075, "step": 12430 }, { "epoch": 33.71273712737128, "grad_norm": 0.2027633935213089, "learning_rate": 3.424282772341176e-05, "loss": 0.0067, "step": 12440 }, { "epoch": 33.739837398373986, "grad_norm": 0.7875013947486877, "learning_rate": 3.416438851430519e-05, "loss": 0.0069, "step": 12450 }, { "epoch": 33.766937669376695, "grad_norm": 0.24583064019680023, "learning_rate": 3.408599259915577e-05, "loss": 0.0094, "step": 12460 }, { "epoch": 33.794037940379404, "grad_norm": 0.28233206272125244, "learning_rate": 3.400764019229487e-05, "loss": 0.0077, "step": 12470 }, { "epoch": 33.82113821138211, "grad_norm": 0.2651706635951996, "learning_rate": 3.3929331507935035e-05, "loss": 0.0074, "step": 12480 }, { "epoch": 33.84823848238482, "grad_norm": 0.1905028522014618, "learning_rate": 3.3851066760169196e-05, "loss": 0.0059, "step": 12490 }, { "epoch": 33.87533875338753, "grad_norm": 0.17781716585159302, "learning_rate": 3.377284616297021e-05, "loss": 0.0069, "step": 12500 }, { "epoch": 33.90243902439025, "grad_norm": 0.18644750118255615, "learning_rate": 3.3694669930190166e-05, "loss": 0.0087, "step": 12510 }, { "epoch": 33.929539295392956, "grad_norm": 0.16904477775096893, "learning_rate": 3.36165382755599e-05, "loss": 0.0057, "step": 12520 }, { "epoch": 33.956639566395665, "grad_norm": 0.16569726169109344, "learning_rate": 3.35384514126884e-05, "loss": 0.0066, "step": 12530 }, { "epoch": 33.983739837398375, "grad_norm": 0.13223528861999512, "learning_rate": 3.3460409555062154e-05, "loss": 0.0066, "step": 12540 }, { "epoch": 34.010840108401084, "grad_norm": 0.2539064884185791, "learning_rate": 3.3382412916044645e-05, "loss": 0.0067, "step": 12550 }, { "epoch": 34.03794037940379, "grad_norm": 0.1991189867258072, "learning_rate": 3.330446170887566e-05, "loss": 0.0081, "step": 12560 }, { "epoch": 34.0650406504065, "grad_norm": 0.17691747844219208, "learning_rate": 3.3226556146670834e-05, "loss": 0.0087, "step": 12570 }, { "epoch": 34.09214092140921, "grad_norm": 0.17945216596126556, "learning_rate": 3.314869644242102e-05, "loss": 0.0094, "step": 12580 }, { "epoch": 34.11924119241193, "grad_norm": 0.27042844891548157, "learning_rate": 3.3070882808991674e-05, "loss": 0.006, "step": 12590 }, { "epoch": 34.146341463414636, "grad_norm": 0.1671561449766159, "learning_rate": 3.2993115459122305e-05, "loss": 0.0093, "step": 12600 }, { "epoch": 34.173441734417345, "grad_norm": 0.24590574204921722, "learning_rate": 3.2915394605425835e-05, "loss": 0.0075, "step": 12610 }, { "epoch": 34.200542005420054, "grad_norm": 0.19082117080688477, "learning_rate": 3.283772046038816e-05, "loss": 0.0075, "step": 12620 }, { "epoch": 34.22764227642276, "grad_norm": 0.34057337045669556, "learning_rate": 3.276009323636739e-05, "loss": 0.0074, "step": 12630 }, { "epoch": 34.25474254742547, "grad_norm": 0.16938942670822144, "learning_rate": 3.268251314559344e-05, "loss": 0.0094, "step": 12640 }, { "epoch": 34.28184281842818, "grad_norm": 0.3068222999572754, "learning_rate": 3.2604980400167254e-05, "loss": 0.0079, "step": 12650 }, { "epoch": 34.3089430894309, "grad_norm": 0.13214971125125885, "learning_rate": 3.252749521206042e-05, "loss": 0.0071, "step": 12660 }, { "epoch": 34.33604336043361, "grad_norm": 0.3195270597934723, "learning_rate": 3.2450057793114494e-05, "loss": 0.0097, "step": 12670 }, { "epoch": 34.363143631436316, "grad_norm": 0.1508561223745346, "learning_rate": 3.2372668355040435e-05, "loss": 0.0067, "step": 12680 }, { "epoch": 34.390243902439025, "grad_norm": 0.18666337430477142, "learning_rate": 3.2295327109418005e-05, "loss": 0.0064, "step": 12690 }, { "epoch": 34.417344173441734, "grad_norm": 0.15276911854743958, "learning_rate": 3.221803426769518e-05, "loss": 0.0061, "step": 12700 }, { "epoch": 34.44444444444444, "grad_norm": 0.21455082297325134, "learning_rate": 3.214079004118768e-05, "loss": 0.0084, "step": 12710 }, { "epoch": 34.47154471544715, "grad_norm": 0.17252852022647858, "learning_rate": 3.2063594641078234e-05, "loss": 0.0061, "step": 12720 }, { "epoch": 34.49864498644986, "grad_norm": 0.16442935168743134, "learning_rate": 3.198644827841616e-05, "loss": 0.009, "step": 12730 }, { "epoch": 34.52574525745258, "grad_norm": 0.38062527775764465, "learning_rate": 3.1909351164116654e-05, "loss": 0.0084, "step": 12740 }, { "epoch": 34.552845528455286, "grad_norm": 0.17739443480968475, "learning_rate": 3.183230350896026e-05, "loss": 0.009, "step": 12750 }, { "epoch": 34.579945799457995, "grad_norm": 0.18803781270980835, "learning_rate": 3.1755305523592337e-05, "loss": 0.0074, "step": 12760 }, { "epoch": 34.607046070460704, "grad_norm": 0.19971145689487457, "learning_rate": 3.167835741852245e-05, "loss": 0.0102, "step": 12770 }, { "epoch": 34.63414634146341, "grad_norm": 0.2661511301994324, "learning_rate": 3.160145940412378e-05, "loss": 0.0081, "step": 12780 }, { "epoch": 34.66124661246612, "grad_norm": 0.15748707950115204, "learning_rate": 3.1524611690632545e-05, "loss": 0.0081, "step": 12790 }, { "epoch": 34.68834688346883, "grad_norm": 0.15716984868049622, "learning_rate": 3.144781448814746e-05, "loss": 0.0067, "step": 12800 }, { "epoch": 34.71544715447155, "grad_norm": 0.33909112215042114, "learning_rate": 3.1371068006629145e-05, "loss": 0.0089, "step": 12810 }, { "epoch": 34.74254742547426, "grad_norm": 0.143507182598114, "learning_rate": 3.129437245589956e-05, "loss": 0.0058, "step": 12820 }, { "epoch": 34.769647696476966, "grad_norm": 0.12052647024393082, "learning_rate": 3.121772804564143e-05, "loss": 0.0058, "step": 12830 }, { "epoch": 34.796747967479675, "grad_norm": 0.18451768159866333, "learning_rate": 3.11411349853976e-05, "loss": 0.0079, "step": 12840 }, { "epoch": 34.823848238482384, "grad_norm": 0.1808987259864807, "learning_rate": 3.10645934845706e-05, "loss": 0.0103, "step": 12850 }, { "epoch": 34.85094850948509, "grad_norm": 0.1912742406129837, "learning_rate": 3.098810375242196e-05, "loss": 0.0085, "step": 12860 }, { "epoch": 34.8780487804878, "grad_norm": 0.15933136641979218, "learning_rate": 3.0911665998071704e-05, "loss": 0.0101, "step": 12870 }, { "epoch": 34.90514905149052, "grad_norm": 0.16721640527248383, "learning_rate": 3.083528043049774e-05, "loss": 0.0074, "step": 12880 }, { "epoch": 34.93224932249323, "grad_norm": 0.19409671425819397, "learning_rate": 3.0758947258535255e-05, "loss": 0.0053, "step": 12890 }, { "epoch": 34.959349593495936, "grad_norm": 0.14766128361225128, "learning_rate": 3.068266669087625e-05, "loss": 0.0065, "step": 12900 }, { "epoch": 34.986449864498645, "grad_norm": 0.1747574806213379, "learning_rate": 3.060643893606887e-05, "loss": 0.0072, "step": 12910 }, { "epoch": 35.013550135501355, "grad_norm": 0.27278900146484375, "learning_rate": 3.053026420251693e-05, "loss": 0.0075, "step": 12920 }, { "epoch": 35.040650406504064, "grad_norm": 0.24823696911334991, "learning_rate": 3.0454142698479183e-05, "loss": 0.0067, "step": 12930 }, { "epoch": 35.06775067750677, "grad_norm": 0.2055366337299347, "learning_rate": 3.0378074632068954e-05, "loss": 0.0084, "step": 12940 }, { "epoch": 35.09485094850948, "grad_norm": 0.4046635925769806, "learning_rate": 3.0302060211253408e-05, "loss": 0.0081, "step": 12950 }, { "epoch": 35.1219512195122, "grad_norm": 0.17768128216266632, "learning_rate": 3.0226099643853073e-05, "loss": 0.0075, "step": 12960 }, { "epoch": 35.14905149051491, "grad_norm": 0.27226102352142334, "learning_rate": 3.0150193137541283e-05, "loss": 0.0078, "step": 12970 }, { "epoch": 35.176151761517616, "grad_norm": 0.22766919434070587, "learning_rate": 3.0074340899843467e-05, "loss": 0.0073, "step": 12980 }, { "epoch": 35.203252032520325, "grad_norm": 0.22336988151073456, "learning_rate": 2.999854313813677e-05, "loss": 0.0082, "step": 12990 }, { "epoch": 35.230352303523034, "grad_norm": 0.2287459820508957, "learning_rate": 2.9922800059649382e-05, "loss": 0.0102, "step": 13000 }, { "epoch": 35.25745257452574, "grad_norm": 0.17669586837291718, "learning_rate": 2.9847111871459976e-05, "loss": 0.0087, "step": 13010 }, { "epoch": 35.28455284552845, "grad_norm": 0.14224295318126678, "learning_rate": 2.977147878049721e-05, "loss": 0.0073, "step": 13020 }, { "epoch": 35.31165311653117, "grad_norm": 0.1542416512966156, "learning_rate": 2.9695900993539006e-05, "loss": 0.0068, "step": 13030 }, { "epoch": 35.33875338753388, "grad_norm": 0.11448461562395096, "learning_rate": 2.9620378717212183e-05, "loss": 0.0056, "step": 13040 }, { "epoch": 35.36585365853659, "grad_norm": 0.1375311017036438, "learning_rate": 2.9544912157991745e-05, "loss": 0.0075, "step": 13050 }, { "epoch": 35.392953929539296, "grad_norm": 0.1756557822227478, "learning_rate": 2.9469501522200405e-05, "loss": 0.0065, "step": 13060 }, { "epoch": 35.420054200542005, "grad_norm": 0.13950446248054504, "learning_rate": 2.9394147016007946e-05, "loss": 0.0113, "step": 13070 }, { "epoch": 35.447154471544714, "grad_norm": 0.18171489238739014, "learning_rate": 2.9318848845430702e-05, "loss": 0.0093, "step": 13080 }, { "epoch": 35.47425474254742, "grad_norm": 0.16960223019123077, "learning_rate": 2.9243607216331013e-05, "loss": 0.0082, "step": 13090 }, { "epoch": 35.50135501355014, "grad_norm": 0.18758150935173035, "learning_rate": 2.916842233441661e-05, "loss": 0.0107, "step": 13100 }, { "epoch": 35.52845528455285, "grad_norm": 0.2391832321882248, "learning_rate": 2.90932944052401e-05, "loss": 0.0082, "step": 13110 }, { "epoch": 35.55555555555556, "grad_norm": 0.11851340532302856, "learning_rate": 2.9018223634198354e-05, "loss": 0.0099, "step": 13120 }, { "epoch": 35.582655826558266, "grad_norm": 0.22041457891464233, "learning_rate": 2.8943210226532025e-05, "loss": 0.01, "step": 13130 }, { "epoch": 35.609756097560975, "grad_norm": 0.1936119645833969, "learning_rate": 2.8868254387324857e-05, "loss": 0.0074, "step": 13140 }, { "epoch": 35.636856368563684, "grad_norm": 0.08397537469863892, "learning_rate": 2.8793356321503306e-05, "loss": 0.0075, "step": 13150 }, { "epoch": 35.66395663956639, "grad_norm": 0.18800604343414307, "learning_rate": 2.87185162338358e-05, "loss": 0.0065, "step": 13160 }, { "epoch": 35.6910569105691, "grad_norm": 0.2120666354894638, "learning_rate": 2.8643734328932253e-05, "loss": 0.0066, "step": 13170 }, { "epoch": 35.71815718157182, "grad_norm": 0.2982398569583893, "learning_rate": 2.856901081124359e-05, "loss": 0.0071, "step": 13180 }, { "epoch": 35.74525745257453, "grad_norm": 0.18741829693317413, "learning_rate": 2.8494345885061002e-05, "loss": 0.0059, "step": 13190 }, { "epoch": 35.77235772357724, "grad_norm": 0.1518445461988449, "learning_rate": 2.8419739754515616e-05, "loss": 0.0071, "step": 13200 }, { "epoch": 35.799457994579946, "grad_norm": 0.21891917288303375, "learning_rate": 2.8345192623577666e-05, "loss": 0.0074, "step": 13210 }, { "epoch": 35.826558265582655, "grad_norm": 0.2544057071208954, "learning_rate": 2.8270704696056193e-05, "loss": 0.0071, "step": 13220 }, { "epoch": 35.853658536585364, "grad_norm": 0.14287658035755157, "learning_rate": 2.8196276175598367e-05, "loss": 0.0061, "step": 13230 }, { "epoch": 35.88075880758807, "grad_norm": 0.12916241586208344, "learning_rate": 2.8121907265688884e-05, "loss": 0.0076, "step": 13240 }, { "epoch": 35.90785907859079, "grad_norm": 0.14093485474586487, "learning_rate": 2.804759816964957e-05, "loss": 0.0059, "step": 13250 }, { "epoch": 35.9349593495935, "grad_norm": 0.1611955761909485, "learning_rate": 2.797334909063857e-05, "loss": 0.0092, "step": 13260 }, { "epoch": 35.96205962059621, "grad_norm": 0.10550932586193085, "learning_rate": 2.7899160231650056e-05, "loss": 0.0082, "step": 13270 }, { "epoch": 35.989159891598916, "grad_norm": 0.29210400581359863, "learning_rate": 2.7825031795513585e-05, "loss": 0.0069, "step": 13280 }, { "epoch": 36.016260162601625, "grad_norm": 0.1624496579170227, "learning_rate": 2.775096398489341e-05, "loss": 0.0069, "step": 13290 }, { "epoch": 36.043360433604335, "grad_norm": 0.18560156226158142, "learning_rate": 2.7676957002288163e-05, "loss": 0.0054, "step": 13300 }, { "epoch": 36.070460704607044, "grad_norm": 0.16980290412902832, "learning_rate": 2.760301105003003e-05, "loss": 0.0058, "step": 13310 }, { "epoch": 36.09756097560975, "grad_norm": 0.1851630061864853, "learning_rate": 2.752912633028446e-05, "loss": 0.0078, "step": 13320 }, { "epoch": 36.12466124661247, "grad_norm": 0.2316962480545044, "learning_rate": 2.7455303045049474e-05, "loss": 0.0071, "step": 13330 }, { "epoch": 36.15176151761518, "grad_norm": 0.239768385887146, "learning_rate": 2.7381541396155098e-05, "loss": 0.0061, "step": 13340 }, { "epoch": 36.17886178861789, "grad_norm": 0.44044336676597595, "learning_rate": 2.730784158526286e-05, "loss": 0.0062, "step": 13350 }, { "epoch": 36.205962059620596, "grad_norm": 0.6850070953369141, "learning_rate": 2.723420381386521e-05, "loss": 0.0088, "step": 13360 }, { "epoch": 36.233062330623305, "grad_norm": 0.23442471027374268, "learning_rate": 2.7160628283285018e-05, "loss": 0.0085, "step": 13370 }, { "epoch": 36.260162601626014, "grad_norm": 0.17065224051475525, "learning_rate": 2.7087115194675007e-05, "loss": 0.0071, "step": 13380 }, { "epoch": 36.28726287262872, "grad_norm": 0.1544269174337387, "learning_rate": 2.701366474901712e-05, "loss": 0.0069, "step": 13390 }, { "epoch": 36.31436314363144, "grad_norm": 0.17808552086353302, "learning_rate": 2.6940277147122085e-05, "loss": 0.0066, "step": 13400 }, { "epoch": 36.34146341463415, "grad_norm": 0.2015807032585144, "learning_rate": 2.686695258962878e-05, "loss": 0.007, "step": 13410 }, { "epoch": 36.36856368563686, "grad_norm": 0.11304455250501633, "learning_rate": 2.679369127700375e-05, "loss": 0.0061, "step": 13420 }, { "epoch": 36.39566395663957, "grad_norm": 0.1619618982076645, "learning_rate": 2.672049340954067e-05, "loss": 0.0087, "step": 13430 }, { "epoch": 36.422764227642276, "grad_norm": 0.14021410048007965, "learning_rate": 2.6647359187359676e-05, "loss": 0.0097, "step": 13440 }, { "epoch": 36.449864498644985, "grad_norm": 0.23788203299045563, "learning_rate": 2.6574288810406946e-05, "loss": 0.0071, "step": 13450 }, { "epoch": 36.476964769647694, "grad_norm": 0.2884807586669922, "learning_rate": 2.6501282478454083e-05, "loss": 0.0066, "step": 13460 }, { "epoch": 36.50406504065041, "grad_norm": 0.1832629293203354, "learning_rate": 2.6428340391097618e-05, "loss": 0.0063, "step": 13470 }, { "epoch": 36.53116531165312, "grad_norm": 0.23150089383125305, "learning_rate": 2.6355462747758485e-05, "loss": 0.0052, "step": 13480 }, { "epoch": 36.55826558265583, "grad_norm": 0.2992226183414459, "learning_rate": 2.6282649747681304e-05, "loss": 0.0068, "step": 13490 }, { "epoch": 36.58536585365854, "grad_norm": 0.14088834822177887, "learning_rate": 2.620990158993406e-05, "loss": 0.0068, "step": 13500 }, { "epoch": 36.612466124661246, "grad_norm": 0.2998476028442383, "learning_rate": 2.6137218473407477e-05, "loss": 0.008, "step": 13510 }, { "epoch": 36.639566395663955, "grad_norm": 0.5066817998886108, "learning_rate": 2.606460059681436e-05, "loss": 0.006, "step": 13520 }, { "epoch": 36.666666666666664, "grad_norm": 0.15699605643749237, "learning_rate": 2.599204815868928e-05, "loss": 0.0064, "step": 13530 }, { "epoch": 36.69376693766937, "grad_norm": 0.16631172597408295, "learning_rate": 2.5919561357387756e-05, "loss": 0.0068, "step": 13540 }, { "epoch": 36.72086720867209, "grad_norm": 0.18928463757038116, "learning_rate": 2.5847140391085972e-05, "loss": 0.0093, "step": 13550 }, { "epoch": 36.7479674796748, "grad_norm": 0.18820270895957947, "learning_rate": 2.5774785457780103e-05, "loss": 0.0058, "step": 13560 }, { "epoch": 36.77506775067751, "grad_norm": 0.20964524149894714, "learning_rate": 2.5702496755285753e-05, "loss": 0.0083, "step": 13570 }, { "epoch": 36.80216802168022, "grad_norm": 0.16885870695114136, "learning_rate": 2.5630274481237483e-05, "loss": 0.0063, "step": 13580 }, { "epoch": 36.829268292682926, "grad_norm": 0.19431626796722412, "learning_rate": 2.5558118833088197e-05, "loss": 0.0082, "step": 13590 }, { "epoch": 36.856368563685635, "grad_norm": 0.1777656227350235, "learning_rate": 2.548603000810872e-05, "loss": 0.0066, "step": 13600 }, { "epoch": 36.883468834688344, "grad_norm": 0.19363994896411896, "learning_rate": 2.5414008203387152e-05, "loss": 0.0084, "step": 13610 }, { "epoch": 36.91056910569106, "grad_norm": 0.2294522374868393, "learning_rate": 2.534205361582834e-05, "loss": 0.0111, "step": 13620 }, { "epoch": 36.93766937669377, "grad_norm": 0.12288045883178711, "learning_rate": 2.527016644215338e-05, "loss": 0.0066, "step": 13630 }, { "epoch": 36.96476964769648, "grad_norm": 0.21495701372623444, "learning_rate": 2.519834687889905e-05, "loss": 0.0065, "step": 13640 }, { "epoch": 36.99186991869919, "grad_norm": 0.1644265055656433, "learning_rate": 2.5126595122417295e-05, "loss": 0.0066, "step": 13650 }, { "epoch": 37.018970189701896, "grad_norm": 0.23176752030849457, "learning_rate": 2.5054911368874713e-05, "loss": 0.0064, "step": 13660 }, { "epoch": 37.046070460704605, "grad_norm": 0.11874737590551376, "learning_rate": 2.4983295814251916e-05, "loss": 0.0065, "step": 13670 }, { "epoch": 37.073170731707314, "grad_norm": 0.14342182874679565, "learning_rate": 2.4911748654343105e-05, "loss": 0.0097, "step": 13680 }, { "epoch": 37.10027100271003, "grad_norm": 0.19524219632148743, "learning_rate": 2.4840270084755463e-05, "loss": 0.0092, "step": 13690 }, { "epoch": 37.12737127371274, "grad_norm": 0.19951020181179047, "learning_rate": 2.4768860300908685e-05, "loss": 0.0072, "step": 13700 }, { "epoch": 37.15447154471545, "grad_norm": 0.2914043664932251, "learning_rate": 2.469751949803443e-05, "loss": 0.0072, "step": 13710 }, { "epoch": 37.18157181571816, "grad_norm": 0.18333879113197327, "learning_rate": 2.4626247871175666e-05, "loss": 0.0067, "step": 13720 }, { "epoch": 37.20867208672087, "grad_norm": 0.13867692649364471, "learning_rate": 2.4555045615186346e-05, "loss": 0.0086, "step": 13730 }, { "epoch": 37.235772357723576, "grad_norm": 0.1423323005437851, "learning_rate": 2.4483912924730677e-05, "loss": 0.0064, "step": 13740 }, { "epoch": 37.262872628726285, "grad_norm": 0.13120156526565552, "learning_rate": 2.4412849994282742e-05, "loss": 0.0068, "step": 13750 }, { "epoch": 37.289972899728994, "grad_norm": 0.16872307658195496, "learning_rate": 2.434185701812592e-05, "loss": 0.0065, "step": 13760 }, { "epoch": 37.31707317073171, "grad_norm": 0.13427405059337616, "learning_rate": 2.4270934190352218e-05, "loss": 0.0082, "step": 13770 }, { "epoch": 37.34417344173442, "grad_norm": 0.11225791275501251, "learning_rate": 2.4200081704861998e-05, "loss": 0.0064, "step": 13780 }, { "epoch": 37.37127371273713, "grad_norm": 0.17972229421138763, "learning_rate": 2.412929975536321e-05, "loss": 0.0071, "step": 13790 }, { "epoch": 37.39837398373984, "grad_norm": 0.19697989523410797, "learning_rate": 2.4058588535371017e-05, "loss": 0.0102, "step": 13800 }, { "epoch": 37.42547425474255, "grad_norm": 0.159907728433609, "learning_rate": 2.3987948238207243e-05, "loss": 0.0071, "step": 13810 }, { "epoch": 37.452574525745256, "grad_norm": 0.42949384450912476, "learning_rate": 2.3917379056999678e-05, "loss": 0.008, "step": 13820 }, { "epoch": 37.479674796747965, "grad_norm": 0.29631245136260986, "learning_rate": 2.3846881184681824e-05, "loss": 0.008, "step": 13830 }, { "epoch": 37.50677506775068, "grad_norm": 0.17850087583065033, "learning_rate": 2.377645481399214e-05, "loss": 0.0056, "step": 13840 }, { "epoch": 37.53387533875339, "grad_norm": 0.21160969138145447, "learning_rate": 2.3706100137473667e-05, "loss": 0.007, "step": 13850 }, { "epoch": 37.5609756097561, "grad_norm": 0.19388577342033386, "learning_rate": 2.3635817347473394e-05, "loss": 0.0075, "step": 13860 }, { "epoch": 37.58807588075881, "grad_norm": 0.16958138346672058, "learning_rate": 2.3565606636141757e-05, "loss": 0.0059, "step": 13870 }, { "epoch": 37.61517615176152, "grad_norm": 0.27087029814720154, "learning_rate": 2.3495468195432203e-05, "loss": 0.0078, "step": 13880 }, { "epoch": 37.642276422764226, "grad_norm": 0.22888928651809692, "learning_rate": 2.3425402217100507e-05, "loss": 0.0049, "step": 13890 }, { "epoch": 37.669376693766935, "grad_norm": 0.27046632766723633, "learning_rate": 2.3355408892704424e-05, "loss": 0.0058, "step": 13900 }, { "epoch": 37.696476964769644, "grad_norm": 0.15023523569107056, "learning_rate": 2.3285488413603003e-05, "loss": 0.0091, "step": 13910 }, { "epoch": 37.72357723577236, "grad_norm": 0.1310701072216034, "learning_rate": 2.321564097095615e-05, "loss": 0.008, "step": 13920 }, { "epoch": 37.75067750677507, "grad_norm": 0.28465476632118225, "learning_rate": 2.3145866755724142e-05, "loss": 0.0062, "step": 13930 }, { "epoch": 37.77777777777778, "grad_norm": 0.18440328538417816, "learning_rate": 2.307616595866699e-05, "loss": 0.0058, "step": 13940 }, { "epoch": 37.80487804878049, "grad_norm": 0.20021019876003265, "learning_rate": 2.3006538770344032e-05, "loss": 0.0067, "step": 13950 }, { "epoch": 37.8319783197832, "grad_norm": 0.16673649847507477, "learning_rate": 2.293698538111334e-05, "loss": 0.0074, "step": 13960 }, { "epoch": 37.859078590785906, "grad_norm": 0.1467021107673645, "learning_rate": 2.28675059811312e-05, "loss": 0.0065, "step": 13970 }, { "epoch": 37.886178861788615, "grad_norm": 0.12668804824352264, "learning_rate": 2.279810076035167e-05, "loss": 0.0052, "step": 13980 }, { "epoch": 37.91327913279133, "grad_norm": 0.20830276608467102, "learning_rate": 2.272876990852596e-05, "loss": 0.0074, "step": 13990 }, { "epoch": 37.94037940379404, "grad_norm": 0.19295884668827057, "learning_rate": 2.265951361520195e-05, "loss": 0.0055, "step": 14000 }, { "epoch": 37.96747967479675, "grad_norm": 0.28534308075904846, "learning_rate": 2.2590332069723748e-05, "loss": 0.0072, "step": 14010 }, { "epoch": 37.99457994579946, "grad_norm": 0.18349437415599823, "learning_rate": 2.2521225461231004e-05, "loss": 0.0067, "step": 14020 }, { "epoch": 38.02168021680217, "grad_norm": 0.13209030032157898, "learning_rate": 2.2452193978658597e-05, "loss": 0.007, "step": 14030 }, { "epoch": 38.048780487804876, "grad_norm": 0.12526516616344452, "learning_rate": 2.238323781073594e-05, "loss": 0.007, "step": 14040 }, { "epoch": 38.075880758807585, "grad_norm": 0.13823744654655457, "learning_rate": 2.2314357145986552e-05, "loss": 0.0073, "step": 14050 }, { "epoch": 38.1029810298103, "grad_norm": 0.13085457682609558, "learning_rate": 2.224555217272757e-05, "loss": 0.0088, "step": 14060 }, { "epoch": 38.13008130081301, "grad_norm": 0.2153085321187973, "learning_rate": 2.2176823079069127e-05, "loss": 0.0063, "step": 14070 }, { "epoch": 38.15718157181572, "grad_norm": 0.14179079234600067, "learning_rate": 2.210817005291398e-05, "loss": 0.0063, "step": 14080 }, { "epoch": 38.18428184281843, "grad_norm": 0.27023130655288696, "learning_rate": 2.203959328195686e-05, "loss": 0.0087, "step": 14090 }, { "epoch": 38.21138211382114, "grad_norm": 0.1771167367696762, "learning_rate": 2.1971092953684026e-05, "loss": 0.0064, "step": 14100 }, { "epoch": 38.23848238482385, "grad_norm": 0.26843690872192383, "learning_rate": 2.1902669255372788e-05, "loss": 0.0075, "step": 14110 }, { "epoch": 38.265582655826556, "grad_norm": 0.13176365196704865, "learning_rate": 2.1834322374090897e-05, "loss": 0.0079, "step": 14120 }, { "epoch": 38.292682926829265, "grad_norm": 0.1854613721370697, "learning_rate": 2.1766052496696153e-05, "loss": 0.0082, "step": 14130 }, { "epoch": 38.31978319783198, "grad_norm": 0.1490379124879837, "learning_rate": 2.169785980983577e-05, "loss": 0.0087, "step": 14140 }, { "epoch": 38.34688346883469, "grad_norm": 0.1480277180671692, "learning_rate": 2.162974449994593e-05, "loss": 0.0069, "step": 14150 }, { "epoch": 38.3739837398374, "grad_norm": 0.212471142411232, "learning_rate": 2.1561706753251337e-05, "loss": 0.0114, "step": 14160 }, { "epoch": 38.40108401084011, "grad_norm": 0.16021981835365295, "learning_rate": 2.1493746755764544e-05, "loss": 0.0055, "step": 14170 }, { "epoch": 38.42818428184282, "grad_norm": 0.18070632219314575, "learning_rate": 2.1425864693285635e-05, "loss": 0.0082, "step": 14180 }, { "epoch": 38.45528455284553, "grad_norm": 0.6733684539794922, "learning_rate": 2.1358060751401547e-05, "loss": 0.0088, "step": 14190 }, { "epoch": 38.482384823848236, "grad_norm": 0.13270796835422516, "learning_rate": 2.129033511548566e-05, "loss": 0.0055, "step": 14200 }, { "epoch": 38.50948509485095, "grad_norm": 0.1656384915113449, "learning_rate": 2.1222687970697315e-05, "loss": 0.0071, "step": 14210 }, { "epoch": 38.53658536585366, "grad_norm": 0.2169555276632309, "learning_rate": 2.1155119501981173e-05, "loss": 0.0069, "step": 14220 }, { "epoch": 38.56368563685637, "grad_norm": 0.159426748752594, "learning_rate": 2.1087629894066895e-05, "loss": 0.0053, "step": 14230 }, { "epoch": 38.59078590785908, "grad_norm": 0.12559357285499573, "learning_rate": 2.1020219331468473e-05, "loss": 0.0048, "step": 14240 }, { "epoch": 38.61788617886179, "grad_norm": 0.16586895287036896, "learning_rate": 2.095288799848379e-05, "loss": 0.0066, "step": 14250 }, { "epoch": 38.6449864498645, "grad_norm": 0.5561702847480774, "learning_rate": 2.088563607919417e-05, "loss": 0.0059, "step": 14260 }, { "epoch": 38.672086720867206, "grad_norm": 0.22358690202236176, "learning_rate": 2.0818463757463786e-05, "loss": 0.0067, "step": 14270 }, { "epoch": 38.69918699186992, "grad_norm": 0.17953239381313324, "learning_rate": 2.0751371216939175e-05, "loss": 0.0051, "step": 14280 }, { "epoch": 38.72628726287263, "grad_norm": 0.11877129226922989, "learning_rate": 2.068435864104882e-05, "loss": 0.0083, "step": 14290 }, { "epoch": 38.75338753387534, "grad_norm": 0.11580316722393036, "learning_rate": 2.0617426213002506e-05, "loss": 0.0069, "step": 14300 }, { "epoch": 38.78048780487805, "grad_norm": 0.4734462797641754, "learning_rate": 2.055057411579097e-05, "loss": 0.0072, "step": 14310 }, { "epoch": 38.80758807588076, "grad_norm": 0.3175680339336395, "learning_rate": 2.0483802532185286e-05, "loss": 0.0066, "step": 14320 }, { "epoch": 38.83468834688347, "grad_norm": 0.18677327036857605, "learning_rate": 2.041711164473638e-05, "loss": 0.0059, "step": 14330 }, { "epoch": 38.86178861788618, "grad_norm": 0.12488168478012085, "learning_rate": 2.0350501635774637e-05, "loss": 0.0053, "step": 14340 }, { "epoch": 38.888888888888886, "grad_norm": 0.13501761853694916, "learning_rate": 2.0283972687409247e-05, "loss": 0.0086, "step": 14350 }, { "epoch": 38.9159891598916, "grad_norm": 0.16577276587486267, "learning_rate": 2.021752498152784e-05, "loss": 0.0079, "step": 14360 }, { "epoch": 38.94308943089431, "grad_norm": 0.1932990700006485, "learning_rate": 2.015115869979589e-05, "loss": 0.0059, "step": 14370 }, { "epoch": 38.97018970189702, "grad_norm": 0.13587765395641327, "learning_rate": 2.0084874023656265e-05, "loss": 0.0079, "step": 14380 }, { "epoch": 38.99728997289973, "grad_norm": 0.2684161067008972, "learning_rate": 2.001867113432877e-05, "loss": 0.0069, "step": 14390 }, { "epoch": 39.02439024390244, "grad_norm": 0.13740691542625427, "learning_rate": 1.995255021280954e-05, "loss": 0.0079, "step": 14400 }, { "epoch": 39.05149051490515, "grad_norm": 0.17065179347991943, "learning_rate": 1.9886511439870688e-05, "loss": 0.0051, "step": 14410 }, { "epoch": 39.078590785907856, "grad_norm": 0.17003221809864044, "learning_rate": 1.9820554996059675e-05, "loss": 0.0062, "step": 14420 }, { "epoch": 39.10569105691057, "grad_norm": 0.11185391247272491, "learning_rate": 1.9754681061698893e-05, "loss": 0.0066, "step": 14430 }, { "epoch": 39.13279132791328, "grad_norm": 0.24696999788284302, "learning_rate": 1.9688889816885185e-05, "loss": 0.0078, "step": 14440 }, { "epoch": 39.15989159891599, "grad_norm": 0.1950426548719406, "learning_rate": 1.962318144148928e-05, "loss": 0.0056, "step": 14450 }, { "epoch": 39.1869918699187, "grad_norm": 0.13897421956062317, "learning_rate": 1.955755611515539e-05, "loss": 0.0073, "step": 14460 }, { "epoch": 39.21409214092141, "grad_norm": 0.20904766023159027, "learning_rate": 1.9492014017300642e-05, "loss": 0.0056, "step": 14470 }, { "epoch": 39.24119241192412, "grad_norm": 0.14363570511341095, "learning_rate": 1.942655532711461e-05, "loss": 0.0078, "step": 14480 }, { "epoch": 39.26829268292683, "grad_norm": 0.15833771228790283, "learning_rate": 1.9361180223558882e-05, "loss": 0.0065, "step": 14490 }, { "epoch": 39.295392953929536, "grad_norm": 0.1422295868396759, "learning_rate": 1.929588888536647e-05, "loss": 0.0068, "step": 14500 }, { "epoch": 39.32249322493225, "grad_norm": 0.15797455608844757, "learning_rate": 1.9230681491041425e-05, "loss": 0.0064, "step": 14510 }, { "epoch": 39.34959349593496, "grad_norm": 0.10724328458309174, "learning_rate": 1.9165558218858264e-05, "loss": 0.0078, "step": 14520 }, { "epoch": 39.37669376693767, "grad_norm": 0.09831748902797699, "learning_rate": 1.9100519246861505e-05, "loss": 0.0051, "step": 14530 }, { "epoch": 39.40379403794038, "grad_norm": 0.22848092019557953, "learning_rate": 1.9035564752865248e-05, "loss": 0.0067, "step": 14540 }, { "epoch": 39.43089430894309, "grad_norm": 0.1680644452571869, "learning_rate": 1.897069491445258e-05, "loss": 0.0063, "step": 14550 }, { "epoch": 39.4579945799458, "grad_norm": 0.14080646634101868, "learning_rate": 1.890590990897515e-05, "loss": 0.0079, "step": 14560 }, { "epoch": 39.48509485094851, "grad_norm": 0.13812215626239777, "learning_rate": 1.884120991355272e-05, "loss": 0.0069, "step": 14570 }, { "epoch": 39.51219512195122, "grad_norm": 0.1742081642150879, "learning_rate": 1.8776595105072576e-05, "loss": 0.006, "step": 14580 }, { "epoch": 39.53929539295393, "grad_norm": 0.20583809912204742, "learning_rate": 1.8712065660189166e-05, "loss": 0.0075, "step": 14590 }, { "epoch": 39.56639566395664, "grad_norm": 0.12186416983604431, "learning_rate": 1.8647621755323513e-05, "loss": 0.0046, "step": 14600 }, { "epoch": 39.59349593495935, "grad_norm": 0.09667308628559113, "learning_rate": 1.858326356666278e-05, "loss": 0.0052, "step": 14610 }, { "epoch": 39.62059620596206, "grad_norm": 0.11311681568622589, "learning_rate": 1.851899127015983e-05, "loss": 0.0054, "step": 14620 }, { "epoch": 39.64769647696477, "grad_norm": 0.1594165563583374, "learning_rate": 1.8454805041532626e-05, "loss": 0.0057, "step": 14630 }, { "epoch": 39.67479674796748, "grad_norm": 0.11635188013315201, "learning_rate": 1.8390705056263906e-05, "loss": 0.0069, "step": 14640 }, { "epoch": 39.70189701897019, "grad_norm": 0.164677694439888, "learning_rate": 1.832669148960057e-05, "loss": 0.0067, "step": 14650 }, { "epoch": 39.7289972899729, "grad_norm": 0.09936928749084473, "learning_rate": 1.8262764516553233e-05, "loss": 0.0075, "step": 14660 }, { "epoch": 39.75609756097561, "grad_norm": 0.27220606803894043, "learning_rate": 1.8198924311895843e-05, "loss": 0.0057, "step": 14670 }, { "epoch": 39.78319783197832, "grad_norm": 0.19956834614276886, "learning_rate": 1.813517105016505e-05, "loss": 0.0074, "step": 14680 }, { "epoch": 39.81029810298103, "grad_norm": 0.10844115912914276, "learning_rate": 1.8071504905659888e-05, "loss": 0.0065, "step": 14690 }, { "epoch": 39.83739837398374, "grad_norm": 0.20101509988307953, "learning_rate": 1.800792605244109e-05, "loss": 0.0056, "step": 14700 }, { "epoch": 39.86449864498645, "grad_norm": 0.29316991567611694, "learning_rate": 1.7944434664330844e-05, "loss": 0.0066, "step": 14710 }, { "epoch": 39.89159891598916, "grad_norm": 0.18381386995315552, "learning_rate": 1.7881030914912212e-05, "loss": 0.0059, "step": 14720 }, { "epoch": 39.91869918699187, "grad_norm": 0.14670105278491974, "learning_rate": 1.7817714977528577e-05, "loss": 0.0069, "step": 14730 }, { "epoch": 39.94579945799458, "grad_norm": 0.27042171359062195, "learning_rate": 1.7754487025283332e-05, "loss": 0.0072, "step": 14740 }, { "epoch": 39.97289972899729, "grad_norm": 0.4190824329853058, "learning_rate": 1.7691347231039275e-05, "loss": 0.0053, "step": 14750 }, { "epoch": 40.0, "grad_norm": 0.12946806848049164, "learning_rate": 1.7628295767418164e-05, "loss": 0.0067, "step": 14760 }, { "epoch": 40.02710027100271, "grad_norm": 0.2278592586517334, "learning_rate": 1.7565332806800333e-05, "loss": 0.007, "step": 14770 }, { "epoch": 40.05420054200542, "grad_norm": 0.1610528528690338, "learning_rate": 1.750245852132408e-05, "loss": 0.006, "step": 14780 }, { "epoch": 40.08130081300813, "grad_norm": 0.1977321058511734, "learning_rate": 1.7439673082885323e-05, "loss": 0.0054, "step": 14790 }, { "epoch": 40.10840108401084, "grad_norm": 0.08471622318029404, "learning_rate": 1.7376976663137047e-05, "loss": 0.0067, "step": 14800 }, { "epoch": 40.13550135501355, "grad_norm": 0.18286339938640594, "learning_rate": 1.7314369433488853e-05, "loss": 0.0058, "step": 14810 }, { "epoch": 40.16260162601626, "grad_norm": 0.4733876883983612, "learning_rate": 1.7251851565106548e-05, "loss": 0.0062, "step": 14820 }, { "epoch": 40.18970189701897, "grad_norm": 0.22591553628444672, "learning_rate": 1.7189423228911574e-05, "loss": 0.0058, "step": 14830 }, { "epoch": 40.21680216802168, "grad_norm": 0.09818581491708755, "learning_rate": 1.7127084595580606e-05, "loss": 0.007, "step": 14840 }, { "epoch": 40.24390243902439, "grad_norm": 0.13922473788261414, "learning_rate": 1.706483583554513e-05, "loss": 0.0085, "step": 14850 }, { "epoch": 40.2710027100271, "grad_norm": 0.2723208963871002, "learning_rate": 1.700267711899083e-05, "loss": 0.0081, "step": 14860 }, { "epoch": 40.29810298102981, "grad_norm": 0.10639534890651703, "learning_rate": 1.69406086158573e-05, "loss": 0.0058, "step": 14870 }, { "epoch": 40.32520325203252, "grad_norm": 0.09317772090435028, "learning_rate": 1.6878630495837455e-05, "loss": 0.0049, "step": 14880 }, { "epoch": 40.35230352303523, "grad_norm": 0.356636106967926, "learning_rate": 1.681674292837707e-05, "loss": 0.0052, "step": 14890 }, { "epoch": 40.37940379403794, "grad_norm": 0.19073514640331268, "learning_rate": 1.6754946082674444e-05, "loss": 0.0068, "step": 14900 }, { "epoch": 40.40650406504065, "grad_norm": 0.08838856965303421, "learning_rate": 1.6693240127679748e-05, "loss": 0.0049, "step": 14910 }, { "epoch": 40.43360433604336, "grad_norm": 0.30231767892837524, "learning_rate": 1.663162523209475e-05, "loss": 0.0069, "step": 14920 }, { "epoch": 40.46070460704607, "grad_norm": 0.2967580258846283, "learning_rate": 1.6570101564372193e-05, "loss": 0.0072, "step": 14930 }, { "epoch": 40.48780487804878, "grad_norm": 0.18485017120838165, "learning_rate": 1.650866929271543e-05, "loss": 0.0078, "step": 14940 }, { "epoch": 40.51490514905149, "grad_norm": 0.16824975609779358, "learning_rate": 1.644732858507797e-05, "loss": 0.0053, "step": 14950 }, { "epoch": 40.5420054200542, "grad_norm": 0.25375404953956604, "learning_rate": 1.6386079609162943e-05, "loss": 0.0059, "step": 14960 }, { "epoch": 40.56910569105691, "grad_norm": 0.15846814215183258, "learning_rate": 1.6324922532422742e-05, "loss": 0.0057, "step": 14970 }, { "epoch": 40.59620596205962, "grad_norm": 0.2749636173248291, "learning_rate": 1.6263857522058434e-05, "loss": 0.0062, "step": 14980 }, { "epoch": 40.62330623306233, "grad_norm": 0.15305157005786896, "learning_rate": 1.6202884745019443e-05, "loss": 0.0048, "step": 14990 }, { "epoch": 40.65040650406504, "grad_norm": 0.2381352186203003, "learning_rate": 1.614200436800304e-05, "loss": 0.0067, "step": 15000 }, { "epoch": 40.67750677506775, "grad_norm": 0.15931585431098938, "learning_rate": 1.6081216557453814e-05, "loss": 0.0079, "step": 15010 }, { "epoch": 40.704607046070464, "grad_norm": 0.2737705409526825, "learning_rate": 1.6020521479563367e-05, "loss": 0.0095, "step": 15020 }, { "epoch": 40.73170731707317, "grad_norm": 0.2357729822397232, "learning_rate": 1.5959919300269654e-05, "loss": 0.007, "step": 15030 }, { "epoch": 40.75880758807588, "grad_norm": 0.13825517892837524, "learning_rate": 1.5899410185256764e-05, "loss": 0.0053, "step": 15040 }, { "epoch": 40.78590785907859, "grad_norm": 0.17495086789131165, "learning_rate": 1.583899429995431e-05, "loss": 0.0055, "step": 15050 }, { "epoch": 40.8130081300813, "grad_norm": 0.10616852343082428, "learning_rate": 1.5778671809536993e-05, "loss": 0.0054, "step": 15060 }, { "epoch": 40.84010840108401, "grad_norm": 0.09753458201885223, "learning_rate": 1.5718442878924246e-05, "loss": 0.0056, "step": 15070 }, { "epoch": 40.86720867208672, "grad_norm": 0.13864010572433472, "learning_rate": 1.5658307672779593e-05, "loss": 0.0085, "step": 15080 }, { "epoch": 40.89430894308943, "grad_norm": 0.13822434842586517, "learning_rate": 1.5598266355510427e-05, "loss": 0.0051, "step": 15090 }, { "epoch": 40.921409214092144, "grad_norm": 0.21132275462150574, "learning_rate": 1.553831909126744e-05, "loss": 0.0077, "step": 15100 }, { "epoch": 40.94850948509485, "grad_norm": 0.19039276242256165, "learning_rate": 1.5478466043944135e-05, "loss": 0.0067, "step": 15110 }, { "epoch": 40.97560975609756, "grad_norm": 0.11550827324390411, "learning_rate": 1.5418707377176468e-05, "loss": 0.0061, "step": 15120 }, { "epoch": 41.00271002710027, "grad_norm": 0.18926581740379333, "learning_rate": 1.535904325434233e-05, "loss": 0.0072, "step": 15130 }, { "epoch": 41.02981029810298, "grad_norm": 0.1374940723180771, "learning_rate": 1.529947383856118e-05, "loss": 0.0059, "step": 15140 }, { "epoch": 41.05691056910569, "grad_norm": 0.12394358962774277, "learning_rate": 1.5239999292693524e-05, "loss": 0.0057, "step": 15150 }, { "epoch": 41.0840108401084, "grad_norm": 0.17972442507743835, "learning_rate": 1.5180619779340505e-05, "loss": 0.0057, "step": 15160 }, { "epoch": 41.111111111111114, "grad_norm": 0.19728465378284454, "learning_rate": 1.5121335460843428e-05, "loss": 0.0051, "step": 15170 }, { "epoch": 41.13821138211382, "grad_norm": 0.14743106067180634, "learning_rate": 1.5062146499283347e-05, "loss": 0.0095, "step": 15180 }, { "epoch": 41.16531165311653, "grad_norm": 0.18473105132579803, "learning_rate": 1.5003053056480643e-05, "loss": 0.0044, "step": 15190 }, { "epoch": 41.19241192411924, "grad_norm": 0.22731715440750122, "learning_rate": 1.4944055293994551e-05, "loss": 0.0066, "step": 15200 }, { "epoch": 41.21951219512195, "grad_norm": 0.284103125333786, "learning_rate": 1.4885153373122656e-05, "loss": 0.0076, "step": 15210 }, { "epoch": 41.24661246612466, "grad_norm": 0.12830741703510284, "learning_rate": 1.482634745490059e-05, "loss": 0.0073, "step": 15220 }, { "epoch": 41.27371273712737, "grad_norm": 0.23458468914031982, "learning_rate": 1.4767637700101466e-05, "loss": 0.0102, "step": 15230 }, { "epoch": 41.300813008130085, "grad_norm": 0.1784856915473938, "learning_rate": 1.4709024269235528e-05, "loss": 0.0055, "step": 15240 }, { "epoch": 41.327913279132794, "grad_norm": 0.19092804193496704, "learning_rate": 1.4650507322549684e-05, "loss": 0.0062, "step": 15250 }, { "epoch": 41.3550135501355, "grad_norm": 0.1643650233745575, "learning_rate": 1.4592087020026972e-05, "loss": 0.0063, "step": 15260 }, { "epoch": 41.38211382113821, "grad_norm": 0.23264330625534058, "learning_rate": 1.4533763521386318e-05, "loss": 0.0065, "step": 15270 }, { "epoch": 41.40921409214092, "grad_norm": 0.1387760192155838, "learning_rate": 1.44755369860819e-05, "loss": 0.0056, "step": 15280 }, { "epoch": 41.43631436314363, "grad_norm": 0.1651674062013626, "learning_rate": 1.441740757330287e-05, "loss": 0.0081, "step": 15290 }, { "epoch": 41.46341463414634, "grad_norm": 0.1573343425989151, "learning_rate": 1.4359375441972844e-05, "loss": 0.0064, "step": 15300 }, { "epoch": 41.49051490514905, "grad_norm": 0.1500045359134674, "learning_rate": 1.4301440750749395e-05, "loss": 0.0064, "step": 15310 }, { "epoch": 41.517615176151764, "grad_norm": 0.1026371568441391, "learning_rate": 1.4243603658023808e-05, "loss": 0.0046, "step": 15320 }, { "epoch": 41.54471544715447, "grad_norm": 0.17724239826202393, "learning_rate": 1.4185864321920444e-05, "loss": 0.0098, "step": 15330 }, { "epoch": 41.57181571815718, "grad_norm": 0.1293010115623474, "learning_rate": 1.4128222900296485e-05, "loss": 0.0054, "step": 15340 }, { "epoch": 41.59891598915989, "grad_norm": 0.19513213634490967, "learning_rate": 1.407067955074135e-05, "loss": 0.0066, "step": 15350 }, { "epoch": 41.6260162601626, "grad_norm": 0.1645292043685913, "learning_rate": 1.4013234430576356e-05, "loss": 0.0074, "step": 15360 }, { "epoch": 41.65311653116531, "grad_norm": 0.19021093845367432, "learning_rate": 1.3955887696854286e-05, "loss": 0.0057, "step": 15370 }, { "epoch": 41.68021680216802, "grad_norm": 0.10806591063737869, "learning_rate": 1.38986395063589e-05, "loss": 0.0067, "step": 15380 }, { "epoch": 41.707317073170735, "grad_norm": 0.10623554140329361, "learning_rate": 1.3841490015604597e-05, "loss": 0.0078, "step": 15390 }, { "epoch": 41.734417344173444, "grad_norm": 0.2214043140411377, "learning_rate": 1.3784439380835879e-05, "loss": 0.0047, "step": 15400 }, { "epoch": 41.76151761517615, "grad_norm": 0.13813938200473785, "learning_rate": 1.3727487758026986e-05, "loss": 0.0079, "step": 15410 }, { "epoch": 41.78861788617886, "grad_norm": 0.094864122569561, "learning_rate": 1.3670635302881525e-05, "loss": 0.0068, "step": 15420 }, { "epoch": 41.81571815718157, "grad_norm": 0.18754640221595764, "learning_rate": 1.3613882170831888e-05, "loss": 0.0061, "step": 15430 }, { "epoch": 41.84281842818428, "grad_norm": 0.10638555884361267, "learning_rate": 1.355722851703901e-05, "loss": 0.0052, "step": 15440 }, { "epoch": 41.86991869918699, "grad_norm": 0.17453154921531677, "learning_rate": 1.3500674496391814e-05, "loss": 0.0047, "step": 15450 }, { "epoch": 41.8970189701897, "grad_norm": 0.14862652122974396, "learning_rate": 1.3444220263506795e-05, "loss": 0.0043, "step": 15460 }, { "epoch": 41.924119241192415, "grad_norm": 0.1318141520023346, "learning_rate": 1.3387865972727714e-05, "loss": 0.0051, "step": 15470 }, { "epoch": 41.951219512195124, "grad_norm": 0.13167433440685272, "learning_rate": 1.3331611778125036e-05, "loss": 0.0061, "step": 15480 }, { "epoch": 41.97831978319783, "grad_norm": 0.17797520756721497, "learning_rate": 1.3275457833495564e-05, "loss": 0.0068, "step": 15490 }, { "epoch": 42.00542005420054, "grad_norm": 0.3770260214805603, "learning_rate": 1.3219404292362065e-05, "loss": 0.0072, "step": 15500 }, { "epoch": 42.03252032520325, "grad_norm": 0.1358630210161209, "learning_rate": 1.3163451307972751e-05, "loss": 0.0055, "step": 15510 }, { "epoch": 42.05962059620596, "grad_norm": 0.22232897579669952, "learning_rate": 1.3107599033300977e-05, "loss": 0.0059, "step": 15520 }, { "epoch": 42.08672086720867, "grad_norm": 0.16818757355213165, "learning_rate": 1.305184762104471e-05, "loss": 0.0057, "step": 15530 }, { "epoch": 42.113821138211385, "grad_norm": 0.27974140644073486, "learning_rate": 1.2996197223626178e-05, "loss": 0.006, "step": 15540 }, { "epoch": 42.140921409214094, "grad_norm": 0.17872421443462372, "learning_rate": 1.2940647993191457e-05, "loss": 0.0046, "step": 15550 }, { "epoch": 42.1680216802168, "grad_norm": 0.21095164120197296, "learning_rate": 1.2885200081610005e-05, "loss": 0.0047, "step": 15560 }, { "epoch": 42.19512195121951, "grad_norm": 0.13203759491443634, "learning_rate": 1.2829853640474316e-05, "loss": 0.0055, "step": 15570 }, { "epoch": 42.22222222222222, "grad_norm": 0.165818989276886, "learning_rate": 1.2774608821099438e-05, "loss": 0.0052, "step": 15580 }, { "epoch": 42.24932249322493, "grad_norm": 0.1977515071630478, "learning_rate": 1.2719465774522577e-05, "loss": 0.009, "step": 15590 }, { "epoch": 42.27642276422764, "grad_norm": 0.19988669455051422, "learning_rate": 1.2664424651502755e-05, "loss": 0.007, "step": 15600 }, { "epoch": 42.303523035230356, "grad_norm": 0.16021305322647095, "learning_rate": 1.260948560252026e-05, "loss": 0.0067, "step": 15610 }, { "epoch": 42.330623306233065, "grad_norm": 0.156336709856987, "learning_rate": 1.2554648777776396e-05, "loss": 0.007, "step": 15620 }, { "epoch": 42.357723577235774, "grad_norm": 0.13185253739356995, "learning_rate": 1.2499914327192919e-05, "loss": 0.006, "step": 15630 }, { "epoch": 42.38482384823848, "grad_norm": 0.17337509989738464, "learning_rate": 1.2445282400411722e-05, "loss": 0.004, "step": 15640 }, { "epoch": 42.41192411924119, "grad_norm": 0.11312593519687653, "learning_rate": 1.2390753146794437e-05, "loss": 0.0042, "step": 15650 }, { "epoch": 42.4390243902439, "grad_norm": 0.11690627783536911, "learning_rate": 1.2336326715421925e-05, "loss": 0.0069, "step": 15660 }, { "epoch": 42.46612466124661, "grad_norm": 0.13633222877979279, "learning_rate": 1.2282003255094005e-05, "loss": 0.005, "step": 15670 }, { "epoch": 42.49322493224932, "grad_norm": 0.18237242102622986, "learning_rate": 1.2227782914328928e-05, "loss": 0.0072, "step": 15680 }, { "epoch": 42.520325203252035, "grad_norm": 0.13380283117294312, "learning_rate": 1.2173665841363018e-05, "loss": 0.005, "step": 15690 }, { "epoch": 42.547425474254744, "grad_norm": 0.13192123174667358, "learning_rate": 1.211965218415032e-05, "loss": 0.0069, "step": 15700 }, { "epoch": 42.57452574525745, "grad_norm": 0.09512990713119507, "learning_rate": 1.2065742090362082e-05, "loss": 0.0076, "step": 15710 }, { "epoch": 42.60162601626016, "grad_norm": 0.09815787523984909, "learning_rate": 1.2011935707386457e-05, "loss": 0.0054, "step": 15720 }, { "epoch": 42.62872628726287, "grad_norm": 0.2036513090133667, "learning_rate": 1.1958233182328044e-05, "loss": 0.0089, "step": 15730 }, { "epoch": 42.65582655826558, "grad_norm": 0.1400594264268875, "learning_rate": 1.1904634662007474e-05, "loss": 0.0059, "step": 15740 }, { "epoch": 42.68292682926829, "grad_norm": 0.32511183619499207, "learning_rate": 1.1851140292961088e-05, "loss": 0.0056, "step": 15750 }, { "epoch": 42.710027100271006, "grad_norm": 0.11255430430173874, "learning_rate": 1.1797750221440424e-05, "loss": 0.0078, "step": 15760 }, { "epoch": 42.737127371273715, "grad_norm": 0.148197203874588, "learning_rate": 1.1744464593411897e-05, "loss": 0.006, "step": 15770 }, { "epoch": 42.764227642276424, "grad_norm": 0.1130661889910698, "learning_rate": 1.1691283554556399e-05, "loss": 0.0049, "step": 15780 }, { "epoch": 42.79132791327913, "grad_norm": 0.18451468646526337, "learning_rate": 1.1638207250268834e-05, "loss": 0.0081, "step": 15790 }, { "epoch": 42.81842818428184, "grad_norm": 0.09941829741001129, "learning_rate": 1.158523582565782e-05, "loss": 0.0044, "step": 15800 }, { "epoch": 42.84552845528455, "grad_norm": 0.1289730817079544, "learning_rate": 1.1532369425545192e-05, "loss": 0.0081, "step": 15810 }, { "epoch": 42.87262872628726, "grad_norm": 0.1633101999759674, "learning_rate": 1.1479608194465662e-05, "loss": 0.0062, "step": 15820 }, { "epoch": 42.89972899728997, "grad_norm": 0.20012009143829346, "learning_rate": 1.1426952276666442e-05, "loss": 0.0057, "step": 15830 }, { "epoch": 42.926829268292686, "grad_norm": 0.15768368542194366, "learning_rate": 1.1374401816106778e-05, "loss": 0.0053, "step": 15840 }, { "epoch": 42.953929539295395, "grad_norm": 0.11256875097751617, "learning_rate": 1.1321956956457646e-05, "loss": 0.0053, "step": 15850 }, { "epoch": 42.981029810298104, "grad_norm": 0.0994715765118599, "learning_rate": 1.1269617841101277e-05, "loss": 0.0079, "step": 15860 }, { "epoch": 43.00813008130081, "grad_norm": 0.3231642544269562, "learning_rate": 1.1217384613130804e-05, "loss": 0.0099, "step": 15870 }, { "epoch": 43.03523035230352, "grad_norm": 0.16685017943382263, "learning_rate": 1.11652574153499e-05, "loss": 0.0055, "step": 15880 }, { "epoch": 43.06233062330623, "grad_norm": 0.10406318306922913, "learning_rate": 1.1113236390272303e-05, "loss": 0.0051, "step": 15890 }, { "epoch": 43.08943089430894, "grad_norm": 0.12919719517230988, "learning_rate": 1.106132168012155e-05, "loss": 0.0058, "step": 15900 }, { "epoch": 43.116531165311656, "grad_norm": 0.25328418612480164, "learning_rate": 1.1009513426830448e-05, "loss": 0.0064, "step": 15910 }, { "epoch": 43.143631436314365, "grad_norm": 0.13947467505931854, "learning_rate": 1.0957811772040777e-05, "loss": 0.0047, "step": 15920 }, { "epoch": 43.170731707317074, "grad_norm": 0.2402535080909729, "learning_rate": 1.0906216857102913e-05, "loss": 0.0089, "step": 15930 }, { "epoch": 43.19783197831978, "grad_norm": 0.291524201631546, "learning_rate": 1.0854728823075355e-05, "loss": 0.0073, "step": 15940 }, { "epoch": 43.22493224932249, "grad_norm": 0.161453515291214, "learning_rate": 1.0803347810724452e-05, "loss": 0.0059, "step": 15950 }, { "epoch": 43.2520325203252, "grad_norm": 0.11064847558736801, "learning_rate": 1.0752073960523911e-05, "loss": 0.0061, "step": 15960 }, { "epoch": 43.27913279132791, "grad_norm": 0.25043243169784546, "learning_rate": 1.070090741265447e-05, "loss": 0.0089, "step": 15970 }, { "epoch": 43.30623306233063, "grad_norm": 0.19931381940841675, "learning_rate": 1.0649848307003547e-05, "loss": 0.0064, "step": 15980 }, { "epoch": 43.333333333333336, "grad_norm": 0.16349555552005768, "learning_rate": 1.0598896783164757e-05, "loss": 0.0064, "step": 15990 }, { "epoch": 43.360433604336045, "grad_norm": 0.40111517906188965, "learning_rate": 1.0548052980437645e-05, "loss": 0.0069, "step": 16000 }, { "epoch": 43.387533875338754, "grad_norm": 0.3496374189853668, "learning_rate": 1.049731703782722e-05, "loss": 0.007, "step": 16010 }, { "epoch": 43.41463414634146, "grad_norm": 0.15300680696964264, "learning_rate": 1.0446689094043587e-05, "loss": 0.0064, "step": 16020 }, { "epoch": 43.44173441734417, "grad_norm": 0.18439817428588867, "learning_rate": 1.039616928750165e-05, "loss": 0.0069, "step": 16030 }, { "epoch": 43.46883468834688, "grad_norm": 0.15217368304729462, "learning_rate": 1.0345757756320612e-05, "loss": 0.0076, "step": 16040 }, { "epoch": 43.49593495934959, "grad_norm": 0.269040048122406, "learning_rate": 1.0295454638323666e-05, "loss": 0.0099, "step": 16050 }, { "epoch": 43.523035230352306, "grad_norm": 0.11595907807350159, "learning_rate": 1.0245260071037632e-05, "loss": 0.0047, "step": 16060 }, { "epoch": 43.550135501355015, "grad_norm": 0.13107401132583618, "learning_rate": 1.0195174191692518e-05, "loss": 0.0057, "step": 16070 }, { "epoch": 43.577235772357724, "grad_norm": 0.13428780436515808, "learning_rate": 1.014519713722124e-05, "loss": 0.0047, "step": 16080 }, { "epoch": 43.60433604336043, "grad_norm": 0.2077709287405014, "learning_rate": 1.0095329044259132e-05, "loss": 0.0063, "step": 16090 }, { "epoch": 43.63143631436314, "grad_norm": 0.07851365953683853, "learning_rate": 1.004557004914365e-05, "loss": 0.0053, "step": 16100 }, { "epoch": 43.65853658536585, "grad_norm": 0.34477925300598145, "learning_rate": 9.995920287914007e-06, "loss": 0.0051, "step": 16110 }, { "epoch": 43.68563685636856, "grad_norm": 0.20998863875865936, "learning_rate": 9.946379896310737e-06, "loss": 0.005, "step": 16120 }, { "epoch": 43.71273712737128, "grad_norm": 0.12561562657356262, "learning_rate": 9.896949009775396e-06, "loss": 0.0063, "step": 16130 }, { "epoch": 43.739837398373986, "grad_norm": 0.1736477166414261, "learning_rate": 9.847627763450134e-06, "loss": 0.0056, "step": 16140 }, { "epoch": 43.766937669376695, "grad_norm": 0.0867067500948906, "learning_rate": 9.798416292177337e-06, "loss": 0.0052, "step": 16150 }, { "epoch": 43.794037940379404, "grad_norm": 0.1699327677488327, "learning_rate": 9.74931473049932e-06, "loss": 0.0069, "step": 16160 }, { "epoch": 43.82113821138211, "grad_norm": 0.11137860268354416, "learning_rate": 9.700323212657847e-06, "loss": 0.0061, "step": 16170 }, { "epoch": 43.84823848238482, "grad_norm": 0.07396666705608368, "learning_rate": 9.65144187259388e-06, "loss": 0.0042, "step": 16180 }, { "epoch": 43.87533875338753, "grad_norm": 0.16854752600193024, "learning_rate": 9.602670843947132e-06, "loss": 0.0045, "step": 16190 }, { "epoch": 43.90243902439025, "grad_norm": 0.16244308650493622, "learning_rate": 9.554010260055713e-06, "loss": 0.0055, "step": 16200 }, { "epoch": 43.929539295392956, "grad_norm": 0.22828274965286255, "learning_rate": 9.505460253955834e-06, "loss": 0.0073, "step": 16210 }, { "epoch": 43.956639566395665, "grad_norm": 0.12531310319900513, "learning_rate": 9.457020958381324e-06, "loss": 0.0056, "step": 16220 }, { "epoch": 43.983739837398375, "grad_norm": 0.14961346983909607, "learning_rate": 9.408692505763395e-06, "loss": 0.006, "step": 16230 }, { "epoch": 44.010840108401084, "grad_norm": 0.13769058883190155, "learning_rate": 9.360475028230181e-06, "loss": 0.0053, "step": 16240 }, { "epoch": 44.03794037940379, "grad_norm": 0.19581179320812225, "learning_rate": 9.312368657606412e-06, "loss": 0.0049, "step": 16250 }, { "epoch": 44.0650406504065, "grad_norm": 0.332977294921875, "learning_rate": 9.264373525413096e-06, "loss": 0.0068, "step": 16260 }, { "epoch": 44.09214092140921, "grad_norm": 0.13656285405158997, "learning_rate": 9.216489762867058e-06, "loss": 0.0063, "step": 16270 }, { "epoch": 44.11924119241193, "grad_norm": 0.10972703248262405, "learning_rate": 9.168717500880708e-06, "loss": 0.0045, "step": 16280 }, { "epoch": 44.146341463414636, "grad_norm": 0.09800173342227936, "learning_rate": 9.121056870061574e-06, "loss": 0.0043, "step": 16290 }, { "epoch": 44.173441734417345, "grad_norm": 0.26469528675079346, "learning_rate": 9.073508000711983e-06, "loss": 0.006, "step": 16300 }, { "epoch": 44.200542005420054, "grad_norm": 0.12709784507751465, "learning_rate": 9.026071022828758e-06, "loss": 0.0047, "step": 16310 }, { "epoch": 44.22764227642276, "grad_norm": 0.2942989468574524, "learning_rate": 8.978746066102771e-06, "loss": 0.0065, "step": 16320 }, { "epoch": 44.25474254742547, "grad_norm": 0.08226390928030014, "learning_rate": 8.931533259918634e-06, "loss": 0.0073, "step": 16330 }, { "epoch": 44.28184281842818, "grad_norm": 0.20180447399616241, "learning_rate": 8.884432733354382e-06, "loss": 0.0054, "step": 16340 }, { "epoch": 44.3089430894309, "grad_norm": 0.2385869175195694, "learning_rate": 8.837444615181029e-06, "loss": 0.0076, "step": 16350 }, { "epoch": 44.33604336043361, "grad_norm": 0.08991343528032303, "learning_rate": 8.790569033862323e-06, "loss": 0.0067, "step": 16360 }, { "epoch": 44.363143631436316, "grad_norm": 0.22100304067134857, "learning_rate": 8.7438061175543e-06, "loss": 0.0072, "step": 16370 }, { "epoch": 44.390243902439025, "grad_norm": 0.21797777712345123, "learning_rate": 8.697155994104978e-06, "loss": 0.0056, "step": 16380 }, { "epoch": 44.417344173441734, "grad_norm": 0.16675469279289246, "learning_rate": 8.650618791054033e-06, "loss": 0.0048, "step": 16390 }, { "epoch": 44.44444444444444, "grad_norm": 0.123349130153656, "learning_rate": 8.604194635632373e-06, "loss": 0.0056, "step": 16400 }, { "epoch": 44.47154471544715, "grad_norm": 0.5237106084823608, "learning_rate": 8.557883654761906e-06, "loss": 0.0068, "step": 16410 }, { "epoch": 44.49864498644986, "grad_norm": 0.07697708159685135, "learning_rate": 8.511685975055061e-06, "loss": 0.0042, "step": 16420 }, { "epoch": 44.52574525745258, "grad_norm": 0.26826637983322144, "learning_rate": 8.46560172281452e-06, "loss": 0.0056, "step": 16430 }, { "epoch": 44.552845528455286, "grad_norm": 0.19480153918266296, "learning_rate": 8.419631024032893e-06, "loss": 0.0062, "step": 16440 }, { "epoch": 44.579945799457995, "grad_norm": 0.10711710155010223, "learning_rate": 8.373774004392293e-06, "loss": 0.006, "step": 16450 }, { "epoch": 44.607046070460704, "grad_norm": 0.11224622279405594, "learning_rate": 8.32803078926409e-06, "loss": 0.0075, "step": 16460 }, { "epoch": 44.63414634146341, "grad_norm": 0.22442279756069183, "learning_rate": 8.282401503708454e-06, "loss": 0.0058, "step": 16470 }, { "epoch": 44.66124661246612, "grad_norm": 0.15553249418735504, "learning_rate": 8.23688627247412e-06, "loss": 0.0081, "step": 16480 }, { "epoch": 44.68834688346883, "grad_norm": 0.08998789638280869, "learning_rate": 8.191485219998007e-06, "loss": 0.0044, "step": 16490 }, { "epoch": 44.71544715447155, "grad_norm": 0.21985961496829987, "learning_rate": 8.146198470404843e-06, "loss": 0.0081, "step": 16500 }, { "epoch": 44.74254742547426, "grad_norm": 0.1146453320980072, "learning_rate": 8.101026147506897e-06, "loss": 0.0043, "step": 16510 }, { "epoch": 44.769647696476966, "grad_norm": 0.1846509575843811, "learning_rate": 8.05596837480353e-06, "loss": 0.0065, "step": 16520 }, { "epoch": 44.796747967479675, "grad_norm": 0.11517076939344406, "learning_rate": 8.011025275480998e-06, "loss": 0.0065, "step": 16530 }, { "epoch": 44.823848238482384, "grad_norm": 0.1309705525636673, "learning_rate": 7.966196972412027e-06, "loss": 0.0067, "step": 16540 }, { "epoch": 44.85094850948509, "grad_norm": 0.14068907499313354, "learning_rate": 7.92148358815547e-06, "loss": 0.0049, "step": 16550 }, { "epoch": 44.8780487804878, "grad_norm": 0.15621615946292877, "learning_rate": 7.87688524495604e-06, "loss": 0.0052, "step": 16560 }, { "epoch": 44.90514905149052, "grad_norm": 0.3268279731273651, "learning_rate": 7.83240206474386e-06, "loss": 0.0043, "step": 16570 }, { "epoch": 44.93224932249323, "grad_norm": 0.15452805161476135, "learning_rate": 7.788034169134272e-06, "loss": 0.0049, "step": 16580 }, { "epoch": 44.959349593495936, "grad_norm": 0.16033874452114105, "learning_rate": 7.743781679427414e-06, "loss": 0.0064, "step": 16590 }, { "epoch": 44.986449864498645, "grad_norm": 0.1832730770111084, "learning_rate": 7.699644716607895e-06, "loss": 0.0122, "step": 16600 }, { "epoch": 45.013550135501355, "grad_norm": 0.1255677491426468, "learning_rate": 7.655623401344486e-06, "loss": 0.0049, "step": 16610 }, { "epoch": 45.040650406504064, "grad_norm": 0.17634032666683197, "learning_rate": 7.611717853989775e-06, "loss": 0.0083, "step": 16620 }, { "epoch": 45.06775067750677, "grad_norm": 0.10568185895681381, "learning_rate": 7.567928194579854e-06, "loss": 0.0061, "step": 16630 }, { "epoch": 45.09485094850948, "grad_norm": 0.2042877972126007, "learning_rate": 7.524254542833997e-06, "loss": 0.0051, "step": 16640 }, { "epoch": 45.1219512195122, "grad_norm": 0.14467550814151764, "learning_rate": 7.480697018154286e-06, "loss": 0.0046, "step": 16650 }, { "epoch": 45.14905149051491, "grad_norm": 0.1753573715686798, "learning_rate": 7.437255739625332e-06, "loss": 0.0049, "step": 16660 }, { "epoch": 45.176151761517616, "grad_norm": 0.22939461469650269, "learning_rate": 7.393930826013923e-06, "loss": 0.0049, "step": 16670 }, { "epoch": 45.203252032520325, "grad_norm": 0.2235630750656128, "learning_rate": 7.350722395768722e-06, "loss": 0.0049, "step": 16680 }, { "epoch": 45.230352303523034, "grad_norm": 0.0906524658203125, "learning_rate": 7.307630567019963e-06, "loss": 0.0049, "step": 16690 }, { "epoch": 45.25745257452574, "grad_norm": 0.15578493475914001, "learning_rate": 7.264655457579e-06, "loss": 0.0058, "step": 16700 }, { "epoch": 45.28455284552845, "grad_norm": 0.08713225275278091, "learning_rate": 7.221797184938184e-06, "loss": 0.0069, "step": 16710 }, { "epoch": 45.31165311653117, "grad_norm": 0.19288066029548645, "learning_rate": 7.179055866270373e-06, "loss": 0.006, "step": 16720 }, { "epoch": 45.33875338753388, "grad_norm": 0.1365078091621399, "learning_rate": 7.136431618428707e-06, "loss": 0.0057, "step": 16730 }, { "epoch": 45.36585365853659, "grad_norm": 0.17542476952075958, "learning_rate": 7.09392455794628e-06, "loss": 0.0045, "step": 16740 }, { "epoch": 45.392953929539296, "grad_norm": 0.26298168301582336, "learning_rate": 7.051534801035725e-06, "loss": 0.0046, "step": 16750 }, { "epoch": 45.420054200542005, "grad_norm": 0.19980652630329132, "learning_rate": 7.00926246358905e-06, "loss": 0.0048, "step": 16760 }, { "epoch": 45.447154471544714, "grad_norm": 0.1623387336730957, "learning_rate": 6.967107661177191e-06, "loss": 0.0046, "step": 16770 }, { "epoch": 45.47425474254742, "grad_norm": 0.147375226020813, "learning_rate": 6.925070509049786e-06, "loss": 0.0051, "step": 16780 }, { "epoch": 45.50135501355014, "grad_norm": 0.09257940202951431, "learning_rate": 6.883151122134812e-06, "loss": 0.0053, "step": 16790 }, { "epoch": 45.52845528455285, "grad_norm": 0.08058512210845947, "learning_rate": 6.8413496150382394e-06, "loss": 0.0063, "step": 16800 }, { "epoch": 45.55555555555556, "grad_norm": 0.13982714712619781, "learning_rate": 6.7996661020438165e-06, "loss": 0.0052, "step": 16810 }, { "epoch": 45.582655826558266, "grad_norm": 0.19218283891677856, "learning_rate": 6.758100697112662e-06, "loss": 0.0068, "step": 16820 }, { "epoch": 45.609756097560975, "grad_norm": 0.17297238111495972, "learning_rate": 6.716653513883026e-06, "loss": 0.0051, "step": 16830 }, { "epoch": 45.636856368563684, "grad_norm": 0.19390827417373657, "learning_rate": 6.675324665669913e-06, "loss": 0.0043, "step": 16840 }, { "epoch": 45.66395663956639, "grad_norm": 0.18421435356140137, "learning_rate": 6.634114265464803e-06, "loss": 0.0056, "step": 16850 }, { "epoch": 45.6910569105691, "grad_norm": 0.19145499169826508, "learning_rate": 6.59302242593538e-06, "loss": 0.0049, "step": 16860 }, { "epoch": 45.71815718157182, "grad_norm": 0.11658080667257309, "learning_rate": 6.552049259425141e-06, "loss": 0.0045, "step": 16870 }, { "epoch": 45.74525745257453, "grad_norm": 0.2357105314731598, "learning_rate": 6.511194877953181e-06, "loss": 0.0035, "step": 16880 }, { "epoch": 45.77235772357724, "grad_norm": 0.1650124490261078, "learning_rate": 6.470459393213813e-06, "loss": 0.0048, "step": 16890 }, { "epoch": 45.799457994579946, "grad_norm": 0.19807708263397217, "learning_rate": 6.429842916576279e-06, "loss": 0.0046, "step": 16900 }, { "epoch": 45.826558265582655, "grad_norm": 0.08891675621271133, "learning_rate": 6.389345559084503e-06, "loss": 0.0065, "step": 16910 }, { "epoch": 45.853658536585364, "grad_norm": 0.21796709299087524, "learning_rate": 6.348967431456682e-06, "loss": 0.0052, "step": 16920 }, { "epoch": 45.88075880758807, "grad_norm": 0.061334095895290375, "learning_rate": 6.30870864408511e-06, "loss": 0.0046, "step": 16930 }, { "epoch": 45.90785907859079, "grad_norm": 0.10369174927473068, "learning_rate": 6.268569307035754e-06, "loss": 0.0042, "step": 16940 }, { "epoch": 45.9349593495935, "grad_norm": 0.2762027978897095, "learning_rate": 6.228549530048022e-06, "loss": 0.0047, "step": 16950 }, { "epoch": 45.96205962059621, "grad_norm": 0.1696929633617401, "learning_rate": 6.1886494225344814e-06, "loss": 0.0049, "step": 16960 }, { "epoch": 45.989159891598916, "grad_norm": 0.19755671918392181, "learning_rate": 6.148869093580479e-06, "loss": 0.0041, "step": 16970 }, { "epoch": 46.016260162601625, "grad_norm": 0.32033243775367737, "learning_rate": 6.109208651943921e-06, "loss": 0.0052, "step": 16980 }, { "epoch": 46.043360433604335, "grad_norm": 0.20722919702529907, "learning_rate": 6.069668206054946e-06, "loss": 0.0043, "step": 16990 }, { "epoch": 46.070460704607044, "grad_norm": 0.09935317933559418, "learning_rate": 6.0302478640156145e-06, "loss": 0.0051, "step": 17000 }, { "epoch": 46.09756097560975, "grad_norm": 0.13332107663154602, "learning_rate": 5.990947733599644e-06, "loss": 0.0065, "step": 17010 }, { "epoch": 46.12466124661247, "grad_norm": 0.2013438642024994, "learning_rate": 5.951767922252105e-06, "loss": 0.0052, "step": 17020 }, { "epoch": 46.15176151761518, "grad_norm": 0.2627830505371094, "learning_rate": 5.912708537089068e-06, "loss": 0.0049, "step": 17030 }, { "epoch": 46.17886178861789, "grad_norm": 0.20139412581920624, "learning_rate": 5.873769684897434e-06, "loss": 0.008, "step": 17040 }, { "epoch": 46.205962059620596, "grad_norm": 0.19920307397842407, "learning_rate": 5.834951472134514e-06, "loss": 0.005, "step": 17050 }, { "epoch": 46.233062330623305, "grad_norm": 0.3315029442310333, "learning_rate": 5.796254004927832e-06, "loss": 0.0048, "step": 17060 }, { "epoch": 46.260162601626014, "grad_norm": 0.45195749402046204, "learning_rate": 5.757677389074806e-06, "loss": 0.0053, "step": 17070 }, { "epoch": 46.28726287262872, "grad_norm": 0.15126870572566986, "learning_rate": 5.719221730042385e-06, "loss": 0.0047, "step": 17080 }, { "epoch": 46.31436314363144, "grad_norm": 0.11279576271772385, "learning_rate": 5.680887132966911e-06, "loss": 0.006, "step": 17090 }, { "epoch": 46.34146341463415, "grad_norm": 0.2803502380847931, "learning_rate": 5.642673702653683e-06, "loss": 0.0047, "step": 17100 }, { "epoch": 46.36856368563686, "grad_norm": 0.05484018847346306, "learning_rate": 5.604581543576781e-06, "loss": 0.0058, "step": 17110 }, { "epoch": 46.39566395663957, "grad_norm": 0.1561799794435501, "learning_rate": 5.566610759878704e-06, "loss": 0.0053, "step": 17120 }, { "epoch": 46.422764227642276, "grad_norm": 0.07656382769346237, "learning_rate": 5.528761455370119e-06, "loss": 0.0043, "step": 17130 }, { "epoch": 46.449864498644985, "grad_norm": 0.3825330138206482, "learning_rate": 5.491033733529594e-06, "loss": 0.0067, "step": 17140 }, { "epoch": 46.476964769647694, "grad_norm": 0.16067424416542053, "learning_rate": 5.453427697503255e-06, "loss": 0.0056, "step": 17150 }, { "epoch": 46.50406504065041, "grad_norm": 0.16018998622894287, "learning_rate": 5.415943450104599e-06, "loss": 0.0039, "step": 17160 }, { "epoch": 46.53116531165312, "grad_norm": 0.18528050184249878, "learning_rate": 5.378581093814111e-06, "loss": 0.0047, "step": 17170 }, { "epoch": 46.55826558265583, "grad_norm": 0.32171863317489624, "learning_rate": 5.3413407307790375e-06, "loss": 0.0057, "step": 17180 }, { "epoch": 46.58536585365854, "grad_norm": 0.282651424407959, "learning_rate": 5.30422246281313e-06, "loss": 0.0054, "step": 17190 }, { "epoch": 46.612466124661246, "grad_norm": 0.18001292645931244, "learning_rate": 5.267226391396296e-06, "loss": 0.0058, "step": 17200 }, { "epoch": 46.639566395663955, "grad_norm": 0.15819020569324493, "learning_rate": 5.2303526176744e-06, "loss": 0.0043, "step": 17210 }, { "epoch": 46.666666666666664, "grad_norm": 0.13377858698368073, "learning_rate": 5.193601242458929e-06, "loss": 0.0049, "step": 17220 }, { "epoch": 46.69376693766937, "grad_norm": 0.10663385689258575, "learning_rate": 5.156972366226714e-06, "loss": 0.0046, "step": 17230 }, { "epoch": 46.72086720867209, "grad_norm": 0.19713780283927917, "learning_rate": 5.120466089119735e-06, "loss": 0.0059, "step": 17240 }, { "epoch": 46.7479674796748, "grad_norm": 0.08138268440961838, "learning_rate": 5.084082510944749e-06, "loss": 0.0051, "step": 17250 }, { "epoch": 46.77506775067751, "grad_norm": 0.09034677594900131, "learning_rate": 5.047821731173058e-06, "loss": 0.0062, "step": 17260 }, { "epoch": 46.80216802168022, "grad_norm": 0.21803443133831024, "learning_rate": 5.011683848940274e-06, "loss": 0.005, "step": 17270 }, { "epoch": 46.829268292682926, "grad_norm": 0.1058468297123909, "learning_rate": 4.975668963045954e-06, "loss": 0.0054, "step": 17280 }, { "epoch": 46.856368563685635, "grad_norm": 0.11975191533565521, "learning_rate": 4.9397771719534525e-06, "loss": 0.0041, "step": 17290 }, { "epoch": 46.883468834688344, "grad_norm": 0.09365298599004745, "learning_rate": 4.904008573789548e-06, "loss": 0.0041, "step": 17300 }, { "epoch": 46.91056910569106, "grad_norm": 0.29151850938796997, "learning_rate": 4.8683632663442005e-06, "loss": 0.0071, "step": 17310 }, { "epoch": 46.93766937669377, "grad_norm": 0.10066478699445724, "learning_rate": 4.832841347070343e-06, "loss": 0.0085, "step": 17320 }, { "epoch": 46.96476964769648, "grad_norm": 0.09221629798412323, "learning_rate": 4.797442913083539e-06, "loss": 0.0046, "step": 17330 }, { "epoch": 46.99186991869919, "grad_norm": 0.1054675281047821, "learning_rate": 4.7621680611617596e-06, "loss": 0.0047, "step": 17340 }, { "epoch": 47.018970189701896, "grad_norm": 0.30412593483924866, "learning_rate": 4.727016887745095e-06, "loss": 0.0061, "step": 17350 }, { "epoch": 47.046070460704605, "grad_norm": 0.14220406115055084, "learning_rate": 4.691989488935511e-06, "loss": 0.0058, "step": 17360 }, { "epoch": 47.073170731707314, "grad_norm": 0.219927579164505, "learning_rate": 4.657085960496588e-06, "loss": 0.0045, "step": 17370 }, { "epoch": 47.10027100271003, "grad_norm": 0.1132938414812088, "learning_rate": 4.6223063978532265e-06, "loss": 0.0089, "step": 17380 }, { "epoch": 47.12737127371274, "grad_norm": 0.27833452820777893, "learning_rate": 4.587650896091439e-06, "loss": 0.0065, "step": 17390 }, { "epoch": 47.15447154471545, "grad_norm": 0.17135611176490784, "learning_rate": 4.553119549958035e-06, "loss": 0.0061, "step": 17400 }, { "epoch": 47.18157181571816, "grad_norm": 0.1284724473953247, "learning_rate": 4.518712453860385e-06, "loss": 0.0044, "step": 17410 }, { "epoch": 47.20867208672087, "grad_norm": 0.39931365847587585, "learning_rate": 4.484429701866205e-06, "loss": 0.0049, "step": 17420 }, { "epoch": 47.235772357723576, "grad_norm": 0.0940791442990303, "learning_rate": 4.4502713877031975e-06, "loss": 0.0041, "step": 17430 }, { "epoch": 47.262872628726285, "grad_norm": 0.11864673346281052, "learning_rate": 4.416237604758911e-06, "loss": 0.0047, "step": 17440 }, { "epoch": 47.289972899728994, "grad_norm": 0.2692002058029175, "learning_rate": 4.3823284460804025e-06, "loss": 0.0048, "step": 17450 }, { "epoch": 47.31707317073171, "grad_norm": 0.11034056544303894, "learning_rate": 4.348544004374011e-06, "loss": 0.0035, "step": 17460 }, { "epoch": 47.34417344173442, "grad_norm": 0.09027573466300964, "learning_rate": 4.314884372005123e-06, "loss": 0.0045, "step": 17470 }, { "epoch": 47.37127371273713, "grad_norm": 0.2091233879327774, "learning_rate": 4.281349640997867e-06, "loss": 0.0061, "step": 17480 }, { "epoch": 47.39837398373984, "grad_norm": 0.18246790766716003, "learning_rate": 4.247939903034942e-06, "loss": 0.0046, "step": 17490 }, { "epoch": 47.42547425474255, "grad_norm": 0.1971135139465332, "learning_rate": 4.214655249457284e-06, "loss": 0.0051, "step": 17500 }, { "epoch": 47.452574525745256, "grad_norm": 0.15376883745193481, "learning_rate": 4.181495771263855e-06, "loss": 0.0076, "step": 17510 }, { "epoch": 47.479674796747965, "grad_norm": 0.1267101764678955, "learning_rate": 4.148461559111427e-06, "loss": 0.0075, "step": 17520 }, { "epoch": 47.50677506775068, "grad_norm": 0.1592373251914978, "learning_rate": 4.115552703314252e-06, "loss": 0.0056, "step": 17530 }, { "epoch": 47.53387533875339, "grad_norm": 0.19724491238594055, "learning_rate": 4.082769293843886e-06, "loss": 0.0079, "step": 17540 }, { "epoch": 47.5609756097561, "grad_norm": 0.09814240783452988, "learning_rate": 4.050111420328939e-06, "loss": 0.0049, "step": 17550 }, { "epoch": 47.58807588075881, "grad_norm": 0.18811175227165222, "learning_rate": 4.017579172054764e-06, "loss": 0.0064, "step": 17560 }, { "epoch": 47.61517615176152, "grad_norm": 0.17727218568325043, "learning_rate": 3.985172637963308e-06, "loss": 0.0056, "step": 17570 }, { "epoch": 47.642276422764226, "grad_norm": 0.1361125111579895, "learning_rate": 3.952891906652784e-06, "loss": 0.0068, "step": 17580 }, { "epoch": 47.669376693766935, "grad_norm": 0.25143107771873474, "learning_rate": 3.920737066377478e-06, "loss": 0.0062, "step": 17590 }, { "epoch": 47.696476964769644, "grad_norm": 0.22111794352531433, "learning_rate": 3.888708205047509e-06, "loss": 0.0047, "step": 17600 }, { "epoch": 47.72357723577236, "grad_norm": 0.09195035696029663, "learning_rate": 3.856805410228542e-06, "loss": 0.005, "step": 17610 }, { "epoch": 47.75067750677507, "grad_norm": 0.2917189598083496, "learning_rate": 3.82502876914162e-06, "loss": 0.0062, "step": 17620 }, { "epoch": 47.77777777777778, "grad_norm": 0.15350060164928436, "learning_rate": 3.7933783686628586e-06, "loss": 0.0067, "step": 17630 }, { "epoch": 47.80487804878049, "grad_norm": 0.10943206399679184, "learning_rate": 3.7618542953232306e-06, "loss": 0.0065, "step": 17640 }, { "epoch": 47.8319783197832, "grad_norm": 0.12656757235527039, "learning_rate": 3.7304566353083658e-06, "loss": 0.007, "step": 17650 }, { "epoch": 47.859078590785906, "grad_norm": 0.1274162381887436, "learning_rate": 3.6991854744582555e-06, "loss": 0.0057, "step": 17660 }, { "epoch": 47.886178861788615, "grad_norm": 0.10538295656442642, "learning_rate": 3.6680408982670777e-06, "loss": 0.0045, "step": 17670 }, { "epoch": 47.91327913279133, "grad_norm": 0.0886586382985115, "learning_rate": 3.637022991882899e-06, "loss": 0.0054, "step": 17680 }, { "epoch": 47.94037940379404, "grad_norm": 0.1359531134366989, "learning_rate": 3.606131840107485e-06, "loss": 0.0048, "step": 17690 }, { "epoch": 47.96747967479675, "grad_norm": 0.09876429289579391, "learning_rate": 3.575367527396084e-06, "loss": 0.004, "step": 17700 }, { "epoch": 47.99457994579946, "grad_norm": 0.126203253865242, "learning_rate": 3.5447301378571386e-06, "loss": 0.0045, "step": 17710 }, { "epoch": 48.02168021680217, "grad_norm": 0.0810219943523407, "learning_rate": 3.514219755252113e-06, "loss": 0.0038, "step": 17720 }, { "epoch": 48.048780487804876, "grad_norm": 0.14066103100776672, "learning_rate": 3.4838364629952213e-06, "loss": 0.0051, "step": 17730 }, { "epoch": 48.075880758807585, "grad_norm": 0.1597338169813156, "learning_rate": 3.4535803441532123e-06, "loss": 0.0052, "step": 17740 }, { "epoch": 48.1029810298103, "grad_norm": 0.10610620677471161, "learning_rate": 3.4234514814451836e-06, "loss": 0.006, "step": 17750 }, { "epoch": 48.13008130081301, "grad_norm": 0.12110180407762527, "learning_rate": 3.393449957242273e-06, "loss": 0.0038, "step": 17760 }, { "epoch": 48.15718157181572, "grad_norm": 0.10479318350553513, "learning_rate": 3.363575853567524e-06, "loss": 0.0056, "step": 17770 }, { "epoch": 48.18428184281843, "grad_norm": 0.16386792063713074, "learning_rate": 3.3338292520955826e-06, "loss": 0.0046, "step": 17780 }, { "epoch": 48.21138211382114, "grad_norm": 0.1825025975704193, "learning_rate": 3.304210234152516e-06, "loss": 0.0051, "step": 17790 }, { "epoch": 48.23848238482385, "grad_norm": 0.4550725519657135, "learning_rate": 3.2747188807155993e-06, "loss": 0.0047, "step": 17800 }, { "epoch": 48.265582655826556, "grad_norm": 0.16348780691623688, "learning_rate": 3.2453552724130643e-06, "loss": 0.0051, "step": 17810 }, { "epoch": 48.292682926829265, "grad_norm": 0.12621180713176727, "learning_rate": 3.216119489523889e-06, "loss": 0.0048, "step": 17820 }, { "epoch": 48.31978319783198, "grad_norm": 0.11769979447126389, "learning_rate": 3.1870116119775917e-06, "loss": 0.0059, "step": 17830 }, { "epoch": 48.34688346883469, "grad_norm": 0.16212886571884155, "learning_rate": 3.158031719353999e-06, "loss": 0.0048, "step": 17840 }, { "epoch": 48.3739837398374, "grad_norm": 0.24824529886245728, "learning_rate": 3.1291798908830273e-06, "loss": 0.0043, "step": 17850 }, { "epoch": 48.40108401084011, "grad_norm": 0.1723768562078476, "learning_rate": 3.1004562054444853e-06, "loss": 0.0048, "step": 17860 }, { "epoch": 48.42818428184282, "grad_norm": 0.15021464228630066, "learning_rate": 3.071860741567806e-06, "loss": 0.0057, "step": 17870 }, { "epoch": 48.45528455284553, "grad_norm": 0.11917204409837723, "learning_rate": 3.04339357743193e-06, "loss": 0.0032, "step": 17880 }, { "epoch": 48.482384823848236, "grad_norm": 0.4273058772087097, "learning_rate": 3.0150547908649628e-06, "loss": 0.0042, "step": 17890 }, { "epoch": 48.50948509485095, "grad_norm": 0.1360849142074585, "learning_rate": 2.9868444593440957e-06, "loss": 0.0053, "step": 17900 }, { "epoch": 48.53658536585366, "grad_norm": 0.10374443233013153, "learning_rate": 2.9587626599952846e-06, "loss": 0.0074, "step": 17910 }, { "epoch": 48.56368563685637, "grad_norm": 0.25186583399772644, "learning_rate": 2.930809469593082e-06, "loss": 0.0063, "step": 17920 }, { "epoch": 48.59078590785908, "grad_norm": 0.15119154751300812, "learning_rate": 2.9029849645604733e-06, "loss": 0.004, "step": 17930 }, { "epoch": 48.61788617886179, "grad_norm": 0.07773621380329132, "learning_rate": 2.8752892209685632e-06, "loss": 0.0072, "step": 17940 }, { "epoch": 48.6449864498645, "grad_norm": 0.3180740773677826, "learning_rate": 2.847722314536483e-06, "loss": 0.0043, "step": 17950 }, { "epoch": 48.672086720867206, "grad_norm": 0.1575130820274353, "learning_rate": 2.820284320631078e-06, "loss": 0.0095, "step": 17960 }, { "epoch": 48.69918699186992, "grad_norm": 0.17196351289749146, "learning_rate": 2.792975314266788e-06, "loss": 0.004, "step": 17970 }, { "epoch": 48.72628726287263, "grad_norm": 0.17340238392353058, "learning_rate": 2.7657953701054007e-06, "loss": 0.0053, "step": 17980 }, { "epoch": 48.75338753387534, "grad_norm": 0.21593770384788513, "learning_rate": 2.7387445624558306e-06, "loss": 0.0047, "step": 17990 }, { "epoch": 48.78048780487805, "grad_norm": 0.20095482468605042, "learning_rate": 2.7118229652739747e-06, "loss": 0.0039, "step": 18000 }, { "epoch": 48.80758807588076, "grad_norm": 0.22448702156543732, "learning_rate": 2.6850306521624236e-06, "loss": 0.0055, "step": 18010 }, { "epoch": 48.83468834688347, "grad_norm": 0.13658004999160767, "learning_rate": 2.6583676963703507e-06, "loss": 0.0044, "step": 18020 }, { "epoch": 48.86178861788618, "grad_norm": 0.12292136251926422, "learning_rate": 2.631834170793268e-06, "loss": 0.0038, "step": 18030 }, { "epoch": 48.888888888888886, "grad_norm": 0.10635823756456375, "learning_rate": 2.6054301479728036e-06, "loss": 0.0046, "step": 18040 }, { "epoch": 48.9159891598916, "grad_norm": 0.12728004157543182, "learning_rate": 2.579155700096575e-06, "loss": 0.0038, "step": 18050 }, { "epoch": 48.94308943089431, "grad_norm": 0.19667483866214752, "learning_rate": 2.5530108989978873e-06, "loss": 0.0045, "step": 18060 }, { "epoch": 48.97018970189702, "grad_norm": 0.14875462651252747, "learning_rate": 2.5269958161556416e-06, "loss": 0.0051, "step": 18070 }, { "epoch": 48.99728997289973, "grad_norm": 0.24510788917541504, "learning_rate": 2.5011105226940888e-06, "loss": 0.0083, "step": 18080 }, { "epoch": 49.02439024390244, "grad_norm": 0.19768071174621582, "learning_rate": 2.4753550893826248e-06, "loss": 0.0044, "step": 18090 }, { "epoch": 49.05149051490515, "grad_norm": 0.08882863074541092, "learning_rate": 2.4497295866356296e-06, "loss": 0.0046, "step": 18100 }, { "epoch": 49.078590785907856, "grad_norm": 0.1337045133113861, "learning_rate": 2.424234084512228e-06, "loss": 0.0039, "step": 18110 }, { "epoch": 49.10569105691057, "grad_norm": 0.14051270484924316, "learning_rate": 2.3988686527161687e-06, "loss": 0.0056, "step": 18120 }, { "epoch": 49.13279132791328, "grad_norm": 0.10506237298250198, "learning_rate": 2.373633360595573e-06, "loss": 0.0037, "step": 18130 }, { "epoch": 49.15989159891599, "grad_norm": 0.1801370531320572, "learning_rate": 2.3485282771427585e-06, "loss": 0.0043, "step": 18140 }, { "epoch": 49.1869918699187, "grad_norm": 0.09127062559127808, "learning_rate": 2.3235534709940665e-06, "loss": 0.0055, "step": 18150 }, { "epoch": 49.21409214092141, "grad_norm": 0.1351483166217804, "learning_rate": 2.2987090104296617e-06, "loss": 0.0055, "step": 18160 }, { "epoch": 49.24119241192412, "grad_norm": 0.0842880979180336, "learning_rate": 2.273994963373355e-06, "loss": 0.0058, "step": 18170 }, { "epoch": 49.26829268292683, "grad_norm": 0.10625439137220383, "learning_rate": 2.249411397392409e-06, "loss": 0.0043, "step": 18180 }, { "epoch": 49.295392953929536, "grad_norm": 0.12923267483711243, "learning_rate": 2.2249583796973506e-06, "loss": 0.0059, "step": 18190 }, { "epoch": 49.32249322493225, "grad_norm": 0.18930329382419586, "learning_rate": 2.200635977141796e-06, "loss": 0.0045, "step": 18200 }, { "epoch": 49.34959349593496, "grad_norm": 0.15852610766887665, "learning_rate": 2.17644425622226e-06, "loss": 0.0042, "step": 18210 }, { "epoch": 49.37669376693767, "grad_norm": 0.08264784514904022, "learning_rate": 2.152383283077991e-06, "loss": 0.0043, "step": 18220 }, { "epoch": 49.40379403794038, "grad_norm": 0.1611468344926834, "learning_rate": 2.128453123490781e-06, "loss": 0.0076, "step": 18230 }, { "epoch": 49.43089430894309, "grad_norm": 0.22530703246593475, "learning_rate": 2.1046538428847462e-06, "loss": 0.0049, "step": 18240 }, { "epoch": 49.4579945799458, "grad_norm": 0.4359319806098938, "learning_rate": 2.0809855063262273e-06, "loss": 0.0052, "step": 18250 }, { "epoch": 49.48509485094851, "grad_norm": 0.17958980798721313, "learning_rate": 2.057448178523558e-06, "loss": 0.0051, "step": 18260 }, { "epoch": 49.51219512195122, "grad_norm": 0.10429201275110245, "learning_rate": 2.034041923826885e-06, "loss": 0.0047, "step": 18270 }, { "epoch": 49.53929539295393, "grad_norm": 0.21783709526062012, "learning_rate": 2.0107668062280204e-06, "loss": 0.0064, "step": 18280 }, { "epoch": 49.56639566395664, "grad_norm": 0.07852937281131744, "learning_rate": 1.9876228893602357e-06, "loss": 0.006, "step": 18290 }, { "epoch": 49.59349593495935, "grad_norm": 0.16991955041885376, "learning_rate": 1.9646102364981266e-06, "loss": 0.0057, "step": 18300 }, { "epoch": 49.62059620596206, "grad_norm": 0.11703486740589142, "learning_rate": 1.9417289105574053e-06, "loss": 0.005, "step": 18310 }, { "epoch": 49.64769647696477, "grad_norm": 0.20216898620128632, "learning_rate": 1.9189789740947427e-06, "loss": 0.0061, "step": 18320 }, { "epoch": 49.67479674796748, "grad_norm": 0.2134554088115692, "learning_rate": 1.896360489307597e-06, "loss": 0.0038, "step": 18330 }, { "epoch": 49.70189701897019, "grad_norm": 0.12339705228805542, "learning_rate": 1.8738735180340362e-06, "loss": 0.004, "step": 18340 }, { "epoch": 49.7289972899729, "grad_norm": 0.18699929118156433, "learning_rate": 1.8515181217525824e-06, "loss": 0.004, "step": 18350 }, { "epoch": 49.75609756097561, "grad_norm": 0.08322204649448395, "learning_rate": 1.8292943615820457e-06, "loss": 0.0058, "step": 18360 }, { "epoch": 49.78319783197832, "grad_norm": 0.5303295254707336, "learning_rate": 1.8072022982813296e-06, "loss": 0.0066, "step": 18370 }, { "epoch": 49.81029810298103, "grad_norm": 0.09343548119068146, "learning_rate": 1.7852419922492925e-06, "loss": 0.004, "step": 18380 }, { "epoch": 49.83739837398374, "grad_norm": 0.16755147278308868, "learning_rate": 1.763413503524569e-06, "loss": 0.0051, "step": 18390 }, { "epoch": 49.86449864498645, "grad_norm": 0.15111427009105682, "learning_rate": 1.7417168917854165e-06, "loss": 0.0046, "step": 18400 }, { "epoch": 49.89159891598916, "grad_norm": 0.19859519600868225, "learning_rate": 1.720152216349552e-06, "loss": 0.004, "step": 18410 }, { "epoch": 49.91869918699187, "grad_norm": 0.11992624402046204, "learning_rate": 1.6987195361739595e-06, "loss": 0.0052, "step": 18420 }, { "epoch": 49.94579945799458, "grad_norm": 0.11703237146139145, "learning_rate": 1.6774189098547832e-06, "loss": 0.005, "step": 18430 }, { "epoch": 49.97289972899729, "grad_norm": 0.1666322648525238, "learning_rate": 1.6562503956271069e-06, "loss": 0.0038, "step": 18440 }, { "epoch": 50.0, "grad_norm": 0.10679597407579422, "learning_rate": 1.6352140513648417e-06, "loss": 0.0037, "step": 18450 }, { "epoch": 50.02710027100271, "grad_norm": 0.1131080612540245, "learning_rate": 1.6143099345805712e-06, "loss": 0.0052, "step": 18460 }, { "epoch": 50.05420054200542, "grad_norm": 0.11193347722291946, "learning_rate": 1.5935381024253293e-06, "loss": 0.0054, "step": 18470 }, { "epoch": 50.08130081300813, "grad_norm": 0.15964744985103607, "learning_rate": 1.572898611688517e-06, "loss": 0.0063, "step": 18480 }, { "epoch": 50.10840108401084, "grad_norm": 0.17212416231632233, "learning_rate": 1.5523915187977133e-06, "loss": 0.0053, "step": 18490 }, { "epoch": 50.13550135501355, "grad_norm": 0.42601072788238525, "learning_rate": 1.532016879818532e-06, "loss": 0.0052, "step": 18500 }, { "epoch": 50.16260162601626, "grad_norm": 0.30006563663482666, "learning_rate": 1.51177475045447e-06, "loss": 0.0038, "step": 18510 }, { "epoch": 50.18970189701897, "grad_norm": 0.15745452046394348, "learning_rate": 1.4916651860467035e-06, "loss": 0.0049, "step": 18520 }, { "epoch": 50.21680216802168, "grad_norm": 0.06367097049951553, "learning_rate": 1.471688241574043e-06, "loss": 0.0068, "step": 18530 }, { "epoch": 50.24390243902439, "grad_norm": 0.08158767968416214, "learning_rate": 1.451843971652672e-06, "loss": 0.0038, "step": 18540 }, { "epoch": 50.2710027100271, "grad_norm": 0.12777452170848846, "learning_rate": 1.432132430536076e-06, "loss": 0.0036, "step": 18550 }, { "epoch": 50.29810298102981, "grad_norm": 0.13680694997310638, "learning_rate": 1.412553672114869e-06, "loss": 0.0044, "step": 18560 }, { "epoch": 50.32520325203252, "grad_norm": 0.15993328392505646, "learning_rate": 1.3931077499166056e-06, "loss": 0.0043, "step": 18570 }, { "epoch": 50.35230352303523, "grad_norm": 0.22809302806854248, "learning_rate": 1.3737947171057085e-06, "loss": 0.0043, "step": 18580 }, { "epoch": 50.37940379403794, "grad_norm": 0.2573760747909546, "learning_rate": 1.3546146264832582e-06, "loss": 0.005, "step": 18590 }, { "epoch": 50.40650406504065, "grad_norm": 0.09156999737024307, "learning_rate": 1.3355675304869086e-06, "loss": 0.0056, "step": 18600 }, { "epoch": 50.43360433604336, "grad_norm": 0.08990399539470673, "learning_rate": 1.3166534811906827e-06, "loss": 0.0043, "step": 18610 }, { "epoch": 50.46070460704607, "grad_norm": 0.1309632509946823, "learning_rate": 1.2978725303048666e-06, "loss": 0.0049, "step": 18620 }, { "epoch": 50.48780487804878, "grad_norm": 0.3899499475955963, "learning_rate": 1.2792247291758762e-06, "loss": 0.0048, "step": 18630 }, { "epoch": 50.51490514905149, "grad_norm": 0.16595973074436188, "learning_rate": 1.2607101287860635e-06, "loss": 0.0062, "step": 18640 }, { "epoch": 50.5420054200542, "grad_norm": 0.14789938926696777, "learning_rate": 1.2423287797536654e-06, "loss": 0.0057, "step": 18650 }, { "epoch": 50.56910569105691, "grad_norm": 0.09703000634908676, "learning_rate": 1.2240807323325776e-06, "loss": 0.0055, "step": 18660 }, { "epoch": 50.59620596205962, "grad_norm": 0.2631438076496124, "learning_rate": 1.205966036412254e-06, "loss": 0.0077, "step": 18670 }, { "epoch": 50.62330623306233, "grad_norm": 0.10016387701034546, "learning_rate": 1.1879847415175949e-06, "loss": 0.0039, "step": 18680 }, { "epoch": 50.65040650406504, "grad_norm": 0.11013240367174149, "learning_rate": 1.1701368968087712e-06, "loss": 0.0051, "step": 18690 }, { "epoch": 50.67750677506775, "grad_norm": 0.17379704117774963, "learning_rate": 1.1524225510811116e-06, "loss": 0.0038, "step": 18700 }, { "epoch": 50.704607046070464, "grad_norm": 0.12348836660385132, "learning_rate": 1.1348417527649535e-06, "loss": 0.0044, "step": 18710 }, { "epoch": 50.73170731707317, "grad_norm": 0.2123788595199585, "learning_rate": 1.1173945499255268e-06, "loss": 0.0056, "step": 18720 }, { "epoch": 50.75880758807588, "grad_norm": 0.42153918743133545, "learning_rate": 1.1000809902628307e-06, "loss": 0.0042, "step": 18730 }, { "epoch": 50.78590785907859, "grad_norm": 0.16688768565654755, "learning_rate": 1.082901121111468e-06, "loss": 0.0077, "step": 18740 }, { "epoch": 50.8130081300813, "grad_norm": 0.12890347838401794, "learning_rate": 1.0658549894405456e-06, "loss": 0.0041, "step": 18750 }, { "epoch": 50.84010840108401, "grad_norm": 0.19007909297943115, "learning_rate": 1.0489426418535342e-06, "loss": 0.0046, "step": 18760 }, { "epoch": 50.86720867208672, "grad_norm": 0.1304931342601776, "learning_rate": 1.0321641245881474e-06, "loss": 0.0044, "step": 18770 }, { "epoch": 50.89430894308943, "grad_norm": 0.17433351278305054, "learning_rate": 1.015519483516214e-06, "loss": 0.0053, "step": 18780 }, { "epoch": 50.921409214092144, "grad_norm": 0.13205666840076447, "learning_rate": 9.990087641435443e-07, "loss": 0.0051, "step": 18790 }, { "epoch": 50.94850948509485, "grad_norm": 0.3140515387058258, "learning_rate": 9.826320116098132e-07, "loss": 0.0048, "step": 18800 }, { "epoch": 50.97560975609756, "grad_norm": 0.18461419641971588, "learning_rate": 9.663892706884447e-07, "loss": 0.0038, "step": 18810 }, { "epoch": 51.00271002710027, "grad_norm": 0.0949225202202797, "learning_rate": 9.502805857864616e-07, "loss": 0.005, "step": 18820 }, { "epoch": 51.02981029810298, "grad_norm": 0.0969328060746193, "learning_rate": 9.34306000944396e-07, "loss": 0.004, "step": 18830 }, { "epoch": 51.05691056910569, "grad_norm": 0.18764281272888184, "learning_rate": 9.184655598361624e-07, "loss": 0.005, "step": 18840 }, { "epoch": 51.0840108401084, "grad_norm": 0.15977919101715088, "learning_rate": 9.027593057689076e-07, "loss": 0.0043, "step": 18850 }, { "epoch": 51.111111111111114, "grad_norm": 0.11880920827388763, "learning_rate": 8.871872816829441e-07, "loss": 0.0047, "step": 18860 }, { "epoch": 51.13821138211382, "grad_norm": 0.1275797337293625, "learning_rate": 8.717495301515777e-07, "loss": 0.0055, "step": 18870 }, { "epoch": 51.16531165311653, "grad_norm": 0.1307823807001114, "learning_rate": 8.564460933810415e-07, "loss": 0.0051, "step": 18880 }, { "epoch": 51.19241192411924, "grad_norm": 0.12237460166215897, "learning_rate": 8.412770132103453e-07, "loss": 0.0051, "step": 18890 }, { "epoch": 51.21951219512195, "grad_norm": 0.1375521868467331, "learning_rate": 8.262423311111711e-07, "loss": 0.0072, "step": 18900 }, { "epoch": 51.24661246612466, "grad_norm": 0.22906029224395752, "learning_rate": 8.113420881877665e-07, "loss": 0.0068, "step": 18910 }, { "epoch": 51.27371273712737, "grad_norm": 0.236679807305336, "learning_rate": 7.965763251768288e-07, "loss": 0.0045, "step": 18920 }, { "epoch": 51.300813008130085, "grad_norm": 0.27824822068214417, "learning_rate": 7.819450824473995e-07, "loss": 0.0064, "step": 18930 }, { "epoch": 51.327913279132794, "grad_norm": 0.1453292816877365, "learning_rate": 7.674484000007198e-07, "loss": 0.004, "step": 18940 }, { "epoch": 51.3550135501355, "grad_norm": 0.3477244973182678, "learning_rate": 7.530863174701752e-07, "loss": 0.0041, "step": 18950 }, { "epoch": 51.38211382113821, "grad_norm": 0.2643730044364929, "learning_rate": 7.38858874121151e-07, "loss": 0.0048, "step": 18960 }, { "epoch": 51.40921409214092, "grad_norm": 0.08208220452070236, "learning_rate": 7.247661088509328e-07, "loss": 0.0042, "step": 18970 }, { "epoch": 51.43631436314363, "grad_norm": 0.07534940540790558, "learning_rate": 7.108080601886002e-07, "loss": 0.0048, "step": 18980 }, { "epoch": 51.46341463414634, "grad_norm": 0.1122741773724556, "learning_rate": 6.969847662949336e-07, "loss": 0.008, "step": 18990 }, { "epoch": 51.49051490514905, "grad_norm": 0.14368808269500732, "learning_rate": 6.832962649622798e-07, "loss": 0.0058, "step": 19000 }, { "epoch": 51.517615176151764, "grad_norm": 0.5208157896995544, "learning_rate": 6.697425936144863e-07, "loss": 0.0043, "step": 19010 }, { "epoch": 51.54471544715447, "grad_norm": 0.17327289283275604, "learning_rate": 6.563237893067731e-07, "loss": 0.0055, "step": 19020 }, { "epoch": 51.57181571815718, "grad_norm": 0.13503023982048035, "learning_rate": 6.430398887256328e-07, "loss": 0.0045, "step": 19030 }, { "epoch": 51.59891598915989, "grad_norm": 0.250845342874527, "learning_rate": 6.298909281887478e-07, "loss": 0.0052, "step": 19040 }, { "epoch": 51.6260162601626, "grad_norm": 0.1254211813211441, "learning_rate": 6.168769436448673e-07, "loss": 0.0048, "step": 19050 }, { "epoch": 51.65311653116531, "grad_norm": 0.12398107349872589, "learning_rate": 6.03997970673742e-07, "loss": 0.0064, "step": 19060 }, { "epoch": 51.68021680216802, "grad_norm": 0.12783107161521912, "learning_rate": 5.912540444859782e-07, "loss": 0.0065, "step": 19070 }, { "epoch": 51.707317073170735, "grad_norm": 0.20372964441776276, "learning_rate": 5.786451999229837e-07, "loss": 0.0045, "step": 19080 }, { "epoch": 51.734417344173444, "grad_norm": 0.20626503229141235, "learning_rate": 5.661714714568722e-07, "loss": 0.0053, "step": 19090 }, { "epoch": 51.76151761517615, "grad_norm": 0.0989387109875679, "learning_rate": 5.538328931903259e-07, "loss": 0.0044, "step": 19100 }, { "epoch": 51.78861788617886, "grad_norm": 0.06986941397190094, "learning_rate": 5.416294988565551e-07, "loss": 0.0053, "step": 19110 }, { "epoch": 51.81571815718157, "grad_norm": 0.24736253917217255, "learning_rate": 5.29561321819172e-07, "loss": 0.0047, "step": 19120 }, { "epoch": 51.84281842818428, "grad_norm": 0.10518695414066315, "learning_rate": 5.176283950721061e-07, "loss": 0.0052, "step": 19130 }, { "epoch": 51.86991869918699, "grad_norm": 0.11050674319267273, "learning_rate": 5.058307512395332e-07, "loss": 0.0041, "step": 19140 }, { "epoch": 51.8970189701897, "grad_norm": 0.35872170329093933, "learning_rate": 4.941684225757526e-07, "loss": 0.0044, "step": 19150 }, { "epoch": 51.924119241192415, "grad_norm": 0.17557621002197266, "learning_rate": 4.826414409651314e-07, "loss": 0.0039, "step": 19160 }, { "epoch": 51.951219512195124, "grad_norm": 0.14846773445606232, "learning_rate": 4.712498379219943e-07, "loss": 0.0044, "step": 19170 }, { "epoch": 51.97831978319783, "grad_norm": 0.11166596412658691, "learning_rate": 4.599936445905506e-07, "loss": 0.0042, "step": 19180 }, { "epoch": 52.00542005420054, "grad_norm": 0.16070547699928284, "learning_rate": 4.4887289174480594e-07, "loss": 0.0061, "step": 19190 }, { "epoch": 52.03252032520325, "grad_norm": 0.11862843483686447, "learning_rate": 4.378876097884621e-07, "loss": 0.0043, "step": 19200 }, { "epoch": 52.05962059620596, "grad_norm": 0.2427193522453308, "learning_rate": 4.2703782875487264e-07, "loss": 0.0058, "step": 19210 }, { "epoch": 52.08672086720867, "grad_norm": 0.122115857899189, "learning_rate": 4.163235783069208e-07, "loss": 0.0069, "step": 19220 }, { "epoch": 52.113821138211385, "grad_norm": 0.11167177557945251, "learning_rate": 4.057448877369585e-07, "loss": 0.0034, "step": 19230 }, { "epoch": 52.140921409214094, "grad_norm": 0.12154641002416611, "learning_rate": 3.9530178596672295e-07, "loss": 0.0041, "step": 19240 }, { "epoch": 52.1680216802168, "grad_norm": 0.23783494532108307, "learning_rate": 3.849943015472479e-07, "loss": 0.0046, "step": 19250 }, { "epoch": 52.19512195121951, "grad_norm": 0.2064845860004425, "learning_rate": 3.748224626588137e-07, "loss": 0.0044, "step": 19260 }, { "epoch": 52.22222222222222, "grad_norm": 0.197647824883461, "learning_rate": 3.647862971108307e-07, "loss": 0.0048, "step": 19270 }, { "epoch": 52.24932249322493, "grad_norm": 0.1339246779680252, "learning_rate": 3.5488583234179473e-07, "loss": 0.0052, "step": 19280 }, { "epoch": 52.27642276422764, "grad_norm": 0.1916918158531189, "learning_rate": 3.4512109541920413e-07, "loss": 0.0062, "step": 19290 }, { "epoch": 52.303523035230356, "grad_norm": 0.5827029943466187, "learning_rate": 3.354921130394706e-07, "loss": 0.0048, "step": 19300 }, { "epoch": 52.330623306233065, "grad_norm": 0.13093264400959015, "learning_rate": 3.259989115278639e-07, "loss": 0.0045, "step": 19310 }, { "epoch": 52.357723577235774, "grad_norm": 0.09943904727697372, "learning_rate": 3.1664151683843403e-07, "loss": 0.0054, "step": 19320 }, { "epoch": 52.38482384823848, "grad_norm": 0.40851300954818726, "learning_rate": 3.074199545539447e-07, "loss": 0.0044, "step": 19330 }, { "epoch": 52.41192411924119, "grad_norm": 0.08994726091623306, "learning_rate": 2.983342498857955e-07, "loss": 0.0049, "step": 19340 }, { "epoch": 52.4390243902439, "grad_norm": 0.4190889000892639, "learning_rate": 2.893844276739499e-07, "loss": 0.0045, "step": 19350 }, { "epoch": 52.46612466124661, "grad_norm": 0.14843754470348358, "learning_rate": 2.8057051238688514e-07, "loss": 0.0047, "step": 19360 }, { "epoch": 52.49322493224932, "grad_norm": 0.14286349713802338, "learning_rate": 2.71892528121509e-07, "loss": 0.0048, "step": 19370 }, { "epoch": 52.520325203252035, "grad_norm": 0.150453120470047, "learning_rate": 2.633504986030988e-07, "loss": 0.005, "step": 19380 }, { "epoch": 52.547425474254744, "grad_norm": 0.3468497693538666, "learning_rate": 2.549444471852347e-07, "loss": 0.0074, "step": 19390 }, { "epoch": 52.57452574525745, "grad_norm": 0.4678441882133484, "learning_rate": 2.4667439684974423e-07, "loss": 0.0041, "step": 19400 }, { "epoch": 52.60162601626016, "grad_norm": 0.23654618859291077, "learning_rate": 2.3854037020662467e-07, "loss": 0.0041, "step": 19410 }, { "epoch": 52.62872628726287, "grad_norm": 0.17082145810127258, "learning_rate": 2.3054238949399288e-07, "loss": 0.005, "step": 19420 }, { "epoch": 52.65582655826558, "grad_norm": 0.44038328528404236, "learning_rate": 2.2268047657802993e-07, "loss": 0.0052, "step": 19430 }, { "epoch": 52.68292682926829, "grad_norm": 0.15363441407680511, "learning_rate": 2.149546529529034e-07, "loss": 0.0047, "step": 19440 }, { "epoch": 52.710027100271006, "grad_norm": 0.09303904324769974, "learning_rate": 2.0736493974071736e-07, "loss": 0.004, "step": 19450 }, { "epoch": 52.737127371273715, "grad_norm": 0.1057671308517456, "learning_rate": 1.9991135769145686e-07, "loss": 0.0046, "step": 19460 }, { "epoch": 52.764227642276424, "grad_norm": 0.1695941537618637, "learning_rate": 1.9259392718293245e-07, "loss": 0.0045, "step": 19470 }, { "epoch": 52.79132791327913, "grad_norm": 0.057480666786432266, "learning_rate": 1.8541266822072467e-07, "loss": 0.0048, "step": 19480 }, { "epoch": 52.81842818428184, "grad_norm": 0.18144655227661133, "learning_rate": 1.7836760043811184e-07, "loss": 0.0057, "step": 19490 }, { "epoch": 52.84552845528455, "grad_norm": 0.1123758926987648, "learning_rate": 1.7145874309604792e-07, "loss": 0.0054, "step": 19500 }, { "epoch": 52.87262872628726, "grad_norm": 0.1041613444685936, "learning_rate": 1.6468611508308474e-07, "loss": 0.0047, "step": 19510 }, { "epoch": 52.89972899728997, "grad_norm": 0.10000970959663391, "learning_rate": 1.5804973491532204e-07, "loss": 0.0045, "step": 19520 }, { "epoch": 52.926829268292686, "grad_norm": 0.37011274695396423, "learning_rate": 1.5154962073637424e-07, "loss": 0.0034, "step": 19530 }, { "epoch": 52.953929539295395, "grad_norm": 0.06438801437616348, "learning_rate": 1.4518579031730372e-07, "loss": 0.008, "step": 19540 }, { "epoch": 52.981029810298104, "grad_norm": 0.09758870303630829, "learning_rate": 1.389582610565876e-07, "loss": 0.0042, "step": 19550 }, { "epoch": 53.00813008130081, "grad_norm": 0.18722441792488098, "learning_rate": 1.3286704998003995e-07, "loss": 0.0047, "step": 19560 }, { "epoch": 53.03523035230352, "grad_norm": 0.12624219059944153, "learning_rate": 1.2691217374080632e-07, "loss": 0.0057, "step": 19570 }, { "epoch": 53.06233062330623, "grad_norm": 0.14944744110107422, "learning_rate": 1.2109364861929705e-07, "loss": 0.0045, "step": 19580 }, { "epoch": 53.08943089430894, "grad_norm": 0.10548335313796997, "learning_rate": 1.1541149052312628e-07, "loss": 0.0049, "step": 19590 }, { "epoch": 53.116531165311656, "grad_norm": 0.13670559227466583, "learning_rate": 1.0986571498710074e-07, "loss": 0.0043, "step": 19600 }, { "epoch": 53.143631436314365, "grad_norm": 0.213542640209198, "learning_rate": 1.0445633717316438e-07, "loss": 0.0034, "step": 19610 }, { "epoch": 53.170731707317074, "grad_norm": 0.11260993033647537, "learning_rate": 9.918337187034277e-08, "loss": 0.004, "step": 19620 }, { "epoch": 53.19783197831978, "grad_norm": 0.20435650646686554, "learning_rate": 9.404683349472643e-08, "loss": 0.005, "step": 19630 }, { "epoch": 53.22493224932249, "grad_norm": 0.13499966263771057, "learning_rate": 8.904673608940983e-08, "loss": 0.0047, "step": 19640 }, { "epoch": 53.2520325203252, "grad_norm": 0.17751647531986237, "learning_rate": 8.418309332447471e-08, "loss": 0.0054, "step": 19650 }, { "epoch": 53.27913279132791, "grad_norm": 0.10488054901361465, "learning_rate": 7.945591849692902e-08, "loss": 0.0058, "step": 19660 }, { "epoch": 53.30623306233063, "grad_norm": 0.22118496894836426, "learning_rate": 7.486522453069578e-08, "loss": 0.0048, "step": 19670 }, { "epoch": 53.333333333333336, "grad_norm": 0.0999939814209938, "learning_rate": 7.041102397655208e-08, "loss": 0.0039, "step": 19680 }, { "epoch": 53.360433604336045, "grad_norm": 0.18741941452026367, "learning_rate": 6.609332901210685e-08, "loss": 0.005, "step": 19690 }, { "epoch": 53.387533875338754, "grad_norm": 0.05051896721124649, "learning_rate": 6.191215144178419e-08, "loss": 0.0047, "step": 19700 }, { "epoch": 53.41463414634146, "grad_norm": 0.10814210027456284, "learning_rate": 5.786750269675678e-08, "loss": 0.0041, "step": 19710 }, { "epoch": 53.44173441734417, "grad_norm": 0.1713084727525711, "learning_rate": 5.395939383494031e-08, "loss": 0.0055, "step": 19720 }, { "epoch": 53.46883468834688, "grad_norm": 0.094144806265831, "learning_rate": 5.018783554095463e-08, "loss": 0.0039, "step": 19730 }, { "epoch": 53.49593495934959, "grad_norm": 0.24812133610248566, "learning_rate": 4.655283812610156e-08, "loss": 0.0042, "step": 19740 }, { "epoch": 53.523035230352306, "grad_norm": 0.12969903647899628, "learning_rate": 4.305441152831491e-08, "loss": 0.0055, "step": 19750 }, { "epoch": 53.550135501355015, "grad_norm": 0.21154777705669403, "learning_rate": 3.9692565312171584e-08, "loss": 0.005, "step": 19760 }, { "epoch": 53.577235772357724, "grad_norm": 0.1688470095396042, "learning_rate": 3.6467308668824975e-08, "loss": 0.0058, "step": 19770 }, { "epoch": 53.60433604336043, "grad_norm": 0.11661141365766525, "learning_rate": 3.3378650416004964e-08, "loss": 0.005, "step": 19780 }, { "epoch": 53.63143631436314, "grad_norm": 0.18576888740062714, "learning_rate": 3.042659899797906e-08, "loss": 0.0051, "step": 19790 }, { "epoch": 53.65853658536585, "grad_norm": 0.2684761583805084, "learning_rate": 2.76111624855524e-08, "loss": 0.005, "step": 19800 }, { "epoch": 53.68563685636856, "grad_norm": 0.06501578539609909, "learning_rate": 2.4932348576017784e-08, "loss": 0.0031, "step": 19810 }, { "epoch": 53.71273712737128, "grad_norm": 0.0858672633767128, "learning_rate": 2.239016459314458e-08, "loss": 0.0037, "step": 19820 }, { "epoch": 53.739837398373986, "grad_norm": 0.13379603624343872, "learning_rate": 1.9984617487173174e-08, "loss": 0.0036, "step": 19830 }, { "epoch": 53.766937669376695, "grad_norm": 0.17467908561229706, "learning_rate": 1.7715713834776105e-08, "loss": 0.0065, "step": 19840 }, { "epoch": 53.794037940379404, "grad_norm": 0.15586039423942566, "learning_rate": 1.5583459839046964e-08, "loss": 0.0062, "step": 19850 }, { "epoch": 53.82113821138211, "grad_norm": 0.17013944685459137, "learning_rate": 1.3587861329489304e-08, "loss": 0.0056, "step": 19860 }, { "epoch": 53.84823848238482, "grad_norm": 0.11505021899938583, "learning_rate": 1.1728923761994415e-08, "loss": 0.0036, "step": 19870 }, { "epoch": 53.87533875338753, "grad_norm": 0.13016805052757263, "learning_rate": 1.0006652218819135e-08, "loss": 0.0043, "step": 19880 }, { "epoch": 53.90243902439025, "grad_norm": 0.1075243204832077, "learning_rate": 8.421051408596947e-09, "loss": 0.0071, "step": 19890 }, { "epoch": 53.929539295392956, "grad_norm": 0.03719748556613922, "learning_rate": 6.972125666299123e-09, "loss": 0.0038, "step": 19900 }, { "epoch": 53.956639566395665, "grad_norm": 0.16308626532554626, "learning_rate": 5.659878953229169e-09, "loss": 0.007, "step": 19910 }, { "epoch": 53.983739837398375, "grad_norm": 0.14838647842407227, "learning_rate": 4.48431485701728e-09, "loss": 0.0058, "step": 19920 }, { "epoch": 54.010840108401084, "grad_norm": 0.09518647938966751, "learning_rate": 3.4454365916203322e-09, "loss": 0.004, "step": 19930 }, { "epoch": 54.03794037940379, "grad_norm": 0.21079345047473907, "learning_rate": 2.5432469972830332e-09, "loss": 0.0049, "step": 19940 }, { "epoch": 54.0650406504065, "grad_norm": 0.19580811262130737, "learning_rate": 1.7777485405601203e-09, "loss": 0.0067, "step": 19950 }, { "epoch": 54.09214092140921, "grad_norm": 0.23285380005836487, "learning_rate": 1.1489433142941597e-09, "loss": 0.0063, "step": 19960 }, { "epoch": 54.11924119241193, "grad_norm": 0.10448484867811203, "learning_rate": 6.568330376210963e-10, "loss": 0.0048, "step": 19970 }, { "epoch": 54.146341463414636, "grad_norm": 0.13178621232509613, "learning_rate": 3.0141905594249787e-10, "loss": 0.0044, "step": 19980 }, { "epoch": 54.173441734417345, "grad_norm": 0.14288213849067688, "learning_rate": 8.270234094776008e-11, "loss": 0.0058, "step": 19990 }, { "epoch": 54.200542005420054, "grad_norm": 0.11834049224853516, "learning_rate": 6.834906085551041e-13, "loss": 0.005, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 55, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 36, "trial_name": null, "trial_params": null }