{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 49.66887417218543, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.033112582781456956, "grad_norm": 11.272758483886719, "learning_rate": 9e-07, "loss": 0.7783, "step": 10 }, { "epoch": 0.06622516556291391, "grad_norm": 6.836533546447754, "learning_rate": 1.9e-06, "loss": 0.7189, "step": 20 }, { "epoch": 0.09933774834437085, "grad_norm": 3.0679104328155518, "learning_rate": 2.9e-06, "loss": 0.4214, "step": 30 }, { "epoch": 0.13245033112582782, "grad_norm": 2.2359883785247803, "learning_rate": 3.9e-06, "loss": 0.2344, "step": 40 }, { "epoch": 0.16556291390728478, "grad_norm": 1.2197821140289307, "learning_rate": 4.9000000000000005e-06, "loss": 0.1888, "step": 50 }, { "epoch": 0.1986754966887417, "grad_norm": 1.0855205059051514, "learning_rate": 5.9e-06, "loss": 0.1526, "step": 60 }, { "epoch": 0.23178807947019867, "grad_norm": 1.0838931798934937, "learning_rate": 6.900000000000001e-06, "loss": 0.1289, "step": 70 }, { "epoch": 0.26490066225165565, "grad_norm": 1.1162554025650024, "learning_rate": 7.9e-06, "loss": 0.13, "step": 80 }, { "epoch": 0.2980132450331126, "grad_norm": 0.739594578742981, "learning_rate": 8.9e-06, "loss": 0.114, "step": 90 }, { "epoch": 0.33112582781456956, "grad_norm": 0.9306089282035828, "learning_rate": 9.900000000000002e-06, "loss": 0.1068, "step": 100 }, { "epoch": 0.36423841059602646, "grad_norm": 0.9236593842506409, "learning_rate": 1.09e-05, "loss": 0.1119, "step": 110 }, { "epoch": 0.3973509933774834, "grad_norm": 1.2485811710357666, "learning_rate": 1.19e-05, "loss": 0.1116, "step": 120 }, { "epoch": 0.4304635761589404, "grad_norm": 0.8547218441963196, "learning_rate": 1.29e-05, "loss": 0.0983, "step": 130 }, { "epoch": 0.46357615894039733, "grad_norm": 0.7687177062034607, "learning_rate": 1.3900000000000002e-05, "loss": 0.0975, "step": 140 }, { "epoch": 0.4966887417218543, "grad_norm": 0.7976408004760742, "learning_rate": 1.49e-05, "loss": 0.1081, "step": 150 }, { "epoch": 0.5298013245033113, "grad_norm": 0.86927330493927, "learning_rate": 1.59e-05, "loss": 0.1088, "step": 160 }, { "epoch": 0.5629139072847682, "grad_norm": 1.422742247581482, "learning_rate": 1.69e-05, "loss": 0.096, "step": 170 }, { "epoch": 0.5960264900662252, "grad_norm": 0.6284838914871216, "learning_rate": 1.79e-05, "loss": 0.0964, "step": 180 }, { "epoch": 0.6291390728476821, "grad_norm": 0.9891707301139832, "learning_rate": 1.8900000000000002e-05, "loss": 0.0935, "step": 190 }, { "epoch": 0.6622516556291391, "grad_norm": 0.5493897795677185, "learning_rate": 1.9900000000000003e-05, "loss": 0.0867, "step": 200 }, { "epoch": 0.695364238410596, "grad_norm": 0.5843174457550049, "learning_rate": 2.09e-05, "loss": 0.0867, "step": 210 }, { "epoch": 0.7284768211920529, "grad_norm": 0.48971569538116455, "learning_rate": 2.19e-05, "loss": 0.0877, "step": 220 }, { "epoch": 0.7615894039735099, "grad_norm": 0.8286858797073364, "learning_rate": 2.29e-05, "loss": 0.0901, "step": 230 }, { "epoch": 0.7947019867549668, "grad_norm": 0.5216439366340637, "learning_rate": 2.39e-05, "loss": 0.095, "step": 240 }, { "epoch": 0.8278145695364238, "grad_norm": 0.8459084033966064, "learning_rate": 2.4900000000000002e-05, "loss": 0.0852, "step": 250 }, { "epoch": 0.8609271523178808, "grad_norm": 0.4757920801639557, "learning_rate": 2.5900000000000003e-05, "loss": 0.0803, "step": 260 }, { "epoch": 0.8940397350993378, "grad_norm": 0.5093938708305359, "learning_rate": 2.6900000000000003e-05, "loss": 0.088, "step": 270 }, { "epoch": 0.9271523178807947, "grad_norm": 0.9054393768310547, "learning_rate": 2.7900000000000004e-05, "loss": 0.0839, "step": 280 }, { "epoch": 0.9602649006622517, "grad_norm": 0.6595967411994934, "learning_rate": 2.8899999999999998e-05, "loss": 0.08, "step": 290 }, { "epoch": 0.9933774834437086, "grad_norm": 0.38437893986701965, "learning_rate": 2.9900000000000002e-05, "loss": 0.0891, "step": 300 }, { "epoch": 1.0264900662251655, "grad_norm": 0.8347054719924927, "learning_rate": 3.09e-05, "loss": 0.0876, "step": 310 }, { "epoch": 1.0596026490066226, "grad_norm": 0.5943841934204102, "learning_rate": 3.19e-05, "loss": 0.0758, "step": 320 }, { "epoch": 1.0927152317880795, "grad_norm": 0.5209283232688904, "learning_rate": 3.29e-05, "loss": 0.0819, "step": 330 }, { "epoch": 1.1258278145695364, "grad_norm": 0.5600361227989197, "learning_rate": 3.3900000000000004e-05, "loss": 0.0875, "step": 340 }, { "epoch": 1.1589403973509933, "grad_norm": 0.4793728291988373, "learning_rate": 3.49e-05, "loss": 0.0892, "step": 350 }, { "epoch": 1.1920529801324504, "grad_norm": 0.42760252952575684, "learning_rate": 3.59e-05, "loss": 0.0807, "step": 360 }, { "epoch": 1.2251655629139073, "grad_norm": 0.7746989727020264, "learning_rate": 3.69e-05, "loss": 0.0705, "step": 370 }, { "epoch": 1.2582781456953642, "grad_norm": 0.6529927849769592, "learning_rate": 3.79e-05, "loss": 0.0812, "step": 380 }, { "epoch": 1.2913907284768211, "grad_norm": 0.5801954865455627, "learning_rate": 3.8900000000000004e-05, "loss": 0.0817, "step": 390 }, { "epoch": 1.3245033112582782, "grad_norm": 1.0298762321472168, "learning_rate": 3.99e-05, "loss": 0.0911, "step": 400 }, { "epoch": 1.3576158940397351, "grad_norm": 0.553765058517456, "learning_rate": 4.09e-05, "loss": 0.0878, "step": 410 }, { "epoch": 1.390728476821192, "grad_norm": 0.5896804928779602, "learning_rate": 4.19e-05, "loss": 0.0803, "step": 420 }, { "epoch": 1.423841059602649, "grad_norm": 0.649569571018219, "learning_rate": 4.29e-05, "loss": 0.0873, "step": 430 }, { "epoch": 1.4569536423841059, "grad_norm": 0.47723865509033203, "learning_rate": 4.39e-05, "loss": 0.0778, "step": 440 }, { "epoch": 1.490066225165563, "grad_norm": 0.8546996712684631, "learning_rate": 4.49e-05, "loss": 0.0815, "step": 450 }, { "epoch": 1.5231788079470199, "grad_norm": 1.0322091579437256, "learning_rate": 4.5900000000000004e-05, "loss": 0.0789, "step": 460 }, { "epoch": 1.5562913907284768, "grad_norm": 0.7527519464492798, "learning_rate": 4.69e-05, "loss": 0.079, "step": 470 }, { "epoch": 1.589403973509934, "grad_norm": 0.4126027822494507, "learning_rate": 4.79e-05, "loss": 0.0762, "step": 480 }, { "epoch": 1.6225165562913908, "grad_norm": 0.5786799788475037, "learning_rate": 4.89e-05, "loss": 0.0788, "step": 490 }, { "epoch": 1.6556291390728477, "grad_norm": 0.48556241393089294, "learning_rate": 4.99e-05, "loss": 0.0809, "step": 500 }, { "epoch": 1.6887417218543046, "grad_norm": 0.4278988242149353, "learning_rate": 5.0900000000000004e-05, "loss": 0.0788, "step": 510 }, { "epoch": 1.7218543046357615, "grad_norm": 0.7481850981712341, "learning_rate": 5.19e-05, "loss": 0.0814, "step": 520 }, { "epoch": 1.7549668874172184, "grad_norm": 0.804981529712677, "learning_rate": 5.2900000000000005e-05, "loss": 0.0812, "step": 530 }, { "epoch": 1.7880794701986755, "grad_norm": 0.48933446407318115, "learning_rate": 5.390000000000001e-05, "loss": 0.0799, "step": 540 }, { "epoch": 1.8211920529801324, "grad_norm": 0.654542863368988, "learning_rate": 5.4900000000000006e-05, "loss": 0.0715, "step": 550 }, { "epoch": 1.8543046357615895, "grad_norm": 0.45833292603492737, "learning_rate": 5.590000000000001e-05, "loss": 0.0787, "step": 560 }, { "epoch": 1.8874172185430464, "grad_norm": 0.5363119840621948, "learning_rate": 5.69e-05, "loss": 0.0781, "step": 570 }, { "epoch": 1.9205298013245033, "grad_norm": 0.8770874738693237, "learning_rate": 5.79e-05, "loss": 0.0802, "step": 580 }, { "epoch": 1.9536423841059603, "grad_norm": 0.530733048915863, "learning_rate": 5.89e-05, "loss": 0.0784, "step": 590 }, { "epoch": 1.9867549668874172, "grad_norm": 0.5182718634605408, "learning_rate": 5.99e-05, "loss": 0.0761, "step": 600 }, { "epoch": 2.019867549668874, "grad_norm": 0.7403817176818848, "learning_rate": 6.09e-05, "loss": 0.0779, "step": 610 }, { "epoch": 2.052980132450331, "grad_norm": 0.5862758755683899, "learning_rate": 6.19e-05, "loss": 0.0808, "step": 620 }, { "epoch": 2.0860927152317883, "grad_norm": 0.5413907170295715, "learning_rate": 6.29e-05, "loss": 0.0804, "step": 630 }, { "epoch": 2.119205298013245, "grad_norm": 0.6511678695678711, "learning_rate": 6.390000000000001e-05, "loss": 0.0782, "step": 640 }, { "epoch": 2.152317880794702, "grad_norm": 0.3867063820362091, "learning_rate": 6.49e-05, "loss": 0.081, "step": 650 }, { "epoch": 2.185430463576159, "grad_norm": 0.48696842789649963, "learning_rate": 6.59e-05, "loss": 0.0781, "step": 660 }, { "epoch": 2.218543046357616, "grad_norm": 0.32975149154663086, "learning_rate": 6.690000000000001e-05, "loss": 0.0726, "step": 670 }, { "epoch": 2.251655629139073, "grad_norm": 0.41110119223594666, "learning_rate": 6.790000000000001e-05, "loss": 0.0734, "step": 680 }, { "epoch": 2.2847682119205297, "grad_norm": 0.39215582609176636, "learning_rate": 6.89e-05, "loss": 0.0746, "step": 690 }, { "epoch": 2.3178807947019866, "grad_norm": 0.8009976148605347, "learning_rate": 6.99e-05, "loss": 0.0767, "step": 700 }, { "epoch": 2.3509933774834435, "grad_norm": 0.35606980323791504, "learning_rate": 7.09e-05, "loss": 0.0736, "step": 710 }, { "epoch": 2.384105960264901, "grad_norm": 0.39610183238983154, "learning_rate": 7.19e-05, "loss": 0.0706, "step": 720 }, { "epoch": 2.4172185430463577, "grad_norm": 0.5188985466957092, "learning_rate": 7.29e-05, "loss": 0.0727, "step": 730 }, { "epoch": 2.4503311258278146, "grad_norm": 0.6609029173851013, "learning_rate": 7.390000000000001e-05, "loss": 0.07, "step": 740 }, { "epoch": 2.4834437086092715, "grad_norm": 0.512077808380127, "learning_rate": 7.49e-05, "loss": 0.0687, "step": 750 }, { "epoch": 2.5165562913907285, "grad_norm": 0.5282275080680847, "learning_rate": 7.59e-05, "loss": 0.0772, "step": 760 }, { "epoch": 2.5496688741721854, "grad_norm": 0.6372500061988831, "learning_rate": 7.69e-05, "loss": 0.0772, "step": 770 }, { "epoch": 2.5827814569536423, "grad_norm": 0.7868066430091858, "learning_rate": 7.790000000000001e-05, "loss": 0.0881, "step": 780 }, { "epoch": 2.6158940397350996, "grad_norm": 0.811579167842865, "learning_rate": 7.890000000000001e-05, "loss": 0.0771, "step": 790 }, { "epoch": 2.6490066225165565, "grad_norm": 0.7840661406517029, "learning_rate": 7.99e-05, "loss": 0.0811, "step": 800 }, { "epoch": 2.6821192052980134, "grad_norm": 0.4495397210121155, "learning_rate": 8.090000000000001e-05, "loss": 0.0769, "step": 810 }, { "epoch": 2.7152317880794703, "grad_norm": 0.6220383048057556, "learning_rate": 8.19e-05, "loss": 0.0828, "step": 820 }, { "epoch": 2.748344370860927, "grad_norm": 0.4503801763057709, "learning_rate": 8.29e-05, "loss": 0.08, "step": 830 }, { "epoch": 2.781456953642384, "grad_norm": 0.5046451687812805, "learning_rate": 8.39e-05, "loss": 0.0747, "step": 840 }, { "epoch": 2.814569536423841, "grad_norm": 0.8233490586280823, "learning_rate": 8.49e-05, "loss": 0.0728, "step": 850 }, { "epoch": 2.847682119205298, "grad_norm": 0.5818904042243958, "learning_rate": 8.59e-05, "loss": 0.0743, "step": 860 }, { "epoch": 2.880794701986755, "grad_norm": 0.7405814528465271, "learning_rate": 8.69e-05, "loss": 0.0757, "step": 870 }, { "epoch": 2.9139072847682117, "grad_norm": 0.7239719033241272, "learning_rate": 8.790000000000001e-05, "loss": 0.0848, "step": 880 }, { "epoch": 2.9470198675496686, "grad_norm": 0.4691993296146393, "learning_rate": 8.89e-05, "loss": 0.0782, "step": 890 }, { "epoch": 2.980132450331126, "grad_norm": 0.2698413133621216, "learning_rate": 8.99e-05, "loss": 0.0728, "step": 900 }, { "epoch": 3.013245033112583, "grad_norm": 0.3984912633895874, "learning_rate": 9.090000000000001e-05, "loss": 0.0698, "step": 910 }, { "epoch": 3.0463576158940397, "grad_norm": 0.5739409327507019, "learning_rate": 9.190000000000001e-05, "loss": 0.0704, "step": 920 }, { "epoch": 3.0794701986754967, "grad_norm": 0.5284174680709839, "learning_rate": 9.290000000000001e-05, "loss": 0.0774, "step": 930 }, { "epoch": 3.1125827814569536, "grad_norm": 0.5192115306854248, "learning_rate": 9.39e-05, "loss": 0.0668, "step": 940 }, { "epoch": 3.1456953642384105, "grad_norm": 0.307212769985199, "learning_rate": 9.49e-05, "loss": 0.0721, "step": 950 }, { "epoch": 3.1788079470198674, "grad_norm": 0.5918645858764648, "learning_rate": 9.59e-05, "loss": 0.0728, "step": 960 }, { "epoch": 3.2119205298013247, "grad_norm": 0.3695594072341919, "learning_rate": 9.69e-05, "loss": 0.0729, "step": 970 }, { "epoch": 3.2450331125827816, "grad_norm": 0.4149673283100128, "learning_rate": 9.790000000000001e-05, "loss": 0.069, "step": 980 }, { "epoch": 3.2781456953642385, "grad_norm": 0.4721461832523346, "learning_rate": 9.89e-05, "loss": 0.0694, "step": 990 }, { "epoch": 3.3112582781456954, "grad_norm": 0.5365404486656189, "learning_rate": 9.99e-05, "loss": 0.0674, "step": 1000 }, { "epoch": 3.3443708609271523, "grad_norm": 0.6196780204772949, "learning_rate": 9.999994463727085e-05, "loss": 0.0686, "step": 1010 }, { "epoch": 3.377483443708609, "grad_norm": 0.3526443541049957, "learning_rate": 9.999975326009292e-05, "loss": 0.0818, "step": 1020 }, { "epoch": 3.410596026490066, "grad_norm": 0.8027057647705078, "learning_rate": 9.999942518549879e-05, "loss": 0.0734, "step": 1030 }, { "epoch": 3.443708609271523, "grad_norm": 0.42216959595680237, "learning_rate": 9.999896041438544e-05, "loss": 0.0687, "step": 1040 }, { "epoch": 3.47682119205298, "grad_norm": 0.4121258854866028, "learning_rate": 9.999835894802353e-05, "loss": 0.0672, "step": 1050 }, { "epoch": 3.5099337748344372, "grad_norm": 0.6335343718528748, "learning_rate": 9.999762078805743e-05, "loss": 0.0671, "step": 1060 }, { "epoch": 3.543046357615894, "grad_norm": 0.2850275933742523, "learning_rate": 9.999674593650526e-05, "loss": 0.0684, "step": 1070 }, { "epoch": 3.576158940397351, "grad_norm": 0.4279652237892151, "learning_rate": 9.99957343957588e-05, "loss": 0.0726, "step": 1080 }, { "epoch": 3.609271523178808, "grad_norm": 0.31030815839767456, "learning_rate": 9.99945861685836e-05, "loss": 0.0691, "step": 1090 }, { "epoch": 3.642384105960265, "grad_norm": 0.3121446967124939, "learning_rate": 9.999330125811884e-05, "loss": 0.0683, "step": 1100 }, { "epoch": 3.6754966887417218, "grad_norm": 0.3846411108970642, "learning_rate": 9.999187966787744e-05, "loss": 0.0647, "step": 1110 }, { "epoch": 3.7086092715231787, "grad_norm": 0.586017370223999, "learning_rate": 9.999032140174595e-05, "loss": 0.0744, "step": 1120 }, { "epoch": 3.741721854304636, "grad_norm": 0.6339834928512573, "learning_rate": 9.998862646398464e-05, "loss": 0.0721, "step": 1130 }, { "epoch": 3.774834437086093, "grad_norm": 0.2831915318965912, "learning_rate": 9.998679485922739e-05, "loss": 0.0659, "step": 1140 }, { "epoch": 3.80794701986755, "grad_norm": 0.47010165452957153, "learning_rate": 9.998482659248174e-05, "loss": 0.0673, "step": 1150 }, { "epoch": 3.8410596026490067, "grad_norm": 0.5042463541030884, "learning_rate": 9.998272166912883e-05, "loss": 0.0681, "step": 1160 }, { "epoch": 3.8741721854304636, "grad_norm": 0.4812171161174774, "learning_rate": 9.998048009492347e-05, "loss": 0.0735, "step": 1170 }, { "epoch": 3.9072847682119205, "grad_norm": 1.1552042961120605, "learning_rate": 9.997810187599403e-05, "loss": 0.0726, "step": 1180 }, { "epoch": 3.9403973509933774, "grad_norm": 0.5132084488868713, "learning_rate": 9.997558701884249e-05, "loss": 0.0685, "step": 1190 }, { "epoch": 3.9735099337748343, "grad_norm": 0.25289177894592285, "learning_rate": 9.997293553034433e-05, "loss": 0.0713, "step": 1200 }, { "epoch": 4.006622516556291, "grad_norm": 0.41046833992004395, "learning_rate": 9.997014741774866e-05, "loss": 0.0743, "step": 1210 }, { "epoch": 4.039735099337748, "grad_norm": 0.6460240483283997, "learning_rate": 9.996722268867803e-05, "loss": 0.0687, "step": 1220 }, { "epoch": 4.072847682119205, "grad_norm": 0.5020318031311035, "learning_rate": 9.996416135112858e-05, "loss": 0.0671, "step": 1230 }, { "epoch": 4.105960264900662, "grad_norm": 0.4620524048805237, "learning_rate": 9.996096341346988e-05, "loss": 0.07, "step": 1240 }, { "epoch": 4.139072847682119, "grad_norm": 0.42971736192703247, "learning_rate": 9.995762888444495e-05, "loss": 0.0696, "step": 1250 }, { "epoch": 4.172185430463577, "grad_norm": 0.2649601697921753, "learning_rate": 9.995415777317027e-05, "loss": 0.0667, "step": 1260 }, { "epoch": 4.2052980132450335, "grad_norm": 0.5057312250137329, "learning_rate": 9.995055008913574e-05, "loss": 0.0699, "step": 1270 }, { "epoch": 4.23841059602649, "grad_norm": 0.27768081426620483, "learning_rate": 9.994680584220463e-05, "loss": 0.0691, "step": 1280 }, { "epoch": 4.271523178807947, "grad_norm": 0.6923494338989258, "learning_rate": 9.994292504261355e-05, "loss": 0.0716, "step": 1290 }, { "epoch": 4.304635761589404, "grad_norm": 0.4541308581829071, "learning_rate": 9.993890770097247e-05, "loss": 0.0694, "step": 1300 }, { "epoch": 4.337748344370861, "grad_norm": 0.32371750473976135, "learning_rate": 9.993475382826467e-05, "loss": 0.073, "step": 1310 }, { "epoch": 4.370860927152318, "grad_norm": 0.5184516310691833, "learning_rate": 9.993046343584664e-05, "loss": 0.0641, "step": 1320 }, { "epoch": 4.403973509933775, "grad_norm": 0.4610058069229126, "learning_rate": 9.992603653544816e-05, "loss": 0.0663, "step": 1330 }, { "epoch": 4.437086092715232, "grad_norm": 0.6592222452163696, "learning_rate": 9.992147313917222e-05, "loss": 0.07, "step": 1340 }, { "epoch": 4.470198675496689, "grad_norm": 0.35688820481300354, "learning_rate": 9.991677325949497e-05, "loss": 0.0684, "step": 1350 }, { "epoch": 4.503311258278146, "grad_norm": 0.6169222593307495, "learning_rate": 9.991193690926568e-05, "loss": 0.0692, "step": 1360 }, { "epoch": 4.5364238410596025, "grad_norm": 0.5066049098968506, "learning_rate": 9.990696410170678e-05, "loss": 0.0701, "step": 1370 }, { "epoch": 4.569536423841059, "grad_norm": 0.5569772124290466, "learning_rate": 9.990185485041371e-05, "loss": 0.0607, "step": 1380 }, { "epoch": 4.602649006622516, "grad_norm": 0.4323882758617401, "learning_rate": 9.989660916935498e-05, "loss": 0.0618, "step": 1390 }, { "epoch": 4.635761589403973, "grad_norm": 0.484758585691452, "learning_rate": 9.989122707287208e-05, "loss": 0.0619, "step": 1400 }, { "epoch": 4.66887417218543, "grad_norm": 0.30767735838890076, "learning_rate": 9.988570857567945e-05, "loss": 0.0623, "step": 1410 }, { "epoch": 4.701986754966887, "grad_norm": 0.7736018896102905, "learning_rate": 9.988005369286446e-05, "loss": 0.0683, "step": 1420 }, { "epoch": 4.735099337748345, "grad_norm": 0.46966439485549927, "learning_rate": 9.987426243988734e-05, "loss": 0.0632, "step": 1430 }, { "epoch": 4.768211920529802, "grad_norm": 0.5283108949661255, "learning_rate": 9.986833483258114e-05, "loss": 0.064, "step": 1440 }, { "epoch": 4.801324503311259, "grad_norm": 0.34168267250061035, "learning_rate": 9.986227088715173e-05, "loss": 0.0616, "step": 1450 }, { "epoch": 4.8344370860927155, "grad_norm": 0.3154952824115753, "learning_rate": 9.98560706201777e-05, "loss": 0.0709, "step": 1460 }, { "epoch": 4.867549668874172, "grad_norm": 0.5397589206695557, "learning_rate": 9.984973404861036e-05, "loss": 0.0619, "step": 1470 }, { "epoch": 4.900662251655629, "grad_norm": 0.37355950474739075, "learning_rate": 9.984326118977361e-05, "loss": 0.0625, "step": 1480 }, { "epoch": 4.933774834437086, "grad_norm": 0.3682405650615692, "learning_rate": 9.983665206136406e-05, "loss": 0.0664, "step": 1490 }, { "epoch": 4.966887417218543, "grad_norm": 0.29455462098121643, "learning_rate": 9.982990668145075e-05, "loss": 0.066, "step": 1500 }, { "epoch": 5.0, "grad_norm": 0.3286159932613373, "learning_rate": 9.982302506847534e-05, "loss": 0.0675, "step": 1510 }, { "epoch": 5.033112582781457, "grad_norm": 0.4320835769176483, "learning_rate": 9.981600724125189e-05, "loss": 0.0668, "step": 1520 }, { "epoch": 5.066225165562914, "grad_norm": 0.20314979553222656, "learning_rate": 9.980885321896685e-05, "loss": 0.066, "step": 1530 }, { "epoch": 5.099337748344371, "grad_norm": 0.6292283535003662, "learning_rate": 9.980156302117905e-05, "loss": 0.0711, "step": 1540 }, { "epoch": 5.132450331125828, "grad_norm": 0.49716177582740784, "learning_rate": 9.979413666781963e-05, "loss": 0.0647, "step": 1550 }, { "epoch": 5.1655629139072845, "grad_norm": 0.34228673577308655, "learning_rate": 9.978657417919193e-05, "loss": 0.0708, "step": 1560 }, { "epoch": 5.198675496688741, "grad_norm": 0.343089759349823, "learning_rate": 9.977887557597153e-05, "loss": 0.0622, "step": 1570 }, { "epoch": 5.231788079470198, "grad_norm": 0.47524014115333557, "learning_rate": 9.97710408792061e-05, "loss": 0.063, "step": 1580 }, { "epoch": 5.264900662251655, "grad_norm": 0.3913901448249817, "learning_rate": 9.976307011031542e-05, "loss": 0.0628, "step": 1590 }, { "epoch": 5.298013245033113, "grad_norm": 0.6190195083618164, "learning_rate": 9.975496329109126e-05, "loss": 0.0683, "step": 1600 }, { "epoch": 5.33112582781457, "grad_norm": 0.30605730414390564, "learning_rate": 9.974672044369732e-05, "loss": 0.0635, "step": 1610 }, { "epoch": 5.364238410596027, "grad_norm": 0.3592502474784851, "learning_rate": 9.97383415906693e-05, "loss": 0.0706, "step": 1620 }, { "epoch": 5.397350993377484, "grad_norm": 0.23145179450511932, "learning_rate": 9.97298267549146e-05, "loss": 0.0645, "step": 1630 }, { "epoch": 5.430463576158941, "grad_norm": 0.2897457480430603, "learning_rate": 9.972117595971249e-05, "loss": 0.0654, "step": 1640 }, { "epoch": 5.4635761589403975, "grad_norm": 0.27629318833351135, "learning_rate": 9.971238922871391e-05, "loss": 0.0615, "step": 1650 }, { "epoch": 5.496688741721854, "grad_norm": 0.2666998505592346, "learning_rate": 9.970346658594142e-05, "loss": 0.0618, "step": 1660 }, { "epoch": 5.529801324503311, "grad_norm": 0.20623719692230225, "learning_rate": 9.969440805578923e-05, "loss": 0.0598, "step": 1670 }, { "epoch": 5.562913907284768, "grad_norm": 0.4561695456504822, "learning_rate": 9.968521366302298e-05, "loss": 0.0645, "step": 1680 }, { "epoch": 5.596026490066225, "grad_norm": 0.38180306553840637, "learning_rate": 9.967588343277981e-05, "loss": 0.072, "step": 1690 }, { "epoch": 5.629139072847682, "grad_norm": 0.3241891860961914, "learning_rate": 9.966641739056818e-05, "loss": 0.0715, "step": 1700 }, { "epoch": 5.662251655629139, "grad_norm": 0.5120295882225037, "learning_rate": 9.965681556226793e-05, "loss": 0.0724, "step": 1710 }, { "epoch": 5.695364238410596, "grad_norm": 0.37431472539901733, "learning_rate": 9.964707797413006e-05, "loss": 0.06, "step": 1720 }, { "epoch": 5.728476821192053, "grad_norm": 0.41069701313972473, "learning_rate": 9.963720465277679e-05, "loss": 0.0661, "step": 1730 }, { "epoch": 5.76158940397351, "grad_norm": 0.465186208486557, "learning_rate": 9.96271956252014e-05, "loss": 0.0643, "step": 1740 }, { "epoch": 5.7947019867549665, "grad_norm": 0.4171273410320282, "learning_rate": 9.961705091876816e-05, "loss": 0.0709, "step": 1750 }, { "epoch": 5.827814569536423, "grad_norm": 0.5710222721099854, "learning_rate": 9.960677056121235e-05, "loss": 0.0667, "step": 1760 }, { "epoch": 5.860927152317881, "grad_norm": 0.5049121379852295, "learning_rate": 9.959635458064005e-05, "loss": 0.0678, "step": 1770 }, { "epoch": 5.894039735099338, "grad_norm": 0.42478466033935547, "learning_rate": 9.958580300552815e-05, "loss": 0.0647, "step": 1780 }, { "epoch": 5.927152317880795, "grad_norm": 0.5328264832496643, "learning_rate": 9.957511586472426e-05, "loss": 0.0687, "step": 1790 }, { "epoch": 5.960264900662252, "grad_norm": 0.3081049919128418, "learning_rate": 9.956429318744662e-05, "loss": 0.0661, "step": 1800 }, { "epoch": 5.993377483443709, "grad_norm": 0.4146735966205597, "learning_rate": 9.955333500328404e-05, "loss": 0.0612, "step": 1810 }, { "epoch": 6.026490066225166, "grad_norm": 0.23737876117229462, "learning_rate": 9.95422413421957e-05, "loss": 0.0648, "step": 1820 }, { "epoch": 6.059602649006623, "grad_norm": 0.34124910831451416, "learning_rate": 9.953101223451133e-05, "loss": 0.065, "step": 1830 }, { "epoch": 6.0927152317880795, "grad_norm": 0.5083723664283752, "learning_rate": 9.951964771093085e-05, "loss": 0.0643, "step": 1840 }, { "epoch": 6.125827814569536, "grad_norm": 0.347522497177124, "learning_rate": 9.950814780252442e-05, "loss": 0.0659, "step": 1850 }, { "epoch": 6.158940397350993, "grad_norm": 0.32610011100769043, "learning_rate": 9.949651254073236e-05, "loss": 0.0641, "step": 1860 }, { "epoch": 6.19205298013245, "grad_norm": 0.5385106801986694, "learning_rate": 9.948474195736504e-05, "loss": 0.0622, "step": 1870 }, { "epoch": 6.225165562913907, "grad_norm": 0.2900354266166687, "learning_rate": 9.947283608460277e-05, "loss": 0.0606, "step": 1880 }, { "epoch": 6.258278145695364, "grad_norm": 0.5221039056777954, "learning_rate": 9.946079495499577e-05, "loss": 0.0702, "step": 1890 }, { "epoch": 6.291390728476821, "grad_norm": 0.44581082463264465, "learning_rate": 9.944861860146401e-05, "loss": 0.0643, "step": 1900 }, { "epoch": 6.324503311258278, "grad_norm": 0.47844311594963074, "learning_rate": 9.943630705729719e-05, "loss": 0.0691, "step": 1910 }, { "epoch": 6.357615894039735, "grad_norm": 0.2434798777103424, "learning_rate": 9.942386035615459e-05, "loss": 0.0646, "step": 1920 }, { "epoch": 6.390728476821192, "grad_norm": 0.3171127438545227, "learning_rate": 9.941127853206503e-05, "loss": 0.066, "step": 1930 }, { "epoch": 6.423841059602649, "grad_norm": 0.2915511727333069, "learning_rate": 9.939856161942673e-05, "loss": 0.065, "step": 1940 }, { "epoch": 6.456953642384106, "grad_norm": 0.4793040156364441, "learning_rate": 9.938570965300724e-05, "loss": 0.0629, "step": 1950 }, { "epoch": 6.490066225165563, "grad_norm": 0.3607787489891052, "learning_rate": 9.937272266794335e-05, "loss": 0.06, "step": 1960 }, { "epoch": 6.52317880794702, "grad_norm": 0.34582194685935974, "learning_rate": 9.935960069974096e-05, "loss": 0.0677, "step": 1970 }, { "epoch": 6.556291390728477, "grad_norm": 0.34532251954078674, "learning_rate": 9.934634378427506e-05, "loss": 0.0633, "step": 1980 }, { "epoch": 6.589403973509934, "grad_norm": 0.30875545740127563, "learning_rate": 9.933295195778954e-05, "loss": 0.0578, "step": 1990 }, { "epoch": 6.622516556291391, "grad_norm": 0.5000781416893005, "learning_rate": 9.931942525689715e-05, "loss": 0.0624, "step": 2000 }, { "epoch": 6.655629139072848, "grad_norm": 0.41086098551750183, "learning_rate": 9.930576371857936e-05, "loss": 0.0638, "step": 2010 }, { "epoch": 6.688741721854305, "grad_norm": 0.4273587465286255, "learning_rate": 9.929196738018629e-05, "loss": 0.0652, "step": 2020 }, { "epoch": 6.7218543046357615, "grad_norm": 0.3414785861968994, "learning_rate": 9.927803627943662e-05, "loss": 0.0628, "step": 2030 }, { "epoch": 6.754966887417218, "grad_norm": 0.21039006114006042, "learning_rate": 9.926397045441744e-05, "loss": 0.0616, "step": 2040 }, { "epoch": 6.788079470198675, "grad_norm": 0.244539275765419, "learning_rate": 9.924976994358417e-05, "loss": 0.0613, "step": 2050 }, { "epoch": 6.821192052980132, "grad_norm": 0.30621302127838135, "learning_rate": 9.923543478576048e-05, "loss": 0.0629, "step": 2060 }, { "epoch": 6.854304635761589, "grad_norm": 0.3515874743461609, "learning_rate": 9.922096502013813e-05, "loss": 0.0624, "step": 2070 }, { "epoch": 6.887417218543046, "grad_norm": 0.32597121596336365, "learning_rate": 9.92063606862769e-05, "loss": 0.0597, "step": 2080 }, { "epoch": 6.920529801324504, "grad_norm": 0.2507514953613281, "learning_rate": 9.919162182410453e-05, "loss": 0.0659, "step": 2090 }, { "epoch": 6.95364238410596, "grad_norm": 0.49123650789260864, "learning_rate": 9.917674847391645e-05, "loss": 0.067, "step": 2100 }, { "epoch": 6.986754966887418, "grad_norm": 0.3320463299751282, "learning_rate": 9.916174067637584e-05, "loss": 0.0612, "step": 2110 }, { "epoch": 7.0198675496688745, "grad_norm": 0.5160515904426575, "learning_rate": 9.914659847251348e-05, "loss": 0.0594, "step": 2120 }, { "epoch": 7.052980132450331, "grad_norm": 0.27490440011024475, "learning_rate": 9.913132190372753e-05, "loss": 0.0676, "step": 2130 }, { "epoch": 7.086092715231788, "grad_norm": 0.4232550263404846, "learning_rate": 9.911591101178359e-05, "loss": 0.0639, "step": 2140 }, { "epoch": 7.119205298013245, "grad_norm": 0.26283782720565796, "learning_rate": 9.910036583881443e-05, "loss": 0.0598, "step": 2150 }, { "epoch": 7.152317880794702, "grad_norm": 0.2822170853614807, "learning_rate": 9.908468642731995e-05, "loss": 0.0608, "step": 2160 }, { "epoch": 7.185430463576159, "grad_norm": 0.19591639935970306, "learning_rate": 9.906887282016707e-05, "loss": 0.0608, "step": 2170 }, { "epoch": 7.218543046357616, "grad_norm": 0.28580793738365173, "learning_rate": 9.90529250605896e-05, "loss": 0.0633, "step": 2180 }, { "epoch": 7.251655629139073, "grad_norm": 0.3640088737010956, "learning_rate": 9.903684319218809e-05, "loss": 0.0581, "step": 2190 }, { "epoch": 7.28476821192053, "grad_norm": 0.44236427545547485, "learning_rate": 9.902062725892976e-05, "loss": 0.0664, "step": 2200 }, { "epoch": 7.317880794701987, "grad_norm": 0.22968073189258575, "learning_rate": 9.900427730514834e-05, "loss": 0.0628, "step": 2210 }, { "epoch": 7.3509933774834435, "grad_norm": 0.2978042960166931, "learning_rate": 9.8987793375544e-05, "loss": 0.0625, "step": 2220 }, { "epoch": 7.3841059602649, "grad_norm": 0.33108794689178467, "learning_rate": 9.897117551518318e-05, "loss": 0.0628, "step": 2230 }, { "epoch": 7.417218543046357, "grad_norm": 0.21153508126735687, "learning_rate": 9.895442376949844e-05, "loss": 0.0617, "step": 2240 }, { "epoch": 7.450331125827814, "grad_norm": 0.3166658282279968, "learning_rate": 9.893753818428845e-05, "loss": 0.0625, "step": 2250 }, { "epoch": 7.483443708609272, "grad_norm": 0.32127508521080017, "learning_rate": 9.892051880571773e-05, "loss": 0.0631, "step": 2260 }, { "epoch": 7.516556291390728, "grad_norm": 0.6129347681999207, "learning_rate": 9.890336568031663e-05, "loss": 0.0592, "step": 2270 }, { "epoch": 7.549668874172186, "grad_norm": 0.5134275555610657, "learning_rate": 9.888607885498113e-05, "loss": 0.0575, "step": 2280 }, { "epoch": 7.582781456953643, "grad_norm": 0.43336644768714905, "learning_rate": 9.886865837697275e-05, "loss": 0.0579, "step": 2290 }, { "epoch": 7.6158940397351, "grad_norm": 0.30430659651756287, "learning_rate": 9.88511042939184e-05, "loss": 0.0584, "step": 2300 }, { "epoch": 7.6490066225165565, "grad_norm": 0.36088964343070984, "learning_rate": 9.883341665381028e-05, "loss": 0.0604, "step": 2310 }, { "epoch": 7.682119205298013, "grad_norm": 0.44607266783714294, "learning_rate": 9.881559550500575e-05, "loss": 0.0596, "step": 2320 }, { "epoch": 7.71523178807947, "grad_norm": 0.34343472123146057, "learning_rate": 9.879764089622712e-05, "loss": 0.0631, "step": 2330 }, { "epoch": 7.748344370860927, "grad_norm": 0.5981128215789795, "learning_rate": 9.87795528765616e-05, "loss": 0.0651, "step": 2340 }, { "epoch": 7.781456953642384, "grad_norm": 0.3051721751689911, "learning_rate": 9.876133149546118e-05, "loss": 0.0643, "step": 2350 }, { "epoch": 7.814569536423841, "grad_norm": 0.49914315342903137, "learning_rate": 9.874297680274238e-05, "loss": 0.0646, "step": 2360 }, { "epoch": 7.847682119205298, "grad_norm": 0.26899829506874084, "learning_rate": 9.872448884858624e-05, "loss": 0.0665, "step": 2370 }, { "epoch": 7.880794701986755, "grad_norm": 0.3715449869632721, "learning_rate": 9.870586768353815e-05, "loss": 0.065, "step": 2380 }, { "epoch": 7.913907284768212, "grad_norm": 0.4118329882621765, "learning_rate": 9.868711335850764e-05, "loss": 0.0651, "step": 2390 }, { "epoch": 7.947019867549669, "grad_norm": 0.3028053939342499, "learning_rate": 9.866822592476833e-05, "loss": 0.063, "step": 2400 }, { "epoch": 7.9801324503311255, "grad_norm": 0.40981510281562805, "learning_rate": 9.86492054339577e-05, "loss": 0.0647, "step": 2410 }, { "epoch": 8.013245033112582, "grad_norm": 0.6474801301956177, "learning_rate": 9.863005193807711e-05, "loss": 0.0663, "step": 2420 }, { "epoch": 8.04635761589404, "grad_norm": 0.3966490626335144, "learning_rate": 9.861076548949143e-05, "loss": 0.0617, "step": 2430 }, { "epoch": 8.079470198675496, "grad_norm": 0.35986804962158203, "learning_rate": 9.859134614092912e-05, "loss": 0.0633, "step": 2440 }, { "epoch": 8.112582781456954, "grad_norm": 0.38414672017097473, "learning_rate": 9.857179394548191e-05, "loss": 0.0628, "step": 2450 }, { "epoch": 8.14569536423841, "grad_norm": 0.5009891986846924, "learning_rate": 9.855210895660477e-05, "loss": 0.0566, "step": 2460 }, { "epoch": 8.178807947019868, "grad_norm": 0.35315462946891785, "learning_rate": 9.853229122811568e-05, "loss": 0.0598, "step": 2470 }, { "epoch": 8.211920529801324, "grad_norm": 0.35116562247276306, "learning_rate": 9.851234081419559e-05, "loss": 0.0574, "step": 2480 }, { "epoch": 8.245033112582782, "grad_norm": 0.25362998247146606, "learning_rate": 9.849225776938814e-05, "loss": 0.0618, "step": 2490 }, { "epoch": 8.278145695364238, "grad_norm": 0.4382838308811188, "learning_rate": 9.847204214859964e-05, "loss": 0.0609, "step": 2500 }, { "epoch": 8.311258278145695, "grad_norm": 0.22599613666534424, "learning_rate": 9.845169400709879e-05, "loss": 0.0622, "step": 2510 }, { "epoch": 8.344370860927153, "grad_norm": 0.31164127588272095, "learning_rate": 9.843121340051664e-05, "loss": 0.0576, "step": 2520 }, { "epoch": 8.37748344370861, "grad_norm": 0.1694200336933136, "learning_rate": 9.841060038484641e-05, "loss": 0.0598, "step": 2530 }, { "epoch": 8.410596026490067, "grad_norm": 0.45784375071525574, "learning_rate": 9.838985501644328e-05, "loss": 0.0621, "step": 2540 }, { "epoch": 8.443708609271523, "grad_norm": 0.34973156452178955, "learning_rate": 9.83689773520243e-05, "loss": 0.0623, "step": 2550 }, { "epoch": 8.47682119205298, "grad_norm": 0.24540765583515167, "learning_rate": 9.834796744866819e-05, "loss": 0.0653, "step": 2560 }, { "epoch": 8.509933774834437, "grad_norm": 0.40534868836402893, "learning_rate": 9.832682536381525e-05, "loss": 0.0611, "step": 2570 }, { "epoch": 8.543046357615895, "grad_norm": 0.29260650277137756, "learning_rate": 9.830555115526711e-05, "loss": 0.0604, "step": 2580 }, { "epoch": 8.57615894039735, "grad_norm": 0.2636522948741913, "learning_rate": 9.828414488118667e-05, "loss": 0.0586, "step": 2590 }, { "epoch": 8.609271523178808, "grad_norm": 0.25658273696899414, "learning_rate": 9.826260660009785e-05, "loss": 0.0603, "step": 2600 }, { "epoch": 8.642384105960264, "grad_norm": 0.25631648302078247, "learning_rate": 9.824093637088547e-05, "loss": 0.0627, "step": 2610 }, { "epoch": 8.675496688741722, "grad_norm": 0.3292628824710846, "learning_rate": 9.821913425279514e-05, "loss": 0.0651, "step": 2620 }, { "epoch": 8.708609271523178, "grad_norm": 0.4831010401248932, "learning_rate": 9.8197200305433e-05, "loss": 0.0605, "step": 2630 }, { "epoch": 8.741721854304636, "grad_norm": 0.2148185819387436, "learning_rate": 9.817513458876564e-05, "loss": 0.0612, "step": 2640 }, { "epoch": 8.774834437086092, "grad_norm": 0.3227541744709015, "learning_rate": 9.815293716311987e-05, "loss": 0.0587, "step": 2650 }, { "epoch": 8.80794701986755, "grad_norm": 0.3363269865512848, "learning_rate": 9.813060808918262e-05, "loss": 0.0575, "step": 2660 }, { "epoch": 8.841059602649006, "grad_norm": 0.46669331192970276, "learning_rate": 9.810814742800069e-05, "loss": 0.0612, "step": 2670 }, { "epoch": 8.874172185430464, "grad_norm": 0.42400455474853516, "learning_rate": 9.808555524098074e-05, "loss": 0.0588, "step": 2680 }, { "epoch": 8.90728476821192, "grad_norm": 0.38816598057746887, "learning_rate": 9.806283158988887e-05, "loss": 0.0678, "step": 2690 }, { "epoch": 8.940397350993377, "grad_norm": 0.3489267826080322, "learning_rate": 9.803997653685072e-05, "loss": 0.0608, "step": 2700 }, { "epoch": 8.973509933774835, "grad_norm": 0.3504479229450226, "learning_rate": 9.801699014435112e-05, "loss": 0.0602, "step": 2710 }, { "epoch": 9.006622516556291, "grad_norm": 0.40530872344970703, "learning_rate": 9.799387247523398e-05, "loss": 0.056, "step": 2720 }, { "epoch": 9.039735099337749, "grad_norm": 0.30909809470176697, "learning_rate": 9.797062359270215e-05, "loss": 0.0632, "step": 2730 }, { "epoch": 9.072847682119205, "grad_norm": 0.38244327902793884, "learning_rate": 9.794724356031715e-05, "loss": 0.056, "step": 2740 }, { "epoch": 9.105960264900663, "grad_norm": 0.2798317074775696, "learning_rate": 9.792373244199913e-05, "loss": 0.0623, "step": 2750 }, { "epoch": 9.139072847682119, "grad_norm": 0.26110270619392395, "learning_rate": 9.790009030202658e-05, "loss": 0.0592, "step": 2760 }, { "epoch": 9.172185430463577, "grad_norm": 0.3762434422969818, "learning_rate": 9.78763172050362e-05, "loss": 0.0613, "step": 2770 }, { "epoch": 9.205298013245033, "grad_norm": 0.4685998857021332, "learning_rate": 9.785241321602274e-05, "loss": 0.0579, "step": 2780 }, { "epoch": 9.23841059602649, "grad_norm": 0.4478963315486908, "learning_rate": 9.782837840033879e-05, "loss": 0.0625, "step": 2790 }, { "epoch": 9.271523178807946, "grad_norm": 0.42982766032218933, "learning_rate": 9.780421282369461e-05, "loss": 0.0593, "step": 2800 }, { "epoch": 9.304635761589404, "grad_norm": 0.2951209545135498, "learning_rate": 9.777991655215797e-05, "loss": 0.0621, "step": 2810 }, { "epoch": 9.33774834437086, "grad_norm": 0.26976823806762695, "learning_rate": 9.775548965215394e-05, "loss": 0.0577, "step": 2820 }, { "epoch": 9.370860927152318, "grad_norm": 0.2779231369495392, "learning_rate": 9.773093219046474e-05, "loss": 0.0599, "step": 2830 }, { "epoch": 9.403973509933774, "grad_norm": 0.2434268444776535, "learning_rate": 9.770624423422954e-05, "loss": 0.0613, "step": 2840 }, { "epoch": 9.437086092715232, "grad_norm": 0.5225670337677002, "learning_rate": 9.768142585094426e-05, "loss": 0.0633, "step": 2850 }, { "epoch": 9.47019867549669, "grad_norm": 0.2580816447734833, "learning_rate": 9.765647710846142e-05, "loss": 0.0592, "step": 2860 }, { "epoch": 9.503311258278146, "grad_norm": 0.24676990509033203, "learning_rate": 9.763139807498991e-05, "loss": 0.0621, "step": 2870 }, { "epoch": 9.536423841059603, "grad_norm": 0.3819582760334015, "learning_rate": 9.760618881909487e-05, "loss": 0.0567, "step": 2880 }, { "epoch": 9.56953642384106, "grad_norm": 0.22858983278274536, "learning_rate": 9.758084940969744e-05, "loss": 0.0585, "step": 2890 }, { "epoch": 9.602649006622517, "grad_norm": 0.31290411949157715, "learning_rate": 9.755537991607459e-05, "loss": 0.0594, "step": 2900 }, { "epoch": 9.635761589403973, "grad_norm": 0.5019789934158325, "learning_rate": 9.752978040785895e-05, "loss": 0.0555, "step": 2910 }, { "epoch": 9.668874172185431, "grad_norm": 0.37787845730781555, "learning_rate": 9.750405095503859e-05, "loss": 0.0573, "step": 2920 }, { "epoch": 9.701986754966887, "grad_norm": 0.25667133927345276, "learning_rate": 9.747819162795686e-05, "loss": 0.0652, "step": 2930 }, { "epoch": 9.735099337748345, "grad_norm": 0.4140005111694336, "learning_rate": 9.745220249731217e-05, "loss": 0.0588, "step": 2940 }, { "epoch": 9.7682119205298, "grad_norm": 0.23670446872711182, "learning_rate": 9.742608363415781e-05, "loss": 0.0611, "step": 2950 }, { "epoch": 9.801324503311259, "grad_norm": 0.3898259699344635, "learning_rate": 9.739983510990176e-05, "loss": 0.0566, "step": 2960 }, { "epoch": 9.834437086092715, "grad_norm": 0.43201398849487305, "learning_rate": 9.737345699630647e-05, "loss": 0.0572, "step": 2970 }, { "epoch": 9.867549668874172, "grad_norm": 0.3268705904483795, "learning_rate": 9.734694936548869e-05, "loss": 0.06, "step": 2980 }, { "epoch": 9.900662251655628, "grad_norm": 0.3355279266834259, "learning_rate": 9.732031228991932e-05, "loss": 0.0555, "step": 2990 }, { "epoch": 9.933774834437086, "grad_norm": 0.21696588397026062, "learning_rate": 9.729354584242302e-05, "loss": 0.0565, "step": 3000 }, { "epoch": 9.966887417218544, "grad_norm": 0.47815877199172974, "learning_rate": 9.726665009617832e-05, "loss": 0.0559, "step": 3010 }, { "epoch": 10.0, "grad_norm": 0.3026737570762634, "learning_rate": 9.723962512471714e-05, "loss": 0.0568, "step": 3020 }, { "epoch": 10.033112582781458, "grad_norm": 0.42634478211402893, "learning_rate": 9.72124710019247e-05, "loss": 0.0544, "step": 3030 }, { "epoch": 10.066225165562914, "grad_norm": 0.3247314393520355, "learning_rate": 9.718518780203934e-05, "loss": 0.0596, "step": 3040 }, { "epoch": 10.099337748344372, "grad_norm": 0.37585797905921936, "learning_rate": 9.715777559965228e-05, "loss": 0.0595, "step": 3050 }, { "epoch": 10.132450331125828, "grad_norm": 0.4026211202144623, "learning_rate": 9.713023446970746e-05, "loss": 0.0537, "step": 3060 }, { "epoch": 10.165562913907285, "grad_norm": 0.2530674636363983, "learning_rate": 9.710256448750126e-05, "loss": 0.0581, "step": 3070 }, { "epoch": 10.198675496688741, "grad_norm": 0.2540310323238373, "learning_rate": 9.707476572868235e-05, "loss": 0.0565, "step": 3080 }, { "epoch": 10.2317880794702, "grad_norm": 0.40987494587898254, "learning_rate": 9.704683826925149e-05, "loss": 0.0597, "step": 3090 }, { "epoch": 10.264900662251655, "grad_norm": 0.31499749422073364, "learning_rate": 9.701878218556129e-05, "loss": 0.0597, "step": 3100 }, { "epoch": 10.298013245033113, "grad_norm": 0.38515767455101013, "learning_rate": 9.699059755431598e-05, "loss": 0.0558, "step": 3110 }, { "epoch": 10.331125827814569, "grad_norm": 0.2933037281036377, "learning_rate": 9.696228445257132e-05, "loss": 0.0546, "step": 3120 }, { "epoch": 10.364238410596027, "grad_norm": 0.26463568210601807, "learning_rate": 9.693384295773419e-05, "loss": 0.058, "step": 3130 }, { "epoch": 10.397350993377483, "grad_norm": 0.23882460594177246, "learning_rate": 9.690527314756259e-05, "loss": 0.0588, "step": 3140 }, { "epoch": 10.43046357615894, "grad_norm": 0.23210971057415009, "learning_rate": 9.687657510016527e-05, "loss": 0.0574, "step": 3150 }, { "epoch": 10.463576158940397, "grad_norm": 0.2671436071395874, "learning_rate": 9.684774889400161e-05, "loss": 0.0534, "step": 3160 }, { "epoch": 10.496688741721854, "grad_norm": 0.33807018399238586, "learning_rate": 9.681879460788135e-05, "loss": 0.0521, "step": 3170 }, { "epoch": 10.52980132450331, "grad_norm": 0.3162677586078644, "learning_rate": 9.67897123209644e-05, "loss": 0.0509, "step": 3180 }, { "epoch": 10.562913907284768, "grad_norm": 0.23942869901657104, "learning_rate": 9.676050211276062e-05, "loss": 0.056, "step": 3190 }, { "epoch": 10.596026490066226, "grad_norm": 0.4066106677055359, "learning_rate": 9.673116406312962e-05, "loss": 0.0565, "step": 3200 }, { "epoch": 10.629139072847682, "grad_norm": 0.2288457602262497, "learning_rate": 9.67016982522805e-05, "loss": 0.0593, "step": 3210 }, { "epoch": 10.66225165562914, "grad_norm": 0.3817801773548126, "learning_rate": 9.667210476077164e-05, "loss": 0.0559, "step": 3220 }, { "epoch": 10.695364238410596, "grad_norm": 0.355571448802948, "learning_rate": 9.664238366951055e-05, "loss": 0.0551, "step": 3230 }, { "epoch": 10.728476821192054, "grad_norm": 0.3130621016025543, "learning_rate": 9.661253505975355e-05, "loss": 0.0568, "step": 3240 }, { "epoch": 10.76158940397351, "grad_norm": 0.19243387877941132, "learning_rate": 9.658255901310557e-05, "loss": 0.0535, "step": 3250 }, { "epoch": 10.794701986754967, "grad_norm": 0.4788419306278229, "learning_rate": 9.655245561152e-05, "loss": 0.054, "step": 3260 }, { "epoch": 10.827814569536423, "grad_norm": 0.19252415001392365, "learning_rate": 9.65222249372984e-05, "loss": 0.0591, "step": 3270 }, { "epoch": 10.860927152317881, "grad_norm": 0.31182000041007996, "learning_rate": 9.649186707309026e-05, "loss": 0.0562, "step": 3280 }, { "epoch": 10.894039735099337, "grad_norm": 0.2479945570230484, "learning_rate": 9.646138210189283e-05, "loss": 0.0628, "step": 3290 }, { "epoch": 10.927152317880795, "grad_norm": 0.3453010618686676, "learning_rate": 9.643077010705087e-05, "loss": 0.0608, "step": 3300 }, { "epoch": 10.960264900662251, "grad_norm": 0.35571587085723877, "learning_rate": 9.640003117225637e-05, "loss": 0.0577, "step": 3310 }, { "epoch": 10.993377483443709, "grad_norm": 0.2610296607017517, "learning_rate": 9.636916538154846e-05, "loss": 0.0589, "step": 3320 }, { "epoch": 11.026490066225165, "grad_norm": 0.2517532408237457, "learning_rate": 9.633817281931296e-05, "loss": 0.0567, "step": 3330 }, { "epoch": 11.059602649006623, "grad_norm": 0.3228975832462311, "learning_rate": 9.630705357028242e-05, "loss": 0.0541, "step": 3340 }, { "epoch": 11.092715231788079, "grad_norm": 0.3950878381729126, "learning_rate": 9.627580771953563e-05, "loss": 0.0595, "step": 3350 }, { "epoch": 11.125827814569536, "grad_norm": 0.3295522928237915, "learning_rate": 9.624443535249759e-05, "loss": 0.0585, "step": 3360 }, { "epoch": 11.158940397350994, "grad_norm": 0.4770802855491638, "learning_rate": 9.621293655493913e-05, "loss": 0.0562, "step": 3370 }, { "epoch": 11.19205298013245, "grad_norm": 0.31397074460983276, "learning_rate": 9.618131141297675e-05, "loss": 0.0618, "step": 3380 }, { "epoch": 11.225165562913908, "grad_norm": 0.21584492921829224, "learning_rate": 9.614956001307242e-05, "loss": 0.0515, "step": 3390 }, { "epoch": 11.258278145695364, "grad_norm": 0.33990344405174255, "learning_rate": 9.611768244203321e-05, "loss": 0.0581, "step": 3400 }, { "epoch": 11.291390728476822, "grad_norm": 0.25237536430358887, "learning_rate": 9.60856787870112e-05, "loss": 0.0581, "step": 3410 }, { "epoch": 11.324503311258278, "grad_norm": 0.4137415885925293, "learning_rate": 9.605354913550318e-05, "loss": 0.0492, "step": 3420 }, { "epoch": 11.357615894039736, "grad_norm": 0.2941897511482239, "learning_rate": 9.602129357535037e-05, "loss": 0.0537, "step": 3430 }, { "epoch": 11.390728476821192, "grad_norm": 0.2811073362827301, "learning_rate": 9.598891219473825e-05, "loss": 0.0571, "step": 3440 }, { "epoch": 11.42384105960265, "grad_norm": 0.40189027786254883, "learning_rate": 9.595640508219625e-05, "loss": 0.0611, "step": 3450 }, { "epoch": 11.456953642384105, "grad_norm": 0.3597954213619232, "learning_rate": 9.592377232659761e-05, "loss": 0.0568, "step": 3460 }, { "epoch": 11.490066225165563, "grad_norm": 0.31246596574783325, "learning_rate": 9.589101401715904e-05, "loss": 0.058, "step": 3470 }, { "epoch": 11.52317880794702, "grad_norm": 0.27412205934524536, "learning_rate": 9.585813024344045e-05, "loss": 0.0609, "step": 3480 }, { "epoch": 11.556291390728477, "grad_norm": 0.20753033459186554, "learning_rate": 9.58251210953449e-05, "loss": 0.0574, "step": 3490 }, { "epoch": 11.589403973509933, "grad_norm": 0.20783717930316925, "learning_rate": 9.579198666311809e-05, "loss": 0.056, "step": 3500 }, { "epoch": 11.62251655629139, "grad_norm": 0.2541440725326538, "learning_rate": 9.575872703734832e-05, "loss": 0.0561, "step": 3510 }, { "epoch": 11.655629139072847, "grad_norm": 0.44244036078453064, "learning_rate": 9.572534230896611e-05, "loss": 0.0515, "step": 3520 }, { "epoch": 11.688741721854305, "grad_norm": 0.43785685300827026, "learning_rate": 9.569183256924403e-05, "loss": 0.0609, "step": 3530 }, { "epoch": 11.721854304635762, "grad_norm": 0.33773308992385864, "learning_rate": 9.565819790979646e-05, "loss": 0.052, "step": 3540 }, { "epoch": 11.754966887417218, "grad_norm": 0.5158550143241882, "learning_rate": 9.562443842257925e-05, "loss": 0.053, "step": 3550 }, { "epoch": 11.788079470198676, "grad_norm": 0.2844068706035614, "learning_rate": 9.559055419988956e-05, "loss": 0.0555, "step": 3560 }, { "epoch": 11.821192052980132, "grad_norm": 0.3475736975669861, "learning_rate": 9.555654533436557e-05, "loss": 0.0564, "step": 3570 }, { "epoch": 11.85430463576159, "grad_norm": 0.2760317623615265, "learning_rate": 9.552241191898621e-05, "loss": 0.0578, "step": 3580 }, { "epoch": 11.887417218543046, "grad_norm": 0.38130131363868713, "learning_rate": 9.548815404707092e-05, "loss": 0.0592, "step": 3590 }, { "epoch": 11.920529801324504, "grad_norm": 0.29025664925575256, "learning_rate": 9.545377181227942e-05, "loss": 0.0599, "step": 3600 }, { "epoch": 11.95364238410596, "grad_norm": 0.555117666721344, "learning_rate": 9.541926530861145e-05, "loss": 0.0539, "step": 3610 }, { "epoch": 11.986754966887418, "grad_norm": 0.377044141292572, "learning_rate": 9.538463463040645e-05, "loss": 0.0573, "step": 3620 }, { "epoch": 12.019867549668874, "grad_norm": 0.30155807733535767, "learning_rate": 9.534987987234337e-05, "loss": 0.0541, "step": 3630 }, { "epoch": 12.052980132450331, "grad_norm": 0.22209548950195312, "learning_rate": 9.53150011294404e-05, "loss": 0.0554, "step": 3640 }, { "epoch": 12.086092715231787, "grad_norm": 0.30140480399131775, "learning_rate": 9.527999849705471e-05, "loss": 0.0528, "step": 3650 }, { "epoch": 12.119205298013245, "grad_norm": 0.4048808813095093, "learning_rate": 9.524487207088213e-05, "loss": 0.0588, "step": 3660 }, { "epoch": 12.152317880794701, "grad_norm": 0.23815444111824036, "learning_rate": 9.520962194695698e-05, "loss": 0.0529, "step": 3670 }, { "epoch": 12.185430463576159, "grad_norm": 0.27641287446022034, "learning_rate": 9.517424822165175e-05, "loss": 0.0502, "step": 3680 }, { "epoch": 12.218543046357617, "grad_norm": 0.2630961239337921, "learning_rate": 9.513875099167685e-05, "loss": 0.0532, "step": 3690 }, { "epoch": 12.251655629139073, "grad_norm": 0.4269041419029236, "learning_rate": 9.510313035408035e-05, "loss": 0.059, "step": 3700 }, { "epoch": 12.28476821192053, "grad_norm": 0.480225145816803, "learning_rate": 9.506738640624775e-05, "loss": 0.0501, "step": 3710 }, { "epoch": 12.317880794701987, "grad_norm": 0.2649463713169098, "learning_rate": 9.50315192459016e-05, "loss": 0.0553, "step": 3720 }, { "epoch": 12.350993377483444, "grad_norm": 0.27226024866104126, "learning_rate": 9.499552897110136e-05, "loss": 0.0542, "step": 3730 }, { "epoch": 12.3841059602649, "grad_norm": 0.5471118092536926, "learning_rate": 9.495941568024304e-05, "loss": 0.0581, "step": 3740 }, { "epoch": 12.417218543046358, "grad_norm": 0.31663334369659424, "learning_rate": 9.492317947205904e-05, "loss": 0.056, "step": 3750 }, { "epoch": 12.450331125827814, "grad_norm": 0.18205411732196808, "learning_rate": 9.488682044561775e-05, "loss": 0.0534, "step": 3760 }, { "epoch": 12.483443708609272, "grad_norm": 0.3155178129673004, "learning_rate": 9.485033870032335e-05, "loss": 0.0594, "step": 3770 }, { "epoch": 12.516556291390728, "grad_norm": 0.3822356164455414, "learning_rate": 9.481373433591556e-05, "loss": 0.0534, "step": 3780 }, { "epoch": 12.549668874172186, "grad_norm": 0.27331840991973877, "learning_rate": 9.47770074524693e-05, "loss": 0.0576, "step": 3790 }, { "epoch": 12.582781456953642, "grad_norm": 0.34020712971687317, "learning_rate": 9.474015815039446e-05, "loss": 0.0602, "step": 3800 }, { "epoch": 12.6158940397351, "grad_norm": 0.2069326490163803, "learning_rate": 9.470318653043565e-05, "loss": 0.0572, "step": 3810 }, { "epoch": 12.649006622516556, "grad_norm": 0.30206841230392456, "learning_rate": 9.466609269367185e-05, "loss": 0.0545, "step": 3820 }, { "epoch": 12.682119205298013, "grad_norm": 0.23368656635284424, "learning_rate": 9.46288767415162e-05, "loss": 0.0584, "step": 3830 }, { "epoch": 12.71523178807947, "grad_norm": 0.32714197039604187, "learning_rate": 9.459153877571567e-05, "loss": 0.0546, "step": 3840 }, { "epoch": 12.748344370860927, "grad_norm": 0.3084418475627899, "learning_rate": 9.455407889835087e-05, "loss": 0.053, "step": 3850 }, { "epoch": 12.781456953642383, "grad_norm": 0.21246743202209473, "learning_rate": 9.451649721183564e-05, "loss": 0.0532, "step": 3860 }, { "epoch": 12.814569536423841, "grad_norm": 0.4070841670036316, "learning_rate": 9.447879381891692e-05, "loss": 0.0565, "step": 3870 }, { "epoch": 12.847682119205299, "grad_norm": 0.2797333896160126, "learning_rate": 9.444096882267428e-05, "loss": 0.0519, "step": 3880 }, { "epoch": 12.880794701986755, "grad_norm": 0.2666597366333008, "learning_rate": 9.440302232651988e-05, "loss": 0.0534, "step": 3890 }, { "epoch": 12.913907284768213, "grad_norm": 0.22058582305908203, "learning_rate": 9.436495443419795e-05, "loss": 0.0499, "step": 3900 }, { "epoch": 12.947019867549669, "grad_norm": 0.27836939692497253, "learning_rate": 9.432676524978466e-05, "loss": 0.0529, "step": 3910 }, { "epoch": 12.980132450331126, "grad_norm": 0.24895453453063965, "learning_rate": 9.42884548776878e-05, "loss": 0.059, "step": 3920 }, { "epoch": 13.013245033112582, "grad_norm": 0.38560667634010315, "learning_rate": 9.425002342264646e-05, "loss": 0.0553, "step": 3930 }, { "epoch": 13.04635761589404, "grad_norm": 0.40569379925727844, "learning_rate": 9.421147098973077e-05, "loss": 0.0485, "step": 3940 }, { "epoch": 13.079470198675496, "grad_norm": 0.28403738141059875, "learning_rate": 9.41727976843416e-05, "loss": 0.0598, "step": 3950 }, { "epoch": 13.112582781456954, "grad_norm": 0.3417736291885376, "learning_rate": 9.413400361221029e-05, "loss": 0.0556, "step": 3960 }, { "epoch": 13.14569536423841, "grad_norm": 0.25839293003082275, "learning_rate": 9.409508887939835e-05, "loss": 0.0559, "step": 3970 }, { "epoch": 13.178807947019868, "grad_norm": 0.3978862166404724, "learning_rate": 9.40560535922972e-05, "loss": 0.0549, "step": 3980 }, { "epoch": 13.211920529801324, "grad_norm": 0.7343702912330627, "learning_rate": 9.40168978576278e-05, "loss": 0.0577, "step": 3990 }, { "epoch": 13.245033112582782, "grad_norm": 0.24029166996479034, "learning_rate": 9.397762178244043e-05, "loss": 0.0628, "step": 4000 }, { "epoch": 13.278145695364238, "grad_norm": 0.3237532079219818, "learning_rate": 9.393822547411439e-05, "loss": 0.0543, "step": 4010 }, { "epoch": 13.311258278145695, "grad_norm": 0.5030490756034851, "learning_rate": 9.389870904035769e-05, "loss": 0.0588, "step": 4020 }, { "epoch": 13.344370860927153, "grad_norm": 0.2587463855743408, "learning_rate": 9.385907258920672e-05, "loss": 0.0505, "step": 4030 }, { "epoch": 13.37748344370861, "grad_norm": 0.2764757573604584, "learning_rate": 9.381931622902607e-05, "loss": 0.0529, "step": 4040 }, { "epoch": 13.410596026490067, "grad_norm": 0.4327974021434784, "learning_rate": 9.377944006850807e-05, "loss": 0.0497, "step": 4050 }, { "epoch": 13.443708609271523, "grad_norm": 0.2790825366973877, "learning_rate": 9.373944421667265e-05, "loss": 0.0513, "step": 4060 }, { "epoch": 13.47682119205298, "grad_norm": 0.4502870738506317, "learning_rate": 9.369932878286691e-05, "loss": 0.0561, "step": 4070 }, { "epoch": 13.509933774834437, "grad_norm": 0.41858136653900146, "learning_rate": 9.365909387676494e-05, "loss": 0.0601, "step": 4080 }, { "epoch": 13.543046357615895, "grad_norm": 0.3691936731338501, "learning_rate": 9.361873960836744e-05, "loss": 0.056, "step": 4090 }, { "epoch": 13.57615894039735, "grad_norm": 0.47950518131256104, "learning_rate": 9.357826608800142e-05, "loss": 0.0563, "step": 4100 }, { "epoch": 13.609271523178808, "grad_norm": 0.5044728517532349, "learning_rate": 9.353767342631994e-05, "loss": 0.0554, "step": 4110 }, { "epoch": 13.642384105960264, "grad_norm": 0.26135769486427307, "learning_rate": 9.34969617343018e-05, "loss": 0.0576, "step": 4120 }, { "epoch": 13.675496688741722, "grad_norm": 0.4602676033973694, "learning_rate": 9.345613112325122e-05, "loss": 0.0507, "step": 4130 }, { "epoch": 13.708609271523178, "grad_norm": 0.3314308226108551, "learning_rate": 9.34151817047975e-05, "loss": 0.0581, "step": 4140 }, { "epoch": 13.741721854304636, "grad_norm": 0.30388274788856506, "learning_rate": 9.33741135908948e-05, "loss": 0.054, "step": 4150 }, { "epoch": 13.774834437086092, "grad_norm": 0.37620946764945984, "learning_rate": 9.33329268938218e-05, "loss": 0.0531, "step": 4160 }, { "epoch": 13.80794701986755, "grad_norm": 0.6620563268661499, "learning_rate": 9.329162172618132e-05, "loss": 0.0541, "step": 4170 }, { "epoch": 13.841059602649006, "grad_norm": 0.3122325539588928, "learning_rate": 9.325019820090013e-05, "loss": 0.0534, "step": 4180 }, { "epoch": 13.874172185430464, "grad_norm": 0.34020987153053284, "learning_rate": 9.320865643122855e-05, "loss": 0.0477, "step": 4190 }, { "epoch": 13.90728476821192, "grad_norm": 0.27751025557518005, "learning_rate": 9.316699653074023e-05, "loss": 0.0557, "step": 4200 }, { "epoch": 13.940397350993377, "grad_norm": 0.22059102356433868, "learning_rate": 9.312521861333172e-05, "loss": 0.0487, "step": 4210 }, { "epoch": 13.973509933774835, "grad_norm": 0.33522048592567444, "learning_rate": 9.308332279322224e-05, "loss": 0.0546, "step": 4220 }, { "epoch": 14.006622516556291, "grad_norm": 0.26130959391593933, "learning_rate": 9.304130918495338e-05, "loss": 0.0505, "step": 4230 }, { "epoch": 14.039735099337749, "grad_norm": 0.20113135874271393, "learning_rate": 9.299917790338874e-05, "loss": 0.0485, "step": 4240 }, { "epoch": 14.072847682119205, "grad_norm": 0.34097424149513245, "learning_rate": 9.295692906371363e-05, "loss": 0.0548, "step": 4250 }, { "epoch": 14.105960264900663, "grad_norm": 0.39684292674064636, "learning_rate": 9.291456278143476e-05, "loss": 0.053, "step": 4260 }, { "epoch": 14.139072847682119, "grad_norm": 0.36868414282798767, "learning_rate": 9.287207917237994e-05, "loss": 0.0499, "step": 4270 }, { "epoch": 14.172185430463577, "grad_norm": 0.20515242218971252, "learning_rate": 9.282947835269773e-05, "loss": 0.0538, "step": 4280 }, { "epoch": 14.205298013245033, "grad_norm": 0.2462504804134369, "learning_rate": 9.278676043885715e-05, "loss": 0.053, "step": 4290 }, { "epoch": 14.23841059602649, "grad_norm": 0.22033865749835968, "learning_rate": 9.274392554764733e-05, "loss": 0.0482, "step": 4300 }, { "epoch": 14.271523178807946, "grad_norm": 0.43178895115852356, "learning_rate": 9.270097379617723e-05, "loss": 0.0524, "step": 4310 }, { "epoch": 14.304635761589404, "grad_norm": 0.19731228053569794, "learning_rate": 9.26579053018753e-05, "loss": 0.0519, "step": 4320 }, { "epoch": 14.33774834437086, "grad_norm": 0.22813649475574493, "learning_rate": 9.261472018248918e-05, "loss": 0.0554, "step": 4330 }, { "epoch": 14.370860927152318, "grad_norm": 0.41562557220458984, "learning_rate": 9.25714185560853e-05, "loss": 0.0525, "step": 4340 }, { "epoch": 14.403973509933774, "grad_norm": 0.2197459638118744, "learning_rate": 9.252800054104868e-05, "loss": 0.0533, "step": 4350 }, { "epoch": 14.437086092715232, "grad_norm": 0.3540545105934143, "learning_rate": 9.248446625608252e-05, "loss": 0.0537, "step": 4360 }, { "epoch": 14.47019867549669, "grad_norm": 0.2596363127231598, "learning_rate": 9.244081582020789e-05, "loss": 0.055, "step": 4370 }, { "epoch": 14.503311258278146, "grad_norm": 0.49597832560539246, "learning_rate": 9.239704935276339e-05, "loss": 0.0552, "step": 4380 }, { "epoch": 14.536423841059603, "grad_norm": 0.346963107585907, "learning_rate": 9.235316697340489e-05, "loss": 0.0566, "step": 4390 }, { "epoch": 14.56953642384106, "grad_norm": 0.32065990567207336, "learning_rate": 9.230916880210512e-05, "loss": 0.0581, "step": 4400 }, { "epoch": 14.602649006622517, "grad_norm": 0.3738252520561218, "learning_rate": 9.226505495915342e-05, "loss": 0.0576, "step": 4410 }, { "epoch": 14.635761589403973, "grad_norm": 0.2920328974723816, "learning_rate": 9.222082556515536e-05, "loss": 0.0549, "step": 4420 }, { "epoch": 14.668874172185431, "grad_norm": 0.2182052880525589, "learning_rate": 9.217648074103242e-05, "loss": 0.0545, "step": 4430 }, { "epoch": 14.701986754966887, "grad_norm": 0.22154872119426727, "learning_rate": 9.213202060802161e-05, "loss": 0.0498, "step": 4440 }, { "epoch": 14.735099337748345, "grad_norm": 0.3167935609817505, "learning_rate": 9.208744528767528e-05, "loss": 0.053, "step": 4450 }, { "epoch": 14.7682119205298, "grad_norm": 0.266191691160202, "learning_rate": 9.204275490186064e-05, "loss": 0.051, "step": 4460 }, { "epoch": 14.801324503311259, "grad_norm": 0.30615562200546265, "learning_rate": 9.199794957275949e-05, "loss": 0.0487, "step": 4470 }, { "epoch": 14.834437086092715, "grad_norm": 0.25950491428375244, "learning_rate": 9.19530294228679e-05, "loss": 0.0488, "step": 4480 }, { "epoch": 14.867549668874172, "grad_norm": 0.40980663895606995, "learning_rate": 9.190799457499583e-05, "loss": 0.0519, "step": 4490 }, { "epoch": 14.900662251655628, "grad_norm": 0.301404744386673, "learning_rate": 9.186284515226686e-05, "loss": 0.0536, "step": 4500 }, { "epoch": 14.933774834437086, "grad_norm": 0.4593862295150757, "learning_rate": 9.181758127811777e-05, "loss": 0.0514, "step": 4510 }, { "epoch": 14.966887417218544, "grad_norm": 0.3321060538291931, "learning_rate": 9.177220307629825e-05, "loss": 0.0488, "step": 4520 }, { "epoch": 15.0, "grad_norm": 0.3440271019935608, "learning_rate": 9.172671067087059e-05, "loss": 0.0495, "step": 4530 }, { "epoch": 15.033112582781458, "grad_norm": 0.3458652198314667, "learning_rate": 9.16811041862093e-05, "loss": 0.0487, "step": 4540 }, { "epoch": 15.066225165562914, "grad_norm": 0.3206164836883545, "learning_rate": 9.163538374700076e-05, "loss": 0.055, "step": 4550 }, { "epoch": 15.099337748344372, "grad_norm": 0.2858612537384033, "learning_rate": 9.158954947824287e-05, "loss": 0.0529, "step": 4560 }, { "epoch": 15.132450331125828, "grad_norm": 0.1733393520116806, "learning_rate": 9.154360150524482e-05, "loss": 0.0448, "step": 4570 }, { "epoch": 15.165562913907285, "grad_norm": 0.37576815485954285, "learning_rate": 9.14975399536266e-05, "loss": 0.0523, "step": 4580 }, { "epoch": 15.198675496688741, "grad_norm": 0.3717908263206482, "learning_rate": 9.14513649493187e-05, "loss": 0.054, "step": 4590 }, { "epoch": 15.2317880794702, "grad_norm": 0.16278734803199768, "learning_rate": 9.140507661856187e-05, "loss": 0.0553, "step": 4600 }, { "epoch": 15.264900662251655, "grad_norm": 0.2441447228193283, "learning_rate": 9.135867508790661e-05, "loss": 0.0519, "step": 4610 }, { "epoch": 15.298013245033113, "grad_norm": 0.3409784138202667, "learning_rate": 9.131216048421291e-05, "loss": 0.0526, "step": 4620 }, { "epoch": 15.331125827814569, "grad_norm": 0.3123800456523895, "learning_rate": 9.126553293464998e-05, "loss": 0.0499, "step": 4630 }, { "epoch": 15.364238410596027, "grad_norm": 0.7394290566444397, "learning_rate": 9.121879256669572e-05, "loss": 0.053, "step": 4640 }, { "epoch": 15.397350993377483, "grad_norm": 0.21700596809387207, "learning_rate": 9.117193950813652e-05, "loss": 0.0502, "step": 4650 }, { "epoch": 15.43046357615894, "grad_norm": 0.19841018319129944, "learning_rate": 9.112497388706685e-05, "loss": 0.0482, "step": 4660 }, { "epoch": 15.463576158940397, "grad_norm": 0.316501259803772, "learning_rate": 9.10778958318889e-05, "loss": 0.0499, "step": 4670 }, { "epoch": 15.496688741721854, "grad_norm": 0.33726245164871216, "learning_rate": 9.103070547131232e-05, "loss": 0.0537, "step": 4680 }, { "epoch": 15.52980132450331, "grad_norm": 0.6032313704490662, "learning_rate": 9.098340293435375e-05, "loss": 0.0556, "step": 4690 }, { "epoch": 15.562913907284768, "grad_norm": 0.5185425877571106, "learning_rate": 9.093598835033649e-05, "loss": 0.0559, "step": 4700 }, { "epoch": 15.596026490066226, "grad_norm": 0.4050739109516144, "learning_rate": 9.088846184889021e-05, "loss": 0.0481, "step": 4710 }, { "epoch": 15.629139072847682, "grad_norm": 0.2776203453540802, "learning_rate": 9.084082355995057e-05, "loss": 0.0528, "step": 4720 }, { "epoch": 15.66225165562914, "grad_norm": 0.30099448561668396, "learning_rate": 9.079307361375882e-05, "loss": 0.0522, "step": 4730 }, { "epoch": 15.695364238410596, "grad_norm": 0.344683974981308, "learning_rate": 9.074521214086149e-05, "loss": 0.05, "step": 4740 }, { "epoch": 15.728476821192054, "grad_norm": 0.27057892084121704, "learning_rate": 9.069723927211001e-05, "loss": 0.0507, "step": 4750 }, { "epoch": 15.76158940397351, "grad_norm": 0.3077874779701233, "learning_rate": 9.064915513866037e-05, "loss": 0.0539, "step": 4760 }, { "epoch": 15.794701986754967, "grad_norm": 0.1907300055027008, "learning_rate": 9.060095987197279e-05, "loss": 0.0544, "step": 4770 }, { "epoch": 15.827814569536423, "grad_norm": 0.24715468287467957, "learning_rate": 9.055265360381126e-05, "loss": 0.055, "step": 4780 }, { "epoch": 15.860927152317881, "grad_norm": 0.32824012637138367, "learning_rate": 9.050423646624326e-05, "loss": 0.052, "step": 4790 }, { "epoch": 15.894039735099337, "grad_norm": 0.28079748153686523, "learning_rate": 9.045570859163943e-05, "loss": 0.049, "step": 4800 }, { "epoch": 15.927152317880795, "grad_norm": 0.30994635820388794, "learning_rate": 9.04070701126731e-05, "loss": 0.0477, "step": 4810 }, { "epoch": 15.960264900662251, "grad_norm": 0.42453500628471375, "learning_rate": 9.035832116232001e-05, "loss": 0.0548, "step": 4820 }, { "epoch": 15.993377483443709, "grad_norm": 0.18040865659713745, "learning_rate": 9.030946187385796e-05, "loss": 0.0489, "step": 4830 }, { "epoch": 16.026490066225165, "grad_norm": 0.3745681643486023, "learning_rate": 9.026049238086635e-05, "loss": 0.0498, "step": 4840 }, { "epoch": 16.05960264900662, "grad_norm": 0.2618829011917114, "learning_rate": 9.021141281722591e-05, "loss": 0.0489, "step": 4850 }, { "epoch": 16.09271523178808, "grad_norm": 0.22814440727233887, "learning_rate": 9.01622233171183e-05, "loss": 0.0479, "step": 4860 }, { "epoch": 16.125827814569536, "grad_norm": 0.2320789396762848, "learning_rate": 9.011292401502574e-05, "loss": 0.0504, "step": 4870 }, { "epoch": 16.158940397350992, "grad_norm": 0.2136622816324234, "learning_rate": 9.006351504573063e-05, "loss": 0.0515, "step": 4880 }, { "epoch": 16.192052980132452, "grad_norm": 0.2421858161687851, "learning_rate": 9.001399654431519e-05, "loss": 0.0475, "step": 4890 }, { "epoch": 16.225165562913908, "grad_norm": 0.2304915338754654, "learning_rate": 8.996436864616116e-05, "loss": 0.0475, "step": 4900 }, { "epoch": 16.258278145695364, "grad_norm": 0.20679974555969238, "learning_rate": 8.991463148694925e-05, "loss": 0.0459, "step": 4910 }, { "epoch": 16.29139072847682, "grad_norm": 0.19499890506267548, "learning_rate": 8.986478520265902e-05, "loss": 0.0492, "step": 4920 }, { "epoch": 16.32450331125828, "grad_norm": 0.23162683844566345, "learning_rate": 8.981482992956827e-05, "loss": 0.0509, "step": 4930 }, { "epoch": 16.357615894039736, "grad_norm": 0.42436715960502625, "learning_rate": 8.976476580425282e-05, "loss": 0.0506, "step": 4940 }, { "epoch": 16.39072847682119, "grad_norm": 0.17514273524284363, "learning_rate": 8.971459296358606e-05, "loss": 0.0525, "step": 4950 }, { "epoch": 16.423841059602648, "grad_norm": 0.22839798033237457, "learning_rate": 8.966431154473864e-05, "loss": 0.0501, "step": 4960 }, { "epoch": 16.456953642384107, "grad_norm": 0.2785276174545288, "learning_rate": 8.961392168517803e-05, "loss": 0.0486, "step": 4970 }, { "epoch": 16.490066225165563, "grad_norm": 0.2775113582611084, "learning_rate": 8.956342352266821e-05, "loss": 0.0544, "step": 4980 }, { "epoch": 16.52317880794702, "grad_norm": 0.4839721620082855, "learning_rate": 8.95128171952692e-05, "loss": 0.0548, "step": 4990 }, { "epoch": 16.556291390728475, "grad_norm": 0.27179333567619324, "learning_rate": 8.946210284133676e-05, "loss": 0.0525, "step": 5000 }, { "epoch": 16.589403973509935, "grad_norm": 0.2629265785217285, "learning_rate": 8.941128059952201e-05, "loss": 0.052, "step": 5010 }, { "epoch": 16.62251655629139, "grad_norm": 0.4592989981174469, "learning_rate": 8.936035060877102e-05, "loss": 0.0523, "step": 5020 }, { "epoch": 16.655629139072847, "grad_norm": 0.3389127552509308, "learning_rate": 8.930931300832443e-05, "loss": 0.055, "step": 5030 }, { "epoch": 16.688741721854306, "grad_norm": 0.3871324062347412, "learning_rate": 8.925816793771711e-05, "loss": 0.0539, "step": 5040 }, { "epoch": 16.721854304635762, "grad_norm": 0.19742853939533234, "learning_rate": 8.92069155367777e-05, "loss": 0.0486, "step": 5050 }, { "epoch": 16.75496688741722, "grad_norm": 0.3362676501274109, "learning_rate": 8.915555594562834e-05, "loss": 0.0505, "step": 5060 }, { "epoch": 16.788079470198674, "grad_norm": 0.2406490445137024, "learning_rate": 8.910408930468416e-05, "loss": 0.0472, "step": 5070 }, { "epoch": 16.821192052980134, "grad_norm": 0.2260250598192215, "learning_rate": 8.905251575465303e-05, "loss": 0.0469, "step": 5080 }, { "epoch": 16.85430463576159, "grad_norm": 0.24575504660606384, "learning_rate": 8.900083543653502e-05, "loss": 0.048, "step": 5090 }, { "epoch": 16.887417218543046, "grad_norm": 0.30263665318489075, "learning_rate": 8.894904849162218e-05, "loss": 0.0505, "step": 5100 }, { "epoch": 16.920529801324502, "grad_norm": 0.24957509338855743, "learning_rate": 8.889715506149802e-05, "loss": 0.0496, "step": 5110 }, { "epoch": 16.95364238410596, "grad_norm": 0.2903924584388733, "learning_rate": 8.884515528803722e-05, "loss": 0.0501, "step": 5120 }, { "epoch": 16.986754966887418, "grad_norm": 0.2967614233493805, "learning_rate": 8.879304931340517e-05, "loss": 0.0492, "step": 5130 }, { "epoch": 17.019867549668874, "grad_norm": 0.29281461238861084, "learning_rate": 8.874083728005759e-05, "loss": 0.0478, "step": 5140 }, { "epoch": 17.05298013245033, "grad_norm": 0.26834797859191895, "learning_rate": 8.868851933074021e-05, "loss": 0.0537, "step": 5150 }, { "epoch": 17.08609271523179, "grad_norm": 0.2625283896923065, "learning_rate": 8.863609560848829e-05, "loss": 0.05, "step": 5160 }, { "epoch": 17.119205298013245, "grad_norm": 0.28712624311447144, "learning_rate": 8.85835662566263e-05, "loss": 0.0494, "step": 5170 }, { "epoch": 17.1523178807947, "grad_norm": 0.31634753942489624, "learning_rate": 8.853093141876747e-05, "loss": 0.0472, "step": 5180 }, { "epoch": 17.185430463576157, "grad_norm": 0.2730175852775574, "learning_rate": 8.847819123881343e-05, "loss": 0.0439, "step": 5190 }, { "epoch": 17.218543046357617, "grad_norm": 0.3013550937175751, "learning_rate": 8.842534586095383e-05, "loss": 0.0495, "step": 5200 }, { "epoch": 17.251655629139073, "grad_norm": 0.1916494369506836, "learning_rate": 8.837239542966593e-05, "loss": 0.0495, "step": 5210 }, { "epoch": 17.28476821192053, "grad_norm": 0.2952755093574524, "learning_rate": 8.831934008971417e-05, "loss": 0.0478, "step": 5220 }, { "epoch": 17.31788079470199, "grad_norm": 0.2587955892086029, "learning_rate": 8.826617998614982e-05, "loss": 0.0488, "step": 5230 }, { "epoch": 17.350993377483444, "grad_norm": 0.40977925062179565, "learning_rate": 8.821291526431056e-05, "loss": 0.0468, "step": 5240 }, { "epoch": 17.3841059602649, "grad_norm": 0.24762938916683197, "learning_rate": 8.815954606982015e-05, "loss": 0.0474, "step": 5250 }, { "epoch": 17.417218543046356, "grad_norm": 0.3435589373111725, "learning_rate": 8.810607254858789e-05, "loss": 0.0504, "step": 5260 }, { "epoch": 17.450331125827816, "grad_norm": 0.3456942141056061, "learning_rate": 8.805249484680838e-05, "loss": 0.0527, "step": 5270 }, { "epoch": 17.483443708609272, "grad_norm": 0.35484451055526733, "learning_rate": 8.799881311096096e-05, "loss": 0.0525, "step": 5280 }, { "epoch": 17.516556291390728, "grad_norm": 0.36420899629592896, "learning_rate": 8.794502748780949e-05, "loss": 0.0488, "step": 5290 }, { "epoch": 17.549668874172184, "grad_norm": 0.37120816111564636, "learning_rate": 8.78911381244018e-05, "loss": 0.0485, "step": 5300 }, { "epoch": 17.582781456953644, "grad_norm": 0.20918512344360352, "learning_rate": 8.783714516806933e-05, "loss": 0.0495, "step": 5310 }, { "epoch": 17.6158940397351, "grad_norm": 0.2788046896457672, "learning_rate": 8.77830487664268e-05, "loss": 0.0483, "step": 5320 }, { "epoch": 17.649006622516556, "grad_norm": 0.23161277174949646, "learning_rate": 8.772884906737167e-05, "loss": 0.0463, "step": 5330 }, { "epoch": 17.68211920529801, "grad_norm": 0.3572770059108734, "learning_rate": 8.767454621908387e-05, "loss": 0.0482, "step": 5340 }, { "epoch": 17.71523178807947, "grad_norm": 0.417128324508667, "learning_rate": 8.76201403700253e-05, "loss": 0.0503, "step": 5350 }, { "epoch": 17.748344370860927, "grad_norm": 0.32680389285087585, "learning_rate": 8.756563166893949e-05, "loss": 0.0484, "step": 5360 }, { "epoch": 17.781456953642383, "grad_norm": 0.2871386706829071, "learning_rate": 8.751102026485113e-05, "loss": 0.0472, "step": 5370 }, { "epoch": 17.814569536423843, "grad_norm": 0.3247740864753723, "learning_rate": 8.745630630706571e-05, "loss": 0.0507, "step": 5380 }, { "epoch": 17.8476821192053, "grad_norm": 0.22287116944789886, "learning_rate": 8.740148994516912e-05, "loss": 0.05, "step": 5390 }, { "epoch": 17.880794701986755, "grad_norm": 0.3685692548751831, "learning_rate": 8.73465713290272e-05, "loss": 0.0453, "step": 5400 }, { "epoch": 17.91390728476821, "grad_norm": 0.24796898663043976, "learning_rate": 8.729155060878533e-05, "loss": 0.0477, "step": 5410 }, { "epoch": 17.94701986754967, "grad_norm": 0.30694589018821716, "learning_rate": 8.723642793486809e-05, "loss": 0.0488, "step": 5420 }, { "epoch": 17.980132450331126, "grad_norm": 0.2931412160396576, "learning_rate": 8.718120345797873e-05, "loss": 0.049, "step": 5430 }, { "epoch": 18.013245033112582, "grad_norm": 0.24440760910511017, "learning_rate": 8.712587732909889e-05, "loss": 0.0476, "step": 5440 }, { "epoch": 18.04635761589404, "grad_norm": 0.3994658291339874, "learning_rate": 8.707044969948806e-05, "loss": 0.0465, "step": 5450 }, { "epoch": 18.079470198675498, "grad_norm": 0.36141183972358704, "learning_rate": 8.701492072068329e-05, "loss": 0.0488, "step": 5460 }, { "epoch": 18.112582781456954, "grad_norm": 0.31564465165138245, "learning_rate": 8.695929054449869e-05, "loss": 0.0467, "step": 5470 }, { "epoch": 18.14569536423841, "grad_norm": 0.5005706548690796, "learning_rate": 8.690355932302501e-05, "loss": 0.0482, "step": 5480 }, { "epoch": 18.178807947019866, "grad_norm": 0.3739710748195648, "learning_rate": 8.684772720862931e-05, "loss": 0.0457, "step": 5490 }, { "epoch": 18.211920529801326, "grad_norm": 0.24996857345104218, "learning_rate": 8.679179435395446e-05, "loss": 0.05, "step": 5500 }, { "epoch": 18.24503311258278, "grad_norm": 0.32123425602912903, "learning_rate": 8.673576091191874e-05, "loss": 0.0527, "step": 5510 }, { "epoch": 18.278145695364238, "grad_norm": 0.2880495488643646, "learning_rate": 8.667962703571541e-05, "loss": 0.0482, "step": 5520 }, { "epoch": 18.311258278145694, "grad_norm": 0.27904170751571655, "learning_rate": 8.662339287881238e-05, "loss": 0.0462, "step": 5530 }, { "epoch": 18.344370860927153, "grad_norm": 0.2851518988609314, "learning_rate": 8.656705859495169e-05, "loss": 0.0524, "step": 5540 }, { "epoch": 18.37748344370861, "grad_norm": 0.3574736416339874, "learning_rate": 8.651062433814912e-05, "loss": 0.0485, "step": 5550 }, { "epoch": 18.410596026490065, "grad_norm": 0.24048876762390137, "learning_rate": 8.645409026269375e-05, "loss": 0.0528, "step": 5560 }, { "epoch": 18.443708609271525, "grad_norm": 0.17228512465953827, "learning_rate": 8.639745652314759e-05, "loss": 0.0486, "step": 5570 }, { "epoch": 18.47682119205298, "grad_norm": 0.2497442364692688, "learning_rate": 8.634072327434515e-05, "loss": 0.0482, "step": 5580 }, { "epoch": 18.509933774834437, "grad_norm": 0.2625841200351715, "learning_rate": 8.628389067139294e-05, "loss": 0.049, "step": 5590 }, { "epoch": 18.543046357615893, "grad_norm": 0.2676774561405182, "learning_rate": 8.622695886966911e-05, "loss": 0.0494, "step": 5600 }, { "epoch": 18.576158940397352, "grad_norm": 0.3793989419937134, "learning_rate": 8.616992802482308e-05, "loss": 0.0538, "step": 5610 }, { "epoch": 18.60927152317881, "grad_norm": 0.2918393611907959, "learning_rate": 8.611279829277496e-05, "loss": 0.0507, "step": 5620 }, { "epoch": 18.642384105960264, "grad_norm": 0.19793328642845154, "learning_rate": 8.605556982971528e-05, "loss": 0.0529, "step": 5630 }, { "epoch": 18.67549668874172, "grad_norm": 0.39510205388069153, "learning_rate": 8.599824279210447e-05, "loss": 0.0484, "step": 5640 }, { "epoch": 18.70860927152318, "grad_norm": 0.2644934356212616, "learning_rate": 8.594081733667243e-05, "loss": 0.0484, "step": 5650 }, { "epoch": 18.741721854304636, "grad_norm": 0.29609838128089905, "learning_rate": 8.58832936204182e-05, "loss": 0.051, "step": 5660 }, { "epoch": 18.774834437086092, "grad_norm": 0.20625938475131989, "learning_rate": 8.582567180060942e-05, "loss": 0.0485, "step": 5670 }, { "epoch": 18.807947019867548, "grad_norm": 0.1631142646074295, "learning_rate": 8.576795203478194e-05, "loss": 0.0474, "step": 5680 }, { "epoch": 18.841059602649008, "grad_norm": 0.2637156844139099, "learning_rate": 8.571013448073939e-05, "loss": 0.0458, "step": 5690 }, { "epoch": 18.874172185430464, "grad_norm": 0.35963174700737, "learning_rate": 8.565221929655275e-05, "loss": 0.0461, "step": 5700 }, { "epoch": 18.90728476821192, "grad_norm": 0.3065672218799591, "learning_rate": 8.559420664055992e-05, "loss": 0.0456, "step": 5710 }, { "epoch": 18.94039735099338, "grad_norm": 0.35327818989753723, "learning_rate": 8.553609667136532e-05, "loss": 0.049, "step": 5720 }, { "epoch": 18.973509933774835, "grad_norm": 0.3045600354671478, "learning_rate": 8.547788954783936e-05, "loss": 0.0449, "step": 5730 }, { "epoch": 19.00662251655629, "grad_norm": 0.20336578786373138, "learning_rate": 8.541958542911808e-05, "loss": 0.0503, "step": 5740 }, { "epoch": 19.039735099337747, "grad_norm": 0.2766091227531433, "learning_rate": 8.536118447460275e-05, "loss": 0.045, "step": 5750 }, { "epoch": 19.072847682119207, "grad_norm": 0.3710872530937195, "learning_rate": 8.530268684395932e-05, "loss": 0.0448, "step": 5760 }, { "epoch": 19.105960264900663, "grad_norm": 0.3155226707458496, "learning_rate": 8.524409269711807e-05, "loss": 0.0479, "step": 5770 }, { "epoch": 19.13907284768212, "grad_norm": 0.28983038663864136, "learning_rate": 8.51854021942732e-05, "loss": 0.0445, "step": 5780 }, { "epoch": 19.172185430463575, "grad_norm": 0.24907587468624115, "learning_rate": 8.512661549588227e-05, "loss": 0.0481, "step": 5790 }, { "epoch": 19.205298013245034, "grad_norm": 0.3750559985637665, "learning_rate": 8.506773276266588e-05, "loss": 0.0448, "step": 5800 }, { "epoch": 19.23841059602649, "grad_norm": 0.3185857832431793, "learning_rate": 8.500875415560721e-05, "loss": 0.0472, "step": 5810 }, { "epoch": 19.271523178807946, "grad_norm": 0.37759584188461304, "learning_rate": 8.494967983595144e-05, "loss": 0.0484, "step": 5820 }, { "epoch": 19.304635761589402, "grad_norm": 0.2041579931974411, "learning_rate": 8.489050996520558e-05, "loss": 0.0497, "step": 5830 }, { "epoch": 19.337748344370862, "grad_norm": 0.208077535033226, "learning_rate": 8.483124470513775e-05, "loss": 0.0454, "step": 5840 }, { "epoch": 19.370860927152318, "grad_norm": 0.30169370770454407, "learning_rate": 8.477188421777692e-05, "loss": 0.0449, "step": 5850 }, { "epoch": 19.403973509933774, "grad_norm": 0.19728776812553406, "learning_rate": 8.47124286654124e-05, "loss": 0.0492, "step": 5860 }, { "epoch": 19.437086092715234, "grad_norm": 0.34709495306015015, "learning_rate": 8.465287821059341e-05, "loss": 0.0413, "step": 5870 }, { "epoch": 19.47019867549669, "grad_norm": 0.2884853184223175, "learning_rate": 8.45932330161286e-05, "loss": 0.044, "step": 5880 }, { "epoch": 19.503311258278146, "grad_norm": 0.2834859788417816, "learning_rate": 8.453349324508567e-05, "loss": 0.0485, "step": 5890 }, { "epoch": 19.5364238410596, "grad_norm": 0.23814116418361664, "learning_rate": 8.447365906079088e-05, "loss": 0.0469, "step": 5900 }, { "epoch": 19.56953642384106, "grad_norm": 0.28024622797966003, "learning_rate": 8.441373062682856e-05, "loss": 0.0441, "step": 5910 }, { "epoch": 19.602649006622517, "grad_norm": 0.2971237003803253, "learning_rate": 8.43537081070408e-05, "loss": 0.0479, "step": 5920 }, { "epoch": 19.635761589403973, "grad_norm": 0.2392372339963913, "learning_rate": 8.429359166552689e-05, "loss": 0.0469, "step": 5930 }, { "epoch": 19.66887417218543, "grad_norm": 0.25242742896080017, "learning_rate": 8.423338146664284e-05, "loss": 0.0499, "step": 5940 }, { "epoch": 19.70198675496689, "grad_norm": 0.16030198335647583, "learning_rate": 8.417307767500107e-05, "loss": 0.043, "step": 5950 }, { "epoch": 19.735099337748345, "grad_norm": 0.2851664125919342, "learning_rate": 8.411268045546983e-05, "loss": 0.0513, "step": 5960 }, { "epoch": 19.7682119205298, "grad_norm": 0.44885265827178955, "learning_rate": 8.405218997317281e-05, "loss": 0.0475, "step": 5970 }, { "epoch": 19.801324503311257, "grad_norm": 0.20197376608848572, "learning_rate": 8.399160639348869e-05, "loss": 0.047, "step": 5980 }, { "epoch": 19.834437086092716, "grad_norm": 0.303156316280365, "learning_rate": 8.393092988205065e-05, "loss": 0.0471, "step": 5990 }, { "epoch": 19.867549668874172, "grad_norm": 0.2801036238670349, "learning_rate": 8.387016060474597e-05, "loss": 0.0442, "step": 6000 }, { "epoch": 19.90066225165563, "grad_norm": 0.2651307284832001, "learning_rate": 8.380929872771551e-05, "loss": 0.0441, "step": 6010 }, { "epoch": 19.933774834437084, "grad_norm": 0.24375686049461365, "learning_rate": 8.374834441735335e-05, "loss": 0.0487, "step": 6020 }, { "epoch": 19.966887417218544, "grad_norm": 0.31468185782432556, "learning_rate": 8.368729784030622e-05, "loss": 0.0465, "step": 6030 }, { "epoch": 20.0, "grad_norm": 0.5277534127235413, "learning_rate": 8.362615916347315e-05, "loss": 0.05, "step": 6040 }, { "epoch": 20.033112582781456, "grad_norm": 0.21887578070163727, "learning_rate": 8.356492855400493e-05, "loss": 0.0423, "step": 6050 }, { "epoch": 20.066225165562916, "grad_norm": 0.36684879660606384, "learning_rate": 8.350360617930371e-05, "loss": 0.0446, "step": 6060 }, { "epoch": 20.09933774834437, "grad_norm": 0.4326893985271454, "learning_rate": 8.344219220702255e-05, "loss": 0.0459, "step": 6070 }, { "epoch": 20.132450331125828, "grad_norm": 0.20481698215007782, "learning_rate": 8.338068680506485e-05, "loss": 0.0455, "step": 6080 }, { "epoch": 20.165562913907284, "grad_norm": 0.5505645275115967, "learning_rate": 8.33190901415841e-05, "loss": 0.0469, "step": 6090 }, { "epoch": 20.198675496688743, "grad_norm": 0.30030176043510437, "learning_rate": 8.325740238498317e-05, "loss": 0.0458, "step": 6100 }, { "epoch": 20.2317880794702, "grad_norm": 0.25492092967033386, "learning_rate": 8.319562370391406e-05, "loss": 0.0425, "step": 6110 }, { "epoch": 20.264900662251655, "grad_norm": 0.26933717727661133, "learning_rate": 8.31337542672773e-05, "loss": 0.0481, "step": 6120 }, { "epoch": 20.29801324503311, "grad_norm": 0.2844066619873047, "learning_rate": 8.307179424422158e-05, "loss": 0.0475, "step": 6130 }, { "epoch": 20.33112582781457, "grad_norm": 0.22873665392398834, "learning_rate": 8.300974380414327e-05, "loss": 0.0465, "step": 6140 }, { "epoch": 20.364238410596027, "grad_norm": 0.20232853293418884, "learning_rate": 8.294760311668586e-05, "loss": 0.047, "step": 6150 }, { "epoch": 20.397350993377483, "grad_norm": 0.2537810206413269, "learning_rate": 8.288537235173961e-05, "loss": 0.0428, "step": 6160 }, { "epoch": 20.43046357615894, "grad_norm": 0.2643328011035919, "learning_rate": 8.282305167944108e-05, "loss": 0.0458, "step": 6170 }, { "epoch": 20.4635761589404, "grad_norm": 0.23945002257823944, "learning_rate": 8.276064127017262e-05, "loss": 0.0428, "step": 6180 }, { "epoch": 20.496688741721854, "grad_norm": 0.2556137144565582, "learning_rate": 8.269814129456189e-05, "loss": 0.0443, "step": 6190 }, { "epoch": 20.52980132450331, "grad_norm": 0.2918807864189148, "learning_rate": 8.263555192348143e-05, "loss": 0.0467, "step": 6200 }, { "epoch": 20.562913907284766, "grad_norm": 0.23681600391864777, "learning_rate": 8.257287332804819e-05, "loss": 0.0498, "step": 6210 }, { "epoch": 20.596026490066226, "grad_norm": 0.31582027673721313, "learning_rate": 8.251010567962307e-05, "loss": 0.0467, "step": 6220 }, { "epoch": 20.629139072847682, "grad_norm": 0.26111549139022827, "learning_rate": 8.244724914981041e-05, "loss": 0.0422, "step": 6230 }, { "epoch": 20.662251655629138, "grad_norm": 0.3107320964336395, "learning_rate": 8.238430391045757e-05, "loss": 0.0479, "step": 6240 }, { "epoch": 20.695364238410598, "grad_norm": 0.2813695967197418, "learning_rate": 8.232127013365445e-05, "loss": 0.0444, "step": 6250 }, { "epoch": 20.728476821192054, "grad_norm": 0.3972276449203491, "learning_rate": 8.225814799173295e-05, "loss": 0.0459, "step": 6260 }, { "epoch": 20.76158940397351, "grad_norm": 0.37497857213020325, "learning_rate": 8.219493765726663e-05, "loss": 0.0436, "step": 6270 }, { "epoch": 20.794701986754966, "grad_norm": 0.2163100391626358, "learning_rate": 8.21316393030701e-05, "loss": 0.0436, "step": 6280 }, { "epoch": 20.827814569536425, "grad_norm": 0.31823134422302246, "learning_rate": 8.206825310219865e-05, "loss": 0.0459, "step": 6290 }, { "epoch": 20.86092715231788, "grad_norm": 0.23597359657287598, "learning_rate": 8.200477922794776e-05, "loss": 0.0458, "step": 6300 }, { "epoch": 20.894039735099337, "grad_norm": 0.35591644048690796, "learning_rate": 8.194121785385256e-05, "loss": 0.0419, "step": 6310 }, { "epoch": 20.927152317880793, "grad_norm": 0.2987116277217865, "learning_rate": 8.187756915368741e-05, "loss": 0.0416, "step": 6320 }, { "epoch": 20.960264900662253, "grad_norm": 0.23300248384475708, "learning_rate": 8.181383330146544e-05, "loss": 0.0446, "step": 6330 }, { "epoch": 20.99337748344371, "grad_norm": 0.37123557925224304, "learning_rate": 8.175001047143804e-05, "loss": 0.0441, "step": 6340 }, { "epoch": 21.026490066225165, "grad_norm": 0.27428799867630005, "learning_rate": 8.168610083809438e-05, "loss": 0.0452, "step": 6350 }, { "epoch": 21.05960264900662, "grad_norm": 0.30260396003723145, "learning_rate": 8.162210457616095e-05, "loss": 0.0469, "step": 6360 }, { "epoch": 21.09271523178808, "grad_norm": 0.34331098198890686, "learning_rate": 8.155802186060109e-05, "loss": 0.0434, "step": 6370 }, { "epoch": 21.125827814569536, "grad_norm": 0.36323457956314087, "learning_rate": 8.149385286661453e-05, "loss": 0.0454, "step": 6380 }, { "epoch": 21.158940397350992, "grad_norm": 0.28714850544929504, "learning_rate": 8.14295977696368e-05, "loss": 0.0433, "step": 6390 }, { "epoch": 21.192052980132452, "grad_norm": 0.23781517148017883, "learning_rate": 8.13652567453389e-05, "loss": 0.0449, "step": 6400 }, { "epoch": 21.225165562913908, "grad_norm": 0.27831393480300903, "learning_rate": 8.130082996962676e-05, "loss": 0.0523, "step": 6410 }, { "epoch": 21.258278145695364, "grad_norm": 0.3475172221660614, "learning_rate": 8.123631761864068e-05, "loss": 0.0473, "step": 6420 }, { "epoch": 21.29139072847682, "grad_norm": 0.3229403793811798, "learning_rate": 8.1171719868755e-05, "loss": 0.0452, "step": 6430 }, { "epoch": 21.32450331125828, "grad_norm": 0.24031709134578705, "learning_rate": 8.110703689657748e-05, "loss": 0.042, "step": 6440 }, { "epoch": 21.357615894039736, "grad_norm": 0.19280578196048737, "learning_rate": 8.104226887894892e-05, "loss": 0.0462, "step": 6450 }, { "epoch": 21.39072847682119, "grad_norm": 0.260720431804657, "learning_rate": 8.097741599294257e-05, "loss": 0.0441, "step": 6460 }, { "epoch": 21.423841059602648, "grad_norm": 0.20740565657615662, "learning_rate": 8.091247841586378e-05, "loss": 0.0484, "step": 6470 }, { "epoch": 21.456953642384107, "grad_norm": 0.24257272481918335, "learning_rate": 8.084745632524939e-05, "loss": 0.0429, "step": 6480 }, { "epoch": 21.490066225165563, "grad_norm": 0.49715572595596313, "learning_rate": 8.07823498988673e-05, "loss": 0.0434, "step": 6490 }, { "epoch": 21.52317880794702, "grad_norm": 0.27258336544036865, "learning_rate": 8.071715931471602e-05, "loss": 0.0433, "step": 6500 }, { "epoch": 21.556291390728475, "grad_norm": 0.326820969581604, "learning_rate": 8.06518847510241e-05, "loss": 0.0481, "step": 6510 }, { "epoch": 21.589403973509935, "grad_norm": 0.2046242356300354, "learning_rate": 8.058652638624971e-05, "loss": 0.0454, "step": 6520 }, { "epoch": 21.62251655629139, "grad_norm": 0.20394554734230042, "learning_rate": 8.052108439908013e-05, "loss": 0.0411, "step": 6530 }, { "epoch": 21.655629139072847, "grad_norm": 0.3296065032482147, "learning_rate": 8.045555896843125e-05, "loss": 0.0456, "step": 6540 }, { "epoch": 21.688741721854306, "grad_norm": 0.2835526168346405, "learning_rate": 8.03899502734471e-05, "loss": 0.0433, "step": 6550 }, { "epoch": 21.721854304635762, "grad_norm": 0.4771028757095337, "learning_rate": 8.032425849349931e-05, "loss": 0.0456, "step": 6560 }, { "epoch": 21.75496688741722, "grad_norm": 0.13579747080802917, "learning_rate": 8.025848380818674e-05, "loss": 0.0485, "step": 6570 }, { "epoch": 21.788079470198674, "grad_norm": 0.23113711178302765, "learning_rate": 8.019262639733487e-05, "loss": 0.0435, "step": 6580 }, { "epoch": 21.821192052980134, "grad_norm": 0.49168169498443604, "learning_rate": 8.012668644099531e-05, "loss": 0.0408, "step": 6590 }, { "epoch": 21.85430463576159, "grad_norm": 0.22659792006015778, "learning_rate": 8.006066411944542e-05, "loss": 0.0434, "step": 6600 }, { "epoch": 21.887417218543046, "grad_norm": 0.15820032358169556, "learning_rate": 7.999455961318769e-05, "loss": 0.0463, "step": 6610 }, { "epoch": 21.920529801324502, "grad_norm": 0.2989514470100403, "learning_rate": 7.992837310294932e-05, "loss": 0.0469, "step": 6620 }, { "epoch": 21.95364238410596, "grad_norm": 0.2574496865272522, "learning_rate": 7.986210476968167e-05, "loss": 0.0487, "step": 6630 }, { "epoch": 21.986754966887418, "grad_norm": 0.35213711857795715, "learning_rate": 7.97957547945599e-05, "loss": 0.0468, "step": 6640 }, { "epoch": 22.019867549668874, "grad_norm": 0.2406720519065857, "learning_rate": 7.972932335898226e-05, "loss": 0.0473, "step": 6650 }, { "epoch": 22.05298013245033, "grad_norm": 0.29066550731658936, "learning_rate": 7.966281064456975e-05, "loss": 0.0471, "step": 6660 }, { "epoch": 22.08609271523179, "grad_norm": 0.3156541883945465, "learning_rate": 7.959621683316563e-05, "loss": 0.0393, "step": 6670 }, { "epoch": 22.119205298013245, "grad_norm": 0.2464132457971573, "learning_rate": 7.952954210683481e-05, "loss": 0.0472, "step": 6680 }, { "epoch": 22.1523178807947, "grad_norm": 0.2683815062046051, "learning_rate": 7.946278664786345e-05, "loss": 0.041, "step": 6690 }, { "epoch": 22.185430463576157, "grad_norm": 0.265022873878479, "learning_rate": 7.939595063875842e-05, "loss": 0.0452, "step": 6700 }, { "epoch": 22.218543046357617, "grad_norm": 0.1707126796245575, "learning_rate": 7.932903426224683e-05, "loss": 0.0431, "step": 6710 }, { "epoch": 22.251655629139073, "grad_norm": 0.231646329164505, "learning_rate": 7.926203770127552e-05, "loss": 0.047, "step": 6720 }, { "epoch": 22.28476821192053, "grad_norm": 0.23318573832511902, "learning_rate": 7.919496113901046e-05, "loss": 0.0458, "step": 6730 }, { "epoch": 22.31788079470199, "grad_norm": 0.35598325729370117, "learning_rate": 7.912780475883649e-05, "loss": 0.0455, "step": 6740 }, { "epoch": 22.350993377483444, "grad_norm": 0.24500803649425507, "learning_rate": 7.906056874435652e-05, "loss": 0.0456, "step": 6750 }, { "epoch": 22.3841059602649, "grad_norm": 0.19846856594085693, "learning_rate": 7.899325327939131e-05, "loss": 0.0472, "step": 6760 }, { "epoch": 22.417218543046356, "grad_norm": 0.25153249502182007, "learning_rate": 7.892585854797872e-05, "loss": 0.0422, "step": 6770 }, { "epoch": 22.450331125827816, "grad_norm": 0.2677418291568756, "learning_rate": 7.88583847343734e-05, "loss": 0.0472, "step": 6780 }, { "epoch": 22.483443708609272, "grad_norm": 0.23182639479637146, "learning_rate": 7.879083202304616e-05, "loss": 0.0445, "step": 6790 }, { "epoch": 22.516556291390728, "grad_norm": 0.3407961428165436, "learning_rate": 7.872320059868355e-05, "loss": 0.047, "step": 6800 }, { "epoch": 22.549668874172184, "grad_norm": 0.43899449706077576, "learning_rate": 7.865549064618729e-05, "loss": 0.0447, "step": 6810 }, { "epoch": 22.582781456953644, "grad_norm": 0.3127565383911133, "learning_rate": 7.858770235067381e-05, "loss": 0.0455, "step": 6820 }, { "epoch": 22.6158940397351, "grad_norm": 0.20309969782829285, "learning_rate": 7.851983589747374e-05, "loss": 0.0434, "step": 6830 }, { "epoch": 22.649006622516556, "grad_norm": 0.2274715155363083, "learning_rate": 7.845189147213133e-05, "loss": 0.0429, "step": 6840 }, { "epoch": 22.68211920529801, "grad_norm": 0.43601083755493164, "learning_rate": 7.838386926040407e-05, "loss": 0.0439, "step": 6850 }, { "epoch": 22.71523178807947, "grad_norm": 0.42478787899017334, "learning_rate": 7.83157694482621e-05, "loss": 0.0448, "step": 6860 }, { "epoch": 22.748344370860927, "grad_norm": 0.25616151094436646, "learning_rate": 7.824759222188768e-05, "loss": 0.0514, "step": 6870 }, { "epoch": 22.781456953642383, "grad_norm": 0.29273074865341187, "learning_rate": 7.817933776767478e-05, "loss": 0.0474, "step": 6880 }, { "epoch": 22.814569536423843, "grad_norm": 0.288250207901001, "learning_rate": 7.811100627222842e-05, "loss": 0.0464, "step": 6890 }, { "epoch": 22.8476821192053, "grad_norm": 0.34893709421157837, "learning_rate": 7.804259792236435e-05, "loss": 0.0429, "step": 6900 }, { "epoch": 22.880794701986755, "grad_norm": 0.22524340450763702, "learning_rate": 7.797411290510835e-05, "loss": 0.0441, "step": 6910 }, { "epoch": 22.91390728476821, "grad_norm": 0.2282683551311493, "learning_rate": 7.790555140769586e-05, "loss": 0.0469, "step": 6920 }, { "epoch": 22.94701986754967, "grad_norm": 0.3836049735546112, "learning_rate": 7.78369136175714e-05, "loss": 0.0415, "step": 6930 }, { "epoch": 22.980132450331126, "grad_norm": 0.285179078578949, "learning_rate": 7.776819972238806e-05, "loss": 0.0454, "step": 6940 }, { "epoch": 23.013245033112582, "grad_norm": 0.40329691767692566, "learning_rate": 7.7699409910007e-05, "loss": 0.0417, "step": 6950 }, { "epoch": 23.04635761589404, "grad_norm": 0.25568804144859314, "learning_rate": 7.763054436849694e-05, "loss": 0.0421, "step": 6960 }, { "epoch": 23.079470198675498, "grad_norm": 1.1990829706192017, "learning_rate": 7.756160328613364e-05, "loss": 0.0473, "step": 6970 }, { "epoch": 23.112582781456954, "grad_norm": 0.2340088188648224, "learning_rate": 7.749258685139942e-05, "loss": 0.0423, "step": 6980 }, { "epoch": 23.14569536423841, "grad_norm": 0.21429742872714996, "learning_rate": 7.742349525298253e-05, "loss": 0.042, "step": 6990 }, { "epoch": 23.178807947019866, "grad_norm": 0.2315322309732437, "learning_rate": 7.735432867977679e-05, "loss": 0.0425, "step": 7000 }, { "epoch": 23.211920529801326, "grad_norm": 0.3559403419494629, "learning_rate": 7.728508732088096e-05, "loss": 0.0415, "step": 7010 }, { "epoch": 23.24503311258278, "grad_norm": 0.17586569488048553, "learning_rate": 7.721577136559825e-05, "loss": 0.0454, "step": 7020 }, { "epoch": 23.278145695364238, "grad_norm": 0.3238493800163269, "learning_rate": 7.714638100343588e-05, "loss": 0.0416, "step": 7030 }, { "epoch": 23.311258278145694, "grad_norm": 0.28565070033073425, "learning_rate": 7.707691642410444e-05, "loss": 0.0473, "step": 7040 }, { "epoch": 23.344370860927153, "grad_norm": 0.27038830518722534, "learning_rate": 7.70073778175174e-05, "loss": 0.0429, "step": 7050 }, { "epoch": 23.37748344370861, "grad_norm": 0.22363823652267456, "learning_rate": 7.69377653737907e-05, "loss": 0.0436, "step": 7060 }, { "epoch": 23.410596026490065, "grad_norm": 0.27239683270454407, "learning_rate": 7.686807928324209e-05, "loss": 0.0408, "step": 7070 }, { "epoch": 23.443708609271525, "grad_norm": 0.1702318638563156, "learning_rate": 7.679831973639065e-05, "loss": 0.0465, "step": 7080 }, { "epoch": 23.47682119205298, "grad_norm": 0.21600832045078278, "learning_rate": 7.672848692395637e-05, "loss": 0.04, "step": 7090 }, { "epoch": 23.509933774834437, "grad_norm": 0.2722924053668976, "learning_rate": 7.665858103685944e-05, "loss": 0.0426, "step": 7100 }, { "epoch": 23.543046357615893, "grad_norm": 0.2611531615257263, "learning_rate": 7.658860226621991e-05, "loss": 0.0451, "step": 7110 }, { "epoch": 23.576158940397352, "grad_norm": 0.20654314756393433, "learning_rate": 7.651855080335708e-05, "loss": 0.042, "step": 7120 }, { "epoch": 23.60927152317881, "grad_norm": 0.2541739344596863, "learning_rate": 7.644842683978896e-05, "loss": 0.0427, "step": 7130 }, { "epoch": 23.642384105960264, "grad_norm": 0.23382434248924255, "learning_rate": 7.63782305672318e-05, "loss": 0.0487, "step": 7140 }, { "epoch": 23.67549668874172, "grad_norm": 0.23742124438285828, "learning_rate": 7.63079621775995e-05, "loss": 0.0422, "step": 7150 }, { "epoch": 23.70860927152318, "grad_norm": 0.321322500705719, "learning_rate": 7.623762186300319e-05, "loss": 0.0406, "step": 7160 }, { "epoch": 23.741721854304636, "grad_norm": 0.3420529067516327, "learning_rate": 7.616720981575057e-05, "loss": 0.0436, "step": 7170 }, { "epoch": 23.774834437086092, "grad_norm": 0.39397263526916504, "learning_rate": 7.609672622834552e-05, "loss": 0.0461, "step": 7180 }, { "epoch": 23.807947019867548, "grad_norm": 0.31546032428741455, "learning_rate": 7.602617129348747e-05, "loss": 0.0447, "step": 7190 }, { "epoch": 23.841059602649008, "grad_norm": 0.2628043293952942, "learning_rate": 7.595554520407088e-05, "loss": 0.0448, "step": 7200 }, { "epoch": 23.874172185430464, "grad_norm": 0.28553488850593567, "learning_rate": 7.588484815318484e-05, "loss": 0.0438, "step": 7210 }, { "epoch": 23.90728476821192, "grad_norm": 0.34233078360557556, "learning_rate": 7.581408033411234e-05, "loss": 0.0406, "step": 7220 }, { "epoch": 23.94039735099338, "grad_norm": 0.23430036008358002, "learning_rate": 7.574324194032995e-05, "loss": 0.0384, "step": 7230 }, { "epoch": 23.973509933774835, "grad_norm": 0.21433497965335846, "learning_rate": 7.567233316550705e-05, "loss": 0.0438, "step": 7240 }, { "epoch": 24.00662251655629, "grad_norm": 0.271681010723114, "learning_rate": 7.560135420350562e-05, "loss": 0.0485, "step": 7250 }, { "epoch": 24.039735099337747, "grad_norm": 0.2231360375881195, "learning_rate": 7.553030524837935e-05, "loss": 0.041, "step": 7260 }, { "epoch": 24.072847682119207, "grad_norm": 0.2169087678194046, "learning_rate": 7.545918649437341e-05, "loss": 0.0395, "step": 7270 }, { "epoch": 24.105960264900663, "grad_norm": 0.22116117179393768, "learning_rate": 7.538799813592377e-05, "loss": 0.042, "step": 7280 }, { "epoch": 24.13907284768212, "grad_norm": 0.24826359748840332, "learning_rate": 7.531674036765662e-05, "loss": 0.0436, "step": 7290 }, { "epoch": 24.172185430463575, "grad_norm": 0.27867329120635986, "learning_rate": 7.524541338438807e-05, "loss": 0.0418, "step": 7300 }, { "epoch": 24.205298013245034, "grad_norm": 0.23993730545043945, "learning_rate": 7.517401738112328e-05, "loss": 0.0414, "step": 7310 }, { "epoch": 24.23841059602649, "grad_norm": 0.2976135015487671, "learning_rate": 7.510255255305628e-05, "loss": 0.0424, "step": 7320 }, { "epoch": 24.271523178807946, "grad_norm": 0.3636080324649811, "learning_rate": 7.503101909556911e-05, "loss": 0.0426, "step": 7330 }, { "epoch": 24.304635761589402, "grad_norm": 0.267316997051239, "learning_rate": 7.495941720423154e-05, "loss": 0.0413, "step": 7340 }, { "epoch": 24.337748344370862, "grad_norm": 0.2943784296512604, "learning_rate": 7.488774707480042e-05, "loss": 0.0434, "step": 7350 }, { "epoch": 24.370860927152318, "grad_norm": 0.41523075103759766, "learning_rate": 7.481600890321911e-05, "loss": 0.0452, "step": 7360 }, { "epoch": 24.403973509933774, "grad_norm": 0.3176235258579254, "learning_rate": 7.474420288561708e-05, "loss": 0.0481, "step": 7370 }, { "epoch": 24.437086092715234, "grad_norm": 0.2263539582490921, "learning_rate": 7.467232921830921e-05, "loss": 0.0439, "step": 7380 }, { "epoch": 24.47019867549669, "grad_norm": 0.2514092028141022, "learning_rate": 7.460038809779537e-05, "loss": 0.043, "step": 7390 }, { "epoch": 24.503311258278146, "grad_norm": 0.5345478653907776, "learning_rate": 7.452837972075983e-05, "loss": 0.0445, "step": 7400 }, { "epoch": 24.5364238410596, "grad_norm": 0.35810139775276184, "learning_rate": 7.445630428407074e-05, "loss": 0.042, "step": 7410 }, { "epoch": 24.56953642384106, "grad_norm": 0.21379345655441284, "learning_rate": 7.43841619847796e-05, "loss": 0.0427, "step": 7420 }, { "epoch": 24.602649006622517, "grad_norm": 0.2651136815547943, "learning_rate": 7.431195302012072e-05, "loss": 0.0401, "step": 7430 }, { "epoch": 24.635761589403973, "grad_norm": 0.27217385172843933, "learning_rate": 7.423967758751061e-05, "loss": 0.0367, "step": 7440 }, { "epoch": 24.66887417218543, "grad_norm": 0.3457459509372711, "learning_rate": 7.416733588454758e-05, "loss": 0.0388, "step": 7450 }, { "epoch": 24.70198675496689, "grad_norm": 0.2479846030473709, "learning_rate": 7.409492810901106e-05, "loss": 0.0395, "step": 7460 }, { "epoch": 24.735099337748345, "grad_norm": 0.20149891078472137, "learning_rate": 7.402245445886116e-05, "loss": 0.0443, "step": 7470 }, { "epoch": 24.7682119205298, "grad_norm": 0.20882965624332428, "learning_rate": 7.394991513223806e-05, "loss": 0.0446, "step": 7480 }, { "epoch": 24.801324503311257, "grad_norm": 0.3814091682434082, "learning_rate": 7.38773103274615e-05, "loss": 0.0386, "step": 7490 }, { "epoch": 24.834437086092716, "grad_norm": 0.17472979426383972, "learning_rate": 7.380464024303028e-05, "loss": 0.0371, "step": 7500 }, { "epoch": 24.867549668874172, "grad_norm": 0.2399348020553589, "learning_rate": 7.373190507762162e-05, "loss": 0.0423, "step": 7510 }, { "epoch": 24.90066225165563, "grad_norm": 0.3357790410518646, "learning_rate": 7.365910503009066e-05, "loss": 0.0398, "step": 7520 }, { "epoch": 24.933774834437084, "grad_norm": 0.46532532572746277, "learning_rate": 7.358624029946996e-05, "loss": 0.0428, "step": 7530 }, { "epoch": 24.966887417218544, "grad_norm": 0.30107513070106506, "learning_rate": 7.351331108496893e-05, "loss": 0.0403, "step": 7540 }, { "epoch": 25.0, "grad_norm": 0.2312031388282776, "learning_rate": 7.344031758597325e-05, "loss": 0.0409, "step": 7550 }, { "epoch": 25.033112582781456, "grad_norm": 0.24951495230197906, "learning_rate": 7.336726000204435e-05, "loss": 0.04, "step": 7560 }, { "epoch": 25.066225165562916, "grad_norm": 0.22746464610099792, "learning_rate": 7.32941385329189e-05, "loss": 0.044, "step": 7570 }, { "epoch": 25.09933774834437, "grad_norm": 0.24596048891544342, "learning_rate": 7.322095337850816e-05, "loss": 0.0403, "step": 7580 }, { "epoch": 25.132450331125828, "grad_norm": 0.19115141034126282, "learning_rate": 7.314770473889758e-05, "loss": 0.0389, "step": 7590 }, { "epoch": 25.165562913907284, "grad_norm": 0.28532323241233826, "learning_rate": 7.307439281434615e-05, "loss": 0.0389, "step": 7600 }, { "epoch": 25.198675496688743, "grad_norm": 0.3170408010482788, "learning_rate": 7.300101780528585e-05, "loss": 0.041, "step": 7610 }, { "epoch": 25.2317880794702, "grad_norm": 0.21404165029525757, "learning_rate": 7.292757991232117e-05, "loss": 0.036, "step": 7620 }, { "epoch": 25.264900662251655, "grad_norm": 0.164994016289711, "learning_rate": 7.285407933622848e-05, "loss": 0.0394, "step": 7630 }, { "epoch": 25.29801324503311, "grad_norm": 0.25994402170181274, "learning_rate": 7.278051627795557e-05, "loss": 0.0427, "step": 7640 }, { "epoch": 25.33112582781457, "grad_norm": 0.29878076910972595, "learning_rate": 7.270689093862105e-05, "loss": 0.0423, "step": 7650 }, { "epoch": 25.364238410596027, "grad_norm": 0.23664246499538422, "learning_rate": 7.263320351951374e-05, "loss": 0.0396, "step": 7660 }, { "epoch": 25.397350993377483, "grad_norm": 0.22399583458900452, "learning_rate": 7.255945422209227e-05, "loss": 0.0465, "step": 7670 }, { "epoch": 25.43046357615894, "grad_norm": 0.24424488842487335, "learning_rate": 7.248564324798437e-05, "loss": 0.0408, "step": 7680 }, { "epoch": 25.4635761589404, "grad_norm": 0.1838204264640808, "learning_rate": 7.241177079898644e-05, "loss": 0.037, "step": 7690 }, { "epoch": 25.496688741721854, "grad_norm": 0.22885197401046753, "learning_rate": 7.233783707706295e-05, "loss": 0.0413, "step": 7700 }, { "epoch": 25.52980132450331, "grad_norm": 0.29734522104263306, "learning_rate": 7.226384228434586e-05, "loss": 0.0474, "step": 7710 }, { "epoch": 25.562913907284766, "grad_norm": 0.3032153248786926, "learning_rate": 7.21897866231341e-05, "loss": 0.0391, "step": 7720 }, { "epoch": 25.596026490066226, "grad_norm": 0.22387151420116425, "learning_rate": 7.211567029589303e-05, "loss": 0.0375, "step": 7730 }, { "epoch": 25.629139072847682, "grad_norm": 0.2761561870574951, "learning_rate": 7.204149350525387e-05, "loss": 0.0375, "step": 7740 }, { "epoch": 25.662251655629138, "grad_norm": 0.16986067593097687, "learning_rate": 7.196725645401309e-05, "loss": 0.0361, "step": 7750 }, { "epoch": 25.695364238410598, "grad_norm": 0.2819330096244812, "learning_rate": 7.1892959345132e-05, "loss": 0.0397, "step": 7760 }, { "epoch": 25.728476821192054, "grad_norm": 0.2068263441324234, "learning_rate": 7.181860238173605e-05, "loss": 0.0402, "step": 7770 }, { "epoch": 25.76158940397351, "grad_norm": 0.2126781940460205, "learning_rate": 7.174418576711432e-05, "loss": 0.0438, "step": 7780 }, { "epoch": 25.794701986754966, "grad_norm": 0.2722189724445343, "learning_rate": 7.1669709704719e-05, "loss": 0.0388, "step": 7790 }, { "epoch": 25.827814569536425, "grad_norm": 0.32165783643722534, "learning_rate": 7.159517439816481e-05, "loss": 0.0395, "step": 7800 }, { "epoch": 25.86092715231788, "grad_norm": 0.3317587971687317, "learning_rate": 7.152058005122842e-05, "loss": 0.0423, "step": 7810 }, { "epoch": 25.894039735099337, "grad_norm": 0.6758538484573364, "learning_rate": 7.144592686784793e-05, "loss": 0.0371, "step": 7820 }, { "epoch": 25.927152317880793, "grad_norm": 0.532970130443573, "learning_rate": 7.137121505212229e-05, "loss": 0.0375, "step": 7830 }, { "epoch": 25.960264900662253, "grad_norm": 0.225266233086586, "learning_rate": 7.129644480831077e-05, "loss": 0.0403, "step": 7840 }, { "epoch": 25.99337748344371, "grad_norm": 0.20892295241355896, "learning_rate": 7.122161634083234e-05, "loss": 0.0417, "step": 7850 }, { "epoch": 26.026490066225165, "grad_norm": 0.2937694191932678, "learning_rate": 7.114672985426516e-05, "loss": 0.0401, "step": 7860 }, { "epoch": 26.05960264900662, "grad_norm": 0.26536044478416443, "learning_rate": 7.107178555334606e-05, "loss": 0.045, "step": 7870 }, { "epoch": 26.09271523178808, "grad_norm": 0.19423232972621918, "learning_rate": 7.099678364296989e-05, "loss": 0.0407, "step": 7880 }, { "epoch": 26.125827814569536, "grad_norm": 0.20651349425315857, "learning_rate": 7.0921724328189e-05, "loss": 0.0404, "step": 7890 }, { "epoch": 26.158940397350992, "grad_norm": 0.249784916639328, "learning_rate": 7.084660781421268e-05, "loss": 0.0428, "step": 7900 }, { "epoch": 26.192052980132452, "grad_norm": 0.27849239110946655, "learning_rate": 7.077143430640662e-05, "loss": 0.0428, "step": 7910 }, { "epoch": 26.225165562913908, "grad_norm": 0.2091342806816101, "learning_rate": 7.069620401029232e-05, "loss": 0.0473, "step": 7920 }, { "epoch": 26.258278145695364, "grad_norm": 0.21407970786094666, "learning_rate": 7.062091713154655e-05, "loss": 0.0388, "step": 7930 }, { "epoch": 26.29139072847682, "grad_norm": 0.15458589792251587, "learning_rate": 7.054557387600075e-05, "loss": 0.0419, "step": 7940 }, { "epoch": 26.32450331125828, "grad_norm": 0.22888121008872986, "learning_rate": 7.04701744496405e-05, "loss": 0.042, "step": 7950 }, { "epoch": 26.357615894039736, "grad_norm": 0.2171410471200943, "learning_rate": 7.039471905860495e-05, "loss": 0.0432, "step": 7960 }, { "epoch": 26.39072847682119, "grad_norm": 0.281112402677536, "learning_rate": 7.031920790918628e-05, "loss": 0.0423, "step": 7970 }, { "epoch": 26.423841059602648, "grad_norm": 0.27414464950561523, "learning_rate": 7.024364120782906e-05, "loss": 0.0423, "step": 7980 }, { "epoch": 26.456953642384107, "grad_norm": 0.29030996561050415, "learning_rate": 7.016801916112978e-05, "loss": 0.0424, "step": 7990 }, { "epoch": 26.490066225165563, "grad_norm": 0.2741679251194, "learning_rate": 7.009234197583623e-05, "loss": 0.0393, "step": 8000 }, { "epoch": 26.52317880794702, "grad_norm": 0.258553683757782, "learning_rate": 7.001660985884692e-05, "loss": 0.0387, "step": 8010 }, { "epoch": 26.556291390728475, "grad_norm": 0.3444920778274536, "learning_rate": 6.994082301721063e-05, "loss": 0.0367, "step": 8020 }, { "epoch": 26.589403973509935, "grad_norm": 0.20599070191383362, "learning_rate": 6.986498165812563e-05, "loss": 0.0384, "step": 8030 }, { "epoch": 26.62251655629139, "grad_norm": 0.31933361291885376, "learning_rate": 6.978908598893932e-05, "loss": 0.0411, "step": 8040 }, { "epoch": 26.655629139072847, "grad_norm": 0.22647035121917725, "learning_rate": 6.971313621714756e-05, "loss": 0.0415, "step": 8050 }, { "epoch": 26.688741721854306, "grad_norm": 0.23651236295700073, "learning_rate": 6.96371325503941e-05, "loss": 0.0365, "step": 8060 }, { "epoch": 26.721854304635762, "grad_norm": 0.2775920629501343, "learning_rate": 6.956107519647014e-05, "loss": 0.0419, "step": 8070 }, { "epoch": 26.75496688741722, "grad_norm": 0.2546255886554718, "learning_rate": 6.94849643633135e-05, "loss": 0.0377, "step": 8080 }, { "epoch": 26.788079470198674, "grad_norm": 0.2640795409679413, "learning_rate": 6.940880025900834e-05, "loss": 0.0405, "step": 8090 }, { "epoch": 26.821192052980134, "grad_norm": 0.3056033253669739, "learning_rate": 6.933258309178438e-05, "loss": 0.0401, "step": 8100 }, { "epoch": 26.85430463576159, "grad_norm": 0.2560013234615326, "learning_rate": 6.925631307001646e-05, "loss": 0.0398, "step": 8110 }, { "epoch": 26.887417218543046, "grad_norm": 0.3284982144832611, "learning_rate": 6.91799904022239e-05, "loss": 0.0387, "step": 8120 }, { "epoch": 26.920529801324502, "grad_norm": 0.2504599988460541, "learning_rate": 6.910361529706997e-05, "loss": 0.0414, "step": 8130 }, { "epoch": 26.95364238410596, "grad_norm": 0.21117530763149261, "learning_rate": 6.902718796336131e-05, "loss": 0.0406, "step": 8140 }, { "epoch": 26.986754966887418, "grad_norm": 0.36615896224975586, "learning_rate": 6.895070861004729e-05, "loss": 0.0415, "step": 8150 }, { "epoch": 27.019867549668874, "grad_norm": 0.3322663903236389, "learning_rate": 6.887417744621956e-05, "loss": 0.041, "step": 8160 }, { "epoch": 27.05298013245033, "grad_norm": 0.2852466106414795, "learning_rate": 6.87975946811114e-05, "loss": 0.0401, "step": 8170 }, { "epoch": 27.08609271523179, "grad_norm": 0.21037110686302185, "learning_rate": 6.872096052409718e-05, "loss": 0.038, "step": 8180 }, { "epoch": 27.119205298013245, "grad_norm": 0.2465747445821762, "learning_rate": 6.864427518469174e-05, "loss": 0.0374, "step": 8190 }, { "epoch": 27.1523178807947, "grad_norm": 0.28945982456207275, "learning_rate": 6.856753887254986e-05, "loss": 0.0402, "step": 8200 }, { "epoch": 27.185430463576157, "grad_norm": 0.2734522521495819, "learning_rate": 6.849075179746572e-05, "loss": 0.0357, "step": 8210 }, { "epoch": 27.218543046357617, "grad_norm": 0.14553239941596985, "learning_rate": 6.841391416937221e-05, "loss": 0.0364, "step": 8220 }, { "epoch": 27.251655629139073, "grad_norm": 0.19808602333068848, "learning_rate": 6.833702619834053e-05, "loss": 0.0351, "step": 8230 }, { "epoch": 27.28476821192053, "grad_norm": 0.3412145972251892, "learning_rate": 6.82600880945794e-05, "loss": 0.0376, "step": 8240 }, { "epoch": 27.31788079470199, "grad_norm": 0.25483885407447815, "learning_rate": 6.818310006843468e-05, "loss": 0.0389, "step": 8250 }, { "epoch": 27.350993377483444, "grad_norm": 0.31506574153900146, "learning_rate": 6.810606233038868e-05, "loss": 0.0367, "step": 8260 }, { "epoch": 27.3841059602649, "grad_norm": 0.3476342558860779, "learning_rate": 6.802897509105966e-05, "loss": 0.0345, "step": 8270 }, { "epoch": 27.417218543046356, "grad_norm": 0.3301520347595215, "learning_rate": 6.79518385612012e-05, "loss": 0.0381, "step": 8280 }, { "epoch": 27.450331125827816, "grad_norm": 0.23731929063796997, "learning_rate": 6.787465295170157e-05, "loss": 0.0374, "step": 8290 }, { "epoch": 27.483443708609272, "grad_norm": 0.17476530373096466, "learning_rate": 6.779741847358332e-05, "loss": 0.0389, "step": 8300 }, { "epoch": 27.516556291390728, "grad_norm": 0.30568990111351013, "learning_rate": 6.772013533800256e-05, "loss": 0.0393, "step": 8310 }, { "epoch": 27.549668874172184, "grad_norm": 0.2570163607597351, "learning_rate": 6.764280375624843e-05, "loss": 0.0393, "step": 8320 }, { "epoch": 27.582781456953644, "grad_norm": 0.20715026557445526, "learning_rate": 6.756542393974252e-05, "loss": 0.0372, "step": 8330 }, { "epoch": 27.6158940397351, "grad_norm": 0.3439162075519562, "learning_rate": 6.748799610003828e-05, "loss": 0.0374, "step": 8340 }, { "epoch": 27.649006622516556, "grad_norm": 0.2586623430252075, "learning_rate": 6.741052044882048e-05, "loss": 0.0375, "step": 8350 }, { "epoch": 27.68211920529801, "grad_norm": 0.3403407633304596, "learning_rate": 6.73329971979046e-05, "loss": 0.0388, "step": 8360 }, { "epoch": 27.71523178807947, "grad_norm": 0.3346109986305237, "learning_rate": 6.725542655923625e-05, "loss": 0.0397, "step": 8370 }, { "epoch": 27.748344370860927, "grad_norm": 0.2793143391609192, "learning_rate": 6.717780874489057e-05, "loss": 0.0338, "step": 8380 }, { "epoch": 27.781456953642383, "grad_norm": 0.21956293284893036, "learning_rate": 6.710014396707172e-05, "loss": 0.0368, "step": 8390 }, { "epoch": 27.814569536423843, "grad_norm": 0.1664106547832489, "learning_rate": 6.702243243811221e-05, "loss": 0.0395, "step": 8400 }, { "epoch": 27.8476821192053, "grad_norm": 0.2635118365287781, "learning_rate": 6.694467437047244e-05, "loss": 0.0384, "step": 8410 }, { "epoch": 27.880794701986755, "grad_norm": 0.2720617353916168, "learning_rate": 6.686686997673997e-05, "loss": 0.0425, "step": 8420 }, { "epoch": 27.91390728476821, "grad_norm": 0.3802785575389862, "learning_rate": 6.678901946962903e-05, "loss": 0.0396, "step": 8430 }, { "epoch": 27.94701986754967, "grad_norm": 0.40870964527130127, "learning_rate": 6.671112306197996e-05, "loss": 0.0374, "step": 8440 }, { "epoch": 27.980132450331126, "grad_norm": 0.24894554913043976, "learning_rate": 6.663318096675854e-05, "loss": 0.0389, "step": 8450 }, { "epoch": 28.013245033112582, "grad_norm": 0.4067665636539459, "learning_rate": 6.655519339705552e-05, "loss": 0.0449, "step": 8460 }, { "epoch": 28.04635761589404, "grad_norm": 0.1885935515165329, "learning_rate": 6.647716056608588e-05, "loss": 0.0372, "step": 8470 }, { "epoch": 28.079470198675498, "grad_norm": 0.2343471795320511, "learning_rate": 6.639908268718843e-05, "loss": 0.039, "step": 8480 }, { "epoch": 28.112582781456954, "grad_norm": 0.29440125823020935, "learning_rate": 6.632095997382514e-05, "loss": 0.038, "step": 8490 }, { "epoch": 28.14569536423841, "grad_norm": 0.48126116394996643, "learning_rate": 6.624279263958047e-05, "loss": 0.041, "step": 8500 }, { "epoch": 28.178807947019866, "grad_norm": 0.20583835244178772, "learning_rate": 6.616458089816097e-05, "loss": 0.0435, "step": 8510 }, { "epoch": 28.211920529801326, "grad_norm": 0.1693679839372635, "learning_rate": 6.608632496339454e-05, "loss": 0.0421, "step": 8520 }, { "epoch": 28.24503311258278, "grad_norm": 0.22596974670886993, "learning_rate": 6.600802504922988e-05, "loss": 0.0418, "step": 8530 }, { "epoch": 28.278145695364238, "grad_norm": 0.19928045570850372, "learning_rate": 6.592968136973604e-05, "loss": 0.0376, "step": 8540 }, { "epoch": 28.311258278145694, "grad_norm": 0.25728437304496765, "learning_rate": 6.585129413910159e-05, "loss": 0.0406, "step": 8550 }, { "epoch": 28.344370860927153, "grad_norm": 0.21682079136371613, "learning_rate": 6.577286357163424e-05, "loss": 0.0365, "step": 8560 }, { "epoch": 28.37748344370861, "grad_norm": 0.2388421595096588, "learning_rate": 6.569438988176018e-05, "loss": 0.038, "step": 8570 }, { "epoch": 28.410596026490065, "grad_norm": 0.2655351758003235, "learning_rate": 6.561587328402347e-05, "loss": 0.041, "step": 8580 }, { "epoch": 28.443708609271525, "grad_norm": 0.3812657594680786, "learning_rate": 6.553731399308549e-05, "loss": 0.0389, "step": 8590 }, { "epoch": 28.47682119205298, "grad_norm": 0.45246249437332153, "learning_rate": 6.545871222372436e-05, "loss": 0.0375, "step": 8600 }, { "epoch": 28.509933774834437, "grad_norm": 0.3398737907409668, "learning_rate": 6.538006819083426e-05, "loss": 0.039, "step": 8610 }, { "epoch": 28.543046357615893, "grad_norm": 0.28014498949050903, "learning_rate": 6.530138210942505e-05, "loss": 0.0406, "step": 8620 }, { "epoch": 28.576158940397352, "grad_norm": 0.2871643304824829, "learning_rate": 6.522265419462141e-05, "loss": 0.0371, "step": 8630 }, { "epoch": 28.60927152317881, "grad_norm": 0.2815556526184082, "learning_rate": 6.514388466166248e-05, "loss": 0.042, "step": 8640 }, { "epoch": 28.642384105960264, "grad_norm": 0.3473415970802307, "learning_rate": 6.506507372590119e-05, "loss": 0.0403, "step": 8650 }, { "epoch": 28.67549668874172, "grad_norm": 0.2806711196899414, "learning_rate": 6.498622160280355e-05, "loss": 0.0412, "step": 8660 }, { "epoch": 28.70860927152318, "grad_norm": 0.18617627024650574, "learning_rate": 6.490732850794832e-05, "loss": 0.0367, "step": 8670 }, { "epoch": 28.741721854304636, "grad_norm": 0.31807005405426025, "learning_rate": 6.482839465702616e-05, "loss": 0.039, "step": 8680 }, { "epoch": 28.774834437086092, "grad_norm": 0.347241073846817, "learning_rate": 6.474942026583923e-05, "loss": 0.0409, "step": 8690 }, { "epoch": 28.807947019867548, "grad_norm": 0.2914639413356781, "learning_rate": 6.467040555030052e-05, "loss": 0.0358, "step": 8700 }, { "epoch": 28.841059602649008, "grad_norm": 0.22341950237751007, "learning_rate": 6.459135072643321e-05, "loss": 0.0321, "step": 8710 }, { "epoch": 28.874172185430464, "grad_norm": 0.2481926828622818, "learning_rate": 6.451225601037019e-05, "loss": 0.0393, "step": 8720 }, { "epoch": 28.90728476821192, "grad_norm": 0.2582075893878937, "learning_rate": 6.443312161835338e-05, "loss": 0.04, "step": 8730 }, { "epoch": 28.94039735099338, "grad_norm": 0.25580933690071106, "learning_rate": 6.43539477667332e-05, "loss": 0.0386, "step": 8740 }, { "epoch": 28.973509933774835, "grad_norm": 0.24333177506923676, "learning_rate": 6.427473467196793e-05, "loss": 0.0388, "step": 8750 }, { "epoch": 29.00662251655629, "grad_norm": 0.2547275722026825, "learning_rate": 6.419548255062315e-05, "loss": 0.0334, "step": 8760 }, { "epoch": 29.039735099337747, "grad_norm": 0.16192732751369476, "learning_rate": 6.411619161937112e-05, "loss": 0.0369, "step": 8770 }, { "epoch": 29.072847682119207, "grad_norm": 0.2602890729904175, "learning_rate": 6.403686209499022e-05, "loss": 0.0396, "step": 8780 }, { "epoch": 29.105960264900663, "grad_norm": 0.3310868442058563, "learning_rate": 6.395749419436437e-05, "loss": 0.0396, "step": 8790 }, { "epoch": 29.13907284768212, "grad_norm": 0.17539499700069427, "learning_rate": 6.387808813448234e-05, "loss": 0.0367, "step": 8800 }, { "epoch": 29.172185430463575, "grad_norm": 0.2949528694152832, "learning_rate": 6.37986441324373e-05, "loss": 0.0374, "step": 8810 }, { "epoch": 29.205298013245034, "grad_norm": 0.3170853853225708, "learning_rate": 6.37191624054261e-05, "loss": 0.0334, "step": 8820 }, { "epoch": 29.23841059602649, "grad_norm": 0.2562275230884552, "learning_rate": 6.363964317074872e-05, "loss": 0.0377, "step": 8830 }, { "epoch": 29.271523178807946, "grad_norm": 0.2412496656179428, "learning_rate": 6.356008664580776e-05, "loss": 0.0375, "step": 8840 }, { "epoch": 29.304635761589402, "grad_norm": 0.291025847196579, "learning_rate": 6.348049304810771e-05, "loss": 0.0395, "step": 8850 }, { "epoch": 29.337748344370862, "grad_norm": 0.3089853525161743, "learning_rate": 6.340086259525442e-05, "loss": 0.0422, "step": 8860 }, { "epoch": 29.370860927152318, "grad_norm": 0.36182618141174316, "learning_rate": 6.332119550495448e-05, "loss": 0.0374, "step": 8870 }, { "epoch": 29.403973509933774, "grad_norm": 0.20298461616039276, "learning_rate": 6.324149199501473e-05, "loss": 0.0401, "step": 8880 }, { "epoch": 29.437086092715234, "grad_norm": 0.3029368817806244, "learning_rate": 6.316175228334146e-05, "loss": 0.036, "step": 8890 }, { "epoch": 29.47019867549669, "grad_norm": 0.21909299492835999, "learning_rate": 6.308197658794003e-05, "loss": 0.0376, "step": 8900 }, { "epoch": 29.503311258278146, "grad_norm": 0.41992664337158203, "learning_rate": 6.300216512691417e-05, "loss": 0.0358, "step": 8910 }, { "epoch": 29.5364238410596, "grad_norm": 0.22992579638957977, "learning_rate": 6.292231811846532e-05, "loss": 0.0377, "step": 8920 }, { "epoch": 29.56953642384106, "grad_norm": 0.2940419018268585, "learning_rate": 6.284243578089217e-05, "loss": 0.0374, "step": 8930 }, { "epoch": 29.602649006622517, "grad_norm": 0.3546594977378845, "learning_rate": 6.276251833258999e-05, "loss": 0.0401, "step": 8940 }, { "epoch": 29.635761589403973, "grad_norm": 0.23110158741474152, "learning_rate": 6.268256599205003e-05, "loss": 0.0373, "step": 8950 }, { "epoch": 29.66887417218543, "grad_norm": 0.24438811838626862, "learning_rate": 6.260257897785892e-05, "loss": 0.0406, "step": 8960 }, { "epoch": 29.70198675496689, "grad_norm": 0.20318980515003204, "learning_rate": 6.252255750869811e-05, "loss": 0.0383, "step": 8970 }, { "epoch": 29.735099337748345, "grad_norm": 0.2617165148258209, "learning_rate": 6.244250180334325e-05, "loss": 0.0406, "step": 8980 }, { "epoch": 29.7682119205298, "grad_norm": 0.12334084510803223, "learning_rate": 6.236241208066356e-05, "loss": 0.0382, "step": 8990 }, { "epoch": 29.801324503311257, "grad_norm": 0.2487781047821045, "learning_rate": 6.228228855962133e-05, "loss": 0.0369, "step": 9000 }, { "epoch": 29.834437086092716, "grad_norm": 0.32601532340049744, "learning_rate": 6.220213145927115e-05, "loss": 0.0393, "step": 9010 }, { "epoch": 29.867549668874172, "grad_norm": 0.6535779237747192, "learning_rate": 6.212194099875951e-05, "loss": 0.0401, "step": 9020 }, { "epoch": 29.90066225165563, "grad_norm": 0.17556065320968628, "learning_rate": 6.204171739732405e-05, "loss": 0.0376, "step": 9030 }, { "epoch": 29.933774834437084, "grad_norm": 0.17014305293560028, "learning_rate": 6.196146087429303e-05, "loss": 0.041, "step": 9040 }, { "epoch": 29.966887417218544, "grad_norm": 0.1754930168390274, "learning_rate": 6.188117164908474e-05, "loss": 0.035, "step": 9050 }, { "epoch": 30.0, "grad_norm": 0.2303304523229599, "learning_rate": 6.180084994120684e-05, "loss": 0.0361, "step": 9060 }, { "epoch": 30.033112582781456, "grad_norm": 0.17113472521305084, "learning_rate": 6.17204959702558e-05, "loss": 0.035, "step": 9070 }, { "epoch": 30.066225165562916, "grad_norm": 0.16054780781269073, "learning_rate": 6.164010995591635e-05, "loss": 0.036, "step": 9080 }, { "epoch": 30.09933774834437, "grad_norm": 0.3483756482601166, "learning_rate": 6.155969211796076e-05, "loss": 0.0392, "step": 9090 }, { "epoch": 30.132450331125828, "grad_norm": 0.21537001430988312, "learning_rate": 6.147924267624829e-05, "loss": 0.0363, "step": 9100 }, { "epoch": 30.165562913907284, "grad_norm": 0.22557206451892853, "learning_rate": 6.13987618507247e-05, "loss": 0.0358, "step": 9110 }, { "epoch": 30.198675496688743, "grad_norm": 0.2419864386320114, "learning_rate": 6.131824986142147e-05, "loss": 0.0334, "step": 9120 }, { "epoch": 30.2317880794702, "grad_norm": 0.3194415867328644, "learning_rate": 6.123770692845529e-05, "loss": 0.0397, "step": 9130 }, { "epoch": 30.264900662251655, "grad_norm": 0.38300296664237976, "learning_rate": 6.11571332720275e-05, "loss": 0.0378, "step": 9140 }, { "epoch": 30.29801324503311, "grad_norm": 0.22396212816238403, "learning_rate": 6.107652911242336e-05, "loss": 0.0354, "step": 9150 }, { "epoch": 30.33112582781457, "grad_norm": 0.1722584068775177, "learning_rate": 6.0995894670011586e-05, "loss": 0.0378, "step": 9160 }, { "epoch": 30.364238410596027, "grad_norm": 0.19905656576156616, "learning_rate": 6.091523016524368e-05, "loss": 0.0324, "step": 9170 }, { "epoch": 30.397350993377483, "grad_norm": 0.14751461148262024, "learning_rate": 6.083453581865328e-05, "loss": 0.0382, "step": 9180 }, { "epoch": 30.43046357615894, "grad_norm": 0.29925334453582764, "learning_rate": 6.075381185085568e-05, "loss": 0.0329, "step": 9190 }, { "epoch": 30.4635761589404, "grad_norm": 0.2548972964286804, "learning_rate": 6.067305848254709e-05, "loss": 0.0394, "step": 9200 }, { "epoch": 30.496688741721854, "grad_norm": 0.1922713667154312, "learning_rate": 6.059227593450418e-05, "loss": 0.0374, "step": 9210 }, { "epoch": 30.52980132450331, "grad_norm": 0.2586852014064789, "learning_rate": 6.051146442758333e-05, "loss": 0.0406, "step": 9220 }, { "epoch": 30.562913907284766, "grad_norm": 0.22139191627502441, "learning_rate": 6.043062418272012e-05, "loss": 0.0392, "step": 9230 }, { "epoch": 30.596026490066226, "grad_norm": 0.2914575934410095, "learning_rate": 6.0349755420928666e-05, "loss": 0.0387, "step": 9240 }, { "epoch": 30.629139072847682, "grad_norm": 0.2100769728422165, "learning_rate": 6.0268858363301105e-05, "loss": 0.0335, "step": 9250 }, { "epoch": 30.662251655629138, "grad_norm": 0.22080926597118378, "learning_rate": 6.018793323100689e-05, "loss": 0.0356, "step": 9260 }, { "epoch": 30.695364238410598, "grad_norm": 0.42547768354415894, "learning_rate": 6.0106980245292255e-05, "loss": 0.0373, "step": 9270 }, { "epoch": 30.728476821192054, "grad_norm": 0.19902274012565613, "learning_rate": 6.002599962747957e-05, "loss": 0.0369, "step": 9280 }, { "epoch": 30.76158940397351, "grad_norm": 0.24458260834217072, "learning_rate": 5.994499159896673e-05, "loss": 0.0378, "step": 9290 }, { "epoch": 30.794701986754966, "grad_norm": 0.17163531482219696, "learning_rate": 5.9863956381226607e-05, "loss": 0.0335, "step": 9300 }, { "epoch": 30.827814569536425, "grad_norm": 0.2232498675584793, "learning_rate": 5.9782894195806394e-05, "loss": 0.0392, "step": 9310 }, { "epoch": 30.86092715231788, "grad_norm": 0.268479585647583, "learning_rate": 5.9701805264327004e-05, "loss": 0.0344, "step": 9320 }, { "epoch": 30.894039735099337, "grad_norm": 0.2328217327594757, "learning_rate": 5.96206898084825e-05, "loss": 0.0344, "step": 9330 }, { "epoch": 30.927152317880793, "grad_norm": 0.26818543672561646, "learning_rate": 5.953954805003942e-05, "loss": 0.035, "step": 9340 }, { "epoch": 30.960264900662253, "grad_norm": 0.3924226760864258, "learning_rate": 5.945838021083623e-05, "loss": 0.0402, "step": 9350 }, { "epoch": 30.99337748344371, "grad_norm": 0.2961652874946594, "learning_rate": 5.9377186512782714e-05, "loss": 0.0364, "step": 9360 }, { "epoch": 31.026490066225165, "grad_norm": 0.24418044090270996, "learning_rate": 5.929596717785935e-05, "loss": 0.0349, "step": 9370 }, { "epoch": 31.05960264900662, "grad_norm": 0.19145238399505615, "learning_rate": 5.921472242811668e-05, "loss": 0.0347, "step": 9380 }, { "epoch": 31.09271523178808, "grad_norm": 0.22511261701583862, "learning_rate": 5.913345248567475e-05, "loss": 0.032, "step": 9390 }, { "epoch": 31.125827814569536, "grad_norm": 0.2850934863090515, "learning_rate": 5.905215757272248e-05, "loss": 0.0353, "step": 9400 }, { "epoch": 31.158940397350992, "grad_norm": 0.24544374644756317, "learning_rate": 5.897083791151706e-05, "loss": 0.0404, "step": 9410 }, { "epoch": 31.192052980132452, "grad_norm": 0.2776612341403961, "learning_rate": 5.888949372438336e-05, "loss": 0.0371, "step": 9420 }, { "epoch": 31.225165562913908, "grad_norm": 0.18803074955940247, "learning_rate": 5.8808125233713255e-05, "loss": 0.0349, "step": 9430 }, { "epoch": 31.258278145695364, "grad_norm": 0.27445507049560547, "learning_rate": 5.872673266196509e-05, "loss": 0.0353, "step": 9440 }, { "epoch": 31.29139072847682, "grad_norm": 0.30535560846328735, "learning_rate": 5.864531623166305e-05, "loss": 0.0346, "step": 9450 }, { "epoch": 31.32450331125828, "grad_norm": 0.32384318113327026, "learning_rate": 5.856387616539656e-05, "loss": 0.0361, "step": 9460 }, { "epoch": 31.357615894039736, "grad_norm": 0.2613276243209839, "learning_rate": 5.848241268581967e-05, "loss": 0.0361, "step": 9470 }, { "epoch": 31.39072847682119, "grad_norm": 0.2466685026884079, "learning_rate": 5.840092601565037e-05, "loss": 0.0375, "step": 9480 }, { "epoch": 31.423841059602648, "grad_norm": 0.20146478712558746, "learning_rate": 5.8319416377670144e-05, "loss": 0.0327, "step": 9490 }, { "epoch": 31.456953642384107, "grad_norm": 0.25195345282554626, "learning_rate": 5.82378839947232e-05, "loss": 0.0344, "step": 9500 }, { "epoch": 31.490066225165563, "grad_norm": 0.27356091141700745, "learning_rate": 5.815632908971599e-05, "loss": 0.0324, "step": 9510 }, { "epoch": 31.52317880794702, "grad_norm": 0.3405700623989105, "learning_rate": 5.80747518856165e-05, "loss": 0.0331, "step": 9520 }, { "epoch": 31.556291390728475, "grad_norm": 0.19591087102890015, "learning_rate": 5.799315260545367e-05, "loss": 0.037, "step": 9530 }, { "epoch": 31.589403973509935, "grad_norm": 0.19023580849170685, "learning_rate": 5.791153147231686e-05, "loss": 0.0315, "step": 9540 }, { "epoch": 31.62251655629139, "grad_norm": 0.18196910619735718, "learning_rate": 5.782988870935509e-05, "loss": 0.0333, "step": 9550 }, { "epoch": 31.655629139072847, "grad_norm": 0.14420269429683685, "learning_rate": 5.774822453977657e-05, "loss": 0.0328, "step": 9560 }, { "epoch": 31.688741721854306, "grad_norm": 0.26820048689842224, "learning_rate": 5.7666539186848036e-05, "loss": 0.0327, "step": 9570 }, { "epoch": 31.721854304635762, "grad_norm": 0.1775560975074768, "learning_rate": 5.758483287389411e-05, "loss": 0.034, "step": 9580 }, { "epoch": 31.75496688741722, "grad_norm": 0.3094513416290283, "learning_rate": 5.7503105824296735e-05, "loss": 0.0361, "step": 9590 }, { "epoch": 31.788079470198674, "grad_norm": 0.5413017868995667, "learning_rate": 5.742135826149453e-05, "loss": 0.0359, "step": 9600 }, { "epoch": 31.821192052980134, "grad_norm": 0.27401477098464966, "learning_rate": 5.7339590408982223e-05, "loss": 0.0363, "step": 9610 }, { "epoch": 31.85430463576159, "grad_norm": 0.46224838495254517, "learning_rate": 5.725780249031e-05, "loss": 0.0331, "step": 9620 }, { "epoch": 31.887417218543046, "grad_norm": 0.22892165184020996, "learning_rate": 5.717599472908292e-05, "loss": 0.0333, "step": 9630 }, { "epoch": 31.920529801324502, "grad_norm": 0.3308574855327606, "learning_rate": 5.7094167348960237e-05, "loss": 0.0378, "step": 9640 }, { "epoch": 31.95364238410596, "grad_norm": 0.2394508272409439, "learning_rate": 5.7012320573654945e-05, "loss": 0.0355, "step": 9650 }, { "epoch": 31.986754966887418, "grad_norm": 0.25356248021125793, "learning_rate": 5.693045462693295e-05, "loss": 0.0371, "step": 9660 }, { "epoch": 32.019867549668874, "grad_norm": 0.24230192601680756, "learning_rate": 5.684856973261266e-05, "loss": 0.0311, "step": 9670 }, { "epoch": 32.05298013245033, "grad_norm": 0.27070727944374084, "learning_rate": 5.6766666114564215e-05, "loss": 0.0356, "step": 9680 }, { "epoch": 32.086092715231786, "grad_norm": 0.33190855383872986, "learning_rate": 5.668474399670899e-05, "loss": 0.035, "step": 9690 }, { "epoch": 32.11920529801324, "grad_norm": 0.36471137404441833, "learning_rate": 5.660280360301896e-05, "loss": 0.0351, "step": 9700 }, { "epoch": 32.152317880794705, "grad_norm": 0.17476685345172882, "learning_rate": 5.652084515751599e-05, "loss": 0.0346, "step": 9710 }, { "epoch": 32.18543046357616, "grad_norm": 0.21883909404277802, "learning_rate": 5.643886888427137e-05, "loss": 0.0343, "step": 9720 }, { "epoch": 32.21854304635762, "grad_norm": 0.20835182070732117, "learning_rate": 5.6356875007405074e-05, "loss": 0.0341, "step": 9730 }, { "epoch": 32.25165562913907, "grad_norm": 0.2572871446609497, "learning_rate": 5.627486375108525e-05, "loss": 0.0346, "step": 9740 }, { "epoch": 32.28476821192053, "grad_norm": 0.44495582580566406, "learning_rate": 5.619283533952754e-05, "loss": 0.0375, "step": 9750 }, { "epoch": 32.317880794701985, "grad_norm": 0.17944549024105072, "learning_rate": 5.6110789996994474e-05, "loss": 0.0348, "step": 9760 }, { "epoch": 32.35099337748344, "grad_norm": 0.4825090169906616, "learning_rate": 5.602872794779491e-05, "loss": 0.0356, "step": 9770 }, { "epoch": 32.384105960264904, "grad_norm": 0.20879022777080536, "learning_rate": 5.594664941628334e-05, "loss": 0.0316, "step": 9780 }, { "epoch": 32.41721854304636, "grad_norm": 0.2649371922016144, "learning_rate": 5.5864554626859324e-05, "loss": 0.0371, "step": 9790 }, { "epoch": 32.450331125827816, "grad_norm": 0.3210839629173279, "learning_rate": 5.578244380396691e-05, "loss": 0.0374, "step": 9800 }, { "epoch": 32.48344370860927, "grad_norm": 0.2758302688598633, "learning_rate": 5.570031717209394e-05, "loss": 0.0322, "step": 9810 }, { "epoch": 32.51655629139073, "grad_norm": 0.25773128867149353, "learning_rate": 5.561817495577147e-05, "loss": 0.0302, "step": 9820 }, { "epoch": 32.549668874172184, "grad_norm": 0.41572508215904236, "learning_rate": 5.5536017379573215e-05, "loss": 0.0326, "step": 9830 }, { "epoch": 32.58278145695364, "grad_norm": 0.3170468509197235, "learning_rate": 5.545384466811483e-05, "loss": 0.0322, "step": 9840 }, { "epoch": 32.615894039735096, "grad_norm": 0.26432445645332336, "learning_rate": 5.5371657046053384e-05, "loss": 0.036, "step": 9850 }, { "epoch": 32.64900662251656, "grad_norm": 0.1895545870065689, "learning_rate": 5.528945473808669e-05, "loss": 0.0355, "step": 9860 }, { "epoch": 32.682119205298015, "grad_norm": 0.1715487241744995, "learning_rate": 5.520723796895272e-05, "loss": 0.0362, "step": 9870 }, { "epoch": 32.71523178807947, "grad_norm": 0.21888187527656555, "learning_rate": 5.512500696342897e-05, "loss": 0.0348, "step": 9880 }, { "epoch": 32.74834437086093, "grad_norm": 0.16479165852069855, "learning_rate": 5.504276194633188e-05, "loss": 0.0369, "step": 9890 }, { "epoch": 32.78145695364238, "grad_norm": 0.3059069514274597, "learning_rate": 5.49605031425162e-05, "loss": 0.0346, "step": 9900 }, { "epoch": 32.81456953642384, "grad_norm": 0.20589354634284973, "learning_rate": 5.487823077687434e-05, "loss": 0.034, "step": 9910 }, { "epoch": 32.847682119205295, "grad_norm": 0.15338502824306488, "learning_rate": 5.4795945074335806e-05, "loss": 0.0346, "step": 9920 }, { "epoch": 32.88079470198676, "grad_norm": 0.23455652594566345, "learning_rate": 5.471364625986657e-05, "loss": 0.0376, "step": 9930 }, { "epoch": 32.913907284768214, "grad_norm": 0.25778454542160034, "learning_rate": 5.463133455846845e-05, "loss": 0.0377, "step": 9940 }, { "epoch": 32.94701986754967, "grad_norm": 0.3645855784416199, "learning_rate": 5.4549010195178505e-05, "loss": 0.0364, "step": 9950 }, { "epoch": 32.980132450331126, "grad_norm": 0.20957821607589722, "learning_rate": 5.446667339506838e-05, "loss": 0.0321, "step": 9960 }, { "epoch": 33.01324503311258, "grad_norm": 0.21152363717556, "learning_rate": 5.4384324383243756e-05, "loss": 0.0339, "step": 9970 }, { "epoch": 33.04635761589404, "grad_norm": 0.17473343014717102, "learning_rate": 5.430196338484368e-05, "loss": 0.0298, "step": 9980 }, { "epoch": 33.079470198675494, "grad_norm": 0.16461780667304993, "learning_rate": 5.4219590625039975e-05, "loss": 0.0339, "step": 9990 }, { "epoch": 33.11258278145695, "grad_norm": 0.21120722591876984, "learning_rate": 5.413720632903664e-05, "loss": 0.0324, "step": 10000 }, { "epoch": 33.145695364238414, "grad_norm": 0.2197161465883255, "learning_rate": 5.405481072206917e-05, "loss": 0.0341, "step": 10010 }, { "epoch": 33.17880794701987, "grad_norm": 0.27445507049560547, "learning_rate": 5.397240402940402e-05, "loss": 0.0326, "step": 10020 }, { "epoch": 33.211920529801326, "grad_norm": 0.24972335994243622, "learning_rate": 5.388998647633794e-05, "loss": 0.0324, "step": 10030 }, { "epoch": 33.24503311258278, "grad_norm": 0.21517805755138397, "learning_rate": 5.380755828819737e-05, "loss": 0.0339, "step": 10040 }, { "epoch": 33.27814569536424, "grad_norm": 0.3300723135471344, "learning_rate": 5.3725119690337846e-05, "loss": 0.0352, "step": 10050 }, { "epoch": 33.311258278145694, "grad_norm": 0.17584434151649475, "learning_rate": 5.3642670908143324e-05, "loss": 0.0315, "step": 10060 }, { "epoch": 33.34437086092715, "grad_norm": 0.21206383407115936, "learning_rate": 5.356021216702562e-05, "loss": 0.0283, "step": 10070 }, { "epoch": 33.37748344370861, "grad_norm": 0.26914459466934204, "learning_rate": 5.347774369242381e-05, "loss": 0.0343, "step": 10080 }, { "epoch": 33.41059602649007, "grad_norm": 0.20885267853736877, "learning_rate": 5.3395265709803545e-05, "loss": 0.0369, "step": 10090 }, { "epoch": 33.443708609271525, "grad_norm": 0.2572750151157379, "learning_rate": 5.331277844465647e-05, "loss": 0.0349, "step": 10100 }, { "epoch": 33.47682119205298, "grad_norm": 0.1972154676914215, "learning_rate": 5.323028212249963e-05, "loss": 0.0318, "step": 10110 }, { "epoch": 33.50993377483444, "grad_norm": 0.14460426568984985, "learning_rate": 5.314777696887481e-05, "loss": 0.0376, "step": 10120 }, { "epoch": 33.54304635761589, "grad_norm": 0.3620637357234955, "learning_rate": 5.306526320934796e-05, "loss": 0.0333, "step": 10130 }, { "epoch": 33.57615894039735, "grad_norm": 0.27463585138320923, "learning_rate": 5.298274106950854e-05, "loss": 0.0321, "step": 10140 }, { "epoch": 33.609271523178805, "grad_norm": 0.19789670407772064, "learning_rate": 5.290021077496893e-05, "loss": 0.0316, "step": 10150 }, { "epoch": 33.64238410596027, "grad_norm": 0.26203078031539917, "learning_rate": 5.2817672551363816e-05, "loss": 0.0305, "step": 10160 }, { "epoch": 33.675496688741724, "grad_norm": 0.21039330959320068, "learning_rate": 5.273512662434952e-05, "loss": 0.0321, "step": 10170 }, { "epoch": 33.70860927152318, "grad_norm": 0.4152960777282715, "learning_rate": 5.265257321960349e-05, "loss": 0.0338, "step": 10180 }, { "epoch": 33.741721854304636, "grad_norm": 0.19181127846240997, "learning_rate": 5.257001256282357e-05, "loss": 0.0363, "step": 10190 }, { "epoch": 33.77483443708609, "grad_norm": 0.18149568140506744, "learning_rate": 5.248744487972742e-05, "loss": 0.0318, "step": 10200 }, { "epoch": 33.80794701986755, "grad_norm": 0.5489351749420166, "learning_rate": 5.240487039605196e-05, "loss": 0.0304, "step": 10210 }, { "epoch": 33.841059602649004, "grad_norm": 0.15656758844852448, "learning_rate": 5.232228933755267e-05, "loss": 0.033, "step": 10220 }, { "epoch": 33.87417218543047, "grad_norm": 0.23301753401756287, "learning_rate": 5.2239701930003006e-05, "loss": 0.0335, "step": 10230 }, { "epoch": 33.90728476821192, "grad_norm": 0.23011118173599243, "learning_rate": 5.215710839919379e-05, "loss": 0.032, "step": 10240 }, { "epoch": 33.94039735099338, "grad_norm": 0.20455700159072876, "learning_rate": 5.207450897093257e-05, "loss": 0.0314, "step": 10250 }, { "epoch": 33.973509933774835, "grad_norm": 0.4475822448730469, "learning_rate": 5.1991903871043046e-05, "loss": 0.0329, "step": 10260 }, { "epoch": 34.00662251655629, "grad_norm": 0.15076115727424622, "learning_rate": 5.190929332536439e-05, "loss": 0.0316, "step": 10270 }, { "epoch": 34.03973509933775, "grad_norm": 0.23971900343894958, "learning_rate": 5.182667755975071e-05, "loss": 0.0333, "step": 10280 }, { "epoch": 34.0728476821192, "grad_norm": 0.2574482858181, "learning_rate": 5.1744056800070315e-05, "loss": 0.0323, "step": 10290 }, { "epoch": 34.10596026490066, "grad_norm": 0.3156982660293579, "learning_rate": 5.166143127220524e-05, "loss": 0.0335, "step": 10300 }, { "epoch": 34.13907284768212, "grad_norm": 0.33129143714904785, "learning_rate": 5.1578801202050485e-05, "loss": 0.0339, "step": 10310 }, { "epoch": 34.17218543046358, "grad_norm": 0.2537616491317749, "learning_rate": 5.149616681551355e-05, "loss": 0.035, "step": 10320 }, { "epoch": 34.205298013245034, "grad_norm": 0.21245171129703522, "learning_rate": 5.141352833851367e-05, "loss": 0.0325, "step": 10330 }, { "epoch": 34.23841059602649, "grad_norm": 0.3326011002063751, "learning_rate": 5.1330885996981285e-05, "loss": 0.0332, "step": 10340 }, { "epoch": 34.271523178807946, "grad_norm": 0.2851768136024475, "learning_rate": 5.124824001685741e-05, "loss": 0.0349, "step": 10350 }, { "epoch": 34.3046357615894, "grad_norm": 0.33392295241355896, "learning_rate": 5.116559062409298e-05, "loss": 0.0314, "step": 10360 }, { "epoch": 34.33774834437086, "grad_norm": 0.19532805681228638, "learning_rate": 5.10829380446483e-05, "loss": 0.0337, "step": 10370 }, { "epoch": 34.370860927152314, "grad_norm": 0.14206373691558838, "learning_rate": 5.100028250449235e-05, "loss": 0.0318, "step": 10380 }, { "epoch": 34.40397350993378, "grad_norm": 0.2943340837955475, "learning_rate": 5.0917624229602234e-05, "loss": 0.0291, "step": 10390 }, { "epoch": 34.437086092715234, "grad_norm": 0.26831772923469543, "learning_rate": 5.0834963445962524e-05, "loss": 0.034, "step": 10400 }, { "epoch": 34.47019867549669, "grad_norm": 0.4701237976551056, "learning_rate": 5.075230037956461e-05, "loss": 0.0358, "step": 10410 }, { "epoch": 34.503311258278146, "grad_norm": 0.2895570993423462, "learning_rate": 5.0669635256406213e-05, "loss": 0.033, "step": 10420 }, { "epoch": 34.5364238410596, "grad_norm": 0.12950770556926727, "learning_rate": 5.058696830249058e-05, "loss": 0.0307, "step": 10430 }, { "epoch": 34.56953642384106, "grad_norm": 0.35114461183547974, "learning_rate": 5.050429974382602e-05, "loss": 0.0352, "step": 10440 }, { "epoch": 34.602649006622514, "grad_norm": 0.19343434274196625, "learning_rate": 5.042162980642523e-05, "loss": 0.0316, "step": 10450 }, { "epoch": 34.63576158940398, "grad_norm": 0.2576303482055664, "learning_rate": 5.033895871630462e-05, "loss": 0.0316, "step": 10460 }, { "epoch": 34.66887417218543, "grad_norm": 0.15583279728889465, "learning_rate": 5.025628669948386e-05, "loss": 0.0338, "step": 10470 }, { "epoch": 34.70198675496689, "grad_norm": 0.2718207836151123, "learning_rate": 5.017361398198502e-05, "loss": 0.0358, "step": 10480 }, { "epoch": 34.735099337748345, "grad_norm": 0.3082188069820404, "learning_rate": 5.009094078983221e-05, "loss": 0.0338, "step": 10490 }, { "epoch": 34.7682119205298, "grad_norm": 0.6136208176612854, "learning_rate": 5.000826734905073e-05, "loss": 0.0362, "step": 10500 }, { "epoch": 34.80132450331126, "grad_norm": 0.20909234881401062, "learning_rate": 4.9925593885666645e-05, "loss": 0.0372, "step": 10510 }, { "epoch": 34.83443708609271, "grad_norm": 0.3481629490852356, "learning_rate": 4.984292062570602e-05, "loss": 0.0312, "step": 10520 }, { "epoch": 34.86754966887417, "grad_norm": 1.2787091732025146, "learning_rate": 4.976024779519442e-05, "loss": 0.0312, "step": 10530 }, { "epoch": 34.90066225165563, "grad_norm": 0.1571429818868637, "learning_rate": 4.9677575620156194e-05, "loss": 0.032, "step": 10540 }, { "epoch": 34.93377483443709, "grad_norm": 0.28299441933631897, "learning_rate": 4.959490432661391e-05, "loss": 0.0338, "step": 10550 }, { "epoch": 34.966887417218544, "grad_norm": 0.18108460307121277, "learning_rate": 4.9512234140587726e-05, "loss": 0.0322, "step": 10560 }, { "epoch": 35.0, "grad_norm": 0.3064558207988739, "learning_rate": 4.942956528809477e-05, "loss": 0.0322, "step": 10570 }, { "epoch": 35.033112582781456, "grad_norm": 0.26309308409690857, "learning_rate": 4.934689799514854e-05, "loss": 0.0289, "step": 10580 }, { "epoch": 35.06622516556291, "grad_norm": 0.2917889952659607, "learning_rate": 4.926423248775827e-05, "loss": 0.0346, "step": 10590 }, { "epoch": 35.09933774834437, "grad_norm": 0.20297817885875702, "learning_rate": 4.918156899192826e-05, "loss": 0.0307, "step": 10600 }, { "epoch": 35.13245033112583, "grad_norm": 0.23024588823318481, "learning_rate": 4.909890773365738e-05, "loss": 0.0299, "step": 10610 }, { "epoch": 35.16556291390729, "grad_norm": 0.3907947838306427, "learning_rate": 4.9016248938938344e-05, "loss": 0.0311, "step": 10620 }, { "epoch": 35.19867549668874, "grad_norm": 0.24886567890644073, "learning_rate": 4.8933592833757156e-05, "loss": 0.0352, "step": 10630 }, { "epoch": 35.2317880794702, "grad_norm": 0.25502219796180725, "learning_rate": 4.8850939644092435e-05, "loss": 0.0339, "step": 10640 }, { "epoch": 35.264900662251655, "grad_norm": 0.28451308608055115, "learning_rate": 4.876828959591485e-05, "loss": 0.0359, "step": 10650 }, { "epoch": 35.29801324503311, "grad_norm": 0.13937613368034363, "learning_rate": 4.8685642915186474e-05, "loss": 0.0326, "step": 10660 }, { "epoch": 35.33112582781457, "grad_norm": 0.22906582057476044, "learning_rate": 4.860299982786018e-05, "loss": 0.0295, "step": 10670 }, { "epoch": 35.36423841059602, "grad_norm": 0.437084823846817, "learning_rate": 4.852036055987901e-05, "loss": 0.0338, "step": 10680 }, { "epoch": 35.397350993377486, "grad_norm": 0.2696670591831207, "learning_rate": 4.843772533717558e-05, "loss": 0.0318, "step": 10690 }, { "epoch": 35.43046357615894, "grad_norm": 0.25491827726364136, "learning_rate": 4.835509438567142e-05, "loss": 0.0358, "step": 10700 }, { "epoch": 35.4635761589404, "grad_norm": 0.16642053425312042, "learning_rate": 4.827246793127639e-05, "loss": 0.0346, "step": 10710 }, { "epoch": 35.496688741721854, "grad_norm": 0.3987714350223541, "learning_rate": 4.818984619988807e-05, "loss": 0.0338, "step": 10720 }, { "epoch": 35.52980132450331, "grad_norm": 0.24792352318763733, "learning_rate": 4.810722941739115e-05, "loss": 0.0317, "step": 10730 }, { "epoch": 35.562913907284766, "grad_norm": 0.22430919110774994, "learning_rate": 4.8024617809656684e-05, "loss": 0.0337, "step": 10740 }, { "epoch": 35.59602649006622, "grad_norm": 0.2487117052078247, "learning_rate": 4.794201160254171e-05, "loss": 0.0301, "step": 10750 }, { "epoch": 35.629139072847686, "grad_norm": 0.21340730786323547, "learning_rate": 4.785941102188844e-05, "loss": 0.0335, "step": 10760 }, { "epoch": 35.66225165562914, "grad_norm": 0.2508874535560608, "learning_rate": 4.7776816293523686e-05, "loss": 0.0312, "step": 10770 }, { "epoch": 35.6953642384106, "grad_norm": 0.19691202044487, "learning_rate": 4.769422764325832e-05, "loss": 0.0293, "step": 10780 }, { "epoch": 35.728476821192054, "grad_norm": 0.3388879895210266, "learning_rate": 4.76116452968865e-05, "loss": 0.0302, "step": 10790 }, { "epoch": 35.76158940397351, "grad_norm": 0.21686053276062012, "learning_rate": 4.752906948018525e-05, "loss": 0.0322, "step": 10800 }, { "epoch": 35.794701986754966, "grad_norm": 0.4091481566429138, "learning_rate": 4.7446500418913684e-05, "loss": 0.0342, "step": 10810 }, { "epoch": 35.82781456953642, "grad_norm": 0.2506748139858246, "learning_rate": 4.736393833881247e-05, "loss": 0.0325, "step": 10820 }, { "epoch": 35.86092715231788, "grad_norm": 0.27399787306785583, "learning_rate": 4.7281383465603194e-05, "loss": 0.0313, "step": 10830 }, { "epoch": 35.89403973509934, "grad_norm": 0.4147755801677704, "learning_rate": 4.71988360249877e-05, "loss": 0.0305, "step": 10840 }, { "epoch": 35.9271523178808, "grad_norm": 0.3651663661003113, "learning_rate": 4.7116296242647554e-05, "loss": 0.0315, "step": 10850 }, { "epoch": 35.96026490066225, "grad_norm": 0.37585246562957764, "learning_rate": 4.703376434424336e-05, "loss": 0.0328, "step": 10860 }, { "epoch": 35.99337748344371, "grad_norm": 0.37144047021865845, "learning_rate": 4.695124055541421e-05, "loss": 0.0348, "step": 10870 }, { "epoch": 36.026490066225165, "grad_norm": 0.259626567363739, "learning_rate": 4.6868725101776934e-05, "loss": 0.0316, "step": 10880 }, { "epoch": 36.05960264900662, "grad_norm": 0.2662050426006317, "learning_rate": 4.678621820892567e-05, "loss": 0.0322, "step": 10890 }, { "epoch": 36.09271523178808, "grad_norm": 0.19502747058868408, "learning_rate": 4.670372010243111e-05, "loss": 0.0265, "step": 10900 }, { "epoch": 36.12582781456954, "grad_norm": 0.17268115282058716, "learning_rate": 4.662123100783992e-05, "loss": 0.0334, "step": 10910 }, { "epoch": 36.158940397350996, "grad_norm": 0.2282090038061142, "learning_rate": 4.653875115067415e-05, "loss": 0.0323, "step": 10920 }, { "epoch": 36.19205298013245, "grad_norm": 0.2659374177455902, "learning_rate": 4.6456280756430545e-05, "loss": 0.0298, "step": 10930 }, { "epoch": 36.22516556291391, "grad_norm": 0.2202862948179245, "learning_rate": 4.637382005058004e-05, "loss": 0.0332, "step": 10940 }, { "epoch": 36.258278145695364, "grad_norm": 0.19263967871665955, "learning_rate": 4.629136925856705e-05, "loss": 0.0315, "step": 10950 }, { "epoch": 36.29139072847682, "grad_norm": 0.3398522436618805, "learning_rate": 4.6208928605808895e-05, "loss": 0.0317, "step": 10960 }, { "epoch": 36.324503311258276, "grad_norm": 0.13398945331573486, "learning_rate": 4.612649831769519e-05, "loss": 0.0335, "step": 10970 }, { "epoch": 36.35761589403973, "grad_norm": 0.25498366355895996, "learning_rate": 4.604407861958715e-05, "loss": 0.0332, "step": 10980 }, { "epoch": 36.390728476821195, "grad_norm": 0.20276212692260742, "learning_rate": 4.5961669736817114e-05, "loss": 0.0329, "step": 10990 }, { "epoch": 36.42384105960265, "grad_norm": 0.24882645905017853, "learning_rate": 4.5879271894687814e-05, "loss": 0.0275, "step": 11000 }, { "epoch": 36.45695364238411, "grad_norm": 0.4265897274017334, "learning_rate": 4.5796885318471826e-05, "loss": 0.0351, "step": 11010 }, { "epoch": 36.49006622516556, "grad_norm": 0.2461688071489334, "learning_rate": 4.571451023341086e-05, "loss": 0.0313, "step": 11020 }, { "epoch": 36.52317880794702, "grad_norm": 0.22279717028141022, "learning_rate": 4.563214686471527e-05, "loss": 0.0346, "step": 11030 }, { "epoch": 36.556291390728475, "grad_norm": 0.2688103914260864, "learning_rate": 4.5549795437563365e-05, "loss": 0.0339, "step": 11040 }, { "epoch": 36.58940397350993, "grad_norm": 0.7398050427436829, "learning_rate": 4.546745617710081e-05, "loss": 0.0294, "step": 11050 }, { "epoch": 36.62251655629139, "grad_norm": 0.23308923840522766, "learning_rate": 4.5385129308440014e-05, "loss": 0.0334, "step": 11060 }, { "epoch": 36.65562913907285, "grad_norm": 0.19651198387145996, "learning_rate": 4.530281505665944e-05, "loss": 0.0355, "step": 11070 }, { "epoch": 36.688741721854306, "grad_norm": 0.15957315266132355, "learning_rate": 4.5220513646803134e-05, "loss": 0.0264, "step": 11080 }, { "epoch": 36.72185430463576, "grad_norm": 0.450812429189682, "learning_rate": 4.513822530388003e-05, "loss": 0.0318, "step": 11090 }, { "epoch": 36.75496688741722, "grad_norm": 0.29433169960975647, "learning_rate": 4.5055950252863296e-05, "loss": 0.0313, "step": 11100 }, { "epoch": 36.788079470198674, "grad_norm": 0.25340089201927185, "learning_rate": 4.4973688718689803e-05, "loss": 0.032, "step": 11110 }, { "epoch": 36.82119205298013, "grad_norm": 0.3011920750141144, "learning_rate": 4.4891440926259406e-05, "loss": 0.0302, "step": 11120 }, { "epoch": 36.854304635761586, "grad_norm": 0.24688753485679626, "learning_rate": 4.480920710043443e-05, "loss": 0.0328, "step": 11130 }, { "epoch": 36.88741721854305, "grad_norm": 0.2358098179101944, "learning_rate": 4.4726987466039044e-05, "loss": 0.0349, "step": 11140 }, { "epoch": 36.920529801324506, "grad_norm": 0.1336469203233719, "learning_rate": 4.46447822478586e-05, "loss": 0.0294, "step": 11150 }, { "epoch": 36.95364238410596, "grad_norm": 0.18710459768772125, "learning_rate": 4.4562591670638974e-05, "loss": 0.0265, "step": 11160 }, { "epoch": 36.98675496688742, "grad_norm": 0.3914312720298767, "learning_rate": 4.4480415959086105e-05, "loss": 0.0294, "step": 11170 }, { "epoch": 37.019867549668874, "grad_norm": 0.1925038993358612, "learning_rate": 4.439825533786522e-05, "loss": 0.0354, "step": 11180 }, { "epoch": 37.05298013245033, "grad_norm": 0.3509373962879181, "learning_rate": 4.431611003160035e-05, "loss": 0.0287, "step": 11190 }, { "epoch": 37.086092715231786, "grad_norm": 0.23057058453559875, "learning_rate": 4.4233980264873636e-05, "loss": 0.0334, "step": 11200 }, { "epoch": 37.11920529801324, "grad_norm": 0.31683772802352905, "learning_rate": 4.4151866262224684e-05, "loss": 0.0316, "step": 11210 }, { "epoch": 37.152317880794705, "grad_norm": 0.39582154154777527, "learning_rate": 4.406976824815006e-05, "loss": 0.0374, "step": 11220 }, { "epoch": 37.18543046357616, "grad_norm": 0.22726592421531677, "learning_rate": 4.3987686447102595e-05, "loss": 0.031, "step": 11230 }, { "epoch": 37.21854304635762, "grad_norm": 0.3297795057296753, "learning_rate": 4.3905621083490804e-05, "loss": 0.0317, "step": 11240 }, { "epoch": 37.25165562913907, "grad_norm": 0.17110410332679749, "learning_rate": 4.3823572381678286e-05, "loss": 0.0287, "step": 11250 }, { "epoch": 37.28476821192053, "grad_norm": 0.23990792036056519, "learning_rate": 4.374154056598301e-05, "loss": 0.0341, "step": 11260 }, { "epoch": 37.317880794701985, "grad_norm": 0.8576610684394836, "learning_rate": 4.3659525860676845e-05, "loss": 0.0303, "step": 11270 }, { "epoch": 37.35099337748344, "grad_norm": 0.2661357522010803, "learning_rate": 4.3577528489984854e-05, "loss": 0.0328, "step": 11280 }, { "epoch": 37.384105960264904, "grad_norm": 0.22212661802768707, "learning_rate": 4.349554867808476e-05, "loss": 0.0291, "step": 11290 }, { "epoch": 37.41721854304636, "grad_norm": 0.6875467896461487, "learning_rate": 4.34135866491062e-05, "loss": 0.0287, "step": 11300 }, { "epoch": 37.450331125827816, "grad_norm": 0.1712811440229416, "learning_rate": 4.333164262713022e-05, "loss": 0.028, "step": 11310 }, { "epoch": 37.48344370860927, "grad_norm": 0.1387980431318283, "learning_rate": 4.324971683618868e-05, "loss": 0.029, "step": 11320 }, { "epoch": 37.51655629139073, "grad_norm": 0.22156654298305511, "learning_rate": 4.316780950026354e-05, "loss": 0.0287, "step": 11330 }, { "epoch": 37.549668874172184, "grad_norm": 0.24115592241287231, "learning_rate": 4.308592084328637e-05, "loss": 0.03, "step": 11340 }, { "epoch": 37.58278145695364, "grad_norm": 0.42638343572616577, "learning_rate": 4.3004051089137576e-05, "loss": 0.0285, "step": 11350 }, { "epoch": 37.615894039735096, "grad_norm": 0.28123584389686584, "learning_rate": 4.292220046164597e-05, "loss": 0.0332, "step": 11360 }, { "epoch": 37.64900662251656, "grad_norm": 0.1709744930267334, "learning_rate": 4.2840369184588035e-05, "loss": 0.0305, "step": 11370 }, { "epoch": 37.682119205298015, "grad_norm": 0.3266321122646332, "learning_rate": 4.2758557481687345e-05, "loss": 0.0332, "step": 11380 }, { "epoch": 37.71523178807947, "grad_norm": 0.19532470405101776, "learning_rate": 4.267676557661403e-05, "loss": 0.0308, "step": 11390 }, { "epoch": 37.74834437086093, "grad_norm": 0.3276239037513733, "learning_rate": 4.2594993692983955e-05, "loss": 0.0307, "step": 11400 }, { "epoch": 37.78145695364238, "grad_norm": 0.2672770023345947, "learning_rate": 4.251324205435837e-05, "loss": 0.0293, "step": 11410 }, { "epoch": 37.81456953642384, "grad_norm": 0.21173541247844696, "learning_rate": 4.243151088424312e-05, "loss": 0.0322, "step": 11420 }, { "epoch": 37.847682119205295, "grad_norm": 0.35736680030822754, "learning_rate": 4.234980040608813e-05, "loss": 0.0282, "step": 11430 }, { "epoch": 37.88079470198676, "grad_norm": 0.34901079535484314, "learning_rate": 4.22681108432867e-05, "loss": 0.029, "step": 11440 }, { "epoch": 37.913907284768214, "grad_norm": 0.23467600345611572, "learning_rate": 4.2186442419174984e-05, "loss": 0.0285, "step": 11450 }, { "epoch": 37.94701986754967, "grad_norm": 0.2832430303096771, "learning_rate": 4.210479535703133e-05, "loss": 0.0304, "step": 11460 }, { "epoch": 37.980132450331126, "grad_norm": 0.3323991894721985, "learning_rate": 4.202316988007567e-05, "loss": 0.0302, "step": 11470 }, { "epoch": 38.01324503311258, "grad_norm": 0.24794968962669373, "learning_rate": 4.194156621146901e-05, "loss": 0.0299, "step": 11480 }, { "epoch": 38.04635761589404, "grad_norm": 0.32636773586273193, "learning_rate": 4.1859984574312596e-05, "loss": 0.028, "step": 11490 }, { "epoch": 38.079470198675494, "grad_norm": 0.25478073954582214, "learning_rate": 4.177842519164752e-05, "loss": 0.0257, "step": 11500 }, { "epoch": 38.11258278145695, "grad_norm": 0.1774207502603531, "learning_rate": 4.169688828645404e-05, "loss": 0.0288, "step": 11510 }, { "epoch": 38.145695364238414, "grad_norm": 0.20046834647655487, "learning_rate": 4.161537408165092e-05, "loss": 0.0299, "step": 11520 }, { "epoch": 38.17880794701987, "grad_norm": 0.29784736037254333, "learning_rate": 4.1533882800094924e-05, "loss": 0.0316, "step": 11530 }, { "epoch": 38.211920529801326, "grad_norm": 0.3994748592376709, "learning_rate": 4.145241466458005e-05, "loss": 0.0345, "step": 11540 }, { "epoch": 38.24503311258278, "grad_norm": 0.6250624060630798, "learning_rate": 4.13709698978371e-05, "loss": 0.0325, "step": 11550 }, { "epoch": 38.27814569536424, "grad_norm": 0.17900417745113373, "learning_rate": 4.1289548722532944e-05, "loss": 0.0321, "step": 11560 }, { "epoch": 38.311258278145694, "grad_norm": 0.3831682503223419, "learning_rate": 4.120815136126999e-05, "loss": 0.0321, "step": 11570 }, { "epoch": 38.34437086092715, "grad_norm": 0.25898477435112, "learning_rate": 4.112677803658548e-05, "loss": 0.0303, "step": 11580 }, { "epoch": 38.37748344370861, "grad_norm": 0.46061569452285767, "learning_rate": 4.1045428970951e-05, "loss": 0.0287, "step": 11590 }, { "epoch": 38.41059602649007, "grad_norm": 0.38484135270118713, "learning_rate": 4.0964104386771785e-05, "loss": 0.0318, "step": 11600 }, { "epoch": 38.443708609271525, "grad_norm": 0.2984238862991333, "learning_rate": 4.0882804506386144e-05, "loss": 0.0305, "step": 11610 }, { "epoch": 38.47682119205298, "grad_norm": 0.31329554319381714, "learning_rate": 4.080152955206485e-05, "loss": 0.0354, "step": 11620 }, { "epoch": 38.50993377483444, "grad_norm": 0.20007964968681335, "learning_rate": 4.0720279746010505e-05, "loss": 0.0363, "step": 11630 }, { "epoch": 38.54304635761589, "grad_norm": 0.21700434386730194, "learning_rate": 4.063905531035699e-05, "loss": 0.0292, "step": 11640 }, { "epoch": 38.57615894039735, "grad_norm": 0.19520461559295654, "learning_rate": 4.055785646716882e-05, "loss": 0.0308, "step": 11650 }, { "epoch": 38.609271523178805, "grad_norm": 0.21486185491085052, "learning_rate": 4.047668343844051e-05, "loss": 0.0303, "step": 11660 }, { "epoch": 38.64238410596027, "grad_norm": 0.3995651304721832, "learning_rate": 4.039553644609604e-05, "loss": 0.0299, "step": 11670 }, { "epoch": 38.675496688741724, "grad_norm": 0.23882295191287994, "learning_rate": 4.0314415711988176e-05, "loss": 0.0304, "step": 11680 }, { "epoch": 38.70860927152318, "grad_norm": 0.5273828506469727, "learning_rate": 4.023332145789792e-05, "loss": 0.0287, "step": 11690 }, { "epoch": 38.741721854304636, "grad_norm": 0.17762397229671478, "learning_rate": 4.015225390553385e-05, "loss": 0.0259, "step": 11700 }, { "epoch": 38.77483443708609, "grad_norm": 0.24309583008289337, "learning_rate": 4.007121327653158e-05, "loss": 0.0317, "step": 11710 }, { "epoch": 38.80794701986755, "grad_norm": 0.4205617308616638, "learning_rate": 3.9990199792453064e-05, "loss": 0.029, "step": 11720 }, { "epoch": 38.841059602649004, "grad_norm": 0.19731582701206207, "learning_rate": 3.9909213674786103e-05, "loss": 0.0291, "step": 11730 }, { "epoch": 38.87417218543047, "grad_norm": 0.2158842533826828, "learning_rate": 3.982825514494363e-05, "loss": 0.0286, "step": 11740 }, { "epoch": 38.90728476821192, "grad_norm": 0.21077322959899902, "learning_rate": 3.974732442426319e-05, "loss": 0.0314, "step": 11750 }, { "epoch": 38.94039735099338, "grad_norm": 0.30665168166160583, "learning_rate": 3.966642173400629e-05, "loss": 0.0288, "step": 11760 }, { "epoch": 38.973509933774835, "grad_norm": 0.2822461724281311, "learning_rate": 3.9585547295357764e-05, "loss": 0.0311, "step": 11770 }, { "epoch": 39.00662251655629, "grad_norm": 0.3230983018875122, "learning_rate": 3.950470132942526e-05, "loss": 0.0296, "step": 11780 }, { "epoch": 39.03973509933775, "grad_norm": 0.16945619881153107, "learning_rate": 3.942388405723856e-05, "loss": 0.0278, "step": 11790 }, { "epoch": 39.0728476821192, "grad_norm": 0.2642642855644226, "learning_rate": 3.9343095699749e-05, "loss": 0.0296, "step": 11800 }, { "epoch": 39.10596026490066, "grad_norm": 0.21198195219039917, "learning_rate": 3.9262336477828874e-05, "loss": 0.0336, "step": 11810 }, { "epoch": 39.13907284768212, "grad_norm": 0.2856614589691162, "learning_rate": 3.9181606612270794e-05, "loss": 0.0303, "step": 11820 }, { "epoch": 39.17218543046358, "grad_norm": 0.21097883582115173, "learning_rate": 3.910090632378713e-05, "loss": 0.028, "step": 11830 }, { "epoch": 39.205298013245034, "grad_norm": 0.2533677816390991, "learning_rate": 3.90202358330094e-05, "loss": 0.0299, "step": 11840 }, { "epoch": 39.23841059602649, "grad_norm": 0.9427504539489746, "learning_rate": 3.8939595360487656e-05, "loss": 0.0308, "step": 11850 }, { "epoch": 39.271523178807946, "grad_norm": 0.2323155254125595, "learning_rate": 3.885898512668984e-05, "loss": 0.0304, "step": 11860 }, { "epoch": 39.3046357615894, "grad_norm": 0.28928112983703613, "learning_rate": 3.877840535200127e-05, "loss": 0.0284, "step": 11870 }, { "epoch": 39.33774834437086, "grad_norm": 0.16505491733551025, "learning_rate": 3.869785625672397e-05, "loss": 0.0314, "step": 11880 }, { "epoch": 39.370860927152314, "grad_norm": 0.18097570538520813, "learning_rate": 3.8617338061076094e-05, "loss": 0.0281, "step": 11890 }, { "epoch": 39.40397350993378, "grad_norm": 0.17732447385787964, "learning_rate": 3.853685098519132e-05, "loss": 0.0301, "step": 11900 }, { "epoch": 39.437086092715234, "grad_norm": 0.17801932990550995, "learning_rate": 3.845639524911823e-05, "loss": 0.0305, "step": 11910 }, { "epoch": 39.47019867549669, "grad_norm": 0.5749412178993225, "learning_rate": 3.837597107281974e-05, "loss": 0.0307, "step": 11920 }, { "epoch": 39.503311258278146, "grad_norm": 0.2073831558227539, "learning_rate": 3.829557867617247e-05, "loss": 0.0294, "step": 11930 }, { "epoch": 39.5364238410596, "grad_norm": 0.31640639901161194, "learning_rate": 3.821521827896618e-05, "loss": 0.032, "step": 11940 }, { "epoch": 39.56953642384106, "grad_norm": 0.212114155292511, "learning_rate": 3.81348901009031e-05, "loss": 0.0311, "step": 11950 }, { "epoch": 39.602649006622514, "grad_norm": 0.2143804430961609, "learning_rate": 3.805459436159741e-05, "loss": 0.0333, "step": 11960 }, { "epoch": 39.63576158940398, "grad_norm": 0.5470573902130127, "learning_rate": 3.797433128057461e-05, "loss": 0.0291, "step": 11970 }, { "epoch": 39.66887417218543, "grad_norm": 0.32851001620292664, "learning_rate": 3.789410107727089e-05, "loss": 0.0326, "step": 11980 }, { "epoch": 39.70198675496689, "grad_norm": 0.16108420491218567, "learning_rate": 3.781390397103257e-05, "loss": 0.0288, "step": 11990 }, { "epoch": 39.735099337748345, "grad_norm": 0.24283955991268158, "learning_rate": 3.7733740181115455e-05, "loss": 0.0268, "step": 12000 }, { "epoch": 39.7682119205298, "grad_norm": 0.16471371054649353, "learning_rate": 3.7653609926684306e-05, "loss": 0.031, "step": 12010 }, { "epoch": 39.80132450331126, "grad_norm": 0.1810884326696396, "learning_rate": 3.757351342681217e-05, "loss": 0.0312, "step": 12020 }, { "epoch": 39.83443708609271, "grad_norm": 0.24186266958713531, "learning_rate": 3.749345090047982e-05, "loss": 0.0289, "step": 12030 }, { "epoch": 39.86754966887417, "grad_norm": 0.1484794020652771, "learning_rate": 3.741342256657515e-05, "loss": 0.0284, "step": 12040 }, { "epoch": 39.90066225165563, "grad_norm": 0.17371796071529388, "learning_rate": 3.7333428643892567e-05, "loss": 0.0285, "step": 12050 }, { "epoch": 39.93377483443709, "grad_norm": 0.9928889274597168, "learning_rate": 3.725346935113239e-05, "loss": 0.0311, "step": 12060 }, { "epoch": 39.966887417218544, "grad_norm": 0.1484404057264328, "learning_rate": 3.717354490690029e-05, "loss": 0.0273, "step": 12070 }, { "epoch": 40.0, "grad_norm": 0.35029229521751404, "learning_rate": 3.709365552970664e-05, "loss": 0.0266, "step": 12080 }, { "epoch": 40.033112582781456, "grad_norm": 0.20015496015548706, "learning_rate": 3.7013801437965945e-05, "loss": 0.026, "step": 12090 }, { "epoch": 40.06622516556291, "grad_norm": 0.24685202538967133, "learning_rate": 3.693398284999623e-05, "loss": 0.0318, "step": 12100 }, { "epoch": 40.09933774834437, "grad_norm": 0.23731215298175812, "learning_rate": 3.6854199984018484e-05, "loss": 0.0305, "step": 12110 }, { "epoch": 40.13245033112583, "grad_norm": 0.8071597218513489, "learning_rate": 3.677445305815601e-05, "loss": 0.029, "step": 12120 }, { "epoch": 40.16556291390729, "grad_norm": 0.21976056694984436, "learning_rate": 3.669474229043387e-05, "loss": 0.0291, "step": 12130 }, { "epoch": 40.19867549668874, "grad_norm": 0.25854623317718506, "learning_rate": 3.6615067898778235e-05, "loss": 0.0269, "step": 12140 }, { "epoch": 40.2317880794702, "grad_norm": 0.3673534393310547, "learning_rate": 3.6535430101015866e-05, "loss": 0.0299, "step": 12150 }, { "epoch": 40.264900662251655, "grad_norm": 0.3772508203983307, "learning_rate": 3.645582911487345e-05, "loss": 0.0301, "step": 12160 }, { "epoch": 40.29801324503311, "grad_norm": 0.2217094898223877, "learning_rate": 3.637626515797706e-05, "loss": 0.0284, "step": 12170 }, { "epoch": 40.33112582781457, "grad_norm": 0.3375459611415863, "learning_rate": 3.629673844785152e-05, "loss": 0.0343, "step": 12180 }, { "epoch": 40.36423841059602, "grad_norm": 0.2525017261505127, "learning_rate": 3.621724920191979e-05, "loss": 0.0294, "step": 12190 }, { "epoch": 40.397350993377486, "grad_norm": 0.5784022808074951, "learning_rate": 3.6137797637502444e-05, "loss": 0.0305, "step": 12200 }, { "epoch": 40.43046357615894, "grad_norm": 0.4149390161037445, "learning_rate": 3.6058383971817035e-05, "loss": 0.0325, "step": 12210 }, { "epoch": 40.4635761589404, "grad_norm": 0.1609480232000351, "learning_rate": 3.59790084219775e-05, "loss": 0.0282, "step": 12220 }, { "epoch": 40.496688741721854, "grad_norm": 0.312656432390213, "learning_rate": 3.589967120499353e-05, "loss": 0.0302, "step": 12230 }, { "epoch": 40.52980132450331, "grad_norm": 0.20036600530147552, "learning_rate": 3.5820372537770075e-05, "loss": 0.028, "step": 12240 }, { "epoch": 40.562913907284766, "grad_norm": 0.18584154546260834, "learning_rate": 3.5741112637106655e-05, "loss": 0.0312, "step": 12250 }, { "epoch": 40.59602649006622, "grad_norm": 0.20621097087860107, "learning_rate": 3.5661891719696804e-05, "loss": 0.0273, "step": 12260 }, { "epoch": 40.629139072847686, "grad_norm": 0.25101974606513977, "learning_rate": 3.5582710002127504e-05, "loss": 0.0255, "step": 12270 }, { "epoch": 40.66225165562914, "grad_norm": 0.17507719993591309, "learning_rate": 3.550356770087853e-05, "loss": 0.0292, "step": 12280 }, { "epoch": 40.6953642384106, "grad_norm": 0.4010429084300995, "learning_rate": 3.5424465032321914e-05, "loss": 0.0305, "step": 12290 }, { "epoch": 40.728476821192054, "grad_norm": 0.2750539481639862, "learning_rate": 3.5345402212721335e-05, "loss": 0.024, "step": 12300 }, { "epoch": 40.76158940397351, "grad_norm": 0.5297742486000061, "learning_rate": 3.526637945823152e-05, "loss": 0.0296, "step": 12310 }, { "epoch": 40.794701986754966, "grad_norm": 0.21984215080738068, "learning_rate": 3.518739698489767e-05, "loss": 0.0301, "step": 12320 }, { "epoch": 40.82781456953642, "grad_norm": 0.27345168590545654, "learning_rate": 3.510845500865485e-05, "loss": 0.0299, "step": 12330 }, { "epoch": 40.86092715231788, "grad_norm": 0.7299495935440063, "learning_rate": 3.502955374532739e-05, "loss": 0.0267, "step": 12340 }, { "epoch": 40.89403973509934, "grad_norm": 0.1416206806898117, "learning_rate": 3.495069341062836e-05, "loss": 0.0275, "step": 12350 }, { "epoch": 40.9271523178808, "grad_norm": 0.2925141155719757, "learning_rate": 3.4871874220158896e-05, "loss": 0.0313, "step": 12360 }, { "epoch": 40.96026490066225, "grad_norm": 0.20017962157726288, "learning_rate": 3.479309638940762e-05, "loss": 0.0265, "step": 12370 }, { "epoch": 40.99337748344371, "grad_norm": 0.22929278016090393, "learning_rate": 3.4714360133750146e-05, "loss": 0.0285, "step": 12380 }, { "epoch": 41.026490066225165, "grad_norm": 0.2136211395263672, "learning_rate": 3.463566566844839e-05, "loss": 0.0263, "step": 12390 }, { "epoch": 41.05960264900662, "grad_norm": 0.4275485873222351, "learning_rate": 3.4557013208650016e-05, "loss": 0.0238, "step": 12400 }, { "epoch": 41.09271523178808, "grad_norm": 0.2433047741651535, "learning_rate": 3.4478402969387857e-05, "loss": 0.0282, "step": 12410 }, { "epoch": 41.12582781456954, "grad_norm": 0.2757996618747711, "learning_rate": 3.4399835165579266e-05, "loss": 0.0271, "step": 12420 }, { "epoch": 41.158940397350996, "grad_norm": 0.19716499745845795, "learning_rate": 3.4321310012025645e-05, "loss": 0.0243, "step": 12430 }, { "epoch": 41.19205298013245, "grad_norm": 0.17559948563575745, "learning_rate": 3.424282772341176e-05, "loss": 0.0296, "step": 12440 }, { "epoch": 41.22516556291391, "grad_norm": 0.2857153117656708, "learning_rate": 3.416438851430519e-05, "loss": 0.0255, "step": 12450 }, { "epoch": 41.258278145695364, "grad_norm": 0.21669800579547882, "learning_rate": 3.408599259915577e-05, "loss": 0.0288, "step": 12460 }, { "epoch": 41.29139072847682, "grad_norm": 0.20823001861572266, "learning_rate": 3.400764019229487e-05, "loss": 0.0284, "step": 12470 }, { "epoch": 41.324503311258276, "grad_norm": 0.5184702277183533, "learning_rate": 3.3929331507935035e-05, "loss": 0.0256, "step": 12480 }, { "epoch": 41.35761589403973, "grad_norm": 0.22159075736999512, "learning_rate": 3.3851066760169196e-05, "loss": 0.0311, "step": 12490 }, { "epoch": 41.390728476821195, "grad_norm": 0.21630822122097015, "learning_rate": 3.377284616297021e-05, "loss": 0.0313, "step": 12500 }, { "epoch": 41.42384105960265, "grad_norm": 0.1872854381799698, "learning_rate": 3.3694669930190166e-05, "loss": 0.028, "step": 12510 }, { "epoch": 41.45695364238411, "grad_norm": 0.17626115679740906, "learning_rate": 3.36165382755599e-05, "loss": 0.0279, "step": 12520 }, { "epoch": 41.49006622516556, "grad_norm": 0.21602006256580353, "learning_rate": 3.35384514126884e-05, "loss": 0.0272, "step": 12530 }, { "epoch": 41.52317880794702, "grad_norm": 0.17685115337371826, "learning_rate": 3.3460409555062154e-05, "loss": 0.0264, "step": 12540 }, { "epoch": 41.556291390728475, "grad_norm": 0.2286406010389328, "learning_rate": 3.3382412916044645e-05, "loss": 0.0278, "step": 12550 }, { "epoch": 41.58940397350993, "grad_norm": 0.22394977509975433, "learning_rate": 3.330446170887566e-05, "loss": 0.0266, "step": 12560 }, { "epoch": 41.62251655629139, "grad_norm": 0.3603760898113251, "learning_rate": 3.3226556146670834e-05, "loss": 0.0291, "step": 12570 }, { "epoch": 41.65562913907285, "grad_norm": 0.18654073774814606, "learning_rate": 3.314869644242102e-05, "loss": 0.0286, "step": 12580 }, { "epoch": 41.688741721854306, "grad_norm": 0.21928901970386505, "learning_rate": 3.3070882808991674e-05, "loss": 0.03, "step": 12590 }, { "epoch": 41.72185430463576, "grad_norm": 0.21380895376205444, "learning_rate": 3.2993115459122305e-05, "loss": 0.029, "step": 12600 }, { "epoch": 41.75496688741722, "grad_norm": 0.2798177897930145, "learning_rate": 3.2915394605425835e-05, "loss": 0.0257, "step": 12610 }, { "epoch": 41.788079470198674, "grad_norm": 0.2315150499343872, "learning_rate": 3.283772046038816e-05, "loss": 0.0256, "step": 12620 }, { "epoch": 41.82119205298013, "grad_norm": 0.1403428018093109, "learning_rate": 3.276009323636739e-05, "loss": 0.0287, "step": 12630 }, { "epoch": 41.854304635761586, "grad_norm": 0.21724843978881836, "learning_rate": 3.268251314559344e-05, "loss": 0.027, "step": 12640 }, { "epoch": 41.88741721854305, "grad_norm": 0.15753726661205292, "learning_rate": 3.2604980400167254e-05, "loss": 0.0295, "step": 12650 }, { "epoch": 41.920529801324506, "grad_norm": 0.15565885603427887, "learning_rate": 3.252749521206042e-05, "loss": 0.0287, "step": 12660 }, { "epoch": 41.95364238410596, "grad_norm": 0.16612857580184937, "learning_rate": 3.2450057793114494e-05, "loss": 0.0267, "step": 12670 }, { "epoch": 41.98675496688742, "grad_norm": 0.47622784972190857, "learning_rate": 3.2372668355040435e-05, "loss": 0.0299, "step": 12680 }, { "epoch": 42.019867549668874, "grad_norm": 0.5689780712127686, "learning_rate": 3.2295327109418005e-05, "loss": 0.0295, "step": 12690 }, { "epoch": 42.05298013245033, "grad_norm": 0.21783658862113953, "learning_rate": 3.221803426769518e-05, "loss": 0.0298, "step": 12700 }, { "epoch": 42.086092715231786, "grad_norm": 0.27987921237945557, "learning_rate": 3.214079004118768e-05, "loss": 0.0265, "step": 12710 }, { "epoch": 42.11920529801324, "grad_norm": 0.20422133803367615, "learning_rate": 3.2063594641078234e-05, "loss": 0.025, "step": 12720 }, { "epoch": 42.152317880794705, "grad_norm": 0.282254159450531, "learning_rate": 3.198644827841616e-05, "loss": 0.0298, "step": 12730 }, { "epoch": 42.18543046357616, "grad_norm": 0.3692350387573242, "learning_rate": 3.1909351164116654e-05, "loss": 0.0313, "step": 12740 }, { "epoch": 42.21854304635762, "grad_norm": 0.2074042707681656, "learning_rate": 3.183230350896026e-05, "loss": 0.0297, "step": 12750 }, { "epoch": 42.25165562913907, "grad_norm": 0.19803418219089508, "learning_rate": 3.1755305523592337e-05, "loss": 0.0284, "step": 12760 }, { "epoch": 42.28476821192053, "grad_norm": 0.20594845712184906, "learning_rate": 3.167835741852245e-05, "loss": 0.032, "step": 12770 }, { "epoch": 42.317880794701985, "grad_norm": 0.2448321431875229, "learning_rate": 3.160145940412378e-05, "loss": 0.0322, "step": 12780 }, { "epoch": 42.35099337748344, "grad_norm": 0.3952292501926422, "learning_rate": 3.1524611690632545e-05, "loss": 0.0297, "step": 12790 }, { "epoch": 42.384105960264904, "grad_norm": 0.35092097520828247, "learning_rate": 3.144781448814746e-05, "loss": 0.0315, "step": 12800 }, { "epoch": 42.41721854304636, "grad_norm": 0.24274170398712158, "learning_rate": 3.1371068006629145e-05, "loss": 0.0286, "step": 12810 }, { "epoch": 42.450331125827816, "grad_norm": 0.2861262261867523, "learning_rate": 3.129437245589956e-05, "loss": 0.0262, "step": 12820 }, { "epoch": 42.48344370860927, "grad_norm": 0.21543806791305542, "learning_rate": 3.121772804564143e-05, "loss": 0.0276, "step": 12830 }, { "epoch": 42.51655629139073, "grad_norm": 0.35661378502845764, "learning_rate": 3.11411349853976e-05, "loss": 0.0266, "step": 12840 }, { "epoch": 42.549668874172184, "grad_norm": 0.20661170780658722, "learning_rate": 3.10645934845706e-05, "loss": 0.0258, "step": 12850 }, { "epoch": 42.58278145695364, "grad_norm": 0.19220682978630066, "learning_rate": 3.098810375242196e-05, "loss": 0.0269, "step": 12860 }, { "epoch": 42.615894039735096, "grad_norm": 0.2968003749847412, "learning_rate": 3.0911665998071704e-05, "loss": 0.0279, "step": 12870 }, { "epoch": 42.64900662251656, "grad_norm": 0.22863176465034485, "learning_rate": 3.083528043049774e-05, "loss": 0.0277, "step": 12880 }, { "epoch": 42.682119205298015, "grad_norm": 0.2997119724750519, "learning_rate": 3.0758947258535255e-05, "loss": 0.0273, "step": 12890 }, { "epoch": 42.71523178807947, "grad_norm": 0.3841931223869324, "learning_rate": 3.068266669087625e-05, "loss": 0.0277, "step": 12900 }, { "epoch": 42.74834437086093, "grad_norm": 0.2254512459039688, "learning_rate": 3.060643893606887e-05, "loss": 0.0301, "step": 12910 }, { "epoch": 42.78145695364238, "grad_norm": 0.23568278551101685, "learning_rate": 3.053026420251693e-05, "loss": 0.0255, "step": 12920 }, { "epoch": 42.81456953642384, "grad_norm": 0.3137474060058594, "learning_rate": 3.0454142698479183e-05, "loss": 0.0232, "step": 12930 }, { "epoch": 42.847682119205295, "grad_norm": 0.1857440322637558, "learning_rate": 3.0378074632068954e-05, "loss": 0.0302, "step": 12940 }, { "epoch": 42.88079470198676, "grad_norm": 0.3241572678089142, "learning_rate": 3.0302060211253408e-05, "loss": 0.0288, "step": 12950 }, { "epoch": 42.913907284768214, "grad_norm": 0.1517849713563919, "learning_rate": 3.0226099643853073e-05, "loss": 0.0271, "step": 12960 }, { "epoch": 42.94701986754967, "grad_norm": 0.3828745186328888, "learning_rate": 3.0150193137541283e-05, "loss": 0.028, "step": 12970 }, { "epoch": 42.980132450331126, "grad_norm": 0.16545556485652924, "learning_rate": 3.0074340899843467e-05, "loss": 0.0256, "step": 12980 }, { "epoch": 43.01324503311258, "grad_norm": 0.1494814157485962, "learning_rate": 2.999854313813677e-05, "loss": 0.0258, "step": 12990 }, { "epoch": 43.04635761589404, "grad_norm": 0.21991249918937683, "learning_rate": 2.9922800059649382e-05, "loss": 0.0255, "step": 13000 }, { "epoch": 43.079470198675494, "grad_norm": 0.27084603905677795, "learning_rate": 2.9847111871459976e-05, "loss": 0.0288, "step": 13010 }, { "epoch": 43.11258278145695, "grad_norm": 0.3045360743999481, "learning_rate": 2.977147878049721e-05, "loss": 0.0282, "step": 13020 }, { "epoch": 43.145695364238414, "grad_norm": 0.20979227125644684, "learning_rate": 2.9695900993539006e-05, "loss": 0.0262, "step": 13030 }, { "epoch": 43.17880794701987, "grad_norm": 0.2415732592344284, "learning_rate": 2.9620378717212183e-05, "loss": 0.0233, "step": 13040 }, { "epoch": 43.211920529801326, "grad_norm": 0.25760385394096375, "learning_rate": 2.9544912157991745e-05, "loss": 0.0243, "step": 13050 }, { "epoch": 43.24503311258278, "grad_norm": 0.163492351770401, "learning_rate": 2.9469501522200405e-05, "loss": 0.0272, "step": 13060 }, { "epoch": 43.27814569536424, "grad_norm": 0.29214584827423096, "learning_rate": 2.9394147016007946e-05, "loss": 0.0281, "step": 13070 }, { "epoch": 43.311258278145694, "grad_norm": 0.3045174181461334, "learning_rate": 2.9318848845430702e-05, "loss": 0.025, "step": 13080 }, { "epoch": 43.34437086092715, "grad_norm": 0.26643693447113037, "learning_rate": 2.9243607216331013e-05, "loss": 0.0246, "step": 13090 }, { "epoch": 43.37748344370861, "grad_norm": 0.1854688972234726, "learning_rate": 2.916842233441661e-05, "loss": 0.0272, "step": 13100 }, { "epoch": 43.41059602649007, "grad_norm": 0.20955538749694824, "learning_rate": 2.90932944052401e-05, "loss": 0.0287, "step": 13110 }, { "epoch": 43.443708609271525, "grad_norm": 0.2092038244009018, "learning_rate": 2.9018223634198354e-05, "loss": 0.0277, "step": 13120 }, { "epoch": 43.47682119205298, "grad_norm": 0.26196178793907166, "learning_rate": 2.8943210226532025e-05, "loss": 0.0273, "step": 13130 }, { "epoch": 43.50993377483444, "grad_norm": 0.36422455310821533, "learning_rate": 2.8868254387324857e-05, "loss": 0.0266, "step": 13140 }, { "epoch": 43.54304635761589, "grad_norm": 0.4578329026699066, "learning_rate": 2.8793356321503306e-05, "loss": 0.027, "step": 13150 }, { "epoch": 43.57615894039735, "grad_norm": 0.16047930717468262, "learning_rate": 2.87185162338358e-05, "loss": 0.0246, "step": 13160 }, { "epoch": 43.609271523178805, "grad_norm": 0.23854345083236694, "learning_rate": 2.8643734328932253e-05, "loss": 0.027, "step": 13170 }, { "epoch": 43.64238410596027, "grad_norm": 0.23841392993927002, "learning_rate": 2.856901081124359e-05, "loss": 0.027, "step": 13180 }, { "epoch": 43.675496688741724, "grad_norm": 0.21374045312404633, "learning_rate": 2.8494345885061002e-05, "loss": 0.0286, "step": 13190 }, { "epoch": 43.70860927152318, "grad_norm": 0.25766780972480774, "learning_rate": 2.8419739754515616e-05, "loss": 0.0283, "step": 13200 }, { "epoch": 43.741721854304636, "grad_norm": 0.39618444442749023, "learning_rate": 2.8345192623577666e-05, "loss": 0.0261, "step": 13210 }, { "epoch": 43.77483443708609, "grad_norm": 0.19191734492778778, "learning_rate": 2.8270704696056193e-05, "loss": 0.0248, "step": 13220 }, { "epoch": 43.80794701986755, "grad_norm": 0.21463987231254578, "learning_rate": 2.8196276175598367e-05, "loss": 0.0259, "step": 13230 }, { "epoch": 43.841059602649004, "grad_norm": 0.15801990032196045, "learning_rate": 2.8121907265688884e-05, "loss": 0.0259, "step": 13240 }, { "epoch": 43.87417218543047, "grad_norm": 0.3054826855659485, "learning_rate": 2.804759816964957e-05, "loss": 0.0274, "step": 13250 }, { "epoch": 43.90728476821192, "grad_norm": 0.21815967559814453, "learning_rate": 2.797334909063857e-05, "loss": 0.0262, "step": 13260 }, { "epoch": 43.94039735099338, "grad_norm": 0.15842215716838837, "learning_rate": 2.7899160231650056e-05, "loss": 0.0283, "step": 13270 }, { "epoch": 43.973509933774835, "grad_norm": 0.14069923758506775, "learning_rate": 2.7825031795513585e-05, "loss": 0.0283, "step": 13280 }, { "epoch": 44.00662251655629, "grad_norm": 0.22023382782936096, "learning_rate": 2.775096398489341e-05, "loss": 0.0265, "step": 13290 }, { "epoch": 44.03973509933775, "grad_norm": 0.129379004240036, "learning_rate": 2.7676957002288163e-05, "loss": 0.0279, "step": 13300 }, { "epoch": 44.0728476821192, "grad_norm": 0.18019552528858185, "learning_rate": 2.760301105003003e-05, "loss": 0.0296, "step": 13310 }, { "epoch": 44.10596026490066, "grad_norm": 0.2472902238368988, "learning_rate": 2.752912633028446e-05, "loss": 0.026, "step": 13320 }, { "epoch": 44.13907284768212, "grad_norm": 0.27610862255096436, "learning_rate": 2.7455303045049474e-05, "loss": 0.0247, "step": 13330 }, { "epoch": 44.17218543046358, "grad_norm": 0.19796645641326904, "learning_rate": 2.7381541396155098e-05, "loss": 0.0316, "step": 13340 }, { "epoch": 44.205298013245034, "grad_norm": 0.34906208515167236, "learning_rate": 2.730784158526286e-05, "loss": 0.0265, "step": 13350 }, { "epoch": 44.23841059602649, "grad_norm": 0.13970595598220825, "learning_rate": 2.723420381386521e-05, "loss": 0.0264, "step": 13360 }, { "epoch": 44.271523178807946, "grad_norm": 0.20864208042621613, "learning_rate": 2.7160628283285018e-05, "loss": 0.0306, "step": 13370 }, { "epoch": 44.3046357615894, "grad_norm": 0.20458504557609558, "learning_rate": 2.7087115194675007e-05, "loss": 0.0265, "step": 13380 }, { "epoch": 44.33774834437086, "grad_norm": 0.4178740978240967, "learning_rate": 2.701366474901712e-05, "loss": 0.0257, "step": 13390 }, { "epoch": 44.370860927152314, "grad_norm": 0.48967328667640686, "learning_rate": 2.6940277147122085e-05, "loss": 0.0286, "step": 13400 }, { "epoch": 44.40397350993378, "grad_norm": 0.14464400708675385, "learning_rate": 2.686695258962878e-05, "loss": 0.0258, "step": 13410 }, { "epoch": 44.437086092715234, "grad_norm": 0.17512960731983185, "learning_rate": 2.679369127700375e-05, "loss": 0.0275, "step": 13420 }, { "epoch": 44.47019867549669, "grad_norm": 0.4980560541152954, "learning_rate": 2.672049340954067e-05, "loss": 0.0271, "step": 13430 }, { "epoch": 44.503311258278146, "grad_norm": 0.23665954172611237, "learning_rate": 2.6647359187359676e-05, "loss": 0.0285, "step": 13440 }, { "epoch": 44.5364238410596, "grad_norm": 0.15417209267616272, "learning_rate": 2.6574288810406946e-05, "loss": 0.0294, "step": 13450 }, { "epoch": 44.56953642384106, "grad_norm": 0.24678270518779755, "learning_rate": 2.6501282478454083e-05, "loss": 0.0306, "step": 13460 }, { "epoch": 44.602649006622514, "grad_norm": 0.41801917552948, "learning_rate": 2.6428340391097618e-05, "loss": 0.0283, "step": 13470 }, { "epoch": 44.63576158940398, "grad_norm": 0.19247105717658997, "learning_rate": 2.6355462747758485e-05, "loss": 0.0233, "step": 13480 }, { "epoch": 44.66887417218543, "grad_norm": 0.1867855340242386, "learning_rate": 2.6282649747681304e-05, "loss": 0.0304, "step": 13490 }, { "epoch": 44.70198675496689, "grad_norm": 0.29475754499435425, "learning_rate": 2.620990158993406e-05, "loss": 0.0232, "step": 13500 }, { "epoch": 44.735099337748345, "grad_norm": 0.3131444454193115, "learning_rate": 2.6137218473407477e-05, "loss": 0.0265, "step": 13510 }, { "epoch": 44.7682119205298, "grad_norm": 0.27327319979667664, "learning_rate": 2.606460059681436e-05, "loss": 0.0267, "step": 13520 }, { "epoch": 44.80132450331126, "grad_norm": 0.2430776208639145, "learning_rate": 2.599204815868928e-05, "loss": 0.0252, "step": 13530 }, { "epoch": 44.83443708609271, "grad_norm": 0.19147177040576935, "learning_rate": 2.5919561357387756e-05, "loss": 0.0227, "step": 13540 }, { "epoch": 44.86754966887417, "grad_norm": 0.3629341721534729, "learning_rate": 2.5847140391085972e-05, "loss": 0.0251, "step": 13550 }, { "epoch": 44.90066225165563, "grad_norm": 0.5238823890686035, "learning_rate": 2.5774785457780103e-05, "loss": 0.0255, "step": 13560 }, { "epoch": 44.93377483443709, "grad_norm": 0.3454253077507019, "learning_rate": 2.5702496755285753e-05, "loss": 0.0281, "step": 13570 }, { "epoch": 44.966887417218544, "grad_norm": 0.18038779497146606, "learning_rate": 2.5630274481237483e-05, "loss": 0.0274, "step": 13580 }, { "epoch": 45.0, "grad_norm": 0.23894493281841278, "learning_rate": 2.5558118833088197e-05, "loss": 0.0256, "step": 13590 }, { "epoch": 45.033112582781456, "grad_norm": 0.15541237592697144, "learning_rate": 2.548603000810872e-05, "loss": 0.0235, "step": 13600 }, { "epoch": 45.06622516556291, "grad_norm": 0.22609639167785645, "learning_rate": 2.5414008203387152e-05, "loss": 0.0313, "step": 13610 }, { "epoch": 45.09933774834437, "grad_norm": 0.22861701250076294, "learning_rate": 2.534205361582834e-05, "loss": 0.0225, "step": 13620 }, { "epoch": 45.13245033112583, "grad_norm": 0.13009682297706604, "learning_rate": 2.527016644215338e-05, "loss": 0.0241, "step": 13630 }, { "epoch": 45.16556291390729, "grad_norm": 0.26994094252586365, "learning_rate": 2.519834687889905e-05, "loss": 0.0263, "step": 13640 }, { "epoch": 45.19867549668874, "grad_norm": 0.2084682136774063, "learning_rate": 2.5126595122417295e-05, "loss": 0.024, "step": 13650 }, { "epoch": 45.2317880794702, "grad_norm": 0.25466686487197876, "learning_rate": 2.5054911368874713e-05, "loss": 0.028, "step": 13660 }, { "epoch": 45.264900662251655, "grad_norm": 0.3024353086948395, "learning_rate": 2.4983295814251916e-05, "loss": 0.0261, "step": 13670 }, { "epoch": 45.29801324503311, "grad_norm": 0.35666775703430176, "learning_rate": 2.4911748654343105e-05, "loss": 0.0246, "step": 13680 }, { "epoch": 45.33112582781457, "grad_norm": 0.20765885710716248, "learning_rate": 2.4840270084755463e-05, "loss": 0.0272, "step": 13690 }, { "epoch": 45.36423841059602, "grad_norm": 0.3087199628353119, "learning_rate": 2.4768860300908685e-05, "loss": 0.0284, "step": 13700 }, { "epoch": 45.397350993377486, "grad_norm": 0.23970980942249298, "learning_rate": 2.469751949803443e-05, "loss": 0.027, "step": 13710 }, { "epoch": 45.43046357615894, "grad_norm": 0.1528858244419098, "learning_rate": 2.4626247871175666e-05, "loss": 0.0271, "step": 13720 }, { "epoch": 45.4635761589404, "grad_norm": 0.16680683195590973, "learning_rate": 2.4555045615186346e-05, "loss": 0.0243, "step": 13730 }, { "epoch": 45.496688741721854, "grad_norm": 0.4507889449596405, "learning_rate": 2.4483912924730677e-05, "loss": 0.0225, "step": 13740 }, { "epoch": 45.52980132450331, "grad_norm": 0.18513129651546478, "learning_rate": 2.4412849994282742e-05, "loss": 0.0273, "step": 13750 }, { "epoch": 45.562913907284766, "grad_norm": 0.26876774430274963, "learning_rate": 2.434185701812592e-05, "loss": 0.026, "step": 13760 }, { "epoch": 45.59602649006622, "grad_norm": 0.23975156247615814, "learning_rate": 2.4270934190352218e-05, "loss": 0.0284, "step": 13770 }, { "epoch": 45.629139072847686, "grad_norm": 0.24307018518447876, "learning_rate": 2.4200081704861998e-05, "loss": 0.0252, "step": 13780 }, { "epoch": 45.66225165562914, "grad_norm": 0.1909065544605255, "learning_rate": 2.412929975536321e-05, "loss": 0.0223, "step": 13790 }, { "epoch": 45.6953642384106, "grad_norm": 0.1662614345550537, "learning_rate": 2.4058588535371017e-05, "loss": 0.0249, "step": 13800 }, { "epoch": 45.728476821192054, "grad_norm": 0.18997307121753693, "learning_rate": 2.3987948238207243e-05, "loss": 0.0229, "step": 13810 }, { "epoch": 45.76158940397351, "grad_norm": 0.213860422372818, "learning_rate": 2.3917379056999678e-05, "loss": 0.0248, "step": 13820 }, { "epoch": 45.794701986754966, "grad_norm": 0.23504944145679474, "learning_rate": 2.3846881184681824e-05, "loss": 0.0234, "step": 13830 }, { "epoch": 45.82781456953642, "grad_norm": 0.347250372171402, "learning_rate": 2.377645481399214e-05, "loss": 0.0314, "step": 13840 }, { "epoch": 45.86092715231788, "grad_norm": 0.12555301189422607, "learning_rate": 2.3706100137473667e-05, "loss": 0.0259, "step": 13850 }, { "epoch": 45.89403973509934, "grad_norm": 0.25163450837135315, "learning_rate": 2.3635817347473394e-05, "loss": 0.0252, "step": 13860 }, { "epoch": 45.9271523178808, "grad_norm": 0.14467217028141022, "learning_rate": 2.3565606636141757e-05, "loss": 0.0248, "step": 13870 }, { "epoch": 45.96026490066225, "grad_norm": 0.15821459889411926, "learning_rate": 2.3495468195432203e-05, "loss": 0.0218, "step": 13880 }, { "epoch": 45.99337748344371, "grad_norm": 0.16929589211940765, "learning_rate": 2.3425402217100507e-05, "loss": 0.0278, "step": 13890 }, { "epoch": 46.026490066225165, "grad_norm": 0.13030746579170227, "learning_rate": 2.3355408892704424e-05, "loss": 0.0247, "step": 13900 }, { "epoch": 46.05960264900662, "grad_norm": 0.18366019427776337, "learning_rate": 2.3285488413603003e-05, "loss": 0.0271, "step": 13910 }, { "epoch": 46.09271523178808, "grad_norm": 0.19861802458763123, "learning_rate": 2.321564097095615e-05, "loss": 0.0254, "step": 13920 }, { "epoch": 46.12582781456954, "grad_norm": 0.4009852111339569, "learning_rate": 2.3145866755724142e-05, "loss": 0.0251, "step": 13930 }, { "epoch": 46.158940397350996, "grad_norm": 0.4459439218044281, "learning_rate": 2.307616595866699e-05, "loss": 0.0239, "step": 13940 }, { "epoch": 46.19205298013245, "grad_norm": 0.2984050214290619, "learning_rate": 2.3006538770344032e-05, "loss": 0.0201, "step": 13950 }, { "epoch": 46.22516556291391, "grad_norm": 0.17963097989559174, "learning_rate": 2.293698538111334e-05, "loss": 0.0223, "step": 13960 }, { "epoch": 46.258278145695364, "grad_norm": 0.25005754828453064, "learning_rate": 2.28675059811312e-05, "loss": 0.0255, "step": 13970 }, { "epoch": 46.29139072847682, "grad_norm": 0.18441782891750336, "learning_rate": 2.279810076035167e-05, "loss": 0.0264, "step": 13980 }, { "epoch": 46.324503311258276, "grad_norm": 0.4007423222064972, "learning_rate": 2.272876990852596e-05, "loss": 0.0268, "step": 13990 }, { "epoch": 46.35761589403973, "grad_norm": 0.24818086624145508, "learning_rate": 2.265951361520195e-05, "loss": 0.0229, "step": 14000 }, { "epoch": 46.390728476821195, "grad_norm": 0.17871522903442383, "learning_rate": 2.2590332069723748e-05, "loss": 0.0234, "step": 14010 }, { "epoch": 46.42384105960265, "grad_norm": 0.332699179649353, "learning_rate": 2.2521225461231004e-05, "loss": 0.0224, "step": 14020 }, { "epoch": 46.45695364238411, "grad_norm": 0.16376204788684845, "learning_rate": 2.2452193978658597e-05, "loss": 0.0287, "step": 14030 }, { "epoch": 46.49006622516556, "grad_norm": 0.15537136793136597, "learning_rate": 2.238323781073594e-05, "loss": 0.0248, "step": 14040 }, { "epoch": 46.52317880794702, "grad_norm": 0.14758358895778656, "learning_rate": 2.2314357145986552e-05, "loss": 0.0237, "step": 14050 }, { "epoch": 46.556291390728475, "grad_norm": 0.3315291404724121, "learning_rate": 2.224555217272757e-05, "loss": 0.0239, "step": 14060 }, { "epoch": 46.58940397350993, "grad_norm": 0.3423442244529724, "learning_rate": 2.2176823079069127e-05, "loss": 0.0267, "step": 14070 }, { "epoch": 46.62251655629139, "grad_norm": 0.18832775950431824, "learning_rate": 2.210817005291398e-05, "loss": 0.0253, "step": 14080 }, { "epoch": 46.65562913907285, "grad_norm": 0.30161669850349426, "learning_rate": 2.203959328195686e-05, "loss": 0.0205, "step": 14090 }, { "epoch": 46.688741721854306, "grad_norm": 0.8834372162818909, "learning_rate": 2.1971092953684026e-05, "loss": 0.0241, "step": 14100 }, { "epoch": 46.72185430463576, "grad_norm": 0.178205206990242, "learning_rate": 2.1902669255372788e-05, "loss": 0.0222, "step": 14110 }, { "epoch": 46.75496688741722, "grad_norm": 0.18602070212364197, "learning_rate": 2.1834322374090897e-05, "loss": 0.0237, "step": 14120 }, { "epoch": 46.788079470198674, "grad_norm": 0.20869125425815582, "learning_rate": 2.1766052496696153e-05, "loss": 0.0239, "step": 14130 }, { "epoch": 46.82119205298013, "grad_norm": 0.17276296019554138, "learning_rate": 2.169785980983577e-05, "loss": 0.0267, "step": 14140 }, { "epoch": 46.854304635761586, "grad_norm": 0.5190914869308472, "learning_rate": 2.162974449994593e-05, "loss": 0.026, "step": 14150 }, { "epoch": 46.88741721854305, "grad_norm": 0.39548423886299133, "learning_rate": 2.1561706753251337e-05, "loss": 0.0246, "step": 14160 }, { "epoch": 46.920529801324506, "grad_norm": 0.19796308875083923, "learning_rate": 2.1493746755764544e-05, "loss": 0.0231, "step": 14170 }, { "epoch": 46.95364238410596, "grad_norm": 0.22226747870445251, "learning_rate": 2.1425864693285635e-05, "loss": 0.0274, "step": 14180 }, { "epoch": 46.98675496688742, "grad_norm": 0.13562965393066406, "learning_rate": 2.1358060751401547e-05, "loss": 0.0221, "step": 14190 }, { "epoch": 47.019867549668874, "grad_norm": 0.19479365646839142, "learning_rate": 2.129033511548566e-05, "loss": 0.0254, "step": 14200 }, { "epoch": 47.05298013245033, "grad_norm": 0.156860813498497, "learning_rate": 2.1222687970697315e-05, "loss": 0.0228, "step": 14210 }, { "epoch": 47.086092715231786, "grad_norm": 0.222572922706604, "learning_rate": 2.1155119501981173e-05, "loss": 0.0249, "step": 14220 }, { "epoch": 47.11920529801324, "grad_norm": 0.4172445833683014, "learning_rate": 2.1087629894066895e-05, "loss": 0.0249, "step": 14230 }, { "epoch": 47.152317880794705, "grad_norm": 0.1780320107936859, "learning_rate": 2.1020219331468473e-05, "loss": 0.025, "step": 14240 }, { "epoch": 47.18543046357616, "grad_norm": 0.2399112433195114, "learning_rate": 2.095288799848379e-05, "loss": 0.0262, "step": 14250 }, { "epoch": 47.21854304635762, "grad_norm": 0.3587103486061096, "learning_rate": 2.088563607919417e-05, "loss": 0.0262, "step": 14260 }, { "epoch": 47.25165562913907, "grad_norm": 0.5932842493057251, "learning_rate": 2.0818463757463786e-05, "loss": 0.024, "step": 14270 }, { "epoch": 47.28476821192053, "grad_norm": 0.33412808179855347, "learning_rate": 2.0751371216939175e-05, "loss": 0.0259, "step": 14280 }, { "epoch": 47.317880794701985, "grad_norm": 0.172576442360878, "learning_rate": 2.068435864104882e-05, "loss": 0.0263, "step": 14290 }, { "epoch": 47.35099337748344, "grad_norm": 0.6864001750946045, "learning_rate": 2.0617426213002506e-05, "loss": 0.0235, "step": 14300 }, { "epoch": 47.384105960264904, "grad_norm": 0.20690110325813293, "learning_rate": 2.055057411579097e-05, "loss": 0.0233, "step": 14310 }, { "epoch": 47.41721854304636, "grad_norm": 0.20191584527492523, "learning_rate": 2.0483802532185286e-05, "loss": 0.0239, "step": 14320 }, { "epoch": 47.450331125827816, "grad_norm": 0.19868309795856476, "learning_rate": 2.041711164473638e-05, "loss": 0.0269, "step": 14330 }, { "epoch": 47.48344370860927, "grad_norm": 0.1512822061777115, "learning_rate": 2.0350501635774637e-05, "loss": 0.025, "step": 14340 }, { "epoch": 47.51655629139073, "grad_norm": 0.2615854740142822, "learning_rate": 2.0283972687409247e-05, "loss": 0.0233, "step": 14350 }, { "epoch": 47.549668874172184, "grad_norm": 0.21799106895923615, "learning_rate": 2.021752498152784e-05, "loss": 0.0269, "step": 14360 }, { "epoch": 47.58278145695364, "grad_norm": 0.1833494007587433, "learning_rate": 2.015115869979589e-05, "loss": 0.0219, "step": 14370 }, { "epoch": 47.615894039735096, "grad_norm": 0.22705607116222382, "learning_rate": 2.0084874023656265e-05, "loss": 0.027, "step": 14380 }, { "epoch": 47.64900662251656, "grad_norm": 0.23188988864421844, "learning_rate": 2.001867113432877e-05, "loss": 0.0203, "step": 14390 }, { "epoch": 47.682119205298015, "grad_norm": 0.5443418622016907, "learning_rate": 1.995255021280954e-05, "loss": 0.0265, "step": 14400 }, { "epoch": 47.71523178807947, "grad_norm": 0.3573514223098755, "learning_rate": 1.9886511439870688e-05, "loss": 0.0248, "step": 14410 }, { "epoch": 47.74834437086093, "grad_norm": 0.8361745476722717, "learning_rate": 1.9820554996059675e-05, "loss": 0.0244, "step": 14420 }, { "epoch": 47.78145695364238, "grad_norm": 0.27226555347442627, "learning_rate": 1.9754681061698893e-05, "loss": 0.0216, "step": 14430 }, { "epoch": 47.81456953642384, "grad_norm": 0.21065223217010498, "learning_rate": 1.9688889816885185e-05, "loss": 0.0224, "step": 14440 }, { "epoch": 47.847682119205295, "grad_norm": 0.20882853865623474, "learning_rate": 1.962318144148928e-05, "loss": 0.0234, "step": 14450 }, { "epoch": 47.88079470198676, "grad_norm": 0.3774722218513489, "learning_rate": 1.955755611515539e-05, "loss": 0.0264, "step": 14460 }, { "epoch": 47.913907284768214, "grad_norm": 0.14797475934028625, "learning_rate": 1.9492014017300642e-05, "loss": 0.0228, "step": 14470 }, { "epoch": 47.94701986754967, "grad_norm": 0.21411831676959991, "learning_rate": 1.942655532711461e-05, "loss": 0.0261, "step": 14480 }, { "epoch": 47.980132450331126, "grad_norm": 0.28465551137924194, "learning_rate": 1.9361180223558882e-05, "loss": 0.0228, "step": 14490 }, { "epoch": 48.01324503311258, "grad_norm": 0.23264336585998535, "learning_rate": 1.929588888536647e-05, "loss": 0.0229, "step": 14500 }, { "epoch": 48.04635761589404, "grad_norm": 0.14964714646339417, "learning_rate": 1.9230681491041425e-05, "loss": 0.022, "step": 14510 }, { "epoch": 48.079470198675494, "grad_norm": 0.24809028208255768, "learning_rate": 1.9165558218858264e-05, "loss": 0.0271, "step": 14520 }, { "epoch": 48.11258278145695, "grad_norm": 0.3449985384941101, "learning_rate": 1.9100519246861505e-05, "loss": 0.0223, "step": 14530 }, { "epoch": 48.145695364238414, "grad_norm": 0.7909275889396667, "learning_rate": 1.9035564752865248e-05, "loss": 0.023, "step": 14540 }, { "epoch": 48.17880794701987, "grad_norm": 0.3180926740169525, "learning_rate": 1.897069491445258e-05, "loss": 0.0237, "step": 14550 }, { "epoch": 48.211920529801326, "grad_norm": 0.21645359694957733, "learning_rate": 1.890590990897515e-05, "loss": 0.0231, "step": 14560 }, { "epoch": 48.24503311258278, "grad_norm": 0.18906641006469727, "learning_rate": 1.884120991355272e-05, "loss": 0.0272, "step": 14570 }, { "epoch": 48.27814569536424, "grad_norm": 0.23836173117160797, "learning_rate": 1.8776595105072576e-05, "loss": 0.0248, "step": 14580 }, { "epoch": 48.311258278145694, "grad_norm": 0.31565800309181213, "learning_rate": 1.8712065660189166e-05, "loss": 0.0223, "step": 14590 }, { "epoch": 48.34437086092715, "grad_norm": 0.2006607949733734, "learning_rate": 1.8647621755323513e-05, "loss": 0.0235, "step": 14600 }, { "epoch": 48.37748344370861, "grad_norm": 0.18316569924354553, "learning_rate": 1.858326356666278e-05, "loss": 0.0237, "step": 14610 }, { "epoch": 48.41059602649007, "grad_norm": 0.3317444622516632, "learning_rate": 1.851899127015983e-05, "loss": 0.0237, "step": 14620 }, { "epoch": 48.443708609271525, "grad_norm": 0.3682951331138611, "learning_rate": 1.8454805041532626e-05, "loss": 0.0239, "step": 14630 }, { "epoch": 48.47682119205298, "grad_norm": 0.20940051972866058, "learning_rate": 1.8390705056263906e-05, "loss": 0.0213, "step": 14640 }, { "epoch": 48.50993377483444, "grad_norm": 0.35365840792655945, "learning_rate": 1.832669148960057e-05, "loss": 0.0259, "step": 14650 }, { "epoch": 48.54304635761589, "grad_norm": 0.4733079671859741, "learning_rate": 1.8262764516553233e-05, "loss": 0.0217, "step": 14660 }, { "epoch": 48.57615894039735, "grad_norm": 0.24613884091377258, "learning_rate": 1.8198924311895843e-05, "loss": 0.0256, "step": 14670 }, { "epoch": 48.609271523178805, "grad_norm": 0.2479301393032074, "learning_rate": 1.813517105016505e-05, "loss": 0.0253, "step": 14680 }, { "epoch": 48.64238410596027, "grad_norm": 0.2340872883796692, "learning_rate": 1.8071504905659888e-05, "loss": 0.0236, "step": 14690 }, { "epoch": 48.675496688741724, "grad_norm": 0.2724156677722931, "learning_rate": 1.800792605244109e-05, "loss": 0.0226, "step": 14700 }, { "epoch": 48.70860927152318, "grad_norm": 0.19568641483783722, "learning_rate": 1.7944434664330844e-05, "loss": 0.0207, "step": 14710 }, { "epoch": 48.741721854304636, "grad_norm": 0.30155614018440247, "learning_rate": 1.7881030914912212e-05, "loss": 0.0257, "step": 14720 }, { "epoch": 48.77483443708609, "grad_norm": 0.44055917859077454, "learning_rate": 1.7817714977528577e-05, "loss": 0.0212, "step": 14730 }, { "epoch": 48.80794701986755, "grad_norm": 0.35490790009498596, "learning_rate": 1.7754487025283332e-05, "loss": 0.0241, "step": 14740 }, { "epoch": 48.841059602649004, "grad_norm": 0.2849467396736145, "learning_rate": 1.7691347231039275e-05, "loss": 0.0249, "step": 14750 }, { "epoch": 48.87417218543047, "grad_norm": 0.4596991539001465, "learning_rate": 1.7628295767418164e-05, "loss": 0.0226, "step": 14760 }, { "epoch": 48.90728476821192, "grad_norm": 0.24115458130836487, "learning_rate": 1.7565332806800333e-05, "loss": 0.0219, "step": 14770 }, { "epoch": 48.94039735099338, "grad_norm": 0.21470122039318085, "learning_rate": 1.750245852132408e-05, "loss": 0.0245, "step": 14780 }, { "epoch": 48.973509933774835, "grad_norm": 0.2219647765159607, "learning_rate": 1.7439673082885323e-05, "loss": 0.0231, "step": 14790 }, { "epoch": 49.00662251655629, "grad_norm": 0.24425294995307922, "learning_rate": 1.7376976663137047e-05, "loss": 0.0246, "step": 14800 }, { "epoch": 49.03973509933775, "grad_norm": 0.45767685770988464, "learning_rate": 1.7314369433488853e-05, "loss": 0.0249, "step": 14810 }, { "epoch": 49.0728476821192, "grad_norm": 0.1724698692560196, "learning_rate": 1.7251851565106548e-05, "loss": 0.0228, "step": 14820 }, { "epoch": 49.10596026490066, "grad_norm": 0.28500011563301086, "learning_rate": 1.7189423228911574e-05, "loss": 0.0239, "step": 14830 }, { "epoch": 49.13907284768212, "grad_norm": 0.12930689752101898, "learning_rate": 1.7127084595580606e-05, "loss": 0.0191, "step": 14840 }, { "epoch": 49.17218543046358, "grad_norm": 0.26708266139030457, "learning_rate": 1.706483583554513e-05, "loss": 0.0259, "step": 14850 }, { "epoch": 49.205298013245034, "grad_norm": 0.496579110622406, "learning_rate": 1.700267711899083e-05, "loss": 0.0231, "step": 14860 }, { "epoch": 49.23841059602649, "grad_norm": 0.4714255928993225, "learning_rate": 1.69406086158573e-05, "loss": 0.0209, "step": 14870 }, { "epoch": 49.271523178807946, "grad_norm": 0.18420591950416565, "learning_rate": 1.6878630495837455e-05, "loss": 0.0248, "step": 14880 }, { "epoch": 49.3046357615894, "grad_norm": 0.18520568311214447, "learning_rate": 1.681674292837707e-05, "loss": 0.0226, "step": 14890 }, { "epoch": 49.33774834437086, "grad_norm": 0.16793382167816162, "learning_rate": 1.6754946082674444e-05, "loss": 0.0237, "step": 14900 }, { "epoch": 49.370860927152314, "grad_norm": 0.45027685165405273, "learning_rate": 1.6693240127679748e-05, "loss": 0.0236, "step": 14910 }, { "epoch": 49.40397350993378, "grad_norm": 0.1665189564228058, "learning_rate": 1.663162523209475e-05, "loss": 0.0262, "step": 14920 }, { "epoch": 49.437086092715234, "grad_norm": 0.3784805238246918, "learning_rate": 1.6570101564372193e-05, "loss": 0.0228, "step": 14930 }, { "epoch": 49.47019867549669, "grad_norm": 0.23161712288856506, "learning_rate": 1.650866929271543e-05, "loss": 0.0245, "step": 14940 }, { "epoch": 49.503311258278146, "grad_norm": 0.1671711951494217, "learning_rate": 1.644732858507797e-05, "loss": 0.0267, "step": 14950 }, { "epoch": 49.5364238410596, "grad_norm": 0.21905918419361115, "learning_rate": 1.6386079609162943e-05, "loss": 0.0231, "step": 14960 }, { "epoch": 49.56953642384106, "grad_norm": 0.2753278315067291, "learning_rate": 1.6324922532422742e-05, "loss": 0.0223, "step": 14970 }, { "epoch": 49.602649006622514, "grad_norm": 0.20879550278186798, "learning_rate": 1.6263857522058434e-05, "loss": 0.0204, "step": 14980 }, { "epoch": 49.63576158940398, "grad_norm": 0.13701865077018738, "learning_rate": 1.6202884745019443e-05, "loss": 0.0244, "step": 14990 }, { "epoch": 49.66887417218543, "grad_norm": 0.13872459530830383, "learning_rate": 1.614200436800304e-05, "loss": 0.0255, "step": 15000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 67, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }