{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 28.050490883590463, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014025245441795231, "grad_norm": 22.17071533203125, "learning_rate": 9e-07, "loss": 2.3107, "step": 10 }, { "epoch": 0.028050490883590462, "grad_norm": 25.1187686920166, "learning_rate": 1.9e-06, "loss": 2.2373, "step": 20 }, { "epoch": 0.04207573632538569, "grad_norm": 19.94733428955078, "learning_rate": 2.9e-06, "loss": 1.8289, "step": 30 }, { "epoch": 0.056100981767180924, "grad_norm": 21.66728401184082, "learning_rate": 3.9e-06, "loss": 1.7518, "step": 40 }, { "epoch": 0.07012622720897616, "grad_norm": 18.539276123046875, "learning_rate": 4.9000000000000005e-06, "loss": 1.2757, "step": 50 }, { "epoch": 0.08415147265077139, "grad_norm": 14.955641746520996, "learning_rate": 5.9e-06, "loss": 1.0142, "step": 60 }, { "epoch": 0.09817671809256662, "grad_norm": 15.744616508483887, "learning_rate": 6.900000000000001e-06, "loss": 0.9439, "step": 70 }, { "epoch": 0.11220196353436185, "grad_norm": 15.010833740234375, "learning_rate": 7.9e-06, "loss": 0.7815, "step": 80 }, { "epoch": 0.12622720897615708, "grad_norm": 6.252669334411621, "learning_rate": 8.9e-06, "loss": 0.633, "step": 90 }, { "epoch": 0.1402524544179523, "grad_norm": 2.7483913898468018, "learning_rate": 9.900000000000002e-06, "loss": 0.5101, "step": 100 }, { "epoch": 0.15427769985974754, "grad_norm": 2.7423720359802246, "learning_rate": 1.09e-05, "loss": 0.4377, "step": 110 }, { "epoch": 0.16830294530154277, "grad_norm": 1.6129422187805176, "learning_rate": 1.19e-05, "loss": 0.389, "step": 120 }, { "epoch": 0.182328190743338, "grad_norm": 1.4542351961135864, "learning_rate": 1.29e-05, "loss": 0.3244, "step": 130 }, { "epoch": 0.19635343618513323, "grad_norm": 2.468302011489868, "learning_rate": 1.3900000000000002e-05, "loss": 0.2929, "step": 140 }, { "epoch": 0.21037868162692847, "grad_norm": 1.512667179107666, "learning_rate": 1.49e-05, "loss": 0.2632, "step": 150 }, { "epoch": 0.2244039270687237, "grad_norm": 1.0917623043060303, "learning_rate": 1.59e-05, "loss": 0.2165, "step": 160 }, { "epoch": 0.23842917251051893, "grad_norm": 0.8670994639396667, "learning_rate": 1.69e-05, "loss": 0.203, "step": 170 }, { "epoch": 0.25245441795231416, "grad_norm": 0.8572374582290649, "learning_rate": 1.79e-05, "loss": 0.1821, "step": 180 }, { "epoch": 0.2664796633941094, "grad_norm": 0.7816066145896912, "learning_rate": 1.8900000000000002e-05, "loss": 0.1655, "step": 190 }, { "epoch": 0.2805049088359046, "grad_norm": 0.7851330041885376, "learning_rate": 1.9900000000000003e-05, "loss": 0.1479, "step": 200 }, { "epoch": 0.29453015427769985, "grad_norm": 0.633863091468811, "learning_rate": 2.09e-05, "loss": 0.1429, "step": 210 }, { "epoch": 0.3085553997194951, "grad_norm": 0.6532235741615295, "learning_rate": 2.19e-05, "loss": 0.1196, "step": 220 }, { "epoch": 0.3225806451612903, "grad_norm": 0.58732008934021, "learning_rate": 2.29e-05, "loss": 0.1113, "step": 230 }, { "epoch": 0.33660589060308554, "grad_norm": 0.5805008411407471, "learning_rate": 2.39e-05, "loss": 0.1027, "step": 240 }, { "epoch": 0.3506311360448808, "grad_norm": 0.6197958588600159, "learning_rate": 2.4900000000000002e-05, "loss": 0.0959, "step": 250 }, { "epoch": 0.364656381486676, "grad_norm": 0.49735227227211, "learning_rate": 2.5900000000000003e-05, "loss": 0.0866, "step": 260 }, { "epoch": 0.37868162692847124, "grad_norm": 0.6848756670951843, "learning_rate": 2.6900000000000003e-05, "loss": 0.0848, "step": 270 }, { "epoch": 0.39270687237026647, "grad_norm": 0.7571144700050354, "learning_rate": 2.7900000000000004e-05, "loss": 0.0886, "step": 280 }, { "epoch": 0.4067321178120617, "grad_norm": 0.5750849843025208, "learning_rate": 2.8899999999999998e-05, "loss": 0.0803, "step": 290 }, { "epoch": 0.42075736325385693, "grad_norm": 0.7033276557922363, "learning_rate": 2.9900000000000002e-05, "loss": 0.0758, "step": 300 }, { "epoch": 0.43478260869565216, "grad_norm": 0.6850725412368774, "learning_rate": 3.09e-05, "loss": 0.0859, "step": 310 }, { "epoch": 0.4488078541374474, "grad_norm": 0.7713612914085388, "learning_rate": 3.19e-05, "loss": 0.0716, "step": 320 }, { "epoch": 0.4628330995792426, "grad_norm": 0.6409364938735962, "learning_rate": 3.29e-05, "loss": 0.0684, "step": 330 }, { "epoch": 0.47685834502103785, "grad_norm": 0.7150951623916626, "learning_rate": 3.3900000000000004e-05, "loss": 0.0671, "step": 340 }, { "epoch": 0.4908835904628331, "grad_norm": 0.5204799175262451, "learning_rate": 3.49e-05, "loss": 0.0629, "step": 350 }, { "epoch": 0.5049088359046283, "grad_norm": 0.5199767351150513, "learning_rate": 3.59e-05, "loss": 0.063, "step": 360 }, { "epoch": 0.5189340813464236, "grad_norm": 0.7538745999336243, "learning_rate": 3.69e-05, "loss": 0.0579, "step": 370 }, { "epoch": 0.5329593267882188, "grad_norm": 0.7714184522628784, "learning_rate": 3.79e-05, "loss": 0.0578, "step": 380 }, { "epoch": 0.5469845722300141, "grad_norm": 0.6017686128616333, "learning_rate": 3.8900000000000004e-05, "loss": 0.0561, "step": 390 }, { "epoch": 0.5610098176718092, "grad_norm": 0.5977626442909241, "learning_rate": 3.99e-05, "loss": 0.0569, "step": 400 }, { "epoch": 0.5750350631136045, "grad_norm": 0.6179803609848022, "learning_rate": 4.09e-05, "loss": 0.0574, "step": 410 }, { "epoch": 0.5890603085553997, "grad_norm": 0.5554933547973633, "learning_rate": 4.19e-05, "loss": 0.0501, "step": 420 }, { "epoch": 0.603085553997195, "grad_norm": 0.5608754754066467, "learning_rate": 4.29e-05, "loss": 0.0491, "step": 430 }, { "epoch": 0.6171107994389902, "grad_norm": 0.8249959945678711, "learning_rate": 4.39e-05, "loss": 0.0528, "step": 440 }, { "epoch": 0.6311360448807855, "grad_norm": 0.4761039614677429, "learning_rate": 4.49e-05, "loss": 0.0582, "step": 450 }, { "epoch": 0.6451612903225806, "grad_norm": 0.6753378510475159, "learning_rate": 4.5900000000000004e-05, "loss": 0.0536, "step": 460 }, { "epoch": 0.6591865357643759, "grad_norm": 0.8747177124023438, "learning_rate": 4.69e-05, "loss": 0.0542, "step": 470 }, { "epoch": 0.6732117812061711, "grad_norm": 0.7381353378295898, "learning_rate": 4.79e-05, "loss": 0.0521, "step": 480 }, { "epoch": 0.6872370266479664, "grad_norm": 0.843453049659729, "learning_rate": 4.89e-05, "loss": 0.0501, "step": 490 }, { "epoch": 0.7012622720897616, "grad_norm": 0.7646147012710571, "learning_rate": 4.99e-05, "loss": 0.0509, "step": 500 }, { "epoch": 0.7152875175315568, "grad_norm": 0.5345742702484131, "learning_rate": 5.0900000000000004e-05, "loss": 0.0442, "step": 510 }, { "epoch": 0.729312762973352, "grad_norm": 0.6116300225257874, "learning_rate": 5.19e-05, "loss": 0.0493, "step": 520 }, { "epoch": 0.7433380084151473, "grad_norm": 0.7407103180885315, "learning_rate": 5.2900000000000005e-05, "loss": 0.0515, "step": 530 }, { "epoch": 0.7573632538569425, "grad_norm": 0.5777375102043152, "learning_rate": 5.390000000000001e-05, "loss": 0.0493, "step": 540 }, { "epoch": 0.7713884992987378, "grad_norm": 0.634828507900238, "learning_rate": 5.4900000000000006e-05, "loss": 0.0438, "step": 550 }, { "epoch": 0.7854137447405329, "grad_norm": 0.6257818937301636, "learning_rate": 5.590000000000001e-05, "loss": 0.049, "step": 560 }, { "epoch": 0.7994389901823282, "grad_norm": 0.6375992894172668, "learning_rate": 5.69e-05, "loss": 0.0487, "step": 570 }, { "epoch": 0.8134642356241234, "grad_norm": 0.4552578926086426, "learning_rate": 5.79e-05, "loss": 0.0435, "step": 580 }, { "epoch": 0.8274894810659187, "grad_norm": 0.4636882543563843, "learning_rate": 5.89e-05, "loss": 0.0412, "step": 590 }, { "epoch": 0.8415147265077139, "grad_norm": 0.4828931987285614, "learning_rate": 5.99e-05, "loss": 0.0406, "step": 600 }, { "epoch": 0.8555399719495091, "grad_norm": 0.5254333019256592, "learning_rate": 6.09e-05, "loss": 0.0382, "step": 610 }, { "epoch": 0.8695652173913043, "grad_norm": 0.6792616248130798, "learning_rate": 6.19e-05, "loss": 0.04, "step": 620 }, { "epoch": 0.8835904628330996, "grad_norm": 0.5945351719856262, "learning_rate": 6.29e-05, "loss": 0.0403, "step": 630 }, { "epoch": 0.8976157082748948, "grad_norm": 0.8767504692077637, "learning_rate": 6.390000000000001e-05, "loss": 0.0387, "step": 640 }, { "epoch": 0.9116409537166901, "grad_norm": 0.6875373125076294, "learning_rate": 6.49e-05, "loss": 0.0372, "step": 650 }, { "epoch": 0.9256661991584852, "grad_norm": 0.6573301553726196, "learning_rate": 6.59e-05, "loss": 0.0378, "step": 660 }, { "epoch": 0.9396914446002805, "grad_norm": 0.6323565244674683, "learning_rate": 6.690000000000001e-05, "loss": 0.0385, "step": 670 }, { "epoch": 0.9537166900420757, "grad_norm": 0.7380919456481934, "learning_rate": 6.790000000000001e-05, "loss": 0.0352, "step": 680 }, { "epoch": 0.967741935483871, "grad_norm": 0.9897159337997437, "learning_rate": 6.89e-05, "loss": 0.0374, "step": 690 }, { "epoch": 0.9817671809256662, "grad_norm": 0.8483231663703918, "learning_rate": 6.99e-05, "loss": 0.0375, "step": 700 }, { "epoch": 0.9957924263674615, "grad_norm": 0.8681412935256958, "learning_rate": 7.09e-05, "loss": 0.0437, "step": 710 }, { "epoch": 1.0098176718092566, "grad_norm": 0.6575011610984802, "learning_rate": 7.19e-05, "loss": 0.0393, "step": 720 }, { "epoch": 1.023842917251052, "grad_norm": 0.6262069344520569, "learning_rate": 7.29e-05, "loss": 0.0338, "step": 730 }, { "epoch": 1.0378681626928472, "grad_norm": 0.6191099882125854, "learning_rate": 7.390000000000001e-05, "loss": 0.0399, "step": 740 }, { "epoch": 1.0518934081346423, "grad_norm": 0.5923285484313965, "learning_rate": 7.49e-05, "loss": 0.0373, "step": 750 }, { "epoch": 1.0659186535764376, "grad_norm": 0.6219848394393921, "learning_rate": 7.59e-05, "loss": 0.0436, "step": 760 }, { "epoch": 1.0799438990182328, "grad_norm": 0.5840882658958435, "learning_rate": 7.69e-05, "loss": 0.0393, "step": 770 }, { "epoch": 1.0939691444600281, "grad_norm": 0.6952173113822937, "learning_rate": 7.790000000000001e-05, "loss": 0.0375, "step": 780 }, { "epoch": 1.1079943899018232, "grad_norm": 0.7534716129302979, "learning_rate": 7.890000000000001e-05, "loss": 0.0382, "step": 790 }, { "epoch": 1.1220196353436185, "grad_norm": 0.5402746200561523, "learning_rate": 7.99e-05, "loss": 0.0343, "step": 800 }, { "epoch": 1.1360448807854138, "grad_norm": 1.0442235469818115, "learning_rate": 8.090000000000001e-05, "loss": 0.0343, "step": 810 }, { "epoch": 1.150070126227209, "grad_norm": 0.663788914680481, "learning_rate": 8.19e-05, "loss": 0.0364, "step": 820 }, { "epoch": 1.1640953716690041, "grad_norm": 0.491360604763031, "learning_rate": 8.29e-05, "loss": 0.0329, "step": 830 }, { "epoch": 1.1781206171107994, "grad_norm": 0.4959655702114105, "learning_rate": 8.39e-05, "loss": 0.0365, "step": 840 }, { "epoch": 1.1921458625525947, "grad_norm": 0.8006618618965149, "learning_rate": 8.49e-05, "loss": 0.0353, "step": 850 }, { "epoch": 1.20617110799439, "grad_norm": 0.45286810398101807, "learning_rate": 8.59e-05, "loss": 0.0373, "step": 860 }, { "epoch": 1.220196353436185, "grad_norm": 0.5148314833641052, "learning_rate": 8.69e-05, "loss": 0.0336, "step": 870 }, { "epoch": 1.2342215988779803, "grad_norm": 0.4781610667705536, "learning_rate": 8.790000000000001e-05, "loss": 0.0343, "step": 880 }, { "epoch": 1.2482468443197756, "grad_norm": 0.5660382509231567, "learning_rate": 8.89e-05, "loss": 0.0326, "step": 890 }, { "epoch": 1.262272089761571, "grad_norm": 0.4245665967464447, "learning_rate": 8.99e-05, "loss": 0.0305, "step": 900 }, { "epoch": 1.276297335203366, "grad_norm": 0.7362809181213379, "learning_rate": 9.090000000000001e-05, "loss": 0.0299, "step": 910 }, { "epoch": 1.2903225806451613, "grad_norm": 0.5507371425628662, "learning_rate": 9.190000000000001e-05, "loss": 0.0338, "step": 920 }, { "epoch": 1.3043478260869565, "grad_norm": 0.6224228143692017, "learning_rate": 9.290000000000001e-05, "loss": 0.0325, "step": 930 }, { "epoch": 1.3183730715287518, "grad_norm": 0.4563072621822357, "learning_rate": 9.39e-05, "loss": 0.0301, "step": 940 }, { "epoch": 1.3323983169705471, "grad_norm": 0.7094169855117798, "learning_rate": 9.49e-05, "loss": 0.0345, "step": 950 }, { "epoch": 1.3464235624123422, "grad_norm": 0.7290515899658203, "learning_rate": 9.59e-05, "loss": 0.0349, "step": 960 }, { "epoch": 1.3604488078541375, "grad_norm": 0.5639945864677429, "learning_rate": 9.69e-05, "loss": 0.0334, "step": 970 }, { "epoch": 1.3744740532959328, "grad_norm": 0.5375191569328308, "learning_rate": 9.790000000000001e-05, "loss": 0.0349, "step": 980 }, { "epoch": 1.3884992987377278, "grad_norm": 0.6211612820625305, "learning_rate": 9.89e-05, "loss": 0.0327, "step": 990 }, { "epoch": 1.402524544179523, "grad_norm": 0.5266867876052856, "learning_rate": 9.99e-05, "loss": 0.0341, "step": 1000 }, { "epoch": 1.4165497896213184, "grad_norm": 0.540449321269989, "learning_rate": 9.999994463727085e-05, "loss": 0.0366, "step": 1010 }, { "epoch": 1.4305750350631137, "grad_norm": 0.7496849894523621, "learning_rate": 9.999975326009292e-05, "loss": 0.0351, "step": 1020 }, { "epoch": 1.444600280504909, "grad_norm": 0.9782182574272156, "learning_rate": 9.999942518549879e-05, "loss": 0.0364, "step": 1030 }, { "epoch": 1.458625525946704, "grad_norm": 0.54149329662323, "learning_rate": 9.999896041438544e-05, "loss": 0.0355, "step": 1040 }, { "epoch": 1.4726507713884993, "grad_norm": 0.7279684543609619, "learning_rate": 9.999835894802353e-05, "loss": 0.0331, "step": 1050 }, { "epoch": 1.4866760168302946, "grad_norm": 0.5277300477027893, "learning_rate": 9.999762078805743e-05, "loss": 0.0334, "step": 1060 }, { "epoch": 1.5007012622720897, "grad_norm": 0.567764401435852, "learning_rate": 9.999674593650526e-05, "loss": 0.0291, "step": 1070 }, { "epoch": 1.514726507713885, "grad_norm": 0.6632901430130005, "learning_rate": 9.99957343957588e-05, "loss": 0.0273, "step": 1080 }, { "epoch": 1.5287517531556802, "grad_norm": 0.41641584038734436, "learning_rate": 9.99945861685836e-05, "loss": 0.0305, "step": 1090 }, { "epoch": 1.5427769985974753, "grad_norm": 0.6843762993812561, "learning_rate": 9.999330125811884e-05, "loss": 0.0329, "step": 1100 }, { "epoch": 1.5568022440392708, "grad_norm": 0.5566938519477844, "learning_rate": 9.999187966787744e-05, "loss": 0.0266, "step": 1110 }, { "epoch": 1.5708274894810659, "grad_norm": 0.4862300753593445, "learning_rate": 9.999032140174595e-05, "loss": 0.03, "step": 1120 }, { "epoch": 1.5848527349228612, "grad_norm": 0.47158119082450867, "learning_rate": 9.998862646398464e-05, "loss": 0.0304, "step": 1130 }, { "epoch": 1.5988779803646564, "grad_norm": 0.7062301635742188, "learning_rate": 9.998679485922739e-05, "loss": 0.0297, "step": 1140 }, { "epoch": 1.6129032258064515, "grad_norm": 0.6044575572013855, "learning_rate": 9.998482659248174e-05, "loss": 0.0298, "step": 1150 }, { "epoch": 1.6269284712482468, "grad_norm": 0.4926566481590271, "learning_rate": 9.998272166912883e-05, "loss": 0.0308, "step": 1160 }, { "epoch": 1.640953716690042, "grad_norm": 0.5544949173927307, "learning_rate": 9.998048009492347e-05, "loss": 0.0316, "step": 1170 }, { "epoch": 1.6549789621318372, "grad_norm": 0.7231107354164124, "learning_rate": 9.997810187599403e-05, "loss": 0.0305, "step": 1180 }, { "epoch": 1.6690042075736327, "grad_norm": 0.7011065483093262, "learning_rate": 9.997558701884249e-05, "loss": 0.0338, "step": 1190 }, { "epoch": 1.6830294530154277, "grad_norm": 0.5220640301704407, "learning_rate": 9.997293553034433e-05, "loss": 0.0309, "step": 1200 }, { "epoch": 1.697054698457223, "grad_norm": 0.5792560577392578, "learning_rate": 9.997014741774866e-05, "loss": 0.0321, "step": 1210 }, { "epoch": 1.7110799438990183, "grad_norm": 0.6253339648246765, "learning_rate": 9.996722268867803e-05, "loss": 0.0269, "step": 1220 }, { "epoch": 1.7251051893408134, "grad_norm": 0.533940851688385, "learning_rate": 9.996416135112858e-05, "loss": 0.0308, "step": 1230 }, { "epoch": 1.7391304347826086, "grad_norm": 0.6302036643028259, "learning_rate": 9.996096341346988e-05, "loss": 0.0299, "step": 1240 }, { "epoch": 1.753155680224404, "grad_norm": 0.6721609830856323, "learning_rate": 9.995762888444495e-05, "loss": 0.0296, "step": 1250 }, { "epoch": 1.767180925666199, "grad_norm": 0.5527097582817078, "learning_rate": 9.995415777317027e-05, "loss": 0.0262, "step": 1260 }, { "epoch": 1.7812061711079945, "grad_norm": 0.5017527937889099, "learning_rate": 9.995055008913574e-05, "loss": 0.0313, "step": 1270 }, { "epoch": 1.7952314165497896, "grad_norm": 0.6044161319732666, "learning_rate": 9.994680584220463e-05, "loss": 0.0283, "step": 1280 }, { "epoch": 1.8092566619915849, "grad_norm": 0.571279764175415, "learning_rate": 9.994292504261355e-05, "loss": 0.0296, "step": 1290 }, { "epoch": 1.8232819074333801, "grad_norm": 0.3652215003967285, "learning_rate": 9.993890770097247e-05, "loss": 0.0302, "step": 1300 }, { "epoch": 1.8373071528751752, "grad_norm": 0.5293875932693481, "learning_rate": 9.993475382826467e-05, "loss": 0.0271, "step": 1310 }, { "epoch": 1.8513323983169705, "grad_norm": 0.506298840045929, "learning_rate": 9.993046343584664e-05, "loss": 0.0334, "step": 1320 }, { "epoch": 1.8653576437587658, "grad_norm": 0.36554843187332153, "learning_rate": 9.992603653544816e-05, "loss": 0.0302, "step": 1330 }, { "epoch": 1.8793828892005608, "grad_norm": 0.3352556824684143, "learning_rate": 9.992147313917222e-05, "loss": 0.0242, "step": 1340 }, { "epoch": 1.8934081346423564, "grad_norm": 0.47855645418167114, "learning_rate": 9.991677325949497e-05, "loss": 0.0288, "step": 1350 }, { "epoch": 1.9074333800841514, "grad_norm": 0.5517224669456482, "learning_rate": 9.991193690926568e-05, "loss": 0.0335, "step": 1360 }, { "epoch": 1.9214586255259467, "grad_norm": 0.5608410239219666, "learning_rate": 9.990696410170678e-05, "loss": 0.0246, "step": 1370 }, { "epoch": 1.935483870967742, "grad_norm": 0.39530378580093384, "learning_rate": 9.990185485041371e-05, "loss": 0.0244, "step": 1380 }, { "epoch": 1.949509116409537, "grad_norm": 0.49722835421562195, "learning_rate": 9.989660916935498e-05, "loss": 0.0243, "step": 1390 }, { "epoch": 1.9635343618513323, "grad_norm": 0.5666421055793762, "learning_rate": 9.989122707287208e-05, "loss": 0.0223, "step": 1400 }, { "epoch": 1.9775596072931276, "grad_norm": 0.5772007703781128, "learning_rate": 9.988570857567945e-05, "loss": 0.0236, "step": 1410 }, { "epoch": 1.9915848527349227, "grad_norm": 0.48771339654922485, "learning_rate": 9.988005369286446e-05, "loss": 0.0249, "step": 1420 }, { "epoch": 2.005610098176718, "grad_norm": 0.756152331829071, "learning_rate": 9.987426243988734e-05, "loss": 0.0287, "step": 1430 }, { "epoch": 2.0196353436185133, "grad_norm": 0.5374137163162231, "learning_rate": 9.986833483258114e-05, "loss": 0.0224, "step": 1440 }, { "epoch": 2.0336605890603083, "grad_norm": 0.4821907579898834, "learning_rate": 9.986227088715173e-05, "loss": 0.0231, "step": 1450 }, { "epoch": 2.047685834502104, "grad_norm": 0.5262070298194885, "learning_rate": 9.98560706201777e-05, "loss": 0.0264, "step": 1460 }, { "epoch": 2.061711079943899, "grad_norm": 0.47368595004081726, "learning_rate": 9.984973404861036e-05, "loss": 0.0251, "step": 1470 }, { "epoch": 2.0757363253856944, "grad_norm": 0.3475220203399658, "learning_rate": 9.984326118977361e-05, "loss": 0.0235, "step": 1480 }, { "epoch": 2.0897615708274895, "grad_norm": 0.429341197013855, "learning_rate": 9.983665206136406e-05, "loss": 0.0271, "step": 1490 }, { "epoch": 2.1037868162692845, "grad_norm": 0.5548306107521057, "learning_rate": 9.982990668145075e-05, "loss": 0.0235, "step": 1500 }, { "epoch": 2.11781206171108, "grad_norm": 0.5300411581993103, "learning_rate": 9.982302506847534e-05, "loss": 0.0228, "step": 1510 }, { "epoch": 2.131837307152875, "grad_norm": 0.5319258570671082, "learning_rate": 9.981600724125189e-05, "loss": 0.0247, "step": 1520 }, { "epoch": 2.1458625525946706, "grad_norm": 0.5253931879997253, "learning_rate": 9.980885321896685e-05, "loss": 0.0268, "step": 1530 }, { "epoch": 2.1598877980364657, "grad_norm": 0.4208759367465973, "learning_rate": 9.980156302117905e-05, "loss": 0.0255, "step": 1540 }, { "epoch": 2.1739130434782608, "grad_norm": 0.4674943685531616, "learning_rate": 9.979413666781963e-05, "loss": 0.0262, "step": 1550 }, { "epoch": 2.1879382889200563, "grad_norm": 0.3543790876865387, "learning_rate": 9.978657417919193e-05, "loss": 0.0271, "step": 1560 }, { "epoch": 2.2019635343618513, "grad_norm": 0.43492788076400757, "learning_rate": 9.977887557597153e-05, "loss": 0.0262, "step": 1570 }, { "epoch": 2.2159887798036464, "grad_norm": 0.42585092782974243, "learning_rate": 9.97710408792061e-05, "loss": 0.0248, "step": 1580 }, { "epoch": 2.230014025245442, "grad_norm": 0.478304922580719, "learning_rate": 9.976307011031542e-05, "loss": 0.0215, "step": 1590 }, { "epoch": 2.244039270687237, "grad_norm": 0.3462429642677307, "learning_rate": 9.975496329109126e-05, "loss": 0.029, "step": 1600 }, { "epoch": 2.258064516129032, "grad_norm": 0.42855706810951233, "learning_rate": 9.974672044369732e-05, "loss": 0.0214, "step": 1610 }, { "epoch": 2.2720897615708275, "grad_norm": 0.4495585858821869, "learning_rate": 9.97383415906693e-05, "loss": 0.0224, "step": 1620 }, { "epoch": 2.2861150070126226, "grad_norm": 0.5009020566940308, "learning_rate": 9.97298267549146e-05, "loss": 0.0247, "step": 1630 }, { "epoch": 2.300140252454418, "grad_norm": 0.4345225989818573, "learning_rate": 9.972117595971249e-05, "loss": 0.0239, "step": 1640 }, { "epoch": 2.314165497896213, "grad_norm": 0.5536576509475708, "learning_rate": 9.971238922871391e-05, "loss": 0.0242, "step": 1650 }, { "epoch": 2.3281907433380082, "grad_norm": 0.5379785895347595, "learning_rate": 9.970346658594142e-05, "loss": 0.0229, "step": 1660 }, { "epoch": 2.3422159887798037, "grad_norm": 0.4857536554336548, "learning_rate": 9.969440805578923e-05, "loss": 0.025, "step": 1670 }, { "epoch": 2.356241234221599, "grad_norm": 0.30995380878448486, "learning_rate": 9.968521366302298e-05, "loss": 0.0228, "step": 1680 }, { "epoch": 2.3702664796633943, "grad_norm": 0.6400951147079468, "learning_rate": 9.967588343277981e-05, "loss": 0.022, "step": 1690 }, { "epoch": 2.3842917251051894, "grad_norm": 0.4108792245388031, "learning_rate": 9.966641739056818e-05, "loss": 0.024, "step": 1700 }, { "epoch": 2.3983169705469845, "grad_norm": 0.4272639751434326, "learning_rate": 9.965681556226793e-05, "loss": 0.0263, "step": 1710 }, { "epoch": 2.41234221598878, "grad_norm": 0.6711747050285339, "learning_rate": 9.964707797413006e-05, "loss": 0.0251, "step": 1720 }, { "epoch": 2.426367461430575, "grad_norm": 0.5560961961746216, "learning_rate": 9.963720465277679e-05, "loss": 0.0246, "step": 1730 }, { "epoch": 2.44039270687237, "grad_norm": 0.45505061745643616, "learning_rate": 9.96271956252014e-05, "loss": 0.0214, "step": 1740 }, { "epoch": 2.4544179523141656, "grad_norm": 0.4494926631450653, "learning_rate": 9.961705091876816e-05, "loss": 0.0233, "step": 1750 }, { "epoch": 2.4684431977559607, "grad_norm": 0.3989627957344055, "learning_rate": 9.960677056121235e-05, "loss": 0.0214, "step": 1760 }, { "epoch": 2.4824684431977557, "grad_norm": 0.289046972990036, "learning_rate": 9.959635458064005e-05, "loss": 0.026, "step": 1770 }, { "epoch": 2.4964936886395512, "grad_norm": 0.4921347200870514, "learning_rate": 9.958580300552815e-05, "loss": 0.0252, "step": 1780 }, { "epoch": 2.5105189340813463, "grad_norm": 0.2858704626560211, "learning_rate": 9.957511586472426e-05, "loss": 0.0243, "step": 1790 }, { "epoch": 2.524544179523142, "grad_norm": 0.5276514291763306, "learning_rate": 9.956429318744662e-05, "loss": 0.0255, "step": 1800 }, { "epoch": 2.538569424964937, "grad_norm": 0.5681992173194885, "learning_rate": 9.955333500328404e-05, "loss": 0.025, "step": 1810 }, { "epoch": 2.552594670406732, "grad_norm": 0.49297747015953064, "learning_rate": 9.95422413421957e-05, "loss": 0.0215, "step": 1820 }, { "epoch": 2.5666199158485274, "grad_norm": 0.4279099404811859, "learning_rate": 9.953101223451133e-05, "loss": 0.0232, "step": 1830 }, { "epoch": 2.5806451612903225, "grad_norm": 0.7610570788383484, "learning_rate": 9.951964771093085e-05, "loss": 0.0219, "step": 1840 }, { "epoch": 2.594670406732118, "grad_norm": 0.3718789517879486, "learning_rate": 9.950814780252442e-05, "loss": 0.0209, "step": 1850 }, { "epoch": 2.608695652173913, "grad_norm": 0.5335069298744202, "learning_rate": 9.949651254073236e-05, "loss": 0.0218, "step": 1860 }, { "epoch": 2.622720897615708, "grad_norm": 0.6871110200881958, "learning_rate": 9.948474195736504e-05, "loss": 0.0238, "step": 1870 }, { "epoch": 2.6367461430575037, "grad_norm": 0.481277197599411, "learning_rate": 9.947283608460277e-05, "loss": 0.0214, "step": 1880 }, { "epoch": 2.6507713884992987, "grad_norm": 0.5350204706192017, "learning_rate": 9.946079495499577e-05, "loss": 0.0229, "step": 1890 }, { "epoch": 2.6647966339410942, "grad_norm": 0.6527811884880066, "learning_rate": 9.944861860146401e-05, "loss": 0.0231, "step": 1900 }, { "epoch": 2.6788218793828893, "grad_norm": 0.3901378810405731, "learning_rate": 9.943630705729719e-05, "loss": 0.023, "step": 1910 }, { "epoch": 2.6928471248246844, "grad_norm": 0.5257353186607361, "learning_rate": 9.942386035615459e-05, "loss": 0.022, "step": 1920 }, { "epoch": 2.7068723702664794, "grad_norm": 0.599141001701355, "learning_rate": 9.941127853206503e-05, "loss": 0.0206, "step": 1930 }, { "epoch": 2.720897615708275, "grad_norm": 0.5852662920951843, "learning_rate": 9.939856161942673e-05, "loss": 0.0277, "step": 1940 }, { "epoch": 2.73492286115007, "grad_norm": 0.4969964027404785, "learning_rate": 9.938570965300724e-05, "loss": 0.0202, "step": 1950 }, { "epoch": 2.7489481065918655, "grad_norm": 0.3696187734603882, "learning_rate": 9.937272266794335e-05, "loss": 0.0219, "step": 1960 }, { "epoch": 2.7629733520336606, "grad_norm": 0.38774561882019043, "learning_rate": 9.935960069974096e-05, "loss": 0.0235, "step": 1970 }, { "epoch": 2.7769985974754556, "grad_norm": 0.4430711567401886, "learning_rate": 9.934634378427506e-05, "loss": 0.0261, "step": 1980 }, { "epoch": 2.791023842917251, "grad_norm": 0.45368659496307373, "learning_rate": 9.933295195778954e-05, "loss": 0.0222, "step": 1990 }, { "epoch": 2.805049088359046, "grad_norm": 0.473530650138855, "learning_rate": 9.931942525689715e-05, "loss": 0.0219, "step": 2000 }, { "epoch": 2.8190743338008417, "grad_norm": 0.8025186657905579, "learning_rate": 9.930576371857936e-05, "loss": 0.0235, "step": 2010 }, { "epoch": 2.833099579242637, "grad_norm": 0.5754848718643188, "learning_rate": 9.929196738018629e-05, "loss": 0.0223, "step": 2020 }, { "epoch": 2.847124824684432, "grad_norm": 0.35400086641311646, "learning_rate": 9.927803627943662e-05, "loss": 0.0203, "step": 2030 }, { "epoch": 2.8611500701262274, "grad_norm": 0.5690286755561829, "learning_rate": 9.926397045441744e-05, "loss": 0.0241, "step": 2040 }, { "epoch": 2.8751753155680224, "grad_norm": 0.39585182070732117, "learning_rate": 9.924976994358417e-05, "loss": 0.025, "step": 2050 }, { "epoch": 2.889200561009818, "grad_norm": 0.4901697337627411, "learning_rate": 9.923543478576048e-05, "loss": 0.0224, "step": 2060 }, { "epoch": 2.903225806451613, "grad_norm": 0.3694133758544922, "learning_rate": 9.922096502013813e-05, "loss": 0.025, "step": 2070 }, { "epoch": 2.917251051893408, "grad_norm": 0.9573909044265747, "learning_rate": 9.92063606862769e-05, "loss": 0.027, "step": 2080 }, { "epoch": 2.931276297335203, "grad_norm": 0.9195519685745239, "learning_rate": 9.919162182410453e-05, "loss": 0.0228, "step": 2090 }, { "epoch": 2.9453015427769986, "grad_norm": 0.4038443863391876, "learning_rate": 9.917674847391645e-05, "loss": 0.0261, "step": 2100 }, { "epoch": 2.9593267882187937, "grad_norm": 0.5364935994148254, "learning_rate": 9.916174067637584e-05, "loss": 0.0228, "step": 2110 }, { "epoch": 2.973352033660589, "grad_norm": 0.3995906412601471, "learning_rate": 9.914659847251348e-05, "loss": 0.0236, "step": 2120 }, { "epoch": 2.9873772791023843, "grad_norm": 0.3220432698726654, "learning_rate": 9.913132190372753e-05, "loss": 0.0229, "step": 2130 }, { "epoch": 3.0014025245441793, "grad_norm": 0.414442241191864, "learning_rate": 9.911591101178359e-05, "loss": 0.0224, "step": 2140 }, { "epoch": 3.015427769985975, "grad_norm": 0.6580541729927063, "learning_rate": 9.910036583881443e-05, "loss": 0.0213, "step": 2150 }, { "epoch": 3.02945301542777, "grad_norm": 0.500247061252594, "learning_rate": 9.908468642731995e-05, "loss": 0.0207, "step": 2160 }, { "epoch": 3.0434782608695654, "grad_norm": 0.5789839625358582, "learning_rate": 9.906887282016707e-05, "loss": 0.0212, "step": 2170 }, { "epoch": 3.0575035063113605, "grad_norm": 0.5131335854530334, "learning_rate": 9.90529250605896e-05, "loss": 0.0257, "step": 2180 }, { "epoch": 3.0715287517531555, "grad_norm": 0.4165535867214203, "learning_rate": 9.903684319218809e-05, "loss": 0.0225, "step": 2190 }, { "epoch": 3.085553997194951, "grad_norm": 0.49071380496025085, "learning_rate": 9.902062725892976e-05, "loss": 0.0195, "step": 2200 }, { "epoch": 3.099579242636746, "grad_norm": 0.3882966935634613, "learning_rate": 9.900427730514834e-05, "loss": 0.0196, "step": 2210 }, { "epoch": 3.113604488078541, "grad_norm": 0.3937629461288452, "learning_rate": 9.8987793375544e-05, "loss": 0.0187, "step": 2220 }, { "epoch": 3.1276297335203367, "grad_norm": 0.36516687273979187, "learning_rate": 9.897117551518318e-05, "loss": 0.0223, "step": 2230 }, { "epoch": 3.1416549789621318, "grad_norm": 0.4005696475505829, "learning_rate": 9.895442376949844e-05, "loss": 0.0243, "step": 2240 }, { "epoch": 3.1556802244039273, "grad_norm": 0.4739917814731598, "learning_rate": 9.893753818428845e-05, "loss": 0.0249, "step": 2250 }, { "epoch": 3.1697054698457223, "grad_norm": 0.39732709527015686, "learning_rate": 9.892051880571773e-05, "loss": 0.0222, "step": 2260 }, { "epoch": 3.1837307152875174, "grad_norm": 0.357051819562912, "learning_rate": 9.890336568031663e-05, "loss": 0.0234, "step": 2270 }, { "epoch": 3.197755960729313, "grad_norm": 0.40445685386657715, "learning_rate": 9.888607885498113e-05, "loss": 0.0227, "step": 2280 }, { "epoch": 3.211781206171108, "grad_norm": 0.44902658462524414, "learning_rate": 9.886865837697275e-05, "loss": 0.0204, "step": 2290 }, { "epoch": 3.225806451612903, "grad_norm": 0.30404961109161377, "learning_rate": 9.88511042939184e-05, "loss": 0.0218, "step": 2300 }, { "epoch": 3.2398316970546985, "grad_norm": 0.3803367018699646, "learning_rate": 9.883341665381028e-05, "loss": 0.0213, "step": 2310 }, { "epoch": 3.2538569424964936, "grad_norm": 0.3949420750141144, "learning_rate": 9.881559550500575e-05, "loss": 0.0196, "step": 2320 }, { "epoch": 3.267882187938289, "grad_norm": 0.35858362913131714, "learning_rate": 9.879764089622712e-05, "loss": 0.0232, "step": 2330 }, { "epoch": 3.281907433380084, "grad_norm": 0.4899305999279022, "learning_rate": 9.87795528765616e-05, "loss": 0.0239, "step": 2340 }, { "epoch": 3.2959326788218792, "grad_norm": 0.3478863835334778, "learning_rate": 9.876133149546118e-05, "loss": 0.0213, "step": 2350 }, { "epoch": 3.3099579242636747, "grad_norm": 0.2943876385688782, "learning_rate": 9.874297680274238e-05, "loss": 0.0211, "step": 2360 }, { "epoch": 3.32398316970547, "grad_norm": 0.446048766374588, "learning_rate": 9.872448884858624e-05, "loss": 0.0243, "step": 2370 }, { "epoch": 3.3380084151472653, "grad_norm": 0.44767916202545166, "learning_rate": 9.870586768353815e-05, "loss": 0.0217, "step": 2380 }, { "epoch": 3.3520336605890604, "grad_norm": 0.6240367889404297, "learning_rate": 9.868711335850764e-05, "loss": 0.0222, "step": 2390 }, { "epoch": 3.3660589060308554, "grad_norm": 0.540012538433075, "learning_rate": 9.866822592476833e-05, "loss": 0.0221, "step": 2400 }, { "epoch": 3.380084151472651, "grad_norm": 0.3778727948665619, "learning_rate": 9.86492054339577e-05, "loss": 0.0196, "step": 2410 }, { "epoch": 3.394109396914446, "grad_norm": 0.514531672000885, "learning_rate": 9.863005193807711e-05, "loss": 0.0217, "step": 2420 }, { "epoch": 3.408134642356241, "grad_norm": 0.4259992241859436, "learning_rate": 9.861076548949143e-05, "loss": 0.0211, "step": 2430 }, { "epoch": 3.4221598877980366, "grad_norm": 0.31876450777053833, "learning_rate": 9.859134614092912e-05, "loss": 0.0196, "step": 2440 }, { "epoch": 3.4361851332398317, "grad_norm": 0.3842601776123047, "learning_rate": 9.857179394548191e-05, "loss": 0.0204, "step": 2450 }, { "epoch": 3.4502103786816267, "grad_norm": 0.41069552302360535, "learning_rate": 9.855210895660477e-05, "loss": 0.0212, "step": 2460 }, { "epoch": 3.4642356241234222, "grad_norm": 0.4412436783313751, "learning_rate": 9.853229122811568e-05, "loss": 0.0207, "step": 2470 }, { "epoch": 3.4782608695652173, "grad_norm": 0.4764294922351837, "learning_rate": 9.851234081419559e-05, "loss": 0.0234, "step": 2480 }, { "epoch": 3.492286115007013, "grad_norm": 0.36828526854515076, "learning_rate": 9.849225776938814e-05, "loss": 0.0177, "step": 2490 }, { "epoch": 3.506311360448808, "grad_norm": 0.3346124589443207, "learning_rate": 9.847204214859964e-05, "loss": 0.022, "step": 2500 }, { "epoch": 3.520336605890603, "grad_norm": 0.31021836400032043, "learning_rate": 9.845169400709879e-05, "loss": 0.0202, "step": 2510 }, { "epoch": 3.5343618513323984, "grad_norm": 0.5396780967712402, "learning_rate": 9.843121340051664e-05, "loss": 0.0218, "step": 2520 }, { "epoch": 3.5483870967741935, "grad_norm": 0.34304407238960266, "learning_rate": 9.841060038484641e-05, "loss": 0.0187, "step": 2530 }, { "epoch": 3.562412342215989, "grad_norm": 0.5475286841392517, "learning_rate": 9.838985501644328e-05, "loss": 0.0215, "step": 2540 }, { "epoch": 3.576437587657784, "grad_norm": 0.49284565448760986, "learning_rate": 9.83689773520243e-05, "loss": 0.0233, "step": 2550 }, { "epoch": 3.590462833099579, "grad_norm": 0.3187533915042877, "learning_rate": 9.834796744866819e-05, "loss": 0.0213, "step": 2560 }, { "epoch": 3.604488078541374, "grad_norm": 0.37108996510505676, "learning_rate": 9.832682536381525e-05, "loss": 0.0197, "step": 2570 }, { "epoch": 3.6185133239831697, "grad_norm": 0.3552638590335846, "learning_rate": 9.830555115526711e-05, "loss": 0.0226, "step": 2580 }, { "epoch": 3.632538569424965, "grad_norm": 0.4856904149055481, "learning_rate": 9.828414488118667e-05, "loss": 0.0203, "step": 2590 }, { "epoch": 3.6465638148667603, "grad_norm": 0.38604024052619934, "learning_rate": 9.826260660009785e-05, "loss": 0.0202, "step": 2600 }, { "epoch": 3.6605890603085554, "grad_norm": 0.4104976952075958, "learning_rate": 9.824093637088547e-05, "loss": 0.0186, "step": 2610 }, { "epoch": 3.6746143057503504, "grad_norm": 0.43835514783859253, "learning_rate": 9.821913425279514e-05, "loss": 0.0197, "step": 2620 }, { "epoch": 3.688639551192146, "grad_norm": 0.4827972650527954, "learning_rate": 9.8197200305433e-05, "loss": 0.0217, "step": 2630 }, { "epoch": 3.702664796633941, "grad_norm": 0.5964663028717041, "learning_rate": 9.817513458876564e-05, "loss": 0.0226, "step": 2640 }, { "epoch": 3.7166900420757365, "grad_norm": 0.5359312891960144, "learning_rate": 9.815293716311987e-05, "loss": 0.0228, "step": 2650 }, { "epoch": 3.7307152875175316, "grad_norm": 0.496905654668808, "learning_rate": 9.813060808918262e-05, "loss": 0.0192, "step": 2660 }, { "epoch": 3.7447405329593266, "grad_norm": 0.5463829040527344, "learning_rate": 9.810814742800069e-05, "loss": 0.0199, "step": 2670 }, { "epoch": 3.758765778401122, "grad_norm": 0.6641817092895508, "learning_rate": 9.808555524098074e-05, "loss": 0.0193, "step": 2680 }, { "epoch": 3.772791023842917, "grad_norm": 0.42474281787872314, "learning_rate": 9.806283158988887e-05, "loss": 0.0205, "step": 2690 }, { "epoch": 3.7868162692847127, "grad_norm": 0.5956943035125732, "learning_rate": 9.803997653685072e-05, "loss": 0.0187, "step": 2700 }, { "epoch": 3.8008415147265078, "grad_norm": 0.4028650224208832, "learning_rate": 9.801699014435112e-05, "loss": 0.0238, "step": 2710 }, { "epoch": 3.814866760168303, "grad_norm": 0.5186575055122375, "learning_rate": 9.799387247523398e-05, "loss": 0.0191, "step": 2720 }, { "epoch": 3.828892005610098, "grad_norm": 0.4395618438720703, "learning_rate": 9.797062359270215e-05, "loss": 0.0202, "step": 2730 }, { "epoch": 3.8429172510518934, "grad_norm": 0.40920400619506836, "learning_rate": 9.794724356031715e-05, "loss": 0.0216, "step": 2740 }, { "epoch": 3.8569424964936885, "grad_norm": 0.4983864426612854, "learning_rate": 9.792373244199913e-05, "loss": 0.0224, "step": 2750 }, { "epoch": 3.870967741935484, "grad_norm": 0.48729512095451355, "learning_rate": 9.790009030202658e-05, "loss": 0.0216, "step": 2760 }, { "epoch": 3.884992987377279, "grad_norm": 0.3918007016181946, "learning_rate": 9.78763172050362e-05, "loss": 0.0188, "step": 2770 }, { "epoch": 3.899018232819074, "grad_norm": 0.4873197376728058, "learning_rate": 9.785241321602274e-05, "loss": 0.0224, "step": 2780 }, { "epoch": 3.9130434782608696, "grad_norm": 0.38729774951934814, "learning_rate": 9.782837840033879e-05, "loss": 0.0221, "step": 2790 }, { "epoch": 3.9270687237026647, "grad_norm": 0.4236994683742523, "learning_rate": 9.780421282369461e-05, "loss": 0.0213, "step": 2800 }, { "epoch": 3.94109396914446, "grad_norm": 0.422280877828598, "learning_rate": 9.777991655215797e-05, "loss": 0.0186, "step": 2810 }, { "epoch": 3.9551192145862553, "grad_norm": 0.30596524477005005, "learning_rate": 9.775548965215394e-05, "loss": 0.0188, "step": 2820 }, { "epoch": 3.9691444600280503, "grad_norm": 0.4848993122577667, "learning_rate": 9.773093219046474e-05, "loss": 0.0207, "step": 2830 }, { "epoch": 3.983169705469846, "grad_norm": 0.4128357470035553, "learning_rate": 9.770624423422954e-05, "loss": 0.0226, "step": 2840 }, { "epoch": 3.997194950911641, "grad_norm": 0.33639654517173767, "learning_rate": 9.768142585094426e-05, "loss": 0.0186, "step": 2850 }, { "epoch": 4.011220196353436, "grad_norm": 0.699684739112854, "learning_rate": 9.765647710846142e-05, "loss": 0.0178, "step": 2860 }, { "epoch": 4.0252454417952315, "grad_norm": 0.4630524814128876, "learning_rate": 9.763139807498991e-05, "loss": 0.021, "step": 2870 }, { "epoch": 4.0392706872370265, "grad_norm": 0.3919309973716736, "learning_rate": 9.760618881909487e-05, "loss": 0.0195, "step": 2880 }, { "epoch": 4.053295932678822, "grad_norm": 0.32120126485824585, "learning_rate": 9.758084940969744e-05, "loss": 0.0172, "step": 2890 }, { "epoch": 4.067321178120617, "grad_norm": 0.34362781047821045, "learning_rate": 9.755537991607459e-05, "loss": 0.0185, "step": 2900 }, { "epoch": 4.081346423562413, "grad_norm": 0.49441805481910706, "learning_rate": 9.752978040785895e-05, "loss": 0.0179, "step": 2910 }, { "epoch": 4.095371669004208, "grad_norm": 0.3296351432800293, "learning_rate": 9.750405095503859e-05, "loss": 0.0162, "step": 2920 }, { "epoch": 4.109396914446003, "grad_norm": 0.4605567753314972, "learning_rate": 9.747819162795686e-05, "loss": 0.0227, "step": 2930 }, { "epoch": 4.123422159887798, "grad_norm": 0.386587917804718, "learning_rate": 9.745220249731217e-05, "loss": 0.0217, "step": 2940 }, { "epoch": 4.137447405329593, "grad_norm": 0.3634292483329773, "learning_rate": 9.742608363415781e-05, "loss": 0.0211, "step": 2950 }, { "epoch": 4.151472650771389, "grad_norm": 0.3919965624809265, "learning_rate": 9.739983510990176e-05, "loss": 0.0212, "step": 2960 }, { "epoch": 4.165497896213184, "grad_norm": 0.49229857325553894, "learning_rate": 9.737345699630647e-05, "loss": 0.0188, "step": 2970 }, { "epoch": 4.179523141654979, "grad_norm": 0.4552806317806244, "learning_rate": 9.734694936548869e-05, "loss": 0.0206, "step": 2980 }, { "epoch": 4.193548387096774, "grad_norm": 0.5032992959022522, "learning_rate": 9.732031228991932e-05, "loss": 0.0185, "step": 2990 }, { "epoch": 4.207573632538569, "grad_norm": 0.28982898592948914, "learning_rate": 9.729354584242302e-05, "loss": 0.0206, "step": 3000 }, { "epoch": 4.221598877980365, "grad_norm": 0.5825907588005066, "learning_rate": 9.726665009617832e-05, "loss": 0.0233, "step": 3010 }, { "epoch": 4.23562412342216, "grad_norm": 0.33989542722702026, "learning_rate": 9.723962512471714e-05, "loss": 0.0225, "step": 3020 }, { "epoch": 4.249649368863955, "grad_norm": 0.3098219633102417, "learning_rate": 9.72124710019247e-05, "loss": 0.0191, "step": 3030 }, { "epoch": 4.26367461430575, "grad_norm": 0.3972424268722534, "learning_rate": 9.718518780203934e-05, "loss": 0.0176, "step": 3040 }, { "epoch": 4.277699859747545, "grad_norm": 0.49592509865760803, "learning_rate": 9.715777559965228e-05, "loss": 0.018, "step": 3050 }, { "epoch": 4.291725105189341, "grad_norm": 0.3602411150932312, "learning_rate": 9.713023446970746e-05, "loss": 0.0182, "step": 3060 }, { "epoch": 4.305750350631136, "grad_norm": 0.4067145586013794, "learning_rate": 9.710256448750126e-05, "loss": 0.0187, "step": 3070 }, { "epoch": 4.319775596072931, "grad_norm": 0.36915647983551025, "learning_rate": 9.707476572868235e-05, "loss": 0.021, "step": 3080 }, { "epoch": 4.333800841514726, "grad_norm": 0.3213123679161072, "learning_rate": 9.704683826925149e-05, "loss": 0.0184, "step": 3090 }, { "epoch": 4.3478260869565215, "grad_norm": 0.40447762608528137, "learning_rate": 9.701878218556129e-05, "loss": 0.0172, "step": 3100 }, { "epoch": 4.361851332398317, "grad_norm": 0.6009376049041748, "learning_rate": 9.699059755431598e-05, "loss": 0.0269, "step": 3110 }, { "epoch": 4.3758765778401125, "grad_norm": 0.3681100308895111, "learning_rate": 9.696228445257132e-05, "loss": 0.0174, "step": 3120 }, { "epoch": 4.389901823281908, "grad_norm": 0.4623655676841736, "learning_rate": 9.693384295773419e-05, "loss": 0.0183, "step": 3130 }, { "epoch": 4.403927068723703, "grad_norm": 0.5438552498817444, "learning_rate": 9.690527314756259e-05, "loss": 0.0214, "step": 3140 }, { "epoch": 4.417952314165498, "grad_norm": 0.3868160843849182, "learning_rate": 9.687657510016527e-05, "loss": 0.0197, "step": 3150 }, { "epoch": 4.431977559607293, "grad_norm": 0.4334091246128082, "learning_rate": 9.684774889400161e-05, "loss": 0.0196, "step": 3160 }, { "epoch": 4.446002805049089, "grad_norm": 0.3781043291091919, "learning_rate": 9.681879460788135e-05, "loss": 0.0198, "step": 3170 }, { "epoch": 4.460028050490884, "grad_norm": 0.30728963017463684, "learning_rate": 9.67897123209644e-05, "loss": 0.019, "step": 3180 }, { "epoch": 4.474053295932679, "grad_norm": 0.43715304136276245, "learning_rate": 9.676050211276062e-05, "loss": 0.0191, "step": 3190 }, { "epoch": 4.488078541374474, "grad_norm": 0.4904627203941345, "learning_rate": 9.673116406312962e-05, "loss": 0.0168, "step": 3200 }, { "epoch": 4.502103786816269, "grad_norm": 0.4652002155780792, "learning_rate": 9.67016982522805e-05, "loss": 0.0206, "step": 3210 }, { "epoch": 4.516129032258064, "grad_norm": 0.4601595103740692, "learning_rate": 9.667210476077164e-05, "loss": 0.016, "step": 3220 }, { "epoch": 4.53015427769986, "grad_norm": 0.41385766863822937, "learning_rate": 9.664238366951055e-05, "loss": 0.0265, "step": 3230 }, { "epoch": 4.544179523141655, "grad_norm": 0.32522597908973694, "learning_rate": 9.661253505975355e-05, "loss": 0.0172, "step": 3240 }, { "epoch": 4.55820476858345, "grad_norm": 0.3976089656352997, "learning_rate": 9.658255901310557e-05, "loss": 0.0201, "step": 3250 }, { "epoch": 4.572230014025245, "grad_norm": 0.36448755860328674, "learning_rate": 9.655245561152e-05, "loss": 0.0203, "step": 3260 }, { "epoch": 4.586255259467041, "grad_norm": 0.4955296218395233, "learning_rate": 9.65222249372984e-05, "loss": 0.0199, "step": 3270 }, { "epoch": 4.600280504908836, "grad_norm": 0.48822030425071716, "learning_rate": 9.649186707309026e-05, "loss": 0.018, "step": 3280 }, { "epoch": 4.614305750350631, "grad_norm": 0.3561914563179016, "learning_rate": 9.646138210189283e-05, "loss": 0.0204, "step": 3290 }, { "epoch": 4.628330995792426, "grad_norm": 0.3411588966846466, "learning_rate": 9.643077010705087e-05, "loss": 0.0183, "step": 3300 }, { "epoch": 4.642356241234221, "grad_norm": 0.5176354050636292, "learning_rate": 9.640003117225637e-05, "loss": 0.0175, "step": 3310 }, { "epoch": 4.6563814866760165, "grad_norm": 0.4721958041191101, "learning_rate": 9.636916538154846e-05, "loss": 0.0271, "step": 3320 }, { "epoch": 4.670406732117812, "grad_norm": 0.4027402400970459, "learning_rate": 9.633817281931296e-05, "loss": 0.0178, "step": 3330 }, { "epoch": 4.6844319775596075, "grad_norm": 0.30383777618408203, "learning_rate": 9.630705357028242e-05, "loss": 0.0177, "step": 3340 }, { "epoch": 4.698457223001403, "grad_norm": 0.28878289461135864, "learning_rate": 9.627580771953563e-05, "loss": 0.0219, "step": 3350 }, { "epoch": 4.712482468443198, "grad_norm": 0.33227187395095825, "learning_rate": 9.624443535249759e-05, "loss": 0.018, "step": 3360 }, { "epoch": 4.726507713884993, "grad_norm": 0.42790162563323975, "learning_rate": 9.621293655493913e-05, "loss": 0.0188, "step": 3370 }, { "epoch": 4.740532959326789, "grad_norm": 0.48498180508613586, "learning_rate": 9.618131141297675e-05, "loss": 0.0183, "step": 3380 }, { "epoch": 4.754558204768584, "grad_norm": 0.44318583607673645, "learning_rate": 9.614956001307242e-05, "loss": 0.0172, "step": 3390 }, { "epoch": 4.768583450210379, "grad_norm": 0.3985426425933838, "learning_rate": 9.611768244203321e-05, "loss": 0.0207, "step": 3400 }, { "epoch": 4.782608695652174, "grad_norm": 0.4970969259738922, "learning_rate": 9.60856787870112e-05, "loss": 0.0183, "step": 3410 }, { "epoch": 4.796633941093969, "grad_norm": 0.45009705424308777, "learning_rate": 9.605354913550318e-05, "loss": 0.0185, "step": 3420 }, { "epoch": 4.810659186535764, "grad_norm": 0.4902184307575226, "learning_rate": 9.602129357535037e-05, "loss": 0.0166, "step": 3430 }, { "epoch": 4.82468443197756, "grad_norm": 0.42166388034820557, "learning_rate": 9.598891219473825e-05, "loss": 0.0193, "step": 3440 }, { "epoch": 4.838709677419355, "grad_norm": 0.46719059348106384, "learning_rate": 9.595640508219625e-05, "loss": 0.0163, "step": 3450 }, { "epoch": 4.85273492286115, "grad_norm": 0.3171842694282532, "learning_rate": 9.592377232659761e-05, "loss": 0.02, "step": 3460 }, { "epoch": 4.866760168302945, "grad_norm": 0.2784029245376587, "learning_rate": 9.589101401715904e-05, "loss": 0.0196, "step": 3470 }, { "epoch": 4.88078541374474, "grad_norm": 0.26875120401382446, "learning_rate": 9.585813024344045e-05, "loss": 0.0184, "step": 3480 }, { "epoch": 4.894810659186536, "grad_norm": 0.4350612759590149, "learning_rate": 9.58251210953449e-05, "loss": 0.018, "step": 3490 }, { "epoch": 4.908835904628331, "grad_norm": 0.4602183401584625, "learning_rate": 9.579198666311809e-05, "loss": 0.0168, "step": 3500 }, { "epoch": 4.922861150070126, "grad_norm": 0.36949145793914795, "learning_rate": 9.575872703734832e-05, "loss": 0.0187, "step": 3510 }, { "epoch": 4.936886395511921, "grad_norm": 0.39185070991516113, "learning_rate": 9.572534230896611e-05, "loss": 0.0197, "step": 3520 }, { "epoch": 4.950911640953716, "grad_norm": 0.4085732400417328, "learning_rate": 9.569183256924403e-05, "loss": 0.0189, "step": 3530 }, { "epoch": 4.9649368863955115, "grad_norm": 0.40207141637802124, "learning_rate": 9.565819790979646e-05, "loss": 0.0187, "step": 3540 }, { "epoch": 4.978962131837307, "grad_norm": 0.3561556041240692, "learning_rate": 9.562443842257925e-05, "loss": 0.0188, "step": 3550 }, { "epoch": 4.9929873772791025, "grad_norm": 0.29750770330429077, "learning_rate": 9.559055419988956e-05, "loss": 0.0175, "step": 3560 }, { "epoch": 5.0070126227208975, "grad_norm": 0.4048471450805664, "learning_rate": 9.555654533436557e-05, "loss": 0.0179, "step": 3570 }, { "epoch": 5.021037868162693, "grad_norm": 0.2840149700641632, "learning_rate": 9.552241191898621e-05, "loss": 0.0199, "step": 3580 }, { "epoch": 5.035063113604488, "grad_norm": 0.34960347414016724, "learning_rate": 9.548815404707092e-05, "loss": 0.0192, "step": 3590 }, { "epoch": 5.049088359046284, "grad_norm": 0.41497114300727844, "learning_rate": 9.545377181227942e-05, "loss": 0.0179, "step": 3600 }, { "epoch": 5.063113604488079, "grad_norm": 0.47879934310913086, "learning_rate": 9.541926530861145e-05, "loss": 0.0212, "step": 3610 }, { "epoch": 5.077138849929874, "grad_norm": 0.3229731619358063, "learning_rate": 9.538463463040645e-05, "loss": 0.0169, "step": 3620 }, { "epoch": 5.091164095371669, "grad_norm": 0.3725450038909912, "learning_rate": 9.534987987234337e-05, "loss": 0.0184, "step": 3630 }, { "epoch": 5.105189340813464, "grad_norm": 0.4881027936935425, "learning_rate": 9.53150011294404e-05, "loss": 0.0183, "step": 3640 }, { "epoch": 5.11921458625526, "grad_norm": 0.4989643692970276, "learning_rate": 9.527999849705471e-05, "loss": 0.0188, "step": 3650 }, { "epoch": 5.133239831697055, "grad_norm": 0.4125017821788788, "learning_rate": 9.524487207088213e-05, "loss": 0.0176, "step": 3660 }, { "epoch": 5.14726507713885, "grad_norm": 0.30687186121940613, "learning_rate": 9.520962194695698e-05, "loss": 0.0156, "step": 3670 }, { "epoch": 5.161290322580645, "grad_norm": 0.43356555700302124, "learning_rate": 9.517424822165175e-05, "loss": 0.0171, "step": 3680 }, { "epoch": 5.17531556802244, "grad_norm": 0.4036951959133148, "learning_rate": 9.513875099167685e-05, "loss": 0.0205, "step": 3690 }, { "epoch": 5.189340813464236, "grad_norm": 0.33838722109794617, "learning_rate": 9.510313035408035e-05, "loss": 0.016, "step": 3700 }, { "epoch": 5.203366058906031, "grad_norm": 0.3640996217727661, "learning_rate": 9.506738640624775e-05, "loss": 0.0166, "step": 3710 }, { "epoch": 5.217391304347826, "grad_norm": 0.3356633484363556, "learning_rate": 9.50315192459016e-05, "loss": 0.0164, "step": 3720 }, { "epoch": 5.231416549789621, "grad_norm": 0.5182667970657349, "learning_rate": 9.499552897110136e-05, "loss": 0.0194, "step": 3730 }, { "epoch": 5.245441795231416, "grad_norm": 0.3254126310348511, "learning_rate": 9.495941568024304e-05, "loss": 0.0204, "step": 3740 }, { "epoch": 5.259467040673211, "grad_norm": 0.33048364520072937, "learning_rate": 9.492317947205904e-05, "loss": 0.0179, "step": 3750 }, { "epoch": 5.273492286115007, "grad_norm": 0.3442145884037018, "learning_rate": 9.488682044561775e-05, "loss": 0.0181, "step": 3760 }, { "epoch": 5.287517531556802, "grad_norm": 0.31233927607536316, "learning_rate": 9.485033870032335e-05, "loss": 0.0175, "step": 3770 }, { "epoch": 5.301542776998597, "grad_norm": 0.27989670634269714, "learning_rate": 9.481373433591556e-05, "loss": 0.0167, "step": 3780 }, { "epoch": 5.3155680224403925, "grad_norm": 0.27309074997901917, "learning_rate": 9.47770074524693e-05, "loss": 0.0179, "step": 3790 }, { "epoch": 5.329593267882188, "grad_norm": 0.3995862305164337, "learning_rate": 9.474015815039446e-05, "loss": 0.0187, "step": 3800 }, { "epoch": 5.3436185133239835, "grad_norm": 0.39817574620246887, "learning_rate": 9.470318653043565e-05, "loss": 0.0194, "step": 3810 }, { "epoch": 5.357643758765779, "grad_norm": 0.43617236614227295, "learning_rate": 9.466609269367185e-05, "loss": 0.0205, "step": 3820 }, { "epoch": 5.371669004207574, "grad_norm": 0.37327948212623596, "learning_rate": 9.46288767415162e-05, "loss": 0.0188, "step": 3830 }, { "epoch": 5.385694249649369, "grad_norm": 0.3820459842681885, "learning_rate": 9.459153877571567e-05, "loss": 0.0167, "step": 3840 }, { "epoch": 5.399719495091164, "grad_norm": 0.31756189465522766, "learning_rate": 9.455407889835087e-05, "loss": 0.0174, "step": 3850 }, { "epoch": 5.41374474053296, "grad_norm": 0.2555500268936157, "learning_rate": 9.451649721183564e-05, "loss": 0.0185, "step": 3860 }, { "epoch": 5.427769985974755, "grad_norm": 0.4270869493484497, "learning_rate": 9.447879381891692e-05, "loss": 0.0173, "step": 3870 }, { "epoch": 5.44179523141655, "grad_norm": 0.3319943845272064, "learning_rate": 9.444096882267428e-05, "loss": 0.0151, "step": 3880 }, { "epoch": 5.455820476858345, "grad_norm": 0.2833229899406433, "learning_rate": 9.440302232651988e-05, "loss": 0.0175, "step": 3890 }, { "epoch": 5.46984572230014, "grad_norm": 0.3331961929798126, "learning_rate": 9.436495443419795e-05, "loss": 0.019, "step": 3900 }, { "epoch": 5.483870967741936, "grad_norm": 0.30023398995399475, "learning_rate": 9.432676524978466e-05, "loss": 0.0194, "step": 3910 }, { "epoch": 5.497896213183731, "grad_norm": 0.358003169298172, "learning_rate": 9.42884548776878e-05, "loss": 0.0193, "step": 3920 }, { "epoch": 5.511921458625526, "grad_norm": 0.3799135088920593, "learning_rate": 9.425002342264646e-05, "loss": 0.019, "step": 3930 }, { "epoch": 5.525946704067321, "grad_norm": 0.5097178220748901, "learning_rate": 9.421147098973077e-05, "loss": 0.0182, "step": 3940 }, { "epoch": 5.539971949509116, "grad_norm": 0.2824934720993042, "learning_rate": 9.41727976843416e-05, "loss": 0.0181, "step": 3950 }, { "epoch": 5.553997194950911, "grad_norm": 0.3880980312824249, "learning_rate": 9.413400361221029e-05, "loss": 0.0171, "step": 3960 }, { "epoch": 5.568022440392707, "grad_norm": 0.3898027241230011, "learning_rate": 9.409508887939835e-05, "loss": 0.0176, "step": 3970 }, { "epoch": 5.582047685834502, "grad_norm": 0.36968794465065, "learning_rate": 9.40560535922972e-05, "loss": 0.0153, "step": 3980 }, { "epoch": 5.596072931276297, "grad_norm": 0.3463082015514374, "learning_rate": 9.40168978576278e-05, "loss": 0.0176, "step": 3990 }, { "epoch": 5.610098176718092, "grad_norm": 0.32816973328590393, "learning_rate": 9.397762178244043e-05, "loss": 0.0151, "step": 4000 }, { "epoch": 5.6241234221598875, "grad_norm": 0.37664294242858887, "learning_rate": 9.393822547411439e-05, "loss": 0.0158, "step": 4010 }, { "epoch": 5.638148667601683, "grad_norm": 0.41917476058006287, "learning_rate": 9.389870904035769e-05, "loss": 0.0178, "step": 4020 }, { "epoch": 5.6521739130434785, "grad_norm": 0.26848939061164856, "learning_rate": 9.385907258920672e-05, "loss": 0.0182, "step": 4030 }, { "epoch": 5.666199158485274, "grad_norm": 0.2848636209964752, "learning_rate": 9.381931622902607e-05, "loss": 0.0151, "step": 4040 }, { "epoch": 5.680224403927069, "grad_norm": 0.5861037969589233, "learning_rate": 9.377944006850807e-05, "loss": 0.0161, "step": 4050 }, { "epoch": 5.694249649368864, "grad_norm": 0.5171000957489014, "learning_rate": 9.373944421667265e-05, "loss": 0.0183, "step": 4060 }, { "epoch": 5.708274894810659, "grad_norm": 0.5463475584983826, "learning_rate": 9.369932878286691e-05, "loss": 0.0169, "step": 4070 }, { "epoch": 5.722300140252455, "grad_norm": 0.3080609440803528, "learning_rate": 9.365909387676494e-05, "loss": 0.0178, "step": 4080 }, { "epoch": 5.73632538569425, "grad_norm": 0.3004729747772217, "learning_rate": 9.361873960836744e-05, "loss": 0.0164, "step": 4090 }, { "epoch": 5.750350631136045, "grad_norm": 0.2695719599723816, "learning_rate": 9.357826608800142e-05, "loss": 0.0167, "step": 4100 }, { "epoch": 5.76437587657784, "grad_norm": 0.2622930407524109, "learning_rate": 9.353767342631994e-05, "loss": 0.0178, "step": 4110 }, { "epoch": 5.778401122019635, "grad_norm": 0.3648136258125305, "learning_rate": 9.34969617343018e-05, "loss": 0.0177, "step": 4120 }, { "epoch": 5.792426367461431, "grad_norm": 0.3646831810474396, "learning_rate": 9.345613112325122e-05, "loss": 0.0199, "step": 4130 }, { "epoch": 5.806451612903226, "grad_norm": 0.29547202587127686, "learning_rate": 9.34151817047975e-05, "loss": 0.0168, "step": 4140 }, { "epoch": 5.820476858345021, "grad_norm": 0.313324898481369, "learning_rate": 9.33741135908948e-05, "loss": 0.0169, "step": 4150 }, { "epoch": 5.834502103786816, "grad_norm": 0.3926902413368225, "learning_rate": 9.33329268938218e-05, "loss": 0.0166, "step": 4160 }, { "epoch": 5.848527349228611, "grad_norm": 0.48040062189102173, "learning_rate": 9.329162172618132e-05, "loss": 0.0171, "step": 4170 }, { "epoch": 5.862552594670406, "grad_norm": 0.4277091920375824, "learning_rate": 9.325019820090013e-05, "loss": 0.0199, "step": 4180 }, { "epoch": 5.876577840112202, "grad_norm": 0.3505248725414276, "learning_rate": 9.320865643122855e-05, "loss": 0.016, "step": 4190 }, { "epoch": 5.890603085553997, "grad_norm": 0.2751741409301758, "learning_rate": 9.316699653074023e-05, "loss": 0.0179, "step": 4200 }, { "epoch": 5.904628330995792, "grad_norm": 0.2688392102718353, "learning_rate": 9.312521861333172e-05, "loss": 0.0155, "step": 4210 }, { "epoch": 5.918653576437587, "grad_norm": 0.26209181547164917, "learning_rate": 9.308332279322224e-05, "loss": 0.0156, "step": 4220 }, { "epoch": 5.932678821879383, "grad_norm": 0.39522090554237366, "learning_rate": 9.304130918495338e-05, "loss": 0.0171, "step": 4230 }, { "epoch": 5.946704067321178, "grad_norm": 0.3307866156101227, "learning_rate": 9.299917790338874e-05, "loss": 0.0147, "step": 4240 }, { "epoch": 5.9607293127629735, "grad_norm": 0.2404823750257492, "learning_rate": 9.295692906371363e-05, "loss": 0.018, "step": 4250 }, { "epoch": 5.9747545582047685, "grad_norm": 0.24990908801555634, "learning_rate": 9.291456278143476e-05, "loss": 0.0175, "step": 4260 }, { "epoch": 5.988779803646564, "grad_norm": 0.3414025604724884, "learning_rate": 9.287207917237994e-05, "loss": 0.0147, "step": 4270 }, { "epoch": 6.002805049088359, "grad_norm": 0.38710200786590576, "learning_rate": 9.282947835269773e-05, "loss": 0.0162, "step": 4280 }, { "epoch": 6.016830294530155, "grad_norm": 0.41710081696510315, "learning_rate": 9.278676043885715e-05, "loss": 0.0155, "step": 4290 }, { "epoch": 6.03085553997195, "grad_norm": 0.3030624985694885, "learning_rate": 9.274392554764733e-05, "loss": 0.0154, "step": 4300 }, { "epoch": 6.044880785413745, "grad_norm": 0.48476335406303406, "learning_rate": 9.270097379617723e-05, "loss": 0.0144, "step": 4310 }, { "epoch": 6.05890603085554, "grad_norm": 0.34476009011268616, "learning_rate": 9.26579053018753e-05, "loss": 0.0175, "step": 4320 }, { "epoch": 6.072931276297335, "grad_norm": 0.37346845865249634, "learning_rate": 9.261472018248918e-05, "loss": 0.0147, "step": 4330 }, { "epoch": 6.086956521739131, "grad_norm": 0.4647091329097748, "learning_rate": 9.25714185560853e-05, "loss": 0.0154, "step": 4340 }, { "epoch": 6.100981767180926, "grad_norm": 0.2972899079322815, "learning_rate": 9.252800054104868e-05, "loss": 0.0173, "step": 4350 }, { "epoch": 6.115007012622721, "grad_norm": 0.3479161858558655, "learning_rate": 9.248446625608252e-05, "loss": 0.0169, "step": 4360 }, { "epoch": 6.129032258064516, "grad_norm": 0.2201370894908905, "learning_rate": 9.244081582020789e-05, "loss": 0.0167, "step": 4370 }, { "epoch": 6.143057503506311, "grad_norm": 0.3771252930164337, "learning_rate": 9.239704935276339e-05, "loss": 0.0174, "step": 4380 }, { "epoch": 6.157082748948106, "grad_norm": 0.43128442764282227, "learning_rate": 9.235316697340489e-05, "loss": 0.0223, "step": 4390 }, { "epoch": 6.171107994389902, "grad_norm": 0.3310973644256592, "learning_rate": 9.230916880210512e-05, "loss": 0.0159, "step": 4400 }, { "epoch": 6.185133239831697, "grad_norm": 0.27224138379096985, "learning_rate": 9.226505495915342e-05, "loss": 0.0168, "step": 4410 }, { "epoch": 6.199158485273492, "grad_norm": 0.2980877459049225, "learning_rate": 9.222082556515536e-05, "loss": 0.0164, "step": 4420 }, { "epoch": 6.213183730715287, "grad_norm": 0.2621767520904541, "learning_rate": 9.217648074103242e-05, "loss": 0.0183, "step": 4430 }, { "epoch": 6.227208976157082, "grad_norm": 0.31056857109069824, "learning_rate": 9.213202060802161e-05, "loss": 0.0144, "step": 4440 }, { "epoch": 6.241234221598878, "grad_norm": 0.3696659207344055, "learning_rate": 9.208744528767528e-05, "loss": 0.0165, "step": 4450 }, { "epoch": 6.255259467040673, "grad_norm": 0.4154193699359894, "learning_rate": 9.204275490186064e-05, "loss": 0.017, "step": 4460 }, { "epoch": 6.269284712482468, "grad_norm": 0.37429219484329224, "learning_rate": 9.199794957275949e-05, "loss": 0.0155, "step": 4470 }, { "epoch": 6.2833099579242635, "grad_norm": 0.30133599042892456, "learning_rate": 9.19530294228679e-05, "loss": 0.0162, "step": 4480 }, { "epoch": 6.297335203366059, "grad_norm": 0.22135549783706665, "learning_rate": 9.190799457499583e-05, "loss": 0.0161, "step": 4490 }, { "epoch": 6.3113604488078545, "grad_norm": 0.43155360221862793, "learning_rate": 9.186284515226686e-05, "loss": 0.0194, "step": 4500 }, { "epoch": 6.32538569424965, "grad_norm": 0.29513120651245117, "learning_rate": 9.181758127811777e-05, "loss": 0.0142, "step": 4510 }, { "epoch": 6.339410939691445, "grad_norm": 0.38059473037719727, "learning_rate": 9.177220307629825e-05, "loss": 0.0159, "step": 4520 }, { "epoch": 6.35343618513324, "grad_norm": 0.4392417371273041, "learning_rate": 9.172671067087059e-05, "loss": 0.0165, "step": 4530 }, { "epoch": 6.367461430575035, "grad_norm": 0.29749542474746704, "learning_rate": 9.16811041862093e-05, "loss": 0.0185, "step": 4540 }, { "epoch": 6.381486676016831, "grad_norm": 0.35591283440589905, "learning_rate": 9.163538374700076e-05, "loss": 0.0158, "step": 4550 }, { "epoch": 6.395511921458626, "grad_norm": 0.36862605810165405, "learning_rate": 9.158954947824287e-05, "loss": 0.0135, "step": 4560 }, { "epoch": 6.409537166900421, "grad_norm": 0.2780384123325348, "learning_rate": 9.154360150524482e-05, "loss": 0.0147, "step": 4570 }, { "epoch": 6.423562412342216, "grad_norm": 0.33799365162849426, "learning_rate": 9.14975399536266e-05, "loss": 0.0172, "step": 4580 }, { "epoch": 6.437587657784011, "grad_norm": 0.2954995334148407, "learning_rate": 9.14513649493187e-05, "loss": 0.0185, "step": 4590 }, { "epoch": 6.451612903225806, "grad_norm": 0.2516833245754242, "learning_rate": 9.140507661856187e-05, "loss": 0.0182, "step": 4600 }, { "epoch": 6.465638148667602, "grad_norm": 0.30376479029655457, "learning_rate": 9.135867508790661e-05, "loss": 0.0153, "step": 4610 }, { "epoch": 6.479663394109397, "grad_norm": 0.20895326137542725, "learning_rate": 9.131216048421291e-05, "loss": 0.0188, "step": 4620 }, { "epoch": 6.493688639551192, "grad_norm": 0.45658454298973083, "learning_rate": 9.126553293464998e-05, "loss": 0.0148, "step": 4630 }, { "epoch": 6.507713884992987, "grad_norm": 0.40105757117271423, "learning_rate": 9.121879256669572e-05, "loss": 0.0175, "step": 4640 }, { "epoch": 6.521739130434782, "grad_norm": 0.240820050239563, "learning_rate": 9.117193950813652e-05, "loss": 0.0157, "step": 4650 }, { "epoch": 6.535764375876578, "grad_norm": 0.3216880261898041, "learning_rate": 9.112497388706685e-05, "loss": 0.0185, "step": 4660 }, { "epoch": 6.549789621318373, "grad_norm": 0.3502776622772217, "learning_rate": 9.10778958318889e-05, "loss": 0.0157, "step": 4670 }, { "epoch": 6.563814866760168, "grad_norm": 0.26351398229599, "learning_rate": 9.103070547131232e-05, "loss": 0.0157, "step": 4680 }, { "epoch": 6.577840112201963, "grad_norm": 0.32146379351615906, "learning_rate": 9.098340293435375e-05, "loss": 0.0144, "step": 4690 }, { "epoch": 6.5918653576437585, "grad_norm": 0.38634830713272095, "learning_rate": 9.093598835033649e-05, "loss": 0.017, "step": 4700 }, { "epoch": 6.6058906030855535, "grad_norm": 0.2745518982410431, "learning_rate": 9.088846184889021e-05, "loss": 0.0172, "step": 4710 }, { "epoch": 6.6199158485273495, "grad_norm": 0.24883407354354858, "learning_rate": 9.084082355995057e-05, "loss": 0.0153, "step": 4720 }, { "epoch": 6.6339410939691446, "grad_norm": 0.28864675760269165, "learning_rate": 9.079307361375882e-05, "loss": 0.0148, "step": 4730 }, { "epoch": 6.64796633941094, "grad_norm": 0.43499740958213806, "learning_rate": 9.074521214086149e-05, "loss": 0.0167, "step": 4740 }, { "epoch": 6.661991584852735, "grad_norm": 0.32007405161857605, "learning_rate": 9.069723927211001e-05, "loss": 0.0166, "step": 4750 }, { "epoch": 6.676016830294531, "grad_norm": 0.49020981788635254, "learning_rate": 9.064915513866037e-05, "loss": 0.0158, "step": 4760 }, { "epoch": 6.690042075736326, "grad_norm": 0.4361474812030792, "learning_rate": 9.060095987197279e-05, "loss": 0.0184, "step": 4770 }, { "epoch": 6.704067321178121, "grad_norm": 0.5326139330863953, "learning_rate": 9.055265360381126e-05, "loss": 0.0175, "step": 4780 }, { "epoch": 6.718092566619916, "grad_norm": 0.35351288318634033, "learning_rate": 9.050423646624326e-05, "loss": 0.0154, "step": 4790 }, { "epoch": 6.732117812061711, "grad_norm": 0.3406434953212738, "learning_rate": 9.045570859163943e-05, "loss": 0.0179, "step": 4800 }, { "epoch": 6.746143057503506, "grad_norm": 0.4789683222770691, "learning_rate": 9.04070701126731e-05, "loss": 0.0156, "step": 4810 }, { "epoch": 6.760168302945302, "grad_norm": 0.21596045792102814, "learning_rate": 9.035832116232001e-05, "loss": 0.016, "step": 4820 }, { "epoch": 6.774193548387097, "grad_norm": 0.38774675130844116, "learning_rate": 9.030946187385796e-05, "loss": 0.0156, "step": 4830 }, { "epoch": 6.788218793828892, "grad_norm": 0.2434934824705124, "learning_rate": 9.026049238086635e-05, "loss": 0.0141, "step": 4840 }, { "epoch": 6.802244039270687, "grad_norm": 0.332832396030426, "learning_rate": 9.021141281722591e-05, "loss": 0.0167, "step": 4850 }, { "epoch": 6.816269284712482, "grad_norm": 0.3770451545715332, "learning_rate": 9.01622233171183e-05, "loss": 0.0171, "step": 4860 }, { "epoch": 6.830294530154278, "grad_norm": 0.41268274188041687, "learning_rate": 9.011292401502574e-05, "loss": 0.0175, "step": 4870 }, { "epoch": 6.844319775596073, "grad_norm": 0.27913346886634827, "learning_rate": 9.006351504573063e-05, "loss": 0.0167, "step": 4880 }, { "epoch": 6.858345021037868, "grad_norm": 0.32114824652671814, "learning_rate": 9.001399654431519e-05, "loss": 0.0184, "step": 4890 }, { "epoch": 6.872370266479663, "grad_norm": 0.32656899094581604, "learning_rate": 8.996436864616116e-05, "loss": 0.0153, "step": 4900 }, { "epoch": 6.886395511921458, "grad_norm": 0.3487681746482849, "learning_rate": 8.991463148694925e-05, "loss": 0.0139, "step": 4910 }, { "epoch": 6.900420757363253, "grad_norm": 0.37331855297088623, "learning_rate": 8.986478520265902e-05, "loss": 0.0176, "step": 4920 }, { "epoch": 6.914446002805049, "grad_norm": 0.35413599014282227, "learning_rate": 8.981482992956827e-05, "loss": 0.0158, "step": 4930 }, { "epoch": 6.9284712482468445, "grad_norm": 0.2919650077819824, "learning_rate": 8.976476580425282e-05, "loss": 0.0134, "step": 4940 }, { "epoch": 6.9424964936886395, "grad_norm": 0.4425271153450012, "learning_rate": 8.971459296358606e-05, "loss": 0.0174, "step": 4950 }, { "epoch": 6.956521739130435, "grad_norm": 0.28576064109802246, "learning_rate": 8.966431154473864e-05, "loss": 0.0171, "step": 4960 }, { "epoch": 6.97054698457223, "grad_norm": 0.4203834533691406, "learning_rate": 8.961392168517803e-05, "loss": 0.0155, "step": 4970 }, { "epoch": 6.984572230014026, "grad_norm": 0.32101160287857056, "learning_rate": 8.956342352266821e-05, "loss": 0.0175, "step": 4980 }, { "epoch": 6.998597475455821, "grad_norm": 0.25754424929618835, "learning_rate": 8.95128171952692e-05, "loss": 0.0161, "step": 4990 }, { "epoch": 7.012622720897616, "grad_norm": 0.4329180121421814, "learning_rate": 8.946210284133676e-05, "loss": 0.0163, "step": 5000 }, { "epoch": 7.026647966339411, "grad_norm": 0.31324008107185364, "learning_rate": 8.941128059952201e-05, "loss": 0.0164, "step": 5010 }, { "epoch": 7.040673211781206, "grad_norm": 0.3928026854991913, "learning_rate": 8.936035060877102e-05, "loss": 0.0177, "step": 5020 }, { "epoch": 7.054698457223002, "grad_norm": 0.2512287199497223, "learning_rate": 8.930931300832443e-05, "loss": 0.0169, "step": 5030 }, { "epoch": 7.068723702664797, "grad_norm": 0.333125501871109, "learning_rate": 8.925816793771711e-05, "loss": 0.0135, "step": 5040 }, { "epoch": 7.082748948106592, "grad_norm": 0.44627436995506287, "learning_rate": 8.92069155367777e-05, "loss": 0.0167, "step": 5050 }, { "epoch": 7.096774193548387, "grad_norm": 0.2636484205722809, "learning_rate": 8.915555594562834e-05, "loss": 0.0183, "step": 5060 }, { "epoch": 7.110799438990182, "grad_norm": 0.4147075414657593, "learning_rate": 8.910408930468416e-05, "loss": 0.0157, "step": 5070 }, { "epoch": 7.124824684431977, "grad_norm": 0.4243148863315582, "learning_rate": 8.905251575465303e-05, "loss": 0.0149, "step": 5080 }, { "epoch": 7.138849929873773, "grad_norm": 0.3053276240825653, "learning_rate": 8.900083543653502e-05, "loss": 0.0141, "step": 5090 }, { "epoch": 7.152875175315568, "grad_norm": 0.3795381486415863, "learning_rate": 8.894904849162218e-05, "loss": 0.0154, "step": 5100 }, { "epoch": 7.166900420757363, "grad_norm": 0.31169256567955017, "learning_rate": 8.889715506149802e-05, "loss": 0.0151, "step": 5110 }, { "epoch": 7.180925666199158, "grad_norm": 0.3113704025745392, "learning_rate": 8.884515528803722e-05, "loss": 0.0152, "step": 5120 }, { "epoch": 7.194950911640953, "grad_norm": 0.42375317215919495, "learning_rate": 8.879304931340517e-05, "loss": 0.0161, "step": 5130 }, { "epoch": 7.208976157082749, "grad_norm": 0.41431623697280884, "learning_rate": 8.874083728005759e-05, "loss": 0.0143, "step": 5140 }, { "epoch": 7.223001402524544, "grad_norm": 0.4907633066177368, "learning_rate": 8.868851933074021e-05, "loss": 0.015, "step": 5150 }, { "epoch": 7.237026647966339, "grad_norm": 0.2594669461250305, "learning_rate": 8.863609560848829e-05, "loss": 0.0161, "step": 5160 }, { "epoch": 7.2510518934081345, "grad_norm": 0.5131790041923523, "learning_rate": 8.85835662566263e-05, "loss": 0.0151, "step": 5170 }, { "epoch": 7.26507713884993, "grad_norm": 0.22146762907505035, "learning_rate": 8.853093141876747e-05, "loss": 0.0154, "step": 5180 }, { "epoch": 7.2791023842917255, "grad_norm": 0.35415077209472656, "learning_rate": 8.847819123881343e-05, "loss": 0.0151, "step": 5190 }, { "epoch": 7.293127629733521, "grad_norm": 0.31376850605010986, "learning_rate": 8.842534586095383e-05, "loss": 0.0155, "step": 5200 }, { "epoch": 7.307152875175316, "grad_norm": 0.2746514678001404, "learning_rate": 8.837239542966593e-05, "loss": 0.0156, "step": 5210 }, { "epoch": 7.321178120617111, "grad_norm": 0.4385754466056824, "learning_rate": 8.831934008971417e-05, "loss": 0.0191, "step": 5220 }, { "epoch": 7.335203366058906, "grad_norm": 0.5437304973602295, "learning_rate": 8.826617998614982e-05, "loss": 0.0188, "step": 5230 }, { "epoch": 7.349228611500701, "grad_norm": 0.3496556580066681, "learning_rate": 8.821291526431056e-05, "loss": 0.0162, "step": 5240 }, { "epoch": 7.363253856942497, "grad_norm": 0.24815425276756287, "learning_rate": 8.815954606982015e-05, "loss": 0.014, "step": 5250 }, { "epoch": 7.377279102384292, "grad_norm": 0.35493478178977966, "learning_rate": 8.810607254858789e-05, "loss": 0.0167, "step": 5260 }, { "epoch": 7.391304347826087, "grad_norm": 0.26886385679244995, "learning_rate": 8.805249484680838e-05, "loss": 0.0148, "step": 5270 }, { "epoch": 7.405329593267882, "grad_norm": 0.37361031770706177, "learning_rate": 8.799881311096096e-05, "loss": 0.0177, "step": 5280 }, { "epoch": 7.419354838709677, "grad_norm": 0.483303427696228, "learning_rate": 8.794502748780949e-05, "loss": 0.0149, "step": 5290 }, { "epoch": 7.433380084151473, "grad_norm": 0.2884332537651062, "learning_rate": 8.78911381244018e-05, "loss": 0.0179, "step": 5300 }, { "epoch": 7.447405329593268, "grad_norm": 0.27255380153656006, "learning_rate": 8.783714516806933e-05, "loss": 0.0206, "step": 5310 }, { "epoch": 7.461430575035063, "grad_norm": 0.3140561282634735, "learning_rate": 8.77830487664268e-05, "loss": 0.0187, "step": 5320 }, { "epoch": 7.475455820476858, "grad_norm": 0.2698211967945099, "learning_rate": 8.772884906737167e-05, "loss": 0.0194, "step": 5330 }, { "epoch": 7.489481065918653, "grad_norm": 0.34747305512428284, "learning_rate": 8.767454621908387e-05, "loss": 0.0155, "step": 5340 }, { "epoch": 7.503506311360448, "grad_norm": 0.42131921648979187, "learning_rate": 8.76201403700253e-05, "loss": 0.0175, "step": 5350 }, { "epoch": 7.517531556802244, "grad_norm": 0.3952893316745758, "learning_rate": 8.756563166893949e-05, "loss": 0.0159, "step": 5360 }, { "epoch": 7.531556802244039, "grad_norm": 0.38385137915611267, "learning_rate": 8.751102026485113e-05, "loss": 0.0135, "step": 5370 }, { "epoch": 7.545582047685834, "grad_norm": 0.3793047368526459, "learning_rate": 8.745630630706571e-05, "loss": 0.0166, "step": 5380 }, { "epoch": 7.5596072931276295, "grad_norm": 0.43306764960289, "learning_rate": 8.740148994516912e-05, "loss": 0.0149, "step": 5390 }, { "epoch": 7.573632538569425, "grad_norm": 0.3203562796115875, "learning_rate": 8.73465713290272e-05, "loss": 0.0134, "step": 5400 }, { "epoch": 7.5876577840112205, "grad_norm": 0.2613515555858612, "learning_rate": 8.729155060878533e-05, "loss": 0.0156, "step": 5410 }, { "epoch": 7.6016830294530155, "grad_norm": 0.28463563323020935, "learning_rate": 8.723642793486809e-05, "loss": 0.0157, "step": 5420 }, { "epoch": 7.615708274894811, "grad_norm": 0.4164149761199951, "learning_rate": 8.718120345797873e-05, "loss": 0.0153, "step": 5430 }, { "epoch": 7.629733520336606, "grad_norm": 0.3259846866130829, "learning_rate": 8.712587732909889e-05, "loss": 0.0168, "step": 5440 }, { "epoch": 7.643758765778401, "grad_norm": 0.3069482445716858, "learning_rate": 8.707044969948806e-05, "loss": 0.0138, "step": 5450 }, { "epoch": 7.657784011220197, "grad_norm": 0.2949478328227997, "learning_rate": 8.701492072068329e-05, "loss": 0.0157, "step": 5460 }, { "epoch": 7.671809256661992, "grad_norm": 0.45650017261505127, "learning_rate": 8.695929054449869e-05, "loss": 0.0158, "step": 5470 }, { "epoch": 7.685834502103787, "grad_norm": 0.4870288074016571, "learning_rate": 8.690355932302501e-05, "loss": 0.0164, "step": 5480 }, { "epoch": 7.699859747545582, "grad_norm": 0.4344072937965393, "learning_rate": 8.684772720862931e-05, "loss": 0.0155, "step": 5490 }, { "epoch": 7.713884992987377, "grad_norm": 0.2802102863788605, "learning_rate": 8.679179435395446e-05, "loss": 0.0145, "step": 5500 }, { "epoch": 7.727910238429173, "grad_norm": 0.3005281090736389, "learning_rate": 8.673576091191874e-05, "loss": 0.0152, "step": 5510 }, { "epoch": 7.741935483870968, "grad_norm": 0.34155651926994324, "learning_rate": 8.667962703571541e-05, "loss": 0.018, "step": 5520 }, { "epoch": 7.755960729312763, "grad_norm": 0.31168773770332336, "learning_rate": 8.662339287881238e-05, "loss": 0.0169, "step": 5530 }, { "epoch": 7.769985974754558, "grad_norm": 0.349179744720459, "learning_rate": 8.656705859495169e-05, "loss": 0.0139, "step": 5540 }, { "epoch": 7.784011220196353, "grad_norm": 0.2972019612789154, "learning_rate": 8.651062433814912e-05, "loss": 0.0183, "step": 5550 }, { "epoch": 7.798036465638148, "grad_norm": 0.2987116873264313, "learning_rate": 8.645409026269375e-05, "loss": 0.0176, "step": 5560 }, { "epoch": 7.812061711079944, "grad_norm": 0.3116677701473236, "learning_rate": 8.639745652314759e-05, "loss": 0.0162, "step": 5570 }, { "epoch": 7.826086956521739, "grad_norm": 0.38799554109573364, "learning_rate": 8.634072327434515e-05, "loss": 0.0137, "step": 5580 }, { "epoch": 7.840112201963534, "grad_norm": 0.2460748851299286, "learning_rate": 8.628389067139294e-05, "loss": 0.0138, "step": 5590 }, { "epoch": 7.854137447405329, "grad_norm": 0.21460546553134918, "learning_rate": 8.622695886966911e-05, "loss": 0.0154, "step": 5600 }, { "epoch": 7.868162692847124, "grad_norm": 0.3087299168109894, "learning_rate": 8.616992802482308e-05, "loss": 0.0134, "step": 5610 }, { "epoch": 7.88218793828892, "grad_norm": 0.46766749024391174, "learning_rate": 8.611279829277496e-05, "loss": 0.0148, "step": 5620 }, { "epoch": 7.8962131837307155, "grad_norm": 0.27129852771759033, "learning_rate": 8.605556982971528e-05, "loss": 0.0135, "step": 5630 }, { "epoch": 7.9102384291725105, "grad_norm": 0.2794802784919739, "learning_rate": 8.599824279210447e-05, "loss": 0.0141, "step": 5640 }, { "epoch": 7.924263674614306, "grad_norm": 0.36893999576568604, "learning_rate": 8.594081733667243e-05, "loss": 0.0138, "step": 5650 }, { "epoch": 7.938288920056101, "grad_norm": 0.3350895643234253, "learning_rate": 8.58832936204182e-05, "loss": 0.0175, "step": 5660 }, { "epoch": 7.952314165497896, "grad_norm": 0.336932897567749, "learning_rate": 8.582567180060942e-05, "loss": 0.0148, "step": 5670 }, { "epoch": 7.966339410939692, "grad_norm": 0.24746635556221008, "learning_rate": 8.576795203478194e-05, "loss": 0.0154, "step": 5680 }, { "epoch": 7.980364656381487, "grad_norm": 0.2953358590602875, "learning_rate": 8.571013448073939e-05, "loss": 0.0122, "step": 5690 }, { "epoch": 7.994389901823282, "grad_norm": 0.3385098874568939, "learning_rate": 8.565221929655275e-05, "loss": 0.0128, "step": 5700 }, { "epoch": 8.008415147265078, "grad_norm": 0.3411865234375, "learning_rate": 8.559420664055992e-05, "loss": 0.0137, "step": 5710 }, { "epoch": 8.022440392706873, "grad_norm": 0.3554166257381439, "learning_rate": 8.553609667136532e-05, "loss": 0.0141, "step": 5720 }, { "epoch": 8.036465638148668, "grad_norm": 0.40919095277786255, "learning_rate": 8.547788954783936e-05, "loss": 0.0121, "step": 5730 }, { "epoch": 8.050490883590463, "grad_norm": 0.2632893919944763, "learning_rate": 8.541958542911808e-05, "loss": 0.0159, "step": 5740 }, { "epoch": 8.064516129032258, "grad_norm": 0.3271380364894867, "learning_rate": 8.536118447460275e-05, "loss": 0.015, "step": 5750 }, { "epoch": 8.078541374474053, "grad_norm": 0.3509809672832489, "learning_rate": 8.530268684395932e-05, "loss": 0.0137, "step": 5760 }, { "epoch": 8.092566619915848, "grad_norm": 0.21382591128349304, "learning_rate": 8.524409269711807e-05, "loss": 0.0139, "step": 5770 }, { "epoch": 8.106591865357643, "grad_norm": 0.29120761156082153, "learning_rate": 8.51854021942732e-05, "loss": 0.0152, "step": 5780 }, { "epoch": 8.120617110799438, "grad_norm": 0.2509285807609558, "learning_rate": 8.512661549588227e-05, "loss": 0.0148, "step": 5790 }, { "epoch": 8.134642356241233, "grad_norm": 0.36911842226982117, "learning_rate": 8.506773276266588e-05, "loss": 0.0161, "step": 5800 }, { "epoch": 8.14866760168303, "grad_norm": 0.37581774592399597, "learning_rate": 8.500875415560721e-05, "loss": 0.0154, "step": 5810 }, { "epoch": 8.162692847124825, "grad_norm": 0.4053893983364105, "learning_rate": 8.494967983595144e-05, "loss": 0.0153, "step": 5820 }, { "epoch": 8.17671809256662, "grad_norm": 0.3030328154563904, "learning_rate": 8.489050996520558e-05, "loss": 0.0143, "step": 5830 }, { "epoch": 8.190743338008415, "grad_norm": 0.28341442346572876, "learning_rate": 8.483124470513775e-05, "loss": 0.0175, "step": 5840 }, { "epoch": 8.20476858345021, "grad_norm": 0.29413866996765137, "learning_rate": 8.477188421777692e-05, "loss": 0.0155, "step": 5850 }, { "epoch": 8.218793828892005, "grad_norm": 0.2574174404144287, "learning_rate": 8.47124286654124e-05, "loss": 0.0161, "step": 5860 }, { "epoch": 8.2328190743338, "grad_norm": 0.3629385828971863, "learning_rate": 8.465287821059341e-05, "loss": 0.0165, "step": 5870 }, { "epoch": 8.246844319775596, "grad_norm": 0.23818564414978027, "learning_rate": 8.45932330161286e-05, "loss": 0.0174, "step": 5880 }, { "epoch": 8.26086956521739, "grad_norm": 0.49800559878349304, "learning_rate": 8.453349324508567e-05, "loss": 0.0167, "step": 5890 }, { "epoch": 8.274894810659186, "grad_norm": 0.30231595039367676, "learning_rate": 8.447365906079088e-05, "loss": 0.0146, "step": 5900 }, { "epoch": 8.288920056100983, "grad_norm": 0.403148353099823, "learning_rate": 8.441373062682856e-05, "loss": 0.0162, "step": 5910 }, { "epoch": 8.302945301542778, "grad_norm": 0.2964518368244171, "learning_rate": 8.43537081070408e-05, "loss": 0.0125, "step": 5920 }, { "epoch": 8.316970546984573, "grad_norm": 0.47306790947914124, "learning_rate": 8.429359166552689e-05, "loss": 0.0142, "step": 5930 }, { "epoch": 8.330995792426368, "grad_norm": 0.40605348348617554, "learning_rate": 8.423338146664284e-05, "loss": 0.0175, "step": 5940 }, { "epoch": 8.345021037868163, "grad_norm": 0.3605150580406189, "learning_rate": 8.417307767500107e-05, "loss": 0.0132, "step": 5950 }, { "epoch": 8.359046283309958, "grad_norm": 0.25818341970443726, "learning_rate": 8.411268045546983e-05, "loss": 0.013, "step": 5960 }, { "epoch": 8.373071528751753, "grad_norm": 0.24147140979766846, "learning_rate": 8.405218997317281e-05, "loss": 0.0148, "step": 5970 }, { "epoch": 8.387096774193548, "grad_norm": 0.3710528612136841, "learning_rate": 8.399160639348869e-05, "loss": 0.0152, "step": 5980 }, { "epoch": 8.401122019635343, "grad_norm": 0.25166386365890503, "learning_rate": 8.393092988205065e-05, "loss": 0.0151, "step": 5990 }, { "epoch": 8.415147265077138, "grad_norm": 0.319888174533844, "learning_rate": 8.387016060474597e-05, "loss": 0.0146, "step": 6000 }, { "epoch": 8.429172510518933, "grad_norm": 0.25770360231399536, "learning_rate": 8.380929872771551e-05, "loss": 0.0139, "step": 6010 }, { "epoch": 8.44319775596073, "grad_norm": 0.3537949025630951, "learning_rate": 8.374834441735335e-05, "loss": 0.0125, "step": 6020 }, { "epoch": 8.457223001402525, "grad_norm": 0.3838384449481964, "learning_rate": 8.368729784030622e-05, "loss": 0.0116, "step": 6030 }, { "epoch": 8.47124824684432, "grad_norm": 0.28949517011642456, "learning_rate": 8.362615916347315e-05, "loss": 0.0129, "step": 6040 }, { "epoch": 8.485273492286115, "grad_norm": 0.2491881400346756, "learning_rate": 8.356492855400493e-05, "loss": 0.0139, "step": 6050 }, { "epoch": 8.49929873772791, "grad_norm": 0.22060507535934448, "learning_rate": 8.350360617930371e-05, "loss": 0.0148, "step": 6060 }, { "epoch": 8.513323983169705, "grad_norm": 0.3045375347137451, "learning_rate": 8.344219220702255e-05, "loss": 0.0134, "step": 6070 }, { "epoch": 8.5273492286115, "grad_norm": 0.21728628873825073, "learning_rate": 8.338068680506485e-05, "loss": 0.0148, "step": 6080 }, { "epoch": 8.541374474053296, "grad_norm": 0.27633270621299744, "learning_rate": 8.33190901415841e-05, "loss": 0.0119, "step": 6090 }, { "epoch": 8.55539971949509, "grad_norm": 0.20501497387886047, "learning_rate": 8.325740238498317e-05, "loss": 0.0123, "step": 6100 }, { "epoch": 8.569424964936886, "grad_norm": 0.27560821175575256, "learning_rate": 8.319562370391406e-05, "loss": 0.0165, "step": 6110 }, { "epoch": 8.583450210378682, "grad_norm": 0.2684819996356964, "learning_rate": 8.31337542672773e-05, "loss": 0.0163, "step": 6120 }, { "epoch": 8.597475455820478, "grad_norm": 0.23589953780174255, "learning_rate": 8.307179424422158e-05, "loss": 0.0137, "step": 6130 }, { "epoch": 8.611500701262273, "grad_norm": 0.3140094578266144, "learning_rate": 8.300974380414327e-05, "loss": 0.0127, "step": 6140 }, { "epoch": 8.625525946704068, "grad_norm": 0.31090205907821655, "learning_rate": 8.294760311668586e-05, "loss": 0.0157, "step": 6150 }, { "epoch": 8.639551192145863, "grad_norm": 0.3394724428653717, "learning_rate": 8.288537235173961e-05, "loss": 0.0155, "step": 6160 }, { "epoch": 8.653576437587658, "grad_norm": 0.33309412002563477, "learning_rate": 8.282305167944108e-05, "loss": 0.0151, "step": 6170 }, { "epoch": 8.667601683029453, "grad_norm": 0.36804866790771484, "learning_rate": 8.276064127017262e-05, "loss": 0.0144, "step": 6180 }, { "epoch": 8.681626928471248, "grad_norm": 0.2196921706199646, "learning_rate": 8.269814129456189e-05, "loss": 0.0124, "step": 6190 }, { "epoch": 8.695652173913043, "grad_norm": 0.22026048600673676, "learning_rate": 8.263555192348143e-05, "loss": 0.0129, "step": 6200 }, { "epoch": 8.709677419354838, "grad_norm": 0.39761364459991455, "learning_rate": 8.257287332804819e-05, "loss": 0.0174, "step": 6210 }, { "epoch": 8.723702664796633, "grad_norm": 0.44270944595336914, "learning_rate": 8.251010567962307e-05, "loss": 0.013, "step": 6220 }, { "epoch": 8.73772791023843, "grad_norm": 0.36799392104148865, "learning_rate": 8.244724914981041e-05, "loss": 0.0161, "step": 6230 }, { "epoch": 8.751753155680225, "grad_norm": 0.25583186745643616, "learning_rate": 8.238430391045757e-05, "loss": 0.0128, "step": 6240 }, { "epoch": 8.76577840112202, "grad_norm": 0.2449161559343338, "learning_rate": 8.232127013365445e-05, "loss": 0.0189, "step": 6250 }, { "epoch": 8.779803646563815, "grad_norm": 0.3229672312736511, "learning_rate": 8.225814799173295e-05, "loss": 0.0149, "step": 6260 }, { "epoch": 8.79382889200561, "grad_norm": 0.29948753118515015, "learning_rate": 8.219493765726663e-05, "loss": 0.0147, "step": 6270 }, { "epoch": 8.807854137447405, "grad_norm": 0.3357578217983246, "learning_rate": 8.21316393030701e-05, "loss": 0.0129, "step": 6280 }, { "epoch": 8.8218793828892, "grad_norm": 0.2750747799873352, "learning_rate": 8.206825310219865e-05, "loss": 0.0139, "step": 6290 }, { "epoch": 8.835904628330995, "grad_norm": 0.3543185293674469, "learning_rate": 8.200477922794776e-05, "loss": 0.0156, "step": 6300 }, { "epoch": 8.84992987377279, "grad_norm": 0.32587265968322754, "learning_rate": 8.194121785385256e-05, "loss": 0.0134, "step": 6310 }, { "epoch": 8.863955119214586, "grad_norm": 0.47169673442840576, "learning_rate": 8.187756915368741e-05, "loss": 0.0146, "step": 6320 }, { "epoch": 8.87798036465638, "grad_norm": 0.40850740671157837, "learning_rate": 8.181383330146544e-05, "loss": 0.0145, "step": 6330 }, { "epoch": 8.892005610098177, "grad_norm": 0.3680282533168793, "learning_rate": 8.175001047143804e-05, "loss": 0.0161, "step": 6340 }, { "epoch": 8.906030855539973, "grad_norm": 0.338113933801651, "learning_rate": 8.168610083809438e-05, "loss": 0.0156, "step": 6350 }, { "epoch": 8.920056100981768, "grad_norm": 0.26121440529823303, "learning_rate": 8.162210457616095e-05, "loss": 0.016, "step": 6360 }, { "epoch": 8.934081346423563, "grad_norm": 0.3677983582019806, "learning_rate": 8.155802186060109e-05, "loss": 0.0164, "step": 6370 }, { "epoch": 8.948106591865358, "grad_norm": 0.34072747826576233, "learning_rate": 8.149385286661453e-05, "loss": 0.0156, "step": 6380 }, { "epoch": 8.962131837307153, "grad_norm": 0.3578682839870453, "learning_rate": 8.14295977696368e-05, "loss": 0.0154, "step": 6390 }, { "epoch": 8.976157082748948, "grad_norm": 0.375152051448822, "learning_rate": 8.13652567453389e-05, "loss": 0.0141, "step": 6400 }, { "epoch": 8.990182328190743, "grad_norm": 0.3001749813556671, "learning_rate": 8.130082996962676e-05, "loss": 0.0155, "step": 6410 }, { "epoch": 9.004207573632538, "grad_norm": 0.2534882724285126, "learning_rate": 8.123631761864068e-05, "loss": 0.0163, "step": 6420 }, { "epoch": 9.018232819074333, "grad_norm": 0.1960490494966507, "learning_rate": 8.1171719868755e-05, "loss": 0.0153, "step": 6430 }, { "epoch": 9.03225806451613, "grad_norm": 0.2488713413476944, "learning_rate": 8.110703689657748e-05, "loss": 0.0137, "step": 6440 }, { "epoch": 9.046283309957925, "grad_norm": 0.26622283458709717, "learning_rate": 8.104226887894892e-05, "loss": 0.0148, "step": 6450 }, { "epoch": 9.06030855539972, "grad_norm": 0.32054656744003296, "learning_rate": 8.097741599294257e-05, "loss": 0.0135, "step": 6460 }, { "epoch": 9.074333800841515, "grad_norm": 0.31420671939849854, "learning_rate": 8.091247841586378e-05, "loss": 0.0127, "step": 6470 }, { "epoch": 9.08835904628331, "grad_norm": 0.2749091684818268, "learning_rate": 8.084745632524939e-05, "loss": 0.0162, "step": 6480 }, { "epoch": 9.102384291725105, "grad_norm": 0.35074183344841003, "learning_rate": 8.07823498988673e-05, "loss": 0.013, "step": 6490 }, { "epoch": 9.1164095371669, "grad_norm": 0.3741379380226135, "learning_rate": 8.071715931471602e-05, "loss": 0.0131, "step": 6500 }, { "epoch": 9.130434782608695, "grad_norm": 0.3500541150569916, "learning_rate": 8.06518847510241e-05, "loss": 0.0136, "step": 6510 }, { "epoch": 9.14446002805049, "grad_norm": 0.7191120386123657, "learning_rate": 8.058652638624971e-05, "loss": 0.0139, "step": 6520 }, { "epoch": 9.158485273492285, "grad_norm": 0.458975613117218, "learning_rate": 8.052108439908013e-05, "loss": 0.0123, "step": 6530 }, { "epoch": 9.17251051893408, "grad_norm": 0.3583080470561981, "learning_rate": 8.045555896843125e-05, "loss": 0.0151, "step": 6540 }, { "epoch": 9.186535764375877, "grad_norm": 0.31552088260650635, "learning_rate": 8.03899502734471e-05, "loss": 0.0127, "step": 6550 }, { "epoch": 9.200561009817672, "grad_norm": 0.22617825865745544, "learning_rate": 8.032425849349931e-05, "loss": 0.0158, "step": 6560 }, { "epoch": 9.214586255259468, "grad_norm": 0.27499106526374817, "learning_rate": 8.025848380818674e-05, "loss": 0.0149, "step": 6570 }, { "epoch": 9.228611500701263, "grad_norm": 0.2681017518043518, "learning_rate": 8.019262639733487e-05, "loss": 0.0176, "step": 6580 }, { "epoch": 9.242636746143058, "grad_norm": 0.2705209255218506, "learning_rate": 8.012668644099531e-05, "loss": 0.015, "step": 6590 }, { "epoch": 9.256661991584853, "grad_norm": 0.33764976263046265, "learning_rate": 8.006066411944542e-05, "loss": 0.0151, "step": 6600 }, { "epoch": 9.270687237026648, "grad_norm": 0.23848851025104523, "learning_rate": 7.999455961318769e-05, "loss": 0.0141, "step": 6610 }, { "epoch": 9.284712482468443, "grad_norm": 0.387762188911438, "learning_rate": 7.992837310294932e-05, "loss": 0.0159, "step": 6620 }, { "epoch": 9.298737727910238, "grad_norm": 0.21752434968948364, "learning_rate": 7.986210476968167e-05, "loss": 0.0141, "step": 6630 }, { "epoch": 9.312762973352033, "grad_norm": 0.3159080445766449, "learning_rate": 7.97957547945599e-05, "loss": 0.0148, "step": 6640 }, { "epoch": 9.32678821879383, "grad_norm": 0.2550439238548279, "learning_rate": 7.972932335898226e-05, "loss": 0.0126, "step": 6650 }, { "epoch": 9.340813464235625, "grad_norm": 0.30527979135513306, "learning_rate": 7.966281064456975e-05, "loss": 0.0138, "step": 6660 }, { "epoch": 9.35483870967742, "grad_norm": 0.18135583400726318, "learning_rate": 7.959621683316563e-05, "loss": 0.0154, "step": 6670 }, { "epoch": 9.368863955119215, "grad_norm": 0.23135150969028473, "learning_rate": 7.952954210683481e-05, "loss": 0.0139, "step": 6680 }, { "epoch": 9.38288920056101, "grad_norm": 0.15031686425209045, "learning_rate": 7.946278664786345e-05, "loss": 0.0133, "step": 6690 }, { "epoch": 9.396914446002805, "grad_norm": 0.18772602081298828, "learning_rate": 7.939595063875842e-05, "loss": 0.0138, "step": 6700 }, { "epoch": 9.4109396914446, "grad_norm": 0.22810089588165283, "learning_rate": 7.932903426224683e-05, "loss": 0.0142, "step": 6710 }, { "epoch": 9.424964936886395, "grad_norm": 0.36211958527565, "learning_rate": 7.926203770127552e-05, "loss": 0.0143, "step": 6720 }, { "epoch": 9.43899018232819, "grad_norm": 0.258968323469162, "learning_rate": 7.919496113901046e-05, "loss": 0.0121, "step": 6730 }, { "epoch": 9.453015427769985, "grad_norm": 0.4378112852573395, "learning_rate": 7.912780475883649e-05, "loss": 0.0123, "step": 6740 }, { "epoch": 9.46704067321178, "grad_norm": 0.33111339807510376, "learning_rate": 7.906056874435652e-05, "loss": 0.0138, "step": 6750 }, { "epoch": 9.481065918653577, "grad_norm": 0.32348525524139404, "learning_rate": 7.899325327939131e-05, "loss": 0.0141, "step": 6760 }, { "epoch": 9.495091164095372, "grad_norm": 0.1923413723707199, "learning_rate": 7.892585854797872e-05, "loss": 0.0119, "step": 6770 }, { "epoch": 9.509116409537167, "grad_norm": 0.28749755024909973, "learning_rate": 7.88583847343734e-05, "loss": 0.0117, "step": 6780 }, { "epoch": 9.523141654978962, "grad_norm": 0.24020026624202728, "learning_rate": 7.879083202304616e-05, "loss": 0.0142, "step": 6790 }, { "epoch": 9.537166900420758, "grad_norm": 0.2560575604438782, "learning_rate": 7.872320059868355e-05, "loss": 0.0146, "step": 6800 }, { "epoch": 9.551192145862553, "grad_norm": 0.23248526453971863, "learning_rate": 7.865549064618729e-05, "loss": 0.0145, "step": 6810 }, { "epoch": 9.565217391304348, "grad_norm": 0.2965206205844879, "learning_rate": 7.858770235067381e-05, "loss": 0.013, "step": 6820 }, { "epoch": 9.579242636746143, "grad_norm": 0.33843305706977844, "learning_rate": 7.851983589747374e-05, "loss": 0.0134, "step": 6830 }, { "epoch": 9.593267882187938, "grad_norm": 0.20501723885536194, "learning_rate": 7.845189147213133e-05, "loss": 0.012, "step": 6840 }, { "epoch": 9.607293127629733, "grad_norm": 0.2952137887477875, "learning_rate": 7.838386926040407e-05, "loss": 0.0159, "step": 6850 }, { "epoch": 9.621318373071528, "grad_norm": 0.28230202198028564, "learning_rate": 7.83157694482621e-05, "loss": 0.0165, "step": 6860 }, { "epoch": 9.635343618513325, "grad_norm": 0.30294808745384216, "learning_rate": 7.824759222188768e-05, "loss": 0.014, "step": 6870 }, { "epoch": 9.64936886395512, "grad_norm": 0.24781577289104462, "learning_rate": 7.817933776767478e-05, "loss": 0.0131, "step": 6880 }, { "epoch": 9.663394109396915, "grad_norm": 0.2430293709039688, "learning_rate": 7.811100627222842e-05, "loss": 0.0137, "step": 6890 }, { "epoch": 9.67741935483871, "grad_norm": 0.33456504344940186, "learning_rate": 7.804259792236435e-05, "loss": 0.0108, "step": 6900 }, { "epoch": 9.691444600280505, "grad_norm": 0.2977629005908966, "learning_rate": 7.797411290510835e-05, "loss": 0.0134, "step": 6910 }, { "epoch": 9.7054698457223, "grad_norm": 0.3851078152656555, "learning_rate": 7.790555140769586e-05, "loss": 0.0131, "step": 6920 }, { "epoch": 9.719495091164095, "grad_norm": 0.27562543749809265, "learning_rate": 7.78369136175714e-05, "loss": 0.0125, "step": 6930 }, { "epoch": 9.73352033660589, "grad_norm": 0.2668918967247009, "learning_rate": 7.776819972238806e-05, "loss": 0.0131, "step": 6940 }, { "epoch": 9.747545582047685, "grad_norm": 0.3092336058616638, "learning_rate": 7.7699409910007e-05, "loss": 0.0139, "step": 6950 }, { "epoch": 9.76157082748948, "grad_norm": 0.38904353976249695, "learning_rate": 7.763054436849694e-05, "loss": 0.0116, "step": 6960 }, { "epoch": 9.775596072931275, "grad_norm": 0.24014431238174438, "learning_rate": 7.756160328613364e-05, "loss": 0.0141, "step": 6970 }, { "epoch": 9.789621318373072, "grad_norm": 0.3114299476146698, "learning_rate": 7.749258685139942e-05, "loss": 0.0129, "step": 6980 }, { "epoch": 9.803646563814867, "grad_norm": 0.2867125868797302, "learning_rate": 7.742349525298253e-05, "loss": 0.0129, "step": 6990 }, { "epoch": 9.817671809256662, "grad_norm": 0.37857553362846375, "learning_rate": 7.735432867977679e-05, "loss": 0.0121, "step": 7000 }, { "epoch": 9.831697054698457, "grad_norm": 0.3341030478477478, "learning_rate": 7.728508732088096e-05, "loss": 0.0118, "step": 7010 }, { "epoch": 9.845722300140253, "grad_norm": 0.2617892622947693, "learning_rate": 7.721577136559825e-05, "loss": 0.016, "step": 7020 }, { "epoch": 9.859747545582048, "grad_norm": 0.26049408316612244, "learning_rate": 7.714638100343588e-05, "loss": 0.0143, "step": 7030 }, { "epoch": 9.873772791023843, "grad_norm": 0.5538707971572876, "learning_rate": 7.707691642410444e-05, "loss": 0.0123, "step": 7040 }, { "epoch": 9.887798036465638, "grad_norm": 0.31373026967048645, "learning_rate": 7.70073778175174e-05, "loss": 0.0134, "step": 7050 }, { "epoch": 9.901823281907433, "grad_norm": 0.26134005188941956, "learning_rate": 7.69377653737907e-05, "loss": 0.0124, "step": 7060 }, { "epoch": 9.915848527349228, "grad_norm": 0.34672558307647705, "learning_rate": 7.686807928324209e-05, "loss": 0.0142, "step": 7070 }, { "epoch": 9.929873772791023, "grad_norm": 0.2105439305305481, "learning_rate": 7.679831973639065e-05, "loss": 0.0128, "step": 7080 }, { "epoch": 9.94389901823282, "grad_norm": 0.22676028311252594, "learning_rate": 7.672848692395637e-05, "loss": 0.0131, "step": 7090 }, { "epoch": 9.957924263674615, "grad_norm": 0.2502754032611847, "learning_rate": 7.665858103685944e-05, "loss": 0.012, "step": 7100 }, { "epoch": 9.97194950911641, "grad_norm": 0.46591851115226746, "learning_rate": 7.658860226621991e-05, "loss": 0.0132, "step": 7110 }, { "epoch": 9.985974754558205, "grad_norm": 0.28925108909606934, "learning_rate": 7.651855080335708e-05, "loss": 0.0126, "step": 7120 }, { "epoch": 10.0, "grad_norm": 0.4484270215034485, "learning_rate": 7.644842683978896e-05, "loss": 0.0159, "step": 7130 }, { "epoch": 10.014025245441795, "grad_norm": 0.24588599801063538, "learning_rate": 7.63782305672318e-05, "loss": 0.0133, "step": 7140 }, { "epoch": 10.02805049088359, "grad_norm": 0.2676137387752533, "learning_rate": 7.63079621775995e-05, "loss": 0.0149, "step": 7150 }, { "epoch": 10.042075736325385, "grad_norm": 0.44632625579833984, "learning_rate": 7.623762186300319e-05, "loss": 0.0116, "step": 7160 }, { "epoch": 10.05610098176718, "grad_norm": 0.2945210039615631, "learning_rate": 7.616720981575057e-05, "loss": 0.0141, "step": 7170 }, { "epoch": 10.070126227208975, "grad_norm": 0.31680533289909363, "learning_rate": 7.609672622834552e-05, "loss": 0.0139, "step": 7180 }, { "epoch": 10.084151472650772, "grad_norm": 0.3076924681663513, "learning_rate": 7.602617129348747e-05, "loss": 0.0133, "step": 7190 }, { "epoch": 10.098176718092567, "grad_norm": 0.25141748785972595, "learning_rate": 7.595554520407088e-05, "loss": 0.0135, "step": 7200 }, { "epoch": 10.112201963534362, "grad_norm": 0.18562227487564087, "learning_rate": 7.588484815318484e-05, "loss": 0.0136, "step": 7210 }, { "epoch": 10.126227208976157, "grad_norm": 0.2315988540649414, "learning_rate": 7.581408033411234e-05, "loss": 0.0138, "step": 7220 }, { "epoch": 10.140252454417952, "grad_norm": 0.24984262883663177, "learning_rate": 7.574324194032995e-05, "loss": 0.0146, "step": 7230 }, { "epoch": 10.154277699859747, "grad_norm": 0.23141245543956757, "learning_rate": 7.567233316550705e-05, "loss": 0.0116, "step": 7240 }, { "epoch": 10.168302945301543, "grad_norm": 0.2513107359409332, "learning_rate": 7.560135420350562e-05, "loss": 0.0128, "step": 7250 }, { "epoch": 10.182328190743338, "grad_norm": 0.2561623156070709, "learning_rate": 7.553030524837935e-05, "loss": 0.0124, "step": 7260 }, { "epoch": 10.196353436185133, "grad_norm": 0.26084059476852417, "learning_rate": 7.545918649437341e-05, "loss": 0.014, "step": 7270 }, { "epoch": 10.210378681626928, "grad_norm": 0.27259716391563416, "learning_rate": 7.538799813592377e-05, "loss": 0.0122, "step": 7280 }, { "epoch": 10.224403927068725, "grad_norm": 0.25160273909568787, "learning_rate": 7.531674036765662e-05, "loss": 0.0119, "step": 7290 }, { "epoch": 10.23842917251052, "grad_norm": 0.2396249622106552, "learning_rate": 7.524541338438807e-05, "loss": 0.0124, "step": 7300 }, { "epoch": 10.252454417952315, "grad_norm": 0.23994582891464233, "learning_rate": 7.517401738112328e-05, "loss": 0.0113, "step": 7310 }, { "epoch": 10.26647966339411, "grad_norm": 0.29084017872810364, "learning_rate": 7.510255255305628e-05, "loss": 0.012, "step": 7320 }, { "epoch": 10.280504908835905, "grad_norm": 0.21431438624858856, "learning_rate": 7.503101909556911e-05, "loss": 0.013, "step": 7330 }, { "epoch": 10.2945301542777, "grad_norm": 0.3511161804199219, "learning_rate": 7.495941720423154e-05, "loss": 0.0159, "step": 7340 }, { "epoch": 10.308555399719495, "grad_norm": 0.27332809567451477, "learning_rate": 7.488774707480042e-05, "loss": 0.0109, "step": 7350 }, { "epoch": 10.32258064516129, "grad_norm": 0.23057657480239868, "learning_rate": 7.481600890321911e-05, "loss": 0.0138, "step": 7360 }, { "epoch": 10.336605890603085, "grad_norm": 0.2419021874666214, "learning_rate": 7.474420288561708e-05, "loss": 0.0115, "step": 7370 }, { "epoch": 10.35063113604488, "grad_norm": 0.2548217177391052, "learning_rate": 7.467232921830921e-05, "loss": 0.0116, "step": 7380 }, { "epoch": 10.364656381486675, "grad_norm": 0.1924692988395691, "learning_rate": 7.460038809779537e-05, "loss": 0.0136, "step": 7390 }, { "epoch": 10.378681626928472, "grad_norm": 0.28570497035980225, "learning_rate": 7.452837972075983e-05, "loss": 0.0117, "step": 7400 }, { "epoch": 10.392706872370267, "grad_norm": 0.19721192121505737, "learning_rate": 7.445630428407074e-05, "loss": 0.015, "step": 7410 }, { "epoch": 10.406732117812062, "grad_norm": 0.3168320059776306, "learning_rate": 7.43841619847796e-05, "loss": 0.0129, "step": 7420 }, { "epoch": 10.420757363253857, "grad_norm": 0.22577844560146332, "learning_rate": 7.431195302012072e-05, "loss": 0.012, "step": 7430 }, { "epoch": 10.434782608695652, "grad_norm": 0.3906562328338623, "learning_rate": 7.423967758751061e-05, "loss": 0.0158, "step": 7440 }, { "epoch": 10.448807854137447, "grad_norm": 0.41276058554649353, "learning_rate": 7.416733588454758e-05, "loss": 0.0139, "step": 7450 }, { "epoch": 10.462833099579242, "grad_norm": 0.2815922200679779, "learning_rate": 7.409492810901106e-05, "loss": 0.0131, "step": 7460 }, { "epoch": 10.476858345021038, "grad_norm": 0.3032444417476654, "learning_rate": 7.402245445886116e-05, "loss": 0.0149, "step": 7470 }, { "epoch": 10.490883590462833, "grad_norm": 0.33133697509765625, "learning_rate": 7.394991513223806e-05, "loss": 0.0152, "step": 7480 }, { "epoch": 10.504908835904628, "grad_norm": 0.333442360162735, "learning_rate": 7.38773103274615e-05, "loss": 0.0124, "step": 7490 }, { "epoch": 10.518934081346423, "grad_norm": 0.3000176250934601, "learning_rate": 7.380464024303028e-05, "loss": 0.0129, "step": 7500 }, { "epoch": 10.53295932678822, "grad_norm": 0.3007463812828064, "learning_rate": 7.373190507762162e-05, "loss": 0.0161, "step": 7510 }, { "epoch": 10.546984572230015, "grad_norm": 0.26446443796157837, "learning_rate": 7.365910503009066e-05, "loss": 0.0112, "step": 7520 }, { "epoch": 10.56100981767181, "grad_norm": 0.2666980028152466, "learning_rate": 7.358624029946996e-05, "loss": 0.0123, "step": 7530 }, { "epoch": 10.575035063113605, "grad_norm": 0.3061917722225189, "learning_rate": 7.351331108496893e-05, "loss": 0.014, "step": 7540 }, { "epoch": 10.5890603085554, "grad_norm": 0.23616346716880798, "learning_rate": 7.344031758597325e-05, "loss": 0.0135, "step": 7550 }, { "epoch": 10.603085553997195, "grad_norm": 0.23365937173366547, "learning_rate": 7.336726000204435e-05, "loss": 0.014, "step": 7560 }, { "epoch": 10.61711079943899, "grad_norm": 0.2168937623500824, "learning_rate": 7.32941385329189e-05, "loss": 0.0131, "step": 7570 }, { "epoch": 10.631136044880785, "grad_norm": 0.31507962942123413, "learning_rate": 7.322095337850816e-05, "loss": 0.0116, "step": 7580 }, { "epoch": 10.64516129032258, "grad_norm": 0.33646923303604126, "learning_rate": 7.314770473889758e-05, "loss": 0.0153, "step": 7590 }, { "epoch": 10.659186535764375, "grad_norm": 0.26420262455940247, "learning_rate": 7.307439281434615e-05, "loss": 0.0115, "step": 7600 }, { "epoch": 10.67321178120617, "grad_norm": 0.31156471371650696, "learning_rate": 7.300101780528585e-05, "loss": 0.0131, "step": 7610 }, { "epoch": 10.687237026647967, "grad_norm": 0.4221523404121399, "learning_rate": 7.292757991232117e-05, "loss": 0.0151, "step": 7620 }, { "epoch": 10.701262272089762, "grad_norm": 0.3199746608734131, "learning_rate": 7.285407933622848e-05, "loss": 0.012, "step": 7630 }, { "epoch": 10.715287517531557, "grad_norm": 0.33236730098724365, "learning_rate": 7.278051627795557e-05, "loss": 0.0136, "step": 7640 }, { "epoch": 10.729312762973352, "grad_norm": 0.2858358919620514, "learning_rate": 7.270689093862105e-05, "loss": 0.0172, "step": 7650 }, { "epoch": 10.743338008415147, "grad_norm": 0.22584158182144165, "learning_rate": 7.263320351951374e-05, "loss": 0.0125, "step": 7660 }, { "epoch": 10.757363253856942, "grad_norm": 0.2508135735988617, "learning_rate": 7.255945422209227e-05, "loss": 0.0107, "step": 7670 }, { "epoch": 10.771388499298737, "grad_norm": 0.20944556593894958, "learning_rate": 7.248564324798437e-05, "loss": 0.0123, "step": 7680 }, { "epoch": 10.785413744740532, "grad_norm": 0.20416951179504395, "learning_rate": 7.241177079898644e-05, "loss": 0.0125, "step": 7690 }, { "epoch": 10.799438990182328, "grad_norm": 0.2306576520204544, "learning_rate": 7.233783707706295e-05, "loss": 0.0119, "step": 7700 }, { "epoch": 10.813464235624123, "grad_norm": 0.22312051057815552, "learning_rate": 7.226384228434586e-05, "loss": 0.0135, "step": 7710 }, { "epoch": 10.82748948106592, "grad_norm": 0.26615700125694275, "learning_rate": 7.21897866231341e-05, "loss": 0.0131, "step": 7720 }, { "epoch": 10.841514726507715, "grad_norm": 0.2365492731332779, "learning_rate": 7.211567029589303e-05, "loss": 0.0115, "step": 7730 }, { "epoch": 10.85553997194951, "grad_norm": 0.3009771704673767, "learning_rate": 7.204149350525387e-05, "loss": 0.013, "step": 7740 }, { "epoch": 10.869565217391305, "grad_norm": 0.1905997097492218, "learning_rate": 7.196725645401309e-05, "loss": 0.0104, "step": 7750 }, { "epoch": 10.8835904628331, "grad_norm": 0.23142331838607788, "learning_rate": 7.1892959345132e-05, "loss": 0.0129, "step": 7760 }, { "epoch": 10.897615708274895, "grad_norm": 0.19406883418560028, "learning_rate": 7.181860238173605e-05, "loss": 0.013, "step": 7770 }, { "epoch": 10.91164095371669, "grad_norm": 0.2505205273628235, "learning_rate": 7.174418576711432e-05, "loss": 0.0103, "step": 7780 }, { "epoch": 10.925666199158485, "grad_norm": 0.5667808055877686, "learning_rate": 7.1669709704719e-05, "loss": 0.0124, "step": 7790 }, { "epoch": 10.93969144460028, "grad_norm": 0.3498063385486603, "learning_rate": 7.159517439816481e-05, "loss": 0.0128, "step": 7800 }, { "epoch": 10.953716690042075, "grad_norm": 0.49245595932006836, "learning_rate": 7.152058005122842e-05, "loss": 0.0164, "step": 7810 }, { "epoch": 10.967741935483872, "grad_norm": 0.2970128357410431, "learning_rate": 7.144592686784793e-05, "loss": 0.0106, "step": 7820 }, { "epoch": 10.981767180925667, "grad_norm": 0.2933286130428314, "learning_rate": 7.137121505212229e-05, "loss": 0.011, "step": 7830 }, { "epoch": 10.995792426367462, "grad_norm": 0.26924601197242737, "learning_rate": 7.129644480831077e-05, "loss": 0.0132, "step": 7840 }, { "epoch": 11.009817671809257, "grad_norm": 0.21312811970710754, "learning_rate": 7.122161634083234e-05, "loss": 0.0119, "step": 7850 }, { "epoch": 11.023842917251052, "grad_norm": 0.15351912379264832, "learning_rate": 7.114672985426516e-05, "loss": 0.01, "step": 7860 }, { "epoch": 11.037868162692847, "grad_norm": 0.3153599202632904, "learning_rate": 7.107178555334606e-05, "loss": 0.0125, "step": 7870 }, { "epoch": 11.051893408134642, "grad_norm": 0.3124891519546509, "learning_rate": 7.099678364296989e-05, "loss": 0.0125, "step": 7880 }, { "epoch": 11.065918653576437, "grad_norm": 0.33005765080451965, "learning_rate": 7.0921724328189e-05, "loss": 0.0136, "step": 7890 }, { "epoch": 11.079943899018232, "grad_norm": 0.18786385655403137, "learning_rate": 7.084660781421268e-05, "loss": 0.0115, "step": 7900 }, { "epoch": 11.093969144460027, "grad_norm": 0.3583833873271942, "learning_rate": 7.077143430640662e-05, "loss": 0.0138, "step": 7910 }, { "epoch": 11.107994389901823, "grad_norm": 0.2129838615655899, "learning_rate": 7.069620401029232e-05, "loss": 0.0153, "step": 7920 }, { "epoch": 11.12201963534362, "grad_norm": 0.35970836877822876, "learning_rate": 7.062091713154655e-05, "loss": 0.0114, "step": 7930 }, { "epoch": 11.136044880785414, "grad_norm": 0.3167235255241394, "learning_rate": 7.054557387600075e-05, "loss": 0.014, "step": 7940 }, { "epoch": 11.15007012622721, "grad_norm": 0.27666187286376953, "learning_rate": 7.04701744496405e-05, "loss": 0.0143, "step": 7950 }, { "epoch": 11.164095371669005, "grad_norm": 0.3016020655632019, "learning_rate": 7.039471905860495e-05, "loss": 0.0104, "step": 7960 }, { "epoch": 11.1781206171108, "grad_norm": 0.38499656319618225, "learning_rate": 7.031920790918628e-05, "loss": 0.0144, "step": 7970 }, { "epoch": 11.192145862552595, "grad_norm": 0.2365475296974182, "learning_rate": 7.024364120782906e-05, "loss": 0.0129, "step": 7980 }, { "epoch": 11.20617110799439, "grad_norm": 0.3164042830467224, "learning_rate": 7.016801916112978e-05, "loss": 0.0149, "step": 7990 }, { "epoch": 11.220196353436185, "grad_norm": 0.22480984032154083, "learning_rate": 7.009234197583623e-05, "loss": 0.0124, "step": 8000 }, { "epoch": 11.23422159887798, "grad_norm": 0.27391624450683594, "learning_rate": 7.001660985884692e-05, "loss": 0.0123, "step": 8010 }, { "epoch": 11.248246844319775, "grad_norm": 0.3034892976284027, "learning_rate": 6.994082301721063e-05, "loss": 0.0147, "step": 8020 }, { "epoch": 11.26227208976157, "grad_norm": 0.322370320558548, "learning_rate": 6.986498165812563e-05, "loss": 0.0126, "step": 8030 }, { "epoch": 11.276297335203367, "grad_norm": 0.2935252785682678, "learning_rate": 6.978908598893932e-05, "loss": 0.0126, "step": 8040 }, { "epoch": 11.290322580645162, "grad_norm": 0.3790438175201416, "learning_rate": 6.971313621714756e-05, "loss": 0.0156, "step": 8050 }, { "epoch": 11.304347826086957, "grad_norm": 0.3776082694530487, "learning_rate": 6.96371325503941e-05, "loss": 0.0103, "step": 8060 }, { "epoch": 11.318373071528752, "grad_norm": 0.3524962067604065, "learning_rate": 6.956107519647014e-05, "loss": 0.0123, "step": 8070 }, { "epoch": 11.332398316970547, "grad_norm": 0.24191118776798248, "learning_rate": 6.94849643633135e-05, "loss": 0.0129, "step": 8080 }, { "epoch": 11.346423562412342, "grad_norm": 0.25303730368614197, "learning_rate": 6.940880025900834e-05, "loss": 0.0111, "step": 8090 }, { "epoch": 11.360448807854137, "grad_norm": 0.21776911616325378, "learning_rate": 6.933258309178438e-05, "loss": 0.0107, "step": 8100 }, { "epoch": 11.374474053295932, "grad_norm": 0.40401002764701843, "learning_rate": 6.925631307001646e-05, "loss": 0.017, "step": 8110 }, { "epoch": 11.388499298737727, "grad_norm": 0.241632878780365, "learning_rate": 6.91799904022239e-05, "loss": 0.0113, "step": 8120 }, { "epoch": 11.402524544179522, "grad_norm": 0.20550435781478882, "learning_rate": 6.910361529706997e-05, "loss": 0.0123, "step": 8130 }, { "epoch": 11.416549789621318, "grad_norm": 0.27549752593040466, "learning_rate": 6.902718796336131e-05, "loss": 0.0109, "step": 8140 }, { "epoch": 11.430575035063114, "grad_norm": 0.3318646550178528, "learning_rate": 6.895070861004729e-05, "loss": 0.0129, "step": 8150 }, { "epoch": 11.44460028050491, "grad_norm": 0.31055837869644165, "learning_rate": 6.887417744621956e-05, "loss": 0.013, "step": 8160 }, { "epoch": 11.458625525946704, "grad_norm": 0.22803127765655518, "learning_rate": 6.87975946811114e-05, "loss": 0.0117, "step": 8170 }, { "epoch": 11.4726507713885, "grad_norm": 0.22135315835475922, "learning_rate": 6.872096052409718e-05, "loss": 0.0112, "step": 8180 }, { "epoch": 11.486676016830295, "grad_norm": 0.21517238020896912, "learning_rate": 6.864427518469174e-05, "loss": 0.0131, "step": 8190 }, { "epoch": 11.50070126227209, "grad_norm": 0.2759351432323456, "learning_rate": 6.856753887254986e-05, "loss": 0.0112, "step": 8200 }, { "epoch": 11.514726507713885, "grad_norm": 0.2010258138179779, "learning_rate": 6.849075179746572e-05, "loss": 0.0109, "step": 8210 }, { "epoch": 11.52875175315568, "grad_norm": 0.2100413590669632, "learning_rate": 6.841391416937221e-05, "loss": 0.0119, "step": 8220 }, { "epoch": 11.542776998597475, "grad_norm": 0.351193368434906, "learning_rate": 6.833702619834053e-05, "loss": 0.011, "step": 8230 }, { "epoch": 11.55680224403927, "grad_norm": 0.21107913553714752, "learning_rate": 6.82600880945794e-05, "loss": 0.011, "step": 8240 }, { "epoch": 11.570827489481065, "grad_norm": 0.3388189971446991, "learning_rate": 6.818310006843468e-05, "loss": 0.0096, "step": 8250 }, { "epoch": 11.584852734922862, "grad_norm": 0.384765088558197, "learning_rate": 6.810606233038868e-05, "loss": 0.0111, "step": 8260 }, { "epoch": 11.598877980364657, "grad_norm": 0.3356608748435974, "learning_rate": 6.802897509105966e-05, "loss": 0.0123, "step": 8270 }, { "epoch": 11.612903225806452, "grad_norm": 0.20528161525726318, "learning_rate": 6.79518385612012e-05, "loss": 0.0132, "step": 8280 }, { "epoch": 11.626928471248247, "grad_norm": 0.32332247495651245, "learning_rate": 6.787465295170157e-05, "loss": 0.014, "step": 8290 }, { "epoch": 11.640953716690042, "grad_norm": 0.2170192450284958, "learning_rate": 6.779741847358332e-05, "loss": 0.013, "step": 8300 }, { "epoch": 11.654978962131837, "grad_norm": 0.39857444167137146, "learning_rate": 6.772013533800256e-05, "loss": 0.0106, "step": 8310 }, { "epoch": 11.669004207573632, "grad_norm": 0.29388427734375, "learning_rate": 6.764280375624843e-05, "loss": 0.0126, "step": 8320 }, { "epoch": 11.683029453015427, "grad_norm": 0.31757795810699463, "learning_rate": 6.756542393974252e-05, "loss": 0.0125, "step": 8330 }, { "epoch": 11.697054698457222, "grad_norm": 0.3155818283557892, "learning_rate": 6.748799610003828e-05, "loss": 0.0107, "step": 8340 }, { "epoch": 11.711079943899017, "grad_norm": 0.2028730809688568, "learning_rate": 6.741052044882048e-05, "loss": 0.0119, "step": 8350 }, { "epoch": 11.725105189340814, "grad_norm": 0.31987878680229187, "learning_rate": 6.73329971979046e-05, "loss": 0.01, "step": 8360 }, { "epoch": 11.73913043478261, "grad_norm": 0.21690967679023743, "learning_rate": 6.725542655923625e-05, "loss": 0.0107, "step": 8370 }, { "epoch": 11.753155680224404, "grad_norm": 0.19967463612556458, "learning_rate": 6.717780874489057e-05, "loss": 0.0122, "step": 8380 }, { "epoch": 11.7671809256662, "grad_norm": 0.20786459743976593, "learning_rate": 6.710014396707172e-05, "loss": 0.013, "step": 8390 }, { "epoch": 11.781206171107995, "grad_norm": 0.2564167380332947, "learning_rate": 6.702243243811221e-05, "loss": 0.0116, "step": 8400 }, { "epoch": 11.79523141654979, "grad_norm": 0.23275944590568542, "learning_rate": 6.694467437047244e-05, "loss": 0.0112, "step": 8410 }, { "epoch": 11.809256661991585, "grad_norm": 0.34493371844291687, "learning_rate": 6.686686997673997e-05, "loss": 0.0116, "step": 8420 }, { "epoch": 11.82328190743338, "grad_norm": 0.3206663131713867, "learning_rate": 6.678901946962903e-05, "loss": 0.0126, "step": 8430 }, { "epoch": 11.837307152875175, "grad_norm": 0.18439380824565887, "learning_rate": 6.671112306197996e-05, "loss": 0.0106, "step": 8440 }, { "epoch": 11.85133239831697, "grad_norm": 0.1747751533985138, "learning_rate": 6.663318096675854e-05, "loss": 0.0117, "step": 8450 }, { "epoch": 11.865357643758767, "grad_norm": 0.20690591633319855, "learning_rate": 6.655519339705552e-05, "loss": 0.0105, "step": 8460 }, { "epoch": 11.879382889200562, "grad_norm": 0.24969153106212616, "learning_rate": 6.647716056608588e-05, "loss": 0.0135, "step": 8470 }, { "epoch": 11.893408134642357, "grad_norm": 0.2366046905517578, "learning_rate": 6.639908268718843e-05, "loss": 0.0117, "step": 8480 }, { "epoch": 11.907433380084152, "grad_norm": 0.22518107295036316, "learning_rate": 6.632095997382514e-05, "loss": 0.0109, "step": 8490 }, { "epoch": 11.921458625525947, "grad_norm": 0.2461056411266327, "learning_rate": 6.624279263958047e-05, "loss": 0.0118, "step": 8500 }, { "epoch": 11.935483870967742, "grad_norm": 0.20588627457618713, "learning_rate": 6.616458089816097e-05, "loss": 0.0115, "step": 8510 }, { "epoch": 11.949509116409537, "grad_norm": 0.27274614572525024, "learning_rate": 6.608632496339454e-05, "loss": 0.0108, "step": 8520 }, { "epoch": 11.963534361851332, "grad_norm": 0.2792292535305023, "learning_rate": 6.600802504922988e-05, "loss": 0.0132, "step": 8530 }, { "epoch": 11.977559607293127, "grad_norm": 0.17464491724967957, "learning_rate": 6.592968136973604e-05, "loss": 0.0136, "step": 8540 }, { "epoch": 11.991584852734922, "grad_norm": 0.31539052724838257, "learning_rate": 6.585129413910159e-05, "loss": 0.0131, "step": 8550 }, { "epoch": 12.005610098176717, "grad_norm": 0.2554822862148285, "learning_rate": 6.577286357163424e-05, "loss": 0.0116, "step": 8560 }, { "epoch": 12.019635343618514, "grad_norm": 0.34613344073295593, "learning_rate": 6.569438988176018e-05, "loss": 0.0122, "step": 8570 }, { "epoch": 12.03366058906031, "grad_norm": 0.39492544531822205, "learning_rate": 6.561587328402347e-05, "loss": 0.0108, "step": 8580 }, { "epoch": 12.047685834502104, "grad_norm": 0.2742479741573334, "learning_rate": 6.553731399308549e-05, "loss": 0.0127, "step": 8590 }, { "epoch": 12.0617110799439, "grad_norm": 0.34650036692619324, "learning_rate": 6.545871222372436e-05, "loss": 0.0122, "step": 8600 }, { "epoch": 12.075736325385694, "grad_norm": 0.33805087208747864, "learning_rate": 6.538006819083426e-05, "loss": 0.0109, "step": 8610 }, { "epoch": 12.08976157082749, "grad_norm": 0.1979604959487915, "learning_rate": 6.530138210942505e-05, "loss": 0.01, "step": 8620 }, { "epoch": 12.103786816269285, "grad_norm": 0.29442423582077026, "learning_rate": 6.522265419462141e-05, "loss": 0.0108, "step": 8630 }, { "epoch": 12.11781206171108, "grad_norm": 0.2686213254928589, "learning_rate": 6.514388466166248e-05, "loss": 0.0096, "step": 8640 }, { "epoch": 12.131837307152875, "grad_norm": 0.17319869995117188, "learning_rate": 6.506507372590119e-05, "loss": 0.0106, "step": 8650 }, { "epoch": 12.14586255259467, "grad_norm": 0.22584618628025055, "learning_rate": 6.498622160280355e-05, "loss": 0.0137, "step": 8660 }, { "epoch": 12.159887798036465, "grad_norm": 0.5674343109130859, "learning_rate": 6.490732850794832e-05, "loss": 0.0114, "step": 8670 }, { "epoch": 12.173913043478262, "grad_norm": 0.33759167790412903, "learning_rate": 6.482839465702616e-05, "loss": 0.0134, "step": 8680 }, { "epoch": 12.187938288920057, "grad_norm": 0.21656042337417603, "learning_rate": 6.474942026583923e-05, "loss": 0.0118, "step": 8690 }, { "epoch": 12.201963534361852, "grad_norm": 0.20470066368579865, "learning_rate": 6.467040555030052e-05, "loss": 0.0117, "step": 8700 }, { "epoch": 12.215988779803647, "grad_norm": 0.37844765186309814, "learning_rate": 6.459135072643321e-05, "loss": 0.0108, "step": 8710 }, { "epoch": 12.230014025245442, "grad_norm": 0.2910792827606201, "learning_rate": 6.451225601037019e-05, "loss": 0.0107, "step": 8720 }, { "epoch": 12.244039270687237, "grad_norm": 0.1600731760263443, "learning_rate": 6.443312161835338e-05, "loss": 0.012, "step": 8730 }, { "epoch": 12.258064516129032, "grad_norm": 0.1781173199415207, "learning_rate": 6.43539477667332e-05, "loss": 0.0101, "step": 8740 }, { "epoch": 12.272089761570827, "grad_norm": 0.19164744019508362, "learning_rate": 6.427473467196793e-05, "loss": 0.0102, "step": 8750 }, { "epoch": 12.286115007012622, "grad_norm": 0.28651610016822815, "learning_rate": 6.419548255062315e-05, "loss": 0.0143, "step": 8760 }, { "epoch": 12.300140252454417, "grad_norm": 0.2808370590209961, "learning_rate": 6.411619161937112e-05, "loss": 0.0102, "step": 8770 }, { "epoch": 12.314165497896212, "grad_norm": 0.22873498499393463, "learning_rate": 6.403686209499022e-05, "loss": 0.0096, "step": 8780 }, { "epoch": 12.32819074333801, "grad_norm": 0.16215233504772186, "learning_rate": 6.395749419436437e-05, "loss": 0.0111, "step": 8790 }, { "epoch": 12.342215988779804, "grad_norm": 0.1672547310590744, "learning_rate": 6.387808813448234e-05, "loss": 0.0099, "step": 8800 }, { "epoch": 12.3562412342216, "grad_norm": 0.2782524824142456, "learning_rate": 6.37986441324373e-05, "loss": 0.0132, "step": 8810 }, { "epoch": 12.370266479663394, "grad_norm": 0.3890216648578644, "learning_rate": 6.37191624054261e-05, "loss": 0.0113, "step": 8820 }, { "epoch": 12.38429172510519, "grad_norm": 0.34454333782196045, "learning_rate": 6.363964317074872e-05, "loss": 0.0123, "step": 8830 }, { "epoch": 12.398316970546984, "grad_norm": 1.4596099853515625, "learning_rate": 6.356008664580776e-05, "loss": 0.014, "step": 8840 }, { "epoch": 12.41234221598878, "grad_norm": 0.2937188744544983, "learning_rate": 6.348049304810771e-05, "loss": 0.0125, "step": 8850 }, { "epoch": 12.426367461430575, "grad_norm": 0.20359668135643005, "learning_rate": 6.340086259525442e-05, "loss": 0.0142, "step": 8860 }, { "epoch": 12.44039270687237, "grad_norm": 0.23790264129638672, "learning_rate": 6.332119550495448e-05, "loss": 0.0137, "step": 8870 }, { "epoch": 12.454417952314165, "grad_norm": 0.35698017477989197, "learning_rate": 6.324149199501473e-05, "loss": 0.0118, "step": 8880 }, { "epoch": 12.468443197755962, "grad_norm": 0.35678374767303467, "learning_rate": 6.316175228334146e-05, "loss": 0.0104, "step": 8890 }, { "epoch": 12.482468443197757, "grad_norm": 0.2556513249874115, "learning_rate": 6.308197658794003e-05, "loss": 0.0129, "step": 8900 }, { "epoch": 12.496493688639552, "grad_norm": 0.2054547220468521, "learning_rate": 6.300216512691417e-05, "loss": 0.01, "step": 8910 }, { "epoch": 12.510518934081347, "grad_norm": 0.20166020095348358, "learning_rate": 6.292231811846532e-05, "loss": 0.01, "step": 8920 }, { "epoch": 12.524544179523142, "grad_norm": 0.27720022201538086, "learning_rate": 6.284243578089217e-05, "loss": 0.0126, "step": 8930 }, { "epoch": 12.538569424964937, "grad_norm": 0.14872503280639648, "learning_rate": 6.276251833258999e-05, "loss": 0.0094, "step": 8940 }, { "epoch": 12.552594670406732, "grad_norm": 0.30750975012779236, "learning_rate": 6.268256599205003e-05, "loss": 0.0117, "step": 8950 }, { "epoch": 12.566619915848527, "grad_norm": 0.35392504930496216, "learning_rate": 6.260257897785892e-05, "loss": 0.0125, "step": 8960 }, { "epoch": 12.580645161290322, "grad_norm": 0.20921701192855835, "learning_rate": 6.252255750869811e-05, "loss": 0.0129, "step": 8970 }, { "epoch": 12.594670406732117, "grad_norm": 0.2126634120941162, "learning_rate": 6.244250180334325e-05, "loss": 0.0114, "step": 8980 }, { "epoch": 12.608695652173914, "grad_norm": 0.16483235359191895, "learning_rate": 6.236241208066356e-05, "loss": 0.0123, "step": 8990 }, { "epoch": 12.622720897615709, "grad_norm": 0.25106579065322876, "learning_rate": 6.228228855962133e-05, "loss": 0.0142, "step": 9000 }, { "epoch": 12.636746143057504, "grad_norm": 0.23183995485305786, "learning_rate": 6.220213145927115e-05, "loss": 0.0131, "step": 9010 }, { "epoch": 12.6507713884993, "grad_norm": 0.23641780018806458, "learning_rate": 6.212194099875951e-05, "loss": 0.0114, "step": 9020 }, { "epoch": 12.664796633941094, "grad_norm": 0.20260518789291382, "learning_rate": 6.204171739732405e-05, "loss": 0.011, "step": 9030 }, { "epoch": 12.67882187938289, "grad_norm": 0.38652658462524414, "learning_rate": 6.196146087429303e-05, "loss": 0.013, "step": 9040 }, { "epoch": 12.692847124824684, "grad_norm": 0.216811403632164, "learning_rate": 6.188117164908474e-05, "loss": 0.0114, "step": 9050 }, { "epoch": 12.70687237026648, "grad_norm": 0.34722769260406494, "learning_rate": 6.180084994120684e-05, "loss": 0.0129, "step": 9060 }, { "epoch": 12.720897615708274, "grad_norm": 0.2409057766199112, "learning_rate": 6.17204959702558e-05, "loss": 0.0122, "step": 9070 }, { "epoch": 12.73492286115007, "grad_norm": 0.2228439599275589, "learning_rate": 6.164010995591635e-05, "loss": 0.0115, "step": 9080 }, { "epoch": 12.748948106591865, "grad_norm": 0.20331965386867523, "learning_rate": 6.155969211796076e-05, "loss": 0.0134, "step": 9090 }, { "epoch": 12.762973352033661, "grad_norm": 0.204179048538208, "learning_rate": 6.147924267624829e-05, "loss": 0.0132, "step": 9100 }, { "epoch": 12.776998597475457, "grad_norm": 0.22098228335380554, "learning_rate": 6.13987618507247e-05, "loss": 0.0107, "step": 9110 }, { "epoch": 12.791023842917252, "grad_norm": 0.2090393602848053, "learning_rate": 6.131824986142147e-05, "loss": 0.0094, "step": 9120 }, { "epoch": 12.805049088359047, "grad_norm": 0.2727956473827362, "learning_rate": 6.123770692845529e-05, "loss": 0.012, "step": 9130 }, { "epoch": 12.819074333800842, "grad_norm": 0.41143274307250977, "learning_rate": 6.11571332720275e-05, "loss": 0.0121, "step": 9140 }, { "epoch": 12.833099579242637, "grad_norm": 0.26617541909217834, "learning_rate": 6.107652911242336e-05, "loss": 0.0116, "step": 9150 }, { "epoch": 12.847124824684432, "grad_norm": 0.2342241406440735, "learning_rate": 6.0995894670011586e-05, "loss": 0.0101, "step": 9160 }, { "epoch": 12.861150070126227, "grad_norm": 0.30086350440979004, "learning_rate": 6.091523016524368e-05, "loss": 0.014, "step": 9170 }, { "epoch": 12.875175315568022, "grad_norm": 0.2008330523967743, "learning_rate": 6.083453581865328e-05, "loss": 0.0119, "step": 9180 }, { "epoch": 12.889200561009817, "grad_norm": 0.2688964307308197, "learning_rate": 6.075381185085568e-05, "loss": 0.0103, "step": 9190 }, { "epoch": 12.903225806451612, "grad_norm": 0.24262602627277374, "learning_rate": 6.067305848254709e-05, "loss": 0.0118, "step": 9200 }, { "epoch": 12.917251051893409, "grad_norm": 0.3329535722732544, "learning_rate": 6.059227593450418e-05, "loss": 0.0115, "step": 9210 }, { "epoch": 12.931276297335204, "grad_norm": 0.2554514408111572, "learning_rate": 6.051146442758333e-05, "loss": 0.0131, "step": 9220 }, { "epoch": 12.945301542776999, "grad_norm": 0.2659347355365753, "learning_rate": 6.043062418272012e-05, "loss": 0.0123, "step": 9230 }, { "epoch": 12.959326788218794, "grad_norm": 0.2022310197353363, "learning_rate": 6.0349755420928666e-05, "loss": 0.0109, "step": 9240 }, { "epoch": 12.97335203366059, "grad_norm": 0.253539115190506, "learning_rate": 6.0268858363301105e-05, "loss": 0.0119, "step": 9250 }, { "epoch": 12.987377279102384, "grad_norm": 0.20514161884784698, "learning_rate": 6.018793323100689e-05, "loss": 0.0102, "step": 9260 }, { "epoch": 13.00140252454418, "grad_norm": 0.27027225494384766, "learning_rate": 6.0106980245292255e-05, "loss": 0.01, "step": 9270 }, { "epoch": 13.015427769985974, "grad_norm": 0.39708399772644043, "learning_rate": 6.002599962747957e-05, "loss": 0.0113, "step": 9280 }, { "epoch": 13.02945301542777, "grad_norm": 0.31212082505226135, "learning_rate": 5.994499159896673e-05, "loss": 0.0121, "step": 9290 }, { "epoch": 13.043478260869565, "grad_norm": 0.2223147302865982, "learning_rate": 5.9863956381226607e-05, "loss": 0.0107, "step": 9300 }, { "epoch": 13.05750350631136, "grad_norm": 0.224670872092247, "learning_rate": 5.9782894195806394e-05, "loss": 0.0123, "step": 9310 }, { "epoch": 13.071528751753156, "grad_norm": 0.27533408999443054, "learning_rate": 5.9701805264327004e-05, "loss": 0.0106, "step": 9320 }, { "epoch": 13.085553997194951, "grad_norm": 0.21523049473762512, "learning_rate": 5.96206898084825e-05, "loss": 0.011, "step": 9330 }, { "epoch": 13.099579242636747, "grad_norm": 0.2834705412387848, "learning_rate": 5.953954805003942e-05, "loss": 0.0102, "step": 9340 }, { "epoch": 13.113604488078542, "grad_norm": 0.2904050350189209, "learning_rate": 5.945838021083623e-05, "loss": 0.0104, "step": 9350 }, { "epoch": 13.127629733520337, "grad_norm": 0.3703821003437042, "learning_rate": 5.9377186512782714e-05, "loss": 0.0115, "step": 9360 }, { "epoch": 13.141654978962132, "grad_norm": 0.2685873806476593, "learning_rate": 5.929596717785935e-05, "loss": 0.0113, "step": 9370 }, { "epoch": 13.155680224403927, "grad_norm": 0.23943854868412018, "learning_rate": 5.921472242811668e-05, "loss": 0.0094, "step": 9380 }, { "epoch": 13.169705469845722, "grad_norm": 0.28761741518974304, "learning_rate": 5.913345248567475e-05, "loss": 0.0121, "step": 9390 }, { "epoch": 13.183730715287517, "grad_norm": 0.30681145191192627, "learning_rate": 5.905215757272248e-05, "loss": 0.0098, "step": 9400 }, { "epoch": 13.197755960729312, "grad_norm": 0.3276468813419342, "learning_rate": 5.897083791151706e-05, "loss": 0.0116, "step": 9410 }, { "epoch": 13.211781206171109, "grad_norm": 0.3020120859146118, "learning_rate": 5.888949372438336e-05, "loss": 0.0093, "step": 9420 }, { "epoch": 13.225806451612904, "grad_norm": 0.1698829084634781, "learning_rate": 5.8808125233713255e-05, "loss": 0.0105, "step": 9430 }, { "epoch": 13.239831697054699, "grad_norm": 0.24852901697158813, "learning_rate": 5.872673266196509e-05, "loss": 0.0109, "step": 9440 }, { "epoch": 13.253856942496494, "grad_norm": 0.19102632999420166, "learning_rate": 5.864531623166305e-05, "loss": 0.0103, "step": 9450 }, { "epoch": 13.267882187938289, "grad_norm": 0.15210527181625366, "learning_rate": 5.856387616539656e-05, "loss": 0.0126, "step": 9460 }, { "epoch": 13.281907433380084, "grad_norm": 0.28120526671409607, "learning_rate": 5.848241268581967e-05, "loss": 0.0104, "step": 9470 }, { "epoch": 13.29593267882188, "grad_norm": 0.326600581407547, "learning_rate": 5.840092601565037e-05, "loss": 0.0102, "step": 9480 }, { "epoch": 13.309957924263674, "grad_norm": 0.35304418206214905, "learning_rate": 5.8319416377670144e-05, "loss": 0.0112, "step": 9490 }, { "epoch": 13.32398316970547, "grad_norm": 0.30314168334007263, "learning_rate": 5.82378839947232e-05, "loss": 0.0112, "step": 9500 }, { "epoch": 13.338008415147264, "grad_norm": 0.35311195254325867, "learning_rate": 5.815632908971599e-05, "loss": 0.0124, "step": 9510 }, { "epoch": 13.35203366058906, "grad_norm": 0.1737484335899353, "learning_rate": 5.80747518856165e-05, "loss": 0.0093, "step": 9520 }, { "epoch": 13.366058906030856, "grad_norm": 0.17414887249469757, "learning_rate": 5.799315260545367e-05, "loss": 0.009, "step": 9530 }, { "epoch": 13.380084151472651, "grad_norm": 0.21562929451465607, "learning_rate": 5.791153147231686e-05, "loss": 0.0102, "step": 9540 }, { "epoch": 13.394109396914446, "grad_norm": 0.29961058497428894, "learning_rate": 5.782988870935509e-05, "loss": 0.0113, "step": 9550 }, { "epoch": 13.408134642356242, "grad_norm": 0.26014646887779236, "learning_rate": 5.774822453977657e-05, "loss": 0.0125, "step": 9560 }, { "epoch": 13.422159887798037, "grad_norm": 0.2996726930141449, "learning_rate": 5.7666539186848036e-05, "loss": 0.0114, "step": 9570 }, { "epoch": 13.436185133239832, "grad_norm": 0.24273833632469177, "learning_rate": 5.758483287389411e-05, "loss": 0.0097, "step": 9580 }, { "epoch": 13.450210378681627, "grad_norm": 0.3065846860408783, "learning_rate": 5.7503105824296735e-05, "loss": 0.0129, "step": 9590 }, { "epoch": 13.464235624123422, "grad_norm": 0.17452920973300934, "learning_rate": 5.742135826149453e-05, "loss": 0.0115, "step": 9600 }, { "epoch": 13.478260869565217, "grad_norm": 0.2796613872051239, "learning_rate": 5.7339590408982223e-05, "loss": 0.0095, "step": 9610 }, { "epoch": 13.492286115007012, "grad_norm": 0.16361594200134277, "learning_rate": 5.725780249031e-05, "loss": 0.0114, "step": 9620 }, { "epoch": 13.506311360448809, "grad_norm": 0.16809476912021637, "learning_rate": 5.717599472908292e-05, "loss": 0.0079, "step": 9630 }, { "epoch": 13.520336605890604, "grad_norm": 0.18605032563209534, "learning_rate": 5.7094167348960237e-05, "loss": 0.0103, "step": 9640 }, { "epoch": 13.534361851332399, "grad_norm": 0.3836943805217743, "learning_rate": 5.7012320573654945e-05, "loss": 0.0129, "step": 9650 }, { "epoch": 13.548387096774194, "grad_norm": 0.2775394916534424, "learning_rate": 5.693045462693295e-05, "loss": 0.0123, "step": 9660 }, { "epoch": 13.562412342215989, "grad_norm": 0.2936553955078125, "learning_rate": 5.684856973261266e-05, "loss": 0.0137, "step": 9670 }, { "epoch": 13.576437587657784, "grad_norm": 0.14475058019161224, "learning_rate": 5.6766666114564215e-05, "loss": 0.0092, "step": 9680 }, { "epoch": 13.59046283309958, "grad_norm": 0.21063731610774994, "learning_rate": 5.668474399670899e-05, "loss": 0.0099, "step": 9690 }, { "epoch": 13.604488078541374, "grad_norm": 0.19053435325622559, "learning_rate": 5.660280360301896e-05, "loss": 0.0117, "step": 9700 }, { "epoch": 13.61851332398317, "grad_norm": 0.16197437047958374, "learning_rate": 5.652084515751599e-05, "loss": 0.011, "step": 9710 }, { "epoch": 13.632538569424964, "grad_norm": 0.3279530704021454, "learning_rate": 5.643886888427137e-05, "loss": 0.0126, "step": 9720 }, { "epoch": 13.64656381486676, "grad_norm": 0.21187278628349304, "learning_rate": 5.6356875007405074e-05, "loss": 0.0112, "step": 9730 }, { "epoch": 13.660589060308556, "grad_norm": 0.2788470387458801, "learning_rate": 5.627486375108525e-05, "loss": 0.01, "step": 9740 }, { "epoch": 13.674614305750351, "grad_norm": 0.3709900975227356, "learning_rate": 5.619283533952754e-05, "loss": 0.0106, "step": 9750 }, { "epoch": 13.688639551192146, "grad_norm": 0.3447456657886505, "learning_rate": 5.6110789996994474e-05, "loss": 0.0128, "step": 9760 }, { "epoch": 13.702664796633941, "grad_norm": 0.24220311641693115, "learning_rate": 5.602872794779491e-05, "loss": 0.0101, "step": 9770 }, { "epoch": 13.716690042075736, "grad_norm": 0.2683880627155304, "learning_rate": 5.594664941628334e-05, "loss": 0.0104, "step": 9780 }, { "epoch": 13.730715287517532, "grad_norm": 0.23282812535762787, "learning_rate": 5.5864554626859324e-05, "loss": 0.0088, "step": 9790 }, { "epoch": 13.744740532959327, "grad_norm": 0.1769486963748932, "learning_rate": 5.578244380396691e-05, "loss": 0.01, "step": 9800 }, { "epoch": 13.758765778401122, "grad_norm": 0.22370177507400513, "learning_rate": 5.570031717209394e-05, "loss": 0.0109, "step": 9810 }, { "epoch": 13.772791023842917, "grad_norm": 0.1964796632528305, "learning_rate": 5.561817495577147e-05, "loss": 0.0145, "step": 9820 }, { "epoch": 13.786816269284712, "grad_norm": 0.3304186463356018, "learning_rate": 5.5536017379573215e-05, "loss": 0.0119, "step": 9830 }, { "epoch": 13.800841514726507, "grad_norm": 0.28713250160217285, "learning_rate": 5.545384466811483e-05, "loss": 0.0133, "step": 9840 }, { "epoch": 13.814866760168304, "grad_norm": 0.22271807491779327, "learning_rate": 5.5371657046053384e-05, "loss": 0.0142, "step": 9850 }, { "epoch": 13.828892005610099, "grad_norm": 0.23517246544361115, "learning_rate": 5.528945473808669e-05, "loss": 0.0127, "step": 9860 }, { "epoch": 13.842917251051894, "grad_norm": 0.19365061819553375, "learning_rate": 5.520723796895272e-05, "loss": 0.0108, "step": 9870 }, { "epoch": 13.856942496493689, "grad_norm": 0.22688962519168854, "learning_rate": 5.512500696342897e-05, "loss": 0.0089, "step": 9880 }, { "epoch": 13.870967741935484, "grad_norm": 0.3244739770889282, "learning_rate": 5.504276194633188e-05, "loss": 0.0097, "step": 9890 }, { "epoch": 13.884992987377279, "grad_norm": 0.204489603638649, "learning_rate": 5.49605031425162e-05, "loss": 0.0096, "step": 9900 }, { "epoch": 13.899018232819074, "grad_norm": 0.3114594519138336, "learning_rate": 5.487823077687434e-05, "loss": 0.0096, "step": 9910 }, { "epoch": 13.91304347826087, "grad_norm": 0.20596247911453247, "learning_rate": 5.4795945074335806e-05, "loss": 0.0099, "step": 9920 }, { "epoch": 13.927068723702664, "grad_norm": 0.25572720170021057, "learning_rate": 5.471364625986657e-05, "loss": 0.0098, "step": 9930 }, { "epoch": 13.94109396914446, "grad_norm": 0.19320949912071228, "learning_rate": 5.463133455846845e-05, "loss": 0.011, "step": 9940 }, { "epoch": 13.955119214586254, "grad_norm": 0.33283159136772156, "learning_rate": 5.4549010195178505e-05, "loss": 0.0091, "step": 9950 }, { "epoch": 13.969144460028051, "grad_norm": 0.15693148970603943, "learning_rate": 5.446667339506838e-05, "loss": 0.0129, "step": 9960 }, { "epoch": 13.983169705469846, "grad_norm": 0.22105562686920166, "learning_rate": 5.4384324383243756e-05, "loss": 0.0103, "step": 9970 }, { "epoch": 13.997194950911641, "grad_norm": 0.23915240168571472, "learning_rate": 5.430196338484368e-05, "loss": 0.01, "step": 9980 }, { "epoch": 14.011220196353436, "grad_norm": 0.18690083920955658, "learning_rate": 5.4219590625039975e-05, "loss": 0.0087, "step": 9990 }, { "epoch": 14.025245441795231, "grad_norm": 0.1831246167421341, "learning_rate": 5.413720632903664e-05, "loss": 0.0108, "step": 10000 }, { "epoch": 14.039270687237027, "grad_norm": 0.3131438195705414, "learning_rate": 5.405481072206917e-05, "loss": 0.0104, "step": 10010 }, { "epoch": 14.053295932678822, "grad_norm": 0.1970844864845276, "learning_rate": 5.397240402940402e-05, "loss": 0.0084, "step": 10020 }, { "epoch": 14.067321178120617, "grad_norm": 0.3347682058811188, "learning_rate": 5.388998647633794e-05, "loss": 0.0111, "step": 10030 }, { "epoch": 14.081346423562412, "grad_norm": 0.20134787261486053, "learning_rate": 5.380755828819737e-05, "loss": 0.0106, "step": 10040 }, { "epoch": 14.095371669004207, "grad_norm": 0.2193341702222824, "learning_rate": 5.3725119690337846e-05, "loss": 0.0098, "step": 10050 }, { "epoch": 14.109396914446004, "grad_norm": 0.23165634274482727, "learning_rate": 5.3642670908143324e-05, "loss": 0.01, "step": 10060 }, { "epoch": 14.123422159887799, "grad_norm": 0.18629907071590424, "learning_rate": 5.356021216702562e-05, "loss": 0.0087, "step": 10070 }, { "epoch": 14.137447405329594, "grad_norm": 0.269875168800354, "learning_rate": 5.347774369242381e-05, "loss": 0.0107, "step": 10080 }, { "epoch": 14.151472650771389, "grad_norm": 0.2701401114463806, "learning_rate": 5.3395265709803545e-05, "loss": 0.0126, "step": 10090 }, { "epoch": 14.165497896213184, "grad_norm": 0.8359184861183167, "learning_rate": 5.331277844465647e-05, "loss": 0.0097, "step": 10100 }, { "epoch": 14.179523141654979, "grad_norm": 0.20044049620628357, "learning_rate": 5.323028212249963e-05, "loss": 0.0089, "step": 10110 }, { "epoch": 14.193548387096774, "grad_norm": 0.19982966780662537, "learning_rate": 5.314777696887481e-05, "loss": 0.0096, "step": 10120 }, { "epoch": 14.207573632538569, "grad_norm": 0.5079901218414307, "learning_rate": 5.306526320934796e-05, "loss": 0.0111, "step": 10130 }, { "epoch": 14.221598877980364, "grad_norm": 0.208268404006958, "learning_rate": 5.298274106950854e-05, "loss": 0.0119, "step": 10140 }, { "epoch": 14.23562412342216, "grad_norm": 0.30693739652633667, "learning_rate": 5.290021077496893e-05, "loss": 0.0097, "step": 10150 }, { "epoch": 14.249649368863954, "grad_norm": 0.24054491519927979, "learning_rate": 5.2817672551363816e-05, "loss": 0.009, "step": 10160 }, { "epoch": 14.263674614305751, "grad_norm": 0.3145187795162201, "learning_rate": 5.273512662434952e-05, "loss": 0.0103, "step": 10170 }, { "epoch": 14.277699859747546, "grad_norm": 0.2332286238670349, "learning_rate": 5.265257321960349e-05, "loss": 0.0102, "step": 10180 }, { "epoch": 14.291725105189341, "grad_norm": 0.23023389279842377, "learning_rate": 5.257001256282357e-05, "loss": 0.0108, "step": 10190 }, { "epoch": 14.305750350631136, "grad_norm": 0.25392094254493713, "learning_rate": 5.248744487972742e-05, "loss": 0.0111, "step": 10200 }, { "epoch": 14.319775596072931, "grad_norm": 0.27073585987091064, "learning_rate": 5.240487039605196e-05, "loss": 0.0102, "step": 10210 }, { "epoch": 14.333800841514726, "grad_norm": 0.262440949678421, "learning_rate": 5.232228933755267e-05, "loss": 0.0127, "step": 10220 }, { "epoch": 14.347826086956522, "grad_norm": 0.2948419749736786, "learning_rate": 5.2239701930003006e-05, "loss": 0.0122, "step": 10230 }, { "epoch": 14.361851332398317, "grad_norm": 0.25508829951286316, "learning_rate": 5.215710839919379e-05, "loss": 0.0107, "step": 10240 }, { "epoch": 14.375876577840112, "grad_norm": 0.21171408891677856, "learning_rate": 5.207450897093257e-05, "loss": 0.0097, "step": 10250 }, { "epoch": 14.389901823281907, "grad_norm": 0.21929679811000824, "learning_rate": 5.1991903871043046e-05, "loss": 0.0091, "step": 10260 }, { "epoch": 14.403927068723704, "grad_norm": 0.25920408964157104, "learning_rate": 5.190929332536439e-05, "loss": 0.0125, "step": 10270 }, { "epoch": 14.417952314165499, "grad_norm": 0.16480973362922668, "learning_rate": 5.182667755975071e-05, "loss": 0.0111, "step": 10280 }, { "epoch": 14.431977559607294, "grad_norm": 0.21570950746536255, "learning_rate": 5.1744056800070315e-05, "loss": 0.0103, "step": 10290 }, { "epoch": 14.446002805049089, "grad_norm": 0.31480398774147034, "learning_rate": 5.166143127220524e-05, "loss": 0.0103, "step": 10300 }, { "epoch": 14.460028050490884, "grad_norm": 0.20343366265296936, "learning_rate": 5.1578801202050485e-05, "loss": 0.0112, "step": 10310 }, { "epoch": 14.474053295932679, "grad_norm": 0.2525949776172638, "learning_rate": 5.149616681551355e-05, "loss": 0.0097, "step": 10320 }, { "epoch": 14.488078541374474, "grad_norm": 0.442773699760437, "learning_rate": 5.141352833851367e-05, "loss": 0.0104, "step": 10330 }, { "epoch": 14.502103786816269, "grad_norm": 0.3243388235569, "learning_rate": 5.1330885996981285e-05, "loss": 0.0116, "step": 10340 }, { "epoch": 14.516129032258064, "grad_norm": 0.509351909160614, "learning_rate": 5.124824001685741e-05, "loss": 0.0095, "step": 10350 }, { "epoch": 14.53015427769986, "grad_norm": 0.23085267841815948, "learning_rate": 5.116559062409298e-05, "loss": 0.0128, "step": 10360 }, { "epoch": 14.544179523141654, "grad_norm": 0.29959747195243835, "learning_rate": 5.10829380446483e-05, "loss": 0.0114, "step": 10370 }, { "epoch": 14.558204768583451, "grad_norm": 0.6160489916801453, "learning_rate": 5.100028250449235e-05, "loss": 0.0105, "step": 10380 }, { "epoch": 14.572230014025246, "grad_norm": 0.19416698813438416, "learning_rate": 5.0917624229602234e-05, "loss": 0.0085, "step": 10390 }, { "epoch": 14.586255259467041, "grad_norm": 0.36912187933921814, "learning_rate": 5.0834963445962524e-05, "loss": 0.011, "step": 10400 }, { "epoch": 14.600280504908836, "grad_norm": 0.16205266118049622, "learning_rate": 5.075230037956461e-05, "loss": 0.0095, "step": 10410 }, { "epoch": 14.614305750350631, "grad_norm": 0.18376559019088745, "learning_rate": 5.0669635256406213e-05, "loss": 0.0093, "step": 10420 }, { "epoch": 14.628330995792426, "grad_norm": 0.31053248047828674, "learning_rate": 5.058696830249058e-05, "loss": 0.0073, "step": 10430 }, { "epoch": 14.642356241234221, "grad_norm": 0.18253426253795624, "learning_rate": 5.050429974382602e-05, "loss": 0.0117, "step": 10440 }, { "epoch": 14.656381486676016, "grad_norm": 0.15341848134994507, "learning_rate": 5.042162980642523e-05, "loss": 0.0096, "step": 10450 }, { "epoch": 14.670406732117812, "grad_norm": 0.47582679986953735, "learning_rate": 5.033895871630462e-05, "loss": 0.0104, "step": 10460 }, { "epoch": 14.684431977559607, "grad_norm": 0.19130483269691467, "learning_rate": 5.025628669948386e-05, "loss": 0.0097, "step": 10470 }, { "epoch": 14.698457223001402, "grad_norm": 0.150883287191391, "learning_rate": 5.017361398198502e-05, "loss": 0.0098, "step": 10480 }, { "epoch": 14.712482468443199, "grad_norm": 0.16283394396305084, "learning_rate": 5.009094078983221e-05, "loss": 0.0093, "step": 10490 }, { "epoch": 14.726507713884994, "grad_norm": 0.26326578855514526, "learning_rate": 5.000826734905073e-05, "loss": 0.009, "step": 10500 }, { "epoch": 14.740532959326789, "grad_norm": 0.26455122232437134, "learning_rate": 4.9925593885666645e-05, "loss": 0.0093, "step": 10510 }, { "epoch": 14.754558204768584, "grad_norm": 0.2943791449069977, "learning_rate": 4.984292062570602e-05, "loss": 0.0104, "step": 10520 }, { "epoch": 14.768583450210379, "grad_norm": 0.2905474305152893, "learning_rate": 4.976024779519442e-05, "loss": 0.0097, "step": 10530 }, { "epoch": 14.782608695652174, "grad_norm": 0.20852775871753693, "learning_rate": 4.9677575620156194e-05, "loss": 0.0096, "step": 10540 }, { "epoch": 14.796633941093969, "grad_norm": 0.1885327398777008, "learning_rate": 4.959490432661391e-05, "loss": 0.0097, "step": 10550 }, { "epoch": 14.810659186535764, "grad_norm": 0.20167937874794006, "learning_rate": 4.9512234140587726e-05, "loss": 0.0117, "step": 10560 }, { "epoch": 14.824684431977559, "grad_norm": 0.22184917330741882, "learning_rate": 4.942956528809477e-05, "loss": 0.0092, "step": 10570 }, { "epoch": 14.838709677419354, "grad_norm": 0.23452156782150269, "learning_rate": 4.934689799514854e-05, "loss": 0.0103, "step": 10580 }, { "epoch": 14.85273492286115, "grad_norm": 0.3102039098739624, "learning_rate": 4.926423248775827e-05, "loss": 0.0088, "step": 10590 }, { "epoch": 14.866760168302946, "grad_norm": 0.1670072376728058, "learning_rate": 4.918156899192826e-05, "loss": 0.0105, "step": 10600 }, { "epoch": 14.880785413744741, "grad_norm": 0.33866289258003235, "learning_rate": 4.909890773365738e-05, "loss": 0.011, "step": 10610 }, { "epoch": 14.894810659186536, "grad_norm": 0.19566893577575684, "learning_rate": 4.9016248938938344e-05, "loss": 0.0119, "step": 10620 }, { "epoch": 14.908835904628331, "grad_norm": 0.21510820090770721, "learning_rate": 4.8933592833757156e-05, "loss": 0.014, "step": 10630 }, { "epoch": 14.922861150070126, "grad_norm": 0.2446950525045395, "learning_rate": 4.8850939644092435e-05, "loss": 0.009, "step": 10640 }, { "epoch": 14.936886395511921, "grad_norm": 0.23482680320739746, "learning_rate": 4.876828959591485e-05, "loss": 0.0114, "step": 10650 }, { "epoch": 14.950911640953716, "grad_norm": 0.23506605625152588, "learning_rate": 4.8685642915186474e-05, "loss": 0.0088, "step": 10660 }, { "epoch": 14.964936886395511, "grad_norm": 0.1803177446126938, "learning_rate": 4.860299982786018e-05, "loss": 0.01, "step": 10670 }, { "epoch": 14.978962131837307, "grad_norm": 0.22896519303321838, "learning_rate": 4.852036055987901e-05, "loss": 0.009, "step": 10680 }, { "epoch": 14.992987377279102, "grad_norm": 0.24402359127998352, "learning_rate": 4.843772533717558e-05, "loss": 0.0086, "step": 10690 }, { "epoch": 15.007012622720898, "grad_norm": 0.199191153049469, "learning_rate": 4.835509438567142e-05, "loss": 0.0091, "step": 10700 }, { "epoch": 15.021037868162693, "grad_norm": 0.2728492319583893, "learning_rate": 4.827246793127639e-05, "loss": 0.0112, "step": 10710 }, { "epoch": 15.035063113604489, "grad_norm": 0.21238826215267181, "learning_rate": 4.818984619988807e-05, "loss": 0.008, "step": 10720 }, { "epoch": 15.049088359046284, "grad_norm": 0.2199954241514206, "learning_rate": 4.810722941739115e-05, "loss": 0.0106, "step": 10730 }, { "epoch": 15.063113604488079, "grad_norm": 0.2482915222644806, "learning_rate": 4.8024617809656684e-05, "loss": 0.0103, "step": 10740 }, { "epoch": 15.077138849929874, "grad_norm": 0.22265440225601196, "learning_rate": 4.794201160254171e-05, "loss": 0.0096, "step": 10750 }, { "epoch": 15.091164095371669, "grad_norm": 0.3280293047428131, "learning_rate": 4.785941102188844e-05, "loss": 0.01, "step": 10760 }, { "epoch": 15.105189340813464, "grad_norm": 0.35659781098365784, "learning_rate": 4.7776816293523686e-05, "loss": 0.0102, "step": 10770 }, { "epoch": 15.119214586255259, "grad_norm": 0.24240602552890778, "learning_rate": 4.769422764325832e-05, "loss": 0.0103, "step": 10780 }, { "epoch": 15.133239831697054, "grad_norm": 0.28238245844841003, "learning_rate": 4.76116452968865e-05, "loss": 0.0088, "step": 10790 }, { "epoch": 15.147265077138849, "grad_norm": 0.13241586089134216, "learning_rate": 4.752906948018525e-05, "loss": 0.0121, "step": 10800 }, { "epoch": 15.161290322580646, "grad_norm": 0.18027766048908234, "learning_rate": 4.7446500418913684e-05, "loss": 0.0106, "step": 10810 }, { "epoch": 15.175315568022441, "grad_norm": 0.28974977135658264, "learning_rate": 4.736393833881247e-05, "loss": 0.0083, "step": 10820 }, { "epoch": 15.189340813464236, "grad_norm": 0.18056179583072662, "learning_rate": 4.7281383465603194e-05, "loss": 0.0098, "step": 10830 }, { "epoch": 15.203366058906031, "grad_norm": 0.16724443435668945, "learning_rate": 4.71988360249877e-05, "loss": 0.0095, "step": 10840 }, { "epoch": 15.217391304347826, "grad_norm": 0.23898473381996155, "learning_rate": 4.7116296242647554e-05, "loss": 0.0092, "step": 10850 }, { "epoch": 15.231416549789621, "grad_norm": 0.33602017164230347, "learning_rate": 4.703376434424336e-05, "loss": 0.0089, "step": 10860 }, { "epoch": 15.245441795231416, "grad_norm": 0.20115704834461212, "learning_rate": 4.695124055541421e-05, "loss": 0.0106, "step": 10870 }, { "epoch": 15.259467040673211, "grad_norm": 0.21478301286697388, "learning_rate": 4.6868725101776934e-05, "loss": 0.0103, "step": 10880 }, { "epoch": 15.273492286115006, "grad_norm": 0.1635810285806656, "learning_rate": 4.678621820892567e-05, "loss": 0.0095, "step": 10890 }, { "epoch": 15.287517531556801, "grad_norm": 0.1828705221414566, "learning_rate": 4.670372010243111e-05, "loss": 0.0104, "step": 10900 }, { "epoch": 15.301542776998598, "grad_norm": 0.18363599479198456, "learning_rate": 4.662123100783992e-05, "loss": 0.0081, "step": 10910 }, { "epoch": 15.315568022440393, "grad_norm": 0.14420855045318604, "learning_rate": 4.653875115067415e-05, "loss": 0.0098, "step": 10920 }, { "epoch": 15.329593267882188, "grad_norm": 0.20186762511730194, "learning_rate": 4.6456280756430545e-05, "loss": 0.0108, "step": 10930 }, { "epoch": 15.343618513323984, "grad_norm": 0.23671378195285797, "learning_rate": 4.637382005058004e-05, "loss": 0.009, "step": 10940 }, { "epoch": 15.357643758765779, "grad_norm": 0.32644569873809814, "learning_rate": 4.629136925856705e-05, "loss": 0.0096, "step": 10950 }, { "epoch": 15.371669004207574, "grad_norm": 0.2668977975845337, "learning_rate": 4.6208928605808895e-05, "loss": 0.0075, "step": 10960 }, { "epoch": 15.385694249649369, "grad_norm": 0.20776093006134033, "learning_rate": 4.612649831769519e-05, "loss": 0.0083, "step": 10970 }, { "epoch": 15.399719495091164, "grad_norm": 0.19142881035804749, "learning_rate": 4.604407861958715e-05, "loss": 0.0082, "step": 10980 }, { "epoch": 15.413744740532959, "grad_norm": 0.19776487350463867, "learning_rate": 4.5961669736817114e-05, "loss": 0.0094, "step": 10990 }, { "epoch": 15.427769985974754, "grad_norm": 0.27646464109420776, "learning_rate": 4.5879271894687814e-05, "loss": 0.0098, "step": 11000 }, { "epoch": 15.441795231416549, "grad_norm": 0.15912969410419464, "learning_rate": 4.5796885318471826e-05, "loss": 0.0101, "step": 11010 }, { "epoch": 15.455820476858346, "grad_norm": 0.16419053077697754, "learning_rate": 4.571451023341086e-05, "loss": 0.009, "step": 11020 }, { "epoch": 15.46984572230014, "grad_norm": 0.19846050441265106, "learning_rate": 4.563214686471527e-05, "loss": 0.0097, "step": 11030 }, { "epoch": 15.483870967741936, "grad_norm": 0.3794410824775696, "learning_rate": 4.5549795437563365e-05, "loss": 0.0104, "step": 11040 }, { "epoch": 15.497896213183731, "grad_norm": 0.13084083795547485, "learning_rate": 4.546745617710081e-05, "loss": 0.008, "step": 11050 }, { "epoch": 15.511921458625526, "grad_norm": 0.2968774735927582, "learning_rate": 4.5385129308440014e-05, "loss": 0.0111, "step": 11060 }, { "epoch": 15.525946704067321, "grad_norm": 0.23118312656879425, "learning_rate": 4.530281505665944e-05, "loss": 0.0092, "step": 11070 }, { "epoch": 15.539971949509116, "grad_norm": 0.1832834929227829, "learning_rate": 4.5220513646803134e-05, "loss": 0.009, "step": 11080 }, { "epoch": 15.553997194950911, "grad_norm": 0.1981266885995865, "learning_rate": 4.513822530388003e-05, "loss": 0.0112, "step": 11090 }, { "epoch": 15.568022440392706, "grad_norm": 0.18244324624538422, "learning_rate": 4.5055950252863296e-05, "loss": 0.0097, "step": 11100 }, { "epoch": 15.582047685834501, "grad_norm": 0.22269488871097565, "learning_rate": 4.4973688718689803e-05, "loss": 0.0091, "step": 11110 }, { "epoch": 15.596072931276296, "grad_norm": 0.20695935189723969, "learning_rate": 4.4891440926259406e-05, "loss": 0.009, "step": 11120 }, { "epoch": 15.610098176718093, "grad_norm": 0.2235296070575714, "learning_rate": 4.480920710043443e-05, "loss": 0.0083, "step": 11130 }, { "epoch": 15.624123422159888, "grad_norm": 0.23440764844417572, "learning_rate": 4.4726987466039044e-05, "loss": 0.009, "step": 11140 }, { "epoch": 15.638148667601683, "grad_norm": 0.1440567821264267, "learning_rate": 4.46447822478586e-05, "loss": 0.0098, "step": 11150 }, { "epoch": 15.652173913043478, "grad_norm": 0.15349259972572327, "learning_rate": 4.4562591670638974e-05, "loss": 0.0118, "step": 11160 }, { "epoch": 15.666199158485274, "grad_norm": 0.16608506441116333, "learning_rate": 4.4480415959086105e-05, "loss": 0.0076, "step": 11170 }, { "epoch": 15.680224403927069, "grad_norm": 0.18879686295986176, "learning_rate": 4.439825533786522e-05, "loss": 0.0073, "step": 11180 }, { "epoch": 15.694249649368864, "grad_norm": 0.17607560753822327, "learning_rate": 4.431611003160035e-05, "loss": 0.0098, "step": 11190 }, { "epoch": 15.708274894810659, "grad_norm": 0.3354603052139282, "learning_rate": 4.4233980264873636e-05, "loss": 0.008, "step": 11200 }, { "epoch": 15.722300140252454, "grad_norm": 0.11948952823877335, "learning_rate": 4.4151866262224684e-05, "loss": 0.0098, "step": 11210 }, { "epoch": 15.736325385694249, "grad_norm": 0.5539141297340393, "learning_rate": 4.406976824815006e-05, "loss": 0.0118, "step": 11220 }, { "epoch": 15.750350631136044, "grad_norm": 0.23144300282001495, "learning_rate": 4.3987686447102595e-05, "loss": 0.0081, "step": 11230 }, { "epoch": 15.76437587657784, "grad_norm": 0.21100854873657227, "learning_rate": 4.3905621083490804e-05, "loss": 0.0093, "step": 11240 }, { "epoch": 15.778401122019636, "grad_norm": 0.19652733206748962, "learning_rate": 4.3823572381678286e-05, "loss": 0.0098, "step": 11250 }, { "epoch": 15.792426367461431, "grad_norm": 0.27094849944114685, "learning_rate": 4.374154056598301e-05, "loss": 0.0083, "step": 11260 }, { "epoch": 15.806451612903226, "grad_norm": 0.22976556420326233, "learning_rate": 4.3659525860676845e-05, "loss": 0.0114, "step": 11270 }, { "epoch": 15.820476858345021, "grad_norm": 0.18868762254714966, "learning_rate": 4.3577528489984854e-05, "loss": 0.0118, "step": 11280 }, { "epoch": 15.834502103786816, "grad_norm": 0.1960749328136444, "learning_rate": 4.349554867808476e-05, "loss": 0.0084, "step": 11290 }, { "epoch": 15.848527349228611, "grad_norm": 0.22553664445877075, "learning_rate": 4.34135866491062e-05, "loss": 0.008, "step": 11300 }, { "epoch": 15.862552594670406, "grad_norm": 0.20170699059963226, "learning_rate": 4.333164262713022e-05, "loss": 0.0099, "step": 11310 }, { "epoch": 15.876577840112201, "grad_norm": 0.19314607977867126, "learning_rate": 4.324971683618868e-05, "loss": 0.0116, "step": 11320 }, { "epoch": 15.890603085553996, "grad_norm": 0.1966048777103424, "learning_rate": 4.316780950026354e-05, "loss": 0.0078, "step": 11330 }, { "epoch": 15.904628330995793, "grad_norm": 0.24035994708538055, "learning_rate": 4.308592084328637e-05, "loss": 0.0098, "step": 11340 }, { "epoch": 15.918653576437588, "grad_norm": 0.19576811790466309, "learning_rate": 4.3004051089137576e-05, "loss": 0.0104, "step": 11350 }, { "epoch": 15.932678821879383, "grad_norm": 0.26939424872398376, "learning_rate": 4.292220046164597e-05, "loss": 0.01, "step": 11360 }, { "epoch": 15.946704067321178, "grad_norm": 0.16203773021697998, "learning_rate": 4.2840369184588035e-05, "loss": 0.0073, "step": 11370 }, { "epoch": 15.960729312762973, "grad_norm": 0.09849650412797928, "learning_rate": 4.2758557481687345e-05, "loss": 0.0084, "step": 11380 }, { "epoch": 15.974754558204769, "grad_norm": 0.23517513275146484, "learning_rate": 4.267676557661403e-05, "loss": 0.0093, "step": 11390 }, { "epoch": 15.988779803646564, "grad_norm": 0.21171016991138458, "learning_rate": 4.2594993692983955e-05, "loss": 0.0098, "step": 11400 }, { "epoch": 16.00280504908836, "grad_norm": 0.21014121174812317, "learning_rate": 4.251324205435837e-05, "loss": 0.0084, "step": 11410 }, { "epoch": 16.016830294530155, "grad_norm": 0.19628487527370453, "learning_rate": 4.243151088424312e-05, "loss": 0.0092, "step": 11420 }, { "epoch": 16.03085553997195, "grad_norm": 0.29743191599845886, "learning_rate": 4.234980040608813e-05, "loss": 0.0119, "step": 11430 }, { "epoch": 16.044880785413746, "grad_norm": 0.38979285955429077, "learning_rate": 4.22681108432867e-05, "loss": 0.0116, "step": 11440 }, { "epoch": 16.05890603085554, "grad_norm": 0.29463547468185425, "learning_rate": 4.2186442419174984e-05, "loss": 0.0088, "step": 11450 }, { "epoch": 16.072931276297336, "grad_norm": 0.1423317939043045, "learning_rate": 4.210479535703133e-05, "loss": 0.008, "step": 11460 }, { "epoch": 16.08695652173913, "grad_norm": 0.18004333972930908, "learning_rate": 4.202316988007567e-05, "loss": 0.0083, "step": 11470 }, { "epoch": 16.100981767180926, "grad_norm": 0.2123323380947113, "learning_rate": 4.194156621146901e-05, "loss": 0.0108, "step": 11480 }, { "epoch": 16.11500701262272, "grad_norm": 0.3256523907184601, "learning_rate": 4.1859984574312596e-05, "loss": 0.009, "step": 11490 }, { "epoch": 16.129032258064516, "grad_norm": 0.569152295589447, "learning_rate": 4.177842519164752e-05, "loss": 0.0077, "step": 11500 }, { "epoch": 16.143057503506313, "grad_norm": 0.1363021582365036, "learning_rate": 4.169688828645404e-05, "loss": 0.0107, "step": 11510 }, { "epoch": 16.157082748948106, "grad_norm": 0.3934583067893982, "learning_rate": 4.161537408165092e-05, "loss": 0.0092, "step": 11520 }, { "epoch": 16.171107994389903, "grad_norm": 0.22411219775676727, "learning_rate": 4.1533882800094924e-05, "loss": 0.0094, "step": 11530 }, { "epoch": 16.185133239831696, "grad_norm": 0.16756853461265564, "learning_rate": 4.145241466458005e-05, "loss": 0.0097, "step": 11540 }, { "epoch": 16.199158485273493, "grad_norm": 0.1403542459011078, "learning_rate": 4.13709698978371e-05, "loss": 0.0097, "step": 11550 }, { "epoch": 16.213183730715286, "grad_norm": 0.2111264318227768, "learning_rate": 4.1289548722532944e-05, "loss": 0.0088, "step": 11560 }, { "epoch": 16.227208976157083, "grad_norm": 0.15159165859222412, "learning_rate": 4.120815136126999e-05, "loss": 0.0109, "step": 11570 }, { "epoch": 16.241234221598877, "grad_norm": 0.23566585779190063, "learning_rate": 4.112677803658548e-05, "loss": 0.0093, "step": 11580 }, { "epoch": 16.255259467040673, "grad_norm": 0.20175804197788239, "learning_rate": 4.1045428970951e-05, "loss": 0.0085, "step": 11590 }, { "epoch": 16.269284712482467, "grad_norm": 0.2879919707775116, "learning_rate": 4.0964104386771785e-05, "loss": 0.0084, "step": 11600 }, { "epoch": 16.283309957924264, "grad_norm": 0.1662510186433792, "learning_rate": 4.0882804506386144e-05, "loss": 0.0083, "step": 11610 }, { "epoch": 16.29733520336606, "grad_norm": 0.32710763812065125, "learning_rate": 4.080152955206485e-05, "loss": 0.0086, "step": 11620 }, { "epoch": 16.311360448807854, "grad_norm": 0.2743833065032959, "learning_rate": 4.0720279746010505e-05, "loss": 0.0116, "step": 11630 }, { "epoch": 16.32538569424965, "grad_norm": 0.2028830647468567, "learning_rate": 4.063905531035699e-05, "loss": 0.0075, "step": 11640 }, { "epoch": 16.339410939691444, "grad_norm": 0.22976793348789215, "learning_rate": 4.055785646716882e-05, "loss": 0.0088, "step": 11650 }, { "epoch": 16.35343618513324, "grad_norm": 0.30104291439056396, "learning_rate": 4.047668343844051e-05, "loss": 0.0088, "step": 11660 }, { "epoch": 16.367461430575034, "grad_norm": 0.3315045237541199, "learning_rate": 4.039553644609604e-05, "loss": 0.0097, "step": 11670 }, { "epoch": 16.38148667601683, "grad_norm": 0.21908925473690033, "learning_rate": 4.0314415711988176e-05, "loss": 0.0077, "step": 11680 }, { "epoch": 16.395511921458624, "grad_norm": 0.2245943397283554, "learning_rate": 4.023332145789792e-05, "loss": 0.0084, "step": 11690 }, { "epoch": 16.40953716690042, "grad_norm": 0.16967611014842987, "learning_rate": 4.015225390553385e-05, "loss": 0.0099, "step": 11700 }, { "epoch": 16.423562412342218, "grad_norm": 0.20481960475444794, "learning_rate": 4.007121327653158e-05, "loss": 0.012, "step": 11710 }, { "epoch": 16.43758765778401, "grad_norm": 0.1941106766462326, "learning_rate": 3.9990199792453064e-05, "loss": 0.0093, "step": 11720 }, { "epoch": 16.451612903225808, "grad_norm": 0.2124784141778946, "learning_rate": 3.9909213674786103e-05, "loss": 0.0108, "step": 11730 }, { "epoch": 16.4656381486676, "grad_norm": 0.20621457695960999, "learning_rate": 3.982825514494363e-05, "loss": 0.0075, "step": 11740 }, { "epoch": 16.479663394109398, "grad_norm": 0.18255221843719482, "learning_rate": 3.974732442426319e-05, "loss": 0.0092, "step": 11750 }, { "epoch": 16.49368863955119, "grad_norm": 0.23338167369365692, "learning_rate": 3.966642173400629e-05, "loss": 0.0085, "step": 11760 }, { "epoch": 16.507713884992988, "grad_norm": 0.18127726018428802, "learning_rate": 3.9585547295357764e-05, "loss": 0.0089, "step": 11770 }, { "epoch": 16.52173913043478, "grad_norm": 0.18836653232574463, "learning_rate": 3.950470132942526e-05, "loss": 0.0081, "step": 11780 }, { "epoch": 16.535764375876578, "grad_norm": 0.2228149026632309, "learning_rate": 3.942388405723856e-05, "loss": 0.0105, "step": 11790 }, { "epoch": 16.54978962131837, "grad_norm": 0.26212188601493835, "learning_rate": 3.9343095699749e-05, "loss": 0.0075, "step": 11800 }, { "epoch": 16.56381486676017, "grad_norm": 0.22024713456630707, "learning_rate": 3.9262336477828874e-05, "loss": 0.0116, "step": 11810 }, { "epoch": 16.577840112201965, "grad_norm": 0.2278200089931488, "learning_rate": 3.9181606612270794e-05, "loss": 0.0078, "step": 11820 }, { "epoch": 16.59186535764376, "grad_norm": 0.2341119945049286, "learning_rate": 3.910090632378713e-05, "loss": 0.0098, "step": 11830 }, { "epoch": 16.605890603085555, "grad_norm": 0.16306468844413757, "learning_rate": 3.90202358330094e-05, "loss": 0.0094, "step": 11840 }, { "epoch": 16.61991584852735, "grad_norm": 0.2287190854549408, "learning_rate": 3.8939595360487656e-05, "loss": 0.0085, "step": 11850 }, { "epoch": 16.633941093969145, "grad_norm": 0.32766738533973694, "learning_rate": 3.885898512668984e-05, "loss": 0.0116, "step": 11860 }, { "epoch": 16.64796633941094, "grad_norm": 0.24914959073066711, "learning_rate": 3.877840535200127e-05, "loss": 0.0105, "step": 11870 }, { "epoch": 16.661991584852736, "grad_norm": 0.42789769172668457, "learning_rate": 3.869785625672397e-05, "loss": 0.0108, "step": 11880 }, { "epoch": 16.67601683029453, "grad_norm": 0.24434919655323029, "learning_rate": 3.8617338061076094e-05, "loss": 0.0084, "step": 11890 }, { "epoch": 16.690042075736326, "grad_norm": 0.236898273229599, "learning_rate": 3.853685098519132e-05, "loss": 0.0101, "step": 11900 }, { "epoch": 16.70406732117812, "grad_norm": 0.27654486894607544, "learning_rate": 3.845639524911823e-05, "loss": 0.0131, "step": 11910 }, { "epoch": 16.718092566619916, "grad_norm": 0.15202286839485168, "learning_rate": 3.837597107281974e-05, "loss": 0.0098, "step": 11920 }, { "epoch": 16.732117812061713, "grad_norm": 0.22438643872737885, "learning_rate": 3.829557867617247e-05, "loss": 0.0086, "step": 11930 }, { "epoch": 16.746143057503506, "grad_norm": 0.21500657498836517, "learning_rate": 3.821521827896618e-05, "loss": 0.0087, "step": 11940 }, { "epoch": 16.760168302945303, "grad_norm": 0.27082496881484985, "learning_rate": 3.81348901009031e-05, "loss": 0.0092, "step": 11950 }, { "epoch": 16.774193548387096, "grad_norm": 0.3330731689929962, "learning_rate": 3.805459436159741e-05, "loss": 0.0121, "step": 11960 }, { "epoch": 16.788218793828893, "grad_norm": 0.20970261096954346, "learning_rate": 3.797433128057461e-05, "loss": 0.0094, "step": 11970 }, { "epoch": 16.802244039270686, "grad_norm": 0.21010342240333557, "learning_rate": 3.789410107727089e-05, "loss": 0.0093, "step": 11980 }, { "epoch": 16.816269284712483, "grad_norm": 0.21578454971313477, "learning_rate": 3.781390397103257e-05, "loss": 0.0135, "step": 11990 }, { "epoch": 16.830294530154276, "grad_norm": 0.3264386057853699, "learning_rate": 3.7733740181115455e-05, "loss": 0.0086, "step": 12000 }, { "epoch": 16.844319775596073, "grad_norm": 0.24093574285507202, "learning_rate": 3.7653609926684306e-05, "loss": 0.01, "step": 12010 }, { "epoch": 16.858345021037866, "grad_norm": 0.17956684529781342, "learning_rate": 3.757351342681217e-05, "loss": 0.0119, "step": 12020 }, { "epoch": 16.872370266479663, "grad_norm": 0.16682109236717224, "learning_rate": 3.749345090047982e-05, "loss": 0.0105, "step": 12030 }, { "epoch": 16.88639551192146, "grad_norm": 0.1870986670255661, "learning_rate": 3.741342256657515e-05, "loss": 0.0092, "step": 12040 }, { "epoch": 16.900420757363253, "grad_norm": 0.3824915289878845, "learning_rate": 3.7333428643892567e-05, "loss": 0.0113, "step": 12050 }, { "epoch": 16.91444600280505, "grad_norm": 0.17323283851146698, "learning_rate": 3.725346935113239e-05, "loss": 0.0083, "step": 12060 }, { "epoch": 16.928471248246844, "grad_norm": 0.18815870583057404, "learning_rate": 3.717354490690029e-05, "loss": 0.0097, "step": 12070 }, { "epoch": 16.94249649368864, "grad_norm": 0.15187601745128632, "learning_rate": 3.709365552970664e-05, "loss": 0.0093, "step": 12080 }, { "epoch": 16.956521739130434, "grad_norm": 0.3310275971889496, "learning_rate": 3.7013801437965945e-05, "loss": 0.0085, "step": 12090 }, { "epoch": 16.97054698457223, "grad_norm": 0.23427362740039825, "learning_rate": 3.693398284999623e-05, "loss": 0.0084, "step": 12100 }, { "epoch": 16.984572230014024, "grad_norm": 0.1855204850435257, "learning_rate": 3.6854199984018484e-05, "loss": 0.0086, "step": 12110 }, { "epoch": 16.99859747545582, "grad_norm": 0.3212810754776001, "learning_rate": 3.677445305815601e-05, "loss": 0.0085, "step": 12120 }, { "epoch": 17.012622720897614, "grad_norm": 0.2827611267566681, "learning_rate": 3.669474229043387e-05, "loss": 0.0101, "step": 12130 }, { "epoch": 17.02664796633941, "grad_norm": 0.21065554022789001, "learning_rate": 3.6615067898778235e-05, "loss": 0.0091, "step": 12140 }, { "epoch": 17.040673211781208, "grad_norm": 0.2140539288520813, "learning_rate": 3.6535430101015866e-05, "loss": 0.0072, "step": 12150 }, { "epoch": 17.054698457223, "grad_norm": 0.15207549929618835, "learning_rate": 3.645582911487345e-05, "loss": 0.0075, "step": 12160 }, { "epoch": 17.068723702664798, "grad_norm": 0.1793794184923172, "learning_rate": 3.637626515797706e-05, "loss": 0.0104, "step": 12170 }, { "epoch": 17.08274894810659, "grad_norm": 0.17299926280975342, "learning_rate": 3.629673844785152e-05, "loss": 0.0097, "step": 12180 }, { "epoch": 17.096774193548388, "grad_norm": 0.22457382082939148, "learning_rate": 3.621724920191979e-05, "loss": 0.0081, "step": 12190 }, { "epoch": 17.11079943899018, "grad_norm": 0.152699813246727, "learning_rate": 3.6137797637502444e-05, "loss": 0.007, "step": 12200 }, { "epoch": 17.124824684431978, "grad_norm": 0.16693679988384247, "learning_rate": 3.6058383971817035e-05, "loss": 0.0095, "step": 12210 }, { "epoch": 17.13884992987377, "grad_norm": 0.21770069003105164, "learning_rate": 3.59790084219775e-05, "loss": 0.0087, "step": 12220 }, { "epoch": 17.152875175315568, "grad_norm": 0.2980039715766907, "learning_rate": 3.589967120499353e-05, "loss": 0.0072, "step": 12230 }, { "epoch": 17.166900420757365, "grad_norm": 0.20654623210430145, "learning_rate": 3.5820372537770075e-05, "loss": 0.0091, "step": 12240 }, { "epoch": 17.18092566619916, "grad_norm": 0.2156088501214981, "learning_rate": 3.5741112637106655e-05, "loss": 0.0084, "step": 12250 }, { "epoch": 17.194950911640955, "grad_norm": 0.20457704365253448, "learning_rate": 3.5661891719696804e-05, "loss": 0.0095, "step": 12260 }, { "epoch": 17.20897615708275, "grad_norm": 0.6470947861671448, "learning_rate": 3.5582710002127504e-05, "loss": 0.0089, "step": 12270 }, { "epoch": 17.223001402524545, "grad_norm": 0.29136642813682556, "learning_rate": 3.550356770087853e-05, "loss": 0.0107, "step": 12280 }, { "epoch": 17.23702664796634, "grad_norm": 0.2701510787010193, "learning_rate": 3.5424465032321914e-05, "loss": 0.0092, "step": 12290 }, { "epoch": 17.251051893408135, "grad_norm": 0.14231176674365997, "learning_rate": 3.5345402212721335e-05, "loss": 0.0088, "step": 12300 }, { "epoch": 17.26507713884993, "grad_norm": 0.18041643500328064, "learning_rate": 3.526637945823152e-05, "loss": 0.0091, "step": 12310 }, { "epoch": 17.279102384291726, "grad_norm": 0.15924806892871857, "learning_rate": 3.518739698489767e-05, "loss": 0.0097, "step": 12320 }, { "epoch": 17.29312762973352, "grad_norm": 0.16786174476146698, "learning_rate": 3.510845500865485e-05, "loss": 0.0075, "step": 12330 }, { "epoch": 17.307152875175316, "grad_norm": 0.3544633984565735, "learning_rate": 3.502955374532739e-05, "loss": 0.0093, "step": 12340 }, { "epoch": 17.321178120617112, "grad_norm": 0.18536585569381714, "learning_rate": 3.495069341062836e-05, "loss": 0.0098, "step": 12350 }, { "epoch": 17.335203366058906, "grad_norm": 0.2172236442565918, "learning_rate": 3.4871874220158896e-05, "loss": 0.011, "step": 12360 }, { "epoch": 17.349228611500703, "grad_norm": 0.26288652420043945, "learning_rate": 3.479309638940762e-05, "loss": 0.0094, "step": 12370 }, { "epoch": 17.363253856942496, "grad_norm": 0.2748238146305084, "learning_rate": 3.4714360133750146e-05, "loss": 0.01, "step": 12380 }, { "epoch": 17.377279102384293, "grad_norm": 0.21954891085624695, "learning_rate": 3.463566566844839e-05, "loss": 0.01, "step": 12390 }, { "epoch": 17.391304347826086, "grad_norm": 0.2408241182565689, "learning_rate": 3.4557013208650016e-05, "loss": 0.0084, "step": 12400 }, { "epoch": 17.405329593267883, "grad_norm": 0.1899658441543579, "learning_rate": 3.4478402969387857e-05, "loss": 0.0095, "step": 12410 }, { "epoch": 17.419354838709676, "grad_norm": 0.14918318390846252, "learning_rate": 3.4399835165579266e-05, "loss": 0.0084, "step": 12420 }, { "epoch": 17.433380084151473, "grad_norm": 0.29653528332710266, "learning_rate": 3.4321310012025645e-05, "loss": 0.0083, "step": 12430 }, { "epoch": 17.447405329593266, "grad_norm": 0.32442429661750793, "learning_rate": 3.424282772341176e-05, "loss": 0.0069, "step": 12440 }, { "epoch": 17.461430575035063, "grad_norm": 0.17963483929634094, "learning_rate": 3.416438851430519e-05, "loss": 0.0079, "step": 12450 }, { "epoch": 17.47545582047686, "grad_norm": 0.363046258687973, "learning_rate": 3.408599259915577e-05, "loss": 0.0149, "step": 12460 }, { "epoch": 17.489481065918653, "grad_norm": 0.23502252995967865, "learning_rate": 3.400764019229487e-05, "loss": 0.0089, "step": 12470 }, { "epoch": 17.50350631136045, "grad_norm": 0.23930932581424713, "learning_rate": 3.3929331507935035e-05, "loss": 0.0114, "step": 12480 }, { "epoch": 17.517531556802243, "grad_norm": 0.1738836020231247, "learning_rate": 3.3851066760169196e-05, "loss": 0.0074, "step": 12490 }, { "epoch": 17.53155680224404, "grad_norm": 0.2641429007053375, "learning_rate": 3.377284616297021e-05, "loss": 0.0076, "step": 12500 }, { "epoch": 17.545582047685834, "grad_norm": 0.19689929485321045, "learning_rate": 3.3694669930190166e-05, "loss": 0.0083, "step": 12510 }, { "epoch": 17.55960729312763, "grad_norm": 0.24558936059474945, "learning_rate": 3.36165382755599e-05, "loss": 0.0071, "step": 12520 }, { "epoch": 17.573632538569424, "grad_norm": 0.1259496957063675, "learning_rate": 3.35384514126884e-05, "loss": 0.0072, "step": 12530 }, { "epoch": 17.58765778401122, "grad_norm": 0.18422572314739227, "learning_rate": 3.3460409555062154e-05, "loss": 0.0109, "step": 12540 }, { "epoch": 17.601683029453014, "grad_norm": 0.16148732602596283, "learning_rate": 3.3382412916044645e-05, "loss": 0.0082, "step": 12550 }, { "epoch": 17.61570827489481, "grad_norm": 0.1914682239294052, "learning_rate": 3.330446170887566e-05, "loss": 0.0082, "step": 12560 }, { "epoch": 17.629733520336607, "grad_norm": 0.1489109992980957, "learning_rate": 3.3226556146670834e-05, "loss": 0.007, "step": 12570 }, { "epoch": 17.6437587657784, "grad_norm": 0.2028045356273651, "learning_rate": 3.314869644242102e-05, "loss": 0.0079, "step": 12580 }, { "epoch": 17.657784011220198, "grad_norm": 0.2968466281890869, "learning_rate": 3.3070882808991674e-05, "loss": 0.0082, "step": 12590 }, { "epoch": 17.67180925666199, "grad_norm": 0.19305787980556488, "learning_rate": 3.2993115459122305e-05, "loss": 0.0079, "step": 12600 }, { "epoch": 17.685834502103788, "grad_norm": 0.26344314217567444, "learning_rate": 3.2915394605425835e-05, "loss": 0.0084, "step": 12610 }, { "epoch": 17.69985974754558, "grad_norm": 0.2045954018831253, "learning_rate": 3.283772046038816e-05, "loss": 0.0086, "step": 12620 }, { "epoch": 17.713884992987378, "grad_norm": 0.20698758959770203, "learning_rate": 3.276009323636739e-05, "loss": 0.009, "step": 12630 }, { "epoch": 17.72791023842917, "grad_norm": 0.22774313390254974, "learning_rate": 3.268251314559344e-05, "loss": 0.0074, "step": 12640 }, { "epoch": 17.741935483870968, "grad_norm": 0.13738803565502167, "learning_rate": 3.2604980400167254e-05, "loss": 0.0079, "step": 12650 }, { "epoch": 17.75596072931276, "grad_norm": 0.17301692068576813, "learning_rate": 3.252749521206042e-05, "loss": 0.0077, "step": 12660 }, { "epoch": 17.769985974754558, "grad_norm": 0.12385193258523941, "learning_rate": 3.2450057793114494e-05, "loss": 0.0067, "step": 12670 }, { "epoch": 17.784011220196355, "grad_norm": 0.3599059283733368, "learning_rate": 3.2372668355040435e-05, "loss": 0.0066, "step": 12680 }, { "epoch": 17.79803646563815, "grad_norm": 0.1869467943906784, "learning_rate": 3.2295327109418005e-05, "loss": 0.0079, "step": 12690 }, { "epoch": 17.812061711079945, "grad_norm": 0.25779691338539124, "learning_rate": 3.221803426769518e-05, "loss": 0.0095, "step": 12700 }, { "epoch": 17.82608695652174, "grad_norm": 0.18111944198608398, "learning_rate": 3.214079004118768e-05, "loss": 0.0067, "step": 12710 }, { "epoch": 17.840112201963535, "grad_norm": 0.20176877081394196, "learning_rate": 3.2063594641078234e-05, "loss": 0.0074, "step": 12720 }, { "epoch": 17.85413744740533, "grad_norm": 0.1939600706100464, "learning_rate": 3.198644827841616e-05, "loss": 0.0087, "step": 12730 }, { "epoch": 17.868162692847125, "grad_norm": 0.2841373682022095, "learning_rate": 3.1909351164116654e-05, "loss": 0.0079, "step": 12740 }, { "epoch": 17.88218793828892, "grad_norm": 0.27916449308395386, "learning_rate": 3.183230350896026e-05, "loss": 0.0072, "step": 12750 }, { "epoch": 17.896213183730715, "grad_norm": 0.46009543538093567, "learning_rate": 3.1755305523592337e-05, "loss": 0.0075, "step": 12760 }, { "epoch": 17.91023842917251, "grad_norm": 0.2023129016160965, "learning_rate": 3.167835741852245e-05, "loss": 0.0086, "step": 12770 }, { "epoch": 17.924263674614306, "grad_norm": 0.1823815405368805, "learning_rate": 3.160145940412378e-05, "loss": 0.0078, "step": 12780 }, { "epoch": 17.938288920056102, "grad_norm": 0.1983368992805481, "learning_rate": 3.1524611690632545e-05, "loss": 0.0088, "step": 12790 }, { "epoch": 17.952314165497896, "grad_norm": 0.16487516462802887, "learning_rate": 3.144781448814746e-05, "loss": 0.0079, "step": 12800 }, { "epoch": 17.966339410939693, "grad_norm": 0.13761094212532043, "learning_rate": 3.1371068006629145e-05, "loss": 0.0081, "step": 12810 }, { "epoch": 17.980364656381486, "grad_norm": 0.2635535001754761, "learning_rate": 3.129437245589956e-05, "loss": 0.0075, "step": 12820 }, { "epoch": 17.994389901823283, "grad_norm": 0.17309655249118805, "learning_rate": 3.121772804564143e-05, "loss": 0.0087, "step": 12830 }, { "epoch": 18.008415147265076, "grad_norm": 0.11762574315071106, "learning_rate": 3.11411349853976e-05, "loss": 0.0073, "step": 12840 }, { "epoch": 18.022440392706873, "grad_norm": 0.1560138761997223, "learning_rate": 3.10645934845706e-05, "loss": 0.0091, "step": 12850 }, { "epoch": 18.036465638148666, "grad_norm": 0.14150527119636536, "learning_rate": 3.098810375242196e-05, "loss": 0.0071, "step": 12860 }, { "epoch": 18.050490883590463, "grad_norm": 0.22092695534229279, "learning_rate": 3.0911665998071704e-05, "loss": 0.0094, "step": 12870 }, { "epoch": 18.06451612903226, "grad_norm": 0.13987942039966583, "learning_rate": 3.083528043049774e-05, "loss": 0.0079, "step": 12880 }, { "epoch": 18.078541374474053, "grad_norm": 0.38499996066093445, "learning_rate": 3.0758947258535255e-05, "loss": 0.0079, "step": 12890 }, { "epoch": 18.09256661991585, "grad_norm": 0.2036462426185608, "learning_rate": 3.068266669087625e-05, "loss": 0.0078, "step": 12900 }, { "epoch": 18.106591865357643, "grad_norm": 0.19706113636493683, "learning_rate": 3.060643893606887e-05, "loss": 0.0081, "step": 12910 }, { "epoch": 18.12061711079944, "grad_norm": 0.1700439751148224, "learning_rate": 3.053026420251693e-05, "loss": 0.0086, "step": 12920 }, { "epoch": 18.134642356241233, "grad_norm": 0.1688614785671234, "learning_rate": 3.0454142698479183e-05, "loss": 0.0075, "step": 12930 }, { "epoch": 18.14866760168303, "grad_norm": 0.1469232141971588, "learning_rate": 3.0378074632068954e-05, "loss": 0.0081, "step": 12940 }, { "epoch": 18.162692847124823, "grad_norm": 0.19596649706363678, "learning_rate": 3.0302060211253408e-05, "loss": 0.0086, "step": 12950 }, { "epoch": 18.17671809256662, "grad_norm": 0.16602858901023865, "learning_rate": 3.0226099643853073e-05, "loss": 0.0077, "step": 12960 }, { "epoch": 18.190743338008414, "grad_norm": 0.14598309993743896, "learning_rate": 3.0150193137541283e-05, "loss": 0.0069, "step": 12970 }, { "epoch": 18.20476858345021, "grad_norm": 0.12520602345466614, "learning_rate": 3.0074340899843467e-05, "loss": 0.0084, "step": 12980 }, { "epoch": 18.218793828892007, "grad_norm": 0.16699177026748657, "learning_rate": 2.999854313813677e-05, "loss": 0.0084, "step": 12990 }, { "epoch": 18.2328190743338, "grad_norm": 0.20742231607437134, "learning_rate": 2.9922800059649382e-05, "loss": 0.011, "step": 13000 }, { "epoch": 18.246844319775597, "grad_norm": 0.16405554115772247, "learning_rate": 2.9847111871459976e-05, "loss": 0.0093, "step": 13010 }, { "epoch": 18.26086956521739, "grad_norm": 0.18367408215999603, "learning_rate": 2.977147878049721e-05, "loss": 0.0079, "step": 13020 }, { "epoch": 18.274894810659188, "grad_norm": 0.1830630600452423, "learning_rate": 2.9695900993539006e-05, "loss": 0.0087, "step": 13030 }, { "epoch": 18.28892005610098, "grad_norm": 0.17824967205524445, "learning_rate": 2.9620378717212183e-05, "loss": 0.0076, "step": 13040 }, { "epoch": 18.302945301542778, "grad_norm": 0.1837596893310547, "learning_rate": 2.9544912157991745e-05, "loss": 0.0113, "step": 13050 }, { "epoch": 18.31697054698457, "grad_norm": 0.19367700815200806, "learning_rate": 2.9469501522200405e-05, "loss": 0.0098, "step": 13060 }, { "epoch": 18.330995792426368, "grad_norm": 0.26784583926200867, "learning_rate": 2.9394147016007946e-05, "loss": 0.0085, "step": 13070 }, { "epoch": 18.34502103786816, "grad_norm": 0.19442759454250336, "learning_rate": 2.9318848845430702e-05, "loss": 0.0075, "step": 13080 }, { "epoch": 18.359046283309958, "grad_norm": 0.227598175406456, "learning_rate": 2.9243607216331013e-05, "loss": 0.0067, "step": 13090 }, { "epoch": 18.373071528751755, "grad_norm": 0.22267691791057587, "learning_rate": 2.916842233441661e-05, "loss": 0.0076, "step": 13100 }, { "epoch": 18.387096774193548, "grad_norm": 0.19828003644943237, "learning_rate": 2.90932944052401e-05, "loss": 0.006, "step": 13110 }, { "epoch": 18.401122019635345, "grad_norm": 0.1883724331855774, "learning_rate": 2.9018223634198354e-05, "loss": 0.0075, "step": 13120 }, { "epoch": 18.415147265077138, "grad_norm": 0.15983474254608154, "learning_rate": 2.8943210226532025e-05, "loss": 0.0082, "step": 13130 }, { "epoch": 18.429172510518935, "grad_norm": 0.23428162932395935, "learning_rate": 2.8868254387324857e-05, "loss": 0.0094, "step": 13140 }, { "epoch": 18.44319775596073, "grad_norm": 0.16369260847568512, "learning_rate": 2.8793356321503306e-05, "loss": 0.0093, "step": 13150 }, { "epoch": 18.457223001402525, "grad_norm": 0.15149912238121033, "learning_rate": 2.87185162338358e-05, "loss": 0.0075, "step": 13160 }, { "epoch": 18.47124824684432, "grad_norm": 0.16858907043933868, "learning_rate": 2.8643734328932253e-05, "loss": 0.0109, "step": 13170 }, { "epoch": 18.485273492286115, "grad_norm": 0.2064996063709259, "learning_rate": 2.856901081124359e-05, "loss": 0.0078, "step": 13180 }, { "epoch": 18.49929873772791, "grad_norm": 0.17161284387111664, "learning_rate": 2.8494345885061002e-05, "loss": 0.0083, "step": 13190 }, { "epoch": 18.513323983169705, "grad_norm": 0.20072968304157257, "learning_rate": 2.8419739754515616e-05, "loss": 0.009, "step": 13200 }, { "epoch": 18.527349228611502, "grad_norm": 0.28852778673171997, "learning_rate": 2.8345192623577666e-05, "loss": 0.0086, "step": 13210 }, { "epoch": 18.541374474053296, "grad_norm": 0.17270149290561676, "learning_rate": 2.8270704696056193e-05, "loss": 0.0085, "step": 13220 }, { "epoch": 18.555399719495092, "grad_norm": 0.19832520186901093, "learning_rate": 2.8196276175598367e-05, "loss": 0.0094, "step": 13230 }, { "epoch": 18.569424964936886, "grad_norm": 0.26108789443969727, "learning_rate": 2.8121907265688884e-05, "loss": 0.009, "step": 13240 }, { "epoch": 18.583450210378682, "grad_norm": 0.15912993252277374, "learning_rate": 2.804759816964957e-05, "loss": 0.011, "step": 13250 }, { "epoch": 18.597475455820476, "grad_norm": 0.19230160117149353, "learning_rate": 2.797334909063857e-05, "loss": 0.0082, "step": 13260 }, { "epoch": 18.611500701262273, "grad_norm": 0.17714239656925201, "learning_rate": 2.7899160231650056e-05, "loss": 0.008, "step": 13270 }, { "epoch": 18.625525946704066, "grad_norm": 0.18130367994308472, "learning_rate": 2.7825031795513585e-05, "loss": 0.0076, "step": 13280 }, { "epoch": 18.639551192145863, "grad_norm": 0.19681017100811005, "learning_rate": 2.775096398489341e-05, "loss": 0.0066, "step": 13290 }, { "epoch": 18.65357643758766, "grad_norm": 0.1091277152299881, "learning_rate": 2.7676957002288163e-05, "loss": 0.0062, "step": 13300 }, { "epoch": 18.667601683029453, "grad_norm": 0.29744064807891846, "learning_rate": 2.760301105003003e-05, "loss": 0.0076, "step": 13310 }, { "epoch": 18.68162692847125, "grad_norm": 0.3051035702228546, "learning_rate": 2.752912633028446e-05, "loss": 0.009, "step": 13320 }, { "epoch": 18.695652173913043, "grad_norm": 0.2553929388523102, "learning_rate": 2.7455303045049474e-05, "loss": 0.0069, "step": 13330 }, { "epoch": 18.70967741935484, "grad_norm": 0.19709406793117523, "learning_rate": 2.7381541396155098e-05, "loss": 0.0078, "step": 13340 }, { "epoch": 18.723702664796633, "grad_norm": 0.15913183987140656, "learning_rate": 2.730784158526286e-05, "loss": 0.0073, "step": 13350 }, { "epoch": 18.73772791023843, "grad_norm": 0.2333352416753769, "learning_rate": 2.723420381386521e-05, "loss": 0.0081, "step": 13360 }, { "epoch": 18.751753155680223, "grad_norm": 0.3027023673057556, "learning_rate": 2.7160628283285018e-05, "loss": 0.0071, "step": 13370 }, { "epoch": 18.76577840112202, "grad_norm": 0.25991368293762207, "learning_rate": 2.7087115194675007e-05, "loss": 0.0081, "step": 13380 }, { "epoch": 18.779803646563813, "grad_norm": 0.16294807195663452, "learning_rate": 2.701366474901712e-05, "loss": 0.0081, "step": 13390 }, { "epoch": 18.79382889200561, "grad_norm": 0.17931318283081055, "learning_rate": 2.6940277147122085e-05, "loss": 0.0064, "step": 13400 }, { "epoch": 18.807854137447407, "grad_norm": 0.14474117755889893, "learning_rate": 2.686695258962878e-05, "loss": 0.0074, "step": 13410 }, { "epoch": 18.8218793828892, "grad_norm": 0.4246097803115845, "learning_rate": 2.679369127700375e-05, "loss": 0.0064, "step": 13420 }, { "epoch": 18.835904628330997, "grad_norm": 0.20291243493556976, "learning_rate": 2.672049340954067e-05, "loss": 0.0062, "step": 13430 }, { "epoch": 18.84992987377279, "grad_norm": 0.2238149791955948, "learning_rate": 2.6647359187359676e-05, "loss": 0.0081, "step": 13440 }, { "epoch": 18.863955119214587, "grad_norm": 0.563483476638794, "learning_rate": 2.6574288810406946e-05, "loss": 0.0106, "step": 13450 }, { "epoch": 18.87798036465638, "grad_norm": 0.2689625322818756, "learning_rate": 2.6501282478454083e-05, "loss": 0.0062, "step": 13460 }, { "epoch": 18.892005610098177, "grad_norm": 0.18220138549804688, "learning_rate": 2.6428340391097618e-05, "loss": 0.009, "step": 13470 }, { "epoch": 18.90603085553997, "grad_norm": 0.13106879591941833, "learning_rate": 2.6355462747758485e-05, "loss": 0.0104, "step": 13480 }, { "epoch": 18.920056100981768, "grad_norm": 0.14439129829406738, "learning_rate": 2.6282649747681304e-05, "loss": 0.0071, "step": 13490 }, { "epoch": 18.93408134642356, "grad_norm": 0.14339104294776917, "learning_rate": 2.620990158993406e-05, "loss": 0.007, "step": 13500 }, { "epoch": 18.948106591865358, "grad_norm": 0.2263033092021942, "learning_rate": 2.6137218473407477e-05, "loss": 0.0082, "step": 13510 }, { "epoch": 18.962131837307155, "grad_norm": 0.16331800818443298, "learning_rate": 2.606460059681436e-05, "loss": 0.007, "step": 13520 }, { "epoch": 18.976157082748948, "grad_norm": 0.16371309757232666, "learning_rate": 2.599204815868928e-05, "loss": 0.0073, "step": 13530 }, { "epoch": 18.990182328190745, "grad_norm": 0.1515677273273468, "learning_rate": 2.5919561357387756e-05, "loss": 0.0063, "step": 13540 }, { "epoch": 19.004207573632538, "grad_norm": 0.12874668836593628, "learning_rate": 2.5847140391085972e-05, "loss": 0.0071, "step": 13550 }, { "epoch": 19.018232819074335, "grad_norm": 0.166687473654747, "learning_rate": 2.5774785457780103e-05, "loss": 0.0068, "step": 13560 }, { "epoch": 19.032258064516128, "grad_norm": 0.13182882964611053, "learning_rate": 2.5702496755285753e-05, "loss": 0.0082, "step": 13570 }, { "epoch": 19.046283309957925, "grad_norm": 0.24023155868053436, "learning_rate": 2.5630274481237483e-05, "loss": 0.0088, "step": 13580 }, { "epoch": 19.06030855539972, "grad_norm": 0.18903714418411255, "learning_rate": 2.5558118833088197e-05, "loss": 0.0071, "step": 13590 }, { "epoch": 19.074333800841515, "grad_norm": 0.172989621758461, "learning_rate": 2.548603000810872e-05, "loss": 0.007, "step": 13600 }, { "epoch": 19.08835904628331, "grad_norm": 0.17091317474842072, "learning_rate": 2.5414008203387152e-05, "loss": 0.0075, "step": 13610 }, { "epoch": 19.102384291725105, "grad_norm": 0.12304986268281937, "learning_rate": 2.534205361582834e-05, "loss": 0.0075, "step": 13620 }, { "epoch": 19.116409537166902, "grad_norm": 0.18321725726127625, "learning_rate": 2.527016644215338e-05, "loss": 0.0066, "step": 13630 }, { "epoch": 19.130434782608695, "grad_norm": 0.2508983910083771, "learning_rate": 2.519834687889905e-05, "loss": 0.0084, "step": 13640 }, { "epoch": 19.144460028050492, "grad_norm": 0.16794417798519135, "learning_rate": 2.5126595122417295e-05, "loss": 0.0071, "step": 13650 }, { "epoch": 19.158485273492285, "grad_norm": 0.1970127820968628, "learning_rate": 2.5054911368874713e-05, "loss": 0.0073, "step": 13660 }, { "epoch": 19.172510518934082, "grad_norm": 0.11517162621021271, "learning_rate": 2.4983295814251916e-05, "loss": 0.0064, "step": 13670 }, { "epoch": 19.186535764375876, "grad_norm": 0.12732157111167908, "learning_rate": 2.4911748654343105e-05, "loss": 0.0068, "step": 13680 }, { "epoch": 19.200561009817672, "grad_norm": 0.22805818915367126, "learning_rate": 2.4840270084755463e-05, "loss": 0.0101, "step": 13690 }, { "epoch": 19.214586255259466, "grad_norm": 0.19011084735393524, "learning_rate": 2.4768860300908685e-05, "loss": 0.0069, "step": 13700 }, { "epoch": 19.228611500701263, "grad_norm": 0.1539604514837265, "learning_rate": 2.469751949803443e-05, "loss": 0.0074, "step": 13710 }, { "epoch": 19.242636746143056, "grad_norm": 0.1273416429758072, "learning_rate": 2.4626247871175666e-05, "loss": 0.0091, "step": 13720 }, { "epoch": 19.256661991584853, "grad_norm": 0.13454587757587433, "learning_rate": 2.4555045615186346e-05, "loss": 0.0077, "step": 13730 }, { "epoch": 19.27068723702665, "grad_norm": 0.25041866302490234, "learning_rate": 2.4483912924730677e-05, "loss": 0.007, "step": 13740 }, { "epoch": 19.284712482468443, "grad_norm": 0.14507070183753967, "learning_rate": 2.4412849994282742e-05, "loss": 0.0088, "step": 13750 }, { "epoch": 19.29873772791024, "grad_norm": 0.24760393798351288, "learning_rate": 2.434185701812592e-05, "loss": 0.007, "step": 13760 }, { "epoch": 19.312762973352033, "grad_norm": 0.24445763230323792, "learning_rate": 2.4270934190352218e-05, "loss": 0.0071, "step": 13770 }, { "epoch": 19.32678821879383, "grad_norm": 0.2080240696668625, "learning_rate": 2.4200081704861998e-05, "loss": 0.0067, "step": 13780 }, { "epoch": 19.340813464235623, "grad_norm": 0.20751109719276428, "learning_rate": 2.412929975536321e-05, "loss": 0.0099, "step": 13790 }, { "epoch": 19.35483870967742, "grad_norm": 0.16393770277500153, "learning_rate": 2.4058588535371017e-05, "loss": 0.0063, "step": 13800 }, { "epoch": 19.368863955119213, "grad_norm": 0.3477884829044342, "learning_rate": 2.3987948238207243e-05, "loss": 0.0072, "step": 13810 }, { "epoch": 19.38288920056101, "grad_norm": 0.2302197366952896, "learning_rate": 2.3917379056999678e-05, "loss": 0.0069, "step": 13820 }, { "epoch": 19.396914446002803, "grad_norm": 0.37617552280426025, "learning_rate": 2.3846881184681824e-05, "loss": 0.0074, "step": 13830 }, { "epoch": 19.4109396914446, "grad_norm": 0.07922963798046112, "learning_rate": 2.377645481399214e-05, "loss": 0.0074, "step": 13840 }, { "epoch": 19.424964936886397, "grad_norm": 0.14312949776649475, "learning_rate": 2.3706100137473667e-05, "loss": 0.0054, "step": 13850 }, { "epoch": 19.43899018232819, "grad_norm": 0.2295207530260086, "learning_rate": 2.3635817347473394e-05, "loss": 0.0076, "step": 13860 }, { "epoch": 19.453015427769987, "grad_norm": 0.15568916499614716, "learning_rate": 2.3565606636141757e-05, "loss": 0.0091, "step": 13870 }, { "epoch": 19.46704067321178, "grad_norm": 0.15703842043876648, "learning_rate": 2.3495468195432203e-05, "loss": 0.0081, "step": 13880 }, { "epoch": 19.481065918653577, "grad_norm": 0.16539020836353302, "learning_rate": 2.3425402217100507e-05, "loss": 0.0071, "step": 13890 }, { "epoch": 19.49509116409537, "grad_norm": 0.16811136901378632, "learning_rate": 2.3355408892704424e-05, "loss": 0.0064, "step": 13900 }, { "epoch": 19.509116409537167, "grad_norm": 0.14770881831645966, "learning_rate": 2.3285488413603003e-05, "loss": 0.0069, "step": 13910 }, { "epoch": 19.52314165497896, "grad_norm": 0.09896960109472275, "learning_rate": 2.321564097095615e-05, "loss": 0.0057, "step": 13920 }, { "epoch": 19.537166900420758, "grad_norm": 0.12984874844551086, "learning_rate": 2.3145866755724142e-05, "loss": 0.0058, "step": 13930 }, { "epoch": 19.55119214586255, "grad_norm": 0.1452949196100235, "learning_rate": 2.307616595866699e-05, "loss": 0.0069, "step": 13940 }, { "epoch": 19.565217391304348, "grad_norm": 0.11993955075740814, "learning_rate": 2.3006538770344032e-05, "loss": 0.0076, "step": 13950 }, { "epoch": 19.579242636746145, "grad_norm": 0.372013658285141, "learning_rate": 2.293698538111334e-05, "loss": 0.0082, "step": 13960 }, { "epoch": 19.593267882187938, "grad_norm": 0.1610584557056427, "learning_rate": 2.28675059811312e-05, "loss": 0.0063, "step": 13970 }, { "epoch": 19.607293127629735, "grad_norm": 0.28237685561180115, "learning_rate": 2.279810076035167e-05, "loss": 0.0072, "step": 13980 }, { "epoch": 19.621318373071528, "grad_norm": 0.27008605003356934, "learning_rate": 2.272876990852596e-05, "loss": 0.0092, "step": 13990 }, { "epoch": 19.635343618513325, "grad_norm": 0.15301020443439484, "learning_rate": 2.265951361520195e-05, "loss": 0.0072, "step": 14000 }, { "epoch": 19.649368863955118, "grad_norm": 0.29237642884254456, "learning_rate": 2.2590332069723748e-05, "loss": 0.0103, "step": 14010 }, { "epoch": 19.663394109396915, "grad_norm": 0.19152340292930603, "learning_rate": 2.2521225461231004e-05, "loss": 0.0075, "step": 14020 }, { "epoch": 19.677419354838708, "grad_norm": 0.17558008432388306, "learning_rate": 2.2452193978658597e-05, "loss": 0.0071, "step": 14030 }, { "epoch": 19.691444600280505, "grad_norm": 0.25225555896759033, "learning_rate": 2.238323781073594e-05, "loss": 0.006, "step": 14040 }, { "epoch": 19.7054698457223, "grad_norm": 0.14680759608745575, "learning_rate": 2.2314357145986552e-05, "loss": 0.0075, "step": 14050 }, { "epoch": 19.719495091164095, "grad_norm": 0.2358541637659073, "learning_rate": 2.224555217272757e-05, "loss": 0.007, "step": 14060 }, { "epoch": 19.733520336605892, "grad_norm": 0.10504786670207977, "learning_rate": 2.2176823079069127e-05, "loss": 0.0098, "step": 14070 }, { "epoch": 19.747545582047685, "grad_norm": 0.28144943714141846, "learning_rate": 2.210817005291398e-05, "loss": 0.0077, "step": 14080 }, { "epoch": 19.761570827489482, "grad_norm": 0.14446254074573517, "learning_rate": 2.203959328195686e-05, "loss": 0.0064, "step": 14090 }, { "epoch": 19.775596072931275, "grad_norm": 0.30048227310180664, "learning_rate": 2.1971092953684026e-05, "loss": 0.0071, "step": 14100 }, { "epoch": 19.789621318373072, "grad_norm": 0.21808528900146484, "learning_rate": 2.1902669255372788e-05, "loss": 0.0074, "step": 14110 }, { "epoch": 19.803646563814866, "grad_norm": 0.27822932600975037, "learning_rate": 2.1834322374090897e-05, "loss": 0.0066, "step": 14120 }, { "epoch": 19.817671809256662, "grad_norm": 0.3172879219055176, "learning_rate": 2.1766052496696153e-05, "loss": 0.0069, "step": 14130 }, { "epoch": 19.831697054698456, "grad_norm": 0.28466466069221497, "learning_rate": 2.169785980983577e-05, "loss": 0.0062, "step": 14140 }, { "epoch": 19.845722300140253, "grad_norm": 0.11556751281023026, "learning_rate": 2.162974449994593e-05, "loss": 0.007, "step": 14150 }, { "epoch": 19.85974754558205, "grad_norm": 0.1840214878320694, "learning_rate": 2.1561706753251337e-05, "loss": 0.007, "step": 14160 }, { "epoch": 19.873772791023843, "grad_norm": 0.10644552111625671, "learning_rate": 2.1493746755764544e-05, "loss": 0.0078, "step": 14170 }, { "epoch": 19.88779803646564, "grad_norm": 0.16542315483093262, "learning_rate": 2.1425864693285635e-05, "loss": 0.0072, "step": 14180 }, { "epoch": 19.901823281907433, "grad_norm": 0.30109381675720215, "learning_rate": 2.1358060751401547e-05, "loss": 0.0081, "step": 14190 }, { "epoch": 19.91584852734923, "grad_norm": 0.225279301404953, "learning_rate": 2.129033511548566e-05, "loss": 0.0076, "step": 14200 }, { "epoch": 19.929873772791023, "grad_norm": 0.2283535748720169, "learning_rate": 2.1222687970697315e-05, "loss": 0.0076, "step": 14210 }, { "epoch": 19.94389901823282, "grad_norm": 0.1772790104150772, "learning_rate": 2.1155119501981173e-05, "loss": 0.0055, "step": 14220 }, { "epoch": 19.957924263674613, "grad_norm": 0.22033703327178955, "learning_rate": 2.1087629894066895e-05, "loss": 0.0073, "step": 14230 }, { "epoch": 19.97194950911641, "grad_norm": 0.2238875925540924, "learning_rate": 2.1020219331468473e-05, "loss": 0.0069, "step": 14240 }, { "epoch": 19.985974754558203, "grad_norm": 0.24747996032238007, "learning_rate": 2.095288799848379e-05, "loss": 0.0082, "step": 14250 }, { "epoch": 20.0, "grad_norm": 0.23594938218593597, "learning_rate": 2.088563607919417e-05, "loss": 0.0083, "step": 14260 }, { "epoch": 20.014025245441797, "grad_norm": 0.15593110024929047, "learning_rate": 2.0818463757463786e-05, "loss": 0.0076, "step": 14270 }, { "epoch": 20.02805049088359, "grad_norm": 0.12507948279380798, "learning_rate": 2.0751371216939175e-05, "loss": 0.0068, "step": 14280 }, { "epoch": 20.042075736325387, "grad_norm": 0.30134910345077515, "learning_rate": 2.068435864104882e-05, "loss": 0.0078, "step": 14290 }, { "epoch": 20.05610098176718, "grad_norm": 0.39370736479759216, "learning_rate": 2.0617426213002506e-05, "loss": 0.0068, "step": 14300 }, { "epoch": 20.070126227208977, "grad_norm": 0.24477088451385498, "learning_rate": 2.055057411579097e-05, "loss": 0.0097, "step": 14310 }, { "epoch": 20.08415147265077, "grad_norm": 0.18073125183582306, "learning_rate": 2.0483802532185286e-05, "loss": 0.007, "step": 14320 }, { "epoch": 20.098176718092567, "grad_norm": 0.16628111898899078, "learning_rate": 2.041711164473638e-05, "loss": 0.0067, "step": 14330 }, { "epoch": 20.11220196353436, "grad_norm": 0.2154167741537094, "learning_rate": 2.0350501635774637e-05, "loss": 0.0064, "step": 14340 }, { "epoch": 20.126227208976157, "grad_norm": 0.1076335459947586, "learning_rate": 2.0283972687409247e-05, "loss": 0.0082, "step": 14350 }, { "epoch": 20.14025245441795, "grad_norm": 0.19302693009376526, "learning_rate": 2.021752498152784e-05, "loss": 0.0077, "step": 14360 }, { "epoch": 20.154277699859747, "grad_norm": 0.25136733055114746, "learning_rate": 2.015115869979589e-05, "loss": 0.0065, "step": 14370 }, { "epoch": 20.168302945301544, "grad_norm": 0.17806677520275116, "learning_rate": 2.0084874023656265e-05, "loss": 0.0047, "step": 14380 }, { "epoch": 20.182328190743338, "grad_norm": 0.13459396362304688, "learning_rate": 2.001867113432877e-05, "loss": 0.0058, "step": 14390 }, { "epoch": 20.196353436185134, "grad_norm": 0.19269058108329773, "learning_rate": 1.995255021280954e-05, "loss": 0.0061, "step": 14400 }, { "epoch": 20.210378681626928, "grad_norm": 0.16255083680152893, "learning_rate": 1.9886511439870688e-05, "loss": 0.0085, "step": 14410 }, { "epoch": 20.224403927068725, "grad_norm": 0.1907542496919632, "learning_rate": 1.9820554996059675e-05, "loss": 0.0066, "step": 14420 }, { "epoch": 20.238429172510518, "grad_norm": 0.19394327700138092, "learning_rate": 1.9754681061698893e-05, "loss": 0.0086, "step": 14430 }, { "epoch": 20.252454417952315, "grad_norm": 0.18283472955226898, "learning_rate": 1.9688889816885185e-05, "loss": 0.0059, "step": 14440 }, { "epoch": 20.266479663394108, "grad_norm": 0.10845647752285004, "learning_rate": 1.962318144148928e-05, "loss": 0.006, "step": 14450 }, { "epoch": 20.280504908835905, "grad_norm": 0.13018639385700226, "learning_rate": 1.955755611515539e-05, "loss": 0.0071, "step": 14460 }, { "epoch": 20.294530154277698, "grad_norm": 0.27787429094314575, "learning_rate": 1.9492014017300642e-05, "loss": 0.0077, "step": 14470 }, { "epoch": 20.308555399719495, "grad_norm": 0.26059773564338684, "learning_rate": 1.942655532711461e-05, "loss": 0.008, "step": 14480 }, { "epoch": 20.322580645161292, "grad_norm": 0.19327989220619202, "learning_rate": 1.9361180223558882e-05, "loss": 0.0072, "step": 14490 }, { "epoch": 20.336605890603085, "grad_norm": 0.1103653609752655, "learning_rate": 1.929588888536647e-05, "loss": 0.0067, "step": 14500 }, { "epoch": 20.350631136044882, "grad_norm": 0.18493402004241943, "learning_rate": 1.9230681491041425e-05, "loss": 0.0071, "step": 14510 }, { "epoch": 20.364656381486675, "grad_norm": 0.16320565342903137, "learning_rate": 1.9165558218858264e-05, "loss": 0.0067, "step": 14520 }, { "epoch": 20.378681626928472, "grad_norm": 0.1366550475358963, "learning_rate": 1.9100519246861505e-05, "loss": 0.006, "step": 14530 }, { "epoch": 20.392706872370265, "grad_norm": 0.1936933398246765, "learning_rate": 1.9035564752865248e-05, "loss": 0.0073, "step": 14540 }, { "epoch": 20.406732117812062, "grad_norm": 0.1470431536436081, "learning_rate": 1.897069491445258e-05, "loss": 0.0067, "step": 14550 }, { "epoch": 20.420757363253855, "grad_norm": 0.14992178976535797, "learning_rate": 1.890590990897515e-05, "loss": 0.0074, "step": 14560 }, { "epoch": 20.434782608695652, "grad_norm": 0.11784632503986359, "learning_rate": 1.884120991355272e-05, "loss": 0.0069, "step": 14570 }, { "epoch": 20.44880785413745, "grad_norm": 0.21369604766368866, "learning_rate": 1.8776595105072576e-05, "loss": 0.0076, "step": 14580 }, { "epoch": 20.462833099579242, "grad_norm": 0.2703285813331604, "learning_rate": 1.8712065660189166e-05, "loss": 0.0071, "step": 14590 }, { "epoch": 20.47685834502104, "grad_norm": 0.16866883635520935, "learning_rate": 1.8647621755323513e-05, "loss": 0.0065, "step": 14600 }, { "epoch": 20.490883590462833, "grad_norm": 0.19836434721946716, "learning_rate": 1.858326356666278e-05, "loss": 0.0064, "step": 14610 }, { "epoch": 20.50490883590463, "grad_norm": 0.27467232942581177, "learning_rate": 1.851899127015983e-05, "loss": 0.0075, "step": 14620 }, { "epoch": 20.518934081346423, "grad_norm": 0.13578802347183228, "learning_rate": 1.8454805041532626e-05, "loss": 0.0058, "step": 14630 }, { "epoch": 20.53295932678822, "grad_norm": 0.16281990706920624, "learning_rate": 1.8390705056263906e-05, "loss": 0.0082, "step": 14640 }, { "epoch": 20.546984572230013, "grad_norm": 0.16111886501312256, "learning_rate": 1.832669148960057e-05, "loss": 0.0073, "step": 14650 }, { "epoch": 20.56100981767181, "grad_norm": 0.14917431771755219, "learning_rate": 1.8262764516553233e-05, "loss": 0.0046, "step": 14660 }, { "epoch": 20.575035063113603, "grad_norm": 0.18216311931610107, "learning_rate": 1.8198924311895843e-05, "loss": 0.0105, "step": 14670 }, { "epoch": 20.5890603085554, "grad_norm": 0.21832795441150665, "learning_rate": 1.813517105016505e-05, "loss": 0.0062, "step": 14680 }, { "epoch": 20.603085553997197, "grad_norm": 0.1526041328907013, "learning_rate": 1.8071504905659888e-05, "loss": 0.0076, "step": 14690 }, { "epoch": 20.61711079943899, "grad_norm": 0.17138662934303284, "learning_rate": 1.800792605244109e-05, "loss": 0.0071, "step": 14700 }, { "epoch": 20.631136044880787, "grad_norm": 0.23822371661663055, "learning_rate": 1.7944434664330844e-05, "loss": 0.0068, "step": 14710 }, { "epoch": 20.64516129032258, "grad_norm": 0.15203668177127838, "learning_rate": 1.7881030914912212e-05, "loss": 0.0056, "step": 14720 }, { "epoch": 20.659186535764377, "grad_norm": 0.2086193561553955, "learning_rate": 1.7817714977528577e-05, "loss": 0.0055, "step": 14730 }, { "epoch": 20.67321178120617, "grad_norm": 0.2718794643878937, "learning_rate": 1.7754487025283332e-05, "loss": 0.0074, "step": 14740 }, { "epoch": 20.687237026647967, "grad_norm": 0.13174577057361603, "learning_rate": 1.7691347231039275e-05, "loss": 0.0064, "step": 14750 }, { "epoch": 20.70126227208976, "grad_norm": 0.1665494740009308, "learning_rate": 1.7628295767418164e-05, "loss": 0.0084, "step": 14760 }, { "epoch": 20.715287517531557, "grad_norm": 0.20284776389598846, "learning_rate": 1.7565332806800333e-05, "loss": 0.0054, "step": 14770 }, { "epoch": 20.72931276297335, "grad_norm": 0.18042033910751343, "learning_rate": 1.750245852132408e-05, "loss": 0.0067, "step": 14780 }, { "epoch": 20.743338008415147, "grad_norm": 0.13275204598903656, "learning_rate": 1.7439673082885323e-05, "loss": 0.006, "step": 14790 }, { "epoch": 20.757363253856944, "grad_norm": 0.21596455574035645, "learning_rate": 1.7376976663137047e-05, "loss": 0.0061, "step": 14800 }, { "epoch": 20.771388499298737, "grad_norm": 0.151463121175766, "learning_rate": 1.7314369433488853e-05, "loss": 0.0063, "step": 14810 }, { "epoch": 20.785413744740534, "grad_norm": 0.1609039604663849, "learning_rate": 1.7251851565106548e-05, "loss": 0.0057, "step": 14820 }, { "epoch": 20.799438990182328, "grad_norm": 0.1356969177722931, "learning_rate": 1.7189423228911574e-05, "loss": 0.0076, "step": 14830 }, { "epoch": 20.813464235624124, "grad_norm": 0.4379889667034149, "learning_rate": 1.7127084595580606e-05, "loss": 0.0077, "step": 14840 }, { "epoch": 20.827489481065918, "grad_norm": 0.19752441346645355, "learning_rate": 1.706483583554513e-05, "loss": 0.0082, "step": 14850 }, { "epoch": 20.841514726507715, "grad_norm": 0.24038943648338318, "learning_rate": 1.700267711899083e-05, "loss": 0.0064, "step": 14860 }, { "epoch": 20.855539971949508, "grad_norm": 0.30050015449523926, "learning_rate": 1.69406086158573e-05, "loss": 0.005, "step": 14870 }, { "epoch": 20.869565217391305, "grad_norm": 0.17141461372375488, "learning_rate": 1.6878630495837455e-05, "loss": 0.0057, "step": 14880 }, { "epoch": 20.883590462833098, "grad_norm": 0.1641421914100647, "learning_rate": 1.681674292837707e-05, "loss": 0.0067, "step": 14890 }, { "epoch": 20.897615708274895, "grad_norm": 0.13408112525939941, "learning_rate": 1.6754946082674444e-05, "loss": 0.008, "step": 14900 }, { "epoch": 20.91164095371669, "grad_norm": 0.18544074892997742, "learning_rate": 1.6693240127679748e-05, "loss": 0.0072, "step": 14910 }, { "epoch": 20.925666199158485, "grad_norm": 0.16013793647289276, "learning_rate": 1.663162523209475e-05, "loss": 0.0057, "step": 14920 }, { "epoch": 20.93969144460028, "grad_norm": 0.28659787774086, "learning_rate": 1.6570101564372193e-05, "loss": 0.0084, "step": 14930 }, { "epoch": 20.953716690042075, "grad_norm": 0.22218941152095795, "learning_rate": 1.650866929271543e-05, "loss": 0.0063, "step": 14940 }, { "epoch": 20.967741935483872, "grad_norm": 0.15203070640563965, "learning_rate": 1.644732858507797e-05, "loss": 0.0073, "step": 14950 }, { "epoch": 20.981767180925665, "grad_norm": 0.20719200372695923, "learning_rate": 1.6386079609162943e-05, "loss": 0.0081, "step": 14960 }, { "epoch": 20.995792426367462, "grad_norm": 0.13344328105449677, "learning_rate": 1.6324922532422742e-05, "loss": 0.0068, "step": 14970 }, { "epoch": 21.009817671809255, "grad_norm": 0.16350655257701874, "learning_rate": 1.6263857522058434e-05, "loss": 0.0087, "step": 14980 }, { "epoch": 21.023842917251052, "grad_norm": 0.17923225462436676, "learning_rate": 1.6202884745019443e-05, "loss": 0.0069, "step": 14990 }, { "epoch": 21.037868162692845, "grad_norm": 0.11642610281705856, "learning_rate": 1.614200436800304e-05, "loss": 0.0064, "step": 15000 }, { "epoch": 21.051893408134642, "grad_norm": 0.1841658502817154, "learning_rate": 1.6081216557453814e-05, "loss": 0.0077, "step": 15010 }, { "epoch": 21.06591865357644, "grad_norm": 0.1421952247619629, "learning_rate": 1.6020521479563367e-05, "loss": 0.0058, "step": 15020 }, { "epoch": 21.079943899018232, "grad_norm": 0.15656137466430664, "learning_rate": 1.5959919300269654e-05, "loss": 0.0078, "step": 15030 }, { "epoch": 21.09396914446003, "grad_norm": 0.1332269161939621, "learning_rate": 1.5899410185256764e-05, "loss": 0.0053, "step": 15040 }, { "epoch": 21.107994389901823, "grad_norm": 0.15801821649074554, "learning_rate": 1.583899429995431e-05, "loss": 0.0058, "step": 15050 }, { "epoch": 21.12201963534362, "grad_norm": 0.22718067467212677, "learning_rate": 1.5778671809536993e-05, "loss": 0.006, "step": 15060 }, { "epoch": 21.136044880785413, "grad_norm": 0.1493034064769745, "learning_rate": 1.5718442878924246e-05, "loss": 0.005, "step": 15070 }, { "epoch": 21.15007012622721, "grad_norm": 0.17293235659599304, "learning_rate": 1.5658307672779593e-05, "loss": 0.0061, "step": 15080 }, { "epoch": 21.164095371669003, "grad_norm": 0.20705823600292206, "learning_rate": 1.5598266355510427e-05, "loss": 0.0068, "step": 15090 }, { "epoch": 21.1781206171108, "grad_norm": 0.2087600976228714, "learning_rate": 1.553831909126744e-05, "loss": 0.0056, "step": 15100 }, { "epoch": 21.192145862552593, "grad_norm": 0.32238414883613586, "learning_rate": 1.5478466043944135e-05, "loss": 0.0071, "step": 15110 }, { "epoch": 21.20617110799439, "grad_norm": 0.16787753999233246, "learning_rate": 1.5418707377176468e-05, "loss": 0.0052, "step": 15120 }, { "epoch": 21.220196353436187, "grad_norm": 0.10954579710960388, "learning_rate": 1.535904325434233e-05, "loss": 0.0062, "step": 15130 }, { "epoch": 21.23422159887798, "grad_norm": 0.16781756281852722, "learning_rate": 1.529947383856118e-05, "loss": 0.0089, "step": 15140 }, { "epoch": 21.248246844319777, "grad_norm": 0.1330592781305313, "learning_rate": 1.5239999292693524e-05, "loss": 0.0081, "step": 15150 }, { "epoch": 21.26227208976157, "grad_norm": 0.3218066692352295, "learning_rate": 1.5180619779340505e-05, "loss": 0.0067, "step": 15160 }, { "epoch": 21.276297335203367, "grad_norm": 0.18245480954647064, "learning_rate": 1.5121335460843428e-05, "loss": 0.0076, "step": 15170 }, { "epoch": 21.29032258064516, "grad_norm": 0.19858607649803162, "learning_rate": 1.5062146499283347e-05, "loss": 0.0061, "step": 15180 }, { "epoch": 21.304347826086957, "grad_norm": 0.1483883112668991, "learning_rate": 1.5003053056480643e-05, "loss": 0.0071, "step": 15190 }, { "epoch": 21.31837307152875, "grad_norm": 0.18094000220298767, "learning_rate": 1.4944055293994551e-05, "loss": 0.0069, "step": 15200 }, { "epoch": 21.332398316970547, "grad_norm": 0.13262853026390076, "learning_rate": 1.4885153373122656e-05, "loss": 0.0068, "step": 15210 }, { "epoch": 21.346423562412344, "grad_norm": 0.18970036506652832, "learning_rate": 1.482634745490059e-05, "loss": 0.0077, "step": 15220 }, { "epoch": 21.360448807854137, "grad_norm": 0.15184925496578217, "learning_rate": 1.4767637700101466e-05, "loss": 0.007, "step": 15230 }, { "epoch": 21.374474053295934, "grad_norm": 0.22040249407291412, "learning_rate": 1.4709024269235528e-05, "loss": 0.0058, "step": 15240 }, { "epoch": 21.388499298737727, "grad_norm": 0.3241593837738037, "learning_rate": 1.4650507322549684e-05, "loss": 0.006, "step": 15250 }, { "epoch": 21.402524544179524, "grad_norm": 0.2773667871952057, "learning_rate": 1.4592087020026972e-05, "loss": 0.0063, "step": 15260 }, { "epoch": 21.416549789621318, "grad_norm": 0.19129380583763123, "learning_rate": 1.4533763521386318e-05, "loss": 0.0057, "step": 15270 }, { "epoch": 21.430575035063114, "grad_norm": 0.2000211626291275, "learning_rate": 1.44755369860819e-05, "loss": 0.0069, "step": 15280 }, { "epoch": 21.444600280504908, "grad_norm": 0.14708629250526428, "learning_rate": 1.441740757330287e-05, "loss": 0.0078, "step": 15290 }, { "epoch": 21.458625525946704, "grad_norm": 0.2220766544342041, "learning_rate": 1.4359375441972844e-05, "loss": 0.0057, "step": 15300 }, { "epoch": 21.472650771388498, "grad_norm": 0.14772726595401764, "learning_rate": 1.4301440750749395e-05, "loss": 0.0053, "step": 15310 }, { "epoch": 21.486676016830295, "grad_norm": 0.21886007487773895, "learning_rate": 1.4243603658023808e-05, "loss": 0.0059, "step": 15320 }, { "epoch": 21.50070126227209, "grad_norm": 0.20060285925865173, "learning_rate": 1.4185864321920444e-05, "loss": 0.0062, "step": 15330 }, { "epoch": 21.514726507713885, "grad_norm": 0.12088347971439362, "learning_rate": 1.4128222900296485e-05, "loss": 0.006, "step": 15340 }, { "epoch": 21.52875175315568, "grad_norm": 0.18205010890960693, "learning_rate": 1.407067955074135e-05, "loss": 0.0063, "step": 15350 }, { "epoch": 21.542776998597475, "grad_norm": 0.2992248833179474, "learning_rate": 1.4013234430576356e-05, "loss": 0.0069, "step": 15360 }, { "epoch": 21.55680224403927, "grad_norm": 0.28014636039733887, "learning_rate": 1.3955887696854286e-05, "loss": 0.0075, "step": 15370 }, { "epoch": 21.570827489481065, "grad_norm": 0.2200162410736084, "learning_rate": 1.38986395063589e-05, "loss": 0.0107, "step": 15380 }, { "epoch": 21.584852734922862, "grad_norm": 0.1651024967432022, "learning_rate": 1.3841490015604597e-05, "loss": 0.0066, "step": 15390 }, { "epoch": 21.598877980364655, "grad_norm": 0.1545153558254242, "learning_rate": 1.3784439380835879e-05, "loss": 0.008, "step": 15400 }, { "epoch": 21.612903225806452, "grad_norm": 0.21049390733242035, "learning_rate": 1.3727487758026986e-05, "loss": 0.0064, "step": 15410 }, { "epoch": 21.626928471248245, "grad_norm": 0.16063366830348969, "learning_rate": 1.3670635302881525e-05, "loss": 0.0076, "step": 15420 }, { "epoch": 21.640953716690042, "grad_norm": 0.23348243534564972, "learning_rate": 1.3613882170831888e-05, "loss": 0.0069, "step": 15430 }, { "epoch": 21.65497896213184, "grad_norm": 0.14869913458824158, "learning_rate": 1.355722851703901e-05, "loss": 0.0058, "step": 15440 }, { "epoch": 21.669004207573632, "grad_norm": 0.19635066390037537, "learning_rate": 1.3500674496391814e-05, "loss": 0.0077, "step": 15450 }, { "epoch": 21.68302945301543, "grad_norm": 0.13266953825950623, "learning_rate": 1.3444220263506795e-05, "loss": 0.0067, "step": 15460 }, { "epoch": 21.697054698457222, "grad_norm": 0.15294387936592102, "learning_rate": 1.3387865972727714e-05, "loss": 0.0077, "step": 15470 }, { "epoch": 21.71107994389902, "grad_norm": 0.12518708407878876, "learning_rate": 1.3331611778125036e-05, "loss": 0.0062, "step": 15480 }, { "epoch": 21.725105189340812, "grad_norm": 0.2585180699825287, "learning_rate": 1.3275457833495564e-05, "loss": 0.0087, "step": 15490 }, { "epoch": 21.73913043478261, "grad_norm": 0.1742304563522339, "learning_rate": 1.3219404292362065e-05, "loss": 0.0089, "step": 15500 }, { "epoch": 21.753155680224403, "grad_norm": 0.3211561143398285, "learning_rate": 1.3163451307972751e-05, "loss": 0.0074, "step": 15510 }, { "epoch": 21.7671809256662, "grad_norm": 0.16707541048526764, "learning_rate": 1.3107599033300977e-05, "loss": 0.0078, "step": 15520 }, { "epoch": 21.781206171107993, "grad_norm": 0.16897344589233398, "learning_rate": 1.305184762104471e-05, "loss": 0.0064, "step": 15530 }, { "epoch": 21.79523141654979, "grad_norm": 0.1839171051979065, "learning_rate": 1.2996197223626178e-05, "loss": 0.0076, "step": 15540 }, { "epoch": 21.809256661991586, "grad_norm": 0.15664345026016235, "learning_rate": 1.2940647993191457e-05, "loss": 0.0064, "step": 15550 }, { "epoch": 21.82328190743338, "grad_norm": 0.1449873298406601, "learning_rate": 1.2885200081610005e-05, "loss": 0.0065, "step": 15560 }, { "epoch": 21.837307152875177, "grad_norm": 0.11293761432170868, "learning_rate": 1.2829853640474316e-05, "loss": 0.0056, "step": 15570 }, { "epoch": 21.85133239831697, "grad_norm": 0.09790601581335068, "learning_rate": 1.2774608821099438e-05, "loss": 0.0064, "step": 15580 }, { "epoch": 21.865357643758767, "grad_norm": 0.23209482431411743, "learning_rate": 1.2719465774522577e-05, "loss": 0.0052, "step": 15590 }, { "epoch": 21.87938288920056, "grad_norm": 0.170270174741745, "learning_rate": 1.2664424651502755e-05, "loss": 0.0059, "step": 15600 }, { "epoch": 21.893408134642357, "grad_norm": 0.21709603071212769, "learning_rate": 1.260948560252026e-05, "loss": 0.0077, "step": 15610 }, { "epoch": 21.90743338008415, "grad_norm": 0.10244540125131607, "learning_rate": 1.2554648777776396e-05, "loss": 0.0062, "step": 15620 }, { "epoch": 21.921458625525947, "grad_norm": 0.24043609201908112, "learning_rate": 1.2499914327192919e-05, "loss": 0.0068, "step": 15630 }, { "epoch": 21.93548387096774, "grad_norm": 0.2548508644104004, "learning_rate": 1.2445282400411722e-05, "loss": 0.0062, "step": 15640 }, { "epoch": 21.949509116409537, "grad_norm": 0.18006576597690582, "learning_rate": 1.2390753146794437e-05, "loss": 0.0065, "step": 15650 }, { "epoch": 21.963534361851334, "grad_norm": 0.14326010644435883, "learning_rate": 1.2336326715421925e-05, "loss": 0.0065, "step": 15660 }, { "epoch": 21.977559607293127, "grad_norm": 0.45003437995910645, "learning_rate": 1.2282003255094005e-05, "loss": 0.0074, "step": 15670 }, { "epoch": 21.991584852734924, "grad_norm": 0.17629209160804749, "learning_rate": 1.2227782914328928e-05, "loss": 0.0064, "step": 15680 }, { "epoch": 22.005610098176717, "grad_norm": 0.13864760100841522, "learning_rate": 1.2173665841363018e-05, "loss": 0.0054, "step": 15690 }, { "epoch": 22.019635343618514, "grad_norm": 0.11307646334171295, "learning_rate": 1.211965218415032e-05, "loss": 0.0074, "step": 15700 }, { "epoch": 22.033660589060307, "grad_norm": 0.18920494616031647, "learning_rate": 1.2065742090362082e-05, "loss": 0.0068, "step": 15710 }, { "epoch": 22.047685834502104, "grad_norm": 0.1704755574464798, "learning_rate": 1.2011935707386457e-05, "loss": 0.0055, "step": 15720 }, { "epoch": 22.061711079943898, "grad_norm": 0.16593042016029358, "learning_rate": 1.1958233182328044e-05, "loss": 0.0062, "step": 15730 }, { "epoch": 22.075736325385694, "grad_norm": 0.19483818113803864, "learning_rate": 1.1904634662007474e-05, "loss": 0.0063, "step": 15740 }, { "epoch": 22.08976157082749, "grad_norm": 0.2086101770401001, "learning_rate": 1.1851140292961088e-05, "loss": 0.007, "step": 15750 }, { "epoch": 22.103786816269285, "grad_norm": 0.10892780870199203, "learning_rate": 1.1797750221440424e-05, "loss": 0.0061, "step": 15760 }, { "epoch": 22.11781206171108, "grad_norm": 0.11583981662988663, "learning_rate": 1.1744464593411897e-05, "loss": 0.0055, "step": 15770 }, { "epoch": 22.131837307152875, "grad_norm": 0.22251443564891815, "learning_rate": 1.1691283554556399e-05, "loss": 0.0078, "step": 15780 }, { "epoch": 22.14586255259467, "grad_norm": 0.2820073068141937, "learning_rate": 1.1638207250268834e-05, "loss": 0.0076, "step": 15790 }, { "epoch": 22.159887798036465, "grad_norm": 0.1873839795589447, "learning_rate": 1.158523582565782e-05, "loss": 0.0062, "step": 15800 }, { "epoch": 22.17391304347826, "grad_norm": 0.20822811126708984, "learning_rate": 1.1532369425545192e-05, "loss": 0.0078, "step": 15810 }, { "epoch": 22.187938288920055, "grad_norm": 0.2628297805786133, "learning_rate": 1.1479608194465662e-05, "loss": 0.0055, "step": 15820 }, { "epoch": 22.20196353436185, "grad_norm": 0.27963295578956604, "learning_rate": 1.1426952276666442e-05, "loss": 0.0099, "step": 15830 }, { "epoch": 22.215988779803645, "grad_norm": 0.24205082654953003, "learning_rate": 1.1374401816106778e-05, "loss": 0.0055, "step": 15840 }, { "epoch": 22.230014025245442, "grad_norm": 0.12331650406122208, "learning_rate": 1.1321956956457646e-05, "loss": 0.0072, "step": 15850 }, { "epoch": 22.24403927068724, "grad_norm": 0.15445256233215332, "learning_rate": 1.1269617841101277e-05, "loss": 0.0068, "step": 15860 }, { "epoch": 22.258064516129032, "grad_norm": 0.1186363473534584, "learning_rate": 1.1217384613130804e-05, "loss": 0.0058, "step": 15870 }, { "epoch": 22.27208976157083, "grad_norm": 0.5906305909156799, "learning_rate": 1.11652574153499e-05, "loss": 0.0072, "step": 15880 }, { "epoch": 22.286115007012622, "grad_norm": 0.14717693626880646, "learning_rate": 1.1113236390272303e-05, "loss": 0.0067, "step": 15890 }, { "epoch": 22.30014025245442, "grad_norm": 0.17303548753261566, "learning_rate": 1.106132168012155e-05, "loss": 0.0071, "step": 15900 }, { "epoch": 22.314165497896212, "grad_norm": 0.19496387243270874, "learning_rate": 1.1009513426830448e-05, "loss": 0.007, "step": 15910 }, { "epoch": 22.32819074333801, "grad_norm": 0.13733799755573273, "learning_rate": 1.0957811772040777e-05, "loss": 0.0084, "step": 15920 }, { "epoch": 22.342215988779802, "grad_norm": 0.1528647541999817, "learning_rate": 1.0906216857102913e-05, "loss": 0.0077, "step": 15930 }, { "epoch": 22.3562412342216, "grad_norm": 0.36358335614204407, "learning_rate": 1.0854728823075355e-05, "loss": 0.0074, "step": 15940 }, { "epoch": 22.370266479663393, "grad_norm": 0.15399207174777985, "learning_rate": 1.0803347810724452e-05, "loss": 0.0055, "step": 15950 }, { "epoch": 22.38429172510519, "grad_norm": 0.3679558038711548, "learning_rate": 1.0752073960523911e-05, "loss": 0.0072, "step": 15960 }, { "epoch": 22.398316970546986, "grad_norm": 0.10134106874465942, "learning_rate": 1.070090741265447e-05, "loss": 0.008, "step": 15970 }, { "epoch": 22.41234221598878, "grad_norm": 0.1715242713689804, "learning_rate": 1.0649848307003547e-05, "loss": 0.0061, "step": 15980 }, { "epoch": 22.426367461430576, "grad_norm": 0.13522303104400635, "learning_rate": 1.0598896783164757e-05, "loss": 0.0056, "step": 15990 }, { "epoch": 22.44039270687237, "grad_norm": 0.1502041220664978, "learning_rate": 1.0548052980437645e-05, "loss": 0.0054, "step": 16000 }, { "epoch": 22.454417952314166, "grad_norm": 0.18261954188346863, "learning_rate": 1.049731703782722e-05, "loss": 0.0075, "step": 16010 }, { "epoch": 22.46844319775596, "grad_norm": 0.08858657628297806, "learning_rate": 1.0446689094043587e-05, "loss": 0.0067, "step": 16020 }, { "epoch": 22.482468443197757, "grad_norm": 0.25910377502441406, "learning_rate": 1.039616928750165e-05, "loss": 0.0076, "step": 16030 }, { "epoch": 22.49649368863955, "grad_norm": 0.1523277759552002, "learning_rate": 1.0345757756320612e-05, "loss": 0.0071, "step": 16040 }, { "epoch": 22.510518934081347, "grad_norm": 0.3862397372722626, "learning_rate": 1.0295454638323666e-05, "loss": 0.006, "step": 16050 }, { "epoch": 22.52454417952314, "grad_norm": 0.16586464643478394, "learning_rate": 1.0245260071037632e-05, "loss": 0.0054, "step": 16060 }, { "epoch": 22.538569424964937, "grad_norm": 0.1857137233018875, "learning_rate": 1.0195174191692518e-05, "loss": 0.0083, "step": 16070 }, { "epoch": 22.552594670406734, "grad_norm": 0.1868271827697754, "learning_rate": 1.014519713722124e-05, "loss": 0.0066, "step": 16080 }, { "epoch": 22.566619915848527, "grad_norm": 0.18186905980110168, "learning_rate": 1.0095329044259132e-05, "loss": 0.006, "step": 16090 }, { "epoch": 22.580645161290324, "grad_norm": 0.10191476345062256, "learning_rate": 1.004557004914365e-05, "loss": 0.0063, "step": 16100 }, { "epoch": 22.594670406732117, "grad_norm": 0.17087331414222717, "learning_rate": 9.995920287914007e-06, "loss": 0.0052, "step": 16110 }, { "epoch": 22.608695652173914, "grad_norm": 0.20131385326385498, "learning_rate": 9.946379896310737e-06, "loss": 0.0059, "step": 16120 }, { "epoch": 22.622720897615707, "grad_norm": 0.19129177927970886, "learning_rate": 9.896949009775396e-06, "loss": 0.0052, "step": 16130 }, { "epoch": 22.636746143057504, "grad_norm": 0.267251580953598, "learning_rate": 9.847627763450134e-06, "loss": 0.0063, "step": 16140 }, { "epoch": 22.650771388499297, "grad_norm": 0.17652274668216705, "learning_rate": 9.798416292177337e-06, "loss": 0.0062, "step": 16150 }, { "epoch": 22.664796633941094, "grad_norm": 0.41430479288101196, "learning_rate": 9.74931473049932e-06, "loss": 0.0058, "step": 16160 }, { "epoch": 22.678821879382888, "grad_norm": 0.11806749552488327, "learning_rate": 9.700323212657847e-06, "loss": 0.0065, "step": 16170 }, { "epoch": 22.692847124824684, "grad_norm": 0.25059929490089417, "learning_rate": 9.65144187259388e-06, "loss": 0.0062, "step": 16180 }, { "epoch": 22.70687237026648, "grad_norm": 0.08061333000659943, "learning_rate": 9.602670843947132e-06, "loss": 0.0071, "step": 16190 }, { "epoch": 22.720897615708274, "grad_norm": 0.36033788323402405, "learning_rate": 9.554010260055713e-06, "loss": 0.0056, "step": 16200 }, { "epoch": 22.73492286115007, "grad_norm": 0.21266064047813416, "learning_rate": 9.505460253955834e-06, "loss": 0.0051, "step": 16210 }, { "epoch": 22.748948106591865, "grad_norm": 0.1149720773100853, "learning_rate": 9.457020958381324e-06, "loss": 0.0062, "step": 16220 }, { "epoch": 22.76297335203366, "grad_norm": 0.1568177193403244, "learning_rate": 9.408692505763395e-06, "loss": 0.0078, "step": 16230 }, { "epoch": 22.776998597475455, "grad_norm": 0.1372225284576416, "learning_rate": 9.360475028230181e-06, "loss": 0.0063, "step": 16240 }, { "epoch": 22.79102384291725, "grad_norm": 0.1934565305709839, "learning_rate": 9.312368657606412e-06, "loss": 0.0062, "step": 16250 }, { "epoch": 22.805049088359045, "grad_norm": 0.09998218715190887, "learning_rate": 9.264373525413096e-06, "loss": 0.0057, "step": 16260 }, { "epoch": 22.81907433380084, "grad_norm": 0.1594570428133011, "learning_rate": 9.216489762867058e-06, "loss": 0.0055, "step": 16270 }, { "epoch": 22.833099579242635, "grad_norm": 0.22151117026805878, "learning_rate": 9.168717500880708e-06, "loss": 0.0088, "step": 16280 }, { "epoch": 22.847124824684432, "grad_norm": 0.21959038078784943, "learning_rate": 9.121056870061574e-06, "loss": 0.0058, "step": 16290 }, { "epoch": 22.86115007012623, "grad_norm": 0.16584433615207672, "learning_rate": 9.073508000711983e-06, "loss": 0.0077, "step": 16300 }, { "epoch": 22.875175315568022, "grad_norm": 0.14566433429718018, "learning_rate": 9.026071022828758e-06, "loss": 0.0051, "step": 16310 }, { "epoch": 22.88920056100982, "grad_norm": 0.1422049105167389, "learning_rate": 8.978746066102771e-06, "loss": 0.0075, "step": 16320 }, { "epoch": 22.903225806451612, "grad_norm": 0.13794741034507751, "learning_rate": 8.931533259918634e-06, "loss": 0.007, "step": 16330 }, { "epoch": 22.91725105189341, "grad_norm": 0.30651500821113586, "learning_rate": 8.884432733354382e-06, "loss": 0.0061, "step": 16340 }, { "epoch": 22.931276297335202, "grad_norm": 0.22934864461421967, "learning_rate": 8.837444615181029e-06, "loss": 0.0046, "step": 16350 }, { "epoch": 22.945301542777, "grad_norm": 0.19114024937152863, "learning_rate": 8.790569033862323e-06, "loss": 0.0059, "step": 16360 }, { "epoch": 22.959326788218792, "grad_norm": 0.1274573802947998, "learning_rate": 8.7438061175543e-06, "loss": 0.0074, "step": 16370 }, { "epoch": 22.97335203366059, "grad_norm": 0.2333650439977646, "learning_rate": 8.697155994104978e-06, "loss": 0.0065, "step": 16380 }, { "epoch": 22.987377279102382, "grad_norm": 0.13462717831134796, "learning_rate": 8.650618791054033e-06, "loss": 0.0056, "step": 16390 }, { "epoch": 23.00140252454418, "grad_norm": 0.1728411316871643, "learning_rate": 8.604194635632373e-06, "loss": 0.0047, "step": 16400 }, { "epoch": 23.015427769985976, "grad_norm": 0.1368180364370346, "learning_rate": 8.557883654761906e-06, "loss": 0.0068, "step": 16410 }, { "epoch": 23.02945301542777, "grad_norm": 0.14559707045555115, "learning_rate": 8.511685975055061e-06, "loss": 0.0078, "step": 16420 }, { "epoch": 23.043478260869566, "grad_norm": 0.14680859446525574, "learning_rate": 8.46560172281452e-06, "loss": 0.006, "step": 16430 }, { "epoch": 23.05750350631136, "grad_norm": 0.21323667466640472, "learning_rate": 8.419631024032893e-06, "loss": 0.005, "step": 16440 }, { "epoch": 23.071528751753156, "grad_norm": 0.15140073001384735, "learning_rate": 8.373774004392293e-06, "loss": 0.0051, "step": 16450 }, { "epoch": 23.08555399719495, "grad_norm": 0.1376831829547882, "learning_rate": 8.32803078926409e-06, "loss": 0.0058, "step": 16460 }, { "epoch": 23.099579242636747, "grad_norm": 0.15754319727420807, "learning_rate": 8.282401503708454e-06, "loss": 0.0068, "step": 16470 }, { "epoch": 23.11360448807854, "grad_norm": 0.1529756635427475, "learning_rate": 8.23688627247412e-06, "loss": 0.0069, "step": 16480 }, { "epoch": 23.127629733520337, "grad_norm": 0.18265068531036377, "learning_rate": 8.191485219998007e-06, "loss": 0.0049, "step": 16490 }, { "epoch": 23.141654978962134, "grad_norm": 0.1603613793849945, "learning_rate": 8.146198470404843e-06, "loss": 0.0068, "step": 16500 }, { "epoch": 23.155680224403927, "grad_norm": 0.19095063209533691, "learning_rate": 8.101026147506897e-06, "loss": 0.005, "step": 16510 }, { "epoch": 23.169705469845724, "grad_norm": 0.18111665546894073, "learning_rate": 8.05596837480353e-06, "loss": 0.0049, "step": 16520 }, { "epoch": 23.183730715287517, "grad_norm": 0.14339321851730347, "learning_rate": 8.011025275480998e-06, "loss": 0.0051, "step": 16530 }, { "epoch": 23.197755960729314, "grad_norm": 0.1645493358373642, "learning_rate": 7.966196972412027e-06, "loss": 0.0067, "step": 16540 }, { "epoch": 23.211781206171107, "grad_norm": 0.15270525217056274, "learning_rate": 7.92148358815547e-06, "loss": 0.0055, "step": 16550 }, { "epoch": 23.225806451612904, "grad_norm": 0.18424047529697418, "learning_rate": 7.87688524495604e-06, "loss": 0.0048, "step": 16560 }, { "epoch": 23.239831697054697, "grad_norm": 0.0955439880490303, "learning_rate": 7.83240206474386e-06, "loss": 0.0057, "step": 16570 }, { "epoch": 23.253856942496494, "grad_norm": 0.08628938347101212, "learning_rate": 7.788034169134272e-06, "loss": 0.007, "step": 16580 }, { "epoch": 23.267882187938287, "grad_norm": 0.13551084697246552, "learning_rate": 7.743781679427414e-06, "loss": 0.0063, "step": 16590 }, { "epoch": 23.281907433380084, "grad_norm": 0.24365343153476715, "learning_rate": 7.699644716607895e-06, "loss": 0.0053, "step": 16600 }, { "epoch": 23.29593267882188, "grad_norm": 0.10180027037858963, "learning_rate": 7.655623401344486e-06, "loss": 0.0049, "step": 16610 }, { "epoch": 23.309957924263674, "grad_norm": 0.0961398333311081, "learning_rate": 7.611717853989775e-06, "loss": 0.0045, "step": 16620 }, { "epoch": 23.32398316970547, "grad_norm": 0.12303480505943298, "learning_rate": 7.567928194579854e-06, "loss": 0.0047, "step": 16630 }, { "epoch": 23.338008415147264, "grad_norm": 0.19126975536346436, "learning_rate": 7.524254542833997e-06, "loss": 0.0066, "step": 16640 }, { "epoch": 23.35203366058906, "grad_norm": 0.14369745552539825, "learning_rate": 7.480697018154286e-06, "loss": 0.0068, "step": 16650 }, { "epoch": 23.366058906030855, "grad_norm": 0.5721383094787598, "learning_rate": 7.437255739625332e-06, "loss": 0.005, "step": 16660 }, { "epoch": 23.38008415147265, "grad_norm": 0.2046668976545334, "learning_rate": 7.393930826013923e-06, "loss": 0.0082, "step": 16670 }, { "epoch": 23.394109396914445, "grad_norm": 0.12505614757537842, "learning_rate": 7.350722395768722e-06, "loss": 0.0051, "step": 16680 }, { "epoch": 23.40813464235624, "grad_norm": 0.22786758840084076, "learning_rate": 7.307630567019963e-06, "loss": 0.0073, "step": 16690 }, { "epoch": 23.422159887798035, "grad_norm": 0.13843876123428345, "learning_rate": 7.264655457579e-06, "loss": 0.0053, "step": 16700 }, { "epoch": 23.43618513323983, "grad_norm": 0.11482658982276917, "learning_rate": 7.221797184938184e-06, "loss": 0.0048, "step": 16710 }, { "epoch": 23.45021037868163, "grad_norm": 0.1363949477672577, "learning_rate": 7.179055866270373e-06, "loss": 0.0059, "step": 16720 }, { "epoch": 23.464235624123422, "grad_norm": 0.11930033564567566, "learning_rate": 7.136431618428707e-06, "loss": 0.0062, "step": 16730 }, { "epoch": 23.47826086956522, "grad_norm": 0.13507001101970673, "learning_rate": 7.09392455794628e-06, "loss": 0.0045, "step": 16740 }, { "epoch": 23.492286115007012, "grad_norm": 0.27996110916137695, "learning_rate": 7.051534801035725e-06, "loss": 0.006, "step": 16750 }, { "epoch": 23.50631136044881, "grad_norm": 0.14572127163410187, "learning_rate": 7.00926246358905e-06, "loss": 0.0084, "step": 16760 }, { "epoch": 23.520336605890602, "grad_norm": 0.45944875478744507, "learning_rate": 6.967107661177191e-06, "loss": 0.0052, "step": 16770 }, { "epoch": 23.5343618513324, "grad_norm": 0.1066804975271225, "learning_rate": 6.925070509049786e-06, "loss": 0.0073, "step": 16780 }, { "epoch": 23.548387096774192, "grad_norm": 0.23222844302654266, "learning_rate": 6.883151122134812e-06, "loss": 0.0074, "step": 16790 }, { "epoch": 23.56241234221599, "grad_norm": 0.23283657431602478, "learning_rate": 6.8413496150382394e-06, "loss": 0.0058, "step": 16800 }, { "epoch": 23.576437587657782, "grad_norm": 0.17897731065750122, "learning_rate": 6.7996661020438165e-06, "loss": 0.0052, "step": 16810 }, { "epoch": 23.59046283309958, "grad_norm": 0.08074267208576202, "learning_rate": 6.758100697112662e-06, "loss": 0.005, "step": 16820 }, { "epoch": 23.604488078541376, "grad_norm": 0.139566570520401, "learning_rate": 6.716653513883026e-06, "loss": 0.0058, "step": 16830 }, { "epoch": 23.61851332398317, "grad_norm": 0.09632892906665802, "learning_rate": 6.675324665669913e-06, "loss": 0.0046, "step": 16840 }, { "epoch": 23.632538569424966, "grad_norm": 0.17086729407310486, "learning_rate": 6.634114265464803e-06, "loss": 0.0062, "step": 16850 }, { "epoch": 23.64656381486676, "grad_norm": 0.1661798506975174, "learning_rate": 6.59302242593538e-06, "loss": 0.0069, "step": 16860 }, { "epoch": 23.660589060308556, "grad_norm": 0.11296406388282776, "learning_rate": 6.552049259425141e-06, "loss": 0.0059, "step": 16870 }, { "epoch": 23.67461430575035, "grad_norm": 0.10868395864963531, "learning_rate": 6.511194877953181e-06, "loss": 0.0047, "step": 16880 }, { "epoch": 23.688639551192146, "grad_norm": 0.16406892240047455, "learning_rate": 6.470459393213813e-06, "loss": 0.0058, "step": 16890 }, { "epoch": 23.70266479663394, "grad_norm": 0.14150960743427277, "learning_rate": 6.429842916576279e-06, "loss": 0.0073, "step": 16900 }, { "epoch": 23.716690042075736, "grad_norm": 0.15457387268543243, "learning_rate": 6.389345559084503e-06, "loss": 0.0054, "step": 16910 }, { "epoch": 23.730715287517533, "grad_norm": 0.1621648371219635, "learning_rate": 6.348967431456682e-06, "loss": 0.0056, "step": 16920 }, { "epoch": 23.744740532959327, "grad_norm": 0.10804839432239532, "learning_rate": 6.30870864408511e-06, "loss": 0.0067, "step": 16930 }, { "epoch": 23.758765778401123, "grad_norm": 0.09462837874889374, "learning_rate": 6.268569307035754e-06, "loss": 0.0067, "step": 16940 }, { "epoch": 23.772791023842917, "grad_norm": 0.28971338272094727, "learning_rate": 6.228549530048022e-06, "loss": 0.0047, "step": 16950 }, { "epoch": 23.786816269284714, "grad_norm": 0.1780625432729721, "learning_rate": 6.1886494225344814e-06, "loss": 0.0056, "step": 16960 }, { "epoch": 23.800841514726507, "grad_norm": 0.15525014698505402, "learning_rate": 6.148869093580479e-06, "loss": 0.0062, "step": 16970 }, { "epoch": 23.814866760168304, "grad_norm": 0.2318730652332306, "learning_rate": 6.109208651943921e-06, "loss": 0.0073, "step": 16980 }, { "epoch": 23.828892005610097, "grad_norm": 0.10234996676445007, "learning_rate": 6.069668206054946e-06, "loss": 0.0066, "step": 16990 }, { "epoch": 23.842917251051894, "grad_norm": 0.14014525711536407, "learning_rate": 6.0302478640156145e-06, "loss": 0.0044, "step": 17000 }, { "epoch": 23.856942496493687, "grad_norm": 0.1874684989452362, "learning_rate": 5.990947733599644e-06, "loss": 0.0053, "step": 17010 }, { "epoch": 23.870967741935484, "grad_norm": 0.1636742502450943, "learning_rate": 5.951767922252105e-06, "loss": 0.0086, "step": 17020 }, { "epoch": 23.88499298737728, "grad_norm": 0.2046525478363037, "learning_rate": 5.912708537089068e-06, "loss": 0.006, "step": 17030 }, { "epoch": 23.899018232819074, "grad_norm": 0.18116988241672516, "learning_rate": 5.873769684897434e-06, "loss": 0.0068, "step": 17040 }, { "epoch": 23.91304347826087, "grad_norm": 0.15901286900043488, "learning_rate": 5.834951472134514e-06, "loss": 0.006, "step": 17050 }, { "epoch": 23.927068723702664, "grad_norm": 0.1574937254190445, "learning_rate": 5.796254004927832e-06, "loss": 0.0083, "step": 17060 }, { "epoch": 23.94109396914446, "grad_norm": 0.25888922810554504, "learning_rate": 5.757677389074806e-06, "loss": 0.0056, "step": 17070 }, { "epoch": 23.955119214586254, "grad_norm": 0.12081315368413925, "learning_rate": 5.719221730042385e-06, "loss": 0.007, "step": 17080 }, { "epoch": 23.96914446002805, "grad_norm": 0.2242681086063385, "learning_rate": 5.680887132966911e-06, "loss": 0.0055, "step": 17090 }, { "epoch": 23.983169705469845, "grad_norm": 0.21554933488368988, "learning_rate": 5.642673702653683e-06, "loss": 0.0071, "step": 17100 }, { "epoch": 23.99719495091164, "grad_norm": 0.14063890278339386, "learning_rate": 5.604581543576781e-06, "loss": 0.0051, "step": 17110 }, { "epoch": 24.011220196353435, "grad_norm": 0.20483236014842987, "learning_rate": 5.566610759878704e-06, "loss": 0.0055, "step": 17120 }, { "epoch": 24.02524544179523, "grad_norm": 0.18867062032222748, "learning_rate": 5.528761455370119e-06, "loss": 0.0047, "step": 17130 }, { "epoch": 24.03927068723703, "grad_norm": 0.11019424349069595, "learning_rate": 5.491033733529594e-06, "loss": 0.0069, "step": 17140 }, { "epoch": 24.05329593267882, "grad_norm": 0.2023036777973175, "learning_rate": 5.453427697503255e-06, "loss": 0.0069, "step": 17150 }, { "epoch": 24.06732117812062, "grad_norm": 0.130662739276886, "learning_rate": 5.415943450104599e-06, "loss": 0.005, "step": 17160 }, { "epoch": 24.08134642356241, "grad_norm": 0.20209714770317078, "learning_rate": 5.378581093814111e-06, "loss": 0.0068, "step": 17170 }, { "epoch": 24.09537166900421, "grad_norm": 0.19891436398029327, "learning_rate": 5.3413407307790375e-06, "loss": 0.005, "step": 17180 }, { "epoch": 24.109396914446002, "grad_norm": 0.2523287236690521, "learning_rate": 5.30422246281313e-06, "loss": 0.0062, "step": 17190 }, { "epoch": 24.1234221598878, "grad_norm": 0.11723382771015167, "learning_rate": 5.267226391396296e-06, "loss": 0.0045, "step": 17200 }, { "epoch": 24.137447405329592, "grad_norm": 0.15469062328338623, "learning_rate": 5.2303526176744e-06, "loss": 0.0074, "step": 17210 }, { "epoch": 24.15147265077139, "grad_norm": 0.12312845885753632, "learning_rate": 5.193601242458929e-06, "loss": 0.0047, "step": 17220 }, { "epoch": 24.165497896213182, "grad_norm": 0.1329345852136612, "learning_rate": 5.156972366226714e-06, "loss": 0.0049, "step": 17230 }, { "epoch": 24.17952314165498, "grad_norm": 0.15382882952690125, "learning_rate": 5.120466089119735e-06, "loss": 0.0059, "step": 17240 }, { "epoch": 24.193548387096776, "grad_norm": 0.2627010643482208, "learning_rate": 5.084082510944749e-06, "loss": 0.0067, "step": 17250 }, { "epoch": 24.20757363253857, "grad_norm": 0.17745499312877655, "learning_rate": 5.047821731173058e-06, "loss": 0.0062, "step": 17260 }, { "epoch": 24.221598877980366, "grad_norm": 0.16860170662403107, "learning_rate": 5.011683848940274e-06, "loss": 0.0062, "step": 17270 }, { "epoch": 24.23562412342216, "grad_norm": 0.09154239296913147, "learning_rate": 4.975668963045954e-06, "loss": 0.0065, "step": 17280 }, { "epoch": 24.249649368863956, "grad_norm": 0.2205512970685959, "learning_rate": 4.9397771719534525e-06, "loss": 0.0067, "step": 17290 }, { "epoch": 24.26367461430575, "grad_norm": 0.2345408946275711, "learning_rate": 4.904008573789548e-06, "loss": 0.0044, "step": 17300 }, { "epoch": 24.277699859747546, "grad_norm": 0.10893531888723373, "learning_rate": 4.8683632663442005e-06, "loss": 0.0049, "step": 17310 }, { "epoch": 24.29172510518934, "grad_norm": 0.1400335729122162, "learning_rate": 4.832841347070343e-06, "loss": 0.0053, "step": 17320 }, { "epoch": 24.305750350631136, "grad_norm": 0.19812679290771484, "learning_rate": 4.797442913083539e-06, "loss": 0.0074, "step": 17330 }, { "epoch": 24.31977559607293, "grad_norm": 0.20544840395450592, "learning_rate": 4.7621680611617596e-06, "loss": 0.0052, "step": 17340 }, { "epoch": 24.333800841514726, "grad_norm": 0.16722066700458527, "learning_rate": 4.727016887745095e-06, "loss": 0.0064, "step": 17350 }, { "epoch": 24.347826086956523, "grad_norm": 0.12429093569517136, "learning_rate": 4.691989488935511e-06, "loss": 0.0051, "step": 17360 }, { "epoch": 24.361851332398317, "grad_norm": 0.3147158622741699, "learning_rate": 4.657085960496588e-06, "loss": 0.0068, "step": 17370 }, { "epoch": 24.375876577840113, "grad_norm": 0.3221394717693329, "learning_rate": 4.6223063978532265e-06, "loss": 0.0051, "step": 17380 }, { "epoch": 24.389901823281907, "grad_norm": 0.17557242512702942, "learning_rate": 4.587650896091439e-06, "loss": 0.007, "step": 17390 }, { "epoch": 24.403927068723704, "grad_norm": 0.2569344937801361, "learning_rate": 4.553119549958035e-06, "loss": 0.0051, "step": 17400 }, { "epoch": 24.417952314165497, "grad_norm": 0.12447305023670197, "learning_rate": 4.518712453860385e-06, "loss": 0.0062, "step": 17410 }, { "epoch": 24.431977559607294, "grad_norm": 0.19200433790683746, "learning_rate": 4.484429701866205e-06, "loss": 0.0071, "step": 17420 }, { "epoch": 24.446002805049087, "grad_norm": 0.17701883614063263, "learning_rate": 4.4502713877031975e-06, "loss": 0.0069, "step": 17430 }, { "epoch": 24.460028050490884, "grad_norm": 0.15587218105793, "learning_rate": 4.416237604758911e-06, "loss": 0.0051, "step": 17440 }, { "epoch": 24.474053295932677, "grad_norm": 0.17829012870788574, "learning_rate": 4.3823284460804025e-06, "loss": 0.0059, "step": 17450 }, { "epoch": 24.488078541374474, "grad_norm": 0.2147398144006729, "learning_rate": 4.348544004374011e-06, "loss": 0.0053, "step": 17460 }, { "epoch": 24.50210378681627, "grad_norm": 0.19376811385154724, "learning_rate": 4.314884372005123e-06, "loss": 0.0071, "step": 17470 }, { "epoch": 24.516129032258064, "grad_norm": 0.27625319361686707, "learning_rate": 4.281349640997867e-06, "loss": 0.0049, "step": 17480 }, { "epoch": 24.53015427769986, "grad_norm": 0.14378704130649567, "learning_rate": 4.247939903034942e-06, "loss": 0.0067, "step": 17490 }, { "epoch": 24.544179523141654, "grad_norm": 0.2130182832479477, "learning_rate": 4.214655249457284e-06, "loss": 0.0067, "step": 17500 }, { "epoch": 24.55820476858345, "grad_norm": 0.17118370532989502, "learning_rate": 4.181495771263855e-06, "loss": 0.0054, "step": 17510 }, { "epoch": 24.572230014025244, "grad_norm": 0.1620892435312271, "learning_rate": 4.148461559111427e-06, "loss": 0.0062, "step": 17520 }, { "epoch": 24.58625525946704, "grad_norm": 0.16697004437446594, "learning_rate": 4.115552703314252e-06, "loss": 0.0057, "step": 17530 }, { "epoch": 24.600280504908834, "grad_norm": 0.14689257740974426, "learning_rate": 4.082769293843886e-06, "loss": 0.0048, "step": 17540 }, { "epoch": 24.61430575035063, "grad_norm": 0.1582125425338745, "learning_rate": 4.050111420328939e-06, "loss": 0.0049, "step": 17550 }, { "epoch": 24.628330995792425, "grad_norm": 0.19329874217510223, "learning_rate": 4.017579172054764e-06, "loss": 0.0052, "step": 17560 }, { "epoch": 24.64235624123422, "grad_norm": 0.6873732209205627, "learning_rate": 3.985172637963308e-06, "loss": 0.0056, "step": 17570 }, { "epoch": 24.65638148667602, "grad_norm": 0.20292721688747406, "learning_rate": 3.952891906652784e-06, "loss": 0.0054, "step": 17580 }, { "epoch": 24.67040673211781, "grad_norm": 0.14909766614437103, "learning_rate": 3.920737066377478e-06, "loss": 0.0054, "step": 17590 }, { "epoch": 24.68443197755961, "grad_norm": 0.17100068926811218, "learning_rate": 3.888708205047509e-06, "loss": 0.0067, "step": 17600 }, { "epoch": 24.6984572230014, "grad_norm": 0.08862011134624481, "learning_rate": 3.856805410228542e-06, "loss": 0.0051, "step": 17610 }, { "epoch": 24.7124824684432, "grad_norm": 0.09729751199483871, "learning_rate": 3.82502876914162e-06, "loss": 0.0054, "step": 17620 }, { "epoch": 24.726507713884992, "grad_norm": 0.16381946206092834, "learning_rate": 3.7933783686628586e-06, "loss": 0.0059, "step": 17630 }, { "epoch": 24.74053295932679, "grad_norm": 0.23795203864574432, "learning_rate": 3.7618542953232306e-06, "loss": 0.0055, "step": 17640 }, { "epoch": 24.754558204768582, "grad_norm": 0.5963837504386902, "learning_rate": 3.7304566353083658e-06, "loss": 0.0071, "step": 17650 }, { "epoch": 24.76858345021038, "grad_norm": 0.10808493196964264, "learning_rate": 3.6991854744582555e-06, "loss": 0.0052, "step": 17660 }, { "epoch": 24.782608695652176, "grad_norm": 0.15304909646511078, "learning_rate": 3.6680408982670777e-06, "loss": 0.0047, "step": 17670 }, { "epoch": 24.79663394109397, "grad_norm": 0.13608677685260773, "learning_rate": 3.637022991882899e-06, "loss": 0.0062, "step": 17680 }, { "epoch": 24.810659186535766, "grad_norm": 0.11889146268367767, "learning_rate": 3.606131840107485e-06, "loss": 0.0051, "step": 17690 }, { "epoch": 24.82468443197756, "grad_norm": 0.12367266416549683, "learning_rate": 3.575367527396084e-06, "loss": 0.0063, "step": 17700 }, { "epoch": 24.838709677419356, "grad_norm": 0.6616871356964111, "learning_rate": 3.5447301378571386e-06, "loss": 0.006, "step": 17710 }, { "epoch": 24.85273492286115, "grad_norm": 0.4886166751384735, "learning_rate": 3.514219755252113e-06, "loss": 0.0061, "step": 17720 }, { "epoch": 24.866760168302946, "grad_norm": 0.16492164134979248, "learning_rate": 3.4838364629952213e-06, "loss": 0.0054, "step": 17730 }, { "epoch": 24.88078541374474, "grad_norm": 0.173295259475708, "learning_rate": 3.4535803441532123e-06, "loss": 0.0066, "step": 17740 }, { "epoch": 24.894810659186536, "grad_norm": 0.31970512866973877, "learning_rate": 3.4234514814451836e-06, "loss": 0.0063, "step": 17750 }, { "epoch": 24.90883590462833, "grad_norm": 0.39803874492645264, "learning_rate": 3.393449957242273e-06, "loss": 0.0051, "step": 17760 }, { "epoch": 24.922861150070126, "grad_norm": 0.17242559790611267, "learning_rate": 3.363575853567524e-06, "loss": 0.0058, "step": 17770 }, { "epoch": 24.936886395511923, "grad_norm": 0.4912395775318146, "learning_rate": 3.3338292520955826e-06, "loss": 0.0085, "step": 17780 }, { "epoch": 24.950911640953716, "grad_norm": 0.7508920431137085, "learning_rate": 3.304210234152516e-06, "loss": 0.0073, "step": 17790 }, { "epoch": 24.964936886395513, "grad_norm": 0.228352889418602, "learning_rate": 3.2747188807155993e-06, "loss": 0.0042, "step": 17800 }, { "epoch": 24.978962131837307, "grad_norm": 0.09385259449481964, "learning_rate": 3.2453552724130643e-06, "loss": 0.0066, "step": 17810 }, { "epoch": 24.992987377279103, "grad_norm": 0.14199858903884888, "learning_rate": 3.216119489523889e-06, "loss": 0.0057, "step": 17820 }, { "epoch": 25.007012622720897, "grad_norm": 0.22223864495754242, "learning_rate": 3.1870116119775917e-06, "loss": 0.0075, "step": 17830 }, { "epoch": 25.021037868162693, "grad_norm": 0.21638356149196625, "learning_rate": 3.158031719353999e-06, "loss": 0.0049, "step": 17840 }, { "epoch": 25.035063113604487, "grad_norm": 0.17158985137939453, "learning_rate": 3.1291798908830273e-06, "loss": 0.005, "step": 17850 }, { "epoch": 25.049088359046284, "grad_norm": 0.0929037407040596, "learning_rate": 3.1004562054444853e-06, "loss": 0.0059, "step": 17860 }, { "epoch": 25.063113604488077, "grad_norm": 0.1040598601102829, "learning_rate": 3.071860741567806e-06, "loss": 0.005, "step": 17870 }, { "epoch": 25.077138849929874, "grad_norm": 0.25876984000205994, "learning_rate": 3.04339357743193e-06, "loss": 0.0059, "step": 17880 }, { "epoch": 25.09116409537167, "grad_norm": 0.15042296051979065, "learning_rate": 3.0150547908649628e-06, "loss": 0.0058, "step": 17890 }, { "epoch": 25.105189340813464, "grad_norm": 0.1687903106212616, "learning_rate": 2.9868444593440957e-06, "loss": 0.0052, "step": 17900 }, { "epoch": 25.11921458625526, "grad_norm": 0.24472977221012115, "learning_rate": 2.9587626599952846e-06, "loss": 0.0064, "step": 17910 }, { "epoch": 25.133239831697054, "grad_norm": 0.15147987008094788, "learning_rate": 2.930809469593082e-06, "loss": 0.0054, "step": 17920 }, { "epoch": 25.14726507713885, "grad_norm": 0.11569337546825409, "learning_rate": 2.9029849645604733e-06, "loss": 0.007, "step": 17930 }, { "epoch": 25.161290322580644, "grad_norm": 0.11992765963077545, "learning_rate": 2.8752892209685632e-06, "loss": 0.0055, "step": 17940 }, { "epoch": 25.17531556802244, "grad_norm": 0.16861701011657715, "learning_rate": 2.847722314536483e-06, "loss": 0.0061, "step": 17950 }, { "epoch": 25.189340813464234, "grad_norm": 0.12770086526870728, "learning_rate": 2.820284320631078e-06, "loss": 0.0056, "step": 17960 }, { "epoch": 25.20336605890603, "grad_norm": 0.2314135730266571, "learning_rate": 2.792975314266788e-06, "loss": 0.0048, "step": 17970 }, { "epoch": 25.217391304347824, "grad_norm": 0.2160494029521942, "learning_rate": 2.7657953701054007e-06, "loss": 0.0089, "step": 17980 }, { "epoch": 25.23141654978962, "grad_norm": 0.12879586219787598, "learning_rate": 2.7387445624558306e-06, "loss": 0.0053, "step": 17990 }, { "epoch": 25.245441795231418, "grad_norm": 0.1023537814617157, "learning_rate": 2.7118229652739747e-06, "loss": 0.0049, "step": 18000 }, { "epoch": 25.25946704067321, "grad_norm": 0.18424829840660095, "learning_rate": 2.6850306521624236e-06, "loss": 0.0071, "step": 18010 }, { "epoch": 25.273492286115008, "grad_norm": 0.23073258996009827, "learning_rate": 2.6583676963703507e-06, "loss": 0.0056, "step": 18020 }, { "epoch": 25.2875175315568, "grad_norm": 0.1838628053665161, "learning_rate": 2.631834170793268e-06, "loss": 0.0062, "step": 18030 }, { "epoch": 25.3015427769986, "grad_norm": 0.08284828066825867, "learning_rate": 2.6054301479728036e-06, "loss": 0.0048, "step": 18040 }, { "epoch": 25.31556802244039, "grad_norm": 0.20145320892333984, "learning_rate": 2.579155700096575e-06, "loss": 0.0049, "step": 18050 }, { "epoch": 25.32959326788219, "grad_norm": 0.15119099617004395, "learning_rate": 2.5530108989978873e-06, "loss": 0.0053, "step": 18060 }, { "epoch": 25.34361851332398, "grad_norm": 0.23806370794773102, "learning_rate": 2.5269958161556416e-06, "loss": 0.0067, "step": 18070 }, { "epoch": 25.35764375876578, "grad_norm": 0.191097229719162, "learning_rate": 2.5011105226940888e-06, "loss": 0.0073, "step": 18080 }, { "epoch": 25.371669004207575, "grad_norm": 0.1619851142168045, "learning_rate": 2.4753550893826248e-06, "loss": 0.0076, "step": 18090 }, { "epoch": 25.38569424964937, "grad_norm": 0.23346251249313354, "learning_rate": 2.4497295866356296e-06, "loss": 0.0049, "step": 18100 }, { "epoch": 25.399719495091166, "grad_norm": 0.11106345802545547, "learning_rate": 2.424234084512228e-06, "loss": 0.0062, "step": 18110 }, { "epoch": 25.41374474053296, "grad_norm": 0.1295623928308487, "learning_rate": 2.3988686527161687e-06, "loss": 0.0066, "step": 18120 }, { "epoch": 25.427769985974756, "grad_norm": 0.2167404741048813, "learning_rate": 2.373633360595573e-06, "loss": 0.005, "step": 18130 }, { "epoch": 25.44179523141655, "grad_norm": 0.378950834274292, "learning_rate": 2.3485282771427585e-06, "loss": 0.0074, "step": 18140 }, { "epoch": 25.455820476858346, "grad_norm": 0.09786229580640793, "learning_rate": 2.3235534709940665e-06, "loss": 0.0068, "step": 18150 }, { "epoch": 25.46984572230014, "grad_norm": 0.16675153374671936, "learning_rate": 2.2987090104296617e-06, "loss": 0.0049, "step": 18160 }, { "epoch": 25.483870967741936, "grad_norm": 0.15530745685100555, "learning_rate": 2.273994963373355e-06, "loss": 0.0049, "step": 18170 }, { "epoch": 25.49789621318373, "grad_norm": 0.13479913771152496, "learning_rate": 2.249411397392409e-06, "loss": 0.0049, "step": 18180 }, { "epoch": 25.511921458625526, "grad_norm": 0.10461646318435669, "learning_rate": 2.2249583796973506e-06, "loss": 0.0055, "step": 18190 }, { "epoch": 25.525946704067323, "grad_norm": 0.09507299959659576, "learning_rate": 2.200635977141796e-06, "loss": 0.006, "step": 18200 }, { "epoch": 25.539971949509116, "grad_norm": 0.126841738820076, "learning_rate": 2.17644425622226e-06, "loss": 0.007, "step": 18210 }, { "epoch": 25.553997194950913, "grad_norm": 0.11255265027284622, "learning_rate": 2.152383283077991e-06, "loss": 0.0061, "step": 18220 }, { "epoch": 25.568022440392706, "grad_norm": 0.11809038370847702, "learning_rate": 2.128453123490781e-06, "loss": 0.0062, "step": 18230 }, { "epoch": 25.582047685834503, "grad_norm": 0.13456396758556366, "learning_rate": 2.1046538428847462e-06, "loss": 0.0055, "step": 18240 }, { "epoch": 25.596072931276296, "grad_norm": 0.17136378586292267, "learning_rate": 2.0809855063262273e-06, "loss": 0.007, "step": 18250 }, { "epoch": 25.610098176718093, "grad_norm": 0.12171522527933121, "learning_rate": 2.057448178523558e-06, "loss": 0.0065, "step": 18260 }, { "epoch": 25.624123422159887, "grad_norm": 0.12996907532215118, "learning_rate": 2.034041923826885e-06, "loss": 0.0053, "step": 18270 }, { "epoch": 25.638148667601683, "grad_norm": 0.14325179159641266, "learning_rate": 2.0107668062280204e-06, "loss": 0.0058, "step": 18280 }, { "epoch": 25.652173913043477, "grad_norm": 0.09056848287582397, "learning_rate": 1.9876228893602357e-06, "loss": 0.0053, "step": 18290 }, { "epoch": 25.666199158485274, "grad_norm": 0.21342597901821136, "learning_rate": 1.9646102364981266e-06, "loss": 0.0058, "step": 18300 }, { "epoch": 25.68022440392707, "grad_norm": 0.07170359045267105, "learning_rate": 1.9417289105574053e-06, "loss": 0.0047, "step": 18310 }, { "epoch": 25.694249649368864, "grad_norm": 0.11547553539276123, "learning_rate": 1.9189789740947427e-06, "loss": 0.0059, "step": 18320 }, { "epoch": 25.70827489481066, "grad_norm": 0.25017064809799194, "learning_rate": 1.896360489307597e-06, "loss": 0.0076, "step": 18330 }, { "epoch": 25.722300140252454, "grad_norm": 0.17867837846279144, "learning_rate": 1.8738735180340362e-06, "loss": 0.0059, "step": 18340 }, { "epoch": 25.73632538569425, "grad_norm": 0.11420347541570663, "learning_rate": 1.8515181217525824e-06, "loss": 0.0063, "step": 18350 }, { "epoch": 25.750350631136044, "grad_norm": 0.23834669589996338, "learning_rate": 1.8292943615820457e-06, "loss": 0.006, "step": 18360 }, { "epoch": 25.76437587657784, "grad_norm": 0.14417681097984314, "learning_rate": 1.8072022982813296e-06, "loss": 0.0048, "step": 18370 }, { "epoch": 25.778401122019634, "grad_norm": 0.10893117636442184, "learning_rate": 1.7852419922492925e-06, "loss": 0.004, "step": 18380 }, { "epoch": 25.79242636746143, "grad_norm": 0.13436582684516907, "learning_rate": 1.763413503524569e-06, "loss": 0.0056, "step": 18390 }, { "epoch": 25.806451612903224, "grad_norm": 0.19151785969734192, "learning_rate": 1.7417168917854165e-06, "loss": 0.006, "step": 18400 }, { "epoch": 25.82047685834502, "grad_norm": 0.09058187156915665, "learning_rate": 1.720152216349552e-06, "loss": 0.0045, "step": 18410 }, { "epoch": 25.834502103786818, "grad_norm": 0.11893931776285172, "learning_rate": 1.6987195361739595e-06, "loss": 0.005, "step": 18420 }, { "epoch": 25.84852734922861, "grad_norm": 0.3219398260116577, "learning_rate": 1.6774189098547832e-06, "loss": 0.0053, "step": 18430 }, { "epoch": 25.862552594670408, "grad_norm": 0.094037264585495, "learning_rate": 1.6562503956271069e-06, "loss": 0.0055, "step": 18440 }, { "epoch": 25.8765778401122, "grad_norm": 0.11405256390571594, "learning_rate": 1.6352140513648417e-06, "loss": 0.0067, "step": 18450 }, { "epoch": 25.890603085553998, "grad_norm": 0.17566120624542236, "learning_rate": 1.6143099345805712e-06, "loss": 0.0059, "step": 18460 }, { "epoch": 25.90462833099579, "grad_norm": 0.07137107104063034, "learning_rate": 1.5935381024253293e-06, "loss": 0.006, "step": 18470 }, { "epoch": 25.91865357643759, "grad_norm": 0.22540970146656036, "learning_rate": 1.572898611688517e-06, "loss": 0.0052, "step": 18480 }, { "epoch": 25.93267882187938, "grad_norm": 0.11796361207962036, "learning_rate": 1.5523915187977133e-06, "loss": 0.0041, "step": 18490 }, { "epoch": 25.94670406732118, "grad_norm": 0.140743687748909, "learning_rate": 1.532016879818532e-06, "loss": 0.0045, "step": 18500 }, { "epoch": 25.96072931276297, "grad_norm": 0.15638090670108795, "learning_rate": 1.51177475045447e-06, "loss": 0.008, "step": 18510 }, { "epoch": 25.97475455820477, "grad_norm": 0.15216441452503204, "learning_rate": 1.4916651860467035e-06, "loss": 0.0054, "step": 18520 }, { "epoch": 25.988779803646565, "grad_norm": 0.3197179436683655, "learning_rate": 1.471688241574043e-06, "loss": 0.0056, "step": 18530 }, { "epoch": 26.00280504908836, "grad_norm": 0.22342169284820557, "learning_rate": 1.451843971652672e-06, "loss": 0.006, "step": 18540 }, { "epoch": 26.016830294530155, "grad_norm": 0.06622723489999771, "learning_rate": 1.432132430536076e-06, "loss": 0.0048, "step": 18550 }, { "epoch": 26.03085553997195, "grad_norm": 0.13959047198295593, "learning_rate": 1.412553672114869e-06, "loss": 0.0059, "step": 18560 }, { "epoch": 26.044880785413746, "grad_norm": 0.17865392565727234, "learning_rate": 1.3931077499166056e-06, "loss": 0.0061, "step": 18570 }, { "epoch": 26.05890603085554, "grad_norm": 0.18124058842658997, "learning_rate": 1.3737947171057085e-06, "loss": 0.0049, "step": 18580 }, { "epoch": 26.072931276297336, "grad_norm": 0.12035934627056122, "learning_rate": 1.3546146264832582e-06, "loss": 0.0047, "step": 18590 }, { "epoch": 26.08695652173913, "grad_norm": 0.0953848734498024, "learning_rate": 1.3355675304869086e-06, "loss": 0.0043, "step": 18600 }, { "epoch": 26.100981767180926, "grad_norm": 0.11604516208171844, "learning_rate": 1.3166534811906827e-06, "loss": 0.0057, "step": 18610 }, { "epoch": 26.11500701262272, "grad_norm": 0.13251323997974396, "learning_rate": 1.2978725303048666e-06, "loss": 0.0051, "step": 18620 }, { "epoch": 26.129032258064516, "grad_norm": 0.1484556496143341, "learning_rate": 1.2792247291758762e-06, "loss": 0.007, "step": 18630 }, { "epoch": 26.143057503506313, "grad_norm": 0.1012941375374794, "learning_rate": 1.2607101287860635e-06, "loss": 0.0066, "step": 18640 }, { "epoch": 26.157082748948106, "grad_norm": 0.15221020579338074, "learning_rate": 1.2423287797536654e-06, "loss": 0.0066, "step": 18650 }, { "epoch": 26.171107994389903, "grad_norm": 0.14325091242790222, "learning_rate": 1.2240807323325776e-06, "loss": 0.0045, "step": 18660 }, { "epoch": 26.185133239831696, "grad_norm": 0.25232720375061035, "learning_rate": 1.205966036412254e-06, "loss": 0.0053, "step": 18670 }, { "epoch": 26.199158485273493, "grad_norm": 0.2093740701675415, "learning_rate": 1.1879847415175949e-06, "loss": 0.0047, "step": 18680 }, { "epoch": 26.213183730715286, "grad_norm": 0.2265750616788864, "learning_rate": 1.1701368968087712e-06, "loss": 0.0041, "step": 18690 }, { "epoch": 26.227208976157083, "grad_norm": 0.5609393119812012, "learning_rate": 1.1524225510811116e-06, "loss": 0.0067, "step": 18700 }, { "epoch": 26.241234221598877, "grad_norm": 0.13895270228385925, "learning_rate": 1.1348417527649535e-06, "loss": 0.0065, "step": 18710 }, { "epoch": 26.255259467040673, "grad_norm": 0.10316239297389984, "learning_rate": 1.1173945499255268e-06, "loss": 0.0045, "step": 18720 }, { "epoch": 26.269284712482467, "grad_norm": 0.09768860042095184, "learning_rate": 1.1000809902628307e-06, "loss": 0.0046, "step": 18730 }, { "epoch": 26.283309957924264, "grad_norm": 0.16260455548763275, "learning_rate": 1.082901121111468e-06, "loss": 0.0044, "step": 18740 }, { "epoch": 26.29733520336606, "grad_norm": 0.24753205478191376, "learning_rate": 1.0658549894405456e-06, "loss": 0.0071, "step": 18750 }, { "epoch": 26.311360448807854, "grad_norm": 0.17114342749118805, "learning_rate": 1.0489426418535342e-06, "loss": 0.005, "step": 18760 }, { "epoch": 26.32538569424965, "grad_norm": 0.18181368708610535, "learning_rate": 1.0321641245881474e-06, "loss": 0.0044, "step": 18770 }, { "epoch": 26.339410939691444, "grad_norm": 0.1415439397096634, "learning_rate": 1.015519483516214e-06, "loss": 0.0037, "step": 18780 }, { "epoch": 26.35343618513324, "grad_norm": 0.10086134076118469, "learning_rate": 9.990087641435443e-07, "loss": 0.0048, "step": 18790 }, { "epoch": 26.367461430575034, "grad_norm": 0.1914493292570114, "learning_rate": 9.826320116098132e-07, "loss": 0.0052, "step": 18800 }, { "epoch": 26.38148667601683, "grad_norm": 0.24595479667186737, "learning_rate": 9.663892706884447e-07, "loss": 0.004, "step": 18810 }, { "epoch": 26.395511921458624, "grad_norm": 0.22313730418682098, "learning_rate": 9.502805857864616e-07, "loss": 0.0063, "step": 18820 }, { "epoch": 26.40953716690042, "grad_norm": 0.3261687457561493, "learning_rate": 9.34306000944396e-07, "loss": 0.0047, "step": 18830 }, { "epoch": 26.423562412342218, "grad_norm": 0.18756602704524994, "learning_rate": 9.184655598361624e-07, "loss": 0.0067, "step": 18840 }, { "epoch": 26.43758765778401, "grad_norm": 0.12422969937324524, "learning_rate": 9.027593057689076e-07, "loss": 0.0064, "step": 18850 }, { "epoch": 26.451612903225808, "grad_norm": 0.14848250150680542, "learning_rate": 8.871872816829441e-07, "loss": 0.0055, "step": 18860 }, { "epoch": 26.4656381486676, "grad_norm": 0.14649014174938202, "learning_rate": 8.717495301515777e-07, "loss": 0.0048, "step": 18870 }, { "epoch": 26.479663394109398, "grad_norm": 0.09324609488248825, "learning_rate": 8.564460933810415e-07, "loss": 0.0041, "step": 18880 }, { "epoch": 26.49368863955119, "grad_norm": 0.15295130014419556, "learning_rate": 8.412770132103453e-07, "loss": 0.0038, "step": 18890 }, { "epoch": 26.507713884992988, "grad_norm": 0.08609650284051895, "learning_rate": 8.262423311111711e-07, "loss": 0.0062, "step": 18900 }, { "epoch": 26.52173913043478, "grad_norm": 0.1464366763830185, "learning_rate": 8.113420881877665e-07, "loss": 0.0066, "step": 18910 }, { "epoch": 26.535764375876578, "grad_norm": 0.1295040398836136, "learning_rate": 7.965763251768288e-07, "loss": 0.0052, "step": 18920 }, { "epoch": 26.54978962131837, "grad_norm": 0.08843471109867096, "learning_rate": 7.819450824473995e-07, "loss": 0.0056, "step": 18930 }, { "epoch": 26.56381486676017, "grad_norm": 0.11945833265781403, "learning_rate": 7.674484000007198e-07, "loss": 0.0062, "step": 18940 }, { "epoch": 26.577840112201965, "grad_norm": 0.1147742047905922, "learning_rate": 7.530863174701752e-07, "loss": 0.0056, "step": 18950 }, { "epoch": 26.59186535764376, "grad_norm": 0.3011709749698639, "learning_rate": 7.38858874121151e-07, "loss": 0.0057, "step": 18960 }, { "epoch": 26.605890603085555, "grad_norm": 0.11512934416532516, "learning_rate": 7.247661088509328e-07, "loss": 0.0056, "step": 18970 }, { "epoch": 26.61991584852735, "grad_norm": 0.25181132555007935, "learning_rate": 7.108080601886002e-07, "loss": 0.006, "step": 18980 }, { "epoch": 26.633941093969145, "grad_norm": 0.17628054320812225, "learning_rate": 6.969847662949336e-07, "loss": 0.0049, "step": 18990 }, { "epoch": 26.64796633941094, "grad_norm": 0.11330592632293701, "learning_rate": 6.832962649622798e-07, "loss": 0.007, "step": 19000 }, { "epoch": 26.661991584852736, "grad_norm": 0.24960383772850037, "learning_rate": 6.697425936144863e-07, "loss": 0.0064, "step": 19010 }, { "epoch": 26.67601683029453, "grad_norm": 0.24250544607639313, "learning_rate": 6.563237893067731e-07, "loss": 0.0055, "step": 19020 }, { "epoch": 26.690042075736326, "grad_norm": 0.12342794984579086, "learning_rate": 6.430398887256328e-07, "loss": 0.0056, "step": 19030 }, { "epoch": 26.70406732117812, "grad_norm": 0.17919547855854034, "learning_rate": 6.298909281887478e-07, "loss": 0.0058, "step": 19040 }, { "epoch": 26.718092566619916, "grad_norm": 0.10655045509338379, "learning_rate": 6.168769436448673e-07, "loss": 0.0056, "step": 19050 }, { "epoch": 26.732117812061713, "grad_norm": 0.1881064772605896, "learning_rate": 6.03997970673742e-07, "loss": 0.0041, "step": 19060 }, { "epoch": 26.746143057503506, "grad_norm": 0.13836920261383057, "learning_rate": 5.912540444859782e-07, "loss": 0.0047, "step": 19070 }, { "epoch": 26.760168302945303, "grad_norm": 0.12139743566513062, "learning_rate": 5.786451999229837e-07, "loss": 0.0052, "step": 19080 }, { "epoch": 26.774193548387096, "grad_norm": 0.10941914469003677, "learning_rate": 5.661714714568722e-07, "loss": 0.0053, "step": 19090 }, { "epoch": 26.788218793828893, "grad_norm": 0.12025491148233414, "learning_rate": 5.538328931903259e-07, "loss": 0.0079, "step": 19100 }, { "epoch": 26.802244039270686, "grad_norm": 0.22854135930538177, "learning_rate": 5.416294988565551e-07, "loss": 0.0055, "step": 19110 }, { "epoch": 26.816269284712483, "grad_norm": 0.10310178250074387, "learning_rate": 5.29561321819172e-07, "loss": 0.0044, "step": 19120 }, { "epoch": 26.830294530154276, "grad_norm": 0.11622610688209534, "learning_rate": 5.176283950721061e-07, "loss": 0.0051, "step": 19130 }, { "epoch": 26.844319775596073, "grad_norm": 0.1699177622795105, "learning_rate": 5.058307512395332e-07, "loss": 0.0064, "step": 19140 }, { "epoch": 26.858345021037866, "grad_norm": 0.15938813984394073, "learning_rate": 4.941684225757526e-07, "loss": 0.0044, "step": 19150 }, { "epoch": 26.872370266479663, "grad_norm": 0.17857269942760468, "learning_rate": 4.826414409651314e-07, "loss": 0.0047, "step": 19160 }, { "epoch": 26.88639551192146, "grad_norm": 0.15329059958457947, "learning_rate": 4.712498379219943e-07, "loss": 0.0053, "step": 19170 }, { "epoch": 26.900420757363253, "grad_norm": 0.07742517441511154, "learning_rate": 4.599936445905506e-07, "loss": 0.0045, "step": 19180 }, { "epoch": 26.91444600280505, "grad_norm": 0.1650732308626175, "learning_rate": 4.4887289174480594e-07, "loss": 0.0059, "step": 19190 }, { "epoch": 26.928471248246844, "grad_norm": 0.1655762642621994, "learning_rate": 4.378876097884621e-07, "loss": 0.0059, "step": 19200 }, { "epoch": 26.94249649368864, "grad_norm": 0.09923025220632553, "learning_rate": 4.2703782875487264e-07, "loss": 0.0049, "step": 19210 }, { "epoch": 26.956521739130434, "grad_norm": 0.2799999713897705, "learning_rate": 4.163235783069208e-07, "loss": 0.0051, "step": 19220 }, { "epoch": 26.97054698457223, "grad_norm": 0.12660634517669678, "learning_rate": 4.057448877369585e-07, "loss": 0.0043, "step": 19230 }, { "epoch": 26.984572230014024, "grad_norm": 0.15346185863018036, "learning_rate": 3.9530178596672295e-07, "loss": 0.0041, "step": 19240 }, { "epoch": 26.99859747545582, "grad_norm": 0.15344399213790894, "learning_rate": 3.849943015472479e-07, "loss": 0.0052, "step": 19250 }, { "epoch": 27.012622720897614, "grad_norm": 0.14345623552799225, "learning_rate": 3.748224626588137e-07, "loss": 0.0062, "step": 19260 }, { "epoch": 27.02664796633941, "grad_norm": 0.07607021182775497, "learning_rate": 3.647862971108307e-07, "loss": 0.0063, "step": 19270 }, { "epoch": 27.040673211781208, "grad_norm": 0.14317457377910614, "learning_rate": 3.5488583234179473e-07, "loss": 0.0046, "step": 19280 }, { "epoch": 27.054698457223, "grad_norm": 0.12060989439487457, "learning_rate": 3.4512109541920413e-07, "loss": 0.0044, "step": 19290 }, { "epoch": 27.068723702664798, "grad_norm": 0.10588216036558151, "learning_rate": 3.354921130394706e-07, "loss": 0.0063, "step": 19300 }, { "epoch": 27.08274894810659, "grad_norm": 0.08687908202409744, "learning_rate": 3.259989115278639e-07, "loss": 0.0054, "step": 19310 }, { "epoch": 27.096774193548388, "grad_norm": 0.08043937385082245, "learning_rate": 3.1664151683843403e-07, "loss": 0.0039, "step": 19320 }, { "epoch": 27.11079943899018, "grad_norm": 0.24890583753585815, "learning_rate": 3.074199545539447e-07, "loss": 0.0052, "step": 19330 }, { "epoch": 27.124824684431978, "grad_norm": 0.38645756244659424, "learning_rate": 2.983342498857955e-07, "loss": 0.0053, "step": 19340 }, { "epoch": 27.13884992987377, "grad_norm": 0.16610848903656006, "learning_rate": 2.893844276739499e-07, "loss": 0.0056, "step": 19350 }, { "epoch": 27.152875175315568, "grad_norm": 0.11333078891038895, "learning_rate": 2.8057051238688514e-07, "loss": 0.0061, "step": 19360 }, { "epoch": 27.166900420757365, "grad_norm": 0.10864590853452682, "learning_rate": 2.71892528121509e-07, "loss": 0.0045, "step": 19370 }, { "epoch": 27.18092566619916, "grad_norm": 0.10376414656639099, "learning_rate": 2.633504986030988e-07, "loss": 0.0052, "step": 19380 }, { "epoch": 27.194950911640955, "grad_norm": 0.09328550100326538, "learning_rate": 2.549444471852347e-07, "loss": 0.0053, "step": 19390 }, { "epoch": 27.20897615708275, "grad_norm": 0.24033282697200775, "learning_rate": 2.4667439684974423e-07, "loss": 0.0051, "step": 19400 }, { "epoch": 27.223001402524545, "grad_norm": 0.17933455109596252, "learning_rate": 2.3854037020662467e-07, "loss": 0.0072, "step": 19410 }, { "epoch": 27.23702664796634, "grad_norm": 0.15358075499534607, "learning_rate": 2.3054238949399288e-07, "loss": 0.0048, "step": 19420 }, { "epoch": 27.251051893408135, "grad_norm": 0.18920458853244781, "learning_rate": 2.2268047657802993e-07, "loss": 0.0061, "step": 19430 }, { "epoch": 27.26507713884993, "grad_norm": 0.19364076852798462, "learning_rate": 2.149546529529034e-07, "loss": 0.0049, "step": 19440 }, { "epoch": 27.279102384291726, "grad_norm": 0.10651536285877228, "learning_rate": 2.0736493974071736e-07, "loss": 0.0051, "step": 19450 }, { "epoch": 27.29312762973352, "grad_norm": 0.09404205530881882, "learning_rate": 1.9991135769145686e-07, "loss": 0.0044, "step": 19460 }, { "epoch": 27.307152875175316, "grad_norm": 0.10683480650186539, "learning_rate": 1.9259392718293245e-07, "loss": 0.0055, "step": 19470 }, { "epoch": 27.321178120617112, "grad_norm": 0.16717615723609924, "learning_rate": 1.8541266822072467e-07, "loss": 0.0055, "step": 19480 }, { "epoch": 27.335203366058906, "grad_norm": 0.10378362238407135, "learning_rate": 1.7836760043811184e-07, "loss": 0.0049, "step": 19490 }, { "epoch": 27.349228611500703, "grad_norm": 0.09860067814588547, "learning_rate": 1.7145874309604792e-07, "loss": 0.0067, "step": 19500 }, { "epoch": 27.363253856942496, "grad_norm": 0.10378140956163406, "learning_rate": 1.6468611508308474e-07, "loss": 0.0042, "step": 19510 }, { "epoch": 27.377279102384293, "grad_norm": 0.16751717031002045, "learning_rate": 1.5804973491532204e-07, "loss": 0.0044, "step": 19520 }, { "epoch": 27.391304347826086, "grad_norm": 0.19904761016368866, "learning_rate": 1.5154962073637424e-07, "loss": 0.0069, "step": 19530 }, { "epoch": 27.405329593267883, "grad_norm": 0.164788156747818, "learning_rate": 1.4518579031730372e-07, "loss": 0.0044, "step": 19540 }, { "epoch": 27.419354838709676, "grad_norm": 0.17450521886348724, "learning_rate": 1.389582610565876e-07, "loss": 0.0053, "step": 19550 }, { "epoch": 27.433380084151473, "grad_norm": 0.13234011828899384, "learning_rate": 1.3286704998003995e-07, "loss": 0.0051, "step": 19560 }, { "epoch": 27.447405329593266, "grad_norm": 0.1500682234764099, "learning_rate": 1.2691217374080632e-07, "loss": 0.0056, "step": 19570 }, { "epoch": 27.461430575035063, "grad_norm": 0.07558313012123108, "learning_rate": 1.2109364861929705e-07, "loss": 0.0046, "step": 19580 }, { "epoch": 27.47545582047686, "grad_norm": 0.12733475863933563, "learning_rate": 1.1541149052312628e-07, "loss": 0.0061, "step": 19590 }, { "epoch": 27.489481065918653, "grad_norm": 0.11199061572551727, "learning_rate": 1.0986571498710074e-07, "loss": 0.0037, "step": 19600 }, { "epoch": 27.50350631136045, "grad_norm": 0.12610875070095062, "learning_rate": 1.0445633717316438e-07, "loss": 0.0053, "step": 19610 }, { "epoch": 27.517531556802243, "grad_norm": 0.09795690327882767, "learning_rate": 9.918337187034277e-08, "loss": 0.0062, "step": 19620 }, { "epoch": 27.53155680224404, "grad_norm": 0.09941105544567108, "learning_rate": 9.404683349472643e-08, "loss": 0.0045, "step": 19630 }, { "epoch": 27.545582047685834, "grad_norm": 0.15591807663440704, "learning_rate": 8.904673608940983e-08, "loss": 0.005, "step": 19640 }, { "epoch": 27.55960729312763, "grad_norm": 0.14267446100711823, "learning_rate": 8.418309332447471e-08, "loss": 0.004, "step": 19650 }, { "epoch": 27.573632538569424, "grad_norm": 0.13384677469730377, "learning_rate": 7.945591849692902e-08, "loss": 0.007, "step": 19660 }, { "epoch": 27.58765778401122, "grad_norm": 0.11289151012897491, "learning_rate": 7.486522453069578e-08, "loss": 0.0051, "step": 19670 }, { "epoch": 27.601683029453014, "grad_norm": 0.1474321037530899, "learning_rate": 7.041102397655208e-08, "loss": 0.0045, "step": 19680 }, { "epoch": 27.61570827489481, "grad_norm": 0.15024080872535706, "learning_rate": 6.609332901210685e-08, "loss": 0.0058, "step": 19690 }, { "epoch": 27.629733520336607, "grad_norm": 0.14180392026901245, "learning_rate": 6.191215144178419e-08, "loss": 0.0065, "step": 19700 }, { "epoch": 27.6437587657784, "grad_norm": 0.16007158160209656, "learning_rate": 5.786750269675678e-08, "loss": 0.0048, "step": 19710 }, { "epoch": 27.657784011220198, "grad_norm": 0.24490191042423248, "learning_rate": 5.395939383494031e-08, "loss": 0.0042, "step": 19720 }, { "epoch": 27.67180925666199, "grad_norm": 0.11154283583164215, "learning_rate": 5.018783554095463e-08, "loss": 0.0056, "step": 19730 }, { "epoch": 27.685834502103788, "grad_norm": 0.17513613402843475, "learning_rate": 4.655283812610156e-08, "loss": 0.0057, "step": 19740 }, { "epoch": 27.69985974754558, "grad_norm": 0.1378246396780014, "learning_rate": 4.305441152831491e-08, "loss": 0.0068, "step": 19750 }, { "epoch": 27.713884992987378, "grad_norm": 0.18052566051483154, "learning_rate": 3.9692565312171584e-08, "loss": 0.0051, "step": 19760 }, { "epoch": 27.72791023842917, "grad_norm": 0.1126767098903656, "learning_rate": 3.6467308668824975e-08, "loss": 0.0051, "step": 19770 }, { "epoch": 27.741935483870968, "grad_norm": 0.12774744629859924, "learning_rate": 3.3378650416004964e-08, "loss": 0.0047, "step": 19780 }, { "epoch": 27.75596072931276, "grad_norm": 0.1553681343793869, "learning_rate": 3.042659899797906e-08, "loss": 0.0068, "step": 19790 }, { "epoch": 27.769985974754558, "grad_norm": 0.26514869928359985, "learning_rate": 2.76111624855524e-08, "loss": 0.0054, "step": 19800 }, { "epoch": 27.784011220196355, "grad_norm": 0.102738156914711, "learning_rate": 2.4932348576017784e-08, "loss": 0.0046, "step": 19810 }, { "epoch": 27.79803646563815, "grad_norm": 0.15002243220806122, "learning_rate": 2.239016459314458e-08, "loss": 0.0042, "step": 19820 }, { "epoch": 27.812061711079945, "grad_norm": 0.12290273606777191, "learning_rate": 1.9984617487173174e-08, "loss": 0.0056, "step": 19830 }, { "epoch": 27.82608695652174, "grad_norm": 0.20850427448749542, "learning_rate": 1.7715713834776105e-08, "loss": 0.0052, "step": 19840 }, { "epoch": 27.840112201963535, "grad_norm": 0.188346728682518, "learning_rate": 1.5583459839046964e-08, "loss": 0.0062, "step": 19850 }, { "epoch": 27.85413744740533, "grad_norm": 0.14033252000808716, "learning_rate": 1.3587861329489304e-08, "loss": 0.0053, "step": 19860 }, { "epoch": 27.868162692847125, "grad_norm": 0.13280856609344482, "learning_rate": 1.1728923761994415e-08, "loss": 0.0046, "step": 19870 }, { "epoch": 27.88218793828892, "grad_norm": 0.2080083042383194, "learning_rate": 1.0006652218819135e-08, "loss": 0.0086, "step": 19880 }, { "epoch": 27.896213183730715, "grad_norm": 0.13207824528217316, "learning_rate": 8.421051408596947e-09, "loss": 0.0055, "step": 19890 }, { "epoch": 27.91023842917251, "grad_norm": 0.11125652492046356, "learning_rate": 6.972125666299123e-09, "loss": 0.0055, "step": 19900 }, { "epoch": 27.924263674614306, "grad_norm": 0.1703345775604248, "learning_rate": 5.659878953229169e-09, "loss": 0.0068, "step": 19910 }, { "epoch": 27.938288920056102, "grad_norm": 0.16810546815395355, "learning_rate": 4.48431485701728e-09, "loss": 0.0057, "step": 19920 }, { "epoch": 27.952314165497896, "grad_norm": 0.2110414206981659, "learning_rate": 3.4454365916203322e-09, "loss": 0.0071, "step": 19930 }, { "epoch": 27.966339410939693, "grad_norm": 0.09067033231258392, "learning_rate": 2.5432469972830332e-09, "loss": 0.0054, "step": 19940 }, { "epoch": 27.980364656381486, "grad_norm": 0.13856792449951172, "learning_rate": 1.7777485405601203e-09, "loss": 0.0053, "step": 19950 }, { "epoch": 27.994389901823283, "grad_norm": 0.16945311427116394, "learning_rate": 1.1489433142941597e-09, "loss": 0.005, "step": 19960 }, { "epoch": 28.008415147265076, "grad_norm": 0.17793531715869904, "learning_rate": 6.568330376210963e-10, "loss": 0.005, "step": 19970 }, { "epoch": 28.022440392706873, "grad_norm": 0.10413944721221924, "learning_rate": 3.0141905594249787e-10, "loss": 0.0057, "step": 19980 }, { "epoch": 28.036465638148666, "grad_norm": 0.5468075275421143, "learning_rate": 8.270234094776008e-11, "loss": 0.0057, "step": 19990 }, { "epoch": 28.050490883590463, "grad_norm": 0.18732772767543793, "learning_rate": 6.834906085551041e-13, "loss": 0.0047, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 29, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 48, "trial_name": null, "trial_params": null }