{ "best_global_step": 20000, "best_metric": 2.3699159622192383, "best_model_checkpoint": "saves/qwen2.5_3d/full/sft_7b_stage1_abl_lr/checkpoint-18000", "epoch": 2.9999080561225426, "eval_steps": 1000, "global_step": 20391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001471102039315202, "grad_norm": 66.26342895448909, "learning_rate": 8.823529411764707e-08, "loss": 12.6904, "step": 10 }, { "epoch": 0.002942204078630404, "grad_norm": 62.09257322477435, "learning_rate": 1.8627450980392158e-07, "loss": 12.684, "step": 20 }, { "epoch": 0.004413306117945606, "grad_norm": 66.64704818506358, "learning_rate": 2.843137254901961e-07, "loss": 12.5976, "step": 30 }, { "epoch": 0.005884408157260808, "grad_norm": 63.44217139400408, "learning_rate": 3.8235294117647064e-07, "loss": 12.3092, "step": 40 }, { "epoch": 0.00735551019657601, "grad_norm": 59.991918948144864, "learning_rate": 4.803921568627452e-07, "loss": 11.4393, "step": 50 }, { "epoch": 0.008826612235891212, "grad_norm": 47.95271239722775, "learning_rate": 5.784313725490197e-07, "loss": 10.3303, "step": 60 }, { "epoch": 0.010297714275206414, "grad_norm": 37.35573566100412, "learning_rate": 6.764705882352941e-07, "loss": 8.3919, "step": 70 }, { "epoch": 0.011768816314521617, "grad_norm": 32.390860532432285, "learning_rate": 7.745098039215687e-07, "loss": 7.3661, "step": 80 }, { "epoch": 0.013239918353836817, "grad_norm": 31.32922806489914, "learning_rate": 8.725490196078432e-07, "loss": 6.5133, "step": 90 }, { "epoch": 0.01471102039315202, "grad_norm": 25.772925681561798, "learning_rate": 9.705882352941176e-07, "loss": 5.9613, "step": 100 }, { "epoch": 0.016182122432467222, "grad_norm": 25.481183834486995, "learning_rate": 1.0686274509803922e-06, "loss": 5.6447, "step": 110 }, { "epoch": 0.017653224471782424, "grad_norm": 24.640098498924495, "learning_rate": 1.1666666666666668e-06, "loss": 5.5362, "step": 120 }, { "epoch": 0.019124326511097627, "grad_norm": 23.060207746990525, "learning_rate": 1.2647058823529412e-06, "loss": 5.2691, "step": 130 }, { "epoch": 0.02059542855041283, "grad_norm": 22.333982608297, "learning_rate": 1.3627450980392157e-06, "loss": 5.0877, "step": 140 }, { "epoch": 0.02206653058972803, "grad_norm": 21.102818708630007, "learning_rate": 1.4607843137254903e-06, "loss": 5.0074, "step": 150 }, { "epoch": 0.023537632629043233, "grad_norm": 21.761777478005047, "learning_rate": 1.5588235294117649e-06, "loss": 4.8362, "step": 160 }, { "epoch": 0.025008734668358432, "grad_norm": 20.7883689634105, "learning_rate": 1.6568627450980392e-06, "loss": 4.7253, "step": 170 }, { "epoch": 0.026479836707673635, "grad_norm": 19.17544803199464, "learning_rate": 1.7549019607843138e-06, "loss": 4.7386, "step": 180 }, { "epoch": 0.027950938746988837, "grad_norm": 19.573645643496196, "learning_rate": 1.8529411764705884e-06, "loss": 4.6134, "step": 190 }, { "epoch": 0.02942204078630404, "grad_norm": 19.783930495686683, "learning_rate": 1.950980392156863e-06, "loss": 4.5944, "step": 200 }, { "epoch": 0.03089314282561924, "grad_norm": 17.358721935506342, "learning_rate": 2.0490196078431373e-06, "loss": 4.4126, "step": 210 }, { "epoch": 0.032364244864934444, "grad_norm": 17.665369005751753, "learning_rate": 2.1470588235294117e-06, "loss": 4.3401, "step": 220 }, { "epoch": 0.03383534690424964, "grad_norm": 16.62119680497179, "learning_rate": 2.2450980392156864e-06, "loss": 4.3538, "step": 230 }, { "epoch": 0.03530644894356485, "grad_norm": 15.716542913448349, "learning_rate": 2.343137254901961e-06, "loss": 4.3953, "step": 240 }, { "epoch": 0.03677755098288005, "grad_norm": 15.004729203857071, "learning_rate": 2.4411764705882356e-06, "loss": 4.2296, "step": 250 }, { "epoch": 0.03824865302219525, "grad_norm": 14.245531492645952, "learning_rate": 2.53921568627451e-06, "loss": 4.2308, "step": 260 }, { "epoch": 0.03971975506151045, "grad_norm": 13.871955518399535, "learning_rate": 2.6372549019607847e-06, "loss": 4.1817, "step": 270 }, { "epoch": 0.04119085710082566, "grad_norm": 12.329399865745838, "learning_rate": 2.7352941176470595e-06, "loss": 4.113, "step": 280 }, { "epoch": 0.04266195914014086, "grad_norm": 12.492057630850423, "learning_rate": 2.8333333333333335e-06, "loss": 4.0076, "step": 290 }, { "epoch": 0.04413306117945606, "grad_norm": 11.666813776228647, "learning_rate": 2.9313725490196082e-06, "loss": 3.9904, "step": 300 }, { "epoch": 0.04560416321877126, "grad_norm": 11.78667420479183, "learning_rate": 3.0294117647058826e-06, "loss": 3.9312, "step": 310 }, { "epoch": 0.04707526525808647, "grad_norm": 10.929653329305472, "learning_rate": 3.127450980392157e-06, "loss": 3.957, "step": 320 }, { "epoch": 0.048546367297401666, "grad_norm": 10.504111351202095, "learning_rate": 3.2254901960784317e-06, "loss": 3.8737, "step": 330 }, { "epoch": 0.050017469336716865, "grad_norm": 10.074169741733797, "learning_rate": 3.323529411764706e-06, "loss": 3.8004, "step": 340 }, { "epoch": 0.05148857137603207, "grad_norm": 9.746770117158373, "learning_rate": 3.421568627450981e-06, "loss": 3.8519, "step": 350 }, { "epoch": 0.05295967341534727, "grad_norm": 9.19271822109123, "learning_rate": 3.5196078431372552e-06, "loss": 3.8188, "step": 360 }, { "epoch": 0.054430775454662475, "grad_norm": 8.818896186972353, "learning_rate": 3.6176470588235296e-06, "loss": 3.6468, "step": 370 }, { "epoch": 0.055901877493977674, "grad_norm": 9.162119875515724, "learning_rate": 3.7156862745098044e-06, "loss": 3.773, "step": 380 }, { "epoch": 0.05737297953329288, "grad_norm": 8.47207355788738, "learning_rate": 3.8137254901960783e-06, "loss": 3.6994, "step": 390 }, { "epoch": 0.05884408157260808, "grad_norm": 8.669164401046018, "learning_rate": 3.911764705882353e-06, "loss": 3.5392, "step": 400 }, { "epoch": 0.060315183611923284, "grad_norm": 8.433069329511136, "learning_rate": 4.009803921568628e-06, "loss": 3.6185, "step": 410 }, { "epoch": 0.06178628565123848, "grad_norm": 8.646961624907743, "learning_rate": 4.107843137254902e-06, "loss": 3.4655, "step": 420 }, { "epoch": 0.06325738769055368, "grad_norm": 7.684094824122494, "learning_rate": 4.205882352941177e-06, "loss": 3.4955, "step": 430 }, { "epoch": 0.06472848972986889, "grad_norm": 7.885087938296234, "learning_rate": 4.303921568627451e-06, "loss": 3.2831, "step": 440 }, { "epoch": 0.0661995917691841, "grad_norm": 7.77085243676356, "learning_rate": 4.401960784313726e-06, "loss": 3.4924, "step": 450 }, { "epoch": 0.06767069380849929, "grad_norm": 8.062505991418812, "learning_rate": 4.5e-06, "loss": 3.475, "step": 460 }, { "epoch": 0.06914179584781449, "grad_norm": 8.597492546029347, "learning_rate": 4.598039215686275e-06, "loss": 3.3538, "step": 470 }, { "epoch": 0.0706128978871297, "grad_norm": 7.554645344135859, "learning_rate": 4.69607843137255e-06, "loss": 3.3784, "step": 480 }, { "epoch": 0.0720839999264449, "grad_norm": 7.206474032229274, "learning_rate": 4.7941176470588245e-06, "loss": 3.3747, "step": 490 }, { "epoch": 0.0735551019657601, "grad_norm": 7.034592400922699, "learning_rate": 4.892156862745098e-06, "loss": 3.3312, "step": 500 }, { "epoch": 0.0750262040050753, "grad_norm": 7.79744686715035, "learning_rate": 4.990196078431373e-06, "loss": 3.3913, "step": 510 }, { "epoch": 0.0764973060443905, "grad_norm": 7.134525483972959, "learning_rate": 5.088235294117647e-06, "loss": 3.3027, "step": 520 }, { "epoch": 0.07796840808370571, "grad_norm": 7.51569765481842, "learning_rate": 5.186274509803923e-06, "loss": 3.3859, "step": 530 }, { "epoch": 0.0794395101230209, "grad_norm": 7.282220822533848, "learning_rate": 5.284313725490197e-06, "loss": 3.527, "step": 540 }, { "epoch": 0.08091061216233611, "grad_norm": 7.158323924163281, "learning_rate": 5.382352941176471e-06, "loss": 3.3842, "step": 550 }, { "epoch": 0.08238171420165132, "grad_norm": 6.839068525683603, "learning_rate": 5.480392156862746e-06, "loss": 3.4176, "step": 560 }, { "epoch": 0.08385281624096651, "grad_norm": 7.2141182529571735, "learning_rate": 5.57843137254902e-06, "loss": 3.3212, "step": 570 }, { "epoch": 0.08532391828028171, "grad_norm": 6.7877413896296845, "learning_rate": 5.676470588235294e-06, "loss": 3.3625, "step": 580 }, { "epoch": 0.08679502031959692, "grad_norm": 6.5410142273182625, "learning_rate": 5.774509803921569e-06, "loss": 3.2821, "step": 590 }, { "epoch": 0.08826612235891212, "grad_norm": 7.22937143915621, "learning_rate": 5.872549019607844e-06, "loss": 3.2149, "step": 600 }, { "epoch": 0.08973722439822732, "grad_norm": 6.684762726165931, "learning_rate": 5.970588235294118e-06, "loss": 3.266, "step": 610 }, { "epoch": 0.09120832643754252, "grad_norm": 6.830232299360609, "learning_rate": 6.068627450980392e-06, "loss": 3.2835, "step": 620 }, { "epoch": 0.09267942847685773, "grad_norm": 6.734402427950244, "learning_rate": 6.166666666666667e-06, "loss": 3.2267, "step": 630 }, { "epoch": 0.09415053051617293, "grad_norm": 6.887138394222975, "learning_rate": 6.264705882352942e-06, "loss": 3.2807, "step": 640 }, { "epoch": 0.09562163255548813, "grad_norm": 6.899790864910463, "learning_rate": 6.362745098039216e-06, "loss": 3.3506, "step": 650 }, { "epoch": 0.09709273459480333, "grad_norm": 6.9438612351416324, "learning_rate": 6.460784313725491e-06, "loss": 3.1721, "step": 660 }, { "epoch": 0.09856383663411854, "grad_norm": 6.8264280130404105, "learning_rate": 6.5588235294117655e-06, "loss": 3.2669, "step": 670 }, { "epoch": 0.10003493867343373, "grad_norm": 6.793366678181268, "learning_rate": 6.656862745098039e-06, "loss": 3.2233, "step": 680 }, { "epoch": 0.10150604071274894, "grad_norm": 7.187933298379474, "learning_rate": 6.754901960784315e-06, "loss": 3.1987, "step": 690 }, { "epoch": 0.10297714275206414, "grad_norm": 6.695346784714423, "learning_rate": 6.852941176470589e-06, "loss": 3.236, "step": 700 }, { "epoch": 0.10444824479137935, "grad_norm": 6.5370852283846235, "learning_rate": 6.950980392156863e-06, "loss": 3.3095, "step": 710 }, { "epoch": 0.10591934683069454, "grad_norm": 7.407374460211865, "learning_rate": 7.0490196078431386e-06, "loss": 3.3362, "step": 720 }, { "epoch": 0.10739044887000974, "grad_norm": 7.284970526643557, "learning_rate": 7.1470588235294125e-06, "loss": 3.2484, "step": 730 }, { "epoch": 0.10886155090932495, "grad_norm": 6.6650695959102935, "learning_rate": 7.2450980392156864e-06, "loss": 3.2228, "step": 740 }, { "epoch": 0.11033265294864016, "grad_norm": 7.420341658391782, "learning_rate": 7.343137254901961e-06, "loss": 3.2486, "step": 750 }, { "epoch": 0.11180375498795535, "grad_norm": 6.49684331345131, "learning_rate": 7.441176470588236e-06, "loss": 3.2101, "step": 760 }, { "epoch": 0.11327485702727055, "grad_norm": 6.652881597600587, "learning_rate": 7.53921568627451e-06, "loss": 3.2805, "step": 770 }, { "epoch": 0.11474595906658576, "grad_norm": 6.239671966115607, "learning_rate": 7.637254901960784e-06, "loss": 3.3175, "step": 780 }, { "epoch": 0.11621706110590095, "grad_norm": 6.602741875168707, "learning_rate": 7.73529411764706e-06, "loss": 3.1907, "step": 790 }, { "epoch": 0.11768816314521616, "grad_norm": 6.792875053614203, "learning_rate": 7.833333333333333e-06, "loss": 3.2141, "step": 800 }, { "epoch": 0.11915926518453136, "grad_norm": 6.7327977319103685, "learning_rate": 7.931372549019607e-06, "loss": 3.1188, "step": 810 }, { "epoch": 0.12063036722384657, "grad_norm": 6.358373747628766, "learning_rate": 8.029411764705883e-06, "loss": 3.1704, "step": 820 }, { "epoch": 0.12210146926316176, "grad_norm": 6.657564283305632, "learning_rate": 8.127450980392157e-06, "loss": 3.1858, "step": 830 }, { "epoch": 0.12357257130247697, "grad_norm": 6.52623890356355, "learning_rate": 8.225490196078431e-06, "loss": 3.2666, "step": 840 }, { "epoch": 0.12504367334179217, "grad_norm": 5.8944819943401665, "learning_rate": 8.323529411764707e-06, "loss": 3.1964, "step": 850 }, { "epoch": 0.12651477538110736, "grad_norm": 6.056719875612138, "learning_rate": 8.42156862745098e-06, "loss": 3.2059, "step": 860 }, { "epoch": 0.12798587742042258, "grad_norm": 6.201845715677044, "learning_rate": 8.519607843137256e-06, "loss": 3.1135, "step": 870 }, { "epoch": 0.12945697945973778, "grad_norm": 6.420509296740124, "learning_rate": 8.61764705882353e-06, "loss": 3.1424, "step": 880 }, { "epoch": 0.13092808149905297, "grad_norm": 6.66328945725781, "learning_rate": 8.715686274509804e-06, "loss": 3.1376, "step": 890 }, { "epoch": 0.1323991835383682, "grad_norm": 6.197964660867368, "learning_rate": 8.81372549019608e-06, "loss": 3.0407, "step": 900 }, { "epoch": 0.13387028557768338, "grad_norm": 5.621529595646019, "learning_rate": 8.911764705882354e-06, "loss": 3.1452, "step": 910 }, { "epoch": 0.13534138761699857, "grad_norm": 6.295795702416077, "learning_rate": 9.009803921568629e-06, "loss": 3.2117, "step": 920 }, { "epoch": 0.1368124896563138, "grad_norm": 6.039321793458176, "learning_rate": 9.107843137254903e-06, "loss": 3.1871, "step": 930 }, { "epoch": 0.13828359169562898, "grad_norm": 5.847194000546661, "learning_rate": 9.205882352941177e-06, "loss": 3.1256, "step": 940 }, { "epoch": 0.1397546937349442, "grad_norm": 6.58051040176238, "learning_rate": 9.303921568627453e-06, "loss": 3.1181, "step": 950 }, { "epoch": 0.1412257957742594, "grad_norm": 5.69537908451469, "learning_rate": 9.401960784313727e-06, "loss": 3.1376, "step": 960 }, { "epoch": 0.14269689781357459, "grad_norm": 5.817700482263394, "learning_rate": 9.5e-06, "loss": 3.049, "step": 970 }, { "epoch": 0.1441679998528898, "grad_norm": 5.529511959059173, "learning_rate": 9.598039215686276e-06, "loss": 3.1162, "step": 980 }, { "epoch": 0.145639101892205, "grad_norm": 6.21762167441033, "learning_rate": 9.69607843137255e-06, "loss": 3.1539, "step": 990 }, { "epoch": 0.1471102039315202, "grad_norm": 5.726871844912853, "learning_rate": 9.794117647058824e-06, "loss": 3.1652, "step": 1000 }, { "epoch": 0.1471102039315202, "eval_loss": 2.999117136001587, "eval_runtime": 337.7515, "eval_samples_per_second": 271.173, "eval_steps_per_second": 8.477, "step": 1000 }, { "epoch": 0.1485813059708354, "grad_norm": 6.190793241950275, "learning_rate": 9.8921568627451e-06, "loss": 3.0528, "step": 1010 }, { "epoch": 0.1500524080101506, "grad_norm": 5.955157680162561, "learning_rate": 9.990196078431374e-06, "loss": 3.1431, "step": 1020 }, { "epoch": 0.1515235100494658, "grad_norm": 5.502151085259626, "learning_rate": 1.0088235294117648e-05, "loss": 3.1785, "step": 1030 }, { "epoch": 0.152994612088781, "grad_norm": 6.181010377227675, "learning_rate": 1.0186274509803921e-05, "loss": 3.0941, "step": 1040 }, { "epoch": 0.1544657141280962, "grad_norm": 5.153288378605397, "learning_rate": 1.0284313725490195e-05, "loss": 3.0664, "step": 1050 }, { "epoch": 0.15593681616741142, "grad_norm": 5.834588438411839, "learning_rate": 1.0382352941176473e-05, "loss": 3.172, "step": 1060 }, { "epoch": 0.15740791820672662, "grad_norm": 5.921432675823691, "learning_rate": 1.0480392156862747e-05, "loss": 3.1119, "step": 1070 }, { "epoch": 0.1588790202460418, "grad_norm": 5.449279807637706, "learning_rate": 1.057843137254902e-05, "loss": 3.0954, "step": 1080 }, { "epoch": 0.16035012228535703, "grad_norm": 5.394164526269914, "learning_rate": 1.0676470588235295e-05, "loss": 3.1932, "step": 1090 }, { "epoch": 0.16182122432467222, "grad_norm": 5.665649121367119, "learning_rate": 1.0774509803921568e-05, "loss": 3.0295, "step": 1100 }, { "epoch": 0.1632923263639874, "grad_norm": 5.662461579153995, "learning_rate": 1.0872549019607842e-05, "loss": 3.0707, "step": 1110 }, { "epoch": 0.16476342840330263, "grad_norm": 5.177458674068838, "learning_rate": 1.097058823529412e-05, "loss": 2.9795, "step": 1120 }, { "epoch": 0.16623453044261782, "grad_norm": 5.40126851780247, "learning_rate": 1.1068627450980394e-05, "loss": 3.1904, "step": 1130 }, { "epoch": 0.16770563248193301, "grad_norm": 4.953305818562837, "learning_rate": 1.1166666666666668e-05, "loss": 3.167, "step": 1140 }, { "epoch": 0.16917673452124823, "grad_norm": 5.435993325416515, "learning_rate": 1.1264705882352942e-05, "loss": 3.1197, "step": 1150 }, { "epoch": 0.17064783656056343, "grad_norm": 5.367666933958674, "learning_rate": 1.1362745098039215e-05, "loss": 3.1262, "step": 1160 }, { "epoch": 0.17211893859987865, "grad_norm": 5.257856451228682, "learning_rate": 1.1460784313725491e-05, "loss": 3.0218, "step": 1170 }, { "epoch": 0.17359004063919384, "grad_norm": 5.218586083618575, "learning_rate": 1.1558823529411765e-05, "loss": 3.0407, "step": 1180 }, { "epoch": 0.17506114267850903, "grad_norm": 5.440467125091453, "learning_rate": 1.165686274509804e-05, "loss": 3.0311, "step": 1190 }, { "epoch": 0.17653224471782425, "grad_norm": 5.700250048103754, "learning_rate": 1.1754901960784315e-05, "loss": 3.1875, "step": 1200 }, { "epoch": 0.17800334675713944, "grad_norm": 6.32577219550878, "learning_rate": 1.1852941176470589e-05, "loss": 3.0798, "step": 1210 }, { "epoch": 0.17947444879645463, "grad_norm": 6.044057328045771, "learning_rate": 1.1950980392156864e-05, "loss": 3.1151, "step": 1220 }, { "epoch": 0.18094555083576985, "grad_norm": 5.396481994465476, "learning_rate": 1.2049019607843138e-05, "loss": 3.0829, "step": 1230 }, { "epoch": 0.18241665287508504, "grad_norm": 4.925979837219996, "learning_rate": 1.2147058823529412e-05, "loss": 3.1077, "step": 1240 }, { "epoch": 0.18388775491440024, "grad_norm": 4.9961918014630795, "learning_rate": 1.2245098039215688e-05, "loss": 3.1207, "step": 1250 }, { "epoch": 0.18535885695371546, "grad_norm": 5.207063986398873, "learning_rate": 1.2343137254901962e-05, "loss": 3.1327, "step": 1260 }, { "epoch": 0.18682995899303065, "grad_norm": 5.125445437324428, "learning_rate": 1.2441176470588237e-05, "loss": 2.9861, "step": 1270 }, { "epoch": 0.18830106103234587, "grad_norm": 4.920442915419563, "learning_rate": 1.2539215686274511e-05, "loss": 3.1002, "step": 1280 }, { "epoch": 0.18977216307166106, "grad_norm": 4.751314714837663, "learning_rate": 1.2637254901960785e-05, "loss": 3.1466, "step": 1290 }, { "epoch": 0.19124326511097625, "grad_norm": 4.718357828422992, "learning_rate": 1.2735294117647059e-05, "loss": 3.0354, "step": 1300 }, { "epoch": 0.19271436715029147, "grad_norm": 4.8212504558062115, "learning_rate": 1.2833333333333335e-05, "loss": 3.0486, "step": 1310 }, { "epoch": 0.19418546918960666, "grad_norm": 4.793727108836257, "learning_rate": 1.293137254901961e-05, "loss": 2.9342, "step": 1320 }, { "epoch": 0.19565657122892186, "grad_norm": 4.958676457556465, "learning_rate": 1.3029411764705884e-05, "loss": 3.1118, "step": 1330 }, { "epoch": 0.19712767326823707, "grad_norm": 4.407408936591921, "learning_rate": 1.3127450980392158e-05, "loss": 2.9943, "step": 1340 }, { "epoch": 0.19859877530755227, "grad_norm": 5.087564160818564, "learning_rate": 1.3225490196078432e-05, "loss": 2.968, "step": 1350 }, { "epoch": 0.20006987734686746, "grad_norm": 5.043470282367098, "learning_rate": 1.3323529411764706e-05, "loss": 3.0488, "step": 1360 }, { "epoch": 0.20154097938618268, "grad_norm": 4.680641552477329, "learning_rate": 1.342156862745098e-05, "loss": 3.1118, "step": 1370 }, { "epoch": 0.20301208142549787, "grad_norm": 4.479784250653302, "learning_rate": 1.3519607843137257e-05, "loss": 3.0392, "step": 1380 }, { "epoch": 0.2044831834648131, "grad_norm": 4.664550392292978, "learning_rate": 1.3617647058823531e-05, "loss": 3.0802, "step": 1390 }, { "epoch": 0.20595428550412828, "grad_norm": 4.615102907673118, "learning_rate": 1.3715686274509805e-05, "loss": 3.0491, "step": 1400 }, { "epoch": 0.20742538754344347, "grad_norm": 4.4349275701752315, "learning_rate": 1.3813725490196079e-05, "loss": 3.0065, "step": 1410 }, { "epoch": 0.2088964895827587, "grad_norm": 4.521394623978541, "learning_rate": 1.3911764705882353e-05, "loss": 2.9944, "step": 1420 }, { "epoch": 0.21036759162207389, "grad_norm": 4.9215410107297295, "learning_rate": 1.4009803921568627e-05, "loss": 3.0211, "step": 1430 }, { "epoch": 0.21183869366138908, "grad_norm": 4.57673049996733, "learning_rate": 1.4107843137254904e-05, "loss": 2.9345, "step": 1440 }, { "epoch": 0.2133097957007043, "grad_norm": 4.548792792968484, "learning_rate": 1.4205882352941178e-05, "loss": 2.9116, "step": 1450 }, { "epoch": 0.2147808977400195, "grad_norm": 4.608092324673133, "learning_rate": 1.4303921568627452e-05, "loss": 3.0297, "step": 1460 }, { "epoch": 0.21625199977933468, "grad_norm": 4.1916717156705845, "learning_rate": 1.4401960784313726e-05, "loss": 2.922, "step": 1470 }, { "epoch": 0.2177231018186499, "grad_norm": 4.501076693970024, "learning_rate": 1.45e-05, "loss": 3.0251, "step": 1480 }, { "epoch": 0.2191942038579651, "grad_norm": 4.348611343272184, "learning_rate": 1.4598039215686276e-05, "loss": 2.9365, "step": 1490 }, { "epoch": 0.2206653058972803, "grad_norm": 4.2733598600410785, "learning_rate": 1.4696078431372551e-05, "loss": 3.0107, "step": 1500 }, { "epoch": 0.2221364079365955, "grad_norm": 4.919187347741377, "learning_rate": 1.4794117647058825e-05, "loss": 3.0395, "step": 1510 }, { "epoch": 0.2236075099759107, "grad_norm": 4.5571920189385215, "learning_rate": 1.48921568627451e-05, "loss": 2.9233, "step": 1520 }, { "epoch": 0.22507861201522592, "grad_norm": 4.329967409345819, "learning_rate": 1.4990196078431373e-05, "loss": 3.0054, "step": 1530 }, { "epoch": 0.2265497140545411, "grad_norm": 5.070531561309539, "learning_rate": 1.5088235294117649e-05, "loss": 3.15, "step": 1540 }, { "epoch": 0.2280208160938563, "grad_norm": 4.430904562625354, "learning_rate": 1.5186274509803923e-05, "loss": 3.0143, "step": 1550 }, { "epoch": 0.22949191813317152, "grad_norm": 4.163850708898494, "learning_rate": 1.5284313725490195e-05, "loss": 2.9884, "step": 1560 }, { "epoch": 0.2309630201724867, "grad_norm": 4.100004319335175, "learning_rate": 1.5382352941176474e-05, "loss": 2.9701, "step": 1570 }, { "epoch": 0.2324341222118019, "grad_norm": 4.021344078808187, "learning_rate": 1.5480392156862746e-05, "loss": 3.0129, "step": 1580 }, { "epoch": 0.23390522425111712, "grad_norm": 4.082107244451859, "learning_rate": 1.5578431372549022e-05, "loss": 2.9649, "step": 1590 }, { "epoch": 0.23537632629043231, "grad_norm": 3.998715364354105, "learning_rate": 1.5676470588235294e-05, "loss": 2.9254, "step": 1600 }, { "epoch": 0.23684742832974753, "grad_norm": 4.414392375212064, "learning_rate": 1.577450980392157e-05, "loss": 2.9938, "step": 1610 }, { "epoch": 0.23831853036906273, "grad_norm": 4.398963162641712, "learning_rate": 1.5872549019607842e-05, "loss": 2.9542, "step": 1620 }, { "epoch": 0.23978963240837792, "grad_norm": 3.8879357487063637, "learning_rate": 1.597058823529412e-05, "loss": 2.9114, "step": 1630 }, { "epoch": 0.24126073444769314, "grad_norm": 4.085338713226984, "learning_rate": 1.6068627450980393e-05, "loss": 3.0338, "step": 1640 }, { "epoch": 0.24273183648700833, "grad_norm": 4.513647248397561, "learning_rate": 1.616666666666667e-05, "loss": 3.0555, "step": 1650 }, { "epoch": 0.24420293852632352, "grad_norm": 3.801848789999424, "learning_rate": 1.626470588235294e-05, "loss": 2.961, "step": 1660 }, { "epoch": 0.24567404056563874, "grad_norm": 4.054106678537572, "learning_rate": 1.6362745098039217e-05, "loss": 2.8823, "step": 1670 }, { "epoch": 0.24714514260495393, "grad_norm": 4.206849659143871, "learning_rate": 1.6460784313725492e-05, "loss": 3.1235, "step": 1680 }, { "epoch": 0.24861624464426912, "grad_norm": 4.2160073978318495, "learning_rate": 1.6558823529411765e-05, "loss": 3.0495, "step": 1690 }, { "epoch": 0.25008734668358434, "grad_norm": 3.753404487160081, "learning_rate": 1.665686274509804e-05, "loss": 2.9682, "step": 1700 }, { "epoch": 0.25155844872289956, "grad_norm": 4.036107123017637, "learning_rate": 1.6754901960784316e-05, "loss": 2.915, "step": 1710 }, { "epoch": 0.2530295507622147, "grad_norm": 3.6374040269245915, "learning_rate": 1.6852941176470588e-05, "loss": 2.9384, "step": 1720 }, { "epoch": 0.25450065280152995, "grad_norm": 3.841655747763544, "learning_rate": 1.6950980392156864e-05, "loss": 2.9346, "step": 1730 }, { "epoch": 0.25597175484084517, "grad_norm": 3.554343440225196, "learning_rate": 1.704901960784314e-05, "loss": 2.9101, "step": 1740 }, { "epoch": 0.25744285688016033, "grad_norm": 4.0558758606979115, "learning_rate": 1.714705882352941e-05, "loss": 2.9682, "step": 1750 }, { "epoch": 0.25891395891947555, "grad_norm": 3.832075757537308, "learning_rate": 1.7245098039215687e-05, "loss": 2.9839, "step": 1760 }, { "epoch": 0.26038506095879077, "grad_norm": 4.481522120835596, "learning_rate": 1.7343137254901963e-05, "loss": 3.0276, "step": 1770 }, { "epoch": 0.26185616299810593, "grad_norm": 4.215997451616899, "learning_rate": 1.744117647058824e-05, "loss": 2.9995, "step": 1780 }, { "epoch": 0.26332726503742115, "grad_norm": 4.2421308662411805, "learning_rate": 1.753921568627451e-05, "loss": 3.0416, "step": 1790 }, { "epoch": 0.2647983670767364, "grad_norm": 3.5819704368749625, "learning_rate": 1.7637254901960786e-05, "loss": 2.9966, "step": 1800 }, { "epoch": 0.26626946911605154, "grad_norm": 4.101364908417362, "learning_rate": 1.773529411764706e-05, "loss": 3.0303, "step": 1810 }, { "epoch": 0.26774057115536676, "grad_norm": 4.045850844701724, "learning_rate": 1.7833333333333334e-05, "loss": 2.95, "step": 1820 }, { "epoch": 0.269211673194682, "grad_norm": 3.5506705286412843, "learning_rate": 1.793137254901961e-05, "loss": 2.9503, "step": 1830 }, { "epoch": 0.27068277523399714, "grad_norm": 3.507369218282485, "learning_rate": 1.8029411764705886e-05, "loss": 2.8873, "step": 1840 }, { "epoch": 0.27215387727331236, "grad_norm": 3.6867050490945243, "learning_rate": 1.8127450980392158e-05, "loss": 2.9835, "step": 1850 }, { "epoch": 0.2736249793126276, "grad_norm": 3.6622111404729334, "learning_rate": 1.8225490196078433e-05, "loss": 2.9712, "step": 1860 }, { "epoch": 0.2750960813519428, "grad_norm": 3.670461875302331, "learning_rate": 1.8323529411764706e-05, "loss": 3.1268, "step": 1870 }, { "epoch": 0.27656718339125796, "grad_norm": 3.1939864694310223, "learning_rate": 1.842156862745098e-05, "loss": 2.884, "step": 1880 }, { "epoch": 0.2780382854305732, "grad_norm": 3.519917267460838, "learning_rate": 1.8519607843137257e-05, "loss": 2.9223, "step": 1890 }, { "epoch": 0.2795093874698884, "grad_norm": 3.4050811240406733, "learning_rate": 1.8617647058823533e-05, "loss": 2.9678, "step": 1900 }, { "epoch": 0.28098048950920357, "grad_norm": 3.578781241096238, "learning_rate": 1.8715686274509805e-05, "loss": 2.8867, "step": 1910 }, { "epoch": 0.2824515915485188, "grad_norm": 3.343062694475462, "learning_rate": 1.881372549019608e-05, "loss": 3.0163, "step": 1920 }, { "epoch": 0.283922693587834, "grad_norm": 3.6217316765996386, "learning_rate": 1.8911764705882353e-05, "loss": 2.8875, "step": 1930 }, { "epoch": 0.28539379562714917, "grad_norm": 3.4480457254033987, "learning_rate": 1.9009803921568628e-05, "loss": 2.8928, "step": 1940 }, { "epoch": 0.2868648976664644, "grad_norm": 3.4679229936415608, "learning_rate": 1.9107843137254904e-05, "loss": 2.8991, "step": 1950 }, { "epoch": 0.2883359997057796, "grad_norm": 3.3703878211810885, "learning_rate": 1.920588235294118e-05, "loss": 3.012, "step": 1960 }, { "epoch": 0.2898071017450948, "grad_norm": 3.406153847268149, "learning_rate": 1.9303921568627452e-05, "loss": 2.9673, "step": 1970 }, { "epoch": 0.29127820378441, "grad_norm": 3.5732400346584114, "learning_rate": 1.9401960784313727e-05, "loss": 2.9527, "step": 1980 }, { "epoch": 0.2927493058237252, "grad_norm": 3.5617858266965463, "learning_rate": 1.95e-05, "loss": 2.9464, "step": 1990 }, { "epoch": 0.2942204078630404, "grad_norm": 3.5069896432361, "learning_rate": 1.9598039215686275e-05, "loss": 2.8946, "step": 2000 }, { "epoch": 0.2942204078630404, "eval_loss": 2.847656488418579, "eval_runtime": 340.875, "eval_samples_per_second": 268.688, "eval_steps_per_second": 8.399, "step": 2000 }, { "epoch": 0.2956915099023556, "grad_norm": 3.221526918879751, "learning_rate": 1.969607843137255e-05, "loss": 2.926, "step": 2010 }, { "epoch": 0.2971626119416708, "grad_norm": 3.359246292330086, "learning_rate": 1.9794117647058827e-05, "loss": 2.9087, "step": 2020 }, { "epoch": 0.298633713980986, "grad_norm": 3.357849562262364, "learning_rate": 1.98921568627451e-05, "loss": 2.8988, "step": 2030 }, { "epoch": 0.3001048160203012, "grad_norm": 3.168185472410491, "learning_rate": 1.9990196078431374e-05, "loss": 2.9612, "step": 2040 }, { "epoch": 0.3015759180596164, "grad_norm": 3.3550282552548656, "learning_rate": 1.9999988130423818e-05, "loss": 2.9422, "step": 2050 }, { "epoch": 0.3030470200989316, "grad_norm": 3.2028150715861283, "learning_rate": 1.9999947099826275e-05, "loss": 2.9079, "step": 2060 }, { "epoch": 0.3045181221382468, "grad_norm": 3.566764787955102, "learning_rate": 1.9999876761789614e-05, "loss": 2.8736, "step": 2070 }, { "epoch": 0.305989224177562, "grad_norm": 3.2603976283237217, "learning_rate": 1.9999777116519978e-05, "loss": 2.9916, "step": 2080 }, { "epoch": 0.30746032621687724, "grad_norm": 3.078005020454507, "learning_rate": 1.9999648164309408e-05, "loss": 2.9678, "step": 2090 }, { "epoch": 0.3089314282561924, "grad_norm": 2.9451303521442718, "learning_rate": 1.999948990553583e-05, "loss": 2.8639, "step": 2100 }, { "epoch": 0.31040253029550763, "grad_norm": 3.362797921444144, "learning_rate": 1.999930234066306e-05, "loss": 2.9435, "step": 2110 }, { "epoch": 0.31187363233482285, "grad_norm": 3.546104835498642, "learning_rate": 1.999908547024081e-05, "loss": 2.9608, "step": 2120 }, { "epoch": 0.313344734374138, "grad_norm": 3.078341477658518, "learning_rate": 1.999883929490467e-05, "loss": 2.8865, "step": 2130 }, { "epoch": 0.31481583641345323, "grad_norm": 3.495919652477174, "learning_rate": 1.999856381537612e-05, "loss": 3.0123, "step": 2140 }, { "epoch": 0.31628693845276845, "grad_norm": 3.200124724398863, "learning_rate": 1.9998259032462533e-05, "loss": 2.8452, "step": 2150 }, { "epoch": 0.3177580404920836, "grad_norm": 2.9761148882132, "learning_rate": 1.9997924947057146e-05, "loss": 2.9498, "step": 2160 }, { "epoch": 0.31922914253139884, "grad_norm": 3.1215818705738196, "learning_rate": 1.9997561560139086e-05, "loss": 2.8845, "step": 2170 }, { "epoch": 0.32070024457071405, "grad_norm": 3.263908425794416, "learning_rate": 1.9997168872773355e-05, "loss": 2.9407, "step": 2180 }, { "epoch": 0.3221713466100292, "grad_norm": 3.2022833767535213, "learning_rate": 1.999674688611082e-05, "loss": 2.9211, "step": 2190 }, { "epoch": 0.32364244864934444, "grad_norm": 3.1830337723875224, "learning_rate": 1.9996295601388226e-05, "loss": 2.9348, "step": 2200 }, { "epoch": 0.32511355068865966, "grad_norm": 2.9262850691199285, "learning_rate": 1.9995815019928178e-05, "loss": 2.9803, "step": 2210 }, { "epoch": 0.3265846527279748, "grad_norm": 3.054682944088222, "learning_rate": 1.9995305143139144e-05, "loss": 2.8835, "step": 2220 }, { "epoch": 0.32805575476729004, "grad_norm": 3.292185555497587, "learning_rate": 1.9994765972515452e-05, "loss": 2.8624, "step": 2230 }, { "epoch": 0.32952685680660526, "grad_norm": 3.1813837651657373, "learning_rate": 1.9994197509637282e-05, "loss": 3.067, "step": 2240 }, { "epoch": 0.3309979588459204, "grad_norm": 2.939018891358365, "learning_rate": 1.9993599756170664e-05, "loss": 2.8449, "step": 2250 }, { "epoch": 0.33246906088523565, "grad_norm": 3.1783121570370643, "learning_rate": 1.9992972713867462e-05, "loss": 2.8768, "step": 2260 }, { "epoch": 0.33394016292455087, "grad_norm": 2.71214858370158, "learning_rate": 1.9992316384565395e-05, "loss": 2.8915, "step": 2270 }, { "epoch": 0.33541126496386603, "grad_norm": 2.903326706819815, "learning_rate": 1.9991630770187996e-05, "loss": 2.8548, "step": 2280 }, { "epoch": 0.33688236700318125, "grad_norm": 3.081160690795643, "learning_rate": 1.9990915872744646e-05, "loss": 2.825, "step": 2290 }, { "epoch": 0.33835346904249647, "grad_norm": 2.8262282488197386, "learning_rate": 1.999017169433053e-05, "loss": 2.9211, "step": 2300 }, { "epoch": 0.3398245710818117, "grad_norm": 2.889736785732422, "learning_rate": 1.9989398237126663e-05, "loss": 2.9716, "step": 2310 }, { "epoch": 0.34129567312112685, "grad_norm": 3.1091208853268304, "learning_rate": 1.9988595503399856e-05, "loss": 2.797, "step": 2320 }, { "epoch": 0.34276677516044207, "grad_norm": 2.9235985906211543, "learning_rate": 1.998776349550273e-05, "loss": 2.7974, "step": 2330 }, { "epoch": 0.3442378771997573, "grad_norm": 2.8062922097304703, "learning_rate": 1.99869022158737e-05, "loss": 2.9073, "step": 2340 }, { "epoch": 0.34570897923907246, "grad_norm": 2.956294498689921, "learning_rate": 1.9986011667036974e-05, "loss": 2.8665, "step": 2350 }, { "epoch": 0.3471800812783877, "grad_norm": 2.9287227175052437, "learning_rate": 1.998509185160253e-05, "loss": 2.8546, "step": 2360 }, { "epoch": 0.3486511833177029, "grad_norm": 2.643572414855314, "learning_rate": 1.9984142772266124e-05, "loss": 2.8819, "step": 2370 }, { "epoch": 0.35012228535701806, "grad_norm": 3.055340227210005, "learning_rate": 1.998316443180929e-05, "loss": 3.0119, "step": 2380 }, { "epoch": 0.3515933873963333, "grad_norm": 3.3365470065288343, "learning_rate": 1.9982156833099295e-05, "loss": 2.8785, "step": 2390 }, { "epoch": 0.3530644894356485, "grad_norm": 2.820114968955479, "learning_rate": 1.998111997908918e-05, "loss": 2.8461, "step": 2400 }, { "epoch": 0.35453559147496366, "grad_norm": 2.8006394498666722, "learning_rate": 1.9980053872817708e-05, "loss": 2.83, "step": 2410 }, { "epoch": 0.3560066935142789, "grad_norm": 2.8788682467098594, "learning_rate": 1.9978958517409383e-05, "loss": 2.8615, "step": 2420 }, { "epoch": 0.3574777955535941, "grad_norm": 3.087282310362417, "learning_rate": 1.9977833916074424e-05, "loss": 2.8888, "step": 2430 }, { "epoch": 0.35894889759290927, "grad_norm": 2.852888629823497, "learning_rate": 1.997668007210877e-05, "loss": 2.7679, "step": 2440 }, { "epoch": 0.3604199996322245, "grad_norm": 2.7705968758206336, "learning_rate": 1.9975496988894064e-05, "loss": 2.8232, "step": 2450 }, { "epoch": 0.3618911016715397, "grad_norm": 2.7470510714135403, "learning_rate": 1.997428466989763e-05, "loss": 2.9085, "step": 2460 }, { "epoch": 0.36336220371085487, "grad_norm": 2.5676587038388665, "learning_rate": 1.9973043118672488e-05, "loss": 2.8171, "step": 2470 }, { "epoch": 0.3648333057501701, "grad_norm": 2.786797066208579, "learning_rate": 1.9971772338857327e-05, "loss": 2.8226, "step": 2480 }, { "epoch": 0.3663044077894853, "grad_norm": 2.6711553629681215, "learning_rate": 1.9970472334176495e-05, "loss": 2.9017, "step": 2490 }, { "epoch": 0.3677755098288005, "grad_norm": 3.229590243606091, "learning_rate": 1.9969143108439993e-05, "loss": 2.8699, "step": 2500 }, { "epoch": 0.3692466118681157, "grad_norm": 2.9210642964251887, "learning_rate": 1.9967784665543465e-05, "loss": 2.8483, "step": 2510 }, { "epoch": 0.3707177139074309, "grad_norm": 2.8443367638789416, "learning_rate": 1.9966397009468172e-05, "loss": 2.8673, "step": 2520 }, { "epoch": 0.37218881594674613, "grad_norm": 2.640541238300699, "learning_rate": 1.996498014428101e-05, "loss": 2.9184, "step": 2530 }, { "epoch": 0.3736599179860613, "grad_norm": 2.790018647234763, "learning_rate": 1.9963534074134464e-05, "loss": 2.7604, "step": 2540 }, { "epoch": 0.3751310200253765, "grad_norm": 2.6562858622660865, "learning_rate": 1.9962058803266618e-05, "loss": 2.91, "step": 2550 }, { "epoch": 0.37660212206469174, "grad_norm": 2.7951428545867114, "learning_rate": 1.996055433600114e-05, "loss": 2.8467, "step": 2560 }, { "epoch": 0.3780732241040069, "grad_norm": 3.152400782886881, "learning_rate": 1.9959020676747257e-05, "loss": 2.8909, "step": 2570 }, { "epoch": 0.3795443261433221, "grad_norm": 2.752725880165347, "learning_rate": 1.9957457829999754e-05, "loss": 2.833, "step": 2580 }, { "epoch": 0.38101542818263734, "grad_norm": 2.7822763219945275, "learning_rate": 1.995586580033897e-05, "loss": 2.9542, "step": 2590 }, { "epoch": 0.3824865302219525, "grad_norm": 2.847502702063513, "learning_rate": 1.9954244592430747e-05, "loss": 2.8881, "step": 2600 }, { "epoch": 0.3839576322612677, "grad_norm": 2.6484635854420553, "learning_rate": 1.995259421102646e-05, "loss": 2.8267, "step": 2610 }, { "epoch": 0.38542873430058294, "grad_norm": 2.8143115314356133, "learning_rate": 1.9950914660962984e-05, "loss": 2.7723, "step": 2620 }, { "epoch": 0.3868998363398981, "grad_norm": 2.6520908915416705, "learning_rate": 1.9949205947162676e-05, "loss": 2.8784, "step": 2630 }, { "epoch": 0.3883709383792133, "grad_norm": 2.63456765057146, "learning_rate": 1.9947468074633358e-05, "loss": 2.9055, "step": 2640 }, { "epoch": 0.38984204041852855, "grad_norm": 2.6848925985961305, "learning_rate": 1.9945701048468324e-05, "loss": 2.8522, "step": 2650 }, { "epoch": 0.3913131424578437, "grad_norm": 3.2965998254545275, "learning_rate": 1.9943904873846295e-05, "loss": 2.8727, "step": 2660 }, { "epoch": 0.39278424449715893, "grad_norm": 2.7968509533655075, "learning_rate": 1.9942079556031433e-05, "loss": 2.8635, "step": 2670 }, { "epoch": 0.39425534653647415, "grad_norm": 2.8614597776193706, "learning_rate": 1.9940225100373304e-05, "loss": 2.8606, "step": 2680 }, { "epoch": 0.3957264485757893, "grad_norm": 3.286583353699016, "learning_rate": 1.993834151230687e-05, "loss": 2.8457, "step": 2690 }, { "epoch": 0.39719755061510453, "grad_norm": 2.5508800643858205, "learning_rate": 1.9936428797352474e-05, "loss": 2.8221, "step": 2700 }, { "epoch": 0.39866865265441975, "grad_norm": 2.5952512163396686, "learning_rate": 1.9934486961115826e-05, "loss": 2.7899, "step": 2710 }, { "epoch": 0.4001397546937349, "grad_norm": 2.723803422150279, "learning_rate": 1.9932516009287977e-05, "loss": 2.8407, "step": 2720 }, { "epoch": 0.40161085673305014, "grad_norm": 2.5970246930000758, "learning_rate": 1.9930515947645316e-05, "loss": 2.8154, "step": 2730 }, { "epoch": 0.40308195877236536, "grad_norm": 2.6297908082893495, "learning_rate": 1.992848678204954e-05, "loss": 2.8862, "step": 2740 }, { "epoch": 0.4045530608116806, "grad_norm": 2.7859465498212113, "learning_rate": 1.992642851844765e-05, "loss": 2.7927, "step": 2750 }, { "epoch": 0.40602416285099574, "grad_norm": 2.661489102790221, "learning_rate": 1.9924341162871914e-05, "loss": 2.8373, "step": 2760 }, { "epoch": 0.40749526489031096, "grad_norm": 2.9439265623671966, "learning_rate": 1.9922224721439875e-05, "loss": 2.8623, "step": 2770 }, { "epoch": 0.4089663669296262, "grad_norm": 2.720103696202938, "learning_rate": 1.9920079200354312e-05, "loss": 2.7692, "step": 2780 }, { "epoch": 0.41043746896894134, "grad_norm": 3.0833720428274582, "learning_rate": 1.9917904605903232e-05, "loss": 2.8566, "step": 2790 }, { "epoch": 0.41190857100825656, "grad_norm": 2.585597042356791, "learning_rate": 1.9915700944459842e-05, "loss": 2.7973, "step": 2800 }, { "epoch": 0.4133796730475718, "grad_norm": 2.728601525735839, "learning_rate": 1.9913468222482556e-05, "loss": 2.793, "step": 2810 }, { "epoch": 0.41485077508688695, "grad_norm": 2.5090006967197693, "learning_rate": 1.9911206446514932e-05, "loss": 2.8638, "step": 2820 }, { "epoch": 0.41632187712620217, "grad_norm": 2.5401292112925677, "learning_rate": 1.9908915623185696e-05, "loss": 2.8224, "step": 2830 }, { "epoch": 0.4177929791655174, "grad_norm": 2.3748619381220095, "learning_rate": 1.9906595759208702e-05, "loss": 2.8442, "step": 2840 }, { "epoch": 0.41926408120483255, "grad_norm": 2.675192240844502, "learning_rate": 1.9904246861382913e-05, "loss": 2.7913, "step": 2850 }, { "epoch": 0.42073518324414777, "grad_norm": 2.5700726216129572, "learning_rate": 1.9901868936592383e-05, "loss": 2.8037, "step": 2860 }, { "epoch": 0.422206285283463, "grad_norm": 2.611639161949924, "learning_rate": 1.9899461991806234e-05, "loss": 2.7916, "step": 2870 }, { "epoch": 0.42367738732277815, "grad_norm": 2.8979219659298194, "learning_rate": 1.9897026034078647e-05, "loss": 2.7078, "step": 2880 }, { "epoch": 0.4251484893620934, "grad_norm": 2.5615246514048797, "learning_rate": 1.9894561070548823e-05, "loss": 2.7714, "step": 2890 }, { "epoch": 0.4266195914014086, "grad_norm": 2.6686013384809266, "learning_rate": 1.989206710844098e-05, "loss": 2.7742, "step": 2900 }, { "epoch": 0.42809069344072376, "grad_norm": 2.4360255762653518, "learning_rate": 1.9889544155064326e-05, "loss": 2.9265, "step": 2910 }, { "epoch": 0.429561795480039, "grad_norm": 2.673810642227496, "learning_rate": 1.988699221781302e-05, "loss": 2.9263, "step": 2920 }, { "epoch": 0.4310328975193542, "grad_norm": 2.5925045010194863, "learning_rate": 1.9884411304166186e-05, "loss": 2.7728, "step": 2930 }, { "epoch": 0.43250399955866936, "grad_norm": 2.699310729507785, "learning_rate": 1.988180142168785e-05, "loss": 2.7545, "step": 2940 }, { "epoch": 0.4339751015979846, "grad_norm": 2.5665018995875073, "learning_rate": 1.987916257802696e-05, "loss": 2.765, "step": 2950 }, { "epoch": 0.4354462036372998, "grad_norm": 3.1627304834919796, "learning_rate": 1.987649478091733e-05, "loss": 2.8427, "step": 2960 }, { "epoch": 0.436917305676615, "grad_norm": 2.9277181685325377, "learning_rate": 1.9873798038177625e-05, "loss": 2.9036, "step": 2970 }, { "epoch": 0.4383884077159302, "grad_norm": 2.564593763744071, "learning_rate": 1.9871072357711355e-05, "loss": 2.7938, "step": 2980 }, { "epoch": 0.4398595097552454, "grad_norm": 2.624353180819634, "learning_rate": 1.9868317747506834e-05, "loss": 2.8229, "step": 2990 }, { "epoch": 0.4413306117945606, "grad_norm": 2.231048110749228, "learning_rate": 1.9865534215637158e-05, "loss": 2.8231, "step": 3000 }, { "epoch": 0.4413306117945606, "eval_loss": 2.728628396987915, "eval_runtime": 350.6663, "eval_samples_per_second": 261.186, "eval_steps_per_second": 8.164, "step": 3000 }, { "epoch": 0.4428017138338758, "grad_norm": 2.6789003154076263, "learning_rate": 1.986272177026019e-05, "loss": 2.7488, "step": 3010 }, { "epoch": 0.444272815873191, "grad_norm": 2.525153539281344, "learning_rate": 1.9859880419618534e-05, "loss": 2.7594, "step": 3020 }, { "epoch": 0.4457439179125062, "grad_norm": 2.443743617454917, "learning_rate": 1.9857010172039507e-05, "loss": 2.7823, "step": 3030 }, { "epoch": 0.4472150199518214, "grad_norm": 2.5207912680281956, "learning_rate": 1.9854111035935105e-05, "loss": 2.8098, "step": 3040 }, { "epoch": 0.4486861219911366, "grad_norm": 2.710705979479822, "learning_rate": 1.9851183019802005e-05, "loss": 2.7749, "step": 3050 }, { "epoch": 0.45015722403045183, "grad_norm": 2.706798316922201, "learning_rate": 1.9848226132221516e-05, "loss": 2.7583, "step": 3060 }, { "epoch": 0.451628326069767, "grad_norm": 2.628549241555597, "learning_rate": 1.9845240381859567e-05, "loss": 2.8491, "step": 3070 }, { "epoch": 0.4530994281090822, "grad_norm": 2.4260173278912855, "learning_rate": 1.984222577746667e-05, "loss": 2.82, "step": 3080 }, { "epoch": 0.45457053014839743, "grad_norm": 2.473859226528826, "learning_rate": 1.9839182327877904e-05, "loss": 2.9255, "step": 3090 }, { "epoch": 0.4560416321877126, "grad_norm": 2.835854545651714, "learning_rate": 1.9836110042012894e-05, "loss": 2.8153, "step": 3100 }, { "epoch": 0.4575127342270278, "grad_norm": 2.43122477248373, "learning_rate": 1.9833008928875768e-05, "loss": 2.7526, "step": 3110 }, { "epoch": 0.45898383626634304, "grad_norm": 2.7327422398229544, "learning_rate": 1.9829878997555138e-05, "loss": 2.9282, "step": 3120 }, { "epoch": 0.4604549383056582, "grad_norm": 2.5789912801732062, "learning_rate": 1.9826720257224088e-05, "loss": 2.7628, "step": 3130 }, { "epoch": 0.4619260403449734, "grad_norm": 2.363579763249279, "learning_rate": 1.9823532717140118e-05, "loss": 2.7494, "step": 3140 }, { "epoch": 0.46339714238428864, "grad_norm": 2.8617072286143275, "learning_rate": 1.9820316386645144e-05, "loss": 2.9153, "step": 3150 }, { "epoch": 0.4648682444236038, "grad_norm": 2.726755019564634, "learning_rate": 1.981707127516546e-05, "loss": 2.8471, "step": 3160 }, { "epoch": 0.466339346462919, "grad_norm": 2.5296773886368262, "learning_rate": 1.9813797392211703e-05, "loss": 2.7534, "step": 3170 }, { "epoch": 0.46781044850223424, "grad_norm": 2.4122531952105395, "learning_rate": 1.981049474737884e-05, "loss": 2.729, "step": 3180 }, { "epoch": 0.46928155054154946, "grad_norm": 2.35794200359621, "learning_rate": 1.9807163350346123e-05, "loss": 2.7911, "step": 3190 }, { "epoch": 0.47075265258086463, "grad_norm": 2.5021122242200375, "learning_rate": 1.980380321087708e-05, "loss": 2.8164, "step": 3200 }, { "epoch": 0.47222375462017985, "grad_norm": 2.4667348291168034, "learning_rate": 1.9800414338819467e-05, "loss": 2.7982, "step": 3210 }, { "epoch": 0.47369485665949507, "grad_norm": 2.6340766506774616, "learning_rate": 1.979699674410526e-05, "loss": 2.8296, "step": 3220 }, { "epoch": 0.47516595869881023, "grad_norm": 2.461661968407773, "learning_rate": 1.9793550436750597e-05, "loss": 2.7262, "step": 3230 }, { "epoch": 0.47663706073812545, "grad_norm": 2.5367373002462514, "learning_rate": 1.9790075426855783e-05, "loss": 2.788, "step": 3240 }, { "epoch": 0.47810816277744067, "grad_norm": 2.6066117493610306, "learning_rate": 1.9786571724605236e-05, "loss": 2.7458, "step": 3250 }, { "epoch": 0.47957926481675583, "grad_norm": 2.462453676380383, "learning_rate": 1.978303934026746e-05, "loss": 2.8459, "step": 3260 }, { "epoch": 0.48105036685607105, "grad_norm": 2.491211968025449, "learning_rate": 1.9779478284195026e-05, "loss": 2.7984, "step": 3270 }, { "epoch": 0.4825214688953863, "grad_norm": 2.411685233992615, "learning_rate": 1.9775888566824533e-05, "loss": 2.8644, "step": 3280 }, { "epoch": 0.48399257093470144, "grad_norm": 2.5771627487083575, "learning_rate": 1.977227019867658e-05, "loss": 2.788, "step": 3290 }, { "epoch": 0.48546367297401666, "grad_norm": 2.294064858928463, "learning_rate": 1.9768623190355732e-05, "loss": 2.8633, "step": 3300 }, { "epoch": 0.4869347750133319, "grad_norm": 2.6387838746085683, "learning_rate": 1.976494755255049e-05, "loss": 2.7438, "step": 3310 }, { "epoch": 0.48840587705264704, "grad_norm": 2.2514747751210735, "learning_rate": 1.976124329603327e-05, "loss": 2.8076, "step": 3320 }, { "epoch": 0.48987697909196226, "grad_norm": 2.369982760766685, "learning_rate": 1.9757510431660352e-05, "loss": 2.8201, "step": 3330 }, { "epoch": 0.4913480811312775, "grad_norm": 2.52969857495081, "learning_rate": 1.975374897037187e-05, "loss": 2.7698, "step": 3340 }, { "epoch": 0.49281918317059265, "grad_norm": 2.387519445811719, "learning_rate": 1.974995892319175e-05, "loss": 2.7519, "step": 3350 }, { "epoch": 0.49429028520990786, "grad_norm": 2.3900687028877834, "learning_rate": 1.974614030122772e-05, "loss": 2.8503, "step": 3360 }, { "epoch": 0.4957613872492231, "grad_norm": 2.3030292027182955, "learning_rate": 1.974229311567124e-05, "loss": 2.7606, "step": 3370 }, { "epoch": 0.49723248928853825, "grad_norm": 2.5355290360764173, "learning_rate": 1.9738417377797478e-05, "loss": 2.8459, "step": 3380 }, { "epoch": 0.49870359132785347, "grad_norm": 2.2433223553369244, "learning_rate": 1.97345130989653e-05, "loss": 2.7965, "step": 3390 }, { "epoch": 0.5001746933671687, "grad_norm": 2.5667533758780974, "learning_rate": 1.97305802906172e-05, "loss": 2.8241, "step": 3400 }, { "epoch": 0.5016457954064839, "grad_norm": 2.380149697293527, "learning_rate": 1.9726618964279296e-05, "loss": 2.7956, "step": 3410 }, { "epoch": 0.5031168974457991, "grad_norm": 2.431222849258756, "learning_rate": 1.9722629131561282e-05, "loss": 2.8336, "step": 3420 }, { "epoch": 0.5045879994851142, "grad_norm": 2.219807808616715, "learning_rate": 1.97186108041564e-05, "loss": 2.7174, "step": 3430 }, { "epoch": 0.5060591015244295, "grad_norm": 2.268934113205678, "learning_rate": 1.9714563993841397e-05, "loss": 2.7433, "step": 3440 }, { "epoch": 0.5075302035637447, "grad_norm": 2.4703878735595994, "learning_rate": 1.97104887124765e-05, "loss": 2.7929, "step": 3450 }, { "epoch": 0.5090013056030599, "grad_norm": 2.312544414412648, "learning_rate": 1.9706384972005382e-05, "loss": 2.8627, "step": 3460 }, { "epoch": 0.5104724076423751, "grad_norm": 2.2480123252440705, "learning_rate": 1.9702252784455117e-05, "loss": 2.6762, "step": 3470 }, { "epoch": 0.5119435096816903, "grad_norm": 2.425337995985777, "learning_rate": 1.9698092161936154e-05, "loss": 2.8367, "step": 3480 }, { "epoch": 0.5134146117210054, "grad_norm": 2.417063423965001, "learning_rate": 1.9693903116642277e-05, "loss": 2.687, "step": 3490 }, { "epoch": 0.5148857137603207, "grad_norm": 2.2569745893400643, "learning_rate": 1.9689685660850567e-05, "loss": 2.7922, "step": 3500 }, { "epoch": 0.5163568157996359, "grad_norm": 2.352439460396855, "learning_rate": 1.9685439806921373e-05, "loss": 2.7235, "step": 3510 }, { "epoch": 0.5178279178389511, "grad_norm": 2.4527603435634737, "learning_rate": 1.9681165567298275e-05, "loss": 2.7928, "step": 3520 }, { "epoch": 0.5192990198782663, "grad_norm": 2.083747347134929, "learning_rate": 1.9676862954508036e-05, "loss": 2.6934, "step": 3530 }, { "epoch": 0.5207701219175815, "grad_norm": 2.5632264541779426, "learning_rate": 1.9672531981160584e-05, "loss": 2.7751, "step": 3540 }, { "epoch": 0.5222412239568968, "grad_norm": 2.4683213430301296, "learning_rate": 1.9668172659948956e-05, "loss": 2.7474, "step": 3550 }, { "epoch": 0.5237123259962119, "grad_norm": 2.3347411392047674, "learning_rate": 1.966378500364927e-05, "loss": 2.7984, "step": 3560 }, { "epoch": 0.5251834280355271, "grad_norm": 2.2586592888593744, "learning_rate": 1.9659369025120702e-05, "loss": 2.7851, "step": 3570 }, { "epoch": 0.5266545300748423, "grad_norm": 2.2355114406756647, "learning_rate": 1.9654924737305415e-05, "loss": 2.6771, "step": 3580 }, { "epoch": 0.5281256321141575, "grad_norm": 2.4085101496347225, "learning_rate": 1.9650452153228548e-05, "loss": 2.7808, "step": 3590 }, { "epoch": 0.5295967341534727, "grad_norm": 2.2781924900220814, "learning_rate": 1.9645951285998168e-05, "loss": 2.8199, "step": 3600 }, { "epoch": 0.531067836192788, "grad_norm": 2.3050916526748635, "learning_rate": 1.9641422148805242e-05, "loss": 2.7081, "step": 3610 }, { "epoch": 0.5325389382321031, "grad_norm": 2.2925647362997235, "learning_rate": 1.963686475492357e-05, "loss": 2.81, "step": 3620 }, { "epoch": 0.5340100402714183, "grad_norm": 2.217056781617939, "learning_rate": 1.963227911770978e-05, "loss": 2.7036, "step": 3630 }, { "epoch": 0.5354811423107335, "grad_norm": 2.2915915113996066, "learning_rate": 1.962766525060328e-05, "loss": 2.7759, "step": 3640 }, { "epoch": 0.5369522443500487, "grad_norm": 2.2465106746560424, "learning_rate": 1.9623023167126195e-05, "loss": 2.7342, "step": 3650 }, { "epoch": 0.538423346389364, "grad_norm": 2.274565597709604, "learning_rate": 1.961835288088336e-05, "loss": 2.7539, "step": 3660 }, { "epoch": 0.5398944484286792, "grad_norm": 2.271432730732139, "learning_rate": 1.9613654405562254e-05, "loss": 2.7774, "step": 3670 }, { "epoch": 0.5413655504679943, "grad_norm": 2.3233054931525876, "learning_rate": 1.9608927754932983e-05, "loss": 2.9002, "step": 3680 }, { "epoch": 0.5428366525073095, "grad_norm": 2.500761580859682, "learning_rate": 1.9604172942848224e-05, "loss": 2.746, "step": 3690 }, { "epoch": 0.5443077545466247, "grad_norm": 2.568703331399438, "learning_rate": 1.9599389983243188e-05, "loss": 2.7235, "step": 3700 }, { "epoch": 0.5457788565859399, "grad_norm": 2.2367600360138455, "learning_rate": 1.9594578890135574e-05, "loss": 2.7445, "step": 3710 }, { "epoch": 0.5472499586252552, "grad_norm": 2.2294554544202883, "learning_rate": 1.958973967762554e-05, "loss": 2.87, "step": 3720 }, { "epoch": 0.5487210606645704, "grad_norm": 2.215056041242684, "learning_rate": 1.9584872359895652e-05, "loss": 2.6933, "step": 3730 }, { "epoch": 0.5501921627038856, "grad_norm": 2.3503357253555897, "learning_rate": 1.9579976951210854e-05, "loss": 2.7394, "step": 3740 }, { "epoch": 0.5516632647432007, "grad_norm": 2.357625326448057, "learning_rate": 1.9575053465918404e-05, "loss": 2.7238, "step": 3750 }, { "epoch": 0.5531343667825159, "grad_norm": 2.2390385414579588, "learning_rate": 1.9570101918447857e-05, "loss": 2.8098, "step": 3760 }, { "epoch": 0.5546054688218311, "grad_norm": 2.205073868741591, "learning_rate": 1.9565122323311002e-05, "loss": 2.7859, "step": 3770 }, { "epoch": 0.5560765708611464, "grad_norm": 2.2757751711437235, "learning_rate": 1.9560114695101844e-05, "loss": 2.7278, "step": 3780 }, { "epoch": 0.5575476729004616, "grad_norm": 2.436549516867369, "learning_rate": 1.9555079048496527e-05, "loss": 2.7596, "step": 3790 }, { "epoch": 0.5590187749397768, "grad_norm": 2.434281083811189, "learning_rate": 1.9550015398253325e-05, "loss": 2.7138, "step": 3800 }, { "epoch": 0.5604898769790919, "grad_norm": 2.3161798292788287, "learning_rate": 1.9544923759212574e-05, "loss": 2.6578, "step": 3810 }, { "epoch": 0.5619609790184071, "grad_norm": 2.272313433603685, "learning_rate": 1.953980414629665e-05, "loss": 2.7595, "step": 3820 }, { "epoch": 0.5634320810577224, "grad_norm": 2.2348292248609902, "learning_rate": 1.9534656574509903e-05, "loss": 2.7863, "step": 3830 }, { "epoch": 0.5649031830970376, "grad_norm": 2.292114551137293, "learning_rate": 1.9529481058938624e-05, "loss": 2.7024, "step": 3840 }, { "epoch": 0.5663742851363528, "grad_norm": 2.3738870336578235, "learning_rate": 1.952427761475101e-05, "loss": 2.6893, "step": 3850 }, { "epoch": 0.567845387175668, "grad_norm": 2.237343951788618, "learning_rate": 1.9519046257197102e-05, "loss": 2.7918, "step": 3860 }, { "epoch": 0.5693164892149831, "grad_norm": 2.3015063101565585, "learning_rate": 1.951378700160875e-05, "loss": 2.7614, "step": 3870 }, { "epoch": 0.5707875912542983, "grad_norm": 2.4924110407990865, "learning_rate": 1.9508499863399564e-05, "loss": 2.7049, "step": 3880 }, { "epoch": 0.5722586932936136, "grad_norm": 2.3172680308164764, "learning_rate": 1.9503184858064877e-05, "loss": 2.6812, "step": 3890 }, { "epoch": 0.5737297953329288, "grad_norm": 2.2425198843132677, "learning_rate": 1.9497842001181692e-05, "loss": 2.7363, "step": 3900 }, { "epoch": 0.575200897372244, "grad_norm": 2.2399558775393213, "learning_rate": 1.9492471308408634e-05, "loss": 2.7241, "step": 3910 }, { "epoch": 0.5766719994115592, "grad_norm": 2.3023001117907307, "learning_rate": 1.9487072795485912e-05, "loss": 2.7691, "step": 3920 }, { "epoch": 0.5781431014508744, "grad_norm": 2.1890533669901213, "learning_rate": 1.9481646478235275e-05, "loss": 2.7392, "step": 3930 }, { "epoch": 0.5796142034901896, "grad_norm": 2.269830305760707, "learning_rate": 1.9476192372559947e-05, "loss": 2.7468, "step": 3940 }, { "epoch": 0.5810853055295048, "grad_norm": 2.3253263387064944, "learning_rate": 1.9470710494444597e-05, "loss": 2.782, "step": 3950 }, { "epoch": 0.58255640756882, "grad_norm": 2.514117990983834, "learning_rate": 1.9465200859955297e-05, "loss": 2.6993, "step": 3960 }, { "epoch": 0.5840275096081352, "grad_norm": 2.385823046795441, "learning_rate": 1.9459663485239458e-05, "loss": 2.7596, "step": 3970 }, { "epoch": 0.5854986116474504, "grad_norm": 2.0598984155221722, "learning_rate": 1.945409838652579e-05, "loss": 2.7764, "step": 3980 }, { "epoch": 0.5869697136867656, "grad_norm": 2.421421970558393, "learning_rate": 1.9448505580124265e-05, "loss": 2.7161, "step": 3990 }, { "epoch": 0.5884408157260808, "grad_norm": 2.2569578630135103, "learning_rate": 1.9442885082426045e-05, "loss": 2.6174, "step": 4000 }, { "epoch": 0.5884408157260808, "eval_loss": 2.6506688594818115, "eval_runtime": 345.3472, "eval_samples_per_second": 265.208, "eval_steps_per_second": 8.29, "step": 4000 }, { "epoch": 0.589911917765396, "grad_norm": 2.251141926974565, "learning_rate": 1.9437236909903462e-05, "loss": 2.7857, "step": 4010 }, { "epoch": 0.5913830198047112, "grad_norm": 2.3321898908880208, "learning_rate": 1.9431561079109955e-05, "loss": 2.8677, "step": 4020 }, { "epoch": 0.5928541218440264, "grad_norm": 2.239130882784565, "learning_rate": 1.942585760668001e-05, "loss": 2.7407, "step": 4030 }, { "epoch": 0.5943252238833416, "grad_norm": 2.0702218050114904, "learning_rate": 1.9420126509329133e-05, "loss": 2.6994, "step": 4040 }, { "epoch": 0.5957963259226569, "grad_norm": 2.1596930310418343, "learning_rate": 1.9414367803853794e-05, "loss": 2.7326, "step": 4050 }, { "epoch": 0.597267427961972, "grad_norm": 2.245826085931705, "learning_rate": 1.9408581507131376e-05, "loss": 2.7119, "step": 4060 }, { "epoch": 0.5987385300012872, "grad_norm": 2.198221417225154, "learning_rate": 1.9402767636120124e-05, "loss": 2.6448, "step": 4070 }, { "epoch": 0.6002096320406024, "grad_norm": 2.2217643709065724, "learning_rate": 1.9396926207859085e-05, "loss": 2.6294, "step": 4080 }, { "epoch": 0.6016807340799176, "grad_norm": 2.3145744127547467, "learning_rate": 1.939105723946809e-05, "loss": 2.7181, "step": 4090 }, { "epoch": 0.6031518361192328, "grad_norm": 2.3457202427121397, "learning_rate": 1.9385160748147667e-05, "loss": 2.7256, "step": 4100 }, { "epoch": 0.6046229381585481, "grad_norm": 2.454398655565611, "learning_rate": 1.9379236751179017e-05, "loss": 2.7592, "step": 4110 }, { "epoch": 0.6060940401978632, "grad_norm": 2.294078957632621, "learning_rate": 1.937328526592395e-05, "loss": 2.6627, "step": 4120 }, { "epoch": 0.6075651422371784, "grad_norm": 2.3753848656285523, "learning_rate": 1.9367306309824835e-05, "loss": 2.7204, "step": 4130 }, { "epoch": 0.6090362442764936, "grad_norm": 2.2694373404133685, "learning_rate": 1.9361299900404555e-05, "loss": 2.6619, "step": 4140 }, { "epoch": 0.6105073463158088, "grad_norm": 2.4874976297555897, "learning_rate": 1.935526605526645e-05, "loss": 2.8672, "step": 4150 }, { "epoch": 0.611978448355124, "grad_norm": 2.153572208315787, "learning_rate": 1.9349204792094274e-05, "loss": 2.7587, "step": 4160 }, { "epoch": 0.6134495503944393, "grad_norm": 2.2054207166258317, "learning_rate": 1.9343116128652123e-05, "loss": 2.7302, "step": 4170 }, { "epoch": 0.6149206524337545, "grad_norm": 2.385727703978854, "learning_rate": 1.933700008278441e-05, "loss": 2.6754, "step": 4180 }, { "epoch": 0.6163917544730696, "grad_norm": 2.210981528205463, "learning_rate": 1.9330856672415792e-05, "loss": 2.6777, "step": 4190 }, { "epoch": 0.6178628565123848, "grad_norm": 2.3257220491901407, "learning_rate": 1.932468591555113e-05, "loss": 2.737, "step": 4200 }, { "epoch": 0.6193339585517, "grad_norm": 2.315747931623236, "learning_rate": 1.931848783027542e-05, "loss": 2.6828, "step": 4210 }, { "epoch": 0.6208050605910153, "grad_norm": 2.1078438915634945, "learning_rate": 1.931226243475377e-05, "loss": 2.6755, "step": 4220 }, { "epoch": 0.6222761626303305, "grad_norm": 2.2720656012212697, "learning_rate": 1.9306009747231306e-05, "loss": 2.7007, "step": 4230 }, { "epoch": 0.6237472646696457, "grad_norm": 2.2615158724154827, "learning_rate": 1.9299729786033158e-05, "loss": 2.6982, "step": 4240 }, { "epoch": 0.6252183667089608, "grad_norm": 2.2114379548787007, "learning_rate": 1.9293422569564372e-05, "loss": 2.7983, "step": 4250 }, { "epoch": 0.626689468748276, "grad_norm": 2.070192112733516, "learning_rate": 1.9287088116309896e-05, "loss": 2.6675, "step": 4260 }, { "epoch": 0.6281605707875912, "grad_norm": 2.225790964046902, "learning_rate": 1.9280726444834478e-05, "loss": 2.7501, "step": 4270 }, { "epoch": 0.6296316728269065, "grad_norm": 2.1191282008926446, "learning_rate": 1.927433757378265e-05, "loss": 2.7098, "step": 4280 }, { "epoch": 0.6311027748662217, "grad_norm": 2.134221417728929, "learning_rate": 1.9267921521878655e-05, "loss": 2.6865, "step": 4290 }, { "epoch": 0.6325738769055369, "grad_norm": 2.27901239102652, "learning_rate": 1.92614783079264e-05, "loss": 2.7806, "step": 4300 }, { "epoch": 0.634044978944852, "grad_norm": 2.348571854133755, "learning_rate": 1.9255007950809395e-05, "loss": 2.6898, "step": 4310 }, { "epoch": 0.6355160809841672, "grad_norm": 2.340896064372768, "learning_rate": 1.9248510469490696e-05, "loss": 2.6878, "step": 4320 }, { "epoch": 0.6369871830234825, "grad_norm": 2.168514160647567, "learning_rate": 1.9241985883012856e-05, "loss": 2.7297, "step": 4330 }, { "epoch": 0.6384582850627977, "grad_norm": 2.33675865813672, "learning_rate": 1.9235434210497877e-05, "loss": 2.6589, "step": 4340 }, { "epoch": 0.6399293871021129, "grad_norm": 1.9959327722329425, "learning_rate": 1.9228855471147134e-05, "loss": 2.6788, "step": 4350 }, { "epoch": 0.6414004891414281, "grad_norm": 2.256422069826509, "learning_rate": 1.9222249684241317e-05, "loss": 2.709, "step": 4360 }, { "epoch": 0.6428715911807433, "grad_norm": 2.379699188952532, "learning_rate": 1.921561686914041e-05, "loss": 2.6675, "step": 4370 }, { "epoch": 0.6443426932200584, "grad_norm": 2.152580277025235, "learning_rate": 1.9208957045283595e-05, "loss": 2.7327, "step": 4380 }, { "epoch": 0.6458137952593737, "grad_norm": 2.174629145616789, "learning_rate": 1.9202270232189212e-05, "loss": 2.7629, "step": 4390 }, { "epoch": 0.6472848972986889, "grad_norm": 2.1959917571270924, "learning_rate": 1.9195556449454705e-05, "loss": 2.708, "step": 4400 }, { "epoch": 0.6487559993380041, "grad_norm": 2.640890257499646, "learning_rate": 1.918881571675655e-05, "loss": 2.7939, "step": 4410 }, { "epoch": 0.6502271013773193, "grad_norm": 2.077352802593, "learning_rate": 1.9182048053850216e-05, "loss": 2.7202, "step": 4420 }, { "epoch": 0.6516982034166345, "grad_norm": 2.0858843058817254, "learning_rate": 1.91752534805701e-05, "loss": 2.6808, "step": 4430 }, { "epoch": 0.6531693054559496, "grad_norm": 2.3816427188693807, "learning_rate": 1.9168432016829452e-05, "loss": 2.7742, "step": 4440 }, { "epoch": 0.6546404074952649, "grad_norm": 1.9113185215852326, "learning_rate": 1.9161583682620352e-05, "loss": 2.6574, "step": 4450 }, { "epoch": 0.6561115095345801, "grad_norm": 2.059934844956337, "learning_rate": 1.915470849801361e-05, "loss": 2.7066, "step": 4460 }, { "epoch": 0.6575826115738953, "grad_norm": 2.1620725415163635, "learning_rate": 1.914780648315874e-05, "loss": 2.6197, "step": 4470 }, { "epoch": 0.6590537136132105, "grad_norm": 2.3497944081107525, "learning_rate": 1.914087765828389e-05, "loss": 2.7131, "step": 4480 }, { "epoch": 0.6605248156525257, "grad_norm": 2.0931142169873156, "learning_rate": 1.913392204369578e-05, "loss": 2.6182, "step": 4490 }, { "epoch": 0.6619959176918409, "grad_norm": 2.2930008889942735, "learning_rate": 1.912693965977964e-05, "loss": 2.6488, "step": 4500 }, { "epoch": 0.6634670197311561, "grad_norm": 2.221256203333721, "learning_rate": 1.911993052699916e-05, "loss": 2.6405, "step": 4510 }, { "epoch": 0.6649381217704713, "grad_norm": 2.0817306366063657, "learning_rate": 1.9112894665896417e-05, "loss": 2.6903, "step": 4520 }, { "epoch": 0.6664092238097865, "grad_norm": 2.2366376524701215, "learning_rate": 1.910583209709183e-05, "loss": 2.6641, "step": 4530 }, { "epoch": 0.6678803258491017, "grad_norm": 2.1906653914532694, "learning_rate": 1.909874284128409e-05, "loss": 2.6865, "step": 4540 }, { "epoch": 0.669351427888417, "grad_norm": 2.2010887712919565, "learning_rate": 1.90916269192501e-05, "loss": 2.7252, "step": 4550 }, { "epoch": 0.6708225299277321, "grad_norm": 2.165542951228262, "learning_rate": 1.9084484351844912e-05, "loss": 2.7255, "step": 4560 }, { "epoch": 0.6722936319670473, "grad_norm": 2.094344038992154, "learning_rate": 1.9077315160001677e-05, "loss": 2.7009, "step": 4570 }, { "epoch": 0.6737647340063625, "grad_norm": 2.240582829517519, "learning_rate": 1.907011936473157e-05, "loss": 2.6126, "step": 4580 }, { "epoch": 0.6752358360456777, "grad_norm": 2.2472282532894496, "learning_rate": 1.9062896987123736e-05, "loss": 2.6881, "step": 4590 }, { "epoch": 0.6767069380849929, "grad_norm": 2.2477940690910847, "learning_rate": 1.9055648048345228e-05, "loss": 2.7522, "step": 4600 }, { "epoch": 0.6781780401243082, "grad_norm": 2.0472926052136287, "learning_rate": 1.9048372569640935e-05, "loss": 2.6653, "step": 4610 }, { "epoch": 0.6796491421636234, "grad_norm": 2.172652154527864, "learning_rate": 1.9041070572333543e-05, "loss": 2.5922, "step": 4620 }, { "epoch": 0.6811202442029385, "grad_norm": 2.541481788833177, "learning_rate": 1.9033742077823444e-05, "loss": 2.6959, "step": 4630 }, { "epoch": 0.6825913462422537, "grad_norm": 2.2543856673207787, "learning_rate": 1.9026387107588694e-05, "loss": 2.6975, "step": 4640 }, { "epoch": 0.6840624482815689, "grad_norm": 2.03452190681398, "learning_rate": 1.9019005683184943e-05, "loss": 2.7128, "step": 4650 }, { "epoch": 0.6855335503208841, "grad_norm": 2.1527660197425336, "learning_rate": 1.9011597826245364e-05, "loss": 2.7348, "step": 4660 }, { "epoch": 0.6870046523601994, "grad_norm": 2.0472901340861798, "learning_rate": 1.9004163558480612e-05, "loss": 2.6402, "step": 4670 }, { "epoch": 0.6884757543995146, "grad_norm": 2.3043600463845806, "learning_rate": 1.899670290167873e-05, "loss": 2.7407, "step": 4680 }, { "epoch": 0.6899468564388297, "grad_norm": 2.2792655013794865, "learning_rate": 1.898921587770511e-05, "loss": 2.6226, "step": 4690 }, { "epoch": 0.6914179584781449, "grad_norm": 2.2906706642572323, "learning_rate": 1.8981702508502424e-05, "loss": 2.7538, "step": 4700 }, { "epoch": 0.6928890605174601, "grad_norm": 2.007388981544943, "learning_rate": 1.8974162816090544e-05, "loss": 2.6662, "step": 4710 }, { "epoch": 0.6943601625567754, "grad_norm": 2.2644783773297292, "learning_rate": 1.896659682256649e-05, "loss": 2.7582, "step": 4720 }, { "epoch": 0.6958312645960906, "grad_norm": 2.102563442264312, "learning_rate": 1.8959004550104375e-05, "loss": 2.7209, "step": 4730 }, { "epoch": 0.6973023666354058, "grad_norm": 2.3045328001156147, "learning_rate": 1.8951386020955325e-05, "loss": 2.6468, "step": 4740 }, { "epoch": 0.6987734686747209, "grad_norm": 2.135717636086249, "learning_rate": 1.8943741257447413e-05, "loss": 2.6792, "step": 4750 }, { "epoch": 0.7002445707140361, "grad_norm": 2.172628982671641, "learning_rate": 1.89360702819856e-05, "loss": 2.6767, "step": 4760 }, { "epoch": 0.7017156727533513, "grad_norm": 2.0471473589122775, "learning_rate": 1.8928373117051672e-05, "loss": 2.6475, "step": 4770 }, { "epoch": 0.7031867747926666, "grad_norm": 2.1497745045054177, "learning_rate": 1.8920649785204164e-05, "loss": 2.7563, "step": 4780 }, { "epoch": 0.7046578768319818, "grad_norm": 2.2538661259621735, "learning_rate": 1.8912900309078307e-05, "loss": 2.6875, "step": 4790 }, { "epoch": 0.706128978871297, "grad_norm": 2.1339042899424903, "learning_rate": 1.8905124711385952e-05, "loss": 2.6653, "step": 4800 }, { "epoch": 0.7076000809106122, "grad_norm": 2.230136974798985, "learning_rate": 1.8897323014915504e-05, "loss": 2.683, "step": 4810 }, { "epoch": 0.7090711829499273, "grad_norm": 1.997577079611526, "learning_rate": 1.8889495242531858e-05, "loss": 2.6309, "step": 4820 }, { "epoch": 0.7105422849892425, "grad_norm": 2.091548711370225, "learning_rate": 1.888164141717633e-05, "loss": 2.698, "step": 4830 }, { "epoch": 0.7120133870285578, "grad_norm": 2.2022361337244787, "learning_rate": 1.8873761561866595e-05, "loss": 2.7228, "step": 4840 }, { "epoch": 0.713484489067873, "grad_norm": 2.124939877180749, "learning_rate": 1.8865855699696613e-05, "loss": 2.6967, "step": 4850 }, { "epoch": 0.7149555911071882, "grad_norm": 2.033121533795281, "learning_rate": 1.885792385383656e-05, "loss": 2.6968, "step": 4860 }, { "epoch": 0.7164266931465034, "grad_norm": 2.2289911636781023, "learning_rate": 1.8849966047532776e-05, "loss": 2.6963, "step": 4870 }, { "epoch": 0.7178977951858185, "grad_norm": 1.989245524060728, "learning_rate": 1.8841982304107665e-05, "loss": 2.7042, "step": 4880 }, { "epoch": 0.7193688972251338, "grad_norm": 1.920958631693306, "learning_rate": 1.8833972646959672e-05, "loss": 2.6641, "step": 4890 }, { "epoch": 0.720839999264449, "grad_norm": 2.297986288035815, "learning_rate": 1.8825937099563166e-05, "loss": 2.6483, "step": 4900 }, { "epoch": 0.7223111013037642, "grad_norm": 2.238767134616556, "learning_rate": 1.8817875685468405e-05, "loss": 2.7099, "step": 4910 }, { "epoch": 0.7237822033430794, "grad_norm": 2.136794956199546, "learning_rate": 1.8809788428301455e-05, "loss": 2.6356, "step": 4920 }, { "epoch": 0.7252533053823946, "grad_norm": 2.3084926461730526, "learning_rate": 1.880167535176412e-05, "loss": 2.6791, "step": 4930 }, { "epoch": 0.7267244074217097, "grad_norm": 2.19060281280178, "learning_rate": 1.8793536479633878e-05, "loss": 2.6597, "step": 4940 }, { "epoch": 0.728195509461025, "grad_norm": 2.2941095554260773, "learning_rate": 1.8785371835763803e-05, "loss": 2.6159, "step": 4950 }, { "epoch": 0.7296666115003402, "grad_norm": 2.159543517779126, "learning_rate": 1.8777181444082503e-05, "loss": 2.7106, "step": 4960 }, { "epoch": 0.7311377135396554, "grad_norm": 2.054103375896992, "learning_rate": 1.8768965328594042e-05, "loss": 2.6545, "step": 4970 }, { "epoch": 0.7326088155789706, "grad_norm": 2.286860472101357, "learning_rate": 1.876072351337788e-05, "loss": 2.6846, "step": 4980 }, { "epoch": 0.7340799176182858, "grad_norm": 2.1596568544371073, "learning_rate": 1.87524560225888e-05, "loss": 2.6839, "step": 4990 }, { "epoch": 0.735551019657601, "grad_norm": 2.180688416328862, "learning_rate": 1.8744162880456822e-05, "loss": 2.619, "step": 5000 }, { "epoch": 0.735551019657601, "eval_loss": 2.593979835510254, "eval_runtime": 336.695, "eval_samples_per_second": 272.024, "eval_steps_per_second": 8.503, "step": 5000 }, { "epoch": 0.7370221216969162, "grad_norm": 2.083175430743549, "learning_rate": 1.873584411128715e-05, "loss": 2.6915, "step": 5010 }, { "epoch": 0.7384932237362314, "grad_norm": 2.180123078250273, "learning_rate": 1.87274997394601e-05, "loss": 2.6562, "step": 5020 }, { "epoch": 0.7399643257755466, "grad_norm": 2.1836525363986814, "learning_rate": 1.871912978943101e-05, "loss": 2.7206, "step": 5030 }, { "epoch": 0.7414354278148618, "grad_norm": 2.115492838867108, "learning_rate": 1.87107342857302e-05, "loss": 2.6212, "step": 5040 }, { "epoch": 0.742906529854177, "grad_norm": 2.446510740857678, "learning_rate": 1.8702313252962864e-05, "loss": 2.6729, "step": 5050 }, { "epoch": 0.7443776318934923, "grad_norm": 2.300817015795549, "learning_rate": 1.8693866715809032e-05, "loss": 2.6869, "step": 5060 }, { "epoch": 0.7458487339328074, "grad_norm": 2.0747382538613017, "learning_rate": 1.868539469902346e-05, "loss": 2.6063, "step": 5070 }, { "epoch": 0.7473198359721226, "grad_norm": 2.143068905635851, "learning_rate": 1.8676897227435596e-05, "loss": 2.6972, "step": 5080 }, { "epoch": 0.7487909380114378, "grad_norm": 2.100027199333517, "learning_rate": 1.866837432594949e-05, "loss": 2.5892, "step": 5090 }, { "epoch": 0.750262040050753, "grad_norm": 2.2018178198843192, "learning_rate": 1.865982601954371e-05, "loss": 2.6239, "step": 5100 }, { "epoch": 0.7517331420900683, "grad_norm": 2.2357048912205157, "learning_rate": 1.8651252333271287e-05, "loss": 2.6418, "step": 5110 }, { "epoch": 0.7532042441293835, "grad_norm": 2.3916106700622675, "learning_rate": 1.8642653292259637e-05, "loss": 2.6556, "step": 5120 }, { "epoch": 0.7546753461686986, "grad_norm": 2.1398553292897513, "learning_rate": 1.8634028921710473e-05, "loss": 2.6742, "step": 5130 }, { "epoch": 0.7561464482080138, "grad_norm": 2.0707700960242454, "learning_rate": 1.8625379246899752e-05, "loss": 2.6355, "step": 5140 }, { "epoch": 0.757617550247329, "grad_norm": 2.068642929438997, "learning_rate": 1.8616704293177593e-05, "loss": 2.7374, "step": 5150 }, { "epoch": 0.7590886522866442, "grad_norm": 2.1510313914132415, "learning_rate": 1.86080040859682e-05, "loss": 2.6328, "step": 5160 }, { "epoch": 0.7605597543259595, "grad_norm": 1.9927955349115531, "learning_rate": 1.859927865076978e-05, "loss": 2.7375, "step": 5170 }, { "epoch": 0.7620308563652747, "grad_norm": 2.1497502261282238, "learning_rate": 1.859052801315449e-05, "loss": 2.7038, "step": 5180 }, { "epoch": 0.7635019584045898, "grad_norm": 2.2794587188398565, "learning_rate": 1.858175219876834e-05, "loss": 2.5664, "step": 5190 }, { "epoch": 0.764973060443905, "grad_norm": 2.1354201903384236, "learning_rate": 1.8572951233331135e-05, "loss": 2.6896, "step": 5200 }, { "epoch": 0.7664441624832202, "grad_norm": 2.249320519756091, "learning_rate": 1.8564125142636383e-05, "loss": 2.7026, "step": 5210 }, { "epoch": 0.7679152645225354, "grad_norm": 2.164922380218894, "learning_rate": 1.8555273952551234e-05, "loss": 2.6042, "step": 5220 }, { "epoch": 0.7693863665618507, "grad_norm": 2.2354388112183985, "learning_rate": 1.8546397689016397e-05, "loss": 2.6708, "step": 5230 }, { "epoch": 0.7708574686011659, "grad_norm": 2.296423825315972, "learning_rate": 1.8537496378046067e-05, "loss": 2.6711, "step": 5240 }, { "epoch": 0.7723285706404811, "grad_norm": 2.1597634736980384, "learning_rate": 1.852857004572784e-05, "loss": 2.6886, "step": 5250 }, { "epoch": 0.7737996726797962, "grad_norm": 2.446873143766915, "learning_rate": 1.8519618718222655e-05, "loss": 2.6835, "step": 5260 }, { "epoch": 0.7752707747191114, "grad_norm": 2.031759327286404, "learning_rate": 1.8510642421764694e-05, "loss": 2.6153, "step": 5270 }, { "epoch": 0.7767418767584267, "grad_norm": 2.131217325697457, "learning_rate": 1.850164118266132e-05, "loss": 2.7008, "step": 5280 }, { "epoch": 0.7782129787977419, "grad_norm": 2.0314744131498848, "learning_rate": 1.8492615027293012e-05, "loss": 2.6474, "step": 5290 }, { "epoch": 0.7796840808370571, "grad_norm": 2.079125989765076, "learning_rate": 1.8483563982113244e-05, "loss": 2.6375, "step": 5300 }, { "epoch": 0.7811551828763723, "grad_norm": 2.1186480014137516, "learning_rate": 1.8474488073648462e-05, "loss": 2.6767, "step": 5310 }, { "epoch": 0.7826262849156874, "grad_norm": 2.037969739168481, "learning_rate": 1.8465387328497966e-05, "loss": 2.5852, "step": 5320 }, { "epoch": 0.7840973869550026, "grad_norm": 2.3491315191718103, "learning_rate": 1.8456261773333852e-05, "loss": 2.6864, "step": 5330 }, { "epoch": 0.7855684889943179, "grad_norm": 1.9837979673317625, "learning_rate": 1.8447111434900927e-05, "loss": 2.5988, "step": 5340 }, { "epoch": 0.7870395910336331, "grad_norm": 2.015098224304477, "learning_rate": 1.843793634001663e-05, "loss": 2.5539, "step": 5350 }, { "epoch": 0.7885106930729483, "grad_norm": 2.0145323618401543, "learning_rate": 1.842873651557096e-05, "loss": 2.5896, "step": 5360 }, { "epoch": 0.7899817951122635, "grad_norm": 2.095837151207162, "learning_rate": 1.8419511988526385e-05, "loss": 2.5974, "step": 5370 }, { "epoch": 0.7914528971515786, "grad_norm": 2.169116837926995, "learning_rate": 1.8410262785917777e-05, "loss": 2.689, "step": 5380 }, { "epoch": 0.7929239991908938, "grad_norm": 2.0956536113322266, "learning_rate": 1.840098893485232e-05, "loss": 2.6847, "step": 5390 }, { "epoch": 0.7943951012302091, "grad_norm": 2.0333296401875374, "learning_rate": 1.8391690462509447e-05, "loss": 2.6478, "step": 5400 }, { "epoch": 0.7958662032695243, "grad_norm": 2.498191587323823, "learning_rate": 1.838236739614074e-05, "loss": 2.7491, "step": 5410 }, { "epoch": 0.7973373053088395, "grad_norm": 2.118964311970224, "learning_rate": 1.8373019763069858e-05, "loss": 2.6905, "step": 5420 }, { "epoch": 0.7988084073481547, "grad_norm": 2.2344535485457446, "learning_rate": 1.8363647590692473e-05, "loss": 2.595, "step": 5430 }, { "epoch": 0.8002795093874698, "grad_norm": 2.1489224078010083, "learning_rate": 1.8354250906476166e-05, "loss": 2.5674, "step": 5440 }, { "epoch": 0.801750611426785, "grad_norm": 2.0999372355191803, "learning_rate": 1.8344829737960356e-05, "loss": 2.7261, "step": 5450 }, { "epoch": 0.8032217134661003, "grad_norm": 2.109856492939514, "learning_rate": 1.8335384112756216e-05, "loss": 2.6261, "step": 5460 }, { "epoch": 0.8046928155054155, "grad_norm": 2.126611878099278, "learning_rate": 1.832591405854661e-05, "loss": 2.6184, "step": 5470 }, { "epoch": 0.8061639175447307, "grad_norm": 2.1322663047671, "learning_rate": 1.8316419603085986e-05, "loss": 2.7139, "step": 5480 }, { "epoch": 0.8076350195840459, "grad_norm": 2.017420471595554, "learning_rate": 1.8306900774200307e-05, "loss": 2.6552, "step": 5490 }, { "epoch": 0.8091061216233612, "grad_norm": 2.153036909926997, "learning_rate": 1.829735759978697e-05, "loss": 2.5845, "step": 5500 }, { "epoch": 0.8105772236626763, "grad_norm": 2.0235053608793123, "learning_rate": 1.8287790107814725e-05, "loss": 2.5323, "step": 5510 }, { "epoch": 0.8120483257019915, "grad_norm": 2.1628697587690238, "learning_rate": 1.8278198326323585e-05, "loss": 2.6526, "step": 5520 }, { "epoch": 0.8135194277413067, "grad_norm": 2.1555327407962626, "learning_rate": 1.8268582283424763e-05, "loss": 2.6744, "step": 5530 }, { "epoch": 0.8149905297806219, "grad_norm": 2.2490952421389094, "learning_rate": 1.825894200730056e-05, "loss": 2.7, "step": 5540 }, { "epoch": 0.8164616318199371, "grad_norm": 2.63155938129759, "learning_rate": 1.8249277526204306e-05, "loss": 2.6911, "step": 5550 }, { "epoch": 0.8179327338592524, "grad_norm": 2.0338150740453704, "learning_rate": 1.8239588868460273e-05, "loss": 2.6007, "step": 5560 }, { "epoch": 0.8194038358985675, "grad_norm": 2.0241971307083726, "learning_rate": 1.822987606246358e-05, "loss": 2.6393, "step": 5570 }, { "epoch": 0.8208749379378827, "grad_norm": 2.152314673388106, "learning_rate": 1.8220139136680135e-05, "loss": 2.6758, "step": 5580 }, { "epoch": 0.8223460399771979, "grad_norm": 2.047998790701278, "learning_rate": 1.821037811964652e-05, "loss": 2.6413, "step": 5590 }, { "epoch": 0.8238171420165131, "grad_norm": 1.9910131613796962, "learning_rate": 1.820059303996992e-05, "loss": 2.6656, "step": 5600 }, { "epoch": 0.8252882440558283, "grad_norm": 1.9863632735889307, "learning_rate": 1.819078392632806e-05, "loss": 2.5815, "step": 5610 }, { "epoch": 0.8267593460951436, "grad_norm": 2.052879721039735, "learning_rate": 1.818095080746909e-05, "loss": 2.5997, "step": 5620 }, { "epoch": 0.8282304481344587, "grad_norm": 2.1866901275633883, "learning_rate": 1.8171093712211506e-05, "loss": 2.6537, "step": 5630 }, { "epoch": 0.8297015501737739, "grad_norm": 2.0816881108977547, "learning_rate": 1.81612126694441e-05, "loss": 2.5558, "step": 5640 }, { "epoch": 0.8311726522130891, "grad_norm": 2.01825874124343, "learning_rate": 1.8151307708125815e-05, "loss": 2.5947, "step": 5650 }, { "epoch": 0.8326437542524043, "grad_norm": 2.0644371679590465, "learning_rate": 1.814137885728572e-05, "loss": 2.6468, "step": 5660 }, { "epoch": 0.8341148562917196, "grad_norm": 2.031562689707859, "learning_rate": 1.8131426146022885e-05, "loss": 2.618, "step": 5670 }, { "epoch": 0.8355859583310348, "grad_norm": 2.2502934451598064, "learning_rate": 1.8121449603506306e-05, "loss": 2.6695, "step": 5680 }, { "epoch": 0.83705706037035, "grad_norm": 2.1691363118385523, "learning_rate": 1.8111449258974843e-05, "loss": 2.5772, "step": 5690 }, { "epoch": 0.8385281624096651, "grad_norm": 2.057940869440366, "learning_rate": 1.8101425141737087e-05, "loss": 2.6721, "step": 5700 }, { "epoch": 0.8399992644489803, "grad_norm": 1.8960926871615984, "learning_rate": 1.8091377281171318e-05, "loss": 2.6438, "step": 5710 }, { "epoch": 0.8414703664882955, "grad_norm": 2.4904418103832144, "learning_rate": 1.80813057067254e-05, "loss": 2.6365, "step": 5720 }, { "epoch": 0.8429414685276108, "grad_norm": 2.1141269465587036, "learning_rate": 1.807121044791669e-05, "loss": 2.6574, "step": 5730 }, { "epoch": 0.844412570566926, "grad_norm": 2.1028289163267035, "learning_rate": 1.8061091534331966e-05, "loss": 2.6389, "step": 5740 }, { "epoch": 0.8458836726062412, "grad_norm": 2.1018705939483584, "learning_rate": 1.8050948995627332e-05, "loss": 2.7114, "step": 5750 }, { "epoch": 0.8473547746455563, "grad_norm": 2.0609500723441747, "learning_rate": 1.8040782861528126e-05, "loss": 2.6854, "step": 5760 }, { "epoch": 0.8488258766848715, "grad_norm": 2.114941383516259, "learning_rate": 1.803059316182884e-05, "loss": 2.6252, "step": 5770 }, { "epoch": 0.8502969787241867, "grad_norm": 2.1511316351245684, "learning_rate": 1.8020379926393033e-05, "loss": 2.5637, "step": 5780 }, { "epoch": 0.851768080763502, "grad_norm": 2.199924786399337, "learning_rate": 1.801014318515324e-05, "loss": 2.6004, "step": 5790 }, { "epoch": 0.8532391828028172, "grad_norm": 2.1045028207693073, "learning_rate": 1.799988296811089e-05, "loss": 2.6964, "step": 5800 }, { "epoch": 0.8547102848421324, "grad_norm": 1.9890364917323107, "learning_rate": 1.798959930533621e-05, "loss": 2.6082, "step": 5810 }, { "epoch": 0.8561813868814475, "grad_norm": 2.0610919302627373, "learning_rate": 1.797929222696814e-05, "loss": 2.6601, "step": 5820 }, { "epoch": 0.8576524889207627, "grad_norm": 2.4061830582587223, "learning_rate": 1.7968961763214247e-05, "loss": 2.6262, "step": 5830 }, { "epoch": 0.859123590960078, "grad_norm": 2.154415040219887, "learning_rate": 1.7958607944350633e-05, "loss": 2.6139, "step": 5840 }, { "epoch": 0.8605946929993932, "grad_norm": 2.048650554681721, "learning_rate": 1.7948230800721855e-05, "loss": 2.7144, "step": 5850 }, { "epoch": 0.8620657950387084, "grad_norm": 1.8286821585601212, "learning_rate": 1.793783036274082e-05, "loss": 2.6055, "step": 5860 }, { "epoch": 0.8635368970780236, "grad_norm": 2.057696957458442, "learning_rate": 1.7927406660888716e-05, "loss": 2.6564, "step": 5870 }, { "epoch": 0.8650079991173387, "grad_norm": 2.2060955653650334, "learning_rate": 1.7916959725714894e-05, "loss": 2.6106, "step": 5880 }, { "epoch": 0.8664791011566539, "grad_norm": 1.9590242735393282, "learning_rate": 1.790648958783682e-05, "loss": 2.7385, "step": 5890 }, { "epoch": 0.8679502031959692, "grad_norm": 2.221363383359242, "learning_rate": 1.7895996277939938e-05, "loss": 2.6319, "step": 5900 }, { "epoch": 0.8694213052352844, "grad_norm": 2.1511123032440835, "learning_rate": 1.7885479826777624e-05, "loss": 2.685, "step": 5910 }, { "epoch": 0.8708924072745996, "grad_norm": 1.9757304372402735, "learning_rate": 1.787494026517106e-05, "loss": 2.6378, "step": 5920 }, { "epoch": 0.8723635093139148, "grad_norm": 2.0253473102393857, "learning_rate": 1.786437762400917e-05, "loss": 2.5376, "step": 5930 }, { "epoch": 0.87383461135323, "grad_norm": 2.0263622779329906, "learning_rate": 1.785379193424851e-05, "loss": 2.5695, "step": 5940 }, { "epoch": 0.8753057133925451, "grad_norm": 1.8889742368998592, "learning_rate": 1.7843183226913193e-05, "loss": 2.6568, "step": 5950 }, { "epoch": 0.8767768154318604, "grad_norm": 2.089768094965981, "learning_rate": 1.7832551533094787e-05, "loss": 2.6112, "step": 5960 }, { "epoch": 0.8782479174711756, "grad_norm": 2.100608580848784, "learning_rate": 1.782189688395223e-05, "loss": 2.6606, "step": 5970 }, { "epoch": 0.8797190195104908, "grad_norm": 2.255829147283039, "learning_rate": 1.7811219310711734e-05, "loss": 2.6239, "step": 5980 }, { "epoch": 0.881190121549806, "grad_norm": 2.1876571713268023, "learning_rate": 1.7800518844666696e-05, "loss": 2.6451, "step": 5990 }, { "epoch": 0.8826612235891212, "grad_norm": 2.2285258178390306, "learning_rate": 1.7789795517177606e-05, "loss": 2.6865, "step": 6000 }, { "epoch": 0.8826612235891212, "eval_loss": 2.5522170066833496, "eval_runtime": 355.8828, "eval_samples_per_second": 257.357, "eval_steps_per_second": 8.045, "step": 6000 }, { "epoch": 0.8841323256284364, "grad_norm": 2.177355924339414, "learning_rate": 1.7779049359671963e-05, "loss": 2.6251, "step": 6010 }, { "epoch": 0.8856034276677516, "grad_norm": 1.9907170164959624, "learning_rate": 1.7768280403644165e-05, "loss": 2.6561, "step": 6020 }, { "epoch": 0.8870745297070668, "grad_norm": 2.1703052980364737, "learning_rate": 1.7757488680655434e-05, "loss": 2.6304, "step": 6030 }, { "epoch": 0.888545631746382, "grad_norm": 2.1555979615208307, "learning_rate": 1.7746674222333707e-05, "loss": 2.5529, "step": 6040 }, { "epoch": 0.8900167337856972, "grad_norm": 2.0512066556279462, "learning_rate": 1.773583706037357e-05, "loss": 2.5288, "step": 6050 }, { "epoch": 0.8914878358250125, "grad_norm": 2.188116346970898, "learning_rate": 1.772497722653613e-05, "loss": 2.5985, "step": 6060 }, { "epoch": 0.8929589378643276, "grad_norm": 2.232279841731029, "learning_rate": 1.771409475264895e-05, "loss": 2.5893, "step": 6070 }, { "epoch": 0.8944300399036428, "grad_norm": 2.5277210709028965, "learning_rate": 1.770318967060594e-05, "loss": 2.6238, "step": 6080 }, { "epoch": 0.895901141942958, "grad_norm": 2.032938312884567, "learning_rate": 1.7692262012367277e-05, "loss": 2.5758, "step": 6090 }, { "epoch": 0.8973722439822732, "grad_norm": 2.0027535786097093, "learning_rate": 1.768131180995929e-05, "loss": 2.563, "step": 6100 }, { "epoch": 0.8988433460215884, "grad_norm": 2.1306049048088, "learning_rate": 1.76703390954744e-05, "loss": 2.6733, "step": 6110 }, { "epoch": 0.9003144480609037, "grad_norm": 1.9920377031646166, "learning_rate": 1.7659343901070983e-05, "loss": 2.6009, "step": 6120 }, { "epoch": 0.9017855501002189, "grad_norm": 1.9499314070344718, "learning_rate": 1.7648326258973313e-05, "loss": 2.5515, "step": 6130 }, { "epoch": 0.903256652139534, "grad_norm": 2.092553381486599, "learning_rate": 1.7637286201471445e-05, "loss": 2.6473, "step": 6140 }, { "epoch": 0.9047277541788492, "grad_norm": 2.0910934188977324, "learning_rate": 1.7626223760921135e-05, "loss": 2.5877, "step": 6150 }, { "epoch": 0.9061988562181644, "grad_norm": 2.0906793047268315, "learning_rate": 1.7615138969743736e-05, "loss": 2.6334, "step": 6160 }, { "epoch": 0.9076699582574796, "grad_norm": 2.020587619139212, "learning_rate": 1.7604031860426098e-05, "loss": 2.6526, "step": 6170 }, { "epoch": 0.9091410602967949, "grad_norm": 2.094909238594079, "learning_rate": 1.7592902465520493e-05, "loss": 2.6103, "step": 6180 }, { "epoch": 0.9106121623361101, "grad_norm": 2.1268358608061555, "learning_rate": 1.758175081764449e-05, "loss": 2.5839, "step": 6190 }, { "epoch": 0.9120832643754252, "grad_norm": 2.17733809079248, "learning_rate": 1.75705769494809e-05, "loss": 2.6585, "step": 6200 }, { "epoch": 0.9135543664147404, "grad_norm": 2.1262203028989504, "learning_rate": 1.7559380893777635e-05, "loss": 2.6535, "step": 6210 }, { "epoch": 0.9150254684540556, "grad_norm": 2.1250230999090522, "learning_rate": 1.7548162683347637e-05, "loss": 2.6039, "step": 6220 }, { "epoch": 0.9164965704933709, "grad_norm": 2.1694466375084516, "learning_rate": 1.7536922351068785e-05, "loss": 2.6372, "step": 6230 }, { "epoch": 0.9179676725326861, "grad_norm": 1.9608370549001612, "learning_rate": 1.7525659929883793e-05, "loss": 2.6291, "step": 6240 }, { "epoch": 0.9194387745720013, "grad_norm": 2.120621501576332, "learning_rate": 1.7514375452800105e-05, "loss": 2.6435, "step": 6250 }, { "epoch": 0.9209098766113164, "grad_norm": 2.0308452001378, "learning_rate": 1.75030689528898e-05, "loss": 2.5684, "step": 6260 }, { "epoch": 0.9223809786506316, "grad_norm": 2.0689442659473785, "learning_rate": 1.7491740463289522e-05, "loss": 2.6225, "step": 6270 }, { "epoch": 0.9238520806899468, "grad_norm": 2.2745670756627216, "learning_rate": 1.7480390017200345e-05, "loss": 2.665, "step": 6280 }, { "epoch": 0.9253231827292621, "grad_norm": 2.0621700604772215, "learning_rate": 1.7469017647887693e-05, "loss": 2.6389, "step": 6290 }, { "epoch": 0.9267942847685773, "grad_norm": 2.0580932398099496, "learning_rate": 1.7457623388681244e-05, "loss": 2.7017, "step": 6300 }, { "epoch": 0.9282653868078925, "grad_norm": 2.2078094098828056, "learning_rate": 1.744620727297483e-05, "loss": 2.6325, "step": 6310 }, { "epoch": 0.9297364888472076, "grad_norm": 2.1392867251476435, "learning_rate": 1.7434769334226346e-05, "loss": 2.6522, "step": 6320 }, { "epoch": 0.9312075908865228, "grad_norm": 1.9289662907067964, "learning_rate": 1.742330960595763e-05, "loss": 2.6211, "step": 6330 }, { "epoch": 0.932678692925838, "grad_norm": 2.0832342514251705, "learning_rate": 1.7411828121754394e-05, "loss": 2.6068, "step": 6340 }, { "epoch": 0.9341497949651533, "grad_norm": 2.074010617303713, "learning_rate": 1.7400324915266105e-05, "loss": 2.6239, "step": 6350 }, { "epoch": 0.9356208970044685, "grad_norm": 2.081185443158738, "learning_rate": 1.7388800020205895e-05, "loss": 2.6021, "step": 6360 }, { "epoch": 0.9370919990437837, "grad_norm": 2.0647858389525466, "learning_rate": 1.7377253470350455e-05, "loss": 2.6182, "step": 6370 }, { "epoch": 0.9385631010830989, "grad_norm": 2.204662584940334, "learning_rate": 1.736568529953995e-05, "loss": 2.6619, "step": 6380 }, { "epoch": 0.940034203122414, "grad_norm": 1.9092770756970565, "learning_rate": 1.73540955416779e-05, "loss": 2.6336, "step": 6390 }, { "epoch": 0.9415053051617293, "grad_norm": 2.185192562994721, "learning_rate": 1.7342484230731095e-05, "loss": 2.5716, "step": 6400 }, { "epoch": 0.9429764072010445, "grad_norm": 2.195512008541002, "learning_rate": 1.73308514007295e-05, "loss": 2.5924, "step": 6410 }, { "epoch": 0.9444475092403597, "grad_norm": 2.054841176246332, "learning_rate": 1.7319197085766145e-05, "loss": 2.6588, "step": 6420 }, { "epoch": 0.9459186112796749, "grad_norm": 2.1449604205209085, "learning_rate": 1.7307521319997014e-05, "loss": 2.605, "step": 6430 }, { "epoch": 0.9473897133189901, "grad_norm": 2.0866470851472725, "learning_rate": 1.729582413764097e-05, "loss": 2.6592, "step": 6440 }, { "epoch": 0.9488608153583052, "grad_norm": 2.1735180334051947, "learning_rate": 1.728410557297964e-05, "loss": 2.5882, "step": 6450 }, { "epoch": 0.9503319173976205, "grad_norm": 2.2369843839699857, "learning_rate": 1.7272365660357323e-05, "loss": 2.6209, "step": 6460 }, { "epoch": 0.9518030194369357, "grad_norm": 2.07822199354975, "learning_rate": 1.7260604434180872e-05, "loss": 2.614, "step": 6470 }, { "epoch": 0.9532741214762509, "grad_norm": 2.0863412231370932, "learning_rate": 1.7248821928919614e-05, "loss": 2.636, "step": 6480 }, { "epoch": 0.9547452235155661, "grad_norm": 2.152271310000148, "learning_rate": 1.723701817910524e-05, "loss": 2.6113, "step": 6490 }, { "epoch": 0.9562163255548813, "grad_norm": 2.033801398025904, "learning_rate": 1.7225193219331694e-05, "loss": 2.559, "step": 6500 }, { "epoch": 0.9576874275941964, "grad_norm": 2.2164170300112658, "learning_rate": 1.721334708425509e-05, "loss": 2.5764, "step": 6510 }, { "epoch": 0.9591585296335117, "grad_norm": 2.2448743709055283, "learning_rate": 1.720147980859361e-05, "loss": 2.571, "step": 6520 }, { "epoch": 0.9606296316728269, "grad_norm": 2.0688358549503953, "learning_rate": 1.718959142712737e-05, "loss": 2.5361, "step": 6530 }, { "epoch": 0.9621007337121421, "grad_norm": 2.082981941257469, "learning_rate": 1.717768197469836e-05, "loss": 2.6368, "step": 6540 }, { "epoch": 0.9635718357514573, "grad_norm": 2.1214477880370706, "learning_rate": 1.7165751486210326e-05, "loss": 2.6341, "step": 6550 }, { "epoch": 0.9650429377907725, "grad_norm": 2.0655398853383518, "learning_rate": 1.7153799996628655e-05, "loss": 2.5988, "step": 6560 }, { "epoch": 0.9665140398300878, "grad_norm": 2.0463831708420104, "learning_rate": 1.714182754098029e-05, "loss": 2.6607, "step": 6570 }, { "epoch": 0.9679851418694029, "grad_norm": 2.1482124469183796, "learning_rate": 1.7129834154353607e-05, "loss": 2.6311, "step": 6580 }, { "epoch": 0.9694562439087181, "grad_norm": 2.091723581182737, "learning_rate": 1.7117819871898354e-05, "loss": 2.6874, "step": 6590 }, { "epoch": 0.9709273459480333, "grad_norm": 2.127070728964039, "learning_rate": 1.7105784728825492e-05, "loss": 2.5516, "step": 6600 }, { "epoch": 0.9723984479873485, "grad_norm": 2.028533307108539, "learning_rate": 1.709372876040713e-05, "loss": 2.6125, "step": 6610 }, { "epoch": 0.9738695500266638, "grad_norm": 2.146635994029564, "learning_rate": 1.7081652001976414e-05, "loss": 2.5767, "step": 6620 }, { "epoch": 0.975340652065979, "grad_norm": 2.153008092973894, "learning_rate": 1.706955448892742e-05, "loss": 2.6273, "step": 6630 }, { "epoch": 0.9768117541052941, "grad_norm": 2.2125530077735687, "learning_rate": 1.7057436256715038e-05, "loss": 2.5931, "step": 6640 }, { "epoch": 0.9782828561446093, "grad_norm": 2.152783889500107, "learning_rate": 1.7045297340854904e-05, "loss": 2.5328, "step": 6650 }, { "epoch": 0.9797539581839245, "grad_norm": 2.0269229104930795, "learning_rate": 1.703313777692325e-05, "loss": 2.6197, "step": 6660 }, { "epoch": 0.9812250602232397, "grad_norm": 1.9725574508962058, "learning_rate": 1.7020957600556837e-05, "loss": 2.5429, "step": 6670 }, { "epoch": 0.982696162262555, "grad_norm": 2.194590371650428, "learning_rate": 1.7008756847452834e-05, "loss": 2.5806, "step": 6680 }, { "epoch": 0.9841672643018702, "grad_norm": 2.162422995189865, "learning_rate": 1.6996535553368706e-05, "loss": 2.6539, "step": 6690 }, { "epoch": 0.9856383663411853, "grad_norm": 2.103452362272789, "learning_rate": 1.698429375412213e-05, "loss": 2.6682, "step": 6700 }, { "epoch": 0.9871094683805005, "grad_norm": 2.118498510830661, "learning_rate": 1.6972031485590874e-05, "loss": 2.6093, "step": 6710 }, { "epoch": 0.9885805704198157, "grad_norm": 2.0283431153796365, "learning_rate": 1.69597487837127e-05, "loss": 2.6064, "step": 6720 }, { "epoch": 0.990051672459131, "grad_norm": 2.1534789335081927, "learning_rate": 1.6947445684485244e-05, "loss": 2.5153, "step": 6730 }, { "epoch": 0.9915227744984462, "grad_norm": 2.0242343299575243, "learning_rate": 1.6935122223965937e-05, "loss": 2.6519, "step": 6740 }, { "epoch": 0.9929938765377614, "grad_norm": 2.105059586365848, "learning_rate": 1.6922778438271875e-05, "loss": 2.6132, "step": 6750 }, { "epoch": 0.9944649785770765, "grad_norm": 1.9895561301670714, "learning_rate": 1.6910414363579726e-05, "loss": 2.5556, "step": 6760 }, { "epoch": 0.9959360806163917, "grad_norm": 2.086798412774775, "learning_rate": 1.6898030036125615e-05, "loss": 2.6177, "step": 6770 }, { "epoch": 0.9974071826557069, "grad_norm": 1.8637160263522088, "learning_rate": 1.6885625492205026e-05, "loss": 2.603, "step": 6780 }, { "epoch": 0.9988782846950222, "grad_norm": 2.0646682245678014, "learning_rate": 1.68732007681727e-05, "loss": 2.61, "step": 6790 }, { "epoch": 1.000294220407863, "grad_norm": 2.019627803762242, "learning_rate": 1.68607559004425e-05, "loss": 2.5804, "step": 6800 }, { "epoch": 1.0017653224471783, "grad_norm": 2.081721652863151, "learning_rate": 1.6848290925487357e-05, "loss": 2.4762, "step": 6810 }, { "epoch": 1.0032364244864935, "grad_norm": 2.176120642565375, "learning_rate": 1.68358058798391e-05, "loss": 2.4498, "step": 6820 }, { "epoch": 1.0047075265258087, "grad_norm": 2.3100791911882554, "learning_rate": 1.6823300800088395e-05, "loss": 2.4447, "step": 6830 }, { "epoch": 1.0061786285651237, "grad_norm": 2.4584462164170757, "learning_rate": 1.6810775722884633e-05, "loss": 2.4253, "step": 6840 }, { "epoch": 1.007649730604439, "grad_norm": 2.276738579677762, "learning_rate": 1.679823068493579e-05, "loss": 2.3709, "step": 6850 }, { "epoch": 1.0091208326437542, "grad_norm": 2.074041259675006, "learning_rate": 1.678566572300836e-05, "loss": 2.3686, "step": 6860 }, { "epoch": 1.0105919346830694, "grad_norm": 2.2567424256749007, "learning_rate": 1.6773080873927225e-05, "loss": 2.3983, "step": 6870 }, { "epoch": 1.0120630367223846, "grad_norm": 2.28339717696511, "learning_rate": 1.676047617457554e-05, "loss": 2.422, "step": 6880 }, { "epoch": 1.0135341387616998, "grad_norm": 2.615085772004312, "learning_rate": 1.6747851661894664e-05, "loss": 2.4126, "step": 6890 }, { "epoch": 1.015005240801015, "grad_norm": 2.362588980503791, "learning_rate": 1.673520737288399e-05, "loss": 2.3918, "step": 6900 }, { "epoch": 1.0164763428403303, "grad_norm": 2.250116014940073, "learning_rate": 1.6722543344600893e-05, "loss": 2.4557, "step": 6910 }, { "epoch": 1.0179474448796455, "grad_norm": 2.470742984279999, "learning_rate": 1.6709859614160593e-05, "loss": 2.4094, "step": 6920 }, { "epoch": 1.0194185469189607, "grad_norm": 2.3965266014029805, "learning_rate": 1.6697156218736053e-05, "loss": 2.3915, "step": 6930 }, { "epoch": 1.020889648958276, "grad_norm": 2.5407259152634465, "learning_rate": 1.6684433195557867e-05, "loss": 2.379, "step": 6940 }, { "epoch": 1.0223607509975912, "grad_norm": 2.418198316257277, "learning_rate": 1.6671690581914157e-05, "loss": 2.3821, "step": 6950 }, { "epoch": 1.0238318530369064, "grad_norm": 2.448774797919758, "learning_rate": 1.665892841515046e-05, "loss": 2.4211, "step": 6960 }, { "epoch": 1.0253029550762214, "grad_norm": 2.5768480577674353, "learning_rate": 1.6646146732669614e-05, "loss": 2.453, "step": 6970 }, { "epoch": 1.0267740571155366, "grad_norm": 2.3137285855980596, "learning_rate": 1.663334557193165e-05, "loss": 2.4423, "step": 6980 }, { "epoch": 1.0282451591548518, "grad_norm": 2.306616974315616, "learning_rate": 1.6620524970453695e-05, "loss": 2.4649, "step": 6990 }, { "epoch": 1.029716261194167, "grad_norm": 2.512964213348438, "learning_rate": 1.6607684965809853e-05, "loss": 2.4415, "step": 7000 }, { "epoch": 1.029716261194167, "eval_loss": 2.522259473800659, "eval_runtime": 337.9931, "eval_samples_per_second": 270.979, "eval_steps_per_second": 8.471, "step": 7000 }, { "epoch": 1.0311873632334823, "grad_norm": 2.462416687333051, "learning_rate": 1.6594825595631082e-05, "loss": 2.3594, "step": 7010 }, { "epoch": 1.0326584652727975, "grad_norm": 2.4705295360920987, "learning_rate": 1.65819468976051e-05, "loss": 2.4248, "step": 7020 }, { "epoch": 1.0341295673121127, "grad_norm": 2.2871618710219606, "learning_rate": 1.656904890947628e-05, "loss": 2.3849, "step": 7030 }, { "epoch": 1.035600669351428, "grad_norm": 2.3257543797515945, "learning_rate": 1.6556131669045515e-05, "loss": 2.4629, "step": 7040 }, { "epoch": 1.0370717713907431, "grad_norm": 2.2955459114856724, "learning_rate": 1.6543195214170133e-05, "loss": 2.3798, "step": 7050 }, { "epoch": 1.0385428734300584, "grad_norm": 2.4681600265530332, "learning_rate": 1.6530239582763773e-05, "loss": 2.46, "step": 7060 }, { "epoch": 1.0400139754693736, "grad_norm": 2.1929927212057763, "learning_rate": 1.6517264812796265e-05, "loss": 2.3963, "step": 7070 }, { "epoch": 1.0414850775086888, "grad_norm": 2.530749882257626, "learning_rate": 1.650427094229355e-05, "loss": 2.4174, "step": 7080 }, { "epoch": 1.0429561795480038, "grad_norm": 2.1869949834164943, "learning_rate": 1.6491258009337526e-05, "loss": 2.3547, "step": 7090 }, { "epoch": 1.044427281587319, "grad_norm": 2.3001186027738747, "learning_rate": 1.6478226052065976e-05, "loss": 2.3516, "step": 7100 }, { "epoch": 1.0458983836266342, "grad_norm": 2.2617438209316267, "learning_rate": 1.6465175108672428e-05, "loss": 2.4048, "step": 7110 }, { "epoch": 1.0473694856659495, "grad_norm": 2.38595729140565, "learning_rate": 1.6452105217406054e-05, "loss": 2.4472, "step": 7120 }, { "epoch": 1.0488405877052647, "grad_norm": 2.5826948059614736, "learning_rate": 1.6439016416571573e-05, "loss": 2.4727, "step": 7130 }, { "epoch": 1.05031168974458, "grad_norm": 2.417696539452213, "learning_rate": 1.6425908744529096e-05, "loss": 2.4186, "step": 7140 }, { "epoch": 1.051782791783895, "grad_norm": 2.7414906811691457, "learning_rate": 1.6412782239694065e-05, "loss": 2.4278, "step": 7150 }, { "epoch": 1.0532538938232103, "grad_norm": 2.4700928435744043, "learning_rate": 1.639963694053711e-05, "loss": 2.4916, "step": 7160 }, { "epoch": 1.0547249958625255, "grad_norm": 2.3569624672708174, "learning_rate": 1.638647288558393e-05, "loss": 2.4195, "step": 7170 }, { "epoch": 1.0561960979018408, "grad_norm": 2.6525839600968877, "learning_rate": 1.637329011341521e-05, "loss": 2.432, "step": 7180 }, { "epoch": 1.057667199941156, "grad_norm": 2.3363543178025967, "learning_rate": 1.6360088662666485e-05, "loss": 2.3477, "step": 7190 }, { "epoch": 1.0591383019804712, "grad_norm": 2.3640219819702915, "learning_rate": 1.634686857202803e-05, "loss": 2.3908, "step": 7200 }, { "epoch": 1.0606094040197864, "grad_norm": 2.3161884304128697, "learning_rate": 1.6333629880244748e-05, "loss": 2.4505, "step": 7210 }, { "epoch": 1.0620805060591014, "grad_norm": 2.4927380336549727, "learning_rate": 1.632037262611606e-05, "loss": 2.4189, "step": 7220 }, { "epoch": 1.0635516080984166, "grad_norm": 2.504456816261851, "learning_rate": 1.6307096848495788e-05, "loss": 2.4425, "step": 7230 }, { "epoch": 1.0650227101377319, "grad_norm": 2.691433564001996, "learning_rate": 1.6293802586292047e-05, "loss": 2.4337, "step": 7240 }, { "epoch": 1.066493812177047, "grad_norm": 2.364954046579027, "learning_rate": 1.6280489878467113e-05, "loss": 2.3671, "step": 7250 }, { "epoch": 1.0679649142163623, "grad_norm": 2.6002781771983554, "learning_rate": 1.6267158764037336e-05, "loss": 2.4016, "step": 7260 }, { "epoch": 1.0694360162556775, "grad_norm": 2.5612549302534027, "learning_rate": 1.6253809282073005e-05, "loss": 2.3993, "step": 7270 }, { "epoch": 1.0709071182949927, "grad_norm": 2.4116753000571016, "learning_rate": 1.6240441471698235e-05, "loss": 2.4106, "step": 7280 }, { "epoch": 1.072378220334308, "grad_norm": 2.4799882272671447, "learning_rate": 1.6227055372090867e-05, "loss": 2.3686, "step": 7290 }, { "epoch": 1.0738493223736232, "grad_norm": 2.4948583252449117, "learning_rate": 1.6213651022482334e-05, "loss": 2.4438, "step": 7300 }, { "epoch": 1.0753204244129384, "grad_norm": 2.7698483237794127, "learning_rate": 1.620022846215757e-05, "loss": 2.4115, "step": 7310 }, { "epoch": 1.0767915264522536, "grad_norm": 2.2674468676008033, "learning_rate": 1.6186787730454857e-05, "loss": 2.3874, "step": 7320 }, { "epoch": 1.0782626284915688, "grad_norm": 2.421664846904039, "learning_rate": 1.6173328866765752e-05, "loss": 2.4069, "step": 7330 }, { "epoch": 1.0797337305308838, "grad_norm": 2.45458397129261, "learning_rate": 1.6159851910534947e-05, "loss": 2.4112, "step": 7340 }, { "epoch": 1.081204832570199, "grad_norm": 2.479616933138887, "learning_rate": 1.6146356901260157e-05, "loss": 2.4131, "step": 7350 }, { "epoch": 1.0826759346095143, "grad_norm": 2.5726491219593637, "learning_rate": 1.613284387849201e-05, "loss": 2.439, "step": 7360 }, { "epoch": 1.0841470366488295, "grad_norm": 2.211909647913895, "learning_rate": 1.611931288183392e-05, "loss": 2.3836, "step": 7370 }, { "epoch": 1.0856181386881447, "grad_norm": 2.47288635484619, "learning_rate": 1.6105763950941985e-05, "loss": 2.3782, "step": 7380 }, { "epoch": 1.08708924072746, "grad_norm": 2.192237125317018, "learning_rate": 1.6092197125524865e-05, "loss": 2.4127, "step": 7390 }, { "epoch": 1.0885603427667752, "grad_norm": 2.346722985871126, "learning_rate": 1.607861244534366e-05, "loss": 2.3882, "step": 7400 }, { "epoch": 1.0900314448060904, "grad_norm": 2.7490007721842273, "learning_rate": 1.6065009950211793e-05, "loss": 2.4892, "step": 7410 }, { "epoch": 1.0915025468454056, "grad_norm": 2.4121913635025547, "learning_rate": 1.6051389679994916e-05, "loss": 2.4236, "step": 7420 }, { "epoch": 1.0929736488847208, "grad_norm": 2.4564058026426356, "learning_rate": 1.6037751674610755e-05, "loss": 2.4114, "step": 7430 }, { "epoch": 1.094444750924036, "grad_norm": 2.790710408921306, "learning_rate": 1.602409597402902e-05, "loss": 2.4875, "step": 7440 }, { "epoch": 1.0959158529633513, "grad_norm": 2.311969190797976, "learning_rate": 1.60104226182713e-05, "loss": 2.4387, "step": 7450 }, { "epoch": 1.0973869550026665, "grad_norm": 2.1972715272022834, "learning_rate": 1.5996731647410894e-05, "loss": 2.3774, "step": 7460 }, { "epoch": 1.0988580570419815, "grad_norm": 2.763870253539534, "learning_rate": 1.598302310157275e-05, "loss": 2.3126, "step": 7470 }, { "epoch": 1.1003291590812967, "grad_norm": 2.506995721995503, "learning_rate": 1.5969297020933323e-05, "loss": 2.3094, "step": 7480 }, { "epoch": 1.101800261120612, "grad_norm": 2.7068552226877807, "learning_rate": 1.5955553445720444e-05, "loss": 2.4441, "step": 7490 }, { "epoch": 1.1032713631599271, "grad_norm": 2.625388767638196, "learning_rate": 1.5941792416213235e-05, "loss": 2.4564, "step": 7500 }, { "epoch": 1.1047424651992424, "grad_norm": 2.412493199740379, "learning_rate": 1.592801397274195e-05, "loss": 2.3537, "step": 7510 }, { "epoch": 1.1062135672385576, "grad_norm": 2.7034721679001694, "learning_rate": 1.591421815568791e-05, "loss": 2.4533, "step": 7520 }, { "epoch": 1.1076846692778728, "grad_norm": 2.227825400515432, "learning_rate": 1.5900405005483317e-05, "loss": 2.3674, "step": 7530 }, { "epoch": 1.109155771317188, "grad_norm": 2.5953485260388893, "learning_rate": 1.58865745626112e-05, "loss": 2.4589, "step": 7540 }, { "epoch": 1.1106268733565032, "grad_norm": 2.5292634359501815, "learning_rate": 1.5872726867605262e-05, "loss": 2.4, "step": 7550 }, { "epoch": 1.1120979753958184, "grad_norm": 2.703653865772252, "learning_rate": 1.5858861961049763e-05, "loss": 2.355, "step": 7560 }, { "epoch": 1.1135690774351337, "grad_norm": 2.4304175709250124, "learning_rate": 1.5844979883579404e-05, "loss": 2.4407, "step": 7570 }, { "epoch": 1.115040179474449, "grad_norm": 2.546113752768121, "learning_rate": 1.583108067587922e-05, "loss": 2.3582, "step": 7580 }, { "epoch": 1.1165112815137639, "grad_norm": 2.584352953206119, "learning_rate": 1.5817164378684443e-05, "loss": 2.3868, "step": 7590 }, { "epoch": 1.117982383553079, "grad_norm": 2.8002557710949922, "learning_rate": 1.580323103278039e-05, "loss": 2.4482, "step": 7600 }, { "epoch": 1.1194534855923943, "grad_norm": 2.3865222342078294, "learning_rate": 1.5789280679002353e-05, "loss": 2.3964, "step": 7610 }, { "epoch": 1.1209245876317095, "grad_norm": 2.6309746461457015, "learning_rate": 1.5775313358235455e-05, "loss": 2.4662, "step": 7620 }, { "epoch": 1.1223956896710248, "grad_norm": 2.3855175749722166, "learning_rate": 1.5761329111414557e-05, "loss": 2.4447, "step": 7630 }, { "epoch": 1.12386679171034, "grad_norm": 2.692227438660528, "learning_rate": 1.5747327979524114e-05, "loss": 2.4701, "step": 7640 }, { "epoch": 1.1253378937496552, "grad_norm": 2.4752886125753273, "learning_rate": 1.573331000359809e-05, "loss": 2.3622, "step": 7650 }, { "epoch": 1.1268089957889704, "grad_norm": 2.6586296585740934, "learning_rate": 1.5719275224719784e-05, "loss": 2.4486, "step": 7660 }, { "epoch": 1.1282800978282856, "grad_norm": 2.605177193814124, "learning_rate": 1.5705223684021768e-05, "loss": 2.3945, "step": 7670 }, { "epoch": 1.1297511998676009, "grad_norm": 2.33076026903591, "learning_rate": 1.569115542268572e-05, "loss": 2.4449, "step": 7680 }, { "epoch": 1.131222301906916, "grad_norm": 2.837483982983246, "learning_rate": 1.5677070481942337e-05, "loss": 2.3635, "step": 7690 }, { "epoch": 1.1326934039462313, "grad_norm": 2.6847795177639595, "learning_rate": 1.566296890307118e-05, "loss": 2.3355, "step": 7700 }, { "epoch": 1.1341645059855465, "grad_norm": 2.755997221359399, "learning_rate": 1.5648850727400597e-05, "loss": 2.3738, "step": 7710 }, { "epoch": 1.1356356080248617, "grad_norm": 2.3878493350999594, "learning_rate": 1.5634715996307558e-05, "loss": 2.4125, "step": 7720 }, { "epoch": 1.1371067100641767, "grad_norm": 2.670009872823281, "learning_rate": 1.5620564751217555e-05, "loss": 2.4502, "step": 7730 }, { "epoch": 1.138577812103492, "grad_norm": 2.3016432388185364, "learning_rate": 1.5606397033604493e-05, "loss": 2.4407, "step": 7740 }, { "epoch": 1.1400489141428072, "grad_norm": 2.8580162056335245, "learning_rate": 1.5592212884990536e-05, "loss": 2.392, "step": 7750 }, { "epoch": 1.1415200161821224, "grad_norm": 2.363812040656155, "learning_rate": 1.5578012346946008e-05, "loss": 2.3453, "step": 7760 }, { "epoch": 1.1429911182214376, "grad_norm": 2.394329832635475, "learning_rate": 1.556379546108928e-05, "loss": 2.4238, "step": 7770 }, { "epoch": 1.1444622202607528, "grad_norm": 2.3201168068252893, "learning_rate": 1.554956226908662e-05, "loss": 2.4436, "step": 7780 }, { "epoch": 1.145933322300068, "grad_norm": 2.443023233727837, "learning_rate": 1.5535312812652086e-05, "loss": 2.4361, "step": 7790 }, { "epoch": 1.1474044243393833, "grad_norm": 2.6564805674658927, "learning_rate": 1.552104713354741e-05, "loss": 2.4685, "step": 7800 }, { "epoch": 1.1488755263786985, "grad_norm": 2.799682668146018, "learning_rate": 1.5506765273581863e-05, "loss": 2.4472, "step": 7810 }, { "epoch": 1.1503466284180137, "grad_norm": 2.8667621974059574, "learning_rate": 1.5492467274612145e-05, "loss": 2.3565, "step": 7820 }, { "epoch": 1.151817730457329, "grad_norm": 2.235301123519444, "learning_rate": 1.5478153178542245e-05, "loss": 2.4079, "step": 7830 }, { "epoch": 1.153288832496644, "grad_norm": 2.436140987172138, "learning_rate": 1.546382302732334e-05, "loss": 2.3827, "step": 7840 }, { "epoch": 1.1547599345359592, "grad_norm": 2.652141642583632, "learning_rate": 1.5449476862953653e-05, "loss": 2.3754, "step": 7850 }, { "epoch": 1.1562310365752744, "grad_norm": 2.7936613853630985, "learning_rate": 1.5435114727478335e-05, "loss": 2.421, "step": 7860 }, { "epoch": 1.1577021386145896, "grad_norm": 2.5038583966927783, "learning_rate": 1.5420736662989362e-05, "loss": 2.3625, "step": 7870 }, { "epoch": 1.1591732406539048, "grad_norm": 2.620808668325254, "learning_rate": 1.5406342711625373e-05, "loss": 2.4151, "step": 7880 }, { "epoch": 1.16064434269322, "grad_norm": 2.49778265620219, "learning_rate": 1.5391932915571582e-05, "loss": 2.4548, "step": 7890 }, { "epoch": 1.1621154447325353, "grad_norm": 2.566532570044452, "learning_rate": 1.5377507317059627e-05, "loss": 2.3657, "step": 7900 }, { "epoch": 1.1635865467718505, "grad_norm": 2.666923865924686, "learning_rate": 1.5363065958367472e-05, "loss": 2.3389, "step": 7910 }, { "epoch": 1.1650576488111657, "grad_norm": 2.6960636814312644, "learning_rate": 1.5348608881819265e-05, "loss": 2.4475, "step": 7920 }, { "epoch": 1.166528750850481, "grad_norm": 2.584553371378225, "learning_rate": 1.533413612978522e-05, "loss": 2.4387, "step": 7930 }, { "epoch": 1.1679998528897961, "grad_norm": 2.6191080326115297, "learning_rate": 1.5319647744681484e-05, "loss": 2.4085, "step": 7940 }, { "epoch": 1.1694709549291114, "grad_norm": 2.7494722913332508, "learning_rate": 1.530514376897004e-05, "loss": 2.391, "step": 7950 }, { "epoch": 1.1709420569684266, "grad_norm": 2.5442419711978355, "learning_rate": 1.529062424515854e-05, "loss": 2.3957, "step": 7960 }, { "epoch": 1.1724131590077418, "grad_norm": 2.6542991221687036, "learning_rate": 1.527608921580022e-05, "loss": 2.4376, "step": 7970 }, { "epoch": 1.1738842610470568, "grad_norm": 2.575228049619445, "learning_rate": 1.5261538723493758e-05, "loss": 2.375, "step": 7980 }, { "epoch": 1.175355363086372, "grad_norm": 2.668934999938476, "learning_rate": 1.5246972810883138e-05, "loss": 2.3642, "step": 7990 }, { "epoch": 1.1768264651256872, "grad_norm": 2.4378918436975106, "learning_rate": 1.5232391520657555e-05, "loss": 2.4281, "step": 8000 }, { "epoch": 1.1768264651256872, "eval_loss": 2.4909818172454834, "eval_runtime": 350.6717, "eval_samples_per_second": 261.182, "eval_steps_per_second": 8.164, "step": 8000 }, { "epoch": 1.1782975671650024, "grad_norm": 2.7307204346686142, "learning_rate": 1.5217794895551254e-05, "loss": 2.3377, "step": 8010 }, { "epoch": 1.1797686692043177, "grad_norm": 2.464105225095492, "learning_rate": 1.5203182978343437e-05, "loss": 2.3695, "step": 8020 }, { "epoch": 1.1812397712436329, "grad_norm": 2.205473219068819, "learning_rate": 1.5188555811858114e-05, "loss": 2.3925, "step": 8030 }, { "epoch": 1.182710873282948, "grad_norm": 2.5303361306613312, "learning_rate": 1.5173913438963994e-05, "loss": 2.3867, "step": 8040 }, { "epoch": 1.1841819753222633, "grad_norm": 2.8897398080674788, "learning_rate": 1.5159255902574349e-05, "loss": 2.4299, "step": 8050 }, { "epoch": 1.1856530773615785, "grad_norm": 2.650998918309853, "learning_rate": 1.5144583245646889e-05, "loss": 2.4086, "step": 8060 }, { "epoch": 1.1871241794008938, "grad_norm": 2.8249668867022546, "learning_rate": 1.5129895511183645e-05, "loss": 2.465, "step": 8070 }, { "epoch": 1.188595281440209, "grad_norm": 2.1841119249201415, "learning_rate": 1.5115192742230828e-05, "loss": 2.3749, "step": 8080 }, { "epoch": 1.190066383479524, "grad_norm": 2.650758252896847, "learning_rate": 1.5100474981878725e-05, "loss": 2.3928, "step": 8090 }, { "epoch": 1.1915374855188392, "grad_norm": 2.7608855483465873, "learning_rate": 1.5085742273261543e-05, "loss": 2.4245, "step": 8100 }, { "epoch": 1.1930085875581544, "grad_norm": 2.985960058799002, "learning_rate": 1.5070994659557307e-05, "loss": 2.4067, "step": 8110 }, { "epoch": 1.1944796895974696, "grad_norm": 2.514782362794386, "learning_rate": 1.5056232183987729e-05, "loss": 2.3515, "step": 8120 }, { "epoch": 1.1959507916367849, "grad_norm": 2.3927333226428926, "learning_rate": 1.5041454889818075e-05, "loss": 2.4707, "step": 8130 }, { "epoch": 1.1974218936761, "grad_norm": 2.501793170369459, "learning_rate": 1.502666282035703e-05, "loss": 2.401, "step": 8140 }, { "epoch": 1.1988929957154153, "grad_norm": 2.4079653614698144, "learning_rate": 1.5011856018956596e-05, "loss": 2.3999, "step": 8150 }, { "epoch": 1.2003640977547305, "grad_norm": 2.371444275944208, "learning_rate": 1.499703452901195e-05, "loss": 2.3761, "step": 8160 }, { "epoch": 1.2018351997940457, "grad_norm": 2.704498067179267, "learning_rate": 1.4982198393961307e-05, "loss": 2.3815, "step": 8170 }, { "epoch": 1.203306301833361, "grad_norm": 2.665043281506683, "learning_rate": 1.4967347657285812e-05, "loss": 2.4569, "step": 8180 }, { "epoch": 1.2047774038726762, "grad_norm": 2.4526503771570836, "learning_rate": 1.49524823625094e-05, "loss": 2.4098, "step": 8190 }, { "epoch": 1.2062485059119914, "grad_norm": 2.412640003790122, "learning_rate": 1.4937602553198674e-05, "loss": 2.4275, "step": 8200 }, { "epoch": 1.2077196079513066, "grad_norm": 2.544173630229281, "learning_rate": 1.4922708272962773e-05, "loss": 2.4021, "step": 8210 }, { "epoch": 1.2091907099906218, "grad_norm": 2.7491981756311628, "learning_rate": 1.4907799565453253e-05, "loss": 2.3743, "step": 8220 }, { "epoch": 1.2106618120299368, "grad_norm": 2.5421850144739726, "learning_rate": 1.4892876474363948e-05, "loss": 2.4409, "step": 8230 }, { "epoch": 1.212132914069252, "grad_norm": 2.4204218906369848, "learning_rate": 1.4877939043430849e-05, "loss": 2.4047, "step": 8240 }, { "epoch": 1.2136040161085673, "grad_norm": 2.526472020555818, "learning_rate": 1.4862987316431969e-05, "loss": 2.4231, "step": 8250 }, { "epoch": 1.2150751181478825, "grad_norm": 2.6904960880423316, "learning_rate": 1.4848021337187226e-05, "loss": 2.42, "step": 8260 }, { "epoch": 1.2165462201871977, "grad_norm": 2.8177027340194813, "learning_rate": 1.48330411495583e-05, "loss": 2.4219, "step": 8270 }, { "epoch": 1.218017322226513, "grad_norm": 2.7988699674480744, "learning_rate": 1.481804679744852e-05, "loss": 2.3578, "step": 8280 }, { "epoch": 1.2194884242658282, "grad_norm": 2.8164151403816917, "learning_rate": 1.4803038324802728e-05, "loss": 2.401, "step": 8290 }, { "epoch": 1.2209595263051434, "grad_norm": 2.685966251247371, "learning_rate": 1.478801577560714e-05, "loss": 2.4036, "step": 8300 }, { "epoch": 1.2224306283444586, "grad_norm": 2.5528049404131044, "learning_rate": 1.4772979193889236e-05, "loss": 2.4493, "step": 8310 }, { "epoch": 1.2239017303837738, "grad_norm": 2.5615003621966634, "learning_rate": 1.4757928623717618e-05, "loss": 2.3984, "step": 8320 }, { "epoch": 1.225372832423089, "grad_norm": 2.6025438276020965, "learning_rate": 1.4742864109201894e-05, "loss": 2.4487, "step": 8330 }, { "epoch": 1.226843934462404, "grad_norm": 2.802045451382, "learning_rate": 1.472778569449252e-05, "loss": 2.4361, "step": 8340 }, { "epoch": 1.2283150365017192, "grad_norm": 2.5805463524520205, "learning_rate": 1.4712693423780712e-05, "loss": 2.4012, "step": 8350 }, { "epoch": 1.2297861385410345, "grad_norm": 2.60806656661521, "learning_rate": 1.4697587341298278e-05, "loss": 2.4648, "step": 8360 }, { "epoch": 1.2312572405803497, "grad_norm": 2.7286738882606496, "learning_rate": 1.4682467491317514e-05, "loss": 2.4099, "step": 8370 }, { "epoch": 1.232728342619665, "grad_norm": 2.3721317093766037, "learning_rate": 1.4667333918151063e-05, "loss": 2.3596, "step": 8380 }, { "epoch": 1.2341994446589801, "grad_norm": 2.2915505562900527, "learning_rate": 1.4652186666151787e-05, "loss": 2.3573, "step": 8390 }, { "epoch": 1.2356705466982953, "grad_norm": 2.476929819372251, "learning_rate": 1.4637025779712636e-05, "loss": 2.4244, "step": 8400 }, { "epoch": 1.2371416487376106, "grad_norm": 2.5354237346590036, "learning_rate": 1.462185130326652e-05, "loss": 2.3931, "step": 8410 }, { "epoch": 1.2386127507769258, "grad_norm": 2.5429449302998477, "learning_rate": 1.4606663281286181e-05, "loss": 2.3963, "step": 8420 }, { "epoch": 1.240083852816241, "grad_norm": 2.1608727812755815, "learning_rate": 1.4591461758284058e-05, "loss": 2.3888, "step": 8430 }, { "epoch": 1.2415549548555562, "grad_norm": 2.604924339507663, "learning_rate": 1.4576246778812155e-05, "loss": 2.4544, "step": 8440 }, { "epoch": 1.2430260568948714, "grad_norm": 2.6192957732814452, "learning_rate": 1.4561018387461917e-05, "loss": 2.4338, "step": 8450 }, { "epoch": 1.2444971589341867, "grad_norm": 2.41203178259359, "learning_rate": 1.4545776628864099e-05, "loss": 2.424, "step": 8460 }, { "epoch": 1.2459682609735019, "grad_norm": 2.5645317111076156, "learning_rate": 1.4530521547688624e-05, "loss": 2.388, "step": 8470 }, { "epoch": 1.2474393630128169, "grad_norm": 2.6076978455225905, "learning_rate": 1.4515253188644467e-05, "loss": 2.3978, "step": 8480 }, { "epoch": 1.248910465052132, "grad_norm": 2.4644487603321874, "learning_rate": 1.4499971596479515e-05, "loss": 2.4022, "step": 8490 }, { "epoch": 1.2503815670914473, "grad_norm": 2.601984965695311, "learning_rate": 1.4484676815980435e-05, "loss": 2.4259, "step": 8500 }, { "epoch": 1.2518526691307625, "grad_norm": 2.794452558243942, "learning_rate": 1.4469368891972551e-05, "loss": 2.4005, "step": 8510 }, { "epoch": 1.2533237711700778, "grad_norm": 2.36772763185603, "learning_rate": 1.4454047869319705e-05, "loss": 2.4279, "step": 8520 }, { "epoch": 1.254794873209393, "grad_norm": 2.639079160282587, "learning_rate": 1.443871379292413e-05, "loss": 2.3391, "step": 8530 }, { "epoch": 1.2562659752487082, "grad_norm": 2.407290050488924, "learning_rate": 1.4423366707726316e-05, "loss": 2.38, "step": 8540 }, { "epoch": 1.2577370772880234, "grad_norm": 2.7031719309786895, "learning_rate": 1.4408006658704876e-05, "loss": 2.3688, "step": 8550 }, { "epoch": 1.2592081793273386, "grad_norm": 2.360032440445141, "learning_rate": 1.4392633690876418e-05, "loss": 2.4782, "step": 8560 }, { "epoch": 1.2606792813666539, "grad_norm": 2.4767444938568297, "learning_rate": 1.4377247849295408e-05, "loss": 2.4276, "step": 8570 }, { "epoch": 1.262150383405969, "grad_norm": 2.482808488151245, "learning_rate": 1.4361849179054055e-05, "loss": 2.4521, "step": 8580 }, { "epoch": 1.263621485445284, "grad_norm": 2.2905266090853478, "learning_rate": 1.4346437725282147e-05, "loss": 2.4653, "step": 8590 }, { "epoch": 1.2650925874845993, "grad_norm": 2.3337626641312594, "learning_rate": 1.4331013533146956e-05, "loss": 2.4982, "step": 8600 }, { "epoch": 1.2665636895239145, "grad_norm": 2.7267912774309115, "learning_rate": 1.4315576647853072e-05, "loss": 2.4901, "step": 8610 }, { "epoch": 1.2680347915632297, "grad_norm": 2.721511888083161, "learning_rate": 1.4300127114642295e-05, "loss": 2.407, "step": 8620 }, { "epoch": 1.269505893602545, "grad_norm": 2.5328859223680444, "learning_rate": 1.4284664978793488e-05, "loss": 2.4544, "step": 8630 }, { "epoch": 1.2709769956418602, "grad_norm": 2.286614837365679, "learning_rate": 1.4269190285622454e-05, "loss": 2.3546, "step": 8640 }, { "epoch": 1.2724480976811754, "grad_norm": 2.3700938700600744, "learning_rate": 1.4253703080481793e-05, "loss": 2.3521, "step": 8650 }, { "epoch": 1.2739191997204906, "grad_norm": 2.405500059397626, "learning_rate": 1.4238203408760779e-05, "loss": 2.3417, "step": 8660 }, { "epoch": 1.2753903017598058, "grad_norm": 2.5188615827718173, "learning_rate": 1.4222691315885219e-05, "loss": 2.333, "step": 8670 }, { "epoch": 1.276861403799121, "grad_norm": 2.264032433935611, "learning_rate": 1.4207166847317329e-05, "loss": 2.3733, "step": 8680 }, { "epoch": 1.2783325058384363, "grad_norm": 2.4622879064379393, "learning_rate": 1.419163004855559e-05, "loss": 2.463, "step": 8690 }, { "epoch": 1.2798036078777515, "grad_norm": 2.585806577472414, "learning_rate": 1.4176080965134616e-05, "loss": 2.4747, "step": 8700 }, { "epoch": 1.2812747099170667, "grad_norm": 2.2371899494621243, "learning_rate": 1.4160519642625037e-05, "loss": 2.383, "step": 8710 }, { "epoch": 1.282745811956382, "grad_norm": 2.6014268007436554, "learning_rate": 1.4144946126633338e-05, "loss": 2.3954, "step": 8720 }, { "epoch": 1.2842169139956972, "grad_norm": 2.6075615939839407, "learning_rate": 1.4129360462801756e-05, "loss": 2.4116, "step": 8730 }, { "epoch": 1.2856880160350121, "grad_norm": 2.4477152800769093, "learning_rate": 1.4113762696808118e-05, "loss": 2.422, "step": 8740 }, { "epoch": 1.2871591180743274, "grad_norm": 2.2586983105283247, "learning_rate": 1.4098152874365723e-05, "loss": 2.3677, "step": 8750 }, { "epoch": 1.2886302201136426, "grad_norm": 2.5846658005380916, "learning_rate": 1.4082531041223204e-05, "loss": 2.4162, "step": 8760 }, { "epoch": 1.2901013221529578, "grad_norm": 2.4699691973737288, "learning_rate": 1.4066897243164398e-05, "loss": 2.4403, "step": 8770 }, { "epoch": 1.291572424192273, "grad_norm": 2.830075213229464, "learning_rate": 1.4051251526008205e-05, "loss": 2.4042, "step": 8780 }, { "epoch": 1.2930435262315882, "grad_norm": 2.460290760197572, "learning_rate": 1.4035593935608457e-05, "loss": 2.4471, "step": 8790 }, { "epoch": 1.2945146282709035, "grad_norm": 2.353792936977325, "learning_rate": 1.4019924517853789e-05, "loss": 2.476, "step": 8800 }, { "epoch": 1.2959857303102187, "grad_norm": 2.843376537027846, "learning_rate": 1.4004243318667488e-05, "loss": 2.5452, "step": 8810 }, { "epoch": 1.297456832349534, "grad_norm": 2.573645423347063, "learning_rate": 1.3988550384007379e-05, "loss": 2.4344, "step": 8820 }, { "epoch": 1.2989279343888491, "grad_norm": 2.2165040633047357, "learning_rate": 1.3972845759865677e-05, "loss": 2.3593, "step": 8830 }, { "epoch": 1.3003990364281641, "grad_norm": 2.5836978862834044, "learning_rate": 1.3957129492268862e-05, "loss": 2.3955, "step": 8840 }, { "epoch": 1.3018701384674793, "grad_norm": 2.6341661492862167, "learning_rate": 1.3941401627277526e-05, "loss": 2.4107, "step": 8850 }, { "epoch": 1.3033412405067946, "grad_norm": 2.8241575925248372, "learning_rate": 1.3925662210986262e-05, "loss": 2.4608, "step": 8860 }, { "epoch": 1.3048123425461098, "grad_norm": 2.197023906810008, "learning_rate": 1.3909911289523512e-05, "loss": 2.4199, "step": 8870 }, { "epoch": 1.306283444585425, "grad_norm": 2.6165300314271236, "learning_rate": 1.3894148909051437e-05, "loss": 2.3889, "step": 8880 }, { "epoch": 1.3077545466247402, "grad_norm": 2.6291085859185332, "learning_rate": 1.3878375115765781e-05, "loss": 2.354, "step": 8890 }, { "epoch": 1.3092256486640554, "grad_norm": 2.4897411126273368, "learning_rate": 1.3862589955895737e-05, "loss": 2.3478, "step": 8900 }, { "epoch": 1.3106967507033707, "grad_norm": 2.4705059638444977, "learning_rate": 1.3846793475703812e-05, "loss": 2.4007, "step": 8910 }, { "epoch": 1.3121678527426859, "grad_norm": 2.504836060078305, "learning_rate": 1.383098572148569e-05, "loss": 2.3732, "step": 8920 }, { "epoch": 1.313638954782001, "grad_norm": 2.6278259323522764, "learning_rate": 1.3815166739570094e-05, "loss": 2.3088, "step": 8930 }, { "epoch": 1.3151100568213163, "grad_norm": 2.803211319620455, "learning_rate": 1.3799336576318655e-05, "loss": 2.474, "step": 8940 }, { "epoch": 1.3165811588606315, "grad_norm": 2.4668873982040083, "learning_rate": 1.3783495278125771e-05, "loss": 2.3704, "step": 8950 }, { "epoch": 1.3180522608999468, "grad_norm": 2.374402922900511, "learning_rate": 1.376764289141848e-05, "loss": 2.3919, "step": 8960 }, { "epoch": 1.319523362939262, "grad_norm": 2.7890607244111076, "learning_rate": 1.375177946265631e-05, "loss": 2.4012, "step": 8970 }, { "epoch": 1.3209944649785772, "grad_norm": 2.6119535127920734, "learning_rate": 1.3735905038331155e-05, "loss": 2.3746, "step": 8980 }, { "epoch": 1.3224655670178922, "grad_norm": 2.711275319355986, "learning_rate": 1.3720019664967133e-05, "loss": 2.3585, "step": 8990 }, { "epoch": 1.3239366690572074, "grad_norm": 2.3348439408991113, "learning_rate": 1.3704123389120452e-05, "loss": 2.3066, "step": 9000 }, { "epoch": 1.3239366690572074, "eval_loss": 2.466747999191284, "eval_runtime": 344.116, "eval_samples_per_second": 266.157, "eval_steps_per_second": 8.32, "step": 9000 }, { "epoch": 1.3255548813004543, "grad_norm": 3.2559258621036005, "learning_rate": 1.3688216257379271e-05, "loss": 2.311, "step": 9010 }, { "epoch": 1.3270259833397695, "grad_norm": 2.676468929956256, "learning_rate": 1.3672298316363573e-05, "loss": 2.4017, "step": 9020 }, { "epoch": 1.3284970853790847, "grad_norm": 2.5669810321443784, "learning_rate": 1.3656369612725001e-05, "loss": 2.3839, "step": 9030 }, { "epoch": 1.3299681874183997, "grad_norm": 2.6927568696011277, "learning_rate": 1.3640430193146768e-05, "loss": 2.396, "step": 9040 }, { "epoch": 1.331439289457715, "grad_norm": 2.723080098914417, "learning_rate": 1.3624480104343463e-05, "loss": 2.4066, "step": 9050 }, { "epoch": 1.3329103914970302, "grad_norm": 2.2850599731383965, "learning_rate": 1.3608519393060971e-05, "loss": 2.3954, "step": 9060 }, { "epoch": 1.3343814935363454, "grad_norm": 2.3531895353754995, "learning_rate": 1.3592548106076291e-05, "loss": 2.4257, "step": 9070 }, { "epoch": 1.3358525955756606, "grad_norm": 2.4727194120079017, "learning_rate": 1.3576566290197421e-05, "loss": 2.3876, "step": 9080 }, { "epoch": 1.3373236976149758, "grad_norm": 2.5989870117347174, "learning_rate": 1.3560573992263224e-05, "loss": 2.429, "step": 9090 }, { "epoch": 1.338794799654291, "grad_norm": 2.705767925067454, "learning_rate": 1.3544571259143276e-05, "loss": 2.3843, "step": 9100 }, { "epoch": 1.3402659016936063, "grad_norm": 2.3078442848538927, "learning_rate": 1.3528558137737733e-05, "loss": 2.391, "step": 9110 }, { "epoch": 1.3417370037329215, "grad_norm": 2.396343324991712, "learning_rate": 1.3512534674977208e-05, "loss": 2.346, "step": 9120 }, { "epoch": 1.3432081057722367, "grad_norm": 2.8612820770621457, "learning_rate": 1.3496500917822608e-05, "loss": 2.3734, "step": 9130 }, { "epoch": 1.3446792078115517, "grad_norm": 2.8719420062152583, "learning_rate": 1.3480456913265027e-05, "loss": 2.3308, "step": 9140 }, { "epoch": 1.346150309850867, "grad_norm": 2.432081423230842, "learning_rate": 1.3464402708325573e-05, "loss": 2.3623, "step": 9150 }, { "epoch": 1.3476214118901821, "grad_norm": 2.788124523241804, "learning_rate": 1.3448338350055265e-05, "loss": 2.3909, "step": 9160 }, { "epoch": 1.3490925139294974, "grad_norm": 2.7521491765193535, "learning_rate": 1.3432263885534873e-05, "loss": 2.4006, "step": 9170 }, { "epoch": 1.3505636159688126, "grad_norm": 2.535537944702782, "learning_rate": 1.3416179361874777e-05, "loss": 2.373, "step": 9180 }, { "epoch": 1.3520347180081278, "grad_norm": 2.5986322062589213, "learning_rate": 1.3400084826214855e-05, "loss": 2.4123, "step": 9190 }, { "epoch": 1.353505820047443, "grad_norm": 2.6140784213349444, "learning_rate": 1.3383980325724315e-05, "loss": 2.4239, "step": 9200 }, { "epoch": 1.3549769220867582, "grad_norm": 2.34721909336437, "learning_rate": 1.3367865907601572e-05, "loss": 2.3925, "step": 9210 }, { "epoch": 1.3564480241260735, "grad_norm": 2.5377538668495947, "learning_rate": 1.335174161907411e-05, "loss": 2.3133, "step": 9220 }, { "epoch": 1.3579191261653887, "grad_norm": 2.667512166501538, "learning_rate": 1.333560750739834e-05, "loss": 2.364, "step": 9230 }, { "epoch": 1.359390228204704, "grad_norm": 2.647667300804029, "learning_rate": 1.331946361985946e-05, "loss": 2.4089, "step": 9240 }, { "epoch": 1.3608613302440191, "grad_norm": 2.295140784559217, "learning_rate": 1.3303310003771329e-05, "loss": 2.3606, "step": 9250 }, { "epoch": 1.3623324322833343, "grad_norm": 2.7874087959938385, "learning_rate": 1.3287146706476298e-05, "loss": 2.3779, "step": 9260 }, { "epoch": 1.3638035343226496, "grad_norm": 2.2710441619778736, "learning_rate": 1.3270973775345103e-05, "loss": 2.3473, "step": 9270 }, { "epoch": 1.3652746363619648, "grad_norm": 2.2837928502051015, "learning_rate": 1.3254791257776725e-05, "loss": 2.3426, "step": 9280 }, { "epoch": 1.36674573840128, "grad_norm": 2.904856256077315, "learning_rate": 1.3238599201198219e-05, "loss": 2.3636, "step": 9290 }, { "epoch": 1.368216840440595, "grad_norm": 2.6550423813890847, "learning_rate": 1.3222397653064614e-05, "loss": 2.3685, "step": 9300 }, { "epoch": 1.3696879424799102, "grad_norm": 2.734717282791015, "learning_rate": 1.3206186660858743e-05, "loss": 2.4104, "step": 9310 }, { "epoch": 1.3711590445192254, "grad_norm": 2.5962923779069214, "learning_rate": 1.3189966272091131e-05, "loss": 2.4619, "step": 9320 }, { "epoch": 1.3726301465585407, "grad_norm": 2.478799551300158, "learning_rate": 1.3173736534299832e-05, "loss": 2.4226, "step": 9330 }, { "epoch": 1.3741012485978559, "grad_norm": 2.6562924625741, "learning_rate": 1.3157497495050302e-05, "loss": 2.4197, "step": 9340 }, { "epoch": 1.375572350637171, "grad_norm": 2.297046538009216, "learning_rate": 1.314124920193526e-05, "loss": 2.3516, "step": 9350 }, { "epoch": 1.3770434526764863, "grad_norm": 2.450703513455938, "learning_rate": 1.3124991702574544e-05, "loss": 2.3672, "step": 9360 }, { "epoch": 1.3785145547158015, "grad_norm": 2.623803464914788, "learning_rate": 1.3108725044614969e-05, "loss": 2.375, "step": 9370 }, { "epoch": 1.3799856567551168, "grad_norm": 2.5847446394258524, "learning_rate": 1.3092449275730203e-05, "loss": 2.3516, "step": 9380 }, { "epoch": 1.3814567587944317, "grad_norm": 2.4739643730894554, "learning_rate": 1.3076164443620608e-05, "loss": 2.2741, "step": 9390 }, { "epoch": 1.382927860833747, "grad_norm": 2.5117700840508306, "learning_rate": 1.30598705960131e-05, "loss": 2.4414, "step": 9400 }, { "epoch": 1.3843989628730622, "grad_norm": 2.653173179810463, "learning_rate": 1.3043567780661038e-05, "loss": 2.4484, "step": 9410 }, { "epoch": 1.3858700649123774, "grad_norm": 2.7240418149554886, "learning_rate": 1.3027256045344041e-05, "loss": 2.4335, "step": 9420 }, { "epoch": 1.3873411669516926, "grad_norm": 2.6767895689164787, "learning_rate": 1.301093543786789e-05, "loss": 2.3947, "step": 9430 }, { "epoch": 1.3888122689910078, "grad_norm": 2.495640985036208, "learning_rate": 1.2994606006064352e-05, "loss": 2.3954, "step": 9440 }, { "epoch": 1.390283371030323, "grad_norm": 2.507207278955325, "learning_rate": 1.297826779779107e-05, "loss": 2.4007, "step": 9450 }, { "epoch": 1.3917544730696383, "grad_norm": 2.636945254694588, "learning_rate": 1.2961920860931398e-05, "loss": 2.3741, "step": 9460 }, { "epoch": 1.3932255751089535, "grad_norm": 2.61728854096052, "learning_rate": 1.2945565243394271e-05, "loss": 2.3338, "step": 9470 }, { "epoch": 1.3946966771482687, "grad_norm": 2.6106457847765157, "learning_rate": 1.2929200993114077e-05, "loss": 2.335, "step": 9480 }, { "epoch": 1.396167779187584, "grad_norm": 2.6031132772484384, "learning_rate": 1.2912828158050493e-05, "loss": 2.3382, "step": 9490 }, { "epoch": 1.3976388812268992, "grad_norm": 2.3263302161120607, "learning_rate": 1.2896446786188356e-05, "loss": 2.3727, "step": 9500 }, { "epoch": 1.3991099832662144, "grad_norm": 2.5012911356660235, "learning_rate": 1.2880056925537531e-05, "loss": 2.3111, "step": 9510 }, { "epoch": 1.4005810853055296, "grad_norm": 2.390037767245813, "learning_rate": 1.2863658624132755e-05, "loss": 2.3377, "step": 9520 }, { "epoch": 1.4020521873448448, "grad_norm": 2.4678468285247575, "learning_rate": 1.2847251930033505e-05, "loss": 2.3711, "step": 9530 }, { "epoch": 1.40352328938416, "grad_norm": 2.5848317033718566, "learning_rate": 1.2830836891323847e-05, "loss": 2.4182, "step": 9540 }, { "epoch": 1.404994391423475, "grad_norm": 2.3998299175931006, "learning_rate": 1.2814413556112322e-05, "loss": 2.3746, "step": 9550 }, { "epoch": 1.4064654934627903, "grad_norm": 2.4581076418687746, "learning_rate": 1.2797981972531766e-05, "loss": 2.3908, "step": 9560 }, { "epoch": 1.4079365955021055, "grad_norm": 2.6402384312209493, "learning_rate": 1.2781542188739203e-05, "loss": 2.3453, "step": 9570 }, { "epoch": 1.4094076975414207, "grad_norm": 2.6780326871531974, "learning_rate": 1.2765094252915677e-05, "loss": 2.3658, "step": 9580 }, { "epoch": 1.410878799580736, "grad_norm": 2.7843039282181006, "learning_rate": 1.2748638213266138e-05, "loss": 2.3418, "step": 9590 }, { "epoch": 1.4123499016200511, "grad_norm": 2.446077828032173, "learning_rate": 1.2732174118019274e-05, "loss": 2.3584, "step": 9600 }, { "epoch": 1.4138210036593664, "grad_norm": 2.5245956718568086, "learning_rate": 1.2715702015427389e-05, "loss": 2.3129, "step": 9610 }, { "epoch": 1.4152921056986816, "grad_norm": 2.45698860698678, "learning_rate": 1.2699221953766252e-05, "loss": 2.294, "step": 9620 }, { "epoch": 1.4167632077379968, "grad_norm": 2.837249811836683, "learning_rate": 1.2682733981334961e-05, "loss": 2.357, "step": 9630 }, { "epoch": 1.4182343097773118, "grad_norm": 2.5459507627273488, "learning_rate": 1.2666238146455794e-05, "loss": 2.3364, "step": 9640 }, { "epoch": 1.419705411816627, "grad_norm": 2.643099473986947, "learning_rate": 1.2649734497474078e-05, "loss": 2.3687, "step": 9650 }, { "epoch": 1.4211765138559422, "grad_norm": 2.5916814219711872, "learning_rate": 1.263322308275803e-05, "loss": 2.3331, "step": 9660 }, { "epoch": 1.4226476158952575, "grad_norm": 2.596797241870581, "learning_rate": 1.2616703950698644e-05, "loss": 2.3651, "step": 9670 }, { "epoch": 1.4241187179345727, "grad_norm": 2.439824537350601, "learning_rate": 1.2600177149709516e-05, "loss": 2.3901, "step": 9680 }, { "epoch": 1.425589819973888, "grad_norm": 2.598968558498181, "learning_rate": 1.2583642728226725e-05, "loss": 2.2917, "step": 9690 }, { "epoch": 1.4270609220132031, "grad_norm": 2.716662356712752, "learning_rate": 1.2567100734708681e-05, "loss": 2.3172, "step": 9700 }, { "epoch": 1.4285320240525183, "grad_norm": 2.735838761001265, "learning_rate": 1.2550551217635989e-05, "loss": 2.3793, "step": 9710 }, { "epoch": 1.4300031260918336, "grad_norm": 2.6283651984211347, "learning_rate": 1.2533994225511302e-05, "loss": 2.3964, "step": 9720 }, { "epoch": 1.4314742281311488, "grad_norm": 2.4668749221690223, "learning_rate": 1.2517429806859177e-05, "loss": 2.429, "step": 9730 }, { "epoch": 1.432945330170464, "grad_norm": 2.6959586034311913, "learning_rate": 1.2500858010225945e-05, "loss": 2.4097, "step": 9740 }, { "epoch": 1.4344164322097792, "grad_norm": 2.760907719231498, "learning_rate": 1.2484278884179555e-05, "loss": 2.455, "step": 9750 }, { "epoch": 1.4358875342490944, "grad_norm": 2.5681284743168007, "learning_rate": 1.246769247730943e-05, "loss": 2.3963, "step": 9760 }, { "epoch": 1.4373586362884097, "grad_norm": 2.560484050068811, "learning_rate": 1.2451098838226347e-05, "loss": 2.3923, "step": 9770 }, { "epoch": 1.4388297383277249, "grad_norm": 2.744422014199199, "learning_rate": 1.2434498015562264e-05, "loss": 2.4138, "step": 9780 }, { "epoch": 1.44030084036704, "grad_norm": 2.8061348902973733, "learning_rate": 1.2417890057970199e-05, "loss": 2.3776, "step": 9790 }, { "epoch": 1.441771942406355, "grad_norm": 2.5139754147794386, "learning_rate": 1.2401275014124084e-05, "loss": 2.328, "step": 9800 }, { "epoch": 1.4432430444456703, "grad_norm": 2.574251488534968, "learning_rate": 1.2384652932718611e-05, "loss": 2.3139, "step": 9810 }, { "epoch": 1.4447141464849855, "grad_norm": 2.7273783430796437, "learning_rate": 1.23680238624691e-05, "loss": 2.3621, "step": 9820 }, { "epoch": 1.4461852485243007, "grad_norm": 2.5991325656021305, "learning_rate": 1.235138785211136e-05, "loss": 2.4098, "step": 9830 }, { "epoch": 1.447656350563616, "grad_norm": 2.71132043260636, "learning_rate": 1.2334744950401536e-05, "loss": 2.3447, "step": 9840 }, { "epoch": 1.4491274526029312, "grad_norm": 2.697375312733348, "learning_rate": 1.231809520611596e-05, "loss": 2.3274, "step": 9850 }, { "epoch": 1.4505985546422464, "grad_norm": 2.4865577894009663, "learning_rate": 1.2301438668051038e-05, "loss": 2.3927, "step": 9860 }, { "epoch": 1.4520696566815616, "grad_norm": 2.73112210772354, "learning_rate": 1.2284775385023071e-05, "loss": 2.3998, "step": 9870 }, { "epoch": 1.4535407587208768, "grad_norm": 2.7038715183432553, "learning_rate": 1.2268105405868131e-05, "loss": 2.3857, "step": 9880 }, { "epoch": 1.4550118607601918, "grad_norm": 2.6924204606998843, "learning_rate": 1.2251428779441914e-05, "loss": 2.3652, "step": 9890 }, { "epoch": 1.456482962799507, "grad_norm": 2.8806261529421073, "learning_rate": 1.223474555461961e-05, "loss": 2.3626, "step": 9900 }, { "epoch": 1.4579540648388223, "grad_norm": 2.383752531991848, "learning_rate": 1.2218055780295726e-05, "loss": 2.3147, "step": 9910 }, { "epoch": 1.4594251668781375, "grad_norm": 2.5163510207657587, "learning_rate": 1.220135950538398e-05, "loss": 2.4219, "step": 9920 }, { "epoch": 1.4608962689174527, "grad_norm": 2.765008937921741, "learning_rate": 1.2184656778817137e-05, "loss": 2.3209, "step": 9930 }, { "epoch": 1.462367370956768, "grad_norm": 2.401479183393926, "learning_rate": 1.216794764954687e-05, "loss": 2.3944, "step": 9940 }, { "epoch": 1.4638384729960832, "grad_norm": 2.4321952084025393, "learning_rate": 1.2151232166543615e-05, "loss": 2.3092, "step": 9950 }, { "epoch": 1.4653095750353984, "grad_norm": 2.470044472346823, "learning_rate": 1.2134510378796439e-05, "loss": 2.3826, "step": 9960 }, { "epoch": 1.4667806770747136, "grad_norm": 2.748241388774167, "learning_rate": 1.2117782335312871e-05, "loss": 2.372, "step": 9970 }, { "epoch": 1.4682517791140288, "grad_norm": 2.4212904458143734, "learning_rate": 1.210104808511878e-05, "loss": 2.392, "step": 9980 }, { "epoch": 1.469722881153344, "grad_norm": 2.367894769432844, "learning_rate": 1.2084307677258235e-05, "loss": 2.4091, "step": 9990 }, { "epoch": 1.4711939831926593, "grad_norm": 2.880462237719193, "learning_rate": 1.2067561160793341e-05, "loss": 2.3294, "step": 10000 }, { "epoch": 1.4711939831926593, "eval_loss": 2.442768096923828, "eval_runtime": 570.5531, "eval_samples_per_second": 160.527, "eval_steps_per_second": 5.018, "step": 10000 }, { "epoch": 1.4726650852319745, "grad_norm": 2.6566669418925892, "learning_rate": 1.2050808584804105e-05, "loss": 2.3834, "step": 10010 }, { "epoch": 1.4741361872712897, "grad_norm": 2.80762020506157, "learning_rate": 1.2034049998388302e-05, "loss": 2.3308, "step": 10020 }, { "epoch": 1.475607289310605, "grad_norm": 2.675485066836816, "learning_rate": 1.2017285450661311e-05, "loss": 2.3077, "step": 10030 }, { "epoch": 1.4770783913499201, "grad_norm": 2.522032390819777, "learning_rate": 1.2000514990755998e-05, "loss": 2.3834, "step": 10040 }, { "epoch": 1.4785494933892351, "grad_norm": 2.5219548018649087, "learning_rate": 1.1983738667822535e-05, "loss": 2.3844, "step": 10050 }, { "epoch": 1.4800205954285504, "grad_norm": 2.672689958523864, "learning_rate": 1.1966956531028296e-05, "loss": 2.3464, "step": 10060 }, { "epoch": 1.4814916974678656, "grad_norm": 2.863300188244239, "learning_rate": 1.1950168629557684e-05, "loss": 2.4247, "step": 10070 }, { "epoch": 1.4829627995071808, "grad_norm": 2.572709537202734, "learning_rate": 1.1933375012611999e-05, "loss": 2.3197, "step": 10080 }, { "epoch": 1.484433901546496, "grad_norm": 2.4789100309068584, "learning_rate": 1.1916575729409292e-05, "loss": 2.3349, "step": 10090 }, { "epoch": 1.4859050035858112, "grad_norm": 2.7549743744902426, "learning_rate": 1.189977082918422e-05, "loss": 2.4032, "step": 10100 }, { "epoch": 1.4873761056251265, "grad_norm": 2.4362358151755754, "learning_rate": 1.1882960361187903e-05, "loss": 2.437, "step": 10110 }, { "epoch": 1.4888472076644417, "grad_norm": 2.53102755967527, "learning_rate": 1.186614437468778e-05, "loss": 2.3631, "step": 10120 }, { "epoch": 1.490318309703757, "grad_norm": 2.79593536463562, "learning_rate": 1.1849322918967458e-05, "loss": 2.4512, "step": 10130 }, { "epoch": 1.491789411743072, "grad_norm": 2.4887931495009226, "learning_rate": 1.183249604332658e-05, "loss": 2.3003, "step": 10140 }, { "epoch": 1.4932605137823871, "grad_norm": 2.574632306794161, "learning_rate": 1.181566379708067e-05, "loss": 2.3729, "step": 10150 }, { "epoch": 1.4947316158217023, "grad_norm": 2.6794611515993947, "learning_rate": 1.1798826229560993e-05, "loss": 2.4267, "step": 10160 }, { "epoch": 1.4962027178610176, "grad_norm": 2.6996200089240063, "learning_rate": 1.1781983390114404e-05, "loss": 2.3733, "step": 10170 }, { "epoch": 1.4976738199003328, "grad_norm": 2.440365018403851, "learning_rate": 1.1765135328103219e-05, "loss": 2.4151, "step": 10180 }, { "epoch": 1.499144921939648, "grad_norm": 2.5139638289426314, "learning_rate": 1.174828209290505e-05, "loss": 2.3773, "step": 10190 }, { "epoch": 1.5006160239789632, "grad_norm": 2.6026380886060316, "learning_rate": 1.1731423733912677e-05, "loss": 2.444, "step": 10200 }, { "epoch": 1.5020871260182784, "grad_norm": 2.533016696587374, "learning_rate": 1.1714560300533893e-05, "loss": 2.3453, "step": 10210 }, { "epoch": 1.5035582280575936, "grad_norm": 2.7055854916423536, "learning_rate": 1.1697691842191365e-05, "loss": 2.3232, "step": 10220 }, { "epoch": 1.5050293300969089, "grad_norm": 2.6752959439401263, "learning_rate": 1.1680818408322486e-05, "loss": 2.4159, "step": 10230 }, { "epoch": 1.506500432136224, "grad_norm": 2.370325248687303, "learning_rate": 1.166394004837923e-05, "loss": 2.3689, "step": 10240 }, { "epoch": 1.5079715341755393, "grad_norm": 2.789130928173859, "learning_rate": 1.1647056811828012e-05, "loss": 2.3319, "step": 10250 }, { "epoch": 1.5094426362148545, "grad_norm": 2.563767801404464, "learning_rate": 1.1630168748149537e-05, "loss": 2.3786, "step": 10260 }, { "epoch": 1.5109137382541697, "grad_norm": 2.6363438771394367, "learning_rate": 1.161327590683865e-05, "loss": 2.386, "step": 10270 }, { "epoch": 1.512384840293485, "grad_norm": 2.608839128897307, "learning_rate": 1.1596378337404215e-05, "loss": 2.3245, "step": 10280 }, { "epoch": 1.5138559423328002, "grad_norm": 2.4976569975401155, "learning_rate": 1.1579476089368934e-05, "loss": 2.3922, "step": 10290 }, { "epoch": 1.5153270443721154, "grad_norm": 2.3941446323390196, "learning_rate": 1.1562569212269232e-05, "loss": 2.4345, "step": 10300 }, { "epoch": 1.5167981464114306, "grad_norm": 2.4816100807737596, "learning_rate": 1.15456577556551e-05, "loss": 2.2807, "step": 10310 }, { "epoch": 1.5182692484507456, "grad_norm": 2.866291915160975, "learning_rate": 1.1528741769089949e-05, "loss": 2.4022, "step": 10320 }, { "epoch": 1.5197403504900608, "grad_norm": 2.750312885991737, "learning_rate": 1.1511821302150458e-05, "loss": 2.3603, "step": 10330 }, { "epoch": 1.521211452529376, "grad_norm": 2.8052797858013205, "learning_rate": 1.1494896404426456e-05, "loss": 2.3631, "step": 10340 }, { "epoch": 1.5226825545686913, "grad_norm": 3.0346570707633105, "learning_rate": 1.1477967125520744e-05, "loss": 2.343, "step": 10350 }, { "epoch": 1.5241536566080065, "grad_norm": 2.384289552656533, "learning_rate": 1.146103351504896e-05, "loss": 2.2945, "step": 10360 }, { "epoch": 1.5256247586473217, "grad_norm": 2.557726756567248, "learning_rate": 1.1444095622639446e-05, "loss": 2.3546, "step": 10370 }, { "epoch": 1.5270958606866367, "grad_norm": 2.3498143335275823, "learning_rate": 1.142715349793309e-05, "loss": 2.3533, "step": 10380 }, { "epoch": 1.528566962725952, "grad_norm": 2.6936507343287084, "learning_rate": 1.1410207190583183e-05, "loss": 2.3876, "step": 10390 }, { "epoch": 1.5300380647652672, "grad_norm": 2.467923828347904, "learning_rate": 1.139325675025527e-05, "loss": 2.3541, "step": 10400 }, { "epoch": 1.5315091668045824, "grad_norm": 2.473184343283628, "learning_rate": 1.1376302226627023e-05, "loss": 2.3588, "step": 10410 }, { "epoch": 1.5329802688438976, "grad_norm": 2.6682380425382557, "learning_rate": 1.1359343669388067e-05, "loss": 2.3174, "step": 10420 }, { "epoch": 1.5344513708832128, "grad_norm": 2.64478728656671, "learning_rate": 1.134238112823985e-05, "loss": 2.3432, "step": 10430 }, { "epoch": 1.535922472922528, "grad_norm": 2.5639370612391756, "learning_rate": 1.1325414652895503e-05, "loss": 2.3585, "step": 10440 }, { "epoch": 1.5373935749618433, "grad_norm": 2.8869396113432004, "learning_rate": 1.1308444293079685e-05, "loss": 2.3147, "step": 10450 }, { "epoch": 1.5388646770011585, "grad_norm": 2.841266603197888, "learning_rate": 1.1291470098528432e-05, "loss": 2.3472, "step": 10460 }, { "epoch": 1.5403357790404737, "grad_norm": 2.643295503349407, "learning_rate": 1.1274492118989032e-05, "loss": 2.3651, "step": 10470 }, { "epoch": 1.541806881079789, "grad_norm": 2.5484456036548337, "learning_rate": 1.1257510404219851e-05, "loss": 2.3913, "step": 10480 }, { "epoch": 1.5432779831191041, "grad_norm": 2.58687041175103, "learning_rate": 1.1240525003990217e-05, "loss": 2.3089, "step": 10490 }, { "epoch": 1.5447490851584194, "grad_norm": 2.664130844464465, "learning_rate": 1.1223535968080243e-05, "loss": 2.3262, "step": 10500 }, { "epoch": 1.5462201871977346, "grad_norm": 2.632577206181427, "learning_rate": 1.1206543346280714e-05, "loss": 2.2876, "step": 10510 }, { "epoch": 1.5476912892370498, "grad_norm": 2.633209981241062, "learning_rate": 1.1189547188392911e-05, "loss": 2.3958, "step": 10520 }, { "epoch": 1.549162391276365, "grad_norm": 2.6295767839702555, "learning_rate": 1.1172547544228484e-05, "loss": 2.3666, "step": 10530 }, { "epoch": 1.5506334933156802, "grad_norm": 2.4442699339038922, "learning_rate": 1.1155544463609303e-05, "loss": 2.3363, "step": 10540 }, { "epoch": 1.5521045953549955, "grad_norm": 2.3436411472382126, "learning_rate": 1.1138537996367305e-05, "loss": 2.3639, "step": 10550 }, { "epoch": 1.5535756973943107, "grad_norm": 2.563166785342026, "learning_rate": 1.1121528192344354e-05, "loss": 2.3703, "step": 10560 }, { "epoch": 1.5550467994336257, "grad_norm": 2.546413190879069, "learning_rate": 1.1104515101392095e-05, "loss": 2.299, "step": 10570 }, { "epoch": 1.556517901472941, "grad_norm": 2.658192819147526, "learning_rate": 1.1087498773371802e-05, "loss": 2.3225, "step": 10580 }, { "epoch": 1.557989003512256, "grad_norm": 2.8585896685656005, "learning_rate": 1.107047925815424e-05, "loss": 2.3521, "step": 10590 }, { "epoch": 1.5594601055515713, "grad_norm": 2.408909804984504, "learning_rate": 1.1053456605619514e-05, "loss": 2.3391, "step": 10600 }, { "epoch": 1.5609312075908865, "grad_norm": 2.554660365717682, "learning_rate": 1.1036430865656924e-05, "loss": 2.3712, "step": 10610 }, { "epoch": 1.5624023096302018, "grad_norm": 2.4940511064381057, "learning_rate": 1.1019402088164814e-05, "loss": 2.3628, "step": 10620 }, { "epoch": 1.5638734116695168, "grad_norm": 2.6976839034843967, "learning_rate": 1.1002370323050432e-05, "loss": 2.4167, "step": 10630 }, { "epoch": 1.565344513708832, "grad_norm": 2.4856764036334438, "learning_rate": 1.0985335620229791e-05, "loss": 2.3185, "step": 10640 }, { "epoch": 1.5668156157481472, "grad_norm": 2.5724373137926717, "learning_rate": 1.0968298029627506e-05, "loss": 2.3105, "step": 10650 }, { "epoch": 1.5682867177874624, "grad_norm": 2.7439738820083472, "learning_rate": 1.095125760117665e-05, "loss": 2.3456, "step": 10660 }, { "epoch": 1.5697578198267776, "grad_norm": 2.3853391571568086, "learning_rate": 1.0934214384818626e-05, "loss": 2.2816, "step": 10670 }, { "epoch": 1.5712289218660929, "grad_norm": 2.4286404405550908, "learning_rate": 1.0917168430502998e-05, "loss": 2.3574, "step": 10680 }, { "epoch": 1.572700023905408, "grad_norm": 2.432993144993741, "learning_rate": 1.0900119788187355e-05, "loss": 2.352, "step": 10690 }, { "epoch": 1.5741711259447233, "grad_norm": 2.703819298840324, "learning_rate": 1.0883068507837169e-05, "loss": 2.3909, "step": 10700 }, { "epoch": 1.5756422279840385, "grad_norm": 2.5007056399119647, "learning_rate": 1.086601463942564e-05, "loss": 2.3151, "step": 10710 }, { "epoch": 1.5771133300233537, "grad_norm": 2.7696315062262364, "learning_rate": 1.0848958232933548e-05, "loss": 2.3463, "step": 10720 }, { "epoch": 1.578584432062669, "grad_norm": 2.7042037741188216, "learning_rate": 1.0831899338349126e-05, "loss": 2.3305, "step": 10730 }, { "epoch": 1.5800555341019842, "grad_norm": 2.710370758482609, "learning_rate": 1.0814838005667883e-05, "loss": 2.3915, "step": 10740 }, { "epoch": 1.5815266361412994, "grad_norm": 2.471481011557249, "learning_rate": 1.0797774284892482e-05, "loss": 2.2959, "step": 10750 }, { "epoch": 1.5829977381806146, "grad_norm": 2.513011177596476, "learning_rate": 1.0780708226032585e-05, "loss": 2.3192, "step": 10760 }, { "epoch": 1.5844688402199298, "grad_norm": 2.485065402650857, "learning_rate": 1.0763639879104706e-05, "loss": 2.3452, "step": 10770 }, { "epoch": 1.585939942259245, "grad_norm": 2.5369822952619745, "learning_rate": 1.074656929413206e-05, "loss": 2.3135, "step": 10780 }, { "epoch": 1.5874110442985603, "grad_norm": 2.6218283519039107, "learning_rate": 1.0729496521144427e-05, "loss": 2.381, "step": 10790 }, { "epoch": 1.5888821463378755, "grad_norm": 2.29973240638017, "learning_rate": 1.0712421610177995e-05, "loss": 2.3688, "step": 10800 }, { "epoch": 1.5903532483771907, "grad_norm": 2.492507509533079, "learning_rate": 1.0695344611275222e-05, "loss": 2.2723, "step": 10810 }, { "epoch": 1.5918243504165057, "grad_norm": 2.6668095910485445, "learning_rate": 1.067826557448468e-05, "loss": 2.3253, "step": 10820 }, { "epoch": 1.593295452455821, "grad_norm": 2.5557079634875928, "learning_rate": 1.0661184549860923e-05, "loss": 2.3865, "step": 10830 }, { "epoch": 1.5947665544951362, "grad_norm": 2.4837601170031696, "learning_rate": 1.0644101587464317e-05, "loss": 2.3365, "step": 10840 }, { "epoch": 1.5962376565344514, "grad_norm": 2.460795770123061, "learning_rate": 1.0627016737360921e-05, "loss": 2.3137, "step": 10850 }, { "epoch": 1.5977087585737666, "grad_norm": 2.3115779390118085, "learning_rate": 1.060993004962232e-05, "loss": 2.3207, "step": 10860 }, { "epoch": 1.5991798606130818, "grad_norm": 2.616674845038197, "learning_rate": 1.0592841574325482e-05, "loss": 2.3445, "step": 10870 }, { "epoch": 1.6006509626523968, "grad_norm": 2.583593753849189, "learning_rate": 1.0575751361552618e-05, "loss": 2.3197, "step": 10880 }, { "epoch": 1.602122064691712, "grad_norm": 2.61797689050197, "learning_rate": 1.0558659461391032e-05, "loss": 2.3358, "step": 10890 }, { "epoch": 1.6035931667310273, "grad_norm": 2.4896204092423306, "learning_rate": 1.0541565923932968e-05, "loss": 2.3661, "step": 10900 }, { "epoch": 1.6050642687703425, "grad_norm": 2.6716451009334214, "learning_rate": 1.0524470799275473e-05, "loss": 2.3867, "step": 10910 }, { "epoch": 1.6065353708096577, "grad_norm": 2.7347936656383336, "learning_rate": 1.0507374137520245e-05, "loss": 2.3245, "step": 10920 }, { "epoch": 1.608006472848973, "grad_norm": 2.6019017437308367, "learning_rate": 1.049027598877349e-05, "loss": 2.3755, "step": 10930 }, { "epoch": 1.6094775748882881, "grad_norm": 2.40767287395477, "learning_rate": 1.0473176403145757e-05, "loss": 2.307, "step": 10940 }, { "epoch": 1.6109486769276034, "grad_norm": 2.896358913893573, "learning_rate": 1.0456075430751829e-05, "loss": 2.38, "step": 10950 }, { "epoch": 1.6124197789669186, "grad_norm": 2.8050938831163066, "learning_rate": 1.0438973121710541e-05, "loss": 2.3421, "step": 10960 }, { "epoch": 1.6138908810062338, "grad_norm": 2.378468465770187, "learning_rate": 1.0421869526144642e-05, "loss": 2.3512, "step": 10970 }, { "epoch": 1.615361983045549, "grad_norm": 2.603147346741723, "learning_rate": 1.0404764694180653e-05, "loss": 2.3445, "step": 10980 }, { "epoch": 1.6168330850848642, "grad_norm": 2.5836920955997935, "learning_rate": 1.038765867594873e-05, "loss": 2.2831, "step": 10990 }, { "epoch": 1.6183041871241794, "grad_norm": 2.5243412167879806, "learning_rate": 1.0370551521582492e-05, "loss": 2.346, "step": 11000 }, { "epoch": 1.6183041871241794, "eval_loss": 2.419686794281006, "eval_runtime": 331.7771, "eval_samples_per_second": 276.056, "eval_steps_per_second": 8.629, "step": 11000 }, { "epoch": 1.6197752891634947, "grad_norm": 2.3928482451703497, "learning_rate": 1.0353443281218893e-05, "loss": 2.2532, "step": 11010 }, { "epoch": 1.6212463912028099, "grad_norm": 2.82309414818636, "learning_rate": 1.033633400499807e-05, "loss": 2.341, "step": 11020 }, { "epoch": 1.622717493242125, "grad_norm": 2.612725588997264, "learning_rate": 1.0319223743063196e-05, "loss": 2.3884, "step": 11030 }, { "epoch": 1.6241885952814403, "grad_norm": 2.575901908712023, "learning_rate": 1.0302112545560328e-05, "loss": 2.3751, "step": 11040 }, { "epoch": 1.6256596973207555, "grad_norm": 2.5030362715666685, "learning_rate": 1.0285000462638273e-05, "loss": 2.3681, "step": 11050 }, { "epoch": 1.6271307993600708, "grad_norm": 2.5680807413752853, "learning_rate": 1.0267887544448431e-05, "loss": 2.2612, "step": 11060 }, { "epoch": 1.6286019013993858, "grad_norm": 2.7947114987482338, "learning_rate": 1.0250773841144645e-05, "loss": 2.3272, "step": 11070 }, { "epoch": 1.630073003438701, "grad_norm": 2.816805063401047, "learning_rate": 1.0233659402883062e-05, "loss": 2.3616, "step": 11080 }, { "epoch": 1.6315441054780162, "grad_norm": 2.7306295080954577, "learning_rate": 1.0216544279821988e-05, "loss": 2.311, "step": 11090 }, { "epoch": 1.6330152075173314, "grad_norm": 2.5995341833762016, "learning_rate": 1.0199428522121723e-05, "loss": 2.3757, "step": 11100 }, { "epoch": 1.6344863095566466, "grad_norm": 2.7909523876514295, "learning_rate": 1.0182312179944438e-05, "loss": 2.4008, "step": 11110 }, { "epoch": 1.6359574115959619, "grad_norm": 2.59644510039766, "learning_rate": 1.0165195303454019e-05, "loss": 2.3766, "step": 11120 }, { "epoch": 1.6374285136352769, "grad_norm": 2.604099665314221, "learning_rate": 1.0148077942815908e-05, "loss": 2.352, "step": 11130 }, { "epoch": 1.638899615674592, "grad_norm": 2.7703426268101947, "learning_rate": 1.0130960148196968e-05, "loss": 2.3616, "step": 11140 }, { "epoch": 1.6403707177139073, "grad_norm": 2.681836870714937, "learning_rate": 1.0113841969765343e-05, "loss": 2.298, "step": 11150 }, { "epoch": 1.6418418197532225, "grad_norm": 2.495863416288284, "learning_rate": 1.0096723457690296e-05, "loss": 2.3804, "step": 11160 }, { "epoch": 1.6433129217925377, "grad_norm": 2.7490420083059672, "learning_rate": 1.0079604662142062e-05, "loss": 2.2936, "step": 11170 }, { "epoch": 1.644784023831853, "grad_norm": 2.7206147589497003, "learning_rate": 1.0062485633291716e-05, "loss": 2.3554, "step": 11180 }, { "epoch": 1.6462551258711682, "grad_norm": 2.454752131106414, "learning_rate": 1.004536642131101e-05, "loss": 2.3641, "step": 11190 }, { "epoch": 1.6477262279104834, "grad_norm": 2.497471379575004, "learning_rate": 1.002824707637224e-05, "loss": 2.4022, "step": 11200 }, { "epoch": 1.6491973299497986, "grad_norm": 2.443670700240167, "learning_rate": 1.0011127648648085e-05, "loss": 2.3396, "step": 11210 }, { "epoch": 1.6506684319891138, "grad_norm": 2.4467667457410363, "learning_rate": 9.994008188311467e-06, "loss": 2.2525, "step": 11220 }, { "epoch": 1.652139534028429, "grad_norm": 2.917186869797221, "learning_rate": 9.97688874553541e-06, "loss": 2.3497, "step": 11230 }, { "epoch": 1.6536106360677443, "grad_norm": 2.3723100394023744, "learning_rate": 9.959769370492878e-06, "loss": 2.2876, "step": 11240 }, { "epoch": 1.6550817381070595, "grad_norm": 2.740079691055537, "learning_rate": 9.94265011335664e-06, "loss": 2.3887, "step": 11250 }, { "epoch": 1.6565528401463747, "grad_norm": 2.650723091909645, "learning_rate": 9.925531024299125e-06, "loss": 2.3094, "step": 11260 }, { "epoch": 1.65802394218569, "grad_norm": 2.695761056288828, "learning_rate": 9.908412153492261e-06, "loss": 2.3593, "step": 11270 }, { "epoch": 1.6594950442250052, "grad_norm": 2.908660267050911, "learning_rate": 9.891293551107339e-06, "loss": 2.3747, "step": 11280 }, { "epoch": 1.6609661462643204, "grad_norm": 2.6928429704096035, "learning_rate": 9.87417526731487e-06, "loss": 2.4147, "step": 11290 }, { "epoch": 1.6624372483036356, "grad_norm": 2.5026957186545435, "learning_rate": 9.857057352284418e-06, "loss": 2.3221, "step": 11300 }, { "epoch": 1.6639083503429508, "grad_norm": 2.4305550727758685, "learning_rate": 9.83993985618448e-06, "loss": 2.3428, "step": 11310 }, { "epoch": 1.6653794523822658, "grad_norm": 2.4884374749973683, "learning_rate": 9.822822829182311e-06, "loss": 2.3171, "step": 11320 }, { "epoch": 1.666850554421581, "grad_norm": 2.676835171960162, "learning_rate": 9.805706321443802e-06, "loss": 2.2935, "step": 11330 }, { "epoch": 1.6683216564608963, "grad_norm": 2.464715884638053, "learning_rate": 9.788590383133327e-06, "loss": 2.3676, "step": 11340 }, { "epoch": 1.6697927585002115, "grad_norm": 2.702246424755963, "learning_rate": 9.771475064413572e-06, "loss": 2.3925, "step": 11350 }, { "epoch": 1.6712638605395267, "grad_norm": 2.424742311226495, "learning_rate": 9.754360415445424e-06, "loss": 2.3008, "step": 11360 }, { "epoch": 1.672734962578842, "grad_norm": 2.5653060443933753, "learning_rate": 9.737246486387806e-06, "loss": 2.4218, "step": 11370 }, { "epoch": 1.6742060646181571, "grad_norm": 2.7375247562989857, "learning_rate": 9.720133327397517e-06, "loss": 2.3824, "step": 11380 }, { "epoch": 1.6756771666574721, "grad_norm": 2.677890977320229, "learning_rate": 9.703020988629116e-06, "loss": 2.3227, "step": 11390 }, { "epoch": 1.6771482686967873, "grad_norm": 2.625054240458854, "learning_rate": 9.68590952023475e-06, "loss": 2.357, "step": 11400 }, { "epoch": 1.6786193707361026, "grad_norm": 2.8356430898787948, "learning_rate": 9.668798972364014e-06, "loss": 2.3511, "step": 11410 }, { "epoch": 1.6800904727754178, "grad_norm": 2.8044997591544005, "learning_rate": 9.651689395163805e-06, "loss": 2.3045, "step": 11420 }, { "epoch": 1.681561574814733, "grad_norm": 2.7132235481937443, "learning_rate": 9.634580838778184e-06, "loss": 2.3584, "step": 11430 }, { "epoch": 1.6830326768540482, "grad_norm": 2.663070615336599, "learning_rate": 9.61747335334821e-06, "loss": 2.3473, "step": 11440 }, { "epoch": 1.6845037788933634, "grad_norm": 2.646968617288892, "learning_rate": 9.600366989011806e-06, "loss": 2.3491, "step": 11450 }, { "epoch": 1.6859748809326787, "grad_norm": 2.6565473919721354, "learning_rate": 9.583261795903613e-06, "loss": 2.3305, "step": 11460 }, { "epoch": 1.6874459829719939, "grad_norm": 2.8258750515843976, "learning_rate": 9.566157824154832e-06, "loss": 2.2872, "step": 11470 }, { "epoch": 1.688917085011309, "grad_norm": 2.8859414032664206, "learning_rate": 9.549055123893095e-06, "loss": 2.3382, "step": 11480 }, { "epoch": 1.6903881870506243, "grad_norm": 2.7147718443554876, "learning_rate": 9.531953745242302e-06, "loss": 2.2966, "step": 11490 }, { "epoch": 1.6918592890899395, "grad_norm": 2.6134770322126935, "learning_rate": 9.514853738322475e-06, "loss": 2.285, "step": 11500 }, { "epoch": 1.6933303911292548, "grad_norm": 2.767472440198078, "learning_rate": 9.49775515324962e-06, "loss": 2.3136, "step": 11510 }, { "epoch": 1.69480149316857, "grad_norm": 2.627200939455974, "learning_rate": 9.480658040135585e-06, "loss": 2.3568, "step": 11520 }, { "epoch": 1.6962725952078852, "grad_norm": 2.548294714831652, "learning_rate": 9.463562449087881e-06, "loss": 2.3207, "step": 11530 }, { "epoch": 1.6977436972472004, "grad_norm": 2.555800406806819, "learning_rate": 9.446468430209583e-06, "loss": 2.3474, "step": 11540 }, { "epoch": 1.6992147992865156, "grad_norm": 2.545454539737467, "learning_rate": 9.429376033599147e-06, "loss": 2.3046, "step": 11550 }, { "epoch": 1.7006859013258309, "grad_norm": 2.584064087594338, "learning_rate": 9.41228530935027e-06, "loss": 2.3063, "step": 11560 }, { "epoch": 1.7021570033651459, "grad_norm": 2.5860070437650458, "learning_rate": 9.395196307551758e-06, "loss": 2.3244, "step": 11570 }, { "epoch": 1.703628105404461, "grad_norm": 2.608870074717971, "learning_rate": 9.378109078287365e-06, "loss": 2.4009, "step": 11580 }, { "epoch": 1.7050992074437763, "grad_norm": 2.650990756876376, "learning_rate": 9.361023671635641e-06, "loss": 2.2536, "step": 11590 }, { "epoch": 1.7065703094830915, "grad_norm": 2.6449734926041018, "learning_rate": 9.343940137669813e-06, "loss": 2.3158, "step": 11600 }, { "epoch": 1.7080414115224067, "grad_norm": 2.604990627735937, "learning_rate": 9.326858526457598e-06, "loss": 2.2984, "step": 11610 }, { "epoch": 1.709512513561722, "grad_norm": 2.5050214468446774, "learning_rate": 9.309778888061095e-06, "loss": 2.299, "step": 11620 }, { "epoch": 1.7109836156010372, "grad_norm": 2.641123203423394, "learning_rate": 9.292701272536618e-06, "loss": 2.3225, "step": 11630 }, { "epoch": 1.7124547176403522, "grad_norm": 2.4421772819824854, "learning_rate": 9.275625729934545e-06, "loss": 2.3491, "step": 11640 }, { "epoch": 1.7139258196796674, "grad_norm": 2.5447484760768266, "learning_rate": 9.258552310299182e-06, "loss": 2.3401, "step": 11650 }, { "epoch": 1.7153969217189826, "grad_norm": 3.1103086777578204, "learning_rate": 9.24148106366862e-06, "loss": 2.3725, "step": 11660 }, { "epoch": 1.7168680237582978, "grad_norm": 2.7064416737965638, "learning_rate": 9.22441204007457e-06, "loss": 2.3734, "step": 11670 }, { "epoch": 1.718339125797613, "grad_norm": 2.4170929979318907, "learning_rate": 9.207345289542235e-06, "loss": 2.2426, "step": 11680 }, { "epoch": 1.7198102278369283, "grad_norm": 2.6982610748576747, "learning_rate": 9.19028086209016e-06, "loss": 2.3721, "step": 11690 }, { "epoch": 1.7212813298762435, "grad_norm": 2.894616948342068, "learning_rate": 9.17321880773007e-06, "loss": 2.3129, "step": 11700 }, { "epoch": 1.7227524319155587, "grad_norm": 2.7223974827288147, "learning_rate": 9.156159176466741e-06, "loss": 2.3641, "step": 11710 }, { "epoch": 1.724223533954874, "grad_norm": 2.6426545546701465, "learning_rate": 9.139102018297857e-06, "loss": 2.3066, "step": 11720 }, { "epoch": 1.7256946359941892, "grad_norm": 2.4892861111409412, "learning_rate": 9.122047383213835e-06, "loss": 2.2431, "step": 11730 }, { "epoch": 1.7271657380335044, "grad_norm": 2.7106784495175504, "learning_rate": 9.104995321197709e-06, "loss": 2.3632, "step": 11740 }, { "epoch": 1.7286368400728196, "grad_norm": 2.4044541271568445, "learning_rate": 9.087945882224974e-06, "loss": 2.3602, "step": 11750 }, { "epoch": 1.7301079421121348, "grad_norm": 2.5499970879851004, "learning_rate": 9.070899116263431e-06, "loss": 2.3384, "step": 11760 }, { "epoch": 1.73157904415145, "grad_norm": 2.604217027455695, "learning_rate": 9.053855073273051e-06, "loss": 2.2763, "step": 11770 }, { "epoch": 1.7330501461907653, "grad_norm": 2.8278443052364377, "learning_rate": 9.036813803205828e-06, "loss": 2.3795, "step": 11780 }, { "epoch": 1.7345212482300805, "grad_norm": 2.950981364070602, "learning_rate": 9.019775356005616e-06, "loss": 2.3289, "step": 11790 }, { "epoch": 1.7359923502693957, "grad_norm": 2.6628711332341153, "learning_rate": 9.002739781608008e-06, "loss": 2.3686, "step": 11800 }, { "epoch": 1.737463452308711, "grad_norm": 2.462458310284877, "learning_rate": 8.985707129940177e-06, "loss": 2.3115, "step": 11810 }, { "epoch": 1.7389345543480261, "grad_norm": 2.862953494564841, "learning_rate": 8.968677450920723e-06, "loss": 2.3899, "step": 11820 }, { "epoch": 1.7404056563873411, "grad_norm": 2.406904380398983, "learning_rate": 8.95165079445954e-06, "loss": 2.2795, "step": 11830 }, { "epoch": 1.7418767584266563, "grad_norm": 2.6137032741973285, "learning_rate": 8.934627210457663e-06, "loss": 2.3163, "step": 11840 }, { "epoch": 1.7433478604659716, "grad_norm": 2.853587443870121, "learning_rate": 8.917606748807121e-06, "loss": 2.2646, "step": 11850 }, { "epoch": 1.7448189625052868, "grad_norm": 2.721757910431071, "learning_rate": 8.900589459390788e-06, "loss": 2.3485, "step": 11860 }, { "epoch": 1.746290064544602, "grad_norm": 2.4505881122537287, "learning_rate": 8.883575392082252e-06, "loss": 2.2773, "step": 11870 }, { "epoch": 1.7477611665839172, "grad_norm": 2.678442124186317, "learning_rate": 8.866564596745644e-06, "loss": 2.2989, "step": 11880 }, { "epoch": 1.7492322686232322, "grad_norm": 2.506810039424921, "learning_rate": 8.84955712323552e-06, "loss": 2.2918, "step": 11890 }, { "epoch": 1.7507033706625474, "grad_norm": 2.7877706386965975, "learning_rate": 8.832553021396685e-06, "loss": 2.3824, "step": 11900 }, { "epoch": 1.7521744727018627, "grad_norm": 2.6496350121405534, "learning_rate": 8.815552341064074e-06, "loss": 2.3489, "step": 11910 }, { "epoch": 1.7536455747411779, "grad_norm": 2.674540191956863, "learning_rate": 8.798555132062597e-06, "loss": 2.3202, "step": 11920 }, { "epoch": 1.755116676780493, "grad_norm": 2.5375605247634194, "learning_rate": 8.781561444206974e-06, "loss": 2.3622, "step": 11930 }, { "epoch": 1.7565877788198083, "grad_norm": 2.237313738675246, "learning_rate": 8.76457132730162e-06, "loss": 2.3115, "step": 11940 }, { "epoch": 1.7580588808591235, "grad_norm": 2.721256608366041, "learning_rate": 8.747584831140482e-06, "loss": 2.3133, "step": 11950 }, { "epoch": 1.7595299828984388, "grad_norm": 2.5680850069295067, "learning_rate": 8.730602005506891e-06, "loss": 2.3481, "step": 11960 }, { "epoch": 1.761001084937754, "grad_norm": 2.271599183197794, "learning_rate": 8.713622900173424e-06, "loss": 2.3307, "step": 11970 }, { "epoch": 1.7624721869770692, "grad_norm": 2.8019634930107964, "learning_rate": 8.696647564901759e-06, "loss": 2.2007, "step": 11980 }, { "epoch": 1.7639432890163844, "grad_norm": 2.882717840434447, "learning_rate": 8.679676049442511e-06, "loss": 2.3474, "step": 11990 }, { "epoch": 1.7654143910556996, "grad_norm": 2.7230825367901113, "learning_rate": 8.662708403535112e-06, "loss": 2.3633, "step": 12000 }, { "epoch": 1.7654143910556996, "eval_loss": 2.3989651203155518, "eval_runtime": 340.3146, "eval_samples_per_second": 269.13, "eval_steps_per_second": 8.413, "step": 12000 }, { "epoch": 1.7668854930950149, "grad_norm": 2.572216356614912, "learning_rate": 8.645744676907655e-06, "loss": 2.3508, "step": 12010 }, { "epoch": 1.76835659513433, "grad_norm": 2.7986639531428477, "learning_rate": 8.628784919276732e-06, "loss": 2.2573, "step": 12020 }, { "epoch": 1.7698276971736453, "grad_norm": 3.165331163256029, "learning_rate": 8.611829180347318e-06, "loss": 2.3567, "step": 12030 }, { "epoch": 1.7712987992129605, "grad_norm": 2.358468379294268, "learning_rate": 8.594877509812605e-06, "loss": 2.2795, "step": 12040 }, { "epoch": 1.7727699012522757, "grad_norm": 2.8665339367044096, "learning_rate": 8.577929957353861e-06, "loss": 2.2664, "step": 12050 }, { "epoch": 1.774241003291591, "grad_norm": 2.494486540957126, "learning_rate": 8.560986572640281e-06, "loss": 2.3004, "step": 12060 }, { "epoch": 1.7757121053309062, "grad_norm": 2.5263889581899996, "learning_rate": 8.544047405328855e-06, "loss": 2.3164, "step": 12070 }, { "epoch": 1.7771832073702212, "grad_norm": 2.8354715231741814, "learning_rate": 8.527112505064201e-06, "loss": 2.3385, "step": 12080 }, { "epoch": 1.7786543094095364, "grad_norm": 2.6386236502673595, "learning_rate": 8.510181921478443e-06, "loss": 2.3694, "step": 12090 }, { "epoch": 1.7801254114488516, "grad_norm": 2.2922385235269354, "learning_rate": 8.493255704191048e-06, "loss": 2.3283, "step": 12100 }, { "epoch": 1.7815965134881668, "grad_norm": 2.753739046775812, "learning_rate": 8.476333902808682e-06, "loss": 2.3098, "step": 12110 }, { "epoch": 1.783067615527482, "grad_norm": 2.6051284353717894, "learning_rate": 8.459416566925073e-06, "loss": 2.3503, "step": 12120 }, { "epoch": 1.7845387175667973, "grad_norm": 2.6088959417027957, "learning_rate": 8.442503746120873e-06, "loss": 2.2364, "step": 12130 }, { "epoch": 1.7860098196061123, "grad_norm": 2.6979826475467843, "learning_rate": 8.42559548996348e-06, "loss": 2.2951, "step": 12140 }, { "epoch": 1.7874809216454275, "grad_norm": 2.382775544055562, "learning_rate": 8.408691848006927e-06, "loss": 2.3326, "step": 12150 }, { "epoch": 1.7889520236847427, "grad_norm": 2.4190321455763635, "learning_rate": 8.391792869791724e-06, "loss": 2.3606, "step": 12160 }, { "epoch": 1.790423125724058, "grad_norm": 2.6677829284537165, "learning_rate": 8.374898604844711e-06, "loss": 2.3564, "step": 12170 }, { "epoch": 1.7918942277633731, "grad_norm": 2.868096120661717, "learning_rate": 8.35800910267891e-06, "loss": 2.302, "step": 12180 }, { "epoch": 1.7933653298026884, "grad_norm": 2.5466534264703222, "learning_rate": 8.341124412793399e-06, "loss": 2.2643, "step": 12190 }, { "epoch": 1.7948364318420036, "grad_norm": 2.68568050659062, "learning_rate": 8.324244584673128e-06, "loss": 2.3176, "step": 12200 }, { "epoch": 1.7963075338813188, "grad_norm": 2.566454859744201, "learning_rate": 8.30736966778882e-06, "loss": 2.3477, "step": 12210 }, { "epoch": 1.797778635920634, "grad_norm": 2.6379437196212803, "learning_rate": 8.290499711596794e-06, "loss": 2.2874, "step": 12220 }, { "epoch": 1.7992497379599492, "grad_norm": 3.094941195788663, "learning_rate": 8.273634765538833e-06, "loss": 2.2746, "step": 12230 }, { "epoch": 1.8007208399992645, "grad_norm": 2.6140141527549416, "learning_rate": 8.25677487904204e-06, "loss": 2.3202, "step": 12240 }, { "epoch": 1.8021919420385797, "grad_norm": 2.8901396885227384, "learning_rate": 8.239920101518681e-06, "loss": 2.3373, "step": 12250 }, { "epoch": 1.803663044077895, "grad_norm": 2.581386586654735, "learning_rate": 8.223070482366058e-06, "loss": 2.3043, "step": 12260 }, { "epoch": 1.8051341461172101, "grad_norm": 2.551173206181835, "learning_rate": 8.20622607096635e-06, "loss": 2.2843, "step": 12270 }, { "epoch": 1.8066052481565253, "grad_norm": 2.854243310613355, "learning_rate": 8.189386916686472e-06, "loss": 2.2844, "step": 12280 }, { "epoch": 1.8080763501958406, "grad_norm": 2.649236911942285, "learning_rate": 8.172553068877934e-06, "loss": 2.3399, "step": 12290 }, { "epoch": 1.8095474522351558, "grad_norm": 2.8792485531015606, "learning_rate": 8.155724576876701e-06, "loss": 2.3162, "step": 12300 }, { "epoch": 1.811018554274471, "grad_norm": 2.4987141109359046, "learning_rate": 8.138901490003024e-06, "loss": 2.2665, "step": 12310 }, { "epoch": 1.8124896563137862, "grad_norm": 2.5446635090908964, "learning_rate": 8.12208385756133e-06, "loss": 2.3183, "step": 12320 }, { "epoch": 1.8139607583531012, "grad_norm": 2.560493164495115, "learning_rate": 8.105271728840054e-06, "loss": 2.3224, "step": 12330 }, { "epoch": 1.8154318603924164, "grad_norm": 2.485943784293368, "learning_rate": 8.088465153111495e-06, "loss": 2.2564, "step": 12340 }, { "epoch": 1.8169029624317317, "grad_norm": 2.419002745966915, "learning_rate": 8.071664179631686e-06, "loss": 2.3465, "step": 12350 }, { "epoch": 1.8183740644710469, "grad_norm": 2.632937310223138, "learning_rate": 8.054868857640241e-06, "loss": 2.3926, "step": 12360 }, { "epoch": 1.819845166510362, "grad_norm": 2.5726989937422013, "learning_rate": 8.038079236360205e-06, "loss": 2.3071, "step": 12370 }, { "epoch": 1.8213162685496773, "grad_norm": 2.569840756129319, "learning_rate": 8.021295364997917e-06, "loss": 2.3372, "step": 12380 }, { "epoch": 1.8227873705889923, "grad_norm": 2.8647833260063367, "learning_rate": 8.004517292742875e-06, "loss": 2.2828, "step": 12390 }, { "epoch": 1.8242584726283075, "grad_norm": 2.812427677125703, "learning_rate": 7.98774506876756e-06, "loss": 2.2886, "step": 12400 }, { "epoch": 1.8257295746676228, "grad_norm": 2.5122733634188505, "learning_rate": 7.970978742227332e-06, "loss": 2.2754, "step": 12410 }, { "epoch": 1.827200676706938, "grad_norm": 2.697004036589722, "learning_rate": 7.95421836226026e-06, "loss": 2.274, "step": 12420 }, { "epoch": 1.8286717787462532, "grad_norm": 2.6555842784580594, "learning_rate": 7.937463977986984e-06, "loss": 2.3115, "step": 12430 }, { "epoch": 1.8301428807855684, "grad_norm": 2.8010923378275168, "learning_rate": 7.920715638510574e-06, "loss": 2.3251, "step": 12440 }, { "epoch": 1.8316139828248836, "grad_norm": 2.43667862219009, "learning_rate": 7.903973392916386e-06, "loss": 2.3502, "step": 12450 }, { "epoch": 1.8330850848641989, "grad_norm": 2.7837584838837794, "learning_rate": 7.88723729027191e-06, "loss": 2.3157, "step": 12460 }, { "epoch": 1.834556186903514, "grad_norm": 2.5725006170102236, "learning_rate": 7.870507379626638e-06, "loss": 2.2648, "step": 12470 }, { "epoch": 1.8360272889428293, "grad_norm": 2.734050461804077, "learning_rate": 7.853783710011913e-06, "loss": 2.3143, "step": 12480 }, { "epoch": 1.8374983909821445, "grad_norm": 2.6717942523826816, "learning_rate": 7.837066330440787e-06, "loss": 2.3909, "step": 12490 }, { "epoch": 1.8389694930214597, "grad_norm": 2.776291632320834, "learning_rate": 7.820355289907881e-06, "loss": 2.31, "step": 12500 }, { "epoch": 1.840440595060775, "grad_norm": 3.0902895728068893, "learning_rate": 7.803650637389228e-06, "loss": 2.2938, "step": 12510 }, { "epoch": 1.8419116971000902, "grad_norm": 2.7373388344311977, "learning_rate": 7.78695242184215e-06, "loss": 2.3235, "step": 12520 }, { "epoch": 1.8433827991394054, "grad_norm": 2.6518745528619005, "learning_rate": 7.770260692205102e-06, "loss": 2.3678, "step": 12530 }, { "epoch": 1.8448539011787206, "grad_norm": 2.2872652565332396, "learning_rate": 7.75357549739752e-06, "loss": 2.2939, "step": 12540 }, { "epoch": 1.8463250032180358, "grad_norm": 2.8406421315061907, "learning_rate": 7.736896886319699e-06, "loss": 2.3572, "step": 12550 }, { "epoch": 1.847796105257351, "grad_norm": 2.500073490544943, "learning_rate": 7.72022490785264e-06, "loss": 2.3093, "step": 12560 }, { "epoch": 1.8492672072966663, "grad_norm": 2.6472071204927468, "learning_rate": 7.703559610857895e-06, "loss": 2.2855, "step": 12570 }, { "epoch": 1.8507383093359813, "grad_norm": 2.829369528941027, "learning_rate": 7.686901044177442e-06, "loss": 2.2787, "step": 12580 }, { "epoch": 1.8522094113752965, "grad_norm": 2.560657830464184, "learning_rate": 7.670249256633538e-06, "loss": 2.2685, "step": 12590 }, { "epoch": 1.8536805134146117, "grad_norm": 2.869378725127949, "learning_rate": 7.653604297028554e-06, "loss": 2.2765, "step": 12600 }, { "epoch": 1.855151615453927, "grad_norm": 2.6026114812569294, "learning_rate": 7.636966214144871e-06, "loss": 2.3728, "step": 12610 }, { "epoch": 1.8566227174932421, "grad_norm": 2.764363742526596, "learning_rate": 7.620335056744708e-06, "loss": 2.3198, "step": 12620 }, { "epoch": 1.8580938195325574, "grad_norm": 2.6189833999356273, "learning_rate": 7.603710873569979e-06, "loss": 2.342, "step": 12630 }, { "epoch": 1.8595649215718724, "grad_norm": 2.7077587359983983, "learning_rate": 7.587093713342171e-06, "loss": 2.3147, "step": 12640 }, { "epoch": 1.8610360236111876, "grad_norm": 2.3976279597747236, "learning_rate": 7.570483624762182e-06, "loss": 2.2527, "step": 12650 }, { "epoch": 1.8625071256505028, "grad_norm": 2.599465595864537, "learning_rate": 7.553880656510182e-06, "loss": 2.3801, "step": 12660 }, { "epoch": 1.863978227689818, "grad_norm": 2.578399274618669, "learning_rate": 7.537284857245481e-06, "loss": 2.294, "step": 12670 }, { "epoch": 1.8654493297291332, "grad_norm": 2.472868732564471, "learning_rate": 7.520696275606373e-06, "loss": 2.2485, "step": 12680 }, { "epoch": 1.8669204317684485, "grad_norm": 2.481284621660363, "learning_rate": 7.504114960209997e-06, "loss": 2.2654, "step": 12690 }, { "epoch": 1.8683915338077637, "grad_norm": 2.8490060840283493, "learning_rate": 7.487540959652201e-06, "loss": 2.3667, "step": 12700 }, { "epoch": 1.869862635847079, "grad_norm": 2.768465986602492, "learning_rate": 7.470974322507398e-06, "loss": 2.2485, "step": 12710 }, { "epoch": 1.8713337378863941, "grad_norm": 2.575913314668681, "learning_rate": 7.454415097328407e-06, "loss": 2.2775, "step": 12720 }, { "epoch": 1.8728048399257093, "grad_norm": 2.795350833180658, "learning_rate": 7.437863332646336e-06, "loss": 2.2907, "step": 12730 }, { "epoch": 1.8742759419650246, "grad_norm": 2.7931936354227713, "learning_rate": 7.421319076970433e-06, "loss": 2.3137, "step": 12740 }, { "epoch": 1.8757470440043398, "grad_norm": 2.682983384289179, "learning_rate": 7.404782378787918e-06, "loss": 2.2398, "step": 12750 }, { "epoch": 1.877218146043655, "grad_norm": 2.603419979581306, "learning_rate": 7.388253286563882e-06, "loss": 2.2567, "step": 12760 }, { "epoch": 1.8786892480829702, "grad_norm": 2.6797857214477, "learning_rate": 7.3717318487411195e-06, "loss": 2.2606, "step": 12770 }, { "epoch": 1.8801603501222854, "grad_norm": 2.615629547186917, "learning_rate": 7.355218113739986e-06, "loss": 2.3081, "step": 12780 }, { "epoch": 1.8816314521616007, "grad_norm": 2.560739499445025, "learning_rate": 7.3387121299582695e-06, "loss": 2.3537, "step": 12790 }, { "epoch": 1.8831025542009159, "grad_norm": 2.5471717470185467, "learning_rate": 7.322213945771034e-06, "loss": 2.3214, "step": 12800 }, { "epoch": 1.884573656240231, "grad_norm": 2.5090402884792486, "learning_rate": 7.305723609530488e-06, "loss": 2.3105, "step": 12810 }, { "epoch": 1.8860447582795463, "grad_norm": 2.2880219431713464, "learning_rate": 7.289241169565846e-06, "loss": 2.2994, "step": 12820 }, { "epoch": 1.8875158603188613, "grad_norm": 2.453477917965703, "learning_rate": 7.272766674183165e-06, "loss": 2.3249, "step": 12830 }, { "epoch": 1.8889869623581765, "grad_norm": 2.72396348044151, "learning_rate": 7.25630017166523e-06, "loss": 2.2753, "step": 12840 }, { "epoch": 1.8904580643974918, "grad_norm": 2.625390273326832, "learning_rate": 7.239841710271403e-06, "loss": 2.2755, "step": 12850 }, { "epoch": 1.891929166436807, "grad_norm": 2.5229700719151067, "learning_rate": 7.223391338237471e-06, "loss": 2.321, "step": 12860 }, { "epoch": 1.8934002684761222, "grad_norm": 2.9011858808331628, "learning_rate": 7.206949103775518e-06, "loss": 2.3389, "step": 12870 }, { "epoch": 1.8948713705154374, "grad_norm": 3.005681638448819, "learning_rate": 7.1905150550737775e-06, "loss": 2.2981, "step": 12880 }, { "epoch": 1.8963424725547526, "grad_norm": 2.599681864801287, "learning_rate": 7.17408924029649e-06, "loss": 2.2857, "step": 12890 }, { "epoch": 1.8978135745940676, "grad_norm": 2.588158852274977, "learning_rate": 7.157671707583768e-06, "loss": 2.3101, "step": 12900 }, { "epoch": 1.8992846766333829, "grad_norm": 2.6963748284739983, "learning_rate": 7.141262505051456e-06, "loss": 2.2997, "step": 12910 }, { "epoch": 1.900755778672698, "grad_norm": 2.8383420018877157, "learning_rate": 7.124861680790969e-06, "loss": 2.2824, "step": 12920 }, { "epoch": 1.9022268807120133, "grad_norm": 3.0046912924975757, "learning_rate": 7.10846928286918e-06, "loss": 2.3404, "step": 12930 }, { "epoch": 1.9036979827513285, "grad_norm": 2.8070315229199627, "learning_rate": 7.092085359328271e-06, "loss": 2.3122, "step": 12940 }, { "epoch": 1.9051690847906437, "grad_norm": 2.6323527239796167, "learning_rate": 7.07570995818557e-06, "loss": 2.3679, "step": 12950 }, { "epoch": 1.906640186829959, "grad_norm": 2.513794229649011, "learning_rate": 7.059343127433443e-06, "loss": 2.292, "step": 12960 }, { "epoch": 1.9081112888692742, "grad_norm": 2.6140780534030585, "learning_rate": 7.042984915039135e-06, "loss": 2.2315, "step": 12970 }, { "epoch": 1.9095823909085894, "grad_norm": 2.579661828657211, "learning_rate": 7.026635368944626e-06, "loss": 2.2316, "step": 12980 }, { "epoch": 1.9110534929479046, "grad_norm": 2.4326417626780574, "learning_rate": 7.010294537066504e-06, "loss": 2.2762, "step": 12990 }, { "epoch": 1.9125245949872198, "grad_norm": 2.824776052067323, "learning_rate": 6.993962467295823e-06, "loss": 2.3653, "step": 13000 }, { "epoch": 1.9125245949872198, "eval_loss": 2.37896466255188, "eval_runtime": 368.0749, "eval_samples_per_second": 248.832, "eval_steps_per_second": 7.778, "step": 13000 }, { "epoch": 1.913995697026535, "grad_norm": 2.6229054111529195, "learning_rate": 6.97763920749794e-06, "loss": 2.3255, "step": 13010 }, { "epoch": 1.9154667990658503, "grad_norm": 2.494685629648072, "learning_rate": 6.961324805512404e-06, "loss": 2.2892, "step": 13020 }, { "epoch": 1.9169379011051655, "grad_norm": 2.652258307292692, "learning_rate": 6.945019309152811e-06, "loss": 2.2672, "step": 13030 }, { "epoch": 1.9184090031444807, "grad_norm": 3.0932476680650596, "learning_rate": 6.928722766206637e-06, "loss": 2.2958, "step": 13040 }, { "epoch": 1.919880105183796, "grad_norm": 2.885846842710405, "learning_rate": 6.912435224435133e-06, "loss": 2.2467, "step": 13050 }, { "epoch": 1.9213512072231111, "grad_norm": 2.8027038211930413, "learning_rate": 6.896156731573167e-06, "loss": 2.3136, "step": 13060 }, { "epoch": 1.9228223092624264, "grad_norm": 2.6581422357473614, "learning_rate": 6.879887335329083e-06, "loss": 2.2348, "step": 13070 }, { "epoch": 1.9242934113017414, "grad_norm": 2.7258363481701915, "learning_rate": 6.863627083384571e-06, "loss": 2.3256, "step": 13080 }, { "epoch": 1.9257645133410566, "grad_norm": 2.7565710536996897, "learning_rate": 6.847376023394514e-06, "loss": 2.3428, "step": 13090 }, { "epoch": 1.9272356153803718, "grad_norm": 2.517644160540783, "learning_rate": 6.831134202986858e-06, "loss": 2.238, "step": 13100 }, { "epoch": 1.928706717419687, "grad_norm": 3.101368207147356, "learning_rate": 6.814901669762476e-06, "loss": 2.3236, "step": 13110 }, { "epoch": 1.9301778194590022, "grad_norm": 2.5461771117151315, "learning_rate": 6.798678471295011e-06, "loss": 2.2784, "step": 13120 }, { "epoch": 1.9316489214983175, "grad_norm": 2.676394594322696, "learning_rate": 6.782464655130757e-06, "loss": 2.2899, "step": 13130 }, { "epoch": 1.9331200235376327, "grad_norm": 2.6917615058323605, "learning_rate": 6.766260268788515e-06, "loss": 2.3342, "step": 13140 }, { "epoch": 1.9345911255769477, "grad_norm": 2.7925062385745067, "learning_rate": 6.750065359759432e-06, "loss": 2.3106, "step": 13150 }, { "epoch": 1.936062227616263, "grad_norm": 2.745421801111188, "learning_rate": 6.733879975506897e-06, "loss": 2.2655, "step": 13160 }, { "epoch": 1.9375333296555781, "grad_norm": 2.615787101056556, "learning_rate": 6.717704163466376e-06, "loss": 2.2626, "step": 13170 }, { "epoch": 1.9390044316948933, "grad_norm": 2.454817585599468, "learning_rate": 6.7015379710452814e-06, "loss": 2.2837, "step": 13180 }, { "epoch": 1.9404755337342086, "grad_norm": 2.660428077582701, "learning_rate": 6.685381445622834e-06, "loss": 2.3653, "step": 13190 }, { "epoch": 1.9419466357735238, "grad_norm": 2.717052850771079, "learning_rate": 6.669234634549928e-06, "loss": 2.318, "step": 13200 }, { "epoch": 1.943417737812839, "grad_norm": 2.568328929106495, "learning_rate": 6.653097585148972e-06, "loss": 2.3058, "step": 13210 }, { "epoch": 1.9448888398521542, "grad_norm": 2.7301621722249845, "learning_rate": 6.636970344713782e-06, "loss": 2.3407, "step": 13220 }, { "epoch": 1.9463599418914694, "grad_norm": 2.8283813016010884, "learning_rate": 6.620852960509423e-06, "loss": 2.356, "step": 13230 }, { "epoch": 1.9478310439307847, "grad_norm": 3.1249113669107067, "learning_rate": 6.604745479772061e-06, "loss": 2.2477, "step": 13240 }, { "epoch": 1.9493021459700999, "grad_norm": 2.473722612384119, "learning_rate": 6.588647949708853e-06, "loss": 2.2684, "step": 13250 }, { "epoch": 1.950773248009415, "grad_norm": 2.740152401142695, "learning_rate": 6.572560417497786e-06, "loss": 2.3308, "step": 13260 }, { "epoch": 1.9522443500487303, "grad_norm": 2.414689272912095, "learning_rate": 6.556482930287545e-06, "loss": 2.3877, "step": 13270 }, { "epoch": 1.9537154520880455, "grad_norm": 2.4900575933765863, "learning_rate": 6.540415535197382e-06, "loss": 2.3183, "step": 13280 }, { "epoch": 1.9551865541273608, "grad_norm": 2.8812319461990867, "learning_rate": 6.52435827931696e-06, "loss": 2.3252, "step": 13290 }, { "epoch": 1.956657656166676, "grad_norm": 2.740621555428388, "learning_rate": 6.508311209706235e-06, "loss": 2.2855, "step": 13300 }, { "epoch": 1.9581287582059912, "grad_norm": 2.74464183474487, "learning_rate": 6.492274373395308e-06, "loss": 2.3222, "step": 13310 }, { "epoch": 1.9595998602453064, "grad_norm": 2.6540055543020182, "learning_rate": 6.476247817384292e-06, "loss": 2.3111, "step": 13320 }, { "epoch": 1.9610709622846216, "grad_norm": 2.483245319705813, "learning_rate": 6.460231588643159e-06, "loss": 2.3005, "step": 13330 }, { "epoch": 1.9625420643239366, "grad_norm": 2.471406468628288, "learning_rate": 6.444225734111626e-06, "loss": 2.2868, "step": 13340 }, { "epoch": 1.9640131663632518, "grad_norm": 2.5262992754263562, "learning_rate": 6.428230300699005e-06, "loss": 2.3179, "step": 13350 }, { "epoch": 1.965484268402567, "grad_norm": 2.803115698394696, "learning_rate": 6.412245335284057e-06, "loss": 2.2773, "step": 13360 }, { "epoch": 1.9669553704418823, "grad_norm": 2.9327160270181127, "learning_rate": 6.396270884714871e-06, "loss": 2.2943, "step": 13370 }, { "epoch": 1.9684264724811975, "grad_norm": 2.4393078394396484, "learning_rate": 6.380306995808725e-06, "loss": 2.2724, "step": 13380 }, { "epoch": 1.9698975745205127, "grad_norm": 2.5341212532481654, "learning_rate": 6.364353715351927e-06, "loss": 2.2653, "step": 13390 }, { "epoch": 1.9713686765598277, "grad_norm": 3.054010349290827, "learning_rate": 6.3484110900997155e-06, "loss": 2.293, "step": 13400 }, { "epoch": 1.972839778599143, "grad_norm": 2.672973889687425, "learning_rate": 6.332479166776078e-06, "loss": 2.2921, "step": 13410 }, { "epoch": 1.9743108806384582, "grad_norm": 2.6814432125871392, "learning_rate": 6.316557992073653e-06, "loss": 2.3221, "step": 13420 }, { "epoch": 1.9757819826777734, "grad_norm": 2.505588897700807, "learning_rate": 6.300647612653579e-06, "loss": 2.2407, "step": 13430 }, { "epoch": 1.9772530847170886, "grad_norm": 2.9788284813771013, "learning_rate": 6.284748075145342e-06, "loss": 2.2792, "step": 13440 }, { "epoch": 1.9787241867564038, "grad_norm": 2.924806639559489, "learning_rate": 6.268859426146663e-06, "loss": 2.2448, "step": 13450 }, { "epoch": 1.980195288795719, "grad_norm": 2.8632214872698394, "learning_rate": 6.252981712223354e-06, "loss": 2.298, "step": 13460 }, { "epoch": 1.9816663908350343, "grad_norm": 2.8181315644323788, "learning_rate": 6.237114979909168e-06, "loss": 2.2574, "step": 13470 }, { "epoch": 1.9831374928743495, "grad_norm": 2.7289705644095585, "learning_rate": 6.221259275705684e-06, "loss": 2.2768, "step": 13480 }, { "epoch": 1.9846085949136647, "grad_norm": 2.907854546689904, "learning_rate": 6.205414646082162e-06, "loss": 2.2488, "step": 13490 }, { "epoch": 1.98607969695298, "grad_norm": 2.6181151781966956, "learning_rate": 6.189581137475389e-06, "loss": 2.2983, "step": 13500 }, { "epoch": 1.9875507989922951, "grad_norm": 2.846805862098562, "learning_rate": 6.1737587962895736e-06, "loss": 2.3201, "step": 13510 }, { "epoch": 1.9890219010316104, "grad_norm": 2.627238765999436, "learning_rate": 6.157947668896196e-06, "loss": 2.3394, "step": 13520 }, { "epoch": 1.9904930030709256, "grad_norm": 2.7694935126989093, "learning_rate": 6.1421478016338586e-06, "loss": 2.2845, "step": 13530 }, { "epoch": 1.9919641051102408, "grad_norm": 2.6464196669869273, "learning_rate": 6.126359240808175e-06, "loss": 2.3395, "step": 13540 }, { "epoch": 1.993435207149556, "grad_norm": 2.996433081384349, "learning_rate": 6.110582032691622e-06, "loss": 2.3122, "step": 13550 }, { "epoch": 1.9949063091888712, "grad_norm": 2.9082332045972974, "learning_rate": 6.094816223523393e-06, "loss": 2.317, "step": 13560 }, { "epoch": 1.9963774112281865, "grad_norm": 2.716245741980571, "learning_rate": 6.079061859509287e-06, "loss": 2.2539, "step": 13570 }, { "epoch": 1.9978485132675017, "grad_norm": 2.679474322162246, "learning_rate": 6.0633189868215534e-06, "loss": 2.2403, "step": 13580 }, { "epoch": 1.9993196153068167, "grad_norm": 3.0298230744477728, "learning_rate": 6.047587651598766e-06, "loss": 2.2941, "step": 13590 }, { "epoch": 2.000882661223589, "grad_norm": 2.442363173742773, "learning_rate": 6.03186789994568e-06, "loss": 2.3957, "step": 13600 }, { "epoch": 2.0023537632629043, "grad_norm": 2.58181417237534, "learning_rate": 6.016159777933111e-06, "loss": 1.9488, "step": 13610 }, { "epoch": 2.0038248653022195, "grad_norm": 2.386800352488038, "learning_rate": 6.0004633315977804e-06, "loss": 2.0183, "step": 13620 }, { "epoch": 2.0052959673415347, "grad_norm": 2.691844915135655, "learning_rate": 5.984778606942197e-06, "loss": 1.958, "step": 13630 }, { "epoch": 2.00676706938085, "grad_norm": 2.809754677221641, "learning_rate": 5.969105649934521e-06, "loss": 1.9551, "step": 13640 }, { "epoch": 2.008238171420165, "grad_norm": 2.9618920197068856, "learning_rate": 5.95344450650841e-06, "loss": 1.9715, "step": 13650 }, { "epoch": 2.0097092734594804, "grad_norm": 2.6028200177199214, "learning_rate": 5.937795222562911e-06, "loss": 1.9838, "step": 13660 }, { "epoch": 2.0111803754987956, "grad_norm": 3.0162059992078687, "learning_rate": 5.922157843962313e-06, "loss": 1.9436, "step": 13670 }, { "epoch": 2.012651477538111, "grad_norm": 2.755262489318435, "learning_rate": 5.906532416536006e-06, "loss": 1.9561, "step": 13680 }, { "epoch": 2.014122579577426, "grad_norm": 2.5608775609125574, "learning_rate": 5.8909189860783665e-06, "loss": 1.9718, "step": 13690 }, { "epoch": 2.0155936816167412, "grad_norm": 2.812035547091045, "learning_rate": 5.875317598348593e-06, "loss": 1.9941, "step": 13700 }, { "epoch": 2.0170647836560565, "grad_norm": 2.8997980967032433, "learning_rate": 5.859728299070603e-06, "loss": 1.9977, "step": 13710 }, { "epoch": 2.0185358856953717, "grad_norm": 2.8928677965716854, "learning_rate": 5.844151133932887e-06, "loss": 1.95, "step": 13720 }, { "epoch": 2.020006987734687, "grad_norm": 3.0386752691656476, "learning_rate": 5.828586148588359e-06, "loss": 1.9365, "step": 13730 }, { "epoch": 2.021478089774002, "grad_norm": 2.9837077165184094, "learning_rate": 5.813033388654255e-06, "loss": 1.9595, "step": 13740 }, { "epoch": 2.0229491918133173, "grad_norm": 2.85147243702891, "learning_rate": 5.79749289971197e-06, "loss": 1.9658, "step": 13750 }, { "epoch": 2.0244202938526326, "grad_norm": 2.9376766344708325, "learning_rate": 5.781964727306937e-06, "loss": 1.9314, "step": 13760 }, { "epoch": 2.0258913958919473, "grad_norm": 3.077039506269238, "learning_rate": 5.766448916948491e-06, "loss": 1.9877, "step": 13770 }, { "epoch": 2.0273624979312626, "grad_norm": 3.1987646382996906, "learning_rate": 5.750945514109749e-06, "loss": 1.9682, "step": 13780 }, { "epoch": 2.0288335999705778, "grad_norm": 2.975483034396534, "learning_rate": 5.7354545642274405e-06, "loss": 1.957, "step": 13790 }, { "epoch": 2.030304702009893, "grad_norm": 2.718603951792644, "learning_rate": 5.719976112701824e-06, "loss": 1.9492, "step": 13800 }, { "epoch": 2.031775804049208, "grad_norm": 2.735809325646884, "learning_rate": 5.704510204896514e-06, "loss": 1.8996, "step": 13810 }, { "epoch": 2.0332469060885234, "grad_norm": 2.8650627279852476, "learning_rate": 5.689056886138362e-06, "loss": 1.9741, "step": 13820 }, { "epoch": 2.0347180081278387, "grad_norm": 2.8394174170510804, "learning_rate": 5.6736162017173336e-06, "loss": 1.8806, "step": 13830 }, { "epoch": 2.036189110167154, "grad_norm": 2.963350260992524, "learning_rate": 5.658188196886356e-06, "loss": 1.9906, "step": 13840 }, { "epoch": 2.037660212206469, "grad_norm": 3.048107683933938, "learning_rate": 5.642772916861201e-06, "loss": 1.9396, "step": 13850 }, { "epoch": 2.0391313142457843, "grad_norm": 2.8220050876069833, "learning_rate": 5.6273704068203405e-06, "loss": 1.955, "step": 13860 }, { "epoch": 2.0406024162850995, "grad_norm": 2.835322830914968, "learning_rate": 5.611980711904839e-06, "loss": 1.9544, "step": 13870 }, { "epoch": 2.0420735183244147, "grad_norm": 3.024302326376593, "learning_rate": 5.596603877218174e-06, "loss": 1.9649, "step": 13880 }, { "epoch": 2.04354462036373, "grad_norm": 2.8060862105300752, "learning_rate": 5.581239947826158e-06, "loss": 1.93, "step": 13890 }, { "epoch": 2.045015722403045, "grad_norm": 2.5818275596604083, "learning_rate": 5.565888968756771e-06, "loss": 1.9501, "step": 13900 }, { "epoch": 2.0464868244423604, "grad_norm": 3.04950323238599, "learning_rate": 5.5505509850000375e-06, "loss": 1.9417, "step": 13910 }, { "epoch": 2.0479579264816756, "grad_norm": 2.6386523100028345, "learning_rate": 5.535226041507893e-06, "loss": 1.9201, "step": 13920 }, { "epoch": 2.049429028520991, "grad_norm": 3.2157594036964063, "learning_rate": 5.51991418319407e-06, "loss": 1.9486, "step": 13930 }, { "epoch": 2.050900130560306, "grad_norm": 2.8460685436951745, "learning_rate": 5.504615454933937e-06, "loss": 1.98, "step": 13940 }, { "epoch": 2.0523712325996213, "grad_norm": 2.6101053113347383, "learning_rate": 5.489329901564381e-06, "loss": 1.9424, "step": 13950 }, { "epoch": 2.0538423346389365, "grad_norm": 2.5386226269968915, "learning_rate": 5.474057567883694e-06, "loss": 1.9146, "step": 13960 }, { "epoch": 2.0553134366782517, "grad_norm": 3.043056789674242, "learning_rate": 5.458798498651395e-06, "loss": 1.9618, "step": 13970 }, { "epoch": 2.056784538717567, "grad_norm": 3.0056572563877153, "learning_rate": 5.443552738588163e-06, "loss": 1.9862, "step": 13980 }, { "epoch": 2.058255640756882, "grad_norm": 2.694425904705627, "learning_rate": 5.4283203323756335e-06, "loss": 1.9599, "step": 13990 }, { "epoch": 2.0597267427961974, "grad_norm": 3.0965339193276558, "learning_rate": 5.413101324656338e-06, "loss": 1.9338, "step": 14000 }, { "epoch": 2.0597267427961974, "eval_loss": 2.3935444355010986, "eval_runtime": 514.0159, "eval_samples_per_second": 178.183, "eval_steps_per_second": 5.57, "step": 14000 }, { "epoch": 2.0611978448355126, "grad_norm": 2.821769243931729, "learning_rate": 5.397895760033525e-06, "loss": 1.9433, "step": 14010 }, { "epoch": 2.062668946874828, "grad_norm": 3.4667657759430672, "learning_rate": 5.382703683071037e-06, "loss": 1.927, "step": 14020 }, { "epoch": 2.0641400489141426, "grad_norm": 3.001944346771378, "learning_rate": 5.36752513829321e-06, "loss": 1.9364, "step": 14030 }, { "epoch": 2.065611150953458, "grad_norm": 3.53460936721, "learning_rate": 5.3523601701847005e-06, "loss": 1.915, "step": 14040 }, { "epoch": 2.067082252992773, "grad_norm": 2.9902414011151364, "learning_rate": 5.3372088231903815e-06, "loss": 1.977, "step": 14050 }, { "epoch": 2.0685533550320883, "grad_norm": 2.9685601484924913, "learning_rate": 5.322071141715203e-06, "loss": 1.9315, "step": 14060 }, { "epoch": 2.0700244570714035, "grad_norm": 2.745437000418037, "learning_rate": 5.306947170124077e-06, "loss": 1.9579, "step": 14070 }, { "epoch": 2.0714955591107187, "grad_norm": 3.080935162630564, "learning_rate": 5.291836952741712e-06, "loss": 1.9308, "step": 14080 }, { "epoch": 2.072966661150034, "grad_norm": 2.8371233020352116, "learning_rate": 5.276740533852531e-06, "loss": 1.9806, "step": 14090 }, { "epoch": 2.074437763189349, "grad_norm": 3.119969349903189, "learning_rate": 5.261657957700504e-06, "loss": 1.9792, "step": 14100 }, { "epoch": 2.0759088652286644, "grad_norm": 3.026866324794435, "learning_rate": 5.246589268489029e-06, "loss": 1.9673, "step": 14110 }, { "epoch": 2.0773799672679796, "grad_norm": 3.417645602446801, "learning_rate": 5.2315345103808055e-06, "loss": 1.955, "step": 14120 }, { "epoch": 2.078851069307295, "grad_norm": 3.0595458513005545, "learning_rate": 5.216493727497718e-06, "loss": 1.9346, "step": 14130 }, { "epoch": 2.08032217134661, "grad_norm": 2.809635683953166, "learning_rate": 5.201466963920678e-06, "loss": 1.9279, "step": 14140 }, { "epoch": 2.0817932733859252, "grad_norm": 2.789811024893517, "learning_rate": 5.1864542636895086e-06, "loss": 1.9595, "step": 14150 }, { "epoch": 2.0832643754252405, "grad_norm": 2.8188656327093398, "learning_rate": 5.171455670802837e-06, "loss": 1.8944, "step": 14160 }, { "epoch": 2.0847354774645557, "grad_norm": 2.8908302925846603, "learning_rate": 5.156471229217912e-06, "loss": 1.9313, "step": 14170 }, { "epoch": 2.086206579503871, "grad_norm": 3.0674865364393566, "learning_rate": 5.14150098285054e-06, "loss": 1.939, "step": 14180 }, { "epoch": 2.087677681543186, "grad_norm": 3.088477583326289, "learning_rate": 5.126544975574908e-06, "loss": 1.9594, "step": 14190 }, { "epoch": 2.0891487835825013, "grad_norm": 3.0260231291268544, "learning_rate": 5.111603251223474e-06, "loss": 1.9424, "step": 14200 }, { "epoch": 2.0906198856218166, "grad_norm": 3.3181904343298516, "learning_rate": 5.096675853586834e-06, "loss": 1.9729, "step": 14210 }, { "epoch": 2.0920909876611318, "grad_norm": 3.1430621592880916, "learning_rate": 5.081762826413608e-06, "loss": 1.9375, "step": 14220 }, { "epoch": 2.093562089700447, "grad_norm": 2.894332950554654, "learning_rate": 5.066864213410283e-06, "loss": 1.9552, "step": 14230 }, { "epoch": 2.095033191739762, "grad_norm": 3.0585309437415535, "learning_rate": 5.051980058241115e-06, "loss": 1.96, "step": 14240 }, { "epoch": 2.0965042937790774, "grad_norm": 2.8321641606177854, "learning_rate": 5.037110404527977e-06, "loss": 1.9522, "step": 14250 }, { "epoch": 2.0979753958183927, "grad_norm": 2.7220261387075326, "learning_rate": 5.022255295850247e-06, "loss": 1.9514, "step": 14260 }, { "epoch": 2.0994464978577074, "grad_norm": 3.197629929256624, "learning_rate": 5.007414775744678e-06, "loss": 1.9814, "step": 14270 }, { "epoch": 2.1009175998970226, "grad_norm": 3.0861914493855127, "learning_rate": 4.992588887705263e-06, "loss": 1.9348, "step": 14280 }, { "epoch": 2.102388701936338, "grad_norm": 2.741703019490809, "learning_rate": 4.977777675183113e-06, "loss": 1.9264, "step": 14290 }, { "epoch": 2.103859803975653, "grad_norm": 2.9599066968861685, "learning_rate": 4.962981181586327e-06, "loss": 1.9399, "step": 14300 }, { "epoch": 2.1053309060149683, "grad_norm": 3.518620979033253, "learning_rate": 4.948199450279869e-06, "loss": 1.9316, "step": 14310 }, { "epoch": 2.1068020080542835, "grad_norm": 3.014628693863006, "learning_rate": 4.9334325245854335e-06, "loss": 1.9147, "step": 14320 }, { "epoch": 2.1082731100935987, "grad_norm": 2.8331997774660684, "learning_rate": 4.918680447781334e-06, "loss": 1.9287, "step": 14330 }, { "epoch": 2.109744212132914, "grad_norm": 2.79123354730972, "learning_rate": 4.903943263102354e-06, "loss": 1.9571, "step": 14340 }, { "epoch": 2.111215314172229, "grad_norm": 2.922552849392064, "learning_rate": 4.8892210137396355e-06, "loss": 1.9073, "step": 14350 }, { "epoch": 2.1126864162115444, "grad_norm": 3.0806639773428124, "learning_rate": 4.874513742840558e-06, "loss": 1.9632, "step": 14360 }, { "epoch": 2.1141575182508596, "grad_norm": 2.8431158930019853, "learning_rate": 4.8598214935085785e-06, "loss": 1.9156, "step": 14370 }, { "epoch": 2.115628620290175, "grad_norm": 3.3546057936096494, "learning_rate": 4.845144308803159e-06, "loss": 1.9509, "step": 14380 }, { "epoch": 2.11709972232949, "grad_norm": 3.074935327392745, "learning_rate": 4.830482231739591e-06, "loss": 1.959, "step": 14390 }, { "epoch": 2.1185708243688053, "grad_norm": 2.694300408965986, "learning_rate": 4.815835305288896e-06, "loss": 1.9093, "step": 14400 }, { "epoch": 2.1200419264081205, "grad_norm": 2.9244381850453047, "learning_rate": 4.8012035723776875e-06, "loss": 1.9372, "step": 14410 }, { "epoch": 2.1215130284474357, "grad_norm": 3.4603402109792953, "learning_rate": 4.786587075888061e-06, "loss": 1.9628, "step": 14420 }, { "epoch": 2.122984130486751, "grad_norm": 3.501148386311287, "learning_rate": 4.7719858586574496e-06, "loss": 1.8882, "step": 14430 }, { "epoch": 2.124455232526066, "grad_norm": 3.137049958988607, "learning_rate": 4.757399963478506e-06, "loss": 1.9714, "step": 14440 }, { "epoch": 2.1259263345653814, "grad_norm": 2.88659007148771, "learning_rate": 4.742829433098981e-06, "loss": 1.8905, "step": 14450 }, { "epoch": 2.1273974366046966, "grad_norm": 3.2113846539911934, "learning_rate": 4.72827431022159e-06, "loss": 1.9599, "step": 14460 }, { "epoch": 2.128868538644012, "grad_norm": 3.380694689982377, "learning_rate": 4.713734637503904e-06, "loss": 1.9527, "step": 14470 }, { "epoch": 2.130339640683327, "grad_norm": 2.892647755912065, "learning_rate": 4.699210457558201e-06, "loss": 1.9726, "step": 14480 }, { "epoch": 2.1318107427226423, "grad_norm": 2.9657709517831052, "learning_rate": 4.684701812951359e-06, "loss": 1.9393, "step": 14490 }, { "epoch": 2.1332818447619575, "grad_norm": 3.0394352560422138, "learning_rate": 4.670208746204719e-06, "loss": 1.96, "step": 14500 }, { "epoch": 2.1347529468012727, "grad_norm": 3.5951230204741758, "learning_rate": 4.655731299793989e-06, "loss": 1.9497, "step": 14510 }, { "epoch": 2.136224048840588, "grad_norm": 2.855531892986216, "learning_rate": 4.641269516149061e-06, "loss": 1.979, "step": 14520 }, { "epoch": 2.137695150879903, "grad_norm": 3.134185148538862, "learning_rate": 4.626823437653957e-06, "loss": 1.9009, "step": 14530 }, { "epoch": 2.139166252919218, "grad_norm": 3.2812368806708863, "learning_rate": 4.612393106646656e-06, "loss": 1.9126, "step": 14540 }, { "epoch": 2.140637354958533, "grad_norm": 3.1996628600229293, "learning_rate": 4.597978565418979e-06, "loss": 1.9599, "step": 14550 }, { "epoch": 2.1421084569978484, "grad_norm": 3.015467520012759, "learning_rate": 4.583579856216487e-06, "loss": 1.9288, "step": 14560 }, { "epoch": 2.1435795590371636, "grad_norm": 3.3804883547653395, "learning_rate": 4.56919702123833e-06, "loss": 1.9479, "step": 14570 }, { "epoch": 2.145050661076479, "grad_norm": 3.218356734470636, "learning_rate": 4.554830102637137e-06, "loss": 1.9096, "step": 14580 }, { "epoch": 2.146521763115794, "grad_norm": 2.9614467987113153, "learning_rate": 4.540479142518888e-06, "loss": 1.947, "step": 14590 }, { "epoch": 2.1479928651551092, "grad_norm": 2.985626880157704, "learning_rate": 4.526144182942797e-06, "loss": 1.9941, "step": 14600 }, { "epoch": 2.1494639671944245, "grad_norm": 2.9761943184666695, "learning_rate": 4.511825265921176e-06, "loss": 1.9111, "step": 14610 }, { "epoch": 2.1509350692337397, "grad_norm": 2.8980294609836164, "learning_rate": 4.497522433419335e-06, "loss": 1.9434, "step": 14620 }, { "epoch": 2.152406171273055, "grad_norm": 3.015203600207727, "learning_rate": 4.48323572735543e-06, "loss": 1.8911, "step": 14630 }, { "epoch": 2.15387727331237, "grad_norm": 2.9537768518940055, "learning_rate": 4.468965189600355e-06, "loss": 1.9211, "step": 14640 }, { "epoch": 2.1553483753516853, "grad_norm": 3.4651186141248327, "learning_rate": 4.4547108619776355e-06, "loss": 1.926, "step": 14650 }, { "epoch": 2.1568194773910006, "grad_norm": 2.6468701510759964, "learning_rate": 4.440472786263262e-06, "loss": 1.881, "step": 14660 }, { "epoch": 2.1582905794303158, "grad_norm": 3.3021748781230738, "learning_rate": 4.426251004185619e-06, "loss": 1.9259, "step": 14670 }, { "epoch": 2.159761681469631, "grad_norm": 3.088994850587568, "learning_rate": 4.412045557425324e-06, "loss": 1.9481, "step": 14680 }, { "epoch": 2.161232783508946, "grad_norm": 3.3924157969537583, "learning_rate": 4.397856487615126e-06, "loss": 1.9147, "step": 14690 }, { "epoch": 2.1627038855482614, "grad_norm": 3.1999809116178253, "learning_rate": 4.383683836339768e-06, "loss": 1.9526, "step": 14700 }, { "epoch": 2.1641749875875766, "grad_norm": 2.8655205629233493, "learning_rate": 4.369527645135896e-06, "loss": 1.9808, "step": 14710 }, { "epoch": 2.165646089626892, "grad_norm": 2.9641983059870753, "learning_rate": 4.3553879554918825e-06, "loss": 1.9962, "step": 14720 }, { "epoch": 2.167117191666207, "grad_norm": 3.3284426664302176, "learning_rate": 4.3412648088477695e-06, "loss": 1.9145, "step": 14730 }, { "epoch": 2.1685882937055223, "grad_norm": 3.155642810679341, "learning_rate": 4.327158246595097e-06, "loss": 1.9413, "step": 14740 }, { "epoch": 2.1700593957448375, "grad_norm": 3.228900153138567, "learning_rate": 4.313068310076801e-06, "loss": 1.9648, "step": 14750 }, { "epoch": 2.1715304977841527, "grad_norm": 3.3049468870445287, "learning_rate": 4.298995040587105e-06, "loss": 1.9714, "step": 14760 }, { "epoch": 2.1730015998234675, "grad_norm": 3.1718967339213755, "learning_rate": 4.284938479371371e-06, "loss": 1.9454, "step": 14770 }, { "epoch": 2.1744727018627827, "grad_norm": 3.250515191269502, "learning_rate": 4.2708986676259976e-06, "loss": 1.9392, "step": 14780 }, { "epoch": 2.175943803902098, "grad_norm": 3.142034435850978, "learning_rate": 4.256875646498293e-06, "loss": 1.9464, "step": 14790 }, { "epoch": 2.177414905941413, "grad_norm": 3.3922385552973573, "learning_rate": 4.2428694570863695e-06, "loss": 1.9218, "step": 14800 }, { "epoch": 2.1788860079807284, "grad_norm": 3.0505722677298674, "learning_rate": 4.228880140438981e-06, "loss": 1.9318, "step": 14810 }, { "epoch": 2.1803571100200436, "grad_norm": 3.250914974509269, "learning_rate": 4.2149077375554635e-06, "loss": 1.8759, "step": 14820 }, { "epoch": 2.181828212059359, "grad_norm": 3.4429311612165527, "learning_rate": 4.200952289385564e-06, "loss": 1.9197, "step": 14830 }, { "epoch": 2.183299314098674, "grad_norm": 3.036088316201077, "learning_rate": 4.187013836829339e-06, "loss": 1.9245, "step": 14840 }, { "epoch": 2.1847704161379893, "grad_norm": 3.1707436194376832, "learning_rate": 4.173092420737046e-06, "loss": 1.9355, "step": 14850 }, { "epoch": 2.1862415181773045, "grad_norm": 3.1074540207303287, "learning_rate": 4.159188081909005e-06, "loss": 1.9049, "step": 14860 }, { "epoch": 2.1877126202166197, "grad_norm": 2.9113805696029225, "learning_rate": 4.145300861095486e-06, "loss": 1.953, "step": 14870 }, { "epoch": 2.189183722255935, "grad_norm": 3.14312700859708, "learning_rate": 4.131430798996593e-06, "loss": 1.9719, "step": 14880 }, { "epoch": 2.19065482429525, "grad_norm": 2.9103161250363385, "learning_rate": 4.11757793626214e-06, "loss": 1.9016, "step": 14890 }, { "epoch": 2.1921259263345654, "grad_norm": 3.4200341399688163, "learning_rate": 4.103742313491535e-06, "loss": 1.9733, "step": 14900 }, { "epoch": 2.1935970283738806, "grad_norm": 3.516005205154003, "learning_rate": 4.0899239712336645e-06, "loss": 1.9822, "step": 14910 }, { "epoch": 2.195068130413196, "grad_norm": 3.054740712570947, "learning_rate": 4.076122949986756e-06, "loss": 1.9246, "step": 14920 }, { "epoch": 2.196539232452511, "grad_norm": 2.905628776493795, "learning_rate": 4.0623392901982905e-06, "loss": 1.97, "step": 14930 }, { "epoch": 2.1980103344918263, "grad_norm": 3.370156050604696, "learning_rate": 4.048573032264856e-06, "loss": 1.9438, "step": 14940 }, { "epoch": 2.1994814365311415, "grad_norm": 2.7146323988250063, "learning_rate": 4.03482421653204e-06, "loss": 1.8925, "step": 14950 }, { "epoch": 2.2009525385704567, "grad_norm": 3.0679311830945757, "learning_rate": 4.021092883294318e-06, "loss": 1.9452, "step": 14960 }, { "epoch": 2.202423640609772, "grad_norm": 3.1739226494969235, "learning_rate": 4.007379072794922e-06, "loss": 1.9419, "step": 14970 }, { "epoch": 2.203894742649087, "grad_norm": 3.3834678130034574, "learning_rate": 3.993682825225733e-06, "loss": 1.9067, "step": 14980 }, { "epoch": 2.2053658446884024, "grad_norm": 3.097858475331377, "learning_rate": 3.980004180727152e-06, "loss": 1.8909, "step": 14990 }, { "epoch": 2.2068369467277176, "grad_norm": 3.3375933384866148, "learning_rate": 3.966343179388006e-06, "loss": 1.9587, "step": 15000 }, { "epoch": 2.2068369467277176, "eval_loss": 2.3911972045898438, "eval_runtime": 339.9109, "eval_samples_per_second": 269.45, "eval_steps_per_second": 8.423, "step": 15000 }, { "epoch": 2.208308048767033, "grad_norm": 2.6234784334154666, "learning_rate": 3.952699861245389e-06, "loss": 1.8913, "step": 15010 }, { "epoch": 2.209779150806348, "grad_norm": 2.7412611441735097, "learning_rate": 3.939074266284596e-06, "loss": 1.9235, "step": 15020 }, { "epoch": 2.2112502528456632, "grad_norm": 3.3785269262259714, "learning_rate": 3.925466434438964e-06, "loss": 1.9225, "step": 15030 }, { "epoch": 2.212721354884978, "grad_norm": 3.0739486110231886, "learning_rate": 3.911876405589768e-06, "loss": 1.9102, "step": 15040 }, { "epoch": 2.2141924569242932, "grad_norm": 2.962956397664508, "learning_rate": 3.8983042195661235e-06, "loss": 1.9329, "step": 15050 }, { "epoch": 2.2156635589636084, "grad_norm": 3.347816653543782, "learning_rate": 3.884749916144838e-06, "loss": 1.9147, "step": 15060 }, { "epoch": 2.2171346610029237, "grad_norm": 3.233236189311224, "learning_rate": 3.871213535050312e-06, "loss": 1.918, "step": 15070 }, { "epoch": 2.218605763042239, "grad_norm": 3.125316535035012, "learning_rate": 3.85769511595442e-06, "loss": 1.934, "step": 15080 }, { "epoch": 2.220076865081554, "grad_norm": 3.3646695117784264, "learning_rate": 3.844194698476408e-06, "loss": 1.9427, "step": 15090 }, { "epoch": 2.2215479671208693, "grad_norm": 2.9905897904344694, "learning_rate": 3.830712322182735e-06, "loss": 1.9477, "step": 15100 }, { "epoch": 2.2230190691601845, "grad_norm": 2.938382696361462, "learning_rate": 3.817248026587014e-06, "loss": 1.9325, "step": 15110 }, { "epoch": 2.2244901711994998, "grad_norm": 3.1005766463138826, "learning_rate": 3.803801851149852e-06, "loss": 1.9295, "step": 15120 }, { "epoch": 2.225961273238815, "grad_norm": 3.037986727258032, "learning_rate": 3.7903738352787545e-06, "loss": 1.8851, "step": 15130 }, { "epoch": 2.22743237527813, "grad_norm": 2.9515435330567925, "learning_rate": 3.776964018328002e-06, "loss": 1.9357, "step": 15140 }, { "epoch": 2.2289034773174454, "grad_norm": 3.229085920795769, "learning_rate": 3.7635724395985494e-06, "loss": 1.9589, "step": 15150 }, { "epoch": 2.2303745793567606, "grad_norm": 3.86416419012076, "learning_rate": 3.7501991383378875e-06, "loss": 1.9228, "step": 15160 }, { "epoch": 2.231845681396076, "grad_norm": 3.290199822841689, "learning_rate": 3.7368441537399425e-06, "loss": 1.9207, "step": 15170 }, { "epoch": 2.233316783435391, "grad_norm": 3.2009159391243043, "learning_rate": 3.7235075249449713e-06, "loss": 1.9447, "step": 15180 }, { "epoch": 2.2347878854747063, "grad_norm": 3.2411945457413665, "learning_rate": 3.7101892910394098e-06, "loss": 1.934, "step": 15190 }, { "epoch": 2.2362589875140215, "grad_norm": 3.37605096650466, "learning_rate": 3.6968894910558117e-06, "loss": 1.9363, "step": 15200 }, { "epoch": 2.2377300895533367, "grad_norm": 3.7407033323100056, "learning_rate": 3.6836081639726783e-06, "loss": 1.9456, "step": 15210 }, { "epoch": 2.239201191592652, "grad_norm": 3.7054204754113362, "learning_rate": 3.6703453487143927e-06, "loss": 1.9597, "step": 15220 }, { "epoch": 2.240672293631967, "grad_norm": 2.7940333200052083, "learning_rate": 3.6571010841510724e-06, "loss": 1.9282, "step": 15230 }, { "epoch": 2.2421433956712824, "grad_norm": 2.8057419176993488, "learning_rate": 3.643875409098467e-06, "loss": 1.9343, "step": 15240 }, { "epoch": 2.2436144977105976, "grad_norm": 3.036881540572663, "learning_rate": 3.630668362317857e-06, "loss": 1.9494, "step": 15250 }, { "epoch": 2.245085599749913, "grad_norm": 2.84624102143619, "learning_rate": 3.6174799825159135e-06, "loss": 1.917, "step": 15260 }, { "epoch": 2.246556701789228, "grad_norm": 3.002673750354174, "learning_rate": 3.6043103083446053e-06, "loss": 1.9059, "step": 15270 }, { "epoch": 2.248027803828543, "grad_norm": 3.165768239773281, "learning_rate": 3.5911593784010754e-06, "loss": 1.9229, "step": 15280 }, { "epoch": 2.249498905867858, "grad_norm": 3.414534735427826, "learning_rate": 3.5780272312275466e-06, "loss": 1.9224, "step": 15290 }, { "epoch": 2.2509700079071733, "grad_norm": 3.0188752569034336, "learning_rate": 3.564913905311168e-06, "loss": 1.9428, "step": 15300 }, { "epoch": 2.2524411099464885, "grad_norm": 2.913079306924071, "learning_rate": 3.5518194390839544e-06, "loss": 1.8912, "step": 15310 }, { "epoch": 2.2539122119858037, "grad_norm": 2.950015795405563, "learning_rate": 3.5387438709226317e-06, "loss": 1.9734, "step": 15320 }, { "epoch": 2.255383314025119, "grad_norm": 3.0318289906841045, "learning_rate": 3.5256872391485454e-06, "loss": 1.9434, "step": 15330 }, { "epoch": 2.256854416064434, "grad_norm": 3.0844742781032433, "learning_rate": 3.5126495820275376e-06, "loss": 1.9225, "step": 15340 }, { "epoch": 2.2583255181037494, "grad_norm": 3.0494832726722993, "learning_rate": 3.4996309377698524e-06, "loss": 1.9384, "step": 15350 }, { "epoch": 2.2597966201430646, "grad_norm": 3.2713654057922614, "learning_rate": 3.486631344529999e-06, "loss": 1.9454, "step": 15360 }, { "epoch": 2.26126772218238, "grad_norm": 3.3279278844108116, "learning_rate": 3.473650840406656e-06, "loss": 1.948, "step": 15370 }, { "epoch": 2.262738824221695, "grad_norm": 3.2787091100208774, "learning_rate": 3.460689463442568e-06, "loss": 1.913, "step": 15380 }, { "epoch": 2.2642099262610103, "grad_norm": 2.83908644296446, "learning_rate": 3.447747251624398e-06, "loss": 1.9327, "step": 15390 }, { "epoch": 2.2656810283003255, "grad_norm": 3.2355074271344915, "learning_rate": 3.434824242882666e-06, "loss": 1.9138, "step": 15400 }, { "epoch": 2.2671521303396407, "grad_norm": 2.7931280607183355, "learning_rate": 3.421920475091599e-06, "loss": 1.9303, "step": 15410 }, { "epoch": 2.268623232378956, "grad_norm": 3.0598541850438457, "learning_rate": 3.4090359860690347e-06, "loss": 1.9488, "step": 15420 }, { "epoch": 2.270094334418271, "grad_norm": 2.9310342697457736, "learning_rate": 3.3961708135763073e-06, "loss": 1.9726, "step": 15430 }, { "epoch": 2.2715654364575864, "grad_norm": 3.150438629718247, "learning_rate": 3.383324995318149e-06, "loss": 1.9018, "step": 15440 }, { "epoch": 2.2730365384969016, "grad_norm": 3.383247183863827, "learning_rate": 3.3704985689425605e-06, "loss": 1.9504, "step": 15450 }, { "epoch": 2.274507640536217, "grad_norm": 3.0588723770375075, "learning_rate": 3.3576915720407067e-06, "loss": 1.9022, "step": 15460 }, { "epoch": 2.275978742575532, "grad_norm": 3.0382860544876378, "learning_rate": 3.3449040421468247e-06, "loss": 1.916, "step": 15470 }, { "epoch": 2.2774498446148472, "grad_norm": 2.7575971690362673, "learning_rate": 3.3321360167380757e-06, "loss": 1.9121, "step": 15480 }, { "epoch": 2.2789209466541624, "grad_norm": 3.1188870029201277, "learning_rate": 3.3193875332344828e-06, "loss": 1.9226, "step": 15490 }, { "epoch": 2.2803920486934777, "grad_norm": 3.1235940411781256, "learning_rate": 3.306658628998771e-06, "loss": 1.9132, "step": 15500 }, { "epoch": 2.281863150732793, "grad_norm": 3.3315296326684307, "learning_rate": 3.2939493413363065e-06, "loss": 1.9211, "step": 15510 }, { "epoch": 2.283334252772108, "grad_norm": 3.306015226586635, "learning_rate": 3.2812597074949505e-06, "loss": 1.9526, "step": 15520 }, { "epoch": 2.2848053548114233, "grad_norm": 3.280961163891983, "learning_rate": 3.2685897646649667e-06, "loss": 1.9416, "step": 15530 }, { "epoch": 2.2862764568507385, "grad_norm": 3.243436857580898, "learning_rate": 3.2559395499789048e-06, "loss": 1.9363, "step": 15540 }, { "epoch": 2.2877475588900533, "grad_norm": 3.174882254164854, "learning_rate": 3.243309100511509e-06, "loss": 1.9207, "step": 15550 }, { "epoch": 2.2892186609293685, "grad_norm": 3.198913250585862, "learning_rate": 3.230698453279585e-06, "loss": 1.9637, "step": 15560 }, { "epoch": 2.2906897629686838, "grad_norm": 3.161878778871648, "learning_rate": 3.2181076452419e-06, "loss": 1.9373, "step": 15570 }, { "epoch": 2.292160865007999, "grad_norm": 3.192793466092462, "learning_rate": 3.2055367132990955e-06, "loss": 1.9518, "step": 15580 }, { "epoch": 2.293631967047314, "grad_norm": 2.773809020337288, "learning_rate": 3.1929856942935343e-06, "loss": 1.9425, "step": 15590 }, { "epoch": 2.2951030690866294, "grad_norm": 3.618192755694921, "learning_rate": 3.180454625009245e-06, "loss": 1.928, "step": 15600 }, { "epoch": 2.2965741711259446, "grad_norm": 2.8220340456455544, "learning_rate": 3.1679435421717706e-06, "loss": 1.8963, "step": 15610 }, { "epoch": 2.29804527316526, "grad_norm": 3.173441492026203, "learning_rate": 3.155452482448088e-06, "loss": 1.9168, "step": 15620 }, { "epoch": 2.299516375204575, "grad_norm": 3.364871196380989, "learning_rate": 3.142981482446483e-06, "loss": 1.9412, "step": 15630 }, { "epoch": 2.3009874772438903, "grad_norm": 2.900589890836024, "learning_rate": 3.130530578716464e-06, "loss": 1.9125, "step": 15640 }, { "epoch": 2.3024585792832055, "grad_norm": 2.886071456878504, "learning_rate": 3.1180998077486303e-06, "loss": 1.9277, "step": 15650 }, { "epoch": 2.3039296813225207, "grad_norm": 3.2248421365780535, "learning_rate": 3.105689205974578e-06, "loss": 1.9407, "step": 15660 }, { "epoch": 2.305400783361836, "grad_norm": 3.0695401847654824, "learning_rate": 3.0932988097668036e-06, "loss": 1.9725, "step": 15670 }, { "epoch": 2.306871885401151, "grad_norm": 2.9800524302904936, "learning_rate": 3.0809286554385664e-06, "loss": 1.8841, "step": 15680 }, { "epoch": 2.3083429874404664, "grad_norm": 3.6077173785711385, "learning_rate": 3.0685787792438213e-06, "loss": 1.9329, "step": 15690 }, { "epoch": 2.3098140894797816, "grad_norm": 3.3516603977369646, "learning_rate": 3.056249217377082e-06, "loss": 1.9703, "step": 15700 }, { "epoch": 2.311285191519097, "grad_norm": 3.1580112634593447, "learning_rate": 3.043940005973326e-06, "loss": 1.9045, "step": 15710 }, { "epoch": 2.312756293558412, "grad_norm": 3.283283637406713, "learning_rate": 3.0316511811078895e-06, "loss": 1.9014, "step": 15720 }, { "epoch": 2.3142273955977273, "grad_norm": 3.0665443375732115, "learning_rate": 3.0193827787963705e-06, "loss": 1.9359, "step": 15730 }, { "epoch": 2.3156984976370425, "grad_norm": 3.30601770038876, "learning_rate": 3.0071348349944907e-06, "loss": 1.9168, "step": 15740 }, { "epoch": 2.3171695996763577, "grad_norm": 3.077309538853398, "learning_rate": 2.994907385598038e-06, "loss": 1.9398, "step": 15750 }, { "epoch": 2.318640701715673, "grad_norm": 3.400797299282931, "learning_rate": 2.9827004664427204e-06, "loss": 1.9754, "step": 15760 }, { "epoch": 2.3201118037549877, "grad_norm": 3.063787482704412, "learning_rate": 2.970514113304078e-06, "loss": 1.8986, "step": 15770 }, { "epoch": 2.321582905794303, "grad_norm": 3.0824041778188733, "learning_rate": 2.9583483618973917e-06, "loss": 1.9489, "step": 15780 }, { "epoch": 2.323054007833618, "grad_norm": 3.0066938802589878, "learning_rate": 2.946203247877537e-06, "loss": 1.9353, "step": 15790 }, { "epoch": 2.3245251098729334, "grad_norm": 3.0036551273383867, "learning_rate": 2.9340788068389302e-06, "loss": 1.9075, "step": 15800 }, { "epoch": 2.3259962119122486, "grad_norm": 3.1861929534409708, "learning_rate": 2.9219750743153907e-06, "loss": 1.9122, "step": 15810 }, { "epoch": 2.327467313951564, "grad_norm": 3.1518556617655036, "learning_rate": 2.909892085780044e-06, "loss": 1.9192, "step": 15820 }, { "epoch": 2.328938415990879, "grad_norm": 2.882849796840585, "learning_rate": 2.8978298766452197e-06, "loss": 1.8863, "step": 15830 }, { "epoch": 2.3304095180301942, "grad_norm": 3.066954173617023, "learning_rate": 2.8857884822623573e-06, "loss": 1.9257, "step": 15840 }, { "epoch": 2.3318806200695095, "grad_norm": 3.07323558338563, "learning_rate": 2.8737679379218852e-06, "loss": 1.9067, "step": 15850 }, { "epoch": 2.3333517221088247, "grad_norm": 3.3037418148966267, "learning_rate": 2.8617682788531208e-06, "loss": 1.946, "step": 15860 }, { "epoch": 2.33482282414814, "grad_norm": 3.2097869574001945, "learning_rate": 2.8497895402241905e-06, "loss": 1.9077, "step": 15870 }, { "epoch": 2.336293926187455, "grad_norm": 3.0007112813279337, "learning_rate": 2.83783175714188e-06, "loss": 1.8812, "step": 15880 }, { "epoch": 2.3377650282267703, "grad_norm": 3.5100013992967987, "learning_rate": 2.8258949646515867e-06, "loss": 1.9402, "step": 15890 }, { "epoch": 2.3392361302660856, "grad_norm": 3.522067335908392, "learning_rate": 2.8139791977371722e-06, "loss": 1.96, "step": 15900 }, { "epoch": 2.340707232305401, "grad_norm": 3.338728009833454, "learning_rate": 2.802084491320883e-06, "loss": 1.9121, "step": 15910 }, { "epoch": 2.342178334344716, "grad_norm": 3.078054596006625, "learning_rate": 2.79021088026324e-06, "loss": 1.9401, "step": 15920 }, { "epoch": 2.3436494363840312, "grad_norm": 2.9373859196024217, "learning_rate": 2.778358399362946e-06, "loss": 1.9121, "step": 15930 }, { "epoch": 2.3451205384233464, "grad_norm": 2.8731304505663773, "learning_rate": 2.766527083356767e-06, "loss": 1.9506, "step": 15940 }, { "epoch": 2.3465916404626617, "grad_norm": 2.999326917307762, "learning_rate": 2.754716966919445e-06, "loss": 1.9059, "step": 15950 }, { "epoch": 2.348062742501977, "grad_norm": 3.019570201810584, "learning_rate": 2.742928084663591e-06, "loss": 1.9193, "step": 15960 }, { "epoch": 2.349533844541292, "grad_norm": 3.1300053912939028, "learning_rate": 2.7311604711395767e-06, "loss": 1.9454, "step": 15970 }, { "epoch": 2.3510049465806073, "grad_norm": 2.827640778859149, "learning_rate": 2.719414160835454e-06, "loss": 1.8755, "step": 15980 }, { "epoch": 2.3524760486199225, "grad_norm": 2.9016920029457918, "learning_rate": 2.7076891881768297e-06, "loss": 1.9097, "step": 15990 }, { "epoch": 2.3539471506592378, "grad_norm": 2.989835505677974, "learning_rate": 2.6959855875267783e-06, "loss": 1.9198, "step": 16000 }, { "epoch": 2.3539471506592378, "eval_loss": 2.3836171627044678, "eval_runtime": 454.6344, "eval_samples_per_second": 201.456, "eval_steps_per_second": 6.297, "step": 16000 }, { "epoch": 2.355418252698553, "grad_norm": 2.8773499995320235, "learning_rate": 2.684303393185732e-06, "loss": 1.9353, "step": 16010 }, { "epoch": 2.356889354737868, "grad_norm": 3.174886049210558, "learning_rate": 2.672642639391404e-06, "loss": 1.8759, "step": 16020 }, { "epoch": 2.3583604567771834, "grad_norm": 2.8238483665318888, "learning_rate": 2.6610033603186436e-06, "loss": 1.8979, "step": 16030 }, { "epoch": 2.3598315588164986, "grad_norm": 3.007205239779479, "learning_rate": 2.6493855900793886e-06, "loss": 1.9208, "step": 16040 }, { "epoch": 2.3613026608558134, "grad_norm": 3.1433548100703184, "learning_rate": 2.6377893627225247e-06, "loss": 1.9087, "step": 16050 }, { "epoch": 2.3627737628951286, "grad_norm": 3.0254134370610197, "learning_rate": 2.626214712233801e-06, "loss": 1.9124, "step": 16060 }, { "epoch": 2.364244864934444, "grad_norm": 2.806080671861943, "learning_rate": 2.6146616725357433e-06, "loss": 1.8715, "step": 16070 }, { "epoch": 2.365715966973759, "grad_norm": 3.041682999627559, "learning_rate": 2.6031302774875176e-06, "loss": 1.9336, "step": 16080 }, { "epoch": 2.3671870690130743, "grad_norm": 3.055640517538353, "learning_rate": 2.591620560884881e-06, "loss": 1.9187, "step": 16090 }, { "epoch": 2.3686581710523895, "grad_norm": 2.8217232882163814, "learning_rate": 2.5801325564600367e-06, "loss": 1.9656, "step": 16100 }, { "epoch": 2.3701292730917047, "grad_norm": 3.230440081685476, "learning_rate": 2.5686662978815645e-06, "loss": 1.9131, "step": 16110 }, { "epoch": 2.37160037513102, "grad_norm": 3.365212436920359, "learning_rate": 2.557221818754306e-06, "loss": 1.9197, "step": 16120 }, { "epoch": 2.373071477170335, "grad_norm": 3.161453969191289, "learning_rate": 2.545799152619283e-06, "loss": 1.9064, "step": 16130 }, { "epoch": 2.3745425792096504, "grad_norm": 3.3659700344640107, "learning_rate": 2.534398332953577e-06, "loss": 1.918, "step": 16140 }, { "epoch": 2.3760136812489656, "grad_norm": 3.10686341719646, "learning_rate": 2.5230193931702483e-06, "loss": 1.9156, "step": 16150 }, { "epoch": 2.377484783288281, "grad_norm": 3.210901748157881, "learning_rate": 2.5116623666182325e-06, "loss": 1.916, "step": 16160 }, { "epoch": 2.378955885327596, "grad_norm": 3.2882730021569104, "learning_rate": 2.500327286582239e-06, "loss": 1.9409, "step": 16170 }, { "epoch": 2.3804269873669113, "grad_norm": 3.380030888218211, "learning_rate": 2.4890141862826657e-06, "loss": 1.9084, "step": 16180 }, { "epoch": 2.3818980894062265, "grad_norm": 3.6318486263495635, "learning_rate": 2.4777230988754854e-06, "loss": 1.9969, "step": 16190 }, { "epoch": 2.3833691914455417, "grad_norm": 3.1419605682967733, "learning_rate": 2.4664540574521577e-06, "loss": 1.9093, "step": 16200 }, { "epoch": 2.384840293484857, "grad_norm": 2.814285574565318, "learning_rate": 2.45520709503953e-06, "loss": 1.9375, "step": 16210 }, { "epoch": 2.386311395524172, "grad_norm": 3.067933686937432, "learning_rate": 2.443982244599753e-06, "loss": 1.9239, "step": 16220 }, { "epoch": 2.3877824975634874, "grad_norm": 3.3382484853712677, "learning_rate": 2.432779539030149e-06, "loss": 1.9556, "step": 16230 }, { "epoch": 2.3892535996028026, "grad_norm": 3.6017584805542495, "learning_rate": 2.421599011163163e-06, "loss": 1.9514, "step": 16240 }, { "epoch": 2.390724701642118, "grad_norm": 3.014429499635156, "learning_rate": 2.4104406937662285e-06, "loss": 1.8805, "step": 16250 }, { "epoch": 2.392195803681433, "grad_norm": 3.02218949090589, "learning_rate": 2.399304619541687e-06, "loss": 1.8949, "step": 16260 }, { "epoch": 2.393666905720748, "grad_norm": 3.2717010775690674, "learning_rate": 2.388190821126698e-06, "loss": 1.9763, "step": 16270 }, { "epoch": 2.395138007760063, "grad_norm": 2.848532300995941, "learning_rate": 2.377099331093129e-06, "loss": 1.9439, "step": 16280 }, { "epoch": 2.3966091097993782, "grad_norm": 2.8719064027142065, "learning_rate": 2.3660301819474673e-06, "loss": 1.8745, "step": 16290 }, { "epoch": 2.3980802118386935, "grad_norm": 3.275226891963252, "learning_rate": 2.354983406130725e-06, "loss": 1.8734, "step": 16300 }, { "epoch": 2.3995513138780087, "grad_norm": 3.1393251392525183, "learning_rate": 2.3439590360183526e-06, "loss": 1.9519, "step": 16310 }, { "epoch": 2.401022415917324, "grad_norm": 2.964691225182827, "learning_rate": 2.332957103920114e-06, "loss": 1.9576, "step": 16320 }, { "epoch": 2.402493517956639, "grad_norm": 3.132484774037708, "learning_rate": 2.3219776420800344e-06, "loss": 1.9811, "step": 16330 }, { "epoch": 2.4039646199959543, "grad_norm": 3.135405286719422, "learning_rate": 2.3110206826762714e-06, "loss": 1.9293, "step": 16340 }, { "epoch": 2.4054357220352696, "grad_norm": 3.1137490781553128, "learning_rate": 2.300086257821039e-06, "loss": 1.9482, "step": 16350 }, { "epoch": 2.406906824074585, "grad_norm": 3.3405094656526977, "learning_rate": 2.2891743995605008e-06, "loss": 1.9585, "step": 16360 }, { "epoch": 2.4083779261139, "grad_norm": 3.3544972977636176, "learning_rate": 2.278285139874695e-06, "loss": 1.9242, "step": 16370 }, { "epoch": 2.4098490281532152, "grad_norm": 3.073542867978393, "learning_rate": 2.2674185106774194e-06, "loss": 1.9452, "step": 16380 }, { "epoch": 2.4113201301925304, "grad_norm": 2.7544447600046955, "learning_rate": 2.256574543816148e-06, "loss": 1.9423, "step": 16390 }, { "epoch": 2.4127912322318457, "grad_norm": 3.278389145344879, "learning_rate": 2.245753271071941e-06, "loss": 1.9538, "step": 16400 }, { "epoch": 2.414262334271161, "grad_norm": 3.008016006532432, "learning_rate": 2.2349547241593407e-06, "loss": 1.9334, "step": 16410 }, { "epoch": 2.415733436310476, "grad_norm": 3.100136760002732, "learning_rate": 2.2241789347262997e-06, "loss": 1.891, "step": 16420 }, { "epoch": 2.4172045383497913, "grad_norm": 3.225094360437125, "learning_rate": 2.213425934354053e-06, "loss": 1.9611, "step": 16430 }, { "epoch": 2.4186756403891065, "grad_norm": 3.3510824242105937, "learning_rate": 2.2026957545570658e-06, "loss": 1.9318, "step": 16440 }, { "epoch": 2.4201467424284218, "grad_norm": 3.1959720145748363, "learning_rate": 2.191988426782912e-06, "loss": 1.8983, "step": 16450 }, { "epoch": 2.421617844467737, "grad_norm": 3.176740219344757, "learning_rate": 2.1813039824121874e-06, "loss": 1.9447, "step": 16460 }, { "epoch": 2.423088946507052, "grad_norm": 3.3232882374295274, "learning_rate": 2.170642452758437e-06, "loss": 1.9267, "step": 16470 }, { "epoch": 2.4245600485463674, "grad_norm": 3.3312105175648434, "learning_rate": 2.1600038690680346e-06, "loss": 1.9135, "step": 16480 }, { "epoch": 2.4260311505856826, "grad_norm": 3.596296522966716, "learning_rate": 2.149388262520109e-06, "loss": 1.9063, "step": 16490 }, { "epoch": 2.427502252624998, "grad_norm": 2.965222001940806, "learning_rate": 2.1387956642264484e-06, "loss": 1.9099, "step": 16500 }, { "epoch": 2.428973354664313, "grad_norm": 2.712377451126322, "learning_rate": 2.128226105231417e-06, "loss": 1.9576, "step": 16510 }, { "epoch": 2.4304444567036283, "grad_norm": 3.3693911147490776, "learning_rate": 2.1176796165118373e-06, "loss": 1.9471, "step": 16520 }, { "epoch": 2.4319155587429435, "grad_norm": 3.0712499155107276, "learning_rate": 2.107156228976941e-06, "loss": 1.9039, "step": 16530 }, { "epoch": 2.4333866607822587, "grad_norm": 3.2417458462387083, "learning_rate": 2.0966559734682403e-06, "loss": 1.9384, "step": 16540 }, { "epoch": 2.434857762821574, "grad_norm": 2.8835445515220113, "learning_rate": 2.0861788807594587e-06, "loss": 1.9328, "step": 16550 }, { "epoch": 2.4363288648608887, "grad_norm": 3.38703460430818, "learning_rate": 2.075724981556432e-06, "loss": 1.9275, "step": 16560 }, { "epoch": 2.437799966900204, "grad_norm": 3.1440563821412955, "learning_rate": 2.0652943064970278e-06, "loss": 1.8886, "step": 16570 }, { "epoch": 2.439271068939519, "grad_norm": 3.1858696502629287, "learning_rate": 2.054886886151044e-06, "loss": 1.9491, "step": 16580 }, { "epoch": 2.4407421709788344, "grad_norm": 3.0800830102917005, "learning_rate": 2.0445027510201234e-06, "loss": 1.931, "step": 16590 }, { "epoch": 2.4422132730181496, "grad_norm": 3.174479208528788, "learning_rate": 2.0341419315376744e-06, "loss": 1.9037, "step": 16600 }, { "epoch": 2.443684375057465, "grad_norm": 3.3545658673333287, "learning_rate": 2.0238044580687565e-06, "loss": 1.9263, "step": 16610 }, { "epoch": 2.44515547709678, "grad_norm": 2.986840111728175, "learning_rate": 2.0134903609100243e-06, "loss": 1.9029, "step": 16620 }, { "epoch": 2.4466265791360953, "grad_norm": 3.6414883446504596, "learning_rate": 2.003199670289614e-06, "loss": 1.92, "step": 16630 }, { "epoch": 2.4480976811754105, "grad_norm": 2.925720210393143, "learning_rate": 1.9929324163670637e-06, "loss": 1.8985, "step": 16640 }, { "epoch": 2.4495687832147257, "grad_norm": 3.225655952781597, "learning_rate": 1.98268862923322e-06, "loss": 1.9044, "step": 16650 }, { "epoch": 2.451039885254041, "grad_norm": 3.1464766970559617, "learning_rate": 1.972468338910166e-06, "loss": 1.92, "step": 16660 }, { "epoch": 2.452510987293356, "grad_norm": 2.8853956127394698, "learning_rate": 1.9622715753511114e-06, "loss": 1.9352, "step": 16670 }, { "epoch": 2.4539820893326714, "grad_norm": 3.449751427031657, "learning_rate": 1.952098368440315e-06, "loss": 1.9364, "step": 16680 }, { "epoch": 2.4554531913719866, "grad_norm": 3.2145839199707065, "learning_rate": 1.941948747993e-06, "loss": 1.9068, "step": 16690 }, { "epoch": 2.456924293411302, "grad_norm": 2.8552853606694435, "learning_rate": 1.9318227437552594e-06, "loss": 1.9271, "step": 16700 }, { "epoch": 2.458395395450617, "grad_norm": 3.487165850090067, "learning_rate": 1.9217203854039825e-06, "loss": 1.9104, "step": 16710 }, { "epoch": 2.4598664974899322, "grad_norm": 3.1290488935220013, "learning_rate": 1.9116417025467415e-06, "loss": 1.9279, "step": 16720 }, { "epoch": 2.4613375995292475, "grad_norm": 3.2811849508345072, "learning_rate": 1.901586724721739e-06, "loss": 1.949, "step": 16730 }, { "epoch": 2.4628087015685627, "grad_norm": 3.551660578581461, "learning_rate": 1.8915554813976932e-06, "loss": 1.9414, "step": 16740 }, { "epoch": 2.464279803607878, "grad_norm": 2.874929529192133, "learning_rate": 1.881548001973762e-06, "loss": 1.9124, "step": 16750 }, { "epoch": 2.465750905647193, "grad_norm": 3.102697302407939, "learning_rate": 1.871564315779466e-06, "loss": 1.9102, "step": 16760 }, { "epoch": 2.4672220076865083, "grad_norm": 3.2599692891720795, "learning_rate": 1.8616044520745835e-06, "loss": 1.9073, "step": 16770 }, { "epoch": 2.468693109725823, "grad_norm": 3.2939112050745143, "learning_rate": 1.8516684400490793e-06, "loss": 1.8641, "step": 16780 }, { "epoch": 2.4701642117651383, "grad_norm": 3.395052713680924, "learning_rate": 1.8417563088230128e-06, "loss": 1.9725, "step": 16790 }, { "epoch": 2.4716353138044536, "grad_norm": 3.082709369104829, "learning_rate": 1.8318680874464623e-06, "loss": 1.9203, "step": 16800 }, { "epoch": 2.473106415843769, "grad_norm": 2.971797156641281, "learning_rate": 1.822003804899416e-06, "loss": 1.9423, "step": 16810 }, { "epoch": 2.474577517883084, "grad_norm": 3.0615871614569024, "learning_rate": 1.8121634900917196e-06, "loss": 1.8796, "step": 16820 }, { "epoch": 2.476048619922399, "grad_norm": 3.230026031371955, "learning_rate": 1.8023471718629671e-06, "loss": 1.9392, "step": 16830 }, { "epoch": 2.4775197219617144, "grad_norm": 3.3368899231559683, "learning_rate": 1.7925548789824255e-06, "loss": 1.9492, "step": 16840 }, { "epoch": 2.4789908240010297, "grad_norm": 3.0270397198465853, "learning_rate": 1.7827866401489458e-06, "loss": 1.9008, "step": 16850 }, { "epoch": 2.480461926040345, "grad_norm": 3.1779882272564604, "learning_rate": 1.7730424839908922e-06, "loss": 1.888, "step": 16860 }, { "epoch": 2.48193302807966, "grad_norm": 3.028364990617449, "learning_rate": 1.7633224390660386e-06, "loss": 1.94, "step": 16870 }, { "epoch": 2.4834041301189753, "grad_norm": 3.130967209512705, "learning_rate": 1.7536265338614967e-06, "loss": 1.9443, "step": 16880 }, { "epoch": 2.4848752321582905, "grad_norm": 2.9457441313682042, "learning_rate": 1.7439547967936378e-06, "loss": 1.9308, "step": 16890 }, { "epoch": 2.4863463341976058, "grad_norm": 3.0480812434203925, "learning_rate": 1.7343072562079889e-06, "loss": 1.9355, "step": 16900 }, { "epoch": 2.487817436236921, "grad_norm": 2.942201040879157, "learning_rate": 1.7246839403791749e-06, "loss": 1.8874, "step": 16910 }, { "epoch": 2.489288538276236, "grad_norm": 3.28467037352017, "learning_rate": 1.7150848775108187e-06, "loss": 1.9268, "step": 16920 }, { "epoch": 2.4907596403155514, "grad_norm": 3.2409274364448715, "learning_rate": 1.7055100957354642e-06, "loss": 1.9206, "step": 16930 }, { "epoch": 2.4922307423548666, "grad_norm": 3.147522805453268, "learning_rate": 1.6959596231144904e-06, "loss": 1.9336, "step": 16940 }, { "epoch": 2.493701844394182, "grad_norm": 3.274155138470018, "learning_rate": 1.6864334876380395e-06, "loss": 1.9273, "step": 16950 }, { "epoch": 2.495172946433497, "grad_norm": 3.344148580511051, "learning_rate": 1.67693171722492e-06, "loss": 1.9234, "step": 16960 }, { "epoch": 2.4966440484728123, "grad_norm": 3.220432787079356, "learning_rate": 1.6674543397225362e-06, "loss": 1.8997, "step": 16970 }, { "epoch": 2.4981151505121275, "grad_norm": 3.1128416734034454, "learning_rate": 1.6580013829068008e-06, "loss": 1.9286, "step": 16980 }, { "epoch": 2.4995862525514427, "grad_norm": 2.9774686143006175, "learning_rate": 1.6485728744820529e-06, "loss": 1.895, "step": 16990 }, { "epoch": 2.501057354590758, "grad_norm": 3.152616345235875, "learning_rate": 1.6391688420809903e-06, "loss": 1.9226, "step": 17000 }, { "epoch": 2.501057354590758, "eval_loss": 2.3770883083343506, "eval_runtime": 343.1483, "eval_samples_per_second": 266.908, "eval_steps_per_second": 8.343, "step": 17000 }, { "epoch": 2.502528456630073, "grad_norm": 3.5602430734748873, "learning_rate": 1.6297893132645603e-06, "loss": 1.9402, "step": 17010 }, { "epoch": 2.5039995586693884, "grad_norm": 2.7992256331290055, "learning_rate": 1.6204343155219115e-06, "loss": 1.8941, "step": 17020 }, { "epoch": 2.5054706607087036, "grad_norm": 2.965618954520359, "learning_rate": 1.6111038762702901e-06, "loss": 1.9106, "step": 17030 }, { "epoch": 2.506941762748019, "grad_norm": 2.8504531178440473, "learning_rate": 1.601798022854969e-06, "loss": 1.9367, "step": 17040 }, { "epoch": 2.508412864787334, "grad_norm": 2.8321280548695373, "learning_rate": 1.5925167825491616e-06, "loss": 1.9295, "step": 17050 }, { "epoch": 2.5098839668266493, "grad_norm": 3.340110142652952, "learning_rate": 1.5832601825539563e-06, "loss": 1.8928, "step": 17060 }, { "epoch": 2.5113550688659645, "grad_norm": 3.136311845685044, "learning_rate": 1.5740282499982186e-06, "loss": 1.9116, "step": 17070 }, { "epoch": 2.5128261709052793, "grad_norm": 3.3763789627830167, "learning_rate": 1.5648210119385188e-06, "loss": 1.9496, "step": 17080 }, { "epoch": 2.5142972729445945, "grad_norm": 3.3764481018163877, "learning_rate": 1.5556384953590642e-06, "loss": 1.8957, "step": 17090 }, { "epoch": 2.5157683749839097, "grad_norm": 2.9348831290769057, "learning_rate": 1.5464807271715909e-06, "loss": 1.8845, "step": 17100 }, { "epoch": 2.517239477023225, "grad_norm": 3.4974431717562258, "learning_rate": 1.5373477342153232e-06, "loss": 1.9088, "step": 17110 }, { "epoch": 2.51871057906254, "grad_norm": 3.463997491989295, "learning_rate": 1.5282395432568632e-06, "loss": 1.9269, "step": 17120 }, { "epoch": 2.5201816811018554, "grad_norm": 3.322311189976147, "learning_rate": 1.5191561809901267e-06, "loss": 1.9327, "step": 17130 }, { "epoch": 2.5216527831411706, "grad_norm": 3.0954891408356056, "learning_rate": 1.5100976740362604e-06, "loss": 1.8925, "step": 17140 }, { "epoch": 2.523123885180486, "grad_norm": 3.1221204348713947, "learning_rate": 1.5010640489435758e-06, "loss": 1.9521, "step": 17150 }, { "epoch": 2.524594987219801, "grad_norm": 3.135084341277902, "learning_rate": 1.4920553321874497e-06, "loss": 1.9224, "step": 17160 }, { "epoch": 2.5260660892591162, "grad_norm": 2.9801808508884116, "learning_rate": 1.4830715501702652e-06, "loss": 1.9432, "step": 17170 }, { "epoch": 2.5275371912984315, "grad_norm": 2.9584830705024427, "learning_rate": 1.4741127292213264e-06, "loss": 1.9115, "step": 17180 }, { "epoch": 2.5290082933377467, "grad_norm": 3.2397107686662117, "learning_rate": 1.4651788955967783e-06, "loss": 1.9015, "step": 17190 }, { "epoch": 2.530479395377062, "grad_norm": 3.247515639271438, "learning_rate": 1.4562700754795434e-06, "loss": 1.9262, "step": 17200 }, { "epoch": 2.531950497416377, "grad_norm": 2.9039298446229402, "learning_rate": 1.4473862949792272e-06, "loss": 1.8856, "step": 17210 }, { "epoch": 2.5334215994556923, "grad_norm": 3.205693380047616, "learning_rate": 1.438527580132054e-06, "loss": 1.8963, "step": 17220 }, { "epoch": 2.5348927014950076, "grad_norm": 3.198478153615264, "learning_rate": 1.4296939569007828e-06, "loss": 1.9117, "step": 17230 }, { "epoch": 2.536363803534323, "grad_norm": 3.002921255265118, "learning_rate": 1.4208854511746451e-06, "loss": 1.9258, "step": 17240 }, { "epoch": 2.537834905573638, "grad_norm": 3.242120971368771, "learning_rate": 1.4121020887692427e-06, "loss": 1.9291, "step": 17250 }, { "epoch": 2.539306007612953, "grad_norm": 2.939354541138073, "learning_rate": 1.4033438954265055e-06, "loss": 1.9389, "step": 17260 }, { "epoch": 2.540777109652268, "grad_norm": 2.8859791246128577, "learning_rate": 1.3946108968145866e-06, "loss": 1.8831, "step": 17270 }, { "epoch": 2.542248211691583, "grad_norm": 2.7862681442770083, "learning_rate": 1.3859031185278038e-06, "loss": 1.9267, "step": 17280 }, { "epoch": 2.5437193137308984, "grad_norm": 3.396790626060677, "learning_rate": 1.3772205860865662e-06, "loss": 1.9253, "step": 17290 }, { "epoch": 2.5451904157702137, "grad_norm": 3.2405359158636107, "learning_rate": 1.3685633249372787e-06, "loss": 1.9357, "step": 17300 }, { "epoch": 2.546661517809529, "grad_norm": 2.8837741206896252, "learning_rate": 1.359931360452298e-06, "loss": 1.9012, "step": 17310 }, { "epoch": 2.548132619848844, "grad_norm": 3.4358592699584913, "learning_rate": 1.3513247179298328e-06, "loss": 1.9115, "step": 17320 }, { "epoch": 2.5496037218881593, "grad_norm": 3.201329086946682, "learning_rate": 1.3427434225938807e-06, "loss": 1.9, "step": 17330 }, { "epoch": 2.5510748239274745, "grad_norm": 3.1307410923101684, "learning_rate": 1.3341874995941529e-06, "loss": 1.9282, "step": 17340 }, { "epoch": 2.5525459259667898, "grad_norm": 3.426287266239598, "learning_rate": 1.3256569740060043e-06, "loss": 1.9164, "step": 17350 }, { "epoch": 2.554017028006105, "grad_norm": 3.187291274688601, "learning_rate": 1.3171518708303531e-06, "loss": 1.896, "step": 17360 }, { "epoch": 2.55548813004542, "grad_norm": 3.3315961755202936, "learning_rate": 1.3086722149936083e-06, "loss": 1.8955, "step": 17370 }, { "epoch": 2.5569592320847354, "grad_norm": 2.868859870943429, "learning_rate": 1.3002180313476021e-06, "loss": 1.9071, "step": 17380 }, { "epoch": 2.5584303341240506, "grad_norm": 2.747715831334886, "learning_rate": 1.2917893446695107e-06, "loss": 1.8616, "step": 17390 }, { "epoch": 2.559901436163366, "grad_norm": 3.0397268420279513, "learning_rate": 1.283386179661792e-06, "loss": 1.9306, "step": 17400 }, { "epoch": 2.561372538202681, "grad_norm": 2.8156782127842015, "learning_rate": 1.2750085609520968e-06, "loss": 1.9014, "step": 17410 }, { "epoch": 2.5628436402419963, "grad_norm": 3.144511918092508, "learning_rate": 1.2666565130932117e-06, "loss": 1.9385, "step": 17420 }, { "epoch": 2.5643147422813115, "grad_norm": 3.0278553058061686, "learning_rate": 1.2583300605629767e-06, "loss": 1.9204, "step": 17430 }, { "epoch": 2.5657858443206267, "grad_norm": 3.220230842577206, "learning_rate": 1.2500292277642267e-06, "loss": 1.9557, "step": 17440 }, { "epoch": 2.567256946359942, "grad_norm": 3.2002279348195573, "learning_rate": 1.2417540390246996e-06, "loss": 1.912, "step": 17450 }, { "epoch": 2.568728048399257, "grad_norm": 3.4876039609899974, "learning_rate": 1.2335045185969863e-06, "loss": 1.9263, "step": 17460 }, { "epoch": 2.5701991504385724, "grad_norm": 3.24022117390648, "learning_rate": 1.2252806906584457e-06, "loss": 1.9398, "step": 17470 }, { "epoch": 2.5716702524778876, "grad_norm": 3.186814929659408, "learning_rate": 1.2170825793111364e-06, "loss": 1.9229, "step": 17480 }, { "epoch": 2.573141354517203, "grad_norm": 3.2814928579510507, "learning_rate": 1.208910208581755e-06, "loss": 1.9431, "step": 17490 }, { "epoch": 2.574612456556518, "grad_norm": 3.4383279870752794, "learning_rate": 1.2007636024215519e-06, "loss": 1.8985, "step": 17500 }, { "epoch": 2.5760835585958333, "grad_norm": 3.3873529843883605, "learning_rate": 1.1926427847062694e-06, "loss": 1.9436, "step": 17510 }, { "epoch": 2.5775546606351485, "grad_norm": 2.9083794707695225, "learning_rate": 1.1845477792360681e-06, "loss": 1.9091, "step": 17520 }, { "epoch": 2.5790257626744637, "grad_norm": 3.2580172304863524, "learning_rate": 1.1764786097354697e-06, "loss": 1.9265, "step": 17530 }, { "epoch": 2.580496864713779, "grad_norm": 3.124057673845212, "learning_rate": 1.1684352998532577e-06, "loss": 1.9444, "step": 17540 }, { "epoch": 2.581967966753094, "grad_norm": 3.5170913370192847, "learning_rate": 1.160417873162446e-06, "loss": 1.9268, "step": 17550 }, { "epoch": 2.5834390687924094, "grad_norm": 2.9092873130947248, "learning_rate": 1.1524263531601809e-06, "loss": 1.9544, "step": 17560 }, { "epoch": 2.5849101708317246, "grad_norm": 3.133927423805342, "learning_rate": 1.144460763267683e-06, "loss": 1.9274, "step": 17570 }, { "epoch": 2.5863812728710394, "grad_norm": 3.005904689712739, "learning_rate": 1.1365211268301857e-06, "loss": 1.8869, "step": 17580 }, { "epoch": 2.5878523749103546, "grad_norm": 3.43711147740001, "learning_rate": 1.1286074671168456e-06, "loss": 1.9194, "step": 17590 }, { "epoch": 2.58932347694967, "grad_norm": 3.585701617855294, "learning_rate": 1.1207198073207016e-06, "loss": 1.9014, "step": 17600 }, { "epoch": 2.590794578988985, "grad_norm": 3.1180366994965256, "learning_rate": 1.112858170558584e-06, "loss": 1.8908, "step": 17610 }, { "epoch": 2.5922656810283002, "grad_norm": 3.025567328628804, "learning_rate": 1.1050225798710602e-06, "loss": 1.9076, "step": 17620 }, { "epoch": 2.5937367830676155, "grad_norm": 3.3937973926428486, "learning_rate": 1.0972130582223572e-06, "loss": 1.9208, "step": 17630 }, { "epoch": 2.5952078851069307, "grad_norm": 3.323651980206702, "learning_rate": 1.0894296285003125e-06, "loss": 1.8946, "step": 17640 }, { "epoch": 2.596678987146246, "grad_norm": 3.041045017908416, "learning_rate": 1.0816723135162765e-06, "loss": 1.9175, "step": 17650 }, { "epoch": 2.598150089185561, "grad_norm": 3.0566514985000346, "learning_rate": 1.073941136005079e-06, "loss": 1.9201, "step": 17660 }, { "epoch": 2.5996211912248763, "grad_norm": 3.283924286290687, "learning_rate": 1.0662361186249403e-06, "loss": 1.9162, "step": 17670 }, { "epoch": 2.6010922932641916, "grad_norm": 3.6037577254567186, "learning_rate": 1.0585572839574099e-06, "loss": 1.9395, "step": 17680 }, { "epoch": 2.6025633953035068, "grad_norm": 3.4019792965561773, "learning_rate": 1.0509046545073098e-06, "loss": 1.9073, "step": 17690 }, { "epoch": 2.604034497342822, "grad_norm": 2.846285412867, "learning_rate": 1.0432782527026531e-06, "loss": 1.9244, "step": 17700 }, { "epoch": 2.605505599382137, "grad_norm": 2.9778075366366084, "learning_rate": 1.0356781008945882e-06, "loss": 1.9368, "step": 17710 }, { "epoch": 2.6069767014214524, "grad_norm": 3.594257448396519, "learning_rate": 1.02810422135733e-06, "loss": 1.9728, "step": 17720 }, { "epoch": 2.6084478034607677, "grad_norm": 3.242946774237679, "learning_rate": 1.0205566362881024e-06, "loss": 1.8859, "step": 17730 }, { "epoch": 2.609918905500083, "grad_norm": 3.3682089472281236, "learning_rate": 1.013035367807056e-06, "loss": 1.9547, "step": 17740 }, { "epoch": 2.611390007539398, "grad_norm": 3.6596376295508453, "learning_rate": 1.005540437957223e-06, "loss": 1.9679, "step": 17750 }, { "epoch": 2.6128611095787133, "grad_norm": 3.1309695564260314, "learning_rate": 9.980718687044377e-07, "loss": 1.9237, "step": 17760 }, { "epoch": 2.614332211618028, "grad_norm": 3.209908156614619, "learning_rate": 9.906296819372785e-07, "loss": 1.9309, "step": 17770 }, { "epoch": 2.6158033136573433, "grad_norm": 3.016025141831291, "learning_rate": 9.83213899467008e-07, "loss": 1.9022, "step": 17780 }, { "epoch": 2.6172744156966585, "grad_norm": 3.076711978298668, "learning_rate": 9.758245430274994e-07, "loss": 1.9044, "step": 17790 }, { "epoch": 2.6187455177359737, "grad_norm": 2.9031905338200223, "learning_rate": 9.684616342751763e-07, "loss": 1.8995, "step": 17800 }, { "epoch": 2.620216619775289, "grad_norm": 3.399345534090669, "learning_rate": 9.611251947889533e-07, "loss": 1.9182, "step": 17810 }, { "epoch": 2.621687721814604, "grad_norm": 3.356328312163814, "learning_rate": 9.538152460701745e-07, "loss": 1.8936, "step": 17820 }, { "epoch": 2.6231588238539194, "grad_norm": 2.8555026554071143, "learning_rate": 9.465318095425335e-07, "loss": 1.9004, "step": 17830 }, { "epoch": 2.6246299258932346, "grad_norm": 3.076037003558339, "learning_rate": 9.392749065520357e-07, "loss": 1.9034, "step": 17840 }, { "epoch": 2.62610102793255, "grad_norm": 3.3755400593178075, "learning_rate": 9.320445583669157e-07, "loss": 1.8553, "step": 17850 }, { "epoch": 2.627572129971865, "grad_norm": 3.0323788010295787, "learning_rate": 9.248407861775843e-07, "loss": 1.924, "step": 17860 }, { "epoch": 2.6290432320111803, "grad_norm": 3.178669838891407, "learning_rate": 9.176636110965665e-07, "loss": 1.8888, "step": 17870 }, { "epoch": 2.6305143340504955, "grad_norm": 3.0774036223248413, "learning_rate": 9.105130541584328e-07, "loss": 1.8889, "step": 17880 }, { "epoch": 2.6319854360898107, "grad_norm": 3.1460011600276654, "learning_rate": 9.033891363197478e-07, "loss": 1.9216, "step": 17890 }, { "epoch": 2.633456538129126, "grad_norm": 3.028717205392919, "learning_rate": 8.962918784590013e-07, "loss": 1.9055, "step": 17900 }, { "epoch": 2.634927640168441, "grad_norm": 3.3854126751035234, "learning_rate": 8.892213013765472e-07, "loss": 1.9259, "step": 17910 }, { "epoch": 2.6363987422077564, "grad_norm": 3.286680259119932, "learning_rate": 8.821774257945436e-07, "loss": 1.9002, "step": 17920 }, { "epoch": 2.6378698442470716, "grad_norm": 2.992020957824625, "learning_rate": 8.751602723569008e-07, "loss": 1.901, "step": 17930 }, { "epoch": 2.639340946286387, "grad_norm": 3.562186348114735, "learning_rate": 8.681698616292011e-07, "loss": 1.8654, "step": 17940 }, { "epoch": 2.640812048325702, "grad_norm": 3.480379241166342, "learning_rate": 8.61206214098661e-07, "loss": 1.9044, "step": 17950 }, { "epoch": 2.6422831503650173, "grad_norm": 2.7276680333579595, "learning_rate": 8.542693501740551e-07, "loss": 1.8875, "step": 17960 }, { "epoch": 2.6437542524043325, "grad_norm": 2.963633804826828, "learning_rate": 8.473592901856597e-07, "loss": 1.8975, "step": 17970 }, { "epoch": 2.6452253544436477, "grad_norm": 3.2697672785016363, "learning_rate": 8.404760543852031e-07, "loss": 1.8913, "step": 17980 }, { "epoch": 2.646696456482963, "grad_norm": 3.1472042103318354, "learning_rate": 8.336196629457916e-07, "loss": 1.9337, "step": 17990 }, { "epoch": 2.648167558522278, "grad_norm": 3.2612365294476473, "learning_rate": 8.267901359618569e-07, "loss": 1.9083, "step": 18000 }, { "epoch": 2.648167558522278, "eval_loss": 2.372968912124634, "eval_runtime": 579.2622, "eval_samples_per_second": 158.113, "eval_steps_per_second": 4.942, "step": 18000 }, { "epoch": 2.6496386605615934, "grad_norm": 3.263886248253258, "learning_rate": 8.199874934491003e-07, "loss": 1.8997, "step": 18010 }, { "epoch": 2.6511097626009086, "grad_norm": 2.7384375235336043, "learning_rate": 8.132117553444341e-07, "loss": 1.9186, "step": 18020 }, { "epoch": 2.652580864640224, "grad_norm": 3.359796471547076, "learning_rate": 8.0646294150591e-07, "loss": 1.9293, "step": 18030 }, { "epoch": 2.654051966679539, "grad_norm": 2.934780826080653, "learning_rate": 7.997410717126819e-07, "loss": 1.8752, "step": 18040 }, { "epoch": 2.6555230687188542, "grad_norm": 3.095515881057229, "learning_rate": 7.930461656649335e-07, "loss": 1.9313, "step": 18050 }, { "epoch": 2.6569941707581695, "grad_norm": 3.3254244973097467, "learning_rate": 7.863782429838229e-07, "loss": 1.8678, "step": 18060 }, { "epoch": 2.6584652727974847, "grad_norm": 3.3447799815867714, "learning_rate": 7.797373232114235e-07, "loss": 1.9021, "step": 18070 }, { "epoch": 2.6599363748367995, "grad_norm": 3.036511336230227, "learning_rate": 7.731234258106801e-07, "loss": 1.9232, "step": 18080 }, { "epoch": 2.6614074768761147, "grad_norm": 3.380355238207467, "learning_rate": 7.665365701653304e-07, "loss": 1.904, "step": 18090 }, { "epoch": 2.66287857891543, "grad_norm": 3.1021307015920585, "learning_rate": 7.599767755798637e-07, "loss": 1.8664, "step": 18100 }, { "epoch": 2.664349680954745, "grad_norm": 3.099495758856401, "learning_rate": 7.534440612794647e-07, "loss": 1.8696, "step": 18110 }, { "epoch": 2.6658207829940603, "grad_norm": 3.113721904453489, "learning_rate": 7.469384464099394e-07, "loss": 1.9076, "step": 18120 }, { "epoch": 2.6672918850333756, "grad_norm": 3.59279975361112, "learning_rate": 7.404599500376853e-07, "loss": 1.9305, "step": 18130 }, { "epoch": 2.6687629870726908, "grad_norm": 3.2718736938361577, "learning_rate": 7.340085911496154e-07, "loss": 1.9106, "step": 18140 }, { "epoch": 2.670234089112006, "grad_norm": 3.03092352167849, "learning_rate": 7.275843886531108e-07, "loss": 1.8994, "step": 18150 }, { "epoch": 2.671705191151321, "grad_norm": 3.134785958537421, "learning_rate": 7.211873613759623e-07, "loss": 1.924, "step": 18160 }, { "epoch": 2.6731762931906364, "grad_norm": 2.8635251945477513, "learning_rate": 7.148175280663173e-07, "loss": 1.8901, "step": 18170 }, { "epoch": 2.6746473952299517, "grad_norm": 3.0184367088040944, "learning_rate": 7.084749073926267e-07, "loss": 1.9024, "step": 18180 }, { "epoch": 2.676118497269267, "grad_norm": 3.1877192485732313, "learning_rate": 7.021595179435858e-07, "loss": 1.8869, "step": 18190 }, { "epoch": 2.677589599308582, "grad_norm": 3.0304616275915444, "learning_rate": 6.958713782280802e-07, "loss": 1.9138, "step": 18200 }, { "epoch": 2.6790607013478973, "grad_norm": 3.375883518091937, "learning_rate": 6.896105066751336e-07, "loss": 1.903, "step": 18210 }, { "epoch": 2.6805318033872125, "grad_norm": 3.399771120677976, "learning_rate": 6.833769216338592e-07, "loss": 1.9434, "step": 18220 }, { "epoch": 2.6820029054265278, "grad_norm": 3.464654753384636, "learning_rate": 6.771706413733891e-07, "loss": 1.8955, "step": 18230 }, { "epoch": 2.683474007465843, "grad_norm": 3.40497974881624, "learning_rate": 6.709916840828412e-07, "loss": 1.8956, "step": 18240 }, { "epoch": 2.684945109505158, "grad_norm": 3.2489402539724113, "learning_rate": 6.64840067871253e-07, "loss": 1.8824, "step": 18250 }, { "epoch": 2.6864162115444734, "grad_norm": 2.9372707950380494, "learning_rate": 6.587158107675296e-07, "loss": 1.8995, "step": 18260 }, { "epoch": 2.687887313583788, "grad_norm": 3.409684199085107, "learning_rate": 6.526189307203934e-07, "loss": 1.9396, "step": 18270 }, { "epoch": 2.6893584156231034, "grad_norm": 3.039825882687274, "learning_rate": 6.46549445598339e-07, "loss": 1.9147, "step": 18280 }, { "epoch": 2.6908295176624186, "grad_norm": 2.824694059025011, "learning_rate": 6.405073731895628e-07, "loss": 1.8602, "step": 18290 }, { "epoch": 2.692300619701734, "grad_norm": 2.740844742975129, "learning_rate": 6.344927312019244e-07, "loss": 1.8901, "step": 18300 }, { "epoch": 2.693771721741049, "grad_norm": 3.0864547689197757, "learning_rate": 6.285055372628979e-07, "loss": 1.9077, "step": 18310 }, { "epoch": 2.6952428237803643, "grad_norm": 2.8625290108807904, "learning_rate": 6.225458089195013e-07, "loss": 1.9485, "step": 18320 }, { "epoch": 2.6967139258196795, "grad_norm": 3.220872501247995, "learning_rate": 6.166135636382697e-07, "loss": 1.9445, "step": 18330 }, { "epoch": 2.6981850278589947, "grad_norm": 3.079749937272982, "learning_rate": 6.107088188051858e-07, "loss": 1.9098, "step": 18340 }, { "epoch": 2.69965612989831, "grad_norm": 3.0314425842801502, "learning_rate": 6.048315917256342e-07, "loss": 1.8891, "step": 18350 }, { "epoch": 2.701127231937625, "grad_norm": 3.303538482850898, "learning_rate": 5.989818996243535e-07, "loss": 1.8809, "step": 18360 }, { "epoch": 2.7025983339769404, "grad_norm": 3.355391495622468, "learning_rate": 5.931597596453876e-07, "loss": 1.972, "step": 18370 }, { "epoch": 2.7040694360162556, "grad_norm": 3.089107663830533, "learning_rate": 5.873651888520238e-07, "loss": 1.9415, "step": 18380 }, { "epoch": 2.705540538055571, "grad_norm": 3.0952898144550467, "learning_rate": 5.815982042267565e-07, "loss": 1.9161, "step": 18390 }, { "epoch": 2.707011640094886, "grad_norm": 3.0662767830215785, "learning_rate": 5.758588226712336e-07, "loss": 1.9295, "step": 18400 }, { "epoch": 2.7084827421342013, "grad_norm": 3.1589083479233424, "learning_rate": 5.701470610061954e-07, "loss": 1.865, "step": 18410 }, { "epoch": 2.7099538441735165, "grad_norm": 3.2068775671699887, "learning_rate": 5.644629359714449e-07, "loss": 1.9004, "step": 18420 }, { "epoch": 2.7114249462128317, "grad_norm": 3.145448585204527, "learning_rate": 5.588064642257851e-07, "loss": 1.8946, "step": 18430 }, { "epoch": 2.712896048252147, "grad_norm": 3.0825364947555194, "learning_rate": 5.531776623469731e-07, "loss": 1.901, "step": 18440 }, { "epoch": 2.714367150291462, "grad_norm": 2.8496223355545998, "learning_rate": 5.475765468316718e-07, "loss": 1.8991, "step": 18450 }, { "epoch": 2.7158382523307774, "grad_norm": 3.056409622936921, "learning_rate": 5.42003134095408e-07, "loss": 1.9186, "step": 18460 }, { "epoch": 2.7173093543700926, "grad_norm": 2.967921089459645, "learning_rate": 5.364574404725053e-07, "loss": 1.8772, "step": 18470 }, { "epoch": 2.718780456409408, "grad_norm": 3.318839786469856, "learning_rate": 5.309394822160651e-07, "loss": 1.9018, "step": 18480 }, { "epoch": 2.720251558448723, "grad_norm": 3.1514662895969314, "learning_rate": 5.254492754978924e-07, "loss": 1.909, "step": 18490 }, { "epoch": 2.7217226604880382, "grad_norm": 3.1703519745485953, "learning_rate": 5.199868364084615e-07, "loss": 1.9588, "step": 18500 }, { "epoch": 2.7231937625273535, "grad_norm": 3.1805228066343516, "learning_rate": 5.14552180956871e-07, "loss": 1.9164, "step": 18510 }, { "epoch": 2.7246648645666687, "grad_norm": 3.027896976648664, "learning_rate": 5.091453250707823e-07, "loss": 1.8507, "step": 18520 }, { "epoch": 2.726135966605984, "grad_norm": 3.1300480429481166, "learning_rate": 5.037662845963942e-07, "loss": 1.9107, "step": 18530 }, { "epoch": 2.727607068645299, "grad_norm": 2.915515464952128, "learning_rate": 4.984150752983785e-07, "loss": 1.9086, "step": 18540 }, { "epoch": 2.7290781706846143, "grad_norm": 3.2784747771445293, "learning_rate": 4.930917128598434e-07, "loss": 1.9205, "step": 18550 }, { "epoch": 2.7305492727239296, "grad_norm": 3.474015303411599, "learning_rate": 4.877962128822799e-07, "loss": 1.9115, "step": 18560 }, { "epoch": 2.7320203747632448, "grad_norm": 3.307568608819601, "learning_rate": 4.825285908855281e-07, "loss": 1.8862, "step": 18570 }, { "epoch": 2.73349147680256, "grad_norm": 3.591086778431778, "learning_rate": 4.772888623077209e-07, "loss": 1.8685, "step": 18580 }, { "epoch": 2.7349625788418748, "grad_norm": 3.1352576605769378, "learning_rate": 4.72077042505239e-07, "loss": 1.9107, "step": 18590 }, { "epoch": 2.73643368088119, "grad_norm": 3.09464577382086, "learning_rate": 4.668931467526772e-07, "loss": 1.8879, "step": 18600 }, { "epoch": 2.737904782920505, "grad_norm": 3.1212308668245337, "learning_rate": 4.617371902427803e-07, "loss": 1.9553, "step": 18610 }, { "epoch": 2.7393758849598204, "grad_norm": 3.8194847495262314, "learning_rate": 4.5660918808642206e-07, "loss": 1.8552, "step": 18620 }, { "epoch": 2.7408469869991356, "grad_norm": 3.193647205307879, "learning_rate": 4.515091553125406e-07, "loss": 1.9044, "step": 18630 }, { "epoch": 2.742318089038451, "grad_norm": 3.5096869718481205, "learning_rate": 4.464371068681062e-07, "loss": 1.9449, "step": 18640 }, { "epoch": 2.743789191077766, "grad_norm": 3.718879143329132, "learning_rate": 4.413930576180692e-07, "loss": 1.9313, "step": 18650 }, { "epoch": 2.7452602931170813, "grad_norm": 2.7831300081066015, "learning_rate": 4.363770223453301e-07, "loss": 1.8805, "step": 18660 }, { "epoch": 2.7467313951563965, "grad_norm": 3.325672498751171, "learning_rate": 4.3138901575067615e-07, "loss": 1.9024, "step": 18670 }, { "epoch": 2.7482024971957117, "grad_norm": 3.353572963355487, "learning_rate": 4.264290524527581e-07, "loss": 1.9294, "step": 18680 }, { "epoch": 2.749673599235027, "grad_norm": 3.6002541788269355, "learning_rate": 4.214971469880336e-07, "loss": 1.9338, "step": 18690 }, { "epoch": 2.751144701274342, "grad_norm": 3.480447658335368, "learning_rate": 4.1659331381073033e-07, "loss": 1.9091, "step": 18700 }, { "epoch": 2.7526158033136574, "grad_norm": 3.1428581594382567, "learning_rate": 4.117175672928053e-07, "loss": 1.8724, "step": 18710 }, { "epoch": 2.7540869053529726, "grad_norm": 2.6982019239355637, "learning_rate": 4.0686992172389804e-07, "loss": 1.9211, "step": 18720 }, { "epoch": 2.755558007392288, "grad_norm": 3.5654923432165435, "learning_rate": 4.0205039131129167e-07, "loss": 1.9238, "step": 18730 }, { "epoch": 2.757029109431603, "grad_norm": 3.1694958561850877, "learning_rate": 3.9725899017986733e-07, "loss": 1.9164, "step": 18740 }, { "epoch": 2.7585002114709183, "grad_norm": 3.1116180665974227, "learning_rate": 3.924957323720746e-07, "loss": 1.8935, "step": 18750 }, { "epoch": 2.7599713135102335, "grad_norm": 2.9872699083203, "learning_rate": 3.8776063184787107e-07, "loss": 1.9095, "step": 18760 }, { "epoch": 2.7614424155495483, "grad_norm": 2.9603153799524646, "learning_rate": 3.830537024846992e-07, "loss": 1.87, "step": 18770 }, { "epoch": 2.7629135175888635, "grad_norm": 2.9239190556422034, "learning_rate": 3.783749580774376e-07, "loss": 1.9098, "step": 18780 }, { "epoch": 2.7643846196281787, "grad_norm": 2.802641134816102, "learning_rate": 3.7372441233835764e-07, "loss": 1.8574, "step": 18790 }, { "epoch": 2.765855721667494, "grad_norm": 2.983750165825885, "learning_rate": 3.691020788970934e-07, "loss": 1.9068, "step": 18800 }, { "epoch": 2.767326823706809, "grad_norm": 3.181647278728009, "learning_rate": 3.6450797130058947e-07, "loss": 1.9092, "step": 18810 }, { "epoch": 2.7687979257461244, "grad_norm": 3.323994377369704, "learning_rate": 3.599421030130712e-07, "loss": 1.8758, "step": 18820 }, { "epoch": 2.7702690277854396, "grad_norm": 3.051189159524044, "learning_rate": 3.554044874160001e-07, "loss": 1.8789, "step": 18830 }, { "epoch": 2.771740129824755, "grad_norm": 3.0832439673344267, "learning_rate": 3.5089513780803607e-07, "loss": 1.9258, "step": 18840 }, { "epoch": 2.77321123186407, "grad_norm": 3.019565215034097, "learning_rate": 3.464140674049954e-07, "loss": 1.9153, "step": 18850 }, { "epoch": 2.7746823339033853, "grad_norm": 3.2089836647741588, "learning_rate": 3.4196128933982056e-07, "loss": 1.8938, "step": 18860 }, { "epoch": 2.7761534359427005, "grad_norm": 3.1982126365402515, "learning_rate": 3.3753681666253147e-07, "loss": 1.8989, "step": 18870 }, { "epoch": 2.7776245379820157, "grad_norm": 3.389619696977613, "learning_rate": 3.331406623401945e-07, "loss": 1.9232, "step": 18880 }, { "epoch": 2.779095640021331, "grad_norm": 3.3169573075454712, "learning_rate": 3.2877283925687786e-07, "loss": 1.8634, "step": 18890 }, { "epoch": 2.780566742060646, "grad_norm": 3.4955507476534464, "learning_rate": 3.244333602136196e-07, "loss": 1.8958, "step": 18900 }, { "epoch": 2.7820378440999614, "grad_norm": 3.4896967736180025, "learning_rate": 3.2012223792839305e-07, "loss": 1.8913, "step": 18910 }, { "epoch": 2.7835089461392766, "grad_norm": 3.2480851324950852, "learning_rate": 3.1583948503605686e-07, "loss": 1.9707, "step": 18920 }, { "epoch": 2.784980048178592, "grad_norm": 3.220582437408986, "learning_rate": 3.1158511408833083e-07, "loss": 1.9279, "step": 18930 }, { "epoch": 2.786451150217907, "grad_norm": 3.238961129041908, "learning_rate": 3.0735913755375103e-07, "loss": 1.8959, "step": 18940 }, { "epoch": 2.7879222522572222, "grad_norm": 3.316772104847186, "learning_rate": 3.031615678176425e-07, "loss": 1.9041, "step": 18950 }, { "epoch": 2.7893933542965375, "grad_norm": 2.90863551143816, "learning_rate": 2.9899241718206575e-07, "loss": 1.9217, "step": 18960 }, { "epoch": 2.7908644563358527, "grad_norm": 2.8160804538082695, "learning_rate": 2.9485169786580223e-07, "loss": 1.8983, "step": 18970 }, { "epoch": 2.792335558375168, "grad_norm": 3.157770484990326, "learning_rate": 2.907394220043047e-07, "loss": 1.8939, "step": 18980 }, { "epoch": 2.793806660414483, "grad_norm": 3.3288210978024653, "learning_rate": 2.866556016496602e-07, "loss": 1.9036, "step": 18990 }, { "epoch": 2.7952777624537983, "grad_norm": 3.341837351728601, "learning_rate": 2.826002487705692e-07, "loss": 1.9092, "step": 19000 }, { "epoch": 2.7952777624537983, "eval_loss": 2.370347023010254, "eval_runtime": 635.3714, "eval_samples_per_second": 144.15, "eval_steps_per_second": 4.506, "step": 19000 }, { "epoch": 2.7967488644931136, "grad_norm": 3.266721961558169, "learning_rate": 2.785733752522912e-07, "loss": 1.8686, "step": 19010 }, { "epoch": 2.7982199665324288, "grad_norm": 3.370966740538115, "learning_rate": 2.7457499289662573e-07, "loss": 1.928, "step": 19020 }, { "epoch": 2.799691068571744, "grad_norm": 2.9281577771527494, "learning_rate": 2.706051134218679e-07, "loss": 1.9282, "step": 19030 }, { "epoch": 2.801162170611059, "grad_norm": 2.8701755747475666, "learning_rate": 2.666637484627832e-07, "loss": 1.9145, "step": 19040 }, { "epoch": 2.8026332726503744, "grad_norm": 2.990144451162258, "learning_rate": 2.627509095705583e-07, "loss": 1.9121, "step": 19050 }, { "epoch": 2.8041043746896896, "grad_norm": 3.235960045412991, "learning_rate": 2.5886660821278666e-07, "loss": 1.9934, "step": 19060 }, { "epoch": 2.805575476729005, "grad_norm": 3.243284989651956, "learning_rate": 2.550108557734199e-07, "loss": 1.9162, "step": 19070 }, { "epoch": 2.80704657876832, "grad_norm": 3.1333410765018255, "learning_rate": 2.5118366355274093e-07, "loss": 1.8636, "step": 19080 }, { "epoch": 2.808517680807635, "grad_norm": 2.8819144142987594, "learning_rate": 2.4738504276732964e-07, "loss": 1.9461, "step": 19090 }, { "epoch": 2.80998878284695, "grad_norm": 3.1456186986472296, "learning_rate": 2.436150045500274e-07, "loss": 1.9145, "step": 19100 }, { "epoch": 2.8114598848862653, "grad_norm": 3.063851686997199, "learning_rate": 2.3987355994991355e-07, "loss": 1.8579, "step": 19110 }, { "epoch": 2.8129309869255805, "grad_norm": 2.9414334427341613, "learning_rate": 2.3616071993225797e-07, "loss": 1.9105, "step": 19120 }, { "epoch": 2.8144020889648957, "grad_norm": 3.286279572838423, "learning_rate": 2.3247649537850413e-07, "loss": 1.8895, "step": 19130 }, { "epoch": 2.815873191004211, "grad_norm": 3.4911486203180115, "learning_rate": 2.28820897086226e-07, "loss": 1.9001, "step": 19140 }, { "epoch": 2.817344293043526, "grad_norm": 3.118668408364586, "learning_rate": 2.2519393576910576e-07, "loss": 1.901, "step": 19150 }, { "epoch": 2.8188153950828414, "grad_norm": 3.2378894605790967, "learning_rate": 2.2159562205689044e-07, "loss": 1.8983, "step": 19160 }, { "epoch": 2.8202864971221566, "grad_norm": 3.129498234527386, "learning_rate": 2.1802596649537322e-07, "loss": 1.9051, "step": 19170 }, { "epoch": 2.821757599161472, "grad_norm": 3.0785432987068098, "learning_rate": 2.1448497954635773e-07, "loss": 1.9119, "step": 19180 }, { "epoch": 2.823228701200787, "grad_norm": 3.264068054106131, "learning_rate": 2.1097267158762147e-07, "loss": 1.9054, "step": 19190 }, { "epoch": 2.8246998032401023, "grad_norm": 3.2302976603090814, "learning_rate": 2.07489052912897e-07, "loss": 1.9157, "step": 19200 }, { "epoch": 2.8261709052794175, "grad_norm": 2.8617521716798358, "learning_rate": 2.040341337318319e-07, "loss": 1.9184, "step": 19210 }, { "epoch": 2.8276420073187327, "grad_norm": 3.359733183307331, "learning_rate": 2.0060792416996212e-07, "loss": 1.8808, "step": 19220 }, { "epoch": 2.829113109358048, "grad_norm": 3.4567818664221233, "learning_rate": 1.9721043426868313e-07, "loss": 1.8742, "step": 19230 }, { "epoch": 2.830584211397363, "grad_norm": 3.4749699735046966, "learning_rate": 1.9384167398522336e-07, "loss": 1.9323, "step": 19240 }, { "epoch": 2.8320553134366784, "grad_norm": 3.288311167014025, "learning_rate": 1.9050165319260516e-07, "loss": 1.851, "step": 19250 }, { "epoch": 2.8335264154759936, "grad_norm": 3.272538945973311, "learning_rate": 1.8719038167962832e-07, "loss": 1.8781, "step": 19260 }, { "epoch": 2.834997517515309, "grad_norm": 3.137406589362868, "learning_rate": 1.8390786915083114e-07, "loss": 1.9176, "step": 19270 }, { "epoch": 2.8364686195546236, "grad_norm": 3.5226449482445346, "learning_rate": 1.806541252264704e-07, "loss": 1.8975, "step": 19280 }, { "epoch": 2.837939721593939, "grad_norm": 3.1350514062680075, "learning_rate": 1.7742915944248484e-07, "loss": 1.8991, "step": 19290 }, { "epoch": 2.839410823633254, "grad_norm": 2.826684809690263, "learning_rate": 1.7423298125047395e-07, "loss": 1.9073, "step": 19300 }, { "epoch": 2.8408819256725693, "grad_norm": 3.2276796167960815, "learning_rate": 1.7106560001766692e-07, "loss": 1.8996, "step": 19310 }, { "epoch": 2.8423530277118845, "grad_norm": 3.222274824882022, "learning_rate": 1.6792702502689605e-07, "loss": 1.9149, "step": 19320 }, { "epoch": 2.8438241297511997, "grad_norm": 3.094686494584861, "learning_rate": 1.6481726547657228e-07, "loss": 1.9094, "step": 19330 }, { "epoch": 2.845295231790515, "grad_norm": 3.149124830709097, "learning_rate": 1.617363304806474e-07, "loss": 1.9117, "step": 19340 }, { "epoch": 2.84676633382983, "grad_norm": 3.18183619516169, "learning_rate": 1.5868422906860525e-07, "loss": 1.9295, "step": 19350 }, { "epoch": 2.8482374358691454, "grad_norm": 3.0053926099680193, "learning_rate": 1.5566097018541726e-07, "loss": 1.8948, "step": 19360 }, { "epoch": 2.8497085379084606, "grad_norm": 2.6334978063458485, "learning_rate": 1.5266656269153024e-07, "loss": 1.8932, "step": 19370 }, { "epoch": 2.851179639947776, "grad_norm": 3.495938105577525, "learning_rate": 1.49701015362832e-07, "loss": 1.8732, "step": 19380 }, { "epoch": 2.852650741987091, "grad_norm": 2.886515000075108, "learning_rate": 1.4676433689062685e-07, "loss": 1.8863, "step": 19390 }, { "epoch": 2.8541218440264062, "grad_norm": 3.2838669869990142, "learning_rate": 1.4385653588161463e-07, "loss": 1.9182, "step": 19400 }, { "epoch": 2.8555929460657214, "grad_norm": 3.465179255442228, "learning_rate": 1.409776208578595e-07, "loss": 1.9217, "step": 19410 }, { "epoch": 2.8570640481050367, "grad_norm": 3.0556581922010286, "learning_rate": 1.3812760025676997e-07, "loss": 1.8516, "step": 19420 }, { "epoch": 2.858535150144352, "grad_norm": 3.18239439196624, "learning_rate": 1.3530648243106791e-07, "loss": 1.87, "step": 19430 }, { "epoch": 2.860006252183667, "grad_norm": 3.4113111151813147, "learning_rate": 1.3251427564877407e-07, "loss": 1.8843, "step": 19440 }, { "epoch": 2.8614773542229823, "grad_norm": 3.347884253831917, "learning_rate": 1.2975098809317242e-07, "loss": 1.8621, "step": 19450 }, { "epoch": 2.8629484562622975, "grad_norm": 3.077698474066412, "learning_rate": 1.27016627862796e-07, "loss": 1.877, "step": 19460 }, { "epoch": 2.8644195583016128, "grad_norm": 3.3046780416897787, "learning_rate": 1.2431120297139554e-07, "loss": 1.9293, "step": 19470 }, { "epoch": 2.865890660340928, "grad_norm": 3.1160978711751985, "learning_rate": 1.2163472134792186e-07, "loss": 1.8938, "step": 19480 }, { "epoch": 2.867361762380243, "grad_norm": 3.269162161125302, "learning_rate": 1.189871908364959e-07, "loss": 1.8614, "step": 19490 }, { "epoch": 2.8688328644195584, "grad_norm": 3.0790630309128404, "learning_rate": 1.163686191963953e-07, "loss": 1.8874, "step": 19500 }, { "epoch": 2.8703039664588736, "grad_norm": 3.1735674762711863, "learning_rate": 1.137790141020223e-07, "loss": 1.9168, "step": 19510 }, { "epoch": 2.871775068498189, "grad_norm": 3.0140110525659067, "learning_rate": 1.1121838314288702e-07, "loss": 1.9169, "step": 19520 }, { "epoch": 2.873246170537504, "grad_norm": 3.035510855463414, "learning_rate": 1.0868673382358308e-07, "loss": 1.9254, "step": 19530 }, { "epoch": 2.8747172725768193, "grad_norm": 3.086664197454496, "learning_rate": 1.0618407356376314e-07, "loss": 1.8967, "step": 19540 }, { "epoch": 2.8761883746161345, "grad_norm": 3.022527755772944, "learning_rate": 1.0371040969812562e-07, "loss": 1.857, "step": 19550 }, { "epoch": 2.8776594766554497, "grad_norm": 3.5037092142895228, "learning_rate": 1.0126574947638245e-07, "loss": 1.934, "step": 19560 }, { "epoch": 2.879130578694765, "grad_norm": 3.148421508846889, "learning_rate": 9.885010006324469e-08, "loss": 1.8999, "step": 19570 }, { "epoch": 2.88060168073408, "grad_norm": 3.2006304024842636, "learning_rate": 9.646346853840028e-08, "loss": 1.8867, "step": 19580 }, { "epoch": 2.882072782773395, "grad_norm": 3.3464946659170964, "learning_rate": 9.410586189649073e-08, "loss": 1.9628, "step": 19590 }, { "epoch": 2.88354388481271, "grad_norm": 3.099328447964002, "learning_rate": 9.177728704709454e-08, "loss": 1.9075, "step": 19600 }, { "epoch": 2.8850149868520254, "grad_norm": 3.089114519538958, "learning_rate": 8.947775081470488e-08, "loss": 1.8885, "step": 19610 }, { "epoch": 2.8864860888913406, "grad_norm": 2.8236452757288713, "learning_rate": 8.720725993870971e-08, "loss": 1.8814, "step": 19620 }, { "epoch": 2.887957190930656, "grad_norm": 3.1169138894685915, "learning_rate": 8.496582107336948e-08, "loss": 1.9307, "step": 19630 }, { "epoch": 2.889428292969971, "grad_norm": 3.19476275744617, "learning_rate": 8.27534407878039e-08, "loss": 1.9053, "step": 19640 }, { "epoch": 2.8908993950092863, "grad_norm": 3.4483760279709927, "learning_rate": 8.057012556596743e-08, "loss": 1.9153, "step": 19650 }, { "epoch": 2.8923704970486015, "grad_norm": 3.194827053693801, "learning_rate": 7.84158818066305e-08, "loss": 1.9316, "step": 19660 }, { "epoch": 2.8938415990879167, "grad_norm": 3.096702387651515, "learning_rate": 7.62907158233639e-08, "loss": 1.8924, "step": 19670 }, { "epoch": 2.895312701127232, "grad_norm": 3.235902797600817, "learning_rate": 7.419463384451763e-08, "loss": 1.9059, "step": 19680 }, { "epoch": 2.896783803166547, "grad_norm": 3.299594648209659, "learning_rate": 7.212764201320443e-08, "loss": 1.8706, "step": 19690 }, { "epoch": 2.8982549052058624, "grad_norm": 3.0326281244189484, "learning_rate": 7.008974638727962e-08, "loss": 1.9417, "step": 19700 }, { "epoch": 2.8997260072451776, "grad_norm": 3.279571985631714, "learning_rate": 6.80809529393256e-08, "loss": 1.9228, "step": 19710 }, { "epoch": 2.901197109284493, "grad_norm": 2.9346283027549904, "learning_rate": 6.610126755663082e-08, "loss": 1.8997, "step": 19720 }, { "epoch": 2.902668211323808, "grad_norm": 3.357131783931639, "learning_rate": 6.415069604117974e-08, "loss": 1.8802, "step": 19730 }, { "epoch": 2.9041393133631233, "grad_norm": 3.2216799123546096, "learning_rate": 6.222924410962505e-08, "loss": 1.9132, "step": 19740 }, { "epoch": 2.9056104154024385, "grad_norm": 2.965808327196904, "learning_rate": 6.033691739328107e-08, "loss": 1.931, "step": 19750 }, { "epoch": 2.9070815174417537, "grad_norm": 3.26880519517392, "learning_rate": 5.84737214381037e-08, "loss": 1.9408, "step": 19760 }, { "epoch": 2.908552619481069, "grad_norm": 3.696753629856464, "learning_rate": 5.6639661704671614e-08, "loss": 1.9042, "step": 19770 }, { "epoch": 2.9100237215203837, "grad_norm": 2.99345629548633, "learning_rate": 5.4834743568170645e-08, "loss": 1.9152, "step": 19780 }, { "epoch": 2.911494823559699, "grad_norm": 3.4361084999284257, "learning_rate": 5.305897231838386e-08, "loss": 1.8983, "step": 19790 }, { "epoch": 2.912965925599014, "grad_norm": 3.4224027830652064, "learning_rate": 5.1312353159668205e-08, "loss": 1.9231, "step": 19800 }, { "epoch": 2.9144370276383293, "grad_norm": 3.2951751991272618, "learning_rate": 4.959489121094452e-08, "loss": 1.9018, "step": 19810 }, { "epoch": 2.9159081296776446, "grad_norm": 3.076483966696931, "learning_rate": 4.7906591505680885e-08, "loss": 1.8967, "step": 19820 }, { "epoch": 2.91737923171696, "grad_norm": 2.827326401375073, "learning_rate": 4.624745899187821e-08, "loss": 1.9089, "step": 19830 }, { "epoch": 2.918850333756275, "grad_norm": 3.191416747719613, "learning_rate": 4.461749853205244e-08, "loss": 1.9263, "step": 19840 }, { "epoch": 2.9203214357955902, "grad_norm": 3.0948673275899226, "learning_rate": 4.3016714903229006e-08, "loss": 1.9326, "step": 19850 }, { "epoch": 2.9217925378349054, "grad_norm": 3.3221975202120397, "learning_rate": 4.144511279691621e-08, "loss": 1.9228, "step": 19860 }, { "epoch": 2.9232636398742207, "grad_norm": 3.0894225480437068, "learning_rate": 3.990269681910297e-08, "loss": 1.89, "step": 19870 }, { "epoch": 2.924734741913536, "grad_norm": 2.9469864577997527, "learning_rate": 3.838947149024108e-08, "loss": 1.8989, "step": 19880 }, { "epoch": 2.926205843952851, "grad_norm": 3.148151047617243, "learning_rate": 3.690544124522744e-08, "loss": 1.9463, "step": 19890 }, { "epoch": 2.9276769459921663, "grad_norm": 2.8341475182340377, "learning_rate": 3.545061043339848e-08, "loss": 1.9077, "step": 19900 }, { "epoch": 2.9291480480314815, "grad_norm": 3.19449546350283, "learning_rate": 3.402498331851467e-08, "loss": 1.9191, "step": 19910 }, { "epoch": 2.9306191500707968, "grad_norm": 3.1286108365246963, "learning_rate": 3.2628564078743816e-08, "loss": 1.8735, "step": 19920 }, { "epoch": 2.932090252110112, "grad_norm": 3.183972596943143, "learning_rate": 3.126135680665665e-08, "loss": 1.886, "step": 19930 }, { "epoch": 2.933561354149427, "grad_norm": 3.1526471749687746, "learning_rate": 2.992336550920682e-08, "loss": 1.8893, "step": 19940 }, { "epoch": 2.9350324561887424, "grad_norm": 3.4625080442384375, "learning_rate": 2.86145941077276e-08, "loss": 1.8941, "step": 19950 }, { "epoch": 2.9365035582280576, "grad_norm": 3.48716430775079, "learning_rate": 2.733504643791074e-08, "loss": 1.9105, "step": 19960 }, { "epoch": 2.937974660267373, "grad_norm": 3.2558895029346724, "learning_rate": 2.6084726249804294e-08, "loss": 1.9018, "step": 19970 }, { "epoch": 2.939445762306688, "grad_norm": 3.2399651719125813, "learning_rate": 2.4863637207793723e-08, "loss": 1.8919, "step": 19980 }, { "epoch": 2.9409168643460033, "grad_norm": 3.02241554099019, "learning_rate": 2.3671782890599682e-08, "loss": 1.8694, "step": 19990 }, { "epoch": 2.9423879663853185, "grad_norm": 3.188573260812069, "learning_rate": 2.250916679125914e-08, "loss": 1.8372, "step": 20000 }, { "epoch": 2.9423879663853185, "eval_loss": 2.3699159622192383, "eval_runtime": 345.7941, "eval_samples_per_second": 264.866, "eval_steps_per_second": 8.279, "step": 20000 }, { "epoch": 2.9438590684246337, "grad_norm": 3.0550667149257946, "learning_rate": 2.1375792317119837e-08, "loss": 1.8792, "step": 20010 }, { "epoch": 2.945330170463949, "grad_norm": 3.1006762610657606, "learning_rate": 2.027166278983028e-08, "loss": 1.9191, "step": 20020 }, { "epoch": 2.946801272503264, "grad_norm": 3.5116396288689566, "learning_rate": 1.919678144532866e-08, "loss": 1.9294, "step": 20030 }, { "epoch": 2.9482723745425794, "grad_norm": 3.406392232292829, "learning_rate": 1.8151151433833947e-08, "loss": 1.8863, "step": 20040 }, { "epoch": 2.9497434765818946, "grad_norm": 3.4189010532308957, "learning_rate": 1.7134775819834804e-08, "loss": 1.9184, "step": 20050 }, { "epoch": 2.95121457862121, "grad_norm": 3.266936548228511, "learning_rate": 1.614765758208514e-08, "loss": 1.9078, "step": 20060 }, { "epoch": 2.952685680660525, "grad_norm": 3.53549964712295, "learning_rate": 1.5189799613589685e-08, "loss": 1.9164, "step": 20070 }, { "epoch": 2.9541567826998403, "grad_norm": 3.486399243245059, "learning_rate": 1.426120472160064e-08, "loss": 1.9025, "step": 20080 }, { "epoch": 2.955627884739155, "grad_norm": 2.7680373578939768, "learning_rate": 1.3361875627605491e-08, "loss": 1.8919, "step": 20090 }, { "epoch": 2.9570989867784703, "grad_norm": 3.0395334843845276, "learning_rate": 1.2491814967322547e-08, "loss": 1.9146, "step": 20100 }, { "epoch": 2.9585700888177855, "grad_norm": 3.278574030698983, "learning_rate": 1.1651025290688733e-08, "loss": 1.903, "step": 20110 }, { "epoch": 2.9600411908571007, "grad_norm": 3.2677813296136806, "learning_rate": 1.0839509061857379e-08, "loss": 1.9406, "step": 20120 }, { "epoch": 2.961512292896416, "grad_norm": 3.147148290612745, "learning_rate": 1.0057268659187102e-08, "loss": 1.9182, "step": 20130 }, { "epoch": 2.962983394935731, "grad_norm": 2.845917898388034, "learning_rate": 9.304306375236272e-09, "loss": 1.936, "step": 20140 }, { "epoch": 2.9644544969750464, "grad_norm": 3.1371514521278323, "learning_rate": 8.580624416757444e-09, "loss": 1.8761, "step": 20150 }, { "epoch": 2.9659255990143616, "grad_norm": 3.1086560186691874, "learning_rate": 7.886224904685158e-09, "loss": 1.9107, "step": 20160 }, { "epoch": 2.967396701053677, "grad_norm": 3.229566546822201, "learning_rate": 7.221109874140375e-09, "loss": 1.9254, "step": 20170 }, { "epoch": 2.968867803092992, "grad_norm": 3.201446515701826, "learning_rate": 6.585281274413824e-09, "loss": 1.894, "step": 20180 }, { "epoch": 2.9703389051323073, "grad_norm": 3.427567051776931, "learning_rate": 5.978740968967112e-09, "loss": 1.89, "step": 20190 }, { "epoch": 2.9718100071716225, "grad_norm": 3.002016828884272, "learning_rate": 5.401490735422732e-09, "loss": 1.9335, "step": 20200 }, { "epoch": 2.9732811092109377, "grad_norm": 3.0861974238708925, "learning_rate": 4.853532265562955e-09, "loss": 1.9214, "step": 20210 }, { "epoch": 2.974752211250253, "grad_norm": 3.766406692071242, "learning_rate": 4.3348671653220545e-09, "loss": 1.9373, "step": 20220 }, { "epoch": 2.976223313289568, "grad_norm": 2.878461337915158, "learning_rate": 3.845496954782979e-09, "loss": 1.8589, "step": 20230 }, { "epoch": 2.9776944153288833, "grad_norm": 3.2089383751612246, "learning_rate": 3.3854230681718005e-09, "loss": 1.9149, "step": 20240 }, { "epoch": 2.9791655173681986, "grad_norm": 2.7636183013817908, "learning_rate": 2.9546468538532713e-09, "loss": 1.8826, "step": 20250 }, { "epoch": 2.980636619407514, "grad_norm": 3.369600292651696, "learning_rate": 2.553169574330827e-09, "loss": 1.9318, "step": 20260 }, { "epoch": 2.982107721446829, "grad_norm": 3.084292390780035, "learning_rate": 2.180992406237703e-09, "loss": 1.9477, "step": 20270 }, { "epoch": 2.983578823486144, "grad_norm": 3.6743768646555286, "learning_rate": 1.8381164403336037e-09, "loss": 1.877, "step": 20280 }, { "epoch": 2.985049925525459, "grad_norm": 3.148716878150122, "learning_rate": 1.5245426815069243e-09, "loss": 1.8868, "step": 20290 }, { "epoch": 2.9865210275647742, "grad_norm": 3.125200006637892, "learning_rate": 1.2402720487680875e-09, "loss": 1.8751, "step": 20300 }, { "epoch": 2.9879921296040894, "grad_norm": 2.9860191355985526, "learning_rate": 9.853053752428844e-10, "loss": 1.9006, "step": 20310 }, { "epoch": 2.9894632316434047, "grad_norm": 3.2555494412639807, "learning_rate": 7.596434081791337e-10, "loss": 1.8643, "step": 20320 }, { "epoch": 2.99093433368272, "grad_norm": 3.1110804587742433, "learning_rate": 5.63286808938912e-10, "loss": 1.8808, "step": 20330 }, { "epoch": 2.992405435722035, "grad_norm": 3.311129640253521, "learning_rate": 3.962361529941117e-10, "loss": 1.8743, "step": 20340 }, { "epoch": 2.9938765377613503, "grad_norm": 2.9009700937516127, "learning_rate": 2.584919299308819e-10, "loss": 1.8845, "step": 20350 }, { "epoch": 2.9953476398006655, "grad_norm": 3.5141995415283622, "learning_rate": 1.500545434440781e-10, "loss": 1.9369, "step": 20360 }, { "epoch": 2.9968187418399808, "grad_norm": 3.2499174592461544, "learning_rate": 7.092431133726152e-11, "loss": 1.896, "step": 20370 }, { "epoch": 2.998289843879296, "grad_norm": 3.2623930062470685, "learning_rate": 2.110146552380954e-11, "loss": 1.9042, "step": 20380 }, { "epoch": 2.999760945918611, "grad_norm": 3.4086324334077345, "learning_rate": 5.861520202543603e-13, "loss": 1.9238, "step": 20390 }, { "epoch": 2.9999080561225426, "step": 20391, "total_flos": 1019423904694272.0, "train_loss": 1.16662765904398, "train_runtime": 43868.1517, "train_samples_per_second": 119.006, "train_steps_per_second": 0.465 } ], "logging_steps": 10, "max_steps": 20391, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3000, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1019423904694272.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }