{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 25314, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011851851851851852, "grad_norm": 16.64023253730519, "learning_rate": 3.5545023696682464e-08, "loss": 1.3476, "step": 10 }, { "epoch": 0.0023703703703703703, "grad_norm": 16.002172660344485, "learning_rate": 7.50394944707741e-08, "loss": 1.3427, "step": 20 }, { "epoch": 0.0035555555555555557, "grad_norm": 15.506725945818424, "learning_rate": 1.1453396524486573e-07, "loss": 1.3183, "step": 30 }, { "epoch": 0.004740740740740741, "grad_norm": 16.92291845738296, "learning_rate": 1.5402843601895734e-07, "loss": 1.3252, "step": 40 }, { "epoch": 0.005925925925925926, "grad_norm": 14.292359902439756, "learning_rate": 1.93522906793049e-07, "loss": 1.2769, "step": 50 }, { "epoch": 0.0071111111111111115, "grad_norm": 14.435536849846146, "learning_rate": 2.3301737756714062e-07, "loss": 1.2532, "step": 60 }, { "epoch": 0.008296296296296296, "grad_norm": 12.052509163020323, "learning_rate": 2.7251184834123223e-07, "loss": 1.1041, "step": 70 }, { "epoch": 0.009481481481481481, "grad_norm": 12.68662755756317, "learning_rate": 3.1200631911532384e-07, "loss": 1.0185, "step": 80 }, { "epoch": 0.010666666666666666, "grad_norm": 6.810926008102391, "learning_rate": 3.515007898894155e-07, "loss": 0.7842, "step": 90 }, { "epoch": 0.011851851851851851, "grad_norm": 3.355654756862213, "learning_rate": 3.9099526066350717e-07, "loss": 0.6063, "step": 100 }, { "epoch": 0.013037037037037036, "grad_norm": 3.633581940082918, "learning_rate": 4.304897314375988e-07, "loss": 0.5658, "step": 110 }, { "epoch": 0.014222222222222223, "grad_norm": 2.6701078724792904, "learning_rate": 4.699842022116904e-07, "loss": 0.5108, "step": 120 }, { "epoch": 0.015407407407407408, "grad_norm": 2.62643529952277, "learning_rate": 5.09478672985782e-07, "loss": 0.4698, "step": 130 }, { "epoch": 0.016592592592592593, "grad_norm": 2.6826669411135797, "learning_rate": 5.489731437598736e-07, "loss": 0.4291, "step": 140 }, { "epoch": 0.017777777777777778, "grad_norm": 2.355112607468174, "learning_rate": 5.884676145339653e-07, "loss": 0.3999, "step": 150 }, { "epoch": 0.018962962962962963, "grad_norm": 2.4239938265335796, "learning_rate": 6.279620853080568e-07, "loss": 0.3843, "step": 160 }, { "epoch": 0.020148148148148148, "grad_norm": 2.5663214184292253, "learning_rate": 6.674565560821486e-07, "loss": 0.3748, "step": 170 }, { "epoch": 0.021333333333333333, "grad_norm": 2.1224331420495797, "learning_rate": 7.069510268562402e-07, "loss": 0.3596, "step": 180 }, { "epoch": 0.022518518518518518, "grad_norm": 2.071802554656357, "learning_rate": 7.464454976303318e-07, "loss": 0.3474, "step": 190 }, { "epoch": 0.023703703703703703, "grad_norm": 2.0743830333800615, "learning_rate": 7.859399684044235e-07, "loss": 0.3394, "step": 200 }, { "epoch": 0.024888888888888887, "grad_norm": 2.4818874394855936, "learning_rate": 8.25434439178515e-07, "loss": 0.3304, "step": 210 }, { "epoch": 0.026074074074074072, "grad_norm": 2.30314973212301, "learning_rate": 8.649289099526067e-07, "loss": 0.3142, "step": 220 }, { "epoch": 0.02725925925925926, "grad_norm": 2.9283982412163843, "learning_rate": 9.044233807266983e-07, "loss": 0.3068, "step": 230 }, { "epoch": 0.028444444444444446, "grad_norm": 2.5348809727151957, "learning_rate": 9.4391785150079e-07, "loss": 0.2993, "step": 240 }, { "epoch": 0.02962962962962963, "grad_norm": 2.502806735266231, "learning_rate": 9.834123222748817e-07, "loss": 0.2972, "step": 250 }, { "epoch": 0.030814814814814816, "grad_norm": 2.4575479996591376, "learning_rate": 1.0229067930489733e-06, "loss": 0.2997, "step": 260 }, { "epoch": 0.032, "grad_norm": 3.0224786024371637, "learning_rate": 1.0624012638230649e-06, "loss": 0.3, "step": 270 }, { "epoch": 0.033185185185185186, "grad_norm": 2.3998759346501, "learning_rate": 1.1018957345971565e-06, "loss": 0.2874, "step": 280 }, { "epoch": 0.03437037037037037, "grad_norm": 2.8989319371077937, "learning_rate": 1.1413902053712481e-06, "loss": 0.2792, "step": 290 }, { "epoch": 0.035555555555555556, "grad_norm": 2.977853180171724, "learning_rate": 1.1808846761453397e-06, "loss": 0.2738, "step": 300 }, { "epoch": 0.03674074074074074, "grad_norm": 2.6417383181651575, "learning_rate": 1.2203791469194313e-06, "loss": 0.2677, "step": 310 }, { "epoch": 0.037925925925925925, "grad_norm": 2.728345528086855, "learning_rate": 1.2598736176935232e-06, "loss": 0.274, "step": 320 }, { "epoch": 0.03911111111111111, "grad_norm": 3.309747933278777, "learning_rate": 1.2993680884676146e-06, "loss": 0.2595, "step": 330 }, { "epoch": 0.040296296296296295, "grad_norm": 2.8164913457778065, "learning_rate": 1.3388625592417062e-06, "loss": 0.2646, "step": 340 }, { "epoch": 0.04148148148148148, "grad_norm": 2.1205658345135885, "learning_rate": 1.3783570300157978e-06, "loss": 0.2613, "step": 350 }, { "epoch": 0.042666666666666665, "grad_norm": 2.8823508159607307, "learning_rate": 1.4178515007898896e-06, "loss": 0.2506, "step": 360 }, { "epoch": 0.04385185185185185, "grad_norm": 2.4601971464607777, "learning_rate": 1.4573459715639812e-06, "loss": 0.2585, "step": 370 }, { "epoch": 0.045037037037037035, "grad_norm": 2.4232969328222387, "learning_rate": 1.4968404423380728e-06, "loss": 0.2534, "step": 380 }, { "epoch": 0.04622222222222222, "grad_norm": 2.937713487379996, "learning_rate": 1.5363349131121644e-06, "loss": 0.2452, "step": 390 }, { "epoch": 0.047407407407407405, "grad_norm": 2.2762489237517314, "learning_rate": 1.5758293838862558e-06, "loss": 0.2448, "step": 400 }, { "epoch": 0.04859259259259259, "grad_norm": 2.4510732920620466, "learning_rate": 1.6153238546603479e-06, "loss": 0.2446, "step": 410 }, { "epoch": 0.049777777777777775, "grad_norm": 2.2204344270714134, "learning_rate": 1.6548183254344393e-06, "loss": 0.236, "step": 420 }, { "epoch": 0.05096296296296296, "grad_norm": 2.6698538935764695, "learning_rate": 1.694312796208531e-06, "loss": 0.2339, "step": 430 }, { "epoch": 0.052148148148148145, "grad_norm": 2.267477161633137, "learning_rate": 1.7338072669826225e-06, "loss": 0.2427, "step": 440 }, { "epoch": 0.05333333333333334, "grad_norm": 2.596053948214697, "learning_rate": 1.7733017377567141e-06, "loss": 0.2286, "step": 450 }, { "epoch": 0.05451851851851852, "grad_norm": 2.3770694649836237, "learning_rate": 1.812796208530806e-06, "loss": 0.2357, "step": 460 }, { "epoch": 0.05570370370370371, "grad_norm": 2.2607282274650626, "learning_rate": 1.8522906793048976e-06, "loss": 0.2398, "step": 470 }, { "epoch": 0.05688888888888889, "grad_norm": 2.904477627992424, "learning_rate": 1.8917851500789892e-06, "loss": 0.2229, "step": 480 }, { "epoch": 0.05807407407407408, "grad_norm": 2.619241089181291, "learning_rate": 1.9312796208530806e-06, "loss": 0.2302, "step": 490 }, { "epoch": 0.05925925925925926, "grad_norm": 2.058787571644914, "learning_rate": 1.9707740916271724e-06, "loss": 0.2252, "step": 500 }, { "epoch": 0.060444444444444446, "grad_norm": 2.221434893935856, "learning_rate": 2.0102685624012642e-06, "loss": 0.2219, "step": 510 }, { "epoch": 0.06162962962962963, "grad_norm": 3.0717598858918174, "learning_rate": 2.0497630331753556e-06, "loss": 0.2147, "step": 520 }, { "epoch": 0.06281481481481481, "grad_norm": 2.2735367678402123, "learning_rate": 2.0892575039494474e-06, "loss": 0.2165, "step": 530 }, { "epoch": 0.064, "grad_norm": 2.1138430179189736, "learning_rate": 2.128751974723539e-06, "loss": 0.225, "step": 540 }, { "epoch": 0.06518518518518518, "grad_norm": 2.107474761283569, "learning_rate": 2.1682464454976302e-06, "loss": 0.2102, "step": 550 }, { "epoch": 0.06637037037037037, "grad_norm": 2.1363084310333305, "learning_rate": 2.207740916271722e-06, "loss": 0.2074, "step": 560 }, { "epoch": 0.06755555555555555, "grad_norm": 2.2676901135162533, "learning_rate": 2.247235387045814e-06, "loss": 0.1997, "step": 570 }, { "epoch": 0.06874074074074074, "grad_norm": 1.893328201073285, "learning_rate": 2.2867298578199053e-06, "loss": 0.2189, "step": 580 }, { "epoch": 0.06992592592592592, "grad_norm": 2.208257761729857, "learning_rate": 2.326224328593997e-06, "loss": 0.2095, "step": 590 }, { "epoch": 0.07111111111111111, "grad_norm": 2.1481277137516543, "learning_rate": 2.3657187993680885e-06, "loss": 0.2124, "step": 600 }, { "epoch": 0.0722962962962963, "grad_norm": 2.281329003543564, "learning_rate": 2.4052132701421803e-06, "loss": 0.2073, "step": 610 }, { "epoch": 0.07348148148148148, "grad_norm": 1.969363240524975, "learning_rate": 2.444707740916272e-06, "loss": 0.2113, "step": 620 }, { "epoch": 0.07466666666666667, "grad_norm": 1.9322228994582784, "learning_rate": 2.4842022116903636e-06, "loss": 0.2064, "step": 630 }, { "epoch": 0.07585185185185185, "grad_norm": 2.3569096925956483, "learning_rate": 2.5236966824644554e-06, "loss": 0.1946, "step": 640 }, { "epoch": 0.07703703703703704, "grad_norm": 2.0544195464466735, "learning_rate": 2.5631911532385468e-06, "loss": 0.1905, "step": 650 }, { "epoch": 0.07822222222222222, "grad_norm": 1.872188884004776, "learning_rate": 2.6026856240126386e-06, "loss": 0.201, "step": 660 }, { "epoch": 0.07940740740740741, "grad_norm": 2.11573615376644, "learning_rate": 2.64218009478673e-06, "loss": 0.1893, "step": 670 }, { "epoch": 0.08059259259259259, "grad_norm": 1.8642448049772182, "learning_rate": 2.681674565560822e-06, "loss": 0.1908, "step": 680 }, { "epoch": 0.08177777777777778, "grad_norm": 2.2303010127242233, "learning_rate": 2.7211690363349137e-06, "loss": 0.2022, "step": 690 }, { "epoch": 0.08296296296296296, "grad_norm": 1.8511356928408458, "learning_rate": 2.760663507109005e-06, "loss": 0.2021, "step": 700 }, { "epoch": 0.08414814814814815, "grad_norm": 1.794649953795861, "learning_rate": 2.800157977883097e-06, "loss": 0.1906, "step": 710 }, { "epoch": 0.08533333333333333, "grad_norm": 1.9641261846944857, "learning_rate": 2.839652448657188e-06, "loss": 0.1941, "step": 720 }, { "epoch": 0.08651851851851852, "grad_norm": 1.950674535791457, "learning_rate": 2.8791469194312797e-06, "loss": 0.1939, "step": 730 }, { "epoch": 0.0877037037037037, "grad_norm": 1.8244665847120771, "learning_rate": 2.918641390205372e-06, "loss": 0.1854, "step": 740 }, { "epoch": 0.08888888888888889, "grad_norm": 1.7697939041359319, "learning_rate": 2.958135860979463e-06, "loss": 0.1954, "step": 750 }, { "epoch": 0.09007407407407407, "grad_norm": 2.0286467559855352, "learning_rate": 2.9976303317535547e-06, "loss": 0.1789, "step": 760 }, { "epoch": 0.09125925925925926, "grad_norm": 1.5836584212702782, "learning_rate": 3.037124802527646e-06, "loss": 0.186, "step": 770 }, { "epoch": 0.09244444444444444, "grad_norm": 1.4971744594996326, "learning_rate": 3.076619273301738e-06, "loss": 0.1871, "step": 780 }, { "epoch": 0.09362962962962963, "grad_norm": 1.7487066886646125, "learning_rate": 3.1161137440758298e-06, "loss": 0.1871, "step": 790 }, { "epoch": 0.09481481481481481, "grad_norm": 1.569162729193313, "learning_rate": 3.155608214849921e-06, "loss": 0.1923, "step": 800 }, { "epoch": 0.096, "grad_norm": 1.623522453907034, "learning_rate": 3.195102685624013e-06, "loss": 0.1743, "step": 810 }, { "epoch": 0.09718518518518518, "grad_norm": 1.7221906617688856, "learning_rate": 3.2345971563981044e-06, "loss": 0.1845, "step": 820 }, { "epoch": 0.09837037037037037, "grad_norm": 1.661367161486284, "learning_rate": 3.2740916271721962e-06, "loss": 0.1817, "step": 830 }, { "epoch": 0.09955555555555555, "grad_norm": 1.7890071283233029, "learning_rate": 3.313586097946288e-06, "loss": 0.1918, "step": 840 }, { "epoch": 0.10074074074074074, "grad_norm": 1.6622433475069756, "learning_rate": 3.3530805687203794e-06, "loss": 0.1852, "step": 850 }, { "epoch": 0.10192592592592592, "grad_norm": 1.5056266056981837, "learning_rate": 3.3925750394944713e-06, "loss": 0.1795, "step": 860 }, { "epoch": 0.10311111111111111, "grad_norm": 1.5592611887308574, "learning_rate": 3.4320695102685627e-06, "loss": 0.1728, "step": 870 }, { "epoch": 0.10429629629629629, "grad_norm": 2.0188330296033175, "learning_rate": 3.4715639810426545e-06, "loss": 0.1824, "step": 880 }, { "epoch": 0.10548148148148148, "grad_norm": 1.6454136306594946, "learning_rate": 3.5110584518167463e-06, "loss": 0.1826, "step": 890 }, { "epoch": 0.10666666666666667, "grad_norm": 1.8182282975412345, "learning_rate": 3.5505529225908373e-06, "loss": 0.1758, "step": 900 }, { "epoch": 0.10785185185185185, "grad_norm": 1.5929448694005304, "learning_rate": 3.5900473933649295e-06, "loss": 0.1703, "step": 910 }, { "epoch": 0.10903703703703704, "grad_norm": 1.4689256663083547, "learning_rate": 3.6295418641390205e-06, "loss": 0.183, "step": 920 }, { "epoch": 0.11022222222222222, "grad_norm": 1.8805410830998628, "learning_rate": 3.6690363349131123e-06, "loss": 0.1815, "step": 930 }, { "epoch": 0.11140740740740741, "grad_norm": 1.3905979671328816, "learning_rate": 3.708530805687204e-06, "loss": 0.1713, "step": 940 }, { "epoch": 0.11259259259259259, "grad_norm": 1.6578556556364272, "learning_rate": 3.7480252764612956e-06, "loss": 0.1801, "step": 950 }, { "epoch": 0.11377777777777778, "grad_norm": 1.6044003852546482, "learning_rate": 3.7875197472353874e-06, "loss": 0.1776, "step": 960 }, { "epoch": 0.11496296296296296, "grad_norm": 1.4839235623764366, "learning_rate": 3.827014218009479e-06, "loss": 0.1712, "step": 970 }, { "epoch": 0.11614814814814815, "grad_norm": 1.5546392911649811, "learning_rate": 3.866508688783571e-06, "loss": 0.1705, "step": 980 }, { "epoch": 0.11733333333333333, "grad_norm": 1.4719840642069977, "learning_rate": 3.9060031595576624e-06, "loss": 0.1799, "step": 990 }, { "epoch": 0.11851851851851852, "grad_norm": 1.5771030943684798, "learning_rate": 3.945497630331754e-06, "loss": 0.1746, "step": 1000 }, { "epoch": 0.1197037037037037, "grad_norm": 1.5899978185236583, "learning_rate": 3.984992101105846e-06, "loss": 0.1791, "step": 1010 }, { "epoch": 0.12088888888888889, "grad_norm": 1.564653106856945, "learning_rate": 4.024486571879937e-06, "loss": 0.1798, "step": 1020 }, { "epoch": 0.12207407407407407, "grad_norm": 1.4171146569353363, "learning_rate": 4.063981042654029e-06, "loss": 0.1764, "step": 1030 }, { "epoch": 0.12325925925925926, "grad_norm": 1.5775944887678808, "learning_rate": 4.10347551342812e-06, "loss": 0.1723, "step": 1040 }, { "epoch": 0.12444444444444444, "grad_norm": 1.6191537721728222, "learning_rate": 4.142969984202212e-06, "loss": 0.1698, "step": 1050 }, { "epoch": 0.12562962962962962, "grad_norm": 1.5728409510091588, "learning_rate": 4.182464454976304e-06, "loss": 0.1757, "step": 1060 }, { "epoch": 0.12681481481481482, "grad_norm": 1.654955467094043, "learning_rate": 4.221958925750395e-06, "loss": 0.1736, "step": 1070 }, { "epoch": 0.128, "grad_norm": 1.4365683818481625, "learning_rate": 4.261453396524487e-06, "loss": 0.1735, "step": 1080 }, { "epoch": 0.12918518518518518, "grad_norm": 1.3488449107612757, "learning_rate": 4.300947867298579e-06, "loss": 0.1678, "step": 1090 }, { "epoch": 0.13037037037037036, "grad_norm": 1.5644538661334613, "learning_rate": 4.34044233807267e-06, "loss": 0.169, "step": 1100 }, { "epoch": 0.13155555555555556, "grad_norm": 1.5397382467072402, "learning_rate": 4.379936808846762e-06, "loss": 0.1842, "step": 1110 }, { "epoch": 0.13274074074074074, "grad_norm": 1.4898755866787456, "learning_rate": 4.419431279620853e-06, "loss": 0.1772, "step": 1120 }, { "epoch": 0.13392592592592592, "grad_norm": 1.6439184747645121, "learning_rate": 4.4589257503949454e-06, "loss": 0.1673, "step": 1130 }, { "epoch": 0.1351111111111111, "grad_norm": 1.7264795439164726, "learning_rate": 4.498420221169037e-06, "loss": 0.161, "step": 1140 }, { "epoch": 0.1362962962962963, "grad_norm": 1.3230022083211253, "learning_rate": 4.537914691943128e-06, "loss": 0.1655, "step": 1150 }, { "epoch": 0.13748148148148148, "grad_norm": 1.5862506442915425, "learning_rate": 4.5774091627172205e-06, "loss": 0.1678, "step": 1160 }, { "epoch": 0.13866666666666666, "grad_norm": 1.5855849177374566, "learning_rate": 4.616903633491311e-06, "loss": 0.1647, "step": 1170 }, { "epoch": 0.13985185185185184, "grad_norm": 1.358603935034976, "learning_rate": 4.656398104265403e-06, "loss": 0.1679, "step": 1180 }, { "epoch": 0.14103703703703704, "grad_norm": 1.6553313082529357, "learning_rate": 4.695892575039495e-06, "loss": 0.1572, "step": 1190 }, { "epoch": 0.14222222222222222, "grad_norm": 1.3478085969007612, "learning_rate": 4.735387045813586e-06, "loss": 0.1614, "step": 1200 }, { "epoch": 0.1434074074074074, "grad_norm": 1.2945814520336956, "learning_rate": 4.774881516587678e-06, "loss": 0.17, "step": 1210 }, { "epoch": 0.1445925925925926, "grad_norm": 1.574385234075967, "learning_rate": 4.81437598736177e-06, "loss": 0.1694, "step": 1220 }, { "epoch": 0.14577777777777778, "grad_norm": 1.2930518937611122, "learning_rate": 4.853870458135861e-06, "loss": 0.1554, "step": 1230 }, { "epoch": 0.14696296296296296, "grad_norm": 1.2904231842371265, "learning_rate": 4.8933649289099525e-06, "loss": 0.1653, "step": 1240 }, { "epoch": 0.14814814814814814, "grad_norm": 1.5730789203066062, "learning_rate": 4.932859399684045e-06, "loss": 0.1701, "step": 1250 }, { "epoch": 0.14933333333333335, "grad_norm": 1.2769542182085247, "learning_rate": 4.972353870458136e-06, "loss": 0.1676, "step": 1260 }, { "epoch": 0.15051851851851852, "grad_norm": 1.2571410788560955, "learning_rate": 5.011848341232228e-06, "loss": 0.1656, "step": 1270 }, { "epoch": 0.1517037037037037, "grad_norm": 1.3873958490581901, "learning_rate": 5.051342812006319e-06, "loss": 0.1635, "step": 1280 }, { "epoch": 0.15288888888888888, "grad_norm": 1.1159404546222491, "learning_rate": 5.090837282780411e-06, "loss": 0.1575, "step": 1290 }, { "epoch": 0.15407407407407409, "grad_norm": 1.27725876926721, "learning_rate": 5.130331753554503e-06, "loss": 0.1587, "step": 1300 }, { "epoch": 0.15525925925925926, "grad_norm": 1.5545501003527686, "learning_rate": 5.169826224328595e-06, "loss": 0.1509, "step": 1310 }, { "epoch": 0.15644444444444444, "grad_norm": 1.3114722736322373, "learning_rate": 5.209320695102686e-06, "loss": 0.1628, "step": 1320 }, { "epoch": 0.15762962962962962, "grad_norm": 1.3618760732486652, "learning_rate": 5.248815165876777e-06, "loss": 0.1663, "step": 1330 }, { "epoch": 0.15881481481481483, "grad_norm": 1.455841330090113, "learning_rate": 5.288309636650869e-06, "loss": 0.163, "step": 1340 }, { "epoch": 0.16, "grad_norm": 1.2663524840500726, "learning_rate": 5.327804107424961e-06, "loss": 0.157, "step": 1350 }, { "epoch": 0.16118518518518518, "grad_norm": 1.4394970583573017, "learning_rate": 5.367298578199053e-06, "loss": 0.1652, "step": 1360 }, { "epoch": 0.16237037037037036, "grad_norm": 1.625893248662234, "learning_rate": 5.406793048973145e-06, "loss": 0.1627, "step": 1370 }, { "epoch": 0.16355555555555557, "grad_norm": 1.246933955283402, "learning_rate": 5.4462875197472355e-06, "loss": 0.1628, "step": 1380 }, { "epoch": 0.16474074074074074, "grad_norm": 1.3894317077738516, "learning_rate": 5.485781990521327e-06, "loss": 0.1542, "step": 1390 }, { "epoch": 0.16592592592592592, "grad_norm": 1.3397970821643086, "learning_rate": 5.525276461295419e-06, "loss": 0.1534, "step": 1400 }, { "epoch": 0.1671111111111111, "grad_norm": 1.3787584428667217, "learning_rate": 5.5647709320695106e-06, "loss": 0.1606, "step": 1410 }, { "epoch": 0.1682962962962963, "grad_norm": 1.1039480094467116, "learning_rate": 5.604265402843603e-06, "loss": 0.1656, "step": 1420 }, { "epoch": 0.16948148148148148, "grad_norm": 1.5110637918564995, "learning_rate": 5.643759873617693e-06, "loss": 0.1592, "step": 1430 }, { "epoch": 0.17066666666666666, "grad_norm": 1.1598226907603657, "learning_rate": 5.683254344391786e-06, "loss": 0.1553, "step": 1440 }, { "epoch": 0.17185185185185184, "grad_norm": 1.266111525853398, "learning_rate": 5.722748815165877e-06, "loss": 0.1611, "step": 1450 }, { "epoch": 0.17303703703703704, "grad_norm": 1.4369534873980916, "learning_rate": 5.762243285939969e-06, "loss": 0.1586, "step": 1460 }, { "epoch": 0.17422222222222222, "grad_norm": 1.349225066305025, "learning_rate": 5.801737756714061e-06, "loss": 0.155, "step": 1470 }, { "epoch": 0.1754074074074074, "grad_norm": 1.3245587999340904, "learning_rate": 5.841232227488152e-06, "loss": 0.1554, "step": 1480 }, { "epoch": 0.17659259259259258, "grad_norm": 1.3818406938882097, "learning_rate": 5.8807266982622435e-06, "loss": 0.1507, "step": 1490 }, { "epoch": 0.17777777777777778, "grad_norm": 1.2162085401765121, "learning_rate": 5.920221169036336e-06, "loss": 0.1614, "step": 1500 }, { "epoch": 0.17896296296296296, "grad_norm": 1.4420588683251592, "learning_rate": 5.959715639810427e-06, "loss": 0.1512, "step": 1510 }, { "epoch": 0.18014814814814814, "grad_norm": 1.4710421203909305, "learning_rate": 5.999210110584519e-06, "loss": 0.1601, "step": 1520 }, { "epoch": 0.18133333333333335, "grad_norm": 1.4041070078830804, "learning_rate": 6.03870458135861e-06, "loss": 0.1564, "step": 1530 }, { "epoch": 0.18251851851851852, "grad_norm": 1.2990259277105651, "learning_rate": 6.078199052132701e-06, "loss": 0.1474, "step": 1540 }, { "epoch": 0.1837037037037037, "grad_norm": 1.1290654594876044, "learning_rate": 6.1176935229067936e-06, "loss": 0.1524, "step": 1550 }, { "epoch": 0.18488888888888888, "grad_norm": 1.3655761919766516, "learning_rate": 6.157187993680885e-06, "loss": 0.1516, "step": 1560 }, { "epoch": 0.1860740740740741, "grad_norm": 1.13197310444901, "learning_rate": 6.196682464454977e-06, "loss": 0.1482, "step": 1570 }, { "epoch": 0.18725925925925926, "grad_norm": 1.2967230650182364, "learning_rate": 6.236176935229068e-06, "loss": 0.1466, "step": 1580 }, { "epoch": 0.18844444444444444, "grad_norm": 1.2858221652544513, "learning_rate": 6.27567140600316e-06, "loss": 0.14, "step": 1590 }, { "epoch": 0.18962962962962962, "grad_norm": 1.4770672800010265, "learning_rate": 6.315165876777251e-06, "loss": 0.1423, "step": 1600 }, { "epoch": 0.19081481481481483, "grad_norm": 1.364540759886777, "learning_rate": 6.354660347551344e-06, "loss": 0.1573, "step": 1610 }, { "epoch": 0.192, "grad_norm": 1.15436213273915, "learning_rate": 6.394154818325435e-06, "loss": 0.1477, "step": 1620 }, { "epoch": 0.19318518518518518, "grad_norm": 1.302266257733314, "learning_rate": 6.4336492890995265e-06, "loss": 0.15, "step": 1630 }, { "epoch": 0.19437037037037036, "grad_norm": 1.359792431900611, "learning_rate": 6.473143759873618e-06, "loss": 0.1497, "step": 1640 }, { "epoch": 0.19555555555555557, "grad_norm": 1.4472703938936013, "learning_rate": 6.51263823064771e-06, "loss": 0.1451, "step": 1650 }, { "epoch": 0.19674074074074074, "grad_norm": 1.2473977675102152, "learning_rate": 6.5521327014218015e-06, "loss": 0.1492, "step": 1660 }, { "epoch": 0.19792592592592592, "grad_norm": 1.4162855282869764, "learning_rate": 6.591627172195894e-06, "loss": 0.1415, "step": 1670 }, { "epoch": 0.1991111111111111, "grad_norm": 1.3796721675173294, "learning_rate": 6.631121642969984e-06, "loss": 0.1443, "step": 1680 }, { "epoch": 0.2002962962962963, "grad_norm": 1.3461006140955094, "learning_rate": 6.6706161137440765e-06, "loss": 0.1515, "step": 1690 }, { "epoch": 0.20148148148148148, "grad_norm": 1.4963650991939834, "learning_rate": 6.710110584518168e-06, "loss": 0.147, "step": 1700 }, { "epoch": 0.20266666666666666, "grad_norm": 1.2716150586537698, "learning_rate": 6.74960505529226e-06, "loss": 0.1478, "step": 1710 }, { "epoch": 0.20385185185185184, "grad_norm": 1.6329558161465296, "learning_rate": 6.789099526066352e-06, "loss": 0.1488, "step": 1720 }, { "epoch": 0.20503703703703705, "grad_norm": 1.3662584849275496, "learning_rate": 6.828593996840442e-06, "loss": 0.149, "step": 1730 }, { "epoch": 0.20622222222222222, "grad_norm": 1.351762035054952, "learning_rate": 6.868088467614534e-06, "loss": 0.1485, "step": 1740 }, { "epoch": 0.2074074074074074, "grad_norm": 1.2917761077413694, "learning_rate": 6.907582938388626e-06, "loss": 0.1395, "step": 1750 }, { "epoch": 0.20859259259259258, "grad_norm": 1.1078583538981257, "learning_rate": 6.947077409162718e-06, "loss": 0.142, "step": 1760 }, { "epoch": 0.20977777777777779, "grad_norm": 1.1952738983680355, "learning_rate": 6.9865718799368094e-06, "loss": 0.1413, "step": 1770 }, { "epoch": 0.21096296296296296, "grad_norm": 1.2337951524972826, "learning_rate": 7.026066350710901e-06, "loss": 0.1491, "step": 1780 }, { "epoch": 0.21214814814814814, "grad_norm": 1.2214404384776358, "learning_rate": 7.065560821484992e-06, "loss": 0.146, "step": 1790 }, { "epoch": 0.21333333333333335, "grad_norm": 1.2144547821049632, "learning_rate": 7.1050552922590845e-06, "loss": 0.1471, "step": 1800 }, { "epoch": 0.21451851851851853, "grad_norm": 1.2690692704734767, "learning_rate": 7.144549763033176e-06, "loss": 0.1394, "step": 1810 }, { "epoch": 0.2157037037037037, "grad_norm": 1.2528799209121506, "learning_rate": 7.184044233807268e-06, "loss": 0.1391, "step": 1820 }, { "epoch": 0.21688888888888888, "grad_norm": 1.088421676313826, "learning_rate": 7.223538704581359e-06, "loss": 0.1376, "step": 1830 }, { "epoch": 0.2180740740740741, "grad_norm": 1.2532401792769972, "learning_rate": 7.263033175355451e-06, "loss": 0.1387, "step": 1840 }, { "epoch": 0.21925925925925926, "grad_norm": 1.171126057113268, "learning_rate": 7.302527646129542e-06, "loss": 0.1457, "step": 1850 }, { "epoch": 0.22044444444444444, "grad_norm": 1.1517859213369401, "learning_rate": 7.342022116903635e-06, "loss": 0.139, "step": 1860 }, { "epoch": 0.22162962962962962, "grad_norm": 1.4497812790100033, "learning_rate": 7.381516587677726e-06, "loss": 0.1452, "step": 1870 }, { "epoch": 0.22281481481481483, "grad_norm": 1.3681419086556943, "learning_rate": 7.4210110584518165e-06, "loss": 0.1338, "step": 1880 }, { "epoch": 0.224, "grad_norm": 1.239630120430343, "learning_rate": 7.460505529225909e-06, "loss": 0.1444, "step": 1890 }, { "epoch": 0.22518518518518518, "grad_norm": 1.2751504568085517, "learning_rate": 7.500000000000001e-06, "loss": 0.152, "step": 1900 }, { "epoch": 0.22637037037037036, "grad_norm": 1.3398101737423465, "learning_rate": 7.5394944707740924e-06, "loss": 0.1416, "step": 1910 }, { "epoch": 0.22755555555555557, "grad_norm": 1.30370206995981, "learning_rate": 7.578988941548185e-06, "loss": 0.1381, "step": 1920 }, { "epoch": 0.22874074074074074, "grad_norm": 1.4307225892256046, "learning_rate": 7.618483412322275e-06, "loss": 0.1364, "step": 1930 }, { "epoch": 0.22992592592592592, "grad_norm": 1.3831585238138127, "learning_rate": 7.657977883096367e-06, "loss": 0.1349, "step": 1940 }, { "epoch": 0.2311111111111111, "grad_norm": 1.1393824594697461, "learning_rate": 7.697472353870459e-06, "loss": 0.1451, "step": 1950 }, { "epoch": 0.2322962962962963, "grad_norm": 1.3942292177077196, "learning_rate": 7.736966824644551e-06, "loss": 0.1367, "step": 1960 }, { "epoch": 0.23348148148148148, "grad_norm": 1.2433683525066, "learning_rate": 7.776461295418642e-06, "loss": 0.15, "step": 1970 }, { "epoch": 0.23466666666666666, "grad_norm": 1.306693696683265, "learning_rate": 7.815955766192734e-06, "loss": 0.1344, "step": 1980 }, { "epoch": 0.23585185185185184, "grad_norm": 1.15389110039777, "learning_rate": 7.855450236966824e-06, "loss": 0.143, "step": 1990 }, { "epoch": 0.23703703703703705, "grad_norm": 1.249327703738097, "learning_rate": 7.894944707740917e-06, "loss": 0.1326, "step": 2000 }, { "epoch": 0.23822222222222222, "grad_norm": 1.4396758038789688, "learning_rate": 7.934439178515009e-06, "loss": 0.1336, "step": 2010 }, { "epoch": 0.2394074074074074, "grad_norm": 1.0244329804035475, "learning_rate": 7.973933649289101e-06, "loss": 0.1494, "step": 2020 }, { "epoch": 0.24059259259259258, "grad_norm": 1.3655168966767948, "learning_rate": 8.013428120063192e-06, "loss": 0.1458, "step": 2030 }, { "epoch": 0.24177777777777779, "grad_norm": 1.0813431554623738, "learning_rate": 8.052922590837284e-06, "loss": 0.1313, "step": 2040 }, { "epoch": 0.24296296296296296, "grad_norm": 1.1019447318703963, "learning_rate": 8.092417061611375e-06, "loss": 0.1278, "step": 2050 }, { "epoch": 0.24414814814814814, "grad_norm": 1.161541112028708, "learning_rate": 8.131911532385467e-06, "loss": 0.1388, "step": 2060 }, { "epoch": 0.24533333333333332, "grad_norm": 1.2121096685200743, "learning_rate": 8.171406003159559e-06, "loss": 0.1426, "step": 2070 }, { "epoch": 0.24651851851851853, "grad_norm": 1.1943833198302782, "learning_rate": 8.21090047393365e-06, "loss": 0.1448, "step": 2080 }, { "epoch": 0.2477037037037037, "grad_norm": 1.2385176940275673, "learning_rate": 8.250394944707742e-06, "loss": 0.137, "step": 2090 }, { "epoch": 0.24888888888888888, "grad_norm": 1.2399375043986598, "learning_rate": 8.289889415481832e-06, "loss": 0.1332, "step": 2100 }, { "epoch": 0.25007407407407406, "grad_norm": 1.0532024622755551, "learning_rate": 8.329383886255925e-06, "loss": 0.1367, "step": 2110 }, { "epoch": 0.25125925925925924, "grad_norm": 1.1112223938639674, "learning_rate": 8.368878357030017e-06, "loss": 0.122, "step": 2120 }, { "epoch": 0.25244444444444447, "grad_norm": 1.1215548658435044, "learning_rate": 8.408372827804107e-06, "loss": 0.1309, "step": 2130 }, { "epoch": 0.25362962962962965, "grad_norm": 0.9889435324709056, "learning_rate": 8.4478672985782e-06, "loss": 0.1318, "step": 2140 }, { "epoch": 0.2548148148148148, "grad_norm": 1.1190303690533154, "learning_rate": 8.487361769352292e-06, "loss": 0.1372, "step": 2150 }, { "epoch": 0.256, "grad_norm": 1.0821026540815322, "learning_rate": 8.526856240126383e-06, "loss": 0.1394, "step": 2160 }, { "epoch": 0.2571851851851852, "grad_norm": 1.4273902320172085, "learning_rate": 8.566350710900475e-06, "loss": 0.1403, "step": 2170 }, { "epoch": 0.25837037037037036, "grad_norm": 1.1725459842861718, "learning_rate": 8.605845181674565e-06, "loss": 0.1368, "step": 2180 }, { "epoch": 0.25955555555555554, "grad_norm": 1.0347236868665117, "learning_rate": 8.645339652448658e-06, "loss": 0.1322, "step": 2190 }, { "epoch": 0.2607407407407407, "grad_norm": 1.147336853830699, "learning_rate": 8.68483412322275e-06, "loss": 0.1385, "step": 2200 }, { "epoch": 0.26192592592592595, "grad_norm": 1.1195736709673731, "learning_rate": 8.724328593996842e-06, "loss": 0.1299, "step": 2210 }, { "epoch": 0.26311111111111113, "grad_norm": 1.19703340109771, "learning_rate": 8.763823064770933e-06, "loss": 0.1326, "step": 2220 }, { "epoch": 0.2642962962962963, "grad_norm": 1.0370850841770547, "learning_rate": 8.803317535545023e-06, "loss": 0.1256, "step": 2230 }, { "epoch": 0.2654814814814815, "grad_norm": 1.0458152573599093, "learning_rate": 8.842812006319115e-06, "loss": 0.1324, "step": 2240 }, { "epoch": 0.26666666666666666, "grad_norm": 1.1578620236562867, "learning_rate": 8.882306477093208e-06, "loss": 0.1337, "step": 2250 }, { "epoch": 0.26785185185185184, "grad_norm": 1.2400311727023643, "learning_rate": 8.9218009478673e-06, "loss": 0.1389, "step": 2260 }, { "epoch": 0.269037037037037, "grad_norm": 1.0772480700085616, "learning_rate": 8.961295418641392e-06, "loss": 0.1279, "step": 2270 }, { "epoch": 0.2702222222222222, "grad_norm": 1.0559092770169978, "learning_rate": 9.000789889415483e-06, "loss": 0.1272, "step": 2280 }, { "epoch": 0.27140740740740743, "grad_norm": 1.048995973752882, "learning_rate": 9.040284360189573e-06, "loss": 0.1243, "step": 2290 }, { "epoch": 0.2725925925925926, "grad_norm": 1.3451391343777819, "learning_rate": 9.079778830963666e-06, "loss": 0.1307, "step": 2300 }, { "epoch": 0.2737777777777778, "grad_norm": 1.060469009618305, "learning_rate": 9.119273301737758e-06, "loss": 0.1292, "step": 2310 }, { "epoch": 0.27496296296296296, "grad_norm": 1.1680308595423996, "learning_rate": 9.15876777251185e-06, "loss": 0.1291, "step": 2320 }, { "epoch": 0.27614814814814814, "grad_norm": 1.0245403988039696, "learning_rate": 9.19826224328594e-06, "loss": 0.1364, "step": 2330 }, { "epoch": 0.2773333333333333, "grad_norm": 1.3540771871606279, "learning_rate": 9.237756714060033e-06, "loss": 0.1211, "step": 2340 }, { "epoch": 0.2785185185185185, "grad_norm": 1.0477791266593481, "learning_rate": 9.277251184834123e-06, "loss": 0.1256, "step": 2350 }, { "epoch": 0.2797037037037037, "grad_norm": 1.074397627115856, "learning_rate": 9.316745655608216e-06, "loss": 0.1368, "step": 2360 }, { "epoch": 0.2808888888888889, "grad_norm": 1.096762754427759, "learning_rate": 9.356240126382308e-06, "loss": 0.127, "step": 2370 }, { "epoch": 0.2820740740740741, "grad_norm": 1.1152387422710077, "learning_rate": 9.395734597156398e-06, "loss": 0.1157, "step": 2380 }, { "epoch": 0.28325925925925927, "grad_norm": 1.15185809148128, "learning_rate": 9.43522906793049e-06, "loss": 0.1369, "step": 2390 }, { "epoch": 0.28444444444444444, "grad_norm": 1.0649016460434935, "learning_rate": 9.474723538704583e-06, "loss": 0.1267, "step": 2400 }, { "epoch": 0.2856296296296296, "grad_norm": 1.1080080820382023, "learning_rate": 9.514218009478673e-06, "loss": 0.1276, "step": 2410 }, { "epoch": 0.2868148148148148, "grad_norm": 1.1752464726050098, "learning_rate": 9.553712480252766e-06, "loss": 0.13, "step": 2420 }, { "epoch": 0.288, "grad_norm": 1.1902570725844221, "learning_rate": 9.593206951026856e-06, "loss": 0.1206, "step": 2430 }, { "epoch": 0.2891851851851852, "grad_norm": 1.0348767296473844, "learning_rate": 9.632701421800949e-06, "loss": 0.1314, "step": 2440 }, { "epoch": 0.2903703703703704, "grad_norm": 1.5099894055106093, "learning_rate": 9.67219589257504e-06, "loss": 0.1276, "step": 2450 }, { "epoch": 0.29155555555555557, "grad_norm": 1.385132963146712, "learning_rate": 9.711690363349133e-06, "loss": 0.1228, "step": 2460 }, { "epoch": 0.29274074074074075, "grad_norm": 1.288744232081011, "learning_rate": 9.751184834123224e-06, "loss": 0.1281, "step": 2470 }, { "epoch": 0.2939259259259259, "grad_norm": 1.0941276266402564, "learning_rate": 9.790679304897314e-06, "loss": 0.1303, "step": 2480 }, { "epoch": 0.2951111111111111, "grad_norm": 0.945296978599026, "learning_rate": 9.830173775671406e-06, "loss": 0.1259, "step": 2490 }, { "epoch": 0.2962962962962963, "grad_norm": 1.028557604163525, "learning_rate": 9.869668246445499e-06, "loss": 0.1243, "step": 2500 }, { "epoch": 0.29748148148148146, "grad_norm": 1.0961718771978575, "learning_rate": 9.909162717219591e-06, "loss": 0.125, "step": 2510 }, { "epoch": 0.2986666666666667, "grad_norm": 1.2259138889872725, "learning_rate": 9.948657187993681e-06, "loss": 0.1234, "step": 2520 }, { "epoch": 0.29985185185185187, "grad_norm": 1.0222087312043195, "learning_rate": 9.988151658767774e-06, "loss": 0.1315, "step": 2530 }, { "epoch": 0.30103703703703705, "grad_norm": 1.1985118048432004, "learning_rate": 9.999997670556908e-06, "loss": 0.127, "step": 2540 }, { "epoch": 0.3022222222222222, "grad_norm": 0.9286185383611775, "learning_rate": 9.999986261044944e-06, "loss": 0.1266, "step": 2550 }, { "epoch": 0.3034074074074074, "grad_norm": 1.111662361529288, "learning_rate": 9.999965343628881e-06, "loss": 0.1184, "step": 2560 }, { "epoch": 0.3045925925925926, "grad_norm": 1.273986447150661, "learning_rate": 9.9999349183485e-06, "loss": 0.1188, "step": 2570 }, { "epoch": 0.30577777777777776, "grad_norm": 1.1492983257913492, "learning_rate": 9.999894985261652e-06, "loss": 0.1162, "step": 2580 }, { "epoch": 0.30696296296296294, "grad_norm": 0.9259384772003547, "learning_rate": 9.999845544444276e-06, "loss": 0.1208, "step": 2590 }, { "epoch": 0.30814814814814817, "grad_norm": 1.201242801046621, "learning_rate": 9.999786595990388e-06, "loss": 0.1275, "step": 2600 }, { "epoch": 0.30933333333333335, "grad_norm": 1.0816319437512862, "learning_rate": 9.999718140012084e-06, "loss": 0.129, "step": 2610 }, { "epoch": 0.3105185185185185, "grad_norm": 1.106029967405303, "learning_rate": 9.999640176639537e-06, "loss": 0.1214, "step": 2620 }, { "epoch": 0.3117037037037037, "grad_norm": 1.0193732867232608, "learning_rate": 9.999552706021003e-06, "loss": 0.1285, "step": 2630 }, { "epoch": 0.3128888888888889, "grad_norm": 1.1046176513500003, "learning_rate": 9.999455728322813e-06, "loss": 0.1171, "step": 2640 }, { "epoch": 0.31407407407407406, "grad_norm": 1.0910449790973566, "learning_rate": 9.999349243729379e-06, "loss": 0.1191, "step": 2650 }, { "epoch": 0.31525925925925924, "grad_norm": 1.0083266380537457, "learning_rate": 9.999233252443192e-06, "loss": 0.1204, "step": 2660 }, { "epoch": 0.3164444444444444, "grad_norm": 1.0642460379585044, "learning_rate": 9.999107754684817e-06, "loss": 0.1168, "step": 2670 }, { "epoch": 0.31762962962962965, "grad_norm": 1.1298694219636254, "learning_rate": 9.998972750692904e-06, "loss": 0.1194, "step": 2680 }, { "epoch": 0.31881481481481483, "grad_norm": 0.9613338001649993, "learning_rate": 9.998828240724168e-06, "loss": 0.1155, "step": 2690 }, { "epoch": 0.32, "grad_norm": 1.173601854354163, "learning_rate": 9.99867422505341e-06, "loss": 0.1214, "step": 2700 }, { "epoch": 0.3211851851851852, "grad_norm": 0.9733768496559912, "learning_rate": 9.998510703973506e-06, "loss": 0.1227, "step": 2710 }, { "epoch": 0.32237037037037036, "grad_norm": 1.0607585805234887, "learning_rate": 9.998337677795402e-06, "loss": 0.1146, "step": 2720 }, { "epoch": 0.32355555555555554, "grad_norm": 1.080581949156037, "learning_rate": 9.998155146848124e-06, "loss": 0.1074, "step": 2730 }, { "epoch": 0.3247407407407407, "grad_norm": 0.9948035768360823, "learning_rate": 9.99796311147877e-06, "loss": 0.1222, "step": 2740 }, { "epoch": 0.32592592592592595, "grad_norm": 1.1381012144365488, "learning_rate": 9.997761572052513e-06, "loss": 0.1136, "step": 2750 }, { "epoch": 0.32711111111111113, "grad_norm": 1.1019137219716368, "learning_rate": 9.997550528952596e-06, "loss": 0.1197, "step": 2760 }, { "epoch": 0.3282962962962963, "grad_norm": 1.0614332229110073, "learning_rate": 9.997329982580334e-06, "loss": 0.1119, "step": 2770 }, { "epoch": 0.3294814814814815, "grad_norm": 1.019300129700046, "learning_rate": 9.997099933355119e-06, "loss": 0.1122, "step": 2780 }, { "epoch": 0.33066666666666666, "grad_norm": 0.8703414717801001, "learning_rate": 9.996860381714406e-06, "loss": 0.1154, "step": 2790 }, { "epoch": 0.33185185185185184, "grad_norm": 0.9927668006373571, "learning_rate": 9.996611328113725e-06, "loss": 0.1114, "step": 2800 }, { "epoch": 0.333037037037037, "grad_norm": 1.0649223066896096, "learning_rate": 9.996352773026672e-06, "loss": 0.1143, "step": 2810 }, { "epoch": 0.3342222222222222, "grad_norm": 0.945061933888762, "learning_rate": 9.996084716944913e-06, "loss": 0.1165, "step": 2820 }, { "epoch": 0.33540740740740743, "grad_norm": 0.9053165256161341, "learning_rate": 9.995807160378176e-06, "loss": 0.1138, "step": 2830 }, { "epoch": 0.3365925925925926, "grad_norm": 0.983673776932675, "learning_rate": 9.995520103854265e-06, "loss": 0.1126, "step": 2840 }, { "epoch": 0.3377777777777778, "grad_norm": 0.9094831050296318, "learning_rate": 9.995223547919037e-06, "loss": 0.1131, "step": 2850 }, { "epoch": 0.33896296296296297, "grad_norm": 1.1980737495235771, "learning_rate": 9.99491749313642e-06, "loss": 0.1153, "step": 2860 }, { "epoch": 0.34014814814814814, "grad_norm": 1.1642543523703357, "learning_rate": 9.994601940088407e-06, "loss": 0.1069, "step": 2870 }, { "epoch": 0.3413333333333333, "grad_norm": 1.3061713011905183, "learning_rate": 9.994276889375043e-06, "loss": 0.1129, "step": 2880 }, { "epoch": 0.3425185185185185, "grad_norm": 1.1159869136842975, "learning_rate": 9.993942341614445e-06, "loss": 0.1152, "step": 2890 }, { "epoch": 0.3437037037037037, "grad_norm": 0.9868651619155405, "learning_rate": 9.993598297442782e-06, "loss": 0.1111, "step": 2900 }, { "epoch": 0.3448888888888889, "grad_norm": 0.9111232398594167, "learning_rate": 9.993244757514284e-06, "loss": 0.1086, "step": 2910 }, { "epoch": 0.3460740740740741, "grad_norm": 1.1352059022498266, "learning_rate": 9.99288172250124e-06, "loss": 0.1101, "step": 2920 }, { "epoch": 0.34725925925925927, "grad_norm": 1.1092536863833993, "learning_rate": 9.992509193093989e-06, "loss": 0.1138, "step": 2930 }, { "epoch": 0.34844444444444445, "grad_norm": 1.0456678813902056, "learning_rate": 9.992127170000928e-06, "loss": 0.1085, "step": 2940 }, { "epoch": 0.3496296296296296, "grad_norm": 1.05233397677223, "learning_rate": 9.99173565394851e-06, "loss": 0.1137, "step": 2950 }, { "epoch": 0.3508148148148148, "grad_norm": 0.8989290814616561, "learning_rate": 9.99133464568123e-06, "loss": 0.1011, "step": 2960 }, { "epoch": 0.352, "grad_norm": 1.1959494139981524, "learning_rate": 9.990924145961648e-06, "loss": 0.1069, "step": 2970 }, { "epoch": 0.35318518518518516, "grad_norm": 1.0223587977454955, "learning_rate": 9.990504155570358e-06, "loss": 0.1104, "step": 2980 }, { "epoch": 0.3543703703703704, "grad_norm": 1.0324014376102113, "learning_rate": 9.990074675306011e-06, "loss": 0.1089, "step": 2990 }, { "epoch": 0.35555555555555557, "grad_norm": 0.9142730471073823, "learning_rate": 9.989635705985301e-06, "loss": 0.1149, "step": 3000 }, { "epoch": 0.35674074074074075, "grad_norm": 0.9477791096263312, "learning_rate": 9.989187248442965e-06, "loss": 0.1074, "step": 3010 }, { "epoch": 0.3579259259259259, "grad_norm": 1.029917738649383, "learning_rate": 9.98872930353178e-06, "loss": 0.1049, "step": 3020 }, { "epoch": 0.3591111111111111, "grad_norm": 1.0958732449477262, "learning_rate": 9.988261872122575e-06, "loss": 0.1062, "step": 3030 }, { "epoch": 0.3602962962962963, "grad_norm": 0.9739346786531066, "learning_rate": 9.987784955104205e-06, "loss": 0.1043, "step": 3040 }, { "epoch": 0.36148148148148146, "grad_norm": 0.9125268614665308, "learning_rate": 9.987298553383571e-06, "loss": 0.1018, "step": 3050 }, { "epoch": 0.3626666666666667, "grad_norm": 0.9883935739843853, "learning_rate": 9.986802667885609e-06, "loss": 0.1096, "step": 3060 }, { "epoch": 0.36385185185185187, "grad_norm": 1.153554291921888, "learning_rate": 9.986297299553286e-06, "loss": 0.1104, "step": 3070 }, { "epoch": 0.36503703703703705, "grad_norm": 0.8661052521750611, "learning_rate": 9.985782449347605e-06, "loss": 0.0986, "step": 3080 }, { "epoch": 0.3662222222222222, "grad_norm": 1.0611023764230756, "learning_rate": 9.985258118247596e-06, "loss": 0.0945, "step": 3090 }, { "epoch": 0.3674074074074074, "grad_norm": 0.9468505288641946, "learning_rate": 9.984724307250319e-06, "loss": 0.11, "step": 3100 }, { "epoch": 0.3685925925925926, "grad_norm": 1.1929655399531534, "learning_rate": 9.984181017370867e-06, "loss": 0.1071, "step": 3110 }, { "epoch": 0.36977777777777776, "grad_norm": 1.073270532431986, "learning_rate": 9.983628249642345e-06, "loss": 0.106, "step": 3120 }, { "epoch": 0.37096296296296294, "grad_norm": 1.0505725853702017, "learning_rate": 9.983066005115894e-06, "loss": 0.0999, "step": 3130 }, { "epoch": 0.3721481481481482, "grad_norm": 1.0782358716715486, "learning_rate": 9.982494284860668e-06, "loss": 0.1111, "step": 3140 }, { "epoch": 0.37333333333333335, "grad_norm": 1.0557516128878732, "learning_rate": 9.981913089963841e-06, "loss": 0.107, "step": 3150 }, { "epoch": 0.37451851851851853, "grad_norm": 1.0578331773078369, "learning_rate": 9.98132242153061e-06, "loss": 0.1014, "step": 3160 }, { "epoch": 0.3757037037037037, "grad_norm": 0.9900733877629715, "learning_rate": 9.980722280684177e-06, "loss": 0.0985, "step": 3170 }, { "epoch": 0.3768888888888889, "grad_norm": 0.9827346791836639, "learning_rate": 9.980112668565762e-06, "loss": 0.1141, "step": 3180 }, { "epoch": 0.37807407407407406, "grad_norm": 1.0585445080498543, "learning_rate": 9.979493586334596e-06, "loss": 0.1013, "step": 3190 }, { "epoch": 0.37925925925925924, "grad_norm": 0.9483754543855847, "learning_rate": 9.97886503516792e-06, "loss": 0.1018, "step": 3200 }, { "epoch": 0.3804444444444444, "grad_norm": 1.1323394195148195, "learning_rate": 9.978227016260974e-06, "loss": 0.1036, "step": 3210 }, { "epoch": 0.38162962962962965, "grad_norm": 1.014762440048781, "learning_rate": 9.977579530827003e-06, "loss": 0.0959, "step": 3220 }, { "epoch": 0.38281481481481483, "grad_norm": 0.843820063614635, "learning_rate": 9.976922580097266e-06, "loss": 0.0974, "step": 3230 }, { "epoch": 0.384, "grad_norm": 0.9945169322602301, "learning_rate": 9.976256165321002e-06, "loss": 0.1008, "step": 3240 }, { "epoch": 0.3851851851851852, "grad_norm": 0.9422603028606186, "learning_rate": 9.975580287765461e-06, "loss": 0.1016, "step": 3250 }, { "epoch": 0.38637037037037036, "grad_norm": 1.0428131747987188, "learning_rate": 9.974894948715882e-06, "loss": 0.1043, "step": 3260 }, { "epoch": 0.38755555555555554, "grad_norm": 1.0106390080031689, "learning_rate": 9.974200149475494e-06, "loss": 0.0986, "step": 3270 }, { "epoch": 0.3887407407407407, "grad_norm": 1.0909067437339077, "learning_rate": 9.973495891365518e-06, "loss": 0.0975, "step": 3280 }, { "epoch": 0.38992592592592595, "grad_norm": 0.9509979371656362, "learning_rate": 9.972782175725163e-06, "loss": 0.1011, "step": 3290 }, { "epoch": 0.39111111111111113, "grad_norm": 0.9460805917169135, "learning_rate": 9.97205900391162e-06, "loss": 0.1001, "step": 3300 }, { "epoch": 0.3922962962962963, "grad_norm": 1.0433150340303297, "learning_rate": 9.971326377300062e-06, "loss": 0.1028, "step": 3310 }, { "epoch": 0.3934814814814815, "grad_norm": 0.8522650880102192, "learning_rate": 9.970584297283643e-06, "loss": 0.0936, "step": 3320 }, { "epoch": 0.39466666666666667, "grad_norm": 1.041438417145197, "learning_rate": 9.96983276527349e-06, "loss": 0.0969, "step": 3330 }, { "epoch": 0.39585185185185184, "grad_norm": 0.9513231505108326, "learning_rate": 9.969071782698704e-06, "loss": 0.0958, "step": 3340 }, { "epoch": 0.397037037037037, "grad_norm": 1.113472336755939, "learning_rate": 9.968301351006366e-06, "loss": 0.1008, "step": 3350 }, { "epoch": 0.3982222222222222, "grad_norm": 0.9170457331942715, "learning_rate": 9.967521471661511e-06, "loss": 0.0937, "step": 3360 }, { "epoch": 0.39940740740740743, "grad_norm": 0.9925503933520124, "learning_rate": 9.96673214614715e-06, "loss": 0.1018, "step": 3370 }, { "epoch": 0.4005925925925926, "grad_norm": 1.208167708969931, "learning_rate": 9.965933375964252e-06, "loss": 0.1015, "step": 3380 }, { "epoch": 0.4017777777777778, "grad_norm": 0.9111547528969846, "learning_rate": 9.965125162631748e-06, "loss": 0.0977, "step": 3390 }, { "epoch": 0.40296296296296297, "grad_norm": 1.0594293752341197, "learning_rate": 9.964307507686525e-06, "loss": 0.0928, "step": 3400 }, { "epoch": 0.40414814814814815, "grad_norm": 1.02941041489853, "learning_rate": 9.963480412683424e-06, "loss": 0.0933, "step": 3410 }, { "epoch": 0.4053333333333333, "grad_norm": 0.9864449378353636, "learning_rate": 9.96264387919524e-06, "loss": 0.0917, "step": 3420 }, { "epoch": 0.4065185185185185, "grad_norm": 0.783215057552359, "learning_rate": 9.961797908812708e-06, "loss": 0.0908, "step": 3430 }, { "epoch": 0.4077037037037037, "grad_norm": 1.046952903947465, "learning_rate": 9.960942503144518e-06, "loss": 0.0978, "step": 3440 }, { "epoch": 0.4088888888888889, "grad_norm": 0.9699815953951667, "learning_rate": 9.960077663817295e-06, "loss": 0.0964, "step": 3450 }, { "epoch": 0.4100740740740741, "grad_norm": 1.1811835472073076, "learning_rate": 9.959203392475609e-06, "loss": 0.0989, "step": 3460 }, { "epoch": 0.41125925925925927, "grad_norm": 0.7810165988147362, "learning_rate": 9.958319690781956e-06, "loss": 0.0919, "step": 3470 }, { "epoch": 0.41244444444444445, "grad_norm": 0.9336680156088368, "learning_rate": 9.957426560416776e-06, "loss": 0.0918, "step": 3480 }, { "epoch": 0.4136296296296296, "grad_norm": 0.8919293395755065, "learning_rate": 9.956524003078432e-06, "loss": 0.0981, "step": 3490 }, { "epoch": 0.4148148148148148, "grad_norm": 0.8925853133374531, "learning_rate": 9.955612020483215e-06, "loss": 0.0918, "step": 3500 }, { "epoch": 0.416, "grad_norm": 0.9238403024531354, "learning_rate": 9.954690614365337e-06, "loss": 0.0967, "step": 3510 }, { "epoch": 0.41718518518518516, "grad_norm": 0.9155565229092409, "learning_rate": 9.95375978647693e-06, "loss": 0.0959, "step": 3520 }, { "epoch": 0.4183703703703704, "grad_norm": 0.929056384983427, "learning_rate": 9.952819538588045e-06, "loss": 0.0934, "step": 3530 }, { "epoch": 0.41955555555555557, "grad_norm": 0.9734398329498284, "learning_rate": 9.951869872486644e-06, "loss": 0.0907, "step": 3540 }, { "epoch": 0.42074074074074075, "grad_norm": 0.9646738159673482, "learning_rate": 9.950910789978599e-06, "loss": 0.0941, "step": 3550 }, { "epoch": 0.4219259259259259, "grad_norm": 0.815005018709483, "learning_rate": 9.949942292887689e-06, "loss": 0.094, "step": 3560 }, { "epoch": 0.4231111111111111, "grad_norm": 0.8944551210796629, "learning_rate": 9.948964383055592e-06, "loss": 0.0941, "step": 3570 }, { "epoch": 0.4242962962962963, "grad_norm": 1.1965964638289905, "learning_rate": 9.94797706234189e-06, "loss": 0.0939, "step": 3580 }, { "epoch": 0.42548148148148146, "grad_norm": 0.8975630609016324, "learning_rate": 9.946980332624057e-06, "loss": 0.0964, "step": 3590 }, { "epoch": 0.4266666666666667, "grad_norm": 0.9429045517527573, "learning_rate": 9.94597419579746e-06, "loss": 0.0913, "step": 3600 }, { "epoch": 0.42785185185185187, "grad_norm": 0.9952465782659273, "learning_rate": 9.944958653775356e-06, "loss": 0.0898, "step": 3610 }, { "epoch": 0.42903703703703705, "grad_norm": 0.980068236567743, "learning_rate": 9.943933708488883e-06, "loss": 0.0881, "step": 3620 }, { "epoch": 0.43022222222222223, "grad_norm": 0.9972782331025953, "learning_rate": 9.942899361887066e-06, "loss": 0.087, "step": 3630 }, { "epoch": 0.4314074074074074, "grad_norm": 0.893079670752483, "learning_rate": 9.941855615936803e-06, "loss": 0.0891, "step": 3640 }, { "epoch": 0.4325925925925926, "grad_norm": 0.8515826738127601, "learning_rate": 9.940802472622865e-06, "loss": 0.0811, "step": 3650 }, { "epoch": 0.43377777777777776, "grad_norm": 1.000710629615497, "learning_rate": 9.939739933947898e-06, "loss": 0.0906, "step": 3660 }, { "epoch": 0.43496296296296294, "grad_norm": 0.8992833245157053, "learning_rate": 9.938668001932408e-06, "loss": 0.0873, "step": 3670 }, { "epoch": 0.4361481481481482, "grad_norm": 0.9765751087128967, "learning_rate": 9.937586678614765e-06, "loss": 0.0865, "step": 3680 }, { "epoch": 0.43733333333333335, "grad_norm": 0.9354682815023043, "learning_rate": 9.936495966051204e-06, "loss": 0.0927, "step": 3690 }, { "epoch": 0.43851851851851853, "grad_norm": 0.7584790594891406, "learning_rate": 9.9353958663158e-06, "loss": 0.0945, "step": 3700 }, { "epoch": 0.4397037037037037, "grad_norm": 1.0239273300550789, "learning_rate": 9.934286381500494e-06, "loss": 0.0875, "step": 3710 }, { "epoch": 0.4408888888888889, "grad_norm": 0.95376512919744, "learning_rate": 9.933167513715065e-06, "loss": 0.0872, "step": 3720 }, { "epoch": 0.44207407407407406, "grad_norm": 0.9280934572790283, "learning_rate": 9.932039265087137e-06, "loss": 0.0946, "step": 3730 }, { "epoch": 0.44325925925925924, "grad_norm": 0.9442662624740906, "learning_rate": 9.93090163776217e-06, "loss": 0.0995, "step": 3740 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8801583739667498, "learning_rate": 9.929754633903461e-06, "loss": 0.0894, "step": 3750 }, { "epoch": 0.44562962962962965, "grad_norm": 0.8510209114055628, "learning_rate": 9.92859825569214e-06, "loss": 0.0847, "step": 3760 }, { "epoch": 0.44681481481481483, "grad_norm": 1.023114735895868, "learning_rate": 9.927432505327153e-06, "loss": 0.0849, "step": 3770 }, { "epoch": 0.448, "grad_norm": 1.1407286029311243, "learning_rate": 9.92625738502528e-06, "loss": 0.0889, "step": 3780 }, { "epoch": 0.4491851851851852, "grad_norm": 0.8609055870236172, "learning_rate": 9.925072897021111e-06, "loss": 0.091, "step": 3790 }, { "epoch": 0.45037037037037037, "grad_norm": 0.8582580844008023, "learning_rate": 9.923879043567052e-06, "loss": 0.0918, "step": 3800 }, { "epoch": 0.45155555555555554, "grad_norm": 0.9980105984965385, "learning_rate": 9.922675826933319e-06, "loss": 0.0869, "step": 3810 }, { "epoch": 0.4527407407407407, "grad_norm": 0.8638435969966359, "learning_rate": 9.921463249407932e-06, "loss": 0.0862, "step": 3820 }, { "epoch": 0.4539259259259259, "grad_norm": 0.8725754025573971, "learning_rate": 9.92024131329671e-06, "loss": 0.0847, "step": 3830 }, { "epoch": 0.45511111111111113, "grad_norm": 1.0397721165449088, "learning_rate": 9.919010020923269e-06, "loss": 0.0825, "step": 3840 }, { "epoch": 0.4562962962962963, "grad_norm": 1.0489467179584535, "learning_rate": 9.917769374629022e-06, "loss": 0.0828, "step": 3850 }, { "epoch": 0.4574814814814815, "grad_norm": 0.7953383373238736, "learning_rate": 9.916519376773161e-06, "loss": 0.0837, "step": 3860 }, { "epoch": 0.45866666666666667, "grad_norm": 0.9340192162940949, "learning_rate": 9.915260029732664e-06, "loss": 0.083, "step": 3870 }, { "epoch": 0.45985185185185184, "grad_norm": 0.9650531833970395, "learning_rate": 9.913991335902292e-06, "loss": 0.0895, "step": 3880 }, { "epoch": 0.461037037037037, "grad_norm": 0.8063007229691221, "learning_rate": 9.912713297694569e-06, "loss": 0.0843, "step": 3890 }, { "epoch": 0.4622222222222222, "grad_norm": 1.178073999705309, "learning_rate": 9.911425917539798e-06, "loss": 0.0833, "step": 3900 }, { "epoch": 0.46340740740740743, "grad_norm": 1.1009253004923545, "learning_rate": 9.910129197886044e-06, "loss": 0.0766, "step": 3910 }, { "epoch": 0.4645925925925926, "grad_norm": 1.0402442425262375, "learning_rate": 9.90882314119913e-06, "loss": 0.0832, "step": 3920 }, { "epoch": 0.4657777777777778, "grad_norm": 1.1830787967537757, "learning_rate": 9.907507749962636e-06, "loss": 0.0836, "step": 3930 }, { "epoch": 0.46696296296296297, "grad_norm": 1.0321106423065325, "learning_rate": 9.90618302667789e-06, "loss": 0.0861, "step": 3940 }, { "epoch": 0.46814814814814815, "grad_norm": 0.8422245987541108, "learning_rate": 9.90484897386397e-06, "loss": 0.0848, "step": 3950 }, { "epoch": 0.4693333333333333, "grad_norm": 1.149098291761147, "learning_rate": 9.90350559405769e-06, "loss": 0.0825, "step": 3960 }, { "epoch": 0.4705185185185185, "grad_norm": 0.8207440542003209, "learning_rate": 9.902152889813602e-06, "loss": 0.0856, "step": 3970 }, { "epoch": 0.4717037037037037, "grad_norm": 0.849458620432329, "learning_rate": 9.90079086370399e-06, "loss": 0.0785, "step": 3980 }, { "epoch": 0.4728888888888889, "grad_norm": 0.9602453060621781, "learning_rate": 9.899419518318865e-06, "loss": 0.0826, "step": 3990 }, { "epoch": 0.4740740740740741, "grad_norm": 0.9699078220349924, "learning_rate": 9.898038856265957e-06, "loss": 0.0765, "step": 4000 }, { "epoch": 0.47525925925925927, "grad_norm": 0.9269160394434455, "learning_rate": 9.896648880170713e-06, "loss": 0.0761, "step": 4010 }, { "epoch": 0.47644444444444445, "grad_norm": 0.9595874361318213, "learning_rate": 9.895249592676294e-06, "loss": 0.0866, "step": 4020 }, { "epoch": 0.4776296296296296, "grad_norm": 1.013893515884627, "learning_rate": 9.893840996443565e-06, "loss": 0.0813, "step": 4030 }, { "epoch": 0.4788148148148148, "grad_norm": 0.9863207818786465, "learning_rate": 9.892423094151093e-06, "loss": 0.0794, "step": 4040 }, { "epoch": 0.48, "grad_norm": 0.9412206085311899, "learning_rate": 9.890995888495141e-06, "loss": 0.0852, "step": 4050 }, { "epoch": 0.48118518518518516, "grad_norm": 1.0081837206717523, "learning_rate": 9.889559382189662e-06, "loss": 0.0796, "step": 4060 }, { "epoch": 0.4823703703703704, "grad_norm": 0.8860577605037484, "learning_rate": 9.8881135779663e-06, "loss": 0.0789, "step": 4070 }, { "epoch": 0.48355555555555557, "grad_norm": 1.033554009759029, "learning_rate": 9.88665847857437e-06, "loss": 0.08, "step": 4080 }, { "epoch": 0.48474074074074075, "grad_norm": 0.7724560658073316, "learning_rate": 9.885194086780875e-06, "loss": 0.0776, "step": 4090 }, { "epoch": 0.48592592592592593, "grad_norm": 0.9190066206516513, "learning_rate": 9.88372040537048e-06, "loss": 0.0792, "step": 4100 }, { "epoch": 0.4871111111111111, "grad_norm": 0.9470868574408026, "learning_rate": 9.882237437145515e-06, "loss": 0.0813, "step": 4110 }, { "epoch": 0.4882962962962963, "grad_norm": 0.9132243251555825, "learning_rate": 9.880745184925974e-06, "loss": 0.0802, "step": 4120 }, { "epoch": 0.48948148148148146, "grad_norm": 0.7498355906497279, "learning_rate": 9.879243651549501e-06, "loss": 0.0826, "step": 4130 }, { "epoch": 0.49066666666666664, "grad_norm": 0.9813564916356462, "learning_rate": 9.877732839871393e-06, "loss": 0.0836, "step": 4140 }, { "epoch": 0.4918518518518519, "grad_norm": 1.0202496294432721, "learning_rate": 9.876212752764587e-06, "loss": 0.0827, "step": 4150 }, { "epoch": 0.49303703703703705, "grad_norm": 0.6887806634020146, "learning_rate": 9.87468339311966e-06, "loss": 0.0814, "step": 4160 }, { "epoch": 0.49422222222222223, "grad_norm": 0.9911837485493776, "learning_rate": 9.873144763844822e-06, "loss": 0.0828, "step": 4170 }, { "epoch": 0.4954074074074074, "grad_norm": 0.8535687053230085, "learning_rate": 9.871596867865907e-06, "loss": 0.0805, "step": 4180 }, { "epoch": 0.4965925925925926, "grad_norm": 0.8389560557766649, "learning_rate": 9.870039708126371e-06, "loss": 0.0843, "step": 4190 }, { "epoch": 0.49777777777777776, "grad_norm": 0.9831913727447303, "learning_rate": 9.868473287587293e-06, "loss": 0.0738, "step": 4200 }, { "epoch": 0.49896296296296294, "grad_norm": 1.0868062927283855, "learning_rate": 9.86689760922735e-06, "loss": 0.0734, "step": 4210 }, { "epoch": 0.5001481481481481, "grad_norm": 0.9553313952317861, "learning_rate": 9.865312676042835e-06, "loss": 0.0761, "step": 4220 }, { "epoch": 0.5013333333333333, "grad_norm": 0.9784945464770984, "learning_rate": 9.863718491047632e-06, "loss": 0.0734, "step": 4230 }, { "epoch": 0.5025185185185185, "grad_norm": 0.9822855413933417, "learning_rate": 9.86211505727322e-06, "loss": 0.0793, "step": 4240 }, { "epoch": 0.5037037037037037, "grad_norm": 1.002768570268345, "learning_rate": 9.86050237776867e-06, "loss": 0.0829, "step": 4250 }, { "epoch": 0.5048888888888889, "grad_norm": 1.03445820054938, "learning_rate": 9.858880455600628e-06, "loss": 0.079, "step": 4260 }, { "epoch": 0.5060740740740741, "grad_norm": 0.7330098398348979, "learning_rate": 9.857249293853319e-06, "loss": 0.0736, "step": 4270 }, { "epoch": 0.5072592592592593, "grad_norm": 0.8999801573580161, "learning_rate": 9.855608895628538e-06, "loss": 0.0765, "step": 4280 }, { "epoch": 0.5084444444444445, "grad_norm": 0.7743613315019119, "learning_rate": 9.853959264045642e-06, "loss": 0.077, "step": 4290 }, { "epoch": 0.5096296296296297, "grad_norm": 0.8167829336455701, "learning_rate": 9.852300402241551e-06, "loss": 0.0764, "step": 4300 }, { "epoch": 0.5108148148148148, "grad_norm": 0.873016590690606, "learning_rate": 9.85063231337073e-06, "loss": 0.081, "step": 4310 }, { "epoch": 0.512, "grad_norm": 1.0481225878861797, "learning_rate": 9.848955000605192e-06, "loss": 0.0758, "step": 4320 }, { "epoch": 0.5131851851851852, "grad_norm": 0.883091147782801, "learning_rate": 9.847268467134497e-06, "loss": 0.0756, "step": 4330 }, { "epoch": 0.5143703703703704, "grad_norm": 0.8675043761813808, "learning_rate": 9.845572716165728e-06, "loss": 0.0801, "step": 4340 }, { "epoch": 0.5155555555555555, "grad_norm": 0.883382820831632, "learning_rate": 9.843867750923506e-06, "loss": 0.0787, "step": 4350 }, { "epoch": 0.5167407407407407, "grad_norm": 0.9241815851948217, "learning_rate": 9.842153574649966e-06, "loss": 0.0834, "step": 4360 }, { "epoch": 0.5179259259259259, "grad_norm": 0.9695709706938929, "learning_rate": 9.840430190604761e-06, "loss": 0.0792, "step": 4370 }, { "epoch": 0.5191111111111111, "grad_norm": 0.8993586706182487, "learning_rate": 9.838697602065059e-06, "loss": 0.0792, "step": 4380 }, { "epoch": 0.5202962962962963, "grad_norm": 0.9858558254838163, "learning_rate": 9.836955812325521e-06, "loss": 0.0799, "step": 4390 }, { "epoch": 0.5214814814814814, "grad_norm": 0.9147900647179666, "learning_rate": 9.835204824698313e-06, "loss": 0.0767, "step": 4400 }, { "epoch": 0.5226666666666666, "grad_norm": 0.9007220208807277, "learning_rate": 9.833444642513086e-06, "loss": 0.0804, "step": 4410 }, { "epoch": 0.5238518518518519, "grad_norm": 0.9657440658284951, "learning_rate": 9.831675269116981e-06, "loss": 0.0766, "step": 4420 }, { "epoch": 0.5250370370370371, "grad_norm": 1.0775364005766082, "learning_rate": 9.829896707874612e-06, "loss": 0.077, "step": 4430 }, { "epoch": 0.5262222222222223, "grad_norm": 0.9246103362627596, "learning_rate": 9.828108962168066e-06, "loss": 0.0751, "step": 4440 }, { "epoch": 0.5274074074074074, "grad_norm": 0.7364814793955189, "learning_rate": 9.826312035396896e-06, "loss": 0.0746, "step": 4450 }, { "epoch": 0.5285925925925926, "grad_norm": 0.8317457616605274, "learning_rate": 9.824505930978113e-06, "loss": 0.0747, "step": 4460 }, { "epoch": 0.5297777777777778, "grad_norm": 0.919623028819872, "learning_rate": 9.822690652346178e-06, "loss": 0.072, "step": 4470 }, { "epoch": 0.530962962962963, "grad_norm": 0.9798604849866279, "learning_rate": 9.820866202953004e-06, "loss": 0.0812, "step": 4480 }, { "epoch": 0.5321481481481481, "grad_norm": 0.776329098381582, "learning_rate": 9.819032586267933e-06, "loss": 0.0739, "step": 4490 }, { "epoch": 0.5333333333333333, "grad_norm": 0.8161182743896854, "learning_rate": 9.81718980577775e-06, "loss": 0.0722, "step": 4500 }, { "epoch": 0.5345185185185185, "grad_norm": 0.7671593687404197, "learning_rate": 9.815337864986656e-06, "loss": 0.0691, "step": 4510 }, { "epoch": 0.5357037037037037, "grad_norm": 0.848051450203711, "learning_rate": 9.813476767416278e-06, "loss": 0.082, "step": 4520 }, { "epoch": 0.5368888888888889, "grad_norm": 0.7336951432286606, "learning_rate": 9.811606516605655e-06, "loss": 0.0739, "step": 4530 }, { "epoch": 0.538074074074074, "grad_norm": 0.8032350179251622, "learning_rate": 9.809727116111225e-06, "loss": 0.0677, "step": 4540 }, { "epoch": 0.5392592592592592, "grad_norm": 0.8493424748321329, "learning_rate": 9.807838569506834e-06, "loss": 0.0816, "step": 4550 }, { "epoch": 0.5404444444444444, "grad_norm": 1.1252355367667275, "learning_rate": 9.805940880383716e-06, "loss": 0.0709, "step": 4560 }, { "epoch": 0.5416296296296297, "grad_norm": 0.8202722453794303, "learning_rate": 9.804034052350488e-06, "loss": 0.0705, "step": 4570 }, { "epoch": 0.5428148148148149, "grad_norm": 0.9841648666857978, "learning_rate": 9.802118089033147e-06, "loss": 0.0796, "step": 4580 }, { "epoch": 0.544, "grad_norm": 0.9170390653163952, "learning_rate": 9.800192994075064e-06, "loss": 0.0779, "step": 4590 }, { "epoch": 0.5451851851851852, "grad_norm": 0.9349672850027937, "learning_rate": 9.798258771136973e-06, "loss": 0.0701, "step": 4600 }, { "epoch": 0.5463703703703704, "grad_norm": 0.8796945469030365, "learning_rate": 9.796315423896963e-06, "loss": 0.0714, "step": 4610 }, { "epoch": 0.5475555555555556, "grad_norm": 0.8088398284880879, "learning_rate": 9.794362956050479e-06, "loss": 0.0739, "step": 4620 }, { "epoch": 0.5487407407407408, "grad_norm": 0.8677937210696124, "learning_rate": 9.792401371310305e-06, "loss": 0.0697, "step": 4630 }, { "epoch": 0.5499259259259259, "grad_norm": 0.7349109114121267, "learning_rate": 9.79043067340656e-06, "loss": 0.0696, "step": 4640 }, { "epoch": 0.5511111111111111, "grad_norm": 0.9034556703048897, "learning_rate": 9.788450866086702e-06, "loss": 0.0696, "step": 4650 }, { "epoch": 0.5522962962962963, "grad_norm": 0.8167429360416598, "learning_rate": 9.786461953115503e-06, "loss": 0.0695, "step": 4660 }, { "epoch": 0.5534814814814815, "grad_norm": 0.9066277909499421, "learning_rate": 9.784463938275048e-06, "loss": 0.0725, "step": 4670 }, { "epoch": 0.5546666666666666, "grad_norm": 0.9478972121779468, "learning_rate": 9.78245682536474e-06, "loss": 0.0675, "step": 4680 }, { "epoch": 0.5558518518518518, "grad_norm": 0.9942604137795122, "learning_rate": 9.780440618201272e-06, "loss": 0.0732, "step": 4690 }, { "epoch": 0.557037037037037, "grad_norm": 0.7786745953110245, "learning_rate": 9.778415320618637e-06, "loss": 0.0734, "step": 4700 }, { "epoch": 0.5582222222222222, "grad_norm": 0.8077620341914629, "learning_rate": 9.776380936468116e-06, "loss": 0.0696, "step": 4710 }, { "epoch": 0.5594074074074074, "grad_norm": 0.8404086072361571, "learning_rate": 9.77433746961826e-06, "loss": 0.0714, "step": 4720 }, { "epoch": 0.5605925925925926, "grad_norm": 0.7837106722079612, "learning_rate": 9.7722849239549e-06, "loss": 0.0687, "step": 4730 }, { "epoch": 0.5617777777777778, "grad_norm": 0.7414209083076281, "learning_rate": 9.770223303381128e-06, "loss": 0.0756, "step": 4740 }, { "epoch": 0.562962962962963, "grad_norm": 0.9782075391281517, "learning_rate": 9.768152611817293e-06, "loss": 0.0708, "step": 4750 }, { "epoch": 0.5641481481481482, "grad_norm": 0.9312208904384014, "learning_rate": 9.76607285320099e-06, "loss": 0.0671, "step": 4760 }, { "epoch": 0.5653333333333334, "grad_norm": 0.9306729907565611, "learning_rate": 9.763984031487065e-06, "loss": 0.066, "step": 4770 }, { "epoch": 0.5665185185185185, "grad_norm": 0.8345756945549867, "learning_rate": 9.761886150647588e-06, "loss": 0.0668, "step": 4780 }, { "epoch": 0.5677037037037037, "grad_norm": 0.7504187948360455, "learning_rate": 9.759779214671861e-06, "loss": 0.0704, "step": 4790 }, { "epoch": 0.5688888888888889, "grad_norm": 0.9164332260694077, "learning_rate": 9.757663227566404e-06, "loss": 0.0688, "step": 4800 }, { "epoch": 0.5700740740740741, "grad_norm": 0.9344952704716404, "learning_rate": 9.755538193354949e-06, "loss": 0.0657, "step": 4810 }, { "epoch": 0.5712592592592592, "grad_norm": 0.8379251538993587, "learning_rate": 9.753404116078432e-06, "loss": 0.0654, "step": 4820 }, { "epoch": 0.5724444444444444, "grad_norm": 0.8247241787909821, "learning_rate": 9.751260999794982e-06, "loss": 0.0707, "step": 4830 }, { "epoch": 0.5736296296296296, "grad_norm": 0.8336991393348129, "learning_rate": 9.74910884857992e-06, "loss": 0.0674, "step": 4840 }, { "epoch": 0.5748148148148148, "grad_norm": 0.9535592029047301, "learning_rate": 9.74694766652575e-06, "loss": 0.0719, "step": 4850 }, { "epoch": 0.576, "grad_norm": 0.9352346617374387, "learning_rate": 9.74477745774214e-06, "loss": 0.073, "step": 4860 }, { "epoch": 0.5771851851851851, "grad_norm": 0.6953612203906957, "learning_rate": 9.742598226355933e-06, "loss": 0.0666, "step": 4870 }, { "epoch": 0.5783703703703704, "grad_norm": 0.8851143259469123, "learning_rate": 9.740409976511126e-06, "loss": 0.0691, "step": 4880 }, { "epoch": 0.5795555555555556, "grad_norm": 1.162132947056599, "learning_rate": 9.738212712368858e-06, "loss": 0.0737, "step": 4890 }, { "epoch": 0.5807407407407408, "grad_norm": 0.842069037597675, "learning_rate": 9.736006438107422e-06, "loss": 0.0615, "step": 4900 }, { "epoch": 0.581925925925926, "grad_norm": 1.0245882027540885, "learning_rate": 9.733791157922234e-06, "loss": 0.0687, "step": 4910 }, { "epoch": 0.5831111111111111, "grad_norm": 0.7621347191636244, "learning_rate": 9.731566876025844e-06, "loss": 0.0681, "step": 4920 }, { "epoch": 0.5842962962962963, "grad_norm": 0.8124096002928971, "learning_rate": 9.729333596647915e-06, "loss": 0.0672, "step": 4930 }, { "epoch": 0.5854814814814815, "grad_norm": 0.8258468290284263, "learning_rate": 9.727091324035216e-06, "loss": 0.0611, "step": 4940 }, { "epoch": 0.5866666666666667, "grad_norm": 1.0141042479424183, "learning_rate": 9.724840062451624e-06, "loss": 0.0678, "step": 4950 }, { "epoch": 0.5878518518518518, "grad_norm": 0.8893839668094795, "learning_rate": 9.722579816178107e-06, "loss": 0.0715, "step": 4960 }, { "epoch": 0.589037037037037, "grad_norm": 0.8344641638577025, "learning_rate": 9.720310589512715e-06, "loss": 0.0664, "step": 4970 }, { "epoch": 0.5902222222222222, "grad_norm": 0.7879889786909607, "learning_rate": 9.718032386770582e-06, "loss": 0.0688, "step": 4980 }, { "epoch": 0.5914074074074074, "grad_norm": 0.7925280765836924, "learning_rate": 9.715745212283904e-06, "loss": 0.0675, "step": 4990 }, { "epoch": 0.5925925925925926, "grad_norm": 0.9260270033769327, "learning_rate": 9.713449070401941e-06, "loss": 0.0664, "step": 5000 }, { "epoch": 0.5937777777777777, "grad_norm": 0.9446444431575735, "learning_rate": 9.711143965491003e-06, "loss": 0.0703, "step": 5010 }, { "epoch": 0.5949629629629629, "grad_norm": 1.017346089777206, "learning_rate": 9.708829901934447e-06, "loss": 0.063, "step": 5020 }, { "epoch": 0.5961481481481481, "grad_norm": 0.8271272833663433, "learning_rate": 9.70650688413266e-06, "loss": 0.0664, "step": 5030 }, { "epoch": 0.5973333333333334, "grad_norm": 0.6636146814498104, "learning_rate": 9.704174916503068e-06, "loss": 0.0593, "step": 5040 }, { "epoch": 0.5985185185185186, "grad_norm": 0.8554269321734951, "learning_rate": 9.701834003480101e-06, "loss": 0.0659, "step": 5050 }, { "epoch": 0.5997037037037037, "grad_norm": 0.786425376868664, "learning_rate": 9.699484149515209e-06, "loss": 0.0623, "step": 5060 }, { "epoch": 0.6008888888888889, "grad_norm": 0.9158010551102281, "learning_rate": 9.697125359076842e-06, "loss": 0.0655, "step": 5070 }, { "epoch": 0.6020740740740741, "grad_norm": 0.8378371030609353, "learning_rate": 9.69475763665044e-06, "loss": 0.0653, "step": 5080 }, { "epoch": 0.6032592592592593, "grad_norm": 0.8771324341656606, "learning_rate": 9.692380986738437e-06, "loss": 0.0679, "step": 5090 }, { "epoch": 0.6044444444444445, "grad_norm": 0.890772082009701, "learning_rate": 9.689995413860232e-06, "loss": 0.0681, "step": 5100 }, { "epoch": 0.6056296296296296, "grad_norm": 0.6840219596828194, "learning_rate": 9.6876009225522e-06, "loss": 0.0602, "step": 5110 }, { "epoch": 0.6068148148148148, "grad_norm": 0.8918993932105872, "learning_rate": 9.68519751736767e-06, "loss": 0.0668, "step": 5120 }, { "epoch": 0.608, "grad_norm": 0.7424429000679742, "learning_rate": 9.682785202876926e-06, "loss": 0.0688, "step": 5130 }, { "epoch": 0.6091851851851852, "grad_norm": 0.706883039822154, "learning_rate": 9.680363983667188e-06, "loss": 0.0618, "step": 5140 }, { "epoch": 0.6103703703703703, "grad_norm": 0.7982272736582623, "learning_rate": 9.677933864342617e-06, "loss": 0.0613, "step": 5150 }, { "epoch": 0.6115555555555555, "grad_norm": 0.8670836748387671, "learning_rate": 9.67549484952429e-06, "loss": 0.0631, "step": 5160 }, { "epoch": 0.6127407407407407, "grad_norm": 0.7421959033877474, "learning_rate": 9.673046943850209e-06, "loss": 0.0618, "step": 5170 }, { "epoch": 0.6139259259259259, "grad_norm": 0.8029090636060535, "learning_rate": 9.67059015197527e-06, "loss": 0.064, "step": 5180 }, { "epoch": 0.6151111111111112, "grad_norm": 0.8685429063722038, "learning_rate": 9.66812447857128e-06, "loss": 0.0643, "step": 5190 }, { "epoch": 0.6162962962962963, "grad_norm": 0.860826831107572, "learning_rate": 9.665649928326928e-06, "loss": 0.0674, "step": 5200 }, { "epoch": 0.6174814814814815, "grad_norm": 0.9326389724348948, "learning_rate": 9.663166505947782e-06, "loss": 0.0631, "step": 5210 }, { "epoch": 0.6186666666666667, "grad_norm": 0.8840921628765807, "learning_rate": 9.660674216156285e-06, "loss": 0.0644, "step": 5220 }, { "epoch": 0.6198518518518519, "grad_norm": 0.8092167255352415, "learning_rate": 9.65817306369174e-06, "loss": 0.0649, "step": 5230 }, { "epoch": 0.621037037037037, "grad_norm": 1.0602310809354665, "learning_rate": 9.655663053310304e-06, "loss": 0.0673, "step": 5240 }, { "epoch": 0.6222222222222222, "grad_norm": 0.8539612776032047, "learning_rate": 9.653144189784977e-06, "loss": 0.0652, "step": 5250 }, { "epoch": 0.6234074074074074, "grad_norm": 0.9071711891564866, "learning_rate": 9.650616477905595e-06, "loss": 0.068, "step": 5260 }, { "epoch": 0.6245925925925926, "grad_norm": 0.9274017159065672, "learning_rate": 9.648079922478822e-06, "loss": 0.0641, "step": 5270 }, { "epoch": 0.6257777777777778, "grad_norm": 0.7406516271738932, "learning_rate": 9.645534528328131e-06, "loss": 0.0641, "step": 5280 }, { "epoch": 0.6269629629629629, "grad_norm": 0.7465702088105352, "learning_rate": 9.642980300293814e-06, "loss": 0.0588, "step": 5290 }, { "epoch": 0.6281481481481481, "grad_norm": 0.9777056163743169, "learning_rate": 9.640417243232951e-06, "loss": 0.0613, "step": 5300 }, { "epoch": 0.6293333333333333, "grad_norm": 0.6294669971038668, "learning_rate": 9.637845362019418e-06, "loss": 0.0651, "step": 5310 }, { "epoch": 0.6305185185185185, "grad_norm": 0.8809505962495335, "learning_rate": 9.635264661543867e-06, "loss": 0.0643, "step": 5320 }, { "epoch": 0.6317037037037037, "grad_norm": 0.8631695011826251, "learning_rate": 9.632675146713723e-06, "loss": 0.0689, "step": 5330 }, { "epoch": 0.6328888888888888, "grad_norm": 0.7907244282209989, "learning_rate": 9.630076822453171e-06, "loss": 0.0616, "step": 5340 }, { "epoch": 0.6340740740740741, "grad_norm": 0.8973103523088067, "learning_rate": 9.627469693703149e-06, "loss": 0.0617, "step": 5350 }, { "epoch": 0.6352592592592593, "grad_norm": 0.9475571187902337, "learning_rate": 9.624853765421334e-06, "loss": 0.0656, "step": 5360 }, { "epoch": 0.6364444444444445, "grad_norm": 1.0157216339964799, "learning_rate": 9.62222904258214e-06, "loss": 0.0601, "step": 5370 }, { "epoch": 0.6376296296296297, "grad_norm": 0.9974306627276354, "learning_rate": 9.619595530176707e-06, "loss": 0.0679, "step": 5380 }, { "epoch": 0.6388148148148148, "grad_norm": 0.7794751178790458, "learning_rate": 9.61695323321288e-06, "loss": 0.0597, "step": 5390 }, { "epoch": 0.64, "grad_norm": 0.8625675136062335, "learning_rate": 9.614302156715214e-06, "loss": 0.0629, "step": 5400 }, { "epoch": 0.6411851851851852, "grad_norm": 0.9282341396493997, "learning_rate": 9.611642305724965e-06, "loss": 0.0643, "step": 5410 }, { "epoch": 0.6423703703703704, "grad_norm": 1.065934769988668, "learning_rate": 9.608973685300063e-06, "loss": 0.0632, "step": 5420 }, { "epoch": 0.6435555555555555, "grad_norm": 0.8388865553494513, "learning_rate": 9.606296300515122e-06, "loss": 0.0664, "step": 5430 }, { "epoch": 0.6447407407407407, "grad_norm": 0.8548715708167248, "learning_rate": 9.603610156461415e-06, "loss": 0.0627, "step": 5440 }, { "epoch": 0.6459259259259259, "grad_norm": 0.854245229233928, "learning_rate": 9.600915258246884e-06, "loss": 0.0626, "step": 5450 }, { "epoch": 0.6471111111111111, "grad_norm": 0.8890221944599039, "learning_rate": 9.598211610996104e-06, "loss": 0.0604, "step": 5460 }, { "epoch": 0.6482962962962963, "grad_norm": 0.8734622308738257, "learning_rate": 9.595499219850295e-06, "loss": 0.0616, "step": 5470 }, { "epoch": 0.6494814814814814, "grad_norm": 0.7953487650138661, "learning_rate": 9.5927780899673e-06, "loss": 0.0606, "step": 5480 }, { "epoch": 0.6506666666666666, "grad_norm": 0.6840634039323507, "learning_rate": 9.590048226521587e-06, "loss": 0.0657, "step": 5490 }, { "epoch": 0.6518518518518519, "grad_norm": 0.744927913566696, "learning_rate": 9.587309634704219e-06, "loss": 0.0605, "step": 5500 }, { "epoch": 0.6530370370370371, "grad_norm": 0.8445654490129404, "learning_rate": 9.584562319722868e-06, "loss": 0.0603, "step": 5510 }, { "epoch": 0.6542222222222223, "grad_norm": 0.8275336436590583, "learning_rate": 9.58180628680179e-06, "loss": 0.0601, "step": 5520 }, { "epoch": 0.6554074074074074, "grad_norm": 0.9520322283595853, "learning_rate": 9.579041541181816e-06, "loss": 0.0604, "step": 5530 }, { "epoch": 0.6565925925925926, "grad_norm": 0.816709186599619, "learning_rate": 9.576268088120354e-06, "loss": 0.0585, "step": 5540 }, { "epoch": 0.6577777777777778, "grad_norm": 0.7421032800977814, "learning_rate": 9.573485932891356e-06, "loss": 0.062, "step": 5550 }, { "epoch": 0.658962962962963, "grad_norm": 0.7457383791808052, "learning_rate": 9.570695080785333e-06, "loss": 0.0557, "step": 5560 }, { "epoch": 0.6601481481481482, "grad_norm": 0.738277067476009, "learning_rate": 9.567895537109331e-06, "loss": 0.0633, "step": 5570 }, { "epoch": 0.6613333333333333, "grad_norm": 0.7311474578253814, "learning_rate": 9.56508730718692e-06, "loss": 0.0618, "step": 5580 }, { "epoch": 0.6625185185185185, "grad_norm": 0.6122012090202148, "learning_rate": 9.562270396358196e-06, "loss": 0.0582, "step": 5590 }, { "epoch": 0.6637037037037037, "grad_norm": 0.6528632820523848, "learning_rate": 9.559444809979754e-06, "loss": 0.0589, "step": 5600 }, { "epoch": 0.6648888888888889, "grad_norm": 0.7232679190902909, "learning_rate": 9.556610553424692e-06, "loss": 0.061, "step": 5610 }, { "epoch": 0.666074074074074, "grad_norm": 0.7414024599892046, "learning_rate": 9.553767632082588e-06, "loss": 0.0622, "step": 5620 }, { "epoch": 0.6672592592592592, "grad_norm": 0.965476370726474, "learning_rate": 9.550916051359506e-06, "loss": 0.0598, "step": 5630 }, { "epoch": 0.6684444444444444, "grad_norm": 0.8379623119712973, "learning_rate": 9.548055816677971e-06, "loss": 0.0557, "step": 5640 }, { "epoch": 0.6696296296296296, "grad_norm": 0.8585776947199584, "learning_rate": 9.545186933476964e-06, "loss": 0.062, "step": 5650 }, { "epoch": 0.6708148148148149, "grad_norm": 0.7795756023660245, "learning_rate": 9.542309407211914e-06, "loss": 0.0527, "step": 5660 }, { "epoch": 0.672, "grad_norm": 0.778102865818906, "learning_rate": 9.539423243354687e-06, "loss": 0.063, "step": 5670 }, { "epoch": 0.6731851851851852, "grad_norm": 0.7211796374650427, "learning_rate": 9.536528447393568e-06, "loss": 0.0607, "step": 5680 }, { "epoch": 0.6743703703703704, "grad_norm": 0.9290785924371383, "learning_rate": 9.533625024833264e-06, "loss": 0.0592, "step": 5690 }, { "epoch": 0.6755555555555556, "grad_norm": 0.8015010809685191, "learning_rate": 9.53071298119488e-06, "loss": 0.0584, "step": 5700 }, { "epoch": 0.6767407407407408, "grad_norm": 0.7634909938889708, "learning_rate": 9.527792322015918e-06, "loss": 0.0632, "step": 5710 }, { "epoch": 0.6779259259259259, "grad_norm": 0.7005879890355755, "learning_rate": 9.524863052850266e-06, "loss": 0.0596, "step": 5720 }, { "epoch": 0.6791111111111111, "grad_norm": 0.7967970526813604, "learning_rate": 9.521925179268178e-06, "loss": 0.0607, "step": 5730 }, { "epoch": 0.6802962962962963, "grad_norm": 0.9061828273780305, "learning_rate": 9.518978706856275e-06, "loss": 0.064, "step": 5740 }, { "epoch": 0.6814814814814815, "grad_norm": 0.8325329934176016, "learning_rate": 9.516023641217527e-06, "loss": 0.054, "step": 5750 }, { "epoch": 0.6826666666666666, "grad_norm": 0.9647904060386737, "learning_rate": 9.513059987971245e-06, "loss": 0.058, "step": 5760 }, { "epoch": 0.6838518518518518, "grad_norm": 0.6361162714731206, "learning_rate": 9.510087752753073e-06, "loss": 0.0598, "step": 5770 }, { "epoch": 0.685037037037037, "grad_norm": 0.6651870206764534, "learning_rate": 9.507106941214968e-06, "loss": 0.0602, "step": 5780 }, { "epoch": 0.6862222222222222, "grad_norm": 0.879620483371766, "learning_rate": 9.504117559025204e-06, "loss": 0.0607, "step": 5790 }, { "epoch": 0.6874074074074074, "grad_norm": 0.6384981281798937, "learning_rate": 9.501119611868346e-06, "loss": 0.0552, "step": 5800 }, { "epoch": 0.6885925925925926, "grad_norm": 0.8148090652077672, "learning_rate": 9.49811310544525e-06, "loss": 0.0517, "step": 5810 }, { "epoch": 0.6897777777777778, "grad_norm": 0.6261831834625272, "learning_rate": 9.495098045473043e-06, "loss": 0.0576, "step": 5820 }, { "epoch": 0.690962962962963, "grad_norm": 0.750739861082205, "learning_rate": 9.492074437685126e-06, "loss": 0.0593, "step": 5830 }, { "epoch": 0.6921481481481482, "grad_norm": 0.6235390264038503, "learning_rate": 9.489042287831147e-06, "loss": 0.0601, "step": 5840 }, { "epoch": 0.6933333333333334, "grad_norm": 0.6721873408161235, "learning_rate": 9.486001601677e-06, "loss": 0.0567, "step": 5850 }, { "epoch": 0.6945185185185185, "grad_norm": 0.7956788166241592, "learning_rate": 9.482952385004809e-06, "loss": 0.0613, "step": 5860 }, { "epoch": 0.6957037037037037, "grad_norm": 0.7240587880855817, "learning_rate": 9.479894643612926e-06, "loss": 0.0586, "step": 5870 }, { "epoch": 0.6968888888888889, "grad_norm": 0.5945392493921163, "learning_rate": 9.476828383315907e-06, "loss": 0.0562, "step": 5880 }, { "epoch": 0.6980740740740741, "grad_norm": 0.7210743449557837, "learning_rate": 9.47375360994451e-06, "loss": 0.054, "step": 5890 }, { "epoch": 0.6992592592592592, "grad_norm": 0.8592775365194965, "learning_rate": 9.470670329345682e-06, "loss": 0.0602, "step": 5900 }, { "epoch": 0.7004444444444444, "grad_norm": 0.7680147220283731, "learning_rate": 9.467578547382545e-06, "loss": 0.0604, "step": 5910 }, { "epoch": 0.7016296296296296, "grad_norm": 0.930965909733838, "learning_rate": 9.464478269934391e-06, "loss": 0.0597, "step": 5920 }, { "epoch": 0.7028148148148148, "grad_norm": 0.8465943226365384, "learning_rate": 9.46136950289666e-06, "loss": 0.0564, "step": 5930 }, { "epoch": 0.704, "grad_norm": 0.8230315120793787, "learning_rate": 9.458252252180944e-06, "loss": 0.0568, "step": 5940 }, { "epoch": 0.7051851851851851, "grad_norm": 0.7940851445240408, "learning_rate": 9.455126523714962e-06, "loss": 0.0571, "step": 5950 }, { "epoch": 0.7063703703703703, "grad_norm": 0.9072359763710295, "learning_rate": 9.451992323442557e-06, "loss": 0.062, "step": 5960 }, { "epoch": 0.7075555555555556, "grad_norm": 0.8995851727063644, "learning_rate": 9.448849657323675e-06, "loss": 0.0586, "step": 5970 }, { "epoch": 0.7087407407407408, "grad_norm": 0.7970966212102009, "learning_rate": 9.445698531334374e-06, "loss": 0.0583, "step": 5980 }, { "epoch": 0.709925925925926, "grad_norm": 1.0101655604060642, "learning_rate": 9.442538951466786e-06, "loss": 0.0535, "step": 5990 }, { "epoch": 0.7111111111111111, "grad_norm": 0.8269495414481755, "learning_rate": 9.439370923729124e-06, "loss": 0.0565, "step": 6000 }, { "epoch": 0.7122962962962963, "grad_norm": 0.9010994255221992, "learning_rate": 9.43619445414567e-06, "loss": 0.0604, "step": 6010 }, { "epoch": 0.7134814814814815, "grad_norm": 0.6082660417089522, "learning_rate": 9.433009548756746e-06, "loss": 0.0578, "step": 6020 }, { "epoch": 0.7146666666666667, "grad_norm": 0.9597698816797973, "learning_rate": 9.429816213618732e-06, "loss": 0.0576, "step": 6030 }, { "epoch": 0.7158518518518519, "grad_norm": 0.7970908735161462, "learning_rate": 9.426614454804026e-06, "loss": 0.0608, "step": 6040 }, { "epoch": 0.717037037037037, "grad_norm": 0.7567449036108198, "learning_rate": 9.423404278401047e-06, "loss": 0.0542, "step": 6050 }, { "epoch": 0.7182222222222222, "grad_norm": 0.8624178532393584, "learning_rate": 9.420185690514222e-06, "loss": 0.0542, "step": 6060 }, { "epoch": 0.7194074074074074, "grad_norm": 0.6619442585433717, "learning_rate": 9.416958697263976e-06, "loss": 0.0592, "step": 6070 }, { "epoch": 0.7205925925925926, "grad_norm": 0.8855994863229733, "learning_rate": 9.413723304786709e-06, "loss": 0.0579, "step": 6080 }, { "epoch": 0.7217777777777777, "grad_norm": 0.8505929203748652, "learning_rate": 9.410479519234803e-06, "loss": 0.0537, "step": 6090 }, { "epoch": 0.7229629629629629, "grad_norm": 0.9112279837425967, "learning_rate": 9.407227346776592e-06, "loss": 0.0554, "step": 6100 }, { "epoch": 0.7241481481481481, "grad_norm": 1.0305418820556183, "learning_rate": 9.403966793596363e-06, "loss": 0.0582, "step": 6110 }, { "epoch": 0.7253333333333334, "grad_norm": 0.7249236387943062, "learning_rate": 9.40069786589434e-06, "loss": 0.0602, "step": 6120 }, { "epoch": 0.7265185185185186, "grad_norm": 0.8867392695144658, "learning_rate": 9.397420569886666e-06, "loss": 0.0596, "step": 6130 }, { "epoch": 0.7277037037037037, "grad_norm": 0.666491885927328, "learning_rate": 9.394134911805406e-06, "loss": 0.0565, "step": 6140 }, { "epoch": 0.7288888888888889, "grad_norm": 0.7448308217658117, "learning_rate": 9.390840897898519e-06, "loss": 0.0547, "step": 6150 }, { "epoch": 0.7300740740740741, "grad_norm": 0.8831469556013009, "learning_rate": 9.387538534429856e-06, "loss": 0.0596, "step": 6160 }, { "epoch": 0.7312592592592593, "grad_norm": 0.7929783753785901, "learning_rate": 9.384227827679147e-06, "loss": 0.0566, "step": 6170 }, { "epoch": 0.7324444444444445, "grad_norm": 0.7885426096978996, "learning_rate": 9.380908783941985e-06, "loss": 0.0593, "step": 6180 }, { "epoch": 0.7336296296296296, "grad_norm": 0.757031908528628, "learning_rate": 9.377581409529814e-06, "loss": 0.0557, "step": 6190 }, { "epoch": 0.7348148148148148, "grad_norm": 0.663216098080713, "learning_rate": 9.37424571076993e-06, "loss": 0.053, "step": 6200 }, { "epoch": 0.736, "grad_norm": 1.0290396618979418, "learning_rate": 9.370901694005444e-06, "loss": 0.0617, "step": 6210 }, { "epoch": 0.7371851851851852, "grad_norm": 0.69489688132048, "learning_rate": 9.367549365595294e-06, "loss": 0.052, "step": 6220 }, { "epoch": 0.7383703703703703, "grad_norm": 0.6671955448491652, "learning_rate": 9.36418873191422e-06, "loss": 0.0535, "step": 6230 }, { "epoch": 0.7395555555555555, "grad_norm": 0.6981340903029717, "learning_rate": 9.36081979935276e-06, "loss": 0.0558, "step": 6240 }, { "epoch": 0.7407407407407407, "grad_norm": 0.8665073691852571, "learning_rate": 9.357442574317227e-06, "loss": 0.0608, "step": 6250 }, { "epoch": 0.7419259259259259, "grad_norm": 0.6724644707186217, "learning_rate": 9.354057063229703e-06, "loss": 0.0609, "step": 6260 }, { "epoch": 0.7431111111111111, "grad_norm": 0.8985662275435259, "learning_rate": 9.350663272528032e-06, "loss": 0.0576, "step": 6270 }, { "epoch": 0.7442962962962963, "grad_norm": 0.7712484506703242, "learning_rate": 9.347261208665795e-06, "loss": 0.0558, "step": 6280 }, { "epoch": 0.7454814814814815, "grad_norm": 0.7735773295388982, "learning_rate": 9.343850878112313e-06, "loss": 0.0533, "step": 6290 }, { "epoch": 0.7466666666666667, "grad_norm": 0.7736002632612686, "learning_rate": 9.340432287352621e-06, "loss": 0.0541, "step": 6300 }, { "epoch": 0.7478518518518519, "grad_norm": 0.6405293657691001, "learning_rate": 9.337005442887464e-06, "loss": 0.0551, "step": 6310 }, { "epoch": 0.7490370370370371, "grad_norm": 0.8045092278431043, "learning_rate": 9.33357035123328e-06, "loss": 0.055, "step": 6320 }, { "epoch": 0.7502222222222222, "grad_norm": 0.7938763565949336, "learning_rate": 9.330127018922195e-06, "loss": 0.0524, "step": 6330 }, { "epoch": 0.7514074074074074, "grad_norm": 0.684880227632773, "learning_rate": 9.326675452501997e-06, "loss": 0.0508, "step": 6340 }, { "epoch": 0.7525925925925926, "grad_norm": 0.8629082246317131, "learning_rate": 9.323215658536141e-06, "loss": 0.0567, "step": 6350 }, { "epoch": 0.7537777777777778, "grad_norm": 0.9056361754163381, "learning_rate": 9.319747643603721e-06, "loss": 0.0521, "step": 6360 }, { "epoch": 0.754962962962963, "grad_norm": 1.0007224812329896, "learning_rate": 9.316271414299464e-06, "loss": 0.0551, "step": 6370 }, { "epoch": 0.7561481481481481, "grad_norm": 0.8027475866803074, "learning_rate": 9.312786977233722e-06, "loss": 0.0567, "step": 6380 }, { "epoch": 0.7573333333333333, "grad_norm": 0.8213424296886762, "learning_rate": 9.309294339032451e-06, "loss": 0.0568, "step": 6390 }, { "epoch": 0.7585185185185185, "grad_norm": 0.7220031603090468, "learning_rate": 9.305793506337205e-06, "loss": 0.0542, "step": 6400 }, { "epoch": 0.7597037037037037, "grad_norm": 0.7961328175054693, "learning_rate": 9.302284485805114e-06, "loss": 0.0539, "step": 6410 }, { "epoch": 0.7608888888888888, "grad_norm": 0.8014918176680914, "learning_rate": 9.298767284108884e-06, "loss": 0.0501, "step": 6420 }, { "epoch": 0.7620740740740741, "grad_norm": 0.6541465638138857, "learning_rate": 9.295241907936779e-06, "loss": 0.0546, "step": 6430 }, { "epoch": 0.7632592592592593, "grad_norm": 0.7442494535751223, "learning_rate": 9.291708363992602e-06, "loss": 0.0565, "step": 6440 }, { "epoch": 0.7644444444444445, "grad_norm": 0.7306920048504332, "learning_rate": 9.288166658995694e-06, "loss": 0.0564, "step": 6450 }, { "epoch": 0.7656296296296297, "grad_norm": 0.8457138315981726, "learning_rate": 9.284616799680912e-06, "loss": 0.0561, "step": 6460 }, { "epoch": 0.7668148148148148, "grad_norm": 0.8152511601451743, "learning_rate": 9.281058792798615e-06, "loss": 0.0521, "step": 6470 }, { "epoch": 0.768, "grad_norm": 0.9140146783731141, "learning_rate": 9.277492645114662e-06, "loss": 0.055, "step": 6480 }, { "epoch": 0.7691851851851852, "grad_norm": 0.7887229450791052, "learning_rate": 9.273918363410391e-06, "loss": 0.0573, "step": 6490 }, { "epoch": 0.7703703703703704, "grad_norm": 0.7957294730122749, "learning_rate": 9.270335954482601e-06, "loss": 0.05, "step": 6500 }, { "epoch": 0.7715555555555556, "grad_norm": 0.6453171802853228, "learning_rate": 9.266745425143556e-06, "loss": 0.0557, "step": 6510 }, { "epoch": 0.7727407407407407, "grad_norm": 0.6905322452593332, "learning_rate": 9.263146782220956e-06, "loss": 0.0529, "step": 6520 }, { "epoch": 0.7739259259259259, "grad_norm": 1.015803321745992, "learning_rate": 9.259540032557927e-06, "loss": 0.0555, "step": 6530 }, { "epoch": 0.7751111111111111, "grad_norm": 0.8156885650789713, "learning_rate": 9.255925183013016e-06, "loss": 0.0549, "step": 6540 }, { "epoch": 0.7762962962962963, "grad_norm": 0.8153343743055421, "learning_rate": 9.25230224046017e-06, "loss": 0.0541, "step": 6550 }, { "epoch": 0.7774814814814814, "grad_norm": 0.8836847487251168, "learning_rate": 9.248671211788727e-06, "loss": 0.0548, "step": 6560 }, { "epoch": 0.7786666666666666, "grad_norm": 0.6607179937708231, "learning_rate": 9.2450321039034e-06, "loss": 0.0514, "step": 6570 }, { "epoch": 0.7798518518518519, "grad_norm": 0.6578535058879177, "learning_rate": 9.241384923724263e-06, "loss": 0.0543, "step": 6580 }, { "epoch": 0.7810370370370371, "grad_norm": 0.9149304817403267, "learning_rate": 9.237729678186747e-06, "loss": 0.0548, "step": 6590 }, { "epoch": 0.7822222222222223, "grad_norm": 0.6708209980140788, "learning_rate": 9.234066374241615e-06, "loss": 0.0483, "step": 6600 }, { "epoch": 0.7834074074074074, "grad_norm": 0.6969816513837376, "learning_rate": 9.230395018854952e-06, "loss": 0.0512, "step": 6610 }, { "epoch": 0.7845925925925926, "grad_norm": 0.7745192902944739, "learning_rate": 9.22671561900816e-06, "loss": 0.0566, "step": 6620 }, { "epoch": 0.7857777777777778, "grad_norm": 0.7785851625639385, "learning_rate": 9.22302818169793e-06, "loss": 0.0524, "step": 6630 }, { "epoch": 0.786962962962963, "grad_norm": 0.7498071517671575, "learning_rate": 9.219332713936247e-06, "loss": 0.0509, "step": 6640 }, { "epoch": 0.7881481481481482, "grad_norm": 0.6689264312381505, "learning_rate": 9.215629222750356e-06, "loss": 0.0556, "step": 6650 }, { "epoch": 0.7893333333333333, "grad_norm": 1.0352593385270017, "learning_rate": 9.211917715182766e-06, "loss": 0.0536, "step": 6660 }, { "epoch": 0.7905185185185185, "grad_norm": 0.7877134132861506, "learning_rate": 9.208198198291227e-06, "loss": 0.0502, "step": 6670 }, { "epoch": 0.7917037037037037, "grad_norm": 0.7863702253596683, "learning_rate": 9.204470679148721e-06, "loss": 0.055, "step": 6680 }, { "epoch": 0.7928888888888889, "grad_norm": 0.7892872655036743, "learning_rate": 9.200735164843447e-06, "loss": 0.0542, "step": 6690 }, { "epoch": 0.794074074074074, "grad_norm": 0.9191341736448989, "learning_rate": 9.196991662478807e-06, "loss": 0.0488, "step": 6700 }, { "epoch": 0.7952592592592592, "grad_norm": 0.8580919800959579, "learning_rate": 9.193240179173392e-06, "loss": 0.0568, "step": 6710 }, { "epoch": 0.7964444444444444, "grad_norm": 0.6818725359696742, "learning_rate": 9.18948072206097e-06, "loss": 0.0555, "step": 6720 }, { "epoch": 0.7976296296296296, "grad_norm": 0.7314317034608175, "learning_rate": 9.185713298290475e-06, "loss": 0.0529, "step": 6730 }, { "epoch": 0.7988148148148149, "grad_norm": 0.7009943012989422, "learning_rate": 9.181937915025985e-06, "loss": 0.0521, "step": 6740 }, { "epoch": 0.8, "grad_norm": 0.6782388236944448, "learning_rate": 9.178154579446713e-06, "loss": 0.0528, "step": 6750 }, { "epoch": 0.8011851851851852, "grad_norm": 0.593549728311959, "learning_rate": 9.174363298747005e-06, "loss": 0.05, "step": 6760 }, { "epoch": 0.8023703703703704, "grad_norm": 0.5511569981075917, "learning_rate": 9.170564080136301e-06, "loss": 0.0535, "step": 6770 }, { "epoch": 0.8035555555555556, "grad_norm": 0.8915113836903726, "learning_rate": 9.166756930839144e-06, "loss": 0.0539, "step": 6780 }, { "epoch": 0.8047407407407408, "grad_norm": 0.8222043889389753, "learning_rate": 9.162941858095156e-06, "loss": 0.0519, "step": 6790 }, { "epoch": 0.8059259259259259, "grad_norm": 0.8015643043748327, "learning_rate": 9.159118869159028e-06, "loss": 0.049, "step": 6800 }, { "epoch": 0.8071111111111111, "grad_norm": 0.676674015435569, "learning_rate": 9.155287971300498e-06, "loss": 0.0525, "step": 6810 }, { "epoch": 0.8082962962962963, "grad_norm": 0.6182134346053748, "learning_rate": 9.151449171804353e-06, "loss": 0.0495, "step": 6820 }, { "epoch": 0.8094814814814815, "grad_norm": 0.753963849192247, "learning_rate": 9.147602477970396e-06, "loss": 0.0529, "step": 6830 }, { "epoch": 0.8106666666666666, "grad_norm": 1.2623181497669773, "learning_rate": 9.143747897113449e-06, "loss": 0.0499, "step": 6840 }, { "epoch": 0.8118518518518518, "grad_norm": 0.606596694809615, "learning_rate": 9.139885436563328e-06, "loss": 0.0527, "step": 6850 }, { "epoch": 0.813037037037037, "grad_norm": 0.8800625178574711, "learning_rate": 9.136015103664835e-06, "loss": 0.0486, "step": 6860 }, { "epoch": 0.8142222222222222, "grad_norm": 0.5807539928925268, "learning_rate": 9.132136905777742e-06, "loss": 0.0493, "step": 6870 }, { "epoch": 0.8154074074074074, "grad_norm": 0.6103744082962793, "learning_rate": 9.128250850276774e-06, "loss": 0.0549, "step": 6880 }, { "epoch": 0.8165925925925926, "grad_norm": 0.8239434055661777, "learning_rate": 9.1243569445516e-06, "loss": 0.0491, "step": 6890 }, { "epoch": 0.8177777777777778, "grad_norm": 0.7240930108958012, "learning_rate": 9.120455196006816e-06, "loss": 0.0555, "step": 6900 }, { "epoch": 0.818962962962963, "grad_norm": 0.87815896354831, "learning_rate": 9.116545612061935e-06, "loss": 0.0518, "step": 6910 }, { "epoch": 0.8201481481481482, "grad_norm": 0.605418961984683, "learning_rate": 9.112628200151366e-06, "loss": 0.0519, "step": 6920 }, { "epoch": 0.8213333333333334, "grad_norm": 0.8333390981607123, "learning_rate": 9.108702967724407e-06, "loss": 0.0502, "step": 6930 }, { "epoch": 0.8225185185185185, "grad_norm": 0.8088889987293015, "learning_rate": 9.10476992224522e-06, "loss": 0.0533, "step": 6940 }, { "epoch": 0.8237037037037037, "grad_norm": 0.8696568790843807, "learning_rate": 9.100829071192837e-06, "loss": 0.0529, "step": 6950 }, { "epoch": 0.8248888888888889, "grad_norm": 0.7485286739334551, "learning_rate": 9.096880422061116e-06, "loss": 0.0555, "step": 6960 }, { "epoch": 0.8260740740740741, "grad_norm": 0.7312342760803509, "learning_rate": 9.09292398235876e-06, "loss": 0.0487, "step": 6970 }, { "epoch": 0.8272592592592592, "grad_norm": 0.9295771493515899, "learning_rate": 9.088959759609278e-06, "loss": 0.0516, "step": 6980 }, { "epoch": 0.8284444444444444, "grad_norm": 0.7163715080606454, "learning_rate": 9.08498776135098e-06, "loss": 0.0515, "step": 6990 }, { "epoch": 0.8296296296296296, "grad_norm": 0.7376926223849972, "learning_rate": 9.081007995136964e-06, "loss": 0.0546, "step": 7000 }, { "epoch": 0.8308148148148148, "grad_norm": 0.7977251816096612, "learning_rate": 9.077020468535093e-06, "loss": 0.0512, "step": 7010 }, { "epoch": 0.832, "grad_norm": 0.7462915664446347, "learning_rate": 9.073025189128e-06, "loss": 0.0557, "step": 7020 }, { "epoch": 0.8331851851851851, "grad_norm": 0.7898958529684861, "learning_rate": 9.069022164513044e-06, "loss": 0.0542, "step": 7030 }, { "epoch": 0.8343703703703703, "grad_norm": 0.8555896594741408, "learning_rate": 9.065011402302327e-06, "loss": 0.0538, "step": 7040 }, { "epoch": 0.8355555555555556, "grad_norm": 0.6064430590889784, "learning_rate": 9.060992910122656e-06, "loss": 0.0487, "step": 7050 }, { "epoch": 0.8367407407407408, "grad_norm": 0.7421301453000255, "learning_rate": 9.05696669561554e-06, "loss": 0.0499, "step": 7060 }, { "epoch": 0.837925925925926, "grad_norm": 0.7454771545342748, "learning_rate": 9.052932766437173e-06, "loss": 0.0484, "step": 7070 }, { "epoch": 0.8391111111111111, "grad_norm": 0.5933470949265227, "learning_rate": 9.048891130258417e-06, "loss": 0.0503, "step": 7080 }, { "epoch": 0.8402962962962963, "grad_norm": 0.8018042199892028, "learning_rate": 9.044841794764791e-06, "loss": 0.0523, "step": 7090 }, { "epoch": 0.8414814814814815, "grad_norm": 0.8384064614127551, "learning_rate": 9.040784767656456e-06, "loss": 0.0543, "step": 7100 }, { "epoch": 0.8426666666666667, "grad_norm": 0.8268764329768402, "learning_rate": 9.036720056648197e-06, "loss": 0.0498, "step": 7110 }, { "epoch": 0.8438518518518519, "grad_norm": 0.6490970755638987, "learning_rate": 9.032647669469413e-06, "loss": 0.055, "step": 7120 }, { "epoch": 0.845037037037037, "grad_norm": 0.7124008450897088, "learning_rate": 9.028567613864098e-06, "loss": 0.0485, "step": 7130 }, { "epoch": 0.8462222222222222, "grad_norm": 0.6883514513585333, "learning_rate": 9.024479897590828e-06, "loss": 0.0491, "step": 7140 }, { "epoch": 0.8474074074074074, "grad_norm": 0.7562654756583114, "learning_rate": 9.020384528422748e-06, "loss": 0.0505, "step": 7150 }, { "epoch": 0.8485925925925926, "grad_norm": 0.6423243603145317, "learning_rate": 9.016281514147556e-06, "loss": 0.0507, "step": 7160 }, { "epoch": 0.8497777777777777, "grad_norm": 0.6163777727950482, "learning_rate": 9.012170862567485e-06, "loss": 0.0514, "step": 7170 }, { "epoch": 0.8509629629629629, "grad_norm": 0.7436762834591052, "learning_rate": 9.008052581499294e-06, "loss": 0.0511, "step": 7180 }, { "epoch": 0.8521481481481481, "grad_norm": 0.6176416871335711, "learning_rate": 9.003926678774246e-06, "loss": 0.0474, "step": 7190 }, { "epoch": 0.8533333333333334, "grad_norm": 0.7159250656813492, "learning_rate": 8.999793162238105e-06, "loss": 0.0501, "step": 7200 }, { "epoch": 0.8545185185185186, "grad_norm": 0.7201514984437982, "learning_rate": 8.995652039751103e-06, "loss": 0.0523, "step": 7210 }, { "epoch": 0.8557037037037037, "grad_norm": 0.9470419359503877, "learning_rate": 8.991503319187944e-06, "loss": 0.0512, "step": 7220 }, { "epoch": 0.8568888888888889, "grad_norm": 0.6849997793505873, "learning_rate": 8.987347008437776e-06, "loss": 0.048, "step": 7230 }, { "epoch": 0.8580740740740741, "grad_norm": 0.8198002348707787, "learning_rate": 8.983183115404181e-06, "loss": 0.0489, "step": 7240 }, { "epoch": 0.8592592592592593, "grad_norm": 0.7173565076463503, "learning_rate": 8.979011648005163e-06, "loss": 0.0538, "step": 7250 }, { "epoch": 0.8604444444444445, "grad_norm": 0.7974066627644381, "learning_rate": 8.97483261417312e-06, "loss": 0.0464, "step": 7260 }, { "epoch": 0.8616296296296296, "grad_norm": 0.6868280015809256, "learning_rate": 8.970646021854854e-06, "loss": 0.0482, "step": 7270 }, { "epoch": 0.8628148148148148, "grad_norm": 0.5837185009688398, "learning_rate": 8.96645187901152e-06, "loss": 0.0529, "step": 7280 }, { "epoch": 0.864, "grad_norm": 0.5882804649176211, "learning_rate": 8.962250193618649e-06, "loss": 0.0505, "step": 7290 }, { "epoch": 0.8651851851851852, "grad_norm": 0.7250899181586955, "learning_rate": 8.958040973666102e-06, "loss": 0.0464, "step": 7300 }, { "epoch": 0.8663703703703703, "grad_norm": 0.7274661399152923, "learning_rate": 8.95382422715808e-06, "loss": 0.0523, "step": 7310 }, { "epoch": 0.8675555555555555, "grad_norm": 0.6305577005737798, "learning_rate": 8.94959996211308e-06, "loss": 0.0501, "step": 7320 }, { "epoch": 0.8687407407407407, "grad_norm": 0.7384466431817232, "learning_rate": 8.945368186563913e-06, "loss": 0.0481, "step": 7330 }, { "epoch": 0.8699259259259259, "grad_norm": 1.0061465676366388, "learning_rate": 8.94112890855766e-06, "loss": 0.0484, "step": 7340 }, { "epoch": 0.8711111111111111, "grad_norm": 0.9107812967480566, "learning_rate": 8.936882136155676e-06, "loss": 0.053, "step": 7350 }, { "epoch": 0.8722962962962963, "grad_norm": 0.7831745304784927, "learning_rate": 8.932627877433561e-06, "loss": 0.0476, "step": 7360 }, { "epoch": 0.8734814814814815, "grad_norm": 0.5933278625535517, "learning_rate": 8.928366140481159e-06, "loss": 0.0459, "step": 7370 }, { "epoch": 0.8746666666666667, "grad_norm": 0.7309816650853034, "learning_rate": 8.924096933402524e-06, "loss": 0.0518, "step": 7380 }, { "epoch": 0.8758518518518519, "grad_norm": 0.619390213132588, "learning_rate": 8.919820264315922e-06, "loss": 0.049, "step": 7390 }, { "epoch": 0.8770370370370371, "grad_norm": 0.5938431253568528, "learning_rate": 8.915536141353808e-06, "loss": 0.05, "step": 7400 }, { "epoch": 0.8782222222222222, "grad_norm": 0.5661113537053553, "learning_rate": 8.911244572662813e-06, "loss": 0.0468, "step": 7410 }, { "epoch": 0.8794074074074074, "grad_norm": 0.742574427628141, "learning_rate": 8.90694556640372e-06, "loss": 0.0474, "step": 7420 }, { "epoch": 0.8805925925925926, "grad_norm": 0.7639686698143588, "learning_rate": 8.90263913075146e-06, "loss": 0.0476, "step": 7430 }, { "epoch": 0.8817777777777778, "grad_norm": 0.6757283735940369, "learning_rate": 8.898325273895094e-06, "loss": 0.0448, "step": 7440 }, { "epoch": 0.882962962962963, "grad_norm": 0.49264777167027235, "learning_rate": 8.894004004037788e-06, "loss": 0.0519, "step": 7450 }, { "epoch": 0.8841481481481481, "grad_norm": 0.6126128955995369, "learning_rate": 8.889675329396812e-06, "loss": 0.0477, "step": 7460 }, { "epoch": 0.8853333333333333, "grad_norm": 0.7592480310161118, "learning_rate": 8.885339258203511e-06, "loss": 0.0489, "step": 7470 }, { "epoch": 0.8865185185185185, "grad_norm": 0.8556814725564063, "learning_rate": 8.880995798703299e-06, "loss": 0.0501, "step": 7480 }, { "epoch": 0.8877037037037037, "grad_norm": 0.6562343067417394, "learning_rate": 8.876644959155635e-06, "loss": 0.0488, "step": 7490 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5545104797264164, "learning_rate": 8.872286747834018e-06, "loss": 0.0497, "step": 7500 }, { "epoch": 0.8900740740740741, "grad_norm": 0.6434601123386668, "learning_rate": 8.867921173025959e-06, "loss": 0.0459, "step": 7510 }, { "epoch": 0.8912592592592593, "grad_norm": 0.7344293502208636, "learning_rate": 8.863548243032977e-06, "loss": 0.0438, "step": 7520 }, { "epoch": 0.8924444444444445, "grad_norm": 0.715815454504353, "learning_rate": 8.859167966170574e-06, "loss": 0.0438, "step": 7530 }, { "epoch": 0.8936296296296297, "grad_norm": 0.5894308972856338, "learning_rate": 8.854780350768225e-06, "loss": 0.0453, "step": 7540 }, { "epoch": 0.8948148148148148, "grad_norm": 0.7591298649958523, "learning_rate": 8.850385405169352e-06, "loss": 0.0498, "step": 7550 }, { "epoch": 0.896, "grad_norm": 0.682421297422223, "learning_rate": 8.845983137731326e-06, "loss": 0.0517, "step": 7560 }, { "epoch": 0.8971851851851852, "grad_norm": 0.7864052816922432, "learning_rate": 8.84157355682544e-06, "loss": 0.0517, "step": 7570 }, { "epoch": 0.8983703703703704, "grad_norm": 0.6096071342284556, "learning_rate": 8.837156670836888e-06, "loss": 0.0488, "step": 7580 }, { "epoch": 0.8995555555555556, "grad_norm": 0.6836862920275314, "learning_rate": 8.832732488164761e-06, "loss": 0.0461, "step": 7590 }, { "epoch": 0.9007407407407407, "grad_norm": 0.8022104617752468, "learning_rate": 8.82830101722202e-06, "loss": 0.0493, "step": 7600 }, { "epoch": 0.9019259259259259, "grad_norm": 0.6591289899788176, "learning_rate": 8.823862266435492e-06, "loss": 0.0485, "step": 7610 }, { "epoch": 0.9031111111111111, "grad_norm": 0.7142607526573105, "learning_rate": 8.819416244245841e-06, "loss": 0.0462, "step": 7620 }, { "epoch": 0.9042962962962963, "grad_norm": 0.6654038512617686, "learning_rate": 8.81496295910756e-06, "loss": 0.0476, "step": 7630 }, { "epoch": 0.9054814814814814, "grad_norm": 0.8158569675100661, "learning_rate": 8.810502419488958e-06, "loss": 0.0531, "step": 7640 }, { "epoch": 0.9066666666666666, "grad_norm": 0.6565042200381191, "learning_rate": 8.80603463387213e-06, "loss": 0.0435, "step": 7650 }, { "epoch": 0.9078518518518518, "grad_norm": 0.792387147204678, "learning_rate": 8.80155961075296e-06, "loss": 0.0457, "step": 7660 }, { "epoch": 0.9090370370370371, "grad_norm": 0.6400679949422363, "learning_rate": 8.797077358641081e-06, "loss": 0.0491, "step": 7670 }, { "epoch": 0.9102222222222223, "grad_norm": 0.8197301724041891, "learning_rate": 8.792587886059891e-06, "loss": 0.0524, "step": 7680 }, { "epoch": 0.9114074074074074, "grad_norm": 0.6477466730781138, "learning_rate": 8.788091201546503e-06, "loss": 0.0464, "step": 7690 }, { "epoch": 0.9125925925925926, "grad_norm": 0.6329813257018595, "learning_rate": 8.78358731365175e-06, "loss": 0.0435, "step": 7700 }, { "epoch": 0.9137777777777778, "grad_norm": 0.6254414507963543, "learning_rate": 8.779076230940163e-06, "loss": 0.0458, "step": 7710 }, { "epoch": 0.914962962962963, "grad_norm": 0.6207345887037139, "learning_rate": 8.774557961989955e-06, "loss": 0.0491, "step": 7720 }, { "epoch": 0.9161481481481482, "grad_norm": 0.55946268737263, "learning_rate": 8.770032515393e-06, "loss": 0.0472, "step": 7730 }, { "epoch": 0.9173333333333333, "grad_norm": 0.6772035858898395, "learning_rate": 8.765499899754827e-06, "loss": 0.0452, "step": 7740 }, { "epoch": 0.9185185185185185, "grad_norm": 0.8106168155540253, "learning_rate": 8.760960123694595e-06, "loss": 0.046, "step": 7750 }, { "epoch": 0.9197037037037037, "grad_norm": 0.655821818432701, "learning_rate": 8.756413195845075e-06, "loss": 0.0471, "step": 7760 }, { "epoch": 0.9208888888888889, "grad_norm": 0.7343075910923443, "learning_rate": 8.751859124852646e-06, "loss": 0.0498, "step": 7770 }, { "epoch": 0.922074074074074, "grad_norm": 0.9536952913734272, "learning_rate": 8.747297919377262e-06, "loss": 0.0498, "step": 7780 }, { "epoch": 0.9232592592592592, "grad_norm": 0.6033899003523572, "learning_rate": 8.74272958809245e-06, "loss": 0.0447, "step": 7790 }, { "epoch": 0.9244444444444444, "grad_norm": 0.8694089163500168, "learning_rate": 8.738154139685287e-06, "loss": 0.047, "step": 7800 }, { "epoch": 0.9256296296296296, "grad_norm": 0.8019148399219386, "learning_rate": 8.733571582856378e-06, "loss": 0.0488, "step": 7810 }, { "epoch": 0.9268148148148149, "grad_norm": 0.6246197373968206, "learning_rate": 8.728981926319851e-06, "loss": 0.0444, "step": 7820 }, { "epoch": 0.928, "grad_norm": 0.7051854547586471, "learning_rate": 8.72438517880333e-06, "loss": 0.0481, "step": 7830 }, { "epoch": 0.9291851851851852, "grad_norm": 0.4319130166694029, "learning_rate": 8.719781349047931e-06, "loss": 0.0445, "step": 7840 }, { "epoch": 0.9303703703703704, "grad_norm": 0.8021777134565091, "learning_rate": 8.715170445808228e-06, "loss": 0.0479, "step": 7850 }, { "epoch": 0.9315555555555556, "grad_norm": 0.5835486406209588, "learning_rate": 8.710552477852253e-06, "loss": 0.0471, "step": 7860 }, { "epoch": 0.9327407407407408, "grad_norm": 0.6531671902591658, "learning_rate": 8.705927453961468e-06, "loss": 0.0474, "step": 7870 }, { "epoch": 0.9339259259259259, "grad_norm": 0.7264751248983842, "learning_rate": 8.701295382930755e-06, "loss": 0.0491, "step": 7880 }, { "epoch": 0.9351111111111111, "grad_norm": 0.9254258478111342, "learning_rate": 8.696656273568393e-06, "loss": 0.0496, "step": 7890 }, { "epoch": 0.9362962962962963, "grad_norm": 0.4753487468866643, "learning_rate": 8.692010134696052e-06, "loss": 0.0404, "step": 7900 }, { "epoch": 0.9374814814814815, "grad_norm": 0.8017191576342043, "learning_rate": 8.687356975148761e-06, "loss": 0.0528, "step": 7910 }, { "epoch": 0.9386666666666666, "grad_norm": 0.6077130308401523, "learning_rate": 8.682696803774903e-06, "loss": 0.0482, "step": 7920 }, { "epoch": 0.9398518518518518, "grad_norm": 0.5674166803836139, "learning_rate": 8.6780296294362e-06, "loss": 0.0503, "step": 7930 }, { "epoch": 0.941037037037037, "grad_norm": 0.6766821768704477, "learning_rate": 8.673355461007679e-06, "loss": 0.0485, "step": 7940 }, { "epoch": 0.9422222222222222, "grad_norm": 0.6210290727879508, "learning_rate": 8.66867430737768e-06, "loss": 0.0468, "step": 7950 }, { "epoch": 0.9434074074074074, "grad_norm": 0.7572269163731946, "learning_rate": 8.663986177447812e-06, "loss": 0.0473, "step": 7960 }, { "epoch": 0.9445925925925925, "grad_norm": 0.6710397213819798, "learning_rate": 8.659291080132963e-06, "loss": 0.0434, "step": 7970 }, { "epoch": 0.9457777777777778, "grad_norm": 0.5927732791729379, "learning_rate": 8.654589024361264e-06, "loss": 0.0464, "step": 7980 }, { "epoch": 0.946962962962963, "grad_norm": 0.8351921135158358, "learning_rate": 8.649880019074078e-06, "loss": 0.0449, "step": 7990 }, { "epoch": 0.9481481481481482, "grad_norm": 0.5841419481155213, "learning_rate": 8.64516407322598e-06, "loss": 0.0426, "step": 8000 }, { "epoch": 0.9493333333333334, "grad_norm": 0.9127769457819565, "learning_rate": 8.640441195784752e-06, "loss": 0.0473, "step": 8010 }, { "epoch": 0.9505185185185185, "grad_norm": 0.7226985293448518, "learning_rate": 8.63571139573135e-06, "loss": 0.0419, "step": 8020 }, { "epoch": 0.9517037037037037, "grad_norm": 0.5348585822335743, "learning_rate": 8.630974682059896e-06, "loss": 0.0487, "step": 8030 }, { "epoch": 0.9528888888888889, "grad_norm": 0.6563316829016852, "learning_rate": 8.626231063777658e-06, "loss": 0.0495, "step": 8040 }, { "epoch": 0.9540740740740741, "grad_norm": 0.7845822751322598, "learning_rate": 8.621480549905035e-06, "loss": 0.0497, "step": 8050 }, { "epoch": 0.9552592592592593, "grad_norm": 0.471510654237178, "learning_rate": 8.616723149475536e-06, "loss": 0.0436, "step": 8060 }, { "epoch": 0.9564444444444444, "grad_norm": 0.9184220100048154, "learning_rate": 8.61195887153577e-06, "loss": 0.045, "step": 8070 }, { "epoch": 0.9576296296296296, "grad_norm": 0.7117067218902718, "learning_rate": 8.60718772514542e-06, "loss": 0.0467, "step": 8080 }, { "epoch": 0.9588148148148148, "grad_norm": 0.7229676332504311, "learning_rate": 8.602409719377232e-06, "loss": 0.0482, "step": 8090 }, { "epoch": 0.96, "grad_norm": 0.6510765194228915, "learning_rate": 8.597624863316996e-06, "loss": 0.0447, "step": 8100 }, { "epoch": 0.9611851851851851, "grad_norm": 0.6372489162445778, "learning_rate": 8.592833166063525e-06, "loss": 0.0442, "step": 8110 }, { "epoch": 0.9623703703703703, "grad_norm": 0.6806232734100223, "learning_rate": 8.588034636728644e-06, "loss": 0.0493, "step": 8120 }, { "epoch": 0.9635555555555556, "grad_norm": 0.8375858027466179, "learning_rate": 8.58322928443717e-06, "loss": 0.0478, "step": 8130 }, { "epoch": 0.9647407407407408, "grad_norm": 0.7239728236966126, "learning_rate": 8.578417118326897e-06, "loss": 0.0466, "step": 8140 }, { "epoch": 0.965925925925926, "grad_norm": 0.8749584726704697, "learning_rate": 8.573598147548567e-06, "loss": 0.0505, "step": 8150 }, { "epoch": 0.9671111111111111, "grad_norm": 0.6653107548608621, "learning_rate": 8.568772381265872e-06, "loss": 0.0472, "step": 8160 }, { "epoch": 0.9682962962962963, "grad_norm": 0.8257121480511849, "learning_rate": 8.56393982865542e-06, "loss": 0.0453, "step": 8170 }, { "epoch": 0.9694814814814815, "grad_norm": 0.8205803321412547, "learning_rate": 8.559100498906727e-06, "loss": 0.0463, "step": 8180 }, { "epoch": 0.9706666666666667, "grad_norm": 0.784346843530516, "learning_rate": 8.554254401222193e-06, "loss": 0.044, "step": 8190 }, { "epoch": 0.9718518518518519, "grad_norm": 0.7513727776757206, "learning_rate": 8.549401544817092e-06, "loss": 0.0415, "step": 8200 }, { "epoch": 0.973037037037037, "grad_norm": 0.6800157649806621, "learning_rate": 8.544541938919544e-06, "loss": 0.0471, "step": 8210 }, { "epoch": 0.9742222222222222, "grad_norm": 0.5533104339950371, "learning_rate": 8.539675592770513e-06, "loss": 0.0461, "step": 8220 }, { "epoch": 0.9754074074074074, "grad_norm": 0.5840384445204609, "learning_rate": 8.53480251562377e-06, "loss": 0.0471, "step": 8230 }, { "epoch": 0.9765925925925926, "grad_norm": 0.7191931437171187, "learning_rate": 8.529922716745895e-06, "loss": 0.0432, "step": 8240 }, { "epoch": 0.9777777777777777, "grad_norm": 0.6647808639290215, "learning_rate": 8.525036205416244e-06, "loss": 0.042, "step": 8250 }, { "epoch": 0.9789629629629629, "grad_norm": 0.8439395557915452, "learning_rate": 8.520142990926936e-06, "loss": 0.0468, "step": 8260 }, { "epoch": 0.9801481481481481, "grad_norm": 0.7561599265628417, "learning_rate": 8.515243082582843e-06, "loss": 0.0482, "step": 8270 }, { "epoch": 0.9813333333333333, "grad_norm": 0.4856741634382977, "learning_rate": 8.51033648970156e-06, "loss": 0.0448, "step": 8280 }, { "epoch": 0.9825185185185186, "grad_norm": 0.7599107869385829, "learning_rate": 8.505423221613395e-06, "loss": 0.0441, "step": 8290 }, { "epoch": 0.9837037037037037, "grad_norm": 0.6708983626248244, "learning_rate": 8.500503287661353e-06, "loss": 0.0447, "step": 8300 }, { "epoch": 0.9848888888888889, "grad_norm": 0.9174835152187154, "learning_rate": 8.495576697201111e-06, "loss": 0.0462, "step": 8310 }, { "epoch": 0.9860740740740741, "grad_norm": 0.8428263231922415, "learning_rate": 8.490643459601003e-06, "loss": 0.0459, "step": 8320 }, { "epoch": 0.9872592592592593, "grad_norm": 0.8080013338882717, "learning_rate": 8.485703584242006e-06, "loss": 0.0442, "step": 8330 }, { "epoch": 0.9884444444444445, "grad_norm": 0.7738391502885379, "learning_rate": 8.480757080517716e-06, "loss": 0.0445, "step": 8340 }, { "epoch": 0.9896296296296296, "grad_norm": 0.5652129230088224, "learning_rate": 8.47580395783434e-06, "loss": 0.0427, "step": 8350 }, { "epoch": 0.9908148148148148, "grad_norm": 0.6101592040655527, "learning_rate": 8.470844225610662e-06, "loss": 0.042, "step": 8360 }, { "epoch": 0.992, "grad_norm": 0.6039351651956629, "learning_rate": 8.465877893278041e-06, "loss": 0.0425, "step": 8370 }, { "epoch": 0.9931851851851852, "grad_norm": 0.6585164098220578, "learning_rate": 8.460904970280386e-06, "loss": 0.0433, "step": 8380 }, { "epoch": 0.9943703703703703, "grad_norm": 0.6323305714822836, "learning_rate": 8.45592546607414e-06, "loss": 0.0458, "step": 8390 }, { "epoch": 0.9955555555555555, "grad_norm": 0.747018084505675, "learning_rate": 8.450939390128255e-06, "loss": 0.0492, "step": 8400 }, { "epoch": 0.9967407407407407, "grad_norm": 0.6155438710759833, "learning_rate": 8.445946751924184e-06, "loss": 0.0438, "step": 8410 }, { "epoch": 0.9979259259259259, "grad_norm": 0.6477807074361096, "learning_rate": 8.44094756095586e-06, "loss": 0.0388, "step": 8420 }, { "epoch": 0.9991111111111111, "grad_norm": 0.6519925886499779, "learning_rate": 8.435941826729675e-06, "loss": 0.049, "step": 8430 }, { "epoch": 1.000237037037037, "grad_norm": 0.6875558762383998, "learning_rate": 8.43092955876446e-06, "loss": 0.0428, "step": 8440 }, { "epoch": 1.0014222222222222, "grad_norm": 0.5978645689971666, "learning_rate": 8.42591076659148e-06, "loss": 0.0436, "step": 8450 }, { "epoch": 1.0026074074074074, "grad_norm": 0.6314301118033531, "learning_rate": 8.420885459754391e-06, "loss": 0.0405, "step": 8460 }, { "epoch": 1.0037925925925926, "grad_norm": 0.9306637451578884, "learning_rate": 8.415853647809256e-06, "loss": 0.0422, "step": 8470 }, { "epoch": 1.0049777777777777, "grad_norm": 0.592090625214825, "learning_rate": 8.410815340324493e-06, "loss": 0.0393, "step": 8480 }, { "epoch": 1.006162962962963, "grad_norm": 0.5813559493664977, "learning_rate": 8.405770546880876e-06, "loss": 0.0393, "step": 8490 }, { "epoch": 1.007348148148148, "grad_norm": 0.7538927839729614, "learning_rate": 8.400719277071517e-06, "loss": 0.04, "step": 8500 }, { "epoch": 1.0085333333333333, "grad_norm": 0.7724810550854977, "learning_rate": 8.395661540501839e-06, "loss": 0.0401, "step": 8510 }, { "epoch": 1.0097185185185185, "grad_norm": 0.6131732282221035, "learning_rate": 8.390597346789564e-06, "loss": 0.042, "step": 8520 }, { "epoch": 1.0109037037037036, "grad_norm": 0.7320126138462696, "learning_rate": 8.38552670556469e-06, "loss": 0.039, "step": 8530 }, { "epoch": 1.0120888888888888, "grad_norm": 0.6807010096821413, "learning_rate": 8.380449626469482e-06, "loss": 0.0459, "step": 8540 }, { "epoch": 1.013274074074074, "grad_norm": 0.7853944466067139, "learning_rate": 8.375366119158438e-06, "loss": 0.0422, "step": 8550 }, { "epoch": 1.0144592592592592, "grad_norm": 0.6312963470455242, "learning_rate": 8.370276193298288e-06, "loss": 0.0404, "step": 8560 }, { "epoch": 1.0156444444444443, "grad_norm": 0.7801641181641211, "learning_rate": 8.36517985856796e-06, "loss": 0.042, "step": 8570 }, { "epoch": 1.0168296296296295, "grad_norm": 0.6457328648944454, "learning_rate": 8.360077124658576e-06, "loss": 0.0438, "step": 8580 }, { "epoch": 1.0180148148148147, "grad_norm": 0.6397954566901006, "learning_rate": 8.354968001273424e-06, "loss": 0.0388, "step": 8590 }, { "epoch": 1.0192, "grad_norm": 0.8453574899616856, "learning_rate": 8.34985249812794e-06, "loss": 0.0434, "step": 8600 }, { "epoch": 1.0203851851851853, "grad_norm": 0.8247010678271285, "learning_rate": 8.344730624949696e-06, "loss": 0.0389, "step": 8610 }, { "epoch": 1.0215703703703705, "grad_norm": 0.7492583855084065, "learning_rate": 8.33960239147837e-06, "loss": 0.0397, "step": 8620 }, { "epoch": 1.0227555555555556, "grad_norm": 0.7267511819682018, "learning_rate": 8.334467807465742e-06, "loss": 0.0436, "step": 8630 }, { "epoch": 1.0239407407407408, "grad_norm": 0.7815443582365968, "learning_rate": 8.329326882675668e-06, "loss": 0.04, "step": 8640 }, { "epoch": 1.025125925925926, "grad_norm": 0.6115446334258033, "learning_rate": 8.324179626884053e-06, "loss": 0.039, "step": 8650 }, { "epoch": 1.0263111111111112, "grad_norm": 0.6031131121660421, "learning_rate": 8.31902604987885e-06, "loss": 0.0376, "step": 8660 }, { "epoch": 1.0274962962962964, "grad_norm": 0.558765123568777, "learning_rate": 8.313866161460032e-06, "loss": 0.0392, "step": 8670 }, { "epoch": 1.0286814814814815, "grad_norm": 0.6405308284644844, "learning_rate": 8.308699971439564e-06, "loss": 0.0425, "step": 8680 }, { "epoch": 1.0298666666666667, "grad_norm": 0.651610298269465, "learning_rate": 8.303527489641408e-06, "loss": 0.0369, "step": 8690 }, { "epoch": 1.0310518518518519, "grad_norm": 0.6343074308453518, "learning_rate": 8.298348725901477e-06, "loss": 0.0414, "step": 8700 }, { "epoch": 1.032237037037037, "grad_norm": 0.6970569849251086, "learning_rate": 8.29316369006764e-06, "loss": 0.0388, "step": 8710 }, { "epoch": 1.0334222222222222, "grad_norm": 0.6659543188112809, "learning_rate": 8.287972391999686e-06, "loss": 0.0442, "step": 8720 }, { "epoch": 1.0346074074074074, "grad_norm": 0.5944956303577671, "learning_rate": 8.282774841569317e-06, "loss": 0.0379, "step": 8730 }, { "epoch": 1.0357925925925926, "grad_norm": 0.6953084045289076, "learning_rate": 8.277571048660123e-06, "loss": 0.0427, "step": 8740 }, { "epoch": 1.0369777777777778, "grad_norm": 0.8277179186475379, "learning_rate": 8.272361023167561e-06, "loss": 0.0432, "step": 8750 }, { "epoch": 1.038162962962963, "grad_norm": 0.697480224511074, "learning_rate": 8.267144774998946e-06, "loss": 0.0404, "step": 8760 }, { "epoch": 1.0393481481481481, "grad_norm": 0.6068637519355321, "learning_rate": 8.26192231407342e-06, "loss": 0.0392, "step": 8770 }, { "epoch": 1.0405333333333333, "grad_norm": 0.7119865011070537, "learning_rate": 8.256693650321943e-06, "loss": 0.0408, "step": 8780 }, { "epoch": 1.0417185185185185, "grad_norm": 0.7349031071286162, "learning_rate": 8.25145879368727e-06, "loss": 0.0381, "step": 8790 }, { "epoch": 1.0429037037037037, "grad_norm": 0.6683093412262165, "learning_rate": 8.246217754123928e-06, "loss": 0.037, "step": 8800 }, { "epoch": 1.0440888888888888, "grad_norm": 0.8382151548469626, "learning_rate": 8.24097054159821e-06, "loss": 0.0385, "step": 8810 }, { "epoch": 1.045274074074074, "grad_norm": 1.026748020374135, "learning_rate": 8.23571716608814e-06, "loss": 0.0414, "step": 8820 }, { "epoch": 1.0464592592592592, "grad_norm": 0.7786452765257076, "learning_rate": 8.23045763758346e-06, "loss": 0.0402, "step": 8830 }, { "epoch": 1.0476444444444444, "grad_norm": 0.7085828297847706, "learning_rate": 8.225191966085618e-06, "loss": 0.0411, "step": 8840 }, { "epoch": 1.0488296296296296, "grad_norm": 0.6957962289106565, "learning_rate": 8.219920161607744e-06, "loss": 0.0378, "step": 8850 }, { "epoch": 1.0500148148148147, "grad_norm": 0.6711957474321917, "learning_rate": 8.214642234174626e-06, "loss": 0.0425, "step": 8860 }, { "epoch": 1.0512, "grad_norm": 0.8539013627124864, "learning_rate": 8.209358193822697e-06, "loss": 0.041, "step": 8870 }, { "epoch": 1.052385185185185, "grad_norm": 0.7151822535307056, "learning_rate": 8.204068050600014e-06, "loss": 0.0422, "step": 8880 }, { "epoch": 1.0535703703703703, "grad_norm": 0.8329165699353477, "learning_rate": 8.19877181456624e-06, "loss": 0.0415, "step": 8890 }, { "epoch": 1.0547555555555554, "grad_norm": 0.5986366421572213, "learning_rate": 8.193469495792623e-06, "loss": 0.0436, "step": 8900 }, { "epoch": 1.0559407407407408, "grad_norm": 0.7175233673468983, "learning_rate": 8.18816110436198e-06, "loss": 0.0425, "step": 8910 }, { "epoch": 1.057125925925926, "grad_norm": 0.5585316483925973, "learning_rate": 8.182846650368673e-06, "loss": 0.0412, "step": 8920 }, { "epoch": 1.0583111111111112, "grad_norm": 0.8265422060060924, "learning_rate": 8.177526143918594e-06, "loss": 0.0426, "step": 8930 }, { "epoch": 1.0594962962962964, "grad_norm": 0.7195104966522088, "learning_rate": 8.172199595129142e-06, "loss": 0.0399, "step": 8940 }, { "epoch": 1.0606814814814816, "grad_norm": 0.720887371849215, "learning_rate": 8.16686701412921e-06, "loss": 0.0402, "step": 8950 }, { "epoch": 1.0618666666666667, "grad_norm": 0.4879757415552275, "learning_rate": 8.16152841105916e-06, "loss": 0.0397, "step": 8960 }, { "epoch": 1.063051851851852, "grad_norm": 0.6018869225316273, "learning_rate": 8.156183796070802e-06, "loss": 0.0399, "step": 8970 }, { "epoch": 1.064237037037037, "grad_norm": 0.4590675570536953, "learning_rate": 8.150833179327383e-06, "loss": 0.0381, "step": 8980 }, { "epoch": 1.0654222222222223, "grad_norm": 0.5947256104468551, "learning_rate": 8.145476571003564e-06, "loss": 0.0357, "step": 8990 }, { "epoch": 1.0666074074074074, "grad_norm": 0.6652676492436808, "learning_rate": 8.140113981285393e-06, "loss": 0.0365, "step": 9000 }, { "epoch": 1.0677925925925926, "grad_norm": 0.63057459670751, "learning_rate": 8.134745420370295e-06, "loss": 0.0381, "step": 9010 }, { "epoch": 1.0689777777777778, "grad_norm": 0.6430405073543456, "learning_rate": 8.129370898467055e-06, "loss": 0.0398, "step": 9020 }, { "epoch": 1.070162962962963, "grad_norm": 0.6433143868260934, "learning_rate": 8.123990425795785e-06, "loss": 0.0391, "step": 9030 }, { "epoch": 1.0713481481481482, "grad_norm": 0.6851161432417389, "learning_rate": 8.118604012587916e-06, "loss": 0.0381, "step": 9040 }, { "epoch": 1.0725333333333333, "grad_norm": 0.754137037257858, "learning_rate": 8.113211669086181e-06, "loss": 0.0411, "step": 9050 }, { "epoch": 1.0737185185185185, "grad_norm": 0.5380594431387927, "learning_rate": 8.10781340554458e-06, "loss": 0.0389, "step": 9060 }, { "epoch": 1.0749037037037037, "grad_norm": 0.5587904692509513, "learning_rate": 8.102409232228379e-06, "loss": 0.0395, "step": 9070 }, { "epoch": 1.0760888888888889, "grad_norm": 0.6338775033663401, "learning_rate": 8.096999159414077e-06, "loss": 0.0405, "step": 9080 }, { "epoch": 1.077274074074074, "grad_norm": 0.6610530556255558, "learning_rate": 8.091583197389393e-06, "loss": 0.043, "step": 9090 }, { "epoch": 1.0784592592592592, "grad_norm": 0.6434838113502723, "learning_rate": 8.086161356453244e-06, "loss": 0.0399, "step": 9100 }, { "epoch": 1.0796444444444444, "grad_norm": 0.6795600846469724, "learning_rate": 8.08073364691573e-06, "loss": 0.0394, "step": 9110 }, { "epoch": 1.0808296296296296, "grad_norm": 0.5218978794261384, "learning_rate": 8.075300079098105e-06, "loss": 0.0421, "step": 9120 }, { "epoch": 1.0820148148148148, "grad_norm": 0.5995148508472578, "learning_rate": 8.069860663332768e-06, "loss": 0.0387, "step": 9130 }, { "epoch": 1.0832, "grad_norm": 0.638991294714415, "learning_rate": 8.064415409963233e-06, "loss": 0.0424, "step": 9140 }, { "epoch": 1.0843851851851851, "grad_norm": 0.7192740919002207, "learning_rate": 8.058964329344121e-06, "loss": 0.0393, "step": 9150 }, { "epoch": 1.0855703703703703, "grad_norm": 0.607902444473679, "learning_rate": 8.05350743184113e-06, "loss": 0.0372, "step": 9160 }, { "epoch": 1.0867555555555555, "grad_norm": 0.46471787613003696, "learning_rate": 8.048044727831017e-06, "loss": 0.0369, "step": 9170 }, { "epoch": 1.0879407407407407, "grad_norm": 0.5568509234745769, "learning_rate": 8.042576227701588e-06, "loss": 0.0364, "step": 9180 }, { "epoch": 1.0891259259259258, "grad_norm": 0.5453653534218361, "learning_rate": 8.037101941851664e-06, "loss": 0.0396, "step": 9190 }, { "epoch": 1.090311111111111, "grad_norm": 0.672450863643448, "learning_rate": 8.031621880691072e-06, "loss": 0.0399, "step": 9200 }, { "epoch": 1.0914962962962962, "grad_norm": 0.7112324325072343, "learning_rate": 8.026136054640621e-06, "loss": 0.0393, "step": 9210 }, { "epoch": 1.0926814814814816, "grad_norm": 0.4861430639312091, "learning_rate": 8.020644474132075e-06, "loss": 0.0408, "step": 9220 }, { "epoch": 1.0938666666666668, "grad_norm": 0.5925067945784065, "learning_rate": 8.015147149608156e-06, "loss": 0.0437, "step": 9230 }, { "epoch": 1.095051851851852, "grad_norm": 0.7525583291909363, "learning_rate": 8.009644091522493e-06, "loss": 0.0391, "step": 9240 }, { "epoch": 1.0962370370370371, "grad_norm": 0.5702289189899421, "learning_rate": 8.004135310339625e-06, "loss": 0.0386, "step": 9250 }, { "epoch": 1.0974222222222223, "grad_norm": 0.6484215046382384, "learning_rate": 7.998620816534975e-06, "loss": 0.04, "step": 9260 }, { "epoch": 1.0986074074074075, "grad_norm": 0.4710333408744621, "learning_rate": 7.99310062059483e-06, "loss": 0.0363, "step": 9270 }, { "epoch": 1.0997925925925927, "grad_norm": 0.5414923294611799, "learning_rate": 7.987574733016312e-06, "loss": 0.038, "step": 9280 }, { "epoch": 1.1009777777777778, "grad_norm": 0.6676447250579922, "learning_rate": 7.982043164307377e-06, "loss": 0.0408, "step": 9290 }, { "epoch": 1.102162962962963, "grad_norm": 0.636057341718801, "learning_rate": 7.976505924986774e-06, "loss": 0.0394, "step": 9300 }, { "epoch": 1.1033481481481482, "grad_norm": 0.7685832106758209, "learning_rate": 7.970963025584043e-06, "loss": 0.0382, "step": 9310 }, { "epoch": 1.1045333333333334, "grad_norm": 0.5665516393443645, "learning_rate": 7.965414476639483e-06, "loss": 0.0391, "step": 9320 }, { "epoch": 1.1057185185185185, "grad_norm": 0.6762872755854069, "learning_rate": 7.95986028870414e-06, "loss": 0.0404, "step": 9330 }, { "epoch": 1.1069037037037037, "grad_norm": 0.769882492552106, "learning_rate": 7.954300472339776e-06, "loss": 0.0383, "step": 9340 }, { "epoch": 1.108088888888889, "grad_norm": 0.6018450242649039, "learning_rate": 7.948735038118863e-06, "loss": 0.0402, "step": 9350 }, { "epoch": 1.109274074074074, "grad_norm": 0.6182257708007806, "learning_rate": 7.943163996624552e-06, "loss": 0.0354, "step": 9360 }, { "epoch": 1.1104592592592593, "grad_norm": 0.5062675711388913, "learning_rate": 7.937587358450658e-06, "loss": 0.0369, "step": 9370 }, { "epoch": 1.1116444444444444, "grad_norm": 0.6754262725438573, "learning_rate": 7.932005134201639e-06, "loss": 0.0365, "step": 9380 }, { "epoch": 1.1128296296296296, "grad_norm": 0.6640636272467217, "learning_rate": 7.926417334492575e-06, "loss": 0.0407, "step": 9390 }, { "epoch": 1.1140148148148148, "grad_norm": 0.5789209573506124, "learning_rate": 7.920823969949146e-06, "loss": 0.0388, "step": 9400 }, { "epoch": 1.1152, "grad_norm": 0.5976374351360487, "learning_rate": 7.915225051207616e-06, "loss": 0.0369, "step": 9410 }, { "epoch": 1.1163851851851851, "grad_norm": 0.646752887611876, "learning_rate": 7.909620588914813e-06, "loss": 0.0338, "step": 9420 }, { "epoch": 1.1175703703703703, "grad_norm": 0.7553811218787913, "learning_rate": 7.904010593728102e-06, "loss": 0.0428, "step": 9430 }, { "epoch": 1.1187555555555555, "grad_norm": 0.745592403957942, "learning_rate": 7.898395076315375e-06, "loss": 0.0384, "step": 9440 }, { "epoch": 1.1199407407407407, "grad_norm": 0.6189339136937886, "learning_rate": 7.892774047355016e-06, "loss": 0.0395, "step": 9450 }, { "epoch": 1.1211259259259259, "grad_norm": 0.9305371851260256, "learning_rate": 7.8871475175359e-06, "loss": 0.041, "step": 9460 }, { "epoch": 1.122311111111111, "grad_norm": 0.5263504716116539, "learning_rate": 7.881515497557357e-06, "loss": 0.037, "step": 9470 }, { "epoch": 1.1234962962962962, "grad_norm": 0.5730107225067484, "learning_rate": 7.875877998129155e-06, "loss": 0.0417, "step": 9480 }, { "epoch": 1.1246814814814814, "grad_norm": 0.6688827316920198, "learning_rate": 7.870235029971485e-06, "loss": 0.0356, "step": 9490 }, { "epoch": 1.1258666666666666, "grad_norm": 0.5982568853047519, "learning_rate": 7.864586603814939e-06, "loss": 0.0355, "step": 9500 }, { "epoch": 1.127051851851852, "grad_norm": 0.6742697423097298, "learning_rate": 7.858932730400484e-06, "loss": 0.0373, "step": 9510 }, { "epoch": 1.128237037037037, "grad_norm": 0.6575350956719821, "learning_rate": 7.853273420479446e-06, "loss": 0.0401, "step": 9520 }, { "epoch": 1.1294222222222223, "grad_norm": 0.5377286807869309, "learning_rate": 7.84760868481349e-06, "loss": 0.0388, "step": 9530 }, { "epoch": 1.1306074074074073, "grad_norm": 0.7690644000266748, "learning_rate": 7.841938534174599e-06, "loss": 0.0416, "step": 9540 }, { "epoch": 1.1317925925925927, "grad_norm": 0.6870845907636579, "learning_rate": 7.836262979345051e-06, "loss": 0.0387, "step": 9550 }, { "epoch": 1.1329777777777779, "grad_norm": 0.6809896455441627, "learning_rate": 7.830582031117402e-06, "loss": 0.0375, "step": 9560 }, { "epoch": 1.134162962962963, "grad_norm": 0.6128673986902426, "learning_rate": 7.824895700294465e-06, "loss": 0.035, "step": 9570 }, { "epoch": 1.1353481481481482, "grad_norm": 1.074993039372169, "learning_rate": 7.819203997689288e-06, "loss": 0.0381, "step": 9580 }, { "epoch": 1.1365333333333334, "grad_norm": 0.8103569592071879, "learning_rate": 7.813506934125131e-06, "loss": 0.0389, "step": 9590 }, { "epoch": 1.1377185185185186, "grad_norm": 0.6990667147841416, "learning_rate": 7.807804520435453e-06, "loss": 0.0381, "step": 9600 }, { "epoch": 1.1389037037037038, "grad_norm": 0.726995888548818, "learning_rate": 7.802096767463882e-06, "loss": 0.0371, "step": 9610 }, { "epoch": 1.140088888888889, "grad_norm": 0.7627251603117009, "learning_rate": 7.796383686064202e-06, "loss": 0.0374, "step": 9620 }, { "epoch": 1.141274074074074, "grad_norm": 0.6214008591225787, "learning_rate": 7.790665287100329e-06, "loss": 0.0354, "step": 9630 }, { "epoch": 1.1424592592592593, "grad_norm": 0.5952464595022666, "learning_rate": 7.784941581446293e-06, "loss": 0.0381, "step": 9640 }, { "epoch": 1.1436444444444445, "grad_norm": 0.6897070252186435, "learning_rate": 7.779212579986208e-06, "loss": 0.0416, "step": 9650 }, { "epoch": 1.1448296296296296, "grad_norm": 0.4699599349679626, "learning_rate": 7.773478293614268e-06, "loss": 0.037, "step": 9660 }, { "epoch": 1.1460148148148148, "grad_norm": 0.6197041099717513, "learning_rate": 7.76773873323471e-06, "loss": 0.0382, "step": 9670 }, { "epoch": 1.1472, "grad_norm": 0.5461518902608712, "learning_rate": 7.7619939097618e-06, "loss": 0.0385, "step": 9680 }, { "epoch": 1.1483851851851852, "grad_norm": 0.6694001495956893, "learning_rate": 7.756243834119818e-06, "loss": 0.0401, "step": 9690 }, { "epoch": 1.1495703703703704, "grad_norm": 0.5717942388761149, "learning_rate": 7.750488517243024e-06, "loss": 0.0399, "step": 9700 }, { "epoch": 1.1507555555555555, "grad_norm": 0.7816410856533546, "learning_rate": 7.744727970075651e-06, "loss": 0.04, "step": 9710 }, { "epoch": 1.1519407407407407, "grad_norm": 0.5178801735122833, "learning_rate": 7.738962203571873e-06, "loss": 0.0387, "step": 9720 }, { "epoch": 1.1531259259259259, "grad_norm": 0.8818509388942666, "learning_rate": 7.733191228695792e-06, "loss": 0.036, "step": 9730 }, { "epoch": 1.154311111111111, "grad_norm": 0.5854175089363209, "learning_rate": 7.727415056421414e-06, "loss": 0.0367, "step": 9740 }, { "epoch": 1.1554962962962962, "grad_norm": 0.8611802059904105, "learning_rate": 7.721633697732627e-06, "loss": 0.0444, "step": 9750 }, { "epoch": 1.1566814814814814, "grad_norm": 0.48286765978521046, "learning_rate": 7.71584716362318e-06, "loss": 0.0366, "step": 9760 }, { "epoch": 1.1578666666666666, "grad_norm": 0.7231301188202791, "learning_rate": 7.710055465096668e-06, "loss": 0.0347, "step": 9770 }, { "epoch": 1.1590518518518518, "grad_norm": 1.2864862372203607, "learning_rate": 7.704258613166507e-06, "loss": 0.0406, "step": 9780 }, { "epoch": 1.160237037037037, "grad_norm": 0.7086904892292608, "learning_rate": 7.698456618855902e-06, "loss": 0.0397, "step": 9790 }, { "epoch": 1.1614222222222221, "grad_norm": 0.6210945745921319, "learning_rate": 7.69264949319785e-06, "loss": 0.038, "step": 9800 }, { "epoch": 1.1626074074074073, "grad_norm": 0.5273749308272552, "learning_rate": 7.686837247235099e-06, "loss": 0.0403, "step": 9810 }, { "epoch": 1.1637925925925927, "grad_norm": 0.5627276833700177, "learning_rate": 7.681019892020134e-06, "loss": 0.0382, "step": 9820 }, { "epoch": 1.1649777777777777, "grad_norm": 0.5723449974644869, "learning_rate": 7.675197438615159e-06, "loss": 0.0391, "step": 9830 }, { "epoch": 1.166162962962963, "grad_norm": 0.5884098948345482, "learning_rate": 7.669369898092065e-06, "loss": 0.0378, "step": 9840 }, { "epoch": 1.167348148148148, "grad_norm": 0.7781525540790897, "learning_rate": 7.663537281532427e-06, "loss": 0.0397, "step": 9850 }, { "epoch": 1.1685333333333334, "grad_norm": 0.5655562245777895, "learning_rate": 7.657699600027466e-06, "loss": 0.0392, "step": 9860 }, { "epoch": 1.1697185185185186, "grad_norm": 0.7211204620013965, "learning_rate": 7.651856864678033e-06, "loss": 0.0364, "step": 9870 }, { "epoch": 1.1709037037037038, "grad_norm": 0.6655214829188295, "learning_rate": 7.646009086594595e-06, "loss": 0.0388, "step": 9880 }, { "epoch": 1.172088888888889, "grad_norm": 0.7355463899195125, "learning_rate": 7.640156276897203e-06, "loss": 0.0379, "step": 9890 }, { "epoch": 1.1732740740740741, "grad_norm": 0.510158638519037, "learning_rate": 7.63429844671548e-06, "loss": 0.0386, "step": 9900 }, { "epoch": 1.1744592592592593, "grad_norm": 0.4019337706765291, "learning_rate": 7.628435607188593e-06, "loss": 0.0381, "step": 9910 }, { "epoch": 1.1756444444444445, "grad_norm": 0.5379729732599965, "learning_rate": 7.622567769465237e-06, "loss": 0.0381, "step": 9920 }, { "epoch": 1.1768296296296297, "grad_norm": 0.7265356833741914, "learning_rate": 7.61669494470361e-06, "loss": 0.0403, "step": 9930 }, { "epoch": 1.1780148148148148, "grad_norm": 0.5686521688173402, "learning_rate": 7.610817144071392e-06, "loss": 0.0363, "step": 9940 }, { "epoch": 1.1792, "grad_norm": 0.5918067970383514, "learning_rate": 7.604934378745728e-06, "loss": 0.0387, "step": 9950 }, { "epoch": 1.1803851851851852, "grad_norm": 0.5750460930620956, "learning_rate": 7.599046659913203e-06, "loss": 0.0369, "step": 9960 }, { "epoch": 1.1815703703703704, "grad_norm": 0.546769146314586, "learning_rate": 7.59315399876982e-06, "loss": 0.0358, "step": 9970 }, { "epoch": 1.1827555555555556, "grad_norm": 0.5048140405756063, "learning_rate": 7.587256406520981e-06, "loss": 0.0348, "step": 9980 }, { "epoch": 1.1839407407407407, "grad_norm": 0.7396906565248186, "learning_rate": 7.581353894381466e-06, "loss": 0.035, "step": 9990 }, { "epoch": 1.185125925925926, "grad_norm": 0.5378734359421847, "learning_rate": 7.575446473575409e-06, "loss": 0.0384, "step": 10000 }, { "epoch": 1.186311111111111, "grad_norm": 0.6936285923165727, "learning_rate": 7.56953415533628e-06, "loss": 0.0396, "step": 10010 }, { "epoch": 1.1874962962962963, "grad_norm": 0.5537682677410906, "learning_rate": 7.5636169509068595e-06, "loss": 0.0395, "step": 10020 }, { "epoch": 1.1886814814814815, "grad_norm": 0.5203848138992757, "learning_rate": 7.5576948715392205e-06, "loss": 0.036, "step": 10030 }, { "epoch": 1.1898666666666666, "grad_norm": 0.44365860174400173, "learning_rate": 7.551767928494709e-06, "loss": 0.0398, "step": 10040 }, { "epoch": 1.1910518518518518, "grad_norm": 0.5906111121468769, "learning_rate": 7.545836133043916e-06, "loss": 0.0361, "step": 10050 }, { "epoch": 1.192237037037037, "grad_norm": 0.5964498325699477, "learning_rate": 7.539899496466659e-06, "loss": 0.0382, "step": 10060 }, { "epoch": 1.1934222222222222, "grad_norm": 0.5966852144925037, "learning_rate": 7.533958030051964e-06, "loss": 0.0375, "step": 10070 }, { "epoch": 1.1946074074074073, "grad_norm": 0.5669702200578985, "learning_rate": 7.528011745098043e-06, "loss": 0.0366, "step": 10080 }, { "epoch": 1.1957925925925925, "grad_norm": 0.5781264952994672, "learning_rate": 7.522060652912268e-06, "loss": 0.0373, "step": 10090 }, { "epoch": 1.1969777777777777, "grad_norm": 0.6508011966249394, "learning_rate": 7.516104764811151e-06, "loss": 0.0392, "step": 10100 }, { "epoch": 1.1981629629629629, "grad_norm": 0.6527186831348761, "learning_rate": 7.510144092120326e-06, "loss": 0.0368, "step": 10110 }, { "epoch": 1.199348148148148, "grad_norm": 0.7350903522049982, "learning_rate": 7.504178646174526e-06, "loss": 0.0375, "step": 10120 }, { "epoch": 1.2005333333333335, "grad_norm": 0.6744182650640396, "learning_rate": 7.498208438317559e-06, "loss": 0.0409, "step": 10130 }, { "epoch": 1.2017185185185184, "grad_norm": 0.435977993508131, "learning_rate": 7.492233479902289e-06, "loss": 0.038, "step": 10140 }, { "epoch": 1.2029037037037038, "grad_norm": 0.5884682298301195, "learning_rate": 7.486253782290614e-06, "loss": 0.0376, "step": 10150 }, { "epoch": 1.2040888888888888, "grad_norm": 0.5543585381243312, "learning_rate": 7.480269356853444e-06, "loss": 0.0377, "step": 10160 }, { "epoch": 1.2052740740740742, "grad_norm": 0.7363678889770158, "learning_rate": 7.474280214970677e-06, "loss": 0.0376, "step": 10170 }, { "epoch": 1.2064592592592593, "grad_norm": 0.8060719295523754, "learning_rate": 7.4682863680311825e-06, "loss": 0.0393, "step": 10180 }, { "epoch": 1.2076444444444445, "grad_norm": 0.7798286505427975, "learning_rate": 7.462287827432777e-06, "loss": 0.0386, "step": 10190 }, { "epoch": 1.2088296296296297, "grad_norm": 0.5806037574379604, "learning_rate": 7.456284604582203e-06, "loss": 0.0345, "step": 10200 }, { "epoch": 1.2100148148148149, "grad_norm": 0.77013070831697, "learning_rate": 7.450276710895101e-06, "loss": 0.0374, "step": 10210 }, { "epoch": 1.2112, "grad_norm": 0.6185004535725842, "learning_rate": 7.4442641577959996e-06, "loss": 0.033, "step": 10220 }, { "epoch": 1.2123851851851852, "grad_norm": 0.507809037896187, "learning_rate": 7.438246956718288e-06, "loss": 0.039, "step": 10230 }, { "epoch": 1.2135703703703704, "grad_norm": 0.6192477058489974, "learning_rate": 7.432225119104191e-06, "loss": 0.0368, "step": 10240 }, { "epoch": 1.2147555555555556, "grad_norm": 0.606240126842413, "learning_rate": 7.426198656404748e-06, "loss": 0.0334, "step": 10250 }, { "epoch": 1.2159407407407408, "grad_norm": 0.6594304830390676, "learning_rate": 7.4201675800798e-06, "loss": 0.0415, "step": 10260 }, { "epoch": 1.217125925925926, "grad_norm": 0.4799155254654633, "learning_rate": 7.4141319015979564e-06, "loss": 0.036, "step": 10270 }, { "epoch": 1.2183111111111111, "grad_norm": 0.6255204291523627, "learning_rate": 7.408091632436578e-06, "loss": 0.0335, "step": 10280 }, { "epoch": 1.2194962962962963, "grad_norm": 0.521121063974648, "learning_rate": 7.402046784081758e-06, "loss": 0.0352, "step": 10290 }, { "epoch": 1.2206814814814815, "grad_norm": 0.7004544592173146, "learning_rate": 7.395997368028294e-06, "loss": 0.0354, "step": 10300 }, { "epoch": 1.2218666666666667, "grad_norm": 0.5440553864960688, "learning_rate": 7.389943395779673e-06, "loss": 0.0355, "step": 10310 }, { "epoch": 1.2230518518518518, "grad_norm": 0.7544039443013161, "learning_rate": 7.383884878848042e-06, "loss": 0.0389, "step": 10320 }, { "epoch": 1.224237037037037, "grad_norm": 0.7070267814964819, "learning_rate": 7.377821828754195e-06, "loss": 0.0342, "step": 10330 }, { "epoch": 1.2254222222222222, "grad_norm": 0.6609842767241245, "learning_rate": 7.371754257027541e-06, "loss": 0.0355, "step": 10340 }, { "epoch": 1.2266074074074074, "grad_norm": 0.6715580646753436, "learning_rate": 7.365682175206091e-06, "loss": 0.0368, "step": 10350 }, { "epoch": 1.2277925925925925, "grad_norm": 0.5885854475008646, "learning_rate": 7.359605594836431e-06, "loss": 0.0372, "step": 10360 }, { "epoch": 1.2289777777777777, "grad_norm": 0.5101294574648217, "learning_rate": 7.3535245274737e-06, "loss": 0.0375, "step": 10370 }, { "epoch": 1.230162962962963, "grad_norm": 0.6295392425350629, "learning_rate": 7.347438984681572e-06, "loss": 0.0368, "step": 10380 }, { "epoch": 1.231348148148148, "grad_norm": 0.7233676371913077, "learning_rate": 7.341348978032231e-06, "loss": 0.0399, "step": 10390 }, { "epoch": 1.2325333333333333, "grad_norm": 0.6411075309401609, "learning_rate": 7.335254519106348e-06, "loss": 0.0361, "step": 10400 }, { "epoch": 1.2337185185185184, "grad_norm": 0.5243013444225256, "learning_rate": 7.3291556194930605e-06, "loss": 0.0376, "step": 10410 }, { "epoch": 1.2349037037037036, "grad_norm": 0.7982930057892746, "learning_rate": 7.323052290789951e-06, "loss": 0.0376, "step": 10420 }, { "epoch": 1.2360888888888888, "grad_norm": 0.7984123162275314, "learning_rate": 7.3169445446030265e-06, "loss": 0.0373, "step": 10430 }, { "epoch": 1.2372740740740742, "grad_norm": 0.6960294780009021, "learning_rate": 7.310832392546687e-06, "loss": 0.0373, "step": 10440 }, { "epoch": 1.2384592592592591, "grad_norm": 0.4921509400661577, "learning_rate": 7.304715846243719e-06, "loss": 0.0367, "step": 10450 }, { "epoch": 1.2396444444444445, "grad_norm": 0.5673279478922701, "learning_rate": 7.2985949173252615e-06, "loss": 0.0388, "step": 10460 }, { "epoch": 1.2408296296296297, "grad_norm": 0.42769975524903464, "learning_rate": 7.2924696174307885e-06, "loss": 0.0358, "step": 10470 }, { "epoch": 1.242014814814815, "grad_norm": 0.6195045246300867, "learning_rate": 7.286339958208082e-06, "loss": 0.035, "step": 10480 }, { "epoch": 1.2432, "grad_norm": 0.8212876452206117, "learning_rate": 7.280205951313217e-06, "loss": 0.0422, "step": 10490 }, { "epoch": 1.2443851851851853, "grad_norm": 0.5898929393304833, "learning_rate": 7.274067608410536e-06, "loss": 0.0391, "step": 10500 }, { "epoch": 1.2455703703703704, "grad_norm": 0.6162423799805311, "learning_rate": 7.26792494117263e-06, "loss": 0.0352, "step": 10510 }, { "epoch": 1.2467555555555556, "grad_norm": 0.6237237671475605, "learning_rate": 7.2617779612803015e-06, "loss": 0.0394, "step": 10520 }, { "epoch": 1.2479407407407408, "grad_norm": 0.6817710998682491, "learning_rate": 7.255626680422568e-06, "loss": 0.0368, "step": 10530 }, { "epoch": 1.249125925925926, "grad_norm": 0.5662550624455125, "learning_rate": 7.249471110296615e-06, "loss": 0.0356, "step": 10540 }, { "epoch": 1.2503111111111112, "grad_norm": 0.7201143853816148, "learning_rate": 7.243311262607794e-06, "loss": 0.0362, "step": 10550 }, { "epoch": 1.2514962962962963, "grad_norm": 0.8630063421470188, "learning_rate": 7.237147149069581e-06, "loss": 0.0333, "step": 10560 }, { "epoch": 1.2526814814814815, "grad_norm": 0.6061336978531989, "learning_rate": 7.23097878140357e-06, "loss": 0.0369, "step": 10570 }, { "epoch": 1.2538666666666667, "grad_norm": 0.5148449774448415, "learning_rate": 7.22480617133944e-06, "loss": 0.0347, "step": 10580 }, { "epoch": 1.2550518518518519, "grad_norm": 0.4327161338223404, "learning_rate": 7.218629330614946e-06, "loss": 0.0397, "step": 10590 }, { "epoch": 1.256237037037037, "grad_norm": 0.515041847001187, "learning_rate": 7.212448270975878e-06, "loss": 0.0356, "step": 10600 }, { "epoch": 1.2574222222222222, "grad_norm": 0.5306563008821455, "learning_rate": 7.206263004176053e-06, "loss": 0.0368, "step": 10610 }, { "epoch": 1.2586074074074074, "grad_norm": 0.5630591444218594, "learning_rate": 7.2000735419772875e-06, "loss": 0.0346, "step": 10620 }, { "epoch": 1.2597925925925926, "grad_norm": 0.5450281381204137, "learning_rate": 7.193879896149379e-06, "loss": 0.0402, "step": 10630 }, { "epoch": 1.2609777777777778, "grad_norm": 0.5744760203183716, "learning_rate": 7.187682078470076e-06, "loss": 0.0329, "step": 10640 }, { "epoch": 1.262162962962963, "grad_norm": 0.6840546645209985, "learning_rate": 7.181480100725062e-06, "loss": 0.0359, "step": 10650 }, { "epoch": 1.263348148148148, "grad_norm": 0.5898317513632808, "learning_rate": 7.175273974707933e-06, "loss": 0.0401, "step": 10660 }, { "epoch": 1.2645333333333333, "grad_norm": 0.46707345659001853, "learning_rate": 7.16906371222017e-06, "loss": 0.0342, "step": 10670 }, { "epoch": 1.2657185185185185, "grad_norm": 0.588399829806492, "learning_rate": 7.1628493250711215e-06, "loss": 0.0342, "step": 10680 }, { "epoch": 1.2669037037037036, "grad_norm": 0.486637643895772, "learning_rate": 7.156630825077982e-06, "loss": 0.0336, "step": 10690 }, { "epoch": 1.2680888888888888, "grad_norm": 0.766020600943266, "learning_rate": 7.150408224065759e-06, "loss": 0.0346, "step": 10700 }, { "epoch": 1.269274074074074, "grad_norm": 0.6636639967654908, "learning_rate": 7.144181533867269e-06, "loss": 0.0355, "step": 10710 }, { "epoch": 1.2704592592592592, "grad_norm": 0.7306386817598991, "learning_rate": 7.137950766323098e-06, "loss": 0.0381, "step": 10720 }, { "epoch": 1.2716444444444446, "grad_norm": 0.6309264065300677, "learning_rate": 7.131715933281583e-06, "loss": 0.0389, "step": 10730 }, { "epoch": 1.2728296296296295, "grad_norm": 0.5297392547162115, "learning_rate": 7.125477046598801e-06, "loss": 0.0382, "step": 10740 }, { "epoch": 1.274014814814815, "grad_norm": 0.6055137668449215, "learning_rate": 7.119234118138527e-06, "loss": 0.0367, "step": 10750 }, { "epoch": 1.2752, "grad_norm": 0.6366523175764957, "learning_rate": 7.112987159772229e-06, "loss": 0.0366, "step": 10760 }, { "epoch": 1.2763851851851853, "grad_norm": 0.5066931729198593, "learning_rate": 7.106736183379036e-06, "loss": 0.0362, "step": 10770 }, { "epoch": 1.2775703703703702, "grad_norm": 0.6975281275596105, "learning_rate": 7.100481200845718e-06, "loss": 0.0363, "step": 10780 }, { "epoch": 1.2787555555555556, "grad_norm": 0.651262721556952, "learning_rate": 7.0942222240666606e-06, "loss": 0.0372, "step": 10790 }, { "epoch": 1.2799407407407408, "grad_norm": 0.6715872302861878, "learning_rate": 7.0879592649438465e-06, "loss": 0.0381, "step": 10800 }, { "epoch": 1.281125925925926, "grad_norm": 0.5139258765911803, "learning_rate": 7.081692335386834e-06, "loss": 0.0352, "step": 10810 }, { "epoch": 1.2823111111111112, "grad_norm": 0.5810841016085941, "learning_rate": 7.075421447312728e-06, "loss": 0.0363, "step": 10820 }, { "epoch": 1.2834962962962964, "grad_norm": 0.5051955731845852, "learning_rate": 7.06914661264616e-06, "loss": 0.036, "step": 10830 }, { "epoch": 1.2846814814814815, "grad_norm": 0.538763562288868, "learning_rate": 7.062867843319269e-06, "loss": 0.0343, "step": 10840 }, { "epoch": 1.2858666666666667, "grad_norm": 0.6027292423237165, "learning_rate": 7.056585151271675e-06, "loss": 0.0345, "step": 10850 }, { "epoch": 1.287051851851852, "grad_norm": 0.813088142723815, "learning_rate": 7.050298548450459e-06, "loss": 0.0402, "step": 10860 }, { "epoch": 1.288237037037037, "grad_norm": 0.7169273672142574, "learning_rate": 7.044008046810136e-06, "loss": 0.0341, "step": 10870 }, { "epoch": 1.2894222222222222, "grad_norm": 0.6791794741661862, "learning_rate": 7.0377136583126345e-06, "loss": 0.0366, "step": 10880 }, { "epoch": 1.2906074074074074, "grad_norm": 0.5849762133650925, "learning_rate": 7.031415394927279e-06, "loss": 0.0344, "step": 10890 }, { "epoch": 1.2917925925925926, "grad_norm": 0.5842419721747307, "learning_rate": 7.025113268630758e-06, "loss": 0.0371, "step": 10900 }, { "epoch": 1.2929777777777778, "grad_norm": 0.6191734545664108, "learning_rate": 7.018807291407106e-06, "loss": 0.0358, "step": 10910 }, { "epoch": 1.294162962962963, "grad_norm": 0.5264925699241213, "learning_rate": 7.012497475247681e-06, "loss": 0.0329, "step": 10920 }, { "epoch": 1.2953481481481481, "grad_norm": 0.554559917607714, "learning_rate": 7.0061838321511434e-06, "loss": 0.0377, "step": 10930 }, { "epoch": 1.2965333333333333, "grad_norm": 0.6265356610213807, "learning_rate": 6.999866374123429e-06, "loss": 0.0369, "step": 10940 }, { "epoch": 1.2977185185185185, "grad_norm": 0.6389940271189815, "learning_rate": 6.993545113177724e-06, "loss": 0.036, "step": 10950 }, { "epoch": 1.2989037037037037, "grad_norm": 0.6591286121830231, "learning_rate": 6.987220061334453e-06, "loss": 0.0363, "step": 10960 }, { "epoch": 1.3000888888888888, "grad_norm": 0.5927388685743068, "learning_rate": 6.980891230621247e-06, "loss": 0.0364, "step": 10970 }, { "epoch": 1.301274074074074, "grad_norm": 0.553730384114029, "learning_rate": 6.9745586330729205e-06, "loss": 0.0363, "step": 10980 }, { "epoch": 1.3024592592592592, "grad_norm": 0.7396188774134861, "learning_rate": 6.968222280731454e-06, "loss": 0.036, "step": 10990 }, { "epoch": 1.3036444444444444, "grad_norm": 0.5584086682473048, "learning_rate": 6.961882185645964e-06, "loss": 0.0333, "step": 11000 }, { "epoch": 1.3048296296296296, "grad_norm": 0.49887749085338706, "learning_rate": 6.955538359872689e-06, "loss": 0.0357, "step": 11010 }, { "epoch": 1.3060148148148147, "grad_norm": 0.7369840938642377, "learning_rate": 6.94919081547496e-06, "loss": 0.0381, "step": 11020 }, { "epoch": 1.3072, "grad_norm": 0.4800393514610276, "learning_rate": 6.942839564523178e-06, "loss": 0.0355, "step": 11030 }, { "epoch": 1.3083851851851853, "grad_norm": 0.5497586039588198, "learning_rate": 6.936484619094792e-06, "loss": 0.0335, "step": 11040 }, { "epoch": 1.3095703703703703, "grad_norm": 0.6018994519614677, "learning_rate": 6.930125991274281e-06, "loss": 0.0362, "step": 11050 }, { "epoch": 1.3107555555555557, "grad_norm": 0.5800669025095517, "learning_rate": 6.923763693153118e-06, "loss": 0.0377, "step": 11060 }, { "epoch": 1.3119407407407406, "grad_norm": 0.8188165683940332, "learning_rate": 6.917397736829765e-06, "loss": 0.0374, "step": 11070 }, { "epoch": 1.313125925925926, "grad_norm": 0.4804822712980326, "learning_rate": 6.911028134409633e-06, "loss": 0.0357, "step": 11080 }, { "epoch": 1.314311111111111, "grad_norm": 0.5956443799773388, "learning_rate": 6.90465489800507e-06, "loss": 0.036, "step": 11090 }, { "epoch": 1.3154962962962964, "grad_norm": 0.604125706240739, "learning_rate": 6.898278039735333e-06, "loss": 0.0353, "step": 11100 }, { "epoch": 1.3166814814814816, "grad_norm": 0.6787861254042028, "learning_rate": 6.891897571726567e-06, "loss": 0.0349, "step": 11110 }, { "epoch": 1.3178666666666667, "grad_norm": 0.8674508225537952, "learning_rate": 6.8855135061117804e-06, "loss": 0.0356, "step": 11120 }, { "epoch": 1.319051851851852, "grad_norm": 0.5699340089694086, "learning_rate": 6.879125855030825e-06, "loss": 0.0327, "step": 11130 }, { "epoch": 1.320237037037037, "grad_norm": 0.6177549684184535, "learning_rate": 6.872734630630367e-06, "loss": 0.0347, "step": 11140 }, { "epoch": 1.3214222222222223, "grad_norm": 0.7388509361195605, "learning_rate": 6.866339845063868e-06, "loss": 0.0347, "step": 11150 }, { "epoch": 1.3226074074074075, "grad_norm": 0.5873748021014992, "learning_rate": 6.859941510491568e-06, "loss": 0.0345, "step": 11160 }, { "epoch": 1.3237925925925926, "grad_norm": 0.49526389412533656, "learning_rate": 6.853539639080448e-06, "loss": 0.0337, "step": 11170 }, { "epoch": 1.3249777777777778, "grad_norm": 0.7153363063673579, "learning_rate": 6.8471342430042155e-06, "loss": 0.0337, "step": 11180 }, { "epoch": 1.326162962962963, "grad_norm": 0.718296176708374, "learning_rate": 6.840725334443283e-06, "loss": 0.0336, "step": 11190 }, { "epoch": 1.3273481481481482, "grad_norm": 0.6398640873266896, "learning_rate": 6.834312925584745e-06, "loss": 0.0364, "step": 11200 }, { "epoch": 1.3285333333333333, "grad_norm": 0.5794143394656014, "learning_rate": 6.827897028622346e-06, "loss": 0.0355, "step": 11210 }, { "epoch": 1.3297185185185185, "grad_norm": 0.5010500380583615, "learning_rate": 6.821477655756465e-06, "loss": 0.0358, "step": 11220 }, { "epoch": 1.3309037037037037, "grad_norm": 0.5096253170101601, "learning_rate": 6.815054819194095e-06, "loss": 0.0343, "step": 11230 }, { "epoch": 1.3320888888888889, "grad_norm": 0.8089976407775299, "learning_rate": 6.808628531148809e-06, "loss": 0.038, "step": 11240 }, { "epoch": 1.333274074074074, "grad_norm": 0.5171425901613221, "learning_rate": 6.80219880384075e-06, "loss": 0.0343, "step": 11250 }, { "epoch": 1.3344592592592592, "grad_norm": 0.5042780596733507, "learning_rate": 6.795765649496594e-06, "loss": 0.0375, "step": 11260 }, { "epoch": 1.3356444444444444, "grad_norm": 0.5660800758481741, "learning_rate": 6.789329080349542e-06, "loss": 0.0356, "step": 11270 }, { "epoch": 1.3368296296296296, "grad_norm": 0.5269164315045531, "learning_rate": 6.78288910863928e-06, "loss": 0.0383, "step": 11280 }, { "epoch": 1.3380148148148148, "grad_norm": 0.5212949347715897, "learning_rate": 6.77644574661197e-06, "loss": 0.0375, "step": 11290 }, { "epoch": 1.3392, "grad_norm": 0.6101840800275271, "learning_rate": 6.76999900652022e-06, "loss": 0.0366, "step": 11300 }, { "epoch": 1.3403851851851851, "grad_norm": 0.661408418068933, "learning_rate": 6.76354890062306e-06, "loss": 0.0368, "step": 11310 }, { "epoch": 1.3415703703703703, "grad_norm": 0.57865126463542, "learning_rate": 6.757095441185921e-06, "loss": 0.0368, "step": 11320 }, { "epoch": 1.3427555555555555, "grad_norm": 0.5862153482932908, "learning_rate": 6.750638640480613e-06, "loss": 0.0381, "step": 11330 }, { "epoch": 1.3439407407407407, "grad_norm": 0.6043664158827985, "learning_rate": 6.744178510785296e-06, "loss": 0.036, "step": 11340 }, { "epoch": 1.345125925925926, "grad_norm": 0.6308929261719946, "learning_rate": 6.737715064384464e-06, "loss": 0.0369, "step": 11350 }, { "epoch": 1.346311111111111, "grad_norm": 0.6332895047899424, "learning_rate": 6.731248313568917e-06, "loss": 0.0359, "step": 11360 }, { "epoch": 1.3474962962962964, "grad_norm": 0.5334268499617741, "learning_rate": 6.724778270635737e-06, "loss": 0.0323, "step": 11370 }, { "epoch": 1.3486814814814814, "grad_norm": 0.6097177784025062, "learning_rate": 6.7183049478882665e-06, "loss": 0.035, "step": 11380 }, { "epoch": 1.3498666666666668, "grad_norm": 0.461946623934123, "learning_rate": 6.711828357636088e-06, "loss": 0.0375, "step": 11390 }, { "epoch": 1.3510518518518517, "grad_norm": 0.47279748379846404, "learning_rate": 6.7053485121949935e-06, "loss": 0.0366, "step": 11400 }, { "epoch": 1.3522370370370371, "grad_norm": 0.4479543484283015, "learning_rate": 6.698865423886966e-06, "loss": 0.0332, "step": 11410 }, { "epoch": 1.3534222222222223, "grad_norm": 0.5954274040705463, "learning_rate": 6.692379105040157e-06, "loss": 0.0334, "step": 11420 }, { "epoch": 1.3546074074074075, "grad_norm": 0.5473474005216107, "learning_rate": 6.68588956798886e-06, "loss": 0.0329, "step": 11430 }, { "epoch": 1.3557925925925927, "grad_norm": 0.6173287603685763, "learning_rate": 6.67939682507349e-06, "loss": 0.0356, "step": 11440 }, { "epoch": 1.3569777777777778, "grad_norm": 0.5949392287948896, "learning_rate": 6.672900888640551e-06, "loss": 0.0387, "step": 11450 }, { "epoch": 1.358162962962963, "grad_norm": 0.7887002415849497, "learning_rate": 6.6664017710426295e-06, "loss": 0.0354, "step": 11460 }, { "epoch": 1.3593481481481482, "grad_norm": 0.5107077269167135, "learning_rate": 6.659899484638354e-06, "loss": 0.0334, "step": 11470 }, { "epoch": 1.3605333333333334, "grad_norm": 0.6023363328697606, "learning_rate": 6.653394041792386e-06, "loss": 0.0359, "step": 11480 }, { "epoch": 1.3617185185185186, "grad_norm": 0.5705655715727445, "learning_rate": 6.64688545487538e-06, "loss": 0.0379, "step": 11490 }, { "epoch": 1.3629037037037037, "grad_norm": 0.5675378303398613, "learning_rate": 6.6403737362639765e-06, "loss": 0.0381, "step": 11500 }, { "epoch": 1.364088888888889, "grad_norm": 0.5507533395326385, "learning_rate": 6.633858898340767e-06, "loss": 0.0348, "step": 11510 }, { "epoch": 1.365274074074074, "grad_norm": 0.5505772830940984, "learning_rate": 6.627340953494279e-06, "loss": 0.0367, "step": 11520 }, { "epoch": 1.3664592592592593, "grad_norm": 0.5488499941719669, "learning_rate": 6.620819914118943e-06, "loss": 0.0329, "step": 11530 }, { "epoch": 1.3676444444444444, "grad_norm": 0.631397900626461, "learning_rate": 6.614295792615078e-06, "loss": 0.037, "step": 11540 }, { "epoch": 1.3688296296296296, "grad_norm": 0.5632186279781408, "learning_rate": 6.607768601388862e-06, "loss": 0.0372, "step": 11550 }, { "epoch": 1.3700148148148148, "grad_norm": 0.582559158072894, "learning_rate": 6.6012383528523114e-06, "loss": 0.0374, "step": 11560 }, { "epoch": 1.3712, "grad_norm": 0.8006342054054756, "learning_rate": 6.5947050594232534e-06, "loss": 0.0368, "step": 11570 }, { "epoch": 1.3723851851851852, "grad_norm": 0.6156227656750647, "learning_rate": 6.588168733525311e-06, "loss": 0.0368, "step": 11580 }, { "epoch": 1.3735703703703703, "grad_norm": 0.5106679994915344, "learning_rate": 6.581629387587867e-06, "loss": 0.0374, "step": 11590 }, { "epoch": 1.3747555555555555, "grad_norm": 0.640148816293237, "learning_rate": 6.5750870340460525e-06, "loss": 0.0374, "step": 11600 }, { "epoch": 1.3759407407407407, "grad_norm": 0.4613254860891459, "learning_rate": 6.568541685340715e-06, "loss": 0.0357, "step": 11610 }, { "epoch": 1.3771259259259259, "grad_norm": 0.527150437229008, "learning_rate": 6.561993353918398e-06, "loss": 0.0366, "step": 11620 }, { "epoch": 1.378311111111111, "grad_norm": 0.518361073536929, "learning_rate": 6.555442052231317e-06, "loss": 0.038, "step": 11630 }, { "epoch": 1.3794962962962962, "grad_norm": 0.5838883535494714, "learning_rate": 6.548887792737337e-06, "loss": 0.0335, "step": 11640 }, { "epoch": 1.3806814814814814, "grad_norm": 0.44379312855301606, "learning_rate": 6.5423305878999455e-06, "loss": 0.0337, "step": 11650 }, { "epoch": 1.3818666666666668, "grad_norm": 0.6556064765057338, "learning_rate": 6.535770450188232e-06, "loss": 0.0349, "step": 11660 }, { "epoch": 1.3830518518518518, "grad_norm": 0.4198980372163158, "learning_rate": 6.529207392076863e-06, "loss": 0.0347, "step": 11670 }, { "epoch": 1.3842370370370372, "grad_norm": 0.5902701113711034, "learning_rate": 6.522641426046058e-06, "loss": 0.0325, "step": 11680 }, { "epoch": 1.385422222222222, "grad_norm": 0.9062230632178103, "learning_rate": 6.516072564581566e-06, "loss": 0.0346, "step": 11690 }, { "epoch": 1.3866074074074075, "grad_norm": 0.5313649595064155, "learning_rate": 6.509500820174642e-06, "loss": 0.0341, "step": 11700 }, { "epoch": 1.3877925925925925, "grad_norm": 0.5933460651755964, "learning_rate": 6.502926205322025e-06, "loss": 0.0362, "step": 11710 }, { "epoch": 1.3889777777777779, "grad_norm": 0.5369814455484064, "learning_rate": 6.49634873252591e-06, "loss": 0.0302, "step": 11720 }, { "epoch": 1.390162962962963, "grad_norm": 0.5889066684156642, "learning_rate": 6.4897684142939264e-06, "loss": 0.0358, "step": 11730 }, { "epoch": 1.3913481481481482, "grad_norm": 0.6795712463233264, "learning_rate": 6.483185263139117e-06, "loss": 0.0388, "step": 11740 }, { "epoch": 1.3925333333333334, "grad_norm": 0.8610744253662693, "learning_rate": 6.47659929157991e-06, "loss": 0.0374, "step": 11750 }, { "epoch": 1.3937185185185186, "grad_norm": 0.6826603695638273, "learning_rate": 6.470010512140096e-06, "loss": 0.0355, "step": 11760 }, { "epoch": 1.3949037037037038, "grad_norm": 0.6029143831844743, "learning_rate": 6.463418937348807e-06, "loss": 0.0344, "step": 11770 }, { "epoch": 1.396088888888889, "grad_norm": 0.6194314162789141, "learning_rate": 6.456824579740488e-06, "loss": 0.0366, "step": 11780 }, { "epoch": 1.3972740740740741, "grad_norm": 0.535701724273535, "learning_rate": 6.450227451854883e-06, "loss": 0.0355, "step": 11790 }, { "epoch": 1.3984592592592593, "grad_norm": 0.6780961270832723, "learning_rate": 6.443627566236989e-06, "loss": 0.0388, "step": 11800 }, { "epoch": 1.3996444444444445, "grad_norm": 0.5865124627611724, "learning_rate": 6.437024935437064e-06, "loss": 0.0342, "step": 11810 }, { "epoch": 1.4008296296296296, "grad_norm": 0.650873468055745, "learning_rate": 6.430419572010576e-06, "loss": 0.0324, "step": 11820 }, { "epoch": 1.4020148148148148, "grad_norm": 0.54115654862498, "learning_rate": 6.423811488518192e-06, "loss": 0.0349, "step": 11830 }, { "epoch": 1.4032, "grad_norm": 0.6903482739650146, "learning_rate": 6.4172006975257496e-06, "loss": 0.0345, "step": 11840 }, { "epoch": 1.4043851851851852, "grad_norm": 0.48197496990573035, "learning_rate": 6.41058721160424e-06, "loss": 0.0352, "step": 11850 }, { "epoch": 1.4055703703703704, "grad_norm": 0.41977157751683486, "learning_rate": 6.403971043329774e-06, "loss": 0.0338, "step": 11860 }, { "epoch": 1.4067555555555555, "grad_norm": 0.5142761840609447, "learning_rate": 6.3973522052835656e-06, "loss": 0.0326, "step": 11870 }, { "epoch": 1.4079407407407407, "grad_norm": 0.5981393420492328, "learning_rate": 6.390730710051902e-06, "loss": 0.0344, "step": 11880 }, { "epoch": 1.409125925925926, "grad_norm": 0.41128395801819617, "learning_rate": 6.384106570226131e-06, "loss": 0.035, "step": 11890 }, { "epoch": 1.410311111111111, "grad_norm": 0.6166904240536913, "learning_rate": 6.37747979840262e-06, "loss": 0.0341, "step": 11900 }, { "epoch": 1.4114962962962962, "grad_norm": 0.6313770279628215, "learning_rate": 6.3708504071827495e-06, "loss": 0.0337, "step": 11910 }, { "epoch": 1.4126814814814814, "grad_norm": 0.6558176060861862, "learning_rate": 6.364218409172873e-06, "loss": 0.0323, "step": 11920 }, { "epoch": 1.4138666666666666, "grad_norm": 0.557870935338801, "learning_rate": 6.3575838169843095e-06, "loss": 0.0348, "step": 11930 }, { "epoch": 1.4150518518518518, "grad_norm": 0.48126417512548564, "learning_rate": 6.3509466432333054e-06, "loss": 0.0331, "step": 11940 }, { "epoch": 1.416237037037037, "grad_norm": 0.45904037179735924, "learning_rate": 6.344306900541017e-06, "loss": 0.0364, "step": 11950 }, { "epoch": 1.4174222222222221, "grad_norm": 0.5741902140473147, "learning_rate": 6.337664601533488e-06, "loss": 0.0336, "step": 11960 }, { "epoch": 1.4186074074074075, "grad_norm": 0.6869152852529073, "learning_rate": 6.331019758841619e-06, "loss": 0.0362, "step": 11970 }, { "epoch": 1.4197925925925925, "grad_norm": 0.6920543841466743, "learning_rate": 6.324372385101151e-06, "loss": 0.0347, "step": 11980 }, { "epoch": 1.420977777777778, "grad_norm": 0.4582770578269367, "learning_rate": 6.31772249295264e-06, "loss": 0.0349, "step": 11990 }, { "epoch": 1.4221629629629629, "grad_norm": 0.48242390390604495, "learning_rate": 6.3110700950414225e-06, "loss": 0.0345, "step": 12000 }, { "epoch": 1.4233481481481483, "grad_norm": 0.5741789245553095, "learning_rate": 6.304415204017611e-06, "loss": 0.0361, "step": 12010 }, { "epoch": 1.4245333333333332, "grad_norm": 0.527190754710843, "learning_rate": 6.297757832536051e-06, "loss": 0.0319, "step": 12020 }, { "epoch": 1.4257185185185186, "grad_norm": 0.5823014658369792, "learning_rate": 6.2910979932563075e-06, "loss": 0.0336, "step": 12030 }, { "epoch": 1.4269037037037038, "grad_norm": 0.9204610506104669, "learning_rate": 6.284435698842637e-06, "loss": 0.0324, "step": 12040 }, { "epoch": 1.428088888888889, "grad_norm": 0.689396900420848, "learning_rate": 6.277770961963967e-06, "loss": 0.0335, "step": 12050 }, { "epoch": 1.4292740740740741, "grad_norm": 0.49710991494760887, "learning_rate": 6.271103795293868e-06, "loss": 0.0335, "step": 12060 }, { "epoch": 1.4304592592592593, "grad_norm": 0.6559924748545927, "learning_rate": 6.264434211510528e-06, "loss": 0.0359, "step": 12070 }, { "epoch": 1.4316444444444445, "grad_norm": 0.6221518618651144, "learning_rate": 6.2577622232967405e-06, "loss": 0.0346, "step": 12080 }, { "epoch": 1.4328296296296297, "grad_norm": 0.499462205585944, "learning_rate": 6.25108784333986e-06, "loss": 0.0314, "step": 12090 }, { "epoch": 1.4340148148148149, "grad_norm": 0.5319447691270714, "learning_rate": 6.244411084331797e-06, "loss": 0.0338, "step": 12100 }, { "epoch": 1.4352, "grad_norm": 0.5526359434374736, "learning_rate": 6.237731958968981e-06, "loss": 0.0337, "step": 12110 }, { "epoch": 1.4363851851851852, "grad_norm": 0.655313459886929, "learning_rate": 6.231050479952346e-06, "loss": 0.0359, "step": 12120 }, { "epoch": 1.4375703703703704, "grad_norm": 0.554788460737137, "learning_rate": 6.224366659987298e-06, "loss": 0.034, "step": 12130 }, { "epoch": 1.4387555555555556, "grad_norm": 0.5659188001685495, "learning_rate": 6.217680511783696e-06, "loss": 0.032, "step": 12140 }, { "epoch": 1.4399407407407407, "grad_norm": 0.973286737871544, "learning_rate": 6.210992048055824e-06, "loss": 0.0363, "step": 12150 }, { "epoch": 1.441125925925926, "grad_norm": 0.3651541450859842, "learning_rate": 6.204301281522376e-06, "loss": 0.0312, "step": 12160 }, { "epoch": 1.442311111111111, "grad_norm": 0.6785911952934957, "learning_rate": 6.197608224906416e-06, "loss": 0.0362, "step": 12170 }, { "epoch": 1.4434962962962963, "grad_norm": 0.4826625798954932, "learning_rate": 6.19091289093537e-06, "loss": 0.0326, "step": 12180 }, { "epoch": 1.4446814814814815, "grad_norm": 0.5575053452007372, "learning_rate": 6.184215292340988e-06, "loss": 0.0357, "step": 12190 }, { "epoch": 1.4458666666666666, "grad_norm": 0.6325374702945982, "learning_rate": 6.1775154418593346e-06, "loss": 0.0349, "step": 12200 }, { "epoch": 1.4470518518518518, "grad_norm": 0.6225928278867026, "learning_rate": 6.170813352230749e-06, "loss": 0.0362, "step": 12210 }, { "epoch": 1.448237037037037, "grad_norm": 0.6831371902504493, "learning_rate": 6.164109036199832e-06, "loss": 0.0346, "step": 12220 }, { "epoch": 1.4494222222222222, "grad_norm": 0.4862788692053201, "learning_rate": 6.157402506515416e-06, "loss": 0.0326, "step": 12230 }, { "epoch": 1.4506074074074073, "grad_norm": 0.5745574919262131, "learning_rate": 6.150693775930547e-06, "loss": 0.0367, "step": 12240 }, { "epoch": 1.4517925925925925, "grad_norm": 0.5733819586191993, "learning_rate": 6.143982857202452e-06, "loss": 0.0325, "step": 12250 }, { "epoch": 1.4529777777777777, "grad_norm": 0.6617345579986803, "learning_rate": 6.137269763092522e-06, "loss": 0.0319, "step": 12260 }, { "epoch": 1.4541629629629629, "grad_norm": 0.6118097823085019, "learning_rate": 6.130554506366278e-06, "loss": 0.0336, "step": 12270 }, { "epoch": 1.4553481481481483, "grad_norm": 0.5350180924024459, "learning_rate": 6.123837099793365e-06, "loss": 0.0345, "step": 12280 }, { "epoch": 1.4565333333333332, "grad_norm": 0.5396770825445341, "learning_rate": 6.117117556147505e-06, "loss": 0.0335, "step": 12290 }, { "epoch": 1.4577185185185186, "grad_norm": 0.4845975079978553, "learning_rate": 6.110395888206492e-06, "loss": 0.0349, "step": 12300 }, { "epoch": 1.4589037037037036, "grad_norm": 0.4999282856662726, "learning_rate": 6.103672108752152e-06, "loss": 0.0335, "step": 12310 }, { "epoch": 1.460088888888889, "grad_norm": 0.4249204434313072, "learning_rate": 6.096946230570332e-06, "loss": 0.032, "step": 12320 }, { "epoch": 1.461274074074074, "grad_norm": 0.4887537144845807, "learning_rate": 6.09021826645087e-06, "loss": 0.0344, "step": 12330 }, { "epoch": 1.4624592592592593, "grad_norm": 0.5014685169268319, "learning_rate": 6.083488229187565e-06, "loss": 0.0336, "step": 12340 }, { "epoch": 1.4636444444444445, "grad_norm": 0.5412856573017294, "learning_rate": 6.076756131578165e-06, "loss": 0.0338, "step": 12350 }, { "epoch": 1.4648296296296297, "grad_norm": 0.5704363530858065, "learning_rate": 6.070021986424332e-06, "loss": 0.0349, "step": 12360 }, { "epoch": 1.4660148148148149, "grad_norm": 0.6142571307425699, "learning_rate": 6.063285806531623e-06, "loss": 0.0328, "step": 12370 }, { "epoch": 1.4672, "grad_norm": 0.5428976640304795, "learning_rate": 6.056547604709461e-06, "loss": 0.0337, "step": 12380 }, { "epoch": 1.4683851851851852, "grad_norm": 0.4504843161243152, "learning_rate": 6.04980739377112e-06, "loss": 0.0341, "step": 12390 }, { "epoch": 1.4695703703703704, "grad_norm": 0.5593867800158661, "learning_rate": 6.043065186533688e-06, "loss": 0.0322, "step": 12400 }, { "epoch": 1.4707555555555556, "grad_norm": 0.512364210972295, "learning_rate": 6.036320995818056e-06, "loss": 0.0334, "step": 12410 }, { "epoch": 1.4719407407407408, "grad_norm": 0.5433841483492066, "learning_rate": 6.029574834448877e-06, "loss": 0.032, "step": 12420 }, { "epoch": 1.473125925925926, "grad_norm": 0.48420023671213425, "learning_rate": 6.022826715254564e-06, "loss": 0.0359, "step": 12430 }, { "epoch": 1.4743111111111111, "grad_norm": 0.5984474055200714, "learning_rate": 6.016076651067242e-06, "loss": 0.0338, "step": 12440 }, { "epoch": 1.4754962962962963, "grad_norm": 0.5813230092492384, "learning_rate": 6.009324654722741e-06, "loss": 0.0333, "step": 12450 }, { "epoch": 1.4766814814814815, "grad_norm": 0.5234600809537441, "learning_rate": 6.00257073906056e-06, "loss": 0.0353, "step": 12460 }, { "epoch": 1.4778666666666667, "grad_norm": 0.5723866809326428, "learning_rate": 5.995814916923855e-06, "loss": 0.0332, "step": 12470 }, { "epoch": 1.4790518518518518, "grad_norm": 0.45896484561524986, "learning_rate": 5.989057201159401e-06, "loss": 0.0323, "step": 12480 }, { "epoch": 1.480237037037037, "grad_norm": 0.43787060826429475, "learning_rate": 5.982297604617575e-06, "loss": 0.0327, "step": 12490 }, { "epoch": 1.4814222222222222, "grad_norm": 0.4293122281582217, "learning_rate": 5.975536140152331e-06, "loss": 0.0337, "step": 12500 }, { "epoch": 1.4826074074074074, "grad_norm": 0.45263164731419336, "learning_rate": 5.9687728206211805e-06, "loss": 0.0319, "step": 12510 }, { "epoch": 1.4837925925925926, "grad_norm": 0.6026734931124957, "learning_rate": 5.9620076588851514e-06, "loss": 0.0348, "step": 12520 }, { "epoch": 1.4849777777777777, "grad_norm": 0.5815538382704218, "learning_rate": 5.955240667808785e-06, "loss": 0.0329, "step": 12530 }, { "epoch": 1.486162962962963, "grad_norm": 0.4022565493846757, "learning_rate": 5.948471860260093e-06, "loss": 0.0352, "step": 12540 }, { "epoch": 1.487348148148148, "grad_norm": 0.5666152365852863, "learning_rate": 5.94170124911055e-06, "loss": 0.0322, "step": 12550 }, { "epoch": 1.4885333333333333, "grad_norm": 0.5394228797071462, "learning_rate": 5.934928847235053e-06, "loss": 0.0353, "step": 12560 }, { "epoch": 1.4897185185185184, "grad_norm": 0.5115183626748209, "learning_rate": 5.928154667511908e-06, "loss": 0.0345, "step": 12570 }, { "epoch": 1.4909037037037036, "grad_norm": 0.5272183873058901, "learning_rate": 5.9213787228228e-06, "loss": 0.0338, "step": 12580 }, { "epoch": 1.492088888888889, "grad_norm": 0.5952295525046291, "learning_rate": 5.914601026052769e-06, "loss": 0.0349, "step": 12590 }, { "epoch": 1.493274074074074, "grad_norm": 0.7236059494972624, "learning_rate": 5.907821590090191e-06, "loss": 0.0358, "step": 12600 }, { "epoch": 1.4944592592592594, "grad_norm": 0.5009202830839249, "learning_rate": 5.9010404278267475e-06, "loss": 0.0346, "step": 12610 }, { "epoch": 1.4956444444444443, "grad_norm": 0.6375866231808818, "learning_rate": 5.8942575521574005e-06, "loss": 0.0327, "step": 12620 }, { "epoch": 1.4968296296296297, "grad_norm": 0.6448423065195115, "learning_rate": 5.887472975980372e-06, "loss": 0.0354, "step": 12630 }, { "epoch": 1.4980148148148147, "grad_norm": 0.5160262111575215, "learning_rate": 5.880686712197117e-06, "loss": 0.0318, "step": 12640 }, { "epoch": 1.4992, "grad_norm": 0.4143541144775653, "learning_rate": 5.8738987737123e-06, "loss": 0.031, "step": 12650 }, { "epoch": 1.500385185185185, "grad_norm": 0.3538080001350191, "learning_rate": 5.867109173433772e-06, "loss": 0.0337, "step": 12660 }, { "epoch": 1.5015703703703704, "grad_norm": 0.46170497963820933, "learning_rate": 5.8603179242725395e-06, "loss": 0.0309, "step": 12670 }, { "epoch": 1.5027555555555554, "grad_norm": 0.7078203234460463, "learning_rate": 5.85352503914275e-06, "loss": 0.0351, "step": 12680 }, { "epoch": 1.5039407407407408, "grad_norm": 0.6078356295083309, "learning_rate": 5.846730530961654e-06, "loss": 0.0315, "step": 12690 }, { "epoch": 1.505125925925926, "grad_norm": 0.5113595872934816, "learning_rate": 5.8399344126496e-06, "loss": 0.0334, "step": 12700 }, { "epoch": 1.5063111111111112, "grad_norm": 0.5442291307023379, "learning_rate": 5.833136697129987e-06, "loss": 0.0323, "step": 12710 }, { "epoch": 1.5074962962962963, "grad_norm": 0.7009128371808973, "learning_rate": 5.826337397329259e-06, "loss": 0.0325, "step": 12720 }, { "epoch": 1.5086814814814815, "grad_norm": 0.8788712357533189, "learning_rate": 5.819536526176865e-06, "loss": 0.0321, "step": 12730 }, { "epoch": 1.5098666666666667, "grad_norm": 0.8178648648830867, "learning_rate": 5.812734096605253e-06, "loss": 0.0339, "step": 12740 }, { "epoch": 1.5110518518518519, "grad_norm": 0.7361560613112331, "learning_rate": 5.8059301215498236e-06, "loss": 0.0362, "step": 12750 }, { "epoch": 1.512237037037037, "grad_norm": 0.6016599075578817, "learning_rate": 5.799124613948923e-06, "loss": 0.0318, "step": 12760 }, { "epoch": 1.5134222222222222, "grad_norm": 0.5408668261916741, "learning_rate": 5.792317586743806e-06, "loss": 0.0326, "step": 12770 }, { "epoch": 1.5146074074074074, "grad_norm": 0.4982805394078052, "learning_rate": 5.7855090528786266e-06, "loss": 0.0318, "step": 12780 }, { "epoch": 1.5157925925925926, "grad_norm": 0.5149272105546082, "learning_rate": 5.778699025300391e-06, "loss": 0.0306, "step": 12790 }, { "epoch": 1.5169777777777778, "grad_norm": 0.6609594016579912, "learning_rate": 5.77188751695896e-06, "loss": 0.0325, "step": 12800 }, { "epoch": 1.518162962962963, "grad_norm": 0.48397098756621043, "learning_rate": 5.765074540806994e-06, "loss": 0.0352, "step": 12810 }, { "epoch": 1.5193481481481481, "grad_norm": 0.5934881688537705, "learning_rate": 5.758260109799962e-06, "loss": 0.0339, "step": 12820 }, { "epoch": 1.5205333333333333, "grad_norm": 0.5789006476519347, "learning_rate": 5.751444236896085e-06, "loss": 0.0334, "step": 12830 }, { "epoch": 1.5217185185185185, "grad_norm": 0.5683388999786305, "learning_rate": 5.744626935056335e-06, "loss": 0.0349, "step": 12840 }, { "epoch": 1.5229037037037036, "grad_norm": 0.5070308237342998, "learning_rate": 5.737808217244396e-06, "loss": 0.0326, "step": 12850 }, { "epoch": 1.524088888888889, "grad_norm": 0.5091743032835572, "learning_rate": 5.730988096426649e-06, "loss": 0.0327, "step": 12860 }, { "epoch": 1.525274074074074, "grad_norm": 0.43571818250867006, "learning_rate": 5.724166585572137e-06, "loss": 0.0308, "step": 12870 }, { "epoch": 1.5264592592592594, "grad_norm": 0.5166481812065432, "learning_rate": 5.717343697652552e-06, "loss": 0.0325, "step": 12880 }, { "epoch": 1.5276444444444444, "grad_norm": 0.6338275492879064, "learning_rate": 5.710519445642203e-06, "loss": 0.0354, "step": 12890 }, { "epoch": 1.5288296296296298, "grad_norm": 0.5597685545984137, "learning_rate": 5.703693842517993e-06, "loss": 0.0365, "step": 12900 }, { "epoch": 1.5300148148148147, "grad_norm": 0.498291021479039, "learning_rate": 5.696866901259392e-06, "loss": 0.0322, "step": 12910 }, { "epoch": 1.5312000000000001, "grad_norm": 0.6230397076757928, "learning_rate": 5.690038634848415e-06, "loss": 0.0325, "step": 12920 }, { "epoch": 1.532385185185185, "grad_norm": 0.4620149579603233, "learning_rate": 5.683209056269601e-06, "loss": 0.0325, "step": 12930 }, { "epoch": 1.5335703703703705, "grad_norm": 0.6229466618263108, "learning_rate": 5.6763781785099806e-06, "loss": 0.0339, "step": 12940 }, { "epoch": 1.5347555555555554, "grad_norm": 0.6097269669831146, "learning_rate": 5.669546014559053e-06, "loss": 0.0323, "step": 12950 }, { "epoch": 1.5359407407407408, "grad_norm": 0.5561868163069095, "learning_rate": 5.662712577408769e-06, "loss": 0.0336, "step": 12960 }, { "epoch": 1.5371259259259258, "grad_norm": 0.47437352647284287, "learning_rate": 5.6558778800534975e-06, "loss": 0.0314, "step": 12970 }, { "epoch": 1.5383111111111112, "grad_norm": 0.5161057320874354, "learning_rate": 5.649041935490001e-06, "loss": 0.0344, "step": 12980 }, { "epoch": 1.5394962962962961, "grad_norm": 0.5571655285737502, "learning_rate": 5.642204756717419e-06, "loss": 0.0308, "step": 12990 }, { "epoch": 1.5406814814814815, "grad_norm": 0.5523295288576924, "learning_rate": 5.635366356737231e-06, "loss": 0.0316, "step": 13000 }, { "epoch": 1.5418666666666667, "grad_norm": 0.5400102128403003, "learning_rate": 5.628526748553248e-06, "loss": 0.0314, "step": 13010 }, { "epoch": 1.543051851851852, "grad_norm": 0.7644640708120826, "learning_rate": 5.62168594517157e-06, "loss": 0.0332, "step": 13020 }, { "epoch": 1.544237037037037, "grad_norm": 0.7127615461894612, "learning_rate": 5.614843959600577e-06, "loss": 0.0314, "step": 13030 }, { "epoch": 1.5454222222222223, "grad_norm": 0.5754727400557247, "learning_rate": 5.608000804850887e-06, "loss": 0.0313, "step": 13040 }, { "epoch": 1.5466074074074074, "grad_norm": 0.5599260186744512, "learning_rate": 5.601156493935355e-06, "loss": 0.0344, "step": 13050 }, { "epoch": 1.5477925925925926, "grad_norm": 0.3826711882836975, "learning_rate": 5.594311039869022e-06, "loss": 0.0315, "step": 13060 }, { "epoch": 1.5489777777777778, "grad_norm": 0.42317400660031274, "learning_rate": 5.587464455669109e-06, "loss": 0.0314, "step": 13070 }, { "epoch": 1.550162962962963, "grad_norm": 0.4086030442509822, "learning_rate": 5.580616754354985e-06, "loss": 0.0315, "step": 13080 }, { "epoch": 1.5513481481481481, "grad_norm": 0.47338628038441094, "learning_rate": 5.573767948948146e-06, "loss": 0.0317, "step": 13090 }, { "epoch": 1.5525333333333333, "grad_norm": 0.7169833634684311, "learning_rate": 5.56691805247218e-06, "loss": 0.0342, "step": 13100 }, { "epoch": 1.5537185185185185, "grad_norm": 0.5407255083897317, "learning_rate": 5.56006707795276e-06, "loss": 0.0303, "step": 13110 }, { "epoch": 1.5549037037037037, "grad_norm": 0.8122389937260033, "learning_rate": 5.553215038417597e-06, "loss": 0.0313, "step": 13120 }, { "epoch": 1.5560888888888889, "grad_norm": 0.6922836748841144, "learning_rate": 5.546361946896439e-06, "loss": 0.036, "step": 13130 }, { "epoch": 1.557274074074074, "grad_norm": 0.3190103522451645, "learning_rate": 5.539507816421027e-06, "loss": 0.0278, "step": 13140 }, { "epoch": 1.5584592592592592, "grad_norm": 0.473287497333893, "learning_rate": 5.532652660025081e-06, "loss": 0.031, "step": 13150 }, { "epoch": 1.5596444444444444, "grad_norm": 0.5588835513188126, "learning_rate": 5.525796490744269e-06, "loss": 0.0332, "step": 13160 }, { "epoch": 1.5608296296296298, "grad_norm": 0.44401173743418904, "learning_rate": 5.518939321616189e-06, "loss": 0.0315, "step": 13170 }, { "epoch": 1.5620148148148147, "grad_norm": 0.5440112495679429, "learning_rate": 5.512081165680336e-06, "loss": 0.0351, "step": 13180 }, { "epoch": 1.5632000000000001, "grad_norm": 0.5845981732775536, "learning_rate": 5.5052220359780865e-06, "loss": 0.0347, "step": 13190 }, { "epoch": 1.564385185185185, "grad_norm": 0.4455173070571021, "learning_rate": 5.498361945552662e-06, "loss": 0.0346, "step": 13200 }, { "epoch": 1.5655703703703705, "grad_norm": 0.4408979763576873, "learning_rate": 5.491500907449118e-06, "loss": 0.0335, "step": 13210 }, { "epoch": 1.5667555555555555, "grad_norm": 0.8791662408943192, "learning_rate": 5.484638934714307e-06, "loss": 0.0282, "step": 13220 }, { "epoch": 1.5679407407407409, "grad_norm": 0.7316584611770932, "learning_rate": 5.47777604039686e-06, "loss": 0.031, "step": 13230 }, { "epoch": 1.5691259259259258, "grad_norm": 0.4964055070794261, "learning_rate": 5.4709122375471645e-06, "loss": 0.0321, "step": 13240 }, { "epoch": 1.5703111111111112, "grad_norm": 0.5630139537326069, "learning_rate": 5.464047539217329e-06, "loss": 0.0294, "step": 13250 }, { "epoch": 1.5714962962962962, "grad_norm": 0.5507139644896905, "learning_rate": 5.457181958461167e-06, "loss": 0.0322, "step": 13260 }, { "epoch": 1.5726814814814816, "grad_norm": 0.4677531128584071, "learning_rate": 5.450315508334174e-06, "loss": 0.032, "step": 13270 }, { "epoch": 1.5738666666666665, "grad_norm": 0.6262982988055855, "learning_rate": 5.443448201893496e-06, "loss": 0.032, "step": 13280 }, { "epoch": 1.575051851851852, "grad_norm": 0.8777731943478929, "learning_rate": 5.436580052197905e-06, "loss": 0.032, "step": 13290 }, { "epoch": 1.5762370370370369, "grad_norm": 0.7150436176075989, "learning_rate": 5.42971107230778e-06, "loss": 0.0354, "step": 13300 }, { "epoch": 1.5774222222222223, "grad_norm": 0.5825965324224895, "learning_rate": 5.422841275285075e-06, "loss": 0.0338, "step": 13310 }, { "epoch": 1.5786074074074075, "grad_norm": 0.5536348026035459, "learning_rate": 5.415970674193303e-06, "loss": 0.0316, "step": 13320 }, { "epoch": 1.5797925925925926, "grad_norm": 0.43967616397879106, "learning_rate": 5.409099282097502e-06, "loss": 0.0334, "step": 13330 }, { "epoch": 1.5809777777777778, "grad_norm": 0.5732369856067626, "learning_rate": 5.402227112064216e-06, "loss": 0.0304, "step": 13340 }, { "epoch": 1.582162962962963, "grad_norm": 0.4543688416741047, "learning_rate": 5.395354177161464e-06, "loss": 0.0304, "step": 13350 }, { "epoch": 1.5833481481481482, "grad_norm": 0.6300372642789709, "learning_rate": 5.388480490458729e-06, "loss": 0.0328, "step": 13360 }, { "epoch": 1.5845333333333333, "grad_norm": 0.6261021967067414, "learning_rate": 5.381606065026913e-06, "loss": 0.0348, "step": 13370 }, { "epoch": 1.5857185185185185, "grad_norm": 0.6890944257762874, "learning_rate": 5.374730913938331e-06, "loss": 0.0342, "step": 13380 }, { "epoch": 1.5869037037037037, "grad_norm": 0.383277010276292, "learning_rate": 5.367855050266671e-06, "loss": 0.0285, "step": 13390 }, { "epoch": 1.5880888888888889, "grad_norm": 0.5100025539483825, "learning_rate": 5.360978487086982e-06, "loss": 0.0283, "step": 13400 }, { "epoch": 1.589274074074074, "grad_norm": 0.4436995345159667, "learning_rate": 5.354101237475638e-06, "loss": 0.0344, "step": 13410 }, { "epoch": 1.5904592592592592, "grad_norm": 0.4429829601313009, "learning_rate": 5.347223314510324e-06, "loss": 0.0326, "step": 13420 }, { "epoch": 1.5916444444444444, "grad_norm": 0.4971954541232553, "learning_rate": 5.3403447312699995e-06, "loss": 0.0333, "step": 13430 }, { "epoch": 1.5928296296296296, "grad_norm": 0.4750617880508376, "learning_rate": 5.333465500834885e-06, "loss": 0.0311, "step": 13440 }, { "epoch": 1.5940148148148148, "grad_norm": 0.516993521346015, "learning_rate": 5.3265856362864275e-06, "loss": 0.0318, "step": 13450 }, { "epoch": 1.5952, "grad_norm": 0.5913079570692042, "learning_rate": 5.319705150707281e-06, "loss": 0.036, "step": 13460 }, { "epoch": 1.5963851851851851, "grad_norm": 0.4179441199818623, "learning_rate": 5.312824057181282e-06, "loss": 0.0319, "step": 13470 }, { "epoch": 1.5975703703703705, "grad_norm": 0.520202667237722, "learning_rate": 5.3059423687934215e-06, "loss": 0.0348, "step": 13480 }, { "epoch": 1.5987555555555555, "grad_norm": 0.5875177871794808, "learning_rate": 5.299060098629822e-06, "loss": 0.0312, "step": 13490 }, { "epoch": 1.5999407407407409, "grad_norm": 0.42173946676094465, "learning_rate": 5.292177259777712e-06, "loss": 0.0336, "step": 13500 }, { "epoch": 1.6011259259259258, "grad_norm": 0.4960348121315329, "learning_rate": 5.285293865325403e-06, "loss": 0.0342, "step": 13510 }, { "epoch": 1.6023111111111112, "grad_norm": 0.6682583846044446, "learning_rate": 5.278409928362261e-06, "loss": 0.0325, "step": 13520 }, { "epoch": 1.6034962962962962, "grad_norm": 0.4409770099559712, "learning_rate": 5.271525461978685e-06, "loss": 0.0323, "step": 13530 }, { "epoch": 1.6046814814814816, "grad_norm": 0.46424600824419054, "learning_rate": 5.264640479266079e-06, "loss": 0.0339, "step": 13540 }, { "epoch": 1.6058666666666666, "grad_norm": 0.527160755063716, "learning_rate": 5.257754993316831e-06, "loss": 0.0264, "step": 13550 }, { "epoch": 1.607051851851852, "grad_norm": 0.628985681715465, "learning_rate": 5.250869017224284e-06, "loss": 0.0333, "step": 13560 }, { "epoch": 1.608237037037037, "grad_norm": 0.44631646827315336, "learning_rate": 5.243982564082716e-06, "loss": 0.033, "step": 13570 }, { "epoch": 1.6094222222222223, "grad_norm": 0.6095352401098801, "learning_rate": 5.237095646987308e-06, "loss": 0.0319, "step": 13580 }, { "epoch": 1.6106074074074073, "grad_norm": 0.5683086177600853, "learning_rate": 5.230208279034128e-06, "loss": 0.0308, "step": 13590 }, { "epoch": 1.6117925925925927, "grad_norm": 0.4649675799365343, "learning_rate": 5.223320473320095e-06, "loss": 0.0321, "step": 13600 }, { "epoch": 1.6129777777777776, "grad_norm": 0.5165314027749358, "learning_rate": 5.216432242942969e-06, "loss": 0.0306, "step": 13610 }, { "epoch": 1.614162962962963, "grad_norm": 0.6363078505636841, "learning_rate": 5.209543601001307e-06, "loss": 0.0312, "step": 13620 }, { "epoch": 1.6153481481481482, "grad_norm": 0.5388796574454523, "learning_rate": 5.20265456059446e-06, "loss": 0.029, "step": 13630 }, { "epoch": 1.6165333333333334, "grad_norm": 0.5813676482541661, "learning_rate": 5.195765134822528e-06, "loss": 0.0312, "step": 13640 }, { "epoch": 1.6177185185185186, "grad_norm": 0.4924277892767643, "learning_rate": 5.188875336786349e-06, "loss": 0.0326, "step": 13650 }, { "epoch": 1.6189037037037037, "grad_norm": 0.4839135957845775, "learning_rate": 5.181985179587463e-06, "loss": 0.0323, "step": 13660 }, { "epoch": 1.620088888888889, "grad_norm": 0.3535677135763709, "learning_rate": 5.1750946763281e-06, "loss": 0.0317, "step": 13670 }, { "epoch": 1.621274074074074, "grad_norm": 0.5514640615117711, "learning_rate": 5.1682038401111446e-06, "loss": 0.0295, "step": 13680 }, { "epoch": 1.6224592592592593, "grad_norm": 0.5094425767604717, "learning_rate": 5.161312684040114e-06, "loss": 0.0335, "step": 13690 }, { "epoch": 1.6236444444444444, "grad_norm": 0.5582975954470566, "learning_rate": 5.154421221219135e-06, "loss": 0.033, "step": 13700 }, { "epoch": 1.6248296296296296, "grad_norm": 0.46751325802838417, "learning_rate": 5.147529464752916e-06, "loss": 0.0318, "step": 13710 }, { "epoch": 1.6260148148148148, "grad_norm": 0.5851422023563263, "learning_rate": 5.140637427746726e-06, "loss": 0.0328, "step": 13720 }, { "epoch": 1.6272, "grad_norm": 0.531886778556515, "learning_rate": 5.133745123306366e-06, "loss": 0.0331, "step": 13730 }, { "epoch": 1.6283851851851852, "grad_norm": 0.542386697371137, "learning_rate": 5.126852564538145e-06, "loss": 0.0311, "step": 13740 }, { "epoch": 1.6295703703703703, "grad_norm": 0.6884539166308733, "learning_rate": 5.11995976454886e-06, "loss": 0.0329, "step": 13750 }, { "epoch": 1.6307555555555555, "grad_norm": 0.4259484145215916, "learning_rate": 5.1130667364457585e-06, "loss": 0.0314, "step": 13760 }, { "epoch": 1.6319407407407407, "grad_norm": 0.3449300478568714, "learning_rate": 5.10617349333653e-06, "loss": 0.0289, "step": 13770 }, { "epoch": 1.6331259259259259, "grad_norm": 0.6387429519380369, "learning_rate": 5.099280048329268e-06, "loss": 0.0328, "step": 13780 }, { "epoch": 1.6343111111111113, "grad_norm": 0.5000192532252813, "learning_rate": 5.092386414532452e-06, "loss": 0.0313, "step": 13790 }, { "epoch": 1.6354962962962962, "grad_norm": 0.4543401858549053, "learning_rate": 5.085492605054919e-06, "loss": 0.0317, "step": 13800 }, { "epoch": 1.6366814814814816, "grad_norm": 0.3808911842435508, "learning_rate": 5.0785986330058415e-06, "loss": 0.0292, "step": 13810 }, { "epoch": 1.6378666666666666, "grad_norm": 0.641061286823941, "learning_rate": 5.0717045114946995e-06, "loss": 0.0334, "step": 13820 }, { "epoch": 1.639051851851852, "grad_norm": 0.5358972255901302, "learning_rate": 5.064810253631261e-06, "loss": 0.0279, "step": 13830 }, { "epoch": 1.640237037037037, "grad_norm": 0.5497119658997647, "learning_rate": 5.057915872525546e-06, "loss": 0.0306, "step": 13840 }, { "epoch": 1.6414222222222223, "grad_norm": 0.5254691133999501, "learning_rate": 5.0510213812878175e-06, "loss": 0.0297, "step": 13850 }, { "epoch": 1.6426074074074073, "grad_norm": 0.7330691921751293, "learning_rate": 5.044126793028543e-06, "loss": 0.0337, "step": 13860 }, { "epoch": 1.6437925925925927, "grad_norm": 0.5483217602392962, "learning_rate": 5.037232120858374e-06, "loss": 0.0305, "step": 13870 }, { "epoch": 1.6449777777777776, "grad_norm": 0.6661446428335243, "learning_rate": 5.030337377888124e-06, "loss": 0.0304, "step": 13880 }, { "epoch": 1.646162962962963, "grad_norm": 0.31071784886065973, "learning_rate": 5.0234425772287385e-06, "loss": 0.0312, "step": 13890 }, { "epoch": 1.647348148148148, "grad_norm": 0.44037140857046925, "learning_rate": 5.016547731991277e-06, "loss": 0.0329, "step": 13900 }, { "epoch": 1.6485333333333334, "grad_norm": 0.42685646270216976, "learning_rate": 5.009652855286878e-06, "loss": 0.0296, "step": 13910 }, { "epoch": 1.6497185185185184, "grad_norm": 0.4170962141617505, "learning_rate": 5.002757960226744e-06, "loss": 0.0297, "step": 13920 }, { "epoch": 1.6509037037037038, "grad_norm": 0.44864544262814066, "learning_rate": 4.995863059922111e-06, "loss": 0.033, "step": 13930 }, { "epoch": 1.652088888888889, "grad_norm": 0.5355523913822217, "learning_rate": 4.988968167484227e-06, "loss": 0.0295, "step": 13940 }, { "epoch": 1.6532740740740741, "grad_norm": 0.5123179332709443, "learning_rate": 4.982073296024321e-06, "loss": 0.0304, "step": 13950 }, { "epoch": 1.6544592592592593, "grad_norm": 0.7933900045484795, "learning_rate": 4.975178458653586e-06, "loss": 0.0293, "step": 13960 }, { "epoch": 1.6556444444444445, "grad_norm": 0.4222212429222394, "learning_rate": 4.968283668483146e-06, "loss": 0.0303, "step": 13970 }, { "epoch": 1.6568296296296297, "grad_norm": 0.7353778216738837, "learning_rate": 4.961388938624038e-06, "loss": 0.032, "step": 13980 }, { "epoch": 1.6580148148148148, "grad_norm": 0.5278524829550586, "learning_rate": 4.9544942821871875e-06, "loss": 0.031, "step": 13990 }, { "epoch": 1.6592, "grad_norm": 0.4757237246191436, "learning_rate": 4.947599712283375e-06, "loss": 0.0296, "step": 14000 }, { "epoch": 1.6603851851851852, "grad_norm": 0.46821950454696787, "learning_rate": 4.940705242023219e-06, "loss": 0.0276, "step": 14010 }, { "epoch": 1.6615703703703704, "grad_norm": 0.4402682731768157, "learning_rate": 4.933810884517148e-06, "loss": 0.0279, "step": 14020 }, { "epoch": 1.6627555555555555, "grad_norm": 0.5489239763857653, "learning_rate": 4.926916652875373e-06, "loss": 0.0306, "step": 14030 }, { "epoch": 1.6639407407407407, "grad_norm": 0.5936036416875581, "learning_rate": 4.920022560207873e-06, "loss": 0.0311, "step": 14040 }, { "epoch": 1.665125925925926, "grad_norm": 0.4778677822228148, "learning_rate": 4.913128619624355e-06, "loss": 0.0321, "step": 14050 }, { "epoch": 1.666311111111111, "grad_norm": 0.459358995507567, "learning_rate": 4.9062348442342405e-06, "loss": 0.0307, "step": 14060 }, { "epoch": 1.6674962962962963, "grad_norm": 0.520021202362464, "learning_rate": 4.899341247146639e-06, "loss": 0.0294, "step": 14070 }, { "epoch": 1.6686814814814814, "grad_norm": 0.4650761047088053, "learning_rate": 4.892447841470318e-06, "loss": 0.0296, "step": 14080 }, { "epoch": 1.6698666666666666, "grad_norm": 0.4188820057598217, "learning_rate": 4.885554640313679e-06, "loss": 0.0308, "step": 14090 }, { "epoch": 1.671051851851852, "grad_norm": 0.6553845274366292, "learning_rate": 4.8786616567847415e-06, "loss": 0.0306, "step": 14100 }, { "epoch": 1.672237037037037, "grad_norm": 0.42579030631098574, "learning_rate": 4.871768903991102e-06, "loss": 0.0312, "step": 14110 }, { "epoch": 1.6734222222222224, "grad_norm": 0.4372859940183096, "learning_rate": 4.864876395039926e-06, "loss": 0.0299, "step": 14120 }, { "epoch": 1.6746074074074073, "grad_norm": 0.5449410951853673, "learning_rate": 4.857984143037911e-06, "loss": 0.0285, "step": 14130 }, { "epoch": 1.6757925925925927, "grad_norm": 0.4035070881722527, "learning_rate": 4.851092161091267e-06, "loss": 0.0304, "step": 14140 }, { "epoch": 1.6769777777777777, "grad_norm": 0.38621528763695523, "learning_rate": 4.844200462305693e-06, "loss": 0.0303, "step": 14150 }, { "epoch": 1.678162962962963, "grad_norm": 0.48185406252429064, "learning_rate": 4.837309059786344e-06, "loss": 0.0313, "step": 14160 }, { "epoch": 1.679348148148148, "grad_norm": 0.5947634110300325, "learning_rate": 4.830417966637817e-06, "loss": 0.0371, "step": 14170 }, { "epoch": 1.6805333333333334, "grad_norm": 0.4826138196658728, "learning_rate": 4.823527195964119e-06, "loss": 0.0304, "step": 14180 }, { "epoch": 1.6817185185185184, "grad_norm": 0.7322907119535944, "learning_rate": 4.816636760868642e-06, "loss": 0.0296, "step": 14190 }, { "epoch": 1.6829037037037038, "grad_norm": 0.47679496814141087, "learning_rate": 4.809746674454142e-06, "loss": 0.033, "step": 14200 }, { "epoch": 1.6840888888888887, "grad_norm": 0.460173285349719, "learning_rate": 4.802856949822709e-06, "loss": 0.0292, "step": 14210 }, { "epoch": 1.6852740740740741, "grad_norm": 0.7931355404195776, "learning_rate": 4.79596760007575e-06, "loss": 0.0285, "step": 14220 }, { "epoch": 1.686459259259259, "grad_norm": 0.6092037703130151, "learning_rate": 4.789078638313956e-06, "loss": 0.0318, "step": 14230 }, { "epoch": 1.6876444444444445, "grad_norm": 0.5072650049144025, "learning_rate": 4.78219007763728e-06, "loss": 0.0317, "step": 14240 }, { "epoch": 1.6888296296296297, "grad_norm": 0.4029340535377157, "learning_rate": 4.775301931144913e-06, "loss": 0.0281, "step": 14250 }, { "epoch": 1.6900148148148149, "grad_norm": 0.48497410162524296, "learning_rate": 4.7684142119352564e-06, "loss": 0.0311, "step": 14260 }, { "epoch": 1.6912, "grad_norm": 0.43522761213890987, "learning_rate": 4.761526933105905e-06, "loss": 0.0325, "step": 14270 }, { "epoch": 1.6923851851851852, "grad_norm": 0.5016689344640952, "learning_rate": 4.754640107753607e-06, "loss": 0.0324, "step": 14280 }, { "epoch": 1.6935703703703704, "grad_norm": 0.4937025258400182, "learning_rate": 4.747753748974256e-06, "loss": 0.0336, "step": 14290 }, { "epoch": 1.6947555555555556, "grad_norm": 0.5057012342095206, "learning_rate": 4.7408678698628555e-06, "loss": 0.0301, "step": 14300 }, { "epoch": 1.6959407407407407, "grad_norm": 0.6057372860962584, "learning_rate": 4.733982483513499e-06, "loss": 0.0297, "step": 14310 }, { "epoch": 1.697125925925926, "grad_norm": 0.6106725729801042, "learning_rate": 4.727097603019339e-06, "loss": 0.0315, "step": 14320 }, { "epoch": 1.698311111111111, "grad_norm": 0.530901331449317, "learning_rate": 4.72021324147257e-06, "loss": 0.0282, "step": 14330 }, { "epoch": 1.6994962962962963, "grad_norm": 0.4935798770006407, "learning_rate": 4.713329411964395e-06, "loss": 0.0295, "step": 14340 }, { "epoch": 1.7006814814814815, "grad_norm": 0.5634254836080385, "learning_rate": 4.706446127585011e-06, "loss": 0.0309, "step": 14350 }, { "epoch": 1.7018666666666666, "grad_norm": 0.4046089753188776, "learning_rate": 4.699563401423572e-06, "loss": 0.0305, "step": 14360 }, { "epoch": 1.7030518518518518, "grad_norm": 0.46720466293233187, "learning_rate": 4.692681246568175e-06, "loss": 0.0327, "step": 14370 }, { "epoch": 1.704237037037037, "grad_norm": 0.4284975562781869, "learning_rate": 4.685799676105833e-06, "loss": 0.0327, "step": 14380 }, { "epoch": 1.7054222222222222, "grad_norm": 0.4335668602538958, "learning_rate": 4.678918703122443e-06, "loss": 0.0332, "step": 14390 }, { "epoch": 1.7066074074074074, "grad_norm": 0.4443969529333878, "learning_rate": 4.672038340702765e-06, "loss": 0.0333, "step": 14400 }, { "epoch": 1.7077925925925928, "grad_norm": 0.41635457749946203, "learning_rate": 4.665158601930402e-06, "loss": 0.03, "step": 14410 }, { "epoch": 1.7089777777777777, "grad_norm": 0.532917600779119, "learning_rate": 4.658279499887769e-06, "loss": 0.029, "step": 14420 }, { "epoch": 1.710162962962963, "grad_norm": 0.6427531171748737, "learning_rate": 4.6514010476560695e-06, "loss": 0.0317, "step": 14430 }, { "epoch": 1.711348148148148, "grad_norm": 0.47754549180960465, "learning_rate": 4.644523258315273e-06, "loss": 0.0273, "step": 14440 }, { "epoch": 1.7125333333333335, "grad_norm": 0.51966123237056, "learning_rate": 4.637646144944086e-06, "loss": 0.0277, "step": 14450 }, { "epoch": 1.7137185185185184, "grad_norm": 0.6344384371740579, "learning_rate": 4.630769720619935e-06, "loss": 0.0304, "step": 14460 }, { "epoch": 1.7149037037037038, "grad_norm": 0.5124649297628256, "learning_rate": 4.62389399841893e-06, "loss": 0.0309, "step": 14470 }, { "epoch": 1.7160888888888888, "grad_norm": 0.7370031495170535, "learning_rate": 4.617018991415849e-06, "loss": 0.0313, "step": 14480 }, { "epoch": 1.7172740740740742, "grad_norm": 0.6802078608873966, "learning_rate": 4.61014471268411e-06, "loss": 0.0337, "step": 14490 }, { "epoch": 1.7184592592592591, "grad_norm": 0.5294116180956786, "learning_rate": 4.603271175295745e-06, "loss": 0.0307, "step": 14500 }, { "epoch": 1.7196444444444445, "grad_norm": 0.5790305278482072, "learning_rate": 4.596398392321376e-06, "loss": 0.0305, "step": 14510 }, { "epoch": 1.7208296296296295, "grad_norm": 0.3545145887216605, "learning_rate": 4.5895263768301895e-06, "loss": 0.0291, "step": 14520 }, { "epoch": 1.7220148148148149, "grad_norm": 0.5870242012913149, "learning_rate": 4.582655141889918e-06, "loss": 0.0278, "step": 14530 }, { "epoch": 1.7231999999999998, "grad_norm": 0.530338737581341, "learning_rate": 4.575784700566805e-06, "loss": 0.034, "step": 14540 }, { "epoch": 1.7243851851851852, "grad_norm": 0.3864531510757058, "learning_rate": 4.568915065925585e-06, "loss": 0.0303, "step": 14550 }, { "epoch": 1.7255703703703704, "grad_norm": 0.700862334642734, "learning_rate": 4.562046251029461e-06, "loss": 0.0323, "step": 14560 }, { "epoch": 1.7267555555555556, "grad_norm": 0.45647606396128376, "learning_rate": 4.555178268940073e-06, "loss": 0.0309, "step": 14570 }, { "epoch": 1.7279407407407408, "grad_norm": 0.4039825537426591, "learning_rate": 4.548311132717482e-06, "loss": 0.0291, "step": 14580 }, { "epoch": 1.729125925925926, "grad_norm": 0.5180726000904167, "learning_rate": 4.541444855420136e-06, "loss": 0.0313, "step": 14590 }, { "epoch": 1.7303111111111111, "grad_norm": 0.43671016883425606, "learning_rate": 4.534579450104854e-06, "loss": 0.0309, "step": 14600 }, { "epoch": 1.7314962962962963, "grad_norm": 0.4953216633105977, "learning_rate": 4.527714929826793e-06, "loss": 0.027, "step": 14610 }, { "epoch": 1.7326814814814815, "grad_norm": 0.4875449748426298, "learning_rate": 4.5208513076394335e-06, "loss": 0.0308, "step": 14620 }, { "epoch": 1.7338666666666667, "grad_norm": 0.47029719396474, "learning_rate": 4.513988596594539e-06, "loss": 0.0328, "step": 14630 }, { "epoch": 1.7350518518518518, "grad_norm": 0.48243692535173677, "learning_rate": 4.507126809742148e-06, "loss": 0.03, "step": 14640 }, { "epoch": 1.736237037037037, "grad_norm": 0.5764337703138601, "learning_rate": 4.500265960130537e-06, "loss": 0.0324, "step": 14650 }, { "epoch": 1.7374222222222222, "grad_norm": 0.6105999092443823, "learning_rate": 4.493406060806202e-06, "loss": 0.0277, "step": 14660 }, { "epoch": 1.7386074074074074, "grad_norm": 0.5344248733212823, "learning_rate": 4.486547124813832e-06, "loss": 0.0292, "step": 14670 }, { "epoch": 1.7397925925925926, "grad_norm": 0.5438780644211062, "learning_rate": 4.479689165196283e-06, "loss": 0.0263, "step": 14680 }, { "epoch": 1.7409777777777777, "grad_norm": 0.5625317091725303, "learning_rate": 4.472832194994557e-06, "loss": 0.0325, "step": 14690 }, { "epoch": 1.742162962962963, "grad_norm": 0.5911177408897746, "learning_rate": 4.465976227247773e-06, "loss": 0.0319, "step": 14700 }, { "epoch": 1.743348148148148, "grad_norm": 0.5688399100122186, "learning_rate": 4.459121274993141e-06, "loss": 0.0309, "step": 14710 }, { "epoch": 1.7445333333333335, "grad_norm": 0.568522407943515, "learning_rate": 4.452267351265947e-06, "loss": 0.0281, "step": 14720 }, { "epoch": 1.7457185185185184, "grad_norm": 0.5196838385210767, "learning_rate": 4.445414469099512e-06, "loss": 0.03, "step": 14730 }, { "epoch": 1.7469037037037038, "grad_norm": 0.5563854041950367, "learning_rate": 4.438562641525184e-06, "loss": 0.0292, "step": 14740 }, { "epoch": 1.7480888888888888, "grad_norm": 0.4929953062910534, "learning_rate": 4.4317118815723e-06, "loss": 0.0303, "step": 14750 }, { "epoch": 1.7492740740740742, "grad_norm": 0.6030489423102072, "learning_rate": 4.424862202268172e-06, "loss": 0.0303, "step": 14760 }, { "epoch": 1.7504592592592592, "grad_norm": 0.6562596332834234, "learning_rate": 4.418013616638056e-06, "loss": 0.0302, "step": 14770 }, { "epoch": 1.7516444444444446, "grad_norm": 0.6551357962548602, "learning_rate": 4.411166137705122e-06, "loss": 0.0309, "step": 14780 }, { "epoch": 1.7528296296296295, "grad_norm": 0.5385225980834302, "learning_rate": 4.404319778490445e-06, "loss": 0.0312, "step": 14790 }, { "epoch": 1.754014814814815, "grad_norm": 0.3758759556508297, "learning_rate": 4.397474552012964e-06, "loss": 0.033, "step": 14800 }, { "epoch": 1.7551999999999999, "grad_norm": 0.41694548874861553, "learning_rate": 4.390630471289465e-06, "loss": 0.0301, "step": 14810 }, { "epoch": 1.7563851851851853, "grad_norm": 0.5754747471394601, "learning_rate": 4.38378754933456e-06, "loss": 0.0281, "step": 14820 }, { "epoch": 1.7575703703703702, "grad_norm": 0.5915191091531806, "learning_rate": 4.376945799160649e-06, "loss": 0.0318, "step": 14830 }, { "epoch": 1.7587555555555556, "grad_norm": 0.4968711750275647, "learning_rate": 4.370105233777912e-06, "loss": 0.035, "step": 14840 }, { "epoch": 1.7599407407407406, "grad_norm": 0.5308736395041633, "learning_rate": 4.363265866194274e-06, "loss": 0.0276, "step": 14850 }, { "epoch": 1.761125925925926, "grad_norm": 0.5743095281000444, "learning_rate": 4.356427709415378e-06, "loss": 0.0343, "step": 14860 }, { "epoch": 1.7623111111111112, "grad_norm": 0.46070132136944636, "learning_rate": 4.349590776444569e-06, "loss": 0.0319, "step": 14870 }, { "epoch": 1.7634962962962963, "grad_norm": 0.39401651982755276, "learning_rate": 4.342755080282861e-06, "loss": 0.0312, "step": 14880 }, { "epoch": 1.7646814814814815, "grad_norm": 0.5720650300000568, "learning_rate": 4.335920633928922e-06, "loss": 0.028, "step": 14890 }, { "epoch": 1.7658666666666667, "grad_norm": 0.4041693252361372, "learning_rate": 4.329087450379038e-06, "loss": 0.0297, "step": 14900 }, { "epoch": 1.7670518518518519, "grad_norm": 0.5166965234748857, "learning_rate": 4.322255542627093e-06, "loss": 0.0288, "step": 14910 }, { "epoch": 1.768237037037037, "grad_norm": 0.49562874888174113, "learning_rate": 4.315424923664552e-06, "loss": 0.0298, "step": 14920 }, { "epoch": 1.7694222222222222, "grad_norm": 0.4374031735110688, "learning_rate": 4.308595606480423e-06, "loss": 0.0325, "step": 14930 }, { "epoch": 1.7706074074074074, "grad_norm": 0.4812305306996144, "learning_rate": 4.301767604061239e-06, "loss": 0.0292, "step": 14940 }, { "epoch": 1.7717925925925926, "grad_norm": 0.40303043536958244, "learning_rate": 4.294940929391035e-06, "loss": 0.0302, "step": 14950 }, { "epoch": 1.7729777777777778, "grad_norm": 0.4200106880557695, "learning_rate": 4.288115595451321e-06, "loss": 0.0251, "step": 14960 }, { "epoch": 1.774162962962963, "grad_norm": 0.3344483051302518, "learning_rate": 4.281291615221056e-06, "loss": 0.0307, "step": 14970 }, { "epoch": 1.7753481481481481, "grad_norm": 0.4306614667374063, "learning_rate": 4.274469001676625e-06, "loss": 0.0283, "step": 14980 }, { "epoch": 1.7765333333333333, "grad_norm": 0.5685891809532609, "learning_rate": 4.267647767791815e-06, "loss": 0.0355, "step": 14990 }, { "epoch": 1.7777185185185185, "grad_norm": 0.5403635494746096, "learning_rate": 4.260827926537789e-06, "loss": 0.0308, "step": 15000 }, { "epoch": 1.7789037037037037, "grad_norm": 0.4040489065668533, "learning_rate": 4.254009490883065e-06, "loss": 0.0263, "step": 15010 }, { "epoch": 1.7800888888888888, "grad_norm": 0.4964353797283434, "learning_rate": 4.24719247379348e-06, "loss": 0.0335, "step": 15020 }, { "epoch": 1.7812740740740742, "grad_norm": 0.39068385349187273, "learning_rate": 4.240376888232183e-06, "loss": 0.0281, "step": 15030 }, { "epoch": 1.7824592592592592, "grad_norm": 0.4711320437734779, "learning_rate": 4.233562747159593e-06, "loss": 0.0281, "step": 15040 }, { "epoch": 1.7836444444444446, "grad_norm": 0.7219182584901922, "learning_rate": 4.226750063533388e-06, "loss": 0.0314, "step": 15050 }, { "epoch": 1.7848296296296295, "grad_norm": 0.3974088633545491, "learning_rate": 4.21993885030847e-06, "loss": 0.0312, "step": 15060 }, { "epoch": 1.786014814814815, "grad_norm": 0.44801645854078387, "learning_rate": 4.213129120436949e-06, "loss": 0.0296, "step": 15070 }, { "epoch": 1.7872, "grad_norm": 0.510863185058965, "learning_rate": 4.206320886868112e-06, "loss": 0.0313, "step": 15080 }, { "epoch": 1.7883851851851853, "grad_norm": 0.5678227657074751, "learning_rate": 4.1995141625484e-06, "loss": 0.03, "step": 15090 }, { "epoch": 1.7895703703703703, "grad_norm": 0.5242516926779672, "learning_rate": 4.192708960421385e-06, "loss": 0.0275, "step": 15100 }, { "epoch": 1.7907555555555557, "grad_norm": 0.5199210470998465, "learning_rate": 4.185905293427745e-06, "loss": 0.0312, "step": 15110 }, { "epoch": 1.7919407407407406, "grad_norm": 0.40367957397348764, "learning_rate": 4.1791031745052384e-06, "loss": 0.0273, "step": 15120 }, { "epoch": 1.793125925925926, "grad_norm": 0.5915136124073064, "learning_rate": 4.1723026165886794e-06, "loss": 0.0314, "step": 15130 }, { "epoch": 1.794311111111111, "grad_norm": 0.6569441500677481, "learning_rate": 4.165503632609913e-06, "loss": 0.0289, "step": 15140 }, { "epoch": 1.7954962962962964, "grad_norm": 0.4809613279072172, "learning_rate": 4.158706235497792e-06, "loss": 0.0299, "step": 15150 }, { "epoch": 1.7966814814814813, "grad_norm": 0.5399371293626984, "learning_rate": 4.1519104381781556e-06, "loss": 0.029, "step": 15160 }, { "epoch": 1.7978666666666667, "grad_norm": 0.5692173171210713, "learning_rate": 4.1451162535737936e-06, "loss": 0.0274, "step": 15170 }, { "epoch": 1.799051851851852, "grad_norm": 0.531974701043535, "learning_rate": 4.138323694604434e-06, "loss": 0.029, "step": 15180 }, { "epoch": 1.800237037037037, "grad_norm": 0.5121615947338684, "learning_rate": 4.1315327741867105e-06, "loss": 0.0327, "step": 15190 }, { "epoch": 1.8014222222222223, "grad_norm": 0.4578239908458243, "learning_rate": 4.124743505234144e-06, "loss": 0.0302, "step": 15200 }, { "epoch": 1.8026074074074074, "grad_norm": 0.6727969400600277, "learning_rate": 4.117955900657114e-06, "loss": 0.0283, "step": 15210 }, { "epoch": 1.8037925925925926, "grad_norm": 0.45732990864355993, "learning_rate": 4.1111699733628324e-06, "loss": 0.0281, "step": 15220 }, { "epoch": 1.8049777777777778, "grad_norm": 0.35424286856821796, "learning_rate": 4.104385736255326e-06, "loss": 0.0246, "step": 15230 }, { "epoch": 1.806162962962963, "grad_norm": 0.4214838579827263, "learning_rate": 4.097603202235407e-06, "loss": 0.0275, "step": 15240 }, { "epoch": 1.8073481481481481, "grad_norm": 0.45429656595212553, "learning_rate": 4.090822384200643e-06, "loss": 0.0289, "step": 15250 }, { "epoch": 1.8085333333333333, "grad_norm": 0.4547850141275008, "learning_rate": 4.084043295045348e-06, "loss": 0.0309, "step": 15260 }, { "epoch": 1.8097185185185185, "grad_norm": 0.8654630582377518, "learning_rate": 4.0772659476605385e-06, "loss": 0.0288, "step": 15270 }, { "epoch": 1.8109037037037037, "grad_norm": 0.5430544170251327, "learning_rate": 4.0704903549339264e-06, "loss": 0.0277, "step": 15280 }, { "epoch": 1.8120888888888889, "grad_norm": 0.507855494833691, "learning_rate": 4.063716529749881e-06, "loss": 0.029, "step": 15290 }, { "epoch": 1.813274074074074, "grad_norm": 0.4564793922324654, "learning_rate": 4.056944484989419e-06, "loss": 0.0273, "step": 15300 }, { "epoch": 1.8144592592592592, "grad_norm": 0.3624531191320748, "learning_rate": 4.050174233530164e-06, "loss": 0.029, "step": 15310 }, { "epoch": 1.8156444444444444, "grad_norm": 0.5539100728280977, "learning_rate": 4.043405788246331e-06, "loss": 0.0285, "step": 15320 }, { "epoch": 1.8168296296296296, "grad_norm": 0.4810497292397226, "learning_rate": 4.036639162008701e-06, "loss": 0.0294, "step": 15330 }, { "epoch": 1.818014814814815, "grad_norm": 0.4975033519659776, "learning_rate": 4.0298743676845975e-06, "loss": 0.0299, "step": 15340 }, { "epoch": 1.8192, "grad_norm": 0.5337195365254993, "learning_rate": 4.0231114181378565e-06, "loss": 0.0293, "step": 15350 }, { "epoch": 1.8203851851851853, "grad_norm": 0.4548057233761515, "learning_rate": 4.016350326228811e-06, "loss": 0.0318, "step": 15360 }, { "epoch": 1.8215703703703703, "grad_norm": 0.4607108088569288, "learning_rate": 4.009591104814256e-06, "loss": 0.0285, "step": 15370 }, { "epoch": 1.8227555555555557, "grad_norm": 0.6479755682874064, "learning_rate": 4.002833766747436e-06, "loss": 0.0292, "step": 15380 }, { "epoch": 1.8239407407407406, "grad_norm": 0.4326748796763295, "learning_rate": 3.996078324878009e-06, "loss": 0.0321, "step": 15390 }, { "epoch": 1.825125925925926, "grad_norm": 0.5465986554709112, "learning_rate": 3.989324792052029e-06, "loss": 0.0308, "step": 15400 }, { "epoch": 1.826311111111111, "grad_norm": 0.4885014862256522, "learning_rate": 3.982573181111921e-06, "loss": 0.0265, "step": 15410 }, { "epoch": 1.8274962962962964, "grad_norm": 0.5637981051697301, "learning_rate": 3.975823504896453e-06, "loss": 0.028, "step": 15420 }, { "epoch": 1.8286814814814814, "grad_norm": 0.4954193235187059, "learning_rate": 3.969075776240715e-06, "loss": 0.0293, "step": 15430 }, { "epoch": 1.8298666666666668, "grad_norm": 0.3858236316602962, "learning_rate": 3.962330007976095e-06, "loss": 0.0282, "step": 15440 }, { "epoch": 1.8310518518518517, "grad_norm": 0.6469037388758082, "learning_rate": 3.955586212930247e-06, "loss": 0.0316, "step": 15450 }, { "epoch": 1.832237037037037, "grad_norm": 0.5161181257082362, "learning_rate": 3.948844403927084e-06, "loss": 0.0287, "step": 15460 }, { "epoch": 1.833422222222222, "grad_norm": 0.4957757715541177, "learning_rate": 3.942104593786734e-06, "loss": 0.0241, "step": 15470 }, { "epoch": 1.8346074074074075, "grad_norm": 0.4798218550765448, "learning_rate": 3.935366795325524e-06, "loss": 0.0277, "step": 15480 }, { "epoch": 1.8357925925925926, "grad_norm": 0.3362899784693629, "learning_rate": 3.928631021355959e-06, "loss": 0.0269, "step": 15490 }, { "epoch": 1.8369777777777778, "grad_norm": 0.5565080911334147, "learning_rate": 3.921897284686692e-06, "loss": 0.0263, "step": 15500 }, { "epoch": 1.838162962962963, "grad_norm": 0.5322127301741044, "learning_rate": 3.915165598122503e-06, "loss": 0.0286, "step": 15510 }, { "epoch": 1.8393481481481482, "grad_norm": 0.5881755059371034, "learning_rate": 3.908435974464274e-06, "loss": 0.0271, "step": 15520 }, { "epoch": 1.8405333333333334, "grad_norm": 0.5841021082618638, "learning_rate": 3.901708426508961e-06, "loss": 0.0322, "step": 15530 }, { "epoch": 1.8417185185185185, "grad_norm": 0.49861153593008484, "learning_rate": 3.894982967049578e-06, "loss": 0.0299, "step": 15540 }, { "epoch": 1.8429037037037037, "grad_norm": 0.519470148674024, "learning_rate": 3.888259608875165e-06, "loss": 0.0269, "step": 15550 }, { "epoch": 1.8440888888888889, "grad_norm": 0.5111672382070884, "learning_rate": 3.881538364770764e-06, "loss": 0.0284, "step": 15560 }, { "epoch": 1.845274074074074, "grad_norm": 0.4161033822798717, "learning_rate": 3.874819247517401e-06, "loss": 0.0266, "step": 15570 }, { "epoch": 1.8464592592592592, "grad_norm": 0.4722318850809975, "learning_rate": 3.8681022698920535e-06, "loss": 0.0297, "step": 15580 }, { "epoch": 1.8476444444444444, "grad_norm": 0.6447393156446577, "learning_rate": 3.8613874446676345e-06, "loss": 0.0282, "step": 15590 }, { "epoch": 1.8488296296296296, "grad_norm": 0.6406703830306794, "learning_rate": 3.854674784612958e-06, "loss": 0.0266, "step": 15600 }, { "epoch": 1.8500148148148148, "grad_norm": 0.5038505156329683, "learning_rate": 3.84796430249273e-06, "loss": 0.0306, "step": 15610 }, { "epoch": 1.8512, "grad_norm": 0.5057400059062028, "learning_rate": 3.8412560110675066e-06, "loss": 0.0325, "step": 15620 }, { "epoch": 1.8523851851851851, "grad_norm": 0.4155202517038341, "learning_rate": 3.834549923093683e-06, "loss": 0.029, "step": 15630 }, { "epoch": 1.8535703703703703, "grad_norm": 0.5002648922617754, "learning_rate": 3.82784605132346e-06, "loss": 0.0305, "step": 15640 }, { "epoch": 1.8547555555555557, "grad_norm": 0.7098206156922805, "learning_rate": 3.821144408504829e-06, "loss": 0.0301, "step": 15650 }, { "epoch": 1.8559407407407407, "grad_norm": 0.46588244230059567, "learning_rate": 3.8144450073815385e-06, "loss": 0.0317, "step": 15660 }, { "epoch": 1.857125925925926, "grad_norm": 0.5992782297199127, "learning_rate": 3.8077478606930783e-06, "loss": 0.0331, "step": 15670 }, { "epoch": 1.858311111111111, "grad_norm": 0.44376204875494346, "learning_rate": 3.8010529811746454e-06, "loss": 0.027, "step": 15680 }, { "epoch": 1.8594962962962964, "grad_norm": 0.526217394541863, "learning_rate": 3.794360381557133e-06, "loss": 0.029, "step": 15690 }, { "epoch": 1.8606814814814814, "grad_norm": 0.41960226025451475, "learning_rate": 3.787670074567095e-06, "loss": 0.0277, "step": 15700 }, { "epoch": 1.8618666666666668, "grad_norm": 0.5910047693090584, "learning_rate": 3.780982072926723e-06, "loss": 0.0275, "step": 15710 }, { "epoch": 1.8630518518518517, "grad_norm": 0.4809660511390173, "learning_rate": 3.7742963893538297e-06, "loss": 0.0299, "step": 15720 }, { "epoch": 1.8642370370370371, "grad_norm": 0.4609492306276235, "learning_rate": 3.7676130365618187e-06, "loss": 0.0297, "step": 15730 }, { "epoch": 1.865422222222222, "grad_norm": 0.5745527059713748, "learning_rate": 3.760932027259657e-06, "loss": 0.0335, "step": 15740 }, { "epoch": 1.8666074074074075, "grad_norm": 0.4485573868820827, "learning_rate": 3.7542533741518623e-06, "loss": 0.0251, "step": 15750 }, { "epoch": 1.8677925925925924, "grad_norm": 0.40369696111070064, "learning_rate": 3.747577089938464e-06, "loss": 0.0293, "step": 15760 }, { "epoch": 1.8689777777777778, "grad_norm": 0.4243320066064984, "learning_rate": 3.740903187314994e-06, "loss": 0.0269, "step": 15770 }, { "epoch": 1.8701629629629628, "grad_norm": 0.43761623252645204, "learning_rate": 3.7342316789724532e-06, "loss": 0.0258, "step": 15780 }, { "epoch": 1.8713481481481482, "grad_norm": 0.4706054857045525, "learning_rate": 3.7275625775972868e-06, "loss": 0.0269, "step": 15790 }, { "epoch": 1.8725333333333334, "grad_norm": 0.6190183432777634, "learning_rate": 3.720895895871366e-06, "loss": 0.0308, "step": 15800 }, { "epoch": 1.8737185185185186, "grad_norm": 0.5334940190844957, "learning_rate": 3.7142316464719585e-06, "loss": 0.0295, "step": 15810 }, { "epoch": 1.8749037037037037, "grad_norm": 0.5439994890483385, "learning_rate": 3.7075698420717076e-06, "loss": 0.0275, "step": 15820 }, { "epoch": 1.876088888888889, "grad_norm": 0.41694810142074284, "learning_rate": 3.7009104953386087e-06, "loss": 0.0283, "step": 15830 }, { "epoch": 1.877274074074074, "grad_norm": 0.6126828394160406, "learning_rate": 3.6942536189359846e-06, "loss": 0.0292, "step": 15840 }, { "epoch": 1.8784592592592593, "grad_norm": 0.40628786409431195, "learning_rate": 3.6875992255224547e-06, "loss": 0.031, "step": 15850 }, { "epoch": 1.8796444444444445, "grad_norm": 0.5598379107304569, "learning_rate": 3.6809473277519243e-06, "loss": 0.0281, "step": 15860 }, { "epoch": 1.8808296296296296, "grad_norm": 0.36756466696688483, "learning_rate": 3.6742979382735455e-06, "loss": 0.0255, "step": 15870 }, { "epoch": 1.8820148148148148, "grad_norm": 0.5931365049812046, "learning_rate": 3.6676510697317085e-06, "loss": 0.03, "step": 15880 }, { "epoch": 1.8832, "grad_norm": 0.4538241610352921, "learning_rate": 3.6610067347660026e-06, "loss": 0.0283, "step": 15890 }, { "epoch": 1.8843851851851852, "grad_norm": 0.46624277914023465, "learning_rate": 3.654364946011205e-06, "loss": 0.0306, "step": 15900 }, { "epoch": 1.8855703703703703, "grad_norm": 0.4820046871128, "learning_rate": 3.6477257160972435e-06, "loss": 0.0291, "step": 15910 }, { "epoch": 1.8867555555555555, "grad_norm": 0.6964191362619795, "learning_rate": 3.641089057649192e-06, "loss": 0.0303, "step": 15920 }, { "epoch": 1.8879407407407407, "grad_norm": 0.47869732567934326, "learning_rate": 3.6344549832872233e-06, "loss": 0.0267, "step": 15930 }, { "epoch": 1.8891259259259259, "grad_norm": 0.5529243475762332, "learning_rate": 3.627823505626603e-06, "loss": 0.029, "step": 15940 }, { "epoch": 1.890311111111111, "grad_norm": 0.46900237813063694, "learning_rate": 3.6211946372776537e-06, "loss": 0.0318, "step": 15950 }, { "epoch": 1.8914962962962965, "grad_norm": 0.5898620179837001, "learning_rate": 3.614568390845741e-06, "loss": 0.0264, "step": 15960 }, { "epoch": 1.8926814814814814, "grad_norm": 0.39819637014363, "learning_rate": 3.607944778931242e-06, "loss": 0.03, "step": 15970 }, { "epoch": 1.8938666666666668, "grad_norm": 0.46274651599760835, "learning_rate": 3.601323814129525e-06, "loss": 0.0258, "step": 15980 }, { "epoch": 1.8950518518518518, "grad_norm": 0.5451947596958668, "learning_rate": 3.5947055090309223e-06, "loss": 0.027, "step": 15990 }, { "epoch": 1.8962370370370372, "grad_norm": 0.5647378171574232, "learning_rate": 3.5880898762207128e-06, "loss": 0.0291, "step": 16000 }, { "epoch": 1.8974222222222221, "grad_norm": 0.6051927793816031, "learning_rate": 3.5814769282790907e-06, "loss": 0.0303, "step": 16010 }, { "epoch": 1.8986074074074075, "grad_norm": 0.5687503417822197, "learning_rate": 3.5748666777811473e-06, "loss": 0.027, "step": 16020 }, { "epoch": 1.8997925925925925, "grad_norm": 0.5407824615906663, "learning_rate": 3.56825913729684e-06, "loss": 0.0268, "step": 16030 }, { "epoch": 1.9009777777777779, "grad_norm": 0.46859862600170465, "learning_rate": 3.5616543193909783e-06, "loss": 0.0274, "step": 16040 }, { "epoch": 1.9021629629629628, "grad_norm": 0.45405283019784876, "learning_rate": 3.5550522366231876e-06, "loss": 0.0282, "step": 16050 }, { "epoch": 1.9033481481481482, "grad_norm": 0.5174868254718724, "learning_rate": 3.548452901547901e-06, "loss": 0.0309, "step": 16060 }, { "epoch": 1.9045333333333332, "grad_norm": 0.45250526811771735, "learning_rate": 3.541856326714318e-06, "loss": 0.026, "step": 16070 }, { "epoch": 1.9057185185185186, "grad_norm": 0.6017336351020726, "learning_rate": 3.5352625246663954e-06, "loss": 0.0285, "step": 16080 }, { "epoch": 1.9069037037037035, "grad_norm": 0.3395513036241249, "learning_rate": 3.528671507942816e-06, "loss": 0.0303, "step": 16090 }, { "epoch": 1.908088888888889, "grad_norm": 0.5038084307863366, "learning_rate": 3.522083289076964e-06, "loss": 0.0269, "step": 16100 }, { "epoch": 1.9092740740740741, "grad_norm": 0.5163753542883514, "learning_rate": 3.515497880596905e-06, "loss": 0.0278, "step": 16110 }, { "epoch": 1.9104592592592593, "grad_norm": 0.42022863658206705, "learning_rate": 3.508915295025358e-06, "loss": 0.0279, "step": 16120 }, { "epoch": 1.9116444444444445, "grad_norm": 0.40692186116901474, "learning_rate": 3.5023355448796777e-06, "loss": 0.0305, "step": 16130 }, { "epoch": 1.9128296296296297, "grad_norm": 0.5668476685494173, "learning_rate": 3.495758642671824e-06, "loss": 0.0306, "step": 16140 }, { "epoch": 1.9140148148148148, "grad_norm": 0.4424793546433323, "learning_rate": 3.489184600908344e-06, "loss": 0.0258, "step": 16150 }, { "epoch": 1.9152, "grad_norm": 0.554147578183282, "learning_rate": 3.4826134320903428e-06, "loss": 0.0288, "step": 16160 }, { "epoch": 1.9163851851851852, "grad_norm": 0.44569460616605566, "learning_rate": 3.4760451487134645e-06, "loss": 0.0285, "step": 16170 }, { "epoch": 1.9175703703703704, "grad_norm": 0.37719323990982606, "learning_rate": 3.4694797632678627e-06, "loss": 0.0261, "step": 16180 }, { "epoch": 1.9187555555555555, "grad_norm": 0.5111246144797355, "learning_rate": 3.462917288238185e-06, "loss": 0.0299, "step": 16190 }, { "epoch": 1.9199407407407407, "grad_norm": 0.43678865887098206, "learning_rate": 3.4563577361035405e-06, "loss": 0.0261, "step": 16200 }, { "epoch": 1.921125925925926, "grad_norm": 0.40706931567139115, "learning_rate": 3.4498011193374837e-06, "loss": 0.0276, "step": 16210 }, { "epoch": 1.922311111111111, "grad_norm": 0.5716168770877608, "learning_rate": 3.4432474504079818e-06, "loss": 0.0286, "step": 16220 }, { "epoch": 1.9234962962962963, "grad_norm": 0.44597959357535155, "learning_rate": 3.436696741777407e-06, "loss": 0.029, "step": 16230 }, { "epoch": 1.9246814814814814, "grad_norm": 0.418352530258196, "learning_rate": 3.430149005902489e-06, "loss": 0.0265, "step": 16240 }, { "epoch": 1.9258666666666666, "grad_norm": 0.44231181686031346, "learning_rate": 3.423604255234315e-06, "loss": 0.0298, "step": 16250 }, { "epoch": 1.9270518518518518, "grad_norm": 0.3500491976233091, "learning_rate": 3.417062502218289e-06, "loss": 0.0266, "step": 16260 }, { "epoch": 1.9282370370370372, "grad_norm": 0.5466099206429552, "learning_rate": 3.41052375929412e-06, "loss": 0.0292, "step": 16270 }, { "epoch": 1.9294222222222221, "grad_norm": 0.4477341461260506, "learning_rate": 3.4039880388957864e-06, "loss": 0.0291, "step": 16280 }, { "epoch": 1.9306074074074075, "grad_norm": 0.48970004609394685, "learning_rate": 3.3974553534515264e-06, "loss": 0.0289, "step": 16290 }, { "epoch": 1.9317925925925925, "grad_norm": 0.7261947237022754, "learning_rate": 3.3909257153838006e-06, "loss": 0.0301, "step": 16300 }, { "epoch": 1.932977777777778, "grad_norm": 0.5628159186222998, "learning_rate": 3.3843991371092794e-06, "loss": 0.0287, "step": 16310 }, { "epoch": 1.9341629629629629, "grad_norm": 0.42375554178250247, "learning_rate": 3.377875631038813e-06, "loss": 0.0288, "step": 16320 }, { "epoch": 1.9353481481481483, "grad_norm": 0.562442321595791, "learning_rate": 3.3713552095774106e-06, "loss": 0.0293, "step": 16330 }, { "epoch": 1.9365333333333332, "grad_norm": 0.4826177073652702, "learning_rate": 3.3648378851242115e-06, "loss": 0.0277, "step": 16340 }, { "epoch": 1.9377185185185186, "grad_norm": 0.4105968087946979, "learning_rate": 3.3583236700724723e-06, "loss": 0.0288, "step": 16350 }, { "epoch": 1.9389037037037036, "grad_norm": 0.48715253857179075, "learning_rate": 3.351812576809531e-06, "loss": 0.0273, "step": 16360 }, { "epoch": 1.940088888888889, "grad_norm": 0.4959529198340603, "learning_rate": 3.3453046177167907e-06, "loss": 0.0284, "step": 16370 }, { "epoch": 1.941274074074074, "grad_norm": 0.49236835007377455, "learning_rate": 3.3387998051697002e-06, "loss": 0.0276, "step": 16380 }, { "epoch": 1.9424592592592593, "grad_norm": 0.4955130152472539, "learning_rate": 3.332298151537716e-06, "loss": 0.0269, "step": 16390 }, { "epoch": 1.9436444444444443, "grad_norm": 0.7023422206575342, "learning_rate": 3.325799669184295e-06, "loss": 0.028, "step": 16400 }, { "epoch": 1.9448296296296297, "grad_norm": 0.3956179947762618, "learning_rate": 3.319304370466857e-06, "loss": 0.029, "step": 16410 }, { "epoch": 1.9460148148148149, "grad_norm": 0.4501613805232214, "learning_rate": 3.3128122677367747e-06, "loss": 0.0217, "step": 16420 }, { "epoch": 1.9472, "grad_norm": 0.556450555200919, "learning_rate": 3.306323373339338e-06, "loss": 0.0291, "step": 16430 }, { "epoch": 1.9483851851851852, "grad_norm": 0.5214643811890484, "learning_rate": 3.2998376996137383e-06, "loss": 0.0275, "step": 16440 }, { "epoch": 1.9495703703703704, "grad_norm": 0.4489831157563276, "learning_rate": 3.293355258893042e-06, "loss": 0.0286, "step": 16450 }, { "epoch": 1.9507555555555556, "grad_norm": 0.5067054659676723, "learning_rate": 3.2868760635041696e-06, "loss": 0.0277, "step": 16460 }, { "epoch": 1.9519407407407408, "grad_norm": 0.4097933595328039, "learning_rate": 3.2804001257678674e-06, "loss": 0.0278, "step": 16470 }, { "epoch": 1.953125925925926, "grad_norm": 0.5213929502472637, "learning_rate": 3.273927457998689e-06, "loss": 0.0288, "step": 16480 }, { "epoch": 1.954311111111111, "grad_norm": 0.4969659426454915, "learning_rate": 3.267458072504967e-06, "loss": 0.0308, "step": 16490 }, { "epoch": 1.9554962962962963, "grad_norm": 0.523825208177789, "learning_rate": 3.2609919815887974e-06, "loss": 0.0288, "step": 16500 }, { "epoch": 1.9566814814814815, "grad_norm": 0.43342427931537186, "learning_rate": 3.2545291975460058e-06, "loss": 0.0277, "step": 16510 }, { "epoch": 1.9578666666666666, "grad_norm": 0.4444642806632608, "learning_rate": 3.248069732666135e-06, "loss": 0.0268, "step": 16520 }, { "epoch": 1.9590518518518518, "grad_norm": 0.42095491273050895, "learning_rate": 3.2416135992324084e-06, "loss": 0.0296, "step": 16530 }, { "epoch": 1.960237037037037, "grad_norm": 0.49850627441237044, "learning_rate": 3.2351608095217244e-06, "loss": 0.0306, "step": 16540 }, { "epoch": 1.9614222222222222, "grad_norm": 0.5249168466760042, "learning_rate": 3.228711375804616e-06, "loss": 0.0252, "step": 16550 }, { "epoch": 1.9626074074074074, "grad_norm": 0.4617493468704232, "learning_rate": 3.2222653103452377e-06, "loss": 0.0287, "step": 16560 }, { "epoch": 1.9637925925925925, "grad_norm": 0.48353472232940464, "learning_rate": 3.215822625401335e-06, "loss": 0.0261, "step": 16570 }, { "epoch": 1.964977777777778, "grad_norm": 0.5018899958874573, "learning_rate": 3.2093833332242297e-06, "loss": 0.0253, "step": 16580 }, { "epoch": 1.966162962962963, "grad_norm": 0.44293497278832766, "learning_rate": 3.2029474460587886e-06, "loss": 0.0302, "step": 16590 }, { "epoch": 1.9673481481481483, "grad_norm": 0.5289736580903666, "learning_rate": 3.1965149761434056e-06, "loss": 0.0266, "step": 16600 }, { "epoch": 1.9685333333333332, "grad_norm": 0.4432894958891086, "learning_rate": 3.1900859357099734e-06, "loss": 0.0292, "step": 16610 }, { "epoch": 1.9697185185185186, "grad_norm": 0.6122164685823401, "learning_rate": 3.1836603369838697e-06, "loss": 0.0316, "step": 16620 }, { "epoch": 1.9709037037037036, "grad_norm": 0.44770167815912215, "learning_rate": 3.1772381921839212e-06, "loss": 0.0254, "step": 16630 }, { "epoch": 1.972088888888889, "grad_norm": 0.6081499764545403, "learning_rate": 3.1708195135223895e-06, "loss": 0.0303, "step": 16640 }, { "epoch": 1.973274074074074, "grad_norm": 0.5069386007266375, "learning_rate": 3.164404313204944e-06, "loss": 0.0233, "step": 16650 }, { "epoch": 1.9744592592592594, "grad_norm": 0.561366928292409, "learning_rate": 3.15799260343064e-06, "loss": 0.0304, "step": 16660 }, { "epoch": 1.9756444444444443, "grad_norm": 0.558200172607736, "learning_rate": 3.1515843963918952e-06, "loss": 0.0273, "step": 16670 }, { "epoch": 1.9768296296296297, "grad_norm": 0.6094887586344587, "learning_rate": 3.1451797042744654e-06, "loss": 0.0277, "step": 16680 }, { "epoch": 1.9780148148148147, "grad_norm": 0.4939130530614874, "learning_rate": 3.138778539257427e-06, "loss": 0.028, "step": 16690 }, { "epoch": 1.9792, "grad_norm": 0.4498344520360258, "learning_rate": 3.132380913513143e-06, "loss": 0.0273, "step": 16700 }, { "epoch": 1.980385185185185, "grad_norm": 0.5145262397259287, "learning_rate": 3.1259868392072525e-06, "loss": 0.0276, "step": 16710 }, { "epoch": 1.9815703703703704, "grad_norm": 0.5488739293687596, "learning_rate": 3.1195963284986343e-06, "loss": 0.0276, "step": 16720 }, { "epoch": 1.9827555555555556, "grad_norm": 0.4824153911950187, "learning_rate": 3.113209393539396e-06, "loss": 0.0267, "step": 16730 }, { "epoch": 1.9839407407407408, "grad_norm": 0.3240425216116917, "learning_rate": 3.1068260464748453e-06, "loss": 0.0269, "step": 16740 }, { "epoch": 1.985125925925926, "grad_norm": 0.4918205581945271, "learning_rate": 3.1004462994434636e-06, "loss": 0.0286, "step": 16750 }, { "epoch": 1.9863111111111111, "grad_norm": 0.5092021418663703, "learning_rate": 3.0940701645768882e-06, "loss": 0.0282, "step": 16760 }, { "epoch": 1.9874962962962963, "grad_norm": 0.43239668671736603, "learning_rate": 3.0876976539998927e-06, "loss": 0.0281, "step": 16770 }, { "epoch": 1.9886814814814815, "grad_norm": 0.48101851927752226, "learning_rate": 3.0813287798303493e-06, "loss": 0.0294, "step": 16780 }, { "epoch": 1.9898666666666667, "grad_norm": 0.5123373247203269, "learning_rate": 3.0749635541792245e-06, "loss": 0.0295, "step": 16790 }, { "epoch": 1.9910518518518519, "grad_norm": 0.5201502334866416, "learning_rate": 3.0686019891505386e-06, "loss": 0.0268, "step": 16800 }, { "epoch": 1.992237037037037, "grad_norm": 0.43273612199401407, "learning_rate": 3.062244096841358e-06, "loss": 0.0265, "step": 16810 }, { "epoch": 1.9934222222222222, "grad_norm": 0.42623948593609784, "learning_rate": 3.05588988934176e-06, "loss": 0.0287, "step": 16820 }, { "epoch": 1.9946074074074074, "grad_norm": 0.6180521272782323, "learning_rate": 3.049539378734818e-06, "loss": 0.0253, "step": 16830 }, { "epoch": 1.9957925925925926, "grad_norm": 0.4820479106144094, "learning_rate": 3.043192577096571e-06, "loss": 0.0293, "step": 16840 }, { "epoch": 1.9969777777777777, "grad_norm": 0.41628943844623667, "learning_rate": 3.0368494964960147e-06, "loss": 0.0257, "step": 16850 }, { "epoch": 1.998162962962963, "grad_norm": 0.47298114938576746, "learning_rate": 3.0305101489950583e-06, "loss": 0.0254, "step": 16860 }, { "epoch": 1.999348148148148, "grad_norm": 0.559686936226428, "learning_rate": 3.0241745466485185e-06, "loss": 0.0281, "step": 16870 }, { "epoch": 2.000474074074074, "grad_norm": 0.3482473401349315, "learning_rate": 3.0178427015040858e-06, "loss": 0.0229, "step": 16880 }, { "epoch": 2.0016592592592595, "grad_norm": 0.46580309230335915, "learning_rate": 3.011514625602312e-06, "loss": 0.021, "step": 16890 }, { "epoch": 2.0028444444444444, "grad_norm": 0.40274055242316353, "learning_rate": 3.005190330976574e-06, "loss": 0.0215, "step": 16900 }, { "epoch": 2.00402962962963, "grad_norm": 0.6303959964339435, "learning_rate": 2.998869829653064e-06, "loss": 0.0229, "step": 16910 }, { "epoch": 2.005214814814815, "grad_norm": 0.5138781974516277, "learning_rate": 2.9925531336507607e-06, "loss": 0.0186, "step": 16920 }, { "epoch": 2.0064, "grad_norm": 0.44224945268347166, "learning_rate": 2.9862402549814033e-06, "loss": 0.0191, "step": 16930 }, { "epoch": 2.007585185185185, "grad_norm": 0.47402190837384184, "learning_rate": 2.9799312056494744e-06, "loss": 0.0207, "step": 16940 }, { "epoch": 2.0087703703703705, "grad_norm": 0.4512227152680803, "learning_rate": 2.9736259976521743e-06, "loss": 0.0183, "step": 16950 }, { "epoch": 2.0099555555555555, "grad_norm": 0.39109377769125514, "learning_rate": 2.9673246429793977e-06, "loss": 0.0203, "step": 16960 }, { "epoch": 2.011140740740741, "grad_norm": 0.46949579253078194, "learning_rate": 2.9610271536137137e-06, "loss": 0.0206, "step": 16970 }, { "epoch": 2.012325925925926, "grad_norm": 0.7052053510510133, "learning_rate": 2.954733541530339e-06, "loss": 0.0232, "step": 16980 }, { "epoch": 2.0135111111111113, "grad_norm": 0.5301340846313625, "learning_rate": 2.948443818697118e-06, "loss": 0.0207, "step": 16990 }, { "epoch": 2.014696296296296, "grad_norm": 0.43845805140236066, "learning_rate": 2.9421579970745033e-06, "loss": 0.0193, "step": 17000 }, { "epoch": 2.0158814814814816, "grad_norm": 0.637056439108828, "learning_rate": 2.9358760886155225e-06, "loss": 0.0233, "step": 17010 }, { "epoch": 2.0170666666666666, "grad_norm": 0.5642052007571374, "learning_rate": 2.9295981052657664e-06, "loss": 0.02, "step": 17020 }, { "epoch": 2.018251851851852, "grad_norm": 0.5030801354899268, "learning_rate": 2.9233240589633592e-06, "loss": 0.0201, "step": 17030 }, { "epoch": 2.019437037037037, "grad_norm": 0.6592069503241937, "learning_rate": 2.917053961638942e-06, "loss": 0.0191, "step": 17040 }, { "epoch": 2.0206222222222223, "grad_norm": 0.6891333301586693, "learning_rate": 2.9107878252156405e-06, "loss": 0.0209, "step": 17050 }, { "epoch": 2.0218074074074073, "grad_norm": 0.5843514128740532, "learning_rate": 2.904525661609057e-06, "loss": 0.0212, "step": 17060 }, { "epoch": 2.0229925925925927, "grad_norm": 0.57460003385646, "learning_rate": 2.8982674827272306e-06, "loss": 0.0194, "step": 17070 }, { "epoch": 2.0241777777777776, "grad_norm": 0.5048643098953294, "learning_rate": 2.8920133004706297e-06, "loss": 0.0211, "step": 17080 }, { "epoch": 2.025362962962963, "grad_norm": 0.7024965213733743, "learning_rate": 2.8857631267321196e-06, "loss": 0.0219, "step": 17090 }, { "epoch": 2.026548148148148, "grad_norm": 0.5432895602355671, "learning_rate": 2.8795169733969397e-06, "loss": 0.0199, "step": 17100 }, { "epoch": 2.0277333333333334, "grad_norm": 0.49977345022966446, "learning_rate": 2.8732748523426934e-06, "loss": 0.0229, "step": 17110 }, { "epoch": 2.0289185185185183, "grad_norm": 0.5138196441980594, "learning_rate": 2.8670367754393093e-06, "loss": 0.0212, "step": 17120 }, { "epoch": 2.0301037037037037, "grad_norm": 0.5205940883171687, "learning_rate": 2.860802754549026e-06, "loss": 0.0187, "step": 17130 }, { "epoch": 2.0312888888888887, "grad_norm": 0.6485691519856771, "learning_rate": 2.8545728015263692e-06, "loss": 0.0227, "step": 17140 }, { "epoch": 2.032474074074074, "grad_norm": 0.46190546081380174, "learning_rate": 2.848346928218133e-06, "loss": 0.0197, "step": 17150 }, { "epoch": 2.033659259259259, "grad_norm": 0.5374744840579662, "learning_rate": 2.8421251464633527e-06, "loss": 0.0223, "step": 17160 }, { "epoch": 2.0348444444444445, "grad_norm": 0.5035594297525493, "learning_rate": 2.8359074680932797e-06, "loss": 0.0221, "step": 17170 }, { "epoch": 2.0360296296296294, "grad_norm": 0.6218404427421674, "learning_rate": 2.8296939049313632e-06, "loss": 0.0212, "step": 17180 }, { "epoch": 2.037214814814815, "grad_norm": 0.5627208622442489, "learning_rate": 2.8234844687932304e-06, "loss": 0.0229, "step": 17190 }, { "epoch": 2.0384, "grad_norm": 0.5894673881803788, "learning_rate": 2.8172791714866586e-06, "loss": 0.0203, "step": 17200 }, { "epoch": 2.039585185185185, "grad_norm": 0.652610746033946, "learning_rate": 2.8110780248115533e-06, "loss": 0.0202, "step": 17210 }, { "epoch": 2.0407703703703706, "grad_norm": 0.6797262603338378, "learning_rate": 2.8048810405599268e-06, "loss": 0.0224, "step": 17220 }, { "epoch": 2.0419555555555555, "grad_norm": 0.44534648584222053, "learning_rate": 2.7986882305158798e-06, "loss": 0.0195, "step": 17230 }, { "epoch": 2.043140740740741, "grad_norm": 0.42191356985030354, "learning_rate": 2.7924996064555754e-06, "loss": 0.0196, "step": 17240 }, { "epoch": 2.044325925925926, "grad_norm": 0.5084660524003956, "learning_rate": 2.7863151801472125e-06, "loss": 0.0209, "step": 17250 }, { "epoch": 2.0455111111111113, "grad_norm": 0.5181340879304304, "learning_rate": 2.780134963351009e-06, "loss": 0.0188, "step": 17260 }, { "epoch": 2.0466962962962962, "grad_norm": 0.4645642370564578, "learning_rate": 2.773958967819181e-06, "loss": 0.022, "step": 17270 }, { "epoch": 2.0478814814814816, "grad_norm": 0.5798027491530987, "learning_rate": 2.7677872052959153e-06, "loss": 0.0207, "step": 17280 }, { "epoch": 2.0490666666666666, "grad_norm": 0.3604404015266568, "learning_rate": 2.7616196875173486e-06, "loss": 0.0202, "step": 17290 }, { "epoch": 2.050251851851852, "grad_norm": 0.5321509327667007, "learning_rate": 2.7554564262115433e-06, "loss": 0.0207, "step": 17300 }, { "epoch": 2.051437037037037, "grad_norm": 0.8777408808180764, "learning_rate": 2.7492974330984756e-06, "loss": 0.0207, "step": 17310 }, { "epoch": 2.0526222222222223, "grad_norm": 0.5048830394161766, "learning_rate": 2.7431427198900018e-06, "loss": 0.0212, "step": 17320 }, { "epoch": 2.0538074074074073, "grad_norm": 0.506975039542351, "learning_rate": 2.7369922982898356e-06, "loss": 0.0208, "step": 17330 }, { "epoch": 2.0549925925925927, "grad_norm": 0.6546830075558804, "learning_rate": 2.730846179993535e-06, "loss": 0.0197, "step": 17340 }, { "epoch": 2.0561777777777777, "grad_norm": 0.6130693163162522, "learning_rate": 2.7247043766884685e-06, "loss": 0.0227, "step": 17350 }, { "epoch": 2.057362962962963, "grad_norm": 0.6165697629279787, "learning_rate": 2.718566900053809e-06, "loss": 0.0234, "step": 17360 }, { "epoch": 2.058548148148148, "grad_norm": 0.5902572278545075, "learning_rate": 2.7124337617604933e-06, "loss": 0.0193, "step": 17370 }, { "epoch": 2.0597333333333334, "grad_norm": 0.46237748365897946, "learning_rate": 2.7063049734712116e-06, "loss": 0.0222, "step": 17380 }, { "epoch": 2.0609185185185184, "grad_norm": 0.39759154277465686, "learning_rate": 2.700180546840382e-06, "loss": 0.0207, "step": 17390 }, { "epoch": 2.0621037037037038, "grad_norm": 0.6183978214522016, "learning_rate": 2.6940604935141324e-06, "loss": 0.0203, "step": 17400 }, { "epoch": 2.0632888888888887, "grad_norm": 0.517093237139377, "learning_rate": 2.6879448251302677e-06, "loss": 0.0203, "step": 17410 }, { "epoch": 2.064474074074074, "grad_norm": 0.5622185910580356, "learning_rate": 2.6818335533182573e-06, "loss": 0.0206, "step": 17420 }, { "epoch": 2.065659259259259, "grad_norm": 0.5225085519598541, "learning_rate": 2.6757266896992094e-06, "loss": 0.0207, "step": 17430 }, { "epoch": 2.0668444444444445, "grad_norm": 0.5220970224663188, "learning_rate": 2.669624245885854e-06, "loss": 0.0223, "step": 17440 }, { "epoch": 2.0680296296296294, "grad_norm": 0.48375196632405615, "learning_rate": 2.6635262334825095e-06, "loss": 0.0216, "step": 17450 }, { "epoch": 2.069214814814815, "grad_norm": 0.5559496639426671, "learning_rate": 2.6574326640850744e-06, "loss": 0.0238, "step": 17460 }, { "epoch": 2.0704, "grad_norm": 0.5681141804682247, "learning_rate": 2.6513435492809924e-06, "loss": 0.0198, "step": 17470 }, { "epoch": 2.071585185185185, "grad_norm": 0.43538899827280736, "learning_rate": 2.6452589006492426e-06, "loss": 0.02, "step": 17480 }, { "epoch": 2.07277037037037, "grad_norm": 0.580886786888115, "learning_rate": 2.639178729760306e-06, "loss": 0.0224, "step": 17490 }, { "epoch": 2.0739555555555556, "grad_norm": 0.4613400997382292, "learning_rate": 2.6331030481761505e-06, "loss": 0.022, "step": 17500 }, { "epoch": 2.075140740740741, "grad_norm": 0.5427349090105272, "learning_rate": 2.627031867450206e-06, "loss": 0.0235, "step": 17510 }, { "epoch": 2.076325925925926, "grad_norm": 0.55239594231661, "learning_rate": 2.6209651991273476e-06, "loss": 0.0223, "step": 17520 }, { "epoch": 2.0775111111111113, "grad_norm": 0.45400213551288154, "learning_rate": 2.6149030547438648e-06, "loss": 0.0223, "step": 17530 }, { "epoch": 2.0786962962962963, "grad_norm": 0.47364076124826127, "learning_rate": 2.6088454458274503e-06, "loss": 0.021, "step": 17540 }, { "epoch": 2.0798814814814817, "grad_norm": 0.6660565862409713, "learning_rate": 2.602792383897164e-06, "loss": 0.0189, "step": 17550 }, { "epoch": 2.0810666666666666, "grad_norm": 0.510240814833039, "learning_rate": 2.596743880463429e-06, "loss": 0.0202, "step": 17560 }, { "epoch": 2.082251851851852, "grad_norm": 0.5914970747154663, "learning_rate": 2.5906999470279927e-06, "loss": 0.0217, "step": 17570 }, { "epoch": 2.083437037037037, "grad_norm": 0.6248497425366787, "learning_rate": 2.5846605950839133e-06, "loss": 0.0205, "step": 17580 }, { "epoch": 2.0846222222222224, "grad_norm": 0.6278589590491476, "learning_rate": 2.578625836115538e-06, "loss": 0.0196, "step": 17590 }, { "epoch": 2.0858074074074073, "grad_norm": 0.41741915572365507, "learning_rate": 2.572595681598483e-06, "loss": 0.0201, "step": 17600 }, { "epoch": 2.0869925925925927, "grad_norm": 0.5656720612902089, "learning_rate": 2.5665701429996027e-06, "loss": 0.0217, "step": 17610 }, { "epoch": 2.0881777777777777, "grad_norm": 0.6889995847332994, "learning_rate": 2.560549231776981e-06, "loss": 0.0216, "step": 17620 }, { "epoch": 2.089362962962963, "grad_norm": 0.4923895380185985, "learning_rate": 2.5545329593798973e-06, "loss": 0.0207, "step": 17630 }, { "epoch": 2.090548148148148, "grad_norm": 0.6384375039735288, "learning_rate": 2.5485213372488075e-06, "loss": 0.0219, "step": 17640 }, { "epoch": 2.0917333333333334, "grad_norm": 0.5602545664645805, "learning_rate": 2.542514376815334e-06, "loss": 0.0224, "step": 17650 }, { "epoch": 2.0929185185185184, "grad_norm": 0.5108298103386538, "learning_rate": 2.536512089502226e-06, "loss": 0.0202, "step": 17660 }, { "epoch": 2.094103703703704, "grad_norm": 0.6829735094554921, "learning_rate": 2.530514486723348e-06, "loss": 0.0233, "step": 17670 }, { "epoch": 2.0952888888888888, "grad_norm": 0.5213175703053962, "learning_rate": 2.524521579883659e-06, "loss": 0.0216, "step": 17680 }, { "epoch": 2.096474074074074, "grad_norm": 0.49732457335620184, "learning_rate": 2.5185333803791896e-06, "loss": 0.0205, "step": 17690 }, { "epoch": 2.097659259259259, "grad_norm": 0.5454009761276458, "learning_rate": 2.512549899597014e-06, "loss": 0.0213, "step": 17700 }, { "epoch": 2.0988444444444445, "grad_norm": 0.554626102113036, "learning_rate": 2.5065711489152363e-06, "loss": 0.021, "step": 17710 }, { "epoch": 2.1000296296296295, "grad_norm": 0.4848245125944637, "learning_rate": 2.5005971397029625e-06, "loss": 0.0224, "step": 17720 }, { "epoch": 2.101214814814815, "grad_norm": 0.6457509046815929, "learning_rate": 2.4946278833202897e-06, "loss": 0.0195, "step": 17730 }, { "epoch": 2.1024, "grad_norm": 0.42380878103800584, "learning_rate": 2.488663391118271e-06, "loss": 0.0188, "step": 17740 }, { "epoch": 2.1035851851851852, "grad_norm": 0.4105428902709983, "learning_rate": 2.4827036744389007e-06, "loss": 0.0202, "step": 17750 }, { "epoch": 2.10477037037037, "grad_norm": 0.43751033770369263, "learning_rate": 2.4767487446150896e-06, "loss": 0.0196, "step": 17760 }, { "epoch": 2.1059555555555556, "grad_norm": 0.5666900006161766, "learning_rate": 2.4707986129706563e-06, "loss": 0.0205, "step": 17770 }, { "epoch": 2.1071407407407405, "grad_norm": 0.6226424044192759, "learning_rate": 2.464853290820284e-06, "loss": 0.0212, "step": 17780 }, { "epoch": 2.108325925925926, "grad_norm": 0.5324995744190012, "learning_rate": 2.458912789469516e-06, "loss": 0.0203, "step": 17790 }, { "epoch": 2.109511111111111, "grad_norm": 0.6628981567148068, "learning_rate": 2.452977120214723e-06, "loss": 0.0203, "step": 17800 }, { "epoch": 2.1106962962962963, "grad_norm": 0.6104277124955798, "learning_rate": 2.4470462943430954e-06, "loss": 0.0195, "step": 17810 }, { "epoch": 2.1118814814814817, "grad_norm": 0.45996499379702915, "learning_rate": 2.4411203231326076e-06, "loss": 0.0234, "step": 17820 }, { "epoch": 2.1130666666666666, "grad_norm": 0.5682831803009347, "learning_rate": 2.4351992178520025e-06, "loss": 0.0202, "step": 17830 }, { "epoch": 2.114251851851852, "grad_norm": 0.5364567179377799, "learning_rate": 2.42928298976077e-06, "loss": 0.0197, "step": 17840 }, { "epoch": 2.115437037037037, "grad_norm": 0.5352850842233184, "learning_rate": 2.4233716501091294e-06, "loss": 0.0212, "step": 17850 }, { "epoch": 2.1166222222222224, "grad_norm": 0.5781701329699696, "learning_rate": 2.417465210138002e-06, "loss": 0.0195, "step": 17860 }, { "epoch": 2.1178074074074074, "grad_norm": 0.5151015814679271, "learning_rate": 2.411563681078991e-06, "loss": 0.0185, "step": 17870 }, { "epoch": 2.1189925925925928, "grad_norm": 0.5031295769562851, "learning_rate": 2.4056670741543598e-06, "loss": 0.021, "step": 17880 }, { "epoch": 2.1201777777777777, "grad_norm": 0.5490496818347299, "learning_rate": 2.3997754005770175e-06, "loss": 0.0189, "step": 17890 }, { "epoch": 2.121362962962963, "grad_norm": 0.562956163704708, "learning_rate": 2.3938886715504856e-06, "loss": 0.0191, "step": 17900 }, { "epoch": 2.122548148148148, "grad_norm": 0.5580659425582958, "learning_rate": 2.388006898268887e-06, "loss": 0.0198, "step": 17910 }, { "epoch": 2.1237333333333335, "grad_norm": 0.49905480307253824, "learning_rate": 2.382130091916917e-06, "loss": 0.0209, "step": 17920 }, { "epoch": 2.1249185185185184, "grad_norm": 0.5349599214785156, "learning_rate": 2.376258263669831e-06, "loss": 0.0206, "step": 17930 }, { "epoch": 2.126103703703704, "grad_norm": 0.48181259176056973, "learning_rate": 2.370391424693417e-06, "loss": 0.0203, "step": 17940 }, { "epoch": 2.127288888888889, "grad_norm": 0.703787396056072, "learning_rate": 2.364529586143973e-06, "loss": 0.0223, "step": 17950 }, { "epoch": 2.128474074074074, "grad_norm": 0.4809162493123757, "learning_rate": 2.3586727591682867e-06, "loss": 0.0206, "step": 17960 }, { "epoch": 2.129659259259259, "grad_norm": 0.5506637606674155, "learning_rate": 2.352820954903623e-06, "loss": 0.0172, "step": 17970 }, { "epoch": 2.1308444444444445, "grad_norm": 0.39490121962813945, "learning_rate": 2.346974184477689e-06, "loss": 0.0208, "step": 17980 }, { "epoch": 2.1320296296296295, "grad_norm": 0.5460609055203705, "learning_rate": 2.3411324590086194e-06, "loss": 0.0193, "step": 17990 }, { "epoch": 2.133214814814815, "grad_norm": 0.7225464713084386, "learning_rate": 2.3352957896049626e-06, "loss": 0.0218, "step": 18000 }, { "epoch": 2.1344, "grad_norm": 0.507862371964674, "learning_rate": 2.329464187365643e-06, "loss": 0.0217, "step": 18010 }, { "epoch": 2.1355851851851853, "grad_norm": 0.5153060922390552, "learning_rate": 2.3236376633799582e-06, "loss": 0.0219, "step": 18020 }, { "epoch": 2.13677037037037, "grad_norm": 0.6253823099089397, "learning_rate": 2.317816228727543e-06, "loss": 0.0193, "step": 18030 }, { "epoch": 2.1379555555555556, "grad_norm": 0.5404761498282122, "learning_rate": 2.3119998944783562e-06, "loss": 0.0183, "step": 18040 }, { "epoch": 2.1391407407407406, "grad_norm": 0.4335318956345966, "learning_rate": 2.3061886716926562e-06, "loss": 0.0201, "step": 18050 }, { "epoch": 2.140325925925926, "grad_norm": 0.5610753444629021, "learning_rate": 2.3003825714209873e-06, "loss": 0.0212, "step": 18060 }, { "epoch": 2.141511111111111, "grad_norm": 0.6260871102303472, "learning_rate": 2.2945816047041438e-06, "loss": 0.0197, "step": 18070 }, { "epoch": 2.1426962962962963, "grad_norm": 0.5835361641829071, "learning_rate": 2.2887857825731676e-06, "loss": 0.0222, "step": 18080 }, { "epoch": 2.1438814814814813, "grad_norm": 0.526656348906993, "learning_rate": 2.2829951160493092e-06, "loss": 0.0227, "step": 18090 }, { "epoch": 2.1450666666666667, "grad_norm": 0.5026505854718598, "learning_rate": 2.277209616144023e-06, "loss": 0.0181, "step": 18100 }, { "epoch": 2.1462518518518516, "grad_norm": 0.6648280571177739, "learning_rate": 2.2714292938589327e-06, "loss": 0.0212, "step": 18110 }, { "epoch": 2.147437037037037, "grad_norm": 0.4827403542009834, "learning_rate": 2.2656541601858195e-06, "loss": 0.0186, "step": 18120 }, { "epoch": 2.1486222222222224, "grad_norm": 0.4104864640118743, "learning_rate": 2.2598842261065943e-06, "loss": 0.0204, "step": 18130 }, { "epoch": 2.1498074074074074, "grad_norm": 0.531122536135586, "learning_rate": 2.2541195025932877e-06, "loss": 0.0198, "step": 18140 }, { "epoch": 2.150992592592593, "grad_norm": 0.5562995099275271, "learning_rate": 2.2483600006080126e-06, "loss": 0.0196, "step": 18150 }, { "epoch": 2.1521777777777777, "grad_norm": 0.42433929016876354, "learning_rate": 2.242605731102962e-06, "loss": 0.02, "step": 18160 }, { "epoch": 2.153362962962963, "grad_norm": 0.5370964002455985, "learning_rate": 2.236856705020371e-06, "loss": 0.0204, "step": 18170 }, { "epoch": 2.154548148148148, "grad_norm": 0.4995445837538678, "learning_rate": 2.231112933292511e-06, "loss": 0.019, "step": 18180 }, { "epoch": 2.1557333333333335, "grad_norm": 0.7250541553433348, "learning_rate": 2.2253744268416557e-06, "loss": 0.0227, "step": 18190 }, { "epoch": 2.1569185185185185, "grad_norm": 0.5341142665020637, "learning_rate": 2.219641196580069e-06, "loss": 0.0219, "step": 18200 }, { "epoch": 2.158103703703704, "grad_norm": 0.5540512775000475, "learning_rate": 2.2139132534099807e-06, "loss": 0.0197, "step": 18210 }, { "epoch": 2.159288888888889, "grad_norm": 0.5720923139855493, "learning_rate": 2.208190608223568e-06, "loss": 0.0207, "step": 18220 }, { "epoch": 2.160474074074074, "grad_norm": 0.5573024949298367, "learning_rate": 2.202473271902936e-06, "loss": 0.0199, "step": 18230 }, { "epoch": 2.161659259259259, "grad_norm": 0.46947015664990394, "learning_rate": 2.19676125532009e-06, "loss": 0.0202, "step": 18240 }, { "epoch": 2.1628444444444446, "grad_norm": 0.5848449267707536, "learning_rate": 2.19105456933692e-06, "loss": 0.0221, "step": 18250 }, { "epoch": 2.1640296296296295, "grad_norm": 0.7057389535726815, "learning_rate": 2.1853532248051794e-06, "loss": 0.0219, "step": 18260 }, { "epoch": 2.165214814814815, "grad_norm": 0.6746444885619608, "learning_rate": 2.179657232566469e-06, "loss": 0.0209, "step": 18270 }, { "epoch": 2.1664, "grad_norm": 0.5431912367483681, "learning_rate": 2.1739666034522054e-06, "loss": 0.02, "step": 18280 }, { "epoch": 2.1675851851851853, "grad_norm": 0.4652152296806473, "learning_rate": 2.1682813482836092e-06, "loss": 0.0198, "step": 18290 }, { "epoch": 2.1687703703703702, "grad_norm": 0.6062192440874989, "learning_rate": 2.162601477871683e-06, "loss": 0.021, "step": 18300 }, { "epoch": 2.1699555555555556, "grad_norm": 0.6923259295522756, "learning_rate": 2.1569270030171912e-06, "loss": 0.018, "step": 18310 }, { "epoch": 2.1711407407407406, "grad_norm": 0.383772317909671, "learning_rate": 2.151257934510634e-06, "loss": 0.0199, "step": 18320 }, { "epoch": 2.172325925925926, "grad_norm": 0.533245204040971, "learning_rate": 2.1455942831322337e-06, "loss": 0.0201, "step": 18330 }, { "epoch": 2.173511111111111, "grad_norm": 0.5702816356505999, "learning_rate": 2.139936059651908e-06, "loss": 0.0202, "step": 18340 }, { "epoch": 2.1746962962962963, "grad_norm": 0.41103437915339863, "learning_rate": 2.13428327482926e-06, "loss": 0.02, "step": 18350 }, { "epoch": 2.1758814814814813, "grad_norm": 0.47711776383543486, "learning_rate": 2.128635939413544e-06, "loss": 0.0204, "step": 18360 }, { "epoch": 2.1770666666666667, "grad_norm": 0.6101847075721979, "learning_rate": 2.1229940641436525e-06, "loss": 0.0208, "step": 18370 }, { "epoch": 2.1782518518518517, "grad_norm": 0.4549123352331804, "learning_rate": 2.117357659748099e-06, "loss": 0.0181, "step": 18380 }, { "epoch": 2.179437037037037, "grad_norm": 0.3284452989493554, "learning_rate": 2.111726736944994e-06, "loss": 0.0189, "step": 18390 }, { "epoch": 2.180622222222222, "grad_norm": 0.5823265552863844, "learning_rate": 2.106101306442018e-06, "loss": 0.0206, "step": 18400 }, { "epoch": 2.1818074074074074, "grad_norm": 0.6740901089629598, "learning_rate": 2.1004813789364128e-06, "loss": 0.0204, "step": 18410 }, { "epoch": 2.1829925925925924, "grad_norm": 0.5418554750031419, "learning_rate": 2.0948669651149512e-06, "loss": 0.0193, "step": 18420 }, { "epoch": 2.1841777777777778, "grad_norm": 0.4896848300833659, "learning_rate": 2.089258075653928e-06, "loss": 0.0197, "step": 18430 }, { "epoch": 2.185362962962963, "grad_norm": 0.5520432689675453, "learning_rate": 2.0836547212191283e-06, "loss": 0.0186, "step": 18440 }, { "epoch": 2.186548148148148, "grad_norm": 0.6224336923265456, "learning_rate": 2.0780569124658114e-06, "loss": 0.0209, "step": 18450 }, { "epoch": 2.1877333333333335, "grad_norm": 0.6011837329328497, "learning_rate": 2.0724646600386893e-06, "loss": 0.0217, "step": 18460 }, { "epoch": 2.1889185185185185, "grad_norm": 0.7273667529581211, "learning_rate": 2.0668779745719188e-06, "loss": 0.0206, "step": 18470 }, { "epoch": 2.190103703703704, "grad_norm": 0.7029829492058258, "learning_rate": 2.0612968666890583e-06, "loss": 0.0204, "step": 18480 }, { "epoch": 2.191288888888889, "grad_norm": 0.7805906621478219, "learning_rate": 2.055721347003065e-06, "loss": 0.0197, "step": 18490 }, { "epoch": 2.1924740740740742, "grad_norm": 0.6074290614751168, "learning_rate": 2.0501514261162685e-06, "loss": 0.0207, "step": 18500 }, { "epoch": 2.193659259259259, "grad_norm": 0.4782578294360636, "learning_rate": 2.0445871146203554e-06, "loss": 0.0178, "step": 18510 }, { "epoch": 2.1948444444444446, "grad_norm": 0.8116631849011151, "learning_rate": 2.0390284230963413e-06, "loss": 0.0197, "step": 18520 }, { "epoch": 2.1960296296296296, "grad_norm": 0.4698401755295345, "learning_rate": 2.0334753621145547e-06, "loss": 0.0198, "step": 18530 }, { "epoch": 2.197214814814815, "grad_norm": 0.5394019591374424, "learning_rate": 2.0279279422346214e-06, "loss": 0.018, "step": 18540 }, { "epoch": 2.1984, "grad_norm": 0.6298320623550064, "learning_rate": 2.0223861740054358e-06, "loss": 0.02, "step": 18550 }, { "epoch": 2.1995851851851853, "grad_norm": 0.5557470409692559, "learning_rate": 2.016850067965149e-06, "loss": 0.0236, "step": 18560 }, { "epoch": 2.2007703703703703, "grad_norm": 0.47740034710081647, "learning_rate": 2.0113196346411425e-06, "loss": 0.0202, "step": 18570 }, { "epoch": 2.2019555555555557, "grad_norm": 0.5071439018196295, "learning_rate": 2.0057948845500086e-06, "loss": 0.0197, "step": 18580 }, { "epoch": 2.2031407407407406, "grad_norm": 0.5926803996636547, "learning_rate": 2.0002758281975384e-06, "loss": 0.0209, "step": 18590 }, { "epoch": 2.204325925925926, "grad_norm": 0.5431350843715361, "learning_rate": 1.994762476078691e-06, "loss": 0.0185, "step": 18600 }, { "epoch": 2.205511111111111, "grad_norm": 0.7418902778752838, "learning_rate": 1.9892548386775777e-06, "loss": 0.0222, "step": 18610 }, { "epoch": 2.2066962962962964, "grad_norm": 0.538031244062795, "learning_rate": 1.983752926467449e-06, "loss": 0.0186, "step": 18620 }, { "epoch": 2.2078814814814813, "grad_norm": 0.4917170815245571, "learning_rate": 1.9782567499106607e-06, "loss": 0.0187, "step": 18630 }, { "epoch": 2.2090666666666667, "grad_norm": 0.5831414690924662, "learning_rate": 1.9727663194586686e-06, "loss": 0.0196, "step": 18640 }, { "epoch": 2.2102518518518517, "grad_norm": 0.5300137465156955, "learning_rate": 1.967281645551998e-06, "loss": 0.0206, "step": 18650 }, { "epoch": 2.211437037037037, "grad_norm": 0.4390825423197908, "learning_rate": 1.9618027386202267e-06, "loss": 0.0196, "step": 18660 }, { "epoch": 2.212622222222222, "grad_norm": 0.6589359054812434, "learning_rate": 1.9563296090819665e-06, "loss": 0.0174, "step": 18670 }, { "epoch": 2.2138074074074074, "grad_norm": 0.6486500159314164, "learning_rate": 1.950862267344848e-06, "loss": 0.0186, "step": 18680 }, { "epoch": 2.2149925925925924, "grad_norm": 0.5219787693622737, "learning_rate": 1.9454007238054883e-06, "loss": 0.0195, "step": 18690 }, { "epoch": 2.216177777777778, "grad_norm": 0.4806237457949874, "learning_rate": 1.9399449888494855e-06, "loss": 0.0199, "step": 18700 }, { "epoch": 2.2173629629629628, "grad_norm": 0.5748413766392373, "learning_rate": 1.934495072851386e-06, "loss": 0.0182, "step": 18710 }, { "epoch": 2.218548148148148, "grad_norm": 0.4851993483355934, "learning_rate": 1.9290509861746774e-06, "loss": 0.0203, "step": 18720 }, { "epoch": 2.219733333333333, "grad_norm": 0.4144299966291897, "learning_rate": 1.923612739171757e-06, "loss": 0.0197, "step": 18730 }, { "epoch": 2.2209185185185185, "grad_norm": 0.48060856122098367, "learning_rate": 1.9181803421839194e-06, "loss": 0.0186, "step": 18740 }, { "epoch": 2.222103703703704, "grad_norm": 0.5062301767390581, "learning_rate": 1.9127538055413334e-06, "loss": 0.0187, "step": 18750 }, { "epoch": 2.223288888888889, "grad_norm": 0.4595103341480192, "learning_rate": 1.9073331395630274e-06, "loss": 0.0228, "step": 18760 }, { "epoch": 2.2244740740740743, "grad_norm": 0.6514335085910568, "learning_rate": 1.9019183545568653e-06, "loss": 0.0214, "step": 18770 }, { "epoch": 2.2256592592592592, "grad_norm": 0.6109574387758372, "learning_rate": 1.8965094608195251e-06, "loss": 0.0205, "step": 18780 }, { "epoch": 2.2268444444444446, "grad_norm": 0.5820956836800809, "learning_rate": 1.891106468636482e-06, "loss": 0.02, "step": 18790 }, { "epoch": 2.2280296296296296, "grad_norm": 0.5413395837266808, "learning_rate": 1.8857093882819944e-06, "loss": 0.0211, "step": 18800 }, { "epoch": 2.229214814814815, "grad_norm": 0.4719920671745342, "learning_rate": 1.8803182300190725e-06, "loss": 0.0187, "step": 18810 }, { "epoch": 2.2304, "grad_norm": 0.5768584490371501, "learning_rate": 1.8749330040994678e-06, "loss": 0.0217, "step": 18820 }, { "epoch": 2.2315851851851853, "grad_norm": 0.8473451199821922, "learning_rate": 1.8695537207636494e-06, "loss": 0.0192, "step": 18830 }, { "epoch": 2.2327703703703703, "grad_norm": 0.44361653478048213, "learning_rate": 1.864180390240789e-06, "loss": 0.0192, "step": 18840 }, { "epoch": 2.2339555555555557, "grad_norm": 0.40599631041569073, "learning_rate": 1.8588130227487383e-06, "loss": 0.0182, "step": 18850 }, { "epoch": 2.2351407407407407, "grad_norm": 0.6909213136326204, "learning_rate": 1.8534516284940074e-06, "loss": 0.0197, "step": 18860 }, { "epoch": 2.236325925925926, "grad_norm": 0.3922244118276811, "learning_rate": 1.8480962176717482e-06, "loss": 0.0183, "step": 18870 }, { "epoch": 2.237511111111111, "grad_norm": 0.5463617420415016, "learning_rate": 1.8427468004657333e-06, "loss": 0.0185, "step": 18880 }, { "epoch": 2.2386962962962964, "grad_norm": 0.5461376183609467, "learning_rate": 1.8374033870483443e-06, "loss": 0.0182, "step": 18890 }, { "epoch": 2.2398814814814814, "grad_norm": 0.5604828248991386, "learning_rate": 1.8320659875805392e-06, "loss": 0.0205, "step": 18900 }, { "epoch": 2.2410666666666668, "grad_norm": 0.46302276044294616, "learning_rate": 1.8267346122118402e-06, "loss": 0.0182, "step": 18910 }, { "epoch": 2.2422518518518517, "grad_norm": 0.5732434540872019, "learning_rate": 1.8214092710803183e-06, "loss": 0.0181, "step": 18920 }, { "epoch": 2.243437037037037, "grad_norm": 0.6068526861550686, "learning_rate": 1.8160899743125699e-06, "loss": 0.0211, "step": 18930 }, { "epoch": 2.244622222222222, "grad_norm": 0.5417544123354159, "learning_rate": 1.8107767320236936e-06, "loss": 0.0198, "step": 18940 }, { "epoch": 2.2458074074074075, "grad_norm": 0.5191164732118095, "learning_rate": 1.8054695543172763e-06, "loss": 0.0178, "step": 18950 }, { "epoch": 2.2469925925925924, "grad_norm": 0.5061386324089637, "learning_rate": 1.8001684512853723e-06, "loss": 0.0204, "step": 18960 }, { "epoch": 2.248177777777778, "grad_norm": 0.6364838745026915, "learning_rate": 1.7948734330084882e-06, "loss": 0.0182, "step": 18970 }, { "epoch": 2.249362962962963, "grad_norm": 0.6514254175406258, "learning_rate": 1.7895845095555547e-06, "loss": 0.0199, "step": 18980 }, { "epoch": 2.250548148148148, "grad_norm": 0.34277694503495626, "learning_rate": 1.7843016909839135e-06, "loss": 0.0199, "step": 18990 }, { "epoch": 2.251733333333333, "grad_norm": 0.4903627282589065, "learning_rate": 1.7790249873393006e-06, "loss": 0.021, "step": 19000 }, { "epoch": 2.2529185185185185, "grad_norm": 0.6614042165954529, "learning_rate": 1.7737544086558234e-06, "loss": 0.0192, "step": 19010 }, { "epoch": 2.254103703703704, "grad_norm": 0.5624150614018596, "learning_rate": 1.768489964955939e-06, "loss": 0.0187, "step": 19020 }, { "epoch": 2.255288888888889, "grad_norm": 0.46672634499401694, "learning_rate": 1.7632316662504401e-06, "loss": 0.0198, "step": 19030 }, { "epoch": 2.256474074074074, "grad_norm": 0.4534239884903897, "learning_rate": 1.7579795225384328e-06, "loss": 0.0202, "step": 19040 }, { "epoch": 2.2576592592592593, "grad_norm": 0.5612354919044851, "learning_rate": 1.7527335438073234e-06, "loss": 0.0201, "step": 19050 }, { "epoch": 2.2588444444444447, "grad_norm": 0.5078202278505786, "learning_rate": 1.74749374003279e-06, "loss": 0.0207, "step": 19060 }, { "epoch": 2.2600296296296296, "grad_norm": 0.48327683668384314, "learning_rate": 1.7422601211787687e-06, "loss": 0.0186, "step": 19070 }, { "epoch": 2.2612148148148146, "grad_norm": 0.6444750291437481, "learning_rate": 1.7370326971974383e-06, "loss": 0.0204, "step": 19080 }, { "epoch": 2.2624, "grad_norm": 0.7859311222785264, "learning_rate": 1.7318114780291966e-06, "loss": 0.0195, "step": 19090 }, { "epoch": 2.2635851851851854, "grad_norm": 0.6199362599944687, "learning_rate": 1.726596473602639e-06, "loss": 0.0202, "step": 19100 }, { "epoch": 2.2647703703703703, "grad_norm": 0.45617328690617087, "learning_rate": 1.7213876938345459e-06, "loss": 0.0193, "step": 19110 }, { "epoch": 2.2659555555555557, "grad_norm": 0.7439906679979321, "learning_rate": 1.7161851486298576e-06, "loss": 0.0204, "step": 19120 }, { "epoch": 2.2671407407407407, "grad_norm": 0.4159851361012525, "learning_rate": 1.7109888478816655e-06, "loss": 0.0206, "step": 19130 }, { "epoch": 2.268325925925926, "grad_norm": 0.5041915086825026, "learning_rate": 1.7057988014711812e-06, "loss": 0.0182, "step": 19140 }, { "epoch": 2.269511111111111, "grad_norm": 0.5866946063645181, "learning_rate": 1.7006150192677224e-06, "loss": 0.0184, "step": 19150 }, { "epoch": 2.2706962962962964, "grad_norm": 0.5343813177009338, "learning_rate": 1.6954375111286998e-06, "loss": 0.0199, "step": 19160 }, { "epoch": 2.2718814814814814, "grad_norm": 0.450962320628448, "learning_rate": 1.6902662868995884e-06, "loss": 0.0178, "step": 19170 }, { "epoch": 2.273066666666667, "grad_norm": 0.531722771994754, "learning_rate": 1.6851013564139185e-06, "loss": 0.0193, "step": 19180 }, { "epoch": 2.2742518518518517, "grad_norm": 0.5961588257650573, "learning_rate": 1.6799427294932486e-06, "loss": 0.0184, "step": 19190 }, { "epoch": 2.275437037037037, "grad_norm": 0.56687367750425, "learning_rate": 1.67479041594715e-06, "loss": 0.0204, "step": 19200 }, { "epoch": 2.276622222222222, "grad_norm": 0.42683201091368494, "learning_rate": 1.6696444255731935e-06, "loss": 0.018, "step": 19210 }, { "epoch": 2.2778074074074075, "grad_norm": 0.5748915577969967, "learning_rate": 1.6645047681569203e-06, "loss": 0.0187, "step": 19220 }, { "epoch": 2.2789925925925925, "grad_norm": 0.5817547915169919, "learning_rate": 1.6593714534718309e-06, "loss": 0.0206, "step": 19230 }, { "epoch": 2.280177777777778, "grad_norm": 0.5542328170124374, "learning_rate": 1.654244491279367e-06, "loss": 0.0182, "step": 19240 }, { "epoch": 2.281362962962963, "grad_norm": 0.6450951716958563, "learning_rate": 1.6491238913288855e-06, "loss": 0.0204, "step": 19250 }, { "epoch": 2.282548148148148, "grad_norm": 0.49517173325562236, "learning_rate": 1.6440096633576508e-06, "loss": 0.0183, "step": 19260 }, { "epoch": 2.283733333333333, "grad_norm": 0.5335860006965933, "learning_rate": 1.6389018170908066e-06, "loss": 0.0183, "step": 19270 }, { "epoch": 2.2849185185185186, "grad_norm": 0.5659322809392169, "learning_rate": 1.6338003622413612e-06, "loss": 0.0193, "step": 19280 }, { "epoch": 2.2861037037037035, "grad_norm": 0.618359230890751, "learning_rate": 1.6287053085101683e-06, "loss": 0.0182, "step": 19290 }, { "epoch": 2.287288888888889, "grad_norm": 0.4467925544095055, "learning_rate": 1.6236166655859137e-06, "loss": 0.0184, "step": 19300 }, { "epoch": 2.288474074074074, "grad_norm": 0.46540818654384486, "learning_rate": 1.61853444314509e-06, "loss": 0.0186, "step": 19310 }, { "epoch": 2.2896592592592593, "grad_norm": 0.49748673558436396, "learning_rate": 1.61345865085198e-06, "loss": 0.0185, "step": 19320 }, { "epoch": 2.2908444444444447, "grad_norm": 0.5120088700070919, "learning_rate": 1.6083892983586368e-06, "loss": 0.0192, "step": 19330 }, { "epoch": 2.2920296296296296, "grad_norm": 0.5187240607761581, "learning_rate": 1.6033263953048744e-06, "loss": 0.021, "step": 19340 }, { "epoch": 2.2932148148148146, "grad_norm": 0.5964695573469528, "learning_rate": 1.598269951318237e-06, "loss": 0.018, "step": 19350 }, { "epoch": 2.2944, "grad_norm": 0.46062815626209913, "learning_rate": 1.5932199760139871e-06, "loss": 0.0182, "step": 19360 }, { "epoch": 2.2955851851851854, "grad_norm": 0.5543293887546755, "learning_rate": 1.5881764789950866e-06, "loss": 0.02, "step": 19370 }, { "epoch": 2.2967703703703704, "grad_norm": 0.7314322654316134, "learning_rate": 1.5831394698521802e-06, "loss": 0.0219, "step": 19380 }, { "epoch": 2.2979555555555553, "grad_norm": 0.5000687215613968, "learning_rate": 1.5781089581635761e-06, "loss": 0.0188, "step": 19390 }, { "epoch": 2.2991407407407407, "grad_norm": 0.5545642583791672, "learning_rate": 1.573084953495223e-06, "loss": 0.0208, "step": 19400 }, { "epoch": 2.300325925925926, "grad_norm": 0.47756079154830733, "learning_rate": 1.5680674654006967e-06, "loss": 0.019, "step": 19410 }, { "epoch": 2.301511111111111, "grad_norm": 0.4781451441295527, "learning_rate": 1.5630565034211859e-06, "loss": 0.0183, "step": 19420 }, { "epoch": 2.3026962962962965, "grad_norm": 0.5016522216835004, "learning_rate": 1.558052077085464e-06, "loss": 0.0215, "step": 19430 }, { "epoch": 2.3038814814814814, "grad_norm": 0.7425326239908316, "learning_rate": 1.5530541959098787e-06, "loss": 0.0184, "step": 19440 }, { "epoch": 2.305066666666667, "grad_norm": 0.6834690391818072, "learning_rate": 1.5480628693983297e-06, "loss": 0.0199, "step": 19450 }, { "epoch": 2.3062518518518518, "grad_norm": 0.583272257862348, "learning_rate": 1.5430781070422546e-06, "loss": 0.0199, "step": 19460 }, { "epoch": 2.307437037037037, "grad_norm": 0.5233940029356275, "learning_rate": 1.5380999183206097e-06, "loss": 0.0204, "step": 19470 }, { "epoch": 2.308622222222222, "grad_norm": 0.5448020340360619, "learning_rate": 1.5331283126998487e-06, "loss": 0.0199, "step": 19480 }, { "epoch": 2.3098074074074075, "grad_norm": 0.5544604627719499, "learning_rate": 1.5281632996339046e-06, "loss": 0.02, "step": 19490 }, { "epoch": 2.3109925925925925, "grad_norm": 0.4639834453316954, "learning_rate": 1.5232048885641803e-06, "loss": 0.02, "step": 19500 }, { "epoch": 2.312177777777778, "grad_norm": 0.5715526445810782, "learning_rate": 1.5182530889195201e-06, "loss": 0.0198, "step": 19510 }, { "epoch": 2.313362962962963, "grad_norm": 0.6745161045879382, "learning_rate": 1.5133079101161973e-06, "loss": 0.0169, "step": 19520 }, { "epoch": 2.3145481481481482, "grad_norm": 0.6139144226970469, "learning_rate": 1.508369361557892e-06, "loss": 0.0192, "step": 19530 }, { "epoch": 2.315733333333333, "grad_norm": 0.4963197356365394, "learning_rate": 1.5034374526356825e-06, "loss": 0.0178, "step": 19540 }, { "epoch": 2.3169185185185186, "grad_norm": 0.49984660239649953, "learning_rate": 1.4985121927280184e-06, "loss": 0.0189, "step": 19550 }, { "epoch": 2.3181037037037036, "grad_norm": 0.424350799088494, "learning_rate": 1.4935935912007037e-06, "loss": 0.0185, "step": 19560 }, { "epoch": 2.319288888888889, "grad_norm": 0.5169508957159925, "learning_rate": 1.4886816574068823e-06, "loss": 0.0189, "step": 19570 }, { "epoch": 2.320474074074074, "grad_norm": 0.4227018390009985, "learning_rate": 1.4837764006870187e-06, "loss": 0.0196, "step": 19580 }, { "epoch": 2.3216592592592593, "grad_norm": 0.8490205401988372, "learning_rate": 1.4788778303688822e-06, "loss": 0.0199, "step": 19590 }, { "epoch": 2.3228444444444443, "grad_norm": 0.5883017405852737, "learning_rate": 1.4739859557675245e-06, "loss": 0.0202, "step": 19600 }, { "epoch": 2.3240296296296297, "grad_norm": 0.5231568361461372, "learning_rate": 1.469100786185265e-06, "loss": 0.0199, "step": 19610 }, { "epoch": 2.3252148148148146, "grad_norm": 0.6094023317693572, "learning_rate": 1.4642223309116753e-06, "loss": 0.018, "step": 19620 }, { "epoch": 2.3264, "grad_norm": 0.6763932563549391, "learning_rate": 1.4593505992235602e-06, "loss": 0.0201, "step": 19630 }, { "epoch": 2.3275851851851854, "grad_norm": 0.5978441020272818, "learning_rate": 1.454485600384934e-06, "loss": 0.0189, "step": 19640 }, { "epoch": 2.3287703703703704, "grad_norm": 0.48939865171789004, "learning_rate": 1.4496273436470104e-06, "loss": 0.0181, "step": 19650 }, { "epoch": 2.3299555555555553, "grad_norm": 0.6865010857472132, "learning_rate": 1.4447758382481825e-06, "loss": 0.0222, "step": 19660 }, { "epoch": 2.3311407407407407, "grad_norm": 0.44258967685648043, "learning_rate": 1.439931093414007e-06, "loss": 0.0178, "step": 19670 }, { "epoch": 2.332325925925926, "grad_norm": 0.5330345150623879, "learning_rate": 1.435093118357182e-06, "loss": 0.0202, "step": 19680 }, { "epoch": 2.333511111111111, "grad_norm": 0.308049161587547, "learning_rate": 1.430261922277532e-06, "loss": 0.0179, "step": 19690 }, { "epoch": 2.334696296296296, "grad_norm": 0.5235514846875791, "learning_rate": 1.4254375143619936e-06, "loss": 0.019, "step": 19700 }, { "epoch": 2.3358814814814814, "grad_norm": 0.6449493209413945, "learning_rate": 1.4206199037845953e-06, "loss": 0.0198, "step": 19710 }, { "epoch": 2.337066666666667, "grad_norm": 0.531810202050383, "learning_rate": 1.4158090997064356e-06, "loss": 0.0194, "step": 19720 }, { "epoch": 2.338251851851852, "grad_norm": 0.5200387615016707, "learning_rate": 1.4110051112756734e-06, "loss": 0.017, "step": 19730 }, { "epoch": 2.339437037037037, "grad_norm": 0.5733044729616007, "learning_rate": 1.4062079476275041e-06, "loss": 0.0194, "step": 19740 }, { "epoch": 2.340622222222222, "grad_norm": 0.5881944095852142, "learning_rate": 1.4014176178841505e-06, "loss": 0.0203, "step": 19750 }, { "epoch": 2.3418074074074076, "grad_norm": 0.5204774438575377, "learning_rate": 1.3966341311548348e-06, "loss": 0.0196, "step": 19760 }, { "epoch": 2.3429925925925925, "grad_norm": 0.5075643738321208, "learning_rate": 1.3918574965357673e-06, "loss": 0.0182, "step": 19770 }, { "epoch": 2.344177777777778, "grad_norm": 0.6542323545731978, "learning_rate": 1.3870877231101326e-06, "loss": 0.0162, "step": 19780 }, { "epoch": 2.345362962962963, "grad_norm": 0.6855244280424457, "learning_rate": 1.3823248199480632e-06, "loss": 0.0174, "step": 19790 }, { "epoch": 2.3465481481481483, "grad_norm": 0.5520478393210825, "learning_rate": 1.377568796106631e-06, "loss": 0.0194, "step": 19800 }, { "epoch": 2.3477333333333332, "grad_norm": 0.4414258845402759, "learning_rate": 1.3728196606298238e-06, "loss": 0.0199, "step": 19810 }, { "epoch": 2.3489185185185186, "grad_norm": 0.5550926084482453, "learning_rate": 1.3680774225485293e-06, "loss": 0.0186, "step": 19820 }, { "epoch": 2.3501037037037036, "grad_norm": 0.6693271388860823, "learning_rate": 1.363342090880525e-06, "loss": 0.0195, "step": 19830 }, { "epoch": 2.351288888888889, "grad_norm": 0.6086432568859389, "learning_rate": 1.358613674630448e-06, "loss": 0.02, "step": 19840 }, { "epoch": 2.352474074074074, "grad_norm": 0.5725342898704174, "learning_rate": 1.3538921827897922e-06, "loss": 0.0182, "step": 19850 }, { "epoch": 2.3536592592592593, "grad_norm": 0.6380906332845099, "learning_rate": 1.3491776243368782e-06, "loss": 0.0182, "step": 19860 }, { "epoch": 2.3548444444444443, "grad_norm": 0.5021172832151709, "learning_rate": 1.3444700082368434e-06, "loss": 0.019, "step": 19870 }, { "epoch": 2.3560296296296297, "grad_norm": 0.5801743272886423, "learning_rate": 1.3397693434416287e-06, "loss": 0.0219, "step": 19880 }, { "epoch": 2.3572148148148147, "grad_norm": 0.6263972576829657, "learning_rate": 1.3350756388899499e-06, "loss": 0.0192, "step": 19890 }, { "epoch": 2.3584, "grad_norm": 0.645875775540546, "learning_rate": 1.3303889035072892e-06, "loss": 0.0202, "step": 19900 }, { "epoch": 2.359585185185185, "grad_norm": 0.5873097139859446, "learning_rate": 1.3257091462058807e-06, "loss": 0.0191, "step": 19910 }, { "epoch": 2.3607703703703704, "grad_norm": 0.5594482704350012, "learning_rate": 1.3210363758846817e-06, "loss": 0.0183, "step": 19920 }, { "epoch": 2.3619555555555554, "grad_norm": 0.5172918686875249, "learning_rate": 1.3163706014293703e-06, "loss": 0.018, "step": 19930 }, { "epoch": 2.3631407407407408, "grad_norm": 0.7315382363360655, "learning_rate": 1.3117118317123167e-06, "loss": 0.0184, "step": 19940 }, { "epoch": 2.364325925925926, "grad_norm": 0.6329775198479737, "learning_rate": 1.3070600755925712e-06, "loss": 0.0202, "step": 19950 }, { "epoch": 2.365511111111111, "grad_norm": 0.829793080870299, "learning_rate": 1.3024153419158509e-06, "loss": 0.0194, "step": 19960 }, { "epoch": 2.366696296296296, "grad_norm": 0.5805543670629139, "learning_rate": 1.2977776395145147e-06, "loss": 0.0222, "step": 19970 }, { "epoch": 2.3678814814814815, "grad_norm": 0.6373240288302964, "learning_rate": 1.2931469772075534e-06, "loss": 0.0208, "step": 19980 }, { "epoch": 2.369066666666667, "grad_norm": 0.3793110225414597, "learning_rate": 1.2885233638005679e-06, "loss": 0.0176, "step": 19990 }, { "epoch": 2.370251851851852, "grad_norm": 0.4356969281611011, "learning_rate": 1.2839068080857591e-06, "loss": 0.016, "step": 20000 }, { "epoch": 2.371437037037037, "grad_norm": 0.6181439744287168, "learning_rate": 1.2792973188419056e-06, "loss": 0.0182, "step": 20010 }, { "epoch": 2.372622222222222, "grad_norm": 0.4833865287062214, "learning_rate": 1.2746949048343465e-06, "loss": 0.0183, "step": 20020 }, { "epoch": 2.3738074074074076, "grad_norm": 0.4609952543836555, "learning_rate": 1.2700995748149675e-06, "loss": 0.0181, "step": 20030 }, { "epoch": 2.3749925925925925, "grad_norm": 0.5896606908275395, "learning_rate": 1.2655113375221856e-06, "loss": 0.018, "step": 20040 }, { "epoch": 2.376177777777778, "grad_norm": 0.6982786492693911, "learning_rate": 1.2609302016809277e-06, "loss": 0.0209, "step": 20050 }, { "epoch": 2.377362962962963, "grad_norm": 0.5067000949243737, "learning_rate": 1.2563561760026188e-06, "loss": 0.0186, "step": 20060 }, { "epoch": 2.3785481481481483, "grad_norm": 0.5775924887833543, "learning_rate": 1.2517892691851597e-06, "loss": 0.0204, "step": 20070 }, { "epoch": 2.3797333333333333, "grad_norm": 0.5103803660557463, "learning_rate": 1.2472294899129184e-06, "loss": 0.0192, "step": 20080 }, { "epoch": 2.3809185185185187, "grad_norm": 0.5376253046623067, "learning_rate": 1.242676846856709e-06, "loss": 0.0191, "step": 20090 }, { "epoch": 2.3821037037037036, "grad_norm": 0.5813296518527333, "learning_rate": 1.2381313486737728e-06, "loss": 0.0201, "step": 20100 }, { "epoch": 2.383288888888889, "grad_norm": 0.4687920349022994, "learning_rate": 1.2335930040077643e-06, "loss": 0.016, "step": 20110 }, { "epoch": 2.384474074074074, "grad_norm": 0.8056837676198028, "learning_rate": 1.229061821488739e-06, "loss": 0.0181, "step": 20120 }, { "epoch": 2.3856592592592594, "grad_norm": 0.530935296628858, "learning_rate": 1.224537809733129e-06, "loss": 0.0178, "step": 20130 }, { "epoch": 2.3868444444444443, "grad_norm": 0.42222411654219644, "learning_rate": 1.2200209773437316e-06, "loss": 0.0171, "step": 20140 }, { "epoch": 2.3880296296296297, "grad_norm": 0.6151064347479412, "learning_rate": 1.2155113329096912e-06, "loss": 0.0195, "step": 20150 }, { "epoch": 2.3892148148148147, "grad_norm": 0.5238278595648099, "learning_rate": 1.2110088850064867e-06, "loss": 0.018, "step": 20160 }, { "epoch": 2.3904, "grad_norm": 0.5902010228891127, "learning_rate": 1.20651364219591e-06, "loss": 0.0199, "step": 20170 }, { "epoch": 2.391585185185185, "grad_norm": 0.5296221771444523, "learning_rate": 1.2020256130260521e-06, "loss": 0.018, "step": 20180 }, { "epoch": 2.3927703703703704, "grad_norm": 0.7476656151780144, "learning_rate": 1.1975448060312867e-06, "loss": 0.0192, "step": 20190 }, { "epoch": 2.3939555555555554, "grad_norm": 0.6148062697199904, "learning_rate": 1.193071229732251e-06, "loss": 0.0176, "step": 20200 }, { "epoch": 2.395140740740741, "grad_norm": 0.6390239970535004, "learning_rate": 1.1886048926358396e-06, "loss": 0.0195, "step": 20210 }, { "epoch": 2.3963259259259257, "grad_norm": 0.4152743139619699, "learning_rate": 1.184145803235175e-06, "loss": 0.0175, "step": 20220 }, { "epoch": 2.397511111111111, "grad_norm": 0.5160938383689863, "learning_rate": 1.1796939700095971e-06, "loss": 0.0167, "step": 20230 }, { "epoch": 2.398696296296296, "grad_norm": 0.5336357654447675, "learning_rate": 1.1752494014246523e-06, "loss": 0.0184, "step": 20240 }, { "epoch": 2.3998814814814815, "grad_norm": 0.48292619340509424, "learning_rate": 1.1708121059320709e-06, "loss": 0.0181, "step": 20250 }, { "epoch": 2.401066666666667, "grad_norm": 0.6358836808865118, "learning_rate": 1.1663820919697516e-06, "loss": 0.0166, "step": 20260 }, { "epoch": 2.402251851851852, "grad_norm": 0.6887077522557343, "learning_rate": 1.1619593679617457e-06, "loss": 0.0203, "step": 20270 }, { "epoch": 2.403437037037037, "grad_norm": 0.39823512228946023, "learning_rate": 1.1575439423182433e-06, "loss": 0.0181, "step": 20280 }, { "epoch": 2.404622222222222, "grad_norm": 0.5429115888833044, "learning_rate": 1.1531358234355588e-06, "loss": 0.0188, "step": 20290 }, { "epoch": 2.4058074074074076, "grad_norm": 0.4520706804232499, "learning_rate": 1.1487350196961078e-06, "loss": 0.0158, "step": 20300 }, { "epoch": 2.4069925925925926, "grad_norm": 0.4650095784590796, "learning_rate": 1.1443415394683955e-06, "loss": 0.0168, "step": 20310 }, { "epoch": 2.4081777777777775, "grad_norm": 0.6489649052322313, "learning_rate": 1.139955391107005e-06, "loss": 0.0183, "step": 20320 }, { "epoch": 2.409362962962963, "grad_norm": 0.4500355397020522, "learning_rate": 1.1355765829525755e-06, "loss": 0.0172, "step": 20330 }, { "epoch": 2.4105481481481483, "grad_norm": 0.5583578151689336, "learning_rate": 1.1312051233317861e-06, "loss": 0.0193, "step": 20340 }, { "epoch": 2.4117333333333333, "grad_norm": 0.5672274555016643, "learning_rate": 1.1268410205573438e-06, "loss": 0.0179, "step": 20350 }, { "epoch": 2.4129185185185187, "grad_norm": 0.5523373177581667, "learning_rate": 1.1224842829279636e-06, "loss": 0.0191, "step": 20360 }, { "epoch": 2.4141037037037036, "grad_norm": 0.6326189869299843, "learning_rate": 1.1181349187283602e-06, "loss": 0.0198, "step": 20370 }, { "epoch": 2.415288888888889, "grad_norm": 0.4857763939803879, "learning_rate": 1.1137929362292211e-06, "loss": 0.016, "step": 20380 }, { "epoch": 2.416474074074074, "grad_norm": 0.5786082447947875, "learning_rate": 1.109458343687202e-06, "loss": 0.0187, "step": 20390 }, { "epoch": 2.4176592592592594, "grad_norm": 0.5609634800891323, "learning_rate": 1.1051311493449023e-06, "loss": 0.0193, "step": 20400 }, { "epoch": 2.4188444444444444, "grad_norm": 0.52716932764828, "learning_rate": 1.1008113614308536e-06, "loss": 0.0174, "step": 20410 }, { "epoch": 2.4200296296296298, "grad_norm": 0.5480009233089214, "learning_rate": 1.0964989881595068e-06, "loss": 0.0194, "step": 20420 }, { "epoch": 2.4212148148148147, "grad_norm": 0.5680722319281621, "learning_rate": 1.0921940377312086e-06, "loss": 0.0182, "step": 20430 }, { "epoch": 2.4224, "grad_norm": 0.5247975363708455, "learning_rate": 1.0878965183321922e-06, "loss": 0.0191, "step": 20440 }, { "epoch": 2.423585185185185, "grad_norm": 0.6196770658196401, "learning_rate": 1.083606438134563e-06, "loss": 0.0208, "step": 20450 }, { "epoch": 2.4247703703703705, "grad_norm": 0.6681509574607049, "learning_rate": 1.0793238052962746e-06, "loss": 0.0186, "step": 20460 }, { "epoch": 2.4259555555555554, "grad_norm": 0.5390470692480126, "learning_rate": 1.0750486279611245e-06, "loss": 0.0182, "step": 20470 }, { "epoch": 2.427140740740741, "grad_norm": 0.5738117947414857, "learning_rate": 1.0707809142587294e-06, "loss": 0.0179, "step": 20480 }, { "epoch": 2.4283259259259258, "grad_norm": 0.49800061206180446, "learning_rate": 1.0665206723045125e-06, "loss": 0.0203, "step": 20490 }, { "epoch": 2.429511111111111, "grad_norm": 0.4078708010768559, "learning_rate": 1.0622679101996936e-06, "loss": 0.0155, "step": 20500 }, { "epoch": 2.430696296296296, "grad_norm": 0.5796668537692614, "learning_rate": 1.0580226360312634e-06, "loss": 0.0178, "step": 20510 }, { "epoch": 2.4318814814814815, "grad_norm": 0.6701549779120091, "learning_rate": 1.0537848578719755e-06, "loss": 0.0171, "step": 20520 }, { "epoch": 2.4330666666666665, "grad_norm": 0.6190968495996059, "learning_rate": 1.0495545837803323e-06, "loss": 0.0177, "step": 20530 }, { "epoch": 2.434251851851852, "grad_norm": 0.6537220843523749, "learning_rate": 1.0453318218005614e-06, "loss": 0.0192, "step": 20540 }, { "epoch": 2.435437037037037, "grad_norm": 0.6003166181757401, "learning_rate": 1.04111657996261e-06, "loss": 0.0187, "step": 20550 }, { "epoch": 2.4366222222222222, "grad_norm": 0.34957326165006, "learning_rate": 1.0369088662821225e-06, "loss": 0.0178, "step": 20560 }, { "epoch": 2.4378074074074076, "grad_norm": 0.6648562584701454, "learning_rate": 1.032708688760427e-06, "loss": 0.019, "step": 20570 }, { "epoch": 2.4389925925925926, "grad_norm": 0.6624831197497233, "learning_rate": 1.0285160553845253e-06, "loss": 0.0165, "step": 20580 }, { "epoch": 2.4401777777777776, "grad_norm": 0.5150473724953625, "learning_rate": 1.0243309741270691e-06, "loss": 0.0172, "step": 20590 }, { "epoch": 2.441362962962963, "grad_norm": 0.6223207998858351, "learning_rate": 1.0201534529463502e-06, "loss": 0.0189, "step": 20600 }, { "epoch": 2.4425481481481484, "grad_norm": 0.5211990941835191, "learning_rate": 1.0159834997862834e-06, "loss": 0.0179, "step": 20610 }, { "epoch": 2.4437333333333333, "grad_norm": 0.5815406834050634, "learning_rate": 1.0118211225763947e-06, "loss": 0.017, "step": 20620 }, { "epoch": 2.4449185185185183, "grad_norm": 0.6294226134977733, "learning_rate": 1.007666329231804e-06, "loss": 0.0191, "step": 20630 }, { "epoch": 2.4461037037037037, "grad_norm": 0.466524384875676, "learning_rate": 1.0035191276532075e-06, "loss": 0.0155, "step": 20640 }, { "epoch": 2.447288888888889, "grad_norm": 0.48680948326900636, "learning_rate": 9.993795257268634e-07, "loss": 0.0185, "step": 20650 }, { "epoch": 2.448474074074074, "grad_norm": 0.6282117162838744, "learning_rate": 9.952475313245847e-07, "loss": 0.0184, "step": 20660 }, { "epoch": 2.4496592592592594, "grad_norm": 0.5557566609818801, "learning_rate": 9.911231523037124e-07, "loss": 0.0171, "step": 20670 }, { "epoch": 2.4508444444444444, "grad_norm": 0.6363297164808771, "learning_rate": 9.87006396507108e-07, "loss": 0.0176, "step": 20680 }, { "epoch": 2.45202962962963, "grad_norm": 0.6418935685042145, "learning_rate": 9.828972717631357e-07, "loss": 0.0199, "step": 20690 }, { "epoch": 2.4532148148148147, "grad_norm": 0.4677998700735346, "learning_rate": 9.787957858856512e-07, "loss": 0.0182, "step": 20700 }, { "epoch": 2.4544, "grad_norm": 0.5943617063586708, "learning_rate": 9.747019466739837e-07, "loss": 0.0173, "step": 20710 }, { "epoch": 2.455585185185185, "grad_norm": 0.4774857337426721, "learning_rate": 9.706157619129202e-07, "loss": 0.0176, "step": 20720 }, { "epoch": 2.4567703703703705, "grad_norm": 0.6658074825090965, "learning_rate": 9.665372393726908e-07, "loss": 0.0202, "step": 20730 }, { "epoch": 2.4579555555555554, "grad_norm": 0.5770231594772989, "learning_rate": 9.624663868089596e-07, "loss": 0.0175, "step": 20740 }, { "epoch": 2.459140740740741, "grad_norm": 0.6103775137267466, "learning_rate": 9.584032119628017e-07, "loss": 0.0176, "step": 20750 }, { "epoch": 2.460325925925926, "grad_norm": 0.5057995711538233, "learning_rate": 9.54347722560694e-07, "loss": 0.0165, "step": 20760 }, { "epoch": 2.461511111111111, "grad_norm": 0.6415131055077778, "learning_rate": 9.502999263144969e-07, "loss": 0.019, "step": 20770 }, { "epoch": 2.462696296296296, "grad_norm": 0.6091025567475564, "learning_rate": 9.462598309214449e-07, "loss": 0.019, "step": 20780 }, { "epoch": 2.4638814814814816, "grad_norm": 0.6544818226473474, "learning_rate": 9.422274440641277e-07, "loss": 0.0184, "step": 20790 }, { "epoch": 2.4650666666666665, "grad_norm": 0.5133113890220043, "learning_rate": 9.382027734104754e-07, "loss": 0.0172, "step": 20800 }, { "epoch": 2.466251851851852, "grad_norm": 0.5838971367930164, "learning_rate": 9.341858266137449e-07, "loss": 0.0166, "step": 20810 }, { "epoch": 2.467437037037037, "grad_norm": 0.6277586066184481, "learning_rate": 9.301766113125055e-07, "loss": 0.0184, "step": 20820 }, { "epoch": 2.4686222222222223, "grad_norm": 0.395363789973833, "learning_rate": 9.261751351306269e-07, "loss": 0.0169, "step": 20830 }, { "epoch": 2.4698074074074072, "grad_norm": 0.5393064163374834, "learning_rate": 9.221814056772599e-07, "loss": 0.019, "step": 20840 }, { "epoch": 2.4709925925925926, "grad_norm": 0.5508832993128663, "learning_rate": 9.181954305468221e-07, "loss": 0.0167, "step": 20850 }, { "epoch": 2.4721777777777776, "grad_norm": 0.6446638153025966, "learning_rate": 9.142172173189912e-07, "loss": 0.0212, "step": 20860 }, { "epoch": 2.473362962962963, "grad_norm": 0.5644175929859583, "learning_rate": 9.102467735586823e-07, "loss": 0.0173, "step": 20870 }, { "epoch": 2.4745481481481484, "grad_norm": 0.7049009211193161, "learning_rate": 9.062841068160338e-07, "loss": 0.0181, "step": 20880 }, { "epoch": 2.4757333333333333, "grad_norm": 0.48408214004509653, "learning_rate": 9.023292246263992e-07, "loss": 0.017, "step": 20890 }, { "epoch": 2.4769185185185183, "grad_norm": 0.4484831048629232, "learning_rate": 8.98382134510325e-07, "loss": 0.0181, "step": 20900 }, { "epoch": 2.4781037037037037, "grad_norm": 0.509910222167136, "learning_rate": 8.944428439735448e-07, "loss": 0.0172, "step": 20910 }, { "epoch": 2.479288888888889, "grad_norm": 0.454247380281483, "learning_rate": 8.905113605069571e-07, "loss": 0.0192, "step": 20920 }, { "epoch": 2.480474074074074, "grad_norm": 0.5069856836367086, "learning_rate": 8.865876915866178e-07, "loss": 0.0178, "step": 20930 }, { "epoch": 2.4816592592592595, "grad_norm": 0.5715235756999294, "learning_rate": 8.826718446737181e-07, "loss": 0.0176, "step": 20940 }, { "epoch": 2.4828444444444444, "grad_norm": 0.48336785931290466, "learning_rate": 8.787638272145815e-07, "loss": 0.0185, "step": 20950 }, { "epoch": 2.48402962962963, "grad_norm": 0.5901594529833316, "learning_rate": 8.748636466406374e-07, "loss": 0.0188, "step": 20960 }, { "epoch": 2.4852148148148148, "grad_norm": 0.6908066539999371, "learning_rate": 8.709713103684142e-07, "loss": 0.0185, "step": 20970 }, { "epoch": 2.4864, "grad_norm": 0.6924979337226718, "learning_rate": 8.670868257995247e-07, "loss": 0.0199, "step": 20980 }, { "epoch": 2.487585185185185, "grad_norm": 0.7002005115330637, "learning_rate": 8.632102003206511e-07, "loss": 0.0187, "step": 20990 }, { "epoch": 2.4887703703703705, "grad_norm": 0.48501133777228894, "learning_rate": 8.593414413035294e-07, "loss": 0.016, "step": 21000 }, { "epoch": 2.4899555555555555, "grad_norm": 0.5651002456756551, "learning_rate": 8.554805561049395e-07, "loss": 0.0179, "step": 21010 }, { "epoch": 2.491140740740741, "grad_norm": 0.6454325498863057, "learning_rate": 8.516275520666839e-07, "loss": 0.0173, "step": 21020 }, { "epoch": 2.492325925925926, "grad_norm": 0.5595164186336209, "learning_rate": 8.477824365155851e-07, "loss": 0.0175, "step": 21030 }, { "epoch": 2.4935111111111112, "grad_norm": 0.5262821279548828, "learning_rate": 8.439452167634587e-07, "loss": 0.0169, "step": 21040 }, { "epoch": 2.494696296296296, "grad_norm": 0.45082946246916983, "learning_rate": 8.401159001071086e-07, "loss": 0.018, "step": 21050 }, { "epoch": 2.4958814814814816, "grad_norm": 0.538520539920005, "learning_rate": 8.362944938283085e-07, "loss": 0.0209, "step": 21060 }, { "epoch": 2.4970666666666665, "grad_norm": 0.6109965390008959, "learning_rate": 8.324810051937942e-07, "loss": 0.0163, "step": 21070 }, { "epoch": 2.498251851851852, "grad_norm": 0.7242181627637891, "learning_rate": 8.2867544145524e-07, "loss": 0.02, "step": 21080 }, { "epoch": 2.499437037037037, "grad_norm": 0.4948397091771907, "learning_rate": 8.248778098492549e-07, "loss": 0.0181, "step": 21090 }, { "epoch": 2.5006222222222223, "grad_norm": 0.7489629904051068, "learning_rate": 8.210881175973611e-07, "loss": 0.0181, "step": 21100 }, { "epoch": 2.5018074074074073, "grad_norm": 0.41749780441090956, "learning_rate": 8.173063719059832e-07, "loss": 0.0146, "step": 21110 }, { "epoch": 2.5029925925925927, "grad_norm": 0.4212418413036383, "learning_rate": 8.135325799664384e-07, "loss": 0.0157, "step": 21120 }, { "epoch": 2.5041777777777776, "grad_norm": 0.43280182167680353, "learning_rate": 8.097667489549161e-07, "loss": 0.0186, "step": 21130 }, { "epoch": 2.505362962962963, "grad_norm": 0.4316497829943779, "learning_rate": 8.060088860324661e-07, "loss": 0.0197, "step": 21140 }, { "epoch": 2.5065481481481484, "grad_norm": 0.6521403724281614, "learning_rate": 8.022589983449908e-07, "loss": 0.0168, "step": 21150 }, { "epoch": 2.5077333333333334, "grad_norm": 0.47889010429565065, "learning_rate": 7.985170930232216e-07, "loss": 0.0169, "step": 21160 }, { "epoch": 2.5089185185185183, "grad_norm": 0.507653225728503, "learning_rate": 7.947831771827164e-07, "loss": 0.0176, "step": 21170 }, { "epoch": 2.5101037037037037, "grad_norm": 0.4547264446765598, "learning_rate": 7.910572579238357e-07, "loss": 0.0166, "step": 21180 }, { "epoch": 2.511288888888889, "grad_norm": 0.6308250895773334, "learning_rate": 7.873393423317349e-07, "loss": 0.0184, "step": 21190 }, { "epoch": 2.512474074074074, "grad_norm": 0.5262251487920135, "learning_rate": 7.83629437476352e-07, "loss": 0.0179, "step": 21200 }, { "epoch": 2.513659259259259, "grad_norm": 0.46133876070066765, "learning_rate": 7.799275504123904e-07, "loss": 0.0187, "step": 21210 }, { "epoch": 2.5148444444444444, "grad_norm": 0.7015140106863733, "learning_rate": 7.762336881793053e-07, "loss": 0.0186, "step": 21220 }, { "epoch": 2.51602962962963, "grad_norm": 0.5958186357381842, "learning_rate": 7.725478578012929e-07, "loss": 0.0181, "step": 21230 }, { "epoch": 2.517214814814815, "grad_norm": 0.5900887500158141, "learning_rate": 7.6887006628728e-07, "loss": 0.0194, "step": 21240 }, { "epoch": 2.5183999999999997, "grad_norm": 0.45510057858318664, "learning_rate": 7.652003206309022e-07, "loss": 0.0148, "step": 21250 }, { "epoch": 2.519585185185185, "grad_norm": 0.41252168881070916, "learning_rate": 7.615386278104964e-07, "loss": 0.0163, "step": 21260 }, { "epoch": 2.5207703703703706, "grad_norm": 0.6395488212258044, "learning_rate": 7.578849947890848e-07, "loss": 0.018, "step": 21270 }, { "epoch": 2.5219555555555555, "grad_norm": 0.5389037633987491, "learning_rate": 7.542394285143684e-07, "loss": 0.0193, "step": 21280 }, { "epoch": 2.523140740740741, "grad_norm": 0.5573306194458528, "learning_rate": 7.506019359187034e-07, "loss": 0.02, "step": 21290 }, { "epoch": 2.524325925925926, "grad_norm": 0.5403072811635534, "learning_rate": 7.469725239190945e-07, "loss": 0.0166, "step": 21300 }, { "epoch": 2.5255111111111113, "grad_norm": 0.5884307506421887, "learning_rate": 7.433511994171805e-07, "loss": 0.0172, "step": 21310 }, { "epoch": 2.526696296296296, "grad_norm": 0.4863037969486824, "learning_rate": 7.397379692992224e-07, "loss": 0.0162, "step": 21320 }, { "epoch": 2.5278814814814816, "grad_norm": 0.5473849752231416, "learning_rate": 7.361328404360896e-07, "loss": 0.0186, "step": 21330 }, { "epoch": 2.5290666666666666, "grad_norm": 0.5552595876953792, "learning_rate": 7.325358196832432e-07, "loss": 0.0185, "step": 21340 }, { "epoch": 2.530251851851852, "grad_norm": 0.705056990597905, "learning_rate": 7.289469138807265e-07, "loss": 0.0184, "step": 21350 }, { "epoch": 2.531437037037037, "grad_norm": 0.43833826213742755, "learning_rate": 7.253661298531555e-07, "loss": 0.0162, "step": 21360 }, { "epoch": 2.5326222222222223, "grad_norm": 0.5135562918746531, "learning_rate": 7.217934744096977e-07, "loss": 0.0182, "step": 21370 }, { "epoch": 2.5338074074074073, "grad_norm": 0.3904585435870939, "learning_rate": 7.182289543440652e-07, "loss": 0.0169, "step": 21380 }, { "epoch": 2.5349925925925927, "grad_norm": 0.6922550774775605, "learning_rate": 7.146725764344975e-07, "loss": 0.019, "step": 21390 }, { "epoch": 2.5361777777777776, "grad_norm": 0.7244832875800394, "learning_rate": 7.111243474437551e-07, "loss": 0.0182, "step": 21400 }, { "epoch": 2.537362962962963, "grad_norm": 0.5821610690014783, "learning_rate": 7.075842741191019e-07, "loss": 0.0171, "step": 21410 }, { "epoch": 2.538548148148148, "grad_norm": 0.5620961385311694, "learning_rate": 7.040523631922907e-07, "loss": 0.0196, "step": 21420 }, { "epoch": 2.5397333333333334, "grad_norm": 0.48953769431157984, "learning_rate": 7.005286213795537e-07, "loss": 0.016, "step": 21430 }, { "epoch": 2.5409185185185184, "grad_norm": 0.34011777249262254, "learning_rate": 6.970130553815884e-07, "loss": 0.0149, "step": 21440 }, { "epoch": 2.5421037037037038, "grad_norm": 0.47990807369537847, "learning_rate": 6.935056718835487e-07, "loss": 0.0176, "step": 21450 }, { "epoch": 2.543288888888889, "grad_norm": 0.5630613010515627, "learning_rate": 6.900064775550225e-07, "loss": 0.0181, "step": 21460 }, { "epoch": 2.544474074074074, "grad_norm": 0.5257591948589118, "learning_rate": 6.865154790500323e-07, "loss": 0.0174, "step": 21470 }, { "epoch": 2.545659259259259, "grad_norm": 0.6363239781576507, "learning_rate": 6.8303268300701e-07, "loss": 0.0177, "step": 21480 }, { "epoch": 2.5468444444444445, "grad_norm": 0.4752475882989525, "learning_rate": 6.79558096048793e-07, "loss": 0.0164, "step": 21490 }, { "epoch": 2.54802962962963, "grad_norm": 0.6317766217623786, "learning_rate": 6.760917247826076e-07, "loss": 0.0191, "step": 21500 }, { "epoch": 2.549214814814815, "grad_norm": 0.6121659917330085, "learning_rate": 6.726335758000562e-07, "loss": 0.0176, "step": 21510 }, { "epoch": 2.5504, "grad_norm": 0.6697636341281199, "learning_rate": 6.691836556771058e-07, "loss": 0.0174, "step": 21520 }, { "epoch": 2.551585185185185, "grad_norm": 0.50710880388447, "learning_rate": 6.657419709740787e-07, "loss": 0.0178, "step": 21530 }, { "epoch": 2.5527703703703706, "grad_norm": 0.5110129681647883, "learning_rate": 6.623085282356323e-07, "loss": 0.0164, "step": 21540 }, { "epoch": 2.5539555555555555, "grad_norm": 0.5453865599967119, "learning_rate": 6.588833339907552e-07, "loss": 0.0171, "step": 21550 }, { "epoch": 2.5551407407407405, "grad_norm": 0.5300781123461664, "learning_rate": 6.554663947527474e-07, "loss": 0.0184, "step": 21560 }, { "epoch": 2.556325925925926, "grad_norm": 0.4783806902147297, "learning_rate": 6.520577170192144e-07, "loss": 0.0161, "step": 21570 }, { "epoch": 2.5575111111111113, "grad_norm": 0.44039738005484225, "learning_rate": 6.486573072720493e-07, "loss": 0.0179, "step": 21580 }, { "epoch": 2.5586962962962962, "grad_norm": 0.6792744994409649, "learning_rate": 6.452651719774239e-07, "loss": 0.0175, "step": 21590 }, { "epoch": 2.5598814814814816, "grad_norm": 0.5605031491370238, "learning_rate": 6.418813175857747e-07, "loss": 0.018, "step": 21600 }, { "epoch": 2.5610666666666666, "grad_norm": 0.8015951623495515, "learning_rate": 6.385057505317932e-07, "loss": 0.0189, "step": 21610 }, { "epoch": 2.562251851851852, "grad_norm": 0.6331864010093162, "learning_rate": 6.351384772344094e-07, "loss": 0.0168, "step": 21620 }, { "epoch": 2.563437037037037, "grad_norm": 0.6184904424231238, "learning_rate": 6.317795040967844e-07, "loss": 0.0187, "step": 21630 }, { "epoch": 2.5646222222222224, "grad_norm": 0.7186390751573959, "learning_rate": 6.284288375062936e-07, "loss": 0.017, "step": 21640 }, { "epoch": 2.5658074074074073, "grad_norm": 0.7152679327537902, "learning_rate": 6.250864838345195e-07, "loss": 0.016, "step": 21650 }, { "epoch": 2.5669925925925927, "grad_norm": 0.5581145556379897, "learning_rate": 6.217524494372334e-07, "loss": 0.0181, "step": 21660 }, { "epoch": 2.5681777777777777, "grad_norm": 0.4855017977285862, "learning_rate": 6.184267406543898e-07, "loss": 0.0157, "step": 21670 }, { "epoch": 2.569362962962963, "grad_norm": 0.3412304786470055, "learning_rate": 6.151093638101086e-07, "loss": 0.0175, "step": 21680 }, { "epoch": 2.570548148148148, "grad_norm": 0.6497795456381428, "learning_rate": 6.118003252126686e-07, "loss": 0.0177, "step": 21690 }, { "epoch": 2.5717333333333334, "grad_norm": 0.6205662322167813, "learning_rate": 6.084996311544905e-07, "loss": 0.0168, "step": 21700 }, { "epoch": 2.5729185185185184, "grad_norm": 0.5641759402797629, "learning_rate": 6.052072879121296e-07, "loss": 0.0175, "step": 21710 }, { "epoch": 2.574103703703704, "grad_norm": 0.6385352737210143, "learning_rate": 6.019233017462589e-07, "loss": 0.0164, "step": 21720 }, { "epoch": 2.5752888888888887, "grad_norm": 0.621744765216219, "learning_rate": 5.986476789016598e-07, "loss": 0.0187, "step": 21730 }, { "epoch": 2.576474074074074, "grad_norm": 0.5552101013667966, "learning_rate": 5.953804256072127e-07, "loss": 0.0177, "step": 21740 }, { "epoch": 2.577659259259259, "grad_norm": 0.5573477007551515, "learning_rate": 5.921215480758796e-07, "loss": 0.0181, "step": 21750 }, { "epoch": 2.5788444444444445, "grad_norm": 0.42414849129449866, "learning_rate": 5.888710525046964e-07, "loss": 0.0177, "step": 21760 }, { "epoch": 2.58002962962963, "grad_norm": 0.699740183589654, "learning_rate": 5.856289450747604e-07, "loss": 0.0179, "step": 21770 }, { "epoch": 2.581214814814815, "grad_norm": 0.5647578247178993, "learning_rate": 5.823952319512194e-07, "loss": 0.0179, "step": 21780 }, { "epoch": 2.5824, "grad_norm": 0.5556553113987365, "learning_rate": 5.791699192832556e-07, "loss": 0.0177, "step": 21790 }, { "epoch": 2.583585185185185, "grad_norm": 0.7924041878845579, "learning_rate": 5.759530132040791e-07, "loss": 0.0185, "step": 21800 }, { "epoch": 2.5847703703703706, "grad_norm": 0.6544991124887737, "learning_rate": 5.727445198309118e-07, "loss": 0.0184, "step": 21810 }, { "epoch": 2.5859555555555556, "grad_norm": 0.5375126802594471, "learning_rate": 5.695444452649829e-07, "loss": 0.0172, "step": 21820 }, { "epoch": 2.5871407407407405, "grad_norm": 0.4190573312393564, "learning_rate": 5.663527955915083e-07, "loss": 0.0178, "step": 21830 }, { "epoch": 2.588325925925926, "grad_norm": 0.5744162874932179, "learning_rate": 5.631695768796836e-07, "loss": 0.0167, "step": 21840 }, { "epoch": 2.5895111111111113, "grad_norm": 0.6786082923151243, "learning_rate": 5.599947951826718e-07, "loss": 0.0175, "step": 21850 }, { "epoch": 2.5906962962962963, "grad_norm": 0.4806957075973286, "learning_rate": 5.568284565375975e-07, "loss": 0.0182, "step": 21860 }, { "epoch": 2.5918814814814812, "grad_norm": 0.44272063982081145, "learning_rate": 5.536705669655245e-07, "loss": 0.0161, "step": 21870 }, { "epoch": 2.5930666666666666, "grad_norm": 0.6462430155813553, "learning_rate": 5.505211324714505e-07, "loss": 0.0169, "step": 21880 }, { "epoch": 2.594251851851852, "grad_norm": 0.658140325371253, "learning_rate": 5.473801590442957e-07, "loss": 0.0177, "step": 21890 }, { "epoch": 2.595437037037037, "grad_norm": 0.5055702278737, "learning_rate": 5.442476526568935e-07, "loss": 0.0179, "step": 21900 }, { "epoch": 2.5966222222222224, "grad_norm": 0.5905771934930385, "learning_rate": 5.41123619265973e-07, "loss": 0.0188, "step": 21910 }, { "epoch": 2.5978074074074073, "grad_norm": 0.6105603733912472, "learning_rate": 5.380080648121533e-07, "loss": 0.0169, "step": 21920 }, { "epoch": 2.5989925925925927, "grad_norm": 0.6795130059992837, "learning_rate": 5.349009952199269e-07, "loss": 0.0153, "step": 21930 }, { "epoch": 2.6001777777777777, "grad_norm": 0.6455111369860846, "learning_rate": 5.318024163976559e-07, "loss": 0.0172, "step": 21940 }, { "epoch": 2.601362962962963, "grad_norm": 0.5253832429760392, "learning_rate": 5.287123342375555e-07, "loss": 0.0182, "step": 21950 }, { "epoch": 2.602548148148148, "grad_norm": 0.7139651011349015, "learning_rate": 5.256307546156813e-07, "loss": 0.0165, "step": 21960 }, { "epoch": 2.6037333333333335, "grad_norm": 0.5856089789662409, "learning_rate": 5.225576833919221e-07, "loss": 0.0175, "step": 21970 }, { "epoch": 2.6049185185185184, "grad_norm": 0.5570914749299625, "learning_rate": 5.194931264099884e-07, "loss": 0.0193, "step": 21980 }, { "epoch": 2.606103703703704, "grad_norm": 0.47859589843313183, "learning_rate": 5.16437089497398e-07, "loss": 0.0159, "step": 21990 }, { "epoch": 2.6072888888888888, "grad_norm": 0.638222679336556, "learning_rate": 5.133895784654674e-07, "loss": 0.0195, "step": 22000 }, { "epoch": 2.608474074074074, "grad_norm": 0.5675545872739362, "learning_rate": 5.103505991093027e-07, "loss": 0.0174, "step": 22010 }, { "epoch": 2.609659259259259, "grad_norm": 0.5828340663934573, "learning_rate": 5.073201572077835e-07, "loss": 0.0179, "step": 22020 }, { "epoch": 2.6108444444444445, "grad_norm": 0.5984679910764497, "learning_rate": 5.042982585235562e-07, "loss": 0.0163, "step": 22030 }, { "epoch": 2.6120296296296295, "grad_norm": 0.6353571844924941, "learning_rate": 5.012849088030219e-07, "loss": 0.0176, "step": 22040 }, { "epoch": 2.613214814814815, "grad_norm": 0.5140429949902446, "learning_rate": 4.982801137763227e-07, "loss": 0.0161, "step": 22050 }, { "epoch": 2.6144, "grad_norm": 0.7124805537751441, "learning_rate": 4.952838791573361e-07, "loss": 0.0189, "step": 22060 }, { "epoch": 2.6155851851851852, "grad_norm": 0.6454204924765827, "learning_rate": 4.922962106436602e-07, "loss": 0.0176, "step": 22070 }, { "epoch": 2.6167703703703706, "grad_norm": 0.5497005684727574, "learning_rate": 4.893171139166026e-07, "loss": 0.0162, "step": 22080 }, { "epoch": 2.6179555555555556, "grad_norm": 0.4619831978468954, "learning_rate": 4.863465946411733e-07, "loss": 0.0165, "step": 22090 }, { "epoch": 2.6191407407407405, "grad_norm": 0.6178746309084407, "learning_rate": 4.83384658466069e-07, "loss": 0.0178, "step": 22100 }, { "epoch": 2.620325925925926, "grad_norm": 0.5588492409793266, "learning_rate": 4.804313110236674e-07, "loss": 0.0156, "step": 22110 }, { "epoch": 2.6215111111111113, "grad_norm": 0.6636435517100956, "learning_rate": 4.774865579300131e-07, "loss": 0.0176, "step": 22120 }, { "epoch": 2.6226962962962963, "grad_norm": 0.5201496617212275, "learning_rate": 4.745504047848065e-07, "loss": 0.0156, "step": 22130 }, { "epoch": 2.6238814814814813, "grad_norm": 0.6693252769821623, "learning_rate": 4.7162285717139434e-07, "loss": 0.0182, "step": 22140 }, { "epoch": 2.6250666666666667, "grad_norm": 0.62998544290871, "learning_rate": 4.6870392065676286e-07, "loss": 0.0177, "step": 22150 }, { "epoch": 2.626251851851852, "grad_norm": 0.5642750873920185, "learning_rate": 4.657936007915187e-07, "loss": 0.0146, "step": 22160 }, { "epoch": 2.627437037037037, "grad_norm": 0.7338902563262278, "learning_rate": 4.6289190310988776e-07, "loss": 0.0171, "step": 22170 }, { "epoch": 2.628622222222222, "grad_norm": 0.6315221885905348, "learning_rate": 4.599988331296956e-07, "loss": 0.0177, "step": 22180 }, { "epoch": 2.6298074074074074, "grad_norm": 0.5901158298625131, "learning_rate": 4.5711439635236555e-07, "loss": 0.0174, "step": 22190 }, { "epoch": 2.6309925925925928, "grad_norm": 0.6136960373187711, "learning_rate": 4.5423859826290216e-07, "loss": 0.0158, "step": 22200 }, { "epoch": 2.6321777777777777, "grad_norm": 0.7967018906772484, "learning_rate": 4.513714443298817e-07, "loss": 0.018, "step": 22210 }, { "epoch": 2.633362962962963, "grad_norm": 0.47205781155001864, "learning_rate": 4.485129400054444e-07, "loss": 0.0157, "step": 22220 }, { "epoch": 2.634548148148148, "grad_norm": 0.8096455963544056, "learning_rate": 4.456630907252829e-07, "loss": 0.0187, "step": 22230 }, { "epoch": 2.6357333333333335, "grad_norm": 0.5486232504177728, "learning_rate": 4.4282190190862993e-07, "loss": 0.0174, "step": 22240 }, { "epoch": 2.6369185185185184, "grad_norm": 0.4635635046714287, "learning_rate": 4.399893789582516e-07, "loss": 0.0176, "step": 22250 }, { "epoch": 2.638103703703704, "grad_norm": 0.5545216509684127, "learning_rate": 4.3716552726043305e-07, "loss": 0.0177, "step": 22260 }, { "epoch": 2.639288888888889, "grad_norm": 0.5796119638666397, "learning_rate": 4.343503521849718e-07, "loss": 0.017, "step": 22270 }, { "epoch": 2.640474074074074, "grad_norm": 0.6595236578032794, "learning_rate": 4.315438590851662e-07, "loss": 0.0165, "step": 22280 }, { "epoch": 2.641659259259259, "grad_norm": 0.5193183251460862, "learning_rate": 4.287460532978027e-07, "loss": 0.0179, "step": 22290 }, { "epoch": 2.6428444444444446, "grad_norm": 0.569834937754304, "learning_rate": 4.2595694014315016e-07, "loss": 0.0173, "step": 22300 }, { "epoch": 2.6440296296296295, "grad_norm": 0.5638331280692132, "learning_rate": 4.2317652492494754e-07, "loss": 0.017, "step": 22310 }, { "epoch": 2.645214814814815, "grad_norm": 0.46501048855239424, "learning_rate": 4.2040481293039445e-07, "loss": 0.0163, "step": 22320 }, { "epoch": 2.6464, "grad_norm": 0.5455056357115358, "learning_rate": 4.1764180943013876e-07, "loss": 0.0167, "step": 22330 }, { "epoch": 2.6475851851851853, "grad_norm": 0.671931031025325, "learning_rate": 4.148875196782698e-07, "loss": 0.0176, "step": 22340 }, { "epoch": 2.64877037037037, "grad_norm": 0.6138632752931288, "learning_rate": 4.1214194891230574e-07, "loss": 0.0171, "step": 22350 }, { "epoch": 2.6499555555555556, "grad_norm": 0.5863915206209542, "learning_rate": 4.09405102353187e-07, "loss": 0.0167, "step": 22360 }, { "epoch": 2.6511407407407406, "grad_norm": 0.6375120865575559, "learning_rate": 4.0667698520526155e-07, "loss": 0.0195, "step": 22370 }, { "epoch": 2.652325925925926, "grad_norm": 0.45812451800245535, "learning_rate": 4.039576026562786e-07, "loss": 0.0182, "step": 22380 }, { "epoch": 2.6535111111111114, "grad_norm": 0.5964665685487398, "learning_rate": 4.012469598773788e-07, "loss": 0.0191, "step": 22390 }, { "epoch": 2.6546962962962963, "grad_norm": 0.5435465170232701, "learning_rate": 3.985450620230841e-07, "loss": 0.018, "step": 22400 }, { "epoch": 2.6558814814814813, "grad_norm": 0.7120547530871477, "learning_rate": 3.958519142312839e-07, "loss": 0.0169, "step": 22410 }, { "epoch": 2.6570666666666667, "grad_norm": 0.4859189935609332, "learning_rate": 3.9316752162323056e-07, "loss": 0.016, "step": 22420 }, { "epoch": 2.658251851851852, "grad_norm": 0.4352345686757695, "learning_rate": 3.9049188930352697e-07, "loss": 0.0165, "step": 22430 }, { "epoch": 2.659437037037037, "grad_norm": 0.609760821369753, "learning_rate": 3.8782502236012045e-07, "loss": 0.0176, "step": 22440 }, { "epoch": 2.660622222222222, "grad_norm": 0.49217010291880653, "learning_rate": 3.851669258642865e-07, "loss": 0.0158, "step": 22450 }, { "epoch": 2.6618074074074074, "grad_norm": 0.5329534770695694, "learning_rate": 3.825176048706231e-07, "loss": 0.0186, "step": 22460 }, { "epoch": 2.662992592592593, "grad_norm": 0.5336945644131692, "learning_rate": 3.7987706441704406e-07, "loss": 0.0183, "step": 22470 }, { "epoch": 2.6641777777777778, "grad_norm": 0.4577740337806618, "learning_rate": 3.772453095247641e-07, "loss": 0.0172, "step": 22480 }, { "epoch": 2.6653629629629627, "grad_norm": 0.6782765453670982, "learning_rate": 3.7462234519829167e-07, "loss": 0.0175, "step": 22490 }, { "epoch": 2.666548148148148, "grad_norm": 0.513927387748197, "learning_rate": 3.7200817642541796e-07, "loss": 0.0175, "step": 22500 }, { "epoch": 2.6677333333333335, "grad_norm": 0.6202874387819673, "learning_rate": 3.6940280817720997e-07, "loss": 0.0177, "step": 22510 }, { "epoch": 2.6689185185185185, "grad_norm": 0.4571232350885583, "learning_rate": 3.668062454080007e-07, "loss": 0.0163, "step": 22520 }, { "epoch": 2.670103703703704, "grad_norm": 0.6740831615413266, "learning_rate": 3.6421849305537716e-07, "loss": 0.0167, "step": 22530 }, { "epoch": 2.671288888888889, "grad_norm": 0.5103458560212962, "learning_rate": 3.61639556040172e-07, "loss": 0.0183, "step": 22540 }, { "epoch": 2.6724740740740742, "grad_norm": 0.5499439087301622, "learning_rate": 3.5906943926645674e-07, "loss": 0.0167, "step": 22550 }, { "epoch": 2.673659259259259, "grad_norm": 0.5628079092199192, "learning_rate": 3.56508147621531e-07, "loss": 0.0185, "step": 22560 }, { "epoch": 2.6748444444444446, "grad_norm": 0.8432895281819219, "learning_rate": 3.539556859759097e-07, "loss": 0.0177, "step": 22570 }, { "epoch": 2.6760296296296295, "grad_norm": 0.7208958963241069, "learning_rate": 3.514120591833187e-07, "loss": 0.0171, "step": 22580 }, { "epoch": 2.677214814814815, "grad_norm": 0.5127175885306958, "learning_rate": 3.488772720806821e-07, "loss": 0.0155, "step": 22590 }, { "epoch": 2.6784, "grad_norm": 0.48825468926756144, "learning_rate": 3.463513294881171e-07, "loss": 0.0157, "step": 22600 }, { "epoch": 2.6795851851851853, "grad_norm": 0.6257502274449918, "learning_rate": 3.438342362089209e-07, "loss": 0.0192, "step": 22610 }, { "epoch": 2.6807703703703702, "grad_norm": 0.48148993324565625, "learning_rate": 3.413259970295613e-07, "loss": 0.0166, "step": 22620 }, { "epoch": 2.6819555555555556, "grad_norm": 0.6838219625893894, "learning_rate": 3.38826616719673e-07, "loss": 0.0193, "step": 22630 }, { "epoch": 2.6831407407407406, "grad_norm": 0.8094063260874216, "learning_rate": 3.3633610003204087e-07, "loss": 0.0173, "step": 22640 }, { "epoch": 2.684325925925926, "grad_norm": 0.547582980239583, "learning_rate": 3.338544517025982e-07, "loss": 0.0166, "step": 22650 }, { "epoch": 2.685511111111111, "grad_norm": 0.5256113925858257, "learning_rate": 3.313816764504124e-07, "loss": 0.0179, "step": 22660 }, { "epoch": 2.6866962962962964, "grad_norm": 0.5529086920972679, "learning_rate": 3.289177789776776e-07, "loss": 0.0153, "step": 22670 }, { "epoch": 2.6878814814814813, "grad_norm": 0.4559626979943232, "learning_rate": 3.2646276396970824e-07, "loss": 0.0155, "step": 22680 }, { "epoch": 2.6890666666666667, "grad_norm": 0.6207917578822382, "learning_rate": 3.240166360949254e-07, "loss": 0.0182, "step": 22690 }, { "epoch": 2.690251851851852, "grad_norm": 0.4603030834764436, "learning_rate": 3.2157940000485164e-07, "loss": 0.0163, "step": 22700 }, { "epoch": 2.691437037037037, "grad_norm": 0.43191313817098037, "learning_rate": 3.191510603341025e-07, "loss": 0.0173, "step": 22710 }, { "epoch": 2.692622222222222, "grad_norm": 0.5963780564991046, "learning_rate": 3.1673162170037243e-07, "loss": 0.0171, "step": 22720 }, { "epoch": 2.6938074074074074, "grad_norm": 0.5993675036379672, "learning_rate": 3.143210887044351e-07, "loss": 0.0147, "step": 22730 }, { "epoch": 2.694992592592593, "grad_norm": 0.45612814100773097, "learning_rate": 3.1191946593012447e-07, "loss": 0.0148, "step": 22740 }, { "epoch": 2.696177777777778, "grad_norm": 0.5689891280497064, "learning_rate": 3.0952675794433393e-07, "loss": 0.0181, "step": 22750 }, { "epoch": 2.6973629629629627, "grad_norm": 0.5200536521008651, "learning_rate": 3.0714296929700184e-07, "loss": 0.0185, "step": 22760 }, { "epoch": 2.698548148148148, "grad_norm": 0.5322954740029764, "learning_rate": 3.0476810452110817e-07, "loss": 0.0177, "step": 22770 }, { "epoch": 2.6997333333333335, "grad_norm": 0.6285266638084286, "learning_rate": 3.0240216813266446e-07, "loss": 0.0182, "step": 22780 }, { "epoch": 2.7009185185185185, "grad_norm": 0.556186447296804, "learning_rate": 3.0004516463070065e-07, "loss": 0.0187, "step": 22790 }, { "epoch": 2.7021037037037035, "grad_norm": 0.6181022789062538, "learning_rate": 2.976970984972616e-07, "loss": 0.0168, "step": 22800 }, { "epoch": 2.703288888888889, "grad_norm": 0.5306929150152081, "learning_rate": 2.953579741973983e-07, "loss": 0.0162, "step": 22810 }, { "epoch": 2.7044740740740743, "grad_norm": 0.49216528502252066, "learning_rate": 2.9302779617915554e-07, "loss": 0.0166, "step": 22820 }, { "epoch": 2.705659259259259, "grad_norm": 0.32464298100312095, "learning_rate": 2.907065688735683e-07, "loss": 0.0181, "step": 22830 }, { "epoch": 2.7068444444444446, "grad_norm": 0.5581773424887209, "learning_rate": 2.8839429669464846e-07, "loss": 0.0164, "step": 22840 }, { "epoch": 2.7080296296296296, "grad_norm": 0.48897977988184105, "learning_rate": 2.8609098403938164e-07, "loss": 0.0174, "step": 22850 }, { "epoch": 2.709214814814815, "grad_norm": 0.4781983191944791, "learning_rate": 2.837966352877164e-07, "loss": 0.0158, "step": 22860 }, { "epoch": 2.7104, "grad_norm": 0.6328193715799723, "learning_rate": 2.8151125480255226e-07, "loss": 0.0189, "step": 22870 }, { "epoch": 2.7115851851851853, "grad_norm": 0.6355616843512417, "learning_rate": 2.7923484692973735e-07, "loss": 0.0145, "step": 22880 }, { "epoch": 2.7127703703703703, "grad_norm": 0.5384442575779633, "learning_rate": 2.769674159980579e-07, "loss": 0.0137, "step": 22890 }, { "epoch": 2.7139555555555557, "grad_norm": 0.5257990711711436, "learning_rate": 2.7470896631922815e-07, "loss": 0.0176, "step": 22900 }, { "epoch": 2.7151407407407406, "grad_norm": 0.6028946351137432, "learning_rate": 2.7245950218788455e-07, "loss": 0.0199, "step": 22910 }, { "epoch": 2.716325925925926, "grad_norm": 0.6096999928901118, "learning_rate": 2.702190278815764e-07, "loss": 0.0181, "step": 22920 }, { "epoch": 2.717511111111111, "grad_norm": 0.5619062276042769, "learning_rate": 2.679875476607591e-07, "loss": 0.0172, "step": 22930 }, { "epoch": 2.7186962962962964, "grad_norm": 0.7190051502794982, "learning_rate": 2.657650657687844e-07, "loss": 0.0158, "step": 22940 }, { "epoch": 2.7198814814814813, "grad_norm": 0.5399562713131539, "learning_rate": 2.635515864318922e-07, "loss": 0.0166, "step": 22950 }, { "epoch": 2.7210666666666667, "grad_norm": 0.6350567185639987, "learning_rate": 2.613471138592044e-07, "loss": 0.0183, "step": 22960 }, { "epoch": 2.7222518518518517, "grad_norm": 0.44368713994260967, "learning_rate": 2.5915165224271454e-07, "loss": 0.0188, "step": 22970 }, { "epoch": 2.723437037037037, "grad_norm": 0.6828455752911726, "learning_rate": 2.569652057572825e-07, "loss": 0.016, "step": 22980 }, { "epoch": 2.724622222222222, "grad_norm": 0.6120963844436584, "learning_rate": 2.5478777856062454e-07, "loss": 0.0181, "step": 22990 }, { "epoch": 2.7258074074074075, "grad_norm": 0.557913942818718, "learning_rate": 2.526193747933048e-07, "loss": 0.0201, "step": 23000 }, { "epoch": 2.726992592592593, "grad_norm": 0.5931411968363004, "learning_rate": 2.5045999857873036e-07, "loss": 0.0157, "step": 23010 }, { "epoch": 2.728177777777778, "grad_norm": 0.5589712722631065, "learning_rate": 2.483096540231417e-07, "loss": 0.0175, "step": 23020 }, { "epoch": 2.7293629629629628, "grad_norm": 0.6251667845871217, "learning_rate": 2.461683452156033e-07, "loss": 0.0196, "step": 23030 }, { "epoch": 2.730548148148148, "grad_norm": 0.5955504426850127, "learning_rate": 2.440360762279975e-07, "loss": 0.0178, "step": 23040 }, { "epoch": 2.7317333333333336, "grad_norm": 0.6011971122442907, "learning_rate": 2.4191285111501706e-07, "loss": 0.0166, "step": 23050 }, { "epoch": 2.7329185185185185, "grad_norm": 0.5364478580154213, "learning_rate": 2.397986739141589e-07, "loss": 0.0199, "step": 23060 }, { "epoch": 2.7341037037037035, "grad_norm": 0.6608510493743801, "learning_rate": 2.376935486457116e-07, "loss": 0.017, "step": 23070 }, { "epoch": 2.735288888888889, "grad_norm": 0.5779420989406509, "learning_rate": 2.3559747931275189e-07, "loss": 0.0163, "step": 23080 }, { "epoch": 2.7364740740740743, "grad_norm": 0.4827757716026622, "learning_rate": 2.3351046990113647e-07, "loss": 0.017, "step": 23090 }, { "epoch": 2.7376592592592592, "grad_norm": 0.6263917517490031, "learning_rate": 2.314325243794935e-07, "loss": 0.0177, "step": 23100 }, { "epoch": 2.738844444444444, "grad_norm": 0.49606188057862205, "learning_rate": 2.2936364669921495e-07, "loss": 0.0185, "step": 23110 }, { "epoch": 2.7400296296296296, "grad_norm": 0.5680872282032834, "learning_rate": 2.2730384079444944e-07, "loss": 0.0158, "step": 23120 }, { "epoch": 2.741214814814815, "grad_norm": 0.5087943299401656, "learning_rate": 2.2525311058209487e-07, "loss": 0.0169, "step": 23130 }, { "epoch": 2.7424, "grad_norm": 0.6026711741849468, "learning_rate": 2.2321145996179238e-07, "loss": 0.0168, "step": 23140 }, { "epoch": 2.7435851851851853, "grad_norm": 0.584706616518232, "learning_rate": 2.2117889281591587e-07, "loss": 0.017, "step": 23150 }, { "epoch": 2.7447703703703703, "grad_norm": 0.5196194189377914, "learning_rate": 2.1915541300956522e-07, "loss": 0.0161, "step": 23160 }, { "epoch": 2.7459555555555557, "grad_norm": 0.4751551154335021, "learning_rate": 2.1714102439056306e-07, "loss": 0.0158, "step": 23170 }, { "epoch": 2.7471407407407407, "grad_norm": 0.6840118273639366, "learning_rate": 2.151357307894425e-07, "loss": 0.0159, "step": 23180 }, { "epoch": 2.748325925925926, "grad_norm": 0.5714794365600924, "learning_rate": 2.131395360194416e-07, "loss": 0.0178, "step": 23190 }, { "epoch": 2.749511111111111, "grad_norm": 0.5051383517587212, "learning_rate": 2.111524438764967e-07, "loss": 0.0158, "step": 23200 }, { "epoch": 2.7506962962962964, "grad_norm": 0.5589906619467518, "learning_rate": 2.0917445813923298e-07, "loss": 0.0172, "step": 23210 }, { "epoch": 2.7518814814814814, "grad_norm": 0.3092318250707791, "learning_rate": 2.0720558256896283e-07, "loss": 0.0163, "step": 23220 }, { "epoch": 2.7530666666666668, "grad_norm": 0.6604218732894236, "learning_rate": 2.0524582090967137e-07, "loss": 0.017, "step": 23230 }, { "epoch": 2.7542518518518517, "grad_norm": 0.40989607364580555, "learning_rate": 2.032951768880137e-07, "loss": 0.0185, "step": 23240 }, { "epoch": 2.755437037037037, "grad_norm": 0.6234290041495085, "learning_rate": 2.0135365421330765e-07, "loss": 0.0186, "step": 23250 }, { "epoch": 2.756622222222222, "grad_norm": 0.5508048860616999, "learning_rate": 1.9942125657752554e-07, "loss": 0.0156, "step": 23260 }, { "epoch": 2.7578074074074075, "grad_norm": 0.41925814638682407, "learning_rate": 1.974979876552885e-07, "loss": 0.0154, "step": 23270 }, { "epoch": 2.7589925925925924, "grad_norm": 0.5887599680600968, "learning_rate": 1.955838511038577e-07, "loss": 0.0177, "step": 23280 }, { "epoch": 2.760177777777778, "grad_norm": 0.817375285032308, "learning_rate": 1.9367885056312652e-07, "loss": 0.0184, "step": 23290 }, { "epoch": 2.761362962962963, "grad_norm": 0.4650271384049185, "learning_rate": 1.9178298965562002e-07, "loss": 0.015, "step": 23300 }, { "epoch": 2.762548148148148, "grad_norm": 0.6200371376448504, "learning_rate": 1.8989627198647942e-07, "loss": 0.0179, "step": 23310 }, { "epoch": 2.7637333333333336, "grad_norm": 0.45420199429625285, "learning_rate": 1.8801870114346143e-07, "loss": 0.0146, "step": 23320 }, { "epoch": 2.7649185185185186, "grad_norm": 0.6152694272512292, "learning_rate": 1.8615028069692788e-07, "loss": 0.0158, "step": 23330 }, { "epoch": 2.7661037037037035, "grad_norm": 0.751203655151455, "learning_rate": 1.8429101419984108e-07, "loss": 0.0167, "step": 23340 }, { "epoch": 2.767288888888889, "grad_norm": 0.5925673251198694, "learning_rate": 1.8244090518775736e-07, "loss": 0.0183, "step": 23350 }, { "epoch": 2.7684740740740743, "grad_norm": 0.603436719080171, "learning_rate": 1.8059995717881696e-07, "loss": 0.0167, "step": 23360 }, { "epoch": 2.7696592592592593, "grad_norm": 0.5048056625816433, "learning_rate": 1.7876817367374122e-07, "loss": 0.017, "step": 23370 }, { "epoch": 2.770844444444444, "grad_norm": 0.5046099810218734, "learning_rate": 1.7694555815582382e-07, "loss": 0.0146, "step": 23380 }, { "epoch": 2.7720296296296296, "grad_norm": 0.6519202532330443, "learning_rate": 1.7513211409092512e-07, "loss": 0.0165, "step": 23390 }, { "epoch": 2.773214814814815, "grad_norm": 0.8475286050857711, "learning_rate": 1.7332784492746613e-07, "loss": 0.0169, "step": 23400 }, { "epoch": 2.7744, "grad_norm": 0.49356891024625155, "learning_rate": 1.7153275409641846e-07, "loss": 0.0183, "step": 23410 }, { "epoch": 2.775585185185185, "grad_norm": 0.6423043758056829, "learning_rate": 1.6974684501130213e-07, "loss": 0.0162, "step": 23420 }, { "epoch": 2.7767703703703703, "grad_norm": 0.6958336868335069, "learning_rate": 1.6797012106817835e-07, "loss": 0.0173, "step": 23430 }, { "epoch": 2.7779555555555557, "grad_norm": 0.6514600118533722, "learning_rate": 1.662025856456384e-07, "loss": 0.0164, "step": 23440 }, { "epoch": 2.7791407407407407, "grad_norm": 0.4595749847427966, "learning_rate": 1.644442421048048e-07, "loss": 0.0153, "step": 23450 }, { "epoch": 2.780325925925926, "grad_norm": 0.7014126172602996, "learning_rate": 1.6269509378931735e-07, "loss": 0.0191, "step": 23460 }, { "epoch": 2.781511111111111, "grad_norm": 0.4983839899893056, "learning_rate": 1.6095514402533263e-07, "loss": 0.0162, "step": 23470 }, { "epoch": 2.7826962962962964, "grad_norm": 0.5959014383056567, "learning_rate": 1.592243961215162e-07, "loss": 0.0168, "step": 23480 }, { "epoch": 2.7838814814814814, "grad_norm": 0.5668109210434008, "learning_rate": 1.5750285336903314e-07, "loss": 0.0161, "step": 23490 }, { "epoch": 2.785066666666667, "grad_norm": 0.5751938851139909, "learning_rate": 1.557905190415443e-07, "loss": 0.017, "step": 23500 }, { "epoch": 2.7862518518518518, "grad_norm": 0.5617253930658811, "learning_rate": 1.540873963952022e-07, "loss": 0.0176, "step": 23510 }, { "epoch": 2.787437037037037, "grad_norm": 0.44058201089023974, "learning_rate": 1.5239348866864067e-07, "loss": 0.015, "step": 23520 }, { "epoch": 2.788622222222222, "grad_norm": 0.5411531210936671, "learning_rate": 1.5070879908297086e-07, "loss": 0.0182, "step": 23530 }, { "epoch": 2.7898074074074075, "grad_norm": 0.4955496049266531, "learning_rate": 1.4903333084177352e-07, "loss": 0.0173, "step": 23540 }, { "epoch": 2.7909925925925925, "grad_norm": 0.5460026211959926, "learning_rate": 1.4736708713109783e-07, "loss": 0.0143, "step": 23550 }, { "epoch": 2.792177777777778, "grad_norm": 0.5586886441930353, "learning_rate": 1.4571007111944924e-07, "loss": 0.016, "step": 23560 }, { "epoch": 2.793362962962963, "grad_norm": 0.5241678615965204, "learning_rate": 1.440622859577856e-07, "loss": 0.0166, "step": 23570 }, { "epoch": 2.7945481481481482, "grad_norm": 0.48602589301351173, "learning_rate": 1.4242373477951155e-07, "loss": 0.0171, "step": 23580 }, { "epoch": 2.795733333333333, "grad_norm": 0.69943218249404, "learning_rate": 1.4079442070047523e-07, "loss": 0.0171, "step": 23590 }, { "epoch": 2.7969185185185186, "grad_norm": 0.6482562437170073, "learning_rate": 1.3917434681895548e-07, "loss": 0.0177, "step": 23600 }, { "epoch": 2.7981037037037035, "grad_norm": 0.6282925922368169, "learning_rate": 1.3756351621566355e-07, "loss": 0.0179, "step": 23610 }, { "epoch": 2.799288888888889, "grad_norm": 0.6625097023367825, "learning_rate": 1.359619319537314e-07, "loss": 0.0177, "step": 23620 }, { "epoch": 2.8004740740740743, "grad_norm": 0.4700158500845828, "learning_rate": 1.3436959707870956e-07, "loss": 0.0173, "step": 23630 }, { "epoch": 2.8016592592592593, "grad_norm": 0.6880727786582241, "learning_rate": 1.3278651461856084e-07, "loss": 0.0162, "step": 23640 }, { "epoch": 2.8028444444444442, "grad_norm": 0.7156278628541548, "learning_rate": 1.3121268758365224e-07, "loss": 0.0189, "step": 23650 }, { "epoch": 2.8040296296296296, "grad_norm": 0.5556669593471263, "learning_rate": 1.2964811896675034e-07, "loss": 0.0158, "step": 23660 }, { "epoch": 2.805214814814815, "grad_norm": 0.5781255382339048, "learning_rate": 1.2809281174301747e-07, "loss": 0.0165, "step": 23670 }, { "epoch": 2.8064, "grad_norm": 0.8316892911189332, "learning_rate": 1.2654676887000504e-07, "loss": 0.0178, "step": 23680 }, { "epoch": 2.807585185185185, "grad_norm": 0.572177601547204, "learning_rate": 1.2500999328764586e-07, "loss": 0.0167, "step": 23690 }, { "epoch": 2.8087703703703704, "grad_norm": 0.5618758606010577, "learning_rate": 1.2348248791825e-07, "loss": 0.0188, "step": 23700 }, { "epoch": 2.8099555555555558, "grad_norm": 0.7582104032629382, "learning_rate": 1.2196425566650184e-07, "loss": 0.0193, "step": 23710 }, { "epoch": 2.8111407407407407, "grad_norm": 0.5130479103641089, "learning_rate": 1.2045529941945077e-07, "loss": 0.0155, "step": 23720 }, { "epoch": 2.8123259259259257, "grad_norm": 0.6502772297409193, "learning_rate": 1.1895562204650546e-07, "loss": 0.0184, "step": 23730 }, { "epoch": 2.813511111111111, "grad_norm": 0.6072052992417779, "learning_rate": 1.1746522639943304e-07, "loss": 0.0165, "step": 23740 }, { "epoch": 2.8146962962962965, "grad_norm": 0.5420393938088142, "learning_rate": 1.1598411531234755e-07, "loss": 0.0168, "step": 23750 }, { "epoch": 2.8158814814814814, "grad_norm": 0.46995870490733455, "learning_rate": 1.1451229160171051e-07, "loss": 0.0164, "step": 23760 }, { "epoch": 2.817066666666667, "grad_norm": 0.6229276720892111, "learning_rate": 1.130497580663209e-07, "loss": 0.0173, "step": 23770 }, { "epoch": 2.818251851851852, "grad_norm": 0.4008196510899579, "learning_rate": 1.1159651748731126e-07, "loss": 0.0151, "step": 23780 }, { "epoch": 2.819437037037037, "grad_norm": 0.5797283202018212, "learning_rate": 1.1015257262814493e-07, "loss": 0.0162, "step": 23790 }, { "epoch": 2.820622222222222, "grad_norm": 0.46855512418386264, "learning_rate": 1.0871792623460664e-07, "loss": 0.0178, "step": 23800 }, { "epoch": 2.8218074074074075, "grad_norm": 0.5587214472542532, "learning_rate": 1.0729258103479967e-07, "loss": 0.0169, "step": 23810 }, { "epoch": 2.8229925925925925, "grad_norm": 0.5863900549217356, "learning_rate": 1.0587653973914147e-07, "loss": 0.0177, "step": 23820 }, { "epoch": 2.824177777777778, "grad_norm": 0.5710151595819755, "learning_rate": 1.0446980504035476e-07, "loss": 0.015, "step": 23830 }, { "epoch": 2.825362962962963, "grad_norm": 0.5318866091985665, "learning_rate": 1.0307237961346861e-07, "loss": 0.0166, "step": 23840 }, { "epoch": 2.8265481481481483, "grad_norm": 0.5280222084130962, "learning_rate": 1.0168426611580629e-07, "loss": 0.0184, "step": 23850 }, { "epoch": 2.827733333333333, "grad_norm": 0.6034509847198821, "learning_rate": 1.0030546718698575e-07, "loss": 0.0157, "step": 23860 }, { "epoch": 2.8289185185185186, "grad_norm": 0.5683031124940099, "learning_rate": 9.893598544891192e-08, "loss": 0.0164, "step": 23870 }, { "epoch": 2.8301037037037036, "grad_norm": 0.6889455979805238, "learning_rate": 9.757582350577111e-08, "loss": 0.0171, "step": 23880 }, { "epoch": 2.831288888888889, "grad_norm": 0.5635138635299503, "learning_rate": 9.622498394402934e-08, "loss": 0.0167, "step": 23890 }, { "epoch": 2.832474074074074, "grad_norm": 0.6020099574580935, "learning_rate": 9.48834693324241e-08, "loss": 0.0159, "step": 23900 }, { "epoch": 2.8336592592592593, "grad_norm": 0.6375362753351225, "learning_rate": 9.35512822219603e-08, "loss": 0.0166, "step": 23910 }, { "epoch": 2.8348444444444443, "grad_norm": 0.49489541474359744, "learning_rate": 9.222842514590713e-08, "loss": 0.0158, "step": 23920 }, { "epoch": 2.8360296296296297, "grad_norm": 0.7064656880315627, "learning_rate": 9.091490061979014e-08, "loss": 0.0162, "step": 23930 }, { "epoch": 2.837214814814815, "grad_norm": 0.5192228424462412, "learning_rate": 8.961071114139075e-08, "loss": 0.0151, "step": 23940 }, { "epoch": 2.8384, "grad_norm": 0.624949159483972, "learning_rate": 8.831585919073627e-08, "loss": 0.017, "step": 23950 }, { "epoch": 2.839585185185185, "grad_norm": 0.3495923820761073, "learning_rate": 8.703034723009873e-08, "loss": 0.0161, "step": 23960 }, { "epoch": 2.8407703703703704, "grad_norm": 0.5710223194340618, "learning_rate": 8.575417770399109e-08, "loss": 0.0156, "step": 23970 }, { "epoch": 2.841955555555556, "grad_norm": 0.8247218018631985, "learning_rate": 8.448735303915879e-08, "loss": 0.0172, "step": 23980 }, { "epoch": 2.8431407407407407, "grad_norm": 0.3861681904504029, "learning_rate": 8.322987564457818e-08, "loss": 0.0144, "step": 23990 }, { "epoch": 2.8443259259259257, "grad_norm": 0.5909122223748874, "learning_rate": 8.198174791144986e-08, "loss": 0.0155, "step": 24000 }, { "epoch": 2.845511111111111, "grad_norm": 0.5328819807185622, "learning_rate": 8.074297221319694e-08, "loss": 0.0163, "step": 24010 }, { "epoch": 2.8466962962962965, "grad_norm": 0.5598500490202744, "learning_rate": 7.951355090545787e-08, "loss": 0.0163, "step": 24020 }, { "epoch": 2.8478814814814815, "grad_norm": 0.6768782769491176, "learning_rate": 7.829348632608314e-08, "loss": 0.0167, "step": 24030 }, { "epoch": 2.8490666666666664, "grad_norm": 0.6116674476283963, "learning_rate": 7.708278079513021e-08, "loss": 0.0181, "step": 24040 }, { "epoch": 2.850251851851852, "grad_norm": 0.5858764888773094, "learning_rate": 7.588143661486025e-08, "loss": 0.016, "step": 24050 }, { "epoch": 2.851437037037037, "grad_norm": 0.5092432128430017, "learning_rate": 7.468945606973254e-08, "loss": 0.0173, "step": 24060 }, { "epoch": 2.852622222222222, "grad_norm": 0.6061761093394289, "learning_rate": 7.350684142640008e-08, "loss": 0.0154, "step": 24070 }, { "epoch": 2.8538074074074076, "grad_norm": 0.6773936782955857, "learning_rate": 7.233359493370673e-08, "loss": 0.0159, "step": 24080 }, { "epoch": 2.8549925925925925, "grad_norm": 0.8124479635404589, "learning_rate": 7.11697188226812e-08, "loss": 0.0191, "step": 24090 }, { "epoch": 2.856177777777778, "grad_norm": 0.5976595288941503, "learning_rate": 7.001521530653533e-08, "loss": 0.0179, "step": 24100 }, { "epoch": 2.857362962962963, "grad_norm": 0.49519100271791844, "learning_rate": 6.887008658065631e-08, "loss": 0.0162, "step": 24110 }, { "epoch": 2.8585481481481483, "grad_norm": 0.6200909844159384, "learning_rate": 6.773433482260394e-08, "loss": 0.0178, "step": 24120 }, { "epoch": 2.8597333333333332, "grad_norm": 0.6527282945023417, "learning_rate": 6.660796219210897e-08, "loss": 0.0163, "step": 24130 }, { "epoch": 2.8609185185185186, "grad_norm": 0.5283155539315686, "learning_rate": 6.549097083106582e-08, "loss": 0.0156, "step": 24140 }, { "epoch": 2.8621037037037036, "grad_norm": 0.5314188447916525, "learning_rate": 6.43833628635293e-08, "loss": 0.0152, "step": 24150 }, { "epoch": 2.863288888888889, "grad_norm": 0.5165313753051062, "learning_rate": 6.328514039571133e-08, "loss": 0.015, "step": 24160 }, { "epoch": 2.864474074074074, "grad_norm": 0.6981501133944453, "learning_rate": 6.219630551597633e-08, "loss": 0.017, "step": 24170 }, { "epoch": 2.8656592592592594, "grad_norm": 0.5857384358510265, "learning_rate": 6.11168602948381e-08, "loss": 0.0167, "step": 24180 }, { "epoch": 2.8668444444444443, "grad_norm": 0.4979088167867476, "learning_rate": 6.004680678495412e-08, "loss": 0.0172, "step": 24190 }, { "epoch": 2.8680296296296297, "grad_norm": 0.46960980353248033, "learning_rate": 5.898614702112282e-08, "loss": 0.0154, "step": 24200 }, { "epoch": 2.8692148148148147, "grad_norm": 0.4839037710464162, "learning_rate": 5.7934883020281385e-08, "loss": 0.0162, "step": 24210 }, { "epoch": 2.8704, "grad_norm": 0.5710498559029004, "learning_rate": 5.689301678149739e-08, "loss": 0.0174, "step": 24220 }, { "epoch": 2.871585185185185, "grad_norm": 0.7715704542703485, "learning_rate": 5.5860550285969925e-08, "loss": 0.0155, "step": 24230 }, { "epoch": 2.8727703703703704, "grad_norm": 0.6549835873997798, "learning_rate": 5.4837485497021836e-08, "loss": 0.0175, "step": 24240 }, { "epoch": 2.873955555555556, "grad_norm": 0.6368264840592855, "learning_rate": 5.3823824360099695e-08, "loss": 0.0164, "step": 24250 }, { "epoch": 2.8751407407407408, "grad_norm": 0.5693220519768152, "learning_rate": 5.281956880276773e-08, "loss": 0.0158, "step": 24260 }, { "epoch": 2.8763259259259257, "grad_norm": 0.451065093785236, "learning_rate": 5.1824720734703904e-08, "loss": 0.0159, "step": 24270 }, { "epoch": 2.877511111111111, "grad_norm": 0.6882340036966536, "learning_rate": 5.0839282047697166e-08, "loss": 0.0179, "step": 24280 }, { "epoch": 2.8786962962962965, "grad_norm": 0.6484915264109841, "learning_rate": 4.9863254615643567e-08, "loss": 0.018, "step": 24290 }, { "epoch": 2.8798814814814815, "grad_norm": 0.7041913458246905, "learning_rate": 4.889664029454455e-08, "loss": 0.0171, "step": 24300 }, { "epoch": 2.8810666666666664, "grad_norm": 0.6484233749915392, "learning_rate": 4.7939440922499246e-08, "loss": 0.0168, "step": 24310 }, { "epoch": 2.882251851851852, "grad_norm": 0.5761611738707825, "learning_rate": 4.699165831970498e-08, "loss": 0.0154, "step": 24320 }, { "epoch": 2.8834370370370372, "grad_norm": 0.5150081576364847, "learning_rate": 4.605329428845229e-08, "loss": 0.0172, "step": 24330 }, { "epoch": 2.884622222222222, "grad_norm": 0.736701280742616, "learning_rate": 4.512435061312104e-08, "loss": 0.0192, "step": 24340 }, { "epoch": 2.885807407407407, "grad_norm": 0.5374835822662828, "learning_rate": 4.420482906017709e-08, "loss": 0.0144, "step": 24350 }, { "epoch": 2.8869925925925926, "grad_norm": 0.5026084270087791, "learning_rate": 4.3294731378170084e-08, "loss": 0.0164, "step": 24360 }, { "epoch": 2.888177777777778, "grad_norm": 0.5409240192705762, "learning_rate": 4.2394059297728995e-08, "loss": 0.0163, "step": 24370 }, { "epoch": 2.889362962962963, "grad_norm": 0.6043113925441805, "learning_rate": 4.1502814531559356e-08, "loss": 0.0159, "step": 24380 }, { "epoch": 2.8905481481481483, "grad_norm": 0.5175666423448382, "learning_rate": 4.0620998774439924e-08, "loss": 0.0163, "step": 24390 }, { "epoch": 2.8917333333333333, "grad_norm": 0.5385813279903557, "learning_rate": 3.974861370321881e-08, "loss": 0.0162, "step": 24400 }, { "epoch": 2.8929185185185187, "grad_norm": 0.5580944229468465, "learning_rate": 3.888566097681123e-08, "loss": 0.0164, "step": 24410 }, { "epoch": 2.8941037037037036, "grad_norm": 0.49545452069407303, "learning_rate": 3.803214223619733e-08, "loss": 0.0147, "step": 24420 }, { "epoch": 2.895288888888889, "grad_norm": 0.5715448334029454, "learning_rate": 3.718805910441492e-08, "loss": 0.0148, "step": 24430 }, { "epoch": 2.896474074074074, "grad_norm": 0.6057299346285372, "learning_rate": 3.635341318656116e-08, "loss": 0.016, "step": 24440 }, { "epoch": 2.8976592592592594, "grad_norm": 0.5284054439199934, "learning_rate": 3.552820606978757e-08, "loss": 0.0177, "step": 24450 }, { "epoch": 2.8988444444444443, "grad_norm": 0.5129698528135394, "learning_rate": 3.47124393232956e-08, "loss": 0.0161, "step": 24460 }, { "epoch": 2.9000296296296297, "grad_norm": 0.5146787976503923, "learning_rate": 3.3906114498336584e-08, "loss": 0.0154, "step": 24470 }, { "epoch": 2.9012148148148147, "grad_norm": 0.5140974899889613, "learning_rate": 3.3109233128206795e-08, "loss": 0.0172, "step": 24480 }, { "epoch": 2.9024, "grad_norm": 0.6767296358528045, "learning_rate": 3.232179672824409e-08, "loss": 0.0166, "step": 24490 }, { "epoch": 2.903585185185185, "grad_norm": 0.8378371116054654, "learning_rate": 3.154380679582625e-08, "loss": 0.0172, "step": 24500 }, { "epoch": 2.9047703703703704, "grad_norm": 0.6451680075290084, "learning_rate": 3.077526481036874e-08, "loss": 0.0164, "step": 24510 }, { "epoch": 2.9059555555555554, "grad_norm": 0.3349324791713931, "learning_rate": 3.0016172233320874e-08, "loss": 0.0173, "step": 24520 }, { "epoch": 2.907140740740741, "grad_norm": 0.578570728807988, "learning_rate": 2.926653050816075e-08, "loss": 0.0182, "step": 24530 }, { "epoch": 2.9083259259259258, "grad_norm": 0.5000505966884319, "learning_rate": 2.8526341060398088e-08, "loss": 0.016, "step": 24540 }, { "epoch": 2.909511111111111, "grad_norm": 0.5381103410094421, "learning_rate": 2.779560529756642e-08, "loss": 0.0162, "step": 24550 }, { "epoch": 2.9106962962962966, "grad_norm": 0.3671637354012208, "learning_rate": 2.7074324609222547e-08, "loss": 0.0157, "step": 24560 }, { "epoch": 2.9118814814814815, "grad_norm": 0.4515150195230591, "learning_rate": 2.6362500366943767e-08, "loss": 0.0151, "step": 24570 }, { "epoch": 2.9130666666666665, "grad_norm": 0.5705747626911236, "learning_rate": 2.5660133924324537e-08, "loss": 0.0192, "step": 24580 }, { "epoch": 2.914251851851852, "grad_norm": 0.5437557684872543, "learning_rate": 2.496722661697648e-08, "loss": 0.0146, "step": 24590 }, { "epoch": 2.9154370370370373, "grad_norm": 0.6602699080095603, "learning_rate": 2.428377976252172e-08, "loss": 0.016, "step": 24600 }, { "epoch": 2.9166222222222222, "grad_norm": 0.6104992900519459, "learning_rate": 2.3609794660592877e-08, "loss": 0.0177, "step": 24610 }, { "epoch": 2.917807407407407, "grad_norm": 0.5525277045559269, "learning_rate": 2.2945272592830858e-08, "loss": 0.0166, "step": 24620 }, { "epoch": 2.9189925925925926, "grad_norm": 0.5886038288281357, "learning_rate": 2.229021482288152e-08, "loss": 0.0193, "step": 24630 }, { "epoch": 2.920177777777778, "grad_norm": 0.6711393203152322, "learning_rate": 2.1644622596393994e-08, "loss": 0.0162, "step": 24640 }, { "epoch": 2.921362962962963, "grad_norm": 0.45041506476044624, "learning_rate": 2.1008497141017382e-08, "loss": 0.016, "step": 24650 }, { "epoch": 2.922548148148148, "grad_norm": 0.6798601293739935, "learning_rate": 2.0381839666398508e-08, "loss": 0.0167, "step": 24660 }, { "epoch": 2.9237333333333333, "grad_norm": 0.47277047034364483, "learning_rate": 1.976465136418082e-08, "loss": 0.016, "step": 24670 }, { "epoch": 2.9249185185185187, "grad_norm": 0.5141826359565288, "learning_rate": 1.9156933408001066e-08, "loss": 0.0157, "step": 24680 }, { "epoch": 2.9261037037037037, "grad_norm": 0.5528612372626126, "learning_rate": 1.8558686953486503e-08, "loss": 0.0147, "step": 24690 }, { "epoch": 2.927288888888889, "grad_norm": 0.5659139736954499, "learning_rate": 1.796991313825491e-08, "loss": 0.0161, "step": 24700 }, { "epoch": 2.928474074074074, "grad_norm": 0.6033772942545749, "learning_rate": 1.7390613081910702e-08, "loss": 0.017, "step": 24710 }, { "epoch": 2.9296592592592594, "grad_norm": 0.6439271250362896, "learning_rate": 1.6820787886042134e-08, "loss": 0.0168, "step": 24720 }, { "epoch": 2.9308444444444444, "grad_norm": 0.7152694904135216, "learning_rate": 1.6260438634220775e-08, "loss": 0.0167, "step": 24730 }, { "epoch": 2.9320296296296298, "grad_norm": 0.6656290893537492, "learning_rate": 1.5709566391999275e-08, "loss": 0.0159, "step": 24740 }, { "epoch": 2.9332148148148147, "grad_norm": 0.5699462608892579, "learning_rate": 1.5168172206908582e-08, "loss": 0.0168, "step": 24750 }, { "epoch": 2.9344, "grad_norm": 0.4624562892237229, "learning_rate": 1.4636257108456286e-08, "loss": 0.0168, "step": 24760 }, { "epoch": 2.935585185185185, "grad_norm": 0.5308880655766132, "learning_rate": 1.4113822108124953e-08, "loss": 0.0167, "step": 24770 }, { "epoch": 2.9367703703703705, "grad_norm": 0.6806466306029003, "learning_rate": 1.3600868199369344e-08, "loss": 0.0179, "step": 24780 }, { "epoch": 2.9379555555555554, "grad_norm": 0.5719645729463473, "learning_rate": 1.309739635761531e-08, "loss": 0.0159, "step": 24790 }, { "epoch": 2.939140740740741, "grad_norm": 0.6523860091106538, "learning_rate": 1.2603407540258127e-08, "loss": 0.0165, "step": 24800 }, { "epoch": 2.940325925925926, "grad_norm": 0.5894060850266579, "learning_rate": 1.2118902686659717e-08, "loss": 0.0174, "step": 24810 }, { "epoch": 2.941511111111111, "grad_norm": 0.5738391637533752, "learning_rate": 1.1643882718148648e-08, "loss": 0.0177, "step": 24820 }, { "epoch": 2.942696296296296, "grad_norm": 0.6638923418593473, "learning_rate": 1.1178348538015138e-08, "loss": 0.0159, "step": 24830 }, { "epoch": 2.9438814814814815, "grad_norm": 0.5647052149594103, "learning_rate": 1.0722301031513282e-08, "loss": 0.0166, "step": 24840 }, { "epoch": 2.9450666666666665, "grad_norm": 0.5233472219687685, "learning_rate": 1.0275741065856604e-08, "loss": 0.0169, "step": 24850 }, { "epoch": 2.946251851851852, "grad_norm": 0.7171188069712527, "learning_rate": 9.838669490216945e-09, "loss": 0.0182, "step": 24860 }, { "epoch": 2.9474370370370373, "grad_norm": 0.600770659012363, "learning_rate": 9.411087135723362e-09, "loss": 0.017, "step": 24870 }, { "epoch": 2.9486222222222223, "grad_norm": 0.5633870929194631, "learning_rate": 8.99299481546101e-09, "loss": 0.0173, "step": 24880 }, { "epoch": 2.949807407407407, "grad_norm": 0.4735586805047879, "learning_rate": 8.584393324468365e-09, "loss": 0.0161, "step": 24890 }, { "epoch": 2.9509925925925926, "grad_norm": 0.7390374651538125, "learning_rate": 8.185283439735569e-09, "loss": 0.0159, "step": 24900 }, { "epoch": 2.952177777777778, "grad_norm": 0.6411091422246836, "learning_rate": 7.795665920205531e-09, "loss": 0.018, "step": 24910 }, { "epoch": 2.953362962962963, "grad_norm": 0.6960313565978423, "learning_rate": 7.415541506768931e-09, "loss": 0.0188, "step": 24920 }, { "epoch": 2.954548148148148, "grad_norm": 0.5757497526401953, "learning_rate": 7.044910922264781e-09, "loss": 0.0158, "step": 24930 }, { "epoch": 2.9557333333333333, "grad_norm": 0.5141356521997997, "learning_rate": 6.6837748714793095e-09, "loss": 0.0154, "step": 24940 }, { "epoch": 2.9569185185185187, "grad_norm": 0.544039733775537, "learning_rate": 6.332134041143745e-09, "loss": 0.0173, "step": 24950 }, { "epoch": 2.9581037037037037, "grad_norm": 0.573620916072217, "learning_rate": 5.989989099933757e-09, "loss": 0.0167, "step": 24960 }, { "epoch": 2.9592888888888886, "grad_norm": 0.7128676484272533, "learning_rate": 5.657340698466684e-09, "loss": 0.0174, "step": 24970 }, { "epoch": 2.960474074074074, "grad_norm": 0.635586370758074, "learning_rate": 5.334189469302642e-09, "loss": 0.0162, "step": 24980 }, { "epoch": 2.9616592592592594, "grad_norm": 0.5069301117432712, "learning_rate": 5.0205360269411916e-09, "loss": 0.0152, "step": 24990 }, { "epoch": 2.9628444444444444, "grad_norm": 0.5774614011438758, "learning_rate": 4.716380967821344e-09, "loss": 0.0183, "step": 25000 }, { "epoch": 2.96402962962963, "grad_norm": 0.563634931843291, "learning_rate": 4.421724870320443e-09, "loss": 0.0166, "step": 25010 }, { "epoch": 2.9652148148148147, "grad_norm": 0.6115473574967998, "learning_rate": 4.1365682947525074e-09, "loss": 0.0145, "step": 25020 }, { "epoch": 2.9664, "grad_norm": 0.5380579078625547, "learning_rate": 3.860911783366561e-09, "loss": 0.0189, "step": 25030 }, { "epoch": 2.967585185185185, "grad_norm": 0.5822851026345084, "learning_rate": 3.594755860347743e-09, "loss": 0.0167, "step": 25040 }, { "epoch": 2.9687703703703705, "grad_norm": 0.4950849801867146, "learning_rate": 3.3381010318139783e-09, "loss": 0.0182, "step": 25050 }, { "epoch": 2.9699555555555555, "grad_norm": 0.5265732260876559, "learning_rate": 3.090947785817089e-09, "loss": 0.0149, "step": 25060 }, { "epoch": 2.971140740740741, "grad_norm": 0.5169416930697963, "learning_rate": 2.8532965923400158e-09, "loss": 0.0149, "step": 25070 }, { "epoch": 2.972325925925926, "grad_norm": 0.5200164411496547, "learning_rate": 2.625147903297376e-09, "loss": 0.0167, "step": 25080 }, { "epoch": 2.973511111111111, "grad_norm": 0.5693745990291242, "learning_rate": 2.4065021525326858e-09, "loss": 0.018, "step": 25090 }, { "epoch": 2.974696296296296, "grad_norm": 0.6617362942114778, "learning_rate": 2.1973597558200278e-09, "loss": 0.0173, "step": 25100 }, { "epoch": 2.9758814814814816, "grad_norm": 0.6414046448930827, "learning_rate": 1.9977211108612726e-09, "loss": 0.0162, "step": 25110 }, { "epoch": 2.9770666666666665, "grad_norm": 0.6120131032260693, "learning_rate": 1.807586597287747e-09, "loss": 0.0186, "step": 25120 }, { "epoch": 2.978251851851852, "grad_norm": 0.44101965724272346, "learning_rate": 1.6269565766552365e-09, "loss": 0.0147, "step": 25130 }, { "epoch": 2.979437037037037, "grad_norm": 0.5910797793064148, "learning_rate": 1.4558313924478705e-09, "loss": 0.0165, "step": 25140 }, { "epoch": 2.9806222222222223, "grad_norm": 0.5246736687335397, "learning_rate": 1.2942113700747938e-09, "loss": 0.0162, "step": 25150 }, { "epoch": 2.9818074074074072, "grad_norm": 0.5785101327575917, "learning_rate": 1.142096816870164e-09, "loss": 0.0149, "step": 25160 }, { "epoch": 2.9829925925925926, "grad_norm": 0.7034997441291818, "learning_rate": 9.994880220937086e-10, "loss": 0.0156, "step": 25170 }, { "epoch": 2.984177777777778, "grad_norm": 0.48543867906840543, "learning_rate": 8.663852569273934e-10, "loss": 0.0163, "step": 25180 }, { "epoch": 2.985362962962963, "grad_norm": 0.5269357130671872, "learning_rate": 7.42788774477643e-10, "loss": 0.0176, "step": 25190 }, { "epoch": 2.986548148148148, "grad_norm": 0.6630914633220515, "learning_rate": 6.286988097747859e-10, "loss": 0.0176, "step": 25200 }, { "epoch": 2.9877333333333334, "grad_norm": 0.748656324329734, "learning_rate": 5.241155797691688e-10, "loss": 0.0165, "step": 25210 }, { "epoch": 2.9889185185185188, "grad_norm": 0.5430103134833101, "learning_rate": 4.290392833361523e-10, "loss": 0.0156, "step": 25220 }, { "epoch": 2.9901037037037037, "grad_norm": 0.5716313983233334, "learning_rate": 3.4347010127111504e-10, "loss": 0.0151, "step": 25230 }, { "epoch": 2.9912888888888887, "grad_norm": 0.6888188004556484, "learning_rate": 2.674081962905639e-10, "loss": 0.016, "step": 25240 }, { "epoch": 2.992474074074074, "grad_norm": 0.7073514899610959, "learning_rate": 2.0085371303379953e-10, "loss": 0.0183, "step": 25250 }, { "epoch": 2.9936592592592595, "grad_norm": 0.5561703242395853, "learning_rate": 1.438067780590302e-10, "loss": 0.0166, "step": 25260 }, { "epoch": 2.9948444444444444, "grad_norm": 0.5512080757144833, "learning_rate": 9.62674998467028e-11, "loss": 0.0178, "step": 25270 }, { "epoch": 2.9960296296296294, "grad_norm": 0.6813051570043652, "learning_rate": 5.8235968796172e-11, "loss": 0.0167, "step": 25280 }, { "epoch": 2.9972148148148148, "grad_norm": 0.46741913471786667, "learning_rate": 2.9712257227920704e-11, "loss": 0.0166, "step": 25290 }, { "epoch": 2.9984, "grad_norm": 0.6586403960498411, "learning_rate": 1.0696419381894807e-11, "loss": 0.0173, "step": 25300 }, { "epoch": 2.999585185185185, "grad_norm": 0.6044512529157666, "learning_rate": 1.188491419168436e-12, "loss": 0.0168, "step": 25310 }, { "epoch": 3.0, "step": 25314, "total_flos": 1062096120446976.0, "train_loss": 0.055281539885637, "train_runtime": 108497.1176, "train_samples_per_second": 7.465, "train_steps_per_second": 0.233 } ], "logging_steps": 10, "max_steps": 25314, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1062096120446976.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }