diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17760 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 25314, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011851851851851852, + "grad_norm": 16.64023253730519, + "learning_rate": 3.5545023696682464e-08, + "loss": 1.3476, + "step": 10 + }, + { + "epoch": 0.0023703703703703703, + "grad_norm": 16.002172660344485, + "learning_rate": 7.50394944707741e-08, + "loss": 1.3427, + "step": 20 + }, + { + "epoch": 0.0035555555555555557, + "grad_norm": 15.506725945818424, + "learning_rate": 1.1453396524486573e-07, + "loss": 1.3183, + "step": 30 + }, + { + "epoch": 0.004740740740740741, + "grad_norm": 16.92291845738296, + "learning_rate": 1.5402843601895734e-07, + "loss": 1.3252, + "step": 40 + }, + { + "epoch": 0.005925925925925926, + "grad_norm": 14.292359902439756, + "learning_rate": 1.93522906793049e-07, + "loss": 1.2769, + "step": 50 + }, + { + "epoch": 0.0071111111111111115, + "grad_norm": 14.435536849846146, + "learning_rate": 2.3301737756714062e-07, + "loss": 1.2532, + "step": 60 + }, + { + "epoch": 0.008296296296296296, + "grad_norm": 12.052509163020323, + "learning_rate": 2.7251184834123223e-07, + "loss": 1.1041, + "step": 70 + }, + { + "epoch": 0.009481481481481481, + "grad_norm": 12.68662755756317, + "learning_rate": 3.1200631911532384e-07, + "loss": 1.0185, + "step": 80 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 6.810926008102391, + "learning_rate": 3.515007898894155e-07, + "loss": 0.7842, + "step": 90 + }, + { + "epoch": 0.011851851851851851, + "grad_norm": 3.355654756862213, + "learning_rate": 3.9099526066350717e-07, + "loss": 0.6063, + "step": 100 + }, + { + "epoch": 0.013037037037037036, + "grad_norm": 3.633581940082918, + "learning_rate": 4.304897314375988e-07, + "loss": 0.5658, + "step": 110 + }, + { + "epoch": 0.014222222222222223, + "grad_norm": 2.6701078724792904, + "learning_rate": 4.699842022116904e-07, + "loss": 0.5108, + "step": 120 + }, + { + "epoch": 0.015407407407407408, + "grad_norm": 2.62643529952277, + "learning_rate": 5.09478672985782e-07, + "loss": 0.4698, + "step": 130 + }, + { + "epoch": 0.016592592592592593, + "grad_norm": 2.6826669411135797, + "learning_rate": 5.489731437598736e-07, + "loss": 0.4291, + "step": 140 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 2.355112607468174, + "learning_rate": 5.884676145339653e-07, + "loss": 0.3999, + "step": 150 + }, + { + "epoch": 0.018962962962962963, + "grad_norm": 2.4239938265335796, + "learning_rate": 6.279620853080568e-07, + "loss": 0.3843, + "step": 160 + }, + { + "epoch": 0.020148148148148148, + "grad_norm": 2.5663214184292253, + "learning_rate": 6.674565560821486e-07, + "loss": 0.3748, + "step": 170 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 2.1224331420495797, + "learning_rate": 7.069510268562402e-07, + "loss": 0.3596, + "step": 180 + }, + { + "epoch": 0.022518518518518518, + "grad_norm": 2.071802554656357, + "learning_rate": 7.464454976303318e-07, + "loss": 0.3474, + "step": 190 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 2.0743830333800615, + "learning_rate": 7.859399684044235e-07, + "loss": 0.3394, + "step": 200 + }, + { + "epoch": 0.024888888888888887, + "grad_norm": 2.4818874394855936, + "learning_rate": 8.25434439178515e-07, + "loss": 0.3304, + "step": 210 + }, + { + "epoch": 0.026074074074074072, + "grad_norm": 2.30314973212301, + "learning_rate": 8.649289099526067e-07, + "loss": 0.3142, + "step": 220 + }, + { + "epoch": 0.02725925925925926, + "grad_norm": 2.9283982412163843, + "learning_rate": 9.044233807266983e-07, + "loss": 0.3068, + "step": 230 + }, + { + "epoch": 0.028444444444444446, + "grad_norm": 2.5348809727151957, + "learning_rate": 9.4391785150079e-07, + "loss": 0.2993, + "step": 240 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 2.502806735266231, + "learning_rate": 9.834123222748817e-07, + "loss": 0.2972, + "step": 250 + }, + { + "epoch": 0.030814814814814816, + "grad_norm": 2.4575479996591376, + "learning_rate": 1.0229067930489733e-06, + "loss": 0.2997, + "step": 260 + }, + { + "epoch": 0.032, + "grad_norm": 3.0224786024371637, + "learning_rate": 1.0624012638230649e-06, + "loss": 0.3, + "step": 270 + }, + { + "epoch": 0.033185185185185186, + "grad_norm": 2.3998759346501, + "learning_rate": 1.1018957345971565e-06, + "loss": 0.2874, + "step": 280 + }, + { + "epoch": 0.03437037037037037, + "grad_norm": 2.8989319371077937, + "learning_rate": 1.1413902053712481e-06, + "loss": 0.2792, + "step": 290 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 2.977853180171724, + "learning_rate": 1.1808846761453397e-06, + "loss": 0.2738, + "step": 300 + }, + { + "epoch": 0.03674074074074074, + "grad_norm": 2.6417383181651575, + "learning_rate": 1.2203791469194313e-06, + "loss": 0.2677, + "step": 310 + }, + { + "epoch": 0.037925925925925925, + "grad_norm": 2.728345528086855, + "learning_rate": 1.2598736176935232e-06, + "loss": 0.274, + "step": 320 + }, + { + "epoch": 0.03911111111111111, + "grad_norm": 3.309747933278777, + "learning_rate": 1.2993680884676146e-06, + "loss": 0.2595, + "step": 330 + }, + { + "epoch": 0.040296296296296295, + "grad_norm": 2.8164913457778065, + "learning_rate": 1.3388625592417062e-06, + "loss": 0.2646, + "step": 340 + }, + { + "epoch": 0.04148148148148148, + "grad_norm": 2.1205658345135885, + "learning_rate": 1.3783570300157978e-06, + "loss": 0.2613, + "step": 350 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 2.8823508159607307, + "learning_rate": 1.4178515007898896e-06, + "loss": 0.2506, + "step": 360 + }, + { + "epoch": 0.04385185185185185, + "grad_norm": 2.4601971464607777, + "learning_rate": 1.4573459715639812e-06, + "loss": 0.2585, + "step": 370 + }, + { + "epoch": 0.045037037037037035, + "grad_norm": 2.4232969328222387, + "learning_rate": 1.4968404423380728e-06, + "loss": 0.2534, + "step": 380 + }, + { + "epoch": 0.04622222222222222, + "grad_norm": 2.937713487379996, + "learning_rate": 1.5363349131121644e-06, + "loss": 0.2452, + "step": 390 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 2.2762489237517314, + "learning_rate": 1.5758293838862558e-06, + "loss": 0.2448, + "step": 400 + }, + { + "epoch": 0.04859259259259259, + "grad_norm": 2.4510732920620466, + "learning_rate": 1.6153238546603479e-06, + "loss": 0.2446, + "step": 410 + }, + { + "epoch": 0.049777777777777775, + "grad_norm": 2.2204344270714134, + "learning_rate": 1.6548183254344393e-06, + "loss": 0.236, + "step": 420 + }, + { + "epoch": 0.05096296296296296, + "grad_norm": 2.6698538935764695, + "learning_rate": 1.694312796208531e-06, + "loss": 0.2339, + "step": 430 + }, + { + "epoch": 0.052148148148148145, + "grad_norm": 2.267477161633137, + "learning_rate": 1.7338072669826225e-06, + "loss": 0.2427, + "step": 440 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 2.596053948214697, + "learning_rate": 1.7733017377567141e-06, + "loss": 0.2286, + "step": 450 + }, + { + "epoch": 0.05451851851851852, + "grad_norm": 2.3770694649836237, + "learning_rate": 1.812796208530806e-06, + "loss": 0.2357, + "step": 460 + }, + { + "epoch": 0.05570370370370371, + "grad_norm": 2.2607282274650626, + "learning_rate": 1.8522906793048976e-06, + "loss": 0.2398, + "step": 470 + }, + { + "epoch": 0.05688888888888889, + "grad_norm": 2.904477627992424, + "learning_rate": 1.8917851500789892e-06, + "loss": 0.2229, + "step": 480 + }, + { + "epoch": 0.05807407407407408, + "grad_norm": 2.619241089181291, + "learning_rate": 1.9312796208530806e-06, + "loss": 0.2302, + "step": 490 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 2.058787571644914, + "learning_rate": 1.9707740916271724e-06, + "loss": 0.2252, + "step": 500 + }, + { + "epoch": 0.060444444444444446, + "grad_norm": 2.221434893935856, + "learning_rate": 2.0102685624012642e-06, + "loss": 0.2219, + "step": 510 + }, + { + "epoch": 0.06162962962962963, + "grad_norm": 3.0717598858918174, + "learning_rate": 2.0497630331753556e-06, + "loss": 0.2147, + "step": 520 + }, + { + "epoch": 0.06281481481481481, + "grad_norm": 2.2735367678402123, + "learning_rate": 2.0892575039494474e-06, + "loss": 0.2165, + "step": 530 + }, + { + "epoch": 0.064, + "grad_norm": 2.1138430179189736, + "learning_rate": 2.128751974723539e-06, + "loss": 0.225, + "step": 540 + }, + { + "epoch": 0.06518518518518518, + "grad_norm": 2.107474761283569, + "learning_rate": 2.1682464454976302e-06, + "loss": 0.2102, + "step": 550 + }, + { + "epoch": 0.06637037037037037, + "grad_norm": 2.1363084310333305, + "learning_rate": 2.207740916271722e-06, + "loss": 0.2074, + "step": 560 + }, + { + "epoch": 0.06755555555555555, + "grad_norm": 2.2676901135162533, + "learning_rate": 2.247235387045814e-06, + "loss": 0.1997, + "step": 570 + }, + { + "epoch": 0.06874074074074074, + "grad_norm": 1.893328201073285, + "learning_rate": 2.2867298578199053e-06, + "loss": 0.2189, + "step": 580 + }, + { + "epoch": 0.06992592592592592, + "grad_norm": 2.208257761729857, + "learning_rate": 2.326224328593997e-06, + "loss": 0.2095, + "step": 590 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 2.1481277137516543, + "learning_rate": 2.3657187993680885e-06, + "loss": 0.2124, + "step": 600 + }, + { + "epoch": 0.0722962962962963, + "grad_norm": 2.281329003543564, + "learning_rate": 2.4052132701421803e-06, + "loss": 0.2073, + "step": 610 + }, + { + "epoch": 0.07348148148148148, + "grad_norm": 1.969363240524975, + "learning_rate": 2.444707740916272e-06, + "loss": 0.2113, + "step": 620 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 1.9322228994582784, + "learning_rate": 2.4842022116903636e-06, + "loss": 0.2064, + "step": 630 + }, + { + "epoch": 0.07585185185185185, + "grad_norm": 2.3569096925956483, + "learning_rate": 2.5236966824644554e-06, + "loss": 0.1946, + "step": 640 + }, + { + "epoch": 0.07703703703703704, + "grad_norm": 2.0544195464466735, + "learning_rate": 2.5631911532385468e-06, + "loss": 0.1905, + "step": 650 + }, + { + "epoch": 0.07822222222222222, + "grad_norm": 1.872188884004776, + "learning_rate": 2.6026856240126386e-06, + "loss": 0.201, + "step": 660 + }, + { + "epoch": 0.07940740740740741, + "grad_norm": 2.11573615376644, + "learning_rate": 2.64218009478673e-06, + "loss": 0.1893, + "step": 670 + }, + { + "epoch": 0.08059259259259259, + "grad_norm": 1.8642448049772182, + "learning_rate": 2.681674565560822e-06, + "loss": 0.1908, + "step": 680 + }, + { + "epoch": 0.08177777777777778, + "grad_norm": 2.2303010127242233, + "learning_rate": 2.7211690363349137e-06, + "loss": 0.2022, + "step": 690 + }, + { + "epoch": 0.08296296296296296, + "grad_norm": 1.8511356928408458, + "learning_rate": 2.760663507109005e-06, + "loss": 0.2021, + "step": 700 + }, + { + "epoch": 0.08414814814814815, + "grad_norm": 1.794649953795861, + "learning_rate": 2.800157977883097e-06, + "loss": 0.1906, + "step": 710 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 1.9641261846944857, + "learning_rate": 2.839652448657188e-06, + "loss": 0.1941, + "step": 720 + }, + { + "epoch": 0.08651851851851852, + "grad_norm": 1.950674535791457, + "learning_rate": 2.8791469194312797e-06, + "loss": 0.1939, + "step": 730 + }, + { + "epoch": 0.0877037037037037, + "grad_norm": 1.8244665847120771, + "learning_rate": 2.918641390205372e-06, + "loss": 0.1854, + "step": 740 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 1.7697939041359319, + "learning_rate": 2.958135860979463e-06, + "loss": 0.1954, + "step": 750 + }, + { + "epoch": 0.09007407407407407, + "grad_norm": 2.0286467559855352, + "learning_rate": 2.9976303317535547e-06, + "loss": 0.1789, + "step": 760 + }, + { + "epoch": 0.09125925925925926, + "grad_norm": 1.5836584212702782, + "learning_rate": 3.037124802527646e-06, + "loss": 0.186, + "step": 770 + }, + { + "epoch": 0.09244444444444444, + "grad_norm": 1.4971744594996326, + "learning_rate": 3.076619273301738e-06, + "loss": 0.1871, + "step": 780 + }, + { + "epoch": 0.09362962962962963, + "grad_norm": 1.7487066886646125, + "learning_rate": 3.1161137440758298e-06, + "loss": 0.1871, + "step": 790 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 1.569162729193313, + "learning_rate": 3.155608214849921e-06, + "loss": 0.1923, + "step": 800 + }, + { + "epoch": 0.096, + "grad_norm": 1.623522453907034, + "learning_rate": 3.195102685624013e-06, + "loss": 0.1743, + "step": 810 + }, + { + "epoch": 0.09718518518518518, + "grad_norm": 1.7221906617688856, + "learning_rate": 3.2345971563981044e-06, + "loss": 0.1845, + "step": 820 + }, + { + "epoch": 0.09837037037037037, + "grad_norm": 1.661367161486284, + "learning_rate": 3.2740916271721962e-06, + "loss": 0.1817, + "step": 830 + }, + { + "epoch": 0.09955555555555555, + "grad_norm": 1.7890071283233029, + "learning_rate": 3.313586097946288e-06, + "loss": 0.1918, + "step": 840 + }, + { + "epoch": 0.10074074074074074, + "grad_norm": 1.6622433475069756, + "learning_rate": 3.3530805687203794e-06, + "loss": 0.1852, + "step": 850 + }, + { + "epoch": 0.10192592592592592, + "grad_norm": 1.5056266056981837, + "learning_rate": 3.3925750394944713e-06, + "loss": 0.1795, + "step": 860 + }, + { + "epoch": 0.10311111111111111, + "grad_norm": 1.5592611887308574, + "learning_rate": 3.4320695102685627e-06, + "loss": 0.1728, + "step": 870 + }, + { + "epoch": 0.10429629629629629, + "grad_norm": 2.0188330296033175, + "learning_rate": 3.4715639810426545e-06, + "loss": 0.1824, + "step": 880 + }, + { + "epoch": 0.10548148148148148, + "grad_norm": 1.6454136306594946, + "learning_rate": 3.5110584518167463e-06, + "loss": 0.1826, + "step": 890 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 1.8182282975412345, + "learning_rate": 3.5505529225908373e-06, + "loss": 0.1758, + "step": 900 + }, + { + "epoch": 0.10785185185185185, + "grad_norm": 1.5929448694005304, + "learning_rate": 3.5900473933649295e-06, + "loss": 0.1703, + "step": 910 + }, + { + "epoch": 0.10903703703703704, + "grad_norm": 1.4689256663083547, + "learning_rate": 3.6295418641390205e-06, + "loss": 0.183, + "step": 920 + }, + { + "epoch": 0.11022222222222222, + "grad_norm": 1.8805410830998628, + "learning_rate": 3.6690363349131123e-06, + "loss": 0.1815, + "step": 930 + }, + { + "epoch": 0.11140740740740741, + "grad_norm": 1.3905979671328816, + "learning_rate": 3.708530805687204e-06, + "loss": 0.1713, + "step": 940 + }, + { + "epoch": 0.11259259259259259, + "grad_norm": 1.6578556556364272, + "learning_rate": 3.7480252764612956e-06, + "loss": 0.1801, + "step": 950 + }, + { + "epoch": 0.11377777777777778, + "grad_norm": 1.6044003852546482, + "learning_rate": 3.7875197472353874e-06, + "loss": 0.1776, + "step": 960 + }, + { + "epoch": 0.11496296296296296, + "grad_norm": 1.4839235623764366, + "learning_rate": 3.827014218009479e-06, + "loss": 0.1712, + "step": 970 + }, + { + "epoch": 0.11614814814814815, + "grad_norm": 1.5546392911649811, + "learning_rate": 3.866508688783571e-06, + "loss": 0.1705, + "step": 980 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 1.4719840642069977, + "learning_rate": 3.9060031595576624e-06, + "loss": 0.1799, + "step": 990 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 1.5771030943684798, + "learning_rate": 3.945497630331754e-06, + "loss": 0.1746, + "step": 1000 + }, + { + "epoch": 0.1197037037037037, + "grad_norm": 1.5899978185236583, + "learning_rate": 3.984992101105846e-06, + "loss": 0.1791, + "step": 1010 + }, + { + "epoch": 0.12088888888888889, + "grad_norm": 1.564653106856945, + "learning_rate": 4.024486571879937e-06, + "loss": 0.1798, + "step": 1020 + }, + { + "epoch": 0.12207407407407407, + "grad_norm": 1.4171146569353363, + "learning_rate": 4.063981042654029e-06, + "loss": 0.1764, + "step": 1030 + }, + { + "epoch": 0.12325925925925926, + "grad_norm": 1.5775944887678808, + "learning_rate": 4.10347551342812e-06, + "loss": 0.1723, + "step": 1040 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 1.6191537721728222, + "learning_rate": 4.142969984202212e-06, + "loss": 0.1698, + "step": 1050 + }, + { + "epoch": 0.12562962962962962, + "grad_norm": 1.5728409510091588, + "learning_rate": 4.182464454976304e-06, + "loss": 0.1757, + "step": 1060 + }, + { + "epoch": 0.12681481481481482, + "grad_norm": 1.654955467094043, + "learning_rate": 4.221958925750395e-06, + "loss": 0.1736, + "step": 1070 + }, + { + "epoch": 0.128, + "grad_norm": 1.4365683818481625, + "learning_rate": 4.261453396524487e-06, + "loss": 0.1735, + "step": 1080 + }, + { + "epoch": 0.12918518518518518, + "grad_norm": 1.3488449107612757, + "learning_rate": 4.300947867298579e-06, + "loss": 0.1678, + "step": 1090 + }, + { + "epoch": 0.13037037037037036, + "grad_norm": 1.5644538661334613, + "learning_rate": 4.34044233807267e-06, + "loss": 0.169, + "step": 1100 + }, + { + "epoch": 0.13155555555555556, + "grad_norm": 1.5397382467072402, + "learning_rate": 4.379936808846762e-06, + "loss": 0.1842, + "step": 1110 + }, + { + "epoch": 0.13274074074074074, + "grad_norm": 1.4898755866787456, + "learning_rate": 4.419431279620853e-06, + "loss": 0.1772, + "step": 1120 + }, + { + "epoch": 0.13392592592592592, + "grad_norm": 1.6439184747645121, + "learning_rate": 4.4589257503949454e-06, + "loss": 0.1673, + "step": 1130 + }, + { + "epoch": 0.1351111111111111, + "grad_norm": 1.7264795439164726, + "learning_rate": 4.498420221169037e-06, + "loss": 0.161, + "step": 1140 + }, + { + "epoch": 0.1362962962962963, + "grad_norm": 1.3230022083211253, + "learning_rate": 4.537914691943128e-06, + "loss": 0.1655, + "step": 1150 + }, + { + "epoch": 0.13748148148148148, + "grad_norm": 1.5862506442915425, + "learning_rate": 4.5774091627172205e-06, + "loss": 0.1678, + "step": 1160 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 1.5855849177374566, + "learning_rate": 4.616903633491311e-06, + "loss": 0.1647, + "step": 1170 + }, + { + "epoch": 0.13985185185185184, + "grad_norm": 1.358603935034976, + "learning_rate": 4.656398104265403e-06, + "loss": 0.1679, + "step": 1180 + }, + { + "epoch": 0.14103703703703704, + "grad_norm": 1.6553313082529357, + "learning_rate": 4.695892575039495e-06, + "loss": 0.1572, + "step": 1190 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 1.3478085969007612, + "learning_rate": 4.735387045813586e-06, + "loss": 0.1614, + "step": 1200 + }, + { + "epoch": 0.1434074074074074, + "grad_norm": 1.2945814520336956, + "learning_rate": 4.774881516587678e-06, + "loss": 0.17, + "step": 1210 + }, + { + "epoch": 0.1445925925925926, + "grad_norm": 1.574385234075967, + "learning_rate": 4.81437598736177e-06, + "loss": 0.1694, + "step": 1220 + }, + { + "epoch": 0.14577777777777778, + "grad_norm": 1.2930518937611122, + "learning_rate": 4.853870458135861e-06, + "loss": 0.1554, + "step": 1230 + }, + { + "epoch": 0.14696296296296296, + "grad_norm": 1.2904231842371265, + "learning_rate": 4.8933649289099525e-06, + "loss": 0.1653, + "step": 1240 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 1.5730789203066062, + "learning_rate": 4.932859399684045e-06, + "loss": 0.1701, + "step": 1250 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 1.2769542182085247, + "learning_rate": 4.972353870458136e-06, + "loss": 0.1676, + "step": 1260 + }, + { + "epoch": 0.15051851851851852, + "grad_norm": 1.2571410788560955, + "learning_rate": 5.011848341232228e-06, + "loss": 0.1656, + "step": 1270 + }, + { + "epoch": 0.1517037037037037, + "grad_norm": 1.3873958490581901, + "learning_rate": 5.051342812006319e-06, + "loss": 0.1635, + "step": 1280 + }, + { + "epoch": 0.15288888888888888, + "grad_norm": 1.1159404546222491, + "learning_rate": 5.090837282780411e-06, + "loss": 0.1575, + "step": 1290 + }, + { + "epoch": 0.15407407407407409, + "grad_norm": 1.27725876926721, + "learning_rate": 5.130331753554503e-06, + "loss": 0.1587, + "step": 1300 + }, + { + "epoch": 0.15525925925925926, + "grad_norm": 1.5545501003527686, + "learning_rate": 5.169826224328595e-06, + "loss": 0.1509, + "step": 1310 + }, + { + "epoch": 0.15644444444444444, + "grad_norm": 1.3114722736322373, + "learning_rate": 5.209320695102686e-06, + "loss": 0.1628, + "step": 1320 + }, + { + "epoch": 0.15762962962962962, + "grad_norm": 1.3618760732486652, + "learning_rate": 5.248815165876777e-06, + "loss": 0.1663, + "step": 1330 + }, + { + "epoch": 0.15881481481481483, + "grad_norm": 1.455841330090113, + "learning_rate": 5.288309636650869e-06, + "loss": 0.163, + "step": 1340 + }, + { + "epoch": 0.16, + "grad_norm": 1.2663524840500726, + "learning_rate": 5.327804107424961e-06, + "loss": 0.157, + "step": 1350 + }, + { + "epoch": 0.16118518518518518, + "grad_norm": 1.4394970583573017, + "learning_rate": 5.367298578199053e-06, + "loss": 0.1652, + "step": 1360 + }, + { + "epoch": 0.16237037037037036, + "grad_norm": 1.625893248662234, + "learning_rate": 5.406793048973145e-06, + "loss": 0.1627, + "step": 1370 + }, + { + "epoch": 0.16355555555555557, + "grad_norm": 1.246933955283402, + "learning_rate": 5.4462875197472355e-06, + "loss": 0.1628, + "step": 1380 + }, + { + "epoch": 0.16474074074074074, + "grad_norm": 1.3894317077738516, + "learning_rate": 5.485781990521327e-06, + "loss": 0.1542, + "step": 1390 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 1.3397970821643086, + "learning_rate": 5.525276461295419e-06, + "loss": 0.1534, + "step": 1400 + }, + { + "epoch": 0.1671111111111111, + "grad_norm": 1.3787584428667217, + "learning_rate": 5.5647709320695106e-06, + "loss": 0.1606, + "step": 1410 + }, + { + "epoch": 0.1682962962962963, + "grad_norm": 1.1039480094467116, + "learning_rate": 5.604265402843603e-06, + "loss": 0.1656, + "step": 1420 + }, + { + "epoch": 0.16948148148148148, + "grad_norm": 1.5110637918564995, + "learning_rate": 5.643759873617693e-06, + "loss": 0.1592, + "step": 1430 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 1.1598226907603657, + "learning_rate": 5.683254344391786e-06, + "loss": 0.1553, + "step": 1440 + }, + { + "epoch": 0.17185185185185184, + "grad_norm": 1.266111525853398, + "learning_rate": 5.722748815165877e-06, + "loss": 0.1611, + "step": 1450 + }, + { + "epoch": 0.17303703703703704, + "grad_norm": 1.4369534873980916, + "learning_rate": 5.762243285939969e-06, + "loss": 0.1586, + "step": 1460 + }, + { + "epoch": 0.17422222222222222, + "grad_norm": 1.349225066305025, + "learning_rate": 5.801737756714061e-06, + "loss": 0.155, + "step": 1470 + }, + { + "epoch": 0.1754074074074074, + "grad_norm": 1.3245587999340904, + "learning_rate": 5.841232227488152e-06, + "loss": 0.1554, + "step": 1480 + }, + { + "epoch": 0.17659259259259258, + "grad_norm": 1.3818406938882097, + "learning_rate": 5.8807266982622435e-06, + "loss": 0.1507, + "step": 1490 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 1.2162085401765121, + "learning_rate": 5.920221169036336e-06, + "loss": 0.1614, + "step": 1500 + }, + { + "epoch": 0.17896296296296296, + "grad_norm": 1.4420588683251592, + "learning_rate": 5.959715639810427e-06, + "loss": 0.1512, + "step": 1510 + }, + { + "epoch": 0.18014814814814814, + "grad_norm": 1.4710421203909305, + "learning_rate": 5.999210110584519e-06, + "loss": 0.1601, + "step": 1520 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 1.4041070078830804, + "learning_rate": 6.03870458135861e-06, + "loss": 0.1564, + "step": 1530 + }, + { + "epoch": 0.18251851851851852, + "grad_norm": 1.2990259277105651, + "learning_rate": 6.078199052132701e-06, + "loss": 0.1474, + "step": 1540 + }, + { + "epoch": 0.1837037037037037, + "grad_norm": 1.1290654594876044, + "learning_rate": 6.1176935229067936e-06, + "loss": 0.1524, + "step": 1550 + }, + { + "epoch": 0.18488888888888888, + "grad_norm": 1.3655761919766516, + "learning_rate": 6.157187993680885e-06, + "loss": 0.1516, + "step": 1560 + }, + { + "epoch": 0.1860740740740741, + "grad_norm": 1.13197310444901, + "learning_rate": 6.196682464454977e-06, + "loss": 0.1482, + "step": 1570 + }, + { + "epoch": 0.18725925925925926, + "grad_norm": 1.2967230650182364, + "learning_rate": 6.236176935229068e-06, + "loss": 0.1466, + "step": 1580 + }, + { + "epoch": 0.18844444444444444, + "grad_norm": 1.2858221652544513, + "learning_rate": 6.27567140600316e-06, + "loss": 0.14, + "step": 1590 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 1.4770672800010265, + "learning_rate": 6.315165876777251e-06, + "loss": 0.1423, + "step": 1600 + }, + { + "epoch": 0.19081481481481483, + "grad_norm": 1.364540759886777, + "learning_rate": 6.354660347551344e-06, + "loss": 0.1573, + "step": 1610 + }, + { + "epoch": 0.192, + "grad_norm": 1.15436213273915, + "learning_rate": 6.394154818325435e-06, + "loss": 0.1477, + "step": 1620 + }, + { + "epoch": 0.19318518518518518, + "grad_norm": 1.302266257733314, + "learning_rate": 6.4336492890995265e-06, + "loss": 0.15, + "step": 1630 + }, + { + "epoch": 0.19437037037037036, + "grad_norm": 1.359792431900611, + "learning_rate": 6.473143759873618e-06, + "loss": 0.1497, + "step": 1640 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 1.4472703938936013, + "learning_rate": 6.51263823064771e-06, + "loss": 0.1451, + "step": 1650 + }, + { + "epoch": 0.19674074074074074, + "grad_norm": 1.2473977675102152, + "learning_rate": 6.5521327014218015e-06, + "loss": 0.1492, + "step": 1660 + }, + { + "epoch": 0.19792592592592592, + "grad_norm": 1.4162855282869764, + "learning_rate": 6.591627172195894e-06, + "loss": 0.1415, + "step": 1670 + }, + { + "epoch": 0.1991111111111111, + "grad_norm": 1.3796721675173294, + "learning_rate": 6.631121642969984e-06, + "loss": 0.1443, + "step": 1680 + }, + { + "epoch": 0.2002962962962963, + "grad_norm": 1.3461006140955094, + "learning_rate": 6.6706161137440765e-06, + "loss": 0.1515, + "step": 1690 + }, + { + "epoch": 0.20148148148148148, + "grad_norm": 1.4963650991939834, + "learning_rate": 6.710110584518168e-06, + "loss": 0.147, + "step": 1700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 1.2716150586537698, + "learning_rate": 6.74960505529226e-06, + "loss": 0.1478, + "step": 1710 + }, + { + "epoch": 0.20385185185185184, + "grad_norm": 1.6329558161465296, + "learning_rate": 6.789099526066352e-06, + "loss": 0.1488, + "step": 1720 + }, + { + "epoch": 0.20503703703703705, + "grad_norm": 1.3662584849275496, + "learning_rate": 6.828593996840442e-06, + "loss": 0.149, + "step": 1730 + }, + { + "epoch": 0.20622222222222222, + "grad_norm": 1.351762035054952, + "learning_rate": 6.868088467614534e-06, + "loss": 0.1485, + "step": 1740 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 1.2917761077413694, + "learning_rate": 6.907582938388626e-06, + "loss": 0.1395, + "step": 1750 + }, + { + "epoch": 0.20859259259259258, + "grad_norm": 1.1078583538981257, + "learning_rate": 6.947077409162718e-06, + "loss": 0.142, + "step": 1760 + }, + { + "epoch": 0.20977777777777779, + "grad_norm": 1.1952738983680355, + "learning_rate": 6.9865718799368094e-06, + "loss": 0.1413, + "step": 1770 + }, + { + "epoch": 0.21096296296296296, + "grad_norm": 1.2337951524972826, + "learning_rate": 7.026066350710901e-06, + "loss": 0.1491, + "step": 1780 + }, + { + "epoch": 0.21214814814814814, + "grad_norm": 1.2214404384776358, + "learning_rate": 7.065560821484992e-06, + "loss": 0.146, + "step": 1790 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 1.2144547821049632, + "learning_rate": 7.1050552922590845e-06, + "loss": 0.1471, + "step": 1800 + }, + { + "epoch": 0.21451851851851853, + "grad_norm": 1.2690692704734767, + "learning_rate": 7.144549763033176e-06, + "loss": 0.1394, + "step": 1810 + }, + { + "epoch": 0.2157037037037037, + "grad_norm": 1.2528799209121506, + "learning_rate": 7.184044233807268e-06, + "loss": 0.1391, + "step": 1820 + }, + { + "epoch": 0.21688888888888888, + "grad_norm": 1.088421676313826, + "learning_rate": 7.223538704581359e-06, + "loss": 0.1376, + "step": 1830 + }, + { + "epoch": 0.2180740740740741, + "grad_norm": 1.2532401792769972, + "learning_rate": 7.263033175355451e-06, + "loss": 0.1387, + "step": 1840 + }, + { + "epoch": 0.21925925925925926, + "grad_norm": 1.171126057113268, + "learning_rate": 7.302527646129542e-06, + "loss": 0.1457, + "step": 1850 + }, + { + "epoch": 0.22044444444444444, + "grad_norm": 1.1517859213369401, + "learning_rate": 7.342022116903635e-06, + "loss": 0.139, + "step": 1860 + }, + { + "epoch": 0.22162962962962962, + "grad_norm": 1.4497812790100033, + "learning_rate": 7.381516587677726e-06, + "loss": 0.1452, + "step": 1870 + }, + { + "epoch": 0.22281481481481483, + "grad_norm": 1.3681419086556943, + "learning_rate": 7.4210110584518165e-06, + "loss": 0.1338, + "step": 1880 + }, + { + "epoch": 0.224, + "grad_norm": 1.239630120430343, + "learning_rate": 7.460505529225909e-06, + "loss": 0.1444, + "step": 1890 + }, + { + "epoch": 0.22518518518518518, + "grad_norm": 1.2751504568085517, + "learning_rate": 7.500000000000001e-06, + "loss": 0.152, + "step": 1900 + }, + { + "epoch": 0.22637037037037036, + "grad_norm": 1.3398101737423465, + "learning_rate": 7.5394944707740924e-06, + "loss": 0.1416, + "step": 1910 + }, + { + "epoch": 0.22755555555555557, + "grad_norm": 1.30370206995981, + "learning_rate": 7.578988941548185e-06, + "loss": 0.1381, + "step": 1920 + }, + { + "epoch": 0.22874074074074074, + "grad_norm": 1.4307225892256046, + "learning_rate": 7.618483412322275e-06, + "loss": 0.1364, + "step": 1930 + }, + { + "epoch": 0.22992592592592592, + "grad_norm": 1.3831585238138127, + "learning_rate": 7.657977883096367e-06, + "loss": 0.1349, + "step": 1940 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 1.1393824594697461, + "learning_rate": 7.697472353870459e-06, + "loss": 0.1451, + "step": 1950 + }, + { + "epoch": 0.2322962962962963, + "grad_norm": 1.3942292177077196, + "learning_rate": 7.736966824644551e-06, + "loss": 0.1367, + "step": 1960 + }, + { + "epoch": 0.23348148148148148, + "grad_norm": 1.2433683525066, + "learning_rate": 7.776461295418642e-06, + "loss": 0.15, + "step": 1970 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 1.306693696683265, + "learning_rate": 7.815955766192734e-06, + "loss": 0.1344, + "step": 1980 + }, + { + "epoch": 0.23585185185185184, + "grad_norm": 1.15389110039777, + "learning_rate": 7.855450236966824e-06, + "loss": 0.143, + "step": 1990 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 1.249327703738097, + "learning_rate": 7.894944707740917e-06, + "loss": 0.1326, + "step": 2000 + }, + { + "epoch": 0.23822222222222222, + "grad_norm": 1.4396758038789688, + "learning_rate": 7.934439178515009e-06, + "loss": 0.1336, + "step": 2010 + }, + { + "epoch": 0.2394074074074074, + "grad_norm": 1.0244329804035475, + "learning_rate": 7.973933649289101e-06, + "loss": 0.1494, + "step": 2020 + }, + { + "epoch": 0.24059259259259258, + "grad_norm": 1.3655168966767948, + "learning_rate": 8.013428120063192e-06, + "loss": 0.1458, + "step": 2030 + }, + { + "epoch": 0.24177777777777779, + "grad_norm": 1.0813431554623738, + "learning_rate": 8.052922590837284e-06, + "loss": 0.1313, + "step": 2040 + }, + { + "epoch": 0.24296296296296296, + "grad_norm": 1.1019447318703963, + "learning_rate": 8.092417061611375e-06, + "loss": 0.1278, + "step": 2050 + }, + { + "epoch": 0.24414814814814814, + "grad_norm": 1.161541112028708, + "learning_rate": 8.131911532385467e-06, + "loss": 0.1388, + "step": 2060 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 1.2121096685200743, + "learning_rate": 8.171406003159559e-06, + "loss": 0.1426, + "step": 2070 + }, + { + "epoch": 0.24651851851851853, + "grad_norm": 1.1943833198302782, + "learning_rate": 8.21090047393365e-06, + "loss": 0.1448, + "step": 2080 + }, + { + "epoch": 0.2477037037037037, + "grad_norm": 1.2385176940275673, + "learning_rate": 8.250394944707742e-06, + "loss": 0.137, + "step": 2090 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 1.2399375043986598, + "learning_rate": 8.289889415481832e-06, + "loss": 0.1332, + "step": 2100 + }, + { + "epoch": 0.25007407407407406, + "grad_norm": 1.0532024622755551, + "learning_rate": 8.329383886255925e-06, + "loss": 0.1367, + "step": 2110 + }, + { + "epoch": 0.25125925925925924, + "grad_norm": 1.1112223938639674, + "learning_rate": 8.368878357030017e-06, + "loss": 0.122, + "step": 2120 + }, + { + "epoch": 0.25244444444444447, + "grad_norm": 1.1215548658435044, + "learning_rate": 8.408372827804107e-06, + "loss": 0.1309, + "step": 2130 + }, + { + "epoch": 0.25362962962962965, + "grad_norm": 0.9889435324709056, + "learning_rate": 8.4478672985782e-06, + "loss": 0.1318, + "step": 2140 + }, + { + "epoch": 0.2548148148148148, + "grad_norm": 1.1190303690533154, + "learning_rate": 8.487361769352292e-06, + "loss": 0.1372, + "step": 2150 + }, + { + "epoch": 0.256, + "grad_norm": 1.0821026540815322, + "learning_rate": 8.526856240126383e-06, + "loss": 0.1394, + "step": 2160 + }, + { + "epoch": 0.2571851851851852, + "grad_norm": 1.4273902320172085, + "learning_rate": 8.566350710900475e-06, + "loss": 0.1403, + "step": 2170 + }, + { + "epoch": 0.25837037037037036, + "grad_norm": 1.1725459842861718, + "learning_rate": 8.605845181674565e-06, + "loss": 0.1368, + "step": 2180 + }, + { + "epoch": 0.25955555555555554, + "grad_norm": 1.0347236868665117, + "learning_rate": 8.645339652448658e-06, + "loss": 0.1322, + "step": 2190 + }, + { + "epoch": 0.2607407407407407, + "grad_norm": 1.147336853830699, + "learning_rate": 8.68483412322275e-06, + "loss": 0.1385, + "step": 2200 + }, + { + "epoch": 0.26192592592592595, + "grad_norm": 1.1195736709673731, + "learning_rate": 8.724328593996842e-06, + "loss": 0.1299, + "step": 2210 + }, + { + "epoch": 0.26311111111111113, + "grad_norm": 1.19703340109771, + "learning_rate": 8.763823064770933e-06, + "loss": 0.1326, + "step": 2220 + }, + { + "epoch": 0.2642962962962963, + "grad_norm": 1.0370850841770547, + "learning_rate": 8.803317535545023e-06, + "loss": 0.1256, + "step": 2230 + }, + { + "epoch": 0.2654814814814815, + "grad_norm": 1.0458152573599093, + "learning_rate": 8.842812006319115e-06, + "loss": 0.1324, + "step": 2240 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.1578620236562867, + "learning_rate": 8.882306477093208e-06, + "loss": 0.1337, + "step": 2250 + }, + { + "epoch": 0.26785185185185184, + "grad_norm": 1.2400311727023643, + "learning_rate": 8.9218009478673e-06, + "loss": 0.1389, + "step": 2260 + }, + { + "epoch": 0.269037037037037, + "grad_norm": 1.0772480700085616, + "learning_rate": 8.961295418641392e-06, + "loss": 0.1279, + "step": 2270 + }, + { + "epoch": 0.2702222222222222, + "grad_norm": 1.0559092770169978, + "learning_rate": 9.000789889415483e-06, + "loss": 0.1272, + "step": 2280 + }, + { + "epoch": 0.27140740740740743, + "grad_norm": 1.048995973752882, + "learning_rate": 9.040284360189573e-06, + "loss": 0.1243, + "step": 2290 + }, + { + "epoch": 0.2725925925925926, + "grad_norm": 1.3451391343777819, + "learning_rate": 9.079778830963666e-06, + "loss": 0.1307, + "step": 2300 + }, + { + "epoch": 0.2737777777777778, + "grad_norm": 1.060469009618305, + "learning_rate": 9.119273301737758e-06, + "loss": 0.1292, + "step": 2310 + }, + { + "epoch": 0.27496296296296296, + "grad_norm": 1.1680308595423996, + "learning_rate": 9.15876777251185e-06, + "loss": 0.1291, + "step": 2320 + }, + { + "epoch": 0.27614814814814814, + "grad_norm": 1.0245403988039696, + "learning_rate": 9.19826224328594e-06, + "loss": 0.1364, + "step": 2330 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 1.3540771871606279, + "learning_rate": 9.237756714060033e-06, + "loss": 0.1211, + "step": 2340 + }, + { + "epoch": 0.2785185185185185, + "grad_norm": 1.0477791266593481, + "learning_rate": 9.277251184834123e-06, + "loss": 0.1256, + "step": 2350 + }, + { + "epoch": 0.2797037037037037, + "grad_norm": 1.074397627115856, + "learning_rate": 9.316745655608216e-06, + "loss": 0.1368, + "step": 2360 + }, + { + "epoch": 0.2808888888888889, + "grad_norm": 1.096762754427759, + "learning_rate": 9.356240126382308e-06, + "loss": 0.127, + "step": 2370 + }, + { + "epoch": 0.2820740740740741, + "grad_norm": 1.1152387422710077, + "learning_rate": 9.395734597156398e-06, + "loss": 0.1157, + "step": 2380 + }, + { + "epoch": 0.28325925925925927, + "grad_norm": 1.15185809148128, + "learning_rate": 9.43522906793049e-06, + "loss": 0.1369, + "step": 2390 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 1.0649016460434935, + "learning_rate": 9.474723538704583e-06, + "loss": 0.1267, + "step": 2400 + }, + { + "epoch": 0.2856296296296296, + "grad_norm": 1.1080080820382023, + "learning_rate": 9.514218009478673e-06, + "loss": 0.1276, + "step": 2410 + }, + { + "epoch": 0.2868148148148148, + "grad_norm": 1.1752464726050098, + "learning_rate": 9.553712480252766e-06, + "loss": 0.13, + "step": 2420 + }, + { + "epoch": 0.288, + "grad_norm": 1.1902570725844221, + "learning_rate": 9.593206951026856e-06, + "loss": 0.1206, + "step": 2430 + }, + { + "epoch": 0.2891851851851852, + "grad_norm": 1.0348767296473844, + "learning_rate": 9.632701421800949e-06, + "loss": 0.1314, + "step": 2440 + }, + { + "epoch": 0.2903703703703704, + "grad_norm": 1.5099894055106093, + "learning_rate": 9.67219589257504e-06, + "loss": 0.1276, + "step": 2450 + }, + { + "epoch": 0.29155555555555557, + "grad_norm": 1.385132963146712, + "learning_rate": 9.711690363349133e-06, + "loss": 0.1228, + "step": 2460 + }, + { + "epoch": 0.29274074074074075, + "grad_norm": 1.288744232081011, + "learning_rate": 9.751184834123224e-06, + "loss": 0.1281, + "step": 2470 + }, + { + "epoch": 0.2939259259259259, + "grad_norm": 1.0941276266402564, + "learning_rate": 9.790679304897314e-06, + "loss": 0.1303, + "step": 2480 + }, + { + "epoch": 0.2951111111111111, + "grad_norm": 0.945296978599026, + "learning_rate": 9.830173775671406e-06, + "loss": 0.1259, + "step": 2490 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.028557604163525, + "learning_rate": 9.869668246445499e-06, + "loss": 0.1243, + "step": 2500 + }, + { + "epoch": 0.29748148148148146, + "grad_norm": 1.0961718771978575, + "learning_rate": 9.909162717219591e-06, + "loss": 0.125, + "step": 2510 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 1.2259138889872725, + "learning_rate": 9.948657187993681e-06, + "loss": 0.1234, + "step": 2520 + }, + { + "epoch": 0.29985185185185187, + "grad_norm": 1.0222087312043195, + "learning_rate": 9.988151658767774e-06, + "loss": 0.1315, + "step": 2530 + }, + { + "epoch": 0.30103703703703705, + "grad_norm": 1.1985118048432004, + "learning_rate": 9.999997670556908e-06, + "loss": 0.127, + "step": 2540 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 0.9286185383611775, + "learning_rate": 9.999986261044944e-06, + "loss": 0.1266, + "step": 2550 + }, + { + "epoch": 0.3034074074074074, + "grad_norm": 1.111662361529288, + "learning_rate": 9.999965343628881e-06, + "loss": 0.1184, + "step": 2560 + }, + { + "epoch": 0.3045925925925926, + "grad_norm": 1.273986447150661, + "learning_rate": 9.9999349183485e-06, + "loss": 0.1188, + "step": 2570 + }, + { + "epoch": 0.30577777777777776, + "grad_norm": 1.1492983257913492, + "learning_rate": 9.999894985261652e-06, + "loss": 0.1162, + "step": 2580 + }, + { + "epoch": 0.30696296296296294, + "grad_norm": 0.9259384772003547, + "learning_rate": 9.999845544444276e-06, + "loss": 0.1208, + "step": 2590 + }, + { + "epoch": 0.30814814814814817, + "grad_norm": 1.201242801046621, + "learning_rate": 9.999786595990388e-06, + "loss": 0.1275, + "step": 2600 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 1.0816319437512862, + "learning_rate": 9.999718140012084e-06, + "loss": 0.129, + "step": 2610 + }, + { + "epoch": 0.3105185185185185, + "grad_norm": 1.106029967405303, + "learning_rate": 9.999640176639537e-06, + "loss": 0.1214, + "step": 2620 + }, + { + "epoch": 0.3117037037037037, + "grad_norm": 1.0193732867232608, + "learning_rate": 9.999552706021003e-06, + "loss": 0.1285, + "step": 2630 + }, + { + "epoch": 0.3128888888888889, + "grad_norm": 1.1046176513500003, + "learning_rate": 9.999455728322813e-06, + "loss": 0.1171, + "step": 2640 + }, + { + "epoch": 0.31407407407407406, + "grad_norm": 1.0910449790973566, + "learning_rate": 9.999349243729379e-06, + "loss": 0.1191, + "step": 2650 + }, + { + "epoch": 0.31525925925925924, + "grad_norm": 1.0083266380537457, + "learning_rate": 9.999233252443192e-06, + "loss": 0.1204, + "step": 2660 + }, + { + "epoch": 0.3164444444444444, + "grad_norm": 1.0642460379585044, + "learning_rate": 9.999107754684817e-06, + "loss": 0.1168, + "step": 2670 + }, + { + "epoch": 0.31762962962962965, + "grad_norm": 1.1298694219636254, + "learning_rate": 9.998972750692904e-06, + "loss": 0.1194, + "step": 2680 + }, + { + "epoch": 0.31881481481481483, + "grad_norm": 0.9613338001649993, + "learning_rate": 9.998828240724168e-06, + "loss": 0.1155, + "step": 2690 + }, + { + "epoch": 0.32, + "grad_norm": 1.173601854354163, + "learning_rate": 9.99867422505341e-06, + "loss": 0.1214, + "step": 2700 + }, + { + "epoch": 0.3211851851851852, + "grad_norm": 0.9733768496559912, + "learning_rate": 9.998510703973506e-06, + "loss": 0.1227, + "step": 2710 + }, + { + "epoch": 0.32237037037037036, + "grad_norm": 1.0607585805234887, + "learning_rate": 9.998337677795402e-06, + "loss": 0.1146, + "step": 2720 + }, + { + "epoch": 0.32355555555555554, + "grad_norm": 1.080581949156037, + "learning_rate": 9.998155146848124e-06, + "loss": 0.1074, + "step": 2730 + }, + { + "epoch": 0.3247407407407407, + "grad_norm": 0.9948035768360823, + "learning_rate": 9.99796311147877e-06, + "loss": 0.1222, + "step": 2740 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 1.1381012144365488, + "learning_rate": 9.997761572052513e-06, + "loss": 0.1136, + "step": 2750 + }, + { + "epoch": 0.32711111111111113, + "grad_norm": 1.1019137219716368, + "learning_rate": 9.997550528952596e-06, + "loss": 0.1197, + "step": 2760 + }, + { + "epoch": 0.3282962962962963, + "grad_norm": 1.0614332229110073, + "learning_rate": 9.997329982580334e-06, + "loss": 0.1119, + "step": 2770 + }, + { + "epoch": 0.3294814814814815, + "grad_norm": 1.019300129700046, + "learning_rate": 9.997099933355119e-06, + "loss": 0.1122, + "step": 2780 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.8703414717801001, + "learning_rate": 9.996860381714406e-06, + "loss": 0.1154, + "step": 2790 + }, + { + "epoch": 0.33185185185185184, + "grad_norm": 0.9927668006373571, + "learning_rate": 9.996611328113725e-06, + "loss": 0.1114, + "step": 2800 + }, + { + "epoch": 0.333037037037037, + "grad_norm": 1.0649223066896096, + "learning_rate": 9.996352773026672e-06, + "loss": 0.1143, + "step": 2810 + }, + { + "epoch": 0.3342222222222222, + "grad_norm": 0.945061933888762, + "learning_rate": 9.996084716944913e-06, + "loss": 0.1165, + "step": 2820 + }, + { + "epoch": 0.33540740740740743, + "grad_norm": 0.9053165256161341, + "learning_rate": 9.995807160378176e-06, + "loss": 0.1138, + "step": 2830 + }, + { + "epoch": 0.3365925925925926, + "grad_norm": 0.983673776932675, + "learning_rate": 9.995520103854265e-06, + "loss": 0.1126, + "step": 2840 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 0.9094831050296318, + "learning_rate": 9.995223547919037e-06, + "loss": 0.1131, + "step": 2850 + }, + { + "epoch": 0.33896296296296297, + "grad_norm": 1.1980737495235771, + "learning_rate": 9.99491749313642e-06, + "loss": 0.1153, + "step": 2860 + }, + { + "epoch": 0.34014814814814814, + "grad_norm": 1.1642543523703357, + "learning_rate": 9.994601940088407e-06, + "loss": 0.1069, + "step": 2870 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 1.3061713011905183, + "learning_rate": 9.994276889375043e-06, + "loss": 0.1129, + "step": 2880 + }, + { + "epoch": 0.3425185185185185, + "grad_norm": 1.1159869136842975, + "learning_rate": 9.993942341614445e-06, + "loss": 0.1152, + "step": 2890 + }, + { + "epoch": 0.3437037037037037, + "grad_norm": 0.9868651619155405, + "learning_rate": 9.993598297442782e-06, + "loss": 0.1111, + "step": 2900 + }, + { + "epoch": 0.3448888888888889, + "grad_norm": 0.9111232398594167, + "learning_rate": 9.993244757514284e-06, + "loss": 0.1086, + "step": 2910 + }, + { + "epoch": 0.3460740740740741, + "grad_norm": 1.1352059022498266, + "learning_rate": 9.99288172250124e-06, + "loss": 0.1101, + "step": 2920 + }, + { + "epoch": 0.34725925925925927, + "grad_norm": 1.1092536863833993, + "learning_rate": 9.992509193093989e-06, + "loss": 0.1138, + "step": 2930 + }, + { + "epoch": 0.34844444444444445, + "grad_norm": 1.0456678813902056, + "learning_rate": 9.992127170000928e-06, + "loss": 0.1085, + "step": 2940 + }, + { + "epoch": 0.3496296296296296, + "grad_norm": 1.05233397677223, + "learning_rate": 9.99173565394851e-06, + "loss": 0.1137, + "step": 2950 + }, + { + "epoch": 0.3508148148148148, + "grad_norm": 0.8989290814616561, + "learning_rate": 9.99133464568123e-06, + "loss": 0.1011, + "step": 2960 + }, + { + "epoch": 0.352, + "grad_norm": 1.1959494139981524, + "learning_rate": 9.990924145961648e-06, + "loss": 0.1069, + "step": 2970 + }, + { + "epoch": 0.35318518518518516, + "grad_norm": 1.0223587977454955, + "learning_rate": 9.990504155570358e-06, + "loss": 0.1104, + "step": 2980 + }, + { + "epoch": 0.3543703703703704, + "grad_norm": 1.0324014376102113, + "learning_rate": 9.990074675306011e-06, + "loss": 0.1089, + "step": 2990 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.9142730471073823, + "learning_rate": 9.989635705985301e-06, + "loss": 0.1149, + "step": 3000 + }, + { + "epoch": 0.35674074074074075, + "grad_norm": 0.9477791096263312, + "learning_rate": 9.989187248442965e-06, + "loss": 0.1074, + "step": 3010 + }, + { + "epoch": 0.3579259259259259, + "grad_norm": 1.029917738649383, + "learning_rate": 9.98872930353178e-06, + "loss": 0.1049, + "step": 3020 + }, + { + "epoch": 0.3591111111111111, + "grad_norm": 1.0958732449477262, + "learning_rate": 9.988261872122575e-06, + "loss": 0.1062, + "step": 3030 + }, + { + "epoch": 0.3602962962962963, + "grad_norm": 0.9739346786531066, + "learning_rate": 9.987784955104205e-06, + "loss": 0.1043, + "step": 3040 + }, + { + "epoch": 0.36148148148148146, + "grad_norm": 0.9125268614665308, + "learning_rate": 9.987298553383571e-06, + "loss": 0.1018, + "step": 3050 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.9883935739843853, + "learning_rate": 9.986802667885609e-06, + "loss": 0.1096, + "step": 3060 + }, + { + "epoch": 0.36385185185185187, + "grad_norm": 1.153554291921888, + "learning_rate": 9.986297299553286e-06, + "loss": 0.1104, + "step": 3070 + }, + { + "epoch": 0.36503703703703705, + "grad_norm": 0.8661052521750611, + "learning_rate": 9.985782449347605e-06, + "loss": 0.0986, + "step": 3080 + }, + { + "epoch": 0.3662222222222222, + "grad_norm": 1.0611023764230756, + "learning_rate": 9.985258118247596e-06, + "loss": 0.0945, + "step": 3090 + }, + { + "epoch": 0.3674074074074074, + "grad_norm": 0.9468505288641946, + "learning_rate": 9.984724307250319e-06, + "loss": 0.11, + "step": 3100 + }, + { + "epoch": 0.3685925925925926, + "grad_norm": 1.1929655399531534, + "learning_rate": 9.984181017370867e-06, + "loss": 0.1071, + "step": 3110 + }, + { + "epoch": 0.36977777777777776, + "grad_norm": 1.073270532431986, + "learning_rate": 9.983628249642345e-06, + "loss": 0.106, + "step": 3120 + }, + { + "epoch": 0.37096296296296294, + "grad_norm": 1.0505725853702017, + "learning_rate": 9.983066005115894e-06, + "loss": 0.0999, + "step": 3130 + }, + { + "epoch": 0.3721481481481482, + "grad_norm": 1.0782358716715486, + "learning_rate": 9.982494284860668e-06, + "loss": 0.1111, + "step": 3140 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 1.0557516128878732, + "learning_rate": 9.981913089963841e-06, + "loss": 0.107, + "step": 3150 + }, + { + "epoch": 0.37451851851851853, + "grad_norm": 1.0578331773078369, + "learning_rate": 9.98132242153061e-06, + "loss": 0.1014, + "step": 3160 + }, + { + "epoch": 0.3757037037037037, + "grad_norm": 0.9900733877629715, + "learning_rate": 9.980722280684177e-06, + "loss": 0.0985, + "step": 3170 + }, + { + "epoch": 0.3768888888888889, + "grad_norm": 0.9827346791836639, + "learning_rate": 9.980112668565762e-06, + "loss": 0.1141, + "step": 3180 + }, + { + "epoch": 0.37807407407407406, + "grad_norm": 1.0585445080498543, + "learning_rate": 9.979493586334596e-06, + "loss": 0.1013, + "step": 3190 + }, + { + "epoch": 0.37925925925925924, + "grad_norm": 0.9483754543855847, + "learning_rate": 9.97886503516792e-06, + "loss": 0.1018, + "step": 3200 + }, + { + "epoch": 0.3804444444444444, + "grad_norm": 1.1323394195148195, + "learning_rate": 9.978227016260974e-06, + "loss": 0.1036, + "step": 3210 + }, + { + "epoch": 0.38162962962962965, + "grad_norm": 1.014762440048781, + "learning_rate": 9.977579530827003e-06, + "loss": 0.0959, + "step": 3220 + }, + { + "epoch": 0.38281481481481483, + "grad_norm": 0.843820063614635, + "learning_rate": 9.976922580097266e-06, + "loss": 0.0974, + "step": 3230 + }, + { + "epoch": 0.384, + "grad_norm": 0.9945169322602301, + "learning_rate": 9.976256165321002e-06, + "loss": 0.1008, + "step": 3240 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 0.9422603028606186, + "learning_rate": 9.975580287765461e-06, + "loss": 0.1016, + "step": 3250 + }, + { + "epoch": 0.38637037037037036, + "grad_norm": 1.0428131747987188, + "learning_rate": 9.974894948715882e-06, + "loss": 0.1043, + "step": 3260 + }, + { + "epoch": 0.38755555555555554, + "grad_norm": 1.0106390080031689, + "learning_rate": 9.974200149475494e-06, + "loss": 0.0986, + "step": 3270 + }, + { + "epoch": 0.3887407407407407, + "grad_norm": 1.0909067437339077, + "learning_rate": 9.973495891365518e-06, + "loss": 0.0975, + "step": 3280 + }, + { + "epoch": 0.38992592592592595, + "grad_norm": 0.9509979371656362, + "learning_rate": 9.972782175725163e-06, + "loss": 0.1011, + "step": 3290 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 0.9460805917169135, + "learning_rate": 9.97205900391162e-06, + "loss": 0.1001, + "step": 3300 + }, + { + "epoch": 0.3922962962962963, + "grad_norm": 1.0433150340303297, + "learning_rate": 9.971326377300062e-06, + "loss": 0.1028, + "step": 3310 + }, + { + "epoch": 0.3934814814814815, + "grad_norm": 0.8522650880102192, + "learning_rate": 9.970584297283643e-06, + "loss": 0.0936, + "step": 3320 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 1.041438417145197, + "learning_rate": 9.96983276527349e-06, + "loss": 0.0969, + "step": 3330 + }, + { + "epoch": 0.39585185185185184, + "grad_norm": 0.9513231505108326, + "learning_rate": 9.969071782698704e-06, + "loss": 0.0958, + "step": 3340 + }, + { + "epoch": 0.397037037037037, + "grad_norm": 1.113472336755939, + "learning_rate": 9.968301351006366e-06, + "loss": 0.1008, + "step": 3350 + }, + { + "epoch": 0.3982222222222222, + "grad_norm": 0.9170457331942715, + "learning_rate": 9.967521471661511e-06, + "loss": 0.0937, + "step": 3360 + }, + { + "epoch": 0.39940740740740743, + "grad_norm": 0.9925503933520124, + "learning_rate": 9.96673214614715e-06, + "loss": 0.1018, + "step": 3370 + }, + { + "epoch": 0.4005925925925926, + "grad_norm": 1.208167708969931, + "learning_rate": 9.965933375964252e-06, + "loss": 0.1015, + "step": 3380 + }, + { + "epoch": 0.4017777777777778, + "grad_norm": 0.9111547528969846, + "learning_rate": 9.965125162631748e-06, + "loss": 0.0977, + "step": 3390 + }, + { + "epoch": 0.40296296296296297, + "grad_norm": 1.0594293752341197, + "learning_rate": 9.964307507686525e-06, + "loss": 0.0928, + "step": 3400 + }, + { + "epoch": 0.40414814814814815, + "grad_norm": 1.02941041489853, + "learning_rate": 9.963480412683424e-06, + "loss": 0.0933, + "step": 3410 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.9864449378353636, + "learning_rate": 9.96264387919524e-06, + "loss": 0.0917, + "step": 3420 + }, + { + "epoch": 0.4065185185185185, + "grad_norm": 0.783215057552359, + "learning_rate": 9.961797908812708e-06, + "loss": 0.0908, + "step": 3430 + }, + { + "epoch": 0.4077037037037037, + "grad_norm": 1.046952903947465, + "learning_rate": 9.960942503144518e-06, + "loss": 0.0978, + "step": 3440 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 0.9699815953951667, + "learning_rate": 9.960077663817295e-06, + "loss": 0.0964, + "step": 3450 + }, + { + "epoch": 0.4100740740740741, + "grad_norm": 1.1811835472073076, + "learning_rate": 9.959203392475609e-06, + "loss": 0.0989, + "step": 3460 + }, + { + "epoch": 0.41125925925925927, + "grad_norm": 0.7810165988147362, + "learning_rate": 9.958319690781956e-06, + "loss": 0.0919, + "step": 3470 + }, + { + "epoch": 0.41244444444444445, + "grad_norm": 0.9336680156088368, + "learning_rate": 9.957426560416776e-06, + "loss": 0.0918, + "step": 3480 + }, + { + "epoch": 0.4136296296296296, + "grad_norm": 0.8919293395755065, + "learning_rate": 9.956524003078432e-06, + "loss": 0.0981, + "step": 3490 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 0.8925853133374531, + "learning_rate": 9.955612020483215e-06, + "loss": 0.0918, + "step": 3500 + }, + { + "epoch": 0.416, + "grad_norm": 0.9238403024531354, + "learning_rate": 9.954690614365337e-06, + "loss": 0.0967, + "step": 3510 + }, + { + "epoch": 0.41718518518518516, + "grad_norm": 0.9155565229092409, + "learning_rate": 9.95375978647693e-06, + "loss": 0.0959, + "step": 3520 + }, + { + "epoch": 0.4183703703703704, + "grad_norm": 0.929056384983427, + "learning_rate": 9.952819538588045e-06, + "loss": 0.0934, + "step": 3530 + }, + { + "epoch": 0.41955555555555557, + "grad_norm": 0.9734398329498284, + "learning_rate": 9.951869872486644e-06, + "loss": 0.0907, + "step": 3540 + }, + { + "epoch": 0.42074074074074075, + "grad_norm": 0.9646738159673482, + "learning_rate": 9.950910789978599e-06, + "loss": 0.0941, + "step": 3550 + }, + { + "epoch": 0.4219259259259259, + "grad_norm": 0.815005018709483, + "learning_rate": 9.949942292887689e-06, + "loss": 0.094, + "step": 3560 + }, + { + "epoch": 0.4231111111111111, + "grad_norm": 0.8944551210796629, + "learning_rate": 9.948964383055592e-06, + "loss": 0.0941, + "step": 3570 + }, + { + "epoch": 0.4242962962962963, + "grad_norm": 1.1965964638289905, + "learning_rate": 9.94797706234189e-06, + "loss": 0.0939, + "step": 3580 + }, + { + "epoch": 0.42548148148148146, + "grad_norm": 0.8975630609016324, + "learning_rate": 9.946980332624057e-06, + "loss": 0.0964, + "step": 3590 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.9429045517527573, + "learning_rate": 9.94597419579746e-06, + "loss": 0.0913, + "step": 3600 + }, + { + "epoch": 0.42785185185185187, + "grad_norm": 0.9952465782659273, + "learning_rate": 9.944958653775356e-06, + "loss": 0.0898, + "step": 3610 + }, + { + "epoch": 0.42903703703703705, + "grad_norm": 0.980068236567743, + "learning_rate": 9.943933708488883e-06, + "loss": 0.0881, + "step": 3620 + }, + { + "epoch": 0.43022222222222223, + "grad_norm": 0.9972782331025953, + "learning_rate": 9.942899361887066e-06, + "loss": 0.087, + "step": 3630 + }, + { + "epoch": 0.4314074074074074, + "grad_norm": 0.893079670752483, + "learning_rate": 9.941855615936803e-06, + "loss": 0.0891, + "step": 3640 + }, + { + "epoch": 0.4325925925925926, + "grad_norm": 0.8515826738127601, + "learning_rate": 9.940802472622865e-06, + "loss": 0.0811, + "step": 3650 + }, + { + "epoch": 0.43377777777777776, + "grad_norm": 1.000710629615497, + "learning_rate": 9.939739933947898e-06, + "loss": 0.0906, + "step": 3660 + }, + { + "epoch": 0.43496296296296294, + "grad_norm": 0.8992833245157053, + "learning_rate": 9.938668001932408e-06, + "loss": 0.0873, + "step": 3670 + }, + { + "epoch": 0.4361481481481482, + "grad_norm": 0.9765751087128967, + "learning_rate": 9.937586678614765e-06, + "loss": 0.0865, + "step": 3680 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.9354682815023043, + "learning_rate": 9.936495966051204e-06, + "loss": 0.0927, + "step": 3690 + }, + { + "epoch": 0.43851851851851853, + "grad_norm": 0.7584790594891406, + "learning_rate": 9.9353958663158e-06, + "loss": 0.0945, + "step": 3700 + }, + { + "epoch": 0.4397037037037037, + "grad_norm": 1.0239273300550789, + "learning_rate": 9.934286381500494e-06, + "loss": 0.0875, + "step": 3710 + }, + { + "epoch": 0.4408888888888889, + "grad_norm": 0.95376512919744, + "learning_rate": 9.933167513715065e-06, + "loss": 0.0872, + "step": 3720 + }, + { + "epoch": 0.44207407407407406, + "grad_norm": 0.9280934572790283, + "learning_rate": 9.932039265087137e-06, + "loss": 0.0946, + "step": 3730 + }, + { + "epoch": 0.44325925925925924, + "grad_norm": 0.9442662624740906, + "learning_rate": 9.93090163776217e-06, + "loss": 0.0995, + "step": 3740 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8801583739667498, + "learning_rate": 9.929754633903461e-06, + "loss": 0.0894, + "step": 3750 + }, + { + "epoch": 0.44562962962962965, + "grad_norm": 0.8510209114055628, + "learning_rate": 9.92859825569214e-06, + "loss": 0.0847, + "step": 3760 + }, + { + "epoch": 0.44681481481481483, + "grad_norm": 1.023114735895868, + "learning_rate": 9.927432505327153e-06, + "loss": 0.0849, + "step": 3770 + }, + { + "epoch": 0.448, + "grad_norm": 1.1407286029311243, + "learning_rate": 9.92625738502528e-06, + "loss": 0.0889, + "step": 3780 + }, + { + "epoch": 0.4491851851851852, + "grad_norm": 0.8609055870236172, + "learning_rate": 9.925072897021111e-06, + "loss": 0.091, + "step": 3790 + }, + { + "epoch": 0.45037037037037037, + "grad_norm": 0.8582580844008023, + "learning_rate": 9.923879043567052e-06, + "loss": 0.0918, + "step": 3800 + }, + { + "epoch": 0.45155555555555554, + "grad_norm": 0.9980105984965385, + "learning_rate": 9.922675826933319e-06, + "loss": 0.0869, + "step": 3810 + }, + { + "epoch": 0.4527407407407407, + "grad_norm": 0.8638435969966359, + "learning_rate": 9.921463249407932e-06, + "loss": 0.0862, + "step": 3820 + }, + { + "epoch": 0.4539259259259259, + "grad_norm": 0.8725754025573971, + "learning_rate": 9.92024131329671e-06, + "loss": 0.0847, + "step": 3830 + }, + { + "epoch": 0.45511111111111113, + "grad_norm": 1.0397721165449088, + "learning_rate": 9.919010020923269e-06, + "loss": 0.0825, + "step": 3840 + }, + { + "epoch": 0.4562962962962963, + "grad_norm": 1.0489467179584535, + "learning_rate": 9.917769374629022e-06, + "loss": 0.0828, + "step": 3850 + }, + { + "epoch": 0.4574814814814815, + "grad_norm": 0.7953383373238736, + "learning_rate": 9.916519376773161e-06, + "loss": 0.0837, + "step": 3860 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.9340192162940949, + "learning_rate": 9.915260029732664e-06, + "loss": 0.083, + "step": 3870 + }, + { + "epoch": 0.45985185185185184, + "grad_norm": 0.9650531833970395, + "learning_rate": 9.913991335902292e-06, + "loss": 0.0895, + "step": 3880 + }, + { + "epoch": 0.461037037037037, + "grad_norm": 0.8063007229691221, + "learning_rate": 9.912713297694569e-06, + "loss": 0.0843, + "step": 3890 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 1.178073999705309, + "learning_rate": 9.911425917539798e-06, + "loss": 0.0833, + "step": 3900 + }, + { + "epoch": 0.46340740740740743, + "grad_norm": 1.1009253004923545, + "learning_rate": 9.910129197886044e-06, + "loss": 0.0766, + "step": 3910 + }, + { + "epoch": 0.4645925925925926, + "grad_norm": 1.0402442425262375, + "learning_rate": 9.90882314119913e-06, + "loss": 0.0832, + "step": 3920 + }, + { + "epoch": 0.4657777777777778, + "grad_norm": 1.1830787967537757, + "learning_rate": 9.907507749962636e-06, + "loss": 0.0836, + "step": 3930 + }, + { + "epoch": 0.46696296296296297, + "grad_norm": 1.0321106423065325, + "learning_rate": 9.90618302667789e-06, + "loss": 0.0861, + "step": 3940 + }, + { + "epoch": 0.46814814814814815, + "grad_norm": 0.8422245987541108, + "learning_rate": 9.90484897386397e-06, + "loss": 0.0848, + "step": 3950 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 1.149098291761147, + "learning_rate": 9.90350559405769e-06, + "loss": 0.0825, + "step": 3960 + }, + { + "epoch": 0.4705185185185185, + "grad_norm": 0.8207440542003209, + "learning_rate": 9.902152889813602e-06, + "loss": 0.0856, + "step": 3970 + }, + { + "epoch": 0.4717037037037037, + "grad_norm": 0.849458620432329, + "learning_rate": 9.90079086370399e-06, + "loss": 0.0785, + "step": 3980 + }, + { + "epoch": 0.4728888888888889, + "grad_norm": 0.9602453060621781, + "learning_rate": 9.899419518318865e-06, + "loss": 0.0826, + "step": 3990 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 0.9699078220349924, + "learning_rate": 9.898038856265957e-06, + "loss": 0.0765, + "step": 4000 + }, + { + "epoch": 0.47525925925925927, + "grad_norm": 0.9269160394434455, + "learning_rate": 9.896648880170713e-06, + "loss": 0.0761, + "step": 4010 + }, + { + "epoch": 0.47644444444444445, + "grad_norm": 0.9595874361318213, + "learning_rate": 9.895249592676294e-06, + "loss": 0.0866, + "step": 4020 + }, + { + "epoch": 0.4776296296296296, + "grad_norm": 1.013893515884627, + "learning_rate": 9.893840996443565e-06, + "loss": 0.0813, + "step": 4030 + }, + { + "epoch": 0.4788148148148148, + "grad_norm": 0.9863207818786465, + "learning_rate": 9.892423094151093e-06, + "loss": 0.0794, + "step": 4040 + }, + { + "epoch": 0.48, + "grad_norm": 0.9412206085311899, + "learning_rate": 9.890995888495141e-06, + "loss": 0.0852, + "step": 4050 + }, + { + "epoch": 0.48118518518518516, + "grad_norm": 1.0081837206717523, + "learning_rate": 9.889559382189662e-06, + "loss": 0.0796, + "step": 4060 + }, + { + "epoch": 0.4823703703703704, + "grad_norm": 0.8860577605037484, + "learning_rate": 9.8881135779663e-06, + "loss": 0.0789, + "step": 4070 + }, + { + "epoch": 0.48355555555555557, + "grad_norm": 1.033554009759029, + "learning_rate": 9.88665847857437e-06, + "loss": 0.08, + "step": 4080 + }, + { + "epoch": 0.48474074074074075, + "grad_norm": 0.7724560658073316, + "learning_rate": 9.885194086780875e-06, + "loss": 0.0776, + "step": 4090 + }, + { + "epoch": 0.48592592592592593, + "grad_norm": 0.9190066206516513, + "learning_rate": 9.88372040537048e-06, + "loss": 0.0792, + "step": 4100 + }, + { + "epoch": 0.4871111111111111, + "grad_norm": 0.9470868574408026, + "learning_rate": 9.882237437145515e-06, + "loss": 0.0813, + "step": 4110 + }, + { + "epoch": 0.4882962962962963, + "grad_norm": 0.9132243251555825, + "learning_rate": 9.880745184925974e-06, + "loss": 0.0802, + "step": 4120 + }, + { + "epoch": 0.48948148148148146, + "grad_norm": 0.7498355906497279, + "learning_rate": 9.879243651549501e-06, + "loss": 0.0826, + "step": 4130 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.9813564916356462, + "learning_rate": 9.877732839871393e-06, + "loss": 0.0836, + "step": 4140 + }, + { + "epoch": 0.4918518518518519, + "grad_norm": 1.0202496294432721, + "learning_rate": 9.876212752764587e-06, + "loss": 0.0827, + "step": 4150 + }, + { + "epoch": 0.49303703703703705, + "grad_norm": 0.6887806634020146, + "learning_rate": 9.87468339311966e-06, + "loss": 0.0814, + "step": 4160 + }, + { + "epoch": 0.49422222222222223, + "grad_norm": 0.9911837485493776, + "learning_rate": 9.873144763844822e-06, + "loss": 0.0828, + "step": 4170 + }, + { + "epoch": 0.4954074074074074, + "grad_norm": 0.8535687053230085, + "learning_rate": 9.871596867865907e-06, + "loss": 0.0805, + "step": 4180 + }, + { + "epoch": 0.4965925925925926, + "grad_norm": 0.8389560557766649, + "learning_rate": 9.870039708126371e-06, + "loss": 0.0843, + "step": 4190 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 0.9831913727447303, + "learning_rate": 9.868473287587293e-06, + "loss": 0.0738, + "step": 4200 + }, + { + "epoch": 0.49896296296296294, + "grad_norm": 1.0868062927283855, + "learning_rate": 9.86689760922735e-06, + "loss": 0.0734, + "step": 4210 + }, + { + "epoch": 0.5001481481481481, + "grad_norm": 0.9553313952317861, + "learning_rate": 9.865312676042835e-06, + "loss": 0.0761, + "step": 4220 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.9784945464770984, + "learning_rate": 9.863718491047632e-06, + "loss": 0.0734, + "step": 4230 + }, + { + "epoch": 0.5025185185185185, + "grad_norm": 0.9822855413933417, + "learning_rate": 9.86211505727322e-06, + "loss": 0.0793, + "step": 4240 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 1.002768570268345, + "learning_rate": 9.86050237776867e-06, + "loss": 0.0829, + "step": 4250 + }, + { + "epoch": 0.5048888888888889, + "grad_norm": 1.03445820054938, + "learning_rate": 9.858880455600628e-06, + "loss": 0.079, + "step": 4260 + }, + { + "epoch": 0.5060740740740741, + "grad_norm": 0.7330098398348979, + "learning_rate": 9.857249293853319e-06, + "loss": 0.0736, + "step": 4270 + }, + { + "epoch": 0.5072592592592593, + "grad_norm": 0.8999801573580161, + "learning_rate": 9.855608895628538e-06, + "loss": 0.0765, + "step": 4280 + }, + { + "epoch": 0.5084444444444445, + "grad_norm": 0.7743613315019119, + "learning_rate": 9.853959264045642e-06, + "loss": 0.077, + "step": 4290 + }, + { + "epoch": 0.5096296296296297, + "grad_norm": 0.8167829336455701, + "learning_rate": 9.852300402241551e-06, + "loss": 0.0764, + "step": 4300 + }, + { + "epoch": 0.5108148148148148, + "grad_norm": 0.873016590690606, + "learning_rate": 9.85063231337073e-06, + "loss": 0.081, + "step": 4310 + }, + { + "epoch": 0.512, + "grad_norm": 1.0481225878861797, + "learning_rate": 9.848955000605192e-06, + "loss": 0.0758, + "step": 4320 + }, + { + "epoch": 0.5131851851851852, + "grad_norm": 0.883091147782801, + "learning_rate": 9.847268467134497e-06, + "loss": 0.0756, + "step": 4330 + }, + { + "epoch": 0.5143703703703704, + "grad_norm": 0.8675043761813808, + "learning_rate": 9.845572716165728e-06, + "loss": 0.0801, + "step": 4340 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 0.883382820831632, + "learning_rate": 9.843867750923506e-06, + "loss": 0.0787, + "step": 4350 + }, + { + "epoch": 0.5167407407407407, + "grad_norm": 0.9241815851948217, + "learning_rate": 9.842153574649966e-06, + "loss": 0.0834, + "step": 4360 + }, + { + "epoch": 0.5179259259259259, + "grad_norm": 0.9695709706938929, + "learning_rate": 9.840430190604761e-06, + "loss": 0.0792, + "step": 4370 + }, + { + "epoch": 0.5191111111111111, + "grad_norm": 0.8993586706182487, + "learning_rate": 9.838697602065059e-06, + "loss": 0.0792, + "step": 4380 + }, + { + "epoch": 0.5202962962962963, + "grad_norm": 0.9858558254838163, + "learning_rate": 9.836955812325521e-06, + "loss": 0.0799, + "step": 4390 + }, + { + "epoch": 0.5214814814814814, + "grad_norm": 0.9147900647179666, + "learning_rate": 9.835204824698313e-06, + "loss": 0.0767, + "step": 4400 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.9007220208807277, + "learning_rate": 9.833444642513086e-06, + "loss": 0.0804, + "step": 4410 + }, + { + "epoch": 0.5238518518518519, + "grad_norm": 0.9657440658284951, + "learning_rate": 9.831675269116981e-06, + "loss": 0.0766, + "step": 4420 + }, + { + "epoch": 0.5250370370370371, + "grad_norm": 1.0775364005766082, + "learning_rate": 9.829896707874612e-06, + "loss": 0.077, + "step": 4430 + }, + { + "epoch": 0.5262222222222223, + "grad_norm": 0.9246103362627596, + "learning_rate": 9.828108962168066e-06, + "loss": 0.0751, + "step": 4440 + }, + { + "epoch": 0.5274074074074074, + "grad_norm": 0.7364814793955189, + "learning_rate": 9.826312035396896e-06, + "loss": 0.0746, + "step": 4450 + }, + { + "epoch": 0.5285925925925926, + "grad_norm": 0.8317457616605274, + "learning_rate": 9.824505930978113e-06, + "loss": 0.0747, + "step": 4460 + }, + { + "epoch": 0.5297777777777778, + "grad_norm": 0.919623028819872, + "learning_rate": 9.822690652346178e-06, + "loss": 0.072, + "step": 4470 + }, + { + "epoch": 0.530962962962963, + "grad_norm": 0.9798604849866279, + "learning_rate": 9.820866202953004e-06, + "loss": 0.0812, + "step": 4480 + }, + { + "epoch": 0.5321481481481481, + "grad_norm": 0.776329098381582, + "learning_rate": 9.819032586267933e-06, + "loss": 0.0739, + "step": 4490 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.8161182743896854, + "learning_rate": 9.81718980577775e-06, + "loss": 0.0722, + "step": 4500 + }, + { + "epoch": 0.5345185185185185, + "grad_norm": 0.7671593687404197, + "learning_rate": 9.815337864986656e-06, + "loss": 0.0691, + "step": 4510 + }, + { + "epoch": 0.5357037037037037, + "grad_norm": 0.848051450203711, + "learning_rate": 9.813476767416278e-06, + "loss": 0.082, + "step": 4520 + }, + { + "epoch": 0.5368888888888889, + "grad_norm": 0.7336951432286606, + "learning_rate": 9.811606516605655e-06, + "loss": 0.0739, + "step": 4530 + }, + { + "epoch": 0.538074074074074, + "grad_norm": 0.8032350179251622, + "learning_rate": 9.809727116111225e-06, + "loss": 0.0677, + "step": 4540 + }, + { + "epoch": 0.5392592592592592, + "grad_norm": 0.8493424748321329, + "learning_rate": 9.807838569506834e-06, + "loss": 0.0816, + "step": 4550 + }, + { + "epoch": 0.5404444444444444, + "grad_norm": 1.1252355367667275, + "learning_rate": 9.805940880383716e-06, + "loss": 0.0709, + "step": 4560 + }, + { + "epoch": 0.5416296296296297, + "grad_norm": 0.8202722453794303, + "learning_rate": 9.804034052350488e-06, + "loss": 0.0705, + "step": 4570 + }, + { + "epoch": 0.5428148148148149, + "grad_norm": 0.9841648666857978, + "learning_rate": 9.802118089033147e-06, + "loss": 0.0796, + "step": 4580 + }, + { + "epoch": 0.544, + "grad_norm": 0.9170390653163952, + "learning_rate": 9.800192994075064e-06, + "loss": 0.0779, + "step": 4590 + }, + { + "epoch": 0.5451851851851852, + "grad_norm": 0.9349672850027937, + "learning_rate": 9.798258771136973e-06, + "loss": 0.0701, + "step": 4600 + }, + { + "epoch": 0.5463703703703704, + "grad_norm": 0.8796945469030365, + "learning_rate": 9.796315423896963e-06, + "loss": 0.0714, + "step": 4610 + }, + { + "epoch": 0.5475555555555556, + "grad_norm": 0.8088398284880879, + "learning_rate": 9.794362956050479e-06, + "loss": 0.0739, + "step": 4620 + }, + { + "epoch": 0.5487407407407408, + "grad_norm": 0.8677937210696124, + "learning_rate": 9.792401371310305e-06, + "loss": 0.0697, + "step": 4630 + }, + { + "epoch": 0.5499259259259259, + "grad_norm": 0.7349109114121267, + "learning_rate": 9.79043067340656e-06, + "loss": 0.0696, + "step": 4640 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 0.9034556703048897, + "learning_rate": 9.788450866086702e-06, + "loss": 0.0696, + "step": 4650 + }, + { + "epoch": 0.5522962962962963, + "grad_norm": 0.8167429360416598, + "learning_rate": 9.786461953115503e-06, + "loss": 0.0695, + "step": 4660 + }, + { + "epoch": 0.5534814814814815, + "grad_norm": 0.9066277909499421, + "learning_rate": 9.784463938275048e-06, + "loss": 0.0725, + "step": 4670 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.9478972121779468, + "learning_rate": 9.78245682536474e-06, + "loss": 0.0675, + "step": 4680 + }, + { + "epoch": 0.5558518518518518, + "grad_norm": 0.9942604137795122, + "learning_rate": 9.780440618201272e-06, + "loss": 0.0732, + "step": 4690 + }, + { + "epoch": 0.557037037037037, + "grad_norm": 0.7786745953110245, + "learning_rate": 9.778415320618637e-06, + "loss": 0.0734, + "step": 4700 + }, + { + "epoch": 0.5582222222222222, + "grad_norm": 0.8077620341914629, + "learning_rate": 9.776380936468116e-06, + "loss": 0.0696, + "step": 4710 + }, + { + "epoch": 0.5594074074074074, + "grad_norm": 0.8404086072361571, + "learning_rate": 9.77433746961826e-06, + "loss": 0.0714, + "step": 4720 + }, + { + "epoch": 0.5605925925925926, + "grad_norm": 0.7837106722079612, + "learning_rate": 9.7722849239549e-06, + "loss": 0.0687, + "step": 4730 + }, + { + "epoch": 0.5617777777777778, + "grad_norm": 0.7414209083076281, + "learning_rate": 9.770223303381128e-06, + "loss": 0.0756, + "step": 4740 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 0.9782075391281517, + "learning_rate": 9.768152611817293e-06, + "loss": 0.0708, + "step": 4750 + }, + { + "epoch": 0.5641481481481482, + "grad_norm": 0.9312208904384014, + "learning_rate": 9.76607285320099e-06, + "loss": 0.0671, + "step": 4760 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.9306729907565611, + "learning_rate": 9.763984031487065e-06, + "loss": 0.066, + "step": 4770 + }, + { + "epoch": 0.5665185185185185, + "grad_norm": 0.8345756945549867, + "learning_rate": 9.761886150647588e-06, + "loss": 0.0668, + "step": 4780 + }, + { + "epoch": 0.5677037037037037, + "grad_norm": 0.7504187948360455, + "learning_rate": 9.759779214671861e-06, + "loss": 0.0704, + "step": 4790 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 0.9164332260694077, + "learning_rate": 9.757663227566404e-06, + "loss": 0.0688, + "step": 4800 + }, + { + "epoch": 0.5700740740740741, + "grad_norm": 0.9344952704716404, + "learning_rate": 9.755538193354949e-06, + "loss": 0.0657, + "step": 4810 + }, + { + "epoch": 0.5712592592592592, + "grad_norm": 0.8379251538993587, + "learning_rate": 9.753404116078432e-06, + "loss": 0.0654, + "step": 4820 + }, + { + "epoch": 0.5724444444444444, + "grad_norm": 0.8247241787909821, + "learning_rate": 9.751260999794982e-06, + "loss": 0.0707, + "step": 4830 + }, + { + "epoch": 0.5736296296296296, + "grad_norm": 0.8336991393348129, + "learning_rate": 9.74910884857992e-06, + "loss": 0.0674, + "step": 4840 + }, + { + "epoch": 0.5748148148148148, + "grad_norm": 0.9535592029047301, + "learning_rate": 9.74694766652575e-06, + "loss": 0.0719, + "step": 4850 + }, + { + "epoch": 0.576, + "grad_norm": 0.9352346617374387, + "learning_rate": 9.74477745774214e-06, + "loss": 0.073, + "step": 4860 + }, + { + "epoch": 0.5771851851851851, + "grad_norm": 0.6953612203906957, + "learning_rate": 9.742598226355933e-06, + "loss": 0.0666, + "step": 4870 + }, + { + "epoch": 0.5783703703703704, + "grad_norm": 0.8851143259469123, + "learning_rate": 9.740409976511126e-06, + "loss": 0.0691, + "step": 4880 + }, + { + "epoch": 0.5795555555555556, + "grad_norm": 1.162132947056599, + "learning_rate": 9.738212712368858e-06, + "loss": 0.0737, + "step": 4890 + }, + { + "epoch": 0.5807407407407408, + "grad_norm": 0.842069037597675, + "learning_rate": 9.736006438107422e-06, + "loss": 0.0615, + "step": 4900 + }, + { + "epoch": 0.581925925925926, + "grad_norm": 1.0245882027540885, + "learning_rate": 9.733791157922234e-06, + "loss": 0.0687, + "step": 4910 + }, + { + "epoch": 0.5831111111111111, + "grad_norm": 0.7621347191636244, + "learning_rate": 9.731566876025844e-06, + "loss": 0.0681, + "step": 4920 + }, + { + "epoch": 0.5842962962962963, + "grad_norm": 0.8124096002928971, + "learning_rate": 9.729333596647915e-06, + "loss": 0.0672, + "step": 4930 + }, + { + "epoch": 0.5854814814814815, + "grad_norm": 0.8258468290284263, + "learning_rate": 9.727091324035216e-06, + "loss": 0.0611, + "step": 4940 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 1.0141042479424183, + "learning_rate": 9.724840062451624e-06, + "loss": 0.0678, + "step": 4950 + }, + { + "epoch": 0.5878518518518518, + "grad_norm": 0.8893839668094795, + "learning_rate": 9.722579816178107e-06, + "loss": 0.0715, + "step": 4960 + }, + { + "epoch": 0.589037037037037, + "grad_norm": 0.8344641638577025, + "learning_rate": 9.720310589512715e-06, + "loss": 0.0664, + "step": 4970 + }, + { + "epoch": 0.5902222222222222, + "grad_norm": 0.7879889786909607, + "learning_rate": 9.718032386770582e-06, + "loss": 0.0688, + "step": 4980 + }, + { + "epoch": 0.5914074074074074, + "grad_norm": 0.7925280765836924, + "learning_rate": 9.715745212283904e-06, + "loss": 0.0675, + "step": 4990 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.9260270033769327, + "learning_rate": 9.713449070401941e-06, + "loss": 0.0664, + "step": 5000 + }, + { + "epoch": 0.5937777777777777, + "grad_norm": 0.9446444431575735, + "learning_rate": 9.711143965491003e-06, + "loss": 0.0703, + "step": 5010 + }, + { + "epoch": 0.5949629629629629, + "grad_norm": 1.017346089777206, + "learning_rate": 9.708829901934447e-06, + "loss": 0.063, + "step": 5020 + }, + { + "epoch": 0.5961481481481481, + "grad_norm": 0.8271272833663433, + "learning_rate": 9.70650688413266e-06, + "loss": 0.0664, + "step": 5030 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.6636146814498104, + "learning_rate": 9.704174916503068e-06, + "loss": 0.0593, + "step": 5040 + }, + { + "epoch": 0.5985185185185186, + "grad_norm": 0.8554269321734951, + "learning_rate": 9.701834003480101e-06, + "loss": 0.0659, + "step": 5050 + }, + { + "epoch": 0.5997037037037037, + "grad_norm": 0.786425376868664, + "learning_rate": 9.699484149515209e-06, + "loss": 0.0623, + "step": 5060 + }, + { + "epoch": 0.6008888888888889, + "grad_norm": 0.9158010551102281, + "learning_rate": 9.697125359076842e-06, + "loss": 0.0655, + "step": 5070 + }, + { + "epoch": 0.6020740740740741, + "grad_norm": 0.8378371030609353, + "learning_rate": 9.69475763665044e-06, + "loss": 0.0653, + "step": 5080 + }, + { + "epoch": 0.6032592592592593, + "grad_norm": 0.8771324341656606, + "learning_rate": 9.692380986738437e-06, + "loss": 0.0679, + "step": 5090 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 0.890772082009701, + "learning_rate": 9.689995413860232e-06, + "loss": 0.0681, + "step": 5100 + }, + { + "epoch": 0.6056296296296296, + "grad_norm": 0.6840219596828194, + "learning_rate": 9.6876009225522e-06, + "loss": 0.0602, + "step": 5110 + }, + { + "epoch": 0.6068148148148148, + "grad_norm": 0.8918993932105872, + "learning_rate": 9.68519751736767e-06, + "loss": 0.0668, + "step": 5120 + }, + { + "epoch": 0.608, + "grad_norm": 0.7424429000679742, + "learning_rate": 9.682785202876926e-06, + "loss": 0.0688, + "step": 5130 + }, + { + "epoch": 0.6091851851851852, + "grad_norm": 0.706883039822154, + "learning_rate": 9.680363983667188e-06, + "loss": 0.0618, + "step": 5140 + }, + { + "epoch": 0.6103703703703703, + "grad_norm": 0.7982272736582623, + "learning_rate": 9.677933864342617e-06, + "loss": 0.0613, + "step": 5150 + }, + { + "epoch": 0.6115555555555555, + "grad_norm": 0.8670836748387671, + "learning_rate": 9.67549484952429e-06, + "loss": 0.0631, + "step": 5160 + }, + { + "epoch": 0.6127407407407407, + "grad_norm": 0.7421959033877474, + "learning_rate": 9.673046943850209e-06, + "loss": 0.0618, + "step": 5170 + }, + { + "epoch": 0.6139259259259259, + "grad_norm": 0.8029090636060535, + "learning_rate": 9.67059015197527e-06, + "loss": 0.064, + "step": 5180 + }, + { + "epoch": 0.6151111111111112, + "grad_norm": 0.8685429063722038, + "learning_rate": 9.66812447857128e-06, + "loss": 0.0643, + "step": 5190 + }, + { + "epoch": 0.6162962962962963, + "grad_norm": 0.860826831107572, + "learning_rate": 9.665649928326928e-06, + "loss": 0.0674, + "step": 5200 + }, + { + "epoch": 0.6174814814814815, + "grad_norm": 0.9326389724348948, + "learning_rate": 9.663166505947782e-06, + "loss": 0.0631, + "step": 5210 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.8840921628765807, + "learning_rate": 9.660674216156285e-06, + "loss": 0.0644, + "step": 5220 + }, + { + "epoch": 0.6198518518518519, + "grad_norm": 0.8092167255352415, + "learning_rate": 9.65817306369174e-06, + "loss": 0.0649, + "step": 5230 + }, + { + "epoch": 0.621037037037037, + "grad_norm": 1.0602310809354665, + "learning_rate": 9.655663053310304e-06, + "loss": 0.0673, + "step": 5240 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.8539612776032047, + "learning_rate": 9.653144189784977e-06, + "loss": 0.0652, + "step": 5250 + }, + { + "epoch": 0.6234074074074074, + "grad_norm": 0.9071711891564866, + "learning_rate": 9.650616477905595e-06, + "loss": 0.068, + "step": 5260 + }, + { + "epoch": 0.6245925925925926, + "grad_norm": 0.9274017159065672, + "learning_rate": 9.648079922478822e-06, + "loss": 0.0641, + "step": 5270 + }, + { + "epoch": 0.6257777777777778, + "grad_norm": 0.7406516271738932, + "learning_rate": 9.645534528328131e-06, + "loss": 0.0641, + "step": 5280 + }, + { + "epoch": 0.6269629629629629, + "grad_norm": 0.7465702088105352, + "learning_rate": 9.642980300293814e-06, + "loss": 0.0588, + "step": 5290 + }, + { + "epoch": 0.6281481481481481, + "grad_norm": 0.9777056163743169, + "learning_rate": 9.640417243232951e-06, + "loss": 0.0613, + "step": 5300 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.6294669971038668, + "learning_rate": 9.637845362019418e-06, + "loss": 0.0651, + "step": 5310 + }, + { + "epoch": 0.6305185185185185, + "grad_norm": 0.8809505962495335, + "learning_rate": 9.635264661543867e-06, + "loss": 0.0643, + "step": 5320 + }, + { + "epoch": 0.6317037037037037, + "grad_norm": 0.8631695011826251, + "learning_rate": 9.632675146713723e-06, + "loss": 0.0689, + "step": 5330 + }, + { + "epoch": 0.6328888888888888, + "grad_norm": 0.7907244282209989, + "learning_rate": 9.630076822453171e-06, + "loss": 0.0616, + "step": 5340 + }, + { + "epoch": 0.6340740740740741, + "grad_norm": 0.8973103523088067, + "learning_rate": 9.627469693703149e-06, + "loss": 0.0617, + "step": 5350 + }, + { + "epoch": 0.6352592592592593, + "grad_norm": 0.9475571187902337, + "learning_rate": 9.624853765421334e-06, + "loss": 0.0656, + "step": 5360 + }, + { + "epoch": 0.6364444444444445, + "grad_norm": 1.0157216339964799, + "learning_rate": 9.62222904258214e-06, + "loss": 0.0601, + "step": 5370 + }, + { + "epoch": 0.6376296296296297, + "grad_norm": 0.9974306627276354, + "learning_rate": 9.619595530176707e-06, + "loss": 0.0679, + "step": 5380 + }, + { + "epoch": 0.6388148148148148, + "grad_norm": 0.7794751178790458, + "learning_rate": 9.61695323321288e-06, + "loss": 0.0597, + "step": 5390 + }, + { + "epoch": 0.64, + "grad_norm": 0.8625675136062335, + "learning_rate": 9.614302156715214e-06, + "loss": 0.0629, + "step": 5400 + }, + { + "epoch": 0.6411851851851852, + "grad_norm": 0.9282341396493997, + "learning_rate": 9.611642305724965e-06, + "loss": 0.0643, + "step": 5410 + }, + { + "epoch": 0.6423703703703704, + "grad_norm": 1.065934769988668, + "learning_rate": 9.608973685300063e-06, + "loss": 0.0632, + "step": 5420 + }, + { + "epoch": 0.6435555555555555, + "grad_norm": 0.8388865553494513, + "learning_rate": 9.606296300515122e-06, + "loss": 0.0664, + "step": 5430 + }, + { + "epoch": 0.6447407407407407, + "grad_norm": 0.8548715708167248, + "learning_rate": 9.603610156461415e-06, + "loss": 0.0627, + "step": 5440 + }, + { + "epoch": 0.6459259259259259, + "grad_norm": 0.854245229233928, + "learning_rate": 9.600915258246884e-06, + "loss": 0.0626, + "step": 5450 + }, + { + "epoch": 0.6471111111111111, + "grad_norm": 0.8890221944599039, + "learning_rate": 9.598211610996104e-06, + "loss": 0.0604, + "step": 5460 + }, + { + "epoch": 0.6482962962962963, + "grad_norm": 0.8734622308738257, + "learning_rate": 9.595499219850295e-06, + "loss": 0.0616, + "step": 5470 + }, + { + "epoch": 0.6494814814814814, + "grad_norm": 0.7953487650138661, + "learning_rate": 9.5927780899673e-06, + "loss": 0.0606, + "step": 5480 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.6840634039323507, + "learning_rate": 9.590048226521587e-06, + "loss": 0.0657, + "step": 5490 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 0.744927913566696, + "learning_rate": 9.587309634704219e-06, + "loss": 0.0605, + "step": 5500 + }, + { + "epoch": 0.6530370370370371, + "grad_norm": 0.8445654490129404, + "learning_rate": 9.584562319722868e-06, + "loss": 0.0603, + "step": 5510 + }, + { + "epoch": 0.6542222222222223, + "grad_norm": 0.8275336436590583, + "learning_rate": 9.58180628680179e-06, + "loss": 0.0601, + "step": 5520 + }, + { + "epoch": 0.6554074074074074, + "grad_norm": 0.9520322283595853, + "learning_rate": 9.579041541181816e-06, + "loss": 0.0604, + "step": 5530 + }, + { + "epoch": 0.6565925925925926, + "grad_norm": 0.816709186599619, + "learning_rate": 9.576268088120354e-06, + "loss": 0.0585, + "step": 5540 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 0.7421032800977814, + "learning_rate": 9.573485932891356e-06, + "loss": 0.062, + "step": 5550 + }, + { + "epoch": 0.658962962962963, + "grad_norm": 0.7457383791808052, + "learning_rate": 9.570695080785333e-06, + "loss": 0.0557, + "step": 5560 + }, + { + "epoch": 0.6601481481481482, + "grad_norm": 0.738277067476009, + "learning_rate": 9.567895537109331e-06, + "loss": 0.0633, + "step": 5570 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.7311474578253814, + "learning_rate": 9.56508730718692e-06, + "loss": 0.0618, + "step": 5580 + }, + { + "epoch": 0.6625185185185185, + "grad_norm": 0.6122012090202148, + "learning_rate": 9.562270396358196e-06, + "loss": 0.0582, + "step": 5590 + }, + { + "epoch": 0.6637037037037037, + "grad_norm": 0.6528632820523848, + "learning_rate": 9.559444809979754e-06, + "loss": 0.0589, + "step": 5600 + }, + { + "epoch": 0.6648888888888889, + "grad_norm": 0.7232679190902909, + "learning_rate": 9.556610553424692e-06, + "loss": 0.061, + "step": 5610 + }, + { + "epoch": 0.666074074074074, + "grad_norm": 0.7414024599892046, + "learning_rate": 9.553767632082588e-06, + "loss": 0.0622, + "step": 5620 + }, + { + "epoch": 0.6672592592592592, + "grad_norm": 0.965476370726474, + "learning_rate": 9.550916051359506e-06, + "loss": 0.0598, + "step": 5630 + }, + { + "epoch": 0.6684444444444444, + "grad_norm": 0.8379623119712973, + "learning_rate": 9.548055816677971e-06, + "loss": 0.0557, + "step": 5640 + }, + { + "epoch": 0.6696296296296296, + "grad_norm": 0.8585776947199584, + "learning_rate": 9.545186933476964e-06, + "loss": 0.062, + "step": 5650 + }, + { + "epoch": 0.6708148148148149, + "grad_norm": 0.7795756023660245, + "learning_rate": 9.542309407211914e-06, + "loss": 0.0527, + "step": 5660 + }, + { + "epoch": 0.672, + "grad_norm": 0.778102865818906, + "learning_rate": 9.539423243354687e-06, + "loss": 0.063, + "step": 5670 + }, + { + "epoch": 0.6731851851851852, + "grad_norm": 0.7211796374650427, + "learning_rate": 9.536528447393568e-06, + "loss": 0.0607, + "step": 5680 + }, + { + "epoch": 0.6743703703703704, + "grad_norm": 0.9290785924371383, + "learning_rate": 9.533625024833264e-06, + "loss": 0.0592, + "step": 5690 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 0.8015010809685191, + "learning_rate": 9.53071298119488e-06, + "loss": 0.0584, + "step": 5700 + }, + { + "epoch": 0.6767407407407408, + "grad_norm": 0.7634909938889708, + "learning_rate": 9.527792322015918e-06, + "loss": 0.0632, + "step": 5710 + }, + { + "epoch": 0.6779259259259259, + "grad_norm": 0.7005879890355755, + "learning_rate": 9.524863052850266e-06, + "loss": 0.0596, + "step": 5720 + }, + { + "epoch": 0.6791111111111111, + "grad_norm": 0.7967970526813604, + "learning_rate": 9.521925179268178e-06, + "loss": 0.0607, + "step": 5730 + }, + { + "epoch": 0.6802962962962963, + "grad_norm": 0.9061828273780305, + "learning_rate": 9.518978706856275e-06, + "loss": 0.064, + "step": 5740 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 0.8325329934176016, + "learning_rate": 9.516023641217527e-06, + "loss": 0.054, + "step": 5750 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.9647904060386737, + "learning_rate": 9.513059987971245e-06, + "loss": 0.058, + "step": 5760 + }, + { + "epoch": 0.6838518518518518, + "grad_norm": 0.6361162714731206, + "learning_rate": 9.510087752753073e-06, + "loss": 0.0598, + "step": 5770 + }, + { + "epoch": 0.685037037037037, + "grad_norm": 0.6651870206764534, + "learning_rate": 9.507106941214968e-06, + "loss": 0.0602, + "step": 5780 + }, + { + "epoch": 0.6862222222222222, + "grad_norm": 0.879620483371766, + "learning_rate": 9.504117559025204e-06, + "loss": 0.0607, + "step": 5790 + }, + { + "epoch": 0.6874074074074074, + "grad_norm": 0.6384981281798937, + "learning_rate": 9.501119611868346e-06, + "loss": 0.0552, + "step": 5800 + }, + { + "epoch": 0.6885925925925926, + "grad_norm": 0.8148090652077672, + "learning_rate": 9.49811310544525e-06, + "loss": 0.0517, + "step": 5810 + }, + { + "epoch": 0.6897777777777778, + "grad_norm": 0.6261831834625272, + "learning_rate": 9.495098045473043e-06, + "loss": 0.0576, + "step": 5820 + }, + { + "epoch": 0.690962962962963, + "grad_norm": 0.750739861082205, + "learning_rate": 9.492074437685126e-06, + "loss": 0.0593, + "step": 5830 + }, + { + "epoch": 0.6921481481481482, + "grad_norm": 0.6235390264038503, + "learning_rate": 9.489042287831147e-06, + "loss": 0.0601, + "step": 5840 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.6721873408161235, + "learning_rate": 9.486001601677e-06, + "loss": 0.0567, + "step": 5850 + }, + { + "epoch": 0.6945185185185185, + "grad_norm": 0.7956788166241592, + "learning_rate": 9.482952385004809e-06, + "loss": 0.0613, + "step": 5860 + }, + { + "epoch": 0.6957037037037037, + "grad_norm": 0.7240587880855817, + "learning_rate": 9.479894643612926e-06, + "loss": 0.0586, + "step": 5870 + }, + { + "epoch": 0.6968888888888889, + "grad_norm": 0.5945392493921163, + "learning_rate": 9.476828383315907e-06, + "loss": 0.0562, + "step": 5880 + }, + { + "epoch": 0.6980740740740741, + "grad_norm": 0.7210743449557837, + "learning_rate": 9.47375360994451e-06, + "loss": 0.054, + "step": 5890 + }, + { + "epoch": 0.6992592592592592, + "grad_norm": 0.8592775365194965, + "learning_rate": 9.470670329345682e-06, + "loss": 0.0602, + "step": 5900 + }, + { + "epoch": 0.7004444444444444, + "grad_norm": 0.7680147220283731, + "learning_rate": 9.467578547382545e-06, + "loss": 0.0604, + "step": 5910 + }, + { + "epoch": 0.7016296296296296, + "grad_norm": 0.930965909733838, + "learning_rate": 9.464478269934391e-06, + "loss": 0.0597, + "step": 5920 + }, + { + "epoch": 0.7028148148148148, + "grad_norm": 0.8465943226365384, + "learning_rate": 9.46136950289666e-06, + "loss": 0.0564, + "step": 5930 + }, + { + "epoch": 0.704, + "grad_norm": 0.8230315120793787, + "learning_rate": 9.458252252180944e-06, + "loss": 0.0568, + "step": 5940 + }, + { + "epoch": 0.7051851851851851, + "grad_norm": 0.7940851445240408, + "learning_rate": 9.455126523714962e-06, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.7063703703703703, + "grad_norm": 0.9072359763710295, + "learning_rate": 9.451992323442557e-06, + "loss": 0.062, + "step": 5960 + }, + { + "epoch": 0.7075555555555556, + "grad_norm": 0.8995851727063644, + "learning_rate": 9.448849657323675e-06, + "loss": 0.0586, + "step": 5970 + }, + { + "epoch": 0.7087407407407408, + "grad_norm": 0.7970966212102009, + "learning_rate": 9.445698531334374e-06, + "loss": 0.0583, + "step": 5980 + }, + { + "epoch": 0.709925925925926, + "grad_norm": 1.0101655604060642, + "learning_rate": 9.442538951466786e-06, + "loss": 0.0535, + "step": 5990 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.8269495414481755, + "learning_rate": 9.439370923729124e-06, + "loss": 0.0565, + "step": 6000 + }, + { + "epoch": 0.7122962962962963, + "grad_norm": 0.9010994255221992, + "learning_rate": 9.43619445414567e-06, + "loss": 0.0604, + "step": 6010 + }, + { + "epoch": 0.7134814814814815, + "grad_norm": 0.6082660417089522, + "learning_rate": 9.433009548756746e-06, + "loss": 0.0578, + "step": 6020 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.9597698816797973, + "learning_rate": 9.429816213618732e-06, + "loss": 0.0576, + "step": 6030 + }, + { + "epoch": 0.7158518518518519, + "grad_norm": 0.7970908735161462, + "learning_rate": 9.426614454804026e-06, + "loss": 0.0608, + "step": 6040 + }, + { + "epoch": 0.717037037037037, + "grad_norm": 0.7567449036108198, + "learning_rate": 9.423404278401047e-06, + "loss": 0.0542, + "step": 6050 + }, + { + "epoch": 0.7182222222222222, + "grad_norm": 0.8624178532393584, + "learning_rate": 9.420185690514222e-06, + "loss": 0.0542, + "step": 6060 + }, + { + "epoch": 0.7194074074074074, + "grad_norm": 0.6619442585433717, + "learning_rate": 9.416958697263976e-06, + "loss": 0.0592, + "step": 6070 + }, + { + "epoch": 0.7205925925925926, + "grad_norm": 0.8855994863229733, + "learning_rate": 9.413723304786709e-06, + "loss": 0.0579, + "step": 6080 + }, + { + "epoch": 0.7217777777777777, + "grad_norm": 0.8505929203748652, + "learning_rate": 9.410479519234803e-06, + "loss": 0.0537, + "step": 6090 + }, + { + "epoch": 0.7229629629629629, + "grad_norm": 0.9112279837425967, + "learning_rate": 9.407227346776592e-06, + "loss": 0.0554, + "step": 6100 + }, + { + "epoch": 0.7241481481481481, + "grad_norm": 1.0305418820556183, + "learning_rate": 9.403966793596363e-06, + "loss": 0.0582, + "step": 6110 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.7249236387943062, + "learning_rate": 9.40069786589434e-06, + "loss": 0.0602, + "step": 6120 + }, + { + "epoch": 0.7265185185185186, + "grad_norm": 0.8867392695144658, + "learning_rate": 9.397420569886666e-06, + "loss": 0.0596, + "step": 6130 + }, + { + "epoch": 0.7277037037037037, + "grad_norm": 0.666491885927328, + "learning_rate": 9.394134911805406e-06, + "loss": 0.0565, + "step": 6140 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 0.7448308217658117, + "learning_rate": 9.390840897898519e-06, + "loss": 0.0547, + "step": 6150 + }, + { + "epoch": 0.7300740740740741, + "grad_norm": 0.8831469556013009, + "learning_rate": 9.387538534429856e-06, + "loss": 0.0596, + "step": 6160 + }, + { + "epoch": 0.7312592592592593, + "grad_norm": 0.7929783753785901, + "learning_rate": 9.384227827679147e-06, + "loss": 0.0566, + "step": 6170 + }, + { + "epoch": 0.7324444444444445, + "grad_norm": 0.7885426096978996, + "learning_rate": 9.380908783941985e-06, + "loss": 0.0593, + "step": 6180 + }, + { + "epoch": 0.7336296296296296, + "grad_norm": 0.757031908528628, + "learning_rate": 9.377581409529814e-06, + "loss": 0.0557, + "step": 6190 + }, + { + "epoch": 0.7348148148148148, + "grad_norm": 0.663216098080713, + "learning_rate": 9.37424571076993e-06, + "loss": 0.053, + "step": 6200 + }, + { + "epoch": 0.736, + "grad_norm": 1.0290396618979418, + "learning_rate": 9.370901694005444e-06, + "loss": 0.0617, + "step": 6210 + }, + { + "epoch": 0.7371851851851852, + "grad_norm": 0.69489688132048, + "learning_rate": 9.367549365595294e-06, + "loss": 0.052, + "step": 6220 + }, + { + "epoch": 0.7383703703703703, + "grad_norm": 0.6671955448491652, + "learning_rate": 9.36418873191422e-06, + "loss": 0.0535, + "step": 6230 + }, + { + "epoch": 0.7395555555555555, + "grad_norm": 0.6981340903029717, + "learning_rate": 9.36081979935276e-06, + "loss": 0.0558, + "step": 6240 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.8665073691852571, + "learning_rate": 9.357442574317227e-06, + "loss": 0.0608, + "step": 6250 + }, + { + "epoch": 0.7419259259259259, + "grad_norm": 0.6724644707186217, + "learning_rate": 9.354057063229703e-06, + "loss": 0.0609, + "step": 6260 + }, + { + "epoch": 0.7431111111111111, + "grad_norm": 0.8985662275435259, + "learning_rate": 9.350663272528032e-06, + "loss": 0.0576, + "step": 6270 + }, + { + "epoch": 0.7442962962962963, + "grad_norm": 0.7712484506703242, + "learning_rate": 9.347261208665795e-06, + "loss": 0.0558, + "step": 6280 + }, + { + "epoch": 0.7454814814814815, + "grad_norm": 0.7735773295388982, + "learning_rate": 9.343850878112313e-06, + "loss": 0.0533, + "step": 6290 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.7736002632612686, + "learning_rate": 9.340432287352621e-06, + "loss": 0.0541, + "step": 6300 + }, + { + "epoch": 0.7478518518518519, + "grad_norm": 0.6405293657691001, + "learning_rate": 9.337005442887464e-06, + "loss": 0.0551, + "step": 6310 + }, + { + "epoch": 0.7490370370370371, + "grad_norm": 0.8045092278431043, + "learning_rate": 9.33357035123328e-06, + "loss": 0.055, + "step": 6320 + }, + { + "epoch": 0.7502222222222222, + "grad_norm": 0.7938763565949336, + "learning_rate": 9.330127018922195e-06, + "loss": 0.0524, + "step": 6330 + }, + { + "epoch": 0.7514074074074074, + "grad_norm": 0.684880227632773, + "learning_rate": 9.326675452501997e-06, + "loss": 0.0508, + "step": 6340 + }, + { + "epoch": 0.7525925925925926, + "grad_norm": 0.8629082246317131, + "learning_rate": 9.323215658536141e-06, + "loss": 0.0567, + "step": 6350 + }, + { + "epoch": 0.7537777777777778, + "grad_norm": 0.9056361754163381, + "learning_rate": 9.319747643603721e-06, + "loss": 0.0521, + "step": 6360 + }, + { + "epoch": 0.754962962962963, + "grad_norm": 1.0007224812329896, + "learning_rate": 9.316271414299464e-06, + "loss": 0.0551, + "step": 6370 + }, + { + "epoch": 0.7561481481481481, + "grad_norm": 0.8027475866803074, + "learning_rate": 9.312786977233722e-06, + "loss": 0.0567, + "step": 6380 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.8213424296886762, + "learning_rate": 9.309294339032451e-06, + "loss": 0.0568, + "step": 6390 + }, + { + "epoch": 0.7585185185185185, + "grad_norm": 0.7220031603090468, + "learning_rate": 9.305793506337205e-06, + "loss": 0.0542, + "step": 6400 + }, + { + "epoch": 0.7597037037037037, + "grad_norm": 0.7961328175054693, + "learning_rate": 9.302284485805114e-06, + "loss": 0.0539, + "step": 6410 + }, + { + "epoch": 0.7608888888888888, + "grad_norm": 0.8014918176680914, + "learning_rate": 9.298767284108884e-06, + "loss": 0.0501, + "step": 6420 + }, + { + "epoch": 0.7620740740740741, + "grad_norm": 0.6541465638138857, + "learning_rate": 9.295241907936779e-06, + "loss": 0.0546, + "step": 6430 + }, + { + "epoch": 0.7632592592592593, + "grad_norm": 0.7442494535751223, + "learning_rate": 9.291708363992602e-06, + "loss": 0.0565, + "step": 6440 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 0.7306920048504332, + "learning_rate": 9.288166658995694e-06, + "loss": 0.0564, + "step": 6450 + }, + { + "epoch": 0.7656296296296297, + "grad_norm": 0.8457138315981726, + "learning_rate": 9.284616799680912e-06, + "loss": 0.0561, + "step": 6460 + }, + { + "epoch": 0.7668148148148148, + "grad_norm": 0.8152511601451743, + "learning_rate": 9.281058792798615e-06, + "loss": 0.0521, + "step": 6470 + }, + { + "epoch": 0.768, + "grad_norm": 0.9140146783731141, + "learning_rate": 9.277492645114662e-06, + "loss": 0.055, + "step": 6480 + }, + { + "epoch": 0.7691851851851852, + "grad_norm": 0.7887229450791052, + "learning_rate": 9.273918363410391e-06, + "loss": 0.0573, + "step": 6490 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 0.7957294730122749, + "learning_rate": 9.270335954482601e-06, + "loss": 0.05, + "step": 6500 + }, + { + "epoch": 0.7715555555555556, + "grad_norm": 0.6453171802853228, + "learning_rate": 9.266745425143556e-06, + "loss": 0.0557, + "step": 6510 + }, + { + "epoch": 0.7727407407407407, + "grad_norm": 0.6905322452593332, + "learning_rate": 9.263146782220956e-06, + "loss": 0.0529, + "step": 6520 + }, + { + "epoch": 0.7739259259259259, + "grad_norm": 1.015803321745992, + "learning_rate": 9.259540032557927e-06, + "loss": 0.0555, + "step": 6530 + }, + { + "epoch": 0.7751111111111111, + "grad_norm": 0.8156885650789713, + "learning_rate": 9.255925183013016e-06, + "loss": 0.0549, + "step": 6540 + }, + { + "epoch": 0.7762962962962963, + "grad_norm": 0.8153343743055421, + "learning_rate": 9.25230224046017e-06, + "loss": 0.0541, + "step": 6550 + }, + { + "epoch": 0.7774814814814814, + "grad_norm": 0.8836847487251168, + "learning_rate": 9.248671211788727e-06, + "loss": 0.0548, + "step": 6560 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.6607179937708231, + "learning_rate": 9.2450321039034e-06, + "loss": 0.0514, + "step": 6570 + }, + { + "epoch": 0.7798518518518519, + "grad_norm": 0.6578535058879177, + "learning_rate": 9.241384923724263e-06, + "loss": 0.0543, + "step": 6580 + }, + { + "epoch": 0.7810370370370371, + "grad_norm": 0.9149304817403267, + "learning_rate": 9.237729678186747e-06, + "loss": 0.0548, + "step": 6590 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 0.6708209980140788, + "learning_rate": 9.234066374241615e-06, + "loss": 0.0483, + "step": 6600 + }, + { + "epoch": 0.7834074074074074, + "grad_norm": 0.6969816513837376, + "learning_rate": 9.230395018854952e-06, + "loss": 0.0512, + "step": 6610 + }, + { + "epoch": 0.7845925925925926, + "grad_norm": 0.7745192902944739, + "learning_rate": 9.22671561900816e-06, + "loss": 0.0566, + "step": 6620 + }, + { + "epoch": 0.7857777777777778, + "grad_norm": 0.7785851625639385, + "learning_rate": 9.22302818169793e-06, + "loss": 0.0524, + "step": 6630 + }, + { + "epoch": 0.786962962962963, + "grad_norm": 0.7498071517671575, + "learning_rate": 9.219332713936247e-06, + "loss": 0.0509, + "step": 6640 + }, + { + "epoch": 0.7881481481481482, + "grad_norm": 0.6689264312381505, + "learning_rate": 9.215629222750356e-06, + "loss": 0.0556, + "step": 6650 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 1.0352593385270017, + "learning_rate": 9.211917715182766e-06, + "loss": 0.0536, + "step": 6660 + }, + { + "epoch": 0.7905185185185185, + "grad_norm": 0.7877134132861506, + "learning_rate": 9.208198198291227e-06, + "loss": 0.0502, + "step": 6670 + }, + { + "epoch": 0.7917037037037037, + "grad_norm": 0.7863702253596683, + "learning_rate": 9.204470679148721e-06, + "loss": 0.055, + "step": 6680 + }, + { + "epoch": 0.7928888888888889, + "grad_norm": 0.7892872655036743, + "learning_rate": 9.200735164843447e-06, + "loss": 0.0542, + "step": 6690 + }, + { + "epoch": 0.794074074074074, + "grad_norm": 0.9191341736448989, + "learning_rate": 9.196991662478807e-06, + "loss": 0.0488, + "step": 6700 + }, + { + "epoch": 0.7952592592592592, + "grad_norm": 0.8580919800959579, + "learning_rate": 9.193240179173392e-06, + "loss": 0.0568, + "step": 6710 + }, + { + "epoch": 0.7964444444444444, + "grad_norm": 0.6818725359696742, + "learning_rate": 9.18948072206097e-06, + "loss": 0.0555, + "step": 6720 + }, + { + "epoch": 0.7976296296296296, + "grad_norm": 0.7314317034608175, + "learning_rate": 9.185713298290475e-06, + "loss": 0.0529, + "step": 6730 + }, + { + "epoch": 0.7988148148148149, + "grad_norm": 0.7009943012989422, + "learning_rate": 9.181937915025985e-06, + "loss": 0.0521, + "step": 6740 + }, + { + "epoch": 0.8, + "grad_norm": 0.6782388236944448, + "learning_rate": 9.178154579446713e-06, + "loss": 0.0528, + "step": 6750 + }, + { + "epoch": 0.8011851851851852, + "grad_norm": 0.593549728311959, + "learning_rate": 9.174363298747005e-06, + "loss": 0.05, + "step": 6760 + }, + { + "epoch": 0.8023703703703704, + "grad_norm": 0.5511569981075917, + "learning_rate": 9.170564080136301e-06, + "loss": 0.0535, + "step": 6770 + }, + { + "epoch": 0.8035555555555556, + "grad_norm": 0.8915113836903726, + "learning_rate": 9.166756930839144e-06, + "loss": 0.0539, + "step": 6780 + }, + { + "epoch": 0.8047407407407408, + "grad_norm": 0.8222043889389753, + "learning_rate": 9.162941858095156e-06, + "loss": 0.0519, + "step": 6790 + }, + { + "epoch": 0.8059259259259259, + "grad_norm": 0.8015643043748327, + "learning_rate": 9.159118869159028e-06, + "loss": 0.049, + "step": 6800 + }, + { + "epoch": 0.8071111111111111, + "grad_norm": 0.676674015435569, + "learning_rate": 9.155287971300498e-06, + "loss": 0.0525, + "step": 6810 + }, + { + "epoch": 0.8082962962962963, + "grad_norm": 0.6182134346053748, + "learning_rate": 9.151449171804353e-06, + "loss": 0.0495, + "step": 6820 + }, + { + "epoch": 0.8094814814814815, + "grad_norm": 0.753963849192247, + "learning_rate": 9.147602477970396e-06, + "loss": 0.0529, + "step": 6830 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 1.2623181497669773, + "learning_rate": 9.143747897113449e-06, + "loss": 0.0499, + "step": 6840 + }, + { + "epoch": 0.8118518518518518, + "grad_norm": 0.606596694809615, + "learning_rate": 9.139885436563328e-06, + "loss": 0.0527, + "step": 6850 + }, + { + "epoch": 0.813037037037037, + "grad_norm": 0.8800625178574711, + "learning_rate": 9.136015103664835e-06, + "loss": 0.0486, + "step": 6860 + }, + { + "epoch": 0.8142222222222222, + "grad_norm": 0.5807539928925268, + "learning_rate": 9.132136905777742e-06, + "loss": 0.0493, + "step": 6870 + }, + { + "epoch": 0.8154074074074074, + "grad_norm": 0.6103744082962793, + "learning_rate": 9.128250850276774e-06, + "loss": 0.0549, + "step": 6880 + }, + { + "epoch": 0.8165925925925926, + "grad_norm": 0.8239434055661777, + "learning_rate": 9.1243569445516e-06, + "loss": 0.0491, + "step": 6890 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 0.7240930108958012, + "learning_rate": 9.120455196006816e-06, + "loss": 0.0555, + "step": 6900 + }, + { + "epoch": 0.818962962962963, + "grad_norm": 0.87815896354831, + "learning_rate": 9.116545612061935e-06, + "loss": 0.0518, + "step": 6910 + }, + { + "epoch": 0.8201481481481482, + "grad_norm": 0.605418961984683, + "learning_rate": 9.112628200151366e-06, + "loss": 0.0519, + "step": 6920 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.8333390981607123, + "learning_rate": 9.108702967724407e-06, + "loss": 0.0502, + "step": 6930 + }, + { + "epoch": 0.8225185185185185, + "grad_norm": 0.8088889987293015, + "learning_rate": 9.10476992224522e-06, + "loss": 0.0533, + "step": 6940 + }, + { + "epoch": 0.8237037037037037, + "grad_norm": 0.8696568790843807, + "learning_rate": 9.100829071192837e-06, + "loss": 0.0529, + "step": 6950 + }, + { + "epoch": 0.8248888888888889, + "grad_norm": 0.7485286739334551, + "learning_rate": 9.096880422061116e-06, + "loss": 0.0555, + "step": 6960 + }, + { + "epoch": 0.8260740740740741, + "grad_norm": 0.7312342760803509, + "learning_rate": 9.09292398235876e-06, + "loss": 0.0487, + "step": 6970 + }, + { + "epoch": 0.8272592592592592, + "grad_norm": 0.9295771493515899, + "learning_rate": 9.088959759609278e-06, + "loss": 0.0516, + "step": 6980 + }, + { + "epoch": 0.8284444444444444, + "grad_norm": 0.7163715080606454, + "learning_rate": 9.08498776135098e-06, + "loss": 0.0515, + "step": 6990 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 0.7376926223849972, + "learning_rate": 9.081007995136964e-06, + "loss": 0.0546, + "step": 7000 + }, + { + "epoch": 0.8308148148148148, + "grad_norm": 0.7977251816096612, + "learning_rate": 9.077020468535093e-06, + "loss": 0.0512, + "step": 7010 + }, + { + "epoch": 0.832, + "grad_norm": 0.7462915664446347, + "learning_rate": 9.073025189128e-06, + "loss": 0.0557, + "step": 7020 + }, + { + "epoch": 0.8331851851851851, + "grad_norm": 0.7898958529684861, + "learning_rate": 9.069022164513044e-06, + "loss": 0.0542, + "step": 7030 + }, + { + "epoch": 0.8343703703703703, + "grad_norm": 0.8555896594741408, + "learning_rate": 9.065011402302327e-06, + "loss": 0.0538, + "step": 7040 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 0.6064430590889784, + "learning_rate": 9.060992910122656e-06, + "loss": 0.0487, + "step": 7050 + }, + { + "epoch": 0.8367407407407408, + "grad_norm": 0.7421301453000255, + "learning_rate": 9.05696669561554e-06, + "loss": 0.0499, + "step": 7060 + }, + { + "epoch": 0.837925925925926, + "grad_norm": 0.7454771545342748, + "learning_rate": 9.052932766437173e-06, + "loss": 0.0484, + "step": 7070 + }, + { + "epoch": 0.8391111111111111, + "grad_norm": 0.5933470949265227, + "learning_rate": 9.048891130258417e-06, + "loss": 0.0503, + "step": 7080 + }, + { + "epoch": 0.8402962962962963, + "grad_norm": 0.8018042199892028, + "learning_rate": 9.044841794764791e-06, + "loss": 0.0523, + "step": 7090 + }, + { + "epoch": 0.8414814814814815, + "grad_norm": 0.8384064614127551, + "learning_rate": 9.040784767656456e-06, + "loss": 0.0543, + "step": 7100 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.8268764329768402, + "learning_rate": 9.036720056648197e-06, + "loss": 0.0498, + "step": 7110 + }, + { + "epoch": 0.8438518518518519, + "grad_norm": 0.6490970755638987, + "learning_rate": 9.032647669469413e-06, + "loss": 0.055, + "step": 7120 + }, + { + "epoch": 0.845037037037037, + "grad_norm": 0.7124008450897088, + "learning_rate": 9.028567613864098e-06, + "loss": 0.0485, + "step": 7130 + }, + { + "epoch": 0.8462222222222222, + "grad_norm": 0.6883514513585333, + "learning_rate": 9.024479897590828e-06, + "loss": 0.0491, + "step": 7140 + }, + { + "epoch": 0.8474074074074074, + "grad_norm": 0.7562654756583114, + "learning_rate": 9.020384528422748e-06, + "loss": 0.0505, + "step": 7150 + }, + { + "epoch": 0.8485925925925926, + "grad_norm": 0.6423243603145317, + "learning_rate": 9.016281514147556e-06, + "loss": 0.0507, + "step": 7160 + }, + { + "epoch": 0.8497777777777777, + "grad_norm": 0.6163777727950482, + "learning_rate": 9.012170862567485e-06, + "loss": 0.0514, + "step": 7170 + }, + { + "epoch": 0.8509629629629629, + "grad_norm": 0.7436762834591052, + "learning_rate": 9.008052581499294e-06, + "loss": 0.0511, + "step": 7180 + }, + { + "epoch": 0.8521481481481481, + "grad_norm": 0.6176416871335711, + "learning_rate": 9.003926678774246e-06, + "loss": 0.0474, + "step": 7190 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.7159250656813492, + "learning_rate": 8.999793162238105e-06, + "loss": 0.0501, + "step": 7200 + }, + { + "epoch": 0.8545185185185186, + "grad_norm": 0.7201514984437982, + "learning_rate": 8.995652039751103e-06, + "loss": 0.0523, + "step": 7210 + }, + { + "epoch": 0.8557037037037037, + "grad_norm": 0.9470419359503877, + "learning_rate": 8.991503319187944e-06, + "loss": 0.0512, + "step": 7220 + }, + { + "epoch": 0.8568888888888889, + "grad_norm": 0.6849997793505873, + "learning_rate": 8.987347008437776e-06, + "loss": 0.048, + "step": 7230 + }, + { + "epoch": 0.8580740740740741, + "grad_norm": 0.8198002348707787, + "learning_rate": 8.983183115404181e-06, + "loss": 0.0489, + "step": 7240 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 0.7173565076463503, + "learning_rate": 8.979011648005163e-06, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.8604444444444445, + "grad_norm": 0.7974066627644381, + "learning_rate": 8.97483261417312e-06, + "loss": 0.0464, + "step": 7260 + }, + { + "epoch": 0.8616296296296296, + "grad_norm": 0.6868280015809256, + "learning_rate": 8.970646021854854e-06, + "loss": 0.0482, + "step": 7270 + }, + { + "epoch": 0.8628148148148148, + "grad_norm": 0.5837185009688398, + "learning_rate": 8.96645187901152e-06, + "loss": 0.0529, + "step": 7280 + }, + { + "epoch": 0.864, + "grad_norm": 0.5882804649176211, + "learning_rate": 8.962250193618649e-06, + "loss": 0.0505, + "step": 7290 + }, + { + "epoch": 0.8651851851851852, + "grad_norm": 0.7250899181586955, + "learning_rate": 8.958040973666102e-06, + "loss": 0.0464, + "step": 7300 + }, + { + "epoch": 0.8663703703703703, + "grad_norm": 0.7274661399152923, + "learning_rate": 8.95382422715808e-06, + "loss": 0.0523, + "step": 7310 + }, + { + "epoch": 0.8675555555555555, + "grad_norm": 0.6305577005737798, + "learning_rate": 8.94959996211308e-06, + "loss": 0.0501, + "step": 7320 + }, + { + "epoch": 0.8687407407407407, + "grad_norm": 0.7384466431817232, + "learning_rate": 8.945368186563913e-06, + "loss": 0.0481, + "step": 7330 + }, + { + "epoch": 0.8699259259259259, + "grad_norm": 1.0061465676366388, + "learning_rate": 8.94112890855766e-06, + "loss": 0.0484, + "step": 7340 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 0.9107812967480566, + "learning_rate": 8.936882136155676e-06, + "loss": 0.053, + "step": 7350 + }, + { + "epoch": 0.8722962962962963, + "grad_norm": 0.7831745304784927, + "learning_rate": 8.932627877433561e-06, + "loss": 0.0476, + "step": 7360 + }, + { + "epoch": 0.8734814814814815, + "grad_norm": 0.5933278625535517, + "learning_rate": 8.928366140481159e-06, + "loss": 0.0459, + "step": 7370 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.7309816650853034, + "learning_rate": 8.924096933402524e-06, + "loss": 0.0518, + "step": 7380 + }, + { + "epoch": 0.8758518518518519, + "grad_norm": 0.619390213132588, + "learning_rate": 8.919820264315922e-06, + "loss": 0.049, + "step": 7390 + }, + { + "epoch": 0.8770370370370371, + "grad_norm": 0.5938431253568528, + "learning_rate": 8.915536141353808e-06, + "loss": 0.05, + "step": 7400 + }, + { + "epoch": 0.8782222222222222, + "grad_norm": 0.5661113537053553, + "learning_rate": 8.911244572662813e-06, + "loss": 0.0468, + "step": 7410 + }, + { + "epoch": 0.8794074074074074, + "grad_norm": 0.742574427628141, + "learning_rate": 8.90694556640372e-06, + "loss": 0.0474, + "step": 7420 + }, + { + "epoch": 0.8805925925925926, + "grad_norm": 0.7639686698143588, + "learning_rate": 8.90263913075146e-06, + "loss": 0.0476, + "step": 7430 + }, + { + "epoch": 0.8817777777777778, + "grad_norm": 0.6757283735940369, + "learning_rate": 8.898325273895094e-06, + "loss": 0.0448, + "step": 7440 + }, + { + "epoch": 0.882962962962963, + "grad_norm": 0.49264777167027235, + "learning_rate": 8.894004004037788e-06, + "loss": 0.0519, + "step": 7450 + }, + { + "epoch": 0.8841481481481481, + "grad_norm": 0.6126128955995369, + "learning_rate": 8.889675329396812e-06, + "loss": 0.0477, + "step": 7460 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.7592480310161118, + "learning_rate": 8.885339258203511e-06, + "loss": 0.0489, + "step": 7470 + }, + { + "epoch": 0.8865185185185185, + "grad_norm": 0.8556814725564063, + "learning_rate": 8.880995798703299e-06, + "loss": 0.0501, + "step": 7480 + }, + { + "epoch": 0.8877037037037037, + "grad_norm": 0.6562343067417394, + "learning_rate": 8.876644959155635e-06, + "loss": 0.0488, + "step": 7490 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.5545104797264164, + "learning_rate": 8.872286747834018e-06, + "loss": 0.0497, + "step": 7500 + }, + { + "epoch": 0.8900740740740741, + "grad_norm": 0.6434601123386668, + "learning_rate": 8.867921173025959e-06, + "loss": 0.0459, + "step": 7510 + }, + { + "epoch": 0.8912592592592593, + "grad_norm": 0.7344293502208636, + "learning_rate": 8.863548243032977e-06, + "loss": 0.0438, + "step": 7520 + }, + { + "epoch": 0.8924444444444445, + "grad_norm": 0.715815454504353, + "learning_rate": 8.859167966170574e-06, + "loss": 0.0438, + "step": 7530 + }, + { + "epoch": 0.8936296296296297, + "grad_norm": 0.5894308972856338, + "learning_rate": 8.854780350768225e-06, + "loss": 0.0453, + "step": 7540 + }, + { + "epoch": 0.8948148148148148, + "grad_norm": 0.7591298649958523, + "learning_rate": 8.850385405169352e-06, + "loss": 0.0498, + "step": 7550 + }, + { + "epoch": 0.896, + "grad_norm": 0.682421297422223, + "learning_rate": 8.845983137731326e-06, + "loss": 0.0517, + "step": 7560 + }, + { + "epoch": 0.8971851851851852, + "grad_norm": 0.7864052816922432, + "learning_rate": 8.84157355682544e-06, + "loss": 0.0517, + "step": 7570 + }, + { + "epoch": 0.8983703703703704, + "grad_norm": 0.6096071342284556, + "learning_rate": 8.837156670836888e-06, + "loss": 0.0488, + "step": 7580 + }, + { + "epoch": 0.8995555555555556, + "grad_norm": 0.6836862920275314, + "learning_rate": 8.832732488164761e-06, + "loss": 0.0461, + "step": 7590 + }, + { + "epoch": 0.9007407407407407, + "grad_norm": 0.8022104617752468, + "learning_rate": 8.82830101722202e-06, + "loss": 0.0493, + "step": 7600 + }, + { + "epoch": 0.9019259259259259, + "grad_norm": 0.6591289899788176, + "learning_rate": 8.823862266435492e-06, + "loss": 0.0485, + "step": 7610 + }, + { + "epoch": 0.9031111111111111, + "grad_norm": 0.7142607526573105, + "learning_rate": 8.819416244245841e-06, + "loss": 0.0462, + "step": 7620 + }, + { + "epoch": 0.9042962962962963, + "grad_norm": 0.6654038512617686, + "learning_rate": 8.81496295910756e-06, + "loss": 0.0476, + "step": 7630 + }, + { + "epoch": 0.9054814814814814, + "grad_norm": 0.8158569675100661, + "learning_rate": 8.810502419488958e-06, + "loss": 0.0531, + "step": 7640 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.6565042200381191, + "learning_rate": 8.80603463387213e-06, + "loss": 0.0435, + "step": 7650 + }, + { + "epoch": 0.9078518518518518, + "grad_norm": 0.792387147204678, + "learning_rate": 8.80155961075296e-06, + "loss": 0.0457, + "step": 7660 + }, + { + "epoch": 0.9090370370370371, + "grad_norm": 0.6400679949422363, + "learning_rate": 8.797077358641081e-06, + "loss": 0.0491, + "step": 7670 + }, + { + "epoch": 0.9102222222222223, + "grad_norm": 0.8197301724041891, + "learning_rate": 8.792587886059891e-06, + "loss": 0.0524, + "step": 7680 + }, + { + "epoch": 0.9114074074074074, + "grad_norm": 0.6477466730781138, + "learning_rate": 8.788091201546503e-06, + "loss": 0.0464, + "step": 7690 + }, + { + "epoch": 0.9125925925925926, + "grad_norm": 0.6329813257018595, + "learning_rate": 8.78358731365175e-06, + "loss": 0.0435, + "step": 7700 + }, + { + "epoch": 0.9137777777777778, + "grad_norm": 0.6254414507963543, + "learning_rate": 8.779076230940163e-06, + "loss": 0.0458, + "step": 7710 + }, + { + "epoch": 0.914962962962963, + "grad_norm": 0.6207345887037139, + "learning_rate": 8.774557961989955e-06, + "loss": 0.0491, + "step": 7720 + }, + { + "epoch": 0.9161481481481482, + "grad_norm": 0.55946268737263, + "learning_rate": 8.770032515393e-06, + "loss": 0.0472, + "step": 7730 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.6772035858898395, + "learning_rate": 8.765499899754827e-06, + "loss": 0.0452, + "step": 7740 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 0.8106168155540253, + "learning_rate": 8.760960123694595e-06, + "loss": 0.046, + "step": 7750 + }, + { + "epoch": 0.9197037037037037, + "grad_norm": 0.655821818432701, + "learning_rate": 8.756413195845075e-06, + "loss": 0.0471, + "step": 7760 + }, + { + "epoch": 0.9208888888888889, + "grad_norm": 0.7343075910923443, + "learning_rate": 8.751859124852646e-06, + "loss": 0.0498, + "step": 7770 + }, + { + "epoch": 0.922074074074074, + "grad_norm": 0.9536952913734272, + "learning_rate": 8.747297919377262e-06, + "loss": 0.0498, + "step": 7780 + }, + { + "epoch": 0.9232592592592592, + "grad_norm": 0.6033899003523572, + "learning_rate": 8.74272958809245e-06, + "loss": 0.0447, + "step": 7790 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 0.8694089163500168, + "learning_rate": 8.738154139685287e-06, + "loss": 0.047, + "step": 7800 + }, + { + "epoch": 0.9256296296296296, + "grad_norm": 0.8019148399219386, + "learning_rate": 8.733571582856378e-06, + "loss": 0.0488, + "step": 7810 + }, + { + "epoch": 0.9268148148148149, + "grad_norm": 0.6246197373968206, + "learning_rate": 8.728981926319851e-06, + "loss": 0.0444, + "step": 7820 + }, + { + "epoch": 0.928, + "grad_norm": 0.7051854547586471, + "learning_rate": 8.72438517880333e-06, + "loss": 0.0481, + "step": 7830 + }, + { + "epoch": 0.9291851851851852, + "grad_norm": 0.4319130166694029, + "learning_rate": 8.719781349047931e-06, + "loss": 0.0445, + "step": 7840 + }, + { + "epoch": 0.9303703703703704, + "grad_norm": 0.8021777134565091, + "learning_rate": 8.715170445808228e-06, + "loss": 0.0479, + "step": 7850 + }, + { + "epoch": 0.9315555555555556, + "grad_norm": 0.5835486406209588, + "learning_rate": 8.710552477852253e-06, + "loss": 0.0471, + "step": 7860 + }, + { + "epoch": 0.9327407407407408, + "grad_norm": 0.6531671902591658, + "learning_rate": 8.705927453961468e-06, + "loss": 0.0474, + "step": 7870 + }, + { + "epoch": 0.9339259259259259, + "grad_norm": 0.7264751248983842, + "learning_rate": 8.701295382930755e-06, + "loss": 0.0491, + "step": 7880 + }, + { + "epoch": 0.9351111111111111, + "grad_norm": 0.9254258478111342, + "learning_rate": 8.696656273568393e-06, + "loss": 0.0496, + "step": 7890 + }, + { + "epoch": 0.9362962962962963, + "grad_norm": 0.4753487468866643, + "learning_rate": 8.692010134696052e-06, + "loss": 0.0404, + "step": 7900 + }, + { + "epoch": 0.9374814814814815, + "grad_norm": 0.8017191576342043, + "learning_rate": 8.687356975148761e-06, + "loss": 0.0528, + "step": 7910 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.6077130308401523, + "learning_rate": 8.682696803774903e-06, + "loss": 0.0482, + "step": 7920 + }, + { + "epoch": 0.9398518518518518, + "grad_norm": 0.5674166803836139, + "learning_rate": 8.6780296294362e-06, + "loss": 0.0503, + "step": 7930 + }, + { + "epoch": 0.941037037037037, + "grad_norm": 0.6766821768704477, + "learning_rate": 8.673355461007679e-06, + "loss": 0.0485, + "step": 7940 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 0.6210290727879508, + "learning_rate": 8.66867430737768e-06, + "loss": 0.0468, + "step": 7950 + }, + { + "epoch": 0.9434074074074074, + "grad_norm": 0.7572269163731946, + "learning_rate": 8.663986177447812e-06, + "loss": 0.0473, + "step": 7960 + }, + { + "epoch": 0.9445925925925925, + "grad_norm": 0.6710397213819798, + "learning_rate": 8.659291080132963e-06, + "loss": 0.0434, + "step": 7970 + }, + { + "epoch": 0.9457777777777778, + "grad_norm": 0.5927732791729379, + "learning_rate": 8.654589024361264e-06, + "loss": 0.0464, + "step": 7980 + }, + { + "epoch": 0.946962962962963, + "grad_norm": 0.8351921135158358, + "learning_rate": 8.649880019074078e-06, + "loss": 0.0449, + "step": 7990 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 0.5841419481155213, + "learning_rate": 8.64516407322598e-06, + "loss": 0.0426, + "step": 8000 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.9127769457819565, + "learning_rate": 8.640441195784752e-06, + "loss": 0.0473, + "step": 8010 + }, + { + "epoch": 0.9505185185185185, + "grad_norm": 0.7226985293448518, + "learning_rate": 8.63571139573135e-06, + "loss": 0.0419, + "step": 8020 + }, + { + "epoch": 0.9517037037037037, + "grad_norm": 0.5348585822335743, + "learning_rate": 8.630974682059896e-06, + "loss": 0.0487, + "step": 8030 + }, + { + "epoch": 0.9528888888888889, + "grad_norm": 0.6563316829016852, + "learning_rate": 8.626231063777658e-06, + "loss": 0.0495, + "step": 8040 + }, + { + "epoch": 0.9540740740740741, + "grad_norm": 0.7845822751322598, + "learning_rate": 8.621480549905035e-06, + "loss": 0.0497, + "step": 8050 + }, + { + "epoch": 0.9552592592592593, + "grad_norm": 0.471510654237178, + "learning_rate": 8.616723149475536e-06, + "loss": 0.0436, + "step": 8060 + }, + { + "epoch": 0.9564444444444444, + "grad_norm": 0.9184220100048154, + "learning_rate": 8.61195887153577e-06, + "loss": 0.045, + "step": 8070 + }, + { + "epoch": 0.9576296296296296, + "grad_norm": 0.7117067218902718, + "learning_rate": 8.60718772514542e-06, + "loss": 0.0467, + "step": 8080 + }, + { + "epoch": 0.9588148148148148, + "grad_norm": 0.7229676332504311, + "learning_rate": 8.602409719377232e-06, + "loss": 0.0482, + "step": 8090 + }, + { + "epoch": 0.96, + "grad_norm": 0.6510765194228915, + "learning_rate": 8.597624863316996e-06, + "loss": 0.0447, + "step": 8100 + }, + { + "epoch": 0.9611851851851851, + "grad_norm": 0.6372489162445778, + "learning_rate": 8.592833166063525e-06, + "loss": 0.0442, + "step": 8110 + }, + { + "epoch": 0.9623703703703703, + "grad_norm": 0.6806232734100223, + "learning_rate": 8.588034636728644e-06, + "loss": 0.0493, + "step": 8120 + }, + { + "epoch": 0.9635555555555556, + "grad_norm": 0.8375858027466179, + "learning_rate": 8.58322928443717e-06, + "loss": 0.0478, + "step": 8130 + }, + { + "epoch": 0.9647407407407408, + "grad_norm": 0.7239728236966126, + "learning_rate": 8.578417118326897e-06, + "loss": 0.0466, + "step": 8140 + }, + { + "epoch": 0.965925925925926, + "grad_norm": 0.8749584726704697, + "learning_rate": 8.573598147548567e-06, + "loss": 0.0505, + "step": 8150 + }, + { + "epoch": 0.9671111111111111, + "grad_norm": 0.6653107548608621, + "learning_rate": 8.568772381265872e-06, + "loss": 0.0472, + "step": 8160 + }, + { + "epoch": 0.9682962962962963, + "grad_norm": 0.8257121480511849, + "learning_rate": 8.56393982865542e-06, + "loss": 0.0453, + "step": 8170 + }, + { + "epoch": 0.9694814814814815, + "grad_norm": 0.8205803321412547, + "learning_rate": 8.559100498906727e-06, + "loss": 0.0463, + "step": 8180 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.784346843530516, + "learning_rate": 8.554254401222193e-06, + "loss": 0.044, + "step": 8190 + }, + { + "epoch": 0.9718518518518519, + "grad_norm": 0.7513727776757206, + "learning_rate": 8.549401544817092e-06, + "loss": 0.0415, + "step": 8200 + }, + { + "epoch": 0.973037037037037, + "grad_norm": 0.6800157649806621, + "learning_rate": 8.544541938919544e-06, + "loss": 0.0471, + "step": 8210 + }, + { + "epoch": 0.9742222222222222, + "grad_norm": 0.5533104339950371, + "learning_rate": 8.539675592770513e-06, + "loss": 0.0461, + "step": 8220 + }, + { + "epoch": 0.9754074074074074, + "grad_norm": 0.5840384445204609, + "learning_rate": 8.53480251562377e-06, + "loss": 0.0471, + "step": 8230 + }, + { + "epoch": 0.9765925925925926, + "grad_norm": 0.7191931437171187, + "learning_rate": 8.529922716745895e-06, + "loss": 0.0432, + "step": 8240 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.6647808639290215, + "learning_rate": 8.525036205416244e-06, + "loss": 0.042, + "step": 8250 + }, + { + "epoch": 0.9789629629629629, + "grad_norm": 0.8439395557915452, + "learning_rate": 8.520142990926936e-06, + "loss": 0.0468, + "step": 8260 + }, + { + "epoch": 0.9801481481481481, + "grad_norm": 0.7561599265628417, + "learning_rate": 8.515243082582843e-06, + "loss": 0.0482, + "step": 8270 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.4856741634382977, + "learning_rate": 8.51033648970156e-06, + "loss": 0.0448, + "step": 8280 + }, + { + "epoch": 0.9825185185185186, + "grad_norm": 0.7599107869385829, + "learning_rate": 8.505423221613395e-06, + "loss": 0.0441, + "step": 8290 + }, + { + "epoch": 0.9837037037037037, + "grad_norm": 0.6708983626248244, + "learning_rate": 8.500503287661353e-06, + "loss": 0.0447, + "step": 8300 + }, + { + "epoch": 0.9848888888888889, + "grad_norm": 0.9174835152187154, + "learning_rate": 8.495576697201111e-06, + "loss": 0.0462, + "step": 8310 + }, + { + "epoch": 0.9860740740740741, + "grad_norm": 0.8428263231922415, + "learning_rate": 8.490643459601003e-06, + "loss": 0.0459, + "step": 8320 + }, + { + "epoch": 0.9872592592592593, + "grad_norm": 0.8080013338882717, + "learning_rate": 8.485703584242006e-06, + "loss": 0.0442, + "step": 8330 + }, + { + "epoch": 0.9884444444444445, + "grad_norm": 0.7738391502885379, + "learning_rate": 8.480757080517716e-06, + "loss": 0.0445, + "step": 8340 + }, + { + "epoch": 0.9896296296296296, + "grad_norm": 0.5652129230088224, + "learning_rate": 8.47580395783434e-06, + "loss": 0.0427, + "step": 8350 + }, + { + "epoch": 0.9908148148148148, + "grad_norm": 0.6101592040655527, + "learning_rate": 8.470844225610662e-06, + "loss": 0.042, + "step": 8360 + }, + { + "epoch": 0.992, + "grad_norm": 0.6039351651956629, + "learning_rate": 8.465877893278041e-06, + "loss": 0.0425, + "step": 8370 + }, + { + "epoch": 0.9931851851851852, + "grad_norm": 0.6585164098220578, + "learning_rate": 8.460904970280386e-06, + "loss": 0.0433, + "step": 8380 + }, + { + "epoch": 0.9943703703703703, + "grad_norm": 0.6323305714822836, + "learning_rate": 8.45592546607414e-06, + "loss": 0.0458, + "step": 8390 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 0.747018084505675, + "learning_rate": 8.450939390128255e-06, + "loss": 0.0492, + "step": 8400 + }, + { + "epoch": 0.9967407407407407, + "grad_norm": 0.6155438710759833, + "learning_rate": 8.445946751924184e-06, + "loss": 0.0438, + "step": 8410 + }, + { + "epoch": 0.9979259259259259, + "grad_norm": 0.6477807074361096, + "learning_rate": 8.44094756095586e-06, + "loss": 0.0388, + "step": 8420 + }, + { + "epoch": 0.9991111111111111, + "grad_norm": 0.6519925886499779, + "learning_rate": 8.435941826729675e-06, + "loss": 0.049, + "step": 8430 + }, + { + "epoch": 1.000237037037037, + "grad_norm": 0.6875558762383998, + "learning_rate": 8.43092955876446e-06, + "loss": 0.0428, + "step": 8440 + }, + { + "epoch": 1.0014222222222222, + "grad_norm": 0.5978645689971666, + "learning_rate": 8.42591076659148e-06, + "loss": 0.0436, + "step": 8450 + }, + { + "epoch": 1.0026074074074074, + "grad_norm": 0.6314301118033531, + "learning_rate": 8.420885459754391e-06, + "loss": 0.0405, + "step": 8460 + }, + { + "epoch": 1.0037925925925926, + "grad_norm": 0.9306637451578884, + "learning_rate": 8.415853647809256e-06, + "loss": 0.0422, + "step": 8470 + }, + { + "epoch": 1.0049777777777777, + "grad_norm": 0.592090625214825, + "learning_rate": 8.410815340324493e-06, + "loss": 0.0393, + "step": 8480 + }, + { + "epoch": 1.006162962962963, + "grad_norm": 0.5813559493664977, + "learning_rate": 8.405770546880876e-06, + "loss": 0.0393, + "step": 8490 + }, + { + "epoch": 1.007348148148148, + "grad_norm": 0.7538927839729614, + "learning_rate": 8.400719277071517e-06, + "loss": 0.04, + "step": 8500 + }, + { + "epoch": 1.0085333333333333, + "grad_norm": 0.7724810550854977, + "learning_rate": 8.395661540501839e-06, + "loss": 0.0401, + "step": 8510 + }, + { + "epoch": 1.0097185185185185, + "grad_norm": 0.6131732282221035, + "learning_rate": 8.390597346789564e-06, + "loss": 0.042, + "step": 8520 + }, + { + "epoch": 1.0109037037037036, + "grad_norm": 0.7320126138462696, + "learning_rate": 8.38552670556469e-06, + "loss": 0.039, + "step": 8530 + }, + { + "epoch": 1.0120888888888888, + "grad_norm": 0.6807010096821413, + "learning_rate": 8.380449626469482e-06, + "loss": 0.0459, + "step": 8540 + }, + { + "epoch": 1.013274074074074, + "grad_norm": 0.7853944466067139, + "learning_rate": 8.375366119158438e-06, + "loss": 0.0422, + "step": 8550 + }, + { + "epoch": 1.0144592592592592, + "grad_norm": 0.6312963470455242, + "learning_rate": 8.370276193298288e-06, + "loss": 0.0404, + "step": 8560 + }, + { + "epoch": 1.0156444444444443, + "grad_norm": 0.7801641181641211, + "learning_rate": 8.36517985856796e-06, + "loss": 0.042, + "step": 8570 + }, + { + "epoch": 1.0168296296296295, + "grad_norm": 0.6457328648944454, + "learning_rate": 8.360077124658576e-06, + "loss": 0.0438, + "step": 8580 + }, + { + "epoch": 1.0180148148148147, + "grad_norm": 0.6397954566901006, + "learning_rate": 8.354968001273424e-06, + "loss": 0.0388, + "step": 8590 + }, + { + "epoch": 1.0192, + "grad_norm": 0.8453574899616856, + "learning_rate": 8.34985249812794e-06, + "loss": 0.0434, + "step": 8600 + }, + { + "epoch": 1.0203851851851853, + "grad_norm": 0.8247010678271285, + "learning_rate": 8.344730624949696e-06, + "loss": 0.0389, + "step": 8610 + }, + { + "epoch": 1.0215703703703705, + "grad_norm": 0.7492583855084065, + "learning_rate": 8.33960239147837e-06, + "loss": 0.0397, + "step": 8620 + }, + { + "epoch": 1.0227555555555556, + "grad_norm": 0.7267511819682018, + "learning_rate": 8.334467807465742e-06, + "loss": 0.0436, + "step": 8630 + }, + { + "epoch": 1.0239407407407408, + "grad_norm": 0.7815443582365968, + "learning_rate": 8.329326882675668e-06, + "loss": 0.04, + "step": 8640 + }, + { + "epoch": 1.025125925925926, + "grad_norm": 0.6115446334258033, + "learning_rate": 8.324179626884053e-06, + "loss": 0.039, + "step": 8650 + }, + { + "epoch": 1.0263111111111112, + "grad_norm": 0.6031131121660421, + "learning_rate": 8.31902604987885e-06, + "loss": 0.0376, + "step": 8660 + }, + { + "epoch": 1.0274962962962964, + "grad_norm": 0.558765123568777, + "learning_rate": 8.313866161460032e-06, + "loss": 0.0392, + "step": 8670 + }, + { + "epoch": 1.0286814814814815, + "grad_norm": 0.6405308284644844, + "learning_rate": 8.308699971439564e-06, + "loss": 0.0425, + "step": 8680 + }, + { + "epoch": 1.0298666666666667, + "grad_norm": 0.651610298269465, + "learning_rate": 8.303527489641408e-06, + "loss": 0.0369, + "step": 8690 + }, + { + "epoch": 1.0310518518518519, + "grad_norm": 0.6343074308453518, + "learning_rate": 8.298348725901477e-06, + "loss": 0.0414, + "step": 8700 + }, + { + "epoch": 1.032237037037037, + "grad_norm": 0.6970569849251086, + "learning_rate": 8.29316369006764e-06, + "loss": 0.0388, + "step": 8710 + }, + { + "epoch": 1.0334222222222222, + "grad_norm": 0.6659543188112809, + "learning_rate": 8.287972391999686e-06, + "loss": 0.0442, + "step": 8720 + }, + { + "epoch": 1.0346074074074074, + "grad_norm": 0.5944956303577671, + "learning_rate": 8.282774841569317e-06, + "loss": 0.0379, + "step": 8730 + }, + { + "epoch": 1.0357925925925926, + "grad_norm": 0.6953084045289076, + "learning_rate": 8.277571048660123e-06, + "loss": 0.0427, + "step": 8740 + }, + { + "epoch": 1.0369777777777778, + "grad_norm": 0.8277179186475379, + "learning_rate": 8.272361023167561e-06, + "loss": 0.0432, + "step": 8750 + }, + { + "epoch": 1.038162962962963, + "grad_norm": 0.697480224511074, + "learning_rate": 8.267144774998946e-06, + "loss": 0.0404, + "step": 8760 + }, + { + "epoch": 1.0393481481481481, + "grad_norm": 0.6068637519355321, + "learning_rate": 8.26192231407342e-06, + "loss": 0.0392, + "step": 8770 + }, + { + "epoch": 1.0405333333333333, + "grad_norm": 0.7119865011070537, + "learning_rate": 8.256693650321943e-06, + "loss": 0.0408, + "step": 8780 + }, + { + "epoch": 1.0417185185185185, + "grad_norm": 0.7349031071286162, + "learning_rate": 8.25145879368727e-06, + "loss": 0.0381, + "step": 8790 + }, + { + "epoch": 1.0429037037037037, + "grad_norm": 0.6683093412262165, + "learning_rate": 8.246217754123928e-06, + "loss": 0.037, + "step": 8800 + }, + { + "epoch": 1.0440888888888888, + "grad_norm": 0.8382151548469626, + "learning_rate": 8.24097054159821e-06, + "loss": 0.0385, + "step": 8810 + }, + { + "epoch": 1.045274074074074, + "grad_norm": 1.026748020374135, + "learning_rate": 8.23571716608814e-06, + "loss": 0.0414, + "step": 8820 + }, + { + "epoch": 1.0464592592592592, + "grad_norm": 0.7786452765257076, + "learning_rate": 8.23045763758346e-06, + "loss": 0.0402, + "step": 8830 + }, + { + "epoch": 1.0476444444444444, + "grad_norm": 0.7085828297847706, + "learning_rate": 8.225191966085618e-06, + "loss": 0.0411, + "step": 8840 + }, + { + "epoch": 1.0488296296296296, + "grad_norm": 0.6957962289106565, + "learning_rate": 8.219920161607744e-06, + "loss": 0.0378, + "step": 8850 + }, + { + "epoch": 1.0500148148148147, + "grad_norm": 0.6711957474321917, + "learning_rate": 8.214642234174626e-06, + "loss": 0.0425, + "step": 8860 + }, + { + "epoch": 1.0512, + "grad_norm": 0.8539013627124864, + "learning_rate": 8.209358193822697e-06, + "loss": 0.041, + "step": 8870 + }, + { + "epoch": 1.052385185185185, + "grad_norm": 0.7151822535307056, + "learning_rate": 8.204068050600014e-06, + "loss": 0.0422, + "step": 8880 + }, + { + "epoch": 1.0535703703703703, + "grad_norm": 0.8329165699353477, + "learning_rate": 8.19877181456624e-06, + "loss": 0.0415, + "step": 8890 + }, + { + "epoch": 1.0547555555555554, + "grad_norm": 0.5986366421572213, + "learning_rate": 8.193469495792623e-06, + "loss": 0.0436, + "step": 8900 + }, + { + "epoch": 1.0559407407407408, + "grad_norm": 0.7175233673468983, + "learning_rate": 8.18816110436198e-06, + "loss": 0.0425, + "step": 8910 + }, + { + "epoch": 1.057125925925926, + "grad_norm": 0.5585316483925973, + "learning_rate": 8.182846650368673e-06, + "loss": 0.0412, + "step": 8920 + }, + { + "epoch": 1.0583111111111112, + "grad_norm": 0.8265422060060924, + "learning_rate": 8.177526143918594e-06, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 1.0594962962962964, + "grad_norm": 0.7195104966522088, + "learning_rate": 8.172199595129142e-06, + "loss": 0.0399, + "step": 8940 + }, + { + "epoch": 1.0606814814814816, + "grad_norm": 0.720887371849215, + "learning_rate": 8.16686701412921e-06, + "loss": 0.0402, + "step": 8950 + }, + { + "epoch": 1.0618666666666667, + "grad_norm": 0.4879757415552275, + "learning_rate": 8.16152841105916e-06, + "loss": 0.0397, + "step": 8960 + }, + { + "epoch": 1.063051851851852, + "grad_norm": 0.6018869225316273, + "learning_rate": 8.156183796070802e-06, + "loss": 0.0399, + "step": 8970 + }, + { + "epoch": 1.064237037037037, + "grad_norm": 0.4590675570536953, + "learning_rate": 8.150833179327383e-06, + "loss": 0.0381, + "step": 8980 + }, + { + "epoch": 1.0654222222222223, + "grad_norm": 0.5947256104468551, + "learning_rate": 8.145476571003564e-06, + "loss": 0.0357, + "step": 8990 + }, + { + "epoch": 1.0666074074074074, + "grad_norm": 0.6652676492436808, + "learning_rate": 8.140113981285393e-06, + "loss": 0.0365, + "step": 9000 + }, + { + "epoch": 1.0677925925925926, + "grad_norm": 0.63057459670751, + "learning_rate": 8.134745420370295e-06, + "loss": 0.0381, + "step": 9010 + }, + { + "epoch": 1.0689777777777778, + "grad_norm": 0.6430405073543456, + "learning_rate": 8.129370898467055e-06, + "loss": 0.0398, + "step": 9020 + }, + { + "epoch": 1.070162962962963, + "grad_norm": 0.6433143868260934, + "learning_rate": 8.123990425795785e-06, + "loss": 0.0391, + "step": 9030 + }, + { + "epoch": 1.0713481481481482, + "grad_norm": 0.6851161432417389, + "learning_rate": 8.118604012587916e-06, + "loss": 0.0381, + "step": 9040 + }, + { + "epoch": 1.0725333333333333, + "grad_norm": 0.754137037257858, + "learning_rate": 8.113211669086181e-06, + "loss": 0.0411, + "step": 9050 + }, + { + "epoch": 1.0737185185185185, + "grad_norm": 0.5380594431387927, + "learning_rate": 8.10781340554458e-06, + "loss": 0.0389, + "step": 9060 + }, + { + "epoch": 1.0749037037037037, + "grad_norm": 0.5587904692509513, + "learning_rate": 8.102409232228379e-06, + "loss": 0.0395, + "step": 9070 + }, + { + "epoch": 1.0760888888888889, + "grad_norm": 0.6338775033663401, + "learning_rate": 8.096999159414077e-06, + "loss": 0.0405, + "step": 9080 + }, + { + "epoch": 1.077274074074074, + "grad_norm": 0.6610530556255558, + "learning_rate": 8.091583197389393e-06, + "loss": 0.043, + "step": 9090 + }, + { + "epoch": 1.0784592592592592, + "grad_norm": 0.6434838113502723, + "learning_rate": 8.086161356453244e-06, + "loss": 0.0399, + "step": 9100 + }, + { + "epoch": 1.0796444444444444, + "grad_norm": 0.6795600846469724, + "learning_rate": 8.08073364691573e-06, + "loss": 0.0394, + "step": 9110 + }, + { + "epoch": 1.0808296296296296, + "grad_norm": 0.5218978794261384, + "learning_rate": 8.075300079098105e-06, + "loss": 0.0421, + "step": 9120 + }, + { + "epoch": 1.0820148148148148, + "grad_norm": 0.5995148508472578, + "learning_rate": 8.069860663332768e-06, + "loss": 0.0387, + "step": 9130 + }, + { + "epoch": 1.0832, + "grad_norm": 0.638991294714415, + "learning_rate": 8.064415409963233e-06, + "loss": 0.0424, + "step": 9140 + }, + { + "epoch": 1.0843851851851851, + "grad_norm": 0.7192740919002207, + "learning_rate": 8.058964329344121e-06, + "loss": 0.0393, + "step": 9150 + }, + { + "epoch": 1.0855703703703703, + "grad_norm": 0.607902444473679, + "learning_rate": 8.05350743184113e-06, + "loss": 0.0372, + "step": 9160 + }, + { + "epoch": 1.0867555555555555, + "grad_norm": 0.46471787613003696, + "learning_rate": 8.048044727831017e-06, + "loss": 0.0369, + "step": 9170 + }, + { + "epoch": 1.0879407407407407, + "grad_norm": 0.5568509234745769, + "learning_rate": 8.042576227701588e-06, + "loss": 0.0364, + "step": 9180 + }, + { + "epoch": 1.0891259259259258, + "grad_norm": 0.5453653534218361, + "learning_rate": 8.037101941851664e-06, + "loss": 0.0396, + "step": 9190 + }, + { + "epoch": 1.090311111111111, + "grad_norm": 0.672450863643448, + "learning_rate": 8.031621880691072e-06, + "loss": 0.0399, + "step": 9200 + }, + { + "epoch": 1.0914962962962962, + "grad_norm": 0.7112324325072343, + "learning_rate": 8.026136054640621e-06, + "loss": 0.0393, + "step": 9210 + }, + { + "epoch": 1.0926814814814816, + "grad_norm": 0.4861430639312091, + "learning_rate": 8.020644474132075e-06, + "loss": 0.0408, + "step": 9220 + }, + { + "epoch": 1.0938666666666668, + "grad_norm": 0.5925067945784065, + "learning_rate": 8.015147149608156e-06, + "loss": 0.0437, + "step": 9230 + }, + { + "epoch": 1.095051851851852, + "grad_norm": 0.7525583291909363, + "learning_rate": 8.009644091522493e-06, + "loss": 0.0391, + "step": 9240 + }, + { + "epoch": 1.0962370370370371, + "grad_norm": 0.5702289189899421, + "learning_rate": 8.004135310339625e-06, + "loss": 0.0386, + "step": 9250 + }, + { + "epoch": 1.0974222222222223, + "grad_norm": 0.6484215046382384, + "learning_rate": 7.998620816534975e-06, + "loss": 0.04, + "step": 9260 + }, + { + "epoch": 1.0986074074074075, + "grad_norm": 0.4710333408744621, + "learning_rate": 7.99310062059483e-06, + "loss": 0.0363, + "step": 9270 + }, + { + "epoch": 1.0997925925925927, + "grad_norm": 0.5414923294611799, + "learning_rate": 7.987574733016312e-06, + "loss": 0.038, + "step": 9280 + }, + { + "epoch": 1.1009777777777778, + "grad_norm": 0.6676447250579922, + "learning_rate": 7.982043164307377e-06, + "loss": 0.0408, + "step": 9290 + }, + { + "epoch": 1.102162962962963, + "grad_norm": 0.636057341718801, + "learning_rate": 7.976505924986774e-06, + "loss": 0.0394, + "step": 9300 + }, + { + "epoch": 1.1033481481481482, + "grad_norm": 0.7685832106758209, + "learning_rate": 7.970963025584043e-06, + "loss": 0.0382, + "step": 9310 + }, + { + "epoch": 1.1045333333333334, + "grad_norm": 0.5665516393443645, + "learning_rate": 7.965414476639483e-06, + "loss": 0.0391, + "step": 9320 + }, + { + "epoch": 1.1057185185185185, + "grad_norm": 0.6762872755854069, + "learning_rate": 7.95986028870414e-06, + "loss": 0.0404, + "step": 9330 + }, + { + "epoch": 1.1069037037037037, + "grad_norm": 0.769882492552106, + "learning_rate": 7.954300472339776e-06, + "loss": 0.0383, + "step": 9340 + }, + { + "epoch": 1.108088888888889, + "grad_norm": 0.6018450242649039, + "learning_rate": 7.948735038118863e-06, + "loss": 0.0402, + "step": 9350 + }, + { + "epoch": 1.109274074074074, + "grad_norm": 0.6182257708007806, + "learning_rate": 7.943163996624552e-06, + "loss": 0.0354, + "step": 9360 + }, + { + "epoch": 1.1104592592592593, + "grad_norm": 0.5062675711388913, + "learning_rate": 7.937587358450658e-06, + "loss": 0.0369, + "step": 9370 + }, + { + "epoch": 1.1116444444444444, + "grad_norm": 0.6754262725438573, + "learning_rate": 7.932005134201639e-06, + "loss": 0.0365, + "step": 9380 + }, + { + "epoch": 1.1128296296296296, + "grad_norm": 0.6640636272467217, + "learning_rate": 7.926417334492575e-06, + "loss": 0.0407, + "step": 9390 + }, + { + "epoch": 1.1140148148148148, + "grad_norm": 0.5789209573506124, + "learning_rate": 7.920823969949146e-06, + "loss": 0.0388, + "step": 9400 + }, + { + "epoch": 1.1152, + "grad_norm": 0.5976374351360487, + "learning_rate": 7.915225051207616e-06, + "loss": 0.0369, + "step": 9410 + }, + { + "epoch": 1.1163851851851851, + "grad_norm": 0.646752887611876, + "learning_rate": 7.909620588914813e-06, + "loss": 0.0338, + "step": 9420 + }, + { + "epoch": 1.1175703703703703, + "grad_norm": 0.7553811218787913, + "learning_rate": 7.904010593728102e-06, + "loss": 0.0428, + "step": 9430 + }, + { + "epoch": 1.1187555555555555, + "grad_norm": 0.745592403957942, + "learning_rate": 7.898395076315375e-06, + "loss": 0.0384, + "step": 9440 + }, + { + "epoch": 1.1199407407407407, + "grad_norm": 0.6189339136937886, + "learning_rate": 7.892774047355016e-06, + "loss": 0.0395, + "step": 9450 + }, + { + "epoch": 1.1211259259259259, + "grad_norm": 0.9305371851260256, + "learning_rate": 7.8871475175359e-06, + "loss": 0.041, + "step": 9460 + }, + { + "epoch": 1.122311111111111, + "grad_norm": 0.5263504716116539, + "learning_rate": 7.881515497557357e-06, + "loss": 0.037, + "step": 9470 + }, + { + "epoch": 1.1234962962962962, + "grad_norm": 0.5730107225067484, + "learning_rate": 7.875877998129155e-06, + "loss": 0.0417, + "step": 9480 + }, + { + "epoch": 1.1246814814814814, + "grad_norm": 0.6688827316920198, + "learning_rate": 7.870235029971485e-06, + "loss": 0.0356, + "step": 9490 + }, + { + "epoch": 1.1258666666666666, + "grad_norm": 0.5982568853047519, + "learning_rate": 7.864586603814939e-06, + "loss": 0.0355, + "step": 9500 + }, + { + "epoch": 1.127051851851852, + "grad_norm": 0.6742697423097298, + "learning_rate": 7.858932730400484e-06, + "loss": 0.0373, + "step": 9510 + }, + { + "epoch": 1.128237037037037, + "grad_norm": 0.6575350956719821, + "learning_rate": 7.853273420479446e-06, + "loss": 0.0401, + "step": 9520 + }, + { + "epoch": 1.1294222222222223, + "grad_norm": 0.5377286807869309, + "learning_rate": 7.84760868481349e-06, + "loss": 0.0388, + "step": 9530 + }, + { + "epoch": 1.1306074074074073, + "grad_norm": 0.7690644000266748, + "learning_rate": 7.841938534174599e-06, + "loss": 0.0416, + "step": 9540 + }, + { + "epoch": 1.1317925925925927, + "grad_norm": 0.6870845907636579, + "learning_rate": 7.836262979345051e-06, + "loss": 0.0387, + "step": 9550 + }, + { + "epoch": 1.1329777777777779, + "grad_norm": 0.6809896455441627, + "learning_rate": 7.830582031117402e-06, + "loss": 0.0375, + "step": 9560 + }, + { + "epoch": 1.134162962962963, + "grad_norm": 0.6128673986902426, + "learning_rate": 7.824895700294465e-06, + "loss": 0.035, + "step": 9570 + }, + { + "epoch": 1.1353481481481482, + "grad_norm": 1.074993039372169, + "learning_rate": 7.819203997689288e-06, + "loss": 0.0381, + "step": 9580 + }, + { + "epoch": 1.1365333333333334, + "grad_norm": 0.8103569592071879, + "learning_rate": 7.813506934125131e-06, + "loss": 0.0389, + "step": 9590 + }, + { + "epoch": 1.1377185185185186, + "grad_norm": 0.6990667147841416, + "learning_rate": 7.807804520435453e-06, + "loss": 0.0381, + "step": 9600 + }, + { + "epoch": 1.1389037037037038, + "grad_norm": 0.726995888548818, + "learning_rate": 7.802096767463882e-06, + "loss": 0.0371, + "step": 9610 + }, + { + "epoch": 1.140088888888889, + "grad_norm": 0.7627251603117009, + "learning_rate": 7.796383686064202e-06, + "loss": 0.0374, + "step": 9620 + }, + { + "epoch": 1.141274074074074, + "grad_norm": 0.6214008591225787, + "learning_rate": 7.790665287100329e-06, + "loss": 0.0354, + "step": 9630 + }, + { + "epoch": 1.1424592592592593, + "grad_norm": 0.5952464595022666, + "learning_rate": 7.784941581446293e-06, + "loss": 0.0381, + "step": 9640 + }, + { + "epoch": 1.1436444444444445, + "grad_norm": 0.6897070252186435, + "learning_rate": 7.779212579986208e-06, + "loss": 0.0416, + "step": 9650 + }, + { + "epoch": 1.1448296296296296, + "grad_norm": 0.4699599349679626, + "learning_rate": 7.773478293614268e-06, + "loss": 0.037, + "step": 9660 + }, + { + "epoch": 1.1460148148148148, + "grad_norm": 0.6197041099717513, + "learning_rate": 7.76773873323471e-06, + "loss": 0.0382, + "step": 9670 + }, + { + "epoch": 1.1472, + "grad_norm": 0.5461518902608712, + "learning_rate": 7.7619939097618e-06, + "loss": 0.0385, + "step": 9680 + }, + { + "epoch": 1.1483851851851852, + "grad_norm": 0.6694001495956893, + "learning_rate": 7.756243834119818e-06, + "loss": 0.0401, + "step": 9690 + }, + { + "epoch": 1.1495703703703704, + "grad_norm": 0.5717942388761149, + "learning_rate": 7.750488517243024e-06, + "loss": 0.0399, + "step": 9700 + }, + { + "epoch": 1.1507555555555555, + "grad_norm": 0.7816410856533546, + "learning_rate": 7.744727970075651e-06, + "loss": 0.04, + "step": 9710 + }, + { + "epoch": 1.1519407407407407, + "grad_norm": 0.5178801735122833, + "learning_rate": 7.738962203571873e-06, + "loss": 0.0387, + "step": 9720 + }, + { + "epoch": 1.1531259259259259, + "grad_norm": 0.8818509388942666, + "learning_rate": 7.733191228695792e-06, + "loss": 0.036, + "step": 9730 + }, + { + "epoch": 1.154311111111111, + "grad_norm": 0.5854175089363209, + "learning_rate": 7.727415056421414e-06, + "loss": 0.0367, + "step": 9740 + }, + { + "epoch": 1.1554962962962962, + "grad_norm": 0.8611802059904105, + "learning_rate": 7.721633697732627e-06, + "loss": 0.0444, + "step": 9750 + }, + { + "epoch": 1.1566814814814814, + "grad_norm": 0.48286765978521046, + "learning_rate": 7.71584716362318e-06, + "loss": 0.0366, + "step": 9760 + }, + { + "epoch": 1.1578666666666666, + "grad_norm": 0.7231301188202791, + "learning_rate": 7.710055465096668e-06, + "loss": 0.0347, + "step": 9770 + }, + { + "epoch": 1.1590518518518518, + "grad_norm": 1.2864862372203607, + "learning_rate": 7.704258613166507e-06, + "loss": 0.0406, + "step": 9780 + }, + { + "epoch": 1.160237037037037, + "grad_norm": 0.7086904892292608, + "learning_rate": 7.698456618855902e-06, + "loss": 0.0397, + "step": 9790 + }, + { + "epoch": 1.1614222222222221, + "grad_norm": 0.6210945745921319, + "learning_rate": 7.69264949319785e-06, + "loss": 0.038, + "step": 9800 + }, + { + "epoch": 1.1626074074074073, + "grad_norm": 0.5273749308272552, + "learning_rate": 7.686837247235099e-06, + "loss": 0.0403, + "step": 9810 + }, + { + "epoch": 1.1637925925925927, + "grad_norm": 0.5627276833700177, + "learning_rate": 7.681019892020134e-06, + "loss": 0.0382, + "step": 9820 + }, + { + "epoch": 1.1649777777777777, + "grad_norm": 0.5723449974644869, + "learning_rate": 7.675197438615159e-06, + "loss": 0.0391, + "step": 9830 + }, + { + "epoch": 1.166162962962963, + "grad_norm": 0.5884098948345482, + "learning_rate": 7.669369898092065e-06, + "loss": 0.0378, + "step": 9840 + }, + { + "epoch": 1.167348148148148, + "grad_norm": 0.7781525540790897, + "learning_rate": 7.663537281532427e-06, + "loss": 0.0397, + "step": 9850 + }, + { + "epoch": 1.1685333333333334, + "grad_norm": 0.5655562245777895, + "learning_rate": 7.657699600027466e-06, + "loss": 0.0392, + "step": 9860 + }, + { + "epoch": 1.1697185185185186, + "grad_norm": 0.7211204620013965, + "learning_rate": 7.651856864678033e-06, + "loss": 0.0364, + "step": 9870 + }, + { + "epoch": 1.1709037037037038, + "grad_norm": 0.6655214829188295, + "learning_rate": 7.646009086594595e-06, + "loss": 0.0388, + "step": 9880 + }, + { + "epoch": 1.172088888888889, + "grad_norm": 0.7355463899195125, + "learning_rate": 7.640156276897203e-06, + "loss": 0.0379, + "step": 9890 + }, + { + "epoch": 1.1732740740740741, + "grad_norm": 0.510158638519037, + "learning_rate": 7.63429844671548e-06, + "loss": 0.0386, + "step": 9900 + }, + { + "epoch": 1.1744592592592593, + "grad_norm": 0.4019337706765291, + "learning_rate": 7.628435607188593e-06, + "loss": 0.0381, + "step": 9910 + }, + { + "epoch": 1.1756444444444445, + "grad_norm": 0.5379729732599965, + "learning_rate": 7.622567769465237e-06, + "loss": 0.0381, + "step": 9920 + }, + { + "epoch": 1.1768296296296297, + "grad_norm": 0.7265356833741914, + "learning_rate": 7.61669494470361e-06, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 1.1780148148148148, + "grad_norm": 0.5686521688173402, + "learning_rate": 7.610817144071392e-06, + "loss": 0.0363, + "step": 9940 + }, + { + "epoch": 1.1792, + "grad_norm": 0.5918067970383514, + "learning_rate": 7.604934378745728e-06, + "loss": 0.0387, + "step": 9950 + }, + { + "epoch": 1.1803851851851852, + "grad_norm": 0.5750460930620956, + "learning_rate": 7.599046659913203e-06, + "loss": 0.0369, + "step": 9960 + }, + { + "epoch": 1.1815703703703704, + "grad_norm": 0.546769146314586, + "learning_rate": 7.59315399876982e-06, + "loss": 0.0358, + "step": 9970 + }, + { + "epoch": 1.1827555555555556, + "grad_norm": 0.5048140405756063, + "learning_rate": 7.587256406520981e-06, + "loss": 0.0348, + "step": 9980 + }, + { + "epoch": 1.1839407407407407, + "grad_norm": 0.7396906565248186, + "learning_rate": 7.581353894381466e-06, + "loss": 0.035, + "step": 9990 + }, + { + "epoch": 1.185125925925926, + "grad_norm": 0.5378734359421847, + "learning_rate": 7.575446473575409e-06, + "loss": 0.0384, + "step": 10000 + }, + { + "epoch": 1.186311111111111, + "grad_norm": 0.6936285923165727, + "learning_rate": 7.56953415533628e-06, + "loss": 0.0396, + "step": 10010 + }, + { + "epoch": 1.1874962962962963, + "grad_norm": 0.5537682677410906, + "learning_rate": 7.5636169509068595e-06, + "loss": 0.0395, + "step": 10020 + }, + { + "epoch": 1.1886814814814815, + "grad_norm": 0.5203848138992757, + "learning_rate": 7.5576948715392205e-06, + "loss": 0.036, + "step": 10030 + }, + { + "epoch": 1.1898666666666666, + "grad_norm": 0.44365860174400173, + "learning_rate": 7.551767928494709e-06, + "loss": 0.0398, + "step": 10040 + }, + { + "epoch": 1.1910518518518518, + "grad_norm": 0.5906111121468769, + "learning_rate": 7.545836133043916e-06, + "loss": 0.0361, + "step": 10050 + }, + { + "epoch": 1.192237037037037, + "grad_norm": 0.5964498325699477, + "learning_rate": 7.539899496466659e-06, + "loss": 0.0382, + "step": 10060 + }, + { + "epoch": 1.1934222222222222, + "grad_norm": 0.5966852144925037, + "learning_rate": 7.533958030051964e-06, + "loss": 0.0375, + "step": 10070 + }, + { + "epoch": 1.1946074074074073, + "grad_norm": 0.5669702200578985, + "learning_rate": 7.528011745098043e-06, + "loss": 0.0366, + "step": 10080 + }, + { + "epoch": 1.1957925925925925, + "grad_norm": 0.5781264952994672, + "learning_rate": 7.522060652912268e-06, + "loss": 0.0373, + "step": 10090 + }, + { + "epoch": 1.1969777777777777, + "grad_norm": 0.6508011966249394, + "learning_rate": 7.516104764811151e-06, + "loss": 0.0392, + "step": 10100 + }, + { + "epoch": 1.1981629629629629, + "grad_norm": 0.6527186831348761, + "learning_rate": 7.510144092120326e-06, + "loss": 0.0368, + "step": 10110 + }, + { + "epoch": 1.199348148148148, + "grad_norm": 0.7350903522049982, + "learning_rate": 7.504178646174526e-06, + "loss": 0.0375, + "step": 10120 + }, + { + "epoch": 1.2005333333333335, + "grad_norm": 0.6744182650640396, + "learning_rate": 7.498208438317559e-06, + "loss": 0.0409, + "step": 10130 + }, + { + "epoch": 1.2017185185185184, + "grad_norm": 0.435977993508131, + "learning_rate": 7.492233479902289e-06, + "loss": 0.038, + "step": 10140 + }, + { + "epoch": 1.2029037037037038, + "grad_norm": 0.5884682298301195, + "learning_rate": 7.486253782290614e-06, + "loss": 0.0376, + "step": 10150 + }, + { + "epoch": 1.2040888888888888, + "grad_norm": 0.5543585381243312, + "learning_rate": 7.480269356853444e-06, + "loss": 0.0377, + "step": 10160 + }, + { + "epoch": 1.2052740740740742, + "grad_norm": 0.7363678889770158, + "learning_rate": 7.474280214970677e-06, + "loss": 0.0376, + "step": 10170 + }, + { + "epoch": 1.2064592592592593, + "grad_norm": 0.8060719295523754, + "learning_rate": 7.4682863680311825e-06, + "loss": 0.0393, + "step": 10180 + }, + { + "epoch": 1.2076444444444445, + "grad_norm": 0.7798286505427975, + "learning_rate": 7.462287827432777e-06, + "loss": 0.0386, + "step": 10190 + }, + { + "epoch": 1.2088296296296297, + "grad_norm": 0.5806037574379604, + "learning_rate": 7.456284604582203e-06, + "loss": 0.0345, + "step": 10200 + }, + { + "epoch": 1.2100148148148149, + "grad_norm": 0.77013070831697, + "learning_rate": 7.450276710895101e-06, + "loss": 0.0374, + "step": 10210 + }, + { + "epoch": 1.2112, + "grad_norm": 0.6185004535725842, + "learning_rate": 7.4442641577959996e-06, + "loss": 0.033, + "step": 10220 + }, + { + "epoch": 1.2123851851851852, + "grad_norm": 0.507809037896187, + "learning_rate": 7.438246956718288e-06, + "loss": 0.039, + "step": 10230 + }, + { + "epoch": 1.2135703703703704, + "grad_norm": 0.6192477058489974, + "learning_rate": 7.432225119104191e-06, + "loss": 0.0368, + "step": 10240 + }, + { + "epoch": 1.2147555555555556, + "grad_norm": 0.606240126842413, + "learning_rate": 7.426198656404748e-06, + "loss": 0.0334, + "step": 10250 + }, + { + "epoch": 1.2159407407407408, + "grad_norm": 0.6594304830390676, + "learning_rate": 7.4201675800798e-06, + "loss": 0.0415, + "step": 10260 + }, + { + "epoch": 1.217125925925926, + "grad_norm": 0.4799155254654633, + "learning_rate": 7.4141319015979564e-06, + "loss": 0.036, + "step": 10270 + }, + { + "epoch": 1.2183111111111111, + "grad_norm": 0.6255204291523627, + "learning_rate": 7.408091632436578e-06, + "loss": 0.0335, + "step": 10280 + }, + { + "epoch": 1.2194962962962963, + "grad_norm": 0.521121063974648, + "learning_rate": 7.402046784081758e-06, + "loss": 0.0352, + "step": 10290 + }, + { + "epoch": 1.2206814814814815, + "grad_norm": 0.7004544592173146, + "learning_rate": 7.395997368028294e-06, + "loss": 0.0354, + "step": 10300 + }, + { + "epoch": 1.2218666666666667, + "grad_norm": 0.5440553864960688, + "learning_rate": 7.389943395779673e-06, + "loss": 0.0355, + "step": 10310 + }, + { + "epoch": 1.2230518518518518, + "grad_norm": 0.7544039443013161, + "learning_rate": 7.383884878848042e-06, + "loss": 0.0389, + "step": 10320 + }, + { + "epoch": 1.224237037037037, + "grad_norm": 0.7070267814964819, + "learning_rate": 7.377821828754195e-06, + "loss": 0.0342, + "step": 10330 + }, + { + "epoch": 1.2254222222222222, + "grad_norm": 0.6609842767241245, + "learning_rate": 7.371754257027541e-06, + "loss": 0.0355, + "step": 10340 + }, + { + "epoch": 1.2266074074074074, + "grad_norm": 0.6715580646753436, + "learning_rate": 7.365682175206091e-06, + "loss": 0.0368, + "step": 10350 + }, + { + "epoch": 1.2277925925925925, + "grad_norm": 0.5885854475008646, + "learning_rate": 7.359605594836431e-06, + "loss": 0.0372, + "step": 10360 + }, + { + "epoch": 1.2289777777777777, + "grad_norm": 0.5101294574648217, + "learning_rate": 7.3535245274737e-06, + "loss": 0.0375, + "step": 10370 + }, + { + "epoch": 1.230162962962963, + "grad_norm": 0.6295392425350629, + "learning_rate": 7.347438984681572e-06, + "loss": 0.0368, + "step": 10380 + }, + { + "epoch": 1.231348148148148, + "grad_norm": 0.7233676371913077, + "learning_rate": 7.341348978032231e-06, + "loss": 0.0399, + "step": 10390 + }, + { + "epoch": 1.2325333333333333, + "grad_norm": 0.6411075309401609, + "learning_rate": 7.335254519106348e-06, + "loss": 0.0361, + "step": 10400 + }, + { + "epoch": 1.2337185185185184, + "grad_norm": 0.5243013444225256, + "learning_rate": 7.3291556194930605e-06, + "loss": 0.0376, + "step": 10410 + }, + { + "epoch": 1.2349037037037036, + "grad_norm": 0.7982930057892746, + "learning_rate": 7.323052290789951e-06, + "loss": 0.0376, + "step": 10420 + }, + { + "epoch": 1.2360888888888888, + "grad_norm": 0.7984123162275314, + "learning_rate": 7.3169445446030265e-06, + "loss": 0.0373, + "step": 10430 + }, + { + "epoch": 1.2372740740740742, + "grad_norm": 0.6960294780009021, + "learning_rate": 7.310832392546687e-06, + "loss": 0.0373, + "step": 10440 + }, + { + "epoch": 1.2384592592592591, + "grad_norm": 0.4921509400661577, + "learning_rate": 7.304715846243719e-06, + "loss": 0.0367, + "step": 10450 + }, + { + "epoch": 1.2396444444444445, + "grad_norm": 0.5673279478922701, + "learning_rate": 7.2985949173252615e-06, + "loss": 0.0388, + "step": 10460 + }, + { + "epoch": 1.2408296296296297, + "grad_norm": 0.42769975524903464, + "learning_rate": 7.2924696174307885e-06, + "loss": 0.0358, + "step": 10470 + }, + { + "epoch": 1.242014814814815, + "grad_norm": 0.6195045246300867, + "learning_rate": 7.286339958208082e-06, + "loss": 0.035, + "step": 10480 + }, + { + "epoch": 1.2432, + "grad_norm": 0.8212876452206117, + "learning_rate": 7.280205951313217e-06, + "loss": 0.0422, + "step": 10490 + }, + { + "epoch": 1.2443851851851853, + "grad_norm": 0.5898929393304833, + "learning_rate": 7.274067608410536e-06, + "loss": 0.0391, + "step": 10500 + }, + { + "epoch": 1.2455703703703704, + "grad_norm": 0.6162423799805311, + "learning_rate": 7.26792494117263e-06, + "loss": 0.0352, + "step": 10510 + }, + { + "epoch": 1.2467555555555556, + "grad_norm": 0.6237237671475605, + "learning_rate": 7.2617779612803015e-06, + "loss": 0.0394, + "step": 10520 + }, + { + "epoch": 1.2479407407407408, + "grad_norm": 0.6817710998682491, + "learning_rate": 7.255626680422568e-06, + "loss": 0.0368, + "step": 10530 + }, + { + "epoch": 1.249125925925926, + "grad_norm": 0.5662550624455125, + "learning_rate": 7.249471110296615e-06, + "loss": 0.0356, + "step": 10540 + }, + { + "epoch": 1.2503111111111112, + "grad_norm": 0.7201143853816148, + "learning_rate": 7.243311262607794e-06, + "loss": 0.0362, + "step": 10550 + }, + { + "epoch": 1.2514962962962963, + "grad_norm": 0.8630063421470188, + "learning_rate": 7.237147149069581e-06, + "loss": 0.0333, + "step": 10560 + }, + { + "epoch": 1.2526814814814815, + "grad_norm": 0.6061336978531989, + "learning_rate": 7.23097878140357e-06, + "loss": 0.0369, + "step": 10570 + }, + { + "epoch": 1.2538666666666667, + "grad_norm": 0.5148449774448415, + "learning_rate": 7.22480617133944e-06, + "loss": 0.0347, + "step": 10580 + }, + { + "epoch": 1.2550518518518519, + "grad_norm": 0.4327161338223404, + "learning_rate": 7.218629330614946e-06, + "loss": 0.0397, + "step": 10590 + }, + { + "epoch": 1.256237037037037, + "grad_norm": 0.515041847001187, + "learning_rate": 7.212448270975878e-06, + "loss": 0.0356, + "step": 10600 + }, + { + "epoch": 1.2574222222222222, + "grad_norm": 0.5306563008821455, + "learning_rate": 7.206263004176053e-06, + "loss": 0.0368, + "step": 10610 + }, + { + "epoch": 1.2586074074074074, + "grad_norm": 0.5630591444218594, + "learning_rate": 7.2000735419772875e-06, + "loss": 0.0346, + "step": 10620 + }, + { + "epoch": 1.2597925925925926, + "grad_norm": 0.5450281381204137, + "learning_rate": 7.193879896149379e-06, + "loss": 0.0402, + "step": 10630 + }, + { + "epoch": 1.2609777777777778, + "grad_norm": 0.5744760203183716, + "learning_rate": 7.187682078470076e-06, + "loss": 0.0329, + "step": 10640 + }, + { + "epoch": 1.262162962962963, + "grad_norm": 0.6840546645209985, + "learning_rate": 7.181480100725062e-06, + "loss": 0.0359, + "step": 10650 + }, + { + "epoch": 1.263348148148148, + "grad_norm": 0.5898317513632808, + "learning_rate": 7.175273974707933e-06, + "loss": 0.0401, + "step": 10660 + }, + { + "epoch": 1.2645333333333333, + "grad_norm": 0.46707345659001853, + "learning_rate": 7.16906371222017e-06, + "loss": 0.0342, + "step": 10670 + }, + { + "epoch": 1.2657185185185185, + "grad_norm": 0.588399829806492, + "learning_rate": 7.1628493250711215e-06, + "loss": 0.0342, + "step": 10680 + }, + { + "epoch": 1.2669037037037036, + "grad_norm": 0.486637643895772, + "learning_rate": 7.156630825077982e-06, + "loss": 0.0336, + "step": 10690 + }, + { + "epoch": 1.2680888888888888, + "grad_norm": 0.766020600943266, + "learning_rate": 7.150408224065759e-06, + "loss": 0.0346, + "step": 10700 + }, + { + "epoch": 1.269274074074074, + "grad_norm": 0.6636639967654908, + "learning_rate": 7.144181533867269e-06, + "loss": 0.0355, + "step": 10710 + }, + { + "epoch": 1.2704592592592592, + "grad_norm": 0.7306386817598991, + "learning_rate": 7.137950766323098e-06, + "loss": 0.0381, + "step": 10720 + }, + { + "epoch": 1.2716444444444446, + "grad_norm": 0.6309264065300677, + "learning_rate": 7.131715933281583e-06, + "loss": 0.0389, + "step": 10730 + }, + { + "epoch": 1.2728296296296295, + "grad_norm": 0.5297392547162115, + "learning_rate": 7.125477046598801e-06, + "loss": 0.0382, + "step": 10740 + }, + { + "epoch": 1.274014814814815, + "grad_norm": 0.6055137668449215, + "learning_rate": 7.119234118138527e-06, + "loss": 0.0367, + "step": 10750 + }, + { + "epoch": 1.2752, + "grad_norm": 0.6366523175764957, + "learning_rate": 7.112987159772229e-06, + "loss": 0.0366, + "step": 10760 + }, + { + "epoch": 1.2763851851851853, + "grad_norm": 0.5066931729198593, + "learning_rate": 7.106736183379036e-06, + "loss": 0.0362, + "step": 10770 + }, + { + "epoch": 1.2775703703703702, + "grad_norm": 0.6975281275596105, + "learning_rate": 7.100481200845718e-06, + "loss": 0.0363, + "step": 10780 + }, + { + "epoch": 1.2787555555555556, + "grad_norm": 0.651262721556952, + "learning_rate": 7.0942222240666606e-06, + "loss": 0.0372, + "step": 10790 + }, + { + "epoch": 1.2799407407407408, + "grad_norm": 0.6715872302861878, + "learning_rate": 7.0879592649438465e-06, + "loss": 0.0381, + "step": 10800 + }, + { + "epoch": 1.281125925925926, + "grad_norm": 0.5139258765911803, + "learning_rate": 7.081692335386834e-06, + "loss": 0.0352, + "step": 10810 + }, + { + "epoch": 1.2823111111111112, + "grad_norm": 0.5810841016085941, + "learning_rate": 7.075421447312728e-06, + "loss": 0.0363, + "step": 10820 + }, + { + "epoch": 1.2834962962962964, + "grad_norm": 0.5051955731845852, + "learning_rate": 7.06914661264616e-06, + "loss": 0.036, + "step": 10830 + }, + { + "epoch": 1.2846814814814815, + "grad_norm": 0.538763562288868, + "learning_rate": 7.062867843319269e-06, + "loss": 0.0343, + "step": 10840 + }, + { + "epoch": 1.2858666666666667, + "grad_norm": 0.6027292423237165, + "learning_rate": 7.056585151271675e-06, + "loss": 0.0345, + "step": 10850 + }, + { + "epoch": 1.287051851851852, + "grad_norm": 0.813088142723815, + "learning_rate": 7.050298548450459e-06, + "loss": 0.0402, + "step": 10860 + }, + { + "epoch": 1.288237037037037, + "grad_norm": 0.7169273672142574, + "learning_rate": 7.044008046810136e-06, + "loss": 0.0341, + "step": 10870 + }, + { + "epoch": 1.2894222222222222, + "grad_norm": 0.6791794741661862, + "learning_rate": 7.0377136583126345e-06, + "loss": 0.0366, + "step": 10880 + }, + { + "epoch": 1.2906074074074074, + "grad_norm": 0.5849762133650925, + "learning_rate": 7.031415394927279e-06, + "loss": 0.0344, + "step": 10890 + }, + { + "epoch": 1.2917925925925926, + "grad_norm": 0.5842419721747307, + "learning_rate": 7.025113268630758e-06, + "loss": 0.0371, + "step": 10900 + }, + { + "epoch": 1.2929777777777778, + "grad_norm": 0.6191734545664108, + "learning_rate": 7.018807291407106e-06, + "loss": 0.0358, + "step": 10910 + }, + { + "epoch": 1.294162962962963, + "grad_norm": 0.5264925699241213, + "learning_rate": 7.012497475247681e-06, + "loss": 0.0329, + "step": 10920 + }, + { + "epoch": 1.2953481481481481, + "grad_norm": 0.554559917607714, + "learning_rate": 7.0061838321511434e-06, + "loss": 0.0377, + "step": 10930 + }, + { + "epoch": 1.2965333333333333, + "grad_norm": 0.6265356610213807, + "learning_rate": 6.999866374123429e-06, + "loss": 0.0369, + "step": 10940 + }, + { + "epoch": 1.2977185185185185, + "grad_norm": 0.6389940271189815, + "learning_rate": 6.993545113177724e-06, + "loss": 0.036, + "step": 10950 + }, + { + "epoch": 1.2989037037037037, + "grad_norm": 0.6591286121830231, + "learning_rate": 6.987220061334453e-06, + "loss": 0.0363, + "step": 10960 + }, + { + "epoch": 1.3000888888888888, + "grad_norm": 0.5927388685743068, + "learning_rate": 6.980891230621247e-06, + "loss": 0.0364, + "step": 10970 + }, + { + "epoch": 1.301274074074074, + "grad_norm": 0.553730384114029, + "learning_rate": 6.9745586330729205e-06, + "loss": 0.0363, + "step": 10980 + }, + { + "epoch": 1.3024592592592592, + "grad_norm": 0.7396188774134861, + "learning_rate": 6.968222280731454e-06, + "loss": 0.036, + "step": 10990 + }, + { + "epoch": 1.3036444444444444, + "grad_norm": 0.5584086682473048, + "learning_rate": 6.961882185645964e-06, + "loss": 0.0333, + "step": 11000 + }, + { + "epoch": 1.3048296296296296, + "grad_norm": 0.49887749085338706, + "learning_rate": 6.955538359872689e-06, + "loss": 0.0357, + "step": 11010 + }, + { + "epoch": 1.3060148148148147, + "grad_norm": 0.7369840938642377, + "learning_rate": 6.94919081547496e-06, + "loss": 0.0381, + "step": 11020 + }, + { + "epoch": 1.3072, + "grad_norm": 0.4800393514610276, + "learning_rate": 6.942839564523178e-06, + "loss": 0.0355, + "step": 11030 + }, + { + "epoch": 1.3083851851851853, + "grad_norm": 0.5497586039588198, + "learning_rate": 6.936484619094792e-06, + "loss": 0.0335, + "step": 11040 + }, + { + "epoch": 1.3095703703703703, + "grad_norm": 0.6018994519614677, + "learning_rate": 6.930125991274281e-06, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 1.3107555555555557, + "grad_norm": 0.5800669025095517, + "learning_rate": 6.923763693153118e-06, + "loss": 0.0377, + "step": 11060 + }, + { + "epoch": 1.3119407407407406, + "grad_norm": 0.8188165683940332, + "learning_rate": 6.917397736829765e-06, + "loss": 0.0374, + "step": 11070 + }, + { + "epoch": 1.313125925925926, + "grad_norm": 0.4804822712980326, + "learning_rate": 6.911028134409633e-06, + "loss": 0.0357, + "step": 11080 + }, + { + "epoch": 1.314311111111111, + "grad_norm": 0.5956443799773388, + "learning_rate": 6.90465489800507e-06, + "loss": 0.036, + "step": 11090 + }, + { + "epoch": 1.3154962962962964, + "grad_norm": 0.604125706240739, + "learning_rate": 6.898278039735333e-06, + "loss": 0.0353, + "step": 11100 + }, + { + "epoch": 1.3166814814814816, + "grad_norm": 0.6787861254042028, + "learning_rate": 6.891897571726567e-06, + "loss": 0.0349, + "step": 11110 + }, + { + "epoch": 1.3178666666666667, + "grad_norm": 0.8674508225537952, + "learning_rate": 6.8855135061117804e-06, + "loss": 0.0356, + "step": 11120 + }, + { + "epoch": 1.319051851851852, + "grad_norm": 0.5699340089694086, + "learning_rate": 6.879125855030825e-06, + "loss": 0.0327, + "step": 11130 + }, + { + "epoch": 1.320237037037037, + "grad_norm": 0.6177549684184535, + "learning_rate": 6.872734630630367e-06, + "loss": 0.0347, + "step": 11140 + }, + { + "epoch": 1.3214222222222223, + "grad_norm": 0.7388509361195605, + "learning_rate": 6.866339845063868e-06, + "loss": 0.0347, + "step": 11150 + }, + { + "epoch": 1.3226074074074075, + "grad_norm": 0.5873748021014992, + "learning_rate": 6.859941510491568e-06, + "loss": 0.0345, + "step": 11160 + }, + { + "epoch": 1.3237925925925926, + "grad_norm": 0.49526389412533656, + "learning_rate": 6.853539639080448e-06, + "loss": 0.0337, + "step": 11170 + }, + { + "epoch": 1.3249777777777778, + "grad_norm": 0.7153363063673579, + "learning_rate": 6.8471342430042155e-06, + "loss": 0.0337, + "step": 11180 + }, + { + "epoch": 1.326162962962963, + "grad_norm": 0.718296176708374, + "learning_rate": 6.840725334443283e-06, + "loss": 0.0336, + "step": 11190 + }, + { + "epoch": 1.3273481481481482, + "grad_norm": 0.6398640873266896, + "learning_rate": 6.834312925584745e-06, + "loss": 0.0364, + "step": 11200 + }, + { + "epoch": 1.3285333333333333, + "grad_norm": 0.5794143394656014, + "learning_rate": 6.827897028622346e-06, + "loss": 0.0355, + "step": 11210 + }, + { + "epoch": 1.3297185185185185, + "grad_norm": 0.5010500380583615, + "learning_rate": 6.821477655756465e-06, + "loss": 0.0358, + "step": 11220 + }, + { + "epoch": 1.3309037037037037, + "grad_norm": 0.5096253170101601, + "learning_rate": 6.815054819194095e-06, + "loss": 0.0343, + "step": 11230 + }, + { + "epoch": 1.3320888888888889, + "grad_norm": 0.8089976407775299, + "learning_rate": 6.808628531148809e-06, + "loss": 0.038, + "step": 11240 + }, + { + "epoch": 1.333274074074074, + "grad_norm": 0.5171425901613221, + "learning_rate": 6.80219880384075e-06, + "loss": 0.0343, + "step": 11250 + }, + { + "epoch": 1.3344592592592592, + "grad_norm": 0.5042780596733507, + "learning_rate": 6.795765649496594e-06, + "loss": 0.0375, + "step": 11260 + }, + { + "epoch": 1.3356444444444444, + "grad_norm": 0.5660800758481741, + "learning_rate": 6.789329080349542e-06, + "loss": 0.0356, + "step": 11270 + }, + { + "epoch": 1.3368296296296296, + "grad_norm": 0.5269164315045531, + "learning_rate": 6.78288910863928e-06, + "loss": 0.0383, + "step": 11280 + }, + { + "epoch": 1.3380148148148148, + "grad_norm": 0.5212949347715897, + "learning_rate": 6.77644574661197e-06, + "loss": 0.0375, + "step": 11290 + }, + { + "epoch": 1.3392, + "grad_norm": 0.6101840800275271, + "learning_rate": 6.76999900652022e-06, + "loss": 0.0366, + "step": 11300 + }, + { + "epoch": 1.3403851851851851, + "grad_norm": 0.661408418068933, + "learning_rate": 6.76354890062306e-06, + "loss": 0.0368, + "step": 11310 + }, + { + "epoch": 1.3415703703703703, + "grad_norm": 0.57865126463542, + "learning_rate": 6.757095441185921e-06, + "loss": 0.0368, + "step": 11320 + }, + { + "epoch": 1.3427555555555555, + "grad_norm": 0.5862153482932908, + "learning_rate": 6.750638640480613e-06, + "loss": 0.0381, + "step": 11330 + }, + { + "epoch": 1.3439407407407407, + "grad_norm": 0.6043664158827985, + "learning_rate": 6.744178510785296e-06, + "loss": 0.036, + "step": 11340 + }, + { + "epoch": 1.345125925925926, + "grad_norm": 0.6308929261719946, + "learning_rate": 6.737715064384464e-06, + "loss": 0.0369, + "step": 11350 + }, + { + "epoch": 1.346311111111111, + "grad_norm": 0.6332895047899424, + "learning_rate": 6.731248313568917e-06, + "loss": 0.0359, + "step": 11360 + }, + { + "epoch": 1.3474962962962964, + "grad_norm": 0.5334268499617741, + "learning_rate": 6.724778270635737e-06, + "loss": 0.0323, + "step": 11370 + }, + { + "epoch": 1.3486814814814814, + "grad_norm": 0.6097177784025062, + "learning_rate": 6.7183049478882665e-06, + "loss": 0.035, + "step": 11380 + }, + { + "epoch": 1.3498666666666668, + "grad_norm": 0.461946623934123, + "learning_rate": 6.711828357636088e-06, + "loss": 0.0375, + "step": 11390 + }, + { + "epoch": 1.3510518518518517, + "grad_norm": 0.47279748379846404, + "learning_rate": 6.7053485121949935e-06, + "loss": 0.0366, + "step": 11400 + }, + { + "epoch": 1.3522370370370371, + "grad_norm": 0.4479543484283015, + "learning_rate": 6.698865423886966e-06, + "loss": 0.0332, + "step": 11410 + }, + { + "epoch": 1.3534222222222223, + "grad_norm": 0.5954274040705463, + "learning_rate": 6.692379105040157e-06, + "loss": 0.0334, + "step": 11420 + }, + { + "epoch": 1.3546074074074075, + "grad_norm": 0.5473474005216107, + "learning_rate": 6.68588956798886e-06, + "loss": 0.0329, + "step": 11430 + }, + { + "epoch": 1.3557925925925927, + "grad_norm": 0.6173287603685763, + "learning_rate": 6.67939682507349e-06, + "loss": 0.0356, + "step": 11440 + }, + { + "epoch": 1.3569777777777778, + "grad_norm": 0.5949392287948896, + "learning_rate": 6.672900888640551e-06, + "loss": 0.0387, + "step": 11450 + }, + { + "epoch": 1.358162962962963, + "grad_norm": 0.7887002415849497, + "learning_rate": 6.6664017710426295e-06, + "loss": 0.0354, + "step": 11460 + }, + { + "epoch": 1.3593481481481482, + "grad_norm": 0.5107077269167135, + "learning_rate": 6.659899484638354e-06, + "loss": 0.0334, + "step": 11470 + }, + { + "epoch": 1.3605333333333334, + "grad_norm": 0.6023363328697606, + "learning_rate": 6.653394041792386e-06, + "loss": 0.0359, + "step": 11480 + }, + { + "epoch": 1.3617185185185186, + "grad_norm": 0.5705655715727445, + "learning_rate": 6.64688545487538e-06, + "loss": 0.0379, + "step": 11490 + }, + { + "epoch": 1.3629037037037037, + "grad_norm": 0.5675378303398613, + "learning_rate": 6.6403737362639765e-06, + "loss": 0.0381, + "step": 11500 + }, + { + "epoch": 1.364088888888889, + "grad_norm": 0.5507533395326385, + "learning_rate": 6.633858898340767e-06, + "loss": 0.0348, + "step": 11510 + }, + { + "epoch": 1.365274074074074, + "grad_norm": 0.5505772830940984, + "learning_rate": 6.627340953494279e-06, + "loss": 0.0367, + "step": 11520 + }, + { + "epoch": 1.3664592592592593, + "grad_norm": 0.5488499941719669, + "learning_rate": 6.620819914118943e-06, + "loss": 0.0329, + "step": 11530 + }, + { + "epoch": 1.3676444444444444, + "grad_norm": 0.631397900626461, + "learning_rate": 6.614295792615078e-06, + "loss": 0.037, + "step": 11540 + }, + { + "epoch": 1.3688296296296296, + "grad_norm": 0.5632186279781408, + "learning_rate": 6.607768601388862e-06, + "loss": 0.0372, + "step": 11550 + }, + { + "epoch": 1.3700148148148148, + "grad_norm": 0.582559158072894, + "learning_rate": 6.6012383528523114e-06, + "loss": 0.0374, + "step": 11560 + }, + { + "epoch": 1.3712, + "grad_norm": 0.8006342054054756, + "learning_rate": 6.5947050594232534e-06, + "loss": 0.0368, + "step": 11570 + }, + { + "epoch": 1.3723851851851852, + "grad_norm": 0.6156227656750647, + "learning_rate": 6.588168733525311e-06, + "loss": 0.0368, + "step": 11580 + }, + { + "epoch": 1.3735703703703703, + "grad_norm": 0.5106679994915344, + "learning_rate": 6.581629387587867e-06, + "loss": 0.0374, + "step": 11590 + }, + { + "epoch": 1.3747555555555555, + "grad_norm": 0.640148816293237, + "learning_rate": 6.5750870340460525e-06, + "loss": 0.0374, + "step": 11600 + }, + { + "epoch": 1.3759407407407407, + "grad_norm": 0.4613254860891459, + "learning_rate": 6.568541685340715e-06, + "loss": 0.0357, + "step": 11610 + }, + { + "epoch": 1.3771259259259259, + "grad_norm": 0.527150437229008, + "learning_rate": 6.561993353918398e-06, + "loss": 0.0366, + "step": 11620 + }, + { + "epoch": 1.378311111111111, + "grad_norm": 0.518361073536929, + "learning_rate": 6.555442052231317e-06, + "loss": 0.038, + "step": 11630 + }, + { + "epoch": 1.3794962962962962, + "grad_norm": 0.5838883535494714, + "learning_rate": 6.548887792737337e-06, + "loss": 0.0335, + "step": 11640 + }, + { + "epoch": 1.3806814814814814, + "grad_norm": 0.44379312855301606, + "learning_rate": 6.5423305878999455e-06, + "loss": 0.0337, + "step": 11650 + }, + { + "epoch": 1.3818666666666668, + "grad_norm": 0.6556064765057338, + "learning_rate": 6.535770450188232e-06, + "loss": 0.0349, + "step": 11660 + }, + { + "epoch": 1.3830518518518518, + "grad_norm": 0.4198980372163158, + "learning_rate": 6.529207392076863e-06, + "loss": 0.0347, + "step": 11670 + }, + { + "epoch": 1.3842370370370372, + "grad_norm": 0.5902701113711034, + "learning_rate": 6.522641426046058e-06, + "loss": 0.0325, + "step": 11680 + }, + { + "epoch": 1.385422222222222, + "grad_norm": 0.9062230632178103, + "learning_rate": 6.516072564581566e-06, + "loss": 0.0346, + "step": 11690 + }, + { + "epoch": 1.3866074074074075, + "grad_norm": 0.5313649595064155, + "learning_rate": 6.509500820174642e-06, + "loss": 0.0341, + "step": 11700 + }, + { + "epoch": 1.3877925925925925, + "grad_norm": 0.5933460651755964, + "learning_rate": 6.502926205322025e-06, + "loss": 0.0362, + "step": 11710 + }, + { + "epoch": 1.3889777777777779, + "grad_norm": 0.5369814455484064, + "learning_rate": 6.49634873252591e-06, + "loss": 0.0302, + "step": 11720 + }, + { + "epoch": 1.390162962962963, + "grad_norm": 0.5889066684156642, + "learning_rate": 6.4897684142939264e-06, + "loss": 0.0358, + "step": 11730 + }, + { + "epoch": 1.3913481481481482, + "grad_norm": 0.6795712463233264, + "learning_rate": 6.483185263139117e-06, + "loss": 0.0388, + "step": 11740 + }, + { + "epoch": 1.3925333333333334, + "grad_norm": 0.8610744253662693, + "learning_rate": 6.47659929157991e-06, + "loss": 0.0374, + "step": 11750 + }, + { + "epoch": 1.3937185185185186, + "grad_norm": 0.6826603695638273, + "learning_rate": 6.470010512140096e-06, + "loss": 0.0355, + "step": 11760 + }, + { + "epoch": 1.3949037037037038, + "grad_norm": 0.6029143831844743, + "learning_rate": 6.463418937348807e-06, + "loss": 0.0344, + "step": 11770 + }, + { + "epoch": 1.396088888888889, + "grad_norm": 0.6194314162789141, + "learning_rate": 6.456824579740488e-06, + "loss": 0.0366, + "step": 11780 + }, + { + "epoch": 1.3972740740740741, + "grad_norm": 0.535701724273535, + "learning_rate": 6.450227451854883e-06, + "loss": 0.0355, + "step": 11790 + }, + { + "epoch": 1.3984592592592593, + "grad_norm": 0.6780961270832723, + "learning_rate": 6.443627566236989e-06, + "loss": 0.0388, + "step": 11800 + }, + { + "epoch": 1.3996444444444445, + "grad_norm": 0.5865124627611724, + "learning_rate": 6.437024935437064e-06, + "loss": 0.0342, + "step": 11810 + }, + { + "epoch": 1.4008296296296296, + "grad_norm": 0.650873468055745, + "learning_rate": 6.430419572010576e-06, + "loss": 0.0324, + "step": 11820 + }, + { + "epoch": 1.4020148148148148, + "grad_norm": 0.54115654862498, + "learning_rate": 6.423811488518192e-06, + "loss": 0.0349, + "step": 11830 + }, + { + "epoch": 1.4032, + "grad_norm": 0.6903482739650146, + "learning_rate": 6.4172006975257496e-06, + "loss": 0.0345, + "step": 11840 + }, + { + "epoch": 1.4043851851851852, + "grad_norm": 0.48197496990573035, + "learning_rate": 6.41058721160424e-06, + "loss": 0.0352, + "step": 11850 + }, + { + "epoch": 1.4055703703703704, + "grad_norm": 0.41977157751683486, + "learning_rate": 6.403971043329774e-06, + "loss": 0.0338, + "step": 11860 + }, + { + "epoch": 1.4067555555555555, + "grad_norm": 0.5142761840609447, + "learning_rate": 6.3973522052835656e-06, + "loss": 0.0326, + "step": 11870 + }, + { + "epoch": 1.4079407407407407, + "grad_norm": 0.5981393420492328, + "learning_rate": 6.390730710051902e-06, + "loss": 0.0344, + "step": 11880 + }, + { + "epoch": 1.409125925925926, + "grad_norm": 0.41128395801819617, + "learning_rate": 6.384106570226131e-06, + "loss": 0.035, + "step": 11890 + }, + { + "epoch": 1.410311111111111, + "grad_norm": 0.6166904240536913, + "learning_rate": 6.37747979840262e-06, + "loss": 0.0341, + "step": 11900 + }, + { + "epoch": 1.4114962962962962, + "grad_norm": 0.6313770279628215, + "learning_rate": 6.3708504071827495e-06, + "loss": 0.0337, + "step": 11910 + }, + { + "epoch": 1.4126814814814814, + "grad_norm": 0.6558176060861862, + "learning_rate": 6.364218409172873e-06, + "loss": 0.0323, + "step": 11920 + }, + { + "epoch": 1.4138666666666666, + "grad_norm": 0.557870935338801, + "learning_rate": 6.3575838169843095e-06, + "loss": 0.0348, + "step": 11930 + }, + { + "epoch": 1.4150518518518518, + "grad_norm": 0.48126417512548564, + "learning_rate": 6.3509466432333054e-06, + "loss": 0.0331, + "step": 11940 + }, + { + "epoch": 1.416237037037037, + "grad_norm": 0.45904037179735924, + "learning_rate": 6.344306900541017e-06, + "loss": 0.0364, + "step": 11950 + }, + { + "epoch": 1.4174222222222221, + "grad_norm": 0.5741902140473147, + "learning_rate": 6.337664601533488e-06, + "loss": 0.0336, + "step": 11960 + }, + { + "epoch": 1.4186074074074075, + "grad_norm": 0.6869152852529073, + "learning_rate": 6.331019758841619e-06, + "loss": 0.0362, + "step": 11970 + }, + { + "epoch": 1.4197925925925925, + "grad_norm": 0.6920543841466743, + "learning_rate": 6.324372385101151e-06, + "loss": 0.0347, + "step": 11980 + }, + { + "epoch": 1.420977777777778, + "grad_norm": 0.4582770578269367, + "learning_rate": 6.31772249295264e-06, + "loss": 0.0349, + "step": 11990 + }, + { + "epoch": 1.4221629629629629, + "grad_norm": 0.48242390390604495, + "learning_rate": 6.3110700950414225e-06, + "loss": 0.0345, + "step": 12000 + }, + { + "epoch": 1.4233481481481483, + "grad_norm": 0.5741789245553095, + "learning_rate": 6.304415204017611e-06, + "loss": 0.0361, + "step": 12010 + }, + { + "epoch": 1.4245333333333332, + "grad_norm": 0.527190754710843, + "learning_rate": 6.297757832536051e-06, + "loss": 0.0319, + "step": 12020 + }, + { + "epoch": 1.4257185185185186, + "grad_norm": 0.5823014658369792, + "learning_rate": 6.2910979932563075e-06, + "loss": 0.0336, + "step": 12030 + }, + { + "epoch": 1.4269037037037038, + "grad_norm": 0.9204610506104669, + "learning_rate": 6.284435698842637e-06, + "loss": 0.0324, + "step": 12040 + }, + { + "epoch": 1.428088888888889, + "grad_norm": 0.689396900420848, + "learning_rate": 6.277770961963967e-06, + "loss": 0.0335, + "step": 12050 + }, + { + "epoch": 1.4292740740740741, + "grad_norm": 0.49710991494760887, + "learning_rate": 6.271103795293868e-06, + "loss": 0.0335, + "step": 12060 + }, + { + "epoch": 1.4304592592592593, + "grad_norm": 0.6559924748545927, + "learning_rate": 6.264434211510528e-06, + "loss": 0.0359, + "step": 12070 + }, + { + "epoch": 1.4316444444444445, + "grad_norm": 0.6221518618651144, + "learning_rate": 6.2577622232967405e-06, + "loss": 0.0346, + "step": 12080 + }, + { + "epoch": 1.4328296296296297, + "grad_norm": 0.499462205585944, + "learning_rate": 6.25108784333986e-06, + "loss": 0.0314, + "step": 12090 + }, + { + "epoch": 1.4340148148148149, + "grad_norm": 0.5319447691270714, + "learning_rate": 6.244411084331797e-06, + "loss": 0.0338, + "step": 12100 + }, + { + "epoch": 1.4352, + "grad_norm": 0.5526359434374736, + "learning_rate": 6.237731958968981e-06, + "loss": 0.0337, + "step": 12110 + }, + { + "epoch": 1.4363851851851852, + "grad_norm": 0.655313459886929, + "learning_rate": 6.231050479952346e-06, + "loss": 0.0359, + "step": 12120 + }, + { + "epoch": 1.4375703703703704, + "grad_norm": 0.554788460737137, + "learning_rate": 6.224366659987298e-06, + "loss": 0.034, + "step": 12130 + }, + { + "epoch": 1.4387555555555556, + "grad_norm": 0.5659188001685495, + "learning_rate": 6.217680511783696e-06, + "loss": 0.032, + "step": 12140 + }, + { + "epoch": 1.4399407407407407, + "grad_norm": 0.973286737871544, + "learning_rate": 6.210992048055824e-06, + "loss": 0.0363, + "step": 12150 + }, + { + "epoch": 1.441125925925926, + "grad_norm": 0.3651541450859842, + "learning_rate": 6.204301281522376e-06, + "loss": 0.0312, + "step": 12160 + }, + { + "epoch": 1.442311111111111, + "grad_norm": 0.6785911952934957, + "learning_rate": 6.197608224906416e-06, + "loss": 0.0362, + "step": 12170 + }, + { + "epoch": 1.4434962962962963, + "grad_norm": 0.4826625798954932, + "learning_rate": 6.19091289093537e-06, + "loss": 0.0326, + "step": 12180 + }, + { + "epoch": 1.4446814814814815, + "grad_norm": 0.5575053452007372, + "learning_rate": 6.184215292340988e-06, + "loss": 0.0357, + "step": 12190 + }, + { + "epoch": 1.4458666666666666, + "grad_norm": 0.6325374702945982, + "learning_rate": 6.1775154418593346e-06, + "loss": 0.0349, + "step": 12200 + }, + { + "epoch": 1.4470518518518518, + "grad_norm": 0.6225928278867026, + "learning_rate": 6.170813352230749e-06, + "loss": 0.0362, + "step": 12210 + }, + { + "epoch": 1.448237037037037, + "grad_norm": 0.6831371902504493, + "learning_rate": 6.164109036199832e-06, + "loss": 0.0346, + "step": 12220 + }, + { + "epoch": 1.4494222222222222, + "grad_norm": 0.4862788692053201, + "learning_rate": 6.157402506515416e-06, + "loss": 0.0326, + "step": 12230 + }, + { + "epoch": 1.4506074074074073, + "grad_norm": 0.5745574919262131, + "learning_rate": 6.150693775930547e-06, + "loss": 0.0367, + "step": 12240 + }, + { + "epoch": 1.4517925925925925, + "grad_norm": 0.5733819586191993, + "learning_rate": 6.143982857202452e-06, + "loss": 0.0325, + "step": 12250 + }, + { + "epoch": 1.4529777777777777, + "grad_norm": 0.6617345579986803, + "learning_rate": 6.137269763092522e-06, + "loss": 0.0319, + "step": 12260 + }, + { + "epoch": 1.4541629629629629, + "grad_norm": 0.6118097823085019, + "learning_rate": 6.130554506366278e-06, + "loss": 0.0336, + "step": 12270 + }, + { + "epoch": 1.4553481481481483, + "grad_norm": 0.5350180924024459, + "learning_rate": 6.123837099793365e-06, + "loss": 0.0345, + "step": 12280 + }, + { + "epoch": 1.4565333333333332, + "grad_norm": 0.5396770825445341, + "learning_rate": 6.117117556147505e-06, + "loss": 0.0335, + "step": 12290 + }, + { + "epoch": 1.4577185185185186, + "grad_norm": 0.4845975079978553, + "learning_rate": 6.110395888206492e-06, + "loss": 0.0349, + "step": 12300 + }, + { + "epoch": 1.4589037037037036, + "grad_norm": 0.4999282856662726, + "learning_rate": 6.103672108752152e-06, + "loss": 0.0335, + "step": 12310 + }, + { + "epoch": 1.460088888888889, + "grad_norm": 0.4249204434313072, + "learning_rate": 6.096946230570332e-06, + "loss": 0.032, + "step": 12320 + }, + { + "epoch": 1.461274074074074, + "grad_norm": 0.4887537144845807, + "learning_rate": 6.09021826645087e-06, + "loss": 0.0344, + "step": 12330 + }, + { + "epoch": 1.4624592592592593, + "grad_norm": 0.5014685169268319, + "learning_rate": 6.083488229187565e-06, + "loss": 0.0336, + "step": 12340 + }, + { + "epoch": 1.4636444444444445, + "grad_norm": 0.5412856573017294, + "learning_rate": 6.076756131578165e-06, + "loss": 0.0338, + "step": 12350 + }, + { + "epoch": 1.4648296296296297, + "grad_norm": 0.5704363530858065, + "learning_rate": 6.070021986424332e-06, + "loss": 0.0349, + "step": 12360 + }, + { + "epoch": 1.4660148148148149, + "grad_norm": 0.6142571307425699, + "learning_rate": 6.063285806531623e-06, + "loss": 0.0328, + "step": 12370 + }, + { + "epoch": 1.4672, + "grad_norm": 0.5428976640304795, + "learning_rate": 6.056547604709461e-06, + "loss": 0.0337, + "step": 12380 + }, + { + "epoch": 1.4683851851851852, + "grad_norm": 0.4504843161243152, + "learning_rate": 6.04980739377112e-06, + "loss": 0.0341, + "step": 12390 + }, + { + "epoch": 1.4695703703703704, + "grad_norm": 0.5593867800158661, + "learning_rate": 6.043065186533688e-06, + "loss": 0.0322, + "step": 12400 + }, + { + "epoch": 1.4707555555555556, + "grad_norm": 0.512364210972295, + "learning_rate": 6.036320995818056e-06, + "loss": 0.0334, + "step": 12410 + }, + { + "epoch": 1.4719407407407408, + "grad_norm": 0.5433841483492066, + "learning_rate": 6.029574834448877e-06, + "loss": 0.032, + "step": 12420 + }, + { + "epoch": 1.473125925925926, + "grad_norm": 0.48420023671213425, + "learning_rate": 6.022826715254564e-06, + "loss": 0.0359, + "step": 12430 + }, + { + "epoch": 1.4743111111111111, + "grad_norm": 0.5984474055200714, + "learning_rate": 6.016076651067242e-06, + "loss": 0.0338, + "step": 12440 + }, + { + "epoch": 1.4754962962962963, + "grad_norm": 0.5813230092492384, + "learning_rate": 6.009324654722741e-06, + "loss": 0.0333, + "step": 12450 + }, + { + "epoch": 1.4766814814814815, + "grad_norm": 0.5234600809537441, + "learning_rate": 6.00257073906056e-06, + "loss": 0.0353, + "step": 12460 + }, + { + "epoch": 1.4778666666666667, + "grad_norm": 0.5723866809326428, + "learning_rate": 5.995814916923855e-06, + "loss": 0.0332, + "step": 12470 + }, + { + "epoch": 1.4790518518518518, + "grad_norm": 0.45896484561524986, + "learning_rate": 5.989057201159401e-06, + "loss": 0.0323, + "step": 12480 + }, + { + "epoch": 1.480237037037037, + "grad_norm": 0.43787060826429475, + "learning_rate": 5.982297604617575e-06, + "loss": 0.0327, + "step": 12490 + }, + { + "epoch": 1.4814222222222222, + "grad_norm": 0.4293122281582217, + "learning_rate": 5.975536140152331e-06, + "loss": 0.0337, + "step": 12500 + }, + { + "epoch": 1.4826074074074074, + "grad_norm": 0.45263164731419336, + "learning_rate": 5.9687728206211805e-06, + "loss": 0.0319, + "step": 12510 + }, + { + "epoch": 1.4837925925925926, + "grad_norm": 0.6026734931124957, + "learning_rate": 5.9620076588851514e-06, + "loss": 0.0348, + "step": 12520 + }, + { + "epoch": 1.4849777777777777, + "grad_norm": 0.5815538382704218, + "learning_rate": 5.955240667808785e-06, + "loss": 0.0329, + "step": 12530 + }, + { + "epoch": 1.486162962962963, + "grad_norm": 0.4022565493846757, + "learning_rate": 5.948471860260093e-06, + "loss": 0.0352, + "step": 12540 + }, + { + "epoch": 1.487348148148148, + "grad_norm": 0.5666152365852863, + "learning_rate": 5.94170124911055e-06, + "loss": 0.0322, + "step": 12550 + }, + { + "epoch": 1.4885333333333333, + "grad_norm": 0.5394228797071462, + "learning_rate": 5.934928847235053e-06, + "loss": 0.0353, + "step": 12560 + }, + { + "epoch": 1.4897185185185184, + "grad_norm": 0.5115183626748209, + "learning_rate": 5.928154667511908e-06, + "loss": 0.0345, + "step": 12570 + }, + { + "epoch": 1.4909037037037036, + "grad_norm": 0.5272183873058901, + "learning_rate": 5.9213787228228e-06, + "loss": 0.0338, + "step": 12580 + }, + { + "epoch": 1.492088888888889, + "grad_norm": 0.5952295525046291, + "learning_rate": 5.914601026052769e-06, + "loss": 0.0349, + "step": 12590 + }, + { + "epoch": 1.493274074074074, + "grad_norm": 0.7236059494972624, + "learning_rate": 5.907821590090191e-06, + "loss": 0.0358, + "step": 12600 + }, + { + "epoch": 1.4944592592592594, + "grad_norm": 0.5009202830839249, + "learning_rate": 5.9010404278267475e-06, + "loss": 0.0346, + "step": 12610 + }, + { + "epoch": 1.4956444444444443, + "grad_norm": 0.6375866231808818, + "learning_rate": 5.8942575521574005e-06, + "loss": 0.0327, + "step": 12620 + }, + { + "epoch": 1.4968296296296297, + "grad_norm": 0.6448423065195115, + "learning_rate": 5.887472975980372e-06, + "loss": 0.0354, + "step": 12630 + }, + { + "epoch": 1.4980148148148147, + "grad_norm": 0.5160262111575215, + "learning_rate": 5.880686712197117e-06, + "loss": 0.0318, + "step": 12640 + }, + { + "epoch": 1.4992, + "grad_norm": 0.4143541144775653, + "learning_rate": 5.8738987737123e-06, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 1.500385185185185, + "grad_norm": 0.3538080001350191, + "learning_rate": 5.867109173433772e-06, + "loss": 0.0337, + "step": 12660 + }, + { + "epoch": 1.5015703703703704, + "grad_norm": 0.46170497963820933, + "learning_rate": 5.8603179242725395e-06, + "loss": 0.0309, + "step": 12670 + }, + { + "epoch": 1.5027555555555554, + "grad_norm": 0.7078203234460463, + "learning_rate": 5.85352503914275e-06, + "loss": 0.0351, + "step": 12680 + }, + { + "epoch": 1.5039407407407408, + "grad_norm": 0.6078356295083309, + "learning_rate": 5.846730530961654e-06, + "loss": 0.0315, + "step": 12690 + }, + { + "epoch": 1.505125925925926, + "grad_norm": 0.5113595872934816, + "learning_rate": 5.8399344126496e-06, + "loss": 0.0334, + "step": 12700 + }, + { + "epoch": 1.5063111111111112, + "grad_norm": 0.5442291307023379, + "learning_rate": 5.833136697129987e-06, + "loss": 0.0323, + "step": 12710 + }, + { + "epoch": 1.5074962962962963, + "grad_norm": 0.7009128371808973, + "learning_rate": 5.826337397329259e-06, + "loss": 0.0325, + "step": 12720 + }, + { + "epoch": 1.5086814814814815, + "grad_norm": 0.8788712357533189, + "learning_rate": 5.819536526176865e-06, + "loss": 0.0321, + "step": 12730 + }, + { + "epoch": 1.5098666666666667, + "grad_norm": 0.8178648648830867, + "learning_rate": 5.812734096605253e-06, + "loss": 0.0339, + "step": 12740 + }, + { + "epoch": 1.5110518518518519, + "grad_norm": 0.7361560613112331, + "learning_rate": 5.8059301215498236e-06, + "loss": 0.0362, + "step": 12750 + }, + { + "epoch": 1.512237037037037, + "grad_norm": 0.6016599075578817, + "learning_rate": 5.799124613948923e-06, + "loss": 0.0318, + "step": 12760 + }, + { + "epoch": 1.5134222222222222, + "grad_norm": 0.5408668261916741, + "learning_rate": 5.792317586743806e-06, + "loss": 0.0326, + "step": 12770 + }, + { + "epoch": 1.5146074074074074, + "grad_norm": 0.4982805394078052, + "learning_rate": 5.7855090528786266e-06, + "loss": 0.0318, + "step": 12780 + }, + { + "epoch": 1.5157925925925926, + "grad_norm": 0.5149272105546082, + "learning_rate": 5.778699025300391e-06, + "loss": 0.0306, + "step": 12790 + }, + { + "epoch": 1.5169777777777778, + "grad_norm": 0.6609594016579912, + "learning_rate": 5.77188751695896e-06, + "loss": 0.0325, + "step": 12800 + }, + { + "epoch": 1.518162962962963, + "grad_norm": 0.48397098756621043, + "learning_rate": 5.765074540806994e-06, + "loss": 0.0352, + "step": 12810 + }, + { + "epoch": 1.5193481481481481, + "grad_norm": 0.5934881688537705, + "learning_rate": 5.758260109799962e-06, + "loss": 0.0339, + "step": 12820 + }, + { + "epoch": 1.5205333333333333, + "grad_norm": 0.5789006476519347, + "learning_rate": 5.751444236896085e-06, + "loss": 0.0334, + "step": 12830 + }, + { + "epoch": 1.5217185185185185, + "grad_norm": 0.5683388999786305, + "learning_rate": 5.744626935056335e-06, + "loss": 0.0349, + "step": 12840 + }, + { + "epoch": 1.5229037037037036, + "grad_norm": 0.5070308237342998, + "learning_rate": 5.737808217244396e-06, + "loss": 0.0326, + "step": 12850 + }, + { + "epoch": 1.524088888888889, + "grad_norm": 0.5091743032835572, + "learning_rate": 5.730988096426649e-06, + "loss": 0.0327, + "step": 12860 + }, + { + "epoch": 1.525274074074074, + "grad_norm": 0.43571818250867006, + "learning_rate": 5.724166585572137e-06, + "loss": 0.0308, + "step": 12870 + }, + { + "epoch": 1.5264592592592594, + "grad_norm": 0.5166481812065432, + "learning_rate": 5.717343697652552e-06, + "loss": 0.0325, + "step": 12880 + }, + { + "epoch": 1.5276444444444444, + "grad_norm": 0.6338275492879064, + "learning_rate": 5.710519445642203e-06, + "loss": 0.0354, + "step": 12890 + }, + { + "epoch": 1.5288296296296298, + "grad_norm": 0.5597685545984137, + "learning_rate": 5.703693842517993e-06, + "loss": 0.0365, + "step": 12900 + }, + { + "epoch": 1.5300148148148147, + "grad_norm": 0.498291021479039, + "learning_rate": 5.696866901259392e-06, + "loss": 0.0322, + "step": 12910 + }, + { + "epoch": 1.5312000000000001, + "grad_norm": 0.6230397076757928, + "learning_rate": 5.690038634848415e-06, + "loss": 0.0325, + "step": 12920 + }, + { + "epoch": 1.532385185185185, + "grad_norm": 0.4620149579603233, + "learning_rate": 5.683209056269601e-06, + "loss": 0.0325, + "step": 12930 + }, + { + "epoch": 1.5335703703703705, + "grad_norm": 0.6229466618263108, + "learning_rate": 5.6763781785099806e-06, + "loss": 0.0339, + "step": 12940 + }, + { + "epoch": 1.5347555555555554, + "grad_norm": 0.6097269669831146, + "learning_rate": 5.669546014559053e-06, + "loss": 0.0323, + "step": 12950 + }, + { + "epoch": 1.5359407407407408, + "grad_norm": 0.5561868163069095, + "learning_rate": 5.662712577408769e-06, + "loss": 0.0336, + "step": 12960 + }, + { + "epoch": 1.5371259259259258, + "grad_norm": 0.47437352647284287, + "learning_rate": 5.6558778800534975e-06, + "loss": 0.0314, + "step": 12970 + }, + { + "epoch": 1.5383111111111112, + "grad_norm": 0.5161057320874354, + "learning_rate": 5.649041935490001e-06, + "loss": 0.0344, + "step": 12980 + }, + { + "epoch": 1.5394962962962961, + "grad_norm": 0.5571655285737502, + "learning_rate": 5.642204756717419e-06, + "loss": 0.0308, + "step": 12990 + }, + { + "epoch": 1.5406814814814815, + "grad_norm": 0.5523295288576924, + "learning_rate": 5.635366356737231e-06, + "loss": 0.0316, + "step": 13000 + }, + { + "epoch": 1.5418666666666667, + "grad_norm": 0.5400102128403003, + "learning_rate": 5.628526748553248e-06, + "loss": 0.0314, + "step": 13010 + }, + { + "epoch": 1.543051851851852, + "grad_norm": 0.7644640708120826, + "learning_rate": 5.62168594517157e-06, + "loss": 0.0332, + "step": 13020 + }, + { + "epoch": 1.544237037037037, + "grad_norm": 0.7127615461894612, + "learning_rate": 5.614843959600577e-06, + "loss": 0.0314, + "step": 13030 + }, + { + "epoch": 1.5454222222222223, + "grad_norm": 0.5754727400557247, + "learning_rate": 5.608000804850887e-06, + "loss": 0.0313, + "step": 13040 + }, + { + "epoch": 1.5466074074074074, + "grad_norm": 0.5599260186744512, + "learning_rate": 5.601156493935355e-06, + "loss": 0.0344, + "step": 13050 + }, + { + "epoch": 1.5477925925925926, + "grad_norm": 0.3826711882836975, + "learning_rate": 5.594311039869022e-06, + "loss": 0.0315, + "step": 13060 + }, + { + "epoch": 1.5489777777777778, + "grad_norm": 0.42317400660031274, + "learning_rate": 5.587464455669109e-06, + "loss": 0.0314, + "step": 13070 + }, + { + "epoch": 1.550162962962963, + "grad_norm": 0.4086030442509822, + "learning_rate": 5.580616754354985e-06, + "loss": 0.0315, + "step": 13080 + }, + { + "epoch": 1.5513481481481481, + "grad_norm": 0.47338628038441094, + "learning_rate": 5.573767948948146e-06, + "loss": 0.0317, + "step": 13090 + }, + { + "epoch": 1.5525333333333333, + "grad_norm": 0.7169833634684311, + "learning_rate": 5.56691805247218e-06, + "loss": 0.0342, + "step": 13100 + }, + { + "epoch": 1.5537185185185185, + "grad_norm": 0.5407255083897317, + "learning_rate": 5.56006707795276e-06, + "loss": 0.0303, + "step": 13110 + }, + { + "epoch": 1.5549037037037037, + "grad_norm": 0.8122389937260033, + "learning_rate": 5.553215038417597e-06, + "loss": 0.0313, + "step": 13120 + }, + { + "epoch": 1.5560888888888889, + "grad_norm": 0.6922836748841144, + "learning_rate": 5.546361946896439e-06, + "loss": 0.036, + "step": 13130 + }, + { + "epoch": 1.557274074074074, + "grad_norm": 0.3190103522451645, + "learning_rate": 5.539507816421027e-06, + "loss": 0.0278, + "step": 13140 + }, + { + "epoch": 1.5584592592592592, + "grad_norm": 0.473287497333893, + "learning_rate": 5.532652660025081e-06, + "loss": 0.031, + "step": 13150 + }, + { + "epoch": 1.5596444444444444, + "grad_norm": 0.5588835513188126, + "learning_rate": 5.525796490744269e-06, + "loss": 0.0332, + "step": 13160 + }, + { + "epoch": 1.5608296296296298, + "grad_norm": 0.44401173743418904, + "learning_rate": 5.518939321616189e-06, + "loss": 0.0315, + "step": 13170 + }, + { + "epoch": 1.5620148148148147, + "grad_norm": 0.5440112495679429, + "learning_rate": 5.512081165680336e-06, + "loss": 0.0351, + "step": 13180 + }, + { + "epoch": 1.5632000000000001, + "grad_norm": 0.5845981732775536, + "learning_rate": 5.5052220359780865e-06, + "loss": 0.0347, + "step": 13190 + }, + { + "epoch": 1.564385185185185, + "grad_norm": 0.4455173070571021, + "learning_rate": 5.498361945552662e-06, + "loss": 0.0346, + "step": 13200 + }, + { + "epoch": 1.5655703703703705, + "grad_norm": 0.4408979763576873, + "learning_rate": 5.491500907449118e-06, + "loss": 0.0335, + "step": 13210 + }, + { + "epoch": 1.5667555555555555, + "grad_norm": 0.8791662408943192, + "learning_rate": 5.484638934714307e-06, + "loss": 0.0282, + "step": 13220 + }, + { + "epoch": 1.5679407407407409, + "grad_norm": 0.7316584611770932, + "learning_rate": 5.47777604039686e-06, + "loss": 0.031, + "step": 13230 + }, + { + "epoch": 1.5691259259259258, + "grad_norm": 0.4964055070794261, + "learning_rate": 5.4709122375471645e-06, + "loss": 0.0321, + "step": 13240 + }, + { + "epoch": 1.5703111111111112, + "grad_norm": 0.5630139537326069, + "learning_rate": 5.464047539217329e-06, + "loss": 0.0294, + "step": 13250 + }, + { + "epoch": 1.5714962962962962, + "grad_norm": 0.5507139644896905, + "learning_rate": 5.457181958461167e-06, + "loss": 0.0322, + "step": 13260 + }, + { + "epoch": 1.5726814814814816, + "grad_norm": 0.4677531128584071, + "learning_rate": 5.450315508334174e-06, + "loss": 0.032, + "step": 13270 + }, + { + "epoch": 1.5738666666666665, + "grad_norm": 0.6262982988055855, + "learning_rate": 5.443448201893496e-06, + "loss": 0.032, + "step": 13280 + }, + { + "epoch": 1.575051851851852, + "grad_norm": 0.8777731943478929, + "learning_rate": 5.436580052197905e-06, + "loss": 0.032, + "step": 13290 + }, + { + "epoch": 1.5762370370370369, + "grad_norm": 0.7150436176075989, + "learning_rate": 5.42971107230778e-06, + "loss": 0.0354, + "step": 13300 + }, + { + "epoch": 1.5774222222222223, + "grad_norm": 0.5825965324224895, + "learning_rate": 5.422841275285075e-06, + "loss": 0.0338, + "step": 13310 + }, + { + "epoch": 1.5786074074074075, + "grad_norm": 0.5536348026035459, + "learning_rate": 5.415970674193303e-06, + "loss": 0.0316, + "step": 13320 + }, + { + "epoch": 1.5797925925925926, + "grad_norm": 0.43967616397879106, + "learning_rate": 5.409099282097502e-06, + "loss": 0.0334, + "step": 13330 + }, + { + "epoch": 1.5809777777777778, + "grad_norm": 0.5732369856067626, + "learning_rate": 5.402227112064216e-06, + "loss": 0.0304, + "step": 13340 + }, + { + "epoch": 1.582162962962963, + "grad_norm": 0.4543688416741047, + "learning_rate": 5.395354177161464e-06, + "loss": 0.0304, + "step": 13350 + }, + { + "epoch": 1.5833481481481482, + "grad_norm": 0.6300372642789709, + "learning_rate": 5.388480490458729e-06, + "loss": 0.0328, + "step": 13360 + }, + { + "epoch": 1.5845333333333333, + "grad_norm": 0.6261021967067414, + "learning_rate": 5.381606065026913e-06, + "loss": 0.0348, + "step": 13370 + }, + { + "epoch": 1.5857185185185185, + "grad_norm": 0.6890944257762874, + "learning_rate": 5.374730913938331e-06, + "loss": 0.0342, + "step": 13380 + }, + { + "epoch": 1.5869037037037037, + "grad_norm": 0.383277010276292, + "learning_rate": 5.367855050266671e-06, + "loss": 0.0285, + "step": 13390 + }, + { + "epoch": 1.5880888888888889, + "grad_norm": 0.5100025539483825, + "learning_rate": 5.360978487086982e-06, + "loss": 0.0283, + "step": 13400 + }, + { + "epoch": 1.589274074074074, + "grad_norm": 0.4436995345159667, + "learning_rate": 5.354101237475638e-06, + "loss": 0.0344, + "step": 13410 + }, + { + "epoch": 1.5904592592592592, + "grad_norm": 0.4429829601313009, + "learning_rate": 5.347223314510324e-06, + "loss": 0.0326, + "step": 13420 + }, + { + "epoch": 1.5916444444444444, + "grad_norm": 0.4971954541232553, + "learning_rate": 5.3403447312699995e-06, + "loss": 0.0333, + "step": 13430 + }, + { + "epoch": 1.5928296296296296, + "grad_norm": 0.4750617880508376, + "learning_rate": 5.333465500834885e-06, + "loss": 0.0311, + "step": 13440 + }, + { + "epoch": 1.5940148148148148, + "grad_norm": 0.516993521346015, + "learning_rate": 5.3265856362864275e-06, + "loss": 0.0318, + "step": 13450 + }, + { + "epoch": 1.5952, + "grad_norm": 0.5913079570692042, + "learning_rate": 5.319705150707281e-06, + "loss": 0.036, + "step": 13460 + }, + { + "epoch": 1.5963851851851851, + "grad_norm": 0.4179441199818623, + "learning_rate": 5.312824057181282e-06, + "loss": 0.0319, + "step": 13470 + }, + { + "epoch": 1.5975703703703705, + "grad_norm": 0.520202667237722, + "learning_rate": 5.3059423687934215e-06, + "loss": 0.0348, + "step": 13480 + }, + { + "epoch": 1.5987555555555555, + "grad_norm": 0.5875177871794808, + "learning_rate": 5.299060098629822e-06, + "loss": 0.0312, + "step": 13490 + }, + { + "epoch": 1.5999407407407409, + "grad_norm": 0.42173946676094465, + "learning_rate": 5.292177259777712e-06, + "loss": 0.0336, + "step": 13500 + }, + { + "epoch": 1.6011259259259258, + "grad_norm": 0.4960348121315329, + "learning_rate": 5.285293865325403e-06, + "loss": 0.0342, + "step": 13510 + }, + { + "epoch": 1.6023111111111112, + "grad_norm": 0.6682583846044446, + "learning_rate": 5.278409928362261e-06, + "loss": 0.0325, + "step": 13520 + }, + { + "epoch": 1.6034962962962962, + "grad_norm": 0.4409770099559712, + "learning_rate": 5.271525461978685e-06, + "loss": 0.0323, + "step": 13530 + }, + { + "epoch": 1.6046814814814816, + "grad_norm": 0.46424600824419054, + "learning_rate": 5.264640479266079e-06, + "loss": 0.0339, + "step": 13540 + }, + { + "epoch": 1.6058666666666666, + "grad_norm": 0.527160755063716, + "learning_rate": 5.257754993316831e-06, + "loss": 0.0264, + "step": 13550 + }, + { + "epoch": 1.607051851851852, + "grad_norm": 0.628985681715465, + "learning_rate": 5.250869017224284e-06, + "loss": 0.0333, + "step": 13560 + }, + { + "epoch": 1.608237037037037, + "grad_norm": 0.44631646827315336, + "learning_rate": 5.243982564082716e-06, + "loss": 0.033, + "step": 13570 + }, + { + "epoch": 1.6094222222222223, + "grad_norm": 0.6095352401098801, + "learning_rate": 5.237095646987308e-06, + "loss": 0.0319, + "step": 13580 + }, + { + "epoch": 1.6106074074074073, + "grad_norm": 0.5683086177600853, + "learning_rate": 5.230208279034128e-06, + "loss": 0.0308, + "step": 13590 + }, + { + "epoch": 1.6117925925925927, + "grad_norm": 0.4649675799365343, + "learning_rate": 5.223320473320095e-06, + "loss": 0.0321, + "step": 13600 + }, + { + "epoch": 1.6129777777777776, + "grad_norm": 0.5165314027749358, + "learning_rate": 5.216432242942969e-06, + "loss": 0.0306, + "step": 13610 + }, + { + "epoch": 1.614162962962963, + "grad_norm": 0.6363078505636841, + "learning_rate": 5.209543601001307e-06, + "loss": 0.0312, + "step": 13620 + }, + { + "epoch": 1.6153481481481482, + "grad_norm": 0.5388796574454523, + "learning_rate": 5.20265456059446e-06, + "loss": 0.029, + "step": 13630 + }, + { + "epoch": 1.6165333333333334, + "grad_norm": 0.5813676482541661, + "learning_rate": 5.195765134822528e-06, + "loss": 0.0312, + "step": 13640 + }, + { + "epoch": 1.6177185185185186, + "grad_norm": 0.4924277892767643, + "learning_rate": 5.188875336786349e-06, + "loss": 0.0326, + "step": 13650 + }, + { + "epoch": 1.6189037037037037, + "grad_norm": 0.4839135957845775, + "learning_rate": 5.181985179587463e-06, + "loss": 0.0323, + "step": 13660 + }, + { + "epoch": 1.620088888888889, + "grad_norm": 0.3535677135763709, + "learning_rate": 5.1750946763281e-06, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 1.621274074074074, + "grad_norm": 0.5514640615117711, + "learning_rate": 5.1682038401111446e-06, + "loss": 0.0295, + "step": 13680 + }, + { + "epoch": 1.6224592592592593, + "grad_norm": 0.5094425767604717, + "learning_rate": 5.161312684040114e-06, + "loss": 0.0335, + "step": 13690 + }, + { + "epoch": 1.6236444444444444, + "grad_norm": 0.5582975954470566, + "learning_rate": 5.154421221219135e-06, + "loss": 0.033, + "step": 13700 + }, + { + "epoch": 1.6248296296296296, + "grad_norm": 0.46751325802838417, + "learning_rate": 5.147529464752916e-06, + "loss": 0.0318, + "step": 13710 + }, + { + "epoch": 1.6260148148148148, + "grad_norm": 0.5851422023563263, + "learning_rate": 5.140637427746726e-06, + "loss": 0.0328, + "step": 13720 + }, + { + "epoch": 1.6272, + "grad_norm": 0.531886778556515, + "learning_rate": 5.133745123306366e-06, + "loss": 0.0331, + "step": 13730 + }, + { + "epoch": 1.6283851851851852, + "grad_norm": 0.542386697371137, + "learning_rate": 5.126852564538145e-06, + "loss": 0.0311, + "step": 13740 + }, + { + "epoch": 1.6295703703703703, + "grad_norm": 0.6884539166308733, + "learning_rate": 5.11995976454886e-06, + "loss": 0.0329, + "step": 13750 + }, + { + "epoch": 1.6307555555555555, + "grad_norm": 0.4259484145215916, + "learning_rate": 5.1130667364457585e-06, + "loss": 0.0314, + "step": 13760 + }, + { + "epoch": 1.6319407407407407, + "grad_norm": 0.3449300478568714, + "learning_rate": 5.10617349333653e-06, + "loss": 0.0289, + "step": 13770 + }, + { + "epoch": 1.6331259259259259, + "grad_norm": 0.6387429519380369, + "learning_rate": 5.099280048329268e-06, + "loss": 0.0328, + "step": 13780 + }, + { + "epoch": 1.6343111111111113, + "grad_norm": 0.5000192532252813, + "learning_rate": 5.092386414532452e-06, + "loss": 0.0313, + "step": 13790 + }, + { + "epoch": 1.6354962962962962, + "grad_norm": 0.4543401858549053, + "learning_rate": 5.085492605054919e-06, + "loss": 0.0317, + "step": 13800 + }, + { + "epoch": 1.6366814814814816, + "grad_norm": 0.3808911842435508, + "learning_rate": 5.0785986330058415e-06, + "loss": 0.0292, + "step": 13810 + }, + { + "epoch": 1.6378666666666666, + "grad_norm": 0.641061286823941, + "learning_rate": 5.0717045114946995e-06, + "loss": 0.0334, + "step": 13820 + }, + { + "epoch": 1.639051851851852, + "grad_norm": 0.5358972255901302, + "learning_rate": 5.064810253631261e-06, + "loss": 0.0279, + "step": 13830 + }, + { + "epoch": 1.640237037037037, + "grad_norm": 0.5497119658997647, + "learning_rate": 5.057915872525546e-06, + "loss": 0.0306, + "step": 13840 + }, + { + "epoch": 1.6414222222222223, + "grad_norm": 0.5254691133999501, + "learning_rate": 5.0510213812878175e-06, + "loss": 0.0297, + "step": 13850 + }, + { + "epoch": 1.6426074074074073, + "grad_norm": 0.7330691921751293, + "learning_rate": 5.044126793028543e-06, + "loss": 0.0337, + "step": 13860 + }, + { + "epoch": 1.6437925925925927, + "grad_norm": 0.5483217602392962, + "learning_rate": 5.037232120858374e-06, + "loss": 0.0305, + "step": 13870 + }, + { + "epoch": 1.6449777777777776, + "grad_norm": 0.6661446428335243, + "learning_rate": 5.030337377888124e-06, + "loss": 0.0304, + "step": 13880 + }, + { + "epoch": 1.646162962962963, + "grad_norm": 0.31071784886065973, + "learning_rate": 5.0234425772287385e-06, + "loss": 0.0312, + "step": 13890 + }, + { + "epoch": 1.647348148148148, + "grad_norm": 0.44037140857046925, + "learning_rate": 5.016547731991277e-06, + "loss": 0.0329, + "step": 13900 + }, + { + "epoch": 1.6485333333333334, + "grad_norm": 0.42685646270216976, + "learning_rate": 5.009652855286878e-06, + "loss": 0.0296, + "step": 13910 + }, + { + "epoch": 1.6497185185185184, + "grad_norm": 0.4170962141617505, + "learning_rate": 5.002757960226744e-06, + "loss": 0.0297, + "step": 13920 + }, + { + "epoch": 1.6509037037037038, + "grad_norm": 0.44864544262814066, + "learning_rate": 4.995863059922111e-06, + "loss": 0.033, + "step": 13930 + }, + { + "epoch": 1.652088888888889, + "grad_norm": 0.5355523913822217, + "learning_rate": 4.988968167484227e-06, + "loss": 0.0295, + "step": 13940 + }, + { + "epoch": 1.6532740740740741, + "grad_norm": 0.5123179332709443, + "learning_rate": 4.982073296024321e-06, + "loss": 0.0304, + "step": 13950 + }, + { + "epoch": 1.6544592592592593, + "grad_norm": 0.7933900045484795, + "learning_rate": 4.975178458653586e-06, + "loss": 0.0293, + "step": 13960 + }, + { + "epoch": 1.6556444444444445, + "grad_norm": 0.4222212429222394, + "learning_rate": 4.968283668483146e-06, + "loss": 0.0303, + "step": 13970 + }, + { + "epoch": 1.6568296296296297, + "grad_norm": 0.7353778216738837, + "learning_rate": 4.961388938624038e-06, + "loss": 0.032, + "step": 13980 + }, + { + "epoch": 1.6580148148148148, + "grad_norm": 0.5278524829550586, + "learning_rate": 4.9544942821871875e-06, + "loss": 0.031, + "step": 13990 + }, + { + "epoch": 1.6592, + "grad_norm": 0.4757237246191436, + "learning_rate": 4.947599712283375e-06, + "loss": 0.0296, + "step": 14000 + }, + { + "epoch": 1.6603851851851852, + "grad_norm": 0.46821950454696787, + "learning_rate": 4.940705242023219e-06, + "loss": 0.0276, + "step": 14010 + }, + { + "epoch": 1.6615703703703704, + "grad_norm": 0.4402682731768157, + "learning_rate": 4.933810884517148e-06, + "loss": 0.0279, + "step": 14020 + }, + { + "epoch": 1.6627555555555555, + "grad_norm": 0.5489239763857653, + "learning_rate": 4.926916652875373e-06, + "loss": 0.0306, + "step": 14030 + }, + { + "epoch": 1.6639407407407407, + "grad_norm": 0.5936036416875581, + "learning_rate": 4.920022560207873e-06, + "loss": 0.0311, + "step": 14040 + }, + { + "epoch": 1.665125925925926, + "grad_norm": 0.4778677822228148, + "learning_rate": 4.913128619624355e-06, + "loss": 0.0321, + "step": 14050 + }, + { + "epoch": 1.666311111111111, + "grad_norm": 0.459358995507567, + "learning_rate": 4.9062348442342405e-06, + "loss": 0.0307, + "step": 14060 + }, + { + "epoch": 1.6674962962962963, + "grad_norm": 0.520021202362464, + "learning_rate": 4.899341247146639e-06, + "loss": 0.0294, + "step": 14070 + }, + { + "epoch": 1.6686814814814814, + "grad_norm": 0.4650761047088053, + "learning_rate": 4.892447841470318e-06, + "loss": 0.0296, + "step": 14080 + }, + { + "epoch": 1.6698666666666666, + "grad_norm": 0.4188820057598217, + "learning_rate": 4.885554640313679e-06, + "loss": 0.0308, + "step": 14090 + }, + { + "epoch": 1.671051851851852, + "grad_norm": 0.6553845274366292, + "learning_rate": 4.8786616567847415e-06, + "loss": 0.0306, + "step": 14100 + }, + { + "epoch": 1.672237037037037, + "grad_norm": 0.42579030631098574, + "learning_rate": 4.871768903991102e-06, + "loss": 0.0312, + "step": 14110 + }, + { + "epoch": 1.6734222222222224, + "grad_norm": 0.4372859940183096, + "learning_rate": 4.864876395039926e-06, + "loss": 0.0299, + "step": 14120 + }, + { + "epoch": 1.6746074074074073, + "grad_norm": 0.5449410951853673, + "learning_rate": 4.857984143037911e-06, + "loss": 0.0285, + "step": 14130 + }, + { + "epoch": 1.6757925925925927, + "grad_norm": 0.4035070881722527, + "learning_rate": 4.851092161091267e-06, + "loss": 0.0304, + "step": 14140 + }, + { + "epoch": 1.6769777777777777, + "grad_norm": 0.38621528763695523, + "learning_rate": 4.844200462305693e-06, + "loss": 0.0303, + "step": 14150 + }, + { + "epoch": 1.678162962962963, + "grad_norm": 0.48185406252429064, + "learning_rate": 4.837309059786344e-06, + "loss": 0.0313, + "step": 14160 + }, + { + "epoch": 1.679348148148148, + "grad_norm": 0.5947634110300325, + "learning_rate": 4.830417966637817e-06, + "loss": 0.0371, + "step": 14170 + }, + { + "epoch": 1.6805333333333334, + "grad_norm": 0.4826138196658728, + "learning_rate": 4.823527195964119e-06, + "loss": 0.0304, + "step": 14180 + }, + { + "epoch": 1.6817185185185184, + "grad_norm": 0.7322907119535944, + "learning_rate": 4.816636760868642e-06, + "loss": 0.0296, + "step": 14190 + }, + { + "epoch": 1.6829037037037038, + "grad_norm": 0.47679496814141087, + "learning_rate": 4.809746674454142e-06, + "loss": 0.033, + "step": 14200 + }, + { + "epoch": 1.6840888888888887, + "grad_norm": 0.460173285349719, + "learning_rate": 4.802856949822709e-06, + "loss": 0.0292, + "step": 14210 + }, + { + "epoch": 1.6852740740740741, + "grad_norm": 0.7931355404195776, + "learning_rate": 4.79596760007575e-06, + "loss": 0.0285, + "step": 14220 + }, + { + "epoch": 1.686459259259259, + "grad_norm": 0.6092037703130151, + "learning_rate": 4.789078638313956e-06, + "loss": 0.0318, + "step": 14230 + }, + { + "epoch": 1.6876444444444445, + "grad_norm": 0.5072650049144025, + "learning_rate": 4.78219007763728e-06, + "loss": 0.0317, + "step": 14240 + }, + { + "epoch": 1.6888296296296297, + "grad_norm": 0.4029340535377157, + "learning_rate": 4.775301931144913e-06, + "loss": 0.0281, + "step": 14250 + }, + { + "epoch": 1.6900148148148149, + "grad_norm": 0.48497410162524296, + "learning_rate": 4.7684142119352564e-06, + "loss": 0.0311, + "step": 14260 + }, + { + "epoch": 1.6912, + "grad_norm": 0.43522761213890987, + "learning_rate": 4.761526933105905e-06, + "loss": 0.0325, + "step": 14270 + }, + { + "epoch": 1.6923851851851852, + "grad_norm": 0.5016689344640952, + "learning_rate": 4.754640107753607e-06, + "loss": 0.0324, + "step": 14280 + }, + { + "epoch": 1.6935703703703704, + "grad_norm": 0.4937025258400182, + "learning_rate": 4.747753748974256e-06, + "loss": 0.0336, + "step": 14290 + }, + { + "epoch": 1.6947555555555556, + "grad_norm": 0.5057012342095206, + "learning_rate": 4.7408678698628555e-06, + "loss": 0.0301, + "step": 14300 + }, + { + "epoch": 1.6959407407407407, + "grad_norm": 0.6057372860962584, + "learning_rate": 4.733982483513499e-06, + "loss": 0.0297, + "step": 14310 + }, + { + "epoch": 1.697125925925926, + "grad_norm": 0.6106725729801042, + "learning_rate": 4.727097603019339e-06, + "loss": 0.0315, + "step": 14320 + }, + { + "epoch": 1.698311111111111, + "grad_norm": 0.530901331449317, + "learning_rate": 4.72021324147257e-06, + "loss": 0.0282, + "step": 14330 + }, + { + "epoch": 1.6994962962962963, + "grad_norm": 0.4935798770006407, + "learning_rate": 4.713329411964395e-06, + "loss": 0.0295, + "step": 14340 + }, + { + "epoch": 1.7006814814814815, + "grad_norm": 0.5634254836080385, + "learning_rate": 4.706446127585011e-06, + "loss": 0.0309, + "step": 14350 + }, + { + "epoch": 1.7018666666666666, + "grad_norm": 0.4046089753188776, + "learning_rate": 4.699563401423572e-06, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 1.7030518518518518, + "grad_norm": 0.46720466293233187, + "learning_rate": 4.692681246568175e-06, + "loss": 0.0327, + "step": 14370 + }, + { + "epoch": 1.704237037037037, + "grad_norm": 0.4284975562781869, + "learning_rate": 4.685799676105833e-06, + "loss": 0.0327, + "step": 14380 + }, + { + "epoch": 1.7054222222222222, + "grad_norm": 0.4335668602538958, + "learning_rate": 4.678918703122443e-06, + "loss": 0.0332, + "step": 14390 + }, + { + "epoch": 1.7066074074074074, + "grad_norm": 0.4443969529333878, + "learning_rate": 4.672038340702765e-06, + "loss": 0.0333, + "step": 14400 + }, + { + "epoch": 1.7077925925925928, + "grad_norm": 0.41635457749946203, + "learning_rate": 4.665158601930402e-06, + "loss": 0.03, + "step": 14410 + }, + { + "epoch": 1.7089777777777777, + "grad_norm": 0.532917600779119, + "learning_rate": 4.658279499887769e-06, + "loss": 0.029, + "step": 14420 + }, + { + "epoch": 1.710162962962963, + "grad_norm": 0.6427531171748737, + "learning_rate": 4.6514010476560695e-06, + "loss": 0.0317, + "step": 14430 + }, + { + "epoch": 1.711348148148148, + "grad_norm": 0.47754549180960465, + "learning_rate": 4.644523258315273e-06, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 1.7125333333333335, + "grad_norm": 0.51966123237056, + "learning_rate": 4.637646144944086e-06, + "loss": 0.0277, + "step": 14450 + }, + { + "epoch": 1.7137185185185184, + "grad_norm": 0.6344384371740579, + "learning_rate": 4.630769720619935e-06, + "loss": 0.0304, + "step": 14460 + }, + { + "epoch": 1.7149037037037038, + "grad_norm": 0.5124649297628256, + "learning_rate": 4.62389399841893e-06, + "loss": 0.0309, + "step": 14470 + }, + { + "epoch": 1.7160888888888888, + "grad_norm": 0.7370031495170535, + "learning_rate": 4.617018991415849e-06, + "loss": 0.0313, + "step": 14480 + }, + { + "epoch": 1.7172740740740742, + "grad_norm": 0.6802078608873966, + "learning_rate": 4.61014471268411e-06, + "loss": 0.0337, + "step": 14490 + }, + { + "epoch": 1.7184592592592591, + "grad_norm": 0.5294116180956786, + "learning_rate": 4.603271175295745e-06, + "loss": 0.0307, + "step": 14500 + }, + { + "epoch": 1.7196444444444445, + "grad_norm": 0.5790305278482072, + "learning_rate": 4.596398392321376e-06, + "loss": 0.0305, + "step": 14510 + }, + { + "epoch": 1.7208296296296295, + "grad_norm": 0.3545145887216605, + "learning_rate": 4.5895263768301895e-06, + "loss": 0.0291, + "step": 14520 + }, + { + "epoch": 1.7220148148148149, + "grad_norm": 0.5870242012913149, + "learning_rate": 4.582655141889918e-06, + "loss": 0.0278, + "step": 14530 + }, + { + "epoch": 1.7231999999999998, + "grad_norm": 0.530338737581341, + "learning_rate": 4.575784700566805e-06, + "loss": 0.034, + "step": 14540 + }, + { + "epoch": 1.7243851851851852, + "grad_norm": 0.3864531510757058, + "learning_rate": 4.568915065925585e-06, + "loss": 0.0303, + "step": 14550 + }, + { + "epoch": 1.7255703703703704, + "grad_norm": 0.700862334642734, + "learning_rate": 4.562046251029461e-06, + "loss": 0.0323, + "step": 14560 + }, + { + "epoch": 1.7267555555555556, + "grad_norm": 0.45647606396128376, + "learning_rate": 4.555178268940073e-06, + "loss": 0.0309, + "step": 14570 + }, + { + "epoch": 1.7279407407407408, + "grad_norm": 0.4039825537426591, + "learning_rate": 4.548311132717482e-06, + "loss": 0.0291, + "step": 14580 + }, + { + "epoch": 1.729125925925926, + "grad_norm": 0.5180726000904167, + "learning_rate": 4.541444855420136e-06, + "loss": 0.0313, + "step": 14590 + }, + { + "epoch": 1.7303111111111111, + "grad_norm": 0.43671016883425606, + "learning_rate": 4.534579450104854e-06, + "loss": 0.0309, + "step": 14600 + }, + { + "epoch": 1.7314962962962963, + "grad_norm": 0.4953216633105977, + "learning_rate": 4.527714929826793e-06, + "loss": 0.027, + "step": 14610 + }, + { + "epoch": 1.7326814814814815, + "grad_norm": 0.4875449748426298, + "learning_rate": 4.5208513076394335e-06, + "loss": 0.0308, + "step": 14620 + }, + { + "epoch": 1.7338666666666667, + "grad_norm": 0.47029719396474, + "learning_rate": 4.513988596594539e-06, + "loss": 0.0328, + "step": 14630 + }, + { + "epoch": 1.7350518518518518, + "grad_norm": 0.48243692535173677, + "learning_rate": 4.507126809742148e-06, + "loss": 0.03, + "step": 14640 + }, + { + "epoch": 1.736237037037037, + "grad_norm": 0.5764337703138601, + "learning_rate": 4.500265960130537e-06, + "loss": 0.0324, + "step": 14650 + }, + { + "epoch": 1.7374222222222222, + "grad_norm": 0.6105999092443823, + "learning_rate": 4.493406060806202e-06, + "loss": 0.0277, + "step": 14660 + }, + { + "epoch": 1.7386074074074074, + "grad_norm": 0.5344248733212823, + "learning_rate": 4.486547124813832e-06, + "loss": 0.0292, + "step": 14670 + }, + { + "epoch": 1.7397925925925926, + "grad_norm": 0.5438780644211062, + "learning_rate": 4.479689165196283e-06, + "loss": 0.0263, + "step": 14680 + }, + { + "epoch": 1.7409777777777777, + "grad_norm": 0.5625317091725303, + "learning_rate": 4.472832194994557e-06, + "loss": 0.0325, + "step": 14690 + }, + { + "epoch": 1.742162962962963, + "grad_norm": 0.5911177408897746, + "learning_rate": 4.465976227247773e-06, + "loss": 0.0319, + "step": 14700 + }, + { + "epoch": 1.743348148148148, + "grad_norm": 0.5688399100122186, + "learning_rate": 4.459121274993141e-06, + "loss": 0.0309, + "step": 14710 + }, + { + "epoch": 1.7445333333333335, + "grad_norm": 0.568522407943515, + "learning_rate": 4.452267351265947e-06, + "loss": 0.0281, + "step": 14720 + }, + { + "epoch": 1.7457185185185184, + "grad_norm": 0.5196838385210767, + "learning_rate": 4.445414469099512e-06, + "loss": 0.03, + "step": 14730 + }, + { + "epoch": 1.7469037037037038, + "grad_norm": 0.5563854041950367, + "learning_rate": 4.438562641525184e-06, + "loss": 0.0292, + "step": 14740 + }, + { + "epoch": 1.7480888888888888, + "grad_norm": 0.4929953062910534, + "learning_rate": 4.4317118815723e-06, + "loss": 0.0303, + "step": 14750 + }, + { + "epoch": 1.7492740740740742, + "grad_norm": 0.6030489423102072, + "learning_rate": 4.424862202268172e-06, + "loss": 0.0303, + "step": 14760 + }, + { + "epoch": 1.7504592592592592, + "grad_norm": 0.6562596332834234, + "learning_rate": 4.418013616638056e-06, + "loss": 0.0302, + "step": 14770 + }, + { + "epoch": 1.7516444444444446, + "grad_norm": 0.6551357962548602, + "learning_rate": 4.411166137705122e-06, + "loss": 0.0309, + "step": 14780 + }, + { + "epoch": 1.7528296296296295, + "grad_norm": 0.5385225980834302, + "learning_rate": 4.404319778490445e-06, + "loss": 0.0312, + "step": 14790 + }, + { + "epoch": 1.754014814814815, + "grad_norm": 0.3758759556508297, + "learning_rate": 4.397474552012964e-06, + "loss": 0.033, + "step": 14800 + }, + { + "epoch": 1.7551999999999999, + "grad_norm": 0.41694548874861553, + "learning_rate": 4.390630471289465e-06, + "loss": 0.0301, + "step": 14810 + }, + { + "epoch": 1.7563851851851853, + "grad_norm": 0.5754747471394601, + "learning_rate": 4.38378754933456e-06, + "loss": 0.0281, + "step": 14820 + }, + { + "epoch": 1.7575703703703702, + "grad_norm": 0.5915191091531806, + "learning_rate": 4.376945799160649e-06, + "loss": 0.0318, + "step": 14830 + }, + { + "epoch": 1.7587555555555556, + "grad_norm": 0.4968711750275647, + "learning_rate": 4.370105233777912e-06, + "loss": 0.035, + "step": 14840 + }, + { + "epoch": 1.7599407407407406, + "grad_norm": 0.5308736395041633, + "learning_rate": 4.363265866194274e-06, + "loss": 0.0276, + "step": 14850 + }, + { + "epoch": 1.761125925925926, + "grad_norm": 0.5743095281000444, + "learning_rate": 4.356427709415378e-06, + "loss": 0.0343, + "step": 14860 + }, + { + "epoch": 1.7623111111111112, + "grad_norm": 0.46070132136944636, + "learning_rate": 4.349590776444569e-06, + "loss": 0.0319, + "step": 14870 + }, + { + "epoch": 1.7634962962962963, + "grad_norm": 0.39401651982755276, + "learning_rate": 4.342755080282861e-06, + "loss": 0.0312, + "step": 14880 + }, + { + "epoch": 1.7646814814814815, + "grad_norm": 0.5720650300000568, + "learning_rate": 4.335920633928922e-06, + "loss": 0.028, + "step": 14890 + }, + { + "epoch": 1.7658666666666667, + "grad_norm": 0.4041693252361372, + "learning_rate": 4.329087450379038e-06, + "loss": 0.0297, + "step": 14900 + }, + { + "epoch": 1.7670518518518519, + "grad_norm": 0.5166965234748857, + "learning_rate": 4.322255542627093e-06, + "loss": 0.0288, + "step": 14910 + }, + { + "epoch": 1.768237037037037, + "grad_norm": 0.49562874888174113, + "learning_rate": 4.315424923664552e-06, + "loss": 0.0298, + "step": 14920 + }, + { + "epoch": 1.7694222222222222, + "grad_norm": 0.4374031735110688, + "learning_rate": 4.308595606480423e-06, + "loss": 0.0325, + "step": 14930 + }, + { + "epoch": 1.7706074074074074, + "grad_norm": 0.4812305306996144, + "learning_rate": 4.301767604061239e-06, + "loss": 0.0292, + "step": 14940 + }, + { + "epoch": 1.7717925925925926, + "grad_norm": 0.40303043536958244, + "learning_rate": 4.294940929391035e-06, + "loss": 0.0302, + "step": 14950 + }, + { + "epoch": 1.7729777777777778, + "grad_norm": 0.4200106880557695, + "learning_rate": 4.288115595451321e-06, + "loss": 0.0251, + "step": 14960 + }, + { + "epoch": 1.774162962962963, + "grad_norm": 0.3344483051302518, + "learning_rate": 4.281291615221056e-06, + "loss": 0.0307, + "step": 14970 + }, + { + "epoch": 1.7753481481481481, + "grad_norm": 0.4306614667374063, + "learning_rate": 4.274469001676625e-06, + "loss": 0.0283, + "step": 14980 + }, + { + "epoch": 1.7765333333333333, + "grad_norm": 0.5685891809532609, + "learning_rate": 4.267647767791815e-06, + "loss": 0.0355, + "step": 14990 + }, + { + "epoch": 1.7777185185185185, + "grad_norm": 0.5403635494746096, + "learning_rate": 4.260827926537789e-06, + "loss": 0.0308, + "step": 15000 + }, + { + "epoch": 1.7789037037037037, + "grad_norm": 0.4040489065668533, + "learning_rate": 4.254009490883065e-06, + "loss": 0.0263, + "step": 15010 + }, + { + "epoch": 1.7800888888888888, + "grad_norm": 0.4964353797283434, + "learning_rate": 4.24719247379348e-06, + "loss": 0.0335, + "step": 15020 + }, + { + "epoch": 1.7812740740740742, + "grad_norm": 0.39068385349187273, + "learning_rate": 4.240376888232183e-06, + "loss": 0.0281, + "step": 15030 + }, + { + "epoch": 1.7824592592592592, + "grad_norm": 0.4711320437734779, + "learning_rate": 4.233562747159593e-06, + "loss": 0.0281, + "step": 15040 + }, + { + "epoch": 1.7836444444444446, + "grad_norm": 0.7219182584901922, + "learning_rate": 4.226750063533388e-06, + "loss": 0.0314, + "step": 15050 + }, + { + "epoch": 1.7848296296296295, + "grad_norm": 0.3974088633545491, + "learning_rate": 4.21993885030847e-06, + "loss": 0.0312, + "step": 15060 + }, + { + "epoch": 1.786014814814815, + "grad_norm": 0.44801645854078387, + "learning_rate": 4.213129120436949e-06, + "loss": 0.0296, + "step": 15070 + }, + { + "epoch": 1.7872, + "grad_norm": 0.510863185058965, + "learning_rate": 4.206320886868112e-06, + "loss": 0.0313, + "step": 15080 + }, + { + "epoch": 1.7883851851851853, + "grad_norm": 0.5678227657074751, + "learning_rate": 4.1995141625484e-06, + "loss": 0.03, + "step": 15090 + }, + { + "epoch": 1.7895703703703703, + "grad_norm": 0.5242516926779672, + "learning_rate": 4.192708960421385e-06, + "loss": 0.0275, + "step": 15100 + }, + { + "epoch": 1.7907555555555557, + "grad_norm": 0.5199210470998465, + "learning_rate": 4.185905293427745e-06, + "loss": 0.0312, + "step": 15110 + }, + { + "epoch": 1.7919407407407406, + "grad_norm": 0.40367957397348764, + "learning_rate": 4.1791031745052384e-06, + "loss": 0.0273, + "step": 15120 + }, + { + "epoch": 1.793125925925926, + "grad_norm": 0.5915136124073064, + "learning_rate": 4.1723026165886794e-06, + "loss": 0.0314, + "step": 15130 + }, + { + "epoch": 1.794311111111111, + "grad_norm": 0.6569441500677481, + "learning_rate": 4.165503632609913e-06, + "loss": 0.0289, + "step": 15140 + }, + { + "epoch": 1.7954962962962964, + "grad_norm": 0.4809613279072172, + "learning_rate": 4.158706235497792e-06, + "loss": 0.0299, + "step": 15150 + }, + { + "epoch": 1.7966814814814813, + "grad_norm": 0.5399371293626984, + "learning_rate": 4.1519104381781556e-06, + "loss": 0.029, + "step": 15160 + }, + { + "epoch": 1.7978666666666667, + "grad_norm": 0.5692173171210713, + "learning_rate": 4.1451162535737936e-06, + "loss": 0.0274, + "step": 15170 + }, + { + "epoch": 1.799051851851852, + "grad_norm": 0.531974701043535, + "learning_rate": 4.138323694604434e-06, + "loss": 0.029, + "step": 15180 + }, + { + "epoch": 1.800237037037037, + "grad_norm": 0.5121615947338684, + "learning_rate": 4.1315327741867105e-06, + "loss": 0.0327, + "step": 15190 + }, + { + "epoch": 1.8014222222222223, + "grad_norm": 0.4578239908458243, + "learning_rate": 4.124743505234144e-06, + "loss": 0.0302, + "step": 15200 + }, + { + "epoch": 1.8026074074074074, + "grad_norm": 0.6727969400600277, + "learning_rate": 4.117955900657114e-06, + "loss": 0.0283, + "step": 15210 + }, + { + "epoch": 1.8037925925925926, + "grad_norm": 0.45732990864355993, + "learning_rate": 4.1111699733628324e-06, + "loss": 0.0281, + "step": 15220 + }, + { + "epoch": 1.8049777777777778, + "grad_norm": 0.35424286856821796, + "learning_rate": 4.104385736255326e-06, + "loss": 0.0246, + "step": 15230 + }, + { + "epoch": 1.806162962962963, + "grad_norm": 0.4214838579827263, + "learning_rate": 4.097603202235407e-06, + "loss": 0.0275, + "step": 15240 + }, + { + "epoch": 1.8073481481481481, + "grad_norm": 0.45429656595212553, + "learning_rate": 4.090822384200643e-06, + "loss": 0.0289, + "step": 15250 + }, + { + "epoch": 1.8085333333333333, + "grad_norm": 0.4547850141275008, + "learning_rate": 4.084043295045348e-06, + "loss": 0.0309, + "step": 15260 + }, + { + "epoch": 1.8097185185185185, + "grad_norm": 0.8654630582377518, + "learning_rate": 4.0772659476605385e-06, + "loss": 0.0288, + "step": 15270 + }, + { + "epoch": 1.8109037037037037, + "grad_norm": 0.5430544170251327, + "learning_rate": 4.0704903549339264e-06, + "loss": 0.0277, + "step": 15280 + }, + { + "epoch": 1.8120888888888889, + "grad_norm": 0.507855494833691, + "learning_rate": 4.063716529749881e-06, + "loss": 0.029, + "step": 15290 + }, + { + "epoch": 1.813274074074074, + "grad_norm": 0.4564793922324654, + "learning_rate": 4.056944484989419e-06, + "loss": 0.0273, + "step": 15300 + }, + { + "epoch": 1.8144592592592592, + "grad_norm": 0.3624531191320748, + "learning_rate": 4.050174233530164e-06, + "loss": 0.029, + "step": 15310 + }, + { + "epoch": 1.8156444444444444, + "grad_norm": 0.5539100728280977, + "learning_rate": 4.043405788246331e-06, + "loss": 0.0285, + "step": 15320 + }, + { + "epoch": 1.8168296296296296, + "grad_norm": 0.4810497292397226, + "learning_rate": 4.036639162008701e-06, + "loss": 0.0294, + "step": 15330 + }, + { + "epoch": 1.818014814814815, + "grad_norm": 0.4975033519659776, + "learning_rate": 4.0298743676845975e-06, + "loss": 0.0299, + "step": 15340 + }, + { + "epoch": 1.8192, + "grad_norm": 0.5337195365254993, + "learning_rate": 4.0231114181378565e-06, + "loss": 0.0293, + "step": 15350 + }, + { + "epoch": 1.8203851851851853, + "grad_norm": 0.4548057233761515, + "learning_rate": 4.016350326228811e-06, + "loss": 0.0318, + "step": 15360 + }, + { + "epoch": 1.8215703703703703, + "grad_norm": 0.4607108088569288, + "learning_rate": 4.009591104814256e-06, + "loss": 0.0285, + "step": 15370 + }, + { + "epoch": 1.8227555555555557, + "grad_norm": 0.6479755682874064, + "learning_rate": 4.002833766747436e-06, + "loss": 0.0292, + "step": 15380 + }, + { + "epoch": 1.8239407407407406, + "grad_norm": 0.4326748796763295, + "learning_rate": 3.996078324878009e-06, + "loss": 0.0321, + "step": 15390 + }, + { + "epoch": 1.825125925925926, + "grad_norm": 0.5465986554709112, + "learning_rate": 3.989324792052029e-06, + "loss": 0.0308, + "step": 15400 + }, + { + "epoch": 1.826311111111111, + "grad_norm": 0.4885014862256522, + "learning_rate": 3.982573181111921e-06, + "loss": 0.0265, + "step": 15410 + }, + { + "epoch": 1.8274962962962964, + "grad_norm": 0.5637981051697301, + "learning_rate": 3.975823504896453e-06, + "loss": 0.028, + "step": 15420 + }, + { + "epoch": 1.8286814814814814, + "grad_norm": 0.4954193235187059, + "learning_rate": 3.969075776240715e-06, + "loss": 0.0293, + "step": 15430 + }, + { + "epoch": 1.8298666666666668, + "grad_norm": 0.3858236316602962, + "learning_rate": 3.962330007976095e-06, + "loss": 0.0282, + "step": 15440 + }, + { + "epoch": 1.8310518518518517, + "grad_norm": 0.6469037388758082, + "learning_rate": 3.955586212930247e-06, + "loss": 0.0316, + "step": 15450 + }, + { + "epoch": 1.832237037037037, + "grad_norm": 0.5161181257082362, + "learning_rate": 3.948844403927084e-06, + "loss": 0.0287, + "step": 15460 + }, + { + "epoch": 1.833422222222222, + "grad_norm": 0.4957757715541177, + "learning_rate": 3.942104593786734e-06, + "loss": 0.0241, + "step": 15470 + }, + { + "epoch": 1.8346074074074075, + "grad_norm": 0.4798218550765448, + "learning_rate": 3.935366795325524e-06, + "loss": 0.0277, + "step": 15480 + }, + { + "epoch": 1.8357925925925926, + "grad_norm": 0.3362899784693629, + "learning_rate": 3.928631021355959e-06, + "loss": 0.0269, + "step": 15490 + }, + { + "epoch": 1.8369777777777778, + "grad_norm": 0.5565080911334147, + "learning_rate": 3.921897284686692e-06, + "loss": 0.0263, + "step": 15500 + }, + { + "epoch": 1.838162962962963, + "grad_norm": 0.5322127301741044, + "learning_rate": 3.915165598122503e-06, + "loss": 0.0286, + "step": 15510 + }, + { + "epoch": 1.8393481481481482, + "grad_norm": 0.5881755059371034, + "learning_rate": 3.908435974464274e-06, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 1.8405333333333334, + "grad_norm": 0.5841021082618638, + "learning_rate": 3.901708426508961e-06, + "loss": 0.0322, + "step": 15530 + }, + { + "epoch": 1.8417185185185185, + "grad_norm": 0.49861153593008484, + "learning_rate": 3.894982967049578e-06, + "loss": 0.0299, + "step": 15540 + }, + { + "epoch": 1.8429037037037037, + "grad_norm": 0.519470148674024, + "learning_rate": 3.888259608875165e-06, + "loss": 0.0269, + "step": 15550 + }, + { + "epoch": 1.8440888888888889, + "grad_norm": 0.5111672382070884, + "learning_rate": 3.881538364770764e-06, + "loss": 0.0284, + "step": 15560 + }, + { + "epoch": 1.845274074074074, + "grad_norm": 0.4161033822798717, + "learning_rate": 3.874819247517401e-06, + "loss": 0.0266, + "step": 15570 + }, + { + "epoch": 1.8464592592592592, + "grad_norm": 0.4722318850809975, + "learning_rate": 3.8681022698920535e-06, + "loss": 0.0297, + "step": 15580 + }, + { + "epoch": 1.8476444444444444, + "grad_norm": 0.6447393156446577, + "learning_rate": 3.8613874446676345e-06, + "loss": 0.0282, + "step": 15590 + }, + { + "epoch": 1.8488296296296296, + "grad_norm": 0.6406703830306794, + "learning_rate": 3.854674784612958e-06, + "loss": 0.0266, + "step": 15600 + }, + { + "epoch": 1.8500148148148148, + "grad_norm": 0.5038505156329683, + "learning_rate": 3.84796430249273e-06, + "loss": 0.0306, + "step": 15610 + }, + { + "epoch": 1.8512, + "grad_norm": 0.5057400059062028, + "learning_rate": 3.8412560110675066e-06, + "loss": 0.0325, + "step": 15620 + }, + { + "epoch": 1.8523851851851851, + "grad_norm": 0.4155202517038341, + "learning_rate": 3.834549923093683e-06, + "loss": 0.029, + "step": 15630 + }, + { + "epoch": 1.8535703703703703, + "grad_norm": 0.5002648922617754, + "learning_rate": 3.82784605132346e-06, + "loss": 0.0305, + "step": 15640 + }, + { + "epoch": 1.8547555555555557, + "grad_norm": 0.7098206156922805, + "learning_rate": 3.821144408504829e-06, + "loss": 0.0301, + "step": 15650 + }, + { + "epoch": 1.8559407407407407, + "grad_norm": 0.46588244230059567, + "learning_rate": 3.8144450073815385e-06, + "loss": 0.0317, + "step": 15660 + }, + { + "epoch": 1.857125925925926, + "grad_norm": 0.5992782297199127, + "learning_rate": 3.8077478606930783e-06, + "loss": 0.0331, + "step": 15670 + }, + { + "epoch": 1.858311111111111, + "grad_norm": 0.44376204875494346, + "learning_rate": 3.8010529811746454e-06, + "loss": 0.027, + "step": 15680 + }, + { + "epoch": 1.8594962962962964, + "grad_norm": 0.526217394541863, + "learning_rate": 3.794360381557133e-06, + "loss": 0.029, + "step": 15690 + }, + { + "epoch": 1.8606814814814814, + "grad_norm": 0.41960226025451475, + "learning_rate": 3.787670074567095e-06, + "loss": 0.0277, + "step": 15700 + }, + { + "epoch": 1.8618666666666668, + "grad_norm": 0.5910047693090584, + "learning_rate": 3.780982072926723e-06, + "loss": 0.0275, + "step": 15710 + }, + { + "epoch": 1.8630518518518517, + "grad_norm": 0.4809660511390173, + "learning_rate": 3.7742963893538297e-06, + "loss": 0.0299, + "step": 15720 + }, + { + "epoch": 1.8642370370370371, + "grad_norm": 0.4609492306276235, + "learning_rate": 3.7676130365618187e-06, + "loss": 0.0297, + "step": 15730 + }, + { + "epoch": 1.865422222222222, + "grad_norm": 0.5745527059713748, + "learning_rate": 3.760932027259657e-06, + "loss": 0.0335, + "step": 15740 + }, + { + "epoch": 1.8666074074074075, + "grad_norm": 0.4485573868820827, + "learning_rate": 3.7542533741518623e-06, + "loss": 0.0251, + "step": 15750 + }, + { + "epoch": 1.8677925925925924, + "grad_norm": 0.40369696111070064, + "learning_rate": 3.747577089938464e-06, + "loss": 0.0293, + "step": 15760 + }, + { + "epoch": 1.8689777777777778, + "grad_norm": 0.4243320066064984, + "learning_rate": 3.740903187314994e-06, + "loss": 0.0269, + "step": 15770 + }, + { + "epoch": 1.8701629629629628, + "grad_norm": 0.43761623252645204, + "learning_rate": 3.7342316789724532e-06, + "loss": 0.0258, + "step": 15780 + }, + { + "epoch": 1.8713481481481482, + "grad_norm": 0.4706054857045525, + "learning_rate": 3.7275625775972868e-06, + "loss": 0.0269, + "step": 15790 + }, + { + "epoch": 1.8725333333333334, + "grad_norm": 0.6190183432777634, + "learning_rate": 3.720895895871366e-06, + "loss": 0.0308, + "step": 15800 + }, + { + "epoch": 1.8737185185185186, + "grad_norm": 0.5334940190844957, + "learning_rate": 3.7142316464719585e-06, + "loss": 0.0295, + "step": 15810 + }, + { + "epoch": 1.8749037037037037, + "grad_norm": 0.5439994890483385, + "learning_rate": 3.7075698420717076e-06, + "loss": 0.0275, + "step": 15820 + }, + { + "epoch": 1.876088888888889, + "grad_norm": 0.41694810142074284, + "learning_rate": 3.7009104953386087e-06, + "loss": 0.0283, + "step": 15830 + }, + { + "epoch": 1.877274074074074, + "grad_norm": 0.6126828394160406, + "learning_rate": 3.6942536189359846e-06, + "loss": 0.0292, + "step": 15840 + }, + { + "epoch": 1.8784592592592593, + "grad_norm": 0.40628786409431195, + "learning_rate": 3.6875992255224547e-06, + "loss": 0.031, + "step": 15850 + }, + { + "epoch": 1.8796444444444445, + "grad_norm": 0.5598379107304569, + "learning_rate": 3.6809473277519243e-06, + "loss": 0.0281, + "step": 15860 + }, + { + "epoch": 1.8808296296296296, + "grad_norm": 0.36756466696688483, + "learning_rate": 3.6742979382735455e-06, + "loss": 0.0255, + "step": 15870 + }, + { + "epoch": 1.8820148148148148, + "grad_norm": 0.5931365049812046, + "learning_rate": 3.6676510697317085e-06, + "loss": 0.03, + "step": 15880 + }, + { + "epoch": 1.8832, + "grad_norm": 0.4538241610352921, + "learning_rate": 3.6610067347660026e-06, + "loss": 0.0283, + "step": 15890 + }, + { + "epoch": 1.8843851851851852, + "grad_norm": 0.46624277914023465, + "learning_rate": 3.654364946011205e-06, + "loss": 0.0306, + "step": 15900 + }, + { + "epoch": 1.8855703703703703, + "grad_norm": 0.4820046871128, + "learning_rate": 3.6477257160972435e-06, + "loss": 0.0291, + "step": 15910 + }, + { + "epoch": 1.8867555555555555, + "grad_norm": 0.6964191362619795, + "learning_rate": 3.641089057649192e-06, + "loss": 0.0303, + "step": 15920 + }, + { + "epoch": 1.8879407407407407, + "grad_norm": 0.47869732567934326, + "learning_rate": 3.6344549832872233e-06, + "loss": 0.0267, + "step": 15930 + }, + { + "epoch": 1.8891259259259259, + "grad_norm": 0.5529243475762332, + "learning_rate": 3.627823505626603e-06, + "loss": 0.029, + "step": 15940 + }, + { + "epoch": 1.890311111111111, + "grad_norm": 0.46900237813063694, + "learning_rate": 3.6211946372776537e-06, + "loss": 0.0318, + "step": 15950 + }, + { + "epoch": 1.8914962962962965, + "grad_norm": 0.5898620179837001, + "learning_rate": 3.614568390845741e-06, + "loss": 0.0264, + "step": 15960 + }, + { + "epoch": 1.8926814814814814, + "grad_norm": 0.39819637014363, + "learning_rate": 3.607944778931242e-06, + "loss": 0.03, + "step": 15970 + }, + { + "epoch": 1.8938666666666668, + "grad_norm": 0.46274651599760835, + "learning_rate": 3.601323814129525e-06, + "loss": 0.0258, + "step": 15980 + }, + { + "epoch": 1.8950518518518518, + "grad_norm": 0.5451947596958668, + "learning_rate": 3.5947055090309223e-06, + "loss": 0.027, + "step": 15990 + }, + { + "epoch": 1.8962370370370372, + "grad_norm": 0.5647378171574232, + "learning_rate": 3.5880898762207128e-06, + "loss": 0.0291, + "step": 16000 + }, + { + "epoch": 1.8974222222222221, + "grad_norm": 0.6051927793816031, + "learning_rate": 3.5814769282790907e-06, + "loss": 0.0303, + "step": 16010 + }, + { + "epoch": 1.8986074074074075, + "grad_norm": 0.5687503417822197, + "learning_rate": 3.5748666777811473e-06, + "loss": 0.027, + "step": 16020 + }, + { + "epoch": 1.8997925925925925, + "grad_norm": 0.5407824615906663, + "learning_rate": 3.56825913729684e-06, + "loss": 0.0268, + "step": 16030 + }, + { + "epoch": 1.9009777777777779, + "grad_norm": 0.46859862600170465, + "learning_rate": 3.5616543193909783e-06, + "loss": 0.0274, + "step": 16040 + }, + { + "epoch": 1.9021629629629628, + "grad_norm": 0.45405283019784876, + "learning_rate": 3.5550522366231876e-06, + "loss": 0.0282, + "step": 16050 + }, + { + "epoch": 1.9033481481481482, + "grad_norm": 0.5174868254718724, + "learning_rate": 3.548452901547901e-06, + "loss": 0.0309, + "step": 16060 + }, + { + "epoch": 1.9045333333333332, + "grad_norm": 0.45250526811771735, + "learning_rate": 3.541856326714318e-06, + "loss": 0.026, + "step": 16070 + }, + { + "epoch": 1.9057185185185186, + "grad_norm": 0.6017336351020726, + "learning_rate": 3.5352625246663954e-06, + "loss": 0.0285, + "step": 16080 + }, + { + "epoch": 1.9069037037037035, + "grad_norm": 0.3395513036241249, + "learning_rate": 3.528671507942816e-06, + "loss": 0.0303, + "step": 16090 + }, + { + "epoch": 1.908088888888889, + "grad_norm": 0.5038084307863366, + "learning_rate": 3.522083289076964e-06, + "loss": 0.0269, + "step": 16100 + }, + { + "epoch": 1.9092740740740741, + "grad_norm": 0.5163753542883514, + "learning_rate": 3.515497880596905e-06, + "loss": 0.0278, + "step": 16110 + }, + { + "epoch": 1.9104592592592593, + "grad_norm": 0.42022863658206705, + "learning_rate": 3.508915295025358e-06, + "loss": 0.0279, + "step": 16120 + }, + { + "epoch": 1.9116444444444445, + "grad_norm": 0.40692186116901474, + "learning_rate": 3.5023355448796777e-06, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 1.9128296296296297, + "grad_norm": 0.5668476685494173, + "learning_rate": 3.495758642671824e-06, + "loss": 0.0306, + "step": 16140 + }, + { + "epoch": 1.9140148148148148, + "grad_norm": 0.4424793546433323, + "learning_rate": 3.489184600908344e-06, + "loss": 0.0258, + "step": 16150 + }, + { + "epoch": 1.9152, + "grad_norm": 0.554147578183282, + "learning_rate": 3.4826134320903428e-06, + "loss": 0.0288, + "step": 16160 + }, + { + "epoch": 1.9163851851851852, + "grad_norm": 0.44569460616605566, + "learning_rate": 3.4760451487134645e-06, + "loss": 0.0285, + "step": 16170 + }, + { + "epoch": 1.9175703703703704, + "grad_norm": 0.37719323990982606, + "learning_rate": 3.4694797632678627e-06, + "loss": 0.0261, + "step": 16180 + }, + { + "epoch": 1.9187555555555555, + "grad_norm": 0.5111246144797355, + "learning_rate": 3.462917288238185e-06, + "loss": 0.0299, + "step": 16190 + }, + { + "epoch": 1.9199407407407407, + "grad_norm": 0.43678865887098206, + "learning_rate": 3.4563577361035405e-06, + "loss": 0.0261, + "step": 16200 + }, + { + "epoch": 1.921125925925926, + "grad_norm": 0.40706931567139115, + "learning_rate": 3.4498011193374837e-06, + "loss": 0.0276, + "step": 16210 + }, + { + "epoch": 1.922311111111111, + "grad_norm": 0.5716168770877608, + "learning_rate": 3.4432474504079818e-06, + "loss": 0.0286, + "step": 16220 + }, + { + "epoch": 1.9234962962962963, + "grad_norm": 0.44597959357535155, + "learning_rate": 3.436696741777407e-06, + "loss": 0.029, + "step": 16230 + }, + { + "epoch": 1.9246814814814814, + "grad_norm": 0.418352530258196, + "learning_rate": 3.430149005902489e-06, + "loss": 0.0265, + "step": 16240 + }, + { + "epoch": 1.9258666666666666, + "grad_norm": 0.44231181686031346, + "learning_rate": 3.423604255234315e-06, + "loss": 0.0298, + "step": 16250 + }, + { + "epoch": 1.9270518518518518, + "grad_norm": 0.3500491976233091, + "learning_rate": 3.417062502218289e-06, + "loss": 0.0266, + "step": 16260 + }, + { + "epoch": 1.9282370370370372, + "grad_norm": 0.5466099206429552, + "learning_rate": 3.41052375929412e-06, + "loss": 0.0292, + "step": 16270 + }, + { + "epoch": 1.9294222222222221, + "grad_norm": 0.4477341461260506, + "learning_rate": 3.4039880388957864e-06, + "loss": 0.0291, + "step": 16280 + }, + { + "epoch": 1.9306074074074075, + "grad_norm": 0.48970004609394685, + "learning_rate": 3.3974553534515264e-06, + "loss": 0.0289, + "step": 16290 + }, + { + "epoch": 1.9317925925925925, + "grad_norm": 0.7261947237022754, + "learning_rate": 3.3909257153838006e-06, + "loss": 0.0301, + "step": 16300 + }, + { + "epoch": 1.932977777777778, + "grad_norm": 0.5628159186222998, + "learning_rate": 3.3843991371092794e-06, + "loss": 0.0287, + "step": 16310 + }, + { + "epoch": 1.9341629629629629, + "grad_norm": 0.42375554178250247, + "learning_rate": 3.377875631038813e-06, + "loss": 0.0288, + "step": 16320 + }, + { + "epoch": 1.9353481481481483, + "grad_norm": 0.562442321595791, + "learning_rate": 3.3713552095774106e-06, + "loss": 0.0293, + "step": 16330 + }, + { + "epoch": 1.9365333333333332, + "grad_norm": 0.4826177073652702, + "learning_rate": 3.3648378851242115e-06, + "loss": 0.0277, + "step": 16340 + }, + { + "epoch": 1.9377185185185186, + "grad_norm": 0.4105968087946979, + "learning_rate": 3.3583236700724723e-06, + "loss": 0.0288, + "step": 16350 + }, + { + "epoch": 1.9389037037037036, + "grad_norm": 0.48715253857179075, + "learning_rate": 3.351812576809531e-06, + "loss": 0.0273, + "step": 16360 + }, + { + "epoch": 1.940088888888889, + "grad_norm": 0.4959529198340603, + "learning_rate": 3.3453046177167907e-06, + "loss": 0.0284, + "step": 16370 + }, + { + "epoch": 1.941274074074074, + "grad_norm": 0.49236835007377455, + "learning_rate": 3.3387998051697002e-06, + "loss": 0.0276, + "step": 16380 + }, + { + "epoch": 1.9424592592592593, + "grad_norm": 0.4955130152472539, + "learning_rate": 3.332298151537716e-06, + "loss": 0.0269, + "step": 16390 + }, + { + "epoch": 1.9436444444444443, + "grad_norm": 0.7023422206575342, + "learning_rate": 3.325799669184295e-06, + "loss": 0.028, + "step": 16400 + }, + { + "epoch": 1.9448296296296297, + "grad_norm": 0.3956179947762618, + "learning_rate": 3.319304370466857e-06, + "loss": 0.029, + "step": 16410 + }, + { + "epoch": 1.9460148148148149, + "grad_norm": 0.4501613805232214, + "learning_rate": 3.3128122677367747e-06, + "loss": 0.0217, + "step": 16420 + }, + { + "epoch": 1.9472, + "grad_norm": 0.556450555200919, + "learning_rate": 3.306323373339338e-06, + "loss": 0.0291, + "step": 16430 + }, + { + "epoch": 1.9483851851851852, + "grad_norm": 0.5214643811890484, + "learning_rate": 3.2998376996137383e-06, + "loss": 0.0275, + "step": 16440 + }, + { + "epoch": 1.9495703703703704, + "grad_norm": 0.4489831157563276, + "learning_rate": 3.293355258893042e-06, + "loss": 0.0286, + "step": 16450 + }, + { + "epoch": 1.9507555555555556, + "grad_norm": 0.5067054659676723, + "learning_rate": 3.2868760635041696e-06, + "loss": 0.0277, + "step": 16460 + }, + { + "epoch": 1.9519407407407408, + "grad_norm": 0.4097933595328039, + "learning_rate": 3.2804001257678674e-06, + "loss": 0.0278, + "step": 16470 + }, + { + "epoch": 1.953125925925926, + "grad_norm": 0.5213929502472637, + "learning_rate": 3.273927457998689e-06, + "loss": 0.0288, + "step": 16480 + }, + { + "epoch": 1.954311111111111, + "grad_norm": 0.4969659426454915, + "learning_rate": 3.267458072504967e-06, + "loss": 0.0308, + "step": 16490 + }, + { + "epoch": 1.9554962962962963, + "grad_norm": 0.523825208177789, + "learning_rate": 3.2609919815887974e-06, + "loss": 0.0288, + "step": 16500 + }, + { + "epoch": 1.9566814814814815, + "grad_norm": 0.43342427931537186, + "learning_rate": 3.2545291975460058e-06, + "loss": 0.0277, + "step": 16510 + }, + { + "epoch": 1.9578666666666666, + "grad_norm": 0.4444642806632608, + "learning_rate": 3.248069732666135e-06, + "loss": 0.0268, + "step": 16520 + }, + { + "epoch": 1.9590518518518518, + "grad_norm": 0.42095491273050895, + "learning_rate": 3.2416135992324084e-06, + "loss": 0.0296, + "step": 16530 + }, + { + "epoch": 1.960237037037037, + "grad_norm": 0.49850627441237044, + "learning_rate": 3.2351608095217244e-06, + "loss": 0.0306, + "step": 16540 + }, + { + "epoch": 1.9614222222222222, + "grad_norm": 0.5249168466760042, + "learning_rate": 3.228711375804616e-06, + "loss": 0.0252, + "step": 16550 + }, + { + "epoch": 1.9626074074074074, + "grad_norm": 0.4617493468704232, + "learning_rate": 3.2222653103452377e-06, + "loss": 0.0287, + "step": 16560 + }, + { + "epoch": 1.9637925925925925, + "grad_norm": 0.48353472232940464, + "learning_rate": 3.215822625401335e-06, + "loss": 0.0261, + "step": 16570 + }, + { + "epoch": 1.964977777777778, + "grad_norm": 0.5018899958874573, + "learning_rate": 3.2093833332242297e-06, + "loss": 0.0253, + "step": 16580 + }, + { + "epoch": 1.966162962962963, + "grad_norm": 0.44293497278832766, + "learning_rate": 3.2029474460587886e-06, + "loss": 0.0302, + "step": 16590 + }, + { + "epoch": 1.9673481481481483, + "grad_norm": 0.5289736580903666, + "learning_rate": 3.1965149761434056e-06, + "loss": 0.0266, + "step": 16600 + }, + { + "epoch": 1.9685333333333332, + "grad_norm": 0.4432894958891086, + "learning_rate": 3.1900859357099734e-06, + "loss": 0.0292, + "step": 16610 + }, + { + "epoch": 1.9697185185185186, + "grad_norm": 0.6122164685823401, + "learning_rate": 3.1836603369838697e-06, + "loss": 0.0316, + "step": 16620 + }, + { + "epoch": 1.9709037037037036, + "grad_norm": 0.44770167815912215, + "learning_rate": 3.1772381921839212e-06, + "loss": 0.0254, + "step": 16630 + }, + { + "epoch": 1.972088888888889, + "grad_norm": 0.6081499764545403, + "learning_rate": 3.1708195135223895e-06, + "loss": 0.0303, + "step": 16640 + }, + { + "epoch": 1.973274074074074, + "grad_norm": 0.5069386007266375, + "learning_rate": 3.164404313204944e-06, + "loss": 0.0233, + "step": 16650 + }, + { + "epoch": 1.9744592592592594, + "grad_norm": 0.561366928292409, + "learning_rate": 3.15799260343064e-06, + "loss": 0.0304, + "step": 16660 + }, + { + "epoch": 1.9756444444444443, + "grad_norm": 0.558200172607736, + "learning_rate": 3.1515843963918952e-06, + "loss": 0.0273, + "step": 16670 + }, + { + "epoch": 1.9768296296296297, + "grad_norm": 0.6094887586344587, + "learning_rate": 3.1451797042744654e-06, + "loss": 0.0277, + "step": 16680 + }, + { + "epoch": 1.9780148148148147, + "grad_norm": 0.4939130530614874, + "learning_rate": 3.138778539257427e-06, + "loss": 0.028, + "step": 16690 + }, + { + "epoch": 1.9792, + "grad_norm": 0.4498344520360258, + "learning_rate": 3.132380913513143e-06, + "loss": 0.0273, + "step": 16700 + }, + { + "epoch": 1.980385185185185, + "grad_norm": 0.5145262397259287, + "learning_rate": 3.1259868392072525e-06, + "loss": 0.0276, + "step": 16710 + }, + { + "epoch": 1.9815703703703704, + "grad_norm": 0.5488739293687596, + "learning_rate": 3.1195963284986343e-06, + "loss": 0.0276, + "step": 16720 + }, + { + "epoch": 1.9827555555555556, + "grad_norm": 0.4824153911950187, + "learning_rate": 3.113209393539396e-06, + "loss": 0.0267, + "step": 16730 + }, + { + "epoch": 1.9839407407407408, + "grad_norm": 0.3240425216116917, + "learning_rate": 3.1068260464748453e-06, + "loss": 0.0269, + "step": 16740 + }, + { + "epoch": 1.985125925925926, + "grad_norm": 0.4918205581945271, + "learning_rate": 3.1004462994434636e-06, + "loss": 0.0286, + "step": 16750 + }, + { + "epoch": 1.9863111111111111, + "grad_norm": 0.5092021418663703, + "learning_rate": 3.0940701645768882e-06, + "loss": 0.0282, + "step": 16760 + }, + { + "epoch": 1.9874962962962963, + "grad_norm": 0.43239668671736603, + "learning_rate": 3.0876976539998927e-06, + "loss": 0.0281, + "step": 16770 + }, + { + "epoch": 1.9886814814814815, + "grad_norm": 0.48101851927752226, + "learning_rate": 3.0813287798303493e-06, + "loss": 0.0294, + "step": 16780 + }, + { + "epoch": 1.9898666666666667, + "grad_norm": 0.5123373247203269, + "learning_rate": 3.0749635541792245e-06, + "loss": 0.0295, + "step": 16790 + }, + { + "epoch": 1.9910518518518519, + "grad_norm": 0.5201502334866416, + "learning_rate": 3.0686019891505386e-06, + "loss": 0.0268, + "step": 16800 + }, + { + "epoch": 1.992237037037037, + "grad_norm": 0.43273612199401407, + "learning_rate": 3.062244096841358e-06, + "loss": 0.0265, + "step": 16810 + }, + { + "epoch": 1.9934222222222222, + "grad_norm": 0.42623948593609784, + "learning_rate": 3.05588988934176e-06, + "loss": 0.0287, + "step": 16820 + }, + { + "epoch": 1.9946074074074074, + "grad_norm": 0.6180521272782323, + "learning_rate": 3.049539378734818e-06, + "loss": 0.0253, + "step": 16830 + }, + { + "epoch": 1.9957925925925926, + "grad_norm": 0.4820479106144094, + "learning_rate": 3.043192577096571e-06, + "loss": 0.0293, + "step": 16840 + }, + { + "epoch": 1.9969777777777777, + "grad_norm": 0.41628943844623667, + "learning_rate": 3.0368494964960147e-06, + "loss": 0.0257, + "step": 16850 + }, + { + "epoch": 1.998162962962963, + "grad_norm": 0.47298114938576746, + "learning_rate": 3.0305101489950583e-06, + "loss": 0.0254, + "step": 16860 + }, + { + "epoch": 1.999348148148148, + "grad_norm": 0.559686936226428, + "learning_rate": 3.0241745466485185e-06, + "loss": 0.0281, + "step": 16870 + }, + { + "epoch": 2.000474074074074, + "grad_norm": 0.3482473401349315, + "learning_rate": 3.0178427015040858e-06, + "loss": 0.0229, + "step": 16880 + }, + { + "epoch": 2.0016592592592595, + "grad_norm": 0.46580309230335915, + "learning_rate": 3.011514625602312e-06, + "loss": 0.021, + "step": 16890 + }, + { + "epoch": 2.0028444444444444, + "grad_norm": 0.40274055242316353, + "learning_rate": 3.005190330976574e-06, + "loss": 0.0215, + "step": 16900 + }, + { + "epoch": 2.00402962962963, + "grad_norm": 0.6303959964339435, + "learning_rate": 2.998869829653064e-06, + "loss": 0.0229, + "step": 16910 + }, + { + "epoch": 2.005214814814815, + "grad_norm": 0.5138781974516277, + "learning_rate": 2.9925531336507607e-06, + "loss": 0.0186, + "step": 16920 + }, + { + "epoch": 2.0064, + "grad_norm": 0.44224945268347166, + "learning_rate": 2.9862402549814033e-06, + "loss": 0.0191, + "step": 16930 + }, + { + "epoch": 2.007585185185185, + "grad_norm": 0.47402190837384184, + "learning_rate": 2.9799312056494744e-06, + "loss": 0.0207, + "step": 16940 + }, + { + "epoch": 2.0087703703703705, + "grad_norm": 0.4512227152680803, + "learning_rate": 2.9736259976521743e-06, + "loss": 0.0183, + "step": 16950 + }, + { + "epoch": 2.0099555555555555, + "grad_norm": 0.39109377769125514, + "learning_rate": 2.9673246429793977e-06, + "loss": 0.0203, + "step": 16960 + }, + { + "epoch": 2.011140740740741, + "grad_norm": 0.46949579253078194, + "learning_rate": 2.9610271536137137e-06, + "loss": 0.0206, + "step": 16970 + }, + { + "epoch": 2.012325925925926, + "grad_norm": 0.7052053510510133, + "learning_rate": 2.954733541530339e-06, + "loss": 0.0232, + "step": 16980 + }, + { + "epoch": 2.0135111111111113, + "grad_norm": 0.5301340846313625, + "learning_rate": 2.948443818697118e-06, + "loss": 0.0207, + "step": 16990 + }, + { + "epoch": 2.014696296296296, + "grad_norm": 0.43845805140236066, + "learning_rate": 2.9421579970745033e-06, + "loss": 0.0193, + "step": 17000 + }, + { + "epoch": 2.0158814814814816, + "grad_norm": 0.637056439108828, + "learning_rate": 2.9358760886155225e-06, + "loss": 0.0233, + "step": 17010 + }, + { + "epoch": 2.0170666666666666, + "grad_norm": 0.5642052007571374, + "learning_rate": 2.9295981052657664e-06, + "loss": 0.02, + "step": 17020 + }, + { + "epoch": 2.018251851851852, + "grad_norm": 0.5030801354899268, + "learning_rate": 2.9233240589633592e-06, + "loss": 0.0201, + "step": 17030 + }, + { + "epoch": 2.019437037037037, + "grad_norm": 0.6592069503241937, + "learning_rate": 2.917053961638942e-06, + "loss": 0.0191, + "step": 17040 + }, + { + "epoch": 2.0206222222222223, + "grad_norm": 0.6891333301586693, + "learning_rate": 2.9107878252156405e-06, + "loss": 0.0209, + "step": 17050 + }, + { + "epoch": 2.0218074074074073, + "grad_norm": 0.5843514128740532, + "learning_rate": 2.904525661609057e-06, + "loss": 0.0212, + "step": 17060 + }, + { + "epoch": 2.0229925925925927, + "grad_norm": 0.57460003385646, + "learning_rate": 2.8982674827272306e-06, + "loss": 0.0194, + "step": 17070 + }, + { + "epoch": 2.0241777777777776, + "grad_norm": 0.5048643098953294, + "learning_rate": 2.8920133004706297e-06, + "loss": 0.0211, + "step": 17080 + }, + { + "epoch": 2.025362962962963, + "grad_norm": 0.7024965213733743, + "learning_rate": 2.8857631267321196e-06, + "loss": 0.0219, + "step": 17090 + }, + { + "epoch": 2.026548148148148, + "grad_norm": 0.5432895602355671, + "learning_rate": 2.8795169733969397e-06, + "loss": 0.0199, + "step": 17100 + }, + { + "epoch": 2.0277333333333334, + "grad_norm": 0.49977345022966446, + "learning_rate": 2.8732748523426934e-06, + "loss": 0.0229, + "step": 17110 + }, + { + "epoch": 2.0289185185185183, + "grad_norm": 0.5138196441980594, + "learning_rate": 2.8670367754393093e-06, + "loss": 0.0212, + "step": 17120 + }, + { + "epoch": 2.0301037037037037, + "grad_norm": 0.5205940883171687, + "learning_rate": 2.860802754549026e-06, + "loss": 0.0187, + "step": 17130 + }, + { + "epoch": 2.0312888888888887, + "grad_norm": 0.6485691519856771, + "learning_rate": 2.8545728015263692e-06, + "loss": 0.0227, + "step": 17140 + }, + { + "epoch": 2.032474074074074, + "grad_norm": 0.46190546081380174, + "learning_rate": 2.848346928218133e-06, + "loss": 0.0197, + "step": 17150 + }, + { + "epoch": 2.033659259259259, + "grad_norm": 0.5374744840579662, + "learning_rate": 2.8421251464633527e-06, + "loss": 0.0223, + "step": 17160 + }, + { + "epoch": 2.0348444444444445, + "grad_norm": 0.5035594297525493, + "learning_rate": 2.8359074680932797e-06, + "loss": 0.0221, + "step": 17170 + }, + { + "epoch": 2.0360296296296294, + "grad_norm": 0.6218404427421674, + "learning_rate": 2.8296939049313632e-06, + "loss": 0.0212, + "step": 17180 + }, + { + "epoch": 2.037214814814815, + "grad_norm": 0.5627208622442489, + "learning_rate": 2.8234844687932304e-06, + "loss": 0.0229, + "step": 17190 + }, + { + "epoch": 2.0384, + "grad_norm": 0.5894673881803788, + "learning_rate": 2.8172791714866586e-06, + "loss": 0.0203, + "step": 17200 + }, + { + "epoch": 2.039585185185185, + "grad_norm": 0.652610746033946, + "learning_rate": 2.8110780248115533e-06, + "loss": 0.0202, + "step": 17210 + }, + { + "epoch": 2.0407703703703706, + "grad_norm": 0.6797262603338378, + "learning_rate": 2.8048810405599268e-06, + "loss": 0.0224, + "step": 17220 + }, + { + "epoch": 2.0419555555555555, + "grad_norm": 0.44534648584222053, + "learning_rate": 2.7986882305158798e-06, + "loss": 0.0195, + "step": 17230 + }, + { + "epoch": 2.043140740740741, + "grad_norm": 0.42191356985030354, + "learning_rate": 2.7924996064555754e-06, + "loss": 0.0196, + "step": 17240 + }, + { + "epoch": 2.044325925925926, + "grad_norm": 0.5084660524003956, + "learning_rate": 2.7863151801472125e-06, + "loss": 0.0209, + "step": 17250 + }, + { + "epoch": 2.0455111111111113, + "grad_norm": 0.5181340879304304, + "learning_rate": 2.780134963351009e-06, + "loss": 0.0188, + "step": 17260 + }, + { + "epoch": 2.0466962962962962, + "grad_norm": 0.4645642370564578, + "learning_rate": 2.773958967819181e-06, + "loss": 0.022, + "step": 17270 + }, + { + "epoch": 2.0478814814814816, + "grad_norm": 0.5798027491530987, + "learning_rate": 2.7677872052959153e-06, + "loss": 0.0207, + "step": 17280 + }, + { + "epoch": 2.0490666666666666, + "grad_norm": 0.3604404015266568, + "learning_rate": 2.7616196875173486e-06, + "loss": 0.0202, + "step": 17290 + }, + { + "epoch": 2.050251851851852, + "grad_norm": 0.5321509327667007, + "learning_rate": 2.7554564262115433e-06, + "loss": 0.0207, + "step": 17300 + }, + { + "epoch": 2.051437037037037, + "grad_norm": 0.8777408808180764, + "learning_rate": 2.7492974330984756e-06, + "loss": 0.0207, + "step": 17310 + }, + { + "epoch": 2.0526222222222223, + "grad_norm": 0.5048830394161766, + "learning_rate": 2.7431427198900018e-06, + "loss": 0.0212, + "step": 17320 + }, + { + "epoch": 2.0538074074074073, + "grad_norm": 0.506975039542351, + "learning_rate": 2.7369922982898356e-06, + "loss": 0.0208, + "step": 17330 + }, + { + "epoch": 2.0549925925925927, + "grad_norm": 0.6546830075558804, + "learning_rate": 2.730846179993535e-06, + "loss": 0.0197, + "step": 17340 + }, + { + "epoch": 2.0561777777777777, + "grad_norm": 0.6130693163162522, + "learning_rate": 2.7247043766884685e-06, + "loss": 0.0227, + "step": 17350 + }, + { + "epoch": 2.057362962962963, + "grad_norm": 0.6165697629279787, + "learning_rate": 2.718566900053809e-06, + "loss": 0.0234, + "step": 17360 + }, + { + "epoch": 2.058548148148148, + "grad_norm": 0.5902572278545075, + "learning_rate": 2.7124337617604933e-06, + "loss": 0.0193, + "step": 17370 + }, + { + "epoch": 2.0597333333333334, + "grad_norm": 0.46237748365897946, + "learning_rate": 2.7063049734712116e-06, + "loss": 0.0222, + "step": 17380 + }, + { + "epoch": 2.0609185185185184, + "grad_norm": 0.39759154277465686, + "learning_rate": 2.700180546840382e-06, + "loss": 0.0207, + "step": 17390 + }, + { + "epoch": 2.0621037037037038, + "grad_norm": 0.6183978214522016, + "learning_rate": 2.6940604935141324e-06, + "loss": 0.0203, + "step": 17400 + }, + { + "epoch": 2.0632888888888887, + "grad_norm": 0.517093237139377, + "learning_rate": 2.6879448251302677e-06, + "loss": 0.0203, + "step": 17410 + }, + { + "epoch": 2.064474074074074, + "grad_norm": 0.5622185910580356, + "learning_rate": 2.6818335533182573e-06, + "loss": 0.0206, + "step": 17420 + }, + { + "epoch": 2.065659259259259, + "grad_norm": 0.5225085519598541, + "learning_rate": 2.6757266896992094e-06, + "loss": 0.0207, + "step": 17430 + }, + { + "epoch": 2.0668444444444445, + "grad_norm": 0.5220970224663188, + "learning_rate": 2.669624245885854e-06, + "loss": 0.0223, + "step": 17440 + }, + { + "epoch": 2.0680296296296294, + "grad_norm": 0.48375196632405615, + "learning_rate": 2.6635262334825095e-06, + "loss": 0.0216, + "step": 17450 + }, + { + "epoch": 2.069214814814815, + "grad_norm": 0.5559496639426671, + "learning_rate": 2.6574326640850744e-06, + "loss": 0.0238, + "step": 17460 + }, + { + "epoch": 2.0704, + "grad_norm": 0.5681141804682247, + "learning_rate": 2.6513435492809924e-06, + "loss": 0.0198, + "step": 17470 + }, + { + "epoch": 2.071585185185185, + "grad_norm": 0.43538899827280736, + "learning_rate": 2.6452589006492426e-06, + "loss": 0.02, + "step": 17480 + }, + { + "epoch": 2.07277037037037, + "grad_norm": 0.580886786888115, + "learning_rate": 2.639178729760306e-06, + "loss": 0.0224, + "step": 17490 + }, + { + "epoch": 2.0739555555555556, + "grad_norm": 0.4613400997382292, + "learning_rate": 2.6331030481761505e-06, + "loss": 0.022, + "step": 17500 + }, + { + "epoch": 2.075140740740741, + "grad_norm": 0.5427349090105272, + "learning_rate": 2.627031867450206e-06, + "loss": 0.0235, + "step": 17510 + }, + { + "epoch": 2.076325925925926, + "grad_norm": 0.55239594231661, + "learning_rate": 2.6209651991273476e-06, + "loss": 0.0223, + "step": 17520 + }, + { + "epoch": 2.0775111111111113, + "grad_norm": 0.45400213551288154, + "learning_rate": 2.6149030547438648e-06, + "loss": 0.0223, + "step": 17530 + }, + { + "epoch": 2.0786962962962963, + "grad_norm": 0.47364076124826127, + "learning_rate": 2.6088454458274503e-06, + "loss": 0.021, + "step": 17540 + }, + { + "epoch": 2.0798814814814817, + "grad_norm": 0.6660565862409713, + "learning_rate": 2.602792383897164e-06, + "loss": 0.0189, + "step": 17550 + }, + { + "epoch": 2.0810666666666666, + "grad_norm": 0.510240814833039, + "learning_rate": 2.596743880463429e-06, + "loss": 0.0202, + "step": 17560 + }, + { + "epoch": 2.082251851851852, + "grad_norm": 0.5914970747154663, + "learning_rate": 2.5906999470279927e-06, + "loss": 0.0217, + "step": 17570 + }, + { + "epoch": 2.083437037037037, + "grad_norm": 0.6248497425366787, + "learning_rate": 2.5846605950839133e-06, + "loss": 0.0205, + "step": 17580 + }, + { + "epoch": 2.0846222222222224, + "grad_norm": 0.6278589590491476, + "learning_rate": 2.578625836115538e-06, + "loss": 0.0196, + "step": 17590 + }, + { + "epoch": 2.0858074074074073, + "grad_norm": 0.41741915572365507, + "learning_rate": 2.572595681598483e-06, + "loss": 0.0201, + "step": 17600 + }, + { + "epoch": 2.0869925925925927, + "grad_norm": 0.5656720612902089, + "learning_rate": 2.5665701429996027e-06, + "loss": 0.0217, + "step": 17610 + }, + { + "epoch": 2.0881777777777777, + "grad_norm": 0.6889995847332994, + "learning_rate": 2.560549231776981e-06, + "loss": 0.0216, + "step": 17620 + }, + { + "epoch": 2.089362962962963, + "grad_norm": 0.4923895380185985, + "learning_rate": 2.5545329593798973e-06, + "loss": 0.0207, + "step": 17630 + }, + { + "epoch": 2.090548148148148, + "grad_norm": 0.6384375039735288, + "learning_rate": 2.5485213372488075e-06, + "loss": 0.0219, + "step": 17640 + }, + { + "epoch": 2.0917333333333334, + "grad_norm": 0.5602545664645805, + "learning_rate": 2.542514376815334e-06, + "loss": 0.0224, + "step": 17650 + }, + { + "epoch": 2.0929185185185184, + "grad_norm": 0.5108298103386538, + "learning_rate": 2.536512089502226e-06, + "loss": 0.0202, + "step": 17660 + }, + { + "epoch": 2.094103703703704, + "grad_norm": 0.6829735094554921, + "learning_rate": 2.530514486723348e-06, + "loss": 0.0233, + "step": 17670 + }, + { + "epoch": 2.0952888888888888, + "grad_norm": 0.5213175703053962, + "learning_rate": 2.524521579883659e-06, + "loss": 0.0216, + "step": 17680 + }, + { + "epoch": 2.096474074074074, + "grad_norm": 0.49732457335620184, + "learning_rate": 2.5185333803791896e-06, + "loss": 0.0205, + "step": 17690 + }, + { + "epoch": 2.097659259259259, + "grad_norm": 0.5454009761276458, + "learning_rate": 2.512549899597014e-06, + "loss": 0.0213, + "step": 17700 + }, + { + "epoch": 2.0988444444444445, + "grad_norm": 0.554626102113036, + "learning_rate": 2.5065711489152363e-06, + "loss": 0.021, + "step": 17710 + }, + { + "epoch": 2.1000296296296295, + "grad_norm": 0.4848245125944637, + "learning_rate": 2.5005971397029625e-06, + "loss": 0.0224, + "step": 17720 + }, + { + "epoch": 2.101214814814815, + "grad_norm": 0.6457509046815929, + "learning_rate": 2.4946278833202897e-06, + "loss": 0.0195, + "step": 17730 + }, + { + "epoch": 2.1024, + "grad_norm": 0.42380878103800584, + "learning_rate": 2.488663391118271e-06, + "loss": 0.0188, + "step": 17740 + }, + { + "epoch": 2.1035851851851852, + "grad_norm": 0.4105428902709983, + "learning_rate": 2.4827036744389007e-06, + "loss": 0.0202, + "step": 17750 + }, + { + "epoch": 2.10477037037037, + "grad_norm": 0.43751033770369263, + "learning_rate": 2.4767487446150896e-06, + "loss": 0.0196, + "step": 17760 + }, + { + "epoch": 2.1059555555555556, + "grad_norm": 0.5666900006161766, + "learning_rate": 2.4707986129706563e-06, + "loss": 0.0205, + "step": 17770 + }, + { + "epoch": 2.1071407407407405, + "grad_norm": 0.6226424044192759, + "learning_rate": 2.464853290820284e-06, + "loss": 0.0212, + "step": 17780 + }, + { + "epoch": 2.108325925925926, + "grad_norm": 0.5324995744190012, + "learning_rate": 2.458912789469516e-06, + "loss": 0.0203, + "step": 17790 + }, + { + "epoch": 2.109511111111111, + "grad_norm": 0.6628981567148068, + "learning_rate": 2.452977120214723e-06, + "loss": 0.0203, + "step": 17800 + }, + { + "epoch": 2.1106962962962963, + "grad_norm": 0.6104277124955798, + "learning_rate": 2.4470462943430954e-06, + "loss": 0.0195, + "step": 17810 + }, + { + "epoch": 2.1118814814814817, + "grad_norm": 0.45996499379702915, + "learning_rate": 2.4411203231326076e-06, + "loss": 0.0234, + "step": 17820 + }, + { + "epoch": 2.1130666666666666, + "grad_norm": 0.5682831803009347, + "learning_rate": 2.4351992178520025e-06, + "loss": 0.0202, + "step": 17830 + }, + { + "epoch": 2.114251851851852, + "grad_norm": 0.5364567179377799, + "learning_rate": 2.42928298976077e-06, + "loss": 0.0197, + "step": 17840 + }, + { + "epoch": 2.115437037037037, + "grad_norm": 0.5352850842233184, + "learning_rate": 2.4233716501091294e-06, + "loss": 0.0212, + "step": 17850 + }, + { + "epoch": 2.1166222222222224, + "grad_norm": 0.5781701329699696, + "learning_rate": 2.417465210138002e-06, + "loss": 0.0195, + "step": 17860 + }, + { + "epoch": 2.1178074074074074, + "grad_norm": 0.5151015814679271, + "learning_rate": 2.411563681078991e-06, + "loss": 0.0185, + "step": 17870 + }, + { + "epoch": 2.1189925925925928, + "grad_norm": 0.5031295769562851, + "learning_rate": 2.4056670741543598e-06, + "loss": 0.021, + "step": 17880 + }, + { + "epoch": 2.1201777777777777, + "grad_norm": 0.5490496818347299, + "learning_rate": 2.3997754005770175e-06, + "loss": 0.0189, + "step": 17890 + }, + { + "epoch": 2.121362962962963, + "grad_norm": 0.562956163704708, + "learning_rate": 2.3938886715504856e-06, + "loss": 0.0191, + "step": 17900 + }, + { + "epoch": 2.122548148148148, + "grad_norm": 0.5580659425582958, + "learning_rate": 2.388006898268887e-06, + "loss": 0.0198, + "step": 17910 + }, + { + "epoch": 2.1237333333333335, + "grad_norm": 0.49905480307253824, + "learning_rate": 2.382130091916917e-06, + "loss": 0.0209, + "step": 17920 + }, + { + "epoch": 2.1249185185185184, + "grad_norm": 0.5349599214785156, + "learning_rate": 2.376258263669831e-06, + "loss": 0.0206, + "step": 17930 + }, + { + "epoch": 2.126103703703704, + "grad_norm": 0.48181259176056973, + "learning_rate": 2.370391424693417e-06, + "loss": 0.0203, + "step": 17940 + }, + { + "epoch": 2.127288888888889, + "grad_norm": 0.703787396056072, + "learning_rate": 2.364529586143973e-06, + "loss": 0.0223, + "step": 17950 + }, + { + "epoch": 2.128474074074074, + "grad_norm": 0.4809162493123757, + "learning_rate": 2.3586727591682867e-06, + "loss": 0.0206, + "step": 17960 + }, + { + "epoch": 2.129659259259259, + "grad_norm": 0.5506637606674155, + "learning_rate": 2.352820954903623e-06, + "loss": 0.0172, + "step": 17970 + }, + { + "epoch": 2.1308444444444445, + "grad_norm": 0.39490121962813945, + "learning_rate": 2.346974184477689e-06, + "loss": 0.0208, + "step": 17980 + }, + { + "epoch": 2.1320296296296295, + "grad_norm": 0.5460609055203705, + "learning_rate": 2.3411324590086194e-06, + "loss": 0.0193, + "step": 17990 + }, + { + "epoch": 2.133214814814815, + "grad_norm": 0.7225464713084386, + "learning_rate": 2.3352957896049626e-06, + "loss": 0.0218, + "step": 18000 + }, + { + "epoch": 2.1344, + "grad_norm": 0.507862371964674, + "learning_rate": 2.329464187365643e-06, + "loss": 0.0217, + "step": 18010 + }, + { + "epoch": 2.1355851851851853, + "grad_norm": 0.5153060922390552, + "learning_rate": 2.3236376633799582e-06, + "loss": 0.0219, + "step": 18020 + }, + { + "epoch": 2.13677037037037, + "grad_norm": 0.6253823099089397, + "learning_rate": 2.317816228727543e-06, + "loss": 0.0193, + "step": 18030 + }, + { + "epoch": 2.1379555555555556, + "grad_norm": 0.5404761498282122, + "learning_rate": 2.3119998944783562e-06, + "loss": 0.0183, + "step": 18040 + }, + { + "epoch": 2.1391407407407406, + "grad_norm": 0.4335318956345966, + "learning_rate": 2.3061886716926562e-06, + "loss": 0.0201, + "step": 18050 + }, + { + "epoch": 2.140325925925926, + "grad_norm": 0.5610753444629021, + "learning_rate": 2.3003825714209873e-06, + "loss": 0.0212, + "step": 18060 + }, + { + "epoch": 2.141511111111111, + "grad_norm": 0.6260871102303472, + "learning_rate": 2.2945816047041438e-06, + "loss": 0.0197, + "step": 18070 + }, + { + "epoch": 2.1426962962962963, + "grad_norm": 0.5835361641829071, + "learning_rate": 2.2887857825731676e-06, + "loss": 0.0222, + "step": 18080 + }, + { + "epoch": 2.1438814814814813, + "grad_norm": 0.526656348906993, + "learning_rate": 2.2829951160493092e-06, + "loss": 0.0227, + "step": 18090 + }, + { + "epoch": 2.1450666666666667, + "grad_norm": 0.5026505854718598, + "learning_rate": 2.277209616144023e-06, + "loss": 0.0181, + "step": 18100 + }, + { + "epoch": 2.1462518518518516, + "grad_norm": 0.6648280571177739, + "learning_rate": 2.2714292938589327e-06, + "loss": 0.0212, + "step": 18110 + }, + { + "epoch": 2.147437037037037, + "grad_norm": 0.4827403542009834, + "learning_rate": 2.2656541601858195e-06, + "loss": 0.0186, + "step": 18120 + }, + { + "epoch": 2.1486222222222224, + "grad_norm": 0.4104864640118743, + "learning_rate": 2.2598842261065943e-06, + "loss": 0.0204, + "step": 18130 + }, + { + "epoch": 2.1498074074074074, + "grad_norm": 0.531122536135586, + "learning_rate": 2.2541195025932877e-06, + "loss": 0.0198, + "step": 18140 + }, + { + "epoch": 2.150992592592593, + "grad_norm": 0.5562995099275271, + "learning_rate": 2.2483600006080126e-06, + "loss": 0.0196, + "step": 18150 + }, + { + "epoch": 2.1521777777777777, + "grad_norm": 0.42433929016876354, + "learning_rate": 2.242605731102962e-06, + "loss": 0.02, + "step": 18160 + }, + { + "epoch": 2.153362962962963, + "grad_norm": 0.5370964002455985, + "learning_rate": 2.236856705020371e-06, + "loss": 0.0204, + "step": 18170 + }, + { + "epoch": 2.154548148148148, + "grad_norm": 0.4995445837538678, + "learning_rate": 2.231112933292511e-06, + "loss": 0.019, + "step": 18180 + }, + { + "epoch": 2.1557333333333335, + "grad_norm": 0.7250541553433348, + "learning_rate": 2.2253744268416557e-06, + "loss": 0.0227, + "step": 18190 + }, + { + "epoch": 2.1569185185185185, + "grad_norm": 0.5341142665020637, + "learning_rate": 2.219641196580069e-06, + "loss": 0.0219, + "step": 18200 + }, + { + "epoch": 2.158103703703704, + "grad_norm": 0.5540512775000475, + "learning_rate": 2.2139132534099807e-06, + "loss": 0.0197, + "step": 18210 + }, + { + "epoch": 2.159288888888889, + "grad_norm": 0.5720923139855493, + "learning_rate": 2.208190608223568e-06, + "loss": 0.0207, + "step": 18220 + }, + { + "epoch": 2.160474074074074, + "grad_norm": 0.5573024949298367, + "learning_rate": 2.202473271902936e-06, + "loss": 0.0199, + "step": 18230 + }, + { + "epoch": 2.161659259259259, + "grad_norm": 0.46947015664990394, + "learning_rate": 2.19676125532009e-06, + "loss": 0.0202, + "step": 18240 + }, + { + "epoch": 2.1628444444444446, + "grad_norm": 0.5848449267707536, + "learning_rate": 2.19105456933692e-06, + "loss": 0.0221, + "step": 18250 + }, + { + "epoch": 2.1640296296296295, + "grad_norm": 0.7057389535726815, + "learning_rate": 2.1853532248051794e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 2.165214814814815, + "grad_norm": 0.6746444885619608, + "learning_rate": 2.179657232566469e-06, + "loss": 0.0209, + "step": 18270 + }, + { + "epoch": 2.1664, + "grad_norm": 0.5431912367483681, + "learning_rate": 2.1739666034522054e-06, + "loss": 0.02, + "step": 18280 + }, + { + "epoch": 2.1675851851851853, + "grad_norm": 0.4652152296806473, + "learning_rate": 2.1682813482836092e-06, + "loss": 0.0198, + "step": 18290 + }, + { + "epoch": 2.1687703703703702, + "grad_norm": 0.6062192440874989, + "learning_rate": 2.162601477871683e-06, + "loss": 0.021, + "step": 18300 + }, + { + "epoch": 2.1699555555555556, + "grad_norm": 0.6923259295522756, + "learning_rate": 2.1569270030171912e-06, + "loss": 0.018, + "step": 18310 + }, + { + "epoch": 2.1711407407407406, + "grad_norm": 0.383772317909671, + "learning_rate": 2.151257934510634e-06, + "loss": 0.0199, + "step": 18320 + }, + { + "epoch": 2.172325925925926, + "grad_norm": 0.533245204040971, + "learning_rate": 2.1455942831322337e-06, + "loss": 0.0201, + "step": 18330 + }, + { + "epoch": 2.173511111111111, + "grad_norm": 0.5702816356505999, + "learning_rate": 2.139936059651908e-06, + "loss": 0.0202, + "step": 18340 + }, + { + "epoch": 2.1746962962962963, + "grad_norm": 0.41103437915339863, + "learning_rate": 2.13428327482926e-06, + "loss": 0.02, + "step": 18350 + }, + { + "epoch": 2.1758814814814813, + "grad_norm": 0.47711776383543486, + "learning_rate": 2.128635939413544e-06, + "loss": 0.0204, + "step": 18360 + }, + { + "epoch": 2.1770666666666667, + "grad_norm": 0.6101847075721979, + "learning_rate": 2.1229940641436525e-06, + "loss": 0.0208, + "step": 18370 + }, + { + "epoch": 2.1782518518518517, + "grad_norm": 0.4549123352331804, + "learning_rate": 2.117357659748099e-06, + "loss": 0.0181, + "step": 18380 + }, + { + "epoch": 2.179437037037037, + "grad_norm": 0.3284452989493554, + "learning_rate": 2.111726736944994e-06, + "loss": 0.0189, + "step": 18390 + }, + { + "epoch": 2.180622222222222, + "grad_norm": 0.5823265552863844, + "learning_rate": 2.106101306442018e-06, + "loss": 0.0206, + "step": 18400 + }, + { + "epoch": 2.1818074074074074, + "grad_norm": 0.6740901089629598, + "learning_rate": 2.1004813789364128e-06, + "loss": 0.0204, + "step": 18410 + }, + { + "epoch": 2.1829925925925924, + "grad_norm": 0.5418554750031419, + "learning_rate": 2.0948669651149512e-06, + "loss": 0.0193, + "step": 18420 + }, + { + "epoch": 2.1841777777777778, + "grad_norm": 0.4896848300833659, + "learning_rate": 2.089258075653928e-06, + "loss": 0.0197, + "step": 18430 + }, + { + "epoch": 2.185362962962963, + "grad_norm": 0.5520432689675453, + "learning_rate": 2.0836547212191283e-06, + "loss": 0.0186, + "step": 18440 + }, + { + "epoch": 2.186548148148148, + "grad_norm": 0.6224336923265456, + "learning_rate": 2.0780569124658114e-06, + "loss": 0.0209, + "step": 18450 + }, + { + "epoch": 2.1877333333333335, + "grad_norm": 0.6011837329328497, + "learning_rate": 2.0724646600386893e-06, + "loss": 0.0217, + "step": 18460 + }, + { + "epoch": 2.1889185185185185, + "grad_norm": 0.7273667529581211, + "learning_rate": 2.0668779745719188e-06, + "loss": 0.0206, + "step": 18470 + }, + { + "epoch": 2.190103703703704, + "grad_norm": 0.7029829492058258, + "learning_rate": 2.0612968666890583e-06, + "loss": 0.0204, + "step": 18480 + }, + { + "epoch": 2.191288888888889, + "grad_norm": 0.7805906621478219, + "learning_rate": 2.055721347003065e-06, + "loss": 0.0197, + "step": 18490 + }, + { + "epoch": 2.1924740740740742, + "grad_norm": 0.6074290614751168, + "learning_rate": 2.0501514261162685e-06, + "loss": 0.0207, + "step": 18500 + }, + { + "epoch": 2.193659259259259, + "grad_norm": 0.4782578294360636, + "learning_rate": 2.0445871146203554e-06, + "loss": 0.0178, + "step": 18510 + }, + { + "epoch": 2.1948444444444446, + "grad_norm": 0.8116631849011151, + "learning_rate": 2.0390284230963413e-06, + "loss": 0.0197, + "step": 18520 + }, + { + "epoch": 2.1960296296296296, + "grad_norm": 0.4698401755295345, + "learning_rate": 2.0334753621145547e-06, + "loss": 0.0198, + "step": 18530 + }, + { + "epoch": 2.197214814814815, + "grad_norm": 0.5394019591374424, + "learning_rate": 2.0279279422346214e-06, + "loss": 0.018, + "step": 18540 + }, + { + "epoch": 2.1984, + "grad_norm": 0.6298320623550064, + "learning_rate": 2.0223861740054358e-06, + "loss": 0.02, + "step": 18550 + }, + { + "epoch": 2.1995851851851853, + "grad_norm": 0.5557470409692559, + "learning_rate": 2.016850067965149e-06, + "loss": 0.0236, + "step": 18560 + }, + { + "epoch": 2.2007703703703703, + "grad_norm": 0.47740034710081647, + "learning_rate": 2.0113196346411425e-06, + "loss": 0.0202, + "step": 18570 + }, + { + "epoch": 2.2019555555555557, + "grad_norm": 0.5071439018196295, + "learning_rate": 2.0057948845500086e-06, + "loss": 0.0197, + "step": 18580 + }, + { + "epoch": 2.2031407407407406, + "grad_norm": 0.5926803996636547, + "learning_rate": 2.0002758281975384e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 2.204325925925926, + "grad_norm": 0.5431350843715361, + "learning_rate": 1.994762476078691e-06, + "loss": 0.0185, + "step": 18600 + }, + { + "epoch": 2.205511111111111, + "grad_norm": 0.7418902778752838, + "learning_rate": 1.9892548386775777e-06, + "loss": 0.0222, + "step": 18610 + }, + { + "epoch": 2.2066962962962964, + "grad_norm": 0.538031244062795, + "learning_rate": 1.983752926467449e-06, + "loss": 0.0186, + "step": 18620 + }, + { + "epoch": 2.2078814814814813, + "grad_norm": 0.4917170815245571, + "learning_rate": 1.9782567499106607e-06, + "loss": 0.0187, + "step": 18630 + }, + { + "epoch": 2.2090666666666667, + "grad_norm": 0.5831414690924662, + "learning_rate": 1.9727663194586686e-06, + "loss": 0.0196, + "step": 18640 + }, + { + "epoch": 2.2102518518518517, + "grad_norm": 0.5300137465156955, + "learning_rate": 1.967281645551998e-06, + "loss": 0.0206, + "step": 18650 + }, + { + "epoch": 2.211437037037037, + "grad_norm": 0.4390825423197908, + "learning_rate": 1.9618027386202267e-06, + "loss": 0.0196, + "step": 18660 + }, + { + "epoch": 2.212622222222222, + "grad_norm": 0.6589359054812434, + "learning_rate": 1.9563296090819665e-06, + "loss": 0.0174, + "step": 18670 + }, + { + "epoch": 2.2138074074074074, + "grad_norm": 0.6486500159314164, + "learning_rate": 1.950862267344848e-06, + "loss": 0.0186, + "step": 18680 + }, + { + "epoch": 2.2149925925925924, + "grad_norm": 0.5219787693622737, + "learning_rate": 1.9454007238054883e-06, + "loss": 0.0195, + "step": 18690 + }, + { + "epoch": 2.216177777777778, + "grad_norm": 0.4806237457949874, + "learning_rate": 1.9399449888494855e-06, + "loss": 0.0199, + "step": 18700 + }, + { + "epoch": 2.2173629629629628, + "grad_norm": 0.5748413766392373, + "learning_rate": 1.934495072851386e-06, + "loss": 0.0182, + "step": 18710 + }, + { + "epoch": 2.218548148148148, + "grad_norm": 0.4851993483355934, + "learning_rate": 1.9290509861746774e-06, + "loss": 0.0203, + "step": 18720 + }, + { + "epoch": 2.219733333333333, + "grad_norm": 0.4144299966291897, + "learning_rate": 1.923612739171757e-06, + "loss": 0.0197, + "step": 18730 + }, + { + "epoch": 2.2209185185185185, + "grad_norm": 0.48060856122098367, + "learning_rate": 1.9181803421839194e-06, + "loss": 0.0186, + "step": 18740 + }, + { + "epoch": 2.222103703703704, + "grad_norm": 0.5062301767390581, + "learning_rate": 1.9127538055413334e-06, + "loss": 0.0187, + "step": 18750 + }, + { + "epoch": 2.223288888888889, + "grad_norm": 0.4595103341480192, + "learning_rate": 1.9073331395630274e-06, + "loss": 0.0228, + "step": 18760 + }, + { + "epoch": 2.2244740740740743, + "grad_norm": 0.6514335085910568, + "learning_rate": 1.9019183545568653e-06, + "loss": 0.0214, + "step": 18770 + }, + { + "epoch": 2.2256592592592592, + "grad_norm": 0.6109574387758372, + "learning_rate": 1.8965094608195251e-06, + "loss": 0.0205, + "step": 18780 + }, + { + "epoch": 2.2268444444444446, + "grad_norm": 0.5820956836800809, + "learning_rate": 1.891106468636482e-06, + "loss": 0.02, + "step": 18790 + }, + { + "epoch": 2.2280296296296296, + "grad_norm": 0.5413395837266808, + "learning_rate": 1.8857093882819944e-06, + "loss": 0.0211, + "step": 18800 + }, + { + "epoch": 2.229214814814815, + "grad_norm": 0.4719920671745342, + "learning_rate": 1.8803182300190725e-06, + "loss": 0.0187, + "step": 18810 + }, + { + "epoch": 2.2304, + "grad_norm": 0.5768584490371501, + "learning_rate": 1.8749330040994678e-06, + "loss": 0.0217, + "step": 18820 + }, + { + "epoch": 2.2315851851851853, + "grad_norm": 0.8473451199821922, + "learning_rate": 1.8695537207636494e-06, + "loss": 0.0192, + "step": 18830 + }, + { + "epoch": 2.2327703703703703, + "grad_norm": 0.44361653478048213, + "learning_rate": 1.864180390240789e-06, + "loss": 0.0192, + "step": 18840 + }, + { + "epoch": 2.2339555555555557, + "grad_norm": 0.40599631041569073, + "learning_rate": 1.8588130227487383e-06, + "loss": 0.0182, + "step": 18850 + }, + { + "epoch": 2.2351407407407407, + "grad_norm": 0.6909213136326204, + "learning_rate": 1.8534516284940074e-06, + "loss": 0.0197, + "step": 18860 + }, + { + "epoch": 2.236325925925926, + "grad_norm": 0.3922244118276811, + "learning_rate": 1.8480962176717482e-06, + "loss": 0.0183, + "step": 18870 + }, + { + "epoch": 2.237511111111111, + "grad_norm": 0.5463617420415016, + "learning_rate": 1.8427468004657333e-06, + "loss": 0.0185, + "step": 18880 + }, + { + "epoch": 2.2386962962962964, + "grad_norm": 0.5461376183609467, + "learning_rate": 1.8374033870483443e-06, + "loss": 0.0182, + "step": 18890 + }, + { + "epoch": 2.2398814814814814, + "grad_norm": 0.5604828248991386, + "learning_rate": 1.8320659875805392e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 2.2410666666666668, + "grad_norm": 0.46302276044294616, + "learning_rate": 1.8267346122118402e-06, + "loss": 0.0182, + "step": 18910 + }, + { + "epoch": 2.2422518518518517, + "grad_norm": 0.5732434540872019, + "learning_rate": 1.8214092710803183e-06, + "loss": 0.0181, + "step": 18920 + }, + { + "epoch": 2.243437037037037, + "grad_norm": 0.6068526861550686, + "learning_rate": 1.8160899743125699e-06, + "loss": 0.0211, + "step": 18930 + }, + { + "epoch": 2.244622222222222, + "grad_norm": 0.5417544123354159, + "learning_rate": 1.8107767320236936e-06, + "loss": 0.0198, + "step": 18940 + }, + { + "epoch": 2.2458074074074075, + "grad_norm": 0.5191164732118095, + "learning_rate": 1.8054695543172763e-06, + "loss": 0.0178, + "step": 18950 + }, + { + "epoch": 2.2469925925925924, + "grad_norm": 0.5061386324089637, + "learning_rate": 1.8001684512853723e-06, + "loss": 0.0204, + "step": 18960 + }, + { + "epoch": 2.248177777777778, + "grad_norm": 0.6364838745026915, + "learning_rate": 1.7948734330084882e-06, + "loss": 0.0182, + "step": 18970 + }, + { + "epoch": 2.249362962962963, + "grad_norm": 0.6514254175406258, + "learning_rate": 1.7895845095555547e-06, + "loss": 0.0199, + "step": 18980 + }, + { + "epoch": 2.250548148148148, + "grad_norm": 0.34277694503495626, + "learning_rate": 1.7843016909839135e-06, + "loss": 0.0199, + "step": 18990 + }, + { + "epoch": 2.251733333333333, + "grad_norm": 0.4903627282589065, + "learning_rate": 1.7790249873393006e-06, + "loss": 0.021, + "step": 19000 + }, + { + "epoch": 2.2529185185185185, + "grad_norm": 0.6614042165954529, + "learning_rate": 1.7737544086558234e-06, + "loss": 0.0192, + "step": 19010 + }, + { + "epoch": 2.254103703703704, + "grad_norm": 0.5624150614018596, + "learning_rate": 1.768489964955939e-06, + "loss": 0.0187, + "step": 19020 + }, + { + "epoch": 2.255288888888889, + "grad_norm": 0.46672634499401694, + "learning_rate": 1.7632316662504401e-06, + "loss": 0.0198, + "step": 19030 + }, + { + "epoch": 2.256474074074074, + "grad_norm": 0.4534239884903897, + "learning_rate": 1.7579795225384328e-06, + "loss": 0.0202, + "step": 19040 + }, + { + "epoch": 2.2576592592592593, + "grad_norm": 0.5612354919044851, + "learning_rate": 1.7527335438073234e-06, + "loss": 0.0201, + "step": 19050 + }, + { + "epoch": 2.2588444444444447, + "grad_norm": 0.5078202278505786, + "learning_rate": 1.74749374003279e-06, + "loss": 0.0207, + "step": 19060 + }, + { + "epoch": 2.2600296296296296, + "grad_norm": 0.48327683668384314, + "learning_rate": 1.7422601211787687e-06, + "loss": 0.0186, + "step": 19070 + }, + { + "epoch": 2.2612148148148146, + "grad_norm": 0.6444750291437481, + "learning_rate": 1.7370326971974383e-06, + "loss": 0.0204, + "step": 19080 + }, + { + "epoch": 2.2624, + "grad_norm": 0.7859311222785264, + "learning_rate": 1.7318114780291966e-06, + "loss": 0.0195, + "step": 19090 + }, + { + "epoch": 2.2635851851851854, + "grad_norm": 0.6199362599944687, + "learning_rate": 1.726596473602639e-06, + "loss": 0.0202, + "step": 19100 + }, + { + "epoch": 2.2647703703703703, + "grad_norm": 0.45617328690617087, + "learning_rate": 1.7213876938345459e-06, + "loss": 0.0193, + "step": 19110 + }, + { + "epoch": 2.2659555555555557, + "grad_norm": 0.7439906679979321, + "learning_rate": 1.7161851486298576e-06, + "loss": 0.0204, + "step": 19120 + }, + { + "epoch": 2.2671407407407407, + "grad_norm": 0.4159851361012525, + "learning_rate": 1.7109888478816655e-06, + "loss": 0.0206, + "step": 19130 + }, + { + "epoch": 2.268325925925926, + "grad_norm": 0.5041915086825026, + "learning_rate": 1.7057988014711812e-06, + "loss": 0.0182, + "step": 19140 + }, + { + "epoch": 2.269511111111111, + "grad_norm": 0.5866946063645181, + "learning_rate": 1.7006150192677224e-06, + "loss": 0.0184, + "step": 19150 + }, + { + "epoch": 2.2706962962962964, + "grad_norm": 0.5343813177009338, + "learning_rate": 1.6954375111286998e-06, + "loss": 0.0199, + "step": 19160 + }, + { + "epoch": 2.2718814814814814, + "grad_norm": 0.450962320628448, + "learning_rate": 1.6902662868995884e-06, + "loss": 0.0178, + "step": 19170 + }, + { + "epoch": 2.273066666666667, + "grad_norm": 0.531722771994754, + "learning_rate": 1.6851013564139185e-06, + "loss": 0.0193, + "step": 19180 + }, + { + "epoch": 2.2742518518518517, + "grad_norm": 0.5961588257650573, + "learning_rate": 1.6799427294932486e-06, + "loss": 0.0184, + "step": 19190 + }, + { + "epoch": 2.275437037037037, + "grad_norm": 0.56687367750425, + "learning_rate": 1.67479041594715e-06, + "loss": 0.0204, + "step": 19200 + }, + { + "epoch": 2.276622222222222, + "grad_norm": 0.42683201091368494, + "learning_rate": 1.6696444255731935e-06, + "loss": 0.018, + "step": 19210 + }, + { + "epoch": 2.2778074074074075, + "grad_norm": 0.5748915577969967, + "learning_rate": 1.6645047681569203e-06, + "loss": 0.0187, + "step": 19220 + }, + { + "epoch": 2.2789925925925925, + "grad_norm": 0.5817547915169919, + "learning_rate": 1.6593714534718309e-06, + "loss": 0.0206, + "step": 19230 + }, + { + "epoch": 2.280177777777778, + "grad_norm": 0.5542328170124374, + "learning_rate": 1.654244491279367e-06, + "loss": 0.0182, + "step": 19240 + }, + { + "epoch": 2.281362962962963, + "grad_norm": 0.6450951716958563, + "learning_rate": 1.6491238913288855e-06, + "loss": 0.0204, + "step": 19250 + }, + { + "epoch": 2.282548148148148, + "grad_norm": 0.49517173325562236, + "learning_rate": 1.6440096633576508e-06, + "loss": 0.0183, + "step": 19260 + }, + { + "epoch": 2.283733333333333, + "grad_norm": 0.5335860006965933, + "learning_rate": 1.6389018170908066e-06, + "loss": 0.0183, + "step": 19270 + }, + { + "epoch": 2.2849185185185186, + "grad_norm": 0.5659322809392169, + "learning_rate": 1.6338003622413612e-06, + "loss": 0.0193, + "step": 19280 + }, + { + "epoch": 2.2861037037037035, + "grad_norm": 0.618359230890751, + "learning_rate": 1.6287053085101683e-06, + "loss": 0.0182, + "step": 19290 + }, + { + "epoch": 2.287288888888889, + "grad_norm": 0.4467925544095055, + "learning_rate": 1.6236166655859137e-06, + "loss": 0.0184, + "step": 19300 + }, + { + "epoch": 2.288474074074074, + "grad_norm": 0.46540818654384486, + "learning_rate": 1.61853444314509e-06, + "loss": 0.0186, + "step": 19310 + }, + { + "epoch": 2.2896592592592593, + "grad_norm": 0.49748673558436396, + "learning_rate": 1.61345865085198e-06, + "loss": 0.0185, + "step": 19320 + }, + { + "epoch": 2.2908444444444447, + "grad_norm": 0.5120088700070919, + "learning_rate": 1.6083892983586368e-06, + "loss": 0.0192, + "step": 19330 + }, + { + "epoch": 2.2920296296296296, + "grad_norm": 0.5187240607761581, + "learning_rate": 1.6033263953048744e-06, + "loss": 0.021, + "step": 19340 + }, + { + "epoch": 2.2932148148148146, + "grad_norm": 0.5964695573469528, + "learning_rate": 1.598269951318237e-06, + "loss": 0.018, + "step": 19350 + }, + { + "epoch": 2.2944, + "grad_norm": 0.46062815626209913, + "learning_rate": 1.5932199760139871e-06, + "loss": 0.0182, + "step": 19360 + }, + { + "epoch": 2.2955851851851854, + "grad_norm": 0.5543293887546755, + "learning_rate": 1.5881764789950866e-06, + "loss": 0.02, + "step": 19370 + }, + { + "epoch": 2.2967703703703704, + "grad_norm": 0.7314322654316134, + "learning_rate": 1.5831394698521802e-06, + "loss": 0.0219, + "step": 19380 + }, + { + "epoch": 2.2979555555555553, + "grad_norm": 0.5000687215613968, + "learning_rate": 1.5781089581635761e-06, + "loss": 0.0188, + "step": 19390 + }, + { + "epoch": 2.2991407407407407, + "grad_norm": 0.5545642583791672, + "learning_rate": 1.573084953495223e-06, + "loss": 0.0208, + "step": 19400 + }, + { + "epoch": 2.300325925925926, + "grad_norm": 0.47756079154830733, + "learning_rate": 1.5680674654006967e-06, + "loss": 0.019, + "step": 19410 + }, + { + "epoch": 2.301511111111111, + "grad_norm": 0.4781451441295527, + "learning_rate": 1.5630565034211859e-06, + "loss": 0.0183, + "step": 19420 + }, + { + "epoch": 2.3026962962962965, + "grad_norm": 0.5016522216835004, + "learning_rate": 1.558052077085464e-06, + "loss": 0.0215, + "step": 19430 + }, + { + "epoch": 2.3038814814814814, + "grad_norm": 0.7425326239908316, + "learning_rate": 1.5530541959098787e-06, + "loss": 0.0184, + "step": 19440 + }, + { + "epoch": 2.305066666666667, + "grad_norm": 0.6834690391818072, + "learning_rate": 1.5480628693983297e-06, + "loss": 0.0199, + "step": 19450 + }, + { + "epoch": 2.3062518518518518, + "grad_norm": 0.583272257862348, + "learning_rate": 1.5430781070422546e-06, + "loss": 0.0199, + "step": 19460 + }, + { + "epoch": 2.307437037037037, + "grad_norm": 0.5233940029356275, + "learning_rate": 1.5380999183206097e-06, + "loss": 0.0204, + "step": 19470 + }, + { + "epoch": 2.308622222222222, + "grad_norm": 0.5448020340360619, + "learning_rate": 1.5331283126998487e-06, + "loss": 0.0199, + "step": 19480 + }, + { + "epoch": 2.3098074074074075, + "grad_norm": 0.5544604627719499, + "learning_rate": 1.5281632996339046e-06, + "loss": 0.02, + "step": 19490 + }, + { + "epoch": 2.3109925925925925, + "grad_norm": 0.4639834453316954, + "learning_rate": 1.5232048885641803e-06, + "loss": 0.02, + "step": 19500 + }, + { + "epoch": 2.312177777777778, + "grad_norm": 0.5715526445810782, + "learning_rate": 1.5182530889195201e-06, + "loss": 0.0198, + "step": 19510 + }, + { + "epoch": 2.313362962962963, + "grad_norm": 0.6745161045879382, + "learning_rate": 1.5133079101161973e-06, + "loss": 0.0169, + "step": 19520 + }, + { + "epoch": 2.3145481481481482, + "grad_norm": 0.6139144226970469, + "learning_rate": 1.508369361557892e-06, + "loss": 0.0192, + "step": 19530 + }, + { + "epoch": 2.315733333333333, + "grad_norm": 0.4963197356365394, + "learning_rate": 1.5034374526356825e-06, + "loss": 0.0178, + "step": 19540 + }, + { + "epoch": 2.3169185185185186, + "grad_norm": 0.49984660239649953, + "learning_rate": 1.4985121927280184e-06, + "loss": 0.0189, + "step": 19550 + }, + { + "epoch": 2.3181037037037036, + "grad_norm": 0.424350799088494, + "learning_rate": 1.4935935912007037e-06, + "loss": 0.0185, + "step": 19560 + }, + { + "epoch": 2.319288888888889, + "grad_norm": 0.5169508957159925, + "learning_rate": 1.4886816574068823e-06, + "loss": 0.0189, + "step": 19570 + }, + { + "epoch": 2.320474074074074, + "grad_norm": 0.4227018390009985, + "learning_rate": 1.4837764006870187e-06, + "loss": 0.0196, + "step": 19580 + }, + { + "epoch": 2.3216592592592593, + "grad_norm": 0.8490205401988372, + "learning_rate": 1.4788778303688822e-06, + "loss": 0.0199, + "step": 19590 + }, + { + "epoch": 2.3228444444444443, + "grad_norm": 0.5883017405852737, + "learning_rate": 1.4739859557675245e-06, + "loss": 0.0202, + "step": 19600 + }, + { + "epoch": 2.3240296296296297, + "grad_norm": 0.5231568361461372, + "learning_rate": 1.469100786185265e-06, + "loss": 0.0199, + "step": 19610 + }, + { + "epoch": 2.3252148148148146, + "grad_norm": 0.6094023317693572, + "learning_rate": 1.4642223309116753e-06, + "loss": 0.018, + "step": 19620 + }, + { + "epoch": 2.3264, + "grad_norm": 0.6763932563549391, + "learning_rate": 1.4593505992235602e-06, + "loss": 0.0201, + "step": 19630 + }, + { + "epoch": 2.3275851851851854, + "grad_norm": 0.5978441020272818, + "learning_rate": 1.454485600384934e-06, + "loss": 0.0189, + "step": 19640 + }, + { + "epoch": 2.3287703703703704, + "grad_norm": 0.48939865171789004, + "learning_rate": 1.4496273436470104e-06, + "loss": 0.0181, + "step": 19650 + }, + { + "epoch": 2.3299555555555553, + "grad_norm": 0.6865010857472132, + "learning_rate": 1.4447758382481825e-06, + "loss": 0.0222, + "step": 19660 + }, + { + "epoch": 2.3311407407407407, + "grad_norm": 0.44258967685648043, + "learning_rate": 1.439931093414007e-06, + "loss": 0.0178, + "step": 19670 + }, + { + "epoch": 2.332325925925926, + "grad_norm": 0.5330345150623879, + "learning_rate": 1.435093118357182e-06, + "loss": 0.0202, + "step": 19680 + }, + { + "epoch": 2.333511111111111, + "grad_norm": 0.308049161587547, + "learning_rate": 1.430261922277532e-06, + "loss": 0.0179, + "step": 19690 + }, + { + "epoch": 2.334696296296296, + "grad_norm": 0.5235514846875791, + "learning_rate": 1.4254375143619936e-06, + "loss": 0.019, + "step": 19700 + }, + { + "epoch": 2.3358814814814814, + "grad_norm": 0.6449493209413945, + "learning_rate": 1.4206199037845953e-06, + "loss": 0.0198, + "step": 19710 + }, + { + "epoch": 2.337066666666667, + "grad_norm": 0.531810202050383, + "learning_rate": 1.4158090997064356e-06, + "loss": 0.0194, + "step": 19720 + }, + { + "epoch": 2.338251851851852, + "grad_norm": 0.5200387615016707, + "learning_rate": 1.4110051112756734e-06, + "loss": 0.017, + "step": 19730 + }, + { + "epoch": 2.339437037037037, + "grad_norm": 0.5733044729616007, + "learning_rate": 1.4062079476275041e-06, + "loss": 0.0194, + "step": 19740 + }, + { + "epoch": 2.340622222222222, + "grad_norm": 0.5881944095852142, + "learning_rate": 1.4014176178841505e-06, + "loss": 0.0203, + "step": 19750 + }, + { + "epoch": 2.3418074074074076, + "grad_norm": 0.5204774438575377, + "learning_rate": 1.3966341311548348e-06, + "loss": 0.0196, + "step": 19760 + }, + { + "epoch": 2.3429925925925925, + "grad_norm": 0.5075643738321208, + "learning_rate": 1.3918574965357673e-06, + "loss": 0.0182, + "step": 19770 + }, + { + "epoch": 2.344177777777778, + "grad_norm": 0.6542323545731978, + "learning_rate": 1.3870877231101326e-06, + "loss": 0.0162, + "step": 19780 + }, + { + "epoch": 2.345362962962963, + "grad_norm": 0.6855244280424457, + "learning_rate": 1.3823248199480632e-06, + "loss": 0.0174, + "step": 19790 + }, + { + "epoch": 2.3465481481481483, + "grad_norm": 0.5520478393210825, + "learning_rate": 1.377568796106631e-06, + "loss": 0.0194, + "step": 19800 + }, + { + "epoch": 2.3477333333333332, + "grad_norm": 0.4414258845402759, + "learning_rate": 1.3728196606298238e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 2.3489185185185186, + "grad_norm": 0.5550926084482453, + "learning_rate": 1.3680774225485293e-06, + "loss": 0.0186, + "step": 19820 + }, + { + "epoch": 2.3501037037037036, + "grad_norm": 0.6693271388860823, + "learning_rate": 1.363342090880525e-06, + "loss": 0.0195, + "step": 19830 + }, + { + "epoch": 2.351288888888889, + "grad_norm": 0.6086432568859389, + "learning_rate": 1.358613674630448e-06, + "loss": 0.02, + "step": 19840 + }, + { + "epoch": 2.352474074074074, + "grad_norm": 0.5725342898704174, + "learning_rate": 1.3538921827897922e-06, + "loss": 0.0182, + "step": 19850 + }, + { + "epoch": 2.3536592592592593, + "grad_norm": 0.6380906332845099, + "learning_rate": 1.3491776243368782e-06, + "loss": 0.0182, + "step": 19860 + }, + { + "epoch": 2.3548444444444443, + "grad_norm": 0.5021172832151709, + "learning_rate": 1.3444700082368434e-06, + "loss": 0.019, + "step": 19870 + }, + { + "epoch": 2.3560296296296297, + "grad_norm": 0.5801743272886423, + "learning_rate": 1.3397693434416287e-06, + "loss": 0.0219, + "step": 19880 + }, + { + "epoch": 2.3572148148148147, + "grad_norm": 0.6263972576829657, + "learning_rate": 1.3350756388899499e-06, + "loss": 0.0192, + "step": 19890 + }, + { + "epoch": 2.3584, + "grad_norm": 0.645875775540546, + "learning_rate": 1.3303889035072892e-06, + "loss": 0.0202, + "step": 19900 + }, + { + "epoch": 2.359585185185185, + "grad_norm": 0.5873097139859446, + "learning_rate": 1.3257091462058807e-06, + "loss": 0.0191, + "step": 19910 + }, + { + "epoch": 2.3607703703703704, + "grad_norm": 0.5594482704350012, + "learning_rate": 1.3210363758846817e-06, + "loss": 0.0183, + "step": 19920 + }, + { + "epoch": 2.3619555555555554, + "grad_norm": 0.5172918686875249, + "learning_rate": 1.3163706014293703e-06, + "loss": 0.018, + "step": 19930 + }, + { + "epoch": 2.3631407407407408, + "grad_norm": 0.7315382363360655, + "learning_rate": 1.3117118317123167e-06, + "loss": 0.0184, + "step": 19940 + }, + { + "epoch": 2.364325925925926, + "grad_norm": 0.6329775198479737, + "learning_rate": 1.3070600755925712e-06, + "loss": 0.0202, + "step": 19950 + }, + { + "epoch": 2.365511111111111, + "grad_norm": 0.829793080870299, + "learning_rate": 1.3024153419158509e-06, + "loss": 0.0194, + "step": 19960 + }, + { + "epoch": 2.366696296296296, + "grad_norm": 0.5805543670629139, + "learning_rate": 1.2977776395145147e-06, + "loss": 0.0222, + "step": 19970 + }, + { + "epoch": 2.3678814814814815, + "grad_norm": 0.6373240288302964, + "learning_rate": 1.2931469772075534e-06, + "loss": 0.0208, + "step": 19980 + }, + { + "epoch": 2.369066666666667, + "grad_norm": 0.3793110225414597, + "learning_rate": 1.2885233638005679e-06, + "loss": 0.0176, + "step": 19990 + }, + { + "epoch": 2.370251851851852, + "grad_norm": 0.4356969281611011, + "learning_rate": 1.2839068080857591e-06, + "loss": 0.016, + "step": 20000 + }, + { + "epoch": 2.371437037037037, + "grad_norm": 0.6181439744287168, + "learning_rate": 1.2792973188419056e-06, + "loss": 0.0182, + "step": 20010 + }, + { + "epoch": 2.372622222222222, + "grad_norm": 0.4833865287062214, + "learning_rate": 1.2746949048343465e-06, + "loss": 0.0183, + "step": 20020 + }, + { + "epoch": 2.3738074074074076, + "grad_norm": 0.4609952543836555, + "learning_rate": 1.2700995748149675e-06, + "loss": 0.0181, + "step": 20030 + }, + { + "epoch": 2.3749925925925925, + "grad_norm": 0.5896606908275395, + "learning_rate": 1.2655113375221856e-06, + "loss": 0.018, + "step": 20040 + }, + { + "epoch": 2.376177777777778, + "grad_norm": 0.6982786492693911, + "learning_rate": 1.2609302016809277e-06, + "loss": 0.0209, + "step": 20050 + }, + { + "epoch": 2.377362962962963, + "grad_norm": 0.5067000949243737, + "learning_rate": 1.2563561760026188e-06, + "loss": 0.0186, + "step": 20060 + }, + { + "epoch": 2.3785481481481483, + "grad_norm": 0.5775924887833543, + "learning_rate": 1.2517892691851597e-06, + "loss": 0.0204, + "step": 20070 + }, + { + "epoch": 2.3797333333333333, + "grad_norm": 0.5103803660557463, + "learning_rate": 1.2472294899129184e-06, + "loss": 0.0192, + "step": 20080 + }, + { + "epoch": 2.3809185185185187, + "grad_norm": 0.5376253046623067, + "learning_rate": 1.242676846856709e-06, + "loss": 0.0191, + "step": 20090 + }, + { + "epoch": 2.3821037037037036, + "grad_norm": 0.5813296518527333, + "learning_rate": 1.2381313486737728e-06, + "loss": 0.0201, + "step": 20100 + }, + { + "epoch": 2.383288888888889, + "grad_norm": 0.4687920349022994, + "learning_rate": 1.2335930040077643e-06, + "loss": 0.016, + "step": 20110 + }, + { + "epoch": 2.384474074074074, + "grad_norm": 0.8056837676198028, + "learning_rate": 1.229061821488739e-06, + "loss": 0.0181, + "step": 20120 + }, + { + "epoch": 2.3856592592592594, + "grad_norm": 0.530935296628858, + "learning_rate": 1.224537809733129e-06, + "loss": 0.0178, + "step": 20130 + }, + { + "epoch": 2.3868444444444443, + "grad_norm": 0.42222411654219644, + "learning_rate": 1.2200209773437316e-06, + "loss": 0.0171, + "step": 20140 + }, + { + "epoch": 2.3880296296296297, + "grad_norm": 0.6151064347479412, + "learning_rate": 1.2155113329096912e-06, + "loss": 0.0195, + "step": 20150 + }, + { + "epoch": 2.3892148148148147, + "grad_norm": 0.5238278595648099, + "learning_rate": 1.2110088850064867e-06, + "loss": 0.018, + "step": 20160 + }, + { + "epoch": 2.3904, + "grad_norm": 0.5902010228891127, + "learning_rate": 1.20651364219591e-06, + "loss": 0.0199, + "step": 20170 + }, + { + "epoch": 2.391585185185185, + "grad_norm": 0.5296221771444523, + "learning_rate": 1.2020256130260521e-06, + "loss": 0.018, + "step": 20180 + }, + { + "epoch": 2.3927703703703704, + "grad_norm": 0.7476656151780144, + "learning_rate": 1.1975448060312867e-06, + "loss": 0.0192, + "step": 20190 + }, + { + "epoch": 2.3939555555555554, + "grad_norm": 0.6148062697199904, + "learning_rate": 1.193071229732251e-06, + "loss": 0.0176, + "step": 20200 + }, + { + "epoch": 2.395140740740741, + "grad_norm": 0.6390239970535004, + "learning_rate": 1.1886048926358396e-06, + "loss": 0.0195, + "step": 20210 + }, + { + "epoch": 2.3963259259259257, + "grad_norm": 0.4152743139619699, + "learning_rate": 1.184145803235175e-06, + "loss": 0.0175, + "step": 20220 + }, + { + "epoch": 2.397511111111111, + "grad_norm": 0.5160938383689863, + "learning_rate": 1.1796939700095971e-06, + "loss": 0.0167, + "step": 20230 + }, + { + "epoch": 2.398696296296296, + "grad_norm": 0.5336357654447675, + "learning_rate": 1.1752494014246523e-06, + "loss": 0.0184, + "step": 20240 + }, + { + "epoch": 2.3998814814814815, + "grad_norm": 0.48292619340509424, + "learning_rate": 1.1708121059320709e-06, + "loss": 0.0181, + "step": 20250 + }, + { + "epoch": 2.401066666666667, + "grad_norm": 0.6358836808865118, + "learning_rate": 1.1663820919697516e-06, + "loss": 0.0166, + "step": 20260 + }, + { + "epoch": 2.402251851851852, + "grad_norm": 0.6887077522557343, + "learning_rate": 1.1619593679617457e-06, + "loss": 0.0203, + "step": 20270 + }, + { + "epoch": 2.403437037037037, + "grad_norm": 0.39823512228946023, + "learning_rate": 1.1575439423182433e-06, + "loss": 0.0181, + "step": 20280 + }, + { + "epoch": 2.404622222222222, + "grad_norm": 0.5429115888833044, + "learning_rate": 1.1531358234355588e-06, + "loss": 0.0188, + "step": 20290 + }, + { + "epoch": 2.4058074074074076, + "grad_norm": 0.4520706804232499, + "learning_rate": 1.1487350196961078e-06, + "loss": 0.0158, + "step": 20300 + }, + { + "epoch": 2.4069925925925926, + "grad_norm": 0.4650095784590796, + "learning_rate": 1.1443415394683955e-06, + "loss": 0.0168, + "step": 20310 + }, + { + "epoch": 2.4081777777777775, + "grad_norm": 0.6489649052322313, + "learning_rate": 1.139955391107005e-06, + "loss": 0.0183, + "step": 20320 + }, + { + "epoch": 2.409362962962963, + "grad_norm": 0.4500355397020522, + "learning_rate": 1.1355765829525755e-06, + "loss": 0.0172, + "step": 20330 + }, + { + "epoch": 2.4105481481481483, + "grad_norm": 0.5583578151689336, + "learning_rate": 1.1312051233317861e-06, + "loss": 0.0193, + "step": 20340 + }, + { + "epoch": 2.4117333333333333, + "grad_norm": 0.5672274555016643, + "learning_rate": 1.1268410205573438e-06, + "loss": 0.0179, + "step": 20350 + }, + { + "epoch": 2.4129185185185187, + "grad_norm": 0.5523373177581667, + "learning_rate": 1.1224842829279636e-06, + "loss": 0.0191, + "step": 20360 + }, + { + "epoch": 2.4141037037037036, + "grad_norm": 0.6326189869299843, + "learning_rate": 1.1181349187283602e-06, + "loss": 0.0198, + "step": 20370 + }, + { + "epoch": 2.415288888888889, + "grad_norm": 0.4857763939803879, + "learning_rate": 1.1137929362292211e-06, + "loss": 0.016, + "step": 20380 + }, + { + "epoch": 2.416474074074074, + "grad_norm": 0.5786082447947875, + "learning_rate": 1.109458343687202e-06, + "loss": 0.0187, + "step": 20390 + }, + { + "epoch": 2.4176592592592594, + "grad_norm": 0.5609634800891323, + "learning_rate": 1.1051311493449023e-06, + "loss": 0.0193, + "step": 20400 + }, + { + "epoch": 2.4188444444444444, + "grad_norm": 0.52716932764828, + "learning_rate": 1.1008113614308536e-06, + "loss": 0.0174, + "step": 20410 + }, + { + "epoch": 2.4200296296296298, + "grad_norm": 0.5480009233089214, + "learning_rate": 1.0964989881595068e-06, + "loss": 0.0194, + "step": 20420 + }, + { + "epoch": 2.4212148148148147, + "grad_norm": 0.5680722319281621, + "learning_rate": 1.0921940377312086e-06, + "loss": 0.0182, + "step": 20430 + }, + { + "epoch": 2.4224, + "grad_norm": 0.5247975363708455, + "learning_rate": 1.0878965183321922e-06, + "loss": 0.0191, + "step": 20440 + }, + { + "epoch": 2.423585185185185, + "grad_norm": 0.6196770658196401, + "learning_rate": 1.083606438134563e-06, + "loss": 0.0208, + "step": 20450 + }, + { + "epoch": 2.4247703703703705, + "grad_norm": 0.6681509574607049, + "learning_rate": 1.0793238052962746e-06, + "loss": 0.0186, + "step": 20460 + }, + { + "epoch": 2.4259555555555554, + "grad_norm": 0.5390470692480126, + "learning_rate": 1.0750486279611245e-06, + "loss": 0.0182, + "step": 20470 + }, + { + "epoch": 2.427140740740741, + "grad_norm": 0.5738117947414857, + "learning_rate": 1.0707809142587294e-06, + "loss": 0.0179, + "step": 20480 + }, + { + "epoch": 2.4283259259259258, + "grad_norm": 0.49800061206180446, + "learning_rate": 1.0665206723045125e-06, + "loss": 0.0203, + "step": 20490 + }, + { + "epoch": 2.429511111111111, + "grad_norm": 0.4078708010768559, + "learning_rate": 1.0622679101996936e-06, + "loss": 0.0155, + "step": 20500 + }, + { + "epoch": 2.430696296296296, + "grad_norm": 0.5796668537692614, + "learning_rate": 1.0580226360312634e-06, + "loss": 0.0178, + "step": 20510 + }, + { + "epoch": 2.4318814814814815, + "grad_norm": 0.6701549779120091, + "learning_rate": 1.0537848578719755e-06, + "loss": 0.0171, + "step": 20520 + }, + { + "epoch": 2.4330666666666665, + "grad_norm": 0.6190968495996059, + "learning_rate": 1.0495545837803323e-06, + "loss": 0.0177, + "step": 20530 + }, + { + "epoch": 2.434251851851852, + "grad_norm": 0.6537220843523749, + "learning_rate": 1.0453318218005614e-06, + "loss": 0.0192, + "step": 20540 + }, + { + "epoch": 2.435437037037037, + "grad_norm": 0.6003166181757401, + "learning_rate": 1.04111657996261e-06, + "loss": 0.0187, + "step": 20550 + }, + { + "epoch": 2.4366222222222222, + "grad_norm": 0.34957326165006, + "learning_rate": 1.0369088662821225e-06, + "loss": 0.0178, + "step": 20560 + }, + { + "epoch": 2.4378074074074076, + "grad_norm": 0.6648562584701454, + "learning_rate": 1.032708688760427e-06, + "loss": 0.019, + "step": 20570 + }, + { + "epoch": 2.4389925925925926, + "grad_norm": 0.6624831197497233, + "learning_rate": 1.0285160553845253e-06, + "loss": 0.0165, + "step": 20580 + }, + { + "epoch": 2.4401777777777776, + "grad_norm": 0.5150473724953625, + "learning_rate": 1.0243309741270691e-06, + "loss": 0.0172, + "step": 20590 + }, + { + "epoch": 2.441362962962963, + "grad_norm": 0.6223207998858351, + "learning_rate": 1.0201534529463502e-06, + "loss": 0.0189, + "step": 20600 + }, + { + "epoch": 2.4425481481481484, + "grad_norm": 0.5211990941835191, + "learning_rate": 1.0159834997862834e-06, + "loss": 0.0179, + "step": 20610 + }, + { + "epoch": 2.4437333333333333, + "grad_norm": 0.5815406834050634, + "learning_rate": 1.0118211225763947e-06, + "loss": 0.017, + "step": 20620 + }, + { + "epoch": 2.4449185185185183, + "grad_norm": 0.6294226134977733, + "learning_rate": 1.007666329231804e-06, + "loss": 0.0191, + "step": 20630 + }, + { + "epoch": 2.4461037037037037, + "grad_norm": 0.466524384875676, + "learning_rate": 1.0035191276532075e-06, + "loss": 0.0155, + "step": 20640 + }, + { + "epoch": 2.447288888888889, + "grad_norm": 0.48680948326900636, + "learning_rate": 9.993795257268634e-07, + "loss": 0.0185, + "step": 20650 + }, + { + "epoch": 2.448474074074074, + "grad_norm": 0.6282117162838744, + "learning_rate": 9.952475313245847e-07, + "loss": 0.0184, + "step": 20660 + }, + { + "epoch": 2.4496592592592594, + "grad_norm": 0.5557566609818801, + "learning_rate": 9.911231523037124e-07, + "loss": 0.0171, + "step": 20670 + }, + { + "epoch": 2.4508444444444444, + "grad_norm": 0.6363297164808771, + "learning_rate": 9.87006396507108e-07, + "loss": 0.0176, + "step": 20680 + }, + { + "epoch": 2.45202962962963, + "grad_norm": 0.6418935685042145, + "learning_rate": 9.828972717631357e-07, + "loss": 0.0199, + "step": 20690 + }, + { + "epoch": 2.4532148148148147, + "grad_norm": 0.4677998700735346, + "learning_rate": 9.787957858856512e-07, + "loss": 0.0182, + "step": 20700 + }, + { + "epoch": 2.4544, + "grad_norm": 0.5943617063586708, + "learning_rate": 9.747019466739837e-07, + "loss": 0.0173, + "step": 20710 + }, + { + "epoch": 2.455585185185185, + "grad_norm": 0.4774857337426721, + "learning_rate": 9.706157619129202e-07, + "loss": 0.0176, + "step": 20720 + }, + { + "epoch": 2.4567703703703705, + "grad_norm": 0.6658074825090965, + "learning_rate": 9.665372393726908e-07, + "loss": 0.0202, + "step": 20730 + }, + { + "epoch": 2.4579555555555554, + "grad_norm": 0.5770231594772989, + "learning_rate": 9.624663868089596e-07, + "loss": 0.0175, + "step": 20740 + }, + { + "epoch": 2.459140740740741, + "grad_norm": 0.6103775137267466, + "learning_rate": 9.584032119628017e-07, + "loss": 0.0176, + "step": 20750 + }, + { + "epoch": 2.460325925925926, + "grad_norm": 0.5057995711538233, + "learning_rate": 9.54347722560694e-07, + "loss": 0.0165, + "step": 20760 + }, + { + "epoch": 2.461511111111111, + "grad_norm": 0.6415131055077778, + "learning_rate": 9.502999263144969e-07, + "loss": 0.019, + "step": 20770 + }, + { + "epoch": 2.462696296296296, + "grad_norm": 0.6091025567475564, + "learning_rate": 9.462598309214449e-07, + "loss": 0.019, + "step": 20780 + }, + { + "epoch": 2.4638814814814816, + "grad_norm": 0.6544818226473474, + "learning_rate": 9.422274440641277e-07, + "loss": 0.0184, + "step": 20790 + }, + { + "epoch": 2.4650666666666665, + "grad_norm": 0.5133113890220043, + "learning_rate": 9.382027734104754e-07, + "loss": 0.0172, + "step": 20800 + }, + { + "epoch": 2.466251851851852, + "grad_norm": 0.5838971367930164, + "learning_rate": 9.341858266137449e-07, + "loss": 0.0166, + "step": 20810 + }, + { + "epoch": 2.467437037037037, + "grad_norm": 0.6277586066184481, + "learning_rate": 9.301766113125055e-07, + "loss": 0.0184, + "step": 20820 + }, + { + "epoch": 2.4686222222222223, + "grad_norm": 0.395363789973833, + "learning_rate": 9.261751351306269e-07, + "loss": 0.0169, + "step": 20830 + }, + { + "epoch": 2.4698074074074072, + "grad_norm": 0.5393064163374834, + "learning_rate": 9.221814056772599e-07, + "loss": 0.019, + "step": 20840 + }, + { + "epoch": 2.4709925925925926, + "grad_norm": 0.5508832993128663, + "learning_rate": 9.181954305468221e-07, + "loss": 0.0167, + "step": 20850 + }, + { + "epoch": 2.4721777777777776, + "grad_norm": 0.6446638153025966, + "learning_rate": 9.142172173189912e-07, + "loss": 0.0212, + "step": 20860 + }, + { + "epoch": 2.473362962962963, + "grad_norm": 0.5644175929859583, + "learning_rate": 9.102467735586823e-07, + "loss": 0.0173, + "step": 20870 + }, + { + "epoch": 2.4745481481481484, + "grad_norm": 0.7049009211193161, + "learning_rate": 9.062841068160338e-07, + "loss": 0.0181, + "step": 20880 + }, + { + "epoch": 2.4757333333333333, + "grad_norm": 0.48408214004509653, + "learning_rate": 9.023292246263992e-07, + "loss": 0.017, + "step": 20890 + }, + { + "epoch": 2.4769185185185183, + "grad_norm": 0.4484831048629232, + "learning_rate": 8.98382134510325e-07, + "loss": 0.0181, + "step": 20900 + }, + { + "epoch": 2.4781037037037037, + "grad_norm": 0.509910222167136, + "learning_rate": 8.944428439735448e-07, + "loss": 0.0172, + "step": 20910 + }, + { + "epoch": 2.479288888888889, + "grad_norm": 0.454247380281483, + "learning_rate": 8.905113605069571e-07, + "loss": 0.0192, + "step": 20920 + }, + { + "epoch": 2.480474074074074, + "grad_norm": 0.5069856836367086, + "learning_rate": 8.865876915866178e-07, + "loss": 0.0178, + "step": 20930 + }, + { + "epoch": 2.4816592592592595, + "grad_norm": 0.5715235756999294, + "learning_rate": 8.826718446737181e-07, + "loss": 0.0176, + "step": 20940 + }, + { + "epoch": 2.4828444444444444, + "grad_norm": 0.48336785931290466, + "learning_rate": 8.787638272145815e-07, + "loss": 0.0185, + "step": 20950 + }, + { + "epoch": 2.48402962962963, + "grad_norm": 0.5901594529833316, + "learning_rate": 8.748636466406374e-07, + "loss": 0.0188, + "step": 20960 + }, + { + "epoch": 2.4852148148148148, + "grad_norm": 0.6908066539999371, + "learning_rate": 8.709713103684142e-07, + "loss": 0.0185, + "step": 20970 + }, + { + "epoch": 2.4864, + "grad_norm": 0.6924979337226718, + "learning_rate": 8.670868257995247e-07, + "loss": 0.0199, + "step": 20980 + }, + { + "epoch": 2.487585185185185, + "grad_norm": 0.7002005115330637, + "learning_rate": 8.632102003206511e-07, + "loss": 0.0187, + "step": 20990 + }, + { + "epoch": 2.4887703703703705, + "grad_norm": 0.48501133777228894, + "learning_rate": 8.593414413035294e-07, + "loss": 0.016, + "step": 21000 + }, + { + "epoch": 2.4899555555555555, + "grad_norm": 0.5651002456756551, + "learning_rate": 8.554805561049395e-07, + "loss": 0.0179, + "step": 21010 + }, + { + "epoch": 2.491140740740741, + "grad_norm": 0.6454325498863057, + "learning_rate": 8.516275520666839e-07, + "loss": 0.0173, + "step": 21020 + }, + { + "epoch": 2.492325925925926, + "grad_norm": 0.5595164186336209, + "learning_rate": 8.477824365155851e-07, + "loss": 0.0175, + "step": 21030 + }, + { + "epoch": 2.4935111111111112, + "grad_norm": 0.5262821279548828, + "learning_rate": 8.439452167634587e-07, + "loss": 0.0169, + "step": 21040 + }, + { + "epoch": 2.494696296296296, + "grad_norm": 0.45082946246916983, + "learning_rate": 8.401159001071086e-07, + "loss": 0.018, + "step": 21050 + }, + { + "epoch": 2.4958814814814816, + "grad_norm": 0.538520539920005, + "learning_rate": 8.362944938283085e-07, + "loss": 0.0209, + "step": 21060 + }, + { + "epoch": 2.4970666666666665, + "grad_norm": 0.6109965390008959, + "learning_rate": 8.324810051937942e-07, + "loss": 0.0163, + "step": 21070 + }, + { + "epoch": 2.498251851851852, + "grad_norm": 0.7242181627637891, + "learning_rate": 8.2867544145524e-07, + "loss": 0.02, + "step": 21080 + }, + { + "epoch": 2.499437037037037, + "grad_norm": 0.4948397091771907, + "learning_rate": 8.248778098492549e-07, + "loss": 0.0181, + "step": 21090 + }, + { + "epoch": 2.5006222222222223, + "grad_norm": 0.7489629904051068, + "learning_rate": 8.210881175973611e-07, + "loss": 0.0181, + "step": 21100 + }, + { + "epoch": 2.5018074074074073, + "grad_norm": 0.41749780441090956, + "learning_rate": 8.173063719059832e-07, + "loss": 0.0146, + "step": 21110 + }, + { + "epoch": 2.5029925925925927, + "grad_norm": 0.4212418413036383, + "learning_rate": 8.135325799664384e-07, + "loss": 0.0157, + "step": 21120 + }, + { + "epoch": 2.5041777777777776, + "grad_norm": 0.43280182167680353, + "learning_rate": 8.097667489549161e-07, + "loss": 0.0186, + "step": 21130 + }, + { + "epoch": 2.505362962962963, + "grad_norm": 0.4316497829943779, + "learning_rate": 8.060088860324661e-07, + "loss": 0.0197, + "step": 21140 + }, + { + "epoch": 2.5065481481481484, + "grad_norm": 0.6521403724281614, + "learning_rate": 8.022589983449908e-07, + "loss": 0.0168, + "step": 21150 + }, + { + "epoch": 2.5077333333333334, + "grad_norm": 0.47889010429565065, + "learning_rate": 7.985170930232216e-07, + "loss": 0.0169, + "step": 21160 + }, + { + "epoch": 2.5089185185185183, + "grad_norm": 0.507653225728503, + "learning_rate": 7.947831771827164e-07, + "loss": 0.0176, + "step": 21170 + }, + { + "epoch": 2.5101037037037037, + "grad_norm": 0.4547264446765598, + "learning_rate": 7.910572579238357e-07, + "loss": 0.0166, + "step": 21180 + }, + { + "epoch": 2.511288888888889, + "grad_norm": 0.6308250895773334, + "learning_rate": 7.873393423317349e-07, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 2.512474074074074, + "grad_norm": 0.5262251487920135, + "learning_rate": 7.83629437476352e-07, + "loss": 0.0179, + "step": 21200 + }, + { + "epoch": 2.513659259259259, + "grad_norm": 0.46133876070066765, + "learning_rate": 7.799275504123904e-07, + "loss": 0.0187, + "step": 21210 + }, + { + "epoch": 2.5148444444444444, + "grad_norm": 0.7015140106863733, + "learning_rate": 7.762336881793053e-07, + "loss": 0.0186, + "step": 21220 + }, + { + "epoch": 2.51602962962963, + "grad_norm": 0.5958186357381842, + "learning_rate": 7.725478578012929e-07, + "loss": 0.0181, + "step": 21230 + }, + { + "epoch": 2.517214814814815, + "grad_norm": 0.5900887500158141, + "learning_rate": 7.6887006628728e-07, + "loss": 0.0194, + "step": 21240 + }, + { + "epoch": 2.5183999999999997, + "grad_norm": 0.45510057858318664, + "learning_rate": 7.652003206309022e-07, + "loss": 0.0148, + "step": 21250 + }, + { + "epoch": 2.519585185185185, + "grad_norm": 0.41252168881070916, + "learning_rate": 7.615386278104964e-07, + "loss": 0.0163, + "step": 21260 + }, + { + "epoch": 2.5207703703703706, + "grad_norm": 0.6395488212258044, + "learning_rate": 7.578849947890848e-07, + "loss": 0.018, + "step": 21270 + }, + { + "epoch": 2.5219555555555555, + "grad_norm": 0.5389037633987491, + "learning_rate": 7.542394285143684e-07, + "loss": 0.0193, + "step": 21280 + }, + { + "epoch": 2.523140740740741, + "grad_norm": 0.5573306194458528, + "learning_rate": 7.506019359187034e-07, + "loss": 0.02, + "step": 21290 + }, + { + "epoch": 2.524325925925926, + "grad_norm": 0.5403072811635534, + "learning_rate": 7.469725239190945e-07, + "loss": 0.0166, + "step": 21300 + }, + { + "epoch": 2.5255111111111113, + "grad_norm": 0.5884307506421887, + "learning_rate": 7.433511994171805e-07, + "loss": 0.0172, + "step": 21310 + }, + { + "epoch": 2.526696296296296, + "grad_norm": 0.4863037969486824, + "learning_rate": 7.397379692992224e-07, + "loss": 0.0162, + "step": 21320 + }, + { + "epoch": 2.5278814814814816, + "grad_norm": 0.5473849752231416, + "learning_rate": 7.361328404360896e-07, + "loss": 0.0186, + "step": 21330 + }, + { + "epoch": 2.5290666666666666, + "grad_norm": 0.5552595876953792, + "learning_rate": 7.325358196832432e-07, + "loss": 0.0185, + "step": 21340 + }, + { + "epoch": 2.530251851851852, + "grad_norm": 0.705056990597905, + "learning_rate": 7.289469138807265e-07, + "loss": 0.0184, + "step": 21350 + }, + { + "epoch": 2.531437037037037, + "grad_norm": 0.43833826213742755, + "learning_rate": 7.253661298531555e-07, + "loss": 0.0162, + "step": 21360 + }, + { + "epoch": 2.5326222222222223, + "grad_norm": 0.5135562918746531, + "learning_rate": 7.217934744096977e-07, + "loss": 0.0182, + "step": 21370 + }, + { + "epoch": 2.5338074074074073, + "grad_norm": 0.3904585435870939, + "learning_rate": 7.182289543440652e-07, + "loss": 0.0169, + "step": 21380 + }, + { + "epoch": 2.5349925925925927, + "grad_norm": 0.6922550774775605, + "learning_rate": 7.146725764344975e-07, + "loss": 0.019, + "step": 21390 + }, + { + "epoch": 2.5361777777777776, + "grad_norm": 0.7244832875800394, + "learning_rate": 7.111243474437551e-07, + "loss": 0.0182, + "step": 21400 + }, + { + "epoch": 2.537362962962963, + "grad_norm": 0.5821610690014783, + "learning_rate": 7.075842741191019e-07, + "loss": 0.0171, + "step": 21410 + }, + { + "epoch": 2.538548148148148, + "grad_norm": 0.5620961385311694, + "learning_rate": 7.040523631922907e-07, + "loss": 0.0196, + "step": 21420 + }, + { + "epoch": 2.5397333333333334, + "grad_norm": 0.48953769431157984, + "learning_rate": 7.005286213795537e-07, + "loss": 0.016, + "step": 21430 + }, + { + "epoch": 2.5409185185185184, + "grad_norm": 0.34011777249262254, + "learning_rate": 6.970130553815884e-07, + "loss": 0.0149, + "step": 21440 + }, + { + "epoch": 2.5421037037037038, + "grad_norm": 0.47990807369537847, + "learning_rate": 6.935056718835487e-07, + "loss": 0.0176, + "step": 21450 + }, + { + "epoch": 2.543288888888889, + "grad_norm": 0.5630613010515627, + "learning_rate": 6.900064775550225e-07, + "loss": 0.0181, + "step": 21460 + }, + { + "epoch": 2.544474074074074, + "grad_norm": 0.5257591948589118, + "learning_rate": 6.865154790500323e-07, + "loss": 0.0174, + "step": 21470 + }, + { + "epoch": 2.545659259259259, + "grad_norm": 0.6363239781576507, + "learning_rate": 6.8303268300701e-07, + "loss": 0.0177, + "step": 21480 + }, + { + "epoch": 2.5468444444444445, + "grad_norm": 0.4752475882989525, + "learning_rate": 6.79558096048793e-07, + "loss": 0.0164, + "step": 21490 + }, + { + "epoch": 2.54802962962963, + "grad_norm": 0.6317766217623786, + "learning_rate": 6.760917247826076e-07, + "loss": 0.0191, + "step": 21500 + }, + { + "epoch": 2.549214814814815, + "grad_norm": 0.6121659917330085, + "learning_rate": 6.726335758000562e-07, + "loss": 0.0176, + "step": 21510 + }, + { + "epoch": 2.5504, + "grad_norm": 0.6697636341281199, + "learning_rate": 6.691836556771058e-07, + "loss": 0.0174, + "step": 21520 + }, + { + "epoch": 2.551585185185185, + "grad_norm": 0.50710880388447, + "learning_rate": 6.657419709740787e-07, + "loss": 0.0178, + "step": 21530 + }, + { + "epoch": 2.5527703703703706, + "grad_norm": 0.5110129681647883, + "learning_rate": 6.623085282356323e-07, + "loss": 0.0164, + "step": 21540 + }, + { + "epoch": 2.5539555555555555, + "grad_norm": 0.5453865599967119, + "learning_rate": 6.588833339907552e-07, + "loss": 0.0171, + "step": 21550 + }, + { + "epoch": 2.5551407407407405, + "grad_norm": 0.5300781123461664, + "learning_rate": 6.554663947527474e-07, + "loss": 0.0184, + "step": 21560 + }, + { + "epoch": 2.556325925925926, + "grad_norm": 0.4783806902147297, + "learning_rate": 6.520577170192144e-07, + "loss": 0.0161, + "step": 21570 + }, + { + "epoch": 2.5575111111111113, + "grad_norm": 0.44039738005484225, + "learning_rate": 6.486573072720493e-07, + "loss": 0.0179, + "step": 21580 + }, + { + "epoch": 2.5586962962962962, + "grad_norm": 0.6792744994409649, + "learning_rate": 6.452651719774239e-07, + "loss": 0.0175, + "step": 21590 + }, + { + "epoch": 2.5598814814814816, + "grad_norm": 0.5605031491370238, + "learning_rate": 6.418813175857747e-07, + "loss": 0.018, + "step": 21600 + }, + { + "epoch": 2.5610666666666666, + "grad_norm": 0.8015951623495515, + "learning_rate": 6.385057505317932e-07, + "loss": 0.0189, + "step": 21610 + }, + { + "epoch": 2.562251851851852, + "grad_norm": 0.6331864010093162, + "learning_rate": 6.351384772344094e-07, + "loss": 0.0168, + "step": 21620 + }, + { + "epoch": 2.563437037037037, + "grad_norm": 0.6184904424231238, + "learning_rate": 6.317795040967844e-07, + "loss": 0.0187, + "step": 21630 + }, + { + "epoch": 2.5646222222222224, + "grad_norm": 0.7186390751573959, + "learning_rate": 6.284288375062936e-07, + "loss": 0.017, + "step": 21640 + }, + { + "epoch": 2.5658074074074073, + "grad_norm": 0.7152679327537902, + "learning_rate": 6.250864838345195e-07, + "loss": 0.016, + "step": 21650 + }, + { + "epoch": 2.5669925925925927, + "grad_norm": 0.5581145556379897, + "learning_rate": 6.217524494372334e-07, + "loss": 0.0181, + "step": 21660 + }, + { + "epoch": 2.5681777777777777, + "grad_norm": 0.4855017977285862, + "learning_rate": 6.184267406543898e-07, + "loss": 0.0157, + "step": 21670 + }, + { + "epoch": 2.569362962962963, + "grad_norm": 0.3412304786470055, + "learning_rate": 6.151093638101086e-07, + "loss": 0.0175, + "step": 21680 + }, + { + "epoch": 2.570548148148148, + "grad_norm": 0.6497795456381428, + "learning_rate": 6.118003252126686e-07, + "loss": 0.0177, + "step": 21690 + }, + { + "epoch": 2.5717333333333334, + "grad_norm": 0.6205662322167813, + "learning_rate": 6.084996311544905e-07, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 2.5729185185185184, + "grad_norm": 0.5641759402797629, + "learning_rate": 6.052072879121296e-07, + "loss": 0.0175, + "step": 21710 + }, + { + "epoch": 2.574103703703704, + "grad_norm": 0.6385352737210143, + "learning_rate": 6.019233017462589e-07, + "loss": 0.0164, + "step": 21720 + }, + { + "epoch": 2.5752888888888887, + "grad_norm": 0.621744765216219, + "learning_rate": 5.986476789016598e-07, + "loss": 0.0187, + "step": 21730 + }, + { + "epoch": 2.576474074074074, + "grad_norm": 0.5552101013667966, + "learning_rate": 5.953804256072127e-07, + "loss": 0.0177, + "step": 21740 + }, + { + "epoch": 2.577659259259259, + "grad_norm": 0.5573477007551515, + "learning_rate": 5.921215480758796e-07, + "loss": 0.0181, + "step": 21750 + }, + { + "epoch": 2.5788444444444445, + "grad_norm": 0.42414849129449866, + "learning_rate": 5.888710525046964e-07, + "loss": 0.0177, + "step": 21760 + }, + { + "epoch": 2.58002962962963, + "grad_norm": 0.699740183589654, + "learning_rate": 5.856289450747604e-07, + "loss": 0.0179, + "step": 21770 + }, + { + "epoch": 2.581214814814815, + "grad_norm": 0.5647578247178993, + "learning_rate": 5.823952319512194e-07, + "loss": 0.0179, + "step": 21780 + }, + { + "epoch": 2.5824, + "grad_norm": 0.5556553113987365, + "learning_rate": 5.791699192832556e-07, + "loss": 0.0177, + "step": 21790 + }, + { + "epoch": 2.583585185185185, + "grad_norm": 0.7924041878845579, + "learning_rate": 5.759530132040791e-07, + "loss": 0.0185, + "step": 21800 + }, + { + "epoch": 2.5847703703703706, + "grad_norm": 0.6544991124887737, + "learning_rate": 5.727445198309118e-07, + "loss": 0.0184, + "step": 21810 + }, + { + "epoch": 2.5859555555555556, + "grad_norm": 0.5375126802594471, + "learning_rate": 5.695444452649829e-07, + "loss": 0.0172, + "step": 21820 + }, + { + "epoch": 2.5871407407407405, + "grad_norm": 0.4190573312393564, + "learning_rate": 5.663527955915083e-07, + "loss": 0.0178, + "step": 21830 + }, + { + "epoch": 2.588325925925926, + "grad_norm": 0.5744162874932179, + "learning_rate": 5.631695768796836e-07, + "loss": 0.0167, + "step": 21840 + }, + { + "epoch": 2.5895111111111113, + "grad_norm": 0.6786082923151243, + "learning_rate": 5.599947951826718e-07, + "loss": 0.0175, + "step": 21850 + }, + { + "epoch": 2.5906962962962963, + "grad_norm": 0.4806957075973286, + "learning_rate": 5.568284565375975e-07, + "loss": 0.0182, + "step": 21860 + }, + { + "epoch": 2.5918814814814812, + "grad_norm": 0.44272063982081145, + "learning_rate": 5.536705669655245e-07, + "loss": 0.0161, + "step": 21870 + }, + { + "epoch": 2.5930666666666666, + "grad_norm": 0.6462430155813553, + "learning_rate": 5.505211324714505e-07, + "loss": 0.0169, + "step": 21880 + }, + { + "epoch": 2.594251851851852, + "grad_norm": 0.658140325371253, + "learning_rate": 5.473801590442957e-07, + "loss": 0.0177, + "step": 21890 + }, + { + "epoch": 2.595437037037037, + "grad_norm": 0.5055702278737, + "learning_rate": 5.442476526568935e-07, + "loss": 0.0179, + "step": 21900 + }, + { + "epoch": 2.5966222222222224, + "grad_norm": 0.5905771934930385, + "learning_rate": 5.41123619265973e-07, + "loss": 0.0188, + "step": 21910 + }, + { + "epoch": 2.5978074074074073, + "grad_norm": 0.6105603733912472, + "learning_rate": 5.380080648121533e-07, + "loss": 0.0169, + "step": 21920 + }, + { + "epoch": 2.5989925925925927, + "grad_norm": 0.6795130059992837, + "learning_rate": 5.349009952199269e-07, + "loss": 0.0153, + "step": 21930 + }, + { + "epoch": 2.6001777777777777, + "grad_norm": 0.6455111369860846, + "learning_rate": 5.318024163976559e-07, + "loss": 0.0172, + "step": 21940 + }, + { + "epoch": 2.601362962962963, + "grad_norm": 0.5253832429760392, + "learning_rate": 5.287123342375555e-07, + "loss": 0.0182, + "step": 21950 + }, + { + "epoch": 2.602548148148148, + "grad_norm": 0.7139651011349015, + "learning_rate": 5.256307546156813e-07, + "loss": 0.0165, + "step": 21960 + }, + { + "epoch": 2.6037333333333335, + "grad_norm": 0.5856089789662409, + "learning_rate": 5.225576833919221e-07, + "loss": 0.0175, + "step": 21970 + }, + { + "epoch": 2.6049185185185184, + "grad_norm": 0.5570914749299625, + "learning_rate": 5.194931264099884e-07, + "loss": 0.0193, + "step": 21980 + }, + { + "epoch": 2.606103703703704, + "grad_norm": 0.47859589843313183, + "learning_rate": 5.16437089497398e-07, + "loss": 0.0159, + "step": 21990 + }, + { + "epoch": 2.6072888888888888, + "grad_norm": 0.638222679336556, + "learning_rate": 5.133895784654674e-07, + "loss": 0.0195, + "step": 22000 + }, + { + "epoch": 2.608474074074074, + "grad_norm": 0.5675545872739362, + "learning_rate": 5.103505991093027e-07, + "loss": 0.0174, + "step": 22010 + }, + { + "epoch": 2.609659259259259, + "grad_norm": 0.5828340663934573, + "learning_rate": 5.073201572077835e-07, + "loss": 0.0179, + "step": 22020 + }, + { + "epoch": 2.6108444444444445, + "grad_norm": 0.5984679910764497, + "learning_rate": 5.042982585235562e-07, + "loss": 0.0163, + "step": 22030 + }, + { + "epoch": 2.6120296296296295, + "grad_norm": 0.6353571844924941, + "learning_rate": 5.012849088030219e-07, + "loss": 0.0176, + "step": 22040 + }, + { + "epoch": 2.613214814814815, + "grad_norm": 0.5140429949902446, + "learning_rate": 4.982801137763227e-07, + "loss": 0.0161, + "step": 22050 + }, + { + "epoch": 2.6144, + "grad_norm": 0.7124805537751441, + "learning_rate": 4.952838791573361e-07, + "loss": 0.0189, + "step": 22060 + }, + { + "epoch": 2.6155851851851852, + "grad_norm": 0.6454204924765827, + "learning_rate": 4.922962106436602e-07, + "loss": 0.0176, + "step": 22070 + }, + { + "epoch": 2.6167703703703706, + "grad_norm": 0.5497005684727574, + "learning_rate": 4.893171139166026e-07, + "loss": 0.0162, + "step": 22080 + }, + { + "epoch": 2.6179555555555556, + "grad_norm": 0.4619831978468954, + "learning_rate": 4.863465946411733e-07, + "loss": 0.0165, + "step": 22090 + }, + { + "epoch": 2.6191407407407405, + "grad_norm": 0.6178746309084407, + "learning_rate": 4.83384658466069e-07, + "loss": 0.0178, + "step": 22100 + }, + { + "epoch": 2.620325925925926, + "grad_norm": 0.5588492409793266, + "learning_rate": 4.804313110236674e-07, + "loss": 0.0156, + "step": 22110 + }, + { + "epoch": 2.6215111111111113, + "grad_norm": 0.6636435517100956, + "learning_rate": 4.774865579300131e-07, + "loss": 0.0176, + "step": 22120 + }, + { + "epoch": 2.6226962962962963, + "grad_norm": 0.5201496617212275, + "learning_rate": 4.745504047848065e-07, + "loss": 0.0156, + "step": 22130 + }, + { + "epoch": 2.6238814814814813, + "grad_norm": 0.6693252769821623, + "learning_rate": 4.7162285717139434e-07, + "loss": 0.0182, + "step": 22140 + }, + { + "epoch": 2.6250666666666667, + "grad_norm": 0.62998544290871, + "learning_rate": 4.6870392065676286e-07, + "loss": 0.0177, + "step": 22150 + }, + { + "epoch": 2.626251851851852, + "grad_norm": 0.5642750873920185, + "learning_rate": 4.657936007915187e-07, + "loss": 0.0146, + "step": 22160 + }, + { + "epoch": 2.627437037037037, + "grad_norm": 0.7338902563262278, + "learning_rate": 4.6289190310988776e-07, + "loss": 0.0171, + "step": 22170 + }, + { + "epoch": 2.628622222222222, + "grad_norm": 0.6315221885905348, + "learning_rate": 4.599988331296956e-07, + "loss": 0.0177, + "step": 22180 + }, + { + "epoch": 2.6298074074074074, + "grad_norm": 0.5901158298625131, + "learning_rate": 4.5711439635236555e-07, + "loss": 0.0174, + "step": 22190 + }, + { + "epoch": 2.6309925925925928, + "grad_norm": 0.6136960373187711, + "learning_rate": 4.5423859826290216e-07, + "loss": 0.0158, + "step": 22200 + }, + { + "epoch": 2.6321777777777777, + "grad_norm": 0.7967018906772484, + "learning_rate": 4.513714443298817e-07, + "loss": 0.018, + "step": 22210 + }, + { + "epoch": 2.633362962962963, + "grad_norm": 0.47205781155001864, + "learning_rate": 4.485129400054444e-07, + "loss": 0.0157, + "step": 22220 + }, + { + "epoch": 2.634548148148148, + "grad_norm": 0.8096455963544056, + "learning_rate": 4.456630907252829e-07, + "loss": 0.0187, + "step": 22230 + }, + { + "epoch": 2.6357333333333335, + "grad_norm": 0.5486232504177728, + "learning_rate": 4.4282190190862993e-07, + "loss": 0.0174, + "step": 22240 + }, + { + "epoch": 2.6369185185185184, + "grad_norm": 0.4635635046714287, + "learning_rate": 4.399893789582516e-07, + "loss": 0.0176, + "step": 22250 + }, + { + "epoch": 2.638103703703704, + "grad_norm": 0.5545216509684127, + "learning_rate": 4.3716552726043305e-07, + "loss": 0.0177, + "step": 22260 + }, + { + "epoch": 2.639288888888889, + "grad_norm": 0.5796119638666397, + "learning_rate": 4.343503521849718e-07, + "loss": 0.017, + "step": 22270 + }, + { + "epoch": 2.640474074074074, + "grad_norm": 0.6595236578032794, + "learning_rate": 4.315438590851662e-07, + "loss": 0.0165, + "step": 22280 + }, + { + "epoch": 2.641659259259259, + "grad_norm": 0.5193183251460862, + "learning_rate": 4.287460532978027e-07, + "loss": 0.0179, + "step": 22290 + }, + { + "epoch": 2.6428444444444446, + "grad_norm": 0.569834937754304, + "learning_rate": 4.2595694014315016e-07, + "loss": 0.0173, + "step": 22300 + }, + { + "epoch": 2.6440296296296295, + "grad_norm": 0.5638331280692132, + "learning_rate": 4.2317652492494754e-07, + "loss": 0.017, + "step": 22310 + }, + { + "epoch": 2.645214814814815, + "grad_norm": 0.46501048855239424, + "learning_rate": 4.2040481293039445e-07, + "loss": 0.0163, + "step": 22320 + }, + { + "epoch": 2.6464, + "grad_norm": 0.5455056357115358, + "learning_rate": 4.1764180943013876e-07, + "loss": 0.0167, + "step": 22330 + }, + { + "epoch": 2.6475851851851853, + "grad_norm": 0.671931031025325, + "learning_rate": 4.148875196782698e-07, + "loss": 0.0176, + "step": 22340 + }, + { + "epoch": 2.64877037037037, + "grad_norm": 0.6138632752931288, + "learning_rate": 4.1214194891230574e-07, + "loss": 0.0171, + "step": 22350 + }, + { + "epoch": 2.6499555555555556, + "grad_norm": 0.5863915206209542, + "learning_rate": 4.09405102353187e-07, + "loss": 0.0167, + "step": 22360 + }, + { + "epoch": 2.6511407407407406, + "grad_norm": 0.6375120865575559, + "learning_rate": 4.0667698520526155e-07, + "loss": 0.0195, + "step": 22370 + }, + { + "epoch": 2.652325925925926, + "grad_norm": 0.45812451800245535, + "learning_rate": 4.039576026562786e-07, + "loss": 0.0182, + "step": 22380 + }, + { + "epoch": 2.6535111111111114, + "grad_norm": 0.5964665685487398, + "learning_rate": 4.012469598773788e-07, + "loss": 0.0191, + "step": 22390 + }, + { + "epoch": 2.6546962962962963, + "grad_norm": 0.5435465170232701, + "learning_rate": 3.985450620230841e-07, + "loss": 0.018, + "step": 22400 + }, + { + "epoch": 2.6558814814814813, + "grad_norm": 0.7120547530871477, + "learning_rate": 3.958519142312839e-07, + "loss": 0.0169, + "step": 22410 + }, + { + "epoch": 2.6570666666666667, + "grad_norm": 0.4859189935609332, + "learning_rate": 3.9316752162323056e-07, + "loss": 0.016, + "step": 22420 + }, + { + "epoch": 2.658251851851852, + "grad_norm": 0.4352345686757695, + "learning_rate": 3.9049188930352697e-07, + "loss": 0.0165, + "step": 22430 + }, + { + "epoch": 2.659437037037037, + "grad_norm": 0.609760821369753, + "learning_rate": 3.8782502236012045e-07, + "loss": 0.0176, + "step": 22440 + }, + { + "epoch": 2.660622222222222, + "grad_norm": 0.49217010291880653, + "learning_rate": 3.851669258642865e-07, + "loss": 0.0158, + "step": 22450 + }, + { + "epoch": 2.6618074074074074, + "grad_norm": 0.5329534770695694, + "learning_rate": 3.825176048706231e-07, + "loss": 0.0186, + "step": 22460 + }, + { + "epoch": 2.662992592592593, + "grad_norm": 0.5336945644131692, + "learning_rate": 3.7987706441704406e-07, + "loss": 0.0183, + "step": 22470 + }, + { + "epoch": 2.6641777777777778, + "grad_norm": 0.4577740337806618, + "learning_rate": 3.772453095247641e-07, + "loss": 0.0172, + "step": 22480 + }, + { + "epoch": 2.6653629629629627, + "grad_norm": 0.6782765453670982, + "learning_rate": 3.7462234519829167e-07, + "loss": 0.0175, + "step": 22490 + }, + { + "epoch": 2.666548148148148, + "grad_norm": 0.513927387748197, + "learning_rate": 3.7200817642541796e-07, + "loss": 0.0175, + "step": 22500 + }, + { + "epoch": 2.6677333333333335, + "grad_norm": 0.6202874387819673, + "learning_rate": 3.6940280817720997e-07, + "loss": 0.0177, + "step": 22510 + }, + { + "epoch": 2.6689185185185185, + "grad_norm": 0.4571232350885583, + "learning_rate": 3.668062454080007e-07, + "loss": 0.0163, + "step": 22520 + }, + { + "epoch": 2.670103703703704, + "grad_norm": 0.6740831615413266, + "learning_rate": 3.6421849305537716e-07, + "loss": 0.0167, + "step": 22530 + }, + { + "epoch": 2.671288888888889, + "grad_norm": 0.5103458560212962, + "learning_rate": 3.61639556040172e-07, + "loss": 0.0183, + "step": 22540 + }, + { + "epoch": 2.6724740740740742, + "grad_norm": 0.5499439087301622, + "learning_rate": 3.5906943926645674e-07, + "loss": 0.0167, + "step": 22550 + }, + { + "epoch": 2.673659259259259, + "grad_norm": 0.5628079092199192, + "learning_rate": 3.56508147621531e-07, + "loss": 0.0185, + "step": 22560 + }, + { + "epoch": 2.6748444444444446, + "grad_norm": 0.8432895281819219, + "learning_rate": 3.539556859759097e-07, + "loss": 0.0177, + "step": 22570 + }, + { + "epoch": 2.6760296296296295, + "grad_norm": 0.7208958963241069, + "learning_rate": 3.514120591833187e-07, + "loss": 0.0171, + "step": 22580 + }, + { + "epoch": 2.677214814814815, + "grad_norm": 0.5127175885306958, + "learning_rate": 3.488772720806821e-07, + "loss": 0.0155, + "step": 22590 + }, + { + "epoch": 2.6784, + "grad_norm": 0.48825468926756144, + "learning_rate": 3.463513294881171e-07, + "loss": 0.0157, + "step": 22600 + }, + { + "epoch": 2.6795851851851853, + "grad_norm": 0.6257502274449918, + "learning_rate": 3.438342362089209e-07, + "loss": 0.0192, + "step": 22610 + }, + { + "epoch": 2.6807703703703702, + "grad_norm": 0.48148993324565625, + "learning_rate": 3.413259970295613e-07, + "loss": 0.0166, + "step": 22620 + }, + { + "epoch": 2.6819555555555556, + "grad_norm": 0.6838219625893894, + "learning_rate": 3.38826616719673e-07, + "loss": 0.0193, + "step": 22630 + }, + { + "epoch": 2.6831407407407406, + "grad_norm": 0.8094063260874216, + "learning_rate": 3.3633610003204087e-07, + "loss": 0.0173, + "step": 22640 + }, + { + "epoch": 2.684325925925926, + "grad_norm": 0.547582980239583, + "learning_rate": 3.338544517025982e-07, + "loss": 0.0166, + "step": 22650 + }, + { + "epoch": 2.685511111111111, + "grad_norm": 0.5256113925858257, + "learning_rate": 3.313816764504124e-07, + "loss": 0.0179, + "step": 22660 + }, + { + "epoch": 2.6866962962962964, + "grad_norm": 0.5529086920972679, + "learning_rate": 3.289177789776776e-07, + "loss": 0.0153, + "step": 22670 + }, + { + "epoch": 2.6878814814814813, + "grad_norm": 0.4559626979943232, + "learning_rate": 3.2646276396970824e-07, + "loss": 0.0155, + "step": 22680 + }, + { + "epoch": 2.6890666666666667, + "grad_norm": 0.6207917578822382, + "learning_rate": 3.240166360949254e-07, + "loss": 0.0182, + "step": 22690 + }, + { + "epoch": 2.690251851851852, + "grad_norm": 0.4603030834764436, + "learning_rate": 3.2157940000485164e-07, + "loss": 0.0163, + "step": 22700 + }, + { + "epoch": 2.691437037037037, + "grad_norm": 0.43191313817098037, + "learning_rate": 3.191510603341025e-07, + "loss": 0.0173, + "step": 22710 + }, + { + "epoch": 2.692622222222222, + "grad_norm": 0.5963780564991046, + "learning_rate": 3.1673162170037243e-07, + "loss": 0.0171, + "step": 22720 + }, + { + "epoch": 2.6938074074074074, + "grad_norm": 0.5993675036379672, + "learning_rate": 3.143210887044351e-07, + "loss": 0.0147, + "step": 22730 + }, + { + "epoch": 2.694992592592593, + "grad_norm": 0.45612814100773097, + "learning_rate": 3.1191946593012447e-07, + "loss": 0.0148, + "step": 22740 + }, + { + "epoch": 2.696177777777778, + "grad_norm": 0.5689891280497064, + "learning_rate": 3.0952675794433393e-07, + "loss": 0.0181, + "step": 22750 + }, + { + "epoch": 2.6973629629629627, + "grad_norm": 0.5200536521008651, + "learning_rate": 3.0714296929700184e-07, + "loss": 0.0185, + "step": 22760 + }, + { + "epoch": 2.698548148148148, + "grad_norm": 0.5322954740029764, + "learning_rate": 3.0476810452110817e-07, + "loss": 0.0177, + "step": 22770 + }, + { + "epoch": 2.6997333333333335, + "grad_norm": 0.6285266638084286, + "learning_rate": 3.0240216813266446e-07, + "loss": 0.0182, + "step": 22780 + }, + { + "epoch": 2.7009185185185185, + "grad_norm": 0.556186447296804, + "learning_rate": 3.0004516463070065e-07, + "loss": 0.0187, + "step": 22790 + }, + { + "epoch": 2.7021037037037035, + "grad_norm": 0.6181022789062538, + "learning_rate": 2.976970984972616e-07, + "loss": 0.0168, + "step": 22800 + }, + { + "epoch": 2.703288888888889, + "grad_norm": 0.5306929150152081, + "learning_rate": 2.953579741973983e-07, + "loss": 0.0162, + "step": 22810 + }, + { + "epoch": 2.7044740740740743, + "grad_norm": 0.49216528502252066, + "learning_rate": 2.9302779617915554e-07, + "loss": 0.0166, + "step": 22820 + }, + { + "epoch": 2.705659259259259, + "grad_norm": 0.32464298100312095, + "learning_rate": 2.907065688735683e-07, + "loss": 0.0181, + "step": 22830 + }, + { + "epoch": 2.7068444444444446, + "grad_norm": 0.5581773424887209, + "learning_rate": 2.8839429669464846e-07, + "loss": 0.0164, + "step": 22840 + }, + { + "epoch": 2.7080296296296296, + "grad_norm": 0.48897977988184105, + "learning_rate": 2.8609098403938164e-07, + "loss": 0.0174, + "step": 22850 + }, + { + "epoch": 2.709214814814815, + "grad_norm": 0.4781983191944791, + "learning_rate": 2.837966352877164e-07, + "loss": 0.0158, + "step": 22860 + }, + { + "epoch": 2.7104, + "grad_norm": 0.6328193715799723, + "learning_rate": 2.8151125480255226e-07, + "loss": 0.0189, + "step": 22870 + }, + { + "epoch": 2.7115851851851853, + "grad_norm": 0.6355616843512417, + "learning_rate": 2.7923484692973735e-07, + "loss": 0.0145, + "step": 22880 + }, + { + "epoch": 2.7127703703703703, + "grad_norm": 0.5384442575779633, + "learning_rate": 2.769674159980579e-07, + "loss": 0.0137, + "step": 22890 + }, + { + "epoch": 2.7139555555555557, + "grad_norm": 0.5257990711711436, + "learning_rate": 2.7470896631922815e-07, + "loss": 0.0176, + "step": 22900 + }, + { + "epoch": 2.7151407407407406, + "grad_norm": 0.6028946351137432, + "learning_rate": 2.7245950218788455e-07, + "loss": 0.0199, + "step": 22910 + }, + { + "epoch": 2.716325925925926, + "grad_norm": 0.6096999928901118, + "learning_rate": 2.702190278815764e-07, + "loss": 0.0181, + "step": 22920 + }, + { + "epoch": 2.717511111111111, + "grad_norm": 0.5619062276042769, + "learning_rate": 2.679875476607591e-07, + "loss": 0.0172, + "step": 22930 + }, + { + "epoch": 2.7186962962962964, + "grad_norm": 0.7190051502794982, + "learning_rate": 2.657650657687844e-07, + "loss": 0.0158, + "step": 22940 + }, + { + "epoch": 2.7198814814814813, + "grad_norm": 0.5399562713131539, + "learning_rate": 2.635515864318922e-07, + "loss": 0.0166, + "step": 22950 + }, + { + "epoch": 2.7210666666666667, + "grad_norm": 0.6350567185639987, + "learning_rate": 2.613471138592044e-07, + "loss": 0.0183, + "step": 22960 + }, + { + "epoch": 2.7222518518518517, + "grad_norm": 0.44368713994260967, + "learning_rate": 2.5915165224271454e-07, + "loss": 0.0188, + "step": 22970 + }, + { + "epoch": 2.723437037037037, + "grad_norm": 0.6828455752911726, + "learning_rate": 2.569652057572825e-07, + "loss": 0.016, + "step": 22980 + }, + { + "epoch": 2.724622222222222, + "grad_norm": 0.6120963844436584, + "learning_rate": 2.5478777856062454e-07, + "loss": 0.0181, + "step": 22990 + }, + { + "epoch": 2.7258074074074075, + "grad_norm": 0.557913942818718, + "learning_rate": 2.526193747933048e-07, + "loss": 0.0201, + "step": 23000 + }, + { + "epoch": 2.726992592592593, + "grad_norm": 0.5931411968363004, + "learning_rate": 2.5045999857873036e-07, + "loss": 0.0157, + "step": 23010 + }, + { + "epoch": 2.728177777777778, + "grad_norm": 0.5589712722631065, + "learning_rate": 2.483096540231417e-07, + "loss": 0.0175, + "step": 23020 + }, + { + "epoch": 2.7293629629629628, + "grad_norm": 0.6251667845871217, + "learning_rate": 2.461683452156033e-07, + "loss": 0.0196, + "step": 23030 + }, + { + "epoch": 2.730548148148148, + "grad_norm": 0.5955504426850127, + "learning_rate": 2.440360762279975e-07, + "loss": 0.0178, + "step": 23040 + }, + { + "epoch": 2.7317333333333336, + "grad_norm": 0.6011971122442907, + "learning_rate": 2.4191285111501706e-07, + "loss": 0.0166, + "step": 23050 + }, + { + "epoch": 2.7329185185185185, + "grad_norm": 0.5364478580154213, + "learning_rate": 2.397986739141589e-07, + "loss": 0.0199, + "step": 23060 + }, + { + "epoch": 2.7341037037037035, + "grad_norm": 0.6608510493743801, + "learning_rate": 2.376935486457116e-07, + "loss": 0.017, + "step": 23070 + }, + { + "epoch": 2.735288888888889, + "grad_norm": 0.5779420989406509, + "learning_rate": 2.3559747931275189e-07, + "loss": 0.0163, + "step": 23080 + }, + { + "epoch": 2.7364740740740743, + "grad_norm": 0.4827757716026622, + "learning_rate": 2.3351046990113647e-07, + "loss": 0.017, + "step": 23090 + }, + { + "epoch": 2.7376592592592592, + "grad_norm": 0.6263917517490031, + "learning_rate": 2.314325243794935e-07, + "loss": 0.0177, + "step": 23100 + }, + { + "epoch": 2.738844444444444, + "grad_norm": 0.49606188057862205, + "learning_rate": 2.2936364669921495e-07, + "loss": 0.0185, + "step": 23110 + }, + { + "epoch": 2.7400296296296296, + "grad_norm": 0.5680872282032834, + "learning_rate": 2.2730384079444944e-07, + "loss": 0.0158, + "step": 23120 + }, + { + "epoch": 2.741214814814815, + "grad_norm": 0.5087943299401656, + "learning_rate": 2.2525311058209487e-07, + "loss": 0.0169, + "step": 23130 + }, + { + "epoch": 2.7424, + "grad_norm": 0.6026711741849468, + "learning_rate": 2.2321145996179238e-07, + "loss": 0.0168, + "step": 23140 + }, + { + "epoch": 2.7435851851851853, + "grad_norm": 0.584706616518232, + "learning_rate": 2.2117889281591587e-07, + "loss": 0.017, + "step": 23150 + }, + { + "epoch": 2.7447703703703703, + "grad_norm": 0.5196194189377914, + "learning_rate": 2.1915541300956522e-07, + "loss": 0.0161, + "step": 23160 + }, + { + "epoch": 2.7459555555555557, + "grad_norm": 0.4751551154335021, + "learning_rate": 2.1714102439056306e-07, + "loss": 0.0158, + "step": 23170 + }, + { + "epoch": 2.7471407407407407, + "grad_norm": 0.6840118273639366, + "learning_rate": 2.151357307894425e-07, + "loss": 0.0159, + "step": 23180 + }, + { + "epoch": 2.748325925925926, + "grad_norm": 0.5714794365600924, + "learning_rate": 2.131395360194416e-07, + "loss": 0.0178, + "step": 23190 + }, + { + "epoch": 2.749511111111111, + "grad_norm": 0.5051383517587212, + "learning_rate": 2.111524438764967e-07, + "loss": 0.0158, + "step": 23200 + }, + { + "epoch": 2.7506962962962964, + "grad_norm": 0.5589906619467518, + "learning_rate": 2.0917445813923298e-07, + "loss": 0.0172, + "step": 23210 + }, + { + "epoch": 2.7518814814814814, + "grad_norm": 0.3092318250707791, + "learning_rate": 2.0720558256896283e-07, + "loss": 0.0163, + "step": 23220 + }, + { + "epoch": 2.7530666666666668, + "grad_norm": 0.6604218732894236, + "learning_rate": 2.0524582090967137e-07, + "loss": 0.017, + "step": 23230 + }, + { + "epoch": 2.7542518518518517, + "grad_norm": 0.40989607364580555, + "learning_rate": 2.032951768880137e-07, + "loss": 0.0185, + "step": 23240 + }, + { + "epoch": 2.755437037037037, + "grad_norm": 0.6234290041495085, + "learning_rate": 2.0135365421330765e-07, + "loss": 0.0186, + "step": 23250 + }, + { + "epoch": 2.756622222222222, + "grad_norm": 0.5508048860616999, + "learning_rate": 1.9942125657752554e-07, + "loss": 0.0156, + "step": 23260 + }, + { + "epoch": 2.7578074074074075, + "grad_norm": 0.41925814638682407, + "learning_rate": 1.974979876552885e-07, + "loss": 0.0154, + "step": 23270 + }, + { + "epoch": 2.7589925925925924, + "grad_norm": 0.5887599680600968, + "learning_rate": 1.955838511038577e-07, + "loss": 0.0177, + "step": 23280 + }, + { + "epoch": 2.760177777777778, + "grad_norm": 0.817375285032308, + "learning_rate": 1.9367885056312652e-07, + "loss": 0.0184, + "step": 23290 + }, + { + "epoch": 2.761362962962963, + "grad_norm": 0.4650271384049185, + "learning_rate": 1.9178298965562002e-07, + "loss": 0.015, + "step": 23300 + }, + { + "epoch": 2.762548148148148, + "grad_norm": 0.6200371376448504, + "learning_rate": 1.8989627198647942e-07, + "loss": 0.0179, + "step": 23310 + }, + { + "epoch": 2.7637333333333336, + "grad_norm": 0.45420199429625285, + "learning_rate": 1.8801870114346143e-07, + "loss": 0.0146, + "step": 23320 + }, + { + "epoch": 2.7649185185185186, + "grad_norm": 0.6152694272512292, + "learning_rate": 1.8615028069692788e-07, + "loss": 0.0158, + "step": 23330 + }, + { + "epoch": 2.7661037037037035, + "grad_norm": 0.751203655151455, + "learning_rate": 1.8429101419984108e-07, + "loss": 0.0167, + "step": 23340 + }, + { + "epoch": 2.767288888888889, + "grad_norm": 0.5925673251198694, + "learning_rate": 1.8244090518775736e-07, + "loss": 0.0183, + "step": 23350 + }, + { + "epoch": 2.7684740740740743, + "grad_norm": 0.603436719080171, + "learning_rate": 1.8059995717881696e-07, + "loss": 0.0167, + "step": 23360 + }, + { + "epoch": 2.7696592592592593, + "grad_norm": 0.5048056625816433, + "learning_rate": 1.7876817367374122e-07, + "loss": 0.017, + "step": 23370 + }, + { + "epoch": 2.770844444444444, + "grad_norm": 0.5046099810218734, + "learning_rate": 1.7694555815582382e-07, + "loss": 0.0146, + "step": 23380 + }, + { + "epoch": 2.7720296296296296, + "grad_norm": 0.6519202532330443, + "learning_rate": 1.7513211409092512e-07, + "loss": 0.0165, + "step": 23390 + }, + { + "epoch": 2.773214814814815, + "grad_norm": 0.8475286050857711, + "learning_rate": 1.7332784492746613e-07, + "loss": 0.0169, + "step": 23400 + }, + { + "epoch": 2.7744, + "grad_norm": 0.49356891024625155, + "learning_rate": 1.7153275409641846e-07, + "loss": 0.0183, + "step": 23410 + }, + { + "epoch": 2.775585185185185, + "grad_norm": 0.6423043758056829, + "learning_rate": 1.6974684501130213e-07, + "loss": 0.0162, + "step": 23420 + }, + { + "epoch": 2.7767703703703703, + "grad_norm": 0.6958336868335069, + "learning_rate": 1.6797012106817835e-07, + "loss": 0.0173, + "step": 23430 + }, + { + "epoch": 2.7779555555555557, + "grad_norm": 0.6514600118533722, + "learning_rate": 1.662025856456384e-07, + "loss": 0.0164, + "step": 23440 + }, + { + "epoch": 2.7791407407407407, + "grad_norm": 0.4595749847427966, + "learning_rate": 1.644442421048048e-07, + "loss": 0.0153, + "step": 23450 + }, + { + "epoch": 2.780325925925926, + "grad_norm": 0.7014126172602996, + "learning_rate": 1.6269509378931735e-07, + "loss": 0.0191, + "step": 23460 + }, + { + "epoch": 2.781511111111111, + "grad_norm": 0.4983839899893056, + "learning_rate": 1.6095514402533263e-07, + "loss": 0.0162, + "step": 23470 + }, + { + "epoch": 2.7826962962962964, + "grad_norm": 0.5959014383056567, + "learning_rate": 1.592243961215162e-07, + "loss": 0.0168, + "step": 23480 + }, + { + "epoch": 2.7838814814814814, + "grad_norm": 0.5668109210434008, + "learning_rate": 1.5750285336903314e-07, + "loss": 0.0161, + "step": 23490 + }, + { + "epoch": 2.785066666666667, + "grad_norm": 0.5751938851139909, + "learning_rate": 1.557905190415443e-07, + "loss": 0.017, + "step": 23500 + }, + { + "epoch": 2.7862518518518518, + "grad_norm": 0.5617253930658811, + "learning_rate": 1.540873963952022e-07, + "loss": 0.0176, + "step": 23510 + }, + { + "epoch": 2.787437037037037, + "grad_norm": 0.44058201089023974, + "learning_rate": 1.5239348866864067e-07, + "loss": 0.015, + "step": 23520 + }, + { + "epoch": 2.788622222222222, + "grad_norm": 0.5411531210936671, + "learning_rate": 1.5070879908297086e-07, + "loss": 0.0182, + "step": 23530 + }, + { + "epoch": 2.7898074074074075, + "grad_norm": 0.4955496049266531, + "learning_rate": 1.4903333084177352e-07, + "loss": 0.0173, + "step": 23540 + }, + { + "epoch": 2.7909925925925925, + "grad_norm": 0.5460026211959926, + "learning_rate": 1.4736708713109783e-07, + "loss": 0.0143, + "step": 23550 + }, + { + "epoch": 2.792177777777778, + "grad_norm": 0.5586886441930353, + "learning_rate": 1.4571007111944924e-07, + "loss": 0.016, + "step": 23560 + }, + { + "epoch": 2.793362962962963, + "grad_norm": 0.5241678615965204, + "learning_rate": 1.440622859577856e-07, + "loss": 0.0166, + "step": 23570 + }, + { + "epoch": 2.7945481481481482, + "grad_norm": 0.48602589301351173, + "learning_rate": 1.4242373477951155e-07, + "loss": 0.0171, + "step": 23580 + }, + { + "epoch": 2.795733333333333, + "grad_norm": 0.69943218249404, + "learning_rate": 1.4079442070047523e-07, + "loss": 0.0171, + "step": 23590 + }, + { + "epoch": 2.7969185185185186, + "grad_norm": 0.6482562437170073, + "learning_rate": 1.3917434681895548e-07, + "loss": 0.0177, + "step": 23600 + }, + { + "epoch": 2.7981037037037035, + "grad_norm": 0.6282925922368169, + "learning_rate": 1.3756351621566355e-07, + "loss": 0.0179, + "step": 23610 + }, + { + "epoch": 2.799288888888889, + "grad_norm": 0.6625097023367825, + "learning_rate": 1.359619319537314e-07, + "loss": 0.0177, + "step": 23620 + }, + { + "epoch": 2.8004740740740743, + "grad_norm": 0.4700158500845828, + "learning_rate": 1.3436959707870956e-07, + "loss": 0.0173, + "step": 23630 + }, + { + "epoch": 2.8016592592592593, + "grad_norm": 0.6880727786582241, + "learning_rate": 1.3278651461856084e-07, + "loss": 0.0162, + "step": 23640 + }, + { + "epoch": 2.8028444444444442, + "grad_norm": 0.7156278628541548, + "learning_rate": 1.3121268758365224e-07, + "loss": 0.0189, + "step": 23650 + }, + { + "epoch": 2.8040296296296296, + "grad_norm": 0.5556669593471263, + "learning_rate": 1.2964811896675034e-07, + "loss": 0.0158, + "step": 23660 + }, + { + "epoch": 2.805214814814815, + "grad_norm": 0.5781255382339048, + "learning_rate": 1.2809281174301747e-07, + "loss": 0.0165, + "step": 23670 + }, + { + "epoch": 2.8064, + "grad_norm": 0.8316892911189332, + "learning_rate": 1.2654676887000504e-07, + "loss": 0.0178, + "step": 23680 + }, + { + "epoch": 2.807585185185185, + "grad_norm": 0.572177601547204, + "learning_rate": 1.2500999328764586e-07, + "loss": 0.0167, + "step": 23690 + }, + { + "epoch": 2.8087703703703704, + "grad_norm": 0.5618758606010577, + "learning_rate": 1.2348248791825e-07, + "loss": 0.0188, + "step": 23700 + }, + { + "epoch": 2.8099555555555558, + "grad_norm": 0.7582104032629382, + "learning_rate": 1.2196425566650184e-07, + "loss": 0.0193, + "step": 23710 + }, + { + "epoch": 2.8111407407407407, + "grad_norm": 0.5130479103641089, + "learning_rate": 1.2045529941945077e-07, + "loss": 0.0155, + "step": 23720 + }, + { + "epoch": 2.8123259259259257, + "grad_norm": 0.6502772297409193, + "learning_rate": 1.1895562204650546e-07, + "loss": 0.0184, + "step": 23730 + }, + { + "epoch": 2.813511111111111, + "grad_norm": 0.6072052992417779, + "learning_rate": 1.1746522639943304e-07, + "loss": 0.0165, + "step": 23740 + }, + { + "epoch": 2.8146962962962965, + "grad_norm": 0.5420393938088142, + "learning_rate": 1.1598411531234755e-07, + "loss": 0.0168, + "step": 23750 + }, + { + "epoch": 2.8158814814814814, + "grad_norm": 0.46995870490733455, + "learning_rate": 1.1451229160171051e-07, + "loss": 0.0164, + "step": 23760 + }, + { + "epoch": 2.817066666666667, + "grad_norm": 0.6229276720892111, + "learning_rate": 1.130497580663209e-07, + "loss": 0.0173, + "step": 23770 + }, + { + "epoch": 2.818251851851852, + "grad_norm": 0.4008196510899579, + "learning_rate": 1.1159651748731126e-07, + "loss": 0.0151, + "step": 23780 + }, + { + "epoch": 2.819437037037037, + "grad_norm": 0.5797283202018212, + "learning_rate": 1.1015257262814493e-07, + "loss": 0.0162, + "step": 23790 + }, + { + "epoch": 2.820622222222222, + "grad_norm": 0.46855512418386264, + "learning_rate": 1.0871792623460664e-07, + "loss": 0.0178, + "step": 23800 + }, + { + "epoch": 2.8218074074074075, + "grad_norm": 0.5587214472542532, + "learning_rate": 1.0729258103479967e-07, + "loss": 0.0169, + "step": 23810 + }, + { + "epoch": 2.8229925925925925, + "grad_norm": 0.5863900549217356, + "learning_rate": 1.0587653973914147e-07, + "loss": 0.0177, + "step": 23820 + }, + { + "epoch": 2.824177777777778, + "grad_norm": 0.5710151595819755, + "learning_rate": 1.0446980504035476e-07, + "loss": 0.015, + "step": 23830 + }, + { + "epoch": 2.825362962962963, + "grad_norm": 0.5318866091985665, + "learning_rate": 1.0307237961346861e-07, + "loss": 0.0166, + "step": 23840 + }, + { + "epoch": 2.8265481481481483, + "grad_norm": 0.5280222084130962, + "learning_rate": 1.0168426611580629e-07, + "loss": 0.0184, + "step": 23850 + }, + { + "epoch": 2.827733333333333, + "grad_norm": 0.6034509847198821, + "learning_rate": 1.0030546718698575e-07, + "loss": 0.0157, + "step": 23860 + }, + { + "epoch": 2.8289185185185186, + "grad_norm": 0.5683031124940099, + "learning_rate": 9.893598544891192e-08, + "loss": 0.0164, + "step": 23870 + }, + { + "epoch": 2.8301037037037036, + "grad_norm": 0.6889455979805238, + "learning_rate": 9.757582350577111e-08, + "loss": 0.0171, + "step": 23880 + }, + { + "epoch": 2.831288888888889, + "grad_norm": 0.5635138635299503, + "learning_rate": 9.622498394402934e-08, + "loss": 0.0167, + "step": 23890 + }, + { + "epoch": 2.832474074074074, + "grad_norm": 0.6020099574580935, + "learning_rate": 9.48834693324241e-08, + "loss": 0.0159, + "step": 23900 + }, + { + "epoch": 2.8336592592592593, + "grad_norm": 0.6375362753351225, + "learning_rate": 9.35512822219603e-08, + "loss": 0.0166, + "step": 23910 + }, + { + "epoch": 2.8348444444444443, + "grad_norm": 0.49489541474359744, + "learning_rate": 9.222842514590713e-08, + "loss": 0.0158, + "step": 23920 + }, + { + "epoch": 2.8360296296296297, + "grad_norm": 0.7064656880315627, + "learning_rate": 9.091490061979014e-08, + "loss": 0.0162, + "step": 23930 + }, + { + "epoch": 2.837214814814815, + "grad_norm": 0.5192228424462412, + "learning_rate": 8.961071114139075e-08, + "loss": 0.0151, + "step": 23940 + }, + { + "epoch": 2.8384, + "grad_norm": 0.624949159483972, + "learning_rate": 8.831585919073627e-08, + "loss": 0.017, + "step": 23950 + }, + { + "epoch": 2.839585185185185, + "grad_norm": 0.3495923820761073, + "learning_rate": 8.703034723009873e-08, + "loss": 0.0161, + "step": 23960 + }, + { + "epoch": 2.8407703703703704, + "grad_norm": 0.5710223194340618, + "learning_rate": 8.575417770399109e-08, + "loss": 0.0156, + "step": 23970 + }, + { + "epoch": 2.841955555555556, + "grad_norm": 0.8247218018631985, + "learning_rate": 8.448735303915879e-08, + "loss": 0.0172, + "step": 23980 + }, + { + "epoch": 2.8431407407407407, + "grad_norm": 0.3861681904504029, + "learning_rate": 8.322987564457818e-08, + "loss": 0.0144, + "step": 23990 + }, + { + "epoch": 2.8443259259259257, + "grad_norm": 0.5909122223748874, + "learning_rate": 8.198174791144986e-08, + "loss": 0.0155, + "step": 24000 + }, + { + "epoch": 2.845511111111111, + "grad_norm": 0.5328819807185622, + "learning_rate": 8.074297221319694e-08, + "loss": 0.0163, + "step": 24010 + }, + { + "epoch": 2.8466962962962965, + "grad_norm": 0.5598500490202744, + "learning_rate": 7.951355090545787e-08, + "loss": 0.0163, + "step": 24020 + }, + { + "epoch": 2.8478814814814815, + "grad_norm": 0.6768782769491176, + "learning_rate": 7.829348632608314e-08, + "loss": 0.0167, + "step": 24030 + }, + { + "epoch": 2.8490666666666664, + "grad_norm": 0.6116674476283963, + "learning_rate": 7.708278079513021e-08, + "loss": 0.0181, + "step": 24040 + }, + { + "epoch": 2.850251851851852, + "grad_norm": 0.5858764888773094, + "learning_rate": 7.588143661486025e-08, + "loss": 0.016, + "step": 24050 + }, + { + "epoch": 2.851437037037037, + "grad_norm": 0.5092432128430017, + "learning_rate": 7.468945606973254e-08, + "loss": 0.0173, + "step": 24060 + }, + { + "epoch": 2.852622222222222, + "grad_norm": 0.6061761093394289, + "learning_rate": 7.350684142640008e-08, + "loss": 0.0154, + "step": 24070 + }, + { + "epoch": 2.8538074074074076, + "grad_norm": 0.6773936782955857, + "learning_rate": 7.233359493370673e-08, + "loss": 0.0159, + "step": 24080 + }, + { + "epoch": 2.8549925925925925, + "grad_norm": 0.8124479635404589, + "learning_rate": 7.11697188226812e-08, + "loss": 0.0191, + "step": 24090 + }, + { + "epoch": 2.856177777777778, + "grad_norm": 0.5976595288941503, + "learning_rate": 7.001521530653533e-08, + "loss": 0.0179, + "step": 24100 + }, + { + "epoch": 2.857362962962963, + "grad_norm": 0.49519100271791844, + "learning_rate": 6.887008658065631e-08, + "loss": 0.0162, + "step": 24110 + }, + { + "epoch": 2.8585481481481483, + "grad_norm": 0.6200909844159384, + "learning_rate": 6.773433482260394e-08, + "loss": 0.0178, + "step": 24120 + }, + { + "epoch": 2.8597333333333332, + "grad_norm": 0.6527282945023417, + "learning_rate": 6.660796219210897e-08, + "loss": 0.0163, + "step": 24130 + }, + { + "epoch": 2.8609185185185186, + "grad_norm": 0.5283155539315686, + "learning_rate": 6.549097083106582e-08, + "loss": 0.0156, + "step": 24140 + }, + { + "epoch": 2.8621037037037036, + "grad_norm": 0.5314188447916525, + "learning_rate": 6.43833628635293e-08, + "loss": 0.0152, + "step": 24150 + }, + { + "epoch": 2.863288888888889, + "grad_norm": 0.5165313753051062, + "learning_rate": 6.328514039571133e-08, + "loss": 0.015, + "step": 24160 + }, + { + "epoch": 2.864474074074074, + "grad_norm": 0.6981501133944453, + "learning_rate": 6.219630551597633e-08, + "loss": 0.017, + "step": 24170 + }, + { + "epoch": 2.8656592592592594, + "grad_norm": 0.5857384358510265, + "learning_rate": 6.11168602948381e-08, + "loss": 0.0167, + "step": 24180 + }, + { + "epoch": 2.8668444444444443, + "grad_norm": 0.4979088167867476, + "learning_rate": 6.004680678495412e-08, + "loss": 0.0172, + "step": 24190 + }, + { + "epoch": 2.8680296296296297, + "grad_norm": 0.46960980353248033, + "learning_rate": 5.898614702112282e-08, + "loss": 0.0154, + "step": 24200 + }, + { + "epoch": 2.8692148148148147, + "grad_norm": 0.4839037710464162, + "learning_rate": 5.7934883020281385e-08, + "loss": 0.0162, + "step": 24210 + }, + { + "epoch": 2.8704, + "grad_norm": 0.5710498559029004, + "learning_rate": 5.689301678149739e-08, + "loss": 0.0174, + "step": 24220 + }, + { + "epoch": 2.871585185185185, + "grad_norm": 0.7715704542703485, + "learning_rate": 5.5860550285969925e-08, + "loss": 0.0155, + "step": 24230 + }, + { + "epoch": 2.8727703703703704, + "grad_norm": 0.6549835873997798, + "learning_rate": 5.4837485497021836e-08, + "loss": 0.0175, + "step": 24240 + }, + { + "epoch": 2.873955555555556, + "grad_norm": 0.6368264840592855, + "learning_rate": 5.3823824360099695e-08, + "loss": 0.0164, + "step": 24250 + }, + { + "epoch": 2.8751407407407408, + "grad_norm": 0.5693220519768152, + "learning_rate": 5.281956880276773e-08, + "loss": 0.0158, + "step": 24260 + }, + { + "epoch": 2.8763259259259257, + "grad_norm": 0.451065093785236, + "learning_rate": 5.1824720734703904e-08, + "loss": 0.0159, + "step": 24270 + }, + { + "epoch": 2.877511111111111, + "grad_norm": 0.6882340036966536, + "learning_rate": 5.0839282047697166e-08, + "loss": 0.0179, + "step": 24280 + }, + { + "epoch": 2.8786962962962965, + "grad_norm": 0.6484915264109841, + "learning_rate": 4.9863254615643567e-08, + "loss": 0.018, + "step": 24290 + }, + { + "epoch": 2.8798814814814815, + "grad_norm": 0.7041913458246905, + "learning_rate": 4.889664029454455e-08, + "loss": 0.0171, + "step": 24300 + }, + { + "epoch": 2.8810666666666664, + "grad_norm": 0.6484233749915392, + "learning_rate": 4.7939440922499246e-08, + "loss": 0.0168, + "step": 24310 + }, + { + "epoch": 2.882251851851852, + "grad_norm": 0.5761611738707825, + "learning_rate": 4.699165831970498e-08, + "loss": 0.0154, + "step": 24320 + }, + { + "epoch": 2.8834370370370372, + "grad_norm": 0.5150081576364847, + "learning_rate": 4.605329428845229e-08, + "loss": 0.0172, + "step": 24330 + }, + { + "epoch": 2.884622222222222, + "grad_norm": 0.736701280742616, + "learning_rate": 4.512435061312104e-08, + "loss": 0.0192, + "step": 24340 + }, + { + "epoch": 2.885807407407407, + "grad_norm": 0.5374835822662828, + "learning_rate": 4.420482906017709e-08, + "loss": 0.0144, + "step": 24350 + }, + { + "epoch": 2.8869925925925926, + "grad_norm": 0.5026084270087791, + "learning_rate": 4.3294731378170084e-08, + "loss": 0.0164, + "step": 24360 + }, + { + "epoch": 2.888177777777778, + "grad_norm": 0.5409240192705762, + "learning_rate": 4.2394059297728995e-08, + "loss": 0.0163, + "step": 24370 + }, + { + "epoch": 2.889362962962963, + "grad_norm": 0.6043113925441805, + "learning_rate": 4.1502814531559356e-08, + "loss": 0.0159, + "step": 24380 + }, + { + "epoch": 2.8905481481481483, + "grad_norm": 0.5175666423448382, + "learning_rate": 4.0620998774439924e-08, + "loss": 0.0163, + "step": 24390 + }, + { + "epoch": 2.8917333333333333, + "grad_norm": 0.5385813279903557, + "learning_rate": 3.974861370321881e-08, + "loss": 0.0162, + "step": 24400 + }, + { + "epoch": 2.8929185185185187, + "grad_norm": 0.5580944229468465, + "learning_rate": 3.888566097681123e-08, + "loss": 0.0164, + "step": 24410 + }, + { + "epoch": 2.8941037037037036, + "grad_norm": 0.49545452069407303, + "learning_rate": 3.803214223619733e-08, + "loss": 0.0147, + "step": 24420 + }, + { + "epoch": 2.895288888888889, + "grad_norm": 0.5715448334029454, + "learning_rate": 3.718805910441492e-08, + "loss": 0.0148, + "step": 24430 + }, + { + "epoch": 2.896474074074074, + "grad_norm": 0.6057299346285372, + "learning_rate": 3.635341318656116e-08, + "loss": 0.016, + "step": 24440 + }, + { + "epoch": 2.8976592592592594, + "grad_norm": 0.5284054439199934, + "learning_rate": 3.552820606978757e-08, + "loss": 0.0177, + "step": 24450 + }, + { + "epoch": 2.8988444444444443, + "grad_norm": 0.5129698528135394, + "learning_rate": 3.47124393232956e-08, + "loss": 0.0161, + "step": 24460 + }, + { + "epoch": 2.9000296296296297, + "grad_norm": 0.5146787976503923, + "learning_rate": 3.3906114498336584e-08, + "loss": 0.0154, + "step": 24470 + }, + { + "epoch": 2.9012148148148147, + "grad_norm": 0.5140974899889613, + "learning_rate": 3.3109233128206795e-08, + "loss": 0.0172, + "step": 24480 + }, + { + "epoch": 2.9024, + "grad_norm": 0.6767296358528045, + "learning_rate": 3.232179672824409e-08, + "loss": 0.0166, + "step": 24490 + }, + { + "epoch": 2.903585185185185, + "grad_norm": 0.8378371116054654, + "learning_rate": 3.154380679582625e-08, + "loss": 0.0172, + "step": 24500 + }, + { + "epoch": 2.9047703703703704, + "grad_norm": 0.6451680075290084, + "learning_rate": 3.077526481036874e-08, + "loss": 0.0164, + "step": 24510 + }, + { + "epoch": 2.9059555555555554, + "grad_norm": 0.3349324791713931, + "learning_rate": 3.0016172233320874e-08, + "loss": 0.0173, + "step": 24520 + }, + { + "epoch": 2.907140740740741, + "grad_norm": 0.578570728807988, + "learning_rate": 2.926653050816075e-08, + "loss": 0.0182, + "step": 24530 + }, + { + "epoch": 2.9083259259259258, + "grad_norm": 0.5000505966884319, + "learning_rate": 2.8526341060398088e-08, + "loss": 0.016, + "step": 24540 + }, + { + "epoch": 2.909511111111111, + "grad_norm": 0.5381103410094421, + "learning_rate": 2.779560529756642e-08, + "loss": 0.0162, + "step": 24550 + }, + { + "epoch": 2.9106962962962966, + "grad_norm": 0.3671637354012208, + "learning_rate": 2.7074324609222547e-08, + "loss": 0.0157, + "step": 24560 + }, + { + "epoch": 2.9118814814814815, + "grad_norm": 0.4515150195230591, + "learning_rate": 2.6362500366943767e-08, + "loss": 0.0151, + "step": 24570 + }, + { + "epoch": 2.9130666666666665, + "grad_norm": 0.5705747626911236, + "learning_rate": 2.5660133924324537e-08, + "loss": 0.0192, + "step": 24580 + }, + { + "epoch": 2.914251851851852, + "grad_norm": 0.5437557684872543, + "learning_rate": 2.496722661697648e-08, + "loss": 0.0146, + "step": 24590 + }, + { + "epoch": 2.9154370370370373, + "grad_norm": 0.6602699080095603, + "learning_rate": 2.428377976252172e-08, + "loss": 0.016, + "step": 24600 + }, + { + "epoch": 2.9166222222222222, + "grad_norm": 0.6104992900519459, + "learning_rate": 2.3609794660592877e-08, + "loss": 0.0177, + "step": 24610 + }, + { + "epoch": 2.917807407407407, + "grad_norm": 0.5525277045559269, + "learning_rate": 2.2945272592830858e-08, + "loss": 0.0166, + "step": 24620 + }, + { + "epoch": 2.9189925925925926, + "grad_norm": 0.5886038288281357, + "learning_rate": 2.229021482288152e-08, + "loss": 0.0193, + "step": 24630 + }, + { + "epoch": 2.920177777777778, + "grad_norm": 0.6711393203152322, + "learning_rate": 2.1644622596393994e-08, + "loss": 0.0162, + "step": 24640 + }, + { + "epoch": 2.921362962962963, + "grad_norm": 0.45041506476044624, + "learning_rate": 2.1008497141017382e-08, + "loss": 0.016, + "step": 24650 + }, + { + "epoch": 2.922548148148148, + "grad_norm": 0.6798601293739935, + "learning_rate": 2.0381839666398508e-08, + "loss": 0.0167, + "step": 24660 + }, + { + "epoch": 2.9237333333333333, + "grad_norm": 0.47277047034364483, + "learning_rate": 1.976465136418082e-08, + "loss": 0.016, + "step": 24670 + }, + { + "epoch": 2.9249185185185187, + "grad_norm": 0.5141826359565288, + "learning_rate": 1.9156933408001066e-08, + "loss": 0.0157, + "step": 24680 + }, + { + "epoch": 2.9261037037037037, + "grad_norm": 0.5528612372626126, + "learning_rate": 1.8558686953486503e-08, + "loss": 0.0147, + "step": 24690 + }, + { + "epoch": 2.927288888888889, + "grad_norm": 0.5659139736954499, + "learning_rate": 1.796991313825491e-08, + "loss": 0.0161, + "step": 24700 + }, + { + "epoch": 2.928474074074074, + "grad_norm": 0.6033772942545749, + "learning_rate": 1.7390613081910702e-08, + "loss": 0.017, + "step": 24710 + }, + { + "epoch": 2.9296592592592594, + "grad_norm": 0.6439271250362896, + "learning_rate": 1.6820787886042134e-08, + "loss": 0.0168, + "step": 24720 + }, + { + "epoch": 2.9308444444444444, + "grad_norm": 0.7152694904135216, + "learning_rate": 1.6260438634220775e-08, + "loss": 0.0167, + "step": 24730 + }, + { + "epoch": 2.9320296296296298, + "grad_norm": 0.6656290893537492, + "learning_rate": 1.5709566391999275e-08, + "loss": 0.0159, + "step": 24740 + }, + { + "epoch": 2.9332148148148147, + "grad_norm": 0.5699462608892579, + "learning_rate": 1.5168172206908582e-08, + "loss": 0.0168, + "step": 24750 + }, + { + "epoch": 2.9344, + "grad_norm": 0.4624562892237229, + "learning_rate": 1.4636257108456286e-08, + "loss": 0.0168, + "step": 24760 + }, + { + "epoch": 2.935585185185185, + "grad_norm": 0.5308880655766132, + "learning_rate": 1.4113822108124953e-08, + "loss": 0.0167, + "step": 24770 + }, + { + "epoch": 2.9367703703703705, + "grad_norm": 0.6806466306029003, + "learning_rate": 1.3600868199369344e-08, + "loss": 0.0179, + "step": 24780 + }, + { + "epoch": 2.9379555555555554, + "grad_norm": 0.5719645729463473, + "learning_rate": 1.309739635761531e-08, + "loss": 0.0159, + "step": 24790 + }, + { + "epoch": 2.939140740740741, + "grad_norm": 0.6523860091106538, + "learning_rate": 1.2603407540258127e-08, + "loss": 0.0165, + "step": 24800 + }, + { + "epoch": 2.940325925925926, + "grad_norm": 0.5894060850266579, + "learning_rate": 1.2118902686659717e-08, + "loss": 0.0174, + "step": 24810 + }, + { + "epoch": 2.941511111111111, + "grad_norm": 0.5738391637533752, + "learning_rate": 1.1643882718148648e-08, + "loss": 0.0177, + "step": 24820 + }, + { + "epoch": 2.942696296296296, + "grad_norm": 0.6638923418593473, + "learning_rate": 1.1178348538015138e-08, + "loss": 0.0159, + "step": 24830 + }, + { + "epoch": 2.9438814814814815, + "grad_norm": 0.5647052149594103, + "learning_rate": 1.0722301031513282e-08, + "loss": 0.0166, + "step": 24840 + }, + { + "epoch": 2.9450666666666665, + "grad_norm": 0.5233472219687685, + "learning_rate": 1.0275741065856604e-08, + "loss": 0.0169, + "step": 24850 + }, + { + "epoch": 2.946251851851852, + "grad_norm": 0.7171188069712527, + "learning_rate": 9.838669490216945e-09, + "loss": 0.0182, + "step": 24860 + }, + { + "epoch": 2.9474370370370373, + "grad_norm": 0.600770659012363, + "learning_rate": 9.411087135723362e-09, + "loss": 0.017, + "step": 24870 + }, + { + "epoch": 2.9486222222222223, + "grad_norm": 0.5633870929194631, + "learning_rate": 8.99299481546101e-09, + "loss": 0.0173, + "step": 24880 + }, + { + "epoch": 2.949807407407407, + "grad_norm": 0.4735586805047879, + "learning_rate": 8.584393324468365e-09, + "loss": 0.0161, + "step": 24890 + }, + { + "epoch": 2.9509925925925926, + "grad_norm": 0.7390374651538125, + "learning_rate": 8.185283439735569e-09, + "loss": 0.0159, + "step": 24900 + }, + { + "epoch": 2.952177777777778, + "grad_norm": 0.6411091422246836, + "learning_rate": 7.795665920205531e-09, + "loss": 0.018, + "step": 24910 + }, + { + "epoch": 2.953362962962963, + "grad_norm": 0.6960313565978423, + "learning_rate": 7.415541506768931e-09, + "loss": 0.0188, + "step": 24920 + }, + { + "epoch": 2.954548148148148, + "grad_norm": 0.5757497526401953, + "learning_rate": 7.044910922264781e-09, + "loss": 0.0158, + "step": 24930 + }, + { + "epoch": 2.9557333333333333, + "grad_norm": 0.5141356521997997, + "learning_rate": 6.6837748714793095e-09, + "loss": 0.0154, + "step": 24940 + }, + { + "epoch": 2.9569185185185187, + "grad_norm": 0.544039733775537, + "learning_rate": 6.332134041143745e-09, + "loss": 0.0173, + "step": 24950 + }, + { + "epoch": 2.9581037037037037, + "grad_norm": 0.573620916072217, + "learning_rate": 5.989989099933757e-09, + "loss": 0.0167, + "step": 24960 + }, + { + "epoch": 2.9592888888888886, + "grad_norm": 0.7128676484272533, + "learning_rate": 5.657340698466684e-09, + "loss": 0.0174, + "step": 24970 + }, + { + "epoch": 2.960474074074074, + "grad_norm": 0.635586370758074, + "learning_rate": 5.334189469302642e-09, + "loss": 0.0162, + "step": 24980 + }, + { + "epoch": 2.9616592592592594, + "grad_norm": 0.5069301117432712, + "learning_rate": 5.0205360269411916e-09, + "loss": 0.0152, + "step": 24990 + }, + { + "epoch": 2.9628444444444444, + "grad_norm": 0.5774614011438758, + "learning_rate": 4.716380967821344e-09, + "loss": 0.0183, + "step": 25000 + }, + { + "epoch": 2.96402962962963, + "grad_norm": 0.563634931843291, + "learning_rate": 4.421724870320443e-09, + "loss": 0.0166, + "step": 25010 + }, + { + "epoch": 2.9652148148148147, + "grad_norm": 0.6115473574967998, + "learning_rate": 4.1365682947525074e-09, + "loss": 0.0145, + "step": 25020 + }, + { + "epoch": 2.9664, + "grad_norm": 0.5380579078625547, + "learning_rate": 3.860911783366561e-09, + "loss": 0.0189, + "step": 25030 + }, + { + "epoch": 2.967585185185185, + "grad_norm": 0.5822851026345084, + "learning_rate": 3.594755860347743e-09, + "loss": 0.0167, + "step": 25040 + }, + { + "epoch": 2.9687703703703705, + "grad_norm": 0.4950849801867146, + "learning_rate": 3.3381010318139783e-09, + "loss": 0.0182, + "step": 25050 + }, + { + "epoch": 2.9699555555555555, + "grad_norm": 0.5265732260876559, + "learning_rate": 3.090947785817089e-09, + "loss": 0.0149, + "step": 25060 + }, + { + "epoch": 2.971140740740741, + "grad_norm": 0.5169416930697963, + "learning_rate": 2.8532965923400158e-09, + "loss": 0.0149, + "step": 25070 + }, + { + "epoch": 2.972325925925926, + "grad_norm": 0.5200164411496547, + "learning_rate": 2.625147903297376e-09, + "loss": 0.0167, + "step": 25080 + }, + { + "epoch": 2.973511111111111, + "grad_norm": 0.5693745990291242, + "learning_rate": 2.4065021525326858e-09, + "loss": 0.018, + "step": 25090 + }, + { + "epoch": 2.974696296296296, + "grad_norm": 0.6617362942114778, + "learning_rate": 2.1973597558200278e-09, + "loss": 0.0173, + "step": 25100 + }, + { + "epoch": 2.9758814814814816, + "grad_norm": 0.6414046448930827, + "learning_rate": 1.9977211108612726e-09, + "loss": 0.0162, + "step": 25110 + }, + { + "epoch": 2.9770666666666665, + "grad_norm": 0.6120131032260693, + "learning_rate": 1.807586597287747e-09, + "loss": 0.0186, + "step": 25120 + }, + { + "epoch": 2.978251851851852, + "grad_norm": 0.44101965724272346, + "learning_rate": 1.6269565766552365e-09, + "loss": 0.0147, + "step": 25130 + }, + { + "epoch": 2.979437037037037, + "grad_norm": 0.5910797793064148, + "learning_rate": 1.4558313924478705e-09, + "loss": 0.0165, + "step": 25140 + }, + { + "epoch": 2.9806222222222223, + "grad_norm": 0.5246736687335397, + "learning_rate": 1.2942113700747938e-09, + "loss": 0.0162, + "step": 25150 + }, + { + "epoch": 2.9818074074074072, + "grad_norm": 0.5785101327575917, + "learning_rate": 1.142096816870164e-09, + "loss": 0.0149, + "step": 25160 + }, + { + "epoch": 2.9829925925925926, + "grad_norm": 0.7034997441291818, + "learning_rate": 9.994880220937086e-10, + "loss": 0.0156, + "step": 25170 + }, + { + "epoch": 2.984177777777778, + "grad_norm": 0.48543867906840543, + "learning_rate": 8.663852569273934e-10, + "loss": 0.0163, + "step": 25180 + }, + { + "epoch": 2.985362962962963, + "grad_norm": 0.5269357130671872, + "learning_rate": 7.42788774477643e-10, + "loss": 0.0176, + "step": 25190 + }, + { + "epoch": 2.986548148148148, + "grad_norm": 0.6630914633220515, + "learning_rate": 6.286988097747859e-10, + "loss": 0.0176, + "step": 25200 + }, + { + "epoch": 2.9877333333333334, + "grad_norm": 0.748656324329734, + "learning_rate": 5.241155797691688e-10, + "loss": 0.0165, + "step": 25210 + }, + { + "epoch": 2.9889185185185188, + "grad_norm": 0.5430103134833101, + "learning_rate": 4.290392833361523e-10, + "loss": 0.0156, + "step": 25220 + }, + { + "epoch": 2.9901037037037037, + "grad_norm": 0.5716313983233334, + "learning_rate": 3.4347010127111504e-10, + "loss": 0.0151, + "step": 25230 + }, + { + "epoch": 2.9912888888888887, + "grad_norm": 0.6888188004556484, + "learning_rate": 2.674081962905639e-10, + "loss": 0.016, + "step": 25240 + }, + { + "epoch": 2.992474074074074, + "grad_norm": 0.7073514899610959, + "learning_rate": 2.0085371303379953e-10, + "loss": 0.0183, + "step": 25250 + }, + { + "epoch": 2.9936592592592595, + "grad_norm": 0.5561703242395853, + "learning_rate": 1.438067780590302e-10, + "loss": 0.0166, + "step": 25260 + }, + { + "epoch": 2.9948444444444444, + "grad_norm": 0.5512080757144833, + "learning_rate": 9.62674998467028e-11, + "loss": 0.0178, + "step": 25270 + }, + { + "epoch": 2.9960296296296294, + "grad_norm": 0.6813051570043652, + "learning_rate": 5.8235968796172e-11, + "loss": 0.0167, + "step": 25280 + }, + { + "epoch": 2.9972148148148148, + "grad_norm": 0.46741913471786667, + "learning_rate": 2.9712257227920704e-11, + "loss": 0.0166, + "step": 25290 + }, + { + "epoch": 2.9984, + "grad_norm": 0.6586403960498411, + "learning_rate": 1.0696419381894807e-11, + "loss": 0.0173, + "step": 25300 + }, + { + "epoch": 2.999585185185185, + "grad_norm": 0.6044512529157666, + "learning_rate": 1.188491419168436e-12, + "loss": 0.0168, + "step": 25310 + }, + { + "epoch": 3.0, + "step": 25314, + "total_flos": 1062096120446976.0, + "train_loss": 0.055281539885637, + "train_runtime": 108497.1176, + "train_samples_per_second": 7.465, + "train_steps_per_second": 0.233 + } + ], + "logging_steps": 10, + "max_steps": 25314, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1062096120446976.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}