diff --git "a/PA_0.3_seed0_30k/trainer_state.json" "b/PA_0.3_seed0_30k/trainer_state.json" new file mode 100644--- /dev/null +++ "b/PA_0.3_seed0_30k/trainer_state.json" @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.408328826392645, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018027762754642149, + "grad_norm": 5.927395343780518, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.5086, + "step": 10 + }, + { + "epoch": 0.0036055525509284298, + "grad_norm": 5.1692328453063965, + "learning_rate": 6.333333333333333e-07, + "loss": 0.4987, + "step": 20 + }, + { + "epoch": 0.005408328826392645, + "grad_norm": 4.2610626220703125, + "learning_rate": 9.666666666666668e-07, + "loss": 0.4692, + "step": 30 + }, + { + "epoch": 0.0072111051018568595, + "grad_norm": 3.4567067623138428, + "learning_rate": 1.3e-06, + "loss": 0.3896, + "step": 40 + }, + { + "epoch": 0.009013881377321075, + "grad_norm": 1.1518325805664062, + "learning_rate": 1.6333333333333333e-06, + "loss": 0.2379, + "step": 50 + }, + { + "epoch": 0.01081665765278529, + "grad_norm": 0.6859740018844604, + "learning_rate": 1.9666666666666668e-06, + "loss": 0.1762, + "step": 60 + }, + { + "epoch": 0.012619433928249504, + "grad_norm": 0.6898670792579651, + "learning_rate": 2.3e-06, + "loss": 0.1374, + "step": 70 + }, + { + "epoch": 0.014422210203713719, + "grad_norm": 0.35002022981643677, + "learning_rate": 2.6333333333333337e-06, + "loss": 0.1055, + "step": 80 + }, + { + "epoch": 0.016224986479177934, + "grad_norm": 0.35441234707832336, + "learning_rate": 2.966666666666667e-06, + "loss": 0.0942, + "step": 90 + }, + { + "epoch": 0.01802776275464215, + "grad_norm": 0.4481620490550995, + "learning_rate": 3.3e-06, + "loss": 0.0991, + "step": 100 + }, + { + "epoch": 0.019830539030106363, + "grad_norm": 0.24252600967884064, + "learning_rate": 3.633333333333334e-06, + "loss": 0.0737, + "step": 110 + }, + { + "epoch": 0.02163331530557058, + "grad_norm": 0.292210191488266, + "learning_rate": 3.966666666666667e-06, + "loss": 0.0758, + "step": 120 + }, + { + "epoch": 0.023436091581034792, + "grad_norm": 0.16601531207561493, + "learning_rate": 4.2999999999999995e-06, + "loss": 0.069, + "step": 130 + }, + { + "epoch": 0.02523886785649901, + "grad_norm": 0.17904335260391235, + "learning_rate": 4.633333333333334e-06, + "loss": 0.0727, + "step": 140 + }, + { + "epoch": 0.02704164413196322, + "grad_norm": 0.19262968003749847, + "learning_rate": 4.966666666666667e-06, + "loss": 0.0698, + "step": 150 + }, + { + "epoch": 0.028844420407427438, + "grad_norm": 0.2338973879814148, + "learning_rate": 5.3e-06, + "loss": 0.0657, + "step": 160 + }, + { + "epoch": 0.030647196682891654, + "grad_norm": 0.1479610800743103, + "learning_rate": 5.633333333333333e-06, + "loss": 0.0668, + "step": 170 + }, + { + "epoch": 0.03244997295835587, + "grad_norm": 0.27506181597709656, + "learning_rate": 5.9666666666666666e-06, + "loss": 0.0666, + "step": 180 + }, + { + "epoch": 0.03425274923382008, + "grad_norm": 0.2965821921825409, + "learning_rate": 6.300000000000001e-06, + "loss": 0.068, + "step": 190 + }, + { + "epoch": 0.0360555255092843, + "grad_norm": 0.19121438264846802, + "learning_rate": 6.633333333333333e-06, + "loss": 0.065, + "step": 200 + }, + { + "epoch": 0.03785830178474851, + "grad_norm": 0.2759195864200592, + "learning_rate": 6.966666666666667e-06, + "loss": 0.0622, + "step": 210 + }, + { + "epoch": 0.039661078060212726, + "grad_norm": 0.19436152279376984, + "learning_rate": 7.2999999999999996e-06, + "loss": 0.0602, + "step": 220 + }, + { + "epoch": 0.041463854335676946, + "grad_norm": 0.23938709497451782, + "learning_rate": 7.633333333333334e-06, + "loss": 0.0616, + "step": 230 + }, + { + "epoch": 0.04326663061114116, + "grad_norm": 0.22256684303283691, + "learning_rate": 7.966666666666666e-06, + "loss": 0.0589, + "step": 240 + }, + { + "epoch": 0.04506940688660537, + "grad_norm": 0.1885102242231369, + "learning_rate": 8.3e-06, + "loss": 0.0576, + "step": 250 + }, + { + "epoch": 0.046872183162069585, + "grad_norm": 0.1881362944841385, + "learning_rate": 8.633333333333334e-06, + "loss": 0.0548, + "step": 260 + }, + { + "epoch": 0.048674959437533805, + "grad_norm": 0.20400169491767883, + "learning_rate": 8.966666666666668e-06, + "loss": 0.0584, + "step": 270 + }, + { + "epoch": 0.05047773571299802, + "grad_norm": 0.1886519491672516, + "learning_rate": 9.3e-06, + "loss": 0.0604, + "step": 280 + }, + { + "epoch": 0.05228051198846223, + "grad_norm": 0.17705675959587097, + "learning_rate": 9.633333333333335e-06, + "loss": 0.054, + "step": 290 + }, + { + "epoch": 0.05408328826392644, + "grad_norm": 0.200830340385437, + "learning_rate": 9.966666666666667e-06, + "loss": 0.0612, + "step": 300 + }, + { + "epoch": 0.05588606453939066, + "grad_norm": 0.14939028024673462, + "learning_rate": 1.03e-05, + "loss": 0.0548, + "step": 310 + }, + { + "epoch": 0.057688840814854876, + "grad_norm": 0.17654134333133698, + "learning_rate": 1.0633333333333334e-05, + "loss": 0.057, + "step": 320 + }, + { + "epoch": 0.05949161709031909, + "grad_norm": 0.19793210923671722, + "learning_rate": 1.0966666666666666e-05, + "loss": 0.0563, + "step": 330 + }, + { + "epoch": 0.06129439336578331, + "grad_norm": 0.14409703016281128, + "learning_rate": 1.13e-05, + "loss": 0.052, + "step": 340 + }, + { + "epoch": 0.06309716964124752, + "grad_norm": 0.2149503231048584, + "learning_rate": 1.1633333333333334e-05, + "loss": 0.0513, + "step": 350 + }, + { + "epoch": 0.06489994591671173, + "grad_norm": 0.17241068184375763, + "learning_rate": 1.1966666666666668e-05, + "loss": 0.0541, + "step": 360 + }, + { + "epoch": 0.06670272219217595, + "grad_norm": 0.22884050011634827, + "learning_rate": 1.23e-05, + "loss": 0.055, + "step": 370 + }, + { + "epoch": 0.06850549846764016, + "grad_norm": 0.3367147445678711, + "learning_rate": 1.2633333333333333e-05, + "loss": 0.0532, + "step": 380 + }, + { + "epoch": 0.07030827474310439, + "grad_norm": 0.21108222007751465, + "learning_rate": 1.2966666666666669e-05, + "loss": 0.0534, + "step": 390 + }, + { + "epoch": 0.0721110510185686, + "grad_norm": 0.17822010815143585, + "learning_rate": 1.3300000000000001e-05, + "loss": 0.0515, + "step": 400 + }, + { + "epoch": 0.07391382729403281, + "grad_norm": 0.26001372933387756, + "learning_rate": 1.3633333333333334e-05, + "loss": 0.0504, + "step": 410 + }, + { + "epoch": 0.07571660356949703, + "grad_norm": 0.19004106521606445, + "learning_rate": 1.3966666666666666e-05, + "loss": 0.0539, + "step": 420 + }, + { + "epoch": 0.07751937984496124, + "grad_norm": 0.151622474193573, + "learning_rate": 1.43e-05, + "loss": 0.0531, + "step": 430 + }, + { + "epoch": 0.07932215612042545, + "grad_norm": 0.20369409024715424, + "learning_rate": 1.4633333333333334e-05, + "loss": 0.0492, + "step": 440 + }, + { + "epoch": 0.08112493239588967, + "grad_norm": 0.23420299589633942, + "learning_rate": 1.4966666666666668e-05, + "loss": 0.0516, + "step": 450 + }, + { + "epoch": 0.08292770867135389, + "grad_norm": 0.21566835045814514, + "learning_rate": 1.53e-05, + "loss": 0.0495, + "step": 460 + }, + { + "epoch": 0.0847304849468181, + "grad_norm": 0.1883263885974884, + "learning_rate": 1.563333333333333e-05, + "loss": 0.0498, + "step": 470 + }, + { + "epoch": 0.08653326122228232, + "grad_norm": 0.1479886770248413, + "learning_rate": 1.5966666666666667e-05, + "loss": 0.052, + "step": 480 + }, + { + "epoch": 0.08833603749774653, + "grad_norm": 0.14047309756278992, + "learning_rate": 1.63e-05, + "loss": 0.0495, + "step": 490 + }, + { + "epoch": 0.09013881377321074, + "grad_norm": 0.17417068779468536, + "learning_rate": 1.6633333333333336e-05, + "loss": 0.0468, + "step": 500 + }, + { + "epoch": 0.09194159004867496, + "grad_norm": 0.17884397506713867, + "learning_rate": 1.6966666666666668e-05, + "loss": 0.0541, + "step": 510 + }, + { + "epoch": 0.09374436632413917, + "grad_norm": 0.2234535962343216, + "learning_rate": 1.73e-05, + "loss": 0.0474, + "step": 520 + }, + { + "epoch": 0.09554714259960338, + "grad_norm": 0.18611693382263184, + "learning_rate": 1.7633333333333336e-05, + "loss": 0.0458, + "step": 530 + }, + { + "epoch": 0.09734991887506761, + "grad_norm": 0.2371469885110855, + "learning_rate": 1.796666666666667e-05, + "loss": 0.0462, + "step": 540 + }, + { + "epoch": 0.09915269515053182, + "grad_norm": 0.18843497335910797, + "learning_rate": 1.83e-05, + "loss": 0.0468, + "step": 550 + }, + { + "epoch": 0.10095547142599604, + "grad_norm": 0.16880261898040771, + "learning_rate": 1.8633333333333333e-05, + "loss": 0.0443, + "step": 560 + }, + { + "epoch": 0.10275824770146025, + "grad_norm": 0.16339348256587982, + "learning_rate": 1.896666666666667e-05, + "loss": 0.0457, + "step": 570 + }, + { + "epoch": 0.10456102397692446, + "grad_norm": 0.18269579112529755, + "learning_rate": 1.93e-05, + "loss": 0.0457, + "step": 580 + }, + { + "epoch": 0.10636380025238867, + "grad_norm": 0.14606955647468567, + "learning_rate": 1.9633333333333334e-05, + "loss": 0.0446, + "step": 590 + }, + { + "epoch": 0.10816657652785289, + "grad_norm": 0.16745400428771973, + "learning_rate": 1.9966666666666666e-05, + "loss": 0.0462, + "step": 600 + }, + { + "epoch": 0.10996935280331711, + "grad_norm": 0.14538270235061646, + "learning_rate": 2.0300000000000002e-05, + "loss": 0.0441, + "step": 610 + }, + { + "epoch": 0.11177212907878133, + "grad_norm": 0.1557837873697281, + "learning_rate": 2.0633333333333335e-05, + "loss": 0.0445, + "step": 620 + }, + { + "epoch": 0.11357490535424554, + "grad_norm": 0.21569928526878357, + "learning_rate": 2.0966666666666667e-05, + "loss": 0.0451, + "step": 630 + }, + { + "epoch": 0.11537768162970975, + "grad_norm": 0.17741309106349945, + "learning_rate": 2.13e-05, + "loss": 0.046, + "step": 640 + }, + { + "epoch": 0.11718045790517397, + "grad_norm": 0.19033050537109375, + "learning_rate": 2.1633333333333332e-05, + "loss": 0.0434, + "step": 650 + }, + { + "epoch": 0.11898323418063818, + "grad_norm": 0.14259305596351624, + "learning_rate": 2.1966666666666668e-05, + "loss": 0.0415, + "step": 660 + }, + { + "epoch": 0.12078601045610239, + "grad_norm": 0.15615609288215637, + "learning_rate": 2.23e-05, + "loss": 0.0431, + "step": 670 + }, + { + "epoch": 0.12258878673156662, + "grad_norm": 0.16624978184700012, + "learning_rate": 2.2633333333333336e-05, + "loss": 0.0419, + "step": 680 + }, + { + "epoch": 0.12439156300703083, + "grad_norm": 0.12572228908538818, + "learning_rate": 2.2966666666666668e-05, + "loss": 0.0447, + "step": 690 + }, + { + "epoch": 0.12619433928249504, + "grad_norm": 0.15061785280704498, + "learning_rate": 2.3300000000000004e-05, + "loss": 0.0419, + "step": 700 + }, + { + "epoch": 0.12799711555795926, + "grad_norm": 0.1646706610918045, + "learning_rate": 2.3633333333333336e-05, + "loss": 0.0439, + "step": 710 + }, + { + "epoch": 0.12979989183342347, + "grad_norm": 0.1438985913991928, + "learning_rate": 2.396666666666667e-05, + "loss": 0.0426, + "step": 720 + }, + { + "epoch": 0.13160266810888768, + "grad_norm": 0.13277369737625122, + "learning_rate": 2.43e-05, + "loss": 0.0399, + "step": 730 + }, + { + "epoch": 0.1334054443843519, + "grad_norm": 0.15727126598358154, + "learning_rate": 2.4633333333333334e-05, + "loss": 0.0413, + "step": 740 + }, + { + "epoch": 0.1352082206598161, + "grad_norm": 0.13605494797229767, + "learning_rate": 2.496666666666667e-05, + "loss": 0.0415, + "step": 750 + }, + { + "epoch": 0.13701099693528032, + "grad_norm": 0.17520026862621307, + "learning_rate": 2.5300000000000002e-05, + "loss": 0.0419, + "step": 760 + }, + { + "epoch": 0.13881377321074453, + "grad_norm": 0.12359524518251419, + "learning_rate": 2.5633333333333338e-05, + "loss": 0.0429, + "step": 770 + }, + { + "epoch": 0.14061654948620878, + "grad_norm": 0.16951821744441986, + "learning_rate": 2.5966666666666667e-05, + "loss": 0.0403, + "step": 780 + }, + { + "epoch": 0.142419325761673, + "grad_norm": 0.15725772082805634, + "learning_rate": 2.6300000000000002e-05, + "loss": 0.0412, + "step": 790 + }, + { + "epoch": 0.1442221020371372, + "grad_norm": 0.1215682104229927, + "learning_rate": 2.663333333333333e-05, + "loss": 0.0393, + "step": 800 + }, + { + "epoch": 0.1460248783126014, + "grad_norm": 0.15753105282783508, + "learning_rate": 2.6966666666666667e-05, + "loss": 0.041, + "step": 810 + }, + { + "epoch": 0.14782765458806563, + "grad_norm": 0.15297013521194458, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.0404, + "step": 820 + }, + { + "epoch": 0.14963043086352984, + "grad_norm": 0.2108885496854782, + "learning_rate": 2.7633333333333332e-05, + "loss": 0.0424, + "step": 830 + }, + { + "epoch": 0.15143320713899405, + "grad_norm": 0.11800061166286469, + "learning_rate": 2.7966666666666668e-05, + "loss": 0.0413, + "step": 840 + }, + { + "epoch": 0.15323598341445827, + "grad_norm": 0.12303537130355835, + "learning_rate": 2.83e-05, + "loss": 0.0418, + "step": 850 + }, + { + "epoch": 0.15503875968992248, + "grad_norm": 0.16320499777793884, + "learning_rate": 2.8633333333333336e-05, + "loss": 0.042, + "step": 860 + }, + { + "epoch": 0.1568415359653867, + "grad_norm": 0.11843526363372803, + "learning_rate": 2.8966666666666668e-05, + "loss": 0.036, + "step": 870 + }, + { + "epoch": 0.1586443122408509, + "grad_norm": 0.1414240598678589, + "learning_rate": 2.93e-05, + "loss": 0.0413, + "step": 880 + }, + { + "epoch": 0.16044708851631512, + "grad_norm": 0.11786221712827682, + "learning_rate": 2.9633333333333336e-05, + "loss": 0.0415, + "step": 890 + }, + { + "epoch": 0.16224986479177933, + "grad_norm": 0.1784365326166153, + "learning_rate": 2.9966666666666672e-05, + "loss": 0.0416, + "step": 900 + }, + { + "epoch": 0.16405264106724354, + "grad_norm": 0.16295653581619263, + "learning_rate": 3.03e-05, + "loss": 0.0383, + "step": 910 + }, + { + "epoch": 0.16585541734270778, + "grad_norm": 0.16384652256965637, + "learning_rate": 3.063333333333334e-05, + "loss": 0.0397, + "step": 920 + }, + { + "epoch": 0.167658193618172, + "grad_norm": 0.14619915187358856, + "learning_rate": 3.096666666666666e-05, + "loss": 0.0381, + "step": 930 + }, + { + "epoch": 0.1694609698936362, + "grad_norm": 0.17324869334697723, + "learning_rate": 3.13e-05, + "loss": 0.04, + "step": 940 + }, + { + "epoch": 0.17126374616910042, + "grad_norm": 0.23121756315231323, + "learning_rate": 3.1633333333333334e-05, + "loss": 0.0418, + "step": 950 + }, + { + "epoch": 0.17306652244456464, + "grad_norm": 0.12106212973594666, + "learning_rate": 3.196666666666667e-05, + "loss": 0.0386, + "step": 960 + }, + { + "epoch": 0.17486929872002885, + "grad_norm": 0.13935351371765137, + "learning_rate": 3.2300000000000006e-05, + "loss": 0.0396, + "step": 970 + }, + { + "epoch": 0.17667207499549306, + "grad_norm": 0.16403697431087494, + "learning_rate": 3.263333333333333e-05, + "loss": 0.0417, + "step": 980 + }, + { + "epoch": 0.17847485127095727, + "grad_norm": 0.12613044679164886, + "learning_rate": 3.296666666666667e-05, + "loss": 0.0424, + "step": 990 + }, + { + "epoch": 0.1802776275464215, + "grad_norm": 0.16847999393939972, + "learning_rate": 3.33e-05, + "loss": 0.0402, + "step": 1000 + }, + { + "epoch": 0.1820804038218857, + "grad_norm": 0.1335449069738388, + "learning_rate": 3.3633333333333335e-05, + "loss": 0.0393, + "step": 1010 + }, + { + "epoch": 0.1838831800973499, + "grad_norm": 0.18727445602416992, + "learning_rate": 3.396666666666667e-05, + "loss": 0.0383, + "step": 1020 + }, + { + "epoch": 0.18568595637281413, + "grad_norm": 0.14235800504684448, + "learning_rate": 3.430000000000001e-05, + "loss": 0.0415, + "step": 1030 + }, + { + "epoch": 0.18748873264827834, + "grad_norm": 0.17644183337688446, + "learning_rate": 3.463333333333333e-05, + "loss": 0.038, + "step": 1040 + }, + { + "epoch": 0.18929150892374255, + "grad_norm": 0.12054650485515594, + "learning_rate": 3.496666666666667e-05, + "loss": 0.0389, + "step": 1050 + }, + { + "epoch": 0.19109428519920676, + "grad_norm": 0.14751891791820526, + "learning_rate": 3.53e-05, + "loss": 0.0401, + "step": 1060 + }, + { + "epoch": 0.192897061474671, + "grad_norm": 0.14929774403572083, + "learning_rate": 3.563333333333334e-05, + "loss": 0.0387, + "step": 1070 + }, + { + "epoch": 0.19469983775013522, + "grad_norm": 0.16710111498832703, + "learning_rate": 3.596666666666667e-05, + "loss": 0.039, + "step": 1080 + }, + { + "epoch": 0.19650261402559943, + "grad_norm": 0.16452960669994354, + "learning_rate": 3.63e-05, + "loss": 0.0397, + "step": 1090 + }, + { + "epoch": 0.19830539030106364, + "grad_norm": 0.13411708176136017, + "learning_rate": 3.6633333333333334e-05, + "loss": 0.0364, + "step": 1100 + }, + { + "epoch": 0.20010816657652786, + "grad_norm": 0.11768806725740433, + "learning_rate": 3.6966666666666666e-05, + "loss": 0.035, + "step": 1110 + }, + { + "epoch": 0.20191094285199207, + "grad_norm": 0.15497079491615295, + "learning_rate": 3.73e-05, + "loss": 0.0379, + "step": 1120 + }, + { + "epoch": 0.20371371912745628, + "grad_norm": 0.18066664040088654, + "learning_rate": 3.763333333333334e-05, + "loss": 0.0361, + "step": 1130 + }, + { + "epoch": 0.2055164954029205, + "grad_norm": 0.10206460952758789, + "learning_rate": 3.796666666666667e-05, + "loss": 0.0351, + "step": 1140 + }, + { + "epoch": 0.2073192716783847, + "grad_norm": 0.13023562729358673, + "learning_rate": 3.83e-05, + "loss": 0.0348, + "step": 1150 + }, + { + "epoch": 0.20912204795384892, + "grad_norm": 0.1664484590291977, + "learning_rate": 3.8633333333333335e-05, + "loss": 0.0361, + "step": 1160 + }, + { + "epoch": 0.21092482422931313, + "grad_norm": 0.1117839366197586, + "learning_rate": 3.896666666666667e-05, + "loss": 0.04, + "step": 1170 + }, + { + "epoch": 0.21272760050477735, + "grad_norm": 0.15310372412204742, + "learning_rate": 3.9300000000000007e-05, + "loss": 0.0384, + "step": 1180 + }, + { + "epoch": 0.21453037678024156, + "grad_norm": 0.13535067439079285, + "learning_rate": 3.963333333333333e-05, + "loss": 0.0382, + "step": 1190 + }, + { + "epoch": 0.21633315305570577, + "grad_norm": 0.19715803861618042, + "learning_rate": 3.996666666666667e-05, + "loss": 0.0366, + "step": 1200 + }, + { + "epoch": 0.21813592933117001, + "grad_norm": 0.1468590795993805, + "learning_rate": 4.0300000000000004e-05, + "loss": 0.0389, + "step": 1210 + }, + { + "epoch": 0.21993870560663423, + "grad_norm": 0.10836129635572433, + "learning_rate": 4.0633333333333336e-05, + "loss": 0.0358, + "step": 1220 + }, + { + "epoch": 0.22174148188209844, + "grad_norm": 0.11808973550796509, + "learning_rate": 4.096666666666667e-05, + "loss": 0.0393, + "step": 1230 + }, + { + "epoch": 0.22354425815756265, + "grad_norm": 0.14516021311283112, + "learning_rate": 4.13e-05, + "loss": 0.0342, + "step": 1240 + }, + { + "epoch": 0.22534703443302687, + "grad_norm": 0.12221723049879074, + "learning_rate": 4.1633333333333333e-05, + "loss": 0.0359, + "step": 1250 + }, + { + "epoch": 0.22714981070849108, + "grad_norm": 0.12157319486141205, + "learning_rate": 4.196666666666667e-05, + "loss": 0.0362, + "step": 1260 + }, + { + "epoch": 0.2289525869839553, + "grad_norm": 0.16886430978775024, + "learning_rate": 4.23e-05, + "loss": 0.0365, + "step": 1270 + }, + { + "epoch": 0.2307553632594195, + "grad_norm": 0.17078934609889984, + "learning_rate": 4.263333333333334e-05, + "loss": 0.0386, + "step": 1280 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.11399675905704498, + "learning_rate": 4.296666666666666e-05, + "loss": 0.0387, + "step": 1290 + }, + { + "epoch": 0.23436091581034793, + "grad_norm": 0.11997851729393005, + "learning_rate": 4.33e-05, + "loss": 0.0359, + "step": 1300 + }, + { + "epoch": 0.23616369208581214, + "grad_norm": 0.10852163285017014, + "learning_rate": 4.3633333333333335e-05, + "loss": 0.0354, + "step": 1310 + }, + { + "epoch": 0.23796646836127636, + "grad_norm": 0.09702097624540329, + "learning_rate": 4.396666666666667e-05, + "loss": 0.0353, + "step": 1320 + }, + { + "epoch": 0.23976924463674057, + "grad_norm": 0.11853215098381042, + "learning_rate": 4.43e-05, + "loss": 0.0369, + "step": 1330 + }, + { + "epoch": 0.24157202091220478, + "grad_norm": 0.12991459667682648, + "learning_rate": 4.463333333333334e-05, + "loss": 0.035, + "step": 1340 + }, + { + "epoch": 0.24337479718766902, + "grad_norm": 0.12072013318538666, + "learning_rate": 4.496666666666667e-05, + "loss": 0.0366, + "step": 1350 + }, + { + "epoch": 0.24517757346313324, + "grad_norm": 0.10771793872117996, + "learning_rate": 4.53e-05, + "loss": 0.0382, + "step": 1360 + }, + { + "epoch": 0.24698034973859745, + "grad_norm": 0.14169855415821075, + "learning_rate": 4.5633333333333336e-05, + "loss": 0.0316, + "step": 1370 + }, + { + "epoch": 0.24878312601406166, + "grad_norm": 0.12666484713554382, + "learning_rate": 4.596666666666667e-05, + "loss": 0.0369, + "step": 1380 + }, + { + "epoch": 0.2505859022895259, + "grad_norm": 0.13861031830310822, + "learning_rate": 4.630000000000001e-05, + "loss": 0.0367, + "step": 1390 + }, + { + "epoch": 0.2523886785649901, + "grad_norm": 0.15022532641887665, + "learning_rate": 4.663333333333333e-05, + "loss": 0.0364, + "step": 1400 + }, + { + "epoch": 0.2541914548404543, + "grad_norm": 0.11663172394037247, + "learning_rate": 4.696666666666667e-05, + "loss": 0.0339, + "step": 1410 + }, + { + "epoch": 0.2559942311159185, + "grad_norm": 0.11992368847131729, + "learning_rate": 4.73e-05, + "loss": 0.035, + "step": 1420 + }, + { + "epoch": 0.2577970073913827, + "grad_norm": 0.10864418745040894, + "learning_rate": 4.763333333333334e-05, + "loss": 0.032, + "step": 1430 + }, + { + "epoch": 0.25959978366684694, + "grad_norm": 0.0882345661520958, + "learning_rate": 4.796666666666667e-05, + "loss": 0.0339, + "step": 1440 + }, + { + "epoch": 0.26140255994231115, + "grad_norm": 0.09131108224391937, + "learning_rate": 4.83e-05, + "loss": 0.0353, + "step": 1450 + }, + { + "epoch": 0.26320533621777537, + "grad_norm": 0.11899872869253159, + "learning_rate": 4.8633333333333334e-05, + "loss": 0.0337, + "step": 1460 + }, + { + "epoch": 0.2650081124932396, + "grad_norm": 0.10595448315143585, + "learning_rate": 4.8966666666666667e-05, + "loss": 0.0332, + "step": 1470 + }, + { + "epoch": 0.2668108887687038, + "grad_norm": 0.12537339329719543, + "learning_rate": 4.93e-05, + "loss": 0.0355, + "step": 1480 + }, + { + "epoch": 0.268613665044168, + "grad_norm": 0.0927019864320755, + "learning_rate": 4.963333333333334e-05, + "loss": 0.0342, + "step": 1490 + }, + { + "epoch": 0.2704164413196322, + "grad_norm": 0.1698562353849411, + "learning_rate": 4.996666666666667e-05, + "loss": 0.0365, + "step": 1500 + }, + { + "epoch": 0.27221921759509643, + "grad_norm": 0.12412947416305542, + "learning_rate": 5.03e-05, + "loss": 0.0346, + "step": 1510 + }, + { + "epoch": 0.27402199387056064, + "grad_norm": 0.09103080630302429, + "learning_rate": 5.0633333333333335e-05, + "loss": 0.0333, + "step": 1520 + }, + { + "epoch": 0.27582477014602486, + "grad_norm": 0.14483073353767395, + "learning_rate": 5.0966666666666674e-05, + "loss": 0.0349, + "step": 1530 + }, + { + "epoch": 0.27762754642148907, + "grad_norm": 0.10964769870042801, + "learning_rate": 5.130000000000001e-05, + "loss": 0.0325, + "step": 1540 + }, + { + "epoch": 0.2794303226969533, + "grad_norm": 0.13196596503257751, + "learning_rate": 5.163333333333333e-05, + "loss": 0.0331, + "step": 1550 + }, + { + "epoch": 0.28123309897241755, + "grad_norm": 0.1630561351776123, + "learning_rate": 5.196666666666667e-05, + "loss": 0.0348, + "step": 1560 + }, + { + "epoch": 0.28303587524788176, + "grad_norm": 0.09885693341493607, + "learning_rate": 5.2300000000000004e-05, + "loss": 0.0327, + "step": 1570 + }, + { + "epoch": 0.284838651523346, + "grad_norm": 0.08820267766714096, + "learning_rate": 5.2633333333333336e-05, + "loss": 0.0326, + "step": 1580 + }, + { + "epoch": 0.2866414277988102, + "grad_norm": 0.10790671408176422, + "learning_rate": 5.296666666666666e-05, + "loss": 0.0316, + "step": 1590 + }, + { + "epoch": 0.2884442040742744, + "grad_norm": 0.15723896026611328, + "learning_rate": 5.330000000000001e-05, + "loss": 0.0334, + "step": 1600 + }, + { + "epoch": 0.2902469803497386, + "grad_norm": 0.08661948889493942, + "learning_rate": 5.3633333333333334e-05, + "loss": 0.0327, + "step": 1610 + }, + { + "epoch": 0.2920497566252028, + "grad_norm": 0.12468354403972626, + "learning_rate": 5.3966666666666666e-05, + "loss": 0.0355, + "step": 1620 + }, + { + "epoch": 0.29385253290066704, + "grad_norm": 0.14620402455329895, + "learning_rate": 5.4300000000000005e-05, + "loss": 0.0354, + "step": 1630 + }, + { + "epoch": 0.29565530917613125, + "grad_norm": 0.10252387821674347, + "learning_rate": 5.463333333333334e-05, + "loss": 0.035, + "step": 1640 + }, + { + "epoch": 0.29745808545159547, + "grad_norm": 0.12978729605674744, + "learning_rate": 5.496666666666666e-05, + "loss": 0.033, + "step": 1650 + }, + { + "epoch": 0.2992608617270597, + "grad_norm": 0.18542738258838654, + "learning_rate": 5.530000000000001e-05, + "loss": 0.0358, + "step": 1660 + }, + { + "epoch": 0.3010636380025239, + "grad_norm": 0.08660759031772614, + "learning_rate": 5.5633333333333335e-05, + "loss": 0.0325, + "step": 1670 + }, + { + "epoch": 0.3028664142779881, + "grad_norm": 0.12007088959217072, + "learning_rate": 5.596666666666667e-05, + "loss": 0.0342, + "step": 1680 + }, + { + "epoch": 0.3046691905534523, + "grad_norm": 0.10123857855796814, + "learning_rate": 5.63e-05, + "loss": 0.0339, + "step": 1690 + }, + { + "epoch": 0.30647196682891653, + "grad_norm": 0.13549436628818512, + "learning_rate": 5.663333333333334e-05, + "loss": 0.0331, + "step": 1700 + }, + { + "epoch": 0.30827474310438074, + "grad_norm": 0.1028088703751564, + "learning_rate": 5.696666666666667e-05, + "loss": 0.0333, + "step": 1710 + }, + { + "epoch": 0.31007751937984496, + "grad_norm": 0.09804537147283554, + "learning_rate": 5.73e-05, + "loss": 0.0335, + "step": 1720 + }, + { + "epoch": 0.31188029565530917, + "grad_norm": 0.1170876994729042, + "learning_rate": 5.7633333333333336e-05, + "loss": 0.0316, + "step": 1730 + }, + { + "epoch": 0.3136830719307734, + "grad_norm": 0.08236849308013916, + "learning_rate": 5.796666666666667e-05, + "loss": 0.0327, + "step": 1740 + }, + { + "epoch": 0.3154858482062376, + "grad_norm": 0.15237100422382355, + "learning_rate": 5.83e-05, + "loss": 0.0364, + "step": 1750 + }, + { + "epoch": 0.3172886244817018, + "grad_norm": 0.1300020068883896, + "learning_rate": 5.863333333333334e-05, + "loss": 0.0334, + "step": 1760 + }, + { + "epoch": 0.319091400757166, + "grad_norm": 0.13514597713947296, + "learning_rate": 5.896666666666667e-05, + "loss": 0.0342, + "step": 1770 + }, + { + "epoch": 0.32089417703263023, + "grad_norm": 0.12431147694587708, + "learning_rate": 5.93e-05, + "loss": 0.0329, + "step": 1780 + }, + { + "epoch": 0.32269695330809445, + "grad_norm": 0.13690216839313507, + "learning_rate": 5.9633333333333344e-05, + "loss": 0.0349, + "step": 1790 + }, + { + "epoch": 0.32449972958355866, + "grad_norm": 0.12583042681217194, + "learning_rate": 5.996666666666667e-05, + "loss": 0.0329, + "step": 1800 + }, + { + "epoch": 0.3263025058590229, + "grad_norm": 0.08628161251544952, + "learning_rate": 6.03e-05, + "loss": 0.0338, + "step": 1810 + }, + { + "epoch": 0.3281052821344871, + "grad_norm": 0.11665333807468414, + "learning_rate": 6.063333333333333e-05, + "loss": 0.0368, + "step": 1820 + }, + { + "epoch": 0.3299080584099513, + "grad_norm": 0.10734157264232635, + "learning_rate": 6.0966666666666674e-05, + "loss": 0.0351, + "step": 1830 + }, + { + "epoch": 0.33171083468541557, + "grad_norm": 0.10755439847707748, + "learning_rate": 6.13e-05, + "loss": 0.0336, + "step": 1840 + }, + { + "epoch": 0.3335136109608798, + "grad_norm": 0.15677402913570404, + "learning_rate": 6.163333333333333e-05, + "loss": 0.036, + "step": 1850 + }, + { + "epoch": 0.335316387236344, + "grad_norm": 0.1073877289891243, + "learning_rate": 6.196666666666668e-05, + "loss": 0.0338, + "step": 1860 + }, + { + "epoch": 0.3371191635118082, + "grad_norm": 0.0918564572930336, + "learning_rate": 6.23e-05, + "loss": 0.0325, + "step": 1870 + }, + { + "epoch": 0.3389219397872724, + "grad_norm": 0.13792933523654938, + "learning_rate": 6.263333333333333e-05, + "loss": 0.0324, + "step": 1880 + }, + { + "epoch": 0.34072471606273663, + "grad_norm": 0.14770172536373138, + "learning_rate": 6.296666666666667e-05, + "loss": 0.0327, + "step": 1890 + }, + { + "epoch": 0.34252749233820085, + "grad_norm": 0.10930487513542175, + "learning_rate": 6.330000000000001e-05, + "loss": 0.0337, + "step": 1900 + }, + { + "epoch": 0.34433026861366506, + "grad_norm": 0.13039787113666534, + "learning_rate": 6.363333333333334e-05, + "loss": 0.035, + "step": 1910 + }, + { + "epoch": 0.34613304488912927, + "grad_norm": 0.1425892561674118, + "learning_rate": 6.396666666666667e-05, + "loss": 0.0332, + "step": 1920 + }, + { + "epoch": 0.3479358211645935, + "grad_norm": 0.1457022875547409, + "learning_rate": 6.43e-05, + "loss": 0.0365, + "step": 1930 + }, + { + "epoch": 0.3497385974400577, + "grad_norm": 0.11419204622507095, + "learning_rate": 6.463333333333334e-05, + "loss": 0.0324, + "step": 1940 + }, + { + "epoch": 0.3515413737155219, + "grad_norm": 0.13262632489204407, + "learning_rate": 6.496666666666667e-05, + "loss": 0.0336, + "step": 1950 + }, + { + "epoch": 0.3533441499909861, + "grad_norm": 0.1058313250541687, + "learning_rate": 6.53e-05, + "loss": 0.0306, + "step": 1960 + }, + { + "epoch": 0.35514692626645034, + "grad_norm": 0.12586258351802826, + "learning_rate": 6.563333333333333e-05, + "loss": 0.0334, + "step": 1970 + }, + { + "epoch": 0.35694970254191455, + "grad_norm": 0.1120859757065773, + "learning_rate": 6.596666666666667e-05, + "loss": 0.0315, + "step": 1980 + }, + { + "epoch": 0.35875247881737876, + "grad_norm": 0.08368799835443497, + "learning_rate": 6.630000000000001e-05, + "loss": 0.0311, + "step": 1990 + }, + { + "epoch": 0.360555255092843, + "grad_norm": 0.12724414467811584, + "learning_rate": 6.663333333333333e-05, + "loss": 0.0345, + "step": 2000 + }, + { + "epoch": 0.3623580313683072, + "grad_norm": 0.08646780997514725, + "learning_rate": 6.696666666666666e-05, + "loss": 0.0317, + "step": 2010 + }, + { + "epoch": 0.3641608076437714, + "grad_norm": 0.0852169543504715, + "learning_rate": 6.730000000000001e-05, + "loss": 0.0328, + "step": 2020 + }, + { + "epoch": 0.3659635839192356, + "grad_norm": 0.09177013486623764, + "learning_rate": 6.763333333333334e-05, + "loss": 0.0295, + "step": 2030 + }, + { + "epoch": 0.3677663601946998, + "grad_norm": 0.15836720168590546, + "learning_rate": 6.796666666666666e-05, + "loss": 0.0336, + "step": 2040 + }, + { + "epoch": 0.36956913647016404, + "grad_norm": 0.1220734566450119, + "learning_rate": 6.83e-05, + "loss": 0.0298, + "step": 2050 + }, + { + "epoch": 0.37137191274562825, + "grad_norm": 0.10233849287033081, + "learning_rate": 6.863333333333334e-05, + "loss": 0.0321, + "step": 2060 + }, + { + "epoch": 0.37317468902109246, + "grad_norm": 0.08485671132802963, + "learning_rate": 6.896666666666667e-05, + "loss": 0.0316, + "step": 2070 + }, + { + "epoch": 0.3749774652965567, + "grad_norm": 0.12895233929157257, + "learning_rate": 6.93e-05, + "loss": 0.0314, + "step": 2080 + }, + { + "epoch": 0.3767802415720209, + "grad_norm": 0.12180645763874054, + "learning_rate": 6.963333333333334e-05, + "loss": 0.034, + "step": 2090 + }, + { + "epoch": 0.3785830178474851, + "grad_norm": 0.10040601342916489, + "learning_rate": 6.996666666666667e-05, + "loss": 0.0329, + "step": 2100 + }, + { + "epoch": 0.3803857941229493, + "grad_norm": 0.14184916019439697, + "learning_rate": 7.03e-05, + "loss": 0.0338, + "step": 2110 + }, + { + "epoch": 0.38218857039841353, + "grad_norm": 0.11596868932247162, + "learning_rate": 7.063333333333333e-05, + "loss": 0.0337, + "step": 2120 + }, + { + "epoch": 0.3839913466738778, + "grad_norm": 0.13311274349689484, + "learning_rate": 7.096666666666667e-05, + "loss": 0.0342, + "step": 2130 + }, + { + "epoch": 0.385794122949342, + "grad_norm": 0.10001257807016373, + "learning_rate": 7.13e-05, + "loss": 0.0336, + "step": 2140 + }, + { + "epoch": 0.3875968992248062, + "grad_norm": 0.1242021694779396, + "learning_rate": 7.163333333333334e-05, + "loss": 0.0332, + "step": 2150 + }, + { + "epoch": 0.38939967550027044, + "grad_norm": 0.10378447920084, + "learning_rate": 7.196666666666668e-05, + "loss": 0.0328, + "step": 2160 + }, + { + "epoch": 0.39120245177573465, + "grad_norm": 0.09639541804790497, + "learning_rate": 7.23e-05, + "loss": 0.0336, + "step": 2170 + }, + { + "epoch": 0.39300522805119886, + "grad_norm": 0.13110215961933136, + "learning_rate": 7.263333333333334e-05, + "loss": 0.0353, + "step": 2180 + }, + { + "epoch": 0.3948080043266631, + "grad_norm": 0.08413747698068619, + "learning_rate": 7.296666666666667e-05, + "loss": 0.0331, + "step": 2190 + }, + { + "epoch": 0.3966107806021273, + "grad_norm": 0.1678888499736786, + "learning_rate": 7.33e-05, + "loss": 0.031, + "step": 2200 + }, + { + "epoch": 0.3984135568775915, + "grad_norm": 0.09157788008451462, + "learning_rate": 7.363333333333334e-05, + "loss": 0.0316, + "step": 2210 + }, + { + "epoch": 0.4002163331530557, + "grad_norm": 0.16726191341876984, + "learning_rate": 7.396666666666667e-05, + "loss": 0.0325, + "step": 2220 + }, + { + "epoch": 0.4020191094285199, + "grad_norm": 0.11160159111022949, + "learning_rate": 7.43e-05, + "loss": 0.0312, + "step": 2230 + }, + { + "epoch": 0.40382188570398414, + "grad_norm": 0.164637953042984, + "learning_rate": 7.463333333333334e-05, + "loss": 0.031, + "step": 2240 + }, + { + "epoch": 0.40562466197944835, + "grad_norm": 0.16714324057102203, + "learning_rate": 7.496666666666667e-05, + "loss": 0.0328, + "step": 2250 + }, + { + "epoch": 0.40742743825491257, + "grad_norm": 0.0966704934835434, + "learning_rate": 7.53e-05, + "loss": 0.0328, + "step": 2260 + }, + { + "epoch": 0.4092302145303768, + "grad_norm": 0.15929150581359863, + "learning_rate": 7.563333333333333e-05, + "loss": 0.0319, + "step": 2270 + }, + { + "epoch": 0.411032990805841, + "grad_norm": 0.09980643540620804, + "learning_rate": 7.596666666666668e-05, + "loss": 0.0332, + "step": 2280 + }, + { + "epoch": 0.4128357670813052, + "grad_norm": 0.10436840355396271, + "learning_rate": 7.630000000000001e-05, + "loss": 0.0303, + "step": 2290 + }, + { + "epoch": 0.4146385433567694, + "grad_norm": 0.09922856092453003, + "learning_rate": 7.663333333333333e-05, + "loss": 0.0302, + "step": 2300 + }, + { + "epoch": 0.41644131963223363, + "grad_norm": 0.09414553642272949, + "learning_rate": 7.696666666666668e-05, + "loss": 0.0335, + "step": 2310 + }, + { + "epoch": 0.41824409590769784, + "grad_norm": 0.0749678835272789, + "learning_rate": 7.730000000000001e-05, + "loss": 0.0324, + "step": 2320 + }, + { + "epoch": 0.42004687218316206, + "grad_norm": 0.10473085194826126, + "learning_rate": 7.763333333333334e-05, + "loss": 0.0318, + "step": 2330 + }, + { + "epoch": 0.42184964845862627, + "grad_norm": 0.09854334592819214, + "learning_rate": 7.796666666666666e-05, + "loss": 0.0317, + "step": 2340 + }, + { + "epoch": 0.4236524247340905, + "grad_norm": 0.10060928761959076, + "learning_rate": 7.83e-05, + "loss": 0.0304, + "step": 2350 + }, + { + "epoch": 0.4254552010095547, + "grad_norm": 0.10017639398574829, + "learning_rate": 7.863333333333334e-05, + "loss": 0.0326, + "step": 2360 + }, + { + "epoch": 0.4272579772850189, + "grad_norm": 0.08367866277694702, + "learning_rate": 7.896666666666667e-05, + "loss": 0.0313, + "step": 2370 + }, + { + "epoch": 0.4290607535604831, + "grad_norm": 0.10204141587018967, + "learning_rate": 7.93e-05, + "loss": 0.0333, + "step": 2380 + }, + { + "epoch": 0.43086352983594733, + "grad_norm": 0.1103813424706459, + "learning_rate": 7.963333333333334e-05, + "loss": 0.0326, + "step": 2390 + }, + { + "epoch": 0.43266630611141155, + "grad_norm": 0.09394291788339615, + "learning_rate": 7.996666666666667e-05, + "loss": 0.0311, + "step": 2400 + }, + { + "epoch": 0.4344690823868758, + "grad_norm": 0.09699404984712601, + "learning_rate": 8.030000000000001e-05, + "loss": 0.0324, + "step": 2410 + }, + { + "epoch": 0.43627185866234003, + "grad_norm": 0.08242392539978027, + "learning_rate": 8.063333333333333e-05, + "loss": 0.029, + "step": 2420 + }, + { + "epoch": 0.43807463493780424, + "grad_norm": 0.07868364453315735, + "learning_rate": 8.096666666666667e-05, + "loss": 0.0297, + "step": 2430 + }, + { + "epoch": 0.43987741121326845, + "grad_norm": 0.12898984551429749, + "learning_rate": 8.13e-05, + "loss": 0.0307, + "step": 2440 + }, + { + "epoch": 0.44168018748873267, + "grad_norm": 0.10092326253652573, + "learning_rate": 8.163333333333334e-05, + "loss": 0.0312, + "step": 2450 + }, + { + "epoch": 0.4434829637641969, + "grad_norm": 0.08375825732946396, + "learning_rate": 8.196666666666668e-05, + "loss": 0.031, + "step": 2460 + }, + { + "epoch": 0.4452857400396611, + "grad_norm": 0.11443053185939789, + "learning_rate": 8.23e-05, + "loss": 0.0326, + "step": 2470 + }, + { + "epoch": 0.4470885163151253, + "grad_norm": 0.08297625929117203, + "learning_rate": 8.263333333333334e-05, + "loss": 0.0331, + "step": 2480 + }, + { + "epoch": 0.4488912925905895, + "grad_norm": 0.08914212137460709, + "learning_rate": 8.296666666666667e-05, + "loss": 0.0304, + "step": 2490 + }, + { + "epoch": 0.45069406886605373, + "grad_norm": 0.12118199467658997, + "learning_rate": 8.33e-05, + "loss": 0.0297, + "step": 2500 + }, + { + "epoch": 0.45249684514151794, + "grad_norm": 0.11329261213541031, + "learning_rate": 8.363333333333334e-05, + "loss": 0.0318, + "step": 2510 + }, + { + "epoch": 0.45429962141698216, + "grad_norm": 0.0965680405497551, + "learning_rate": 8.396666666666667e-05, + "loss": 0.0334, + "step": 2520 + }, + { + "epoch": 0.45610239769244637, + "grad_norm": 0.17904628813266754, + "learning_rate": 8.43e-05, + "loss": 0.0317, + "step": 2530 + }, + { + "epoch": 0.4579051739679106, + "grad_norm": 0.12222342938184738, + "learning_rate": 8.463333333333335e-05, + "loss": 0.0356, + "step": 2540 + }, + { + "epoch": 0.4597079502433748, + "grad_norm": 0.07297693938016891, + "learning_rate": 8.496666666666667e-05, + "loss": 0.032, + "step": 2550 + }, + { + "epoch": 0.461510726518839, + "grad_norm": 0.1024240180850029, + "learning_rate": 8.53e-05, + "loss": 0.0313, + "step": 2560 + }, + { + "epoch": 0.4633135027943032, + "grad_norm": 0.1282806247472763, + "learning_rate": 8.563333333333333e-05, + "loss": 0.0325, + "step": 2570 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.14574198424816132, + "learning_rate": 8.596666666666668e-05, + "loss": 0.0331, + "step": 2580 + }, + { + "epoch": 0.46691905534523165, + "grad_norm": 0.09403903782367706, + "learning_rate": 8.63e-05, + "loss": 0.0305, + "step": 2590 + }, + { + "epoch": 0.46872183162069586, + "grad_norm": 0.11817971616983414, + "learning_rate": 8.663333333333333e-05, + "loss": 0.0329, + "step": 2600 + }, + { + "epoch": 0.4705246078961601, + "grad_norm": 0.11116228997707367, + "learning_rate": 8.696666666666668e-05, + "loss": 0.0326, + "step": 2610 + }, + { + "epoch": 0.4723273841716243, + "grad_norm": 0.08909476548433304, + "learning_rate": 8.730000000000001e-05, + "loss": 0.0299, + "step": 2620 + }, + { + "epoch": 0.4741301604470885, + "grad_norm": 0.09449451416730881, + "learning_rate": 8.763333333333334e-05, + "loss": 0.0313, + "step": 2630 + }, + { + "epoch": 0.4759329367225527, + "grad_norm": 0.08118786662817001, + "learning_rate": 8.796666666666667e-05, + "loss": 0.03, + "step": 2640 + }, + { + "epoch": 0.4777357129980169, + "grad_norm": 0.08677814155817032, + "learning_rate": 8.83e-05, + "loss": 0.031, + "step": 2650 + }, + { + "epoch": 0.47953848927348114, + "grad_norm": 0.07146529853343964, + "learning_rate": 8.863333333333334e-05, + "loss": 0.0303, + "step": 2660 + }, + { + "epoch": 0.48134126554894535, + "grad_norm": 0.10365787148475647, + "learning_rate": 8.896666666666667e-05, + "loss": 0.0299, + "step": 2670 + }, + { + "epoch": 0.48314404182440956, + "grad_norm": 0.08889545500278473, + "learning_rate": 8.93e-05, + "loss": 0.0294, + "step": 2680 + }, + { + "epoch": 0.48494681809987383, + "grad_norm": 0.07847946882247925, + "learning_rate": 8.963333333333333e-05, + "loss": 0.0317, + "step": 2690 + }, + { + "epoch": 0.48674959437533805, + "grad_norm": 0.10679551959037781, + "learning_rate": 8.996666666666667e-05, + "loss": 0.0298, + "step": 2700 + }, + { + "epoch": 0.48855237065080226, + "grad_norm": 0.10610055178403854, + "learning_rate": 9.030000000000001e-05, + "loss": 0.0286, + "step": 2710 + }, + { + "epoch": 0.49035514692626647, + "grad_norm": 0.08055806159973145, + "learning_rate": 9.063333333333333e-05, + "loss": 0.0319, + "step": 2720 + }, + { + "epoch": 0.4921579232017307, + "grad_norm": 0.0732615515589714, + "learning_rate": 9.096666666666666e-05, + "loss": 0.029, + "step": 2730 + }, + { + "epoch": 0.4939606994771949, + "grad_norm": 0.07683437317609787, + "learning_rate": 9.130000000000001e-05, + "loss": 0.0286, + "step": 2740 + }, + { + "epoch": 0.4957634757526591, + "grad_norm": 0.11705199629068375, + "learning_rate": 9.163333333333334e-05, + "loss": 0.0304, + "step": 2750 + }, + { + "epoch": 0.4975662520281233, + "grad_norm": 0.09369435906410217, + "learning_rate": 9.196666666666666e-05, + "loss": 0.0318, + "step": 2760 + }, + { + "epoch": 0.49936902830358754, + "grad_norm": 0.08877081423997879, + "learning_rate": 9.230000000000001e-05, + "loss": 0.0305, + "step": 2770 + }, + { + "epoch": 0.5011718045790517, + "grad_norm": 0.09463613480329514, + "learning_rate": 9.263333333333334e-05, + "loss": 0.0301, + "step": 2780 + }, + { + "epoch": 0.502974580854516, + "grad_norm": 0.06763426959514618, + "learning_rate": 9.296666666666667e-05, + "loss": 0.0282, + "step": 2790 + }, + { + "epoch": 0.5047773571299802, + "grad_norm": 0.1521163284778595, + "learning_rate": 9.33e-05, + "loss": 0.0319, + "step": 2800 + }, + { + "epoch": 0.5065801334054444, + "grad_norm": 0.09530220180749893, + "learning_rate": 9.363333333333334e-05, + "loss": 0.0311, + "step": 2810 + }, + { + "epoch": 0.5083829096809086, + "grad_norm": 0.10000351816415787, + "learning_rate": 9.396666666666667e-05, + "loss": 0.0284, + "step": 2820 + }, + { + "epoch": 0.5101856859563728, + "grad_norm": 0.12066725641489029, + "learning_rate": 9.43e-05, + "loss": 0.0313, + "step": 2830 + }, + { + "epoch": 0.511988462231837, + "grad_norm": 0.08989167213439941, + "learning_rate": 9.463333333333333e-05, + "loss": 0.0285, + "step": 2840 + }, + { + "epoch": 0.5137912385073012, + "grad_norm": 0.07139872759580612, + "learning_rate": 9.496666666666667e-05, + "loss": 0.0312, + "step": 2850 + }, + { + "epoch": 0.5155940147827655, + "grad_norm": 0.07135288417339325, + "learning_rate": 9.53e-05, + "loss": 0.0311, + "step": 2860 + }, + { + "epoch": 0.5173967910582297, + "grad_norm": 0.08209555596113205, + "learning_rate": 9.563333333333334e-05, + "loss": 0.0316, + "step": 2870 + }, + { + "epoch": 0.5191995673336939, + "grad_norm": 0.07630390673875809, + "learning_rate": 9.596666666666668e-05, + "loss": 0.0285, + "step": 2880 + }, + { + "epoch": 0.5210023436091581, + "grad_norm": 0.10467585921287537, + "learning_rate": 9.63e-05, + "loss": 0.0304, + "step": 2890 + }, + { + "epoch": 0.5228051198846223, + "grad_norm": 0.09707807749509811, + "learning_rate": 9.663333333333334e-05, + "loss": 0.0306, + "step": 2900 + }, + { + "epoch": 0.5246078961600865, + "grad_norm": 0.07005254924297333, + "learning_rate": 9.696666666666667e-05, + "loss": 0.0313, + "step": 2910 + }, + { + "epoch": 0.5264106724355507, + "grad_norm": 0.08306580036878586, + "learning_rate": 9.730000000000001e-05, + "loss": 0.0298, + "step": 2920 + }, + { + "epoch": 0.5282134487110149, + "grad_norm": 0.0919666588306427, + "learning_rate": 9.763333333333334e-05, + "loss": 0.0323, + "step": 2930 + }, + { + "epoch": 0.5300162249864792, + "grad_norm": 0.09779714047908783, + "learning_rate": 9.796666666666667e-05, + "loss": 0.0286, + "step": 2940 + }, + { + "epoch": 0.5318190012619434, + "grad_norm": 0.08522796630859375, + "learning_rate": 9.83e-05, + "loss": 0.0294, + "step": 2950 + }, + { + "epoch": 0.5336217775374076, + "grad_norm": 0.07817700505256653, + "learning_rate": 9.863333333333334e-05, + "loss": 0.0292, + "step": 2960 + }, + { + "epoch": 0.5354245538128718, + "grad_norm": 0.08175903558731079, + "learning_rate": 9.896666666666667e-05, + "loss": 0.0305, + "step": 2970 + }, + { + "epoch": 0.537227330088336, + "grad_norm": 0.12900342047214508, + "learning_rate": 9.93e-05, + "loss": 0.0302, + "step": 2980 + }, + { + "epoch": 0.5390301063638002, + "grad_norm": 0.07554749399423599, + "learning_rate": 9.963333333333333e-05, + "loss": 0.0315, + "step": 2990 + }, + { + "epoch": 0.5408328826392644, + "grad_norm": 0.0798017829656601, + "learning_rate": 9.996666666666668e-05, + "loss": 0.0302, + "step": 3000 + }, + { + "epoch": 0.5426356589147286, + "grad_norm": 0.08450128138065338, + "learning_rate": 9.999999384858465e-05, + "loss": 0.0319, + "step": 3010 + }, + { + "epoch": 0.5444384351901929, + "grad_norm": 0.1123519018292427, + "learning_rate": 9.999997258443473e-05, + "loss": 0.0308, + "step": 3020 + }, + { + "epoch": 0.5462412114656571, + "grad_norm": 0.09831847995519638, + "learning_rate": 9.999993613161331e-05, + "loss": 0.0319, + "step": 3030 + }, + { + "epoch": 0.5480439877411213, + "grad_norm": 0.09247203171253204, + "learning_rate": 9.999988449013146e-05, + "loss": 0.0317, + "step": 3040 + }, + { + "epoch": 0.5498467640165855, + "grad_norm": 0.08455559611320496, + "learning_rate": 9.99998176600049e-05, + "loss": 0.0301, + "step": 3050 + }, + { + "epoch": 0.5516495402920497, + "grad_norm": 0.0746980756521225, + "learning_rate": 9.999973564125389e-05, + "loss": 0.029, + "step": 3060 + }, + { + "epoch": 0.5534523165675139, + "grad_norm": 0.08219433575868607, + "learning_rate": 9.999963843390335e-05, + "loss": 0.0302, + "step": 3070 + }, + { + "epoch": 0.5552550928429781, + "grad_norm": 0.06905438750982285, + "learning_rate": 9.999952603798282e-05, + "loss": 0.0314, + "step": 3080 + }, + { + "epoch": 0.5570578691184424, + "grad_norm": 0.08999866247177124, + "learning_rate": 9.999939845352646e-05, + "loss": 0.029, + "step": 3090 + }, + { + "epoch": 0.5588606453939066, + "grad_norm": 0.0865178257226944, + "learning_rate": 9.999925568057298e-05, + "loss": 0.0292, + "step": 3100 + }, + { + "epoch": 0.5606634216693708, + "grad_norm": 0.09611177444458008, + "learning_rate": 9.999909771916578e-05, + "loss": 0.0305, + "step": 3110 + }, + { + "epoch": 0.5624661979448351, + "grad_norm": 0.10748101025819778, + "learning_rate": 9.999892456935285e-05, + "loss": 0.0278, + "step": 3120 + }, + { + "epoch": 0.5642689742202993, + "grad_norm": 0.11239656805992126, + "learning_rate": 9.999873623118679e-05, + "loss": 0.0305, + "step": 3130 + }, + { + "epoch": 0.5660717504957635, + "grad_norm": 0.09102374315261841, + "learning_rate": 9.999853270472479e-05, + "loss": 0.0279, + "step": 3140 + }, + { + "epoch": 0.5678745267712277, + "grad_norm": 0.09112105518579483, + "learning_rate": 9.999831399002871e-05, + "loss": 0.0286, + "step": 3150 + }, + { + "epoch": 0.569677303046692, + "grad_norm": 0.08242996782064438, + "learning_rate": 9.999808008716494e-05, + "loss": 0.0295, + "step": 3160 + }, + { + "epoch": 0.5714800793221562, + "grad_norm": 0.07456495612859726, + "learning_rate": 9.999783099620459e-05, + "loss": 0.0301, + "step": 3170 + }, + { + "epoch": 0.5732828555976204, + "grad_norm": 0.10104014724493027, + "learning_rate": 9.999756671722328e-05, + "loss": 0.0297, + "step": 3180 + }, + { + "epoch": 0.5750856318730846, + "grad_norm": 0.0868121087551117, + "learning_rate": 9.99972872503013e-05, + "loss": 0.0265, + "step": 3190 + }, + { + "epoch": 0.5768884081485488, + "grad_norm": 0.09674925357103348, + "learning_rate": 9.999699259552359e-05, + "loss": 0.0304, + "step": 3200 + }, + { + "epoch": 0.578691184424013, + "grad_norm": 0.11101334542036057, + "learning_rate": 9.99966827529796e-05, + "loss": 0.0297, + "step": 3210 + }, + { + "epoch": 0.5804939606994772, + "grad_norm": 0.0662771686911583, + "learning_rate": 9.999635772276348e-05, + "loss": 0.0292, + "step": 3220 + }, + { + "epoch": 0.5822967369749414, + "grad_norm": 0.09066713601350784, + "learning_rate": 9.999601750497396e-05, + "loss": 0.0296, + "step": 3230 + }, + { + "epoch": 0.5840995132504057, + "grad_norm": 0.07284525781869888, + "learning_rate": 9.99956620997144e-05, + "loss": 0.03, + "step": 3240 + }, + { + "epoch": 0.5859022895258699, + "grad_norm": 0.1161518543958664, + "learning_rate": 9.999529150709275e-05, + "loss": 0.0296, + "step": 3250 + }, + { + "epoch": 0.5877050658013341, + "grad_norm": 0.09038016200065613, + "learning_rate": 9.999490572722158e-05, + "loss": 0.0305, + "step": 3260 + }, + { + "epoch": 0.5895078420767983, + "grad_norm": 0.06842294335365295, + "learning_rate": 9.99945047602181e-05, + "loss": 0.0276, + "step": 3270 + }, + { + "epoch": 0.5913106183522625, + "grad_norm": 0.1004544049501419, + "learning_rate": 9.99940886062041e-05, + "loss": 0.033, + "step": 3280 + }, + { + "epoch": 0.5931133946277267, + "grad_norm": 0.06948916614055634, + "learning_rate": 9.999365726530599e-05, + "loss": 0.0294, + "step": 3290 + }, + { + "epoch": 0.5949161709031909, + "grad_norm": 0.08625806123018265, + "learning_rate": 9.999321073765481e-05, + "loss": 0.0287, + "step": 3300 + }, + { + "epoch": 0.5967189471786551, + "grad_norm": 0.07781235873699188, + "learning_rate": 9.99927490233862e-05, + "loss": 0.0288, + "step": 3310 + }, + { + "epoch": 0.5985217234541194, + "grad_norm": 0.0724525973200798, + "learning_rate": 9.999227212264043e-05, + "loss": 0.0294, + "step": 3320 + }, + { + "epoch": 0.6003244997295836, + "grad_norm": 0.08471264690160751, + "learning_rate": 9.999178003556236e-05, + "loss": 0.0305, + "step": 3330 + }, + { + "epoch": 0.6021272760050478, + "grad_norm": 0.08040895313024521, + "learning_rate": 9.999127276230146e-05, + "loss": 0.0292, + "step": 3340 + }, + { + "epoch": 0.603930052280512, + "grad_norm": 0.07737747579813004, + "learning_rate": 9.999075030301184e-05, + "loss": 0.0299, + "step": 3350 + }, + { + "epoch": 0.6057328285559762, + "grad_norm": 0.061475593596696854, + "learning_rate": 9.999021265785221e-05, + "loss": 0.028, + "step": 3360 + }, + { + "epoch": 0.6075356048314404, + "grad_norm": 0.08520469814538956, + "learning_rate": 9.998965982698589e-05, + "loss": 0.0291, + "step": 3370 + }, + { + "epoch": 0.6093383811069046, + "grad_norm": 0.08050555735826492, + "learning_rate": 9.998909181058082e-05, + "loss": 0.0291, + "step": 3380 + }, + { + "epoch": 0.6111411573823688, + "grad_norm": 0.08225930482149124, + "learning_rate": 9.998850860880953e-05, + "loss": 0.0285, + "step": 3390 + }, + { + "epoch": 0.6129439336578331, + "grad_norm": 0.07121815532445908, + "learning_rate": 9.998791022184922e-05, + "loss": 0.0285, + "step": 3400 + }, + { + "epoch": 0.6147467099332973, + "grad_norm": 0.08943697810173035, + "learning_rate": 9.99872966498816e-05, + "loss": 0.0277, + "step": 3410 + }, + { + "epoch": 0.6165494862087615, + "grad_norm": 0.11874101310968399, + "learning_rate": 9.998666789309313e-05, + "loss": 0.0285, + "step": 3420 + }, + { + "epoch": 0.6183522624842257, + "grad_norm": 0.07991812378168106, + "learning_rate": 9.998602395167475e-05, + "loss": 0.0292, + "step": 3430 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 0.07335567474365234, + "learning_rate": 9.998536482582213e-05, + "loss": 0.0269, + "step": 3440 + }, + { + "epoch": 0.6219578150351541, + "grad_norm": 0.07487010210752487, + "learning_rate": 9.998469051573544e-05, + "loss": 0.028, + "step": 3450 + }, + { + "epoch": 0.6237605913106183, + "grad_norm": 0.08838813006877899, + "learning_rate": 9.998400102161954e-05, + "loss": 0.0302, + "step": 3460 + }, + { + "epoch": 0.6255633675860826, + "grad_norm": 0.08146977424621582, + "learning_rate": 9.998329634368388e-05, + "loss": 0.0281, + "step": 3470 + }, + { + "epoch": 0.6273661438615468, + "grad_norm": 0.09117871522903442, + "learning_rate": 9.998257648214253e-05, + "loss": 0.03, + "step": 3480 + }, + { + "epoch": 0.629168920137011, + "grad_norm": 0.09138143807649612, + "learning_rate": 9.998184143721417e-05, + "loss": 0.0277, + "step": 3490 + }, + { + "epoch": 0.6309716964124752, + "grad_norm": 0.0891147255897522, + "learning_rate": 9.998109120912206e-05, + "loss": 0.0288, + "step": 3500 + }, + { + "epoch": 0.6327744726879394, + "grad_norm": 0.102154940366745, + "learning_rate": 9.998032579809411e-05, + "loss": 0.0288, + "step": 3510 + }, + { + "epoch": 0.6345772489634036, + "grad_norm": 0.09600833803415298, + "learning_rate": 9.997954520436286e-05, + "loss": 0.0305, + "step": 3520 + }, + { + "epoch": 0.6363800252388678, + "grad_norm": 0.1062265932559967, + "learning_rate": 9.997874942816538e-05, + "loss": 0.0294, + "step": 3530 + }, + { + "epoch": 0.638182801514332, + "grad_norm": 0.07692592591047287, + "learning_rate": 9.997793846974345e-05, + "loss": 0.0296, + "step": 3540 + }, + { + "epoch": 0.6399855777897963, + "grad_norm": 0.07363329827785492, + "learning_rate": 9.997711232934341e-05, + "loss": 0.0287, + "step": 3550 + }, + { + "epoch": 0.6417883540652605, + "grad_norm": 0.09241073578596115, + "learning_rate": 9.99762710072162e-05, + "loss": 0.0288, + "step": 3560 + }, + { + "epoch": 0.6435911303407247, + "grad_norm": 0.08219496160745621, + "learning_rate": 9.997541450361743e-05, + "loss": 0.0275, + "step": 3570 + }, + { + "epoch": 0.6453939066161889, + "grad_norm": 0.1015632376074791, + "learning_rate": 9.997454281880723e-05, + "loss": 0.0275, + "step": 3580 + }, + { + "epoch": 0.6471966828916531, + "grad_norm": 0.0938531830906868, + "learning_rate": 9.997365595305044e-05, + "loss": 0.0261, + "step": 3590 + }, + { + "epoch": 0.6489994591671173, + "grad_norm": 0.08830977976322174, + "learning_rate": 9.997275390661644e-05, + "loss": 0.0284, + "step": 3600 + }, + { + "epoch": 0.6508022354425815, + "grad_norm": 0.13054096698760986, + "learning_rate": 9.997183667977926e-05, + "loss": 0.0305, + "step": 3610 + }, + { + "epoch": 0.6526050117180457, + "grad_norm": 0.08058064430952072, + "learning_rate": 9.997090427281752e-05, + "loss": 0.0279, + "step": 3620 + }, + { + "epoch": 0.65440778799351, + "grad_norm": 0.08787910640239716, + "learning_rate": 9.996995668601448e-05, + "loss": 0.0276, + "step": 3630 + }, + { + "epoch": 0.6562105642689742, + "grad_norm": 0.06614172458648682, + "learning_rate": 9.996899391965798e-05, + "loss": 0.0287, + "step": 3640 + }, + { + "epoch": 0.6580133405444384, + "grad_norm": 0.07113706320524216, + "learning_rate": 9.996801597404048e-05, + "loss": 0.028, + "step": 3650 + }, + { + "epoch": 0.6598161168199026, + "grad_norm": 0.07325347512960434, + "learning_rate": 9.996702284945905e-05, + "loss": 0.0248, + "step": 3660 + }, + { + "epoch": 0.6616188930953668, + "grad_norm": 0.09069879353046417, + "learning_rate": 9.996601454621539e-05, + "loss": 0.029, + "step": 3670 + }, + { + "epoch": 0.6634216693708311, + "grad_norm": 0.07090281695127487, + "learning_rate": 9.996499106461577e-05, + "loss": 0.0278, + "step": 3680 + }, + { + "epoch": 0.6652244456462953, + "grad_norm": 0.08800269663333893, + "learning_rate": 9.996395240497112e-05, + "loss": 0.0268, + "step": 3690 + }, + { + "epoch": 0.6670272219217596, + "grad_norm": 0.08696693927049637, + "learning_rate": 9.996289856759696e-05, + "loss": 0.0273, + "step": 3700 + }, + { + "epoch": 0.6688299981972238, + "grad_norm": 0.0698804259300232, + "learning_rate": 9.996182955281342e-05, + "loss": 0.0285, + "step": 3710 + }, + { + "epoch": 0.670632774472688, + "grad_norm": 0.09495078027248383, + "learning_rate": 9.996074536094519e-05, + "loss": 0.0281, + "step": 3720 + }, + { + "epoch": 0.6724355507481522, + "grad_norm": 0.07790614664554596, + "learning_rate": 9.995964599232168e-05, + "loss": 0.0283, + "step": 3730 + }, + { + "epoch": 0.6742383270236164, + "grad_norm": 0.07828075438737869, + "learning_rate": 9.995853144727683e-05, + "loss": 0.029, + "step": 3740 + }, + { + "epoch": 0.6760411032990806, + "grad_norm": 0.07826725393533707, + "learning_rate": 9.99574017261492e-05, + "loss": 0.0307, + "step": 3750 + }, + { + "epoch": 0.6778438795745448, + "grad_norm": 0.08438368141651154, + "learning_rate": 9.995625682928198e-05, + "loss": 0.029, + "step": 3760 + }, + { + "epoch": 0.679646655850009, + "grad_norm": 0.07491648197174072, + "learning_rate": 9.995509675702295e-05, + "loss": 0.0258, + "step": 3770 + }, + { + "epoch": 0.6814494321254733, + "grad_norm": 0.06878925114870071, + "learning_rate": 9.995392150972451e-05, + "loss": 0.0262, + "step": 3780 + }, + { + "epoch": 0.6832522084009375, + "grad_norm": 0.07941044121980667, + "learning_rate": 9.995273108774366e-05, + "loss": 0.0248, + "step": 3790 + }, + { + "epoch": 0.6850549846764017, + "grad_norm": 0.07930217683315277, + "learning_rate": 9.995152549144205e-05, + "loss": 0.0284, + "step": 3800 + }, + { + "epoch": 0.6868577609518659, + "grad_norm": 0.07347506284713745, + "learning_rate": 9.995030472118587e-05, + "loss": 0.0258, + "step": 3810 + }, + { + "epoch": 0.6886605372273301, + "grad_norm": 0.07248572260141373, + "learning_rate": 9.9949068777346e-05, + "loss": 0.0271, + "step": 3820 + }, + { + "epoch": 0.6904633135027943, + "grad_norm": 0.06957680732011795, + "learning_rate": 9.994781766029786e-05, + "loss": 0.0256, + "step": 3830 + }, + { + "epoch": 0.6922660897782585, + "grad_norm": 0.07835078239440918, + "learning_rate": 9.994655137042151e-05, + "loss": 0.0275, + "step": 3840 + }, + { + "epoch": 0.6940688660537228, + "grad_norm": 0.09615061432123184, + "learning_rate": 9.99452699081016e-05, + "loss": 0.0279, + "step": 3850 + }, + { + "epoch": 0.695871642329187, + "grad_norm": 0.0577404759824276, + "learning_rate": 9.994397327372743e-05, + "loss": 0.0281, + "step": 3860 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.05039632320404053, + "learning_rate": 9.994266146769286e-05, + "loss": 0.0265, + "step": 3870 + }, + { + "epoch": 0.6994771948801154, + "grad_norm": 0.07332111150026321, + "learning_rate": 9.994133449039642e-05, + "loss": 0.0269, + "step": 3880 + }, + { + "epoch": 0.7012799711555796, + "grad_norm": 0.06438911706209183, + "learning_rate": 9.993999234224118e-05, + "loss": 0.0276, + "step": 3890 + }, + { + "epoch": 0.7030827474310438, + "grad_norm": 0.09786788374185562, + "learning_rate": 9.993863502363485e-05, + "loss": 0.0286, + "step": 3900 + }, + { + "epoch": 0.704885523706508, + "grad_norm": 0.06998310983181, + "learning_rate": 9.993726253498976e-05, + "loss": 0.0284, + "step": 3910 + }, + { + "epoch": 0.7066882999819722, + "grad_norm": 0.07971611618995667, + "learning_rate": 9.993587487672282e-05, + "loss": 0.0279, + "step": 3920 + }, + { + "epoch": 0.7084910762574365, + "grad_norm": 0.09115764498710632, + "learning_rate": 9.993447204925558e-05, + "loss": 0.0268, + "step": 3930 + }, + { + "epoch": 0.7102938525329007, + "grad_norm": 0.07438243180513382, + "learning_rate": 9.993305405301416e-05, + "loss": 0.0272, + "step": 3940 + }, + { + "epoch": 0.7120966288083649, + "grad_norm": 0.09995409846305847, + "learning_rate": 9.993162088842935e-05, + "loss": 0.0296, + "step": 3950 + }, + { + "epoch": 0.7138994050838291, + "grad_norm": 0.07293233275413513, + "learning_rate": 9.993017255593646e-05, + "loss": 0.0264, + "step": 3960 + }, + { + "epoch": 0.7157021813592933, + "grad_norm": 0.07003296166658401, + "learning_rate": 9.992870905597548e-05, + "loss": 0.0261, + "step": 3970 + }, + { + "epoch": 0.7175049576347575, + "grad_norm": 0.0757652074098587, + "learning_rate": 9.9927230388991e-05, + "loss": 0.0282, + "step": 3980 + }, + { + "epoch": 0.7193077339102217, + "grad_norm": 0.073615662753582, + "learning_rate": 9.992573655543215e-05, + "loss": 0.0277, + "step": 3990 + }, + { + "epoch": 0.721110510185686, + "grad_norm": 0.08365189284086227, + "learning_rate": 9.992422755575277e-05, + "loss": 0.0269, + "step": 4000 + }, + { + "epoch": 0.7229132864611502, + "grad_norm": 0.05377136915922165, + "learning_rate": 9.992270339041123e-05, + "loss": 0.0257, + "step": 4010 + }, + { + "epoch": 0.7247160627366144, + "grad_norm": 0.08722290396690369, + "learning_rate": 9.992116405987053e-05, + "loss": 0.0269, + "step": 4020 + }, + { + "epoch": 0.7265188390120786, + "grad_norm": 0.0616290383040905, + "learning_rate": 9.991960956459828e-05, + "loss": 0.0258, + "step": 4030 + }, + { + "epoch": 0.7283216152875428, + "grad_norm": 0.06201528385281563, + "learning_rate": 9.991803990506669e-05, + "loss": 0.0278, + "step": 4040 + }, + { + "epoch": 0.730124391563007, + "grad_norm": 0.0831390842795372, + "learning_rate": 9.991645508175258e-05, + "loss": 0.0264, + "step": 4050 + }, + { + "epoch": 0.7319271678384712, + "grad_norm": 0.06965024024248123, + "learning_rate": 9.99148550951374e-05, + "loss": 0.0276, + "step": 4060 + }, + { + "epoch": 0.7337299441139354, + "grad_norm": 0.07673170417547226, + "learning_rate": 9.991323994570716e-05, + "loss": 0.027, + "step": 4070 + }, + { + "epoch": 0.7355327203893997, + "grad_norm": 0.06916692107915878, + "learning_rate": 9.99116096339525e-05, + "loss": 0.0243, + "step": 4080 + }, + { + "epoch": 0.7373354966648639, + "grad_norm": 0.07095610350370407, + "learning_rate": 9.990996416036869e-05, + "loss": 0.0275, + "step": 4090 + }, + { + "epoch": 0.7391382729403281, + "grad_norm": 0.07902040332555771, + "learning_rate": 9.990830352545555e-05, + "loss": 0.0278, + "step": 4100 + }, + { + "epoch": 0.7409410492157923, + "grad_norm": 0.07931259274482727, + "learning_rate": 9.990662772971756e-05, + "loss": 0.0248, + "step": 4110 + }, + { + "epoch": 0.7427438254912565, + "grad_norm": 0.10176528990268707, + "learning_rate": 9.990493677366376e-05, + "loss": 0.0261, + "step": 4120 + }, + { + "epoch": 0.7445466017667207, + "grad_norm": 0.0816539004445076, + "learning_rate": 9.990323065780786e-05, + "loss": 0.0247, + "step": 4130 + }, + { + "epoch": 0.7463493780421849, + "grad_norm": 0.1051865890622139, + "learning_rate": 9.990150938266808e-05, + "loss": 0.027, + "step": 4140 + }, + { + "epoch": 0.7481521543176491, + "grad_norm": 0.06166602671146393, + "learning_rate": 9.989977294876733e-05, + "loss": 0.0253, + "step": 4150 + }, + { + "epoch": 0.7499549305931134, + "grad_norm": 0.08050011098384857, + "learning_rate": 9.989802135663308e-05, + "loss": 0.0276, + "step": 4160 + }, + { + "epoch": 0.7517577068685776, + "grad_norm": 0.08514092862606049, + "learning_rate": 9.989625460679743e-05, + "loss": 0.0266, + "step": 4170 + }, + { + "epoch": 0.7535604831440418, + "grad_norm": 0.09616975486278534, + "learning_rate": 9.989447269979706e-05, + "loss": 0.0281, + "step": 4180 + }, + { + "epoch": 0.755363259419506, + "grad_norm": 0.0882219672203064, + "learning_rate": 9.989267563617328e-05, + "loss": 0.0273, + "step": 4190 + }, + { + "epoch": 0.7571660356949702, + "grad_norm": 0.09132757037878036, + "learning_rate": 9.989086341647198e-05, + "loss": 0.0267, + "step": 4200 + }, + { + "epoch": 0.7589688119704344, + "grad_norm": 0.09457577764987946, + "learning_rate": 9.988903604124366e-05, + "loss": 0.0263, + "step": 4210 + }, + { + "epoch": 0.7607715882458986, + "grad_norm": 0.08002500981092453, + "learning_rate": 9.988719351104343e-05, + "loss": 0.0276, + "step": 4220 + }, + { + "epoch": 0.7625743645213628, + "grad_norm": 0.09656794369220734, + "learning_rate": 9.9885335826431e-05, + "loss": 0.0269, + "step": 4230 + }, + { + "epoch": 0.7643771407968271, + "grad_norm": 0.1038774624466896, + "learning_rate": 9.988346298797071e-05, + "loss": 0.0266, + "step": 4240 + }, + { + "epoch": 0.7661799170722914, + "grad_norm": 0.07056058943271637, + "learning_rate": 9.988157499623146e-05, + "loss": 0.0283, + "step": 4250 + }, + { + "epoch": 0.7679826933477556, + "grad_norm": 0.07456755638122559, + "learning_rate": 9.987967185178677e-05, + "loss": 0.0278, + "step": 4260 + }, + { + "epoch": 0.7697854696232198, + "grad_norm": 0.09168928116559982, + "learning_rate": 9.987775355521476e-05, + "loss": 0.0255, + "step": 4270 + }, + { + "epoch": 0.771588245898684, + "grad_norm": 0.09599215537309647, + "learning_rate": 9.987582010709817e-05, + "loss": 0.0266, + "step": 4280 + }, + { + "epoch": 0.7733910221741482, + "grad_norm": 0.05535329133272171, + "learning_rate": 9.987387150802431e-05, + "loss": 0.0268, + "step": 4290 + }, + { + "epoch": 0.7751937984496124, + "grad_norm": 0.05708804354071617, + "learning_rate": 9.987190775858517e-05, + "loss": 0.0267, + "step": 4300 + }, + { + "epoch": 0.7769965747250767, + "grad_norm": 0.06368128955364227, + "learning_rate": 9.98699288593772e-05, + "loss": 0.0253, + "step": 4310 + }, + { + "epoch": 0.7787993510005409, + "grad_norm": 0.07643580436706543, + "learning_rate": 9.986793481100161e-05, + "loss": 0.0267, + "step": 4320 + }, + { + "epoch": 0.7806021272760051, + "grad_norm": 0.0521041564643383, + "learning_rate": 9.986592561406412e-05, + "loss": 0.0267, + "step": 4330 + }, + { + "epoch": 0.7824049035514693, + "grad_norm": 0.06993807852268219, + "learning_rate": 9.986390126917503e-05, + "loss": 0.0265, + "step": 4340 + }, + { + "epoch": 0.7842076798269335, + "grad_norm": 0.0634775161743164, + "learning_rate": 9.986186177694933e-05, + "loss": 0.0257, + "step": 4350 + }, + { + "epoch": 0.7860104561023977, + "grad_norm": 0.061332568526268005, + "learning_rate": 9.985980713800656e-05, + "loss": 0.027, + "step": 4360 + }, + { + "epoch": 0.7878132323778619, + "grad_norm": 0.05653019994497299, + "learning_rate": 9.985773735297084e-05, + "loss": 0.0279, + "step": 4370 + }, + { + "epoch": 0.7896160086533262, + "grad_norm": 0.062008339911699295, + "learning_rate": 9.985565242247092e-05, + "loss": 0.0253, + "step": 4380 + }, + { + "epoch": 0.7914187849287904, + "grad_norm": 0.08481565862894058, + "learning_rate": 9.985355234714016e-05, + "loss": 0.0266, + "step": 4390 + }, + { + "epoch": 0.7932215612042546, + "grad_norm": 0.07227197289466858, + "learning_rate": 9.985143712761652e-05, + "loss": 0.0251, + "step": 4400 + }, + { + "epoch": 0.7950243374797188, + "grad_norm": 0.06665899604558945, + "learning_rate": 9.984930676454252e-05, + "loss": 0.0266, + "step": 4410 + }, + { + "epoch": 0.796827113755183, + "grad_norm": 0.07347960770130157, + "learning_rate": 9.984716125856532e-05, + "loss": 0.0273, + "step": 4420 + }, + { + "epoch": 0.7986298900306472, + "grad_norm": 0.0625268965959549, + "learning_rate": 9.984500061033667e-05, + "loss": 0.0254, + "step": 4430 + }, + { + "epoch": 0.8004326663061114, + "grad_norm": 0.07932467758655548, + "learning_rate": 9.984282482051293e-05, + "loss": 0.0273, + "step": 4440 + }, + { + "epoch": 0.8022354425815756, + "grad_norm": 0.10024110227823257, + "learning_rate": 9.9840633889755e-05, + "loss": 0.0264, + "step": 4450 + }, + { + "epoch": 0.8040382188570399, + "grad_norm": 0.07264421135187149, + "learning_rate": 9.983842781872848e-05, + "loss": 0.0277, + "step": 4460 + }, + { + "epoch": 0.8058409951325041, + "grad_norm": 0.05864323303103447, + "learning_rate": 9.98362066081035e-05, + "loss": 0.0253, + "step": 4470 + }, + { + "epoch": 0.8076437714079683, + "grad_norm": 0.07349126785993576, + "learning_rate": 9.983397025855479e-05, + "loss": 0.0249, + "step": 4480 + }, + { + "epoch": 0.8094465476834325, + "grad_norm": 0.06676533818244934, + "learning_rate": 9.983171877076171e-05, + "loss": 0.0236, + "step": 4490 + }, + { + "epoch": 0.8112493239588967, + "grad_norm": 0.08150319010019302, + "learning_rate": 9.98294521454082e-05, + "loss": 0.027, + "step": 4500 + }, + { + "epoch": 0.8130521002343609, + "grad_norm": 0.07100415974855423, + "learning_rate": 9.98271703831828e-05, + "loss": 0.0257, + "step": 4510 + }, + { + "epoch": 0.8148548765098251, + "grad_norm": 0.060749687254428864, + "learning_rate": 9.982487348477865e-05, + "loss": 0.0248, + "step": 4520 + }, + { + "epoch": 0.8166576527852893, + "grad_norm": 0.07577101141214371, + "learning_rate": 9.982256145089347e-05, + "loss": 0.0244, + "step": 4530 + }, + { + "epoch": 0.8184604290607536, + "grad_norm": 0.05672326311469078, + "learning_rate": 9.982023428222962e-05, + "loss": 0.0255, + "step": 4540 + }, + { + "epoch": 0.8202632053362178, + "grad_norm": 0.0860847607254982, + "learning_rate": 9.981789197949403e-05, + "loss": 0.0257, + "step": 4550 + }, + { + "epoch": 0.822065981611682, + "grad_norm": 0.06932077556848526, + "learning_rate": 9.98155345433982e-05, + "loss": 0.0255, + "step": 4560 + }, + { + "epoch": 0.8238687578871462, + "grad_norm": 0.06181086599826813, + "learning_rate": 9.981316197465831e-05, + "loss": 0.0249, + "step": 4570 + }, + { + "epoch": 0.8256715341626104, + "grad_norm": 0.06864579766988754, + "learning_rate": 9.981077427399504e-05, + "loss": 0.0252, + "step": 4580 + }, + { + "epoch": 0.8274743104380746, + "grad_norm": 0.08584438264369965, + "learning_rate": 9.980837144213371e-05, + "loss": 0.0267, + "step": 4590 + }, + { + "epoch": 0.8292770867135388, + "grad_norm": 0.07758577913045883, + "learning_rate": 9.980595347980426e-05, + "loss": 0.0255, + "step": 4600 + }, + { + "epoch": 0.831079862989003, + "grad_norm": 0.06732701510190964, + "learning_rate": 9.980352038774119e-05, + "loss": 0.0265, + "step": 4610 + }, + { + "epoch": 0.8328826392644673, + "grad_norm": 0.07011505216360092, + "learning_rate": 9.98010721666836e-05, + "loss": 0.0265, + "step": 4620 + }, + { + "epoch": 0.8346854155399315, + "grad_norm": 0.07638919353485107, + "learning_rate": 9.979860881737523e-05, + "loss": 0.0254, + "step": 4630 + }, + { + "epoch": 0.8364881918153957, + "grad_norm": 0.07176736742258072, + "learning_rate": 9.979613034056434e-05, + "loss": 0.0258, + "step": 4640 + }, + { + "epoch": 0.8382909680908599, + "grad_norm": 0.06857623904943466, + "learning_rate": 9.979363673700386e-05, + "loss": 0.0251, + "step": 4650 + }, + { + "epoch": 0.8400937443663241, + "grad_norm": 0.07225841283798218, + "learning_rate": 9.979112800745124e-05, + "loss": 0.0282, + "step": 4660 + }, + { + "epoch": 0.8418965206417883, + "grad_norm": 0.0744149386882782, + "learning_rate": 9.978860415266861e-05, + "loss": 0.0262, + "step": 4670 + }, + { + "epoch": 0.8436992969172525, + "grad_norm": 0.06240631267428398, + "learning_rate": 9.978606517342262e-05, + "loss": 0.0266, + "step": 4680 + }, + { + "epoch": 0.8455020731927168, + "grad_norm": 0.05723218247294426, + "learning_rate": 9.978351107048456e-05, + "loss": 0.0253, + "step": 4690 + }, + { + "epoch": 0.847304849468181, + "grad_norm": 0.07528171688318253, + "learning_rate": 9.978094184463029e-05, + "loss": 0.0252, + "step": 4700 + }, + { + "epoch": 0.8491076257436452, + "grad_norm": 0.09852490574121475, + "learning_rate": 9.977835749664029e-05, + "loss": 0.0258, + "step": 4710 + }, + { + "epoch": 0.8509104020191094, + "grad_norm": 0.10970467329025269, + "learning_rate": 9.97757580272996e-05, + "loss": 0.0285, + "step": 4720 + }, + { + "epoch": 0.8527131782945736, + "grad_norm": 0.10292208939790726, + "learning_rate": 9.977314343739786e-05, + "loss": 0.0275, + "step": 4730 + }, + { + "epoch": 0.8545159545700378, + "grad_norm": 0.06727108359336853, + "learning_rate": 9.977051372772934e-05, + "loss": 0.0256, + "step": 4740 + }, + { + "epoch": 0.856318730845502, + "grad_norm": 0.06983084976673126, + "learning_rate": 9.976786889909286e-05, + "loss": 0.026, + "step": 4750 + }, + { + "epoch": 0.8581215071209662, + "grad_norm": 0.07255283743143082, + "learning_rate": 9.976520895229185e-05, + "loss": 0.0259, + "step": 4760 + }, + { + "epoch": 0.8599242833964305, + "grad_norm": 0.06273597478866577, + "learning_rate": 9.976253388813433e-05, + "loss": 0.0243, + "step": 4770 + }, + { + "epoch": 0.8617270596718947, + "grad_norm": 0.06409291923046112, + "learning_rate": 9.975984370743293e-05, + "loss": 0.0258, + "step": 4780 + }, + { + "epoch": 0.8635298359473589, + "grad_norm": 0.053686097264289856, + "learning_rate": 9.975713841100485e-05, + "loss": 0.0248, + "step": 4790 + }, + { + "epoch": 0.8653326122228231, + "grad_norm": 0.08964944630861282, + "learning_rate": 9.975441799967187e-05, + "loss": 0.0244, + "step": 4800 + }, + { + "epoch": 0.8671353884982873, + "grad_norm": 0.07845624536275864, + "learning_rate": 9.975168247426039e-05, + "loss": 0.025, + "step": 4810 + }, + { + "epoch": 0.8689381647737516, + "grad_norm": 0.1253553330898285, + "learning_rate": 9.974893183560139e-05, + "loss": 0.0255, + "step": 4820 + }, + { + "epoch": 0.8707409410492158, + "grad_norm": 0.07251527160406113, + "learning_rate": 9.974616608453045e-05, + "loss": 0.0257, + "step": 4830 + }, + { + "epoch": 0.8725437173246801, + "grad_norm": 0.09129400551319122, + "learning_rate": 9.974338522188772e-05, + "loss": 0.0264, + "step": 4840 + }, + { + "epoch": 0.8743464936001443, + "grad_norm": 0.0834803432226181, + "learning_rate": 9.974058924851797e-05, + "loss": 0.0262, + "step": 4850 + }, + { + "epoch": 0.8761492698756085, + "grad_norm": 0.06712009757757187, + "learning_rate": 9.973777816527051e-05, + "loss": 0.0244, + "step": 4860 + }, + { + "epoch": 0.8779520461510727, + "grad_norm": 0.05435571447014809, + "learning_rate": 9.973495197299931e-05, + "loss": 0.0245, + "step": 4870 + }, + { + "epoch": 0.8797548224265369, + "grad_norm": 0.07041464000940323, + "learning_rate": 9.973211067256287e-05, + "loss": 0.0264, + "step": 4880 + }, + { + "epoch": 0.8815575987020011, + "grad_norm": 0.06801621615886688, + "learning_rate": 9.97292542648243e-05, + "loss": 0.025, + "step": 4890 + }, + { + "epoch": 0.8833603749774653, + "grad_norm": 0.06179453060030937, + "learning_rate": 9.972638275065131e-05, + "loss": 0.0231, + "step": 4900 + }, + { + "epoch": 0.8851631512529295, + "grad_norm": 0.08400911837816238, + "learning_rate": 9.972349613091621e-05, + "loss": 0.0244, + "step": 4910 + }, + { + "epoch": 0.8869659275283938, + "grad_norm": 0.0753193274140358, + "learning_rate": 9.972059440649584e-05, + "loss": 0.0239, + "step": 4920 + }, + { + "epoch": 0.888768703803858, + "grad_norm": 0.06970420479774475, + "learning_rate": 9.971767757827168e-05, + "loss": 0.0259, + "step": 4930 + }, + { + "epoch": 0.8905714800793222, + "grad_norm": 0.07199428975582123, + "learning_rate": 9.971474564712982e-05, + "loss": 0.025, + "step": 4940 + }, + { + "epoch": 0.8923742563547864, + "grad_norm": 0.05893219634890556, + "learning_rate": 9.971179861396084e-05, + "loss": 0.0263, + "step": 4950 + }, + { + "epoch": 0.8941770326302506, + "grad_norm": 0.06161533296108246, + "learning_rate": 9.970883647966003e-05, + "loss": 0.0246, + "step": 4960 + }, + { + "epoch": 0.8959798089057148, + "grad_norm": 0.07222472131252289, + "learning_rate": 9.970585924512717e-05, + "loss": 0.0275, + "step": 4970 + }, + { + "epoch": 0.897782585181179, + "grad_norm": 0.07035624235868454, + "learning_rate": 9.970286691126669e-05, + "loss": 0.0253, + "step": 4980 + }, + { + "epoch": 0.8995853614566433, + "grad_norm": 0.07016517221927643, + "learning_rate": 9.969985947898756e-05, + "loss": 0.0231, + "step": 4990 + }, + { + "epoch": 0.9013881377321075, + "grad_norm": 0.06871749460697174, + "learning_rate": 9.969683694920337e-05, + "loss": 0.0264, + "step": 5000 + }, + { + "epoch": 0.9031909140075717, + "grad_norm": 0.06089041382074356, + "learning_rate": 9.969379932283228e-05, + "loss": 0.0245, + "step": 5010 + }, + { + "epoch": 0.9049936902830359, + "grad_norm": 0.0656936913728714, + "learning_rate": 9.969074660079704e-05, + "loss": 0.0251, + "step": 5020 + }, + { + "epoch": 0.9067964665585001, + "grad_norm": 0.0718400627374649, + "learning_rate": 9.968767878402501e-05, + "loss": 0.0258, + "step": 5030 + }, + { + "epoch": 0.9085992428339643, + "grad_norm": 0.06520472466945648, + "learning_rate": 9.968459587344808e-05, + "loss": 0.0241, + "step": 5040 + }, + { + "epoch": 0.9104020191094285, + "grad_norm": 0.06498883664608002, + "learning_rate": 9.968149787000278e-05, + "loss": 0.024, + "step": 5050 + }, + { + "epoch": 0.9122047953848927, + "grad_norm": 0.05527251213788986, + "learning_rate": 9.967838477463018e-05, + "loss": 0.0241, + "step": 5060 + }, + { + "epoch": 0.914007571660357, + "grad_norm": 0.05390998721122742, + "learning_rate": 9.967525658827597e-05, + "loss": 0.0238, + "step": 5070 + }, + { + "epoch": 0.9158103479358212, + "grad_norm": 0.06456433236598969, + "learning_rate": 9.967211331189042e-05, + "loss": 0.0222, + "step": 5080 + }, + { + "epoch": 0.9176131242112854, + "grad_norm": 0.07540162652730942, + "learning_rate": 9.966895494642834e-05, + "loss": 0.0256, + "step": 5090 + }, + { + "epoch": 0.9194159004867496, + "grad_norm": 0.0686672255396843, + "learning_rate": 9.96657814928492e-05, + "loss": 0.0256, + "step": 5100 + }, + { + "epoch": 0.9212186767622138, + "grad_norm": 0.06787993013858795, + "learning_rate": 9.966259295211697e-05, + "loss": 0.0273, + "step": 5110 + }, + { + "epoch": 0.923021453037678, + "grad_norm": 0.07052889466285706, + "learning_rate": 9.965938932520028e-05, + "loss": 0.024, + "step": 5120 + }, + { + "epoch": 0.9248242293131422, + "grad_norm": 0.07397779822349548, + "learning_rate": 9.965617061307229e-05, + "loss": 0.0259, + "step": 5130 + }, + { + "epoch": 0.9266270055886064, + "grad_norm": 0.050168476998806, + "learning_rate": 9.965293681671077e-05, + "loss": 0.0242, + "step": 5140 + }, + { + "epoch": 0.9284297818640707, + "grad_norm": 0.07748370617628098, + "learning_rate": 9.964968793709804e-05, + "loss": 0.0261, + "step": 5150 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.06576280295848846, + "learning_rate": 9.964642397522106e-05, + "loss": 0.0251, + "step": 5160 + }, + { + "epoch": 0.9320353344149991, + "grad_norm": 0.07645824551582336, + "learning_rate": 9.96431449320713e-05, + "loss": 0.026, + "step": 5170 + }, + { + "epoch": 0.9338381106904633, + "grad_norm": 0.05054342746734619, + "learning_rate": 9.963985080864486e-05, + "loss": 0.026, + "step": 5180 + }, + { + "epoch": 0.9356408869659275, + "grad_norm": 0.08960756659507751, + "learning_rate": 9.96365416059424e-05, + "loss": 0.0266, + "step": 5190 + }, + { + "epoch": 0.9374436632413917, + "grad_norm": 0.0542869046330452, + "learning_rate": 9.963321732496919e-05, + "loss": 0.0253, + "step": 5200 + }, + { + "epoch": 0.9392464395168559, + "grad_norm": 0.07120395451784134, + "learning_rate": 9.962987796673506e-05, + "loss": 0.0242, + "step": 5210 + }, + { + "epoch": 0.9410492157923201, + "grad_norm": 0.08881355822086334, + "learning_rate": 9.962652353225438e-05, + "loss": 0.0266, + "step": 5220 + }, + { + "epoch": 0.9428519920677844, + "grad_norm": 0.0686221644282341, + "learning_rate": 9.962315402254619e-05, + "loss": 0.0248, + "step": 5230 + }, + { + "epoch": 0.9446547683432486, + "grad_norm": 0.04925746098160744, + "learning_rate": 9.9619769438634e-05, + "loss": 0.0236, + "step": 5240 + }, + { + "epoch": 0.9464575446187128, + "grad_norm": 0.072658970952034, + "learning_rate": 9.9616369781546e-05, + "loss": 0.0245, + "step": 5250 + }, + { + "epoch": 0.948260320894177, + "grad_norm": 0.062444351613521576, + "learning_rate": 9.961295505231491e-05, + "loss": 0.0239, + "step": 5260 + }, + { + "epoch": 0.9500630971696412, + "grad_norm": 0.07348624616861343, + "learning_rate": 9.960952525197804e-05, + "loss": 0.0234, + "step": 5270 + }, + { + "epoch": 0.9518658734451054, + "grad_norm": 0.08872135728597641, + "learning_rate": 9.960608038157724e-05, + "loss": 0.0261, + "step": 5280 + }, + { + "epoch": 0.9536686497205696, + "grad_norm": 0.06726624816656113, + "learning_rate": 9.960262044215901e-05, + "loss": 0.0265, + "step": 5290 + }, + { + "epoch": 0.9554714259960339, + "grad_norm": 0.07350218296051025, + "learning_rate": 9.959914543477435e-05, + "loss": 0.0254, + "step": 5300 + }, + { + "epoch": 0.9572742022714981, + "grad_norm": 0.08027441799640656, + "learning_rate": 9.959565536047892e-05, + "loss": 0.0265, + "step": 5310 + }, + { + "epoch": 0.9590769785469623, + "grad_norm": 0.05731677636504173, + "learning_rate": 9.959215022033288e-05, + "loss": 0.0232, + "step": 5320 + }, + { + "epoch": 0.9608797548224265, + "grad_norm": 0.07399944961071014, + "learning_rate": 9.9588630015401e-05, + "loss": 0.0246, + "step": 5330 + }, + { + "epoch": 0.9626825310978907, + "grad_norm": 0.054215654730796814, + "learning_rate": 9.958509474675264e-05, + "loss": 0.0243, + "step": 5340 + }, + { + "epoch": 0.9644853073733549, + "grad_norm": 0.07770421355962753, + "learning_rate": 9.958154441546171e-05, + "loss": 0.0249, + "step": 5350 + }, + { + "epoch": 0.9662880836488191, + "grad_norm": 0.062185995280742645, + "learning_rate": 9.957797902260673e-05, + "loss": 0.0246, + "step": 5360 + }, + { + "epoch": 0.9680908599242833, + "grad_norm": 0.05245919153094292, + "learning_rate": 9.957439856927073e-05, + "loss": 0.0249, + "step": 5370 + }, + { + "epoch": 0.9698936361997477, + "grad_norm": 0.0618879459798336, + "learning_rate": 9.957080305654139e-05, + "loss": 0.0232, + "step": 5380 + }, + { + "epoch": 0.9716964124752119, + "grad_norm": 0.08252725005149841, + "learning_rate": 9.956719248551092e-05, + "loss": 0.0266, + "step": 5390 + }, + { + "epoch": 0.9734991887506761, + "grad_norm": 0.057428184896707535, + "learning_rate": 9.956356685727612e-05, + "loss": 0.0243, + "step": 5400 + }, + { + "epoch": 0.9753019650261403, + "grad_norm": 0.06664905697107315, + "learning_rate": 9.955992617293836e-05, + "loss": 0.0252, + "step": 5410 + }, + { + "epoch": 0.9771047413016045, + "grad_norm": 0.05850714445114136, + "learning_rate": 9.955627043360358e-05, + "loss": 0.0241, + "step": 5420 + }, + { + "epoch": 0.9789075175770687, + "grad_norm": 0.056620508432388306, + "learning_rate": 9.955259964038231e-05, + "loss": 0.0234, + "step": 5430 + }, + { + "epoch": 0.9807102938525329, + "grad_norm": 0.07022842019796371, + "learning_rate": 9.954891379438962e-05, + "loss": 0.0233, + "step": 5440 + }, + { + "epoch": 0.9825130701279972, + "grad_norm": 0.07692418247461319, + "learning_rate": 9.954521289674519e-05, + "loss": 0.0257, + "step": 5450 + }, + { + "epoch": 0.9843158464034614, + "grad_norm": 0.07468161731958389, + "learning_rate": 9.954149694857325e-05, + "loss": 0.0254, + "step": 5460 + }, + { + "epoch": 0.9861186226789256, + "grad_norm": 0.06700682640075684, + "learning_rate": 9.953776595100258e-05, + "loss": 0.0223, + "step": 5470 + }, + { + "epoch": 0.9879213989543898, + "grad_norm": 0.06106293573975563, + "learning_rate": 9.95340199051666e-05, + "loss": 0.0253, + "step": 5480 + }, + { + "epoch": 0.989724175229854, + "grad_norm": 0.061947569251060486, + "learning_rate": 9.953025881220325e-05, + "loss": 0.0241, + "step": 5490 + }, + { + "epoch": 0.9915269515053182, + "grad_norm": 0.06343217939138412, + "learning_rate": 9.952648267325504e-05, + "loss": 0.024, + "step": 5500 + }, + { + "epoch": 0.9933297277807824, + "grad_norm": 0.07381337136030197, + "learning_rate": 9.952269148946905e-05, + "loss": 0.0261, + "step": 5510 + }, + { + "epoch": 0.9951325040562466, + "grad_norm": 0.09609223902225494, + "learning_rate": 9.951888526199697e-05, + "loss": 0.0242, + "step": 5520 + }, + { + "epoch": 0.9969352803317109, + "grad_norm": 0.07787443697452545, + "learning_rate": 9.951506399199501e-05, + "loss": 0.0253, + "step": 5530 + }, + { + "epoch": 0.9987380566071751, + "grad_norm": 0.05880453437566757, + "learning_rate": 9.951122768062399e-05, + "loss": 0.0252, + "step": 5540 + }, + { + "epoch": 1.0005408328826393, + "grad_norm": 0.08632911741733551, + "learning_rate": 9.950737632904927e-05, + "loss": 0.0246, + "step": 5550 + }, + { + "epoch": 1.0023436091581035, + "grad_norm": 0.06685912609100342, + "learning_rate": 9.950350993844077e-05, + "loss": 0.026, + "step": 5560 + }, + { + "epoch": 1.0041463854335677, + "grad_norm": 0.059555165469646454, + "learning_rate": 9.949962850997303e-05, + "loss": 0.0244, + "step": 5570 + }, + { + "epoch": 1.005949161709032, + "grad_norm": 0.07208523899316788, + "learning_rate": 9.949573204482512e-05, + "loss": 0.0252, + "step": 5580 + }, + { + "epoch": 1.0077519379844961, + "grad_norm": 0.062134839594364166, + "learning_rate": 9.949182054418064e-05, + "loss": 0.023, + "step": 5590 + }, + { + "epoch": 1.0095547142599604, + "grad_norm": 0.07274772226810455, + "learning_rate": 9.948789400922787e-05, + "loss": 0.0229, + "step": 5600 + }, + { + "epoch": 1.0113574905354246, + "grad_norm": 0.05015425384044647, + "learning_rate": 9.948395244115953e-05, + "loss": 0.0239, + "step": 5610 + }, + { + "epoch": 1.0131602668108888, + "grad_norm": 0.06573361903429031, + "learning_rate": 9.9479995841173e-05, + "loss": 0.024, + "step": 5620 + }, + { + "epoch": 1.014963043086353, + "grad_norm": 0.04703957214951515, + "learning_rate": 9.947602421047017e-05, + "loss": 0.0242, + "step": 5630 + }, + { + "epoch": 1.0167658193618172, + "grad_norm": 0.05187300965189934, + "learning_rate": 9.947203755025753e-05, + "loss": 0.024, + "step": 5640 + }, + { + "epoch": 1.0185685956372814, + "grad_norm": 0.06612180918455124, + "learning_rate": 9.946803586174611e-05, + "loss": 0.0232, + "step": 5650 + }, + { + "epoch": 1.0203713719127456, + "grad_norm": 0.05845790356397629, + "learning_rate": 9.946401914615151e-05, + "loss": 0.0236, + "step": 5660 + }, + { + "epoch": 1.0221741481882098, + "grad_norm": 0.07651075720787048, + "learning_rate": 9.945998740469394e-05, + "loss": 0.0232, + "step": 5670 + }, + { + "epoch": 1.023976924463674, + "grad_norm": 0.07721281796693802, + "learning_rate": 9.945594063859809e-05, + "loss": 0.0247, + "step": 5680 + }, + { + "epoch": 1.0257797007391383, + "grad_norm": 0.09050562977790833, + "learning_rate": 9.94518788490933e-05, + "loss": 0.0263, + "step": 5690 + }, + { + "epoch": 1.0275824770146025, + "grad_norm": 0.05043366178870201, + "learning_rate": 9.944780203741341e-05, + "loss": 0.0236, + "step": 5700 + }, + { + "epoch": 1.0293852532900667, + "grad_norm": 0.07070080190896988, + "learning_rate": 9.944371020479686e-05, + "loss": 0.0249, + "step": 5710 + }, + { + "epoch": 1.031188029565531, + "grad_norm": 0.0642051249742508, + "learning_rate": 9.943960335248662e-05, + "loss": 0.026, + "step": 5720 + }, + { + "epoch": 1.0329908058409951, + "grad_norm": 0.05914174020290375, + "learning_rate": 9.943548148173027e-05, + "loss": 0.0242, + "step": 5730 + }, + { + "epoch": 1.0347935821164593, + "grad_norm": 0.053341593593358994, + "learning_rate": 9.943134459377992e-05, + "loss": 0.0249, + "step": 5740 + }, + { + "epoch": 1.0365963583919235, + "grad_norm": 0.05190033093094826, + "learning_rate": 9.942719268989222e-05, + "loss": 0.0217, + "step": 5750 + }, + { + "epoch": 1.0383991346673878, + "grad_norm": 0.06284834444522858, + "learning_rate": 9.942302577132844e-05, + "loss": 0.0229, + "step": 5760 + }, + { + "epoch": 1.040201910942852, + "grad_norm": 0.06831235438585281, + "learning_rate": 9.941884383935438e-05, + "loss": 0.0224, + "step": 5770 + }, + { + "epoch": 1.0420046872183162, + "grad_norm": 0.056083980947732925, + "learning_rate": 9.941464689524039e-05, + "loss": 0.0226, + "step": 5780 + }, + { + "epoch": 1.0438074634937804, + "grad_norm": 0.05605960264801979, + "learning_rate": 9.941043494026139e-05, + "loss": 0.0237, + "step": 5790 + }, + { + "epoch": 1.0456102397692446, + "grad_norm": 0.07108170539140701, + "learning_rate": 9.940620797569685e-05, + "loss": 0.0225, + "step": 5800 + }, + { + "epoch": 1.0474130160447088, + "grad_norm": 0.054758355021476746, + "learning_rate": 9.940196600283082e-05, + "loss": 0.0248, + "step": 5810 + }, + { + "epoch": 1.049215792320173, + "grad_norm": 0.04980951175093651, + "learning_rate": 9.939770902295192e-05, + "loss": 0.0241, + "step": 5820 + }, + { + "epoch": 1.0510185685956372, + "grad_norm": 0.05114983767271042, + "learning_rate": 9.939343703735329e-05, + "loss": 0.0229, + "step": 5830 + }, + { + "epoch": 1.0528213448711015, + "grad_norm": 0.050313256680965424, + "learning_rate": 9.938915004733264e-05, + "loss": 0.0224, + "step": 5840 + }, + { + "epoch": 1.0546241211465657, + "grad_norm": 0.07128613442182541, + "learning_rate": 9.938484805419224e-05, + "loss": 0.026, + "step": 5850 + }, + { + "epoch": 1.0564268974220299, + "grad_norm": 0.05058193579316139, + "learning_rate": 9.938053105923894e-05, + "loss": 0.0234, + "step": 5860 + }, + { + "epoch": 1.058229673697494, + "grad_norm": 0.04647611081600189, + "learning_rate": 9.937619906378413e-05, + "loss": 0.0221, + "step": 5870 + }, + { + "epoch": 1.0600324499729583, + "grad_norm": 0.062001656740903854, + "learning_rate": 9.937185206914374e-05, + "loss": 0.0254, + "step": 5880 + }, + { + "epoch": 1.0618352262484225, + "grad_norm": 0.07533343136310577, + "learning_rate": 9.936749007663829e-05, + "loss": 0.0251, + "step": 5890 + }, + { + "epoch": 1.0636380025238867, + "grad_norm": 0.06452707201242447, + "learning_rate": 9.93631130875928e-05, + "loss": 0.0242, + "step": 5900 + }, + { + "epoch": 1.065440778799351, + "grad_norm": 0.07372333109378815, + "learning_rate": 9.935872110333692e-05, + "loss": 0.0246, + "step": 5910 + }, + { + "epoch": 1.0672435550748152, + "grad_norm": 0.07187105715274811, + "learning_rate": 9.935431412520484e-05, + "loss": 0.0257, + "step": 5920 + }, + { + "epoch": 1.0690463313502794, + "grad_norm": 0.07282919436693192, + "learning_rate": 9.934989215453523e-05, + "loss": 0.0238, + "step": 5930 + }, + { + "epoch": 1.0708491076257436, + "grad_norm": 0.05027014762163162, + "learning_rate": 9.934545519267139e-05, + "loss": 0.023, + "step": 5940 + }, + { + "epoch": 1.0726518839012078, + "grad_norm": 0.05004817992448807, + "learning_rate": 9.934100324096117e-05, + "loss": 0.0235, + "step": 5950 + }, + { + "epoch": 1.074454660176672, + "grad_norm": 0.06548458337783813, + "learning_rate": 9.933653630075692e-05, + "loss": 0.0239, + "step": 5960 + }, + { + "epoch": 1.0762574364521362, + "grad_norm": 0.06985263526439667, + "learning_rate": 9.93320543734156e-05, + "loss": 0.0225, + "step": 5970 + }, + { + "epoch": 1.0780602127276004, + "grad_norm": 0.07312323898077011, + "learning_rate": 9.932755746029871e-05, + "loss": 0.0249, + "step": 5980 + }, + { + "epoch": 1.0798629890030647, + "grad_norm": 0.07657743245363235, + "learning_rate": 9.932304556277228e-05, + "loss": 0.0252, + "step": 5990 + }, + { + "epoch": 1.0816657652785289, + "grad_norm": 0.05853865295648575, + "learning_rate": 9.93185186822069e-05, + "loss": 0.0238, + "step": 6000 + }, + { + "epoch": 1.083468541553993, + "grad_norm": 0.08346471935510635, + "learning_rate": 9.931397681997773e-05, + "loss": 0.0249, + "step": 6010 + }, + { + "epoch": 1.0852713178294573, + "grad_norm": 0.07303805649280548, + "learning_rate": 9.930941997746446e-05, + "loss": 0.0246, + "step": 6020 + }, + { + "epoch": 1.0870740941049215, + "grad_norm": 0.06027507409453392, + "learning_rate": 9.930484815605134e-05, + "loss": 0.0239, + "step": 6030 + }, + { + "epoch": 1.0888768703803857, + "grad_norm": 0.056215740740299225, + "learning_rate": 9.930026135712717e-05, + "loss": 0.0227, + "step": 6040 + }, + { + "epoch": 1.09067964665585, + "grad_norm": 0.10820568352937698, + "learning_rate": 9.92956595820853e-05, + "loss": 0.0247, + "step": 6050 + }, + { + "epoch": 1.0924824229313141, + "grad_norm": 0.07962211221456528, + "learning_rate": 9.929104283232362e-05, + "loss": 0.022, + "step": 6060 + }, + { + "epoch": 1.0942851992067784, + "grad_norm": 0.05570071190595627, + "learning_rate": 9.92864111092446e-05, + "loss": 0.0249, + "step": 6070 + }, + { + "epoch": 1.0960879754822426, + "grad_norm": 0.0750889927148819, + "learning_rate": 9.92817644142552e-05, + "loss": 0.0235, + "step": 6080 + }, + { + "epoch": 1.0978907517577068, + "grad_norm": 0.0528804287314415, + "learning_rate": 9.927710274876698e-05, + "loss": 0.0228, + "step": 6090 + }, + { + "epoch": 1.099693528033171, + "grad_norm": 0.05412602052092552, + "learning_rate": 9.927242611419603e-05, + "loss": 0.0239, + "step": 6100 + }, + { + "epoch": 1.1014963043086352, + "grad_norm": 0.07128166407346725, + "learning_rate": 9.926773451196301e-05, + "loss": 0.0235, + "step": 6110 + }, + { + "epoch": 1.1032990805840994, + "grad_norm": 0.07863874733448029, + "learning_rate": 9.926302794349306e-05, + "loss": 0.0223, + "step": 6120 + }, + { + "epoch": 1.1051018568595636, + "grad_norm": 0.06284967064857483, + "learning_rate": 9.925830641021594e-05, + "loss": 0.0222, + "step": 6130 + }, + { + "epoch": 1.1069046331350278, + "grad_norm": 0.06793460994958878, + "learning_rate": 9.925356991356593e-05, + "loss": 0.0228, + "step": 6140 + }, + { + "epoch": 1.108707409410492, + "grad_norm": 0.07556314766407013, + "learning_rate": 9.924881845498184e-05, + "loss": 0.0242, + "step": 6150 + }, + { + "epoch": 1.1105101856859563, + "grad_norm": 0.05997014045715332, + "learning_rate": 9.924405203590705e-05, + "loss": 0.0228, + "step": 6160 + }, + { + "epoch": 1.1123129619614205, + "grad_norm": 0.10712309926748276, + "learning_rate": 9.923927065778946e-05, + "loss": 0.0239, + "step": 6170 + }, + { + "epoch": 1.1141157382368847, + "grad_norm": 0.06244373694062233, + "learning_rate": 9.923447432208154e-05, + "loss": 0.0233, + "step": 6180 + }, + { + "epoch": 1.115918514512349, + "grad_norm": 0.058667201548814774, + "learning_rate": 9.922966303024027e-05, + "loss": 0.0241, + "step": 6190 + }, + { + "epoch": 1.1177212907878133, + "grad_norm": 0.09554725885391235, + "learning_rate": 9.922483678372721e-05, + "loss": 0.0231, + "step": 6200 + }, + { + "epoch": 1.1195240670632773, + "grad_norm": 0.049072351306676865, + "learning_rate": 9.921999558400845e-05, + "loss": 0.0213, + "step": 6210 + }, + { + "epoch": 1.1213268433387418, + "grad_norm": 0.07896696031093597, + "learning_rate": 9.92151394325546e-05, + "loss": 0.0217, + "step": 6220 + }, + { + "epoch": 1.1231296196142058, + "grad_norm": 0.08703551441431046, + "learning_rate": 9.921026833084084e-05, + "loss": 0.0232, + "step": 6230 + }, + { + "epoch": 1.1249323958896702, + "grad_norm": 0.052096832543611526, + "learning_rate": 9.920538228034689e-05, + "loss": 0.0232, + "step": 6240 + }, + { + "epoch": 1.1267351721651342, + "grad_norm": 0.06702762097120285, + "learning_rate": 9.920048128255699e-05, + "loss": 0.0246, + "step": 6250 + }, + { + "epoch": 1.1285379484405986, + "grad_norm": 0.04883360490202904, + "learning_rate": 9.919556533895995e-05, + "loss": 0.023, + "step": 6260 + }, + { + "epoch": 1.1303407247160626, + "grad_norm": 0.05246064066886902, + "learning_rate": 9.919063445104907e-05, + "loss": 0.0225, + "step": 6270 + }, + { + "epoch": 1.132143500991527, + "grad_norm": 0.07630882412195206, + "learning_rate": 9.918568862032227e-05, + "loss": 0.0221, + "step": 6280 + }, + { + "epoch": 1.133946277266991, + "grad_norm": 0.0659480169415474, + "learning_rate": 9.918072784828194e-05, + "loss": 0.0226, + "step": 6290 + }, + { + "epoch": 1.1357490535424555, + "grad_norm": 0.08457189053297043, + "learning_rate": 9.917575213643501e-05, + "loss": 0.0227, + "step": 6300 + }, + { + "epoch": 1.1375518298179197, + "grad_norm": 0.06392435729503632, + "learning_rate": 9.917076148629302e-05, + "loss": 0.0216, + "step": 6310 + }, + { + "epoch": 1.139354606093384, + "grad_norm": 0.07318608462810516, + "learning_rate": 9.916575589937196e-05, + "loss": 0.0228, + "step": 6320 + }, + { + "epoch": 1.1411573823688481, + "grad_norm": 0.07366244494915009, + "learning_rate": 9.916073537719239e-05, + "loss": 0.0244, + "step": 6330 + }, + { + "epoch": 1.1429601586443123, + "grad_norm": 0.04482867568731308, + "learning_rate": 9.915569992127944e-05, + "loss": 0.0229, + "step": 6340 + }, + { + "epoch": 1.1447629349197765, + "grad_norm": 0.054469019174575806, + "learning_rate": 9.915064953316273e-05, + "loss": 0.0233, + "step": 6350 + }, + { + "epoch": 1.1465657111952408, + "grad_norm": 0.07274210453033447, + "learning_rate": 9.914558421437645e-05, + "loss": 0.0214, + "step": 6360 + }, + { + "epoch": 1.148368487470705, + "grad_norm": 0.06188854202628136, + "learning_rate": 9.914050396645929e-05, + "loss": 0.0231, + "step": 6370 + }, + { + "epoch": 1.1501712637461692, + "grad_norm": 0.051505692303180695, + "learning_rate": 9.913540879095452e-05, + "loss": 0.0235, + "step": 6380 + }, + { + "epoch": 1.1519740400216334, + "grad_norm": 0.047527775168418884, + "learning_rate": 9.913029868940987e-05, + "loss": 0.0217, + "step": 6390 + }, + { + "epoch": 1.1537768162970976, + "grad_norm": 0.05871560424566269, + "learning_rate": 9.912517366337772e-05, + "loss": 0.0229, + "step": 6400 + }, + { + "epoch": 1.1555795925725618, + "grad_norm": 0.08393415808677673, + "learning_rate": 9.912003371441487e-05, + "loss": 0.0237, + "step": 6410 + }, + { + "epoch": 1.157382368848026, + "grad_norm": 0.07711070030927658, + "learning_rate": 9.911487884408271e-05, + "loss": 0.0235, + "step": 6420 + }, + { + "epoch": 1.1591851451234902, + "grad_norm": 0.05195716768503189, + "learning_rate": 9.910970905394719e-05, + "loss": 0.0209, + "step": 6430 + }, + { + "epoch": 1.1609879213989545, + "grad_norm": 0.04700459539890289, + "learning_rate": 9.91045243455787e-05, + "loss": 0.0224, + "step": 6440 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.05725035071372986, + "learning_rate": 9.909932472055225e-05, + "loss": 0.0218, + "step": 6450 + }, + { + "epoch": 1.1645934739498829, + "grad_norm": 0.06793183088302612, + "learning_rate": 9.909411018044734e-05, + "loss": 0.0224, + "step": 6460 + }, + { + "epoch": 1.166396250225347, + "grad_norm": 0.05550156533718109, + "learning_rate": 9.908888072684802e-05, + "loss": 0.0239, + "step": 6470 + }, + { + "epoch": 1.1681990265008113, + "grad_norm": 0.06550948321819305, + "learning_rate": 9.908363636134285e-05, + "loss": 0.0228, + "step": 6480 + }, + { + "epoch": 1.1700018027762755, + "grad_norm": 0.08333262801170349, + "learning_rate": 9.907837708552493e-05, + "loss": 0.0231, + "step": 6490 + }, + { + "epoch": 1.1718045790517397, + "grad_norm": 0.0609498992562294, + "learning_rate": 9.90731029009919e-05, + "loss": 0.0208, + "step": 6500 + }, + { + "epoch": 1.173607355327204, + "grad_norm": 0.06906290352344513, + "learning_rate": 9.906781380934589e-05, + "loss": 0.0237, + "step": 6510 + }, + { + "epoch": 1.1754101316026682, + "grad_norm": 0.043959613889455795, + "learning_rate": 9.906250981219362e-05, + "loss": 0.0217, + "step": 6520 + }, + { + "epoch": 1.1772129078781324, + "grad_norm": 0.06650874763727188, + "learning_rate": 9.905719091114628e-05, + "loss": 0.0235, + "step": 6530 + }, + { + "epoch": 1.1790156841535966, + "grad_norm": 0.06053738668560982, + "learning_rate": 9.905185710781964e-05, + "loss": 0.0225, + "step": 6540 + }, + { + "epoch": 1.1808184604290608, + "grad_norm": 0.06357341259717941, + "learning_rate": 9.904650840383392e-05, + "loss": 0.026, + "step": 6550 + }, + { + "epoch": 1.182621236704525, + "grad_norm": 0.07409552484750748, + "learning_rate": 9.904114480081397e-05, + "loss": 0.0242, + "step": 6560 + }, + { + "epoch": 1.1844240129799892, + "grad_norm": 0.06452928483486176, + "learning_rate": 9.903576630038906e-05, + "loss": 0.0214, + "step": 6570 + }, + { + "epoch": 1.1862267892554534, + "grad_norm": 0.07412322610616684, + "learning_rate": 9.903037290419309e-05, + "loss": 0.0235, + "step": 6580 + }, + { + "epoch": 1.1880295655309177, + "grad_norm": 0.06648721545934677, + "learning_rate": 9.902496461386439e-05, + "loss": 0.0218, + "step": 6590 + }, + { + "epoch": 1.1898323418063819, + "grad_norm": 0.06602685898542404, + "learning_rate": 9.901954143104588e-05, + "loss": 0.0236, + "step": 6600 + }, + { + "epoch": 1.191635118081846, + "grad_norm": 0.08214517682790756, + "learning_rate": 9.901410335738496e-05, + "loss": 0.025, + "step": 6610 + }, + { + "epoch": 1.1934378943573103, + "grad_norm": 0.057222992181777954, + "learning_rate": 9.900865039453358e-05, + "loss": 0.0228, + "step": 6620 + }, + { + "epoch": 1.1952406706327745, + "grad_norm": 0.04528526961803436, + "learning_rate": 9.900318254414821e-05, + "loss": 0.023, + "step": 6630 + }, + { + "epoch": 1.1970434469082387, + "grad_norm": 0.0676465779542923, + "learning_rate": 9.899769980788985e-05, + "loss": 0.0211, + "step": 6640 + }, + { + "epoch": 1.198846223183703, + "grad_norm": 0.05191970616579056, + "learning_rate": 9.899220218742398e-05, + "loss": 0.0221, + "step": 6650 + }, + { + "epoch": 1.2006489994591671, + "grad_norm": 0.0540454238653183, + "learning_rate": 9.898668968442066e-05, + "loss": 0.0234, + "step": 6660 + }, + { + "epoch": 1.2024517757346314, + "grad_norm": 0.059702739119529724, + "learning_rate": 9.898116230055443e-05, + "loss": 0.0212, + "step": 6670 + }, + { + "epoch": 1.2042545520100956, + "grad_norm": 0.06443289667367935, + "learning_rate": 9.897562003750437e-05, + "loss": 0.0223, + "step": 6680 + }, + { + "epoch": 1.2060573282855598, + "grad_norm": 0.057754285633563995, + "learning_rate": 9.897006289695407e-05, + "loss": 0.0222, + "step": 6690 + }, + { + "epoch": 1.207860104561024, + "grad_norm": 0.056729335337877274, + "learning_rate": 9.896449088059164e-05, + "loss": 0.0237, + "step": 6700 + }, + { + "epoch": 1.2096628808364882, + "grad_norm": 0.06436558812856674, + "learning_rate": 9.89589039901097e-05, + "loss": 0.0233, + "step": 6710 + }, + { + "epoch": 1.2114656571119524, + "grad_norm": 0.06533046066761017, + "learning_rate": 9.895330222720542e-05, + "loss": 0.0234, + "step": 6720 + }, + { + "epoch": 1.2132684333874166, + "grad_norm": 0.054747357964515686, + "learning_rate": 9.894768559358047e-05, + "loss": 0.024, + "step": 6730 + }, + { + "epoch": 1.2150712096628808, + "grad_norm": 0.06782015413045883, + "learning_rate": 9.894205409094101e-05, + "loss": 0.0236, + "step": 6740 + }, + { + "epoch": 1.216873985938345, + "grad_norm": 0.09383301436901093, + "learning_rate": 9.893640772099777e-05, + "loss": 0.0252, + "step": 6750 + }, + { + "epoch": 1.2186767622138093, + "grad_norm": 0.05178625136613846, + "learning_rate": 9.893074648546595e-05, + "loss": 0.0233, + "step": 6760 + }, + { + "epoch": 1.2204795384892735, + "grad_norm": 0.08305904269218445, + "learning_rate": 9.892507038606528e-05, + "loss": 0.0232, + "step": 6770 + }, + { + "epoch": 1.2222823147647377, + "grad_norm": 0.06044602766633034, + "learning_rate": 9.891937942452003e-05, + "loss": 0.0236, + "step": 6780 + }, + { + "epoch": 1.224085091040202, + "grad_norm": 0.062212105840444565, + "learning_rate": 9.891367360255895e-05, + "loss": 0.0231, + "step": 6790 + }, + { + "epoch": 1.2258878673156661, + "grad_norm": 0.06624992191791534, + "learning_rate": 9.890795292191532e-05, + "loss": 0.0225, + "step": 6800 + }, + { + "epoch": 1.2276906435911303, + "grad_norm": 0.06972295790910721, + "learning_rate": 9.890221738432694e-05, + "loss": 0.0231, + "step": 6810 + }, + { + "epoch": 1.2294934198665946, + "grad_norm": 0.06491313129663467, + "learning_rate": 9.88964669915361e-05, + "loss": 0.0231, + "step": 6820 + }, + { + "epoch": 1.2312961961420588, + "grad_norm": 0.09444902837276459, + "learning_rate": 9.889070174528963e-05, + "loss": 0.0258, + "step": 6830 + }, + { + "epoch": 1.233098972417523, + "grad_norm": 0.05975182354450226, + "learning_rate": 9.888492164733883e-05, + "loss": 0.0234, + "step": 6840 + }, + { + "epoch": 1.2349017486929872, + "grad_norm": 0.07824773341417313, + "learning_rate": 9.88791266994396e-05, + "loss": 0.0233, + "step": 6850 + }, + { + "epoch": 1.2367045249684514, + "grad_norm": 0.06507127732038498, + "learning_rate": 9.887331690335223e-05, + "loss": 0.0253, + "step": 6860 + }, + { + "epoch": 1.2385073012439156, + "grad_norm": 0.06631851941347122, + "learning_rate": 9.886749226084163e-05, + "loss": 0.0216, + "step": 6870 + }, + { + "epoch": 1.2403100775193798, + "grad_norm": 0.055613912642002106, + "learning_rate": 9.886165277367714e-05, + "loss": 0.0239, + "step": 6880 + }, + { + "epoch": 1.242112853794844, + "grad_norm": 0.05648408085107803, + "learning_rate": 9.885579844363265e-05, + "loss": 0.022, + "step": 6890 + }, + { + "epoch": 1.2439156300703083, + "grad_norm": 0.05523424595594406, + "learning_rate": 9.884992927248656e-05, + "loss": 0.0218, + "step": 6900 + }, + { + "epoch": 1.2457184063457725, + "grad_norm": 0.065706267952919, + "learning_rate": 9.884404526202178e-05, + "loss": 0.0226, + "step": 6910 + }, + { + "epoch": 1.2475211826212367, + "grad_norm": 0.04601127281785011, + "learning_rate": 9.883814641402568e-05, + "loss": 0.0223, + "step": 6920 + }, + { + "epoch": 1.249323958896701, + "grad_norm": 0.06544718146324158, + "learning_rate": 9.88322327302902e-05, + "loss": 0.0229, + "step": 6930 + }, + { + "epoch": 1.251126735172165, + "grad_norm": 0.05945868045091629, + "learning_rate": 9.882630421261176e-05, + "loss": 0.0215, + "step": 6940 + }, + { + "epoch": 1.2529295114476293, + "grad_norm": 0.044865578413009644, + "learning_rate": 9.88203608627913e-05, + "loss": 0.0232, + "step": 6950 + }, + { + "epoch": 1.2547322877230935, + "grad_norm": 0.06627567857503891, + "learning_rate": 9.881440268263422e-05, + "loss": 0.0209, + "step": 6960 + }, + { + "epoch": 1.2565350639985577, + "grad_norm": 0.05480595678091049, + "learning_rate": 9.880842967395048e-05, + "loss": 0.0243, + "step": 6970 + }, + { + "epoch": 1.258337840274022, + "grad_norm": 0.04346103221178055, + "learning_rate": 9.880244183855452e-05, + "loss": 0.0221, + "step": 6980 + }, + { + "epoch": 1.2601406165494862, + "grad_norm": 0.05963980033993721, + "learning_rate": 9.879643917826527e-05, + "loss": 0.0223, + "step": 6990 + }, + { + "epoch": 1.2619433928249504, + "grad_norm": 0.060017772018909454, + "learning_rate": 9.87904216949062e-05, + "loss": 0.0235, + "step": 7000 + }, + { + "epoch": 1.2637461691004146, + "grad_norm": 0.05154283344745636, + "learning_rate": 9.878438939030526e-05, + "loss": 0.0218, + "step": 7010 + }, + { + "epoch": 1.2655489453758788, + "grad_norm": 0.054108135402202606, + "learning_rate": 9.877834226629489e-05, + "loss": 0.0242, + "step": 7020 + }, + { + "epoch": 1.267351721651343, + "grad_norm": 0.07720384001731873, + "learning_rate": 9.877228032471206e-05, + "loss": 0.0229, + "step": 7030 + }, + { + "epoch": 1.2691544979268072, + "grad_norm": 0.05591970309615135, + "learning_rate": 9.876620356739823e-05, + "loss": 0.0229, + "step": 7040 + }, + { + "epoch": 1.2709572742022714, + "grad_norm": 0.07427584379911423, + "learning_rate": 9.876011199619935e-05, + "loss": 0.023, + "step": 7050 + }, + { + "epoch": 1.2727600504777357, + "grad_norm": 0.06238323450088501, + "learning_rate": 9.875400561296589e-05, + "loss": 0.023, + "step": 7060 + }, + { + "epoch": 1.2745628267531999, + "grad_norm": 0.06671527773141861, + "learning_rate": 9.874788441955278e-05, + "loss": 0.0217, + "step": 7070 + }, + { + "epoch": 1.276365603028664, + "grad_norm": 0.0695948377251625, + "learning_rate": 9.874174841781951e-05, + "loss": 0.0228, + "step": 7080 + }, + { + "epoch": 1.2781683793041283, + "grad_norm": 0.0653127059340477, + "learning_rate": 9.873559760963003e-05, + "loss": 0.0231, + "step": 7090 + }, + { + "epoch": 1.2799711555795925, + "grad_norm": 0.050567515194416046, + "learning_rate": 9.872943199685278e-05, + "loss": 0.022, + "step": 7100 + }, + { + "epoch": 1.2817739318550567, + "grad_norm": 0.06827295571565628, + "learning_rate": 9.872325158136071e-05, + "loss": 0.0247, + "step": 7110 + }, + { + "epoch": 1.283576708130521, + "grad_norm": 0.0659274309873581, + "learning_rate": 9.871705636503128e-05, + "loss": 0.0213, + "step": 7120 + }, + { + "epoch": 1.2853794844059852, + "grad_norm": 0.05960702523589134, + "learning_rate": 9.871084634974641e-05, + "loss": 0.0221, + "step": 7130 + }, + { + "epoch": 1.2871822606814494, + "grad_norm": 0.06706929951906204, + "learning_rate": 9.870462153739257e-05, + "loss": 0.0226, + "step": 7140 + }, + { + "epoch": 1.2889850369569136, + "grad_norm": 0.05996851623058319, + "learning_rate": 9.869838192986067e-05, + "loss": 0.0225, + "step": 7150 + }, + { + "epoch": 1.2907878132323778, + "grad_norm": 0.050380248576402664, + "learning_rate": 9.869212752904616e-05, + "loss": 0.021, + "step": 7160 + }, + { + "epoch": 1.292590589507842, + "grad_norm": 0.0626763105392456, + "learning_rate": 9.868585833684894e-05, + "loss": 0.0203, + "step": 7170 + }, + { + "epoch": 1.2943933657833062, + "grad_norm": 0.06992917507886887, + "learning_rate": 9.867957435517342e-05, + "loss": 0.0222, + "step": 7180 + }, + { + "epoch": 1.2961961420587704, + "grad_norm": 0.05216652899980545, + "learning_rate": 9.867327558592854e-05, + "loss": 0.0219, + "step": 7190 + }, + { + "epoch": 1.2979989183342346, + "grad_norm": 0.04966970160603523, + "learning_rate": 9.866696203102766e-05, + "loss": 0.0204, + "step": 7200 + }, + { + "epoch": 1.2998016946096989, + "grad_norm": 0.05236019566655159, + "learning_rate": 9.86606336923887e-05, + "loss": 0.0236, + "step": 7210 + }, + { + "epoch": 1.301604470885163, + "grad_norm": 0.10576257854700089, + "learning_rate": 9.865429057193403e-05, + "loss": 0.0236, + "step": 7220 + }, + { + "epoch": 1.3034072471606275, + "grad_norm": 0.07995510846376419, + "learning_rate": 9.864793267159053e-05, + "loss": 0.0208, + "step": 7230 + }, + { + "epoch": 1.3052100234360915, + "grad_norm": 0.054266635328531265, + "learning_rate": 9.864155999328957e-05, + "loss": 0.0234, + "step": 7240 + }, + { + "epoch": 1.307012799711556, + "grad_norm": 0.08857008814811707, + "learning_rate": 9.8635172538967e-05, + "loss": 0.0226, + "step": 7250 + }, + { + "epoch": 1.30881557598702, + "grad_norm": 0.04663212224841118, + "learning_rate": 9.862877031056312e-05, + "loss": 0.0216, + "step": 7260 + }, + { + "epoch": 1.3106183522624844, + "grad_norm": 0.0556655116379261, + "learning_rate": 9.862235331002279e-05, + "loss": 0.0208, + "step": 7270 + }, + { + "epoch": 1.3124211285379483, + "grad_norm": 0.061788491904735565, + "learning_rate": 9.861592153929533e-05, + "loss": 0.0206, + "step": 7280 + }, + { + "epoch": 1.3142239048134128, + "grad_norm": 0.05449859797954559, + "learning_rate": 9.860947500033455e-05, + "loss": 0.0213, + "step": 7290 + }, + { + "epoch": 1.3160266810888768, + "grad_norm": 0.07799364626407623, + "learning_rate": 9.86030136950987e-05, + "loss": 0.0229, + "step": 7300 + }, + { + "epoch": 1.3178294573643412, + "grad_norm": 0.06935471296310425, + "learning_rate": 9.85965376255506e-05, + "loss": 0.0232, + "step": 7310 + }, + { + "epoch": 1.3196322336398052, + "grad_norm": 0.0648791566491127, + "learning_rate": 9.859004679365747e-05, + "loss": 0.0252, + "step": 7320 + }, + { + "epoch": 1.3214350099152696, + "grad_norm": 0.05866130441427231, + "learning_rate": 9.858354120139108e-05, + "loss": 0.0234, + "step": 7330 + }, + { + "epoch": 1.3232377861907336, + "grad_norm": 0.06059086322784424, + "learning_rate": 9.857702085072764e-05, + "loss": 0.022, + "step": 7340 + }, + { + "epoch": 1.325040562466198, + "grad_norm": 0.06630909442901611, + "learning_rate": 9.857048574364787e-05, + "loss": 0.0234, + "step": 7350 + }, + { + "epoch": 1.326843338741662, + "grad_norm": 0.05548308789730072, + "learning_rate": 9.856393588213698e-05, + "loss": 0.0225, + "step": 7360 + }, + { + "epoch": 1.3286461150171265, + "grad_norm": 0.05664735659956932, + "learning_rate": 9.855737126818458e-05, + "loss": 0.0216, + "step": 7370 + }, + { + "epoch": 1.3304488912925905, + "grad_norm": 0.04946962371468544, + "learning_rate": 9.855079190378491e-05, + "loss": 0.0222, + "step": 7380 + }, + { + "epoch": 1.332251667568055, + "grad_norm": 0.0632878765463829, + "learning_rate": 9.854419779093655e-05, + "loss": 0.0215, + "step": 7390 + }, + { + "epoch": 1.334054443843519, + "grad_norm": 0.06933576613664627, + "learning_rate": 9.853758893164264e-05, + "loss": 0.0219, + "step": 7400 + }, + { + "epoch": 1.3358572201189833, + "grad_norm": 0.046792659908533096, + "learning_rate": 9.853096532791078e-05, + "loss": 0.0208, + "step": 7410 + }, + { + "epoch": 1.3376599963944473, + "grad_norm": 0.048459798097610474, + "learning_rate": 9.852432698175304e-05, + "loss": 0.0218, + "step": 7420 + }, + { + "epoch": 1.3394627726699118, + "grad_norm": 0.07089771330356598, + "learning_rate": 9.851767389518597e-05, + "loss": 0.0218, + "step": 7430 + }, + { + "epoch": 1.3412655489453758, + "grad_norm": 0.048038680106401443, + "learning_rate": 9.85110060702306e-05, + "loss": 0.022, + "step": 7440 + }, + { + "epoch": 1.3430683252208402, + "grad_norm": 0.0466218963265419, + "learning_rate": 9.850432350891245e-05, + "loss": 0.0234, + "step": 7450 + }, + { + "epoch": 1.3448711014963042, + "grad_norm": 0.06478673964738846, + "learning_rate": 9.84976262132615e-05, + "loss": 0.0223, + "step": 7460 + }, + { + "epoch": 1.3466738777717686, + "grad_norm": 0.07917436212301254, + "learning_rate": 9.849091418531222e-05, + "loss": 0.0218, + "step": 7470 + }, + { + "epoch": 1.3484766540472326, + "grad_norm": 0.09599833190441132, + "learning_rate": 9.848418742710353e-05, + "loss": 0.024, + "step": 7480 + }, + { + "epoch": 1.350279430322697, + "grad_norm": 0.06834573298692703, + "learning_rate": 9.847744594067885e-05, + "loss": 0.0227, + "step": 7490 + }, + { + "epoch": 1.3520822065981613, + "grad_norm": 0.06003880128264427, + "learning_rate": 9.847068972808607e-05, + "loss": 0.0245, + "step": 7500 + }, + { + "epoch": 1.3538849828736255, + "grad_norm": 0.05793830752372742, + "learning_rate": 9.846391879137756e-05, + "loss": 0.0239, + "step": 7510 + }, + { + "epoch": 1.3556877591490897, + "grad_norm": 0.06008558347821236, + "learning_rate": 9.845713313261012e-05, + "loss": 0.0213, + "step": 7520 + }, + { + "epoch": 1.357490535424554, + "grad_norm": 0.06253988295793533, + "learning_rate": 9.845033275384505e-05, + "loss": 0.0234, + "step": 7530 + }, + { + "epoch": 1.359293311700018, + "grad_norm": 0.04976519197225571, + "learning_rate": 9.844351765714818e-05, + "loss": 0.0217, + "step": 7540 + }, + { + "epoch": 1.3610960879754823, + "grad_norm": 0.05400625616312027, + "learning_rate": 9.843668784458971e-05, + "loss": 0.0219, + "step": 7550 + }, + { + "epoch": 1.3628988642509465, + "grad_norm": 0.05366935953497887, + "learning_rate": 9.842984331824437e-05, + "loss": 0.0222, + "step": 7560 + }, + { + "epoch": 1.3647016405264107, + "grad_norm": 0.05530263110995293, + "learning_rate": 9.842298408019133e-05, + "loss": 0.0206, + "step": 7570 + }, + { + "epoch": 1.366504416801875, + "grad_norm": 0.06823855638504028, + "learning_rate": 9.841611013251429e-05, + "loss": 0.0213, + "step": 7580 + }, + { + "epoch": 1.3683071930773392, + "grad_norm": 0.07395543903112411, + "learning_rate": 9.840922147730133e-05, + "loss": 0.0234, + "step": 7590 + }, + { + "epoch": 1.3701099693528034, + "grad_norm": 0.04946191981434822, + "learning_rate": 9.840231811664506e-05, + "loss": 0.0209, + "step": 7600 + }, + { + "epoch": 1.3719127456282676, + "grad_norm": 0.06046136096119881, + "learning_rate": 9.839540005264252e-05, + "loss": 0.0219, + "step": 7610 + }, + { + "epoch": 1.3737155219037318, + "grad_norm": 0.07256844639778137, + "learning_rate": 9.838846728739527e-05, + "loss": 0.024, + "step": 7620 + }, + { + "epoch": 1.375518298179196, + "grad_norm": 0.04503656551241875, + "learning_rate": 9.838151982300927e-05, + "loss": 0.0227, + "step": 7630 + }, + { + "epoch": 1.3773210744546602, + "grad_norm": 0.05757710337638855, + "learning_rate": 9.8374557661595e-05, + "loss": 0.0221, + "step": 7640 + }, + { + "epoch": 1.3791238507301244, + "grad_norm": 0.05113358795642853, + "learning_rate": 9.836758080526735e-05, + "loss": 0.02, + "step": 7650 + }, + { + "epoch": 1.3809266270055887, + "grad_norm": 0.05416606366634369, + "learning_rate": 9.836058925614575e-05, + "loss": 0.021, + "step": 7660 + }, + { + "epoch": 1.3827294032810529, + "grad_norm": 0.06790699809789658, + "learning_rate": 9.8353583016354e-05, + "loss": 0.0217, + "step": 7670 + }, + { + "epoch": 1.384532179556517, + "grad_norm": 0.058047086000442505, + "learning_rate": 9.834656208802044e-05, + "loss": 0.0228, + "step": 7680 + }, + { + "epoch": 1.3863349558319813, + "grad_norm": 0.07217604666948318, + "learning_rate": 9.833952647327784e-05, + "loss": 0.0227, + "step": 7690 + }, + { + "epoch": 1.3881377321074455, + "grad_norm": 0.05192763730883598, + "learning_rate": 9.833247617426342e-05, + "loss": 0.0216, + "step": 7700 + }, + { + "epoch": 1.3899405083829097, + "grad_norm": 0.055417779833078384, + "learning_rate": 9.832541119311889e-05, + "loss": 0.0203, + "step": 7710 + }, + { + "epoch": 1.391743284658374, + "grad_norm": 0.06576554477214813, + "learning_rate": 9.83183315319904e-05, + "loss": 0.0201, + "step": 7720 + }, + { + "epoch": 1.3935460609338381, + "grad_norm": 0.05679874122142792, + "learning_rate": 9.831123719302855e-05, + "loss": 0.021, + "step": 7730 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.04365033656358719, + "learning_rate": 9.830412817838842e-05, + "loss": 0.0199, + "step": 7740 + }, + { + "epoch": 1.3971516134847666, + "grad_norm": 0.04930594936013222, + "learning_rate": 9.829700449022956e-05, + "loss": 0.0226, + "step": 7750 + }, + { + "epoch": 1.3989543897602308, + "grad_norm": 0.06492602825164795, + "learning_rate": 9.828986613071593e-05, + "loss": 0.0205, + "step": 7760 + }, + { + "epoch": 1.400757166035695, + "grad_norm": 0.052412085235118866, + "learning_rate": 9.828271310201601e-05, + "loss": 0.0217, + "step": 7770 + }, + { + "epoch": 1.4025599423111592, + "grad_norm": 0.06131676211953163, + "learning_rate": 9.827554540630268e-05, + "loss": 0.0216, + "step": 7780 + }, + { + "epoch": 1.4043627185866234, + "grad_norm": 0.04985653609037399, + "learning_rate": 9.826836304575329e-05, + "loss": 0.0198, + "step": 7790 + }, + { + "epoch": 1.4061654948620876, + "grad_norm": 0.057291802018880844, + "learning_rate": 9.826116602254966e-05, + "loss": 0.0225, + "step": 7800 + }, + { + "epoch": 1.4079682711375519, + "grad_norm": 0.05043026804924011, + "learning_rate": 9.825395433887805e-05, + "loss": 0.0189, + "step": 7810 + }, + { + "epoch": 1.409771047413016, + "grad_norm": 0.05380875989794731, + "learning_rate": 9.824672799692917e-05, + "loss": 0.02, + "step": 7820 + }, + { + "epoch": 1.4115738236884803, + "grad_norm": 0.060714781284332275, + "learning_rate": 9.823948699889823e-05, + "loss": 0.0214, + "step": 7830 + }, + { + "epoch": 1.4133765999639445, + "grad_norm": 0.04232145845890045, + "learning_rate": 9.823223134698483e-05, + "loss": 0.0187, + "step": 7840 + }, + { + "epoch": 1.4151793762394087, + "grad_norm": 0.06872662901878357, + "learning_rate": 9.822496104339303e-05, + "loss": 0.0202, + "step": 7850 + }, + { + "epoch": 1.416982152514873, + "grad_norm": 0.060074470937252045, + "learning_rate": 9.821767609033138e-05, + "loss": 0.0208, + "step": 7860 + }, + { + "epoch": 1.4187849287903371, + "grad_norm": 0.06381816416978836, + "learning_rate": 9.821037649001284e-05, + "loss": 0.0216, + "step": 7870 + }, + { + "epoch": 1.4205877050658013, + "grad_norm": 0.07376859337091446, + "learning_rate": 9.820306224465486e-05, + "loss": 0.0223, + "step": 7880 + }, + { + "epoch": 1.4223904813412656, + "grad_norm": 0.06658012419939041, + "learning_rate": 9.819573335647928e-05, + "loss": 0.0214, + "step": 7890 + }, + { + "epoch": 1.4241932576167298, + "grad_norm": 0.05955755338072777, + "learning_rate": 9.818838982771246e-05, + "loss": 0.0215, + "step": 7900 + }, + { + "epoch": 1.425996033892194, + "grad_norm": 0.05632449686527252, + "learning_rate": 9.818103166058514e-05, + "loss": 0.0221, + "step": 7910 + }, + { + "epoch": 1.4277988101676582, + "grad_norm": 0.0776543989777565, + "learning_rate": 9.817365885733254e-05, + "loss": 0.0202, + "step": 7920 + }, + { + "epoch": 1.4296015864431224, + "grad_norm": 0.06531548500061035, + "learning_rate": 9.816627142019434e-05, + "loss": 0.0199, + "step": 7930 + }, + { + "epoch": 1.4314043627185866, + "grad_norm": 0.06074118614196777, + "learning_rate": 9.815886935141463e-05, + "loss": 0.0214, + "step": 7940 + }, + { + "epoch": 1.4332071389940508, + "grad_norm": 0.044885337352752686, + "learning_rate": 9.8151452653242e-05, + "loss": 0.0206, + "step": 7950 + }, + { + "epoch": 1.435009915269515, + "grad_norm": 0.0658380463719368, + "learning_rate": 9.814402132792939e-05, + "loss": 0.0205, + "step": 7960 + }, + { + "epoch": 1.4368126915449793, + "grad_norm": 0.05069758743047714, + "learning_rate": 9.813657537773428e-05, + "loss": 0.0206, + "step": 7970 + }, + { + "epoch": 1.4386154678204435, + "grad_norm": 0.051645245403051376, + "learning_rate": 9.812911480491854e-05, + "loss": 0.0209, + "step": 7980 + }, + { + "epoch": 1.4404182440959077, + "grad_norm": 0.03922301530838013, + "learning_rate": 9.81216396117485e-05, + "loss": 0.0192, + "step": 7990 + }, + { + "epoch": 1.442221020371372, + "grad_norm": 0.06504575908184052, + "learning_rate": 9.811414980049491e-05, + "loss": 0.0207, + "step": 8000 + }, + { + "epoch": 1.444023796646836, + "grad_norm": 0.0448056198656559, + "learning_rate": 9.810664537343301e-05, + "loss": 0.0237, + "step": 8010 + }, + { + "epoch": 1.4458265729223003, + "grad_norm": 0.05820690467953682, + "learning_rate": 9.809912633284243e-05, + "loss": 0.0208, + "step": 8020 + }, + { + "epoch": 1.4476293491977645, + "grad_norm": 0.057242486625909805, + "learning_rate": 9.809159268100725e-05, + "loss": 0.0212, + "step": 8030 + }, + { + "epoch": 1.4494321254732287, + "grad_norm": 0.05151813104748726, + "learning_rate": 9.808404442021599e-05, + "loss": 0.0235, + "step": 8040 + }, + { + "epoch": 1.451234901748693, + "grad_norm": 0.04928862303495407, + "learning_rate": 9.807648155276163e-05, + "loss": 0.0213, + "step": 8050 + }, + { + "epoch": 1.4530376780241572, + "grad_norm": 0.07684013992547989, + "learning_rate": 9.806890408094156e-05, + "loss": 0.0208, + "step": 8060 + }, + { + "epoch": 1.4548404542996214, + "grad_norm": 0.04427684098482132, + "learning_rate": 9.806131200705761e-05, + "loss": 0.0209, + "step": 8070 + }, + { + "epoch": 1.4566432305750856, + "grad_norm": 0.04601654037833214, + "learning_rate": 9.805370533341605e-05, + "loss": 0.0218, + "step": 8080 + }, + { + "epoch": 1.4584460068505498, + "grad_norm": 0.08766639977693558, + "learning_rate": 9.804608406232762e-05, + "loss": 0.0227, + "step": 8090 + }, + { + "epoch": 1.460248783126014, + "grad_norm": 0.05103563889861107, + "learning_rate": 9.803844819610741e-05, + "loss": 0.0214, + "step": 8100 + }, + { + "epoch": 1.4620515594014782, + "grad_norm": 0.05936671048402786, + "learning_rate": 9.803079773707504e-05, + "loss": 0.0212, + "step": 8110 + }, + { + "epoch": 1.4638543356769425, + "grad_norm": 0.047964539378881454, + "learning_rate": 9.802313268755447e-05, + "loss": 0.0222, + "step": 8120 + }, + { + "epoch": 1.4656571119524067, + "grad_norm": 0.06973028928041458, + "learning_rate": 9.801545304987419e-05, + "loss": 0.0214, + "step": 8130 + }, + { + "epoch": 1.4674598882278709, + "grad_norm": 0.05382600054144859, + "learning_rate": 9.800775882636704e-05, + "loss": 0.021, + "step": 8140 + }, + { + "epoch": 1.469262664503335, + "grad_norm": 0.057859499007463455, + "learning_rate": 9.800005001937034e-05, + "loss": 0.0219, + "step": 8150 + }, + { + "epoch": 1.4710654407787993, + "grad_norm": 0.0806669294834137, + "learning_rate": 9.79923266312258e-05, + "loss": 0.0225, + "step": 8160 + }, + { + "epoch": 1.4728682170542635, + "grad_norm": 0.07759104669094086, + "learning_rate": 9.79845886642796e-05, + "loss": 0.0229, + "step": 8170 + }, + { + "epoch": 1.4746709933297277, + "grad_norm": 0.08552855253219604, + "learning_rate": 9.797683612088233e-05, + "loss": 0.0209, + "step": 8180 + }, + { + "epoch": 1.476473769605192, + "grad_norm": 0.04938585311174393, + "learning_rate": 9.796906900338898e-05, + "loss": 0.0214, + "step": 8190 + }, + { + "epoch": 1.4782765458806562, + "grad_norm": 0.06465645879507065, + "learning_rate": 9.796128731415903e-05, + "loss": 0.0237, + "step": 8200 + }, + { + "epoch": 1.4800793221561204, + "grad_norm": 0.07902093231678009, + "learning_rate": 9.795349105555634e-05, + "loss": 0.0214, + "step": 8210 + }, + { + "epoch": 1.4818820984315846, + "grad_norm": 0.053144242614507675, + "learning_rate": 9.794568022994922e-05, + "loss": 0.0229, + "step": 8220 + }, + { + "epoch": 1.4836848747070488, + "grad_norm": 0.06802098453044891, + "learning_rate": 9.793785483971034e-05, + "loss": 0.0217, + "step": 8230 + }, + { + "epoch": 1.485487650982513, + "grad_norm": 0.05871633067727089, + "learning_rate": 9.793001488721691e-05, + "loss": 0.0221, + "step": 8240 + }, + { + "epoch": 1.4872904272579772, + "grad_norm": 0.06375337392091751, + "learning_rate": 9.792216037485047e-05, + "loss": 0.0216, + "step": 8250 + }, + { + "epoch": 1.4890932035334414, + "grad_norm": 0.07021962106227875, + "learning_rate": 9.791429130499704e-05, + "loss": 0.0216, + "step": 8260 + }, + { + "epoch": 1.4908959798089056, + "grad_norm": 0.05153977498412132, + "learning_rate": 9.790640768004698e-05, + "loss": 0.0221, + "step": 8270 + }, + { + "epoch": 1.4926987560843699, + "grad_norm": 0.046286292374134064, + "learning_rate": 9.789850950239518e-05, + "loss": 0.0202, + "step": 8280 + }, + { + "epoch": 1.494501532359834, + "grad_norm": 0.06688359379768372, + "learning_rate": 9.789059677444089e-05, + "loss": 0.0218, + "step": 8290 + }, + { + "epoch": 1.4963043086352983, + "grad_norm": 0.06116340309381485, + "learning_rate": 9.788266949858776e-05, + "loss": 0.0213, + "step": 8300 + }, + { + "epoch": 1.4981070849107625, + "grad_norm": 0.049556270241737366, + "learning_rate": 9.787472767724392e-05, + "loss": 0.0205, + "step": 8310 + }, + { + "epoch": 1.4999098611862267, + "grad_norm": 0.060772232711315155, + "learning_rate": 9.786677131282185e-05, + "loss": 0.02, + "step": 8320 + }, + { + "epoch": 1.501712637461691, + "grad_norm": 0.055699124932289124, + "learning_rate": 9.785880040773853e-05, + "loss": 0.0216, + "step": 8330 + }, + { + "epoch": 1.5035154137371554, + "grad_norm": 0.0725230947136879, + "learning_rate": 9.785081496441527e-05, + "loss": 0.0218, + "step": 8340 + }, + { + "epoch": 1.5053181900126194, + "grad_norm": 0.07469064742326736, + "learning_rate": 9.784281498527785e-05, + "loss": 0.0201, + "step": 8350 + }, + { + "epoch": 1.5071209662880838, + "grad_norm": 0.0679037943482399, + "learning_rate": 9.783480047275646e-05, + "loss": 0.0214, + "step": 8360 + }, + { + "epoch": 1.5089237425635478, + "grad_norm": 0.06035342812538147, + "learning_rate": 9.78267714292857e-05, + "loss": 0.0225, + "step": 8370 + }, + { + "epoch": 1.5107265188390122, + "grad_norm": 0.05299760773777962, + "learning_rate": 9.781872785730454e-05, + "loss": 0.0214, + "step": 8380 + }, + { + "epoch": 1.5125292951144762, + "grad_norm": 0.062331847846508026, + "learning_rate": 9.781066975925646e-05, + "loss": 0.0231, + "step": 8390 + }, + { + "epoch": 1.5143320713899406, + "grad_norm": 0.05355817824602127, + "learning_rate": 9.780259713758928e-05, + "loss": 0.0202, + "step": 8400 + }, + { + "epoch": 1.5161348476654046, + "grad_norm": 0.05499721318483353, + "learning_rate": 9.779450999475524e-05, + "loss": 0.0197, + "step": 8410 + }, + { + "epoch": 1.517937623940869, + "grad_norm": 0.07212535291910172, + "learning_rate": 9.7786408333211e-05, + "loss": 0.0233, + "step": 8420 + }, + { + "epoch": 1.519740400216333, + "grad_norm": 0.060439545661211014, + "learning_rate": 9.777829215541764e-05, + "loss": 0.0221, + "step": 8430 + }, + { + "epoch": 1.5215431764917975, + "grad_norm": 0.062395866960287094, + "learning_rate": 9.777016146384064e-05, + "loss": 0.0225, + "step": 8440 + }, + { + "epoch": 1.5233459527672615, + "grad_norm": 0.0595727376639843, + "learning_rate": 9.776201626094988e-05, + "loss": 0.0217, + "step": 8450 + }, + { + "epoch": 1.525148729042726, + "grad_norm": 0.05581018701195717, + "learning_rate": 9.775385654921965e-05, + "loss": 0.0219, + "step": 8460 + }, + { + "epoch": 1.52695150531819, + "grad_norm": 0.05444534122943878, + "learning_rate": 9.774568233112868e-05, + "loss": 0.0217, + "step": 8470 + }, + { + "epoch": 1.5287542815936543, + "grad_norm": 0.05438929423689842, + "learning_rate": 9.773749360916007e-05, + "loss": 0.0225, + "step": 8480 + }, + { + "epoch": 1.5305570578691183, + "grad_norm": 0.052092067897319794, + "learning_rate": 9.772929038580134e-05, + "loss": 0.0206, + "step": 8490 + }, + { + "epoch": 1.5323598341445828, + "grad_norm": 0.054888077080249786, + "learning_rate": 9.772107266354439e-05, + "loss": 0.0196, + "step": 8500 + }, + { + "epoch": 1.5341626104200468, + "grad_norm": 0.037543389946222305, + "learning_rate": 9.77128404448856e-05, + "loss": 0.0209, + "step": 8510 + }, + { + "epoch": 1.5359653866955112, + "grad_norm": 0.06900222599506378, + "learning_rate": 9.770459373232565e-05, + "loss": 0.0208, + "step": 8520 + }, + { + "epoch": 1.5377681629709752, + "grad_norm": 0.058152928948402405, + "learning_rate": 9.769633252836969e-05, + "loss": 0.0211, + "step": 8530 + }, + { + "epoch": 1.5395709392464396, + "grad_norm": 0.05421449989080429, + "learning_rate": 9.768805683552724e-05, + "loss": 0.0211, + "step": 8540 + }, + { + "epoch": 1.5413737155219036, + "grad_norm": 0.05740150436758995, + "learning_rate": 9.767976665631228e-05, + "loss": 0.0224, + "step": 8550 + }, + { + "epoch": 1.543176491797368, + "grad_norm": 0.05205458402633667, + "learning_rate": 9.767146199324311e-05, + "loss": 0.021, + "step": 8560 + }, + { + "epoch": 1.544979268072832, + "grad_norm": 0.042440202087163925, + "learning_rate": 9.766314284884249e-05, + "loss": 0.0188, + "step": 8570 + }, + { + "epoch": 1.5467820443482965, + "grad_norm": 0.05061357095837593, + "learning_rate": 9.765480922563752e-05, + "loss": 0.0201, + "step": 8580 + }, + { + "epoch": 1.5485848206237605, + "grad_norm": 0.07106569409370422, + "learning_rate": 9.764646112615978e-05, + "loss": 0.0206, + "step": 8590 + }, + { + "epoch": 1.550387596899225, + "grad_norm": 0.06931182742118835, + "learning_rate": 9.763809855294517e-05, + "loss": 0.0213, + "step": 8600 + }, + { + "epoch": 1.5521903731746889, + "grad_norm": 0.05047661438584328, + "learning_rate": 9.762972150853404e-05, + "loss": 0.0217, + "step": 8610 + }, + { + "epoch": 1.5539931494501533, + "grad_norm": 0.07393460720777512, + "learning_rate": 9.762132999547111e-05, + "loss": 0.0206, + "step": 8620 + }, + { + "epoch": 1.5557959257256173, + "grad_norm": 0.054030176252126694, + "learning_rate": 9.761292401630549e-05, + "loss": 0.0202, + "step": 8630 + }, + { + "epoch": 1.5575987020010817, + "grad_norm": 0.06852346658706665, + "learning_rate": 9.76045035735907e-05, + "loss": 0.0205, + "step": 8640 + }, + { + "epoch": 1.5594014782765457, + "grad_norm": 0.05895696580410004, + "learning_rate": 9.759606866988464e-05, + "loss": 0.0202, + "step": 8650 + }, + { + "epoch": 1.5612042545520102, + "grad_norm": 0.0662684217095375, + "learning_rate": 9.758761930774963e-05, + "loss": 0.0204, + "step": 8660 + }, + { + "epoch": 1.5630070308274742, + "grad_norm": 0.05740590766072273, + "learning_rate": 9.757915548975235e-05, + "loss": 0.0205, + "step": 8670 + }, + { + "epoch": 1.5648098071029386, + "grad_norm": 0.05237175524234772, + "learning_rate": 9.757067721846389e-05, + "loss": 0.0202, + "step": 8680 + }, + { + "epoch": 1.5666125833784026, + "grad_norm": 0.059741709381341934, + "learning_rate": 9.756218449645971e-05, + "loss": 0.0206, + "step": 8690 + }, + { + "epoch": 1.568415359653867, + "grad_norm": 0.05884997919201851, + "learning_rate": 9.75536773263197e-05, + "loss": 0.0204, + "step": 8700 + }, + { + "epoch": 1.570218135929331, + "grad_norm": 0.035665128380060196, + "learning_rate": 9.75451557106281e-05, + "loss": 0.0196, + "step": 8710 + }, + { + "epoch": 1.5720209122047955, + "grad_norm": 0.060937728732824326, + "learning_rate": 9.753661965197354e-05, + "loss": 0.0212, + "step": 8720 + }, + { + "epoch": 1.5738236884802594, + "grad_norm": 0.09048791974782944, + "learning_rate": 9.752806915294908e-05, + "loss": 0.0208, + "step": 8730 + }, + { + "epoch": 1.5756264647557239, + "grad_norm": 0.09490035474300385, + "learning_rate": 9.75195042161521e-05, + "loss": 0.0232, + "step": 8740 + }, + { + "epoch": 1.5774292410311879, + "grad_norm": 0.04616469889879227, + "learning_rate": 9.751092484418442e-05, + "loss": 0.0194, + "step": 8750 + }, + { + "epoch": 1.5792320173066523, + "grad_norm": 0.058614976704120636, + "learning_rate": 9.750233103965224e-05, + "loss": 0.0214, + "step": 8760 + }, + { + "epoch": 1.5810347935821163, + "grad_norm": 0.059064578264951706, + "learning_rate": 9.749372280516611e-05, + "loss": 0.0224, + "step": 8770 + }, + { + "epoch": 1.5828375698575807, + "grad_norm": 0.06468500196933746, + "learning_rate": 9.748510014334097e-05, + "loss": 0.023, + "step": 8780 + }, + { + "epoch": 1.584640346133045, + "grad_norm": 0.05198678746819496, + "learning_rate": 9.747646305679621e-05, + "loss": 0.02, + "step": 8790 + }, + { + "epoch": 1.5864431224085092, + "grad_norm": 0.05554482713341713, + "learning_rate": 9.74678115481555e-05, + "loss": 0.0215, + "step": 8800 + }, + { + "epoch": 1.5882458986839734, + "grad_norm": 0.05362630635499954, + "learning_rate": 9.745914562004696e-05, + "loss": 0.0213, + "step": 8810 + }, + { + "epoch": 1.5900486749594376, + "grad_norm": 0.05238635465502739, + "learning_rate": 9.745046527510307e-05, + "loss": 0.0217, + "step": 8820 + }, + { + "epoch": 1.5918514512349018, + "grad_norm": 0.05024407058954239, + "learning_rate": 9.744177051596068e-05, + "loss": 0.0216, + "step": 8830 + }, + { + "epoch": 1.593654227510366, + "grad_norm": 0.07376491278409958, + "learning_rate": 9.743306134526105e-05, + "loss": 0.0215, + "step": 8840 + }, + { + "epoch": 1.5954570037858302, + "grad_norm": 0.0438753105700016, + "learning_rate": 9.742433776564977e-05, + "loss": 0.0199, + "step": 8850 + }, + { + "epoch": 1.5972597800612944, + "grad_norm": 0.059425536543130875, + "learning_rate": 9.741559977977683e-05, + "loss": 0.0215, + "step": 8860 + }, + { + "epoch": 1.5990625563367586, + "grad_norm": 0.05588122084736824, + "learning_rate": 9.740684739029661e-05, + "loss": 0.0219, + "step": 8870 + }, + { + "epoch": 1.6008653326122229, + "grad_norm": 0.07067468017339706, + "learning_rate": 9.739808059986789e-05, + "loss": 0.0209, + "step": 8880 + }, + { + "epoch": 1.602668108887687, + "grad_norm": 0.10182754695415497, + "learning_rate": 9.738929941115373e-05, + "loss": 0.0215, + "step": 8890 + }, + { + "epoch": 1.6044708851631513, + "grad_norm": 0.06524571031332016, + "learning_rate": 9.738050382682167e-05, + "loss": 0.0217, + "step": 8900 + }, + { + "epoch": 1.6062736614386155, + "grad_norm": 0.05922749638557434, + "learning_rate": 9.737169384954355e-05, + "loss": 0.022, + "step": 8910 + }, + { + "epoch": 1.6080764377140797, + "grad_norm": 0.07438024133443832, + "learning_rate": 9.736286948199562e-05, + "loss": 0.0215, + "step": 8920 + }, + { + "epoch": 1.609879213989544, + "grad_norm": 0.052409227937459946, + "learning_rate": 9.735403072685848e-05, + "loss": 0.0222, + "step": 8930 + }, + { + "epoch": 1.6116819902650081, + "grad_norm": 0.055033013224601746, + "learning_rate": 9.734517758681712e-05, + "loss": 0.0201, + "step": 8940 + }, + { + "epoch": 1.6134847665404723, + "grad_norm": 0.04354807734489441, + "learning_rate": 9.733631006456088e-05, + "loss": 0.021, + "step": 8950 + }, + { + "epoch": 1.6152875428159366, + "grad_norm": 0.048277027904987335, + "learning_rate": 9.732742816278348e-05, + "loss": 0.0188, + "step": 8960 + }, + { + "epoch": 1.6170903190914008, + "grad_norm": 0.047624099999666214, + "learning_rate": 9.731853188418302e-05, + "loss": 0.0212, + "step": 8970 + }, + { + "epoch": 1.618893095366865, + "grad_norm": 0.07004883885383606, + "learning_rate": 9.730962123146194e-05, + "loss": 0.0216, + "step": 8980 + }, + { + "epoch": 1.6206958716423292, + "grad_norm": 0.06963780522346497, + "learning_rate": 9.730069620732709e-05, + "loss": 0.0212, + "step": 8990 + }, + { + "epoch": 1.6224986479177934, + "grad_norm": 0.06966513395309448, + "learning_rate": 9.72917568144896e-05, + "loss": 0.0199, + "step": 9000 + }, + { + "epoch": 1.6243014241932576, + "grad_norm": 0.05438746511936188, + "learning_rate": 9.728280305566509e-05, + "loss": 0.0196, + "step": 9010 + }, + { + "epoch": 1.6261042004687218, + "grad_norm": 0.04657372459769249, + "learning_rate": 9.727383493357343e-05, + "loss": 0.0209, + "step": 9020 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.052959416061639786, + "learning_rate": 9.726485245093891e-05, + "loss": 0.0209, + "step": 9030 + }, + { + "epoch": 1.6297097530196503, + "grad_norm": 0.04753457009792328, + "learning_rate": 9.725585561049018e-05, + "loss": 0.0215, + "step": 9040 + }, + { + "epoch": 1.6315125292951145, + "grad_norm": 0.06262840330600739, + "learning_rate": 9.724684441496022e-05, + "loss": 0.0217, + "step": 9050 + }, + { + "epoch": 1.6333153055705787, + "grad_norm": 0.0694810301065445, + "learning_rate": 9.72378188670864e-05, + "loss": 0.0225, + "step": 9060 + }, + { + "epoch": 1.635118081846043, + "grad_norm": 0.07077062129974365, + "learning_rate": 9.722877896961047e-05, + "loss": 0.0238, + "step": 9070 + }, + { + "epoch": 1.6369208581215071, + "grad_norm": 0.06684151291847229, + "learning_rate": 9.721972472527848e-05, + "loss": 0.0198, + "step": 9080 + }, + { + "epoch": 1.6387236343969713, + "grad_norm": 0.06978391110897064, + "learning_rate": 9.721065613684089e-05, + "loss": 0.0197, + "step": 9090 + }, + { + "epoch": 1.6405264106724355, + "grad_norm": 0.04421510174870491, + "learning_rate": 9.72015732070525e-05, + "loss": 0.0191, + "step": 9100 + }, + { + "epoch": 1.6423291869478998, + "grad_norm": 0.052172355353832245, + "learning_rate": 9.719247593867244e-05, + "loss": 0.021, + "step": 9110 + }, + { + "epoch": 1.644131963223364, + "grad_norm": 0.06798746436834335, + "learning_rate": 9.718336433446423e-05, + "loss": 0.022, + "step": 9120 + }, + { + "epoch": 1.6459347394988282, + "grad_norm": 0.04945148900151253, + "learning_rate": 9.717423839719574e-05, + "loss": 0.0192, + "step": 9130 + }, + { + "epoch": 1.6477375157742924, + "grad_norm": 0.05155215412378311, + "learning_rate": 9.71650981296392e-05, + "loss": 0.0228, + "step": 9140 + }, + { + "epoch": 1.6495402920497566, + "grad_norm": 0.054424948990345, + "learning_rate": 9.715594353457118e-05, + "loss": 0.0209, + "step": 9150 + }, + { + "epoch": 1.6513430683252208, + "grad_norm": 0.0566774420440197, + "learning_rate": 9.714677461477257e-05, + "loss": 0.0202, + "step": 9160 + }, + { + "epoch": 1.653145844600685, + "grad_norm": 0.050159987062215805, + "learning_rate": 9.713759137302869e-05, + "loss": 0.0189, + "step": 9170 + }, + { + "epoch": 1.6549486208761492, + "grad_norm": 0.06640554964542389, + "learning_rate": 9.712839381212914e-05, + "loss": 0.0202, + "step": 9180 + }, + { + "epoch": 1.6567513971516135, + "grad_norm": 0.06312596052885056, + "learning_rate": 9.71191819348679e-05, + "loss": 0.0214, + "step": 9190 + }, + { + "epoch": 1.6585541734270777, + "grad_norm": 0.07939982414245605, + "learning_rate": 9.710995574404331e-05, + "loss": 0.0211, + "step": 9200 + }, + { + "epoch": 1.6603569497025419, + "grad_norm": 0.054040759801864624, + "learning_rate": 9.710071524245802e-05, + "loss": 0.0202, + "step": 9210 + }, + { + "epoch": 1.662159725978006, + "grad_norm": 0.06506036967039108, + "learning_rate": 9.709146043291906e-05, + "loss": 0.0214, + "step": 9220 + }, + { + "epoch": 1.6639625022534703, + "grad_norm": 0.067658931016922, + "learning_rate": 9.70821913182378e-05, + "loss": 0.0208, + "step": 9230 + }, + { + "epoch": 1.6657652785289345, + "grad_norm": 0.05525829270482063, + "learning_rate": 9.707290790122995e-05, + "loss": 0.021, + "step": 9240 + }, + { + "epoch": 1.6675680548043987, + "grad_norm": 0.051908284425735474, + "learning_rate": 9.706361018471557e-05, + "loss": 0.02, + "step": 9250 + }, + { + "epoch": 1.669370831079863, + "grad_norm": 0.048801615834236145, + "learning_rate": 9.705429817151906e-05, + "loss": 0.0187, + "step": 9260 + }, + { + "epoch": 1.6711736073553272, + "grad_norm": 0.0648244321346283, + "learning_rate": 9.704497186446917e-05, + "loss": 0.0208, + "step": 9270 + }, + { + "epoch": 1.6729763836307914, + "grad_norm": 0.046664994210004807, + "learning_rate": 9.703563126639896e-05, + "loss": 0.0189, + "step": 9280 + }, + { + "epoch": 1.6747791599062556, + "grad_norm": 0.05976800620555878, + "learning_rate": 9.70262763801459e-05, + "loss": 0.0194, + "step": 9290 + }, + { + "epoch": 1.6765819361817198, + "grad_norm": 0.04056289419531822, + "learning_rate": 9.701690720855171e-05, + "loss": 0.0201, + "step": 9300 + }, + { + "epoch": 1.678384712457184, + "grad_norm": 0.0553470216691494, + "learning_rate": 9.700752375446253e-05, + "loss": 0.0197, + "step": 9310 + }, + { + "epoch": 1.6801874887326482, + "grad_norm": 0.048293586820364, + "learning_rate": 9.69981260207288e-05, + "loss": 0.0193, + "step": 9320 + }, + { + "epoch": 1.6819902650081124, + "grad_norm": 0.06013356149196625, + "learning_rate": 9.698871401020529e-05, + "loss": 0.0188, + "step": 9330 + }, + { + "epoch": 1.6837930412835767, + "grad_norm": 0.055767722427845, + "learning_rate": 9.697928772575112e-05, + "loss": 0.0202, + "step": 9340 + }, + { + "epoch": 1.685595817559041, + "grad_norm": 0.10477222502231598, + "learning_rate": 9.696984717022976e-05, + "loss": 0.0209, + "step": 9350 + }, + { + "epoch": 1.687398593834505, + "grad_norm": 0.05171753466129303, + "learning_rate": 9.6960392346509e-05, + "loss": 0.0198, + "step": 9360 + }, + { + "epoch": 1.6892013701099695, + "grad_norm": 0.05773494392633438, + "learning_rate": 9.695092325746097e-05, + "loss": 0.0196, + "step": 9370 + }, + { + "epoch": 1.6910041463854335, + "grad_norm": 0.06796815246343613, + "learning_rate": 9.694143990596211e-05, + "loss": 0.0212, + "step": 9380 + }, + { + "epoch": 1.692806922660898, + "grad_norm": 0.058455824851989746, + "learning_rate": 9.693194229489325e-05, + "loss": 0.0199, + "step": 9390 + }, + { + "epoch": 1.694609698936362, + "grad_norm": 0.04718489199876785, + "learning_rate": 9.692243042713944e-05, + "loss": 0.0203, + "step": 9400 + }, + { + "epoch": 1.6964124752118264, + "grad_norm": 0.05397193133831024, + "learning_rate": 9.691290430559022e-05, + "loss": 0.0208, + "step": 9410 + }, + { + "epoch": 1.6982152514872904, + "grad_norm": 0.06605379283428192, + "learning_rate": 9.690336393313932e-05, + "loss": 0.0205, + "step": 9420 + }, + { + "epoch": 1.7000180277627548, + "grad_norm": 0.05248812586069107, + "learning_rate": 9.689380931268487e-05, + "loss": 0.0223, + "step": 9430 + }, + { + "epoch": 1.7018208040382188, + "grad_norm": 0.044598281383514404, + "learning_rate": 9.688424044712932e-05, + "loss": 0.0213, + "step": 9440 + }, + { + "epoch": 1.7036235803136832, + "grad_norm": 0.03743177279829979, + "learning_rate": 9.687465733937942e-05, + "loss": 0.0192, + "step": 9450 + }, + { + "epoch": 1.7054263565891472, + "grad_norm": 0.0685172975063324, + "learning_rate": 9.686505999234627e-05, + "loss": 0.0204, + "step": 9460 + }, + { + "epoch": 1.7072291328646116, + "grad_norm": 0.07564239948987961, + "learning_rate": 9.685544840894529e-05, + "loss": 0.0235, + "step": 9470 + }, + { + "epoch": 1.7090319091400756, + "grad_norm": 0.05405229330062866, + "learning_rate": 9.684582259209624e-05, + "loss": 0.0214, + "step": 9480 + }, + { + "epoch": 1.71083468541554, + "grad_norm": 0.06509434431791306, + "learning_rate": 9.683618254472317e-05, + "loss": 0.0207, + "step": 9490 + }, + { + "epoch": 1.712637461691004, + "grad_norm": 0.040974050760269165, + "learning_rate": 9.682652826975449e-05, + "loss": 0.0208, + "step": 9500 + }, + { + "epoch": 1.7144402379664685, + "grad_norm": 0.04697747901082039, + "learning_rate": 9.681685977012291e-05, + "loss": 0.0196, + "step": 9510 + }, + { + "epoch": 1.7162430142419325, + "grad_norm": 0.061163127422332764, + "learning_rate": 9.680717704876546e-05, + "loss": 0.0204, + "step": 9520 + }, + { + "epoch": 1.718045790517397, + "grad_norm": 0.04037218540906906, + "learning_rate": 9.679748010862349e-05, + "loss": 0.0209, + "step": 9530 + }, + { + "epoch": 1.719848566792861, + "grad_norm": 0.07055612653493881, + "learning_rate": 9.678776895264267e-05, + "loss": 0.0185, + "step": 9540 + }, + { + "epoch": 1.7216513430683253, + "grad_norm": 0.04670809581875801, + "learning_rate": 9.6778043583773e-05, + "loss": 0.0202, + "step": 9550 + }, + { + "epoch": 1.7234541193437893, + "grad_norm": 0.04538438469171524, + "learning_rate": 9.67683040049688e-05, + "loss": 0.0202, + "step": 9560 + }, + { + "epoch": 1.7252568956192538, + "grad_norm": 0.05957496911287308, + "learning_rate": 9.675855021918869e-05, + "loss": 0.0197, + "step": 9570 + }, + { + "epoch": 1.7270596718947178, + "grad_norm": 0.0546918623149395, + "learning_rate": 9.674878222939561e-05, + "loss": 0.02, + "step": 9580 + }, + { + "epoch": 1.7288624481701822, + "grad_norm": 0.06339256465435028, + "learning_rate": 9.673900003855681e-05, + "loss": 0.0196, + "step": 9590 + }, + { + "epoch": 1.7306652244456462, + "grad_norm": 0.052605751901865005, + "learning_rate": 9.672920364964389e-05, + "loss": 0.0193, + "step": 9600 + }, + { + "epoch": 1.7324680007211106, + "grad_norm": 0.06613392382860184, + "learning_rate": 9.671939306563269e-05, + "loss": 0.0199, + "step": 9610 + }, + { + "epoch": 1.7342707769965746, + "grad_norm": 0.036477938294410706, + "learning_rate": 9.670956828950345e-05, + "loss": 0.0195, + "step": 9620 + }, + { + "epoch": 1.736073553272039, + "grad_norm": 0.04745618253946304, + "learning_rate": 9.669972932424065e-05, + "loss": 0.0192, + "step": 9630 + }, + { + "epoch": 1.737876329547503, + "grad_norm": 0.0595737025141716, + "learning_rate": 9.668987617283312e-05, + "loss": 0.019, + "step": 9640 + }, + { + "epoch": 1.7396791058229675, + "grad_norm": 0.05174412950873375, + "learning_rate": 9.668000883827397e-05, + "loss": 0.0218, + "step": 9650 + }, + { + "epoch": 1.7414818820984315, + "grad_norm": 0.05835159868001938, + "learning_rate": 9.667012732356067e-05, + "loss": 0.0201, + "step": 9660 + }, + { + "epoch": 1.743284658373896, + "grad_norm": 0.0459693968296051, + "learning_rate": 9.666023163169493e-05, + "loss": 0.0189, + "step": 9670 + }, + { + "epoch": 1.74508743464936, + "grad_norm": 0.0838942900300026, + "learning_rate": 9.665032176568281e-05, + "loss": 0.0207, + "step": 9680 + }, + { + "epoch": 1.7468902109248243, + "grad_norm": 0.057598162442445755, + "learning_rate": 9.664039772853469e-05, + "loss": 0.0206, + "step": 9690 + }, + { + "epoch": 1.7486929872002883, + "grad_norm": 0.04592104256153107, + "learning_rate": 9.663045952326518e-05, + "loss": 0.0206, + "step": 9700 + }, + { + "epoch": 1.7504957634757528, + "grad_norm": 0.05846831575036049, + "learning_rate": 9.662050715289328e-05, + "loss": 0.0198, + "step": 9710 + }, + { + "epoch": 1.7522985397512167, + "grad_norm": 0.04357026517391205, + "learning_rate": 9.661054062044226e-05, + "loss": 0.0206, + "step": 9720 + }, + { + "epoch": 1.7541013160266812, + "grad_norm": 0.04666745290160179, + "learning_rate": 9.660055992893968e-05, + "loss": 0.0207, + "step": 9730 + }, + { + "epoch": 1.7559040923021452, + "grad_norm": 0.06873369216918945, + "learning_rate": 9.659056508141739e-05, + "loss": 0.021, + "step": 9740 + }, + { + "epoch": 1.7577068685776096, + "grad_norm": 0.06254009902477264, + "learning_rate": 9.658055608091161e-05, + "loss": 0.0202, + "step": 9750 + }, + { + "epoch": 1.7595096448530736, + "grad_norm": 0.040738534182310104, + "learning_rate": 9.657053293046276e-05, + "loss": 0.0186, + "step": 9760 + }, + { + "epoch": 1.761312421128538, + "grad_norm": 0.0527578704059124, + "learning_rate": 9.656049563311564e-05, + "loss": 0.0192, + "step": 9770 + }, + { + "epoch": 1.763115197404002, + "grad_norm": 0.056363604962825775, + "learning_rate": 9.655044419191929e-05, + "loss": 0.0195, + "step": 9780 + }, + { + "epoch": 1.7649179736794665, + "grad_norm": 0.04523983970284462, + "learning_rate": 9.654037860992711e-05, + "loss": 0.0183, + "step": 9790 + }, + { + "epoch": 1.7667207499549304, + "grad_norm": 0.04990294203162193, + "learning_rate": 9.653029889019672e-05, + "loss": 0.0205, + "step": 9800 + }, + { + "epoch": 1.7685235262303949, + "grad_norm": 0.053742073476314545, + "learning_rate": 9.65202050357901e-05, + "loss": 0.0182, + "step": 9810 + }, + { + "epoch": 1.7703263025058589, + "grad_norm": 0.04176913946866989, + "learning_rate": 9.651009704977347e-05, + "loss": 0.0201, + "step": 9820 + }, + { + "epoch": 1.7721290787813233, + "grad_norm": 0.05364402011036873, + "learning_rate": 9.649997493521738e-05, + "loss": 0.0199, + "step": 9830 + }, + { + "epoch": 1.7739318550567873, + "grad_norm": 0.04094744473695755, + "learning_rate": 9.64898386951967e-05, + "loss": 0.0185, + "step": 9840 + }, + { + "epoch": 1.7757346313322517, + "grad_norm": 0.05297229066491127, + "learning_rate": 9.647968833279049e-05, + "loss": 0.02, + "step": 9850 + }, + { + "epoch": 1.7775374076077157, + "grad_norm": 0.05192141607403755, + "learning_rate": 9.646952385108218e-05, + "loss": 0.0202, + "step": 9860 + }, + { + "epoch": 1.7793401838831802, + "grad_norm": 0.05682383105158806, + "learning_rate": 9.645934525315951e-05, + "loss": 0.0188, + "step": 9870 + }, + { + "epoch": 1.7811429601586442, + "grad_norm": 0.05003898963332176, + "learning_rate": 9.644915254211442e-05, + "loss": 0.0197, + "step": 9880 + }, + { + "epoch": 1.7829457364341086, + "grad_norm": 0.05703689903020859, + "learning_rate": 9.643894572104321e-05, + "loss": 0.0197, + "step": 9890 + }, + { + "epoch": 1.7847485127095726, + "grad_norm": 0.045938435941934586, + "learning_rate": 9.642872479304644e-05, + "loss": 0.0209, + "step": 9900 + }, + { + "epoch": 1.786551288985037, + "grad_norm": 0.04441182315349579, + "learning_rate": 9.641848976122895e-05, + "loss": 0.0199, + "step": 9910 + }, + { + "epoch": 1.7883540652605012, + "grad_norm": 0.07210560142993927, + "learning_rate": 9.64082406286999e-05, + "loss": 0.0211, + "step": 9920 + }, + { + "epoch": 1.7901568415359654, + "grad_norm": 0.04115978628396988, + "learning_rate": 9.639797739857269e-05, + "loss": 0.0208, + "step": 9930 + }, + { + "epoch": 1.7919596178114297, + "grad_norm": 0.04438093677163124, + "learning_rate": 9.638770007396498e-05, + "loss": 0.0198, + "step": 9940 + }, + { + "epoch": 1.7937623940868939, + "grad_norm": 0.05782434344291687, + "learning_rate": 9.63774086579988e-05, + "loss": 0.0191, + "step": 9950 + }, + { + "epoch": 1.795565170362358, + "grad_norm": 0.05490140616893768, + "learning_rate": 9.63671031538004e-05, + "loss": 0.0202, + "step": 9960 + }, + { + "epoch": 1.7973679466378223, + "grad_norm": 0.046709831804037094, + "learning_rate": 9.635678356450031e-05, + "loss": 0.0198, + "step": 9970 + }, + { + "epoch": 1.7991707229132865, + "grad_norm": 0.04540036618709564, + "learning_rate": 9.634644989323336e-05, + "loss": 0.0206, + "step": 9980 + }, + { + "epoch": 1.8009734991887507, + "grad_norm": 0.05456742271780968, + "learning_rate": 9.633610214313861e-05, + "loss": 0.0195, + "step": 9990 + }, + { + "epoch": 1.802776275464215, + "grad_norm": 0.07791728526353836, + "learning_rate": 9.632574031735951e-05, + "loss": 0.0207, + "step": 10000 + }, + { + "epoch": 1.8045790517396791, + "grad_norm": 0.061024244874715805, + "learning_rate": 9.631536441904364e-05, + "loss": 0.0189, + "step": 10010 + }, + { + "epoch": 1.8063818280151434, + "grad_norm": 0.04530148580670357, + "learning_rate": 9.630497445134293e-05, + "loss": 0.0213, + "step": 10020 + }, + { + "epoch": 1.8081846042906076, + "grad_norm": 0.05428197979927063, + "learning_rate": 9.62945704174136e-05, + "loss": 0.0201, + "step": 10030 + }, + { + "epoch": 1.8099873805660718, + "grad_norm": 0.08402746915817261, + "learning_rate": 9.628415232041612e-05, + "loss": 0.0204, + "step": 10040 + }, + { + "epoch": 1.811790156841536, + "grad_norm": 0.06065186858177185, + "learning_rate": 9.627372016351524e-05, + "loss": 0.0188, + "step": 10050 + }, + { + "epoch": 1.8135929331170002, + "grad_norm": 0.055774275213479996, + "learning_rate": 9.626327394987995e-05, + "loss": 0.0199, + "step": 10060 + }, + { + "epoch": 1.8153957093924644, + "grad_norm": 0.05548562854528427, + "learning_rate": 9.625281368268355e-05, + "loss": 0.0197, + "step": 10070 + }, + { + "epoch": 1.8171984856679286, + "grad_norm": 0.04034167528152466, + "learning_rate": 9.624233936510357e-05, + "loss": 0.0188, + "step": 10080 + }, + { + "epoch": 1.8190012619433928, + "grad_norm": 0.05677448585629463, + "learning_rate": 9.623185100032187e-05, + "loss": 0.0202, + "step": 10090 + }, + { + "epoch": 1.820804038218857, + "grad_norm": 0.06716450303792953, + "learning_rate": 9.62213485915245e-05, + "loss": 0.0201, + "step": 10100 + }, + { + "epoch": 1.8226068144943213, + "grad_norm": 0.0505664236843586, + "learning_rate": 9.621083214190186e-05, + "loss": 0.0202, + "step": 10110 + }, + { + "epoch": 1.8244095907697855, + "grad_norm": 0.057667702436447144, + "learning_rate": 9.62003016546485e-05, + "loss": 0.0201, + "step": 10120 + }, + { + "epoch": 1.8262123670452497, + "grad_norm": 0.05325646325945854, + "learning_rate": 9.618975713296339e-05, + "loss": 0.021, + "step": 10130 + }, + { + "epoch": 1.828015143320714, + "grad_norm": 0.037869781255722046, + "learning_rate": 9.61791985800496e-05, + "loss": 0.0195, + "step": 10140 + }, + { + "epoch": 1.8298179195961781, + "grad_norm": 0.04395619034767151, + "learning_rate": 9.616862599911458e-05, + "loss": 0.0204, + "step": 10150 + }, + { + "epoch": 1.8316206958716423, + "grad_norm": 0.05924016609787941, + "learning_rate": 9.615803939337e-05, + "loss": 0.0204, + "step": 10160 + }, + { + "epoch": 1.8334234721471065, + "grad_norm": 0.03749702125787735, + "learning_rate": 9.614743876603178e-05, + "loss": 0.02, + "step": 10170 + }, + { + "epoch": 1.8352262484225708, + "grad_norm": 0.04175048694014549, + "learning_rate": 9.613682412032013e-05, + "loss": 0.0196, + "step": 10180 + }, + { + "epoch": 1.837029024698035, + "grad_norm": 0.06096727401018143, + "learning_rate": 9.612619545945947e-05, + "loss": 0.0194, + "step": 10190 + }, + { + "epoch": 1.8388318009734992, + "grad_norm": 0.05051444098353386, + "learning_rate": 9.611555278667852e-05, + "loss": 0.021, + "step": 10200 + }, + { + "epoch": 1.8406345772489634, + "grad_norm": 0.07645382732152939, + "learning_rate": 9.610489610521024e-05, + "loss": 0.0189, + "step": 10210 + }, + { + "epoch": 1.8424373535244276, + "grad_norm": 0.05622135475277901, + "learning_rate": 9.609422541829187e-05, + "loss": 0.0185, + "step": 10220 + }, + { + "epoch": 1.8442401297998918, + "grad_norm": 0.06744464486837387, + "learning_rate": 9.608354072916486e-05, + "loss": 0.0195, + "step": 10230 + }, + { + "epoch": 1.846042906075356, + "grad_norm": 0.050015565007925034, + "learning_rate": 9.607284204107493e-05, + "loss": 0.021, + "step": 10240 + }, + { + "epoch": 1.8478456823508203, + "grad_norm": 0.06415226310491562, + "learning_rate": 9.606212935727208e-05, + "loss": 0.0201, + "step": 10250 + }, + { + "epoch": 1.8496484586262845, + "grad_norm": 0.04375769942998886, + "learning_rate": 9.605140268101052e-05, + "loss": 0.0188, + "step": 10260 + }, + { + "epoch": 1.8514512349017487, + "grad_norm": 0.058428410440683365, + "learning_rate": 9.604066201554875e-05, + "loss": 0.0204, + "step": 10270 + }, + { + "epoch": 1.853254011177213, + "grad_norm": 0.05927153304219246, + "learning_rate": 9.60299073641495e-05, + "loss": 0.0199, + "step": 10280 + }, + { + "epoch": 1.855056787452677, + "grad_norm": 0.055333442986011505, + "learning_rate": 9.601913873007974e-05, + "loss": 0.0211, + "step": 10290 + }, + { + "epoch": 1.8568595637281413, + "grad_norm": 0.06076036021113396, + "learning_rate": 9.60083561166107e-05, + "loss": 0.0196, + "step": 10300 + }, + { + "epoch": 1.8586623400036055, + "grad_norm": 0.04843183979392052, + "learning_rate": 9.599755952701783e-05, + "loss": 0.019, + "step": 10310 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.08332542330026627, + "learning_rate": 9.598674896458089e-05, + "loss": 0.0191, + "step": 10320 + }, + { + "epoch": 1.862267892554534, + "grad_norm": 0.054583530873060226, + "learning_rate": 9.597592443258383e-05, + "loss": 0.0184, + "step": 10330 + }, + { + "epoch": 1.8640706688299982, + "grad_norm": 0.034477148205041885, + "learning_rate": 9.596508593431483e-05, + "loss": 0.0179, + "step": 10340 + }, + { + "epoch": 1.8658734451054624, + "grad_norm": 0.0649976134300232, + "learning_rate": 9.59542334730664e-05, + "loss": 0.0198, + "step": 10350 + }, + { + "epoch": 1.8676762213809266, + "grad_norm": 0.03918307274580002, + "learning_rate": 9.594336705213516e-05, + "loss": 0.019, + "step": 10360 + }, + { + "epoch": 1.8694789976563908, + "grad_norm": 0.037134163081645966, + "learning_rate": 9.593248667482208e-05, + "loss": 0.0205, + "step": 10370 + }, + { + "epoch": 1.871281773931855, + "grad_norm": 0.060363538563251495, + "learning_rate": 9.592159234443233e-05, + "loss": 0.0202, + "step": 10380 + }, + { + "epoch": 1.8730845502073192, + "grad_norm": 0.06142058223485947, + "learning_rate": 9.59106840642753e-05, + "loss": 0.0198, + "step": 10390 + }, + { + "epoch": 1.8748873264827834, + "grad_norm": 0.039658207446336746, + "learning_rate": 9.589976183766467e-05, + "loss": 0.0201, + "step": 10400 + }, + { + "epoch": 1.8766901027582477, + "grad_norm": 0.05468040704727173, + "learning_rate": 9.58888256679183e-05, + "loss": 0.0197, + "step": 10410 + }, + { + "epoch": 1.8784928790337119, + "grad_norm": 0.0647147074341774, + "learning_rate": 9.587787555835832e-05, + "loss": 0.0185, + "step": 10420 + }, + { + "epoch": 1.880295655309176, + "grad_norm": 0.058429423719644547, + "learning_rate": 9.586691151231107e-05, + "loss": 0.0194, + "step": 10430 + }, + { + "epoch": 1.8820984315846403, + "grad_norm": 0.041789717972278595, + "learning_rate": 9.585593353310715e-05, + "loss": 0.0201, + "step": 10440 + }, + { + "epoch": 1.8839012078601045, + "grad_norm": 0.06954993307590485, + "learning_rate": 9.58449416240814e-05, + "loss": 0.0188, + "step": 10450 + }, + { + "epoch": 1.8857039841355687, + "grad_norm": 0.06261508911848068, + "learning_rate": 9.583393578857283e-05, + "loss": 0.0199, + "step": 10460 + }, + { + "epoch": 1.887506760411033, + "grad_norm": 0.07021066546440125, + "learning_rate": 9.582291602992474e-05, + "loss": 0.0214, + "step": 10470 + }, + { + "epoch": 1.8893095366864974, + "grad_norm": 0.06923937797546387, + "learning_rate": 9.581188235148466e-05, + "loss": 0.0191, + "step": 10480 + }, + { + "epoch": 1.8911123129619614, + "grad_norm": 0.05351436138153076, + "learning_rate": 9.58008347566043e-05, + "loss": 0.0209, + "step": 10490 + }, + { + "epoch": 1.8929150892374258, + "grad_norm": 0.04541342332959175, + "learning_rate": 9.578977324863965e-05, + "loss": 0.0187, + "step": 10500 + }, + { + "epoch": 1.8947178655128898, + "grad_norm": 0.06443334370851517, + "learning_rate": 9.577869783095089e-05, + "loss": 0.0199, + "step": 10510 + }, + { + "epoch": 1.8965206417883542, + "grad_norm": 0.04308365657925606, + "learning_rate": 9.576760850690245e-05, + "loss": 0.0186, + "step": 10520 + }, + { + "epoch": 1.8983234180638182, + "grad_norm": 0.04976684972643852, + "learning_rate": 9.575650527986298e-05, + "loss": 0.0197, + "step": 10530 + }, + { + "epoch": 1.9001261943392826, + "grad_norm": 0.04966283217072487, + "learning_rate": 9.574538815320531e-05, + "loss": 0.0183, + "step": 10540 + }, + { + "epoch": 1.9019289706147466, + "grad_norm": 0.04736045375466347, + "learning_rate": 9.573425713030656e-05, + "loss": 0.0189, + "step": 10550 + }, + { + "epoch": 1.903731746890211, + "grad_norm": 0.04477149620652199, + "learning_rate": 9.572311221454806e-05, + "loss": 0.0208, + "step": 10560 + }, + { + "epoch": 1.905534523165675, + "grad_norm": 0.04618890583515167, + "learning_rate": 9.57119534093153e-05, + "loss": 0.0172, + "step": 10570 + }, + { + "epoch": 1.9073372994411395, + "grad_norm": 0.03977326303720474, + "learning_rate": 9.570078071799806e-05, + "loss": 0.02, + "step": 10580 + }, + { + "epoch": 1.9091400757166035, + "grad_norm": 0.06925521790981293, + "learning_rate": 9.568959414399028e-05, + "loss": 0.0202, + "step": 10590 + }, + { + "epoch": 1.910942851992068, + "grad_norm": 0.061184804886579514, + "learning_rate": 9.567839369069018e-05, + "loss": 0.0192, + "step": 10600 + }, + { + "epoch": 1.912745628267532, + "grad_norm": 0.0442969910800457, + "learning_rate": 9.566717936150013e-05, + "loss": 0.0199, + "step": 10610 + }, + { + "epoch": 1.9145484045429964, + "grad_norm": 0.05663587525486946, + "learning_rate": 9.565595115982678e-05, + "loss": 0.0188, + "step": 10620 + }, + { + "epoch": 1.9163511808184603, + "grad_norm": 0.05110369250178337, + "learning_rate": 9.564470908908094e-05, + "loss": 0.0197, + "step": 10630 + }, + { + "epoch": 1.9181539570939248, + "grad_norm": 0.054244786500930786, + "learning_rate": 9.563345315267764e-05, + "loss": 0.0208, + "step": 10640 + }, + { + "epoch": 1.9199567333693888, + "grad_norm": 0.0678684189915657, + "learning_rate": 9.562218335403616e-05, + "loss": 0.019, + "step": 10650 + }, + { + "epoch": 1.9217595096448532, + "grad_norm": 0.03580731898546219, + "learning_rate": 9.561089969657999e-05, + "loss": 0.0188, + "step": 10660 + }, + { + "epoch": 1.9235622859203172, + "grad_norm": 0.06203586980700493, + "learning_rate": 9.559960218373673e-05, + "loss": 0.0203, + "step": 10670 + }, + { + "epoch": 1.9253650621957816, + "grad_norm": 0.050029970705509186, + "learning_rate": 9.558829081893836e-05, + "loss": 0.0206, + "step": 10680 + }, + { + "epoch": 1.9271678384712456, + "grad_norm": 0.04566832631826401, + "learning_rate": 9.55769656056209e-05, + "loss": 0.018, + "step": 10690 + }, + { + "epoch": 1.92897061474671, + "grad_norm": 0.048688698559999466, + "learning_rate": 9.556562654722469e-05, + "loss": 0.0195, + "step": 10700 + }, + { + "epoch": 1.930773391022174, + "grad_norm": 0.044818002730607986, + "learning_rate": 9.555427364719422e-05, + "loss": 0.0189, + "step": 10710 + }, + { + "epoch": 1.9325761672976385, + "grad_norm": 0.06032998487353325, + "learning_rate": 9.55429069089782e-05, + "loss": 0.0193, + "step": 10720 + }, + { + "epoch": 1.9343789435731025, + "grad_norm": 0.055361032485961914, + "learning_rate": 9.553152633602956e-05, + "loss": 0.0194, + "step": 10730 + }, + { + "epoch": 1.936181719848567, + "grad_norm": 0.06796430796384811, + "learning_rate": 9.552013193180543e-05, + "loss": 0.0192, + "step": 10740 + }, + { + "epoch": 1.937984496124031, + "grad_norm": 0.04104098305106163, + "learning_rate": 9.550872369976707e-05, + "loss": 0.0192, + "step": 10750 + }, + { + "epoch": 1.9397872723994953, + "grad_norm": 0.048406004905700684, + "learning_rate": 9.549730164338007e-05, + "loss": 0.0204, + "step": 10760 + }, + { + "epoch": 1.9415900486749593, + "grad_norm": 0.06462853401899338, + "learning_rate": 9.548586576611408e-05, + "loss": 0.0208, + "step": 10770 + }, + { + "epoch": 1.9433928249504238, + "grad_norm": 0.06952539086341858, + "learning_rate": 9.54744160714431e-05, + "loss": 0.0195, + "step": 10780 + }, + { + "epoch": 1.9451956012258877, + "grad_norm": 0.06650444865226746, + "learning_rate": 9.546295256284516e-05, + "loss": 0.0199, + "step": 10790 + }, + { + "epoch": 1.9469983775013522, + "grad_norm": 0.06242068111896515, + "learning_rate": 9.545147524380265e-05, + "loss": 0.0191, + "step": 10800 + }, + { + "epoch": 1.9488011537768162, + "grad_norm": 0.05579899996519089, + "learning_rate": 9.543998411780201e-05, + "loss": 0.0207, + "step": 10810 + }, + { + "epoch": 1.9506039300522806, + "grad_norm": 0.06585012376308441, + "learning_rate": 9.542847918833397e-05, + "loss": 0.0202, + "step": 10820 + }, + { + "epoch": 1.9524067063277446, + "grad_norm": 0.049776770174503326, + "learning_rate": 9.541696045889343e-05, + "loss": 0.0191, + "step": 10830 + }, + { + "epoch": 1.954209482603209, + "grad_norm": 0.04830382764339447, + "learning_rate": 9.540542793297947e-05, + "loss": 0.0184, + "step": 10840 + }, + { + "epoch": 1.956012258878673, + "grad_norm": 0.047542523592710495, + "learning_rate": 9.539388161409537e-05, + "loss": 0.0193, + "step": 10850 + }, + { + "epoch": 1.9578150351541375, + "grad_norm": 0.08047202229499817, + "learning_rate": 9.538232150574857e-05, + "loss": 0.0188, + "step": 10860 + }, + { + "epoch": 1.9596178114296015, + "grad_norm": 0.07332579046487808, + "learning_rate": 9.537074761145076e-05, + "loss": 0.021, + "step": 10870 + }, + { + "epoch": 1.9614205877050659, + "grad_norm": 0.0493595190346241, + "learning_rate": 9.535915993471778e-05, + "loss": 0.021, + "step": 10880 + }, + { + "epoch": 1.9632233639805299, + "grad_norm": 0.06225757673382759, + "learning_rate": 9.534755847906964e-05, + "loss": 0.0201, + "step": 10890 + }, + { + "epoch": 1.9650261402559943, + "grad_norm": 0.04433118551969528, + "learning_rate": 9.533594324803057e-05, + "loss": 0.02, + "step": 10900 + }, + { + "epoch": 1.9668289165314583, + "grad_norm": 0.06525418907403946, + "learning_rate": 9.532431424512895e-05, + "loss": 0.0182, + "step": 10910 + }, + { + "epoch": 1.9686316928069227, + "grad_norm": 0.05069471895694733, + "learning_rate": 9.531267147389741e-05, + "loss": 0.0187, + "step": 10920 + }, + { + "epoch": 1.9704344690823867, + "grad_norm": 0.05775514990091324, + "learning_rate": 9.530101493787266e-05, + "loss": 0.0192, + "step": 10930 + }, + { + "epoch": 1.9722372453578512, + "grad_norm": 0.06447020173072815, + "learning_rate": 9.528934464059571e-05, + "loss": 0.0197, + "step": 10940 + }, + { + "epoch": 1.9740400216333152, + "grad_norm": 0.0452558770775795, + "learning_rate": 9.527766058561163e-05, + "loss": 0.0182, + "step": 10950 + }, + { + "epoch": 1.9758427979087796, + "grad_norm": 0.05321752652525902, + "learning_rate": 9.526596277646976e-05, + "loss": 0.0213, + "step": 10960 + }, + { + "epoch": 1.9776455741842436, + "grad_norm": 0.06405824422836304, + "learning_rate": 9.525425121672358e-05, + "loss": 0.0205, + "step": 10970 + }, + { + "epoch": 1.979448350459708, + "grad_norm": 0.048017099499702454, + "learning_rate": 9.524252590993074e-05, + "loss": 0.0182, + "step": 10980 + }, + { + "epoch": 1.981251126735172, + "grad_norm": 0.06408586353063583, + "learning_rate": 9.523078685965309e-05, + "loss": 0.0197, + "step": 10990 + }, + { + "epoch": 1.9830539030106364, + "grad_norm": 0.08012508600950241, + "learning_rate": 9.521903406945664e-05, + "loss": 0.0206, + "step": 11000 + }, + { + "epoch": 1.9848566792861004, + "grad_norm": 0.053936880081892014, + "learning_rate": 9.520726754291158e-05, + "loss": 0.0196, + "step": 11010 + }, + { + "epoch": 1.9866594555615649, + "grad_norm": 0.058085717260837555, + "learning_rate": 9.519548728359227e-05, + "loss": 0.0185, + "step": 11020 + }, + { + "epoch": 1.9884622318370289, + "grad_norm": 0.05170876905322075, + "learning_rate": 9.518369329507726e-05, + "loss": 0.0196, + "step": 11030 + }, + { + "epoch": 1.9902650081124933, + "grad_norm": 0.042415618896484375, + "learning_rate": 9.51718855809492e-05, + "loss": 0.0196, + "step": 11040 + }, + { + "epoch": 1.9920677843879575, + "grad_norm": 0.06280050426721573, + "learning_rate": 9.516006414479502e-05, + "loss": 0.0207, + "step": 11050 + }, + { + "epoch": 1.9938705606634217, + "grad_norm": 0.04251417890191078, + "learning_rate": 9.514822899020572e-05, + "loss": 0.0191, + "step": 11060 + }, + { + "epoch": 1.995673336938886, + "grad_norm": 0.07592503726482391, + "learning_rate": 9.513638012077654e-05, + "loss": 0.0198, + "step": 11070 + }, + { + "epoch": 1.9974761132143501, + "grad_norm": 0.043099720031023026, + "learning_rate": 9.512451754010683e-05, + "loss": 0.0193, + "step": 11080 + }, + { + "epoch": 1.9992788894898144, + "grad_norm": 0.0629250556230545, + "learning_rate": 9.511264125180013e-05, + "loss": 0.019, + "step": 11090 + }, + { + "epoch": 2.0010816657652786, + "grad_norm": 0.060784902423620224, + "learning_rate": 9.510075125946414e-05, + "loss": 0.0185, + "step": 11100 + }, + { + "epoch": 2.0028844420407426, + "grad_norm": 0.04950178042054176, + "learning_rate": 9.508884756671075e-05, + "loss": 0.0186, + "step": 11110 + }, + { + "epoch": 2.004687218316207, + "grad_norm": 0.04764701798558235, + "learning_rate": 9.507693017715596e-05, + "loss": 0.0182, + "step": 11120 + }, + { + "epoch": 2.006489994591671, + "grad_norm": 0.03921446576714516, + "learning_rate": 9.506499909441997e-05, + "loss": 0.0198, + "step": 11130 + }, + { + "epoch": 2.0082927708671354, + "grad_norm": 0.04284612089395523, + "learning_rate": 9.505305432212713e-05, + "loss": 0.0183, + "step": 11140 + }, + { + "epoch": 2.0100955471425994, + "grad_norm": 0.040782131254673004, + "learning_rate": 9.504109586390595e-05, + "loss": 0.0188, + "step": 11150 + }, + { + "epoch": 2.011898323418064, + "grad_norm": 0.03780316933989525, + "learning_rate": 9.502912372338908e-05, + "loss": 0.0201, + "step": 11160 + }, + { + "epoch": 2.013701099693528, + "grad_norm": 0.05147184059023857, + "learning_rate": 9.501713790421335e-05, + "loss": 0.0189, + "step": 11170 + }, + { + "epoch": 2.0155038759689923, + "grad_norm": 0.04785027727484703, + "learning_rate": 9.500513841001974e-05, + "loss": 0.0191, + "step": 11180 + }, + { + "epoch": 2.0173066522444563, + "grad_norm": 0.04233533516526222, + "learning_rate": 9.499312524445336e-05, + "loss": 0.0188, + "step": 11190 + }, + { + "epoch": 2.0191094285199207, + "grad_norm": 0.054973054677248, + "learning_rate": 9.498109841116351e-05, + "loss": 0.0192, + "step": 11200 + }, + { + "epoch": 2.0209122047953847, + "grad_norm": 0.05098136141896248, + "learning_rate": 9.496905791380363e-05, + "loss": 0.0202, + "step": 11210 + }, + { + "epoch": 2.022714981070849, + "grad_norm": 0.06053069233894348, + "learning_rate": 9.495700375603129e-05, + "loss": 0.0188, + "step": 11220 + }, + { + "epoch": 2.024517757346313, + "grad_norm": 0.04603096470236778, + "learning_rate": 9.494493594150822e-05, + "loss": 0.0175, + "step": 11230 + }, + { + "epoch": 2.0263205336217776, + "grad_norm": 0.03734096884727478, + "learning_rate": 9.493285447390032e-05, + "loss": 0.0181, + "step": 11240 + }, + { + "epoch": 2.0281233098972415, + "grad_norm": 0.046233274042606354, + "learning_rate": 9.492075935687761e-05, + "loss": 0.0186, + "step": 11250 + }, + { + "epoch": 2.029926086172706, + "grad_norm": 0.049352407455444336, + "learning_rate": 9.490865059411427e-05, + "loss": 0.0185, + "step": 11260 + }, + { + "epoch": 2.03172886244817, + "grad_norm": 0.05269843712449074, + "learning_rate": 9.489652818928863e-05, + "loss": 0.0194, + "step": 11270 + }, + { + "epoch": 2.0335316387236344, + "grad_norm": 0.05059188976883888, + "learning_rate": 9.488439214608315e-05, + "loss": 0.0185, + "step": 11280 + }, + { + "epoch": 2.0353344149990984, + "grad_norm": 0.046580202877521515, + "learning_rate": 9.487224246818444e-05, + "loss": 0.0182, + "step": 11290 + }, + { + "epoch": 2.037137191274563, + "grad_norm": 0.054148849099874496, + "learning_rate": 9.486007915928325e-05, + "loss": 0.0195, + "step": 11300 + }, + { + "epoch": 2.038939967550027, + "grad_norm": 0.038010403513908386, + "learning_rate": 9.484790222307448e-05, + "loss": 0.0202, + "step": 11310 + }, + { + "epoch": 2.0407427438254913, + "grad_norm": 0.04993676766753197, + "learning_rate": 9.483571166325716e-05, + "loss": 0.0173, + "step": 11320 + }, + { + "epoch": 2.0425455201009557, + "grad_norm": 0.04767480492591858, + "learning_rate": 9.482350748353444e-05, + "loss": 0.0179, + "step": 11330 + }, + { + "epoch": 2.0443482963764197, + "grad_norm": 0.04710247740149498, + "learning_rate": 9.481128968761363e-05, + "loss": 0.0194, + "step": 11340 + }, + { + "epoch": 2.046151072651884, + "grad_norm": 0.044514454901218414, + "learning_rate": 9.479905827920621e-05, + "loss": 0.0207, + "step": 11350 + }, + { + "epoch": 2.047953848927348, + "grad_norm": 0.04501241818070412, + "learning_rate": 9.478681326202773e-05, + "loss": 0.0203, + "step": 11360 + }, + { + "epoch": 2.0497566252028125, + "grad_norm": 0.056132253259420395, + "learning_rate": 9.477455463979791e-05, + "loss": 0.0188, + "step": 11370 + }, + { + "epoch": 2.0515594014782765, + "grad_norm": 0.049749210476875305, + "learning_rate": 9.476228241624059e-05, + "loss": 0.019, + "step": 11380 + }, + { + "epoch": 2.053362177753741, + "grad_norm": 0.044695429503917694, + "learning_rate": 9.474999659508374e-05, + "loss": 0.0187, + "step": 11390 + }, + { + "epoch": 2.055164954029205, + "grad_norm": 0.055507369339466095, + "learning_rate": 9.47376971800595e-05, + "loss": 0.0182, + "step": 11400 + }, + { + "epoch": 2.0569677303046694, + "grad_norm": 0.03703399747610092, + "learning_rate": 9.472538417490409e-05, + "loss": 0.0184, + "step": 11410 + }, + { + "epoch": 2.0587705065801334, + "grad_norm": 0.04755253717303276, + "learning_rate": 9.471305758335784e-05, + "loss": 0.0185, + "step": 11420 + }, + { + "epoch": 2.060573282855598, + "grad_norm": 0.07012926787137985, + "learning_rate": 9.47007174091653e-05, + "loss": 0.0187, + "step": 11430 + }, + { + "epoch": 2.062376059131062, + "grad_norm": 0.038705386221408844, + "learning_rate": 9.468836365607507e-05, + "loss": 0.0173, + "step": 11440 + }, + { + "epoch": 2.0641788354065262, + "grad_norm": 0.044470157474279404, + "learning_rate": 9.467599632783988e-05, + "loss": 0.0176, + "step": 11450 + }, + { + "epoch": 2.0659816116819902, + "grad_norm": 0.05963541939854622, + "learning_rate": 9.466361542821662e-05, + "loss": 0.019, + "step": 11460 + }, + { + "epoch": 2.0677843879574547, + "grad_norm": 0.04195240139961243, + "learning_rate": 9.465122096096625e-05, + "loss": 0.0193, + "step": 11470 + }, + { + "epoch": 2.0695871642329187, + "grad_norm": 0.04791637882590294, + "learning_rate": 9.463881292985391e-05, + "loss": 0.0175, + "step": 11480 + }, + { + "epoch": 2.071389940508383, + "grad_norm": 0.053187914192676544, + "learning_rate": 9.462639133864881e-05, + "loss": 0.018, + "step": 11490 + }, + { + "epoch": 2.073192716783847, + "grad_norm": 0.05147776007652283, + "learning_rate": 9.461395619112432e-05, + "loss": 0.019, + "step": 11500 + }, + { + "epoch": 2.0749954930593115, + "grad_norm": 0.05834876745939255, + "learning_rate": 9.460150749105791e-05, + "loss": 0.0177, + "step": 11510 + }, + { + "epoch": 2.0767982693347755, + "grad_norm": 0.05278666317462921, + "learning_rate": 9.458904524223116e-05, + "loss": 0.0178, + "step": 11520 + }, + { + "epoch": 2.07860104561024, + "grad_norm": 0.03935845568776131, + "learning_rate": 9.457656944842976e-05, + "loss": 0.018, + "step": 11530 + }, + { + "epoch": 2.080403821885704, + "grad_norm": 0.04231879487633705, + "learning_rate": 9.456408011344353e-05, + "loss": 0.019, + "step": 11540 + }, + { + "epoch": 2.0822065981611684, + "grad_norm": 0.04070485010743141, + "learning_rate": 9.455157724106643e-05, + "loss": 0.0185, + "step": 11550 + }, + { + "epoch": 2.0840093744366324, + "grad_norm": 0.04201081395149231, + "learning_rate": 9.453906083509647e-05, + "loss": 0.0172, + "step": 11560 + }, + { + "epoch": 2.085812150712097, + "grad_norm": 0.05136219784617424, + "learning_rate": 9.45265308993358e-05, + "loss": 0.0196, + "step": 11570 + }, + { + "epoch": 2.087614926987561, + "grad_norm": 0.0852644070982933, + "learning_rate": 9.451398743759071e-05, + "loss": 0.0182, + "step": 11580 + }, + { + "epoch": 2.0894177032630252, + "grad_norm": 0.04370466247200966, + "learning_rate": 9.450143045367156e-05, + "loss": 0.019, + "step": 11590 + }, + { + "epoch": 2.091220479538489, + "grad_norm": 0.0548919178545475, + "learning_rate": 9.448885995139283e-05, + "loss": 0.019, + "step": 11600 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.04408176615834236, + "learning_rate": 9.44762759345731e-05, + "loss": 0.0182, + "step": 11610 + }, + { + "epoch": 2.0948260320894176, + "grad_norm": 0.04217104613780975, + "learning_rate": 9.446367840703509e-05, + "loss": 0.0188, + "step": 11620 + }, + { + "epoch": 2.096628808364882, + "grad_norm": 0.0407659076154232, + "learning_rate": 9.445106737260556e-05, + "loss": 0.0178, + "step": 11630 + }, + { + "epoch": 2.098431584640346, + "grad_norm": 0.06806162744760513, + "learning_rate": 9.443844283511543e-05, + "loss": 0.0188, + "step": 11640 + }, + { + "epoch": 2.1002343609158105, + "grad_norm": 0.03603404015302658, + "learning_rate": 9.442580479839968e-05, + "loss": 0.0175, + "step": 11650 + }, + { + "epoch": 2.1020371371912745, + "grad_norm": 0.06573912501335144, + "learning_rate": 9.441315326629745e-05, + "loss": 0.0206, + "step": 11660 + }, + { + "epoch": 2.103839913466739, + "grad_norm": 0.05624518543481827, + "learning_rate": 9.44004882426519e-05, + "loss": 0.0191, + "step": 11670 + }, + { + "epoch": 2.105642689742203, + "grad_norm": 0.047574449330568314, + "learning_rate": 9.438780973131037e-05, + "loss": 0.0183, + "step": 11680 + }, + { + "epoch": 2.1074454660176674, + "grad_norm": 0.05878095328807831, + "learning_rate": 9.437511773612423e-05, + "loss": 0.0188, + "step": 11690 + }, + { + "epoch": 2.1092482422931313, + "grad_norm": 0.051653750240802765, + "learning_rate": 9.436241226094896e-05, + "loss": 0.0177, + "step": 11700 + }, + { + "epoch": 2.111051018568596, + "grad_norm": 0.04776541888713837, + "learning_rate": 9.434969330964418e-05, + "loss": 0.018, + "step": 11710 + }, + { + "epoch": 2.1128537948440598, + "grad_norm": 0.04655572399497032, + "learning_rate": 9.433696088607356e-05, + "loss": 0.0181, + "step": 11720 + }, + { + "epoch": 2.114656571119524, + "grad_norm": 0.0545085184276104, + "learning_rate": 9.432421499410486e-05, + "loss": 0.0172, + "step": 11730 + }, + { + "epoch": 2.116459347394988, + "grad_norm": 0.052100300788879395, + "learning_rate": 9.431145563760998e-05, + "loss": 0.0175, + "step": 11740 + }, + { + "epoch": 2.1182621236704526, + "grad_norm": 0.049410104751586914, + "learning_rate": 9.429868282046484e-05, + "loss": 0.019, + "step": 11750 + }, + { + "epoch": 2.1200648999459166, + "grad_norm": 0.04482901096343994, + "learning_rate": 9.428589654654951e-05, + "loss": 0.0197, + "step": 11760 + }, + { + "epoch": 2.121867676221381, + "grad_norm": 0.05914926156401634, + "learning_rate": 9.42730968197481e-05, + "loss": 0.0197, + "step": 11770 + }, + { + "epoch": 2.123670452496845, + "grad_norm": 0.07121001183986664, + "learning_rate": 9.426028364394883e-05, + "loss": 0.0192, + "step": 11780 + }, + { + "epoch": 2.1254732287723095, + "grad_norm": 0.050578322261571884, + "learning_rate": 9.424745702304402e-05, + "loss": 0.0204, + "step": 11790 + }, + { + "epoch": 2.1272760050477735, + "grad_norm": 0.05994211137294769, + "learning_rate": 9.423461696093006e-05, + "loss": 0.0178, + "step": 11800 + }, + { + "epoch": 2.129078781323238, + "grad_norm": 0.04912007600069046, + "learning_rate": 9.422176346150741e-05, + "loss": 0.0193, + "step": 11810 + }, + { + "epoch": 2.130881557598702, + "grad_norm": 0.05401720106601715, + "learning_rate": 9.420889652868063e-05, + "loss": 0.0194, + "step": 11820 + }, + { + "epoch": 2.1326843338741663, + "grad_norm": 0.07447097450494766, + "learning_rate": 9.419601616635836e-05, + "loss": 0.0195, + "step": 11830 + }, + { + "epoch": 2.1344871101496303, + "grad_norm": 0.05088619142770767, + "learning_rate": 9.418312237845331e-05, + "loss": 0.0195, + "step": 11840 + }, + { + "epoch": 2.1362898864250948, + "grad_norm": 0.05147622525691986, + "learning_rate": 9.417021516888225e-05, + "loss": 0.0191, + "step": 11850 + }, + { + "epoch": 2.1380926627005588, + "grad_norm": 0.061257313936948776, + "learning_rate": 9.415729454156608e-05, + "loss": 0.0184, + "step": 11860 + }, + { + "epoch": 2.139895438976023, + "grad_norm": 0.04510786011815071, + "learning_rate": 9.414436050042973e-05, + "loss": 0.0179, + "step": 11870 + }, + { + "epoch": 2.141698215251487, + "grad_norm": 0.037306588143110275, + "learning_rate": 9.413141304940223e-05, + "loss": 0.0188, + "step": 11880 + }, + { + "epoch": 2.1435009915269516, + "grad_norm": 0.04815402999520302, + "learning_rate": 9.411845219241666e-05, + "loss": 0.0165, + "step": 11890 + }, + { + "epoch": 2.1453037678024156, + "grad_norm": 0.04068652167916298, + "learning_rate": 9.410547793341021e-05, + "loss": 0.0189, + "step": 11900 + }, + { + "epoch": 2.14710654407788, + "grad_norm": 0.04820746183395386, + "learning_rate": 9.409249027632408e-05, + "loss": 0.0174, + "step": 11910 + }, + { + "epoch": 2.148909320353344, + "grad_norm": 0.04752184823155403, + "learning_rate": 9.407948922510362e-05, + "loss": 0.0189, + "step": 11920 + }, + { + "epoch": 2.1507120966288085, + "grad_norm": 0.04815623164176941, + "learning_rate": 9.406647478369817e-05, + "loss": 0.0171, + "step": 11930 + }, + { + "epoch": 2.1525148729042725, + "grad_norm": 0.04531407356262207, + "learning_rate": 9.405344695606118e-05, + "loss": 0.0189, + "step": 11940 + }, + { + "epoch": 2.154317649179737, + "grad_norm": 0.05846435949206352, + "learning_rate": 9.404040574615018e-05, + "loss": 0.0181, + "step": 11950 + }, + { + "epoch": 2.156120425455201, + "grad_norm": 0.05158604308962822, + "learning_rate": 9.402735115792674e-05, + "loss": 0.0178, + "step": 11960 + }, + { + "epoch": 2.1579232017306653, + "grad_norm": 0.04581731557846069, + "learning_rate": 9.401428319535649e-05, + "loss": 0.0186, + "step": 11970 + }, + { + "epoch": 2.1597259780061293, + "grad_norm": 0.06311949342489243, + "learning_rate": 9.400120186240912e-05, + "loss": 0.0192, + "step": 11980 + }, + { + "epoch": 2.1615287542815937, + "grad_norm": 0.04585142806172371, + "learning_rate": 9.398810716305844e-05, + "loss": 0.0191, + "step": 11990 + }, + { + "epoch": 2.1633315305570577, + "grad_norm": 0.044545140117406845, + "learning_rate": 9.397499910128222e-05, + "loss": 0.0182, + "step": 12000 + }, + { + "epoch": 2.165134306832522, + "grad_norm": 0.054683320224285126, + "learning_rate": 9.396187768106237e-05, + "loss": 0.0196, + "step": 12010 + }, + { + "epoch": 2.166937083107986, + "grad_norm": 0.07583874464035034, + "learning_rate": 9.394874290638482e-05, + "loss": 0.0198, + "step": 12020 + }, + { + "epoch": 2.1687398593834506, + "grad_norm": 0.044878263026475906, + "learning_rate": 9.393559478123959e-05, + "loss": 0.0186, + "step": 12030 + }, + { + "epoch": 2.1705426356589146, + "grad_norm": 0.046777017414569855, + "learning_rate": 9.39224333096207e-05, + "loss": 0.0208, + "step": 12040 + }, + { + "epoch": 2.172345411934379, + "grad_norm": 0.04405393823981285, + "learning_rate": 9.390925849552629e-05, + "loss": 0.0177, + "step": 12050 + }, + { + "epoch": 2.174148188209843, + "grad_norm": 0.0696743056178093, + "learning_rate": 9.389607034295849e-05, + "loss": 0.0183, + "step": 12060 + }, + { + "epoch": 2.1759509644853074, + "grad_norm": 0.06206822395324707, + "learning_rate": 9.388286885592355e-05, + "loss": 0.0185, + "step": 12070 + }, + { + "epoch": 2.1777537407607714, + "grad_norm": 0.045508287847042084, + "learning_rate": 9.386965403843168e-05, + "loss": 0.0186, + "step": 12080 + }, + { + "epoch": 2.179556517036236, + "grad_norm": 0.05979624018073082, + "learning_rate": 9.385642589449726e-05, + "loss": 0.0183, + "step": 12090 + }, + { + "epoch": 2.1813592933117, + "grad_norm": 0.054192204028367996, + "learning_rate": 9.38431844281386e-05, + "loss": 0.0188, + "step": 12100 + }, + { + "epoch": 2.1831620695871643, + "grad_norm": 0.061764203011989594, + "learning_rate": 9.38299296433781e-05, + "loss": 0.0212, + "step": 12110 + }, + { + "epoch": 2.1849648458626283, + "grad_norm": 0.04798909276723862, + "learning_rate": 9.381666154424226e-05, + "loss": 0.0173, + "step": 12120 + }, + { + "epoch": 2.1867676221380927, + "grad_norm": 0.04615325480699539, + "learning_rate": 9.380338013476157e-05, + "loss": 0.0176, + "step": 12130 + }, + { + "epoch": 2.1885703984135567, + "grad_norm": 0.04360948130488396, + "learning_rate": 9.379008541897054e-05, + "loss": 0.0202, + "step": 12140 + }, + { + "epoch": 2.190373174689021, + "grad_norm": 0.042484354227781296, + "learning_rate": 9.377677740090777e-05, + "loss": 0.0173, + "step": 12150 + }, + { + "epoch": 2.192175950964485, + "grad_norm": 0.042886603623628616, + "learning_rate": 9.376345608461588e-05, + "loss": 0.0178, + "step": 12160 + }, + { + "epoch": 2.1939787272399496, + "grad_norm": 0.047171350568532944, + "learning_rate": 9.375012147414155e-05, + "loss": 0.0172, + "step": 12170 + }, + { + "epoch": 2.1957815035154136, + "grad_norm": 0.04297267273068428, + "learning_rate": 9.373677357353545e-05, + "loss": 0.018, + "step": 12180 + }, + { + "epoch": 2.197584279790878, + "grad_norm": 0.04194411635398865, + "learning_rate": 9.372341238685237e-05, + "loss": 0.0186, + "step": 12190 + }, + { + "epoch": 2.199387056066342, + "grad_norm": 0.04618703946471214, + "learning_rate": 9.371003791815102e-05, + "loss": 0.0186, + "step": 12200 + }, + { + "epoch": 2.2011898323418064, + "grad_norm": 0.054092105478048325, + "learning_rate": 9.369665017149429e-05, + "loss": 0.0193, + "step": 12210 + }, + { + "epoch": 2.2029926086172704, + "grad_norm": 0.04316537454724312, + "learning_rate": 9.368324915094895e-05, + "loss": 0.0182, + "step": 12220 + }, + { + "epoch": 2.204795384892735, + "grad_norm": 0.05260886624455452, + "learning_rate": 9.366983486058591e-05, + "loss": 0.0183, + "step": 12230 + }, + { + "epoch": 2.206598161168199, + "grad_norm": 0.04849545285105705, + "learning_rate": 9.365640730448009e-05, + "loss": 0.0198, + "step": 12240 + }, + { + "epoch": 2.2084009374436633, + "grad_norm": 0.04381267726421356, + "learning_rate": 9.36429664867104e-05, + "loss": 0.0185, + "step": 12250 + }, + { + "epoch": 2.2102037137191273, + "grad_norm": 0.040854621678590775, + "learning_rate": 9.362951241135982e-05, + "loss": 0.0171, + "step": 12260 + }, + { + "epoch": 2.2120064899945917, + "grad_norm": 0.046341340988874435, + "learning_rate": 9.361604508251534e-05, + "loss": 0.0177, + "step": 12270 + }, + { + "epoch": 2.2138092662700557, + "grad_norm": 0.04370513930916786, + "learning_rate": 9.360256450426799e-05, + "loss": 0.0183, + "step": 12280 + }, + { + "epoch": 2.21561204254552, + "grad_norm": 0.05271437019109726, + "learning_rate": 9.358907068071279e-05, + "loss": 0.0174, + "step": 12290 + }, + { + "epoch": 2.217414818820984, + "grad_norm": 0.04402655363082886, + "learning_rate": 9.357556361594882e-05, + "loss": 0.0169, + "step": 12300 + }, + { + "epoch": 2.2192175950964486, + "grad_norm": 0.03955942764878273, + "learning_rate": 9.356204331407917e-05, + "loss": 0.017, + "step": 12310 + }, + { + "epoch": 2.2210203713719125, + "grad_norm": 0.058840155601501465, + "learning_rate": 9.354850977921094e-05, + "loss": 0.0184, + "step": 12320 + }, + { + "epoch": 2.222823147647377, + "grad_norm": 0.05294778198003769, + "learning_rate": 9.353496301545529e-05, + "loss": 0.0183, + "step": 12330 + }, + { + "epoch": 2.224625923922841, + "grad_norm": 0.049787264317274094, + "learning_rate": 9.352140302692733e-05, + "loss": 0.0173, + "step": 12340 + }, + { + "epoch": 2.2264287001983054, + "grad_norm": 0.055396515876054764, + "learning_rate": 9.350782981774627e-05, + "loss": 0.0178, + "step": 12350 + }, + { + "epoch": 2.2282314764737694, + "grad_norm": 0.040666721761226654, + "learning_rate": 9.349424339203526e-05, + "loss": 0.0164, + "step": 12360 + }, + { + "epoch": 2.230034252749234, + "grad_norm": 0.03773987665772438, + "learning_rate": 9.34806437539215e-05, + "loss": 0.0178, + "step": 12370 + }, + { + "epoch": 2.231837029024698, + "grad_norm": 0.0412016361951828, + "learning_rate": 9.346703090753622e-05, + "loss": 0.016, + "step": 12380 + }, + { + "epoch": 2.2336398053001623, + "grad_norm": 0.06248285993933678, + "learning_rate": 9.345340485701461e-05, + "loss": 0.0167, + "step": 12390 + }, + { + "epoch": 2.2354425815756267, + "grad_norm": 0.04911290854215622, + "learning_rate": 9.343976560649595e-05, + "loss": 0.0178, + "step": 12400 + }, + { + "epoch": 2.2372453578510907, + "grad_norm": 0.0369994156062603, + "learning_rate": 9.342611316012344e-05, + "loss": 0.0188, + "step": 12410 + }, + { + "epoch": 2.2390481341265547, + "grad_norm": 0.05140696093440056, + "learning_rate": 9.341244752204437e-05, + "loss": 0.0173, + "step": 12420 + }, + { + "epoch": 2.240850910402019, + "grad_norm": 0.04239685460925102, + "learning_rate": 9.339876869640995e-05, + "loss": 0.0178, + "step": 12430 + }, + { + "epoch": 2.2426536866774835, + "grad_norm": 0.041075970977544785, + "learning_rate": 9.33850766873755e-05, + "loss": 0.0179, + "step": 12440 + }, + { + "epoch": 2.2444564629529475, + "grad_norm": 0.04543808102607727, + "learning_rate": 9.337137149910028e-05, + "loss": 0.0176, + "step": 12450 + }, + { + "epoch": 2.2462592392284115, + "grad_norm": 0.04644192382693291, + "learning_rate": 9.335765313574753e-05, + "loss": 0.0195, + "step": 12460 + }, + { + "epoch": 2.248062015503876, + "grad_norm": 0.04792995750904083, + "learning_rate": 9.334392160148457e-05, + "loss": 0.017, + "step": 12470 + }, + { + "epoch": 2.2498647917793404, + "grad_norm": 0.0529826283454895, + "learning_rate": 9.333017690048264e-05, + "loss": 0.0161, + "step": 12480 + }, + { + "epoch": 2.2516675680548044, + "grad_norm": 0.062125179916620255, + "learning_rate": 9.331641903691706e-05, + "loss": 0.0176, + "step": 12490 + }, + { + "epoch": 2.2534703443302684, + "grad_norm": 0.05746845901012421, + "learning_rate": 9.330264801496707e-05, + "loss": 0.018, + "step": 12500 + }, + { + "epoch": 2.255273120605733, + "grad_norm": 0.06177973002195358, + "learning_rate": 9.328886383881594e-05, + "loss": 0.0182, + "step": 12510 + }, + { + "epoch": 2.2570758968811973, + "grad_norm": 0.05217922478914261, + "learning_rate": 9.327506651265095e-05, + "loss": 0.0194, + "step": 12520 + }, + { + "epoch": 2.2588786731566612, + "grad_norm": 0.04634609445929527, + "learning_rate": 9.326125604066338e-05, + "loss": 0.0165, + "step": 12530 + }, + { + "epoch": 2.2606814494321252, + "grad_norm": 0.06264201551675797, + "learning_rate": 9.324743242704847e-05, + "loss": 0.018, + "step": 12540 + }, + { + "epoch": 2.2624842257075897, + "grad_norm": 0.05730437859892845, + "learning_rate": 9.323359567600546e-05, + "loss": 0.0187, + "step": 12550 + }, + { + "epoch": 2.264287001983054, + "grad_norm": 0.044798873364925385, + "learning_rate": 9.321974579173761e-05, + "loss": 0.017, + "step": 12560 + }, + { + "epoch": 2.266089778258518, + "grad_norm": 0.059193242341279984, + "learning_rate": 9.320588277845213e-05, + "loss": 0.0179, + "step": 12570 + }, + { + "epoch": 2.267892554533982, + "grad_norm": 0.04247918352484703, + "learning_rate": 9.319200664036026e-05, + "loss": 0.018, + "step": 12580 + }, + { + "epoch": 2.2696953308094465, + "grad_norm": 0.06215371936559677, + "learning_rate": 9.31781173816772e-05, + "loss": 0.0188, + "step": 12590 + }, + { + "epoch": 2.271498107084911, + "grad_norm": 0.038454875349998474, + "learning_rate": 9.316421500662212e-05, + "loss": 0.0183, + "step": 12600 + }, + { + "epoch": 2.273300883360375, + "grad_norm": 0.051428843289613724, + "learning_rate": 9.31502995194182e-05, + "loss": 0.0176, + "step": 12610 + }, + { + "epoch": 2.2751036596358394, + "grad_norm": 0.039957646280527115, + "learning_rate": 9.31363709242926e-05, + "loss": 0.0187, + "step": 12620 + }, + { + "epoch": 2.2769064359113034, + "grad_norm": 0.07360216230154037, + "learning_rate": 9.312242922547647e-05, + "loss": 0.0185, + "step": 12630 + }, + { + "epoch": 2.278709212186768, + "grad_norm": 0.044161684811115265, + "learning_rate": 9.310847442720492e-05, + "loss": 0.0173, + "step": 12640 + }, + { + "epoch": 2.280511988462232, + "grad_norm": 0.04912401735782623, + "learning_rate": 9.309450653371706e-05, + "loss": 0.018, + "step": 12650 + }, + { + "epoch": 2.2823147647376962, + "grad_norm": 0.054999157786369324, + "learning_rate": 9.308052554925595e-05, + "loss": 0.0188, + "step": 12660 + }, + { + "epoch": 2.2841175410131602, + "grad_norm": 0.035899825394153595, + "learning_rate": 9.306653147806867e-05, + "loss": 0.0175, + "step": 12670 + }, + { + "epoch": 2.2859203172886247, + "grad_norm": 0.042770326137542725, + "learning_rate": 9.305252432440622e-05, + "loss": 0.0171, + "step": 12680 + }, + { + "epoch": 2.2877230935640886, + "grad_norm": 0.052225347608327866, + "learning_rate": 9.303850409252361e-05, + "loss": 0.0183, + "step": 12690 + }, + { + "epoch": 2.289525869839553, + "grad_norm": 0.05941121652722359, + "learning_rate": 9.302447078667985e-05, + "loss": 0.0168, + "step": 12700 + }, + { + "epoch": 2.291328646115017, + "grad_norm": 0.04308955743908882, + "learning_rate": 9.301042441113783e-05, + "loss": 0.018, + "step": 12710 + }, + { + "epoch": 2.2931314223904815, + "grad_norm": 0.06348423659801483, + "learning_rate": 9.299636497016451e-05, + "loss": 0.0187, + "step": 12720 + }, + { + "epoch": 2.2949341986659455, + "grad_norm": 0.0393378809094429, + "learning_rate": 9.298229246803076e-05, + "loss": 0.0199, + "step": 12730 + }, + { + "epoch": 2.29673697494141, + "grad_norm": 0.06101982295513153, + "learning_rate": 9.296820690901144e-05, + "loss": 0.0177, + "step": 12740 + }, + { + "epoch": 2.298539751216874, + "grad_norm": 0.04529323801398277, + "learning_rate": 9.295410829738539e-05, + "loss": 0.0197, + "step": 12750 + }, + { + "epoch": 2.3003425274923384, + "grad_norm": 0.07504298537969589, + "learning_rate": 9.293999663743535e-05, + "loss": 0.0188, + "step": 12760 + }, + { + "epoch": 2.3021453037678024, + "grad_norm": 0.05081866681575775, + "learning_rate": 9.292587193344813e-05, + "loss": 0.0177, + "step": 12770 + }, + { + "epoch": 2.303948080043267, + "grad_norm": 0.05035237595438957, + "learning_rate": 9.291173418971437e-05, + "loss": 0.0198, + "step": 12780 + }, + { + "epoch": 2.305750856318731, + "grad_norm": 0.060294557362794876, + "learning_rate": 9.28975834105288e-05, + "loss": 0.0161, + "step": 12790 + }, + { + "epoch": 2.307553632594195, + "grad_norm": 0.037165142595767975, + "learning_rate": 9.288341960019004e-05, + "loss": 0.0182, + "step": 12800 + }, + { + "epoch": 2.309356408869659, + "grad_norm": 0.06348995119333267, + "learning_rate": 9.286924276300067e-05, + "loss": 0.0179, + "step": 12810 + }, + { + "epoch": 2.3111591851451236, + "grad_norm": 0.05800136178731918, + "learning_rate": 9.285505290326726e-05, + "loss": 0.0185, + "step": 12820 + }, + { + "epoch": 2.3129619614205876, + "grad_norm": 0.0504094660282135, + "learning_rate": 9.284085002530027e-05, + "loss": 0.0166, + "step": 12830 + }, + { + "epoch": 2.314764737696052, + "grad_norm": 0.0532107912003994, + "learning_rate": 9.282663413341422e-05, + "loss": 0.0171, + "step": 12840 + }, + { + "epoch": 2.316567513971516, + "grad_norm": 0.03987735137343407, + "learning_rate": 9.281240523192747e-05, + "loss": 0.0166, + "step": 12850 + }, + { + "epoch": 2.3183702902469805, + "grad_norm": 0.04715529829263687, + "learning_rate": 9.279816332516242e-05, + "loss": 0.0182, + "step": 12860 + }, + { + "epoch": 2.3201730665224445, + "grad_norm": 0.0592104010283947, + "learning_rate": 9.278390841744536e-05, + "loss": 0.0174, + "step": 12870 + }, + { + "epoch": 2.321975842797909, + "grad_norm": 0.05467063933610916, + "learning_rate": 9.276964051310658e-05, + "loss": 0.0176, + "step": 12880 + }, + { + "epoch": 2.323778619073373, + "grad_norm": 0.06170383095741272, + "learning_rate": 9.275535961648027e-05, + "loss": 0.019, + "step": 12890 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.038287121802568436, + "learning_rate": 9.274106573190459e-05, + "loss": 0.0165, + "step": 12900 + }, + { + "epoch": 2.3273841716243013, + "grad_norm": 0.04920131713151932, + "learning_rate": 9.272675886372168e-05, + "loss": 0.0177, + "step": 12910 + }, + { + "epoch": 2.3291869478997658, + "grad_norm": 0.04114221781492233, + "learning_rate": 9.271243901627754e-05, + "loss": 0.0183, + "step": 12920 + }, + { + "epoch": 2.3309897241752298, + "grad_norm": 0.051664285361766815, + "learning_rate": 9.269810619392219e-05, + "loss": 0.0179, + "step": 12930 + }, + { + "epoch": 2.332792500450694, + "grad_norm": 0.05725456774234772, + "learning_rate": 9.268376040100955e-05, + "loss": 0.019, + "step": 12940 + }, + { + "epoch": 2.334595276726158, + "grad_norm": 0.044263362884521484, + "learning_rate": 9.266940164189752e-05, + "loss": 0.0192, + "step": 12950 + }, + { + "epoch": 2.3363980530016226, + "grad_norm": 0.05342969670891762, + "learning_rate": 9.265502992094787e-05, + "loss": 0.0187, + "step": 12960 + }, + { + "epoch": 2.3382008292770866, + "grad_norm": 0.044457849115133286, + "learning_rate": 9.264064524252638e-05, + "loss": 0.0169, + "step": 12970 + }, + { + "epoch": 2.340003605552551, + "grad_norm": 0.06075321510434151, + "learning_rate": 9.262624761100271e-05, + "loss": 0.0181, + "step": 12980 + }, + { + "epoch": 2.341806381828015, + "grad_norm": 0.04136817529797554, + "learning_rate": 9.261183703075051e-05, + "loss": 0.0169, + "step": 12990 + }, + { + "epoch": 2.3436091581034795, + "grad_norm": 0.0574321448802948, + "learning_rate": 9.259741350614733e-05, + "loss": 0.0178, + "step": 13000 + }, + { + "epoch": 2.3454119343789435, + "grad_norm": 0.04999484494328499, + "learning_rate": 9.258297704157464e-05, + "loss": 0.0178, + "step": 13010 + }, + { + "epoch": 2.347214710654408, + "grad_norm": 0.05150268226861954, + "learning_rate": 9.256852764141786e-05, + "loss": 0.0182, + "step": 13020 + }, + { + "epoch": 2.349017486929872, + "grad_norm": 0.04244760796427727, + "learning_rate": 9.255406531006634e-05, + "loss": 0.0166, + "step": 13030 + }, + { + "epoch": 2.3508202632053363, + "grad_norm": 0.043575964868068695, + "learning_rate": 9.253959005191335e-05, + "loss": 0.017, + "step": 13040 + }, + { + "epoch": 2.3526230394808003, + "grad_norm": 0.0853356122970581, + "learning_rate": 9.25251018713561e-05, + "loss": 0.0188, + "step": 13050 + }, + { + "epoch": 2.3544258157562648, + "grad_norm": 0.03906095400452614, + "learning_rate": 9.251060077279571e-05, + "loss": 0.0175, + "step": 13060 + }, + { + "epoch": 2.3562285920317287, + "grad_norm": 0.04045063629746437, + "learning_rate": 9.249608676063724e-05, + "loss": 0.0177, + "step": 13070 + }, + { + "epoch": 2.358031368307193, + "grad_norm": 0.05732389912009239, + "learning_rate": 9.248155983928964e-05, + "loss": 0.017, + "step": 13080 + }, + { + "epoch": 2.359834144582657, + "grad_norm": 0.03386378660798073, + "learning_rate": 9.246702001316583e-05, + "loss": 0.0166, + "step": 13090 + }, + { + "epoch": 2.3616369208581216, + "grad_norm": 0.0417235866189003, + "learning_rate": 9.245246728668262e-05, + "loss": 0.0173, + "step": 13100 + }, + { + "epoch": 2.3634396971335856, + "grad_norm": 0.04871385917067528, + "learning_rate": 9.243790166426073e-05, + "loss": 0.0178, + "step": 13110 + }, + { + "epoch": 2.36524247340905, + "grad_norm": 0.0322159044444561, + "learning_rate": 9.242332315032484e-05, + "loss": 0.0173, + "step": 13120 + }, + { + "epoch": 2.367045249684514, + "grad_norm": 0.06107112020254135, + "learning_rate": 9.240873174930349e-05, + "loss": 0.0182, + "step": 13130 + }, + { + "epoch": 2.3688480259599785, + "grad_norm": 0.05051233246922493, + "learning_rate": 9.239412746562917e-05, + "loss": 0.0171, + "step": 13140 + }, + { + "epoch": 2.3706508022354424, + "grad_norm": 0.04905249923467636, + "learning_rate": 9.237951030373828e-05, + "loss": 0.0185, + "step": 13150 + }, + { + "epoch": 2.372453578510907, + "grad_norm": 0.05133669078350067, + "learning_rate": 9.236488026807113e-05, + "loss": 0.0181, + "step": 13160 + }, + { + "epoch": 2.374256354786371, + "grad_norm": 0.04691055417060852, + "learning_rate": 9.235023736307193e-05, + "loss": 0.0177, + "step": 13170 + }, + { + "epoch": 2.3760591310618353, + "grad_norm": 0.043550554662942886, + "learning_rate": 9.233558159318881e-05, + "loss": 0.0166, + "step": 13180 + }, + { + "epoch": 2.3778619073372993, + "grad_norm": 0.03385986015200615, + "learning_rate": 9.232091296287382e-05, + "loss": 0.0172, + "step": 13190 + }, + { + "epoch": 2.3796646836127637, + "grad_norm": 0.04919940605759621, + "learning_rate": 9.230623147658288e-05, + "loss": 0.0191, + "step": 13200 + }, + { + "epoch": 2.3814674598882277, + "grad_norm": 0.05682046711444855, + "learning_rate": 9.229153713877586e-05, + "loss": 0.0188, + "step": 13210 + }, + { + "epoch": 2.383270236163692, + "grad_norm": 0.03219263628125191, + "learning_rate": 9.227682995391649e-05, + "loss": 0.0187, + "step": 13220 + }, + { + "epoch": 2.385073012439156, + "grad_norm": 0.03755882382392883, + "learning_rate": 9.226210992647243e-05, + "loss": 0.0178, + "step": 13230 + }, + { + "epoch": 2.3868757887146206, + "grad_norm": 0.055419981479644775, + "learning_rate": 9.224737706091525e-05, + "loss": 0.0175, + "step": 13240 + }, + { + "epoch": 2.3886785649900846, + "grad_norm": 0.045903146266937256, + "learning_rate": 9.223263136172039e-05, + "loss": 0.0181, + "step": 13250 + }, + { + "epoch": 2.390481341265549, + "grad_norm": 0.03570179641246796, + "learning_rate": 9.22178728333672e-05, + "loss": 0.0164, + "step": 13260 + }, + { + "epoch": 2.392284117541013, + "grad_norm": 0.040554024279117584, + "learning_rate": 9.220310148033897e-05, + "loss": 0.0171, + "step": 13270 + }, + { + "epoch": 2.3940868938164774, + "grad_norm": 0.04486890882253647, + "learning_rate": 9.21883173071228e-05, + "loss": 0.0184, + "step": 13280 + }, + { + "epoch": 2.3958896700919414, + "grad_norm": 0.057587604969739914, + "learning_rate": 9.217352031820976e-05, + "loss": 0.0181, + "step": 13290 + }, + { + "epoch": 2.397692446367406, + "grad_norm": 0.0410863496363163, + "learning_rate": 9.215871051809477e-05, + "loss": 0.0171, + "step": 13300 + }, + { + "epoch": 2.39949522264287, + "grad_norm": 0.0463516041636467, + "learning_rate": 9.214388791127666e-05, + "loss": 0.0158, + "step": 13310 + }, + { + "epoch": 2.4012979989183343, + "grad_norm": 0.03586406260728836, + "learning_rate": 9.212905250225814e-05, + "loss": 0.0162, + "step": 13320 + }, + { + "epoch": 2.4031007751937983, + "grad_norm": 0.0435759611427784, + "learning_rate": 9.211420429554583e-05, + "loss": 0.0172, + "step": 13330 + }, + { + "epoch": 2.4049035514692627, + "grad_norm": 0.04795432090759277, + "learning_rate": 9.209934329565022e-05, + "loss": 0.0179, + "step": 13340 + }, + { + "epoch": 2.4067063277447267, + "grad_norm": 0.03454621136188507, + "learning_rate": 9.208446950708568e-05, + "loss": 0.0169, + "step": 13350 + }, + { + "epoch": 2.408509104020191, + "grad_norm": 0.05931982770562172, + "learning_rate": 9.20695829343705e-05, + "loss": 0.0176, + "step": 13360 + }, + { + "epoch": 2.410311880295655, + "grad_norm": 0.039950549602508545, + "learning_rate": 9.205468358202678e-05, + "loss": 0.0174, + "step": 13370 + }, + { + "epoch": 2.4121146565711196, + "grad_norm": 0.03795453906059265, + "learning_rate": 9.203977145458059e-05, + "loss": 0.0165, + "step": 13380 + }, + { + "epoch": 2.4139174328465836, + "grad_norm": 0.05222458764910698, + "learning_rate": 9.202484655656182e-05, + "loss": 0.0181, + "step": 13390 + }, + { + "epoch": 2.415720209122048, + "grad_norm": 0.05632326379418373, + "learning_rate": 9.200990889250427e-05, + "loss": 0.018, + "step": 13400 + }, + { + "epoch": 2.417522985397512, + "grad_norm": 0.048623740673065186, + "learning_rate": 9.19949584669456e-05, + "loss": 0.0173, + "step": 13410 + }, + { + "epoch": 2.4193257616729764, + "grad_norm": 0.04223509877920151, + "learning_rate": 9.197999528442738e-05, + "loss": 0.0175, + "step": 13420 + }, + { + "epoch": 2.4211285379484404, + "grad_norm": 0.05728420987725258, + "learning_rate": 9.196501934949499e-05, + "loss": 0.0175, + "step": 13430 + }, + { + "epoch": 2.422931314223905, + "grad_norm": 0.04311821237206459, + "learning_rate": 9.195003066669776e-05, + "loss": 0.0176, + "step": 13440 + }, + { + "epoch": 2.424734090499369, + "grad_norm": 0.040687426924705505, + "learning_rate": 9.193502924058884e-05, + "loss": 0.0173, + "step": 13450 + }, + { + "epoch": 2.4265368667748333, + "grad_norm": 0.04123549163341522, + "learning_rate": 9.192001507572526e-05, + "loss": 0.0166, + "step": 13460 + }, + { + "epoch": 2.4283396430502977, + "grad_norm": 0.03770912438631058, + "learning_rate": 9.190498817666793e-05, + "loss": 0.0154, + "step": 13470 + }, + { + "epoch": 2.4301424193257617, + "grad_norm": 0.04967711865901947, + "learning_rate": 9.188994854798163e-05, + "loss": 0.0187, + "step": 13480 + }, + { + "epoch": 2.4319451956012257, + "grad_norm": 0.03689836338162422, + "learning_rate": 9.187489619423499e-05, + "loss": 0.0197, + "step": 13490 + }, + { + "epoch": 2.43374797187669, + "grad_norm": 0.048009488731622696, + "learning_rate": 9.185983112000056e-05, + "loss": 0.0173, + "step": 13500 + }, + { + "epoch": 2.4355507481521546, + "grad_norm": 0.05793960392475128, + "learning_rate": 9.184475332985464e-05, + "loss": 0.0167, + "step": 13510 + }, + { + "epoch": 2.4373535244276185, + "grad_norm": 0.055282026529312134, + "learning_rate": 9.182966282837754e-05, + "loss": 0.0156, + "step": 13520 + }, + { + "epoch": 2.4391563007030825, + "grad_norm": 0.077792689204216, + "learning_rate": 9.18145596201533e-05, + "loss": 0.0194, + "step": 13530 + }, + { + "epoch": 2.440959076978547, + "grad_norm": 0.057654235512018204, + "learning_rate": 9.179944370976991e-05, + "loss": 0.0171, + "step": 13540 + }, + { + "epoch": 2.4427618532540114, + "grad_norm": 0.05850673466920853, + "learning_rate": 9.178431510181918e-05, + "loss": 0.017, + "step": 13550 + }, + { + "epoch": 2.4445646295294754, + "grad_norm": 0.04258791357278824, + "learning_rate": 9.176917380089675e-05, + "loss": 0.0186, + "step": 13560 + }, + { + "epoch": 2.4463674058049394, + "grad_norm": 0.035005491226911545, + "learning_rate": 9.175401981160219e-05, + "loss": 0.0179, + "step": 13570 + }, + { + "epoch": 2.448170182080404, + "grad_norm": 0.053044676780700684, + "learning_rate": 9.173885313853885e-05, + "loss": 0.0172, + "step": 13580 + }, + { + "epoch": 2.4499729583558683, + "grad_norm": 0.04435984790325165, + "learning_rate": 9.172367378631398e-05, + "loss": 0.0176, + "step": 13590 + }, + { + "epoch": 2.4517757346313322, + "grad_norm": 0.04213058575987816, + "learning_rate": 9.170848175953866e-05, + "loss": 0.0183, + "step": 13600 + }, + { + "epoch": 2.4535785109067962, + "grad_norm": 0.06019243597984314, + "learning_rate": 9.169327706282784e-05, + "loss": 0.017, + "step": 13610 + }, + { + "epoch": 2.4553812871822607, + "grad_norm": 0.046822547912597656, + "learning_rate": 9.167805970080029e-05, + "loss": 0.017, + "step": 13620 + }, + { + "epoch": 2.457184063457725, + "grad_norm": 0.0517612025141716, + "learning_rate": 9.166282967807864e-05, + "loss": 0.0176, + "step": 13630 + }, + { + "epoch": 2.458986839733189, + "grad_norm": 0.04472601041197777, + "learning_rate": 9.16475869992894e-05, + "loss": 0.0169, + "step": 13640 + }, + { + "epoch": 2.460789616008653, + "grad_norm": 0.042681388556957245, + "learning_rate": 9.163233166906284e-05, + "loss": 0.0169, + "step": 13650 + }, + { + "epoch": 2.4625923922841175, + "grad_norm": 0.04039390757679939, + "learning_rate": 9.161706369203317e-05, + "loss": 0.0168, + "step": 13660 + }, + { + "epoch": 2.464395168559582, + "grad_norm": 0.0496927872300148, + "learning_rate": 9.16017830728384e-05, + "loss": 0.0171, + "step": 13670 + }, + { + "epoch": 2.466197944835046, + "grad_norm": 0.044946908950805664, + "learning_rate": 9.158648981612035e-05, + "loss": 0.0164, + "step": 13680 + }, + { + "epoch": 2.46800072111051, + "grad_norm": 0.040972787886857986, + "learning_rate": 9.157118392652472e-05, + "loss": 0.0163, + "step": 13690 + }, + { + "epoch": 2.4698034973859744, + "grad_norm": 0.04120957851409912, + "learning_rate": 9.155586540870104e-05, + "loss": 0.0165, + "step": 13700 + }, + { + "epoch": 2.471606273661439, + "grad_norm": 0.04621298238635063, + "learning_rate": 9.154053426730267e-05, + "loss": 0.0167, + "step": 13710 + }, + { + "epoch": 2.473409049936903, + "grad_norm": 0.04061366617679596, + "learning_rate": 9.15251905069868e-05, + "loss": 0.0163, + "step": 13720 + }, + { + "epoch": 2.4752118262123672, + "grad_norm": 0.05270979180932045, + "learning_rate": 9.150983413241446e-05, + "loss": 0.0182, + "step": 13730 + }, + { + "epoch": 2.4770146024878312, + "grad_norm": 0.03702928498387337, + "learning_rate": 9.149446514825051e-05, + "loss": 0.0179, + "step": 13740 + }, + { + "epoch": 2.4788173787632957, + "grad_norm": 0.05120924115180969, + "learning_rate": 9.147908355916365e-05, + "loss": 0.0177, + "step": 13750 + }, + { + "epoch": 2.4806201550387597, + "grad_norm": 0.04374690726399422, + "learning_rate": 9.146368936982642e-05, + "loss": 0.0189, + "step": 13760 + }, + { + "epoch": 2.482422931314224, + "grad_norm": 0.04099708050489426, + "learning_rate": 9.144828258491511e-05, + "loss": 0.0175, + "step": 13770 + }, + { + "epoch": 2.484225707589688, + "grad_norm": 0.039554934948682785, + "learning_rate": 9.143286320910996e-05, + "loss": 0.0169, + "step": 13780 + }, + { + "epoch": 2.4860284838651525, + "grad_norm": 0.03700094670057297, + "learning_rate": 9.141743124709491e-05, + "loss": 0.0169, + "step": 13790 + }, + { + "epoch": 2.4878312601406165, + "grad_norm": 0.05056154355406761, + "learning_rate": 9.140198670355784e-05, + "loss": 0.0188, + "step": 13800 + }, + { + "epoch": 2.489634036416081, + "grad_norm": 0.04803236201405525, + "learning_rate": 9.138652958319034e-05, + "loss": 0.0184, + "step": 13810 + }, + { + "epoch": 2.491436812691545, + "grad_norm": 0.045031968504190445, + "learning_rate": 9.137105989068791e-05, + "loss": 0.0173, + "step": 13820 + }, + { + "epoch": 2.4932395889670094, + "grad_norm": 0.03556693345308304, + "learning_rate": 9.135557763074983e-05, + "loss": 0.0187, + "step": 13830 + }, + { + "epoch": 2.4950423652424734, + "grad_norm": 0.05333074554800987, + "learning_rate": 9.13400828080792e-05, + "loss": 0.018, + "step": 13840 + }, + { + "epoch": 2.496845141517938, + "grad_norm": 0.05641833320260048, + "learning_rate": 9.132457542738292e-05, + "loss": 0.0173, + "step": 13850 + }, + { + "epoch": 2.498647917793402, + "grad_norm": 0.034765373915433884, + "learning_rate": 9.130905549337174e-05, + "loss": 0.0169, + "step": 13860 + }, + { + "epoch": 2.500450694068866, + "grad_norm": 0.041158437728881836, + "learning_rate": 9.129352301076021e-05, + "loss": 0.0188, + "step": 13870 + }, + { + "epoch": 2.50225347034433, + "grad_norm": 0.04101722687482834, + "learning_rate": 9.127797798426668e-05, + "loss": 0.0179, + "step": 13880 + }, + { + "epoch": 2.5040562466197946, + "grad_norm": 0.04524582251906395, + "learning_rate": 9.126242041861333e-05, + "loss": 0.0168, + "step": 13890 + }, + { + "epoch": 2.5058590228952586, + "grad_norm": 0.04225512593984604, + "learning_rate": 9.124685031852611e-05, + "loss": 0.0171, + "step": 13900 + }, + { + "epoch": 2.507661799170723, + "grad_norm": 0.037549637258052826, + "learning_rate": 9.123126768873482e-05, + "loss": 0.017, + "step": 13910 + }, + { + "epoch": 2.509464575446187, + "grad_norm": 0.04935866966843605, + "learning_rate": 9.121567253397308e-05, + "loss": 0.0175, + "step": 13920 + }, + { + "epoch": 2.5112673517216515, + "grad_norm": 0.043871212750673294, + "learning_rate": 9.120006485897824e-05, + "loss": 0.0181, + "step": 13930 + }, + { + "epoch": 2.5130701279971155, + "grad_norm": 0.046266715973615646, + "learning_rate": 9.118444466849152e-05, + "loss": 0.0164, + "step": 13940 + }, + { + "epoch": 2.51487290427258, + "grad_norm": 0.04460478946566582, + "learning_rate": 9.116881196725793e-05, + "loss": 0.018, + "step": 13950 + }, + { + "epoch": 2.516675680548044, + "grad_norm": 0.055234260857105255, + "learning_rate": 9.115316676002627e-05, + "loss": 0.0172, + "step": 13960 + }, + { + "epoch": 2.5184784568235083, + "grad_norm": 0.055523887276649475, + "learning_rate": 9.113750905154911e-05, + "loss": 0.0175, + "step": 13970 + }, + { + "epoch": 2.5202812330989723, + "grad_norm": 0.05026349052786827, + "learning_rate": 9.112183884658289e-05, + "loss": 0.0161, + "step": 13980 + }, + { + "epoch": 2.5220840093744368, + "grad_norm": 0.044692546129226685, + "learning_rate": 9.11061561498878e-05, + "loss": 0.0172, + "step": 13990 + }, + { + "epoch": 2.5238867856499008, + "grad_norm": 0.039555624127388, + "learning_rate": 9.109046096622779e-05, + "loss": 0.0181, + "step": 14000 + }, + { + "epoch": 2.525689561925365, + "grad_norm": 0.05490704998373985, + "learning_rate": 9.107475330037069e-05, + "loss": 0.0182, + "step": 14010 + }, + { + "epoch": 2.527492338200829, + "grad_norm": 0.04604006186127663, + "learning_rate": 9.105903315708806e-05, + "loss": 0.018, + "step": 14020 + }, + { + "epoch": 2.5292951144762936, + "grad_norm": 0.05318626016378403, + "learning_rate": 9.104330054115524e-05, + "loss": 0.0182, + "step": 14030 + }, + { + "epoch": 2.5310978907517576, + "grad_norm": 0.05853656306862831, + "learning_rate": 9.102755545735141e-05, + "loss": 0.0178, + "step": 14040 + }, + { + "epoch": 2.532900667027222, + "grad_norm": 0.051720548421144485, + "learning_rate": 9.10117979104595e-05, + "loss": 0.0168, + "step": 14050 + }, + { + "epoch": 2.534703443302686, + "grad_norm": 0.06257446855306625, + "learning_rate": 9.099602790526624e-05, + "loss": 0.0172, + "step": 14060 + }, + { + "epoch": 2.5365062195781505, + "grad_norm": 0.046041857451200485, + "learning_rate": 9.098024544656212e-05, + "loss": 0.0175, + "step": 14070 + }, + { + "epoch": 2.5383089958536145, + "grad_norm": 0.06653042882680893, + "learning_rate": 9.096445053914148e-05, + "loss": 0.0175, + "step": 14080 + }, + { + "epoch": 2.540111772129079, + "grad_norm": 0.04360395669937134, + "learning_rate": 9.094864318780236e-05, + "loss": 0.0174, + "step": 14090 + }, + { + "epoch": 2.541914548404543, + "grad_norm": 0.05256301537156105, + "learning_rate": 9.093282339734663e-05, + "loss": 0.0174, + "step": 14100 + }, + { + "epoch": 2.5437173246800073, + "grad_norm": 0.049962710589170456, + "learning_rate": 9.091699117257992e-05, + "loss": 0.0177, + "step": 14110 + }, + { + "epoch": 2.5455201009554713, + "grad_norm": 0.04457296431064606, + "learning_rate": 9.090114651831163e-05, + "loss": 0.0172, + "step": 14120 + }, + { + "epoch": 2.5473228772309358, + "grad_norm": 0.037270840257406235, + "learning_rate": 9.088528943935497e-05, + "loss": 0.0168, + "step": 14130 + }, + { + "epoch": 2.5491256535063997, + "grad_norm": 0.054548218846321106, + "learning_rate": 9.086941994052689e-05, + "loss": 0.0182, + "step": 14140 + }, + { + "epoch": 2.550928429781864, + "grad_norm": 0.04297526553273201, + "learning_rate": 9.085353802664813e-05, + "loss": 0.0159, + "step": 14150 + }, + { + "epoch": 2.552731206057328, + "grad_norm": 0.0438675582408905, + "learning_rate": 9.08376437025432e-05, + "loss": 0.0172, + "step": 14160 + }, + { + "epoch": 2.5545339823327926, + "grad_norm": 0.05884156748652458, + "learning_rate": 9.082173697304035e-05, + "loss": 0.0166, + "step": 14170 + }, + { + "epoch": 2.5563367586082566, + "grad_norm": 0.03903186321258545, + "learning_rate": 9.080581784297166e-05, + "loss": 0.0175, + "step": 14180 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.0390704870223999, + "learning_rate": 9.078988631717291e-05, + "loss": 0.0167, + "step": 14190 + }, + { + "epoch": 2.559942311159185, + "grad_norm": 0.05178023874759674, + "learning_rate": 9.077394240048369e-05, + "loss": 0.0168, + "step": 14200 + }, + { + "epoch": 2.5617450874346495, + "grad_norm": 0.05695466324687004, + "learning_rate": 9.075798609774736e-05, + "loss": 0.0168, + "step": 14210 + }, + { + "epoch": 2.5635478637101135, + "grad_norm": 0.036688923835754395, + "learning_rate": 9.0742017413811e-05, + "loss": 0.0168, + "step": 14220 + }, + { + "epoch": 2.565350639985578, + "grad_norm": 0.045753076672554016, + "learning_rate": 9.072603635352548e-05, + "loss": 0.017, + "step": 14230 + }, + { + "epoch": 2.567153416261042, + "grad_norm": 0.03992784023284912, + "learning_rate": 9.071004292174541e-05, + "loss": 0.0161, + "step": 14240 + }, + { + "epoch": 2.5689561925365063, + "grad_norm": 0.0392080657184124, + "learning_rate": 9.06940371233292e-05, + "loss": 0.0183, + "step": 14250 + }, + { + "epoch": 2.5707589688119703, + "grad_norm": 0.05191511660814285, + "learning_rate": 9.067801896313898e-05, + "loss": 0.0177, + "step": 14260 + }, + { + "epoch": 2.5725617450874347, + "grad_norm": 0.04834394156932831, + "learning_rate": 9.066198844604064e-05, + "loss": 0.017, + "step": 14270 + }, + { + "epoch": 2.5743645213628987, + "grad_norm": 0.04389265552163124, + "learning_rate": 9.06459455769038e-05, + "loss": 0.017, + "step": 14280 + }, + { + "epoch": 2.576167297638363, + "grad_norm": 0.058147575706243515, + "learning_rate": 9.062989036060193e-05, + "loss": 0.0165, + "step": 14290 + }, + { + "epoch": 2.577970073913827, + "grad_norm": 0.06400105357170105, + "learning_rate": 9.061382280201212e-05, + "loss": 0.0184, + "step": 14300 + }, + { + "epoch": 2.5797728501892916, + "grad_norm": 0.04148991405963898, + "learning_rate": 9.059774290601528e-05, + "loss": 0.0183, + "step": 14310 + }, + { + "epoch": 2.5815756264647556, + "grad_norm": 0.04167164862155914, + "learning_rate": 9.058165067749606e-05, + "loss": 0.0182, + "step": 14320 + }, + { + "epoch": 2.58337840274022, + "grad_norm": 0.03471371531486511, + "learning_rate": 9.056554612134288e-05, + "loss": 0.0167, + "step": 14330 + }, + { + "epoch": 2.585181179015684, + "grad_norm": 0.05262066051363945, + "learning_rate": 9.054942924244785e-05, + "loss": 0.0165, + "step": 14340 + }, + { + "epoch": 2.5869839552911484, + "grad_norm": 0.0339302234351635, + "learning_rate": 9.053330004570686e-05, + "loss": 0.0173, + "step": 14350 + }, + { + "epoch": 2.5887867315666124, + "grad_norm": 0.04472887143492699, + "learning_rate": 9.051715853601955e-05, + "loss": 0.0163, + "step": 14360 + }, + { + "epoch": 2.590589507842077, + "grad_norm": 0.04895560443401337, + "learning_rate": 9.050100471828926e-05, + "loss": 0.0167, + "step": 14370 + }, + { + "epoch": 2.592392284117541, + "grad_norm": 0.06670967489480972, + "learning_rate": 9.048483859742311e-05, + "loss": 0.0164, + "step": 14380 + }, + { + "epoch": 2.5941950603930053, + "grad_norm": 0.04145402833819389, + "learning_rate": 9.046866017833193e-05, + "loss": 0.0155, + "step": 14390 + }, + { + "epoch": 2.5959978366684693, + "grad_norm": 0.04420650377869606, + "learning_rate": 9.045246946593029e-05, + "loss": 0.0167, + "step": 14400 + }, + { + "epoch": 2.5978006129439337, + "grad_norm": 0.0462857186794281, + "learning_rate": 9.043626646513652e-05, + "loss": 0.0166, + "step": 14410 + }, + { + "epoch": 2.5996033892193977, + "grad_norm": 0.038459621369838715, + "learning_rate": 9.042005118087267e-05, + "loss": 0.0165, + "step": 14420 + }, + { + "epoch": 2.601406165494862, + "grad_norm": 0.06416602432727814, + "learning_rate": 9.040382361806448e-05, + "loss": 0.018, + "step": 14430 + }, + { + "epoch": 2.603208941770326, + "grad_norm": 0.048398666083812714, + "learning_rate": 9.038758378164148e-05, + "loss": 0.0164, + "step": 14440 + }, + { + "epoch": 2.6050117180457906, + "grad_norm": 0.038480594754219055, + "learning_rate": 9.037133167653691e-05, + "loss": 0.0167, + "step": 14450 + }, + { + "epoch": 2.606814494321255, + "grad_norm": 0.049998581409454346, + "learning_rate": 9.035506730768771e-05, + "loss": 0.0175, + "step": 14460 + }, + { + "epoch": 2.608617270596719, + "grad_norm": 0.04714805260300636, + "learning_rate": 9.033879068003458e-05, + "loss": 0.0163, + "step": 14470 + }, + { + "epoch": 2.610420046872183, + "grad_norm": 0.05297728627920151, + "learning_rate": 9.032250179852193e-05, + "loss": 0.018, + "step": 14480 + }, + { + "epoch": 2.6122228231476474, + "grad_norm": 0.048020247370004654, + "learning_rate": 9.030620066809787e-05, + "loss": 0.0174, + "step": 14490 + }, + { + "epoch": 2.614025599423112, + "grad_norm": 0.042564716190099716, + "learning_rate": 9.028988729371428e-05, + "loss": 0.0157, + "step": 14500 + }, + { + "epoch": 2.615828375698576, + "grad_norm": 0.04534224048256874, + "learning_rate": 9.027356168032673e-05, + "loss": 0.0164, + "step": 14510 + }, + { + "epoch": 2.61763115197404, + "grad_norm": 0.05507701635360718, + "learning_rate": 9.02572238328945e-05, + "loss": 0.0161, + "step": 14520 + }, + { + "epoch": 2.6194339282495043, + "grad_norm": 0.06633414328098297, + "learning_rate": 9.02408737563806e-05, + "loss": 0.0173, + "step": 14530 + }, + { + "epoch": 2.6212367045249687, + "grad_norm": 0.04391641914844513, + "learning_rate": 9.022451145575174e-05, + "loss": 0.0165, + "step": 14540 + }, + { + "epoch": 2.6230394808004327, + "grad_norm": 0.04832244664430618, + "learning_rate": 9.02081369359784e-05, + "loss": 0.0178, + "step": 14550 + }, + { + "epoch": 2.6248422570758967, + "grad_norm": 0.058936040848493576, + "learning_rate": 9.019175020203465e-05, + "loss": 0.0182, + "step": 14560 + }, + { + "epoch": 2.626645033351361, + "grad_norm": 0.048114318400621414, + "learning_rate": 9.017535125889842e-05, + "loss": 0.0163, + "step": 14570 + }, + { + "epoch": 2.6284478096268256, + "grad_norm": 0.039158448576927185, + "learning_rate": 9.015894011155124e-05, + "loss": 0.0172, + "step": 14580 + }, + { + "epoch": 2.6302505859022896, + "grad_norm": 0.06885010749101639, + "learning_rate": 9.014251676497838e-05, + "loss": 0.0169, + "step": 14590 + }, + { + "epoch": 2.6320533621777535, + "grad_norm": 0.0327608585357666, + "learning_rate": 9.012608122416884e-05, + "loss": 0.0165, + "step": 14600 + }, + { + "epoch": 2.633856138453218, + "grad_norm": 0.06452928483486176, + "learning_rate": 9.010963349411529e-05, + "loss": 0.0167, + "step": 14610 + }, + { + "epoch": 2.6356589147286824, + "grad_norm": 0.04850171506404877, + "learning_rate": 9.00931735798141e-05, + "loss": 0.0165, + "step": 14620 + }, + { + "epoch": 2.6374616910041464, + "grad_norm": 0.048164863139390945, + "learning_rate": 9.00767014862654e-05, + "loss": 0.0163, + "step": 14630 + }, + { + "epoch": 2.6392644672796104, + "grad_norm": 0.053627338260412216, + "learning_rate": 9.006021721847295e-05, + "loss": 0.0167, + "step": 14640 + }, + { + "epoch": 2.641067243555075, + "grad_norm": 0.036188118159770966, + "learning_rate": 9.004372078144423e-05, + "loss": 0.0168, + "step": 14650 + }, + { + "epoch": 2.6428700198305393, + "grad_norm": 0.06933654099702835, + "learning_rate": 9.002721218019043e-05, + "loss": 0.0179, + "step": 14660 + }, + { + "epoch": 2.6446727961060033, + "grad_norm": 0.038835715502500534, + "learning_rate": 9.001069141972642e-05, + "loss": 0.017, + "step": 14670 + }, + { + "epoch": 2.6464755723814672, + "grad_norm": 0.04626325145363808, + "learning_rate": 8.99941585050708e-05, + "loss": 0.018, + "step": 14680 + }, + { + "epoch": 2.6482783486569317, + "grad_norm": 0.05248226597905159, + "learning_rate": 8.997761344124578e-05, + "loss": 0.0174, + "step": 14690 + }, + { + "epoch": 2.650081124932396, + "grad_norm": 0.04428350180387497, + "learning_rate": 8.996105623327737e-05, + "loss": 0.0165, + "step": 14700 + }, + { + "epoch": 2.65188390120786, + "grad_norm": 0.03900173678994179, + "learning_rate": 8.994448688619517e-05, + "loss": 0.0164, + "step": 14710 + }, + { + "epoch": 2.653686677483324, + "grad_norm": 0.03940004110336304, + "learning_rate": 8.992790540503253e-05, + "loss": 0.0152, + "step": 14720 + }, + { + "epoch": 2.6554894537587885, + "grad_norm": 0.04216473549604416, + "learning_rate": 8.991131179482648e-05, + "loss": 0.0169, + "step": 14730 + }, + { + "epoch": 2.657292230034253, + "grad_norm": 0.0676785334944725, + "learning_rate": 8.989470606061768e-05, + "loss": 0.0177, + "step": 14740 + }, + { + "epoch": 2.659095006309717, + "grad_norm": 0.05371207371354103, + "learning_rate": 8.987808820745056e-05, + "loss": 0.0164, + "step": 14750 + }, + { + "epoch": 2.660897782585181, + "grad_norm": 0.07214975357055664, + "learning_rate": 8.986145824037315e-05, + "loss": 0.0173, + "step": 14760 + }, + { + "epoch": 2.6627005588606454, + "grad_norm": 0.05552534759044647, + "learning_rate": 8.984481616443721e-05, + "loss": 0.0177, + "step": 14770 + }, + { + "epoch": 2.66450333513611, + "grad_norm": 0.04334055632352829, + "learning_rate": 8.982816198469815e-05, + "loss": 0.0157, + "step": 14780 + }, + { + "epoch": 2.666306111411574, + "grad_norm": 0.05048564821481705, + "learning_rate": 8.98114957062151e-05, + "loss": 0.0165, + "step": 14790 + }, + { + "epoch": 2.668108887687038, + "grad_norm": 0.03810162842273712, + "learning_rate": 8.97948173340508e-05, + "loss": 0.0158, + "step": 14800 + }, + { + "epoch": 2.6699116639625022, + "grad_norm": 0.03617116063833237, + "learning_rate": 8.977812687327172e-05, + "loss": 0.0163, + "step": 14810 + }, + { + "epoch": 2.6717144402379667, + "grad_norm": 0.028971081599593163, + "learning_rate": 8.976142432894798e-05, + "loss": 0.0168, + "step": 14820 + }, + { + "epoch": 2.6735172165134307, + "grad_norm": 0.05767348036170006, + "learning_rate": 8.974470970615336e-05, + "loss": 0.0168, + "step": 14830 + }, + { + "epoch": 2.6753199927888947, + "grad_norm": 0.060421500355005264, + "learning_rate": 8.972798300996534e-05, + "loss": 0.0167, + "step": 14840 + }, + { + "epoch": 2.677122769064359, + "grad_norm": 0.06846911460161209, + "learning_rate": 8.971124424546504e-05, + "loss": 0.0165, + "step": 14850 + }, + { + "epoch": 2.6789255453398235, + "grad_norm": 0.05561932176351547, + "learning_rate": 8.969449341773724e-05, + "loss": 0.016, + "step": 14860 + }, + { + "epoch": 2.6807283216152875, + "grad_norm": 0.04259176179766655, + "learning_rate": 8.967773053187042e-05, + "loss": 0.0159, + "step": 14870 + }, + { + "epoch": 2.6825310978907515, + "grad_norm": 0.051128387451171875, + "learning_rate": 8.966095559295668e-05, + "loss": 0.0173, + "step": 14880 + }, + { + "epoch": 2.684333874166216, + "grad_norm": 0.04228396713733673, + "learning_rate": 8.964416860609184e-05, + "loss": 0.0179, + "step": 14890 + }, + { + "epoch": 2.6861366504416804, + "grad_norm": 0.037773486226797104, + "learning_rate": 8.962736957637532e-05, + "loss": 0.019, + "step": 14900 + }, + { + "epoch": 2.6879394267171444, + "grad_norm": 0.04876551404595375, + "learning_rate": 8.96105585089102e-05, + "loss": 0.0176, + "step": 14910 + }, + { + "epoch": 2.6897422029926084, + "grad_norm": 0.0459638349711895, + "learning_rate": 8.959373540880329e-05, + "loss": 0.0182, + "step": 14920 + }, + { + "epoch": 2.691544979268073, + "grad_norm": 0.04058060795068741, + "learning_rate": 8.957690028116495e-05, + "loss": 0.016, + "step": 14930 + }, + { + "epoch": 2.6933477555435372, + "grad_norm": 0.053622737526893616, + "learning_rate": 8.956005313110928e-05, + "loss": 0.0183, + "step": 14940 + }, + { + "epoch": 2.695150531819001, + "grad_norm": 0.058955464512109756, + "learning_rate": 8.9543193963754e-05, + "loss": 0.0163, + "step": 14950 + }, + { + "epoch": 2.696953308094465, + "grad_norm": 0.03861239179968834, + "learning_rate": 8.952632278422048e-05, + "loss": 0.0181, + "step": 14960 + }, + { + "epoch": 2.6987560843699296, + "grad_norm": 0.03944086283445358, + "learning_rate": 8.95094395976337e-05, + "loss": 0.017, + "step": 14970 + }, + { + "epoch": 2.700558860645394, + "grad_norm": 0.04173293709754944, + "learning_rate": 8.949254440912239e-05, + "loss": 0.0159, + "step": 14980 + }, + { + "epoch": 2.702361636920858, + "grad_norm": 0.05249183624982834, + "learning_rate": 8.94756372238188e-05, + "loss": 0.0164, + "step": 14990 + }, + { + "epoch": 2.7041644131963225, + "grad_norm": 0.03918549418449402, + "learning_rate": 8.945871804685892e-05, + "loss": 0.0172, + "step": 15000 + }, + { + "epoch": 2.7059671894717865, + "grad_norm": 0.04800887033343315, + "learning_rate": 8.944178688338236e-05, + "loss": 0.0159, + "step": 15010 + }, + { + "epoch": 2.707769965747251, + "grad_norm": 0.049170203506946564, + "learning_rate": 8.942484373853233e-05, + "loss": 0.0169, + "step": 15020 + }, + { + "epoch": 2.709572742022715, + "grad_norm": 0.04204993695020676, + "learning_rate": 8.940788861745572e-05, + "loss": 0.016, + "step": 15030 + }, + { + "epoch": 2.7113755182981794, + "grad_norm": 0.046589914709329605, + "learning_rate": 8.939092152530308e-05, + "loss": 0.0158, + "step": 15040 + }, + { + "epoch": 2.7131782945736433, + "grad_norm": 0.06050195172429085, + "learning_rate": 8.937394246722853e-05, + "loss": 0.016, + "step": 15050 + }, + { + "epoch": 2.714981070849108, + "grad_norm": 0.05370452255010605, + "learning_rate": 8.935695144838984e-05, + "loss": 0.0166, + "step": 15060 + }, + { + "epoch": 2.7167838471245718, + "grad_norm": 0.04950425401329994, + "learning_rate": 8.933994847394849e-05, + "loss": 0.017, + "step": 15070 + }, + { + "epoch": 2.718586623400036, + "grad_norm": 0.0474478118121624, + "learning_rate": 8.932293354906949e-05, + "loss": 0.0154, + "step": 15080 + }, + { + "epoch": 2.7203893996755, + "grad_norm": 0.058557674288749695, + "learning_rate": 8.930590667892153e-05, + "loss": 0.017, + "step": 15090 + }, + { + "epoch": 2.7221921759509646, + "grad_norm": 0.039800044149160385, + "learning_rate": 8.928886786867696e-05, + "loss": 0.0167, + "step": 15100 + }, + { + "epoch": 2.7239949522264286, + "grad_norm": 0.052354078739881516, + "learning_rate": 8.927181712351168e-05, + "loss": 0.0162, + "step": 15110 + }, + { + "epoch": 2.725797728501893, + "grad_norm": 0.046517278999090195, + "learning_rate": 8.925475444860527e-05, + "loss": 0.0158, + "step": 15120 + }, + { + "epoch": 2.727600504777357, + "grad_norm": 0.04440392181277275, + "learning_rate": 8.923767984914092e-05, + "loss": 0.018, + "step": 15130 + }, + { + "epoch": 2.7294032810528215, + "grad_norm": 0.040797486901283264, + "learning_rate": 8.922059333030545e-05, + "loss": 0.0169, + "step": 15140 + }, + { + "epoch": 2.7312060573282855, + "grad_norm": 0.057930342853069305, + "learning_rate": 8.920349489728928e-05, + "loss": 0.0169, + "step": 15150 + }, + { + "epoch": 2.73300883360375, + "grad_norm": 0.04987657815217972, + "learning_rate": 8.918638455528646e-05, + "loss": 0.0164, + "step": 15160 + }, + { + "epoch": 2.734811609879214, + "grad_norm": 0.03821483999490738, + "learning_rate": 8.916926230949468e-05, + "loss": 0.017, + "step": 15170 + }, + { + "epoch": 2.7366143861546783, + "grad_norm": 0.05900193005800247, + "learning_rate": 8.915212816511522e-05, + "loss": 0.016, + "step": 15180 + }, + { + "epoch": 2.7384171624301423, + "grad_norm": 0.04943274334073067, + "learning_rate": 8.913498212735296e-05, + "loss": 0.0161, + "step": 15190 + }, + { + "epoch": 2.7402199387056068, + "grad_norm": 0.05522455647587776, + "learning_rate": 8.911782420141643e-05, + "loss": 0.017, + "step": 15200 + }, + { + "epoch": 2.7420227149810708, + "grad_norm": 0.03871415555477142, + "learning_rate": 8.910065439251775e-05, + "loss": 0.0165, + "step": 15210 + }, + { + "epoch": 2.743825491256535, + "grad_norm": 0.03693372756242752, + "learning_rate": 8.908347270587268e-05, + "loss": 0.0163, + "step": 15220 + }, + { + "epoch": 2.745628267531999, + "grad_norm": 0.04821274057030678, + "learning_rate": 8.906627914670054e-05, + "loss": 0.0174, + "step": 15230 + }, + { + "epoch": 2.7474310438074636, + "grad_norm": 0.03264054283499718, + "learning_rate": 8.904907372022427e-05, + "loss": 0.0182, + "step": 15240 + }, + { + "epoch": 2.7492338200829276, + "grad_norm": 0.05962604656815529, + "learning_rate": 8.903185643167042e-05, + "loss": 0.0154, + "step": 15250 + }, + { + "epoch": 2.751036596358392, + "grad_norm": 0.04314010217785835, + "learning_rate": 8.901462728626919e-05, + "loss": 0.0185, + "step": 15260 + }, + { + "epoch": 2.752839372633856, + "grad_norm": 0.045566167682409286, + "learning_rate": 8.899738628925429e-05, + "loss": 0.0159, + "step": 15270 + }, + { + "epoch": 2.7546421489093205, + "grad_norm": 0.04919673129916191, + "learning_rate": 8.898013344586312e-05, + "loss": 0.0155, + "step": 15280 + }, + { + "epoch": 2.7564449251847845, + "grad_norm": 0.048412807285785675, + "learning_rate": 8.896286876133661e-05, + "loss": 0.0152, + "step": 15290 + }, + { + "epoch": 2.758247701460249, + "grad_norm": 0.0495666041970253, + "learning_rate": 8.894559224091933e-05, + "loss": 0.0157, + "step": 15300 + }, + { + "epoch": 2.760050477735713, + "grad_norm": 0.055834151804447174, + "learning_rate": 8.892830388985942e-05, + "loss": 0.0191, + "step": 15310 + }, + { + "epoch": 2.7618532540111773, + "grad_norm": 0.050143733620643616, + "learning_rate": 8.891100371340864e-05, + "loss": 0.0166, + "step": 15320 + }, + { + "epoch": 2.7636560302866413, + "grad_norm": 0.05090256407856941, + "learning_rate": 8.889369171682231e-05, + "loss": 0.0164, + "step": 15330 + }, + { + "epoch": 2.7654588065621057, + "grad_norm": 0.03812038153409958, + "learning_rate": 8.887636790535936e-05, + "loss": 0.0171, + "step": 15340 + }, + { + "epoch": 2.7672615828375697, + "grad_norm": 0.04982345178723335, + "learning_rate": 8.885903228428231e-05, + "loss": 0.0161, + "step": 15350 + }, + { + "epoch": 2.769064359113034, + "grad_norm": 0.05485037714242935, + "learning_rate": 8.884168485885727e-05, + "loss": 0.0174, + "step": 15360 + }, + { + "epoch": 2.770867135388498, + "grad_norm": 0.0498759001493454, + "learning_rate": 8.882432563435393e-05, + "loss": 0.0166, + "step": 15370 + }, + { + "epoch": 2.7726699116639626, + "grad_norm": 0.07219842076301575, + "learning_rate": 8.880695461604556e-05, + "loss": 0.0173, + "step": 15380 + }, + { + "epoch": 2.7744726879394266, + "grad_norm": 0.045516978949308395, + "learning_rate": 8.878957180920901e-05, + "loss": 0.0148, + "step": 15390 + }, + { + "epoch": 2.776275464214891, + "grad_norm": 0.03794407099485397, + "learning_rate": 8.877217721912473e-05, + "loss": 0.0175, + "step": 15400 + }, + { + "epoch": 2.778078240490355, + "grad_norm": 0.0390416756272316, + "learning_rate": 8.875477085107673e-05, + "loss": 0.0167, + "step": 15410 + }, + { + "epoch": 2.7798810167658194, + "grad_norm": 0.039373625069856644, + "learning_rate": 8.87373527103526e-05, + "loss": 0.0172, + "step": 15420 + }, + { + "epoch": 2.7816837930412834, + "grad_norm": 0.060816604644060135, + "learning_rate": 8.871992280224353e-05, + "loss": 0.0171, + "step": 15430 + }, + { + "epoch": 2.783486569316748, + "grad_norm": 0.04233274981379509, + "learning_rate": 8.870248113204422e-05, + "loss": 0.0169, + "step": 15440 + }, + { + "epoch": 2.785289345592212, + "grad_norm": 0.042403239756822586, + "learning_rate": 8.868502770505306e-05, + "loss": 0.0179, + "step": 15450 + }, + { + "epoch": 2.7870921218676763, + "grad_norm": 0.047547437250614166, + "learning_rate": 8.86675625265719e-05, + "loss": 0.0163, + "step": 15460 + }, + { + "epoch": 2.7888948981431403, + "grad_norm": 0.04658415541052818, + "learning_rate": 8.865008560190618e-05, + "loss": 0.0167, + "step": 15470 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 0.040401119738817215, + "learning_rate": 8.863259693636496e-05, + "loss": 0.0172, + "step": 15480 + }, + { + "epoch": 2.7925004506940687, + "grad_norm": 0.04091029614210129, + "learning_rate": 8.861509653526083e-05, + "loss": 0.0157, + "step": 15490 + }, + { + "epoch": 2.794303226969533, + "grad_norm": 0.046944037079811096, + "learning_rate": 8.859758440390993e-05, + "loss": 0.0154, + "step": 15500 + }, + { + "epoch": 2.796106003244997, + "grad_norm": 0.04554378241300583, + "learning_rate": 8.858006054763202e-05, + "loss": 0.0157, + "step": 15510 + }, + { + "epoch": 2.7979087795204616, + "grad_norm": 0.048771537840366364, + "learning_rate": 8.856252497175035e-05, + "loss": 0.0151, + "step": 15520 + }, + { + "epoch": 2.7997115557959256, + "grad_norm": 0.03943654149770737, + "learning_rate": 8.854497768159178e-05, + "loss": 0.0151, + "step": 15530 + }, + { + "epoch": 2.80151433207139, + "grad_norm": 0.04568270593881607, + "learning_rate": 8.852741868248671e-05, + "loss": 0.0163, + "step": 15540 + }, + { + "epoch": 2.803317108346854, + "grad_norm": 0.05998268350958824, + "learning_rate": 8.85098479797691e-05, + "loss": 0.0163, + "step": 15550 + }, + { + "epoch": 2.8051198846223184, + "grad_norm": 0.03589946776628494, + "learning_rate": 8.849226557877646e-05, + "loss": 0.0168, + "step": 15560 + }, + { + "epoch": 2.806922660897783, + "grad_norm": 0.06253862380981445, + "learning_rate": 8.84746714848499e-05, + "loss": 0.0173, + "step": 15570 + }, + { + "epoch": 2.808725437173247, + "grad_norm": 0.052717480808496475, + "learning_rate": 8.845706570333397e-05, + "loss": 0.0171, + "step": 15580 + }, + { + "epoch": 2.810528213448711, + "grad_norm": 0.05572878569364548, + "learning_rate": 8.84394482395769e-05, + "loss": 0.0166, + "step": 15590 + }, + { + "epoch": 2.8123309897241753, + "grad_norm": 0.04193023964762688, + "learning_rate": 8.842181909893038e-05, + "loss": 0.0154, + "step": 15600 + }, + { + "epoch": 2.8141337659996397, + "grad_norm": 0.0631871148943901, + "learning_rate": 8.840417828674969e-05, + "loss": 0.0153, + "step": 15610 + }, + { + "epoch": 2.8159365422751037, + "grad_norm": 0.042419809848070145, + "learning_rate": 8.838652580839364e-05, + "loss": 0.0174, + "step": 15620 + }, + { + "epoch": 2.8177393185505677, + "grad_norm": 0.04645229130983353, + "learning_rate": 8.836886166922458e-05, + "loss": 0.0175, + "step": 15630 + }, + { + "epoch": 2.819542094826032, + "grad_norm": 0.03820360079407692, + "learning_rate": 8.835118587460844e-05, + "loss": 0.0156, + "step": 15640 + }, + { + "epoch": 2.8213448711014966, + "grad_norm": 0.03719200938940048, + "learning_rate": 8.83334984299146e-05, + "loss": 0.0155, + "step": 15650 + }, + { + "epoch": 2.8231476473769606, + "grad_norm": 0.04647526144981384, + "learning_rate": 8.83157993405161e-05, + "loss": 0.0162, + "step": 15660 + }, + { + "epoch": 2.8249504236524245, + "grad_norm": 0.04952094331383705, + "learning_rate": 8.829808861178943e-05, + "loss": 0.0162, + "step": 15670 + }, + { + "epoch": 2.826753199927889, + "grad_norm": 0.058213476091623306, + "learning_rate": 8.828036624911464e-05, + "loss": 0.0156, + "step": 15680 + }, + { + "epoch": 2.8285559762033534, + "grad_norm": 0.036599744111299515, + "learning_rate": 8.826263225787532e-05, + "loss": 0.0159, + "step": 15690 + }, + { + "epoch": 2.8303587524788174, + "grad_norm": 0.05007516220211983, + "learning_rate": 8.824488664345858e-05, + "loss": 0.0177, + "step": 15700 + }, + { + "epoch": 2.8321615287542814, + "grad_norm": 0.04846655949950218, + "learning_rate": 8.822712941125508e-05, + "loss": 0.016, + "step": 15710 + }, + { + "epoch": 2.833964305029746, + "grad_norm": 0.044363707304000854, + "learning_rate": 8.820936056665898e-05, + "loss": 0.016, + "step": 15720 + }, + { + "epoch": 2.8357670813052103, + "grad_norm": 0.05205946043133736, + "learning_rate": 8.819158011506801e-05, + "loss": 0.0152, + "step": 15730 + }, + { + "epoch": 2.8375698575806743, + "grad_norm": 0.056792374700307846, + "learning_rate": 8.81737880618834e-05, + "loss": 0.0156, + "step": 15740 + }, + { + "epoch": 2.8393726338561383, + "grad_norm": 0.052838411182165146, + "learning_rate": 8.815598441250987e-05, + "loss": 0.018, + "step": 15750 + }, + { + "epoch": 2.8411754101316027, + "grad_norm": 0.04456179961562157, + "learning_rate": 8.813816917235576e-05, + "loss": 0.0157, + "step": 15760 + }, + { + "epoch": 2.842978186407067, + "grad_norm": 0.06999032944440842, + "learning_rate": 8.812034234683282e-05, + "loss": 0.0165, + "step": 15770 + }, + { + "epoch": 2.844780962682531, + "grad_norm": 0.04564972594380379, + "learning_rate": 8.810250394135637e-05, + "loss": 0.0175, + "step": 15780 + }, + { + "epoch": 2.846583738957995, + "grad_norm": 0.04521859809756279, + "learning_rate": 8.808465396134529e-05, + "loss": 0.0173, + "step": 15790 + }, + { + "epoch": 2.8483865152334595, + "grad_norm": 0.03548445552587509, + "learning_rate": 8.806679241222189e-05, + "loss": 0.0167, + "step": 15800 + }, + { + "epoch": 2.850189291508924, + "grad_norm": 0.04952651634812355, + "learning_rate": 8.804891929941203e-05, + "loss": 0.0173, + "step": 15810 + }, + { + "epoch": 2.851992067784388, + "grad_norm": 0.04779171571135521, + "learning_rate": 8.803103462834514e-05, + "loss": 0.0164, + "step": 15820 + }, + { + "epoch": 2.853794844059852, + "grad_norm": 0.04811057075858116, + "learning_rate": 8.801313840445408e-05, + "loss": 0.0176, + "step": 15830 + }, + { + "epoch": 2.8555976203353164, + "grad_norm": 0.058642640709877014, + "learning_rate": 8.799523063317524e-05, + "loss": 0.0166, + "step": 15840 + }, + { + "epoch": 2.857400396610781, + "grad_norm": 0.046281490474939346, + "learning_rate": 8.797731131994854e-05, + "loss": 0.0162, + "step": 15850 + }, + { + "epoch": 2.859203172886245, + "grad_norm": 0.05486986041069031, + "learning_rate": 8.795938047021739e-05, + "loss": 0.0158, + "step": 15860 + }, + { + "epoch": 2.861005949161709, + "grad_norm": 0.0317499004304409, + "learning_rate": 8.794143808942872e-05, + "loss": 0.016, + "step": 15870 + }, + { + "epoch": 2.8628087254371732, + "grad_norm": 0.041959505528211594, + "learning_rate": 8.792348418303296e-05, + "loss": 0.0179, + "step": 15880 + }, + { + "epoch": 2.8646115017126377, + "grad_norm": 0.03803468495607376, + "learning_rate": 8.790551875648398e-05, + "loss": 0.0165, + "step": 15890 + }, + { + "epoch": 2.8664142779881017, + "grad_norm": 0.039478663355112076, + "learning_rate": 8.788754181523926e-05, + "loss": 0.0165, + "step": 15900 + }, + { + "epoch": 2.8682170542635657, + "grad_norm": 0.042266376316547394, + "learning_rate": 8.78695533647597e-05, + "loss": 0.0157, + "step": 15910 + }, + { + "epoch": 2.87001983053903, + "grad_norm": 0.04523250088095665, + "learning_rate": 8.785155341050972e-05, + "loss": 0.0174, + "step": 15920 + }, + { + "epoch": 2.8718226068144945, + "grad_norm": 0.047450292855501175, + "learning_rate": 8.783354195795721e-05, + "loss": 0.0171, + "step": 15930 + }, + { + "epoch": 2.8736253830899585, + "grad_norm": 0.060113970190286636, + "learning_rate": 8.78155190125736e-05, + "loss": 0.0179, + "step": 15940 + }, + { + "epoch": 2.8754281593654225, + "grad_norm": 0.03856213763356209, + "learning_rate": 8.779748457983378e-05, + "loss": 0.0158, + "step": 15950 + }, + { + "epoch": 2.877230935640887, + "grad_norm": 0.038677748292684555, + "learning_rate": 8.777943866521612e-05, + "loss": 0.0171, + "step": 15960 + }, + { + "epoch": 2.8790337119163514, + "grad_norm": 0.05248156189918518, + "learning_rate": 8.77613812742025e-05, + "loss": 0.0158, + "step": 15970 + }, + { + "epoch": 2.8808364881918154, + "grad_norm": 0.04974060878157616, + "learning_rate": 8.774331241227829e-05, + "loss": 0.0157, + "step": 15980 + }, + { + "epoch": 2.8826392644672794, + "grad_norm": 0.054751407355070114, + "learning_rate": 8.772523208493232e-05, + "loss": 0.0169, + "step": 15990 + }, + { + "epoch": 2.884442040742744, + "grad_norm": 0.04187297075986862, + "learning_rate": 8.770714029765692e-05, + "loss": 0.0168, + "step": 16000 + }, + { + "epoch": 2.8862448170182082, + "grad_norm": 0.04502842575311661, + "learning_rate": 8.768903705594789e-05, + "loss": 0.0158, + "step": 16010 + }, + { + "epoch": 2.888047593293672, + "grad_norm": 0.051413025707006454, + "learning_rate": 8.767092236530453e-05, + "loss": 0.0157, + "step": 16020 + }, + { + "epoch": 2.889850369569136, + "grad_norm": 0.06610484421253204, + "learning_rate": 8.76527962312296e-05, + "loss": 0.0176, + "step": 16030 + }, + { + "epoch": 2.8916531458446006, + "grad_norm": 0.04775267094373703, + "learning_rate": 8.763465865922934e-05, + "loss": 0.0156, + "step": 16040 + }, + { + "epoch": 2.893455922120065, + "grad_norm": 0.039712365716695786, + "learning_rate": 8.761650965481347e-05, + "loss": 0.0158, + "step": 16050 + }, + { + "epoch": 2.895258698395529, + "grad_norm": 0.04802672937512398, + "learning_rate": 8.759834922349516e-05, + "loss": 0.0162, + "step": 16060 + }, + { + "epoch": 2.897061474670993, + "grad_norm": 0.040346723049879074, + "learning_rate": 8.758017737079108e-05, + "loss": 0.0157, + "step": 16070 + }, + { + "epoch": 2.8988642509464575, + "grad_norm": 0.04415149986743927, + "learning_rate": 8.756199410222137e-05, + "loss": 0.0179, + "step": 16080 + }, + { + "epoch": 2.900667027221922, + "grad_norm": 0.06237140670418739, + "learning_rate": 8.754379942330963e-05, + "loss": 0.0178, + "step": 16090 + }, + { + "epoch": 2.902469803497386, + "grad_norm": 0.0479973629117012, + "learning_rate": 8.75255933395829e-05, + "loss": 0.0165, + "step": 16100 + }, + { + "epoch": 2.90427257977285, + "grad_norm": 0.04457530379295349, + "learning_rate": 8.750737585657171e-05, + "loss": 0.0154, + "step": 16110 + }, + { + "epoch": 2.9060753560483144, + "grad_norm": 0.0456351637840271, + "learning_rate": 8.748914697981008e-05, + "loss": 0.016, + "step": 16120 + }, + { + "epoch": 2.907878132323779, + "grad_norm": 0.044084761291742325, + "learning_rate": 8.747090671483542e-05, + "loss": 0.0155, + "step": 16130 + }, + { + "epoch": 2.9096809085992428, + "grad_norm": 0.05112861096858978, + "learning_rate": 8.745265506718869e-05, + "loss": 0.0157, + "step": 16140 + }, + { + "epoch": 2.911483684874707, + "grad_norm": 0.047242894768714905, + "learning_rate": 8.74343920424142e-05, + "loss": 0.0173, + "step": 16150 + }, + { + "epoch": 2.913286461150171, + "grad_norm": 0.056666936725378036, + "learning_rate": 8.741611764605982e-05, + "loss": 0.0155, + "step": 16160 + }, + { + "epoch": 2.9150892374256356, + "grad_norm": 0.045878056436777115, + "learning_rate": 8.739783188367682e-05, + "loss": 0.0176, + "step": 16170 + }, + { + "epoch": 2.9168920137010996, + "grad_norm": 0.05163092166185379, + "learning_rate": 8.737953476081991e-05, + "loss": 0.0153, + "step": 16180 + }, + { + "epoch": 2.918694789976564, + "grad_norm": 0.03790448606014252, + "learning_rate": 8.73612262830473e-05, + "loss": 0.0171, + "step": 16190 + }, + { + "epoch": 2.920497566252028, + "grad_norm": 0.07003279030323029, + "learning_rate": 8.734290645592061e-05, + "loss": 0.0176, + "step": 16200 + }, + { + "epoch": 2.9223003425274925, + "grad_norm": 0.05325955152511597, + "learning_rate": 8.732457528500493e-05, + "loss": 0.0167, + "step": 16210 + }, + { + "epoch": 2.9241031188029565, + "grad_norm": 0.03651580959558487, + "learning_rate": 8.730623277586875e-05, + "loss": 0.0169, + "step": 16220 + }, + { + "epoch": 2.925905895078421, + "grad_norm": 0.05708346143364906, + "learning_rate": 8.72878789340841e-05, + "loss": 0.0155, + "step": 16230 + }, + { + "epoch": 2.927708671353885, + "grad_norm": 0.04567921161651611, + "learning_rate": 8.726951376522635e-05, + "loss": 0.0155, + "step": 16240 + }, + { + "epoch": 2.9295114476293493, + "grad_norm": 0.0497337207198143, + "learning_rate": 8.725113727487435e-05, + "loss": 0.0169, + "step": 16250 + }, + { + "epoch": 2.9313142239048133, + "grad_norm": 0.040554072707891464, + "learning_rate": 8.723274946861042e-05, + "loss": 0.0157, + "step": 16260 + }, + { + "epoch": 2.9331170001802778, + "grad_norm": 0.041318465024232864, + "learning_rate": 8.721435035202026e-05, + "loss": 0.0161, + "step": 16270 + }, + { + "epoch": 2.9349197764557418, + "grad_norm": 0.03856276720762253, + "learning_rate": 8.719593993069306e-05, + "loss": 0.0166, + "step": 16280 + }, + { + "epoch": 2.936722552731206, + "grad_norm": 0.05225542560219765, + "learning_rate": 8.717751821022139e-05, + "loss": 0.015, + "step": 16290 + }, + { + "epoch": 2.93852532900667, + "grad_norm": 0.0430973581969738, + "learning_rate": 8.715908519620134e-05, + "loss": 0.0151, + "step": 16300 + }, + { + "epoch": 2.9403281052821346, + "grad_norm": 0.05401335284113884, + "learning_rate": 8.71406408942323e-05, + "loss": 0.0157, + "step": 16310 + }, + { + "epoch": 2.9421308815575986, + "grad_norm": 0.04430680349469185, + "learning_rate": 8.712218530991723e-05, + "loss": 0.0152, + "step": 16320 + }, + { + "epoch": 2.943933657833063, + "grad_norm": 0.04452795535326004, + "learning_rate": 8.710371844886241e-05, + "loss": 0.0163, + "step": 16330 + }, + { + "epoch": 2.945736434108527, + "grad_norm": 0.03391064703464508, + "learning_rate": 8.708524031667758e-05, + "loss": 0.0169, + "step": 16340 + }, + { + "epoch": 2.9475392103839915, + "grad_norm": 0.0571090392768383, + "learning_rate": 8.706675091897592e-05, + "loss": 0.0166, + "step": 16350 + }, + { + "epoch": 2.9493419866594555, + "grad_norm": 0.045564670115709305, + "learning_rate": 8.704825026137404e-05, + "loss": 0.0156, + "step": 16360 + }, + { + "epoch": 2.95114476293492, + "grad_norm": 0.05303395539522171, + "learning_rate": 8.702973834949192e-05, + "loss": 0.0155, + "step": 16370 + }, + { + "epoch": 2.952947539210384, + "grad_norm": 0.038878437131643295, + "learning_rate": 8.701121518895301e-05, + "loss": 0.015, + "step": 16380 + }, + { + "epoch": 2.9547503154858483, + "grad_norm": 0.03865298256278038, + "learning_rate": 8.699268078538414e-05, + "loss": 0.0167, + "step": 16390 + }, + { + "epoch": 2.9565530917613123, + "grad_norm": 0.042044658213853836, + "learning_rate": 8.69741351444156e-05, + "loss": 0.0151, + "step": 16400 + }, + { + "epoch": 2.9583558680367767, + "grad_norm": 0.04329704865813255, + "learning_rate": 8.695557827168101e-05, + "loss": 0.0155, + "step": 16410 + }, + { + "epoch": 2.9601586443122407, + "grad_norm": 0.06257437914609909, + "learning_rate": 8.693701017281753e-05, + "loss": 0.0155, + "step": 16420 + }, + { + "epoch": 2.961961420587705, + "grad_norm": 0.05071616917848587, + "learning_rate": 8.691843085346563e-05, + "loss": 0.0148, + "step": 16430 + }, + { + "epoch": 2.963764196863169, + "grad_norm": 0.0462367907166481, + "learning_rate": 8.689984031926919e-05, + "loss": 0.0163, + "step": 16440 + }, + { + "epoch": 2.9655669731386336, + "grad_norm": 0.041529521346092224, + "learning_rate": 8.688123857587555e-05, + "loss": 0.0179, + "step": 16450 + }, + { + "epoch": 2.9673697494140976, + "grad_norm": 0.043393246829509735, + "learning_rate": 8.686262562893544e-05, + "loss": 0.0147, + "step": 16460 + }, + { + "epoch": 2.969172525689562, + "grad_norm": 0.045178283005952835, + "learning_rate": 8.684400148410294e-05, + "loss": 0.0165, + "step": 16470 + }, + { + "epoch": 2.970975301965026, + "grad_norm": 0.03619108349084854, + "learning_rate": 8.682536614703562e-05, + "loss": 0.0162, + "step": 16480 + }, + { + "epoch": 2.9727780782404905, + "grad_norm": 0.047426674515008926, + "learning_rate": 8.680671962339437e-05, + "loss": 0.0157, + "step": 16490 + }, + { + "epoch": 2.9745808545159544, + "grad_norm": 0.04116211086511612, + "learning_rate": 8.678806191884352e-05, + "loss": 0.0165, + "step": 16500 + }, + { + "epoch": 2.976383630791419, + "grad_norm": 0.05003083497285843, + "learning_rate": 8.67693930390508e-05, + "loss": 0.0161, + "step": 16510 + }, + { + "epoch": 2.978186407066883, + "grad_norm": 0.05268760398030281, + "learning_rate": 8.67507129896873e-05, + "loss": 0.0172, + "step": 16520 + }, + { + "epoch": 2.9799891833423473, + "grad_norm": 0.04245224967598915, + "learning_rate": 8.673202177642757e-05, + "loss": 0.0144, + "step": 16530 + }, + { + "epoch": 2.9817919596178113, + "grad_norm": 0.041886135935783386, + "learning_rate": 8.671331940494945e-05, + "loss": 0.0169, + "step": 16540 + }, + { + "epoch": 2.9835947358932757, + "grad_norm": 0.04381066560745239, + "learning_rate": 8.669460588093427e-05, + "loss": 0.0177, + "step": 16550 + }, + { + "epoch": 2.9853975121687397, + "grad_norm": 0.04021643102169037, + "learning_rate": 8.667588121006667e-05, + "loss": 0.0172, + "step": 16560 + }, + { + "epoch": 2.987200288444204, + "grad_norm": 0.03552757203578949, + "learning_rate": 8.665714539803475e-05, + "loss": 0.0147, + "step": 16570 + }, + { + "epoch": 2.989003064719668, + "grad_norm": 0.046464648097753525, + "learning_rate": 8.663839845052993e-05, + "loss": 0.016, + "step": 16580 + }, + { + "epoch": 2.9908058409951326, + "grad_norm": 0.049261607229709625, + "learning_rate": 8.661964037324703e-05, + "loss": 0.0166, + "step": 16590 + }, + { + "epoch": 2.9926086172705966, + "grad_norm": 0.05617135763168335, + "learning_rate": 8.660087117188427e-05, + "loss": 0.016, + "step": 16600 + }, + { + "epoch": 2.994411393546061, + "grad_norm": 0.03767121955752373, + "learning_rate": 8.658209085214325e-05, + "loss": 0.016, + "step": 16610 + }, + { + "epoch": 2.996214169821525, + "grad_norm": 0.033543169498443604, + "learning_rate": 8.656329941972891e-05, + "loss": 0.017, + "step": 16620 + }, + { + "epoch": 2.9980169460969894, + "grad_norm": 0.049744825810194016, + "learning_rate": 8.654449688034963e-05, + "loss": 0.0159, + "step": 16630 + }, + { + "epoch": 2.9998197223724534, + "grad_norm": 0.0352543406188488, + "learning_rate": 8.652568323971706e-05, + "loss": 0.0164, + "step": 16640 + }, + { + "epoch": 3.001622498647918, + "grad_norm": 0.04713529720902443, + "learning_rate": 8.650685850354636e-05, + "loss": 0.0156, + "step": 16650 + }, + { + "epoch": 3.003425274923382, + "grad_norm": 0.04870638623833656, + "learning_rate": 8.648802267755593e-05, + "loss": 0.0177, + "step": 16660 + }, + { + "epoch": 3.0052280511988463, + "grad_norm": 0.05205162614583969, + "learning_rate": 8.646917576746764e-05, + "loss": 0.0166, + "step": 16670 + }, + { + "epoch": 3.0070308274743103, + "grad_norm": 0.045704178512096405, + "learning_rate": 8.645031777900666e-05, + "loss": 0.0157, + "step": 16680 + }, + { + "epoch": 3.0088336037497747, + "grad_norm": 0.048011843115091324, + "learning_rate": 8.643144871790154e-05, + "loss": 0.0153, + "step": 16690 + }, + { + "epoch": 3.0106363800252387, + "grad_norm": 0.03749169781804085, + "learning_rate": 8.641256858988424e-05, + "loss": 0.0152, + "step": 16700 + }, + { + "epoch": 3.012439156300703, + "grad_norm": 0.03587783873081207, + "learning_rate": 8.639367740069e-05, + "loss": 0.0158, + "step": 16710 + }, + { + "epoch": 3.014241932576167, + "grad_norm": 0.04209086298942566, + "learning_rate": 8.63747751560575e-05, + "loss": 0.0162, + "step": 16720 + }, + { + "epoch": 3.0160447088516316, + "grad_norm": 0.046153757721185684, + "learning_rate": 8.635586186172871e-05, + "loss": 0.0161, + "step": 16730 + }, + { + "epoch": 3.0178474851270956, + "grad_norm": 0.038535188883543015, + "learning_rate": 8.633693752344902e-05, + "loss": 0.0159, + "step": 16740 + }, + { + "epoch": 3.01965026140256, + "grad_norm": 0.05276139825582504, + "learning_rate": 8.631800214696713e-05, + "loss": 0.0149, + "step": 16750 + }, + { + "epoch": 3.021453037678024, + "grad_norm": 0.0456915907561779, + "learning_rate": 8.629905573803511e-05, + "loss": 0.0177, + "step": 16760 + }, + { + "epoch": 3.0232558139534884, + "grad_norm": 0.050260212272405624, + "learning_rate": 8.628009830240839e-05, + "loss": 0.0161, + "step": 16770 + }, + { + "epoch": 3.0250585902289524, + "grad_norm": 0.05257803946733475, + "learning_rate": 8.626112984584571e-05, + "loss": 0.0187, + "step": 16780 + }, + { + "epoch": 3.026861366504417, + "grad_norm": 0.03641931340098381, + "learning_rate": 8.62421503741092e-05, + "loss": 0.0155, + "step": 16790 + }, + { + "epoch": 3.028664142779881, + "grad_norm": 0.04046864062547684, + "learning_rate": 8.622315989296432e-05, + "loss": 0.0155, + "step": 16800 + }, + { + "epoch": 3.0304669190553453, + "grad_norm": 0.04733973369002342, + "learning_rate": 8.62041584081799e-05, + "loss": 0.0154, + "step": 16810 + }, + { + "epoch": 3.0322696953308093, + "grad_norm": 0.03949728608131409, + "learning_rate": 8.618514592552807e-05, + "loss": 0.0158, + "step": 16820 + }, + { + "epoch": 3.0340724716062737, + "grad_norm": 0.04058030620217323, + "learning_rate": 8.616612245078431e-05, + "loss": 0.0158, + "step": 16830 + }, + { + "epoch": 3.0358752478817377, + "grad_norm": 0.042276713997125626, + "learning_rate": 8.614708798972746e-05, + "loss": 0.017, + "step": 16840 + }, + { + "epoch": 3.037678024157202, + "grad_norm": 0.043269913643598557, + "learning_rate": 8.61280425481397e-05, + "loss": 0.0154, + "step": 16850 + }, + { + "epoch": 3.039480800432666, + "grad_norm": 0.05188237130641937, + "learning_rate": 8.61089861318065e-05, + "loss": 0.0157, + "step": 16860 + }, + { + "epoch": 3.0412835767081305, + "grad_norm": 0.048696234822273254, + "learning_rate": 8.608991874651673e-05, + "loss": 0.0167, + "step": 16870 + }, + { + "epoch": 3.0430863529835945, + "grad_norm": 0.055615995079278946, + "learning_rate": 8.607084039806255e-05, + "loss": 0.0162, + "step": 16880 + }, + { + "epoch": 3.044889129259059, + "grad_norm": 0.05035218596458435, + "learning_rate": 8.605175109223944e-05, + "loss": 0.0149, + "step": 16890 + }, + { + "epoch": 3.046691905534523, + "grad_norm": 0.03534025698900223, + "learning_rate": 8.603265083484624e-05, + "loss": 0.015, + "step": 16900 + }, + { + "epoch": 3.0484946818099874, + "grad_norm": 0.032134875655174255, + "learning_rate": 8.60135396316851e-05, + "loss": 0.0157, + "step": 16910 + }, + { + "epoch": 3.0502974580854514, + "grad_norm": 0.06041080877184868, + "learning_rate": 8.599441748856152e-05, + "loss": 0.0166, + "step": 16920 + }, + { + "epoch": 3.052100234360916, + "grad_norm": 0.04301208630204201, + "learning_rate": 8.597528441128427e-05, + "loss": 0.0158, + "step": 16930 + }, + { + "epoch": 3.05390301063638, + "grad_norm": 0.041871558874845505, + "learning_rate": 8.595614040566549e-05, + "loss": 0.0146, + "step": 16940 + }, + { + "epoch": 3.0557057869118442, + "grad_norm": 0.04523066431283951, + "learning_rate": 8.593698547752063e-05, + "loss": 0.0151, + "step": 16950 + }, + { + "epoch": 3.0575085631873082, + "grad_norm": 0.031436722725629807, + "learning_rate": 8.591781963266843e-05, + "loss": 0.0142, + "step": 16960 + }, + { + "epoch": 3.0593113394627727, + "grad_norm": 0.05896392837166786, + "learning_rate": 8.5898642876931e-05, + "loss": 0.0151, + "step": 16970 + }, + { + "epoch": 3.0611141157382367, + "grad_norm": 0.04080561175942421, + "learning_rate": 8.587945521613369e-05, + "loss": 0.015, + "step": 16980 + }, + { + "epoch": 3.062916892013701, + "grad_norm": 0.03276459872722626, + "learning_rate": 8.586025665610524e-05, + "loss": 0.0154, + "step": 16990 + }, + { + "epoch": 3.0647196682891655, + "grad_norm": 0.03930079564452171, + "learning_rate": 8.584104720267765e-05, + "loss": 0.0148, + "step": 17000 + }, + { + "epoch": 3.0665224445646295, + "grad_norm": 0.03830758109688759, + "learning_rate": 8.582182686168625e-05, + "loss": 0.0148, + "step": 17010 + }, + { + "epoch": 3.068325220840094, + "grad_norm": 0.06841686367988586, + "learning_rate": 8.580259563896967e-05, + "loss": 0.0162, + "step": 17020 + }, + { + "epoch": 3.070127997115558, + "grad_norm": 0.04648241400718689, + "learning_rate": 8.578335354036983e-05, + "loss": 0.0148, + "step": 17030 + }, + { + "epoch": 3.0719307733910224, + "grad_norm": 0.037829216569662094, + "learning_rate": 8.576410057173201e-05, + "loss": 0.0145, + "step": 17040 + }, + { + "epoch": 3.0737335496664864, + "grad_norm": 0.044005218893289566, + "learning_rate": 8.574483673890474e-05, + "loss": 0.0162, + "step": 17050 + }, + { + "epoch": 3.075536325941951, + "grad_norm": 0.04921894520521164, + "learning_rate": 8.572556204773983e-05, + "loss": 0.0166, + "step": 17060 + }, + { + "epoch": 3.077339102217415, + "grad_norm": 0.034047238528728485, + "learning_rate": 8.570627650409246e-05, + "loss": 0.0152, + "step": 17070 + }, + { + "epoch": 3.0791418784928792, + "grad_norm": 0.03530779480934143, + "learning_rate": 8.568698011382107e-05, + "loss": 0.0168, + "step": 17080 + }, + { + "epoch": 3.0809446547683432, + "grad_norm": 0.046679895371198654, + "learning_rate": 8.566767288278738e-05, + "loss": 0.0132, + "step": 17090 + }, + { + "epoch": 3.0827474310438077, + "grad_norm": 0.03837157413363457, + "learning_rate": 8.56483548168564e-05, + "loss": 0.0145, + "step": 17100 + }, + { + "epoch": 3.0845502073192717, + "grad_norm": 0.043510422110557556, + "learning_rate": 8.562902592189648e-05, + "loss": 0.0155, + "step": 17110 + }, + { + "epoch": 3.086352983594736, + "grad_norm": 0.04849274829030037, + "learning_rate": 8.560968620377921e-05, + "loss": 0.0155, + "step": 17120 + }, + { + "epoch": 3.0881557598702, + "grad_norm": 0.0444086454808712, + "learning_rate": 8.559033566837951e-05, + "loss": 0.0164, + "step": 17130 + }, + { + "epoch": 3.0899585361456645, + "grad_norm": 0.046112582087516785, + "learning_rate": 8.557097432157551e-05, + "loss": 0.016, + "step": 17140 + }, + { + "epoch": 3.0917613124211285, + "grad_norm": 0.04161524400115013, + "learning_rate": 8.555160216924872e-05, + "loss": 0.0163, + "step": 17150 + }, + { + "epoch": 3.093564088696593, + "grad_norm": 0.07475914061069489, + "learning_rate": 8.55322192172839e-05, + "loss": 0.0168, + "step": 17160 + }, + { + "epoch": 3.095366864972057, + "grad_norm": 0.05291390046477318, + "learning_rate": 8.551282547156902e-05, + "loss": 0.0158, + "step": 17170 + }, + { + "epoch": 3.0971696412475214, + "grad_norm": 0.04373344033956528, + "learning_rate": 8.549342093799544e-05, + "loss": 0.0162, + "step": 17180 + }, + { + "epoch": 3.0989724175229854, + "grad_norm": 0.04885878413915634, + "learning_rate": 8.547400562245773e-05, + "loss": 0.015, + "step": 17190 + }, + { + "epoch": 3.10077519379845, + "grad_norm": 0.043733030557632446, + "learning_rate": 8.545457953085374e-05, + "loss": 0.0158, + "step": 17200 + }, + { + "epoch": 3.102577970073914, + "grad_norm": 0.03102247603237629, + "learning_rate": 8.543514266908463e-05, + "loss": 0.0164, + "step": 17210 + }, + { + "epoch": 3.104380746349378, + "grad_norm": 0.03258621320128441, + "learning_rate": 8.541569504305478e-05, + "loss": 0.0153, + "step": 17220 + }, + { + "epoch": 3.106183522624842, + "grad_norm": 0.032520685344934464, + "learning_rate": 8.539623665867187e-05, + "loss": 0.0144, + "step": 17230 + }, + { + "epoch": 3.1079862989003066, + "grad_norm": 0.043426766991615295, + "learning_rate": 8.537676752184685e-05, + "loss": 0.0153, + "step": 17240 + }, + { + "epoch": 3.1097890751757706, + "grad_norm": 0.04469803720712662, + "learning_rate": 8.53572876384939e-05, + "loss": 0.0168, + "step": 17250 + }, + { + "epoch": 3.111591851451235, + "grad_norm": 0.0530916228890419, + "learning_rate": 8.533779701453056e-05, + "loss": 0.0168, + "step": 17260 + }, + { + "epoch": 3.113394627726699, + "grad_norm": 0.05200677365064621, + "learning_rate": 8.53182956558775e-05, + "loss": 0.017, + "step": 17270 + }, + { + "epoch": 3.1151974040021635, + "grad_norm": 0.04070226848125458, + "learning_rate": 8.529878356845877e-05, + "loss": 0.0155, + "step": 17280 + }, + { + "epoch": 3.1170001802776275, + "grad_norm": 0.04604024067521095, + "learning_rate": 8.527926075820158e-05, + "loss": 0.0158, + "step": 17290 + }, + { + "epoch": 3.118802956553092, + "grad_norm": 0.04929487407207489, + "learning_rate": 8.525972723103648e-05, + "loss": 0.0153, + "step": 17300 + }, + { + "epoch": 3.120605732828556, + "grad_norm": 0.04314001649618149, + "learning_rate": 8.524018299289722e-05, + "loss": 0.0155, + "step": 17310 + }, + { + "epoch": 3.1224085091040203, + "grad_norm": 0.04341772571206093, + "learning_rate": 8.522062804972083e-05, + "loss": 0.0159, + "step": 17320 + }, + { + "epoch": 3.1242112853794843, + "grad_norm": 0.04496905580163002, + "learning_rate": 8.520106240744759e-05, + "loss": 0.0144, + "step": 17330 + }, + { + "epoch": 3.1260140616549488, + "grad_norm": 0.060409065335989, + "learning_rate": 8.518148607202102e-05, + "loss": 0.0155, + "step": 17340 + }, + { + "epoch": 3.1278168379304128, + "grad_norm": 0.048636678606271744, + "learning_rate": 8.51618990493879e-05, + "loss": 0.0153, + "step": 17350 + }, + { + "epoch": 3.129619614205877, + "grad_norm": 0.03617498651146889, + "learning_rate": 8.514230134549823e-05, + "loss": 0.0154, + "step": 17360 + }, + { + "epoch": 3.131422390481341, + "grad_norm": 0.03944437578320503, + "learning_rate": 8.51226929663053e-05, + "loss": 0.0152, + "step": 17370 + }, + { + "epoch": 3.1332251667568056, + "grad_norm": 0.03378669545054436, + "learning_rate": 8.51030739177656e-05, + "loss": 0.0153, + "step": 17380 + }, + { + "epoch": 3.1350279430322696, + "grad_norm": 0.04237424582242966, + "learning_rate": 8.508344420583889e-05, + "loss": 0.0153, + "step": 17390 + }, + { + "epoch": 3.136830719307734, + "grad_norm": 0.05295997112989426, + "learning_rate": 8.506380383648816e-05, + "loss": 0.0154, + "step": 17400 + }, + { + "epoch": 3.138633495583198, + "grad_norm": 0.03904615342617035, + "learning_rate": 8.504415281567963e-05, + "loss": 0.0146, + "step": 17410 + }, + { + "epoch": 3.1404362718586625, + "grad_norm": 0.03903757780790329, + "learning_rate": 8.502449114938275e-05, + "loss": 0.0148, + "step": 17420 + }, + { + "epoch": 3.1422390481341265, + "grad_norm": 0.05069727450609207, + "learning_rate": 8.500481884357025e-05, + "loss": 0.0149, + "step": 17430 + }, + { + "epoch": 3.144041824409591, + "grad_norm": 0.044495221227407455, + "learning_rate": 8.498513590421801e-05, + "loss": 0.0158, + "step": 17440 + }, + { + "epoch": 3.145844600685055, + "grad_norm": 0.03845270723104477, + "learning_rate": 8.496544233730522e-05, + "loss": 0.0148, + "step": 17450 + }, + { + "epoch": 3.1476473769605193, + "grad_norm": 0.04301605746150017, + "learning_rate": 8.494573814881426e-05, + "loss": 0.0158, + "step": 17460 + }, + { + "epoch": 3.1494501532359833, + "grad_norm": 0.054183632135391235, + "learning_rate": 8.492602334473074e-05, + "loss": 0.0157, + "step": 17470 + }, + { + "epoch": 3.1512529295114478, + "grad_norm": 0.04242841899394989, + "learning_rate": 8.49062979310435e-05, + "loss": 0.0159, + "step": 17480 + }, + { + "epoch": 3.1530557057869117, + "grad_norm": 0.04051610827445984, + "learning_rate": 8.488656191374458e-05, + "loss": 0.0157, + "step": 17490 + }, + { + "epoch": 3.154858482062376, + "grad_norm": 0.05127168819308281, + "learning_rate": 8.48668152988293e-05, + "loss": 0.0174, + "step": 17500 + }, + { + "epoch": 3.15666125833784, + "grad_norm": 0.04696068912744522, + "learning_rate": 8.484705809229612e-05, + "loss": 0.0161, + "step": 17510 + }, + { + "epoch": 3.1584640346133046, + "grad_norm": 0.036171332001686096, + "learning_rate": 8.482729030014677e-05, + "loss": 0.0153, + "step": 17520 + }, + { + "epoch": 3.1602668108887686, + "grad_norm": 0.030939970165491104, + "learning_rate": 8.48075119283862e-05, + "loss": 0.0161, + "step": 17530 + }, + { + "epoch": 3.162069587164233, + "grad_norm": 0.035582710057497025, + "learning_rate": 8.478772298302254e-05, + "loss": 0.0162, + "step": 17540 + }, + { + "epoch": 3.163872363439697, + "grad_norm": 0.036424536257982254, + "learning_rate": 8.476792347006716e-05, + "loss": 0.017, + "step": 17550 + }, + { + "epoch": 3.1656751397151615, + "grad_norm": 0.04097399860620499, + "learning_rate": 8.474811339553462e-05, + "loss": 0.0171, + "step": 17560 + }, + { + "epoch": 3.1674779159906254, + "grad_norm": 0.05418233945965767, + "learning_rate": 8.47282927654427e-05, + "loss": 0.015, + "step": 17570 + }, + { + "epoch": 3.16928069226609, + "grad_norm": 0.025247249752283096, + "learning_rate": 8.470846158581238e-05, + "loss": 0.0159, + "step": 17580 + }, + { + "epoch": 3.171083468541554, + "grad_norm": 0.04734804481267929, + "learning_rate": 8.468861986266787e-05, + "loss": 0.0155, + "step": 17590 + }, + { + "epoch": 3.1728862448170183, + "grad_norm": 0.03765108808875084, + "learning_rate": 8.466876760203654e-05, + "loss": 0.0148, + "step": 17600 + }, + { + "epoch": 3.1746890210924823, + "grad_norm": 0.05470295622944832, + "learning_rate": 8.464890480994898e-05, + "loss": 0.0146, + "step": 17610 + }, + { + "epoch": 3.1764917973679467, + "grad_norm": 0.03746388852596283, + "learning_rate": 8.462903149243899e-05, + "loss": 0.0153, + "step": 17620 + }, + { + "epoch": 3.1782945736434107, + "grad_norm": 0.03942546248435974, + "learning_rate": 8.460914765554357e-05, + "loss": 0.0166, + "step": 17630 + }, + { + "epoch": 3.180097349918875, + "grad_norm": 0.04653691500425339, + "learning_rate": 8.458925330530288e-05, + "loss": 0.0156, + "step": 17640 + }, + { + "epoch": 3.181900126194339, + "grad_norm": 0.03797975555062294, + "learning_rate": 8.456934844776032e-05, + "loss": 0.0151, + "step": 17650 + }, + { + "epoch": 3.1837029024698036, + "grad_norm": 0.05821456387639046, + "learning_rate": 8.454943308896246e-05, + "loss": 0.0172, + "step": 17660 + }, + { + "epoch": 3.1855056787452676, + "grad_norm": 0.040300581604242325, + "learning_rate": 8.452950723495905e-05, + "loss": 0.0156, + "step": 17670 + }, + { + "epoch": 3.187308455020732, + "grad_norm": 0.03464846685528755, + "learning_rate": 8.450957089180303e-05, + "loss": 0.0141, + "step": 17680 + }, + { + "epoch": 3.189111231296196, + "grad_norm": 0.059073545038700104, + "learning_rate": 8.448962406555055e-05, + "loss": 0.0158, + "step": 17690 + }, + { + "epoch": 3.1909140075716604, + "grad_norm": 0.06402917206287384, + "learning_rate": 8.446966676226093e-05, + "loss": 0.0156, + "step": 17700 + }, + { + "epoch": 3.1927167838471244, + "grad_norm": 0.046270597726106644, + "learning_rate": 8.444969898799667e-05, + "loss": 0.0161, + "step": 17710 + }, + { + "epoch": 3.194519560122589, + "grad_norm": 0.05054490640759468, + "learning_rate": 8.442972074882343e-05, + "loss": 0.017, + "step": 17720 + }, + { + "epoch": 3.196322336398053, + "grad_norm": 0.04157906025648117, + "learning_rate": 8.44097320508101e-05, + "loss": 0.0144, + "step": 17730 + }, + { + "epoch": 3.1981251126735173, + "grad_norm": 0.03261534124612808, + "learning_rate": 8.43897329000287e-05, + "loss": 0.0159, + "step": 17740 + }, + { + "epoch": 3.1999278889489813, + "grad_norm": 0.033596914261579514, + "learning_rate": 8.436972330255448e-05, + "loss": 0.014, + "step": 17750 + }, + { + "epoch": 3.2017306652244457, + "grad_norm": 0.054582636803388596, + "learning_rate": 8.434970326446579e-05, + "loss": 0.0155, + "step": 17760 + }, + { + "epoch": 3.2035334414999097, + "grad_norm": 0.038620155304670334, + "learning_rate": 8.432967279184418e-05, + "loss": 0.0164, + "step": 17770 + }, + { + "epoch": 3.205336217775374, + "grad_norm": 0.04735894501209259, + "learning_rate": 8.430963189077441e-05, + "loss": 0.0152, + "step": 17780 + }, + { + "epoch": 3.207138994050838, + "grad_norm": 0.04374168813228607, + "learning_rate": 8.428958056734437e-05, + "loss": 0.015, + "step": 17790 + }, + { + "epoch": 3.2089417703263026, + "grad_norm": 0.035431478172540665, + "learning_rate": 8.426951882764513e-05, + "loss": 0.0162, + "step": 17800 + }, + { + "epoch": 3.2107445466017666, + "grad_norm": 0.04569089785218239, + "learning_rate": 8.424944667777089e-05, + "loss": 0.0161, + "step": 17810 + }, + { + "epoch": 3.212547322877231, + "grad_norm": 0.05176321789622307, + "learning_rate": 8.422936412381905e-05, + "loss": 0.0144, + "step": 17820 + }, + { + "epoch": 3.214350099152695, + "grad_norm": 0.05529508739709854, + "learning_rate": 8.420927117189017e-05, + "loss": 0.0171, + "step": 17830 + }, + { + "epoch": 3.2161528754281594, + "grad_norm": 0.03316926211118698, + "learning_rate": 8.418916782808795e-05, + "loss": 0.0148, + "step": 17840 + }, + { + "epoch": 3.2179556517036234, + "grad_norm": 0.04696822166442871, + "learning_rate": 8.416905409851926e-05, + "loss": 0.0154, + "step": 17850 + }, + { + "epoch": 3.219758427979088, + "grad_norm": 0.030431784689426422, + "learning_rate": 8.41489299892941e-05, + "loss": 0.0157, + "step": 17860 + }, + { + "epoch": 3.221561204254552, + "grad_norm": 0.04402593523263931, + "learning_rate": 8.412879550652566e-05, + "loss": 0.0146, + "step": 17870 + }, + { + "epoch": 3.2233639805300163, + "grad_norm": 0.033782489597797394, + "learning_rate": 8.410865065633029e-05, + "loss": 0.0156, + "step": 17880 + }, + { + "epoch": 3.2251667568054803, + "grad_norm": 0.05338328331708908, + "learning_rate": 8.408849544482742e-05, + "loss": 0.0153, + "step": 17890 + }, + { + "epoch": 3.2269695330809447, + "grad_norm": 0.036839358508586884, + "learning_rate": 8.406832987813968e-05, + "loss": 0.0153, + "step": 17900 + }, + { + "epoch": 3.2287723093564087, + "grad_norm": 0.04889962449669838, + "learning_rate": 8.404815396239286e-05, + "loss": 0.0164, + "step": 17910 + }, + { + "epoch": 3.230575085631873, + "grad_norm": 0.046515870839357376, + "learning_rate": 8.402796770371587e-05, + "loss": 0.0157, + "step": 17920 + }, + { + "epoch": 3.232377861907337, + "grad_norm": 0.04423726722598076, + "learning_rate": 8.400777110824071e-05, + "loss": 0.0155, + "step": 17930 + }, + { + "epoch": 3.2341806381828015, + "grad_norm": 0.03224209323525429, + "learning_rate": 8.398756418210263e-05, + "loss": 0.0154, + "step": 17940 + }, + { + "epoch": 3.2359834144582655, + "grad_norm": 0.039227407425642014, + "learning_rate": 8.396734693143993e-05, + "loss": 0.0153, + "step": 17950 + }, + { + "epoch": 3.23778619073373, + "grad_norm": 0.05793439969420433, + "learning_rate": 8.39471193623941e-05, + "loss": 0.0151, + "step": 17960 + }, + { + "epoch": 3.239588967009194, + "grad_norm": 0.046244360506534576, + "learning_rate": 8.392688148110974e-05, + "loss": 0.0155, + "step": 17970 + }, + { + "epoch": 3.2413917432846584, + "grad_norm": 0.04787372797727585, + "learning_rate": 8.390663329373456e-05, + "loss": 0.0161, + "step": 17980 + }, + { + "epoch": 3.2431945195601224, + "grad_norm": 0.039434079080820084, + "learning_rate": 8.388637480641944e-05, + "loss": 0.0147, + "step": 17990 + }, + { + "epoch": 3.244997295835587, + "grad_norm": 0.034184861928224564, + "learning_rate": 8.386610602531837e-05, + "loss": 0.0161, + "step": 18000 + }, + { + "epoch": 3.246800072111051, + "grad_norm": 0.04426391050219536, + "learning_rate": 8.384582695658847e-05, + "loss": 0.0152, + "step": 18010 + }, + { + "epoch": 3.2486028483865153, + "grad_norm": 0.03570328652858734, + "learning_rate": 8.382553760638999e-05, + "loss": 0.0152, + "step": 18020 + }, + { + "epoch": 3.2504056246619797, + "grad_norm": 0.03266839310526848, + "learning_rate": 8.380523798088631e-05, + "loss": 0.0153, + "step": 18030 + }, + { + "epoch": 3.2522084009374437, + "grad_norm": 0.03823930025100708, + "learning_rate": 8.378492808624389e-05, + "loss": 0.015, + "step": 18040 + }, + { + "epoch": 3.2540111772129077, + "grad_norm": 0.04266727343201637, + "learning_rate": 8.376460792863237e-05, + "loss": 0.0161, + "step": 18050 + }, + { + "epoch": 3.255813953488372, + "grad_norm": 0.05183505639433861, + "learning_rate": 8.374427751422444e-05, + "loss": 0.0148, + "step": 18060 + }, + { + "epoch": 3.2576167297638365, + "grad_norm": 0.04032457619905472, + "learning_rate": 8.3723936849196e-05, + "loss": 0.0155, + "step": 18070 + }, + { + "epoch": 3.2594195060393005, + "grad_norm": 0.04840516299009323, + "learning_rate": 8.370358593972595e-05, + "loss": 0.0162, + "step": 18080 + }, + { + "epoch": 3.2612222823147645, + "grad_norm": 0.052178606390953064, + "learning_rate": 8.36832247919964e-05, + "loss": 0.0161, + "step": 18090 + }, + { + "epoch": 3.263025058590229, + "grad_norm": 0.04207688942551613, + "learning_rate": 8.36628534121925e-05, + "loss": 0.0172, + "step": 18100 + }, + { + "epoch": 3.2648278348656934, + "grad_norm": 0.03454183414578438, + "learning_rate": 8.364247180650254e-05, + "loss": 0.016, + "step": 18110 + }, + { + "epoch": 3.2666306111411574, + "grad_norm": 0.04386918619275093, + "learning_rate": 8.362207998111794e-05, + "loss": 0.0156, + "step": 18120 + }, + { + "epoch": 3.2684333874166214, + "grad_norm": 0.0436563640832901, + "learning_rate": 8.360167794223318e-05, + "loss": 0.0168, + "step": 18130 + }, + { + "epoch": 3.270236163692086, + "grad_norm": 0.030179345980286598, + "learning_rate": 8.358126569604586e-05, + "loss": 0.0157, + "step": 18140 + }, + { + "epoch": 3.2720389399675502, + "grad_norm": 0.04435411095619202, + "learning_rate": 8.356084324875668e-05, + "loss": 0.0156, + "step": 18150 + }, + { + "epoch": 3.2738417162430142, + "grad_norm": 0.042664382606744766, + "learning_rate": 8.354041060656945e-05, + "loss": 0.0148, + "step": 18160 + }, + { + "epoch": 3.2756444925184782, + "grad_norm": 0.046798501163721085, + "learning_rate": 8.351996777569106e-05, + "loss": 0.0167, + "step": 18170 + }, + { + "epoch": 3.2774472687939427, + "grad_norm": 0.052082668989896774, + "learning_rate": 8.349951476233148e-05, + "loss": 0.0138, + "step": 18180 + }, + { + "epoch": 3.279250045069407, + "grad_norm": 0.03868875652551651, + "learning_rate": 8.347905157270386e-05, + "loss": 0.0145, + "step": 18190 + }, + { + "epoch": 3.281052821344871, + "grad_norm": 0.04976166784763336, + "learning_rate": 8.345857821302432e-05, + "loss": 0.0157, + "step": 18200 + }, + { + "epoch": 3.282855597620335, + "grad_norm": 0.03541598096489906, + "learning_rate": 8.343809468951213e-05, + "loss": 0.015, + "step": 18210 + }, + { + "epoch": 3.2846583738957995, + "grad_norm": 0.04596628248691559, + "learning_rate": 8.341760100838965e-05, + "loss": 0.0147, + "step": 18220 + }, + { + "epoch": 3.286461150171264, + "grad_norm": 0.044841449707746506, + "learning_rate": 8.339709717588233e-05, + "loss": 0.0151, + "step": 18230 + }, + { + "epoch": 3.288263926446728, + "grad_norm": 0.0335744246840477, + "learning_rate": 8.33765831982187e-05, + "loss": 0.0154, + "step": 18240 + }, + { + "epoch": 3.290066702722192, + "grad_norm": 0.03940792381763458, + "learning_rate": 8.335605908163035e-05, + "loss": 0.0168, + "step": 18250 + }, + { + "epoch": 3.2918694789976564, + "grad_norm": 0.044453833252191544, + "learning_rate": 8.333552483235196e-05, + "loss": 0.0168, + "step": 18260 + }, + { + "epoch": 3.293672255273121, + "grad_norm": 0.04369669035077095, + "learning_rate": 8.33149804566213e-05, + "loss": 0.0157, + "step": 18270 + }, + { + "epoch": 3.295475031548585, + "grad_norm": 0.04471375048160553, + "learning_rate": 8.329442596067921e-05, + "loss": 0.015, + "step": 18280 + }, + { + "epoch": 3.2972778078240492, + "grad_norm": 0.03485496714711189, + "learning_rate": 8.32738613507696e-05, + "loss": 0.0179, + "step": 18290 + }, + { + "epoch": 3.299080584099513, + "grad_norm": 0.048750292509794235, + "learning_rate": 8.325328663313946e-05, + "loss": 0.0163, + "step": 18300 + }, + { + "epoch": 3.3008833603749776, + "grad_norm": 0.05034734681248665, + "learning_rate": 8.323270181403884e-05, + "loss": 0.0156, + "step": 18310 + }, + { + "epoch": 3.3026861366504416, + "grad_norm": 0.0483270026743412, + "learning_rate": 8.321210689972086e-05, + "loss": 0.0166, + "step": 18320 + }, + { + "epoch": 3.304488912925906, + "grad_norm": 0.03686535730957985, + "learning_rate": 8.319150189644174e-05, + "loss": 0.0165, + "step": 18330 + }, + { + "epoch": 3.30629168920137, + "grad_norm": 0.04839479923248291, + "learning_rate": 8.31708868104607e-05, + "loss": 0.0146, + "step": 18340 + }, + { + "epoch": 3.3080944654768345, + "grad_norm": 0.0390094593167305, + "learning_rate": 8.315026164804007e-05, + "loss": 0.0147, + "step": 18350 + }, + { + "epoch": 3.3098972417522985, + "grad_norm": 0.03898933529853821, + "learning_rate": 8.312962641544524e-05, + "loss": 0.0148, + "step": 18360 + }, + { + "epoch": 3.311700018027763, + "grad_norm": 0.0496540367603302, + "learning_rate": 8.310898111894465e-05, + "loss": 0.0148, + "step": 18370 + }, + { + "epoch": 3.313502794303227, + "grad_norm": 0.05227912217378616, + "learning_rate": 8.308832576480977e-05, + "loss": 0.0144, + "step": 18380 + }, + { + "epoch": 3.3153055705786914, + "grad_norm": 0.039459872990846634, + "learning_rate": 8.306766035931519e-05, + "loss": 0.015, + "step": 18390 + }, + { + "epoch": 3.3171083468541553, + "grad_norm": 0.04014031961560249, + "learning_rate": 8.304698490873847e-05, + "loss": 0.0154, + "step": 18400 + }, + { + "epoch": 3.3189111231296198, + "grad_norm": 0.03413144126534462, + "learning_rate": 8.30262994193603e-05, + "loss": 0.0164, + "step": 18410 + }, + { + "epoch": 3.3207138994050838, + "grad_norm": 0.045919422060251236, + "learning_rate": 8.300560389746438e-05, + "loss": 0.0147, + "step": 18420 + }, + { + "epoch": 3.322516675680548, + "grad_norm": 0.04062395542860031, + "learning_rate": 8.298489834933745e-05, + "loss": 0.0158, + "step": 18430 + }, + { + "epoch": 3.324319451956012, + "grad_norm": 0.03182253614068031, + "learning_rate": 8.296418278126934e-05, + "loss": 0.0165, + "step": 18440 + }, + { + "epoch": 3.3261222282314766, + "grad_norm": 0.061315566301345825, + "learning_rate": 8.294345719955284e-05, + "loss": 0.016, + "step": 18450 + }, + { + "epoch": 3.3279250045069406, + "grad_norm": 0.044311199337244034, + "learning_rate": 8.29227216104839e-05, + "loss": 0.015, + "step": 18460 + }, + { + "epoch": 3.329727780782405, + "grad_norm": 0.049143046140670776, + "learning_rate": 8.290197602036137e-05, + "loss": 0.0146, + "step": 18470 + }, + { + "epoch": 3.331530557057869, + "grad_norm": 0.03165849298238754, + "learning_rate": 8.288122043548725e-05, + "loss": 0.0142, + "step": 18480 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.03787924721837044, + "learning_rate": 8.286045486216657e-05, + "loss": 0.0153, + "step": 18490 + }, + { + "epoch": 3.3351361096087975, + "grad_norm": 0.038495466113090515, + "learning_rate": 8.283967930670733e-05, + "loss": 0.0167, + "step": 18500 + }, + { + "epoch": 3.336938885884262, + "grad_norm": 0.048476848751306534, + "learning_rate": 8.281889377542058e-05, + "loss": 0.0162, + "step": 18510 + }, + { + "epoch": 3.338741662159726, + "grad_norm": 0.04408656060695648, + "learning_rate": 8.279809827462045e-05, + "loss": 0.0143, + "step": 18520 + }, + { + "epoch": 3.3405444384351903, + "grad_norm": 0.04140849784016609, + "learning_rate": 8.277729281062402e-05, + "loss": 0.0147, + "step": 18530 + }, + { + "epoch": 3.3423472147106543, + "grad_norm": 0.04301726445555687, + "learning_rate": 8.27564773897515e-05, + "loss": 0.0146, + "step": 18540 + }, + { + "epoch": 3.3441499909861188, + "grad_norm": 0.0386323444545269, + "learning_rate": 8.273565201832602e-05, + "loss": 0.0153, + "step": 18550 + }, + { + "epoch": 3.3459527672615827, + "grad_norm": 0.04478060454130173, + "learning_rate": 8.27148167026738e-05, + "loss": 0.0153, + "step": 18560 + }, + { + "epoch": 3.347755543537047, + "grad_norm": 0.03527615964412689, + "learning_rate": 8.269397144912405e-05, + "loss": 0.0137, + "step": 18570 + }, + { + "epoch": 3.349558319812511, + "grad_norm": 0.03739015385508537, + "learning_rate": 8.267311626400899e-05, + "loss": 0.0149, + "step": 18580 + }, + { + "epoch": 3.3513610960879756, + "grad_norm": 0.032162945717573166, + "learning_rate": 8.26522511536639e-05, + "loss": 0.0148, + "step": 18590 + }, + { + "epoch": 3.3531638723634396, + "grad_norm": 0.04199276119470596, + "learning_rate": 8.263137612442706e-05, + "loss": 0.0157, + "step": 18600 + }, + { + "epoch": 3.354966648638904, + "grad_norm": 0.034036945551633835, + "learning_rate": 8.261049118263971e-05, + "loss": 0.0157, + "step": 18610 + }, + { + "epoch": 3.356769424914368, + "grad_norm": 0.04772842302918434, + "learning_rate": 8.258959633464619e-05, + "loss": 0.0167, + "step": 18620 + }, + { + "epoch": 3.3585722011898325, + "grad_norm": 0.04918771609663963, + "learning_rate": 8.256869158679377e-05, + "loss": 0.0151, + "step": 18630 + }, + { + "epoch": 3.3603749774652965, + "grad_norm": 0.039065614342689514, + "learning_rate": 8.254777694543278e-05, + "loss": 0.0151, + "step": 18640 + }, + { + "epoch": 3.362177753740761, + "grad_norm": 0.053486429154872894, + "learning_rate": 8.252685241691651e-05, + "loss": 0.0148, + "step": 18650 + }, + { + "epoch": 3.363980530016225, + "grad_norm": 0.04167837277054787, + "learning_rate": 8.250591800760133e-05, + "loss": 0.0138, + "step": 18660 + }, + { + "epoch": 3.3657833062916893, + "grad_norm": 0.04774099215865135, + "learning_rate": 8.248497372384649e-05, + "loss": 0.0151, + "step": 18670 + }, + { + "epoch": 3.3675860825671533, + "grad_norm": 0.04595177248120308, + "learning_rate": 8.246401957201437e-05, + "loss": 0.0143, + "step": 18680 + }, + { + "epoch": 3.3693888588426177, + "grad_norm": 0.04253553971648216, + "learning_rate": 8.244305555847027e-05, + "loss": 0.0177, + "step": 18690 + }, + { + "epoch": 3.3711916351180817, + "grad_norm": 0.035030659288167953, + "learning_rate": 8.24220816895825e-05, + "loss": 0.0157, + "step": 18700 + }, + { + "epoch": 3.372994411393546, + "grad_norm": 0.026178395375609398, + "learning_rate": 8.240109797172237e-05, + "loss": 0.0158, + "step": 18710 + }, + { + "epoch": 3.37479718766901, + "grad_norm": 0.051227863878011703, + "learning_rate": 8.238010441126416e-05, + "loss": 0.014, + "step": 18720 + }, + { + "epoch": 3.3765999639444746, + "grad_norm": 0.03194168210029602, + "learning_rate": 8.23591010145852e-05, + "loss": 0.0138, + "step": 18730 + }, + { + "epoch": 3.3784027402199386, + "grad_norm": 0.057198453694581985, + "learning_rate": 8.233808778806571e-05, + "loss": 0.0153, + "step": 18740 + }, + { + "epoch": 3.380205516495403, + "grad_norm": 0.035380154848098755, + "learning_rate": 8.231706473808903e-05, + "loss": 0.0142, + "step": 18750 + }, + { + "epoch": 3.382008292770867, + "grad_norm": 0.04328013211488724, + "learning_rate": 8.229603187104133e-05, + "loss": 0.015, + "step": 18760 + }, + { + "epoch": 3.3838110690463314, + "grad_norm": 0.05469929054379463, + "learning_rate": 8.22749891933119e-05, + "loss": 0.0146, + "step": 18770 + }, + { + "epoch": 3.3856138453217954, + "grad_norm": 0.04030578210949898, + "learning_rate": 8.225393671129291e-05, + "loss": 0.0162, + "step": 18780 + }, + { + "epoch": 3.38741662159726, + "grad_norm": 0.03206414356827736, + "learning_rate": 8.223287443137957e-05, + "loss": 0.0139, + "step": 18790 + }, + { + "epoch": 3.389219397872724, + "grad_norm": 0.046366751194000244, + "learning_rate": 8.221180235997004e-05, + "loss": 0.0142, + "step": 18800 + }, + { + "epoch": 3.3910221741481883, + "grad_norm": 0.04616618901491165, + "learning_rate": 8.219072050346544e-05, + "loss": 0.0151, + "step": 18810 + }, + { + "epoch": 3.3928249504236523, + "grad_norm": 0.03186355531215668, + "learning_rate": 8.216962886826992e-05, + "loss": 0.0153, + "step": 18820 + }, + { + "epoch": 3.3946277266991167, + "grad_norm": 0.0446510910987854, + "learning_rate": 8.214852746079054e-05, + "loss": 0.0146, + "step": 18830 + }, + { + "epoch": 3.3964305029745807, + "grad_norm": 0.0341116264462471, + "learning_rate": 8.212741628743732e-05, + "loss": 0.0158, + "step": 18840 + }, + { + "epoch": 3.398233279250045, + "grad_norm": 0.055387869477272034, + "learning_rate": 8.210629535462333e-05, + "loss": 0.016, + "step": 18850 + }, + { + "epoch": 3.400036055525509, + "grad_norm": 0.02793276496231556, + "learning_rate": 8.208516466876453e-05, + "loss": 0.015, + "step": 18860 + }, + { + "epoch": 3.4018388318009736, + "grad_norm": 0.043695736676454544, + "learning_rate": 8.206402423627986e-05, + "loss": 0.0153, + "step": 18870 + }, + { + "epoch": 3.4036416080764376, + "grad_norm": 0.04137837141752243, + "learning_rate": 8.204287406359124e-05, + "loss": 0.0148, + "step": 18880 + }, + { + "epoch": 3.405444384351902, + "grad_norm": 0.041937507688999176, + "learning_rate": 8.20217141571235e-05, + "loss": 0.0141, + "step": 18890 + }, + { + "epoch": 3.407247160627366, + "grad_norm": 0.04385535418987274, + "learning_rate": 8.200054452330449e-05, + "loss": 0.0139, + "step": 18900 + }, + { + "epoch": 3.4090499369028304, + "grad_norm": 0.04438767954707146, + "learning_rate": 8.197936516856499e-05, + "loss": 0.0137, + "step": 18910 + }, + { + "epoch": 3.4108527131782944, + "grad_norm": 0.03926495835185051, + "learning_rate": 8.195817609933871e-05, + "loss": 0.014, + "step": 18920 + }, + { + "epoch": 3.412655489453759, + "grad_norm": 0.03984593600034714, + "learning_rate": 8.193697732206233e-05, + "loss": 0.0149, + "step": 18930 + }, + { + "epoch": 3.414458265729223, + "grad_norm": 0.040331099182367325, + "learning_rate": 8.19157688431755e-05, + "loss": 0.0129, + "step": 18940 + }, + { + "epoch": 3.4162610420046873, + "grad_norm": 0.04140309616923332, + "learning_rate": 8.189455066912077e-05, + "loss": 0.0139, + "step": 18950 + }, + { + "epoch": 3.4180638182801513, + "grad_norm": 0.052788883447647095, + "learning_rate": 8.187332280634369e-05, + "loss": 0.0147, + "step": 18960 + }, + { + "epoch": 3.4198665945556157, + "grad_norm": 0.03218856826424599, + "learning_rate": 8.18520852612927e-05, + "loss": 0.0146, + "step": 18970 + }, + { + "epoch": 3.4216693708310797, + "grad_norm": 0.034763772040605545, + "learning_rate": 8.183083804041921e-05, + "loss": 0.0164, + "step": 18980 + }, + { + "epoch": 3.423472147106544, + "grad_norm": 0.0383109413087368, + "learning_rate": 8.180958115017757e-05, + "loss": 0.0164, + "step": 18990 + }, + { + "epoch": 3.425274923382008, + "grad_norm": 0.041905343532562256, + "learning_rate": 8.178831459702505e-05, + "loss": 0.0147, + "step": 19000 + }, + { + "epoch": 3.4270776996574726, + "grad_norm": 0.03813144192099571, + "learning_rate": 8.17670383874219e-05, + "loss": 0.0149, + "step": 19010 + }, + { + "epoch": 3.4288804759329365, + "grad_norm": 0.06741496175527573, + "learning_rate": 8.174575252783124e-05, + "loss": 0.0148, + "step": 19020 + }, + { + "epoch": 3.430683252208401, + "grad_norm": 0.03418548405170441, + "learning_rate": 8.172445702471914e-05, + "loss": 0.0145, + "step": 19030 + }, + { + "epoch": 3.432486028483865, + "grad_norm": 0.040689241141080856, + "learning_rate": 8.170315188455466e-05, + "loss": 0.0147, + "step": 19040 + }, + { + "epoch": 3.4342888047593294, + "grad_norm": 0.053933627903461456, + "learning_rate": 8.168183711380969e-05, + "loss": 0.0152, + "step": 19050 + }, + { + "epoch": 3.4360915810347934, + "grad_norm": 0.03319588303565979, + "learning_rate": 8.166051271895913e-05, + "loss": 0.0146, + "step": 19060 + }, + { + "epoch": 3.437894357310258, + "grad_norm": 0.047280509024858475, + "learning_rate": 8.163917870648075e-05, + "loss": 0.015, + "step": 19070 + }, + { + "epoch": 3.439697133585722, + "grad_norm": 0.04009801521897316, + "learning_rate": 8.161783508285526e-05, + "loss": 0.0154, + "step": 19080 + }, + { + "epoch": 3.4414999098611863, + "grad_norm": 0.030150217935442924, + "learning_rate": 8.159648185456628e-05, + "loss": 0.0143, + "step": 19090 + }, + { + "epoch": 3.4433026861366502, + "grad_norm": 0.03477521613240242, + "learning_rate": 8.157511902810038e-05, + "loss": 0.0145, + "step": 19100 + }, + { + "epoch": 3.4451054624121147, + "grad_norm": 0.03518941253423691, + "learning_rate": 8.155374660994701e-05, + "loss": 0.014, + "step": 19110 + }, + { + "epoch": 3.4469082386875787, + "grad_norm": 0.03950391709804535, + "learning_rate": 8.153236460659857e-05, + "loss": 0.0162, + "step": 19120 + }, + { + "epoch": 3.448711014963043, + "grad_norm": 0.04165796563029289, + "learning_rate": 8.151097302455031e-05, + "loss": 0.015, + "step": 19130 + }, + { + "epoch": 3.4505137912385075, + "grad_norm": 0.04732467979192734, + "learning_rate": 8.148957187030044e-05, + "loss": 0.0143, + "step": 19140 + }, + { + "epoch": 3.4523165675139715, + "grad_norm": 0.039161551743745804, + "learning_rate": 8.146816115035006e-05, + "loss": 0.0148, + "step": 19150 + }, + { + "epoch": 3.4541193437894355, + "grad_norm": 0.034740742295980453, + "learning_rate": 8.14467408712032e-05, + "loss": 0.0149, + "step": 19160 + }, + { + "epoch": 3.4559221200649, + "grad_norm": 0.05445292964577675, + "learning_rate": 8.142531103936678e-05, + "loss": 0.0151, + "step": 19170 + }, + { + "epoch": 3.4577248963403644, + "grad_norm": 0.03598875179886818, + "learning_rate": 8.14038716613506e-05, + "loss": 0.0156, + "step": 19180 + }, + { + "epoch": 3.4595276726158284, + "grad_norm": 0.06519860774278641, + "learning_rate": 8.138242274366736e-05, + "loss": 0.0157, + "step": 19190 + }, + { + "epoch": 3.4613304488912924, + "grad_norm": 0.038065992295742035, + "learning_rate": 8.136096429283271e-05, + "loss": 0.0149, + "step": 19200 + }, + { + "epoch": 3.463133225166757, + "grad_norm": 0.04890502989292145, + "learning_rate": 8.133949631536515e-05, + "loss": 0.014, + "step": 19210 + }, + { + "epoch": 3.4649360014422212, + "grad_norm": 0.047623224556446075, + "learning_rate": 8.131801881778607e-05, + "loss": 0.0151, + "step": 19220 + }, + { + "epoch": 3.4667387777176852, + "grad_norm": 0.03559454530477524, + "learning_rate": 8.129653180661978e-05, + "loss": 0.0139, + "step": 19230 + }, + { + "epoch": 3.4685415539931492, + "grad_norm": 0.04051755741238594, + "learning_rate": 8.127503528839346e-05, + "loss": 0.0153, + "step": 19240 + }, + { + "epoch": 3.4703443302686137, + "grad_norm": 0.03824333846569061, + "learning_rate": 8.125352926963721e-05, + "loss": 0.0152, + "step": 19250 + }, + { + "epoch": 3.472147106544078, + "grad_norm": 0.04621855169534683, + "learning_rate": 8.123201375688395e-05, + "loss": 0.0158, + "step": 19260 + }, + { + "epoch": 3.473949882819542, + "grad_norm": 0.05880158767104149, + "learning_rate": 8.121048875666954e-05, + "loss": 0.0144, + "step": 19270 + }, + { + "epoch": 3.475752659095006, + "grad_norm": 0.07258862257003784, + "learning_rate": 8.118895427553274e-05, + "loss": 0.0172, + "step": 19280 + }, + { + "epoch": 3.4775554353704705, + "grad_norm": 0.046844080090522766, + "learning_rate": 8.116741032001511e-05, + "loss": 0.0154, + "step": 19290 + }, + { + "epoch": 3.479358211645935, + "grad_norm": 0.050082698464393616, + "learning_rate": 8.114585689666114e-05, + "loss": 0.0155, + "step": 19300 + }, + { + "epoch": 3.481160987921399, + "grad_norm": 0.039941903203725815, + "learning_rate": 8.112429401201821e-05, + "loss": 0.0151, + "step": 19310 + }, + { + "epoch": 3.482963764196863, + "grad_norm": 0.03121318109333515, + "learning_rate": 8.110272167263656e-05, + "loss": 0.0147, + "step": 19320 + }, + { + "epoch": 3.4847665404723274, + "grad_norm": 0.03897875174880028, + "learning_rate": 8.108113988506929e-05, + "loss": 0.0146, + "step": 19330 + }, + { + "epoch": 3.486569316747792, + "grad_norm": 0.038085777312517166, + "learning_rate": 8.105954865587235e-05, + "loss": 0.0142, + "step": 19340 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 0.03362308442592621, + "learning_rate": 8.103794799160463e-05, + "loss": 0.0147, + "step": 19350 + }, + { + "epoch": 3.49017486929872, + "grad_norm": 0.050041403621435165, + "learning_rate": 8.101633789882781e-05, + "loss": 0.0158, + "step": 19360 + }, + { + "epoch": 3.491977645574184, + "grad_norm": 0.049553874880075455, + "learning_rate": 8.099471838410648e-05, + "loss": 0.0149, + "step": 19370 + }, + { + "epoch": 3.4937804218496487, + "grad_norm": 0.040743015706539154, + "learning_rate": 8.097308945400806e-05, + "loss": 0.015, + "step": 19380 + }, + { + "epoch": 3.4955831981251126, + "grad_norm": 0.0337827205657959, + "learning_rate": 8.095145111510288e-05, + "loss": 0.0138, + "step": 19390 + }, + { + "epoch": 3.497385974400577, + "grad_norm": 0.049499817192554474, + "learning_rate": 8.092980337396406e-05, + "loss": 0.0146, + "step": 19400 + }, + { + "epoch": 3.499188750676041, + "grad_norm": 0.039193812757730484, + "learning_rate": 8.090814623716763e-05, + "loss": 0.0139, + "step": 19410 + }, + { + "epoch": 3.5009915269515055, + "grad_norm": 0.04294576495885849, + "learning_rate": 8.088647971129246e-05, + "loss": 0.0149, + "step": 19420 + }, + { + "epoch": 3.5027943032269695, + "grad_norm": 0.039534565061330795, + "learning_rate": 8.086480380292026e-05, + "loss": 0.0142, + "step": 19430 + }, + { + "epoch": 3.5045970795024335, + "grad_norm": 0.041202276945114136, + "learning_rate": 8.084311851863562e-05, + "loss": 0.0143, + "step": 19440 + }, + { + "epoch": 3.506399855777898, + "grad_norm": 0.054129671305418015, + "learning_rate": 8.082142386502591e-05, + "loss": 0.0157, + "step": 19450 + }, + { + "epoch": 3.5082026320533624, + "grad_norm": 0.05114090442657471, + "learning_rate": 8.079971984868145e-05, + "loss": 0.0163, + "step": 19460 + }, + { + "epoch": 3.5100054083288263, + "grad_norm": 0.031046036630868912, + "learning_rate": 8.077800647619532e-05, + "loss": 0.0151, + "step": 19470 + }, + { + "epoch": 3.5118081846042903, + "grad_norm": 0.050280485302209854, + "learning_rate": 8.075628375416345e-05, + "loss": 0.0157, + "step": 19480 + }, + { + "epoch": 3.5136109608797548, + "grad_norm": 0.04369664937257767, + "learning_rate": 8.073455168918464e-05, + "loss": 0.0158, + "step": 19490 + }, + { + "epoch": 3.515413737155219, + "grad_norm": 0.044924430549144745, + "learning_rate": 8.071281028786055e-05, + "loss": 0.0152, + "step": 19500 + }, + { + "epoch": 3.517216513430683, + "grad_norm": 0.032675743103027344, + "learning_rate": 8.069105955679562e-05, + "loss": 0.0148, + "step": 19510 + }, + { + "epoch": 3.519019289706147, + "grad_norm": 0.05602207034826279, + "learning_rate": 8.066929950259713e-05, + "loss": 0.0154, + "step": 19520 + }, + { + "epoch": 3.5208220659816116, + "grad_norm": 0.04105401411652565, + "learning_rate": 8.064753013187522e-05, + "loss": 0.0142, + "step": 19530 + }, + { + "epoch": 3.522624842257076, + "grad_norm": 0.035338144749403, + "learning_rate": 8.062575145124289e-05, + "loss": 0.0165, + "step": 19540 + }, + { + "epoch": 3.52442761853254, + "grad_norm": 0.04058060795068741, + "learning_rate": 8.060396346731587e-05, + "loss": 0.0151, + "step": 19550 + }, + { + "epoch": 3.5262303948080045, + "grad_norm": 0.040193576365709305, + "learning_rate": 8.058216618671281e-05, + "loss": 0.0173, + "step": 19560 + }, + { + "epoch": 3.5280331710834685, + "grad_norm": 0.037627872079610825, + "learning_rate": 8.056035961605514e-05, + "loss": 0.0144, + "step": 19570 + }, + { + "epoch": 3.529835947358933, + "grad_norm": 0.04159519448876381, + "learning_rate": 8.05385437619671e-05, + "loss": 0.0151, + "step": 19580 + }, + { + "epoch": 3.531638723634397, + "grad_norm": 0.04496466740965843, + "learning_rate": 8.05167186310758e-05, + "loss": 0.0158, + "step": 19590 + }, + { + "epoch": 3.5334414999098613, + "grad_norm": 0.03798772022128105, + "learning_rate": 8.049488423001113e-05, + "loss": 0.0156, + "step": 19600 + }, + { + "epoch": 3.5352442761853253, + "grad_norm": 0.038500335067510605, + "learning_rate": 8.047304056540581e-05, + "loss": 0.0159, + "step": 19610 + }, + { + "epoch": 3.5370470524607898, + "grad_norm": 0.04387033358216286, + "learning_rate": 8.045118764389534e-05, + "loss": 0.0146, + "step": 19620 + }, + { + "epoch": 3.5388498287362538, + "grad_norm": 0.03716866672039032, + "learning_rate": 8.042932547211809e-05, + "loss": 0.0147, + "step": 19630 + }, + { + "epoch": 3.540652605011718, + "grad_norm": 0.03922867029905319, + "learning_rate": 8.04074540567152e-05, + "loss": 0.0148, + "step": 19640 + }, + { + "epoch": 3.542455381287182, + "grad_norm": 0.05910570174455643, + "learning_rate": 8.038557340433063e-05, + "loss": 0.016, + "step": 19650 + }, + { + "epoch": 3.5442581575626466, + "grad_norm": 0.04391755908727646, + "learning_rate": 8.036368352161115e-05, + "loss": 0.0141, + "step": 19660 + }, + { + "epoch": 3.5460609338381106, + "grad_norm": 0.039540257304906845, + "learning_rate": 8.034178441520633e-05, + "loss": 0.0148, + "step": 19670 + }, + { + "epoch": 3.547863710113575, + "grad_norm": 0.039864297956228256, + "learning_rate": 8.031987609176852e-05, + "loss": 0.0149, + "step": 19680 + }, + { + "epoch": 3.549666486389039, + "grad_norm": 0.048623815178871155, + "learning_rate": 8.02979585579529e-05, + "loss": 0.0149, + "step": 19690 + }, + { + "epoch": 3.5514692626645035, + "grad_norm": 0.037327881902456284, + "learning_rate": 8.027603182041745e-05, + "loss": 0.014, + "step": 19700 + }, + { + "epoch": 3.5532720389399675, + "grad_norm": 0.04462967440485954, + "learning_rate": 8.025409588582292e-05, + "loss": 0.0141, + "step": 19710 + }, + { + "epoch": 3.555074815215432, + "grad_norm": 0.04640517011284828, + "learning_rate": 8.023215076083288e-05, + "loss": 0.0149, + "step": 19720 + }, + { + "epoch": 3.556877591490896, + "grad_norm": 0.046785250306129456, + "learning_rate": 8.021019645211367e-05, + "loss": 0.0156, + "step": 19730 + }, + { + "epoch": 3.5586803677663603, + "grad_norm": 0.02823708765208721, + "learning_rate": 8.018823296633441e-05, + "loss": 0.0145, + "step": 19740 + }, + { + "epoch": 3.5604831440418243, + "grad_norm": 0.042899079620838165, + "learning_rate": 8.016626031016708e-05, + "loss": 0.0145, + "step": 19750 + }, + { + "epoch": 3.5622859203172887, + "grad_norm": 0.046258144080638885, + "learning_rate": 8.014427849028636e-05, + "loss": 0.0144, + "step": 19760 + }, + { + "epoch": 3.5640886965927527, + "grad_norm": 0.042662426829338074, + "learning_rate": 8.012228751336974e-05, + "loss": 0.0159, + "step": 19770 + }, + { + "epoch": 3.565891472868217, + "grad_norm": 0.058372534811496735, + "learning_rate": 8.01002873860975e-05, + "loss": 0.0155, + "step": 19780 + }, + { + "epoch": 3.567694249143681, + "grad_norm": 0.07163384556770325, + "learning_rate": 8.00782781151527e-05, + "loss": 0.0158, + "step": 19790 + }, + { + "epoch": 3.5694970254191456, + "grad_norm": 0.03941011801362038, + "learning_rate": 8.005625970722119e-05, + "loss": 0.0144, + "step": 19800 + }, + { + "epoch": 3.5712998016946096, + "grad_norm": 0.033405859023332596, + "learning_rate": 8.003423216899158e-05, + "loss": 0.0149, + "step": 19810 + }, + { + "epoch": 3.573102577970074, + "grad_norm": 0.05415824428200722, + "learning_rate": 8.001219550715522e-05, + "loss": 0.0144, + "step": 19820 + }, + { + "epoch": 3.574905354245538, + "grad_norm": 0.03169209137558937, + "learning_rate": 7.999014972840632e-05, + "loss": 0.0157, + "step": 19830 + }, + { + "epoch": 3.5767081305210024, + "grad_norm": 0.035620275884866714, + "learning_rate": 7.996809483944174e-05, + "loss": 0.0147, + "step": 19840 + }, + { + "epoch": 3.5785109067964664, + "grad_norm": 0.04325123876333237, + "learning_rate": 7.994603084696124e-05, + "loss": 0.0145, + "step": 19850 + }, + { + "epoch": 3.580313683071931, + "grad_norm": 0.03547627851366997, + "learning_rate": 7.992395775766724e-05, + "loss": 0.0154, + "step": 19860 + }, + { + "epoch": 3.582116459347395, + "grad_norm": 0.05444241687655449, + "learning_rate": 7.990187557826497e-05, + "loss": 0.0147, + "step": 19870 + }, + { + "epoch": 3.5839192356228593, + "grad_norm": 0.0524296797811985, + "learning_rate": 7.987978431546242e-05, + "loss": 0.0162, + "step": 19880 + }, + { + "epoch": 3.5857220118983233, + "grad_norm": 0.04514648765325546, + "learning_rate": 7.985768397597031e-05, + "loss": 0.0158, + "step": 19890 + }, + { + "epoch": 3.5875247881737877, + "grad_norm": 0.038352470844984055, + "learning_rate": 7.983557456650216e-05, + "loss": 0.015, + "step": 19900 + }, + { + "epoch": 3.5893275644492517, + "grad_norm": 0.03553672507405281, + "learning_rate": 7.981345609377422e-05, + "loss": 0.0139, + "step": 19910 + }, + { + "epoch": 3.591130340724716, + "grad_norm": 0.02750222571194172, + "learning_rate": 7.97913285645055e-05, + "loss": 0.0141, + "step": 19920 + }, + { + "epoch": 3.59293311700018, + "grad_norm": 0.03494933247566223, + "learning_rate": 7.976919198541776e-05, + "loss": 0.0148, + "step": 19930 + }, + { + "epoch": 3.5947358932756446, + "grad_norm": 0.0474669486284256, + "learning_rate": 7.974704636323548e-05, + "loss": 0.0142, + "step": 19940 + }, + { + "epoch": 3.5965386695511086, + "grad_norm": 0.04354124888777733, + "learning_rate": 7.972489170468597e-05, + "loss": 0.0141, + "step": 19950 + }, + { + "epoch": 3.598341445826573, + "grad_norm": 0.03839286044239998, + "learning_rate": 7.970272801649918e-05, + "loss": 0.0142, + "step": 19960 + }, + { + "epoch": 3.600144222102037, + "grad_norm": 0.04027426987886429, + "learning_rate": 7.96805553054079e-05, + "loss": 0.0143, + "step": 19970 + }, + { + "epoch": 3.6019469983775014, + "grad_norm": 0.03818195313215256, + "learning_rate": 7.965837357814756e-05, + "loss": 0.014, + "step": 19980 + }, + { + "epoch": 3.6037497746529654, + "grad_norm": 0.034888673573732376, + "learning_rate": 7.963618284145643e-05, + "loss": 0.014, + "step": 19990 + }, + { + "epoch": 3.60555255092843, + "grad_norm": 0.05059482529759407, + "learning_rate": 7.961398310207544e-05, + "loss": 0.0153, + "step": 20000 + }, + { + "epoch": 3.607355327203894, + "grad_norm": 0.051929086446762085, + "learning_rate": 7.95917743667483e-05, + "loss": 0.0162, + "step": 20010 + }, + { + "epoch": 3.6091581034793583, + "grad_norm": 0.03973250836133957, + "learning_rate": 7.956955664222144e-05, + "loss": 0.0142, + "step": 20020 + }, + { + "epoch": 3.6109608797548223, + "grad_norm": 0.04374244809150696, + "learning_rate": 7.954732993524399e-05, + "loss": 0.0146, + "step": 20030 + }, + { + "epoch": 3.6127636560302867, + "grad_norm": 0.03251856938004494, + "learning_rate": 7.952509425256786e-05, + "loss": 0.0155, + "step": 20040 + }, + { + "epoch": 3.6145664323057507, + "grad_norm": 0.03911960870027542, + "learning_rate": 7.950284960094767e-05, + "loss": 0.0148, + "step": 20050 + }, + { + "epoch": 3.616369208581215, + "grad_norm": 0.04027637466788292, + "learning_rate": 7.948059598714076e-05, + "loss": 0.0159, + "step": 20060 + }, + { + "epoch": 3.618171984856679, + "grad_norm": 0.029174070805311203, + "learning_rate": 7.945833341790717e-05, + "loss": 0.0152, + "step": 20070 + }, + { + "epoch": 3.6199747611321436, + "grad_norm": 0.03697417303919792, + "learning_rate": 7.94360619000097e-05, + "loss": 0.016, + "step": 20080 + }, + { + "epoch": 3.6217775374076076, + "grad_norm": 0.0647302195429802, + "learning_rate": 7.941378144021381e-05, + "loss": 0.0164, + "step": 20090 + }, + { + "epoch": 3.623580313683072, + "grad_norm": 0.038463521748781204, + "learning_rate": 7.939149204528777e-05, + "loss": 0.0148, + "step": 20100 + }, + { + "epoch": 3.625383089958536, + "grad_norm": 0.06270292401313782, + "learning_rate": 7.936919372200246e-05, + "loss": 0.0143, + "step": 20110 + }, + { + "epoch": 3.6271858662340004, + "grad_norm": 0.049879152327775955, + "learning_rate": 7.934688647713158e-05, + "loss": 0.0147, + "step": 20120 + }, + { + "epoch": 3.628988642509465, + "grad_norm": 0.039216913282871246, + "learning_rate": 7.932457031745143e-05, + "loss": 0.0162, + "step": 20130 + }, + { + "epoch": 3.630791418784929, + "grad_norm": 0.04754387587308884, + "learning_rate": 7.930224524974108e-05, + "loss": 0.0155, + "step": 20140 + }, + { + "epoch": 3.632594195060393, + "grad_norm": 0.04411318525671959, + "learning_rate": 7.927991128078232e-05, + "loss": 0.0145, + "step": 20150 + }, + { + "epoch": 3.6343969713358573, + "grad_norm": 0.033972788602113724, + "learning_rate": 7.925756841735958e-05, + "loss": 0.0146, + "step": 20160 + }, + { + "epoch": 3.6361997476113217, + "grad_norm": 0.03916715458035469, + "learning_rate": 7.923521666626008e-05, + "loss": 0.0149, + "step": 20170 + }, + { + "epoch": 3.6380025238867857, + "grad_norm": 0.057748373597860336, + "learning_rate": 7.921285603427366e-05, + "loss": 0.0153, + "step": 20180 + }, + { + "epoch": 3.6398053001622497, + "grad_norm": 0.039206746965646744, + "learning_rate": 7.91904865281929e-05, + "loss": 0.0159, + "step": 20190 + }, + { + "epoch": 3.641608076437714, + "grad_norm": 0.044377319514751434, + "learning_rate": 7.916810815481307e-05, + "loss": 0.0167, + "step": 20200 + }, + { + "epoch": 3.6434108527131785, + "grad_norm": 0.03270238637924194, + "learning_rate": 7.914572092093211e-05, + "loss": 0.0153, + "step": 20210 + }, + { + "epoch": 3.6452136289886425, + "grad_norm": 0.047995951026678085, + "learning_rate": 7.912332483335068e-05, + "loss": 0.0145, + "step": 20220 + }, + { + "epoch": 3.6470164052641065, + "grad_norm": 0.03722942993044853, + "learning_rate": 7.910091989887213e-05, + "loss": 0.0144, + "step": 20230 + }, + { + "epoch": 3.648819181539571, + "grad_norm": 0.03343651071190834, + "learning_rate": 7.907850612430248e-05, + "loss": 0.0144, + "step": 20240 + }, + { + "epoch": 3.6506219578150354, + "grad_norm": 0.057356301695108414, + "learning_rate": 7.905608351645044e-05, + "loss": 0.0157, + "step": 20250 + }, + { + "epoch": 3.6524247340904994, + "grad_norm": 0.04138507321476936, + "learning_rate": 7.90336520821274e-05, + "loss": 0.014, + "step": 20260 + }, + { + "epoch": 3.6542275103659634, + "grad_norm": 0.03338415175676346, + "learning_rate": 7.901121182814746e-05, + "loss": 0.0146, + "step": 20270 + }, + { + "epoch": 3.656030286641428, + "grad_norm": 0.026249967515468597, + "learning_rate": 7.898876276132736e-05, + "loss": 0.0131, + "step": 20280 + }, + { + "epoch": 3.6578330629168923, + "grad_norm": 0.042465727776288986, + "learning_rate": 7.896630488848654e-05, + "loss": 0.0146, + "step": 20290 + }, + { + "epoch": 3.6596358391923562, + "grad_norm": 0.03768191486597061, + "learning_rate": 7.89438382164471e-05, + "loss": 0.0148, + "step": 20300 + }, + { + "epoch": 3.6614386154678202, + "grad_norm": 0.04162950441241264, + "learning_rate": 7.892136275203383e-05, + "loss": 0.0147, + "step": 20310 + }, + { + "epoch": 3.6632413917432847, + "grad_norm": 0.047552138566970825, + "learning_rate": 7.889887850207418e-05, + "loss": 0.0144, + "step": 20320 + }, + { + "epoch": 3.665044168018749, + "grad_norm": 0.05277705565094948, + "learning_rate": 7.887638547339827e-05, + "loss": 0.0144, + "step": 20330 + }, + { + "epoch": 3.666846944294213, + "grad_norm": 0.045763008296489716, + "learning_rate": 7.885388367283891e-05, + "loss": 0.0145, + "step": 20340 + }, + { + "epoch": 3.668649720569677, + "grad_norm": 0.0361752025783062, + "learning_rate": 7.88313731072315e-05, + "loss": 0.0147, + "step": 20350 + }, + { + "epoch": 3.6704524968451415, + "grad_norm": 0.04635787010192871, + "learning_rate": 7.88088537834142e-05, + "loss": 0.0143, + "step": 20360 + }, + { + "epoch": 3.672255273120606, + "grad_norm": 0.04749641939997673, + "learning_rate": 7.878632570822778e-05, + "loss": 0.0148, + "step": 20370 + }, + { + "epoch": 3.67405804939607, + "grad_norm": 0.043958891183137894, + "learning_rate": 7.876378888851567e-05, + "loss": 0.0142, + "step": 20380 + }, + { + "epoch": 3.675860825671534, + "grad_norm": 0.03590122237801552, + "learning_rate": 7.874124333112396e-05, + "loss": 0.0135, + "step": 20390 + }, + { + "epoch": 3.6776636019469984, + "grad_norm": 0.046476226300001144, + "learning_rate": 7.871868904290138e-05, + "loss": 0.0139, + "step": 20400 + }, + { + "epoch": 3.679466378222463, + "grad_norm": 0.051242485642433167, + "learning_rate": 7.869612603069935e-05, + "loss": 0.0157, + "step": 20410 + }, + { + "epoch": 3.681269154497927, + "grad_norm": 0.036561571061611176, + "learning_rate": 7.867355430137192e-05, + "loss": 0.0136, + "step": 20420 + }, + { + "epoch": 3.683071930773391, + "grad_norm": 0.037889521569013596, + "learning_rate": 7.865097386177577e-05, + "loss": 0.0143, + "step": 20430 + }, + { + "epoch": 3.6848747070488552, + "grad_norm": 0.05961771309375763, + "learning_rate": 7.862838471877023e-05, + "loss": 0.014, + "step": 20440 + }, + { + "epoch": 3.6866774833243197, + "grad_norm": 0.03327816724777222, + "learning_rate": 7.860578687921731e-05, + "loss": 0.0151, + "step": 20450 + }, + { + "epoch": 3.6884802595997837, + "grad_norm": 0.03874890133738518, + "learning_rate": 7.858318034998164e-05, + "loss": 0.0153, + "step": 20460 + }, + { + "epoch": 3.6902830358752476, + "grad_norm": 0.04550622031092644, + "learning_rate": 7.856056513793046e-05, + "loss": 0.0143, + "step": 20470 + }, + { + "epoch": 3.692085812150712, + "grad_norm": 0.0385313555598259, + "learning_rate": 7.85379412499337e-05, + "loss": 0.0145, + "step": 20480 + }, + { + "epoch": 3.6938885884261765, + "grad_norm": 0.03136172518134117, + "learning_rate": 7.851530869286389e-05, + "loss": 0.0141, + "step": 20490 + }, + { + "epoch": 3.6956913647016405, + "grad_norm": 0.03777362033724785, + "learning_rate": 7.849266747359619e-05, + "loss": 0.0141, + "step": 20500 + }, + { + "epoch": 3.6974941409771045, + "grad_norm": 0.03591597452759743, + "learning_rate": 7.847001759900843e-05, + "loss": 0.014, + "step": 20510 + }, + { + "epoch": 3.699296917252569, + "grad_norm": 0.04810404032468796, + "learning_rate": 7.844735907598102e-05, + "loss": 0.0151, + "step": 20520 + }, + { + "epoch": 3.7010996935280334, + "grad_norm": 0.03622191399335861, + "learning_rate": 7.842469191139703e-05, + "loss": 0.0137, + "step": 20530 + }, + { + "epoch": 3.7029024698034974, + "grad_norm": 0.06614887714385986, + "learning_rate": 7.840201611214215e-05, + "loss": 0.0146, + "step": 20540 + }, + { + "epoch": 3.7047052460789613, + "grad_norm": 0.03529386222362518, + "learning_rate": 7.837933168510469e-05, + "loss": 0.015, + "step": 20550 + }, + { + "epoch": 3.706508022354426, + "grad_norm": 0.041570164263248444, + "learning_rate": 7.835663863717559e-05, + "loss": 0.0149, + "step": 20560 + }, + { + "epoch": 3.70831079862989, + "grad_norm": 0.032178789377212524, + "learning_rate": 7.833393697524838e-05, + "loss": 0.0141, + "step": 20570 + }, + { + "epoch": 3.710113574905354, + "grad_norm": 0.032469525933265686, + "learning_rate": 7.831122670621922e-05, + "loss": 0.0141, + "step": 20580 + }, + { + "epoch": 3.711916351180818, + "grad_norm": 0.04391586035490036, + "learning_rate": 7.82885078369869e-05, + "loss": 0.0137, + "step": 20590 + }, + { + "epoch": 3.7137191274562826, + "grad_norm": 0.049191899597644806, + "learning_rate": 7.826578037445283e-05, + "loss": 0.0146, + "step": 20600 + }, + { + "epoch": 3.715521903731747, + "grad_norm": 0.05421588197350502, + "learning_rate": 7.824304432552097e-05, + "loss": 0.0156, + "step": 20610 + }, + { + "epoch": 3.717324680007211, + "grad_norm": 0.050945352762937546, + "learning_rate": 7.822029969709798e-05, + "loss": 0.015, + "step": 20620 + }, + { + "epoch": 3.719127456282675, + "grad_norm": 0.03762253746390343, + "learning_rate": 7.819754649609306e-05, + "loss": 0.0143, + "step": 20630 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 0.050063569098711014, + "learning_rate": 7.817478472941802e-05, + "loss": 0.014, + "step": 20640 + }, + { + "epoch": 3.722733008833604, + "grad_norm": 0.04281818866729736, + "learning_rate": 7.815201440398727e-05, + "loss": 0.015, + "step": 20650 + }, + { + "epoch": 3.724535785109068, + "grad_norm": 0.06540575623512268, + "learning_rate": 7.812923552671789e-05, + "loss": 0.0147, + "step": 20660 + }, + { + "epoch": 3.7263385613845323, + "grad_norm": 0.04596049338579178, + "learning_rate": 7.810644810452945e-05, + "loss": 0.0129, + "step": 20670 + }, + { + "epoch": 3.7281413376599963, + "grad_norm": 0.03492709994316101, + "learning_rate": 7.808365214434417e-05, + "loss": 0.0138, + "step": 20680 + }, + { + "epoch": 3.7299441139354608, + "grad_norm": 0.051774825900793076, + "learning_rate": 7.80608476530869e-05, + "loss": 0.0159, + "step": 20690 + }, + { + "epoch": 3.7317468902109248, + "grad_norm": 0.04082994535565376, + "learning_rate": 7.8038034637685e-05, + "loss": 0.0145, + "step": 20700 + }, + { + "epoch": 3.733549666486389, + "grad_norm": 0.041404783725738525, + "learning_rate": 7.801521310506848e-05, + "loss": 0.0142, + "step": 20710 + }, + { + "epoch": 3.735352442761853, + "grad_norm": 0.042407602071762085, + "learning_rate": 7.799238306216994e-05, + "loss": 0.0153, + "step": 20720 + }, + { + "epoch": 3.7371552190373176, + "grad_norm": 0.03460095822811127, + "learning_rate": 7.796954451592448e-05, + "loss": 0.0127, + "step": 20730 + }, + { + "epoch": 3.7389579953127816, + "grad_norm": 0.035136278718709946, + "learning_rate": 7.794669747326992e-05, + "loss": 0.0138, + "step": 20740 + }, + { + "epoch": 3.740760771588246, + "grad_norm": 0.03732938691973686, + "learning_rate": 7.792384194114654e-05, + "loss": 0.0147, + "step": 20750 + }, + { + "epoch": 3.74256354786371, + "grad_norm": 0.03188316524028778, + "learning_rate": 7.790097792649729e-05, + "loss": 0.014, + "step": 20760 + }, + { + "epoch": 3.7443663241391745, + "grad_norm": 0.03245721012353897, + "learning_rate": 7.787810543626762e-05, + "loss": 0.0129, + "step": 20770 + }, + { + "epoch": 3.7461691004146385, + "grad_norm": 0.0357922725379467, + "learning_rate": 7.785522447740558e-05, + "loss": 0.0154, + "step": 20780 + }, + { + "epoch": 3.747971876690103, + "grad_norm": 0.035208266228437424, + "learning_rate": 7.783233505686182e-05, + "loss": 0.0141, + "step": 20790 + }, + { + "epoch": 3.749774652965567, + "grad_norm": 0.032315921038389206, + "learning_rate": 7.780943718158955e-05, + "loss": 0.0137, + "step": 20800 + }, + { + "epoch": 3.7515774292410313, + "grad_norm": 0.049639370292425156, + "learning_rate": 7.778653085854453e-05, + "loss": 0.0135, + "step": 20810 + }, + { + "epoch": 3.7533802055164953, + "grad_norm": 0.03865642100572586, + "learning_rate": 7.77636160946851e-05, + "loss": 0.0141, + "step": 20820 + }, + { + "epoch": 3.7551829817919598, + "grad_norm": 0.03565587103366852, + "learning_rate": 7.774069289697215e-05, + "loss": 0.0143, + "step": 20830 + }, + { + "epoch": 3.7569857580674237, + "grad_norm": 0.036383070051670074, + "learning_rate": 7.771776127236913e-05, + "loss": 0.0142, + "step": 20840 + }, + { + "epoch": 3.758788534342888, + "grad_norm": 0.05762959271669388, + "learning_rate": 7.769482122784212e-05, + "loss": 0.0144, + "step": 20850 + }, + { + "epoch": 3.760591310618352, + "grad_norm": 0.054863493889570236, + "learning_rate": 7.767187277035963e-05, + "loss": 0.0156, + "step": 20860 + }, + { + "epoch": 3.7623940868938166, + "grad_norm": 0.04642057046294212, + "learning_rate": 7.764891590689285e-05, + "loss": 0.0136, + "step": 20870 + }, + { + "epoch": 3.7641968631692806, + "grad_norm": 0.03445636108517647, + "learning_rate": 7.762595064441542e-05, + "loss": 0.014, + "step": 20880 + }, + { + "epoch": 3.765999639444745, + "grad_norm": 0.03290683031082153, + "learning_rate": 7.760297698990362e-05, + "loss": 0.0155, + "step": 20890 + }, + { + "epoch": 3.767802415720209, + "grad_norm": 0.04270542785525322, + "learning_rate": 7.757999495033623e-05, + "loss": 0.0158, + "step": 20900 + }, + { + "epoch": 3.7696051919956735, + "grad_norm": 0.03870920464396477, + "learning_rate": 7.755700453269456e-05, + "loss": 0.0144, + "step": 20910 + }, + { + "epoch": 3.7714079682711374, + "grad_norm": 0.03391006588935852, + "learning_rate": 7.753400574396254e-05, + "loss": 0.0146, + "step": 20920 + }, + { + "epoch": 3.773210744546602, + "grad_norm": 0.043533071875572205, + "learning_rate": 7.751099859112655e-05, + "loss": 0.0161, + "step": 20930 + }, + { + "epoch": 3.775013520822066, + "grad_norm": 0.03380133956670761, + "learning_rate": 7.748798308117557e-05, + "loss": 0.0135, + "step": 20940 + }, + { + "epoch": 3.7768162970975303, + "grad_norm": 0.029777873307466507, + "learning_rate": 7.746495922110112e-05, + "loss": 0.0141, + "step": 20950 + }, + { + "epoch": 3.7786190733729943, + "grad_norm": 0.02880527824163437, + "learning_rate": 7.744192701789723e-05, + "loss": 0.0144, + "step": 20960 + }, + { + "epoch": 3.7804218496484587, + "grad_norm": 0.03469603881239891, + "learning_rate": 7.741888647856046e-05, + "loss": 0.0138, + "step": 20970 + }, + { + "epoch": 3.7822246259239227, + "grad_norm": 0.034528348594903946, + "learning_rate": 7.739583761008994e-05, + "loss": 0.0136, + "step": 20980 + }, + { + "epoch": 3.784027402199387, + "grad_norm": 0.046810802072286606, + "learning_rate": 7.73727804194873e-05, + "loss": 0.0145, + "step": 20990 + }, + { + "epoch": 3.785830178474851, + "grad_norm": 0.04276028275489807, + "learning_rate": 7.734971491375671e-05, + "loss": 0.0159, + "step": 21000 + }, + { + "epoch": 3.7876329547503156, + "grad_norm": 0.04444903880357742, + "learning_rate": 7.732664109990485e-05, + "loss": 0.0151, + "step": 21010 + }, + { + "epoch": 3.7894357310257796, + "grad_norm": 0.03682350739836693, + "learning_rate": 7.730355898494095e-05, + "loss": 0.014, + "step": 21020 + }, + { + "epoch": 3.791238507301244, + "grad_norm": 0.07245072722434998, + "learning_rate": 7.728046857587673e-05, + "loss": 0.0161, + "step": 21030 + }, + { + "epoch": 3.793041283576708, + "grad_norm": 0.039790619164705276, + "learning_rate": 7.725736987972647e-05, + "loss": 0.015, + "step": 21040 + }, + { + "epoch": 3.7948440598521724, + "grad_norm": 0.038255952298641205, + "learning_rate": 7.723426290350691e-05, + "loss": 0.0149, + "step": 21050 + }, + { + "epoch": 3.7966468361276364, + "grad_norm": 0.05688577517867088, + "learning_rate": 7.721114765423736e-05, + "loss": 0.0143, + "step": 21060 + }, + { + "epoch": 3.798449612403101, + "grad_norm": 0.043230485171079636, + "learning_rate": 7.718802413893963e-05, + "loss": 0.0138, + "step": 21070 + }, + { + "epoch": 3.800252388678565, + "grad_norm": 0.03950490057468414, + "learning_rate": 7.716489236463802e-05, + "loss": 0.0151, + "step": 21080 + }, + { + "epoch": 3.8020551649540293, + "grad_norm": 0.03643106669187546, + "learning_rate": 7.714175233835936e-05, + "loss": 0.0128, + "step": 21090 + }, + { + "epoch": 3.8038579412294933, + "grad_norm": 0.032499685883522034, + "learning_rate": 7.711860406713299e-05, + "loss": 0.0144, + "step": 21100 + }, + { + "epoch": 3.8056607175049577, + "grad_norm": 0.03343929350376129, + "learning_rate": 7.70954475579907e-05, + "loss": 0.0135, + "step": 21110 + }, + { + "epoch": 3.8074634937804217, + "grad_norm": 0.04751326143741608, + "learning_rate": 7.707228281796688e-05, + "loss": 0.0133, + "step": 21120 + }, + { + "epoch": 3.809266270055886, + "grad_norm": 0.04745428264141083, + "learning_rate": 7.704910985409833e-05, + "loss": 0.0146, + "step": 21130 + }, + { + "epoch": 3.81106904633135, + "grad_norm": 0.03743601590394974, + "learning_rate": 7.702592867342439e-05, + "loss": 0.0131, + "step": 21140 + }, + { + "epoch": 3.8128718226068146, + "grad_norm": 0.036620549857616425, + "learning_rate": 7.700273928298691e-05, + "loss": 0.0146, + "step": 21150 + }, + { + "epoch": 3.8146745988822786, + "grad_norm": 0.0405939556658268, + "learning_rate": 7.697954168983021e-05, + "loss": 0.0144, + "step": 21160 + }, + { + "epoch": 3.816477375157743, + "grad_norm": 0.04257440194487572, + "learning_rate": 7.695633590100109e-05, + "loss": 0.0162, + "step": 21170 + }, + { + "epoch": 3.818280151433207, + "grad_norm": 0.05292891710996628, + "learning_rate": 7.693312192354886e-05, + "loss": 0.0142, + "step": 21180 + }, + { + "epoch": 3.8200829277086714, + "grad_norm": 0.04125504568219185, + "learning_rate": 7.690989976452532e-05, + "loss": 0.0153, + "step": 21190 + }, + { + "epoch": 3.8218857039841354, + "grad_norm": 0.04959399625658989, + "learning_rate": 7.688666943098475e-05, + "loss": 0.0137, + "step": 21200 + }, + { + "epoch": 3.8236884802596, + "grad_norm": 0.053475942462682724, + "learning_rate": 7.686343092998389e-05, + "loss": 0.0151, + "step": 21210 + }, + { + "epoch": 3.825491256535064, + "grad_norm": 0.05238006263971329, + "learning_rate": 7.684018426858202e-05, + "loss": 0.0143, + "step": 21220 + }, + { + "epoch": 3.8272940328105283, + "grad_norm": 0.0386255607008934, + "learning_rate": 7.681692945384084e-05, + "loss": 0.0146, + "step": 21230 + }, + { + "epoch": 3.8290968090859927, + "grad_norm": 0.03828830644488335, + "learning_rate": 7.679366649282456e-05, + "loss": 0.0154, + "step": 21240 + }, + { + "epoch": 3.8308995853614567, + "grad_norm": 0.035669535398483276, + "learning_rate": 7.677039539259983e-05, + "loss": 0.0147, + "step": 21250 + }, + { + "epoch": 3.8327023616369207, + "grad_norm": 0.02728315070271492, + "learning_rate": 7.674711616023581e-05, + "loss": 0.0142, + "step": 21260 + }, + { + "epoch": 3.834505137912385, + "grad_norm": 0.05024132505059242, + "learning_rate": 7.672382880280413e-05, + "loss": 0.0147, + "step": 21270 + }, + { + "epoch": 3.8363079141878496, + "grad_norm": 0.032903701066970825, + "learning_rate": 7.670053332737885e-05, + "loss": 0.014, + "step": 21280 + }, + { + "epoch": 3.8381106904633135, + "grad_norm": 0.04273885488510132, + "learning_rate": 7.667722974103654e-05, + "loss": 0.0144, + "step": 21290 + }, + { + "epoch": 3.8399134667387775, + "grad_norm": 0.049315180629491806, + "learning_rate": 7.66539180508562e-05, + "loss": 0.0138, + "step": 21300 + }, + { + "epoch": 3.841716243014242, + "grad_norm": 0.06252292543649673, + "learning_rate": 7.663059826391932e-05, + "loss": 0.0141, + "step": 21310 + }, + { + "epoch": 3.8435190192897064, + "grad_norm": 0.02612433210015297, + "learning_rate": 7.660727038730981e-05, + "loss": 0.0141, + "step": 21320 + }, + { + "epoch": 3.8453217955651704, + "grad_norm": 0.04726696014404297, + "learning_rate": 7.65839344281141e-05, + "loss": 0.0144, + "step": 21330 + }, + { + "epoch": 3.8471245718406344, + "grad_norm": 0.0374397337436676, + "learning_rate": 7.656059039342101e-05, + "loss": 0.0143, + "step": 21340 + }, + { + "epoch": 3.848927348116099, + "grad_norm": 0.03789371997117996, + "learning_rate": 7.653723829032187e-05, + "loss": 0.0149, + "step": 21350 + }, + { + "epoch": 3.8507301243915633, + "grad_norm": 0.041026607155799866, + "learning_rate": 7.65138781259104e-05, + "loss": 0.0145, + "step": 21360 + }, + { + "epoch": 3.8525329006670272, + "grad_norm": 0.04241277649998665, + "learning_rate": 7.649050990728279e-05, + "loss": 0.0154, + "step": 21370 + }, + { + "epoch": 3.8543356769424912, + "grad_norm": 0.043245624750852585, + "learning_rate": 7.646713364153774e-05, + "loss": 0.0138, + "step": 21380 + }, + { + "epoch": 3.8561384532179557, + "grad_norm": 0.04744493216276169, + "learning_rate": 7.64437493357763e-05, + "loss": 0.0155, + "step": 21390 + }, + { + "epoch": 3.85794122949342, + "grad_norm": 0.03769084811210632, + "learning_rate": 7.642035699710202e-05, + "loss": 0.014, + "step": 21400 + }, + { + "epoch": 3.859744005768884, + "grad_norm": 0.034416742622852325, + "learning_rate": 7.639695663262089e-05, + "loss": 0.0139, + "step": 21410 + }, + { + "epoch": 3.861546782044348, + "grad_norm": 0.051613930612802505, + "learning_rate": 7.637354824944128e-05, + "loss": 0.0149, + "step": 21420 + }, + { + "epoch": 3.8633495583198125, + "grad_norm": 0.03892466798424721, + "learning_rate": 7.635013185467408e-05, + "loss": 0.0146, + "step": 21430 + }, + { + "epoch": 3.865152334595277, + "grad_norm": 0.03753797709941864, + "learning_rate": 7.632670745543256e-05, + "loss": 0.013, + "step": 21440 + }, + { + "epoch": 3.866955110870741, + "grad_norm": 0.04707101732492447, + "learning_rate": 7.630327505883242e-05, + "loss": 0.0138, + "step": 21450 + }, + { + "epoch": 3.868757887146205, + "grad_norm": 0.030584491789340973, + "learning_rate": 7.627983467199182e-05, + "loss": 0.0133, + "step": 21460 + }, + { + "epoch": 3.8705606634216694, + "grad_norm": 0.039972029626369476, + "learning_rate": 7.625638630203132e-05, + "loss": 0.013, + "step": 21470 + }, + { + "epoch": 3.872363439697134, + "grad_norm": 0.04151765629649162, + "learning_rate": 7.623292995607394e-05, + "loss": 0.0141, + "step": 21480 + }, + { + "epoch": 3.874166215972598, + "grad_norm": 0.0492335706949234, + "learning_rate": 7.620946564124507e-05, + "loss": 0.0147, + "step": 21490 + }, + { + "epoch": 3.875968992248062, + "grad_norm": 0.030168794095516205, + "learning_rate": 7.618599336467256e-05, + "loss": 0.0156, + "step": 21500 + }, + { + "epoch": 3.8777717685235262, + "grad_norm": 0.038039032369852066, + "learning_rate": 7.616251313348666e-05, + "loss": 0.0142, + "step": 21510 + }, + { + "epoch": 3.8795745447989907, + "grad_norm": 0.03929544985294342, + "learning_rate": 7.613902495482005e-05, + "loss": 0.0145, + "step": 21520 + }, + { + "epoch": 3.8813773210744547, + "grad_norm": 0.03458899259567261, + "learning_rate": 7.611552883580784e-05, + "loss": 0.0135, + "step": 21530 + }, + { + "epoch": 3.8831800973499186, + "grad_norm": 0.03565128147602081, + "learning_rate": 7.609202478358748e-05, + "loss": 0.0134, + "step": 21540 + }, + { + "epoch": 3.884982873625383, + "grad_norm": 0.05891553312540054, + "learning_rate": 7.606851280529895e-05, + "loss": 0.0136, + "step": 21550 + }, + { + "epoch": 3.8867856499008475, + "grad_norm": 0.03950006142258644, + "learning_rate": 7.604499290808449e-05, + "loss": 0.0147, + "step": 21560 + }, + { + "epoch": 3.8885884261763115, + "grad_norm": 0.03491316735744476, + "learning_rate": 7.602146509908888e-05, + "loss": 0.0138, + "step": 21570 + }, + { + "epoch": 3.8903912024517755, + "grad_norm": 0.03439300134778023, + "learning_rate": 7.599792938545921e-05, + "loss": 0.0142, + "step": 21580 + }, + { + "epoch": 3.89219397872724, + "grad_norm": 0.037623994052410126, + "learning_rate": 7.597438577434506e-05, + "loss": 0.0149, + "step": 21590 + }, + { + "epoch": 3.8939967550027044, + "grad_norm": 0.032007526606321335, + "learning_rate": 7.595083427289831e-05, + "loss": 0.0139, + "step": 21600 + }, + { + "epoch": 3.8957995312781684, + "grad_norm": 0.04174208641052246, + "learning_rate": 7.59272748882733e-05, + "loss": 0.0149, + "step": 21610 + }, + { + "epoch": 3.8976023075536324, + "grad_norm": 0.03820587322115898, + "learning_rate": 7.590370762762675e-05, + "loss": 0.0148, + "step": 21620 + }, + { + "epoch": 3.899405083829097, + "grad_norm": 0.04396595433354378, + "learning_rate": 7.588013249811777e-05, + "loss": 0.0146, + "step": 21630 + }, + { + "epoch": 3.901207860104561, + "grad_norm": 0.036074329167604446, + "learning_rate": 7.585654950690786e-05, + "loss": 0.0142, + "step": 21640 + }, + { + "epoch": 3.903010636380025, + "grad_norm": 0.04310118407011032, + "learning_rate": 7.583295866116091e-05, + "loss": 0.0137, + "step": 21650 + }, + { + "epoch": 3.904813412655489, + "grad_norm": 0.046662814915180206, + "learning_rate": 7.580935996804321e-05, + "loss": 0.0129, + "step": 21660 + }, + { + "epoch": 3.9066161889309536, + "grad_norm": 0.028659194707870483, + "learning_rate": 7.57857534347234e-05, + "loss": 0.0133, + "step": 21670 + }, + { + "epoch": 3.908418965206418, + "grad_norm": 0.035923805087804794, + "learning_rate": 7.576213906837254e-05, + "loss": 0.0143, + "step": 21680 + }, + { + "epoch": 3.910221741481882, + "grad_norm": 0.032716307789087296, + "learning_rate": 7.573851687616403e-05, + "loss": 0.0137, + "step": 21690 + }, + { + "epoch": 3.912024517757346, + "grad_norm": 0.047444578260183334, + "learning_rate": 7.571488686527368e-05, + "loss": 0.0139, + "step": 21700 + }, + { + "epoch": 3.9138272940328105, + "grad_norm": 0.03697536140680313, + "learning_rate": 7.569124904287968e-05, + "loss": 0.014, + "step": 21710 + }, + { + "epoch": 3.915630070308275, + "grad_norm": 0.030477941036224365, + "learning_rate": 7.566760341616254e-05, + "loss": 0.0149, + "step": 21720 + }, + { + "epoch": 3.917432846583739, + "grad_norm": 0.04234546422958374, + "learning_rate": 7.564394999230519e-05, + "loss": 0.0135, + "step": 21730 + }, + { + "epoch": 3.919235622859203, + "grad_norm": 0.0433921292424202, + "learning_rate": 7.562028877849294e-05, + "loss": 0.0135, + "step": 21740 + }, + { + "epoch": 3.9210383991346673, + "grad_norm": 0.033926889300346375, + "learning_rate": 7.559661978191341e-05, + "loss": 0.013, + "step": 21750 + }, + { + "epoch": 3.9228411754101318, + "grad_norm": 0.03760771453380585, + "learning_rate": 7.557294300975664e-05, + "loss": 0.0145, + "step": 21760 + }, + { + "epoch": 3.9246439516855958, + "grad_norm": 0.04612240940332413, + "learning_rate": 7.554925846921499e-05, + "loss": 0.0137, + "step": 21770 + }, + { + "epoch": 3.9264467279610598, + "grad_norm": 0.03939961642026901, + "learning_rate": 7.552556616748321e-05, + "loss": 0.013, + "step": 21780 + }, + { + "epoch": 3.928249504236524, + "grad_norm": 0.03510933369398117, + "learning_rate": 7.550186611175838e-05, + "loss": 0.0132, + "step": 21790 + }, + { + "epoch": 3.9300522805119886, + "grad_norm": 0.042671557515859604, + "learning_rate": 7.547815830923998e-05, + "loss": 0.0133, + "step": 21800 + }, + { + "epoch": 3.9318550567874526, + "grad_norm": 0.04043770954012871, + "learning_rate": 7.54544427671298e-05, + "loss": 0.014, + "step": 21810 + }, + { + "epoch": 3.933657833062917, + "grad_norm": 0.03787252679467201, + "learning_rate": 7.543071949263198e-05, + "loss": 0.0137, + "step": 21820 + }, + { + "epoch": 3.935460609338381, + "grad_norm": 0.038864389061927795, + "learning_rate": 7.540698849295305e-05, + "loss": 0.0143, + "step": 21830 + }, + { + "epoch": 3.9372633856138455, + "grad_norm": 0.03555576130747795, + "learning_rate": 7.538324977530183e-05, + "loss": 0.0143, + "step": 21840 + }, + { + "epoch": 3.9390661618893095, + "grad_norm": 0.03683760017156601, + "learning_rate": 7.535950334688955e-05, + "loss": 0.0146, + "step": 21850 + }, + { + "epoch": 3.940868938164774, + "grad_norm": 0.03490287810564041, + "learning_rate": 7.533574921492972e-05, + "loss": 0.014, + "step": 21860 + }, + { + "epoch": 3.942671714440238, + "grad_norm": 0.043082982301712036, + "learning_rate": 7.531198738663824e-05, + "loss": 0.0137, + "step": 21870 + }, + { + "epoch": 3.9444744907157023, + "grad_norm": 0.04301317036151886, + "learning_rate": 7.528821786923333e-05, + "loss": 0.0137, + "step": 21880 + }, + { + "epoch": 3.9462772669911663, + "grad_norm": 0.04977113753557205, + "learning_rate": 7.52644406699355e-05, + "loss": 0.0159, + "step": 21890 + }, + { + "epoch": 3.9480800432666308, + "grad_norm": 0.04279617220163345, + "learning_rate": 7.524065579596766e-05, + "loss": 0.0132, + "step": 21900 + }, + { + "epoch": 3.9498828195420947, + "grad_norm": 0.040784817188978195, + "learning_rate": 7.521686325455506e-05, + "loss": 0.0133, + "step": 21910 + }, + { + "epoch": 3.951685595817559, + "grad_norm": 0.03131188824772835, + "learning_rate": 7.51930630529252e-05, + "loss": 0.0143, + "step": 21920 + }, + { + "epoch": 3.953488372093023, + "grad_norm": 0.03541479632258415, + "learning_rate": 7.516925519830797e-05, + "loss": 0.0132, + "step": 21930 + }, + { + "epoch": 3.9552911483684876, + "grad_norm": 0.03323213383555412, + "learning_rate": 7.514543969793557e-05, + "loss": 0.0142, + "step": 21940 + }, + { + "epoch": 3.9570939246439516, + "grad_norm": 0.04218602180480957, + "learning_rate": 7.512161655904251e-05, + "loss": 0.0151, + "step": 21950 + }, + { + "epoch": 3.958896700919416, + "grad_norm": 0.03165538236498833, + "learning_rate": 7.509778578886563e-05, + "loss": 0.0144, + "step": 21960 + }, + { + "epoch": 3.96069947719488, + "grad_norm": 0.04161103442311287, + "learning_rate": 7.507394739464412e-05, + "loss": 0.0125, + "step": 21970 + }, + { + "epoch": 3.9625022534703445, + "grad_norm": 0.042151253670454025, + "learning_rate": 7.50501013836194e-05, + "loss": 0.0136, + "step": 21980 + }, + { + "epoch": 3.9643050297458085, + "grad_norm": 0.037161558866500854, + "learning_rate": 7.50262477630353e-05, + "loss": 0.0133, + "step": 21990 + }, + { + "epoch": 3.966107806021273, + "grad_norm": 0.031059609726071358, + "learning_rate": 7.500238654013794e-05, + "loss": 0.0134, + "step": 22000 + }, + { + "epoch": 3.967910582296737, + "grad_norm": 0.035748086869716644, + "learning_rate": 7.497851772217566e-05, + "loss": 0.016, + "step": 22010 + }, + { + "epoch": 3.9697133585722013, + "grad_norm": 0.05742967873811722, + "learning_rate": 7.495464131639924e-05, + "loss": 0.0139, + "step": 22020 + }, + { + "epoch": 3.9715161348476653, + "grad_norm": 0.034976325929164886, + "learning_rate": 7.493075733006166e-05, + "loss": 0.015, + "step": 22030 + }, + { + "epoch": 3.9733189111231297, + "grad_norm": 0.0505264513194561, + "learning_rate": 7.490686577041828e-05, + "loss": 0.015, + "step": 22040 + }, + { + "epoch": 3.9751216873985937, + "grad_norm": 0.052405256778001785, + "learning_rate": 7.488296664472668e-05, + "loss": 0.0142, + "step": 22050 + }, + { + "epoch": 3.976924463674058, + "grad_norm": 0.028941497206687927, + "learning_rate": 7.485905996024682e-05, + "loss": 0.0127, + "step": 22060 + }, + { + "epoch": 3.978727239949522, + "grad_norm": 0.04543226212263107, + "learning_rate": 7.483514572424093e-05, + "loss": 0.0136, + "step": 22070 + }, + { + "epoch": 3.9805300162249866, + "grad_norm": 0.03413500264286995, + "learning_rate": 7.481122394397349e-05, + "loss": 0.0138, + "step": 22080 + }, + { + "epoch": 3.9823327925004506, + "grad_norm": 0.03391009941697121, + "learning_rate": 7.478729462671131e-05, + "loss": 0.014, + "step": 22090 + }, + { + "epoch": 3.984135568775915, + "grad_norm": 0.05387495830655098, + "learning_rate": 7.47633577797235e-05, + "loss": 0.0134, + "step": 22100 + }, + { + "epoch": 3.985938345051379, + "grad_norm": 0.05054774507880211, + "learning_rate": 7.473941341028144e-05, + "loss": 0.0148, + "step": 22110 + }, + { + "epoch": 3.9877411213268434, + "grad_norm": 0.02909245155751705, + "learning_rate": 7.471546152565879e-05, + "loss": 0.0136, + "step": 22120 + }, + { + "epoch": 3.9895438976023074, + "grad_norm": 0.04121488332748413, + "learning_rate": 7.46915021331315e-05, + "loss": 0.0148, + "step": 22130 + }, + { + "epoch": 3.991346673877772, + "grad_norm": 0.04284793138504028, + "learning_rate": 7.466753523997778e-05, + "loss": 0.0135, + "step": 22140 + }, + { + "epoch": 3.993149450153236, + "grad_norm": 0.03672342002391815, + "learning_rate": 7.464356085347819e-05, + "loss": 0.0141, + "step": 22150 + }, + { + "epoch": 3.9949522264287003, + "grad_norm": 0.03719167411327362, + "learning_rate": 7.461957898091548e-05, + "loss": 0.0139, + "step": 22160 + }, + { + "epoch": 3.9967550027041643, + "grad_norm": 0.03357776254415512, + "learning_rate": 7.459558962957473e-05, + "loss": 0.0142, + "step": 22170 + }, + { + "epoch": 3.9985577789796287, + "grad_norm": 0.02791198156774044, + "learning_rate": 7.457159280674326e-05, + "loss": 0.0139, + "step": 22180 + }, + { + "epoch": 4.000360555255093, + "grad_norm": 0.03303958475589752, + "learning_rate": 7.454758851971066e-05, + "loss": 0.0132, + "step": 22190 + }, + { + "epoch": 4.002163331530557, + "grad_norm": 0.039240967482328415, + "learning_rate": 7.45235767757688e-05, + "loss": 0.0133, + "step": 22200 + }, + { + "epoch": 4.003966107806021, + "grad_norm": 0.04239935055375099, + "learning_rate": 7.449955758221183e-05, + "loss": 0.0136, + "step": 22210 + }, + { + "epoch": 4.005768884081485, + "grad_norm": 0.031132958829402924, + "learning_rate": 7.447553094633615e-05, + "loss": 0.0141, + "step": 22220 + }, + { + "epoch": 4.00757166035695, + "grad_norm": 0.039666034281253815, + "learning_rate": 7.445149687544039e-05, + "loss": 0.0129, + "step": 22230 + }, + { + "epoch": 4.009374436632414, + "grad_norm": 0.03486225754022598, + "learning_rate": 7.44274553768255e-05, + "loss": 0.0124, + "step": 22240 + }, + { + "epoch": 4.011177212907878, + "grad_norm": 0.04990850016474724, + "learning_rate": 7.440340645779464e-05, + "loss": 0.0136, + "step": 22250 + }, + { + "epoch": 4.012979989183342, + "grad_norm": 0.04421483725309372, + "learning_rate": 7.437935012565322e-05, + "loss": 0.0134, + "step": 22260 + }, + { + "epoch": 4.014782765458807, + "grad_norm": 0.023700745776295662, + "learning_rate": 7.435528638770893e-05, + "loss": 0.013, + "step": 22270 + }, + { + "epoch": 4.016585541734271, + "grad_norm": 0.0390113927423954, + "learning_rate": 7.433121525127171e-05, + "loss": 0.014, + "step": 22280 + }, + { + "epoch": 4.018388318009735, + "grad_norm": 0.034398555755615234, + "learning_rate": 7.430713672365371e-05, + "loss": 0.0139, + "step": 22290 + }, + { + "epoch": 4.020191094285199, + "grad_norm": 0.05531187355518341, + "learning_rate": 7.428305081216938e-05, + "loss": 0.0137, + "step": 22300 + }, + { + "epoch": 4.021993870560664, + "grad_norm": 0.04105549678206444, + "learning_rate": 7.425895752413536e-05, + "loss": 0.0134, + "step": 22310 + }, + { + "epoch": 4.023796646836128, + "grad_norm": 0.05799393728375435, + "learning_rate": 7.423485686687057e-05, + "loss": 0.0137, + "step": 22320 + }, + { + "epoch": 4.025599423111592, + "grad_norm": 0.03267175704240799, + "learning_rate": 7.421074884769616e-05, + "loss": 0.0134, + "step": 22330 + }, + { + "epoch": 4.027402199387056, + "grad_norm": 0.043391238898038864, + "learning_rate": 7.418663347393548e-05, + "loss": 0.0141, + "step": 22340 + }, + { + "epoch": 4.029204975662521, + "grad_norm": 0.03020276129245758, + "learning_rate": 7.416251075291418e-05, + "loss": 0.0146, + "step": 22350 + }, + { + "epoch": 4.0310077519379846, + "grad_norm": 0.03513885289430618, + "learning_rate": 7.413838069196007e-05, + "loss": 0.0131, + "step": 22360 + }, + { + "epoch": 4.0328105282134485, + "grad_norm": 0.0435139462351799, + "learning_rate": 7.411424329840324e-05, + "loss": 0.0148, + "step": 22370 + }, + { + "epoch": 4.0346133044889125, + "grad_norm": 0.04793265089392662, + "learning_rate": 7.409009857957601e-05, + "loss": 0.0139, + "step": 22380 + }, + { + "epoch": 4.036416080764377, + "grad_norm": 0.03465655818581581, + "learning_rate": 7.40659465428129e-05, + "loss": 0.0147, + "step": 22390 + }, + { + "epoch": 4.038218857039841, + "grad_norm": 0.027396971359848976, + "learning_rate": 7.404178719545063e-05, + "loss": 0.0139, + "step": 22400 + }, + { + "epoch": 4.040021633315305, + "grad_norm": 0.02715551294386387, + "learning_rate": 7.401762054482822e-05, + "loss": 0.014, + "step": 22410 + }, + { + "epoch": 4.041824409590769, + "grad_norm": 0.036325473338365555, + "learning_rate": 7.39934465982868e-05, + "loss": 0.0133, + "step": 22420 + }, + { + "epoch": 4.043627185866234, + "grad_norm": 0.04957028850913048, + "learning_rate": 7.396926536316984e-05, + "loss": 0.014, + "step": 22430 + }, + { + "epoch": 4.045429962141698, + "grad_norm": 0.03010491654276848, + "learning_rate": 7.394507684682293e-05, + "loss": 0.0135, + "step": 22440 + }, + { + "epoch": 4.047232738417162, + "grad_norm": 0.033221032470464706, + "learning_rate": 7.392088105659393e-05, + "loss": 0.0131, + "step": 22450 + }, + { + "epoch": 4.049035514692626, + "grad_norm": 0.03006628155708313, + "learning_rate": 7.389667799983284e-05, + "loss": 0.0132, + "step": 22460 + }, + { + "epoch": 4.050838290968091, + "grad_norm": 0.04060186818242073, + "learning_rate": 7.387246768389193e-05, + "loss": 0.0128, + "step": 22470 + }, + { + "epoch": 4.052641067243555, + "grad_norm": 0.03714563325047493, + "learning_rate": 7.384825011612563e-05, + "loss": 0.0165, + "step": 22480 + }, + { + "epoch": 4.054443843519019, + "grad_norm": 0.039576925337314606, + "learning_rate": 7.382402530389066e-05, + "loss": 0.014, + "step": 22490 + }, + { + "epoch": 4.056246619794483, + "grad_norm": 0.03522757440805435, + "learning_rate": 7.379979325454582e-05, + "loss": 0.0136, + "step": 22500 + }, + { + "epoch": 4.058049396069948, + "grad_norm": 0.0423310250043869, + "learning_rate": 7.37755539754522e-05, + "loss": 0.0121, + "step": 22510 + }, + { + "epoch": 4.059852172345412, + "grad_norm": 0.04022989049553871, + "learning_rate": 7.375130747397302e-05, + "loss": 0.0139, + "step": 22520 + }, + { + "epoch": 4.061654948620876, + "grad_norm": 0.030878273770213127, + "learning_rate": 7.372705375747377e-05, + "loss": 0.0136, + "step": 22530 + }, + { + "epoch": 4.06345772489634, + "grad_norm": 0.03871462121605873, + "learning_rate": 7.370279283332205e-05, + "loss": 0.0149, + "step": 22540 + }, + { + "epoch": 4.065260501171805, + "grad_norm": 0.033704329282045364, + "learning_rate": 7.36785247088877e-05, + "loss": 0.0138, + "step": 22550 + }, + { + "epoch": 4.067063277447269, + "grad_norm": 0.030195321887731552, + "learning_rate": 7.365424939154275e-05, + "loss": 0.0137, + "step": 22560 + }, + { + "epoch": 4.068866053722733, + "grad_norm": 0.03183944523334503, + "learning_rate": 7.362996688866138e-05, + "loss": 0.0131, + "step": 22570 + }, + { + "epoch": 4.070668829998197, + "grad_norm": 0.030865347012877464, + "learning_rate": 7.360567720761999e-05, + "loss": 0.0131, + "step": 22580 + }, + { + "epoch": 4.072471606273662, + "grad_norm": 0.047139350324869156, + "learning_rate": 7.358138035579711e-05, + "loss": 0.0134, + "step": 22590 + }, + { + "epoch": 4.074274382549126, + "grad_norm": 0.053456466645002365, + "learning_rate": 7.355707634057354e-05, + "loss": 0.0136, + "step": 22600 + }, + { + "epoch": 4.07607715882459, + "grad_norm": 0.04839158430695534, + "learning_rate": 7.353276516933215e-05, + "loss": 0.0145, + "step": 22610 + }, + { + "epoch": 4.077879935100054, + "grad_norm": 0.04287507385015488, + "learning_rate": 7.350844684945806e-05, + "loss": 0.0141, + "step": 22620 + }, + { + "epoch": 4.0796827113755185, + "grad_norm": 0.023447690531611443, + "learning_rate": 7.348412138833851e-05, + "loss": 0.0132, + "step": 22630 + }, + { + "epoch": 4.0814854876509825, + "grad_norm": 0.03515647351741791, + "learning_rate": 7.345978879336295e-05, + "loss": 0.0142, + "step": 22640 + }, + { + "epoch": 4.0832882639264465, + "grad_norm": 0.03605928272008896, + "learning_rate": 7.343544907192296e-05, + "loss": 0.0151, + "step": 22650 + }, + { + "epoch": 4.085091040201911, + "grad_norm": 0.044174376875162125, + "learning_rate": 7.341110223141235e-05, + "loss": 0.0156, + "step": 22660 + }, + { + "epoch": 4.086893816477375, + "grad_norm": 0.0559733621776104, + "learning_rate": 7.3386748279227e-05, + "loss": 0.0146, + "step": 22670 + }, + { + "epoch": 4.088696592752839, + "grad_norm": 0.03258093446493149, + "learning_rate": 7.336238722276501e-05, + "loss": 0.0143, + "step": 22680 + }, + { + "epoch": 4.090499369028303, + "grad_norm": 0.032111089676618576, + "learning_rate": 7.333801906942663e-05, + "loss": 0.0139, + "step": 22690 + }, + { + "epoch": 4.092302145303768, + "grad_norm": 0.030940134078264236, + "learning_rate": 7.331364382661428e-05, + "loss": 0.0132, + "step": 22700 + }, + { + "epoch": 4.094104921579232, + "grad_norm": 0.03804563358426094, + "learning_rate": 7.328926150173248e-05, + "loss": 0.0131, + "step": 22710 + }, + { + "epoch": 4.095907697854696, + "grad_norm": 0.033908527344465256, + "learning_rate": 7.326487210218795e-05, + "loss": 0.0146, + "step": 22720 + }, + { + "epoch": 4.09771047413016, + "grad_norm": 0.04434509202837944, + "learning_rate": 7.324047563538955e-05, + "loss": 0.013, + "step": 22730 + }, + { + "epoch": 4.099513250405625, + "grad_norm": 0.04704449325799942, + "learning_rate": 7.321607210874828e-05, + "loss": 0.0138, + "step": 22740 + }, + { + "epoch": 4.101316026681089, + "grad_norm": 0.0517672561109066, + "learning_rate": 7.31916615296773e-05, + "loss": 0.0135, + "step": 22750 + }, + { + "epoch": 4.103118802956553, + "grad_norm": 0.04142811521887779, + "learning_rate": 7.316724390559188e-05, + "loss": 0.0128, + "step": 22760 + }, + { + "epoch": 4.104921579232017, + "grad_norm": 0.031558796763420105, + "learning_rate": 7.314281924390946e-05, + "loss": 0.0127, + "step": 22770 + }, + { + "epoch": 4.106724355507482, + "grad_norm": 0.03703946992754936, + "learning_rate": 7.311838755204959e-05, + "loss": 0.0137, + "step": 22780 + }, + { + "epoch": 4.108527131782946, + "grad_norm": 0.032142702490091324, + "learning_rate": 7.3093948837434e-05, + "loss": 0.0133, + "step": 22790 + }, + { + "epoch": 4.11032990805841, + "grad_norm": 0.03694407269358635, + "learning_rate": 7.306950310748651e-05, + "loss": 0.0134, + "step": 22800 + }, + { + "epoch": 4.112132684333874, + "grad_norm": 0.03176625072956085, + "learning_rate": 7.304505036963311e-05, + "loss": 0.0137, + "step": 22810 + }, + { + "epoch": 4.113935460609339, + "grad_norm": 0.035751499235630035, + "learning_rate": 7.302059063130186e-05, + "loss": 0.0132, + "step": 22820 + }, + { + "epoch": 4.115738236884803, + "grad_norm": 0.02915801852941513, + "learning_rate": 7.2996123899923e-05, + "loss": 0.0134, + "step": 22830 + }, + { + "epoch": 4.117541013160267, + "grad_norm": 0.02822216972708702, + "learning_rate": 7.297165018292886e-05, + "loss": 0.0135, + "step": 22840 + }, + { + "epoch": 4.119343789435731, + "grad_norm": 0.03226885944604874, + "learning_rate": 7.294716948775396e-05, + "loss": 0.0138, + "step": 22850 + }, + { + "epoch": 4.121146565711196, + "grad_norm": 0.042486947029829025, + "learning_rate": 7.292268182183484e-05, + "loss": 0.0129, + "step": 22860 + }, + { + "epoch": 4.12294934198666, + "grad_norm": 0.040508706122636795, + "learning_rate": 7.28981871926102e-05, + "loss": 0.0141, + "step": 22870 + }, + { + "epoch": 4.124752118262124, + "grad_norm": 0.05572642385959625, + "learning_rate": 7.28736856075209e-05, + "loss": 0.0152, + "step": 22880 + }, + { + "epoch": 4.126554894537588, + "grad_norm": 0.033819980919361115, + "learning_rate": 7.284917707400985e-05, + "loss": 0.0145, + "step": 22890 + }, + { + "epoch": 4.1283576708130525, + "grad_norm": 0.030018070712685585, + "learning_rate": 7.282466159952212e-05, + "loss": 0.0131, + "step": 22900 + }, + { + "epoch": 4.1301604470885165, + "grad_norm": 0.03599739819765091, + "learning_rate": 7.280013919150483e-05, + "loss": 0.0127, + "step": 22910 + }, + { + "epoch": 4.1319632233639805, + "grad_norm": 0.061725202947854996, + "learning_rate": 7.277560985740728e-05, + "loss": 0.0132, + "step": 22920 + }, + { + "epoch": 4.1337659996394445, + "grad_norm": 0.054199568927288055, + "learning_rate": 7.275107360468079e-05, + "loss": 0.0134, + "step": 22930 + }, + { + "epoch": 4.135568775914909, + "grad_norm": 0.038784127682447433, + "learning_rate": 7.272653044077885e-05, + "loss": 0.0152, + "step": 22940 + }, + { + "epoch": 4.137371552190373, + "grad_norm": 0.06466848403215408, + "learning_rate": 7.270198037315703e-05, + "loss": 0.014, + "step": 22950 + }, + { + "epoch": 4.139174328465837, + "grad_norm": 0.032977573573589325, + "learning_rate": 7.267742340927297e-05, + "loss": 0.0145, + "step": 22960 + }, + { + "epoch": 4.140977104741301, + "grad_norm": 0.0348464660346508, + "learning_rate": 7.265285955658645e-05, + "loss": 0.0132, + "step": 22970 + }, + { + "epoch": 4.142779881016766, + "grad_norm": 0.027925482019782066, + "learning_rate": 7.26282888225593e-05, + "loss": 0.0126, + "step": 22980 + }, + { + "epoch": 4.14458265729223, + "grad_norm": 0.03869510814547539, + "learning_rate": 7.260371121465548e-05, + "loss": 0.0126, + "step": 22990 + }, + { + "epoch": 4.146385433567694, + "grad_norm": 0.03665947541594505, + "learning_rate": 7.2579126740341e-05, + "loss": 0.0135, + "step": 23000 + }, + { + "epoch": 4.148188209843158, + "grad_norm": 0.042169876396656036, + "learning_rate": 7.2554535407084e-05, + "loss": 0.0141, + "step": 23010 + }, + { + "epoch": 4.149990986118623, + "grad_norm": 0.030814573168754578, + "learning_rate": 7.252993722235464e-05, + "loss": 0.0146, + "step": 23020 + }, + { + "epoch": 4.151793762394087, + "grad_norm": 0.03737280145287514, + "learning_rate": 7.250533219362523e-05, + "loss": 0.0131, + "step": 23030 + }, + { + "epoch": 4.153596538669551, + "grad_norm": 0.028718456625938416, + "learning_rate": 7.248072032837012e-05, + "loss": 0.0131, + "step": 23040 + }, + { + "epoch": 4.155399314945015, + "grad_norm": 0.035821519792079926, + "learning_rate": 7.245610163406575e-05, + "loss": 0.0133, + "step": 23050 + }, + { + "epoch": 4.15720209122048, + "grad_norm": 0.02674536034464836, + "learning_rate": 7.243147611819061e-05, + "loss": 0.0128, + "step": 23060 + }, + { + "epoch": 4.159004867495944, + "grad_norm": 0.045630019158124924, + "learning_rate": 7.240684378822531e-05, + "loss": 0.0143, + "step": 23070 + }, + { + "epoch": 4.160807643771408, + "grad_norm": 0.043843742460012436, + "learning_rate": 7.238220465165248e-05, + "loss": 0.0149, + "step": 23080 + }, + { + "epoch": 4.162610420046872, + "grad_norm": 0.03368871659040451, + "learning_rate": 7.235755871595684e-05, + "loss": 0.0138, + "step": 23090 + }, + { + "epoch": 4.164413196322337, + "grad_norm": 0.04520809277892113, + "learning_rate": 7.233290598862517e-05, + "loss": 0.0138, + "step": 23100 + }, + { + "epoch": 4.166215972597801, + "grad_norm": 0.040257591754198074, + "learning_rate": 7.230824647714635e-05, + "loss": 0.0133, + "step": 23110 + }, + { + "epoch": 4.168018748873265, + "grad_norm": 0.03836894407868385, + "learning_rate": 7.228358018901124e-05, + "loss": 0.0137, + "step": 23120 + }, + { + "epoch": 4.169821525148729, + "grad_norm": 0.03831867128610611, + "learning_rate": 7.225890713171286e-05, + "loss": 0.0122, + "step": 23130 + }, + { + "epoch": 4.171624301424194, + "grad_norm": 0.04877988249063492, + "learning_rate": 7.223422731274618e-05, + "loss": 0.0153, + "step": 23140 + }, + { + "epoch": 4.173427077699658, + "grad_norm": 0.04745005443692207, + "learning_rate": 7.220954073960832e-05, + "loss": 0.0133, + "step": 23150 + }, + { + "epoch": 4.175229853975122, + "grad_norm": 0.043991994112730026, + "learning_rate": 7.218484741979838e-05, + "loss": 0.0138, + "step": 23160 + }, + { + "epoch": 4.177032630250586, + "grad_norm": 0.03735299035906792, + "learning_rate": 7.216014736081756e-05, + "loss": 0.0142, + "step": 23170 + }, + { + "epoch": 4.1788354065260505, + "grad_norm": 0.03316495195031166, + "learning_rate": 7.213544057016906e-05, + "loss": 0.0126, + "step": 23180 + }, + { + "epoch": 4.1806381828015144, + "grad_norm": 0.031231414526700974, + "learning_rate": 7.211072705535819e-05, + "loss": 0.0128, + "step": 23190 + }, + { + "epoch": 4.182440959076978, + "grad_norm": 0.028650620952248573, + "learning_rate": 7.208600682389224e-05, + "loss": 0.013, + "step": 23200 + }, + { + "epoch": 4.184243735352442, + "grad_norm": 0.02910509705543518, + "learning_rate": 7.206127988328055e-05, + "loss": 0.0127, + "step": 23210 + }, + { + "epoch": 4.186046511627907, + "grad_norm": 0.04619140177965164, + "learning_rate": 7.203654624103453e-05, + "loss": 0.0145, + "step": 23220 + }, + { + "epoch": 4.187849287903371, + "grad_norm": 0.03180632367730141, + "learning_rate": 7.201180590466761e-05, + "loss": 0.0134, + "step": 23230 + }, + { + "epoch": 4.189652064178835, + "grad_norm": 0.031323306262493134, + "learning_rate": 7.198705888169523e-05, + "loss": 0.0156, + "step": 23240 + }, + { + "epoch": 4.191454840454299, + "grad_norm": 0.0353715755045414, + "learning_rate": 7.196230517963491e-05, + "loss": 0.0128, + "step": 23250 + }, + { + "epoch": 4.193257616729764, + "grad_norm": 0.029487207531929016, + "learning_rate": 7.193754480600615e-05, + "loss": 0.0135, + "step": 23260 + }, + { + "epoch": 4.195060393005228, + "grad_norm": 0.05338752269744873, + "learning_rate": 7.19127777683305e-05, + "loss": 0.0154, + "step": 23270 + }, + { + "epoch": 4.196863169280692, + "grad_norm": 0.041044559329748154, + "learning_rate": 7.188800407413156e-05, + "loss": 0.0132, + "step": 23280 + }, + { + "epoch": 4.198665945556156, + "grad_norm": 0.046917278319597244, + "learning_rate": 7.186322373093489e-05, + "loss": 0.0155, + "step": 23290 + }, + { + "epoch": 4.200468721831621, + "grad_norm": 0.034714143723249435, + "learning_rate": 7.18384367462681e-05, + "loss": 0.0129, + "step": 23300 + }, + { + "epoch": 4.202271498107085, + "grad_norm": 0.05646158754825592, + "learning_rate": 7.181364312766085e-05, + "loss": 0.0132, + "step": 23310 + }, + { + "epoch": 4.204074274382549, + "grad_norm": 0.043013107031583786, + "learning_rate": 7.178884288264477e-05, + "loss": 0.0131, + "step": 23320 + }, + { + "epoch": 4.205877050658013, + "grad_norm": 0.04811820387840271, + "learning_rate": 7.176403601875353e-05, + "loss": 0.0149, + "step": 23330 + }, + { + "epoch": 4.207679826933478, + "grad_norm": 0.04444929212331772, + "learning_rate": 7.173922254352279e-05, + "loss": 0.0141, + "step": 23340 + }, + { + "epoch": 4.209482603208942, + "grad_norm": 0.039410550147295, + "learning_rate": 7.171440246449024e-05, + "loss": 0.0146, + "step": 23350 + }, + { + "epoch": 4.211285379484406, + "grad_norm": 0.039911624044179916, + "learning_rate": 7.168957578919555e-05, + "loss": 0.0145, + "step": 23360 + }, + { + "epoch": 4.21308815575987, + "grad_norm": 0.03958724066615105, + "learning_rate": 7.16647425251804e-05, + "loss": 0.014, + "step": 23370 + }, + { + "epoch": 4.214890932035335, + "grad_norm": 0.056061238050460815, + "learning_rate": 7.163990267998852e-05, + "loss": 0.014, + "step": 23380 + }, + { + "epoch": 4.216693708310799, + "grad_norm": 0.03773448243737221, + "learning_rate": 7.161505626116556e-05, + "loss": 0.014, + "step": 23390 + }, + { + "epoch": 4.218496484586263, + "grad_norm": 0.04029657691717148, + "learning_rate": 7.159020327625923e-05, + "loss": 0.015, + "step": 23400 + }, + { + "epoch": 4.220299260861727, + "grad_norm": 0.0447051003575325, + "learning_rate": 7.15653437328192e-05, + "loss": 0.0139, + "step": 23410 + }, + { + "epoch": 4.222102037137192, + "grad_norm": 0.041529905050992966, + "learning_rate": 7.154047763839713e-05, + "loss": 0.0141, + "step": 23420 + }, + { + "epoch": 4.223904813412656, + "grad_norm": 0.045106153935194016, + "learning_rate": 7.15156050005467e-05, + "loss": 0.0147, + "step": 23430 + }, + { + "epoch": 4.2257075896881195, + "grad_norm": 0.037049345672130585, + "learning_rate": 7.149072582682357e-05, + "loss": 0.0133, + "step": 23440 + }, + { + "epoch": 4.2275103659635835, + "grad_norm": 0.04189861938357353, + "learning_rate": 7.146584012478535e-05, + "loss": 0.0149, + "step": 23450 + }, + { + "epoch": 4.229313142239048, + "grad_norm": 0.03496585786342621, + "learning_rate": 7.144094790199169e-05, + "loss": 0.0134, + "step": 23460 + }, + { + "epoch": 4.231115918514512, + "grad_norm": 0.04067397490143776, + "learning_rate": 7.141604916600415e-05, + "loss": 0.0146, + "step": 23470 + }, + { + "epoch": 4.232918694789976, + "grad_norm": 0.03375764936208725, + "learning_rate": 7.139114392438635e-05, + "loss": 0.0136, + "step": 23480 + }, + { + "epoch": 4.23472147106544, + "grad_norm": 0.05133841186761856, + "learning_rate": 7.136623218470382e-05, + "loss": 0.0132, + "step": 23490 + }, + { + "epoch": 4.236524247340905, + "grad_norm": 0.035877346992492676, + "learning_rate": 7.13413139545241e-05, + "loss": 0.0117, + "step": 23500 + }, + { + "epoch": 4.238327023616369, + "grad_norm": 0.03918047621846199, + "learning_rate": 7.131638924141668e-05, + "loss": 0.0143, + "step": 23510 + }, + { + "epoch": 4.240129799891833, + "grad_norm": 0.035442277789115906, + "learning_rate": 7.129145805295304e-05, + "loss": 0.0129, + "step": 23520 + }, + { + "epoch": 4.241932576167297, + "grad_norm": 0.038123827427625656, + "learning_rate": 7.126652039670661e-05, + "loss": 0.0139, + "step": 23530 + }, + { + "epoch": 4.243735352442762, + "grad_norm": 0.03399790823459625, + "learning_rate": 7.124157628025278e-05, + "loss": 0.0142, + "step": 23540 + }, + { + "epoch": 4.245538128718226, + "grad_norm": 0.03715805336833, + "learning_rate": 7.121662571116894e-05, + "loss": 0.0153, + "step": 23550 + }, + { + "epoch": 4.24734090499369, + "grad_norm": 0.03900928795337677, + "learning_rate": 7.119166869703441e-05, + "loss": 0.0133, + "step": 23560 + }, + { + "epoch": 4.249143681269154, + "grad_norm": 0.03971189260482788, + "learning_rate": 7.116670524543044e-05, + "loss": 0.014, + "step": 23570 + }, + { + "epoch": 4.250946457544619, + "grad_norm": 0.026273401454091072, + "learning_rate": 7.114173536394032e-05, + "loss": 0.0125, + "step": 23580 + }, + { + "epoch": 4.252749233820083, + "grad_norm": 0.04847210273146629, + "learning_rate": 7.111675906014917e-05, + "loss": 0.0131, + "step": 23590 + }, + { + "epoch": 4.254552010095547, + "grad_norm": 0.04952195659279823, + "learning_rate": 7.109177634164421e-05, + "loss": 0.0145, + "step": 23600 + }, + { + "epoch": 4.256354786371011, + "grad_norm": 0.032949626445770264, + "learning_rate": 7.106678721601449e-05, + "loss": 0.0139, + "step": 23610 + }, + { + "epoch": 4.258157562646476, + "grad_norm": 0.034873995929956436, + "learning_rate": 7.104179169085103e-05, + "loss": 0.0126, + "step": 23620 + }, + { + "epoch": 4.25996033892194, + "grad_norm": 0.04450983926653862, + "learning_rate": 7.101678977374683e-05, + "loss": 0.0146, + "step": 23630 + }, + { + "epoch": 4.261763115197404, + "grad_norm": 0.02221265248954296, + "learning_rate": 7.099178147229685e-05, + "loss": 0.0127, + "step": 23640 + }, + { + "epoch": 4.263565891472869, + "grad_norm": 0.034467991441488266, + "learning_rate": 7.096676679409789e-05, + "loss": 0.013, + "step": 23650 + }, + { + "epoch": 4.265368667748333, + "grad_norm": 0.0356123149394989, + "learning_rate": 7.094174574674877e-05, + "loss": 0.0131, + "step": 23660 + }, + { + "epoch": 4.267171444023797, + "grad_norm": 0.030092235654592514, + "learning_rate": 7.091671833785025e-05, + "loss": 0.0134, + "step": 23670 + }, + { + "epoch": 4.268974220299261, + "grad_norm": 0.0496356301009655, + "learning_rate": 7.089168457500493e-05, + "loss": 0.0145, + "step": 23680 + }, + { + "epoch": 4.270776996574725, + "grad_norm": 0.033346716314554214, + "learning_rate": 7.086664446581747e-05, + "loss": 0.013, + "step": 23690 + }, + { + "epoch": 4.2725797728501895, + "grad_norm": 0.026919804513454437, + "learning_rate": 7.084159801789438e-05, + "loss": 0.0138, + "step": 23700 + }, + { + "epoch": 4.2743825491256535, + "grad_norm": 0.034788746386766434, + "learning_rate": 7.081654523884411e-05, + "loss": 0.0131, + "step": 23710 + }, + { + "epoch": 4.2761853254011175, + "grad_norm": 0.048982955515384674, + "learning_rate": 7.0791486136277e-05, + "loss": 0.0144, + "step": 23720 + }, + { + "epoch": 4.277988101676582, + "grad_norm": 0.03243011608719826, + "learning_rate": 7.07664207178054e-05, + "loss": 0.0131, + "step": 23730 + }, + { + "epoch": 4.279790877952046, + "grad_norm": 0.045829616487026215, + "learning_rate": 7.074134899104345e-05, + "loss": 0.0133, + "step": 23740 + }, + { + "epoch": 4.28159365422751, + "grad_norm": 0.03971477225422859, + "learning_rate": 7.071627096360735e-05, + "loss": 0.0134, + "step": 23750 + }, + { + "epoch": 4.283396430502974, + "grad_norm": 0.029917417094111443, + "learning_rate": 7.069118664311511e-05, + "loss": 0.0129, + "step": 23760 + }, + { + "epoch": 4.285199206778438, + "grad_norm": 0.029169609770178795, + "learning_rate": 7.06660960371867e-05, + "loss": 0.0131, + "step": 23770 + }, + { + "epoch": 4.287001983053903, + "grad_norm": 0.026813210919499397, + "learning_rate": 7.064099915344396e-05, + "loss": 0.0131, + "step": 23780 + }, + { + "epoch": 4.288804759329367, + "grad_norm": 0.030124317854642868, + "learning_rate": 7.061589599951066e-05, + "loss": 0.0135, + "step": 23790 + }, + { + "epoch": 4.290607535604831, + "grad_norm": 0.027020709589123726, + "learning_rate": 7.05907865830125e-05, + "loss": 0.0127, + "step": 23800 + }, + { + "epoch": 4.292410311880296, + "grad_norm": 0.02451074868440628, + "learning_rate": 7.056567091157703e-05, + "loss": 0.0123, + "step": 23810 + }, + { + "epoch": 4.29421308815576, + "grad_norm": 0.040313348174095154, + "learning_rate": 7.054054899283375e-05, + "loss": 0.013, + "step": 23820 + }, + { + "epoch": 4.296015864431224, + "grad_norm": 0.03962858393788338, + "learning_rate": 7.051542083441403e-05, + "loss": 0.0134, + "step": 23830 + }, + { + "epoch": 4.297818640706688, + "grad_norm": 0.030218910425901413, + "learning_rate": 7.049028644395113e-05, + "loss": 0.0146, + "step": 23840 + }, + { + "epoch": 4.299621416982152, + "grad_norm": 0.031599052250385284, + "learning_rate": 7.046514582908024e-05, + "loss": 0.0137, + "step": 23850 + }, + { + "epoch": 4.301424193257617, + "grad_norm": 0.030295444652438164, + "learning_rate": 7.043999899743838e-05, + "loss": 0.0123, + "step": 23860 + }, + { + "epoch": 4.303226969533081, + "grad_norm": 0.039118580520153046, + "learning_rate": 7.041484595666451e-05, + "loss": 0.0131, + "step": 23870 + }, + { + "epoch": 4.305029745808545, + "grad_norm": 0.03890449181199074, + "learning_rate": 7.038968671439948e-05, + "loss": 0.0129, + "step": 23880 + }, + { + "epoch": 4.30683252208401, + "grad_norm": 0.03126204013824463, + "learning_rate": 7.036452127828596e-05, + "loss": 0.0136, + "step": 23890 + }, + { + "epoch": 4.308635298359474, + "grad_norm": 0.04461554437875748, + "learning_rate": 7.033934965596859e-05, + "loss": 0.0128, + "step": 23900 + }, + { + "epoch": 4.310438074634938, + "grad_norm": 0.044284481555223465, + "learning_rate": 7.031417185509381e-05, + "loss": 0.0125, + "step": 23910 + }, + { + "epoch": 4.312240850910402, + "grad_norm": 0.02741781435906887, + "learning_rate": 7.028898788331e-05, + "loss": 0.0144, + "step": 23920 + }, + { + "epoch": 4.314043627185867, + "grad_norm": 0.038282811641693115, + "learning_rate": 7.026379774826736e-05, + "loss": 0.0128, + "step": 23930 + }, + { + "epoch": 4.315846403461331, + "grad_norm": 0.0391233004629612, + "learning_rate": 7.0238601457618e-05, + "loss": 0.0132, + "step": 23940 + }, + { + "epoch": 4.317649179736795, + "grad_norm": 0.04443211480975151, + "learning_rate": 7.02133990190159e-05, + "loss": 0.0133, + "step": 23950 + }, + { + "epoch": 4.319451956012259, + "grad_norm": 0.041531383991241455, + "learning_rate": 7.018819044011687e-05, + "loss": 0.0143, + "step": 23960 + }, + { + "epoch": 4.3212547322877235, + "grad_norm": 0.04156087711453438, + "learning_rate": 7.016297572857863e-05, + "loss": 0.014, + "step": 23970 + }, + { + "epoch": 4.3230575085631875, + "grad_norm": 0.0273751150816679, + "learning_rate": 7.013775489206072e-05, + "loss": 0.0126, + "step": 23980 + }, + { + "epoch": 4.3248602848386515, + "grad_norm": 0.03763411194086075, + "learning_rate": 7.01125279382246e-05, + "loss": 0.0133, + "step": 23990 + }, + { + "epoch": 4.3266630611141155, + "grad_norm": 0.047649085521698, + "learning_rate": 7.008729487473351e-05, + "loss": 0.0143, + "step": 24000 + }, + { + "epoch": 4.32846583738958, + "grad_norm": 0.03396891430020332, + "learning_rate": 7.006205570925263e-05, + "loss": 0.0124, + "step": 24010 + }, + { + "epoch": 4.330268613665044, + "grad_norm": 0.038264382630586624, + "learning_rate": 7.003681044944892e-05, + "loss": 0.0133, + "step": 24020 + }, + { + "epoch": 4.332071389940508, + "grad_norm": 0.03571972995996475, + "learning_rate": 7.001155910299126e-05, + "loss": 0.0138, + "step": 24030 + }, + { + "epoch": 4.333874166215972, + "grad_norm": 0.04013395681977272, + "learning_rate": 6.99863016775503e-05, + "loss": 0.0123, + "step": 24040 + }, + { + "epoch": 4.335676942491437, + "grad_norm": 0.04519449174404144, + "learning_rate": 6.996103818079859e-05, + "loss": 0.0135, + "step": 24050 + }, + { + "epoch": 4.337479718766901, + "grad_norm": 0.03955865278840065, + "learning_rate": 6.993576862041054e-05, + "loss": 0.0139, + "step": 24060 + }, + { + "epoch": 4.339282495042365, + "grad_norm": 0.03203146159648895, + "learning_rate": 6.991049300406235e-05, + "loss": 0.0133, + "step": 24070 + }, + { + "epoch": 4.341085271317829, + "grad_norm": 0.033373042941093445, + "learning_rate": 6.988521133943209e-05, + "loss": 0.0128, + "step": 24080 + }, + { + "epoch": 4.342888047593294, + "grad_norm": 0.03725710138678551, + "learning_rate": 6.985992363419966e-05, + "loss": 0.0138, + "step": 24090 + }, + { + "epoch": 4.344690823868758, + "grad_norm": 0.02722766250371933, + "learning_rate": 6.983462989604682e-05, + "loss": 0.0132, + "step": 24100 + }, + { + "epoch": 4.346493600144222, + "grad_norm": 0.03839974105358124, + "learning_rate": 6.980933013265709e-05, + "loss": 0.0154, + "step": 24110 + }, + { + "epoch": 4.348296376419686, + "grad_norm": 0.035053595900535583, + "learning_rate": 6.978402435171592e-05, + "loss": 0.0127, + "step": 24120 + }, + { + "epoch": 4.350099152695151, + "grad_norm": 0.046052757650613785, + "learning_rate": 6.975871256091052e-05, + "loss": 0.0137, + "step": 24130 + }, + { + "epoch": 4.351901928970615, + "grad_norm": 0.031888447701931, + "learning_rate": 6.973339476792995e-05, + "loss": 0.0123, + "step": 24140 + }, + { + "epoch": 4.353704705246079, + "grad_norm": 0.035097233951091766, + "learning_rate": 6.970807098046505e-05, + "loss": 0.014, + "step": 24150 + }, + { + "epoch": 4.355507481521543, + "grad_norm": 0.02791309542953968, + "learning_rate": 6.968274120620858e-05, + "loss": 0.012, + "step": 24160 + }, + { + "epoch": 4.357310257797008, + "grad_norm": 0.044373344630002975, + "learning_rate": 6.965740545285499e-05, + "loss": 0.0129, + "step": 24170 + }, + { + "epoch": 4.359113034072472, + "grad_norm": 0.030218664556741714, + "learning_rate": 6.963206372810068e-05, + "loss": 0.0135, + "step": 24180 + }, + { + "epoch": 4.360915810347936, + "grad_norm": 0.03418146073818207, + "learning_rate": 6.960671603964375e-05, + "loss": 0.0138, + "step": 24190 + }, + { + "epoch": 4.3627185866234, + "grad_norm": 0.0373779758810997, + "learning_rate": 6.958136239518418e-05, + "loss": 0.0144, + "step": 24200 + }, + { + "epoch": 4.364521362898865, + "grad_norm": 0.04759407415986061, + "learning_rate": 6.955600280242371e-05, + "loss": 0.0139, + "step": 24210 + }, + { + "epoch": 4.366324139174329, + "grad_norm": 0.049419913440942764, + "learning_rate": 6.953063726906596e-05, + "loss": 0.0152, + "step": 24220 + }, + { + "epoch": 4.368126915449793, + "grad_norm": 0.030391819775104523, + "learning_rate": 6.950526580281626e-05, + "loss": 0.0138, + "step": 24230 + }, + { + "epoch": 4.369929691725257, + "grad_norm": 0.04242231696844101, + "learning_rate": 6.947988841138184e-05, + "loss": 0.0127, + "step": 24240 + }, + { + "epoch": 4.3717324680007215, + "grad_norm": 0.029161255806684494, + "learning_rate": 6.945450510247165e-05, + "loss": 0.0159, + "step": 24250 + }, + { + "epoch": 4.3735352442761855, + "grad_norm": 0.034745898097753525, + "learning_rate": 6.942911588379647e-05, + "loss": 0.0144, + "step": 24260 + }, + { + "epoch": 4.375338020551649, + "grad_norm": 0.03155764192342758, + "learning_rate": 6.940372076306888e-05, + "loss": 0.013, + "step": 24270 + }, + { + "epoch": 4.377140796827113, + "grad_norm": 0.04190724343061447, + "learning_rate": 6.937831974800326e-05, + "loss": 0.0143, + "step": 24280 + }, + { + "epoch": 4.378943573102578, + "grad_norm": 0.04105806723237038, + "learning_rate": 6.935291284631574e-05, + "loss": 0.0145, + "step": 24290 + }, + { + "epoch": 4.380746349378042, + "grad_norm": 0.03805764392018318, + "learning_rate": 6.932750006572428e-05, + "loss": 0.0137, + "step": 24300 + }, + { + "epoch": 4.382549125653506, + "grad_norm": 0.03581907972693443, + "learning_rate": 6.930208141394863e-05, + "loss": 0.0127, + "step": 24310 + }, + { + "epoch": 4.38435190192897, + "grad_norm": 0.04119705781340599, + "learning_rate": 6.927665689871026e-05, + "loss": 0.0123, + "step": 24320 + }, + { + "epoch": 4.386154678204435, + "grad_norm": 0.039610881358385086, + "learning_rate": 6.925122652773253e-05, + "loss": 0.0145, + "step": 24330 + }, + { + "epoch": 4.387957454479899, + "grad_norm": 0.04330465942621231, + "learning_rate": 6.922579030874046e-05, + "loss": 0.0137, + "step": 24340 + }, + { + "epoch": 4.389760230755363, + "grad_norm": 0.049097999930381775, + "learning_rate": 6.920034824946093e-05, + "loss": 0.0133, + "step": 24350 + }, + { + "epoch": 4.391563007030827, + "grad_norm": 0.027609724551439285, + "learning_rate": 6.917490035762255e-05, + "loss": 0.0134, + "step": 24360 + }, + { + "epoch": 4.393365783306292, + "grad_norm": 0.03320369869470596, + "learning_rate": 6.914944664095573e-05, + "loss": 0.0147, + "step": 24370 + }, + { + "epoch": 4.395168559581756, + "grad_norm": 0.04329666122794151, + "learning_rate": 6.912398710719264e-05, + "loss": 0.0123, + "step": 24380 + }, + { + "epoch": 4.39697133585722, + "grad_norm": 0.030638007447123528, + "learning_rate": 6.90985217640672e-05, + "loss": 0.0136, + "step": 24390 + }, + { + "epoch": 4.398774112132684, + "grad_norm": 0.035177428275346756, + "learning_rate": 6.90730506193151e-05, + "loss": 0.0141, + "step": 24400 + }, + { + "epoch": 4.400576888408149, + "grad_norm": 0.029672667384147644, + "learning_rate": 6.904757368067384e-05, + "loss": 0.0144, + "step": 24410 + }, + { + "epoch": 4.402379664683613, + "grad_norm": 0.030391806736588478, + "learning_rate": 6.90220909558826e-05, + "loss": 0.013, + "step": 24420 + }, + { + "epoch": 4.404182440959077, + "grad_norm": 0.039163392037153244, + "learning_rate": 6.899660245268237e-05, + "loss": 0.0132, + "step": 24430 + }, + { + "epoch": 4.405985217234541, + "grad_norm": 0.03646654263138771, + "learning_rate": 6.897110817881592e-05, + "loss": 0.0131, + "step": 24440 + }, + { + "epoch": 4.407787993510006, + "grad_norm": 0.03784643113613129, + "learning_rate": 6.894560814202769e-05, + "loss": 0.0127, + "step": 24450 + }, + { + "epoch": 4.40959076978547, + "grad_norm": 0.04635323956608772, + "learning_rate": 6.892010235006394e-05, + "loss": 0.0154, + "step": 24460 + }, + { + "epoch": 4.411393546060934, + "grad_norm": 0.03800544887781143, + "learning_rate": 6.889459081067264e-05, + "loss": 0.0134, + "step": 24470 + }, + { + "epoch": 4.413196322336398, + "grad_norm": 0.04739215224981308, + "learning_rate": 6.886907353160356e-05, + "loss": 0.0132, + "step": 24480 + }, + { + "epoch": 4.414999098611863, + "grad_norm": 0.03493911772966385, + "learning_rate": 6.884355052060814e-05, + "loss": 0.0142, + "step": 24490 + }, + { + "epoch": 4.416801874887327, + "grad_norm": 0.04199066758155823, + "learning_rate": 6.88180217854396e-05, + "loss": 0.0123, + "step": 24500 + }, + { + "epoch": 4.4186046511627906, + "grad_norm": 0.03663840889930725, + "learning_rate": 6.87924873338529e-05, + "loss": 0.0128, + "step": 24510 + }, + { + "epoch": 4.4204074274382545, + "grad_norm": 0.039667822420597076, + "learning_rate": 6.876694717360475e-05, + "loss": 0.0144, + "step": 24520 + }, + { + "epoch": 4.422210203713719, + "grad_norm": 0.03431324660778046, + "learning_rate": 6.874140131245355e-05, + "loss": 0.0138, + "step": 24530 + }, + { + "epoch": 4.424012979989183, + "grad_norm": 0.04779832810163498, + "learning_rate": 6.871584975815948e-05, + "loss": 0.0133, + "step": 24540 + }, + { + "epoch": 4.425815756264647, + "grad_norm": 0.037548258900642395, + "learning_rate": 6.86902925184844e-05, + "loss": 0.0134, + "step": 24550 + }, + { + "epoch": 4.427618532540111, + "grad_norm": 0.037038855254650116, + "learning_rate": 6.866472960119195e-05, + "loss": 0.0121, + "step": 24560 + }, + { + "epoch": 4.429421308815576, + "grad_norm": 0.03201065957546234, + "learning_rate": 6.863916101404748e-05, + "loss": 0.0129, + "step": 24570 + }, + { + "epoch": 4.43122408509104, + "grad_norm": 0.041653260588645935, + "learning_rate": 6.8613586764818e-05, + "loss": 0.013, + "step": 24580 + }, + { + "epoch": 4.433026861366504, + "grad_norm": 0.0262505691498518, + "learning_rate": 6.858800686127233e-05, + "loss": 0.0143, + "step": 24590 + }, + { + "epoch": 4.434829637641968, + "grad_norm": 0.04060445353388786, + "learning_rate": 6.856242131118097e-05, + "loss": 0.0127, + "step": 24600 + }, + { + "epoch": 4.436632413917433, + "grad_norm": 0.04151453077793121, + "learning_rate": 6.853683012231614e-05, + "loss": 0.0132, + "step": 24610 + }, + { + "epoch": 4.438435190192897, + "grad_norm": 0.04969179630279541, + "learning_rate": 6.851123330245173e-05, + "loss": 0.014, + "step": 24620 + }, + { + "epoch": 4.440237966468361, + "grad_norm": 0.04438858479261398, + "learning_rate": 6.848563085936343e-05, + "loss": 0.013, + "step": 24630 + }, + { + "epoch": 4.442040742743825, + "grad_norm": 0.03033212199807167, + "learning_rate": 6.846002280082853e-05, + "loss": 0.0138, + "step": 24640 + }, + { + "epoch": 4.44384351901929, + "grad_norm": 0.03370277211070061, + "learning_rate": 6.843440913462614e-05, + "loss": 0.0134, + "step": 24650 + }, + { + "epoch": 4.445646295294754, + "grad_norm": 0.039064306765794754, + "learning_rate": 6.840878986853698e-05, + "loss": 0.0125, + "step": 24660 + }, + { + "epoch": 4.447449071570218, + "grad_norm": 0.041290052235126495, + "learning_rate": 6.838316501034352e-05, + "loss": 0.0131, + "step": 24670 + }, + { + "epoch": 4.449251847845682, + "grad_norm": 0.03734714910387993, + "learning_rate": 6.83575345678299e-05, + "loss": 0.0128, + "step": 24680 + }, + { + "epoch": 4.451054624121147, + "grad_norm": 0.035124070942401886, + "learning_rate": 6.833189854878196e-05, + "loss": 0.0128, + "step": 24690 + }, + { + "epoch": 4.452857400396611, + "grad_norm": 0.03838062658905983, + "learning_rate": 6.83062569609873e-05, + "loss": 0.0131, + "step": 24700 + }, + { + "epoch": 4.454660176672075, + "grad_norm": 0.043373074382543564, + "learning_rate": 6.828060981223512e-05, + "loss": 0.0134, + "step": 24710 + }, + { + "epoch": 4.456462952947539, + "grad_norm": 0.052878301590681076, + "learning_rate": 6.825495711031634e-05, + "loss": 0.0132, + "step": 24720 + }, + { + "epoch": 4.458265729223004, + "grad_norm": 0.042500048875808716, + "learning_rate": 6.822929886302359e-05, + "loss": 0.0124, + "step": 24730 + }, + { + "epoch": 4.460068505498468, + "grad_norm": 0.04317609593272209, + "learning_rate": 6.820363507815116e-05, + "loss": 0.0133, + "step": 24740 + }, + { + "epoch": 4.461871281773932, + "grad_norm": 0.043672747910022736, + "learning_rate": 6.817796576349501e-05, + "loss": 0.0135, + "step": 24750 + }, + { + "epoch": 4.463674058049396, + "grad_norm": 0.035121746361255646, + "learning_rate": 6.815229092685285e-05, + "loss": 0.0141, + "step": 24760 + }, + { + "epoch": 4.4654768343248605, + "grad_norm": 0.03785579279065132, + "learning_rate": 6.812661057602399e-05, + "loss": 0.0139, + "step": 24770 + }, + { + "epoch": 4.4672796106003245, + "grad_norm": 0.04604469612240791, + "learning_rate": 6.810092471880943e-05, + "loss": 0.0137, + "step": 24780 + }, + { + "epoch": 4.4690823868757885, + "grad_norm": 0.03169671446084976, + "learning_rate": 6.807523336301187e-05, + "loss": 0.0118, + "step": 24790 + }, + { + "epoch": 4.470885163151253, + "grad_norm": 0.047851208597421646, + "learning_rate": 6.804953651643566e-05, + "loss": 0.0143, + "step": 24800 + }, + { + "epoch": 4.472687939426717, + "grad_norm": 0.059442270547151566, + "learning_rate": 6.802383418688685e-05, + "loss": 0.0142, + "step": 24810 + }, + { + "epoch": 4.474490715702181, + "grad_norm": 0.035749901086091995, + "learning_rate": 6.799812638217309e-05, + "loss": 0.0122, + "step": 24820 + }, + { + "epoch": 4.476293491977645, + "grad_norm": 0.030107785016298294, + "learning_rate": 6.797241311010373e-05, + "loss": 0.013, + "step": 24830 + }, + { + "epoch": 4.478096268253109, + "grad_norm": 0.02892523817718029, + "learning_rate": 6.794669437848982e-05, + "loss": 0.0139, + "step": 24840 + }, + { + "epoch": 4.479899044528574, + "grad_norm": 0.023501595482230186, + "learning_rate": 6.792097019514402e-05, + "loss": 0.0124, + "step": 24850 + }, + { + "epoch": 4.481701820804038, + "grad_norm": 0.026345157995820045, + "learning_rate": 6.789524056788064e-05, + "loss": 0.0129, + "step": 24860 + }, + { + "epoch": 4.483504597079502, + "grad_norm": 0.03263472393155098, + "learning_rate": 6.786950550451567e-05, + "loss": 0.013, + "step": 24870 + }, + { + "epoch": 4.485307373354967, + "grad_norm": 0.040566831827163696, + "learning_rate": 6.784376501286676e-05, + "loss": 0.0135, + "step": 24880 + }, + { + "epoch": 4.487110149630431, + "grad_norm": 0.04070296511054039, + "learning_rate": 6.781801910075316e-05, + "loss": 0.014, + "step": 24890 + }, + { + "epoch": 4.488912925905895, + "grad_norm": 0.039350975304841995, + "learning_rate": 6.779226777599581e-05, + "loss": 0.0138, + "step": 24900 + }, + { + "epoch": 4.490715702181359, + "grad_norm": 0.05601748079061508, + "learning_rate": 6.776651104641729e-05, + "loss": 0.0154, + "step": 24910 + }, + { + "epoch": 4.492518478456823, + "grad_norm": 0.03998206928372383, + "learning_rate": 6.774074891984183e-05, + "loss": 0.0129, + "step": 24920 + }, + { + "epoch": 4.494321254732288, + "grad_norm": 0.037668805569410324, + "learning_rate": 6.771498140409526e-05, + "loss": 0.0138, + "step": 24930 + }, + { + "epoch": 4.496124031007752, + "grad_norm": 0.03887096419930458, + "learning_rate": 6.768920850700506e-05, + "loss": 0.0135, + "step": 24940 + }, + { + "epoch": 4.497926807283216, + "grad_norm": 0.04310167208313942, + "learning_rate": 6.766343023640039e-05, + "loss": 0.0132, + "step": 24950 + }, + { + "epoch": 4.499729583558681, + "grad_norm": 0.04405100643634796, + "learning_rate": 6.763764660011198e-05, + "loss": 0.0123, + "step": 24960 + }, + { + "epoch": 4.501532359834145, + "grad_norm": 0.04597846791148186, + "learning_rate": 6.761185760597223e-05, + "loss": 0.0131, + "step": 24970 + }, + { + "epoch": 4.503335136109609, + "grad_norm": 0.05299341678619385, + "learning_rate": 6.758606326181515e-05, + "loss": 0.0127, + "step": 24980 + }, + { + "epoch": 4.505137912385073, + "grad_norm": 0.03005286306142807, + "learning_rate": 6.75602635754764e-05, + "loss": 0.0129, + "step": 24990 + }, + { + "epoch": 4.506940688660537, + "grad_norm": 0.04608207195997238, + "learning_rate": 6.75344585547932e-05, + "loss": 0.0142, + "step": 25000 + }, + { + "epoch": 4.508743464936002, + "grad_norm": 0.03853101655840874, + "learning_rate": 6.750864820760449e-05, + "loss": 0.013, + "step": 25010 + }, + { + "epoch": 4.510546241211466, + "grad_norm": 0.032372280955314636, + "learning_rate": 6.748283254175072e-05, + "loss": 0.0142, + "step": 25020 + }, + { + "epoch": 4.51234901748693, + "grad_norm": 0.03529133275151253, + "learning_rate": 6.745701156507404e-05, + "loss": 0.0133, + "step": 25030 + }, + { + "epoch": 4.5141517937623945, + "grad_norm": 0.03886482119560242, + "learning_rate": 6.743118528541818e-05, + "loss": 0.0156, + "step": 25040 + }, + { + "epoch": 4.5159545700378585, + "grad_norm": 0.04149775952100754, + "learning_rate": 6.740535371062846e-05, + "loss": 0.0124, + "step": 25050 + }, + { + "epoch": 4.5177573463133225, + "grad_norm": 0.04035187512636185, + "learning_rate": 6.737951684855185e-05, + "loss": 0.0125, + "step": 25060 + }, + { + "epoch": 4.5195601225887865, + "grad_norm": 0.03269817680120468, + "learning_rate": 6.735367470703691e-05, + "loss": 0.0125, + "step": 25070 + }, + { + "epoch": 4.5213628988642505, + "grad_norm": 0.03520066291093826, + "learning_rate": 6.732782729393379e-05, + "loss": 0.0128, + "step": 25080 + }, + { + "epoch": 4.523165675139715, + "grad_norm": 0.03378160297870636, + "learning_rate": 6.730197461709425e-05, + "loss": 0.0126, + "step": 25090 + }, + { + "epoch": 4.524968451415179, + "grad_norm": 0.04209718108177185, + "learning_rate": 6.727611668437164e-05, + "loss": 0.0137, + "step": 25100 + }, + { + "epoch": 4.526771227690643, + "grad_norm": 0.045656491070985794, + "learning_rate": 6.725025350362094e-05, + "loss": 0.013, + "step": 25110 + }, + { + "epoch": 4.528574003966108, + "grad_norm": 0.03615185245871544, + "learning_rate": 6.72243850826987e-05, + "loss": 0.0133, + "step": 25120 + }, + { + "epoch": 4.530376780241572, + "grad_norm": 0.026036594063043594, + "learning_rate": 6.719851142946305e-05, + "loss": 0.0138, + "step": 25130 + }, + { + "epoch": 4.532179556517036, + "grad_norm": 0.03818904608488083, + "learning_rate": 6.717263255177372e-05, + "loss": 0.0132, + "step": 25140 + }, + { + "epoch": 4.5339823327925, + "grad_norm": 0.03461351618170738, + "learning_rate": 6.714674845749205e-05, + "loss": 0.013, + "step": 25150 + }, + { + "epoch": 4.535785109067964, + "grad_norm": 0.03881440311670303, + "learning_rate": 6.712085915448092e-05, + "loss": 0.0142, + "step": 25160 + }, + { + "epoch": 4.537587885343429, + "grad_norm": 0.03757309541106224, + "learning_rate": 6.709496465060486e-05, + "loss": 0.0127, + "step": 25170 + }, + { + "epoch": 4.539390661618893, + "grad_norm": 0.03815339878201485, + "learning_rate": 6.706906495372987e-05, + "loss": 0.0137, + "step": 25180 + }, + { + "epoch": 4.541193437894357, + "grad_norm": 0.04739398881793022, + "learning_rate": 6.704316007172365e-05, + "loss": 0.0142, + "step": 25190 + }, + { + "epoch": 4.542996214169822, + "grad_norm": 0.038298364728689194, + "learning_rate": 6.701725001245539e-05, + "loss": 0.0144, + "step": 25200 + }, + { + "epoch": 4.544798990445286, + "grad_norm": 0.03373422473669052, + "learning_rate": 6.699133478379588e-05, + "loss": 0.0131, + "step": 25210 + }, + { + "epoch": 4.54660176672075, + "grad_norm": 0.03441242873668671, + "learning_rate": 6.69654143936175e-05, + "loss": 0.0125, + "step": 25220 + }, + { + "epoch": 4.548404542996214, + "grad_norm": 0.042042944580316544, + "learning_rate": 6.693948884979419e-05, + "loss": 0.0128, + "step": 25230 + }, + { + "epoch": 4.550207319271679, + "grad_norm": 0.05151601508259773, + "learning_rate": 6.691355816020142e-05, + "loss": 0.0133, + "step": 25240 + }, + { + "epoch": 4.552010095547143, + "grad_norm": 0.03845982253551483, + "learning_rate": 6.688762233271624e-05, + "loss": 0.0134, + "step": 25250 + }, + { + "epoch": 4.553812871822607, + "grad_norm": 0.03997104614973068, + "learning_rate": 6.68616813752173e-05, + "loss": 0.0141, + "step": 25260 + }, + { + "epoch": 4.555615648098071, + "grad_norm": 0.03335548937320709, + "learning_rate": 6.683573529558477e-05, + "loss": 0.0126, + "step": 25270 + }, + { + "epoch": 4.557418424373536, + "grad_norm": 0.03985786810517311, + "learning_rate": 6.680978410170037e-05, + "loss": 0.0141, + "step": 25280 + }, + { + "epoch": 4.559221200649, + "grad_norm": 0.03882386162877083, + "learning_rate": 6.678382780144741e-05, + "loss": 0.0137, + "step": 25290 + }, + { + "epoch": 4.561023976924464, + "grad_norm": 0.024783367291092873, + "learning_rate": 6.675786640271071e-05, + "loss": 0.0124, + "step": 25300 + }, + { + "epoch": 4.562826753199928, + "grad_norm": 0.036469802260398865, + "learning_rate": 6.673189991337665e-05, + "loss": 0.0129, + "step": 25310 + }, + { + "epoch": 4.5646295294753925, + "grad_norm": 0.037719037383794785, + "learning_rate": 6.670592834133317e-05, + "loss": 0.012, + "step": 25320 + }, + { + "epoch": 4.5664323057508565, + "grad_norm": 0.025829510763287544, + "learning_rate": 6.667995169446979e-05, + "loss": 0.0131, + "step": 25330 + }, + { + "epoch": 4.5682350820263204, + "grad_norm": 0.027123980224132538, + "learning_rate": 6.665396998067747e-05, + "loss": 0.0128, + "step": 25340 + }, + { + "epoch": 4.570037858301784, + "grad_norm": 0.046093400567770004, + "learning_rate": 6.66279832078488e-05, + "loss": 0.0134, + "step": 25350 + }, + { + "epoch": 4.571840634577249, + "grad_norm": 0.03250535577535629, + "learning_rate": 6.660199138387786e-05, + "loss": 0.0125, + "step": 25360 + }, + { + "epoch": 4.573643410852713, + "grad_norm": 0.025955503806471825, + "learning_rate": 6.65759945166603e-05, + "loss": 0.0118, + "step": 25370 + }, + { + "epoch": 4.575446187128177, + "grad_norm": 0.026923254132270813, + "learning_rate": 6.654999261409326e-05, + "loss": 0.0123, + "step": 25380 + }, + { + "epoch": 4.577248963403641, + "grad_norm": 0.033437494188547134, + "learning_rate": 6.652398568407544e-05, + "loss": 0.0125, + "step": 25390 + }, + { + "epoch": 4.579051739679106, + "grad_norm": 0.033924318850040436, + "learning_rate": 6.649797373450707e-05, + "loss": 0.0127, + "step": 25400 + }, + { + "epoch": 4.58085451595457, + "grad_norm": 0.03335762396454811, + "learning_rate": 6.647195677328988e-05, + "loss": 0.0133, + "step": 25410 + }, + { + "epoch": 4.582657292230034, + "grad_norm": 0.028120582923293114, + "learning_rate": 6.644593480832712e-05, + "loss": 0.0138, + "step": 25420 + }, + { + "epoch": 4.584460068505498, + "grad_norm": 0.03576704487204552, + "learning_rate": 6.641990784752363e-05, + "loss": 0.0129, + "step": 25430 + }, + { + "epoch": 4.586262844780963, + "grad_norm": 0.044179245829582214, + "learning_rate": 6.639387589878566e-05, + "loss": 0.0123, + "step": 25440 + }, + { + "epoch": 4.588065621056427, + "grad_norm": 0.050211094319820404, + "learning_rate": 6.636783897002103e-05, + "loss": 0.0126, + "step": 25450 + }, + { + "epoch": 4.589868397331891, + "grad_norm": 0.0414290651679039, + "learning_rate": 6.63417970691391e-05, + "loss": 0.0132, + "step": 25460 + }, + { + "epoch": 4.591671173607355, + "grad_norm": 0.040584441274404526, + "learning_rate": 6.63157502040507e-05, + "loss": 0.0132, + "step": 25470 + }, + { + "epoch": 4.59347394988282, + "grad_norm": 0.02771291323006153, + "learning_rate": 6.628969838266819e-05, + "loss": 0.0124, + "step": 25480 + }, + { + "epoch": 4.595276726158284, + "grad_norm": 0.02972092665731907, + "learning_rate": 6.626364161290541e-05, + "loss": 0.0121, + "step": 25490 + }, + { + "epoch": 4.597079502433748, + "grad_norm": 0.059547778218984604, + "learning_rate": 6.623757990267774e-05, + "loss": 0.0135, + "step": 25500 + }, + { + "epoch": 4.598882278709212, + "grad_norm": 0.047441303730010986, + "learning_rate": 6.621151325990201e-05, + "loss": 0.0145, + "step": 25510 + }, + { + "epoch": 4.600685054984677, + "grad_norm": 0.03956645354628563, + "learning_rate": 6.618544169249657e-05, + "loss": 0.0134, + "step": 25520 + }, + { + "epoch": 4.602487831260141, + "grad_norm": 0.03562502562999725, + "learning_rate": 6.615936520838133e-05, + "loss": 0.0134, + "step": 25530 + }, + { + "epoch": 4.604290607535605, + "grad_norm": 0.026830781251192093, + "learning_rate": 6.613328381547759e-05, + "loss": 0.0139, + "step": 25540 + }, + { + "epoch": 4.606093383811069, + "grad_norm": 0.03395737335085869, + "learning_rate": 6.610719752170821e-05, + "loss": 0.0132, + "step": 25550 + }, + { + "epoch": 4.607896160086534, + "grad_norm": 0.04019602760672569, + "learning_rate": 6.60811063349975e-05, + "loss": 0.0125, + "step": 25560 + }, + { + "epoch": 4.609698936361998, + "grad_norm": 0.03538639098405838, + "learning_rate": 6.605501026327127e-05, + "loss": 0.0131, + "step": 25570 + }, + { + "epoch": 4.611501712637462, + "grad_norm": 0.04015187919139862, + "learning_rate": 6.602890931445685e-05, + "loss": 0.0134, + "step": 25580 + }, + { + "epoch": 4.6133044889129255, + "grad_norm": 0.03596242144703865, + "learning_rate": 6.6002803496483e-05, + "loss": 0.0135, + "step": 25590 + }, + { + "epoch": 4.61510726518839, + "grad_norm": 0.03974773734807968, + "learning_rate": 6.597669281727997e-05, + "loss": 0.0137, + "step": 25600 + }, + { + "epoch": 4.616910041463854, + "grad_norm": 0.03930271416902542, + "learning_rate": 6.595057728477949e-05, + "loss": 0.0119, + "step": 25610 + }, + { + "epoch": 4.618712817739318, + "grad_norm": 0.03932708874344826, + "learning_rate": 6.59244569069148e-05, + "loss": 0.0122, + "step": 25620 + }, + { + "epoch": 4.620515594014782, + "grad_norm": 0.024542829021811485, + "learning_rate": 6.589833169162054e-05, + "loss": 0.0127, + "step": 25630 + }, + { + "epoch": 4.622318370290247, + "grad_norm": 0.040778398513793945, + "learning_rate": 6.587220164683291e-05, + "loss": 0.0122, + "step": 25640 + }, + { + "epoch": 4.624121146565711, + "grad_norm": 0.03504985570907593, + "learning_rate": 6.58460667804895e-05, + "loss": 0.0134, + "step": 25650 + }, + { + "epoch": 4.625923922841175, + "grad_norm": 0.033236000686883926, + "learning_rate": 6.581992710052938e-05, + "loss": 0.0124, + "step": 25660 + }, + { + "epoch": 4.627726699116639, + "grad_norm": 0.03422967344522476, + "learning_rate": 6.579378261489311e-05, + "loss": 0.0131, + "step": 25670 + }, + { + "epoch": 4.629529475392104, + "grad_norm": 0.028461921960115433, + "learning_rate": 6.576763333152268e-05, + "loss": 0.0133, + "step": 25680 + }, + { + "epoch": 4.631332251667568, + "grad_norm": 0.031004978343844414, + "learning_rate": 6.574147925836159e-05, + "loss": 0.0137, + "step": 25690 + }, + { + "epoch": 4.633135027943032, + "grad_norm": 0.046135418117046356, + "learning_rate": 6.571532040335472e-05, + "loss": 0.0145, + "step": 25700 + }, + { + "epoch": 4.634937804218496, + "grad_norm": 0.05924753099679947, + "learning_rate": 6.568915677444845e-05, + "loss": 0.0129, + "step": 25710 + }, + { + "epoch": 4.636740580493961, + "grad_norm": 0.027211302891373634, + "learning_rate": 6.56629883795906e-05, + "loss": 0.0128, + "step": 25720 + }, + { + "epoch": 4.638543356769425, + "grad_norm": 0.032857637852430344, + "learning_rate": 6.563681522673043e-05, + "loss": 0.0127, + "step": 25730 + }, + { + "epoch": 4.640346133044889, + "grad_norm": 0.03805747255682945, + "learning_rate": 6.561063732381867e-05, + "loss": 0.013, + "step": 25740 + }, + { + "epoch": 4.642148909320353, + "grad_norm": 0.04527513310313225, + "learning_rate": 6.558445467880745e-05, + "loss": 0.014, + "step": 25750 + }, + { + "epoch": 4.643951685595818, + "grad_norm": 0.031657543033361435, + "learning_rate": 6.55582672996504e-05, + "loss": 0.0139, + "step": 25760 + }, + { + "epoch": 4.645754461871282, + "grad_norm": 0.03227533400058746, + "learning_rate": 6.553207519430253e-05, + "loss": 0.0129, + "step": 25770 + }, + { + "epoch": 4.647557238146746, + "grad_norm": 0.03152766078710556, + "learning_rate": 6.550587837072032e-05, + "loss": 0.0121, + "step": 25780 + }, + { + "epoch": 4.649360014422211, + "grad_norm": 0.039321210235357285, + "learning_rate": 6.547967683686166e-05, + "loss": 0.0135, + "step": 25790 + }, + { + "epoch": 4.651162790697675, + "grad_norm": 0.0370330885052681, + "learning_rate": 6.545347060068591e-05, + "loss": 0.0129, + "step": 25800 + }, + { + "epoch": 4.652965566973139, + "grad_norm": 0.030718553811311722, + "learning_rate": 6.542725967015382e-05, + "loss": 0.0151, + "step": 25810 + }, + { + "epoch": 4.654768343248603, + "grad_norm": 0.029691452160477638, + "learning_rate": 6.540104405322757e-05, + "loss": 0.0123, + "step": 25820 + }, + { + "epoch": 4.656571119524067, + "grad_norm": 0.034178346395492554, + "learning_rate": 6.537482375787077e-05, + "loss": 0.0136, + "step": 25830 + }, + { + "epoch": 4.6583738957995315, + "grad_norm": 0.02834976650774479, + "learning_rate": 6.534859879204845e-05, + "loss": 0.0128, + "step": 25840 + }, + { + "epoch": 4.6601766720749955, + "grad_norm": 0.043037623167037964, + "learning_rate": 6.532236916372709e-05, + "loss": 0.0119, + "step": 25850 + }, + { + "epoch": 4.6619794483504595, + "grad_norm": 0.03363450989127159, + "learning_rate": 6.529613488087454e-05, + "loss": 0.013, + "step": 25860 + }, + { + "epoch": 4.663782224625924, + "grad_norm": 0.040549539029598236, + "learning_rate": 6.526989595146009e-05, + "loss": 0.0126, + "step": 25870 + }, + { + "epoch": 4.665585000901388, + "grad_norm": 0.02952057495713234, + "learning_rate": 6.524365238345441e-05, + "loss": 0.0131, + "step": 25880 + }, + { + "epoch": 4.667387777176852, + "grad_norm": 0.03283972665667534, + "learning_rate": 6.521740418482964e-05, + "loss": 0.0139, + "step": 25890 + }, + { + "epoch": 4.669190553452316, + "grad_norm": 0.0337519533932209, + "learning_rate": 6.519115136355925e-05, + "loss": 0.0118, + "step": 25900 + }, + { + "epoch": 4.67099332972778, + "grad_norm": 0.03552308678627014, + "learning_rate": 6.51648939276182e-05, + "loss": 0.0121, + "step": 25910 + }, + { + "epoch": 4.672796106003245, + "grad_norm": 0.033700015395879745, + "learning_rate": 6.513863188498277e-05, + "loss": 0.0129, + "step": 25920 + }, + { + "epoch": 4.674598882278709, + "grad_norm": 0.034070007503032684, + "learning_rate": 6.511236524363068e-05, + "loss": 0.0129, + "step": 25930 + }, + { + "epoch": 4.676401658554173, + "grad_norm": 0.04618152603507042, + "learning_rate": 6.508609401154104e-05, + "loss": 0.0132, + "step": 25940 + }, + { + "epoch": 4.678204434829638, + "grad_norm": 0.02468423917889595, + "learning_rate": 6.505981819669439e-05, + "loss": 0.0116, + "step": 25950 + }, + { + "epoch": 4.680007211105102, + "grad_norm": 0.032005369663238525, + "learning_rate": 6.503353780707258e-05, + "loss": 0.0123, + "step": 25960 + }, + { + "epoch": 4.681809987380566, + "grad_norm": 0.033916763961315155, + "learning_rate": 6.500725285065895e-05, + "loss": 0.0141, + "step": 25970 + }, + { + "epoch": 4.68361276365603, + "grad_norm": 0.03805190697312355, + "learning_rate": 6.498096333543813e-05, + "loss": 0.0131, + "step": 25980 + }, + { + "epoch": 4.685415539931494, + "grad_norm": 0.03806079924106598, + "learning_rate": 6.49546692693962e-05, + "loss": 0.0134, + "step": 25990 + }, + { + "epoch": 4.687218316206959, + "grad_norm": 0.033169955015182495, + "learning_rate": 6.492837066052059e-05, + "loss": 0.0126, + "step": 26000 + }, + { + "epoch": 4.689021092482423, + "grad_norm": 0.03455241397023201, + "learning_rate": 6.490206751680014e-05, + "loss": 0.0123, + "step": 26010 + }, + { + "epoch": 4.690823868757887, + "grad_norm": 0.027812698855996132, + "learning_rate": 6.487575984622505e-05, + "loss": 0.0127, + "step": 26020 + }, + { + "epoch": 4.692626645033352, + "grad_norm": 0.03218118101358414, + "learning_rate": 6.484944765678689e-05, + "loss": 0.0132, + "step": 26030 + }, + { + "epoch": 4.694429421308816, + "grad_norm": 0.03887471556663513, + "learning_rate": 6.482313095647861e-05, + "loss": 0.0133, + "step": 26040 + }, + { + "epoch": 4.69623219758428, + "grad_norm": 0.03402502462267876, + "learning_rate": 6.479680975329451e-05, + "loss": 0.0137, + "step": 26050 + }, + { + "epoch": 4.698034973859744, + "grad_norm": 0.04980213940143585, + "learning_rate": 6.477048405523031e-05, + "loss": 0.0134, + "step": 26060 + }, + { + "epoch": 4.699837750135208, + "grad_norm": 0.037422411143779755, + "learning_rate": 6.474415387028304e-05, + "loss": 0.0123, + "step": 26070 + }, + { + "epoch": 4.701640526410673, + "grad_norm": 0.040100786834955215, + "learning_rate": 6.471781920645114e-05, + "loss": 0.0125, + "step": 26080 + }, + { + "epoch": 4.703443302686137, + "grad_norm": 0.03751080855727196, + "learning_rate": 6.469148007173434e-05, + "loss": 0.0138, + "step": 26090 + }, + { + "epoch": 4.705246078961601, + "grad_norm": 0.039573997259140015, + "learning_rate": 6.466513647413381e-05, + "loss": 0.0131, + "step": 26100 + }, + { + "epoch": 4.7070488552370655, + "grad_norm": 0.0415896400809288, + "learning_rate": 6.463878842165203e-05, + "loss": 0.0141, + "step": 26110 + }, + { + "epoch": 4.7088516315125295, + "grad_norm": 0.029678981751203537, + "learning_rate": 6.461243592229286e-05, + "loss": 0.0123, + "step": 26120 + }, + { + "epoch": 4.7106544077879935, + "grad_norm": 0.04141012579202652, + "learning_rate": 6.458607898406146e-05, + "loss": 0.0127, + "step": 26130 + }, + { + "epoch": 4.7124571840634575, + "grad_norm": 0.027870172634720802, + "learning_rate": 6.455971761496439e-05, + "loss": 0.0125, + "step": 26140 + }, + { + "epoch": 4.7142599603389215, + "grad_norm": 0.02918057143688202, + "learning_rate": 6.453335182300953e-05, + "loss": 0.0139, + "step": 26150 + }, + { + "epoch": 4.716062736614386, + "grad_norm": 0.029551664367318153, + "learning_rate": 6.450698161620612e-05, + "loss": 0.0126, + "step": 26160 + }, + { + "epoch": 4.71786551288985, + "grad_norm": 0.04677426815032959, + "learning_rate": 6.448060700256473e-05, + "loss": 0.0129, + "step": 26170 + }, + { + "epoch": 4.719668289165314, + "grad_norm": 0.05065417289733887, + "learning_rate": 6.445422799009726e-05, + "loss": 0.0149, + "step": 26180 + }, + { + "epoch": 4.721471065440779, + "grad_norm": 0.04624687507748604, + "learning_rate": 6.442784458681699e-05, + "loss": 0.0128, + "step": 26190 + }, + { + "epoch": 4.723273841716243, + "grad_norm": 0.04547148942947388, + "learning_rate": 6.440145680073847e-05, + "loss": 0.0132, + "step": 26200 + }, + { + "epoch": 4.725076617991707, + "grad_norm": 0.038907915353775024, + "learning_rate": 6.437506463987762e-05, + "loss": 0.0134, + "step": 26210 + }, + { + "epoch": 4.726879394267171, + "grad_norm": 0.027501314878463745, + "learning_rate": 6.434866811225168e-05, + "loss": 0.0121, + "step": 26220 + }, + { + "epoch": 4.728682170542635, + "grad_norm": 0.031148727983236313, + "learning_rate": 6.432226722587923e-05, + "loss": 0.0119, + "step": 26230 + }, + { + "epoch": 4.7304849468181, + "grad_norm": 0.030809873715043068, + "learning_rate": 6.429586198878015e-05, + "loss": 0.0129, + "step": 26240 + }, + { + "epoch": 4.732287723093564, + "grad_norm": 0.033697377890348434, + "learning_rate": 6.426945240897566e-05, + "loss": 0.013, + "step": 26250 + }, + { + "epoch": 4.734090499369028, + "grad_norm": 0.029761215671896935, + "learning_rate": 6.424303849448829e-05, + "loss": 0.0131, + "step": 26260 + }, + { + "epoch": 4.735893275644493, + "grad_norm": 0.04188040271401405, + "learning_rate": 6.42166202533419e-05, + "loss": 0.0133, + "step": 26270 + }, + { + "epoch": 4.737696051919957, + "grad_norm": 0.04635777324438095, + "learning_rate": 6.419019769356164e-05, + "loss": 0.0143, + "step": 26280 + }, + { + "epoch": 4.739498828195421, + "grad_norm": 0.03785382956266403, + "learning_rate": 6.416377082317398e-05, + "loss": 0.0135, + "step": 26290 + }, + { + "epoch": 4.741301604470885, + "grad_norm": 0.03654355928301811, + "learning_rate": 6.413733965020674e-05, + "loss": 0.0121, + "step": 26300 + }, + { + "epoch": 4.743104380746349, + "grad_norm": 0.04500211775302887, + "learning_rate": 6.411090418268896e-05, + "loss": 0.0129, + "step": 26310 + }, + { + "epoch": 4.744907157021814, + "grad_norm": 0.0421280637383461, + "learning_rate": 6.408446442865109e-05, + "loss": 0.0122, + "step": 26320 + }, + { + "epoch": 4.746709933297278, + "grad_norm": 0.03203197568655014, + "learning_rate": 6.405802039612479e-05, + "loss": 0.0127, + "step": 26330 + }, + { + "epoch": 4.748512709572742, + "grad_norm": 0.033095914870500565, + "learning_rate": 6.403157209314308e-05, + "loss": 0.0136, + "step": 26340 + }, + { + "epoch": 4.750315485848207, + "grad_norm": 0.03345852717757225, + "learning_rate": 6.400511952774024e-05, + "loss": 0.0141, + "step": 26350 + }, + { + "epoch": 4.752118262123671, + "grad_norm": 0.03772794455289841, + "learning_rate": 6.397866270795187e-05, + "loss": 0.0136, + "step": 26360 + }, + { + "epoch": 4.753921038399135, + "grad_norm": 0.034663040190935135, + "learning_rate": 6.395220164181489e-05, + "loss": 0.0122, + "step": 26370 + }, + { + "epoch": 4.755723814674599, + "grad_norm": 0.03888862580060959, + "learning_rate": 6.39257363373674e-05, + "loss": 0.0129, + "step": 26380 + }, + { + "epoch": 4.7575265909500635, + "grad_norm": 0.029735418036580086, + "learning_rate": 6.389926680264892e-05, + "loss": 0.0146, + "step": 26390 + }, + { + "epoch": 4.7593293672255275, + "grad_norm": 0.03543306514620781, + "learning_rate": 6.387279304570017e-05, + "loss": 0.0131, + "step": 26400 + }, + { + "epoch": 4.7611321435009915, + "grad_norm": 0.05978789180517197, + "learning_rate": 6.384631507456319e-05, + "loss": 0.0132, + "step": 26410 + }, + { + "epoch": 4.7629349197764554, + "grad_norm": 0.029082154855132103, + "learning_rate": 6.381983289728126e-05, + "loss": 0.0124, + "step": 26420 + }, + { + "epoch": 4.76473769605192, + "grad_norm": 0.034611333161592484, + "learning_rate": 6.3793346521899e-05, + "loss": 0.012, + "step": 26430 + }, + { + "epoch": 4.766540472327384, + "grad_norm": 0.05795871838927269, + "learning_rate": 6.376685595646226e-05, + "loss": 0.0134, + "step": 26440 + }, + { + "epoch": 4.768343248602848, + "grad_norm": 0.02745138667523861, + "learning_rate": 6.374036120901816e-05, + "loss": 0.0134, + "step": 26450 + }, + { + "epoch": 4.770146024878312, + "grad_norm": 0.04285449534654617, + "learning_rate": 6.371386228761514e-05, + "loss": 0.0143, + "step": 26460 + }, + { + "epoch": 4.771948801153777, + "grad_norm": 0.04301370680332184, + "learning_rate": 6.368735920030283e-05, + "loss": 0.0127, + "step": 26470 + }, + { + "epoch": 4.773751577429241, + "grad_norm": 0.029428930953145027, + "learning_rate": 6.366085195513218e-05, + "loss": 0.013, + "step": 26480 + }, + { + "epoch": 4.775554353704705, + "grad_norm": 0.056381016969680786, + "learning_rate": 6.363434056015543e-05, + "loss": 0.0131, + "step": 26490 + }, + { + "epoch": 4.777357129980169, + "grad_norm": 0.04035766050219536, + "learning_rate": 6.360782502342599e-05, + "loss": 0.0125, + "step": 26500 + }, + { + "epoch": 4.779159906255634, + "grad_norm": 0.050049684941768646, + "learning_rate": 6.358130535299862e-05, + "loss": 0.0142, + "step": 26510 + }, + { + "epoch": 4.780962682531098, + "grad_norm": 0.04519528150558472, + "learning_rate": 6.355478155692926e-05, + "loss": 0.0126, + "step": 26520 + }, + { + "epoch": 4.782765458806562, + "grad_norm": 0.026955561712384224, + "learning_rate": 6.352825364327517e-05, + "loss": 0.0133, + "step": 26530 + }, + { + "epoch": 4.784568235082026, + "grad_norm": 0.055101510137319565, + "learning_rate": 6.350172162009482e-05, + "loss": 0.0124, + "step": 26540 + }, + { + "epoch": 4.786371011357491, + "grad_norm": 0.03820546716451645, + "learning_rate": 6.347518549544793e-05, + "loss": 0.0135, + "step": 26550 + }, + { + "epoch": 4.788173787632955, + "grad_norm": 0.03437512740492821, + "learning_rate": 6.344864527739547e-05, + "loss": 0.0133, + "step": 26560 + }, + { + "epoch": 4.789976563908419, + "grad_norm": 0.03252657502889633, + "learning_rate": 6.342210097399966e-05, + "loss": 0.0124, + "step": 26570 + }, + { + "epoch": 4.791779340183883, + "grad_norm": 0.04321519657969475, + "learning_rate": 6.339555259332398e-05, + "loss": 0.0127, + "step": 26580 + }, + { + "epoch": 4.793582116459348, + "grad_norm": 0.02965615876019001, + "learning_rate": 6.33690001434331e-05, + "loss": 0.0117, + "step": 26590 + }, + { + "epoch": 4.795384892734812, + "grad_norm": 0.043717510998249054, + "learning_rate": 6.334244363239296e-05, + "loss": 0.0124, + "step": 26600 + }, + { + "epoch": 4.797187669010276, + "grad_norm": 0.05073258653283119, + "learning_rate": 6.331588306827073e-05, + "loss": 0.0126, + "step": 26610 + }, + { + "epoch": 4.79899044528574, + "grad_norm": 0.0345473513007164, + "learning_rate": 6.328931845913483e-05, + "loss": 0.0133, + "step": 26620 + }, + { + "epoch": 4.800793221561205, + "grad_norm": 0.03530116006731987, + "learning_rate": 6.326274981305484e-05, + "loss": 0.0137, + "step": 26630 + }, + { + "epoch": 4.802595997836669, + "grad_norm": 0.029969383031129837, + "learning_rate": 6.323617713810166e-05, + "loss": 0.0123, + "step": 26640 + }, + { + "epoch": 4.804398774112133, + "grad_norm": 0.027851708233356476, + "learning_rate": 6.320960044234734e-05, + "loss": 0.0127, + "step": 26650 + }, + { + "epoch": 4.8062015503875966, + "grad_norm": 0.03338292986154556, + "learning_rate": 6.318301973386518e-05, + "loss": 0.0146, + "step": 26660 + }, + { + "epoch": 4.808004326663061, + "grad_norm": 0.0396399050951004, + "learning_rate": 6.315643502072971e-05, + "loss": 0.0134, + "step": 26670 + }, + { + "epoch": 4.809807102938525, + "grad_norm": 0.033591076731681824, + "learning_rate": 6.312984631101667e-05, + "loss": 0.0143, + "step": 26680 + }, + { + "epoch": 4.811609879213989, + "grad_norm": 0.03000384382903576, + "learning_rate": 6.310325361280297e-05, + "loss": 0.0129, + "step": 26690 + }, + { + "epoch": 4.813412655489453, + "grad_norm": 0.05446794256567955, + "learning_rate": 6.30766569341668e-05, + "loss": 0.0143, + "step": 26700 + }, + { + "epoch": 4.815215431764918, + "grad_norm": 0.03709074854850769, + "learning_rate": 6.305005628318753e-05, + "loss": 0.0122, + "step": 26710 + }, + { + "epoch": 4.817018208040382, + "grad_norm": 0.0391797199845314, + "learning_rate": 6.302345166794572e-05, + "loss": 0.0143, + "step": 26720 + }, + { + "epoch": 4.818820984315846, + "grad_norm": 0.036784786731004715, + "learning_rate": 6.299684309652316e-05, + "loss": 0.0131, + "step": 26730 + }, + { + "epoch": 4.82062376059131, + "grad_norm": 0.0460968017578125, + "learning_rate": 6.297023057700283e-05, + "loss": 0.0138, + "step": 26740 + }, + { + "epoch": 4.822426536866775, + "grad_norm": 0.03746715933084488, + "learning_rate": 6.294361411746891e-05, + "loss": 0.014, + "step": 26750 + }, + { + "epoch": 4.824229313142239, + "grad_norm": 0.030767880380153656, + "learning_rate": 6.291699372600677e-05, + "loss": 0.0133, + "step": 26760 + }, + { + "epoch": 4.826032089417703, + "grad_norm": 0.03748341649770737, + "learning_rate": 6.2890369410703e-05, + "loss": 0.0122, + "step": 26770 + }, + { + "epoch": 4.827834865693167, + "grad_norm": 0.03458026051521301, + "learning_rate": 6.286374117964534e-05, + "loss": 0.0123, + "step": 26780 + }, + { + "epoch": 4.829637641968632, + "grad_norm": 0.03462429344654083, + "learning_rate": 6.283710904092277e-05, + "loss": 0.012, + "step": 26790 + }, + { + "epoch": 4.831440418244096, + "grad_norm": 0.04094312712550163, + "learning_rate": 6.281047300262542e-05, + "loss": 0.0127, + "step": 26800 + }, + { + "epoch": 4.83324319451956, + "grad_norm": 0.029383892193436623, + "learning_rate": 6.278383307284461e-05, + "loss": 0.0132, + "step": 26810 + }, + { + "epoch": 4.835045970795024, + "grad_norm": 0.043613508343696594, + "learning_rate": 6.275718925967284e-05, + "loss": 0.0133, + "step": 26820 + }, + { + "epoch": 4.836848747070489, + "grad_norm": 0.02362663298845291, + "learning_rate": 6.273054157120382e-05, + "loss": 0.0121, + "step": 26830 + }, + { + "epoch": 4.838651523345953, + "grad_norm": 0.038084905594587326, + "learning_rate": 6.270389001553238e-05, + "loss": 0.0123, + "step": 26840 + }, + { + "epoch": 4.840454299621417, + "grad_norm": 0.034768253564834595, + "learning_rate": 6.26772346007546e-05, + "loss": 0.0129, + "step": 26850 + }, + { + "epoch": 4.842257075896881, + "grad_norm": 0.027081511914730072, + "learning_rate": 6.265057533496767e-05, + "loss": 0.0113, + "step": 26860 + }, + { + "epoch": 4.844059852172346, + "grad_norm": 0.0270563755184412, + "learning_rate": 6.262391222626997e-05, + "loss": 0.0127, + "step": 26870 + }, + { + "epoch": 4.84586262844781, + "grad_norm": 0.026155179366469383, + "learning_rate": 6.259724528276106e-05, + "loss": 0.0113, + "step": 26880 + }, + { + "epoch": 4.847665404723274, + "grad_norm": 0.03443409875035286, + "learning_rate": 6.257057451254162e-05, + "loss": 0.013, + "step": 26890 + }, + { + "epoch": 4.849468180998738, + "grad_norm": 0.0626431331038475, + "learning_rate": 6.254389992371357e-05, + "loss": 0.0128, + "step": 26900 + }, + { + "epoch": 4.8512709572742025, + "grad_norm": 0.030597640201449394, + "learning_rate": 6.25172215243799e-05, + "loss": 0.0124, + "step": 26910 + }, + { + "epoch": 4.8530737335496665, + "grad_norm": 0.042359549552202225, + "learning_rate": 6.249053932264486e-05, + "loss": 0.0134, + "step": 26920 + }, + { + "epoch": 4.8548765098251305, + "grad_norm": 0.04962105304002762, + "learning_rate": 6.246385332661376e-05, + "loss": 0.0137, + "step": 26930 + }, + { + "epoch": 4.856679286100595, + "grad_norm": 0.036272354423999786, + "learning_rate": 6.24371635443931e-05, + "loss": 0.0124, + "step": 26940 + }, + { + "epoch": 4.858482062376059, + "grad_norm": 0.03511037304997444, + "learning_rate": 6.241046998409054e-05, + "loss": 0.0127, + "step": 26950 + }, + { + "epoch": 4.860284838651523, + "grad_norm": 0.03409293666481972, + "learning_rate": 6.238377265381489e-05, + "loss": 0.0135, + "step": 26960 + }, + { + "epoch": 4.862087614926987, + "grad_norm": 0.041253212839365005, + "learning_rate": 6.235707156167607e-05, + "loss": 0.0127, + "step": 26970 + }, + { + "epoch": 4.863890391202451, + "grad_norm": 0.04034771770238876, + "learning_rate": 6.233036671578519e-05, + "loss": 0.0121, + "step": 26980 + }, + { + "epoch": 4.865693167477916, + "grad_norm": 0.04137939214706421, + "learning_rate": 6.230365812425445e-05, + "loss": 0.0119, + "step": 26990 + }, + { + "epoch": 4.86749594375338, + "grad_norm": 0.028815321624279022, + "learning_rate": 6.227694579519724e-05, + "loss": 0.0123, + "step": 27000 + }, + { + "epoch": 4.869298720028844, + "grad_norm": 0.046638958156108856, + "learning_rate": 6.225022973672805e-05, + "loss": 0.0126, + "step": 27010 + }, + { + "epoch": 4.871101496304309, + "grad_norm": 0.04523523896932602, + "learning_rate": 6.222350995696253e-05, + "loss": 0.0117, + "step": 27020 + }, + { + "epoch": 4.872904272579773, + "grad_norm": 0.03522033616900444, + "learning_rate": 6.21967864640174e-05, + "loss": 0.0142, + "step": 27030 + }, + { + "epoch": 4.874707048855237, + "grad_norm": 0.024224597960710526, + "learning_rate": 6.217005926601059e-05, + "loss": 0.0124, + "step": 27040 + }, + { + "epoch": 4.876509825130701, + "grad_norm": 0.03828557953238487, + "learning_rate": 6.214332837106111e-05, + "loss": 0.0121, + "step": 27050 + }, + { + "epoch": 4.878312601406165, + "grad_norm": 0.0358375608921051, + "learning_rate": 6.21165937872891e-05, + "loss": 0.0124, + "step": 27060 + }, + { + "epoch": 4.88011537768163, + "grad_norm": 0.02597762458026409, + "learning_rate": 6.208985552281582e-05, + "loss": 0.0119, + "step": 27070 + }, + { + "epoch": 4.881918153957094, + "grad_norm": 0.03602936118841171, + "learning_rate": 6.206311358576364e-05, + "loss": 0.0132, + "step": 27080 + }, + { + "epoch": 4.883720930232558, + "grad_norm": 0.03175625205039978, + "learning_rate": 6.203636798425608e-05, + "loss": 0.0132, + "step": 27090 + }, + { + "epoch": 4.885523706508023, + "grad_norm": 0.04793572425842285, + "learning_rate": 6.20096187264177e-05, + "loss": 0.0148, + "step": 27100 + }, + { + "epoch": 4.887326482783487, + "grad_norm": 0.03504430130124092, + "learning_rate": 6.198286582037425e-05, + "loss": 0.0125, + "step": 27110 + }, + { + "epoch": 4.889129259058951, + "grad_norm": 0.03222380205988884, + "learning_rate": 6.195610927425256e-05, + "loss": 0.0125, + "step": 27120 + }, + { + "epoch": 4.890932035334415, + "grad_norm": 0.0324045792222023, + "learning_rate": 6.192934909618056e-05, + "loss": 0.0117, + "step": 27130 + }, + { + "epoch": 4.892734811609879, + "grad_norm": 0.033244505524635315, + "learning_rate": 6.190258529428728e-05, + "loss": 0.0121, + "step": 27140 + }, + { + "epoch": 4.894537587885344, + "grad_norm": 0.04442642629146576, + "learning_rate": 6.187581787670285e-05, + "loss": 0.0142, + "step": 27150 + }, + { + "epoch": 4.896340364160808, + "grad_norm": 0.03551334887742996, + "learning_rate": 6.184904685155852e-05, + "loss": 0.0119, + "step": 27160 + }, + { + "epoch": 4.898143140436272, + "grad_norm": 0.03370974212884903, + "learning_rate": 6.18222722269866e-05, + "loss": 0.012, + "step": 27170 + }, + { + "epoch": 4.8999459167117365, + "grad_norm": 0.026604624465107918, + "learning_rate": 6.179549401112053e-05, + "loss": 0.0124, + "step": 27180 + }, + { + "epoch": 4.9017486929872005, + "grad_norm": 0.031832318753004074, + "learning_rate": 6.176871221209482e-05, + "loss": 0.0119, + "step": 27190 + }, + { + "epoch": 4.9035514692626645, + "grad_norm": 0.038619354367256165, + "learning_rate": 6.174192683804508e-05, + "loss": 0.0123, + "step": 27200 + }, + { + "epoch": 4.9053542455381285, + "grad_norm": 0.05523184314370155, + "learning_rate": 6.1715137897108e-05, + "loss": 0.0123, + "step": 27210 + }, + { + "epoch": 4.9071570218135925, + "grad_norm": 0.03690990060567856, + "learning_rate": 6.168834539742134e-05, + "loss": 0.0116, + "step": 27220 + }, + { + "epoch": 4.908959798089057, + "grad_norm": 0.036614011973142624, + "learning_rate": 6.166154934712397e-05, + "loss": 0.013, + "step": 27230 + }, + { + "epoch": 4.910762574364521, + "grad_norm": 0.020940544083714485, + "learning_rate": 6.163474975435581e-05, + "loss": 0.0119, + "step": 27240 + }, + { + "epoch": 4.912565350639985, + "grad_norm": 0.028566615656018257, + "learning_rate": 6.160794662725787e-05, + "loss": 0.0131, + "step": 27250 + }, + { + "epoch": 4.91436812691545, + "grad_norm": 0.048897840082645416, + "learning_rate": 6.158113997397222e-05, + "loss": 0.0126, + "step": 27260 + }, + { + "epoch": 4.916170903190914, + "grad_norm": 0.0354856513440609, + "learning_rate": 6.155432980264205e-05, + "loss": 0.012, + "step": 27270 + }, + { + "epoch": 4.917973679466378, + "grad_norm": 0.03143368288874626, + "learning_rate": 6.152751612141156e-05, + "loss": 0.013, + "step": 27280 + }, + { + "epoch": 4.919776455741842, + "grad_norm": 0.04854616895318031, + "learning_rate": 6.150069893842602e-05, + "loss": 0.0137, + "step": 27290 + }, + { + "epoch": 4.921579232017306, + "grad_norm": 0.03600136190652847, + "learning_rate": 6.147387826183182e-05, + "loss": 0.0123, + "step": 27300 + }, + { + "epoch": 4.923382008292771, + "grad_norm": 0.03550811856985092, + "learning_rate": 6.144705409977635e-05, + "loss": 0.0129, + "step": 27310 + }, + { + "epoch": 4.925184784568235, + "grad_norm": 0.04855992645025253, + "learning_rate": 6.142022646040808e-05, + "loss": 0.0132, + "step": 27320 + }, + { + "epoch": 4.926987560843699, + "grad_norm": 0.03572499752044678, + "learning_rate": 6.139339535187653e-05, + "loss": 0.012, + "step": 27330 + }, + { + "epoch": 4.928790337119164, + "grad_norm": 0.054072313010692596, + "learning_rate": 6.136656078233232e-05, + "loss": 0.0137, + "step": 27340 + }, + { + "epoch": 4.930593113394628, + "grad_norm": 0.03761845454573631, + "learning_rate": 6.133972275992707e-05, + "loss": 0.0119, + "step": 27350 + }, + { + "epoch": 4.932395889670092, + "grad_norm": 0.03017565608024597, + "learning_rate": 6.131288129281342e-05, + "loss": 0.0134, + "step": 27360 + }, + { + "epoch": 4.934198665945556, + "grad_norm": 0.038365982472896576, + "learning_rate": 6.128603638914516e-05, + "loss": 0.0132, + "step": 27370 + }, + { + "epoch": 4.93600144222102, + "grad_norm": 0.03447599336504936, + "learning_rate": 6.125918805707704e-05, + "loss": 0.0124, + "step": 27380 + }, + { + "epoch": 4.937804218496485, + "grad_norm": 0.025905100628733635, + "learning_rate": 6.123233630476485e-05, + "loss": 0.0117, + "step": 27390 + }, + { + "epoch": 4.939606994771949, + "grad_norm": 0.032403137534856796, + "learning_rate": 6.120548114036547e-05, + "loss": 0.0122, + "step": 27400 + }, + { + "epoch": 4.941409771047413, + "grad_norm": 0.030900051817297935, + "learning_rate": 6.117862257203679e-05, + "loss": 0.0122, + "step": 27410 + }, + { + "epoch": 4.943212547322878, + "grad_norm": 0.034499336034059525, + "learning_rate": 6.115176060793771e-05, + "loss": 0.0133, + "step": 27420 + }, + { + "epoch": 4.945015323598342, + "grad_norm": 0.03711819276213646, + "learning_rate": 6.112489525622822e-05, + "loss": 0.0135, + "step": 27430 + }, + { + "epoch": 4.946818099873806, + "grad_norm": 0.03200267255306244, + "learning_rate": 6.109802652506928e-05, + "loss": 0.0125, + "step": 27440 + }, + { + "epoch": 4.94862087614927, + "grad_norm": 0.03376936540007591, + "learning_rate": 6.107115442262291e-05, + "loss": 0.0111, + "step": 27450 + }, + { + "epoch": 4.9504236524247345, + "grad_norm": 0.03497292846441269, + "learning_rate": 6.104427895705214e-05, + "loss": 0.0125, + "step": 27460 + }, + { + "epoch": 4.9522264287001985, + "grad_norm": 0.032362375408411026, + "learning_rate": 6.101740013652103e-05, + "loss": 0.0126, + "step": 27470 + }, + { + "epoch": 4.9540292049756625, + "grad_norm": 0.03494282066822052, + "learning_rate": 6.099051796919465e-05, + "loss": 0.0125, + "step": 27480 + }, + { + "epoch": 4.9558319812511265, + "grad_norm": 0.05428651347756386, + "learning_rate": 6.096363246323911e-05, + "loss": 0.0124, + "step": 27490 + }, + { + "epoch": 4.957634757526591, + "grad_norm": 0.04408297315239906, + "learning_rate": 6.0936743626821504e-05, + "loss": 0.0137, + "step": 27500 + }, + { + "epoch": 4.959437533802055, + "grad_norm": 0.03198946267366409, + "learning_rate": 6.090985146810996e-05, + "loss": 0.0124, + "step": 27510 + }, + { + "epoch": 4.961240310077519, + "grad_norm": 0.028737051412463188, + "learning_rate": 6.088295599527357e-05, + "loss": 0.0117, + "step": 27520 + }, + { + "epoch": 4.963043086352983, + "grad_norm": 0.03804490342736244, + "learning_rate": 6.085605721648252e-05, + "loss": 0.0134, + "step": 27530 + }, + { + "epoch": 4.964845862628448, + "grad_norm": 0.034807540476322174, + "learning_rate": 6.082915513990792e-05, + "loss": 0.0141, + "step": 27540 + }, + { + "epoch": 4.966648638903912, + "grad_norm": 0.03627427667379379, + "learning_rate": 6.080224977372192e-05, + "loss": 0.0118, + "step": 27550 + }, + { + "epoch": 4.968451415179376, + "grad_norm": 0.033917635679244995, + "learning_rate": 6.0775341126097666e-05, + "loss": 0.0132, + "step": 27560 + }, + { + "epoch": 4.97025419145484, + "grad_norm": 0.03293414041399956, + "learning_rate": 6.074842920520926e-05, + "loss": 0.0123, + "step": 27570 + }, + { + "epoch": 4.972056967730305, + "grad_norm": 0.026023337617516518, + "learning_rate": 6.072151401923186e-05, + "loss": 0.0126, + "step": 27580 + }, + { + "epoch": 4.973859744005769, + "grad_norm": 0.03041137009859085, + "learning_rate": 6.069459557634159e-05, + "loss": 0.0136, + "step": 27590 + }, + { + "epoch": 4.975662520281233, + "grad_norm": 0.03221767395734787, + "learning_rate": 6.066767388471557e-05, + "loss": 0.0116, + "step": 27600 + }, + { + "epoch": 4.977465296556697, + "grad_norm": 0.043770525604486465, + "learning_rate": 6.064074895253188e-05, + "loss": 0.0132, + "step": 27610 + }, + { + "epoch": 4.979268072832162, + "grad_norm": 0.031290099024772644, + "learning_rate": 6.061382078796961e-05, + "loss": 0.0123, + "step": 27620 + }, + { + "epoch": 4.981070849107626, + "grad_norm": 0.031303923577070236, + "learning_rate": 6.0586889399208814e-05, + "loss": 0.0127, + "step": 27630 + }, + { + "epoch": 4.98287362538309, + "grad_norm": 0.052274394780397415, + "learning_rate": 6.0559954794430565e-05, + "loss": 0.0129, + "step": 27640 + }, + { + "epoch": 4.984676401658554, + "grad_norm": 0.03706236183643341, + "learning_rate": 6.053301698181687e-05, + "loss": 0.0135, + "step": 27650 + }, + { + "epoch": 4.986479177934019, + "grad_norm": 0.03283451870083809, + "learning_rate": 6.0506075969550725e-05, + "loss": 0.0125, + "step": 27660 + }, + { + "epoch": 4.988281954209483, + "grad_norm": 0.03406360000371933, + "learning_rate": 6.047913176581609e-05, + "loss": 0.0114, + "step": 27670 + }, + { + "epoch": 4.990084730484947, + "grad_norm": 0.03508257120847702, + "learning_rate": 6.0452184378797904e-05, + "loss": 0.013, + "step": 27680 + }, + { + "epoch": 4.991887506760411, + "grad_norm": 0.03802734240889549, + "learning_rate": 6.042523381668209e-05, + "loss": 0.0131, + "step": 27690 + }, + { + "epoch": 4.993690283035876, + "grad_norm": 0.030670730397105217, + "learning_rate": 6.03982800876555e-05, + "loss": 0.012, + "step": 27700 + }, + { + "epoch": 4.99549305931134, + "grad_norm": 0.031061161309480667, + "learning_rate": 6.0371323199905975e-05, + "loss": 0.0139, + "step": 27710 + }, + { + "epoch": 4.997295835586804, + "grad_norm": 0.02597402036190033, + "learning_rate": 6.03443631616223e-05, + "loss": 0.0122, + "step": 27720 + }, + { + "epoch": 4.999098611862268, + "grad_norm": 0.03564894571900368, + "learning_rate": 6.031739998099421e-05, + "loss": 0.0134, + "step": 27730 + }, + { + "epoch": 5.000901388137732, + "grad_norm": 0.029789695516228676, + "learning_rate": 6.029043366621243e-05, + "loss": 0.0115, + "step": 27740 + }, + { + "epoch": 5.002704164413196, + "grad_norm": 0.03893318399786949, + "learning_rate": 6.0263464225468615e-05, + "loss": 0.0138, + "step": 27750 + }, + { + "epoch": 5.00450694068866, + "grad_norm": 0.03659945726394653, + "learning_rate": 6.023649166695534e-05, + "loss": 0.0131, + "step": 27760 + }, + { + "epoch": 5.006309716964124, + "grad_norm": 0.04202880337834358, + "learning_rate": 6.0209515998866186e-05, + "loss": 0.0121, + "step": 27770 + }, + { + "epoch": 5.008112493239589, + "grad_norm": 0.03039456158876419, + "learning_rate": 6.018253722939563e-05, + "loss": 0.0132, + "step": 27780 + }, + { + "epoch": 5.009915269515053, + "grad_norm": 0.03260527551174164, + "learning_rate": 6.015555536673914e-05, + "loss": 0.0118, + "step": 27790 + }, + { + "epoch": 5.011718045790517, + "grad_norm": 0.03421454504132271, + "learning_rate": 6.0128570419093054e-05, + "loss": 0.0122, + "step": 27800 + }, + { + "epoch": 5.013520822065981, + "grad_norm": 0.030443711206316948, + "learning_rate": 6.010158239465471e-05, + "loss": 0.0123, + "step": 27810 + }, + { + "epoch": 5.015323598341446, + "grad_norm": 0.028168484568595886, + "learning_rate": 6.007459130162235e-05, + "loss": 0.0115, + "step": 27820 + }, + { + "epoch": 5.01712637461691, + "grad_norm": 0.031856387853622437, + "learning_rate": 6.004759714819516e-05, + "loss": 0.0141, + "step": 27830 + }, + { + "epoch": 5.018929150892374, + "grad_norm": 0.040466636419296265, + "learning_rate": 6.002059994257323e-05, + "loss": 0.0135, + "step": 27840 + }, + { + "epoch": 5.020731927167838, + "grad_norm": 0.029774270951747894, + "learning_rate": 5.999359969295764e-05, + "loss": 0.0124, + "step": 27850 + }, + { + "epoch": 5.022534703443303, + "grad_norm": 0.028157807886600494, + "learning_rate": 5.9966596407550314e-05, + "loss": 0.0119, + "step": 27860 + }, + { + "epoch": 5.024337479718767, + "grad_norm": 0.032195307314395905, + "learning_rate": 5.993959009455416e-05, + "loss": 0.0126, + "step": 27870 + }, + { + "epoch": 5.026140255994231, + "grad_norm": 0.027672218158841133, + "learning_rate": 5.991258076217298e-05, + "loss": 0.0127, + "step": 27880 + }, + { + "epoch": 5.027943032269695, + "grad_norm": 0.033116698265075684, + "learning_rate": 5.988556841861147e-05, + "loss": 0.0124, + "step": 27890 + }, + { + "epoch": 5.02974580854516, + "grad_norm": 0.03327102214097977, + "learning_rate": 5.985855307207531e-05, + "loss": 0.0137, + "step": 27900 + }, + { + "epoch": 5.031548584820624, + "grad_norm": 0.029689036309719086, + "learning_rate": 5.9831534730771e-05, + "loss": 0.0122, + "step": 27910 + }, + { + "epoch": 5.033351361096088, + "grad_norm": 0.039038002490997314, + "learning_rate": 5.980451340290605e-05, + "loss": 0.012, + "step": 27920 + }, + { + "epoch": 5.035154137371552, + "grad_norm": 0.03737068176269531, + "learning_rate": 5.97774890966888e-05, + "loss": 0.0134, + "step": 27930 + }, + { + "epoch": 5.036956913647017, + "grad_norm": 0.03169916197657585, + "learning_rate": 5.975046182032851e-05, + "loss": 0.0112, + "step": 27940 + }, + { + "epoch": 5.038759689922481, + "grad_norm": 0.0420771948993206, + "learning_rate": 5.972343158203537e-05, + "loss": 0.013, + "step": 27950 + }, + { + "epoch": 5.040562466197945, + "grad_norm": 0.0674474760890007, + "learning_rate": 5.969639839002045e-05, + "loss": 0.0134, + "step": 27960 + }, + { + "epoch": 5.042365242473409, + "grad_norm": 0.026297645643353462, + "learning_rate": 5.966936225249572e-05, + "loss": 0.0121, + "step": 27970 + }, + { + "epoch": 5.0441680187488735, + "grad_norm": 0.03044302389025688, + "learning_rate": 5.9642323177674044e-05, + "loss": 0.0132, + "step": 27980 + }, + { + "epoch": 5.0459707950243375, + "grad_norm": 0.023887572810053825, + "learning_rate": 5.9615281173769154e-05, + "loss": 0.0122, + "step": 27990 + }, + { + "epoch": 5.0477735712998015, + "grad_norm": 0.03779621794819832, + "learning_rate": 5.958823624899574e-05, + "loss": 0.0126, + "step": 28000 + }, + { + "epoch": 5.0495763475752655, + "grad_norm": 0.03737788274884224, + "learning_rate": 5.956118841156933e-05, + "loss": 0.0129, + "step": 28010 + }, + { + "epoch": 5.05137912385073, + "grad_norm": 0.03048078715801239, + "learning_rate": 5.953413766970631e-05, + "loss": 0.0118, + "step": 28020 + }, + { + "epoch": 5.053181900126194, + "grad_norm": 0.03554223105311394, + "learning_rate": 5.9507084031624e-05, + "loss": 0.0133, + "step": 28030 + }, + { + "epoch": 5.054984676401658, + "grad_norm": 0.034310661256313324, + "learning_rate": 5.948002750554058e-05, + "loss": 0.0117, + "step": 28040 + }, + { + "epoch": 5.056787452677122, + "grad_norm": 0.029481856152415276, + "learning_rate": 5.9452968099675124e-05, + "loss": 0.0121, + "step": 28050 + }, + { + "epoch": 5.058590228952587, + "grad_norm": 0.04133079573512077, + "learning_rate": 5.9425905822247527e-05, + "loss": 0.0116, + "step": 28060 + }, + { + "epoch": 5.060393005228051, + "grad_norm": 0.04458780959248543, + "learning_rate": 5.939884068147864e-05, + "loss": 0.0127, + "step": 28070 + }, + { + "epoch": 5.062195781503515, + "grad_norm": 0.03420090302824974, + "learning_rate": 5.937177268559011e-05, + "loss": 0.0121, + "step": 28080 + }, + { + "epoch": 5.063998557778979, + "grad_norm": 0.036862507462501526, + "learning_rate": 5.934470184280448e-05, + "loss": 0.0122, + "step": 28090 + }, + { + "epoch": 5.065801334054444, + "grad_norm": 0.038637567311525345, + "learning_rate": 5.931762816134516e-05, + "loss": 0.0143, + "step": 28100 + }, + { + "epoch": 5.067604110329908, + "grad_norm": 0.033395882695913315, + "learning_rate": 5.9290551649436434e-05, + "loss": 0.0124, + "step": 28110 + }, + { + "epoch": 5.069406886605372, + "grad_norm": 0.027463359758257866, + "learning_rate": 5.9263472315303416e-05, + "loss": 0.0125, + "step": 28120 + }, + { + "epoch": 5.071209662880836, + "grad_norm": 0.028686290606856346, + "learning_rate": 5.9236390167172096e-05, + "loss": 0.0125, + "step": 28130 + }, + { + "epoch": 5.073012439156301, + "grad_norm": 0.03756026178598404, + "learning_rate": 5.920930521326932e-05, + "loss": 0.012, + "step": 28140 + }, + { + "epoch": 5.074815215431765, + "grad_norm": 0.035746995359659195, + "learning_rate": 5.918221746182276e-05, + "loss": 0.0123, + "step": 28150 + }, + { + "epoch": 5.076617991707229, + "grad_norm": 0.031080571934580803, + "learning_rate": 5.9155126921061e-05, + "loss": 0.0128, + "step": 28160 + }, + { + "epoch": 5.078420767982693, + "grad_norm": 0.03374682739377022, + "learning_rate": 5.91280335992134e-05, + "loss": 0.012, + "step": 28170 + }, + { + "epoch": 5.080223544258158, + "grad_norm": 0.03387926146388054, + "learning_rate": 5.91009375045102e-05, + "loss": 0.0129, + "step": 28180 + }, + { + "epoch": 5.082026320533622, + "grad_norm": 0.02966706082224846, + "learning_rate": 5.9073838645182476e-05, + "loss": 0.0118, + "step": 28190 + }, + { + "epoch": 5.083829096809086, + "grad_norm": 0.030686788260936737, + "learning_rate": 5.904673702946217e-05, + "loss": 0.0113, + "step": 28200 + }, + { + "epoch": 5.08563187308455, + "grad_norm": 0.03864374756813049, + "learning_rate": 5.9019632665582004e-05, + "loss": 0.0124, + "step": 28210 + }, + { + "epoch": 5.087434649360015, + "grad_norm": 0.029510820284485817, + "learning_rate": 5.899252556177559e-05, + "loss": 0.0116, + "step": 28220 + }, + { + "epoch": 5.089237425635479, + "grad_norm": 0.02825387753546238, + "learning_rate": 5.896541572627735e-05, + "loss": 0.0111, + "step": 28230 + }, + { + "epoch": 5.091040201910943, + "grad_norm": 0.023953892290592194, + "learning_rate": 5.893830316732253e-05, + "loss": 0.0122, + "step": 28240 + }, + { + "epoch": 5.092842978186407, + "grad_norm": 0.023725682869553566, + "learning_rate": 5.8911187893147214e-05, + "loss": 0.0112, + "step": 28250 + }, + { + "epoch": 5.0946457544618715, + "grad_norm": 0.04563118517398834, + "learning_rate": 5.888406991198828e-05, + "loss": 0.0139, + "step": 28260 + }, + { + "epoch": 5.0964485307373355, + "grad_norm": 0.03235441818833351, + "learning_rate": 5.885694923208349e-05, + "loss": 0.0131, + "step": 28270 + }, + { + "epoch": 5.0982513070127995, + "grad_norm": 0.035668693482875824, + "learning_rate": 5.882982586167138e-05, + "loss": 0.0119, + "step": 28280 + }, + { + "epoch": 5.1000540832882635, + "grad_norm": 0.04168926179409027, + "learning_rate": 5.880269980899131e-05, + "loss": 0.0118, + "step": 28290 + }, + { + "epoch": 5.101856859563728, + "grad_norm": 0.035474225878715515, + "learning_rate": 5.8775571082283465e-05, + "loss": 0.0118, + "step": 28300 + }, + { + "epoch": 5.103659635839192, + "grad_norm": 0.035137079656124115, + "learning_rate": 5.8748439689788824e-05, + "loss": 0.0134, + "step": 28310 + }, + { + "epoch": 5.105462412114656, + "grad_norm": 0.02779713086783886, + "learning_rate": 5.87213056397492e-05, + "loss": 0.0119, + "step": 28320 + }, + { + "epoch": 5.107265188390121, + "grad_norm": 0.030110465362668037, + "learning_rate": 5.869416894040719e-05, + "loss": 0.0129, + "step": 28330 + }, + { + "epoch": 5.109067964665585, + "grad_norm": 0.04724188521504402, + "learning_rate": 5.866702960000621e-05, + "loss": 0.0139, + "step": 28340 + }, + { + "epoch": 5.110870740941049, + "grad_norm": 0.05827842652797699, + "learning_rate": 5.863988762679048e-05, + "loss": 0.0123, + "step": 28350 + }, + { + "epoch": 5.112673517216513, + "grad_norm": 0.03689935803413391, + "learning_rate": 5.8612743029005e-05, + "loss": 0.012, + "step": 28360 + }, + { + "epoch": 5.114476293491978, + "grad_norm": 0.0318593792617321, + "learning_rate": 5.858559581489561e-05, + "loss": 0.0127, + "step": 28370 + }, + { + "epoch": 5.116279069767442, + "grad_norm": 0.03333774581551552, + "learning_rate": 5.85584459927089e-05, + "loss": 0.0118, + "step": 28380 + }, + { + "epoch": 5.118081846042906, + "grad_norm": 0.026336301118135452, + "learning_rate": 5.853129357069227e-05, + "loss": 0.0119, + "step": 28390 + }, + { + "epoch": 5.11988462231837, + "grad_norm": 0.03407811000943184, + "learning_rate": 5.8504138557093913e-05, + "loss": 0.0124, + "step": 28400 + }, + { + "epoch": 5.121687398593835, + "grad_norm": 0.03073226474225521, + "learning_rate": 5.8476980960162784e-05, + "loss": 0.0125, + "step": 28410 + }, + { + "epoch": 5.123490174869299, + "grad_norm": 0.030720625072717667, + "learning_rate": 5.844982078814868e-05, + "loss": 0.0124, + "step": 28420 + }, + { + "epoch": 5.125292951144763, + "grad_norm": 0.0375964418053627, + "learning_rate": 5.842265804930211e-05, + "loss": 0.0124, + "step": 28430 + }, + { + "epoch": 5.127095727420227, + "grad_norm": 0.02812187932431698, + "learning_rate": 5.839549275187444e-05, + "loss": 0.0127, + "step": 28440 + }, + { + "epoch": 5.128898503695692, + "grad_norm": 0.037454620003700256, + "learning_rate": 5.836832490411771e-05, + "loss": 0.0127, + "step": 28450 + }, + { + "epoch": 5.130701279971156, + "grad_norm": 0.05118027701973915, + "learning_rate": 5.834115451428485e-05, + "loss": 0.012, + "step": 28460 + }, + { + "epoch": 5.13250405624662, + "grad_norm": 0.02471534162759781, + "learning_rate": 5.831398159062946e-05, + "loss": 0.0121, + "step": 28470 + }, + { + "epoch": 5.134306832522084, + "grad_norm": 0.029935011640191078, + "learning_rate": 5.828680614140599e-05, + "loss": 0.0131, + "step": 28480 + }, + { + "epoch": 5.136109608797549, + "grad_norm": 0.031013231724500656, + "learning_rate": 5.825962817486962e-05, + "loss": 0.0128, + "step": 28490 + }, + { + "epoch": 5.137912385073013, + "grad_norm": 0.03063436783850193, + "learning_rate": 5.823244769927629e-05, + "loss": 0.0137, + "step": 28500 + }, + { + "epoch": 5.139715161348477, + "grad_norm": 0.038766708225011826, + "learning_rate": 5.8205264722882716e-05, + "loss": 0.0132, + "step": 28510 + }, + { + "epoch": 5.141517937623941, + "grad_norm": 0.030862826853990555, + "learning_rate": 5.817807925394636e-05, + "loss": 0.0127, + "step": 28520 + }, + { + "epoch": 5.1433207138994055, + "grad_norm": 0.03169157728552818, + "learning_rate": 5.815089130072546e-05, + "loss": 0.0118, + "step": 28530 + }, + { + "epoch": 5.1451234901748695, + "grad_norm": 0.02879260666668415, + "learning_rate": 5.8123700871479e-05, + "loss": 0.0136, + "step": 28540 + }, + { + "epoch": 5.1469262664503335, + "grad_norm": 0.029133619740605354, + "learning_rate": 5.809650797446671e-05, + "loss": 0.0117, + "step": 28550 + }, + { + "epoch": 5.1487290427257975, + "grad_norm": 0.02353537082672119, + "learning_rate": 5.806931261794907e-05, + "loss": 0.0123, + "step": 28560 + }, + { + "epoch": 5.150531819001262, + "grad_norm": 0.029341885820031166, + "learning_rate": 5.804211481018731e-05, + "loss": 0.0129, + "step": 28570 + }, + { + "epoch": 5.152334595276726, + "grad_norm": 0.031569406390190125, + "learning_rate": 5.801491455944341e-05, + "loss": 0.0124, + "step": 28580 + }, + { + "epoch": 5.15413737155219, + "grad_norm": 0.04413260519504547, + "learning_rate": 5.79877118739801e-05, + "loss": 0.0123, + "step": 28590 + }, + { + "epoch": 5.155940147827654, + "grad_norm": 0.030583176761865616, + "learning_rate": 5.7960506762060816e-05, + "loss": 0.0116, + "step": 28600 + }, + { + "epoch": 5.157742924103119, + "grad_norm": 0.0472293421626091, + "learning_rate": 5.793329923194977e-05, + "loss": 0.0123, + "step": 28610 + }, + { + "epoch": 5.159545700378583, + "grad_norm": 0.02850320190191269, + "learning_rate": 5.790608929191187e-05, + "loss": 0.0117, + "step": 28620 + }, + { + "epoch": 5.161348476654047, + "grad_norm": 0.03608838468790054, + "learning_rate": 5.78788769502128e-05, + "loss": 0.0117, + "step": 28630 + }, + { + "epoch": 5.163151252929511, + "grad_norm": 0.031599078327417374, + "learning_rate": 5.785166221511894e-05, + "loss": 0.0118, + "step": 28640 + }, + { + "epoch": 5.164954029204976, + "grad_norm": 0.027840670198202133, + "learning_rate": 5.7824445094897415e-05, + "loss": 0.013, + "step": 28650 + }, + { + "epoch": 5.16675680548044, + "grad_norm": 0.025738004595041275, + "learning_rate": 5.7797225597816065e-05, + "loss": 0.0117, + "step": 28660 + }, + { + "epoch": 5.168559581755904, + "grad_norm": 0.03966360539197922, + "learning_rate": 5.777000373214345e-05, + "loss": 0.0129, + "step": 28670 + }, + { + "epoch": 5.170362358031368, + "grad_norm": 0.03199717402458191, + "learning_rate": 5.774277950614885e-05, + "loss": 0.0126, + "step": 28680 + }, + { + "epoch": 5.172165134306833, + "grad_norm": 0.03687558323144913, + "learning_rate": 5.771555292810227e-05, + "loss": 0.0127, + "step": 28690 + }, + { + "epoch": 5.173967910582297, + "grad_norm": 0.02827855385839939, + "learning_rate": 5.768832400627444e-05, + "loss": 0.0112, + "step": 28700 + }, + { + "epoch": 5.175770686857761, + "grad_norm": 0.036785490810871124, + "learning_rate": 5.7661092748936775e-05, + "loss": 0.0118, + "step": 28710 + }, + { + "epoch": 5.177573463133225, + "grad_norm": 0.03999381884932518, + "learning_rate": 5.76338591643614e-05, + "loss": 0.0121, + "step": 28720 + }, + { + "epoch": 5.17937623940869, + "grad_norm": 0.03653955087065697, + "learning_rate": 5.760662326082118e-05, + "loss": 0.012, + "step": 28730 + }, + { + "epoch": 5.181179015684154, + "grad_norm": 0.04651894420385361, + "learning_rate": 5.757938504658965e-05, + "loss": 0.0112, + "step": 28740 + }, + { + "epoch": 5.182981791959618, + "grad_norm": 0.04207458719611168, + "learning_rate": 5.755214452994107e-05, + "loss": 0.013, + "step": 28750 + }, + { + "epoch": 5.184784568235082, + "grad_norm": 0.04022625461220741, + "learning_rate": 5.752490171915039e-05, + "loss": 0.013, + "step": 28760 + }, + { + "epoch": 5.186587344510547, + "grad_norm": 0.03379439935088158, + "learning_rate": 5.749765662249324e-05, + "loss": 0.0118, + "step": 28770 + }, + { + "epoch": 5.188390120786011, + "grad_norm": 0.05236967280507088, + "learning_rate": 5.747040924824596e-05, + "loss": 0.0139, + "step": 28780 + }, + { + "epoch": 5.190192897061475, + "grad_norm": 0.04127820208668709, + "learning_rate": 5.7443159604685613e-05, + "loss": 0.0124, + "step": 28790 + }, + { + "epoch": 5.191995673336939, + "grad_norm": 0.0369475819170475, + "learning_rate": 5.74159077000899e-05, + "loss": 0.0119, + "step": 28800 + }, + { + "epoch": 5.1937984496124034, + "grad_norm": 0.04646328464150429, + "learning_rate": 5.7388653542737235e-05, + "loss": 0.0117, + "step": 28810 + }, + { + "epoch": 5.195601225887867, + "grad_norm": 0.036706943064928055, + "learning_rate": 5.736139714090672e-05, + "loss": 0.0123, + "step": 28820 + }, + { + "epoch": 5.197404002163331, + "grad_norm": 0.03286447376012802, + "learning_rate": 5.73341385028781e-05, + "loss": 0.013, + "step": 28830 + }, + { + "epoch": 5.199206778438795, + "grad_norm": 0.02809976413846016, + "learning_rate": 5.7306877636931855e-05, + "loss": 0.011, + "step": 28840 + }, + { + "epoch": 5.20100955471426, + "grad_norm": 0.03087390400469303, + "learning_rate": 5.7279614551349125e-05, + "loss": 0.0126, + "step": 28850 + }, + { + "epoch": 5.202812330989724, + "grad_norm": 0.03772282227873802, + "learning_rate": 5.725234925441169e-05, + "loss": 0.0127, + "step": 28860 + }, + { + "epoch": 5.204615107265188, + "grad_norm": 0.04853463172912598, + "learning_rate": 5.7225081754402044e-05, + "loss": 0.013, + "step": 28870 + }, + { + "epoch": 5.206417883540652, + "grad_norm": 0.04236651957035065, + "learning_rate": 5.7197812059603326e-05, + "loss": 0.0125, + "step": 28880 + }, + { + "epoch": 5.208220659816117, + "grad_norm": 0.04262363538146019, + "learning_rate": 5.717054017829934e-05, + "loss": 0.012, + "step": 28890 + }, + { + "epoch": 5.210023436091581, + "grad_norm": 0.040022969245910645, + "learning_rate": 5.7143266118774584e-05, + "loss": 0.0123, + "step": 28900 + }, + { + "epoch": 5.211826212367045, + "grad_norm": 0.04652688652276993, + "learning_rate": 5.711598988931418e-05, + "loss": 0.014, + "step": 28910 + }, + { + "epoch": 5.213628988642509, + "grad_norm": 0.05275602638721466, + "learning_rate": 5.7088711498203954e-05, + "loss": 0.0136, + "step": 28920 + }, + { + "epoch": 5.215431764917974, + "grad_norm": 0.03163760155439377, + "learning_rate": 5.706143095373033e-05, + "loss": 0.012, + "step": 28930 + }, + { + "epoch": 5.217234541193438, + "grad_norm": 0.03394268825650215, + "learning_rate": 5.703414826418042e-05, + "loss": 0.0116, + "step": 28940 + }, + { + "epoch": 5.219037317468902, + "grad_norm": 0.03242777660489082, + "learning_rate": 5.7006863437842007e-05, + "loss": 0.0124, + "step": 28950 + }, + { + "epoch": 5.220840093744366, + "grad_norm": 0.027294114232063293, + "learning_rate": 5.697957648300348e-05, + "loss": 0.0117, + "step": 28960 + }, + { + "epoch": 5.222642870019831, + "grad_norm": 0.033308982849121094, + "learning_rate": 5.695228740795391e-05, + "loss": 0.0119, + "step": 28970 + }, + { + "epoch": 5.224445646295295, + "grad_norm": 0.029074421152472496, + "learning_rate": 5.6924996220982985e-05, + "loss": 0.0126, + "step": 28980 + }, + { + "epoch": 5.226248422570759, + "grad_norm": 0.048360977321863174, + "learning_rate": 5.6897702930381045e-05, + "loss": 0.013, + "step": 28990 + }, + { + "epoch": 5.228051198846223, + "grad_norm": 0.036591239273548126, + "learning_rate": 5.687040754443908e-05, + "loss": 0.0117, + "step": 29000 + }, + { + "epoch": 5.229853975121688, + "grad_norm": 0.031699858605861664, + "learning_rate": 5.6843110071448725e-05, + "loss": 0.0129, + "step": 29010 + }, + { + "epoch": 5.231656751397152, + "grad_norm": 0.03304695338010788, + "learning_rate": 5.6815810519702194e-05, + "loss": 0.0125, + "step": 29020 + }, + { + "epoch": 5.233459527672616, + "grad_norm": 0.026115689426660538, + "learning_rate": 5.6788508897492396e-05, + "loss": 0.0118, + "step": 29030 + }, + { + "epoch": 5.23526230394808, + "grad_norm": 0.03414340317249298, + "learning_rate": 5.676120521311282e-05, + "loss": 0.0131, + "step": 29040 + }, + { + "epoch": 5.2370650802235446, + "grad_norm": 0.03219788894057274, + "learning_rate": 5.6733899474857634e-05, + "loss": 0.0127, + "step": 29050 + }, + { + "epoch": 5.2388678564990085, + "grad_norm": 0.03453666344285011, + "learning_rate": 5.670659169102157e-05, + "loss": 0.0122, + "step": 29060 + }, + { + "epoch": 5.2406706327744725, + "grad_norm": 0.03021911159157753, + "learning_rate": 5.6679281869900044e-05, + "loss": 0.0136, + "step": 29070 + }, + { + "epoch": 5.2424734090499365, + "grad_norm": 0.025377530604600906, + "learning_rate": 5.6651970019789045e-05, + "loss": 0.012, + "step": 29080 + }, + { + "epoch": 5.244276185325401, + "grad_norm": 0.031561125069856644, + "learning_rate": 5.662465614898519e-05, + "loss": 0.0124, + "step": 29090 + }, + { + "epoch": 5.246078961600865, + "grad_norm": 0.032976265996694565, + "learning_rate": 5.6597340265785695e-05, + "loss": 0.0119, + "step": 29100 + }, + { + "epoch": 5.247881737876329, + "grad_norm": 0.03317051753401756, + "learning_rate": 5.657002237848843e-05, + "loss": 0.0125, + "step": 29110 + }, + { + "epoch": 5.249684514151793, + "grad_norm": 0.03329414129257202, + "learning_rate": 5.654270249539183e-05, + "loss": 0.0118, + "step": 29120 + }, + { + "epoch": 5.251487290427258, + "grad_norm": 0.03199499100446701, + "learning_rate": 5.651538062479498e-05, + "loss": 0.0116, + "step": 29130 + }, + { + "epoch": 5.253290066702722, + "grad_norm": 0.02604283206164837, + "learning_rate": 5.648805677499751e-05, + "loss": 0.0122, + "step": 29140 + }, + { + "epoch": 5.255092842978186, + "grad_norm": 0.02844548225402832, + "learning_rate": 5.646073095429969e-05, + "loss": 0.0117, + "step": 29150 + }, + { + "epoch": 5.25689561925365, + "grad_norm": 0.02529033087193966, + "learning_rate": 5.643340317100241e-05, + "loss": 0.012, + "step": 29160 + }, + { + "epoch": 5.258698395529115, + "grad_norm": 0.047921642661094666, + "learning_rate": 5.64060734334071e-05, + "loss": 0.013, + "step": 29170 + }, + { + "epoch": 5.260501171804579, + "grad_norm": 0.02573951706290245, + "learning_rate": 5.637874174981583e-05, + "loss": 0.0129, + "step": 29180 + }, + { + "epoch": 5.262303948080043, + "grad_norm": 0.03397450968623161, + "learning_rate": 5.635140812853124e-05, + "loss": 0.0119, + "step": 29190 + }, + { + "epoch": 5.264106724355507, + "grad_norm": 0.04451403394341469, + "learning_rate": 5.6324072577856544e-05, + "loss": 0.0121, + "step": 29200 + }, + { + "epoch": 5.265909500630972, + "grad_norm": 0.027296623215079308, + "learning_rate": 5.629673510609559e-05, + "loss": 0.0121, + "step": 29210 + }, + { + "epoch": 5.267712276906436, + "grad_norm": 0.03864866495132446, + "learning_rate": 5.626939572155276e-05, + "loss": 0.0113, + "step": 29220 + }, + { + "epoch": 5.2695150531819, + "grad_norm": 0.02468748949468136, + "learning_rate": 5.6242054432533054e-05, + "loss": 0.0119, + "step": 29230 + }, + { + "epoch": 5.271317829457364, + "grad_norm": 0.04279787093400955, + "learning_rate": 5.621471124734201e-05, + "loss": 0.0128, + "step": 29240 + }, + { + "epoch": 5.273120605732829, + "grad_norm": 0.02931690402328968, + "learning_rate": 5.6187366174285794e-05, + "loss": 0.0107, + "step": 29250 + }, + { + "epoch": 5.274923382008293, + "grad_norm": 0.029513049870729446, + "learning_rate": 5.616001922167109e-05, + "loss": 0.0122, + "step": 29260 + }, + { + "epoch": 5.276726158283757, + "grad_norm": 0.023820960894227028, + "learning_rate": 5.61326703978052e-05, + "loss": 0.0125, + "step": 29270 + }, + { + "epoch": 5.278528934559221, + "grad_norm": 0.030634528025984764, + "learning_rate": 5.6105319710995964e-05, + "loss": 0.0124, + "step": 29280 + }, + { + "epoch": 5.280331710834686, + "grad_norm": 0.03769797086715698, + "learning_rate": 5.60779671695518e-05, + "loss": 0.0114, + "step": 29290 + }, + { + "epoch": 5.28213448711015, + "grad_norm": 0.022610878571867943, + "learning_rate": 5.6050612781781684e-05, + "loss": 0.0115, + "step": 29300 + }, + { + "epoch": 5.283937263385614, + "grad_norm": 0.03346133977174759, + "learning_rate": 5.602325655599516e-05, + "loss": 0.0114, + "step": 29310 + }, + { + "epoch": 5.2857400396610785, + "grad_norm": 0.04937040060758591, + "learning_rate": 5.599589850050234e-05, + "loss": 0.0132, + "step": 29320 + }, + { + "epoch": 5.2875428159365425, + "grad_norm": 0.03571958839893341, + "learning_rate": 5.5968538623613874e-05, + "loss": 0.0121, + "step": 29330 + }, + { + "epoch": 5.2893455922120065, + "grad_norm": 0.03229149803519249, + "learning_rate": 5.594117693364095e-05, + "loss": 0.013, + "step": 29340 + }, + { + "epoch": 5.2911483684874705, + "grad_norm": 0.023779766634106636, + "learning_rate": 5.591381343889535e-05, + "loss": 0.0115, + "step": 29350 + }, + { + "epoch": 5.2929511447629345, + "grad_norm": 0.030868684872984886, + "learning_rate": 5.5886448147689355e-05, + "loss": 0.0109, + "step": 29360 + }, + { + "epoch": 5.294753921038399, + "grad_norm": 0.04091461002826691, + "learning_rate": 5.585908106833585e-05, + "loss": 0.0124, + "step": 29370 + }, + { + "epoch": 5.296556697313863, + "grad_norm": 0.03383997827768326, + "learning_rate": 5.5831712209148226e-05, + "loss": 0.0115, + "step": 29380 + }, + { + "epoch": 5.298359473589327, + "grad_norm": 0.04055332764983177, + "learning_rate": 5.58043415784404e-05, + "loss": 0.0122, + "step": 29390 + }, + { + "epoch": 5.300162249864792, + "grad_norm": 0.032377395778894424, + "learning_rate": 5.577696918452686e-05, + "loss": 0.013, + "step": 29400 + }, + { + "epoch": 5.301965026140256, + "grad_norm": 0.032375603914260864, + "learning_rate": 5.5749595035722604e-05, + "loss": 0.0115, + "step": 29410 + }, + { + "epoch": 5.30376780241572, + "grad_norm": 0.04060519114136696, + "learning_rate": 5.5722219140343193e-05, + "loss": 0.0132, + "step": 29420 + }, + { + "epoch": 5.305570578691184, + "grad_norm": 0.040074899792671204, + "learning_rate": 5.56948415067047e-05, + "loss": 0.0129, + "step": 29430 + }, + { + "epoch": 5.307373354966648, + "grad_norm": 0.0320710614323616, + "learning_rate": 5.5667462143123704e-05, + "loss": 0.0128, + "step": 29440 + }, + { + "epoch": 5.309176131242113, + "grad_norm": 0.02577672153711319, + "learning_rate": 5.564008105791737e-05, + "loss": 0.0126, + "step": 29450 + }, + { + "epoch": 5.310978907517577, + "grad_norm": 0.03490905463695526, + "learning_rate": 5.5612698259403316e-05, + "loss": 0.0121, + "step": 29460 + }, + { + "epoch": 5.312781683793041, + "grad_norm": 0.03948046639561653, + "learning_rate": 5.5585313755899724e-05, + "loss": 0.0122, + "step": 29470 + }, + { + "epoch": 5.314584460068506, + "grad_norm": 0.03000693768262863, + "learning_rate": 5.5557927555725285e-05, + "loss": 0.0106, + "step": 29480 + }, + { + "epoch": 5.31638723634397, + "grad_norm": 0.02530750446021557, + "learning_rate": 5.55305396671992e-05, + "loss": 0.0122, + "step": 29490 + }, + { + "epoch": 5.318190012619434, + "grad_norm": 0.026811150833964348, + "learning_rate": 5.55031500986412e-05, + "loss": 0.0116, + "step": 29500 + }, + { + "epoch": 5.319992788894898, + "grad_norm": 0.03169972077012062, + "learning_rate": 5.547575885837149e-05, + "loss": 0.012, + "step": 29510 + }, + { + "epoch": 5.321795565170362, + "grad_norm": 0.036693647503852844, + "learning_rate": 5.5448365954710825e-05, + "loss": 0.0113, + "step": 29520 + }, + { + "epoch": 5.323598341445827, + "grad_norm": 0.02540874481201172, + "learning_rate": 5.5420971395980446e-05, + "loss": 0.0114, + "step": 29530 + }, + { + "epoch": 5.325401117721291, + "grad_norm": 0.03143942356109619, + "learning_rate": 5.539357519050209e-05, + "loss": 0.011, + "step": 29540 + }, + { + "epoch": 5.327203893996755, + "grad_norm": 0.031543608754873276, + "learning_rate": 5.536617734659799e-05, + "loss": 0.0125, + "step": 29550 + }, + { + "epoch": 5.32900667027222, + "grad_norm": 0.04808424413204193, + "learning_rate": 5.533877787259091e-05, + "loss": 0.0126, + "step": 29560 + }, + { + "epoch": 5.330809446547684, + "grad_norm": 0.037209440022706985, + "learning_rate": 5.5311376776804044e-05, + "loss": 0.0116, + "step": 29570 + }, + { + "epoch": 5.332612222823148, + "grad_norm": 0.02935945987701416, + "learning_rate": 5.528397406756118e-05, + "loss": 0.011, + "step": 29580 + }, + { + "epoch": 5.334414999098612, + "grad_norm": 0.02853682078421116, + "learning_rate": 5.525656975318652e-05, + "loss": 0.0137, + "step": 29590 + }, + { + "epoch": 5.3362177753740765, + "grad_norm": 0.03490554541349411, + "learning_rate": 5.522916384200474e-05, + "loss": 0.0108, + "step": 29600 + }, + { + "epoch": 5.3380205516495405, + "grad_norm": 0.04060240089893341, + "learning_rate": 5.520175634234106e-05, + "loss": 0.0126, + "step": 29610 + }, + { + "epoch": 5.3398233279250045, + "grad_norm": 0.03848614916205406, + "learning_rate": 5.517434726252113e-05, + "loss": 0.0114, + "step": 29620 + }, + { + "epoch": 5.3416261042004685, + "grad_norm": 0.05165014788508415, + "learning_rate": 5.514693661087113e-05, + "loss": 0.0103, + "step": 29630 + }, + { + "epoch": 5.343428880475933, + "grad_norm": 0.02856101095676422, + "learning_rate": 5.511952439571769e-05, + "loss": 0.0132, + "step": 29640 + }, + { + "epoch": 5.345231656751397, + "grad_norm": 0.02860461361706257, + "learning_rate": 5.509211062538791e-05, + "loss": 0.0117, + "step": 29650 + }, + { + "epoch": 5.347034433026861, + "grad_norm": 0.030598053708672523, + "learning_rate": 5.506469530820939e-05, + "loss": 0.012, + "step": 29660 + }, + { + "epoch": 5.348837209302325, + "grad_norm": 0.030391210690140724, + "learning_rate": 5.503727845251014e-05, + "loss": 0.0126, + "step": 29670 + }, + { + "epoch": 5.35063998557779, + "grad_norm": 0.031918514519929886, + "learning_rate": 5.50098600666187e-05, + "loss": 0.0113, + "step": 29680 + }, + { + "epoch": 5.352442761853254, + "grad_norm": 0.039375778287649155, + "learning_rate": 5.498244015886406e-05, + "loss": 0.0123, + "step": 29690 + }, + { + "epoch": 5.354245538128718, + "grad_norm": 0.03443246707320213, + "learning_rate": 5.495501873757565e-05, + "loss": 0.0119, + "step": 29700 + }, + { + "epoch": 5.356048314404182, + "grad_norm": 0.026564963161945343, + "learning_rate": 5.492759581108336e-05, + "loss": 0.0123, + "step": 29710 + }, + { + "epoch": 5.357851090679647, + "grad_norm": 0.035584643483161926, + "learning_rate": 5.490017138771759e-05, + "loss": 0.0115, + "step": 29720 + }, + { + "epoch": 5.359653866955111, + "grad_norm": 0.028256235644221306, + "learning_rate": 5.487274547580912e-05, + "loss": 0.012, + "step": 29730 + }, + { + "epoch": 5.361456643230575, + "grad_norm": 0.025452515110373497, + "learning_rate": 5.484531808368923e-05, + "loss": 0.0123, + "step": 29740 + }, + { + "epoch": 5.363259419506039, + "grad_norm": 0.02809840999543667, + "learning_rate": 5.4817889219689656e-05, + "loss": 0.0126, + "step": 29750 + }, + { + "epoch": 5.365062195781504, + "grad_norm": 0.026933996006846428, + "learning_rate": 5.4790458892142536e-05, + "loss": 0.0119, + "step": 29760 + }, + { + "epoch": 5.366864972056968, + "grad_norm": 0.025124521926045418, + "learning_rate": 5.476302710938048e-05, + "loss": 0.0112, + "step": 29770 + }, + { + "epoch": 5.368667748332432, + "grad_norm": 0.04775701090693474, + "learning_rate": 5.473559387973657e-05, + "loss": 0.0121, + "step": 29780 + }, + { + "epoch": 5.370470524607896, + "grad_norm": 0.02980572171509266, + "learning_rate": 5.470815921154425e-05, + "loss": 0.0117, + "step": 29790 + }, + { + "epoch": 5.372273300883361, + "grad_norm": 0.04844120889902115, + "learning_rate": 5.468072311313749e-05, + "loss": 0.013, + "step": 29800 + }, + { + "epoch": 5.374076077158825, + "grad_norm": 0.031243251636624336, + "learning_rate": 5.465328559285063e-05, + "loss": 0.0116, + "step": 29810 + }, + { + "epoch": 5.375878853434289, + "grad_norm": 0.03926122561097145, + "learning_rate": 5.462584665901849e-05, + "loss": 0.0121, + "step": 29820 + }, + { + "epoch": 5.377681629709753, + "grad_norm": 0.03392299637198448, + "learning_rate": 5.4598406319976235e-05, + "loss": 0.0109, + "step": 29830 + }, + { + "epoch": 5.379484405985218, + "grad_norm": 0.04330969229340553, + "learning_rate": 5.457096458405958e-05, + "loss": 0.0122, + "step": 29840 + }, + { + "epoch": 5.381287182260682, + "grad_norm": 0.026129353791475296, + "learning_rate": 5.454352145960457e-05, + "loss": 0.0125, + "step": 29850 + }, + { + "epoch": 5.383089958536146, + "grad_norm": 0.040702350437641144, + "learning_rate": 5.4516076954947715e-05, + "loss": 0.0132, + "step": 29860 + }, + { + "epoch": 5.38489273481161, + "grad_norm": 0.031122533604502678, + "learning_rate": 5.448863107842591e-05, + "loss": 0.0128, + "step": 29870 + }, + { + "epoch": 5.3866955110870745, + "grad_norm": 0.04613840579986572, + "learning_rate": 5.446118383837651e-05, + "loss": 0.0125, + "step": 29880 + }, + { + "epoch": 5.388498287362538, + "grad_norm": 0.03470383211970329, + "learning_rate": 5.443373524313722e-05, + "loss": 0.0126, + "step": 29890 + }, + { + "epoch": 5.390301063638002, + "grad_norm": 0.029422368854284286, + "learning_rate": 5.440628530104626e-05, + "loss": 0.0116, + "step": 29900 + }, + { + "epoch": 5.392103839913466, + "grad_norm": 0.028123416006565094, + "learning_rate": 5.4378834020442146e-05, + "loss": 0.0122, + "step": 29910 + }, + { + "epoch": 5.393906616188931, + "grad_norm": 0.04049869254231453, + "learning_rate": 5.4351381409663884e-05, + "loss": 0.0134, + "step": 29920 + }, + { + "epoch": 5.395709392464395, + "grad_norm": 0.027555856853723526, + "learning_rate": 5.432392747705084e-05, + "loss": 0.0123, + "step": 29930 + }, + { + "epoch": 5.397512168739859, + "grad_norm": 0.03256784379482269, + "learning_rate": 5.429647223094278e-05, + "loss": 0.0112, + "step": 29940 + }, + { + "epoch": 5.399314945015323, + "grad_norm": 0.030601628124713898, + "learning_rate": 5.4269015679679924e-05, + "loss": 0.0121, + "step": 29950 + }, + { + "epoch": 5.401117721290788, + "grad_norm": 0.044355109333992004, + "learning_rate": 5.424155783160281e-05, + "loss": 0.0113, + "step": 29960 + }, + { + "epoch": 5.402920497566252, + "grad_norm": 0.035637736320495605, + "learning_rate": 5.4214098695052415e-05, + "loss": 0.0125, + "step": 29970 + }, + { + "epoch": 5.404723273841716, + "grad_norm": 0.041722025722265244, + "learning_rate": 5.418663827837012e-05, + "loss": 0.0124, + "step": 29980 + }, + { + "epoch": 5.40652605011718, + "grad_norm": 0.032297536730766296, + "learning_rate": 5.415917658989763e-05, + "loss": 0.0118, + "step": 29990 + }, + { + "epoch": 5.408328826392645, + "grad_norm": 0.0383053794503212, + "learning_rate": 5.413171363797713e-05, + "loss": 0.0128, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 60000, + "num_input_tokens_seen": 0, + "num_train_epochs": 11, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}