diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,3571 +1,7039 @@ { - "best_metric": 12.256741523742676, - "best_model_checkpoint": "miner_id_24/checkpoint-500", - "epoch": 0.0818732601932209, - "eval_steps": 100, - "global_step": 500, + "best_metric": 12.25197696685791, + "best_model_checkpoint": "miner_id_24/checkpoint-1000", + "epoch": 0.1637465203864418, + "eval_steps": 1000, + "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001637465203864418, - "grad_norm": 0.04208580031991005, - "learning_rate": 6.666666649834951e-06, + "grad_norm": 0.04071168601512909, + "learning_rate": 6.666666666666667e-07, "loss": 12.4513, "step": 1 }, { "epoch": 0.0001637465203864418, - "eval_loss": 12.441868782043457, - "eval_runtime": 7.3093, - "eval_samples_per_second": 33.656, - "eval_steps_per_second": 16.828, + "eval_loss": 12.441837310791016, + "eval_runtime": 7.2884, + "eval_samples_per_second": 33.752, + "eval_steps_per_second": 16.876, "step": 1 }, { "epoch": 0.0003274930407728836, - "grad_norm": 0.04489905759692192, - "learning_rate": 1.3333333299669903e-05, + "grad_norm": 0.042674656957387924, + "learning_rate": 1.3333333333333334e-06, "loss": 12.449, "step": 2 }, { "epoch": 0.0004912395611593254, - "grad_norm": 0.04625147953629494, - "learning_rate": 1.9999999494757503e-05, - "loss": 12.4494, + "grad_norm": 0.04358048737049103, + "learning_rate": 2.0000000000000003e-06, + "loss": 12.4493, "step": 3 }, { "epoch": 0.0006549860815457672, - "grad_norm": 0.043086178600788116, - "learning_rate": 2.6666666599339806e-05, + "grad_norm": 0.04197787120938301, + "learning_rate": 2.666666666666667e-06, "loss": 12.4376, "step": 4 }, { "epoch": 0.0008187326019322089, - "grad_norm": 0.04618668928742409, - "learning_rate": 3.333333370392211e-05, + "grad_norm": 0.04088578745722771, + "learning_rate": 3.3333333333333333e-06, "loss": 12.4541, "step": 5 }, { "epoch": 0.0009824791223186507, - "grad_norm": 0.03436649218201637, - "learning_rate": 3.9999998989515007e-05, + "grad_norm": 0.03298024460673332, + "learning_rate": 4.000000000000001e-06, "loss": 12.4441, "step": 6 }, { "epoch": 0.0011462256427050926, - "grad_norm": 0.032714761793613434, - "learning_rate": 4.6666664275107905e-05, - "loss": 12.4419, + "grad_norm": 0.03118324838578701, + "learning_rate": 4.666666666666667e-06, + "loss": 12.442, "step": 7 }, { "epoch": 0.0013099721630915344, - "grad_norm": 0.051821667701005936, - "learning_rate": 5.333333319867961e-05, - "loss": 12.4438, + "grad_norm": 0.04715936258435249, + "learning_rate": 5.333333333333334e-06, + "loss": 12.4439, "step": 8 }, { "epoch": 0.001473718683477976, - "grad_norm": 0.038592468947172165, - "learning_rate": 6.000000212225132e-05, - "loss": 12.4529, + "grad_norm": 0.034939300268888474, + "learning_rate": 6e-06, + "loss": 12.453, "step": 9 }, { "epoch": 0.0016374652038644178, - "grad_norm": 0.03184358403086662, - "learning_rate": 6.666666740784422e-05, - "loss": 12.4582, + "grad_norm": 0.028914228081703186, + "learning_rate": 6.666666666666667e-06, + "loss": 12.4583, "step": 10 }, { "epoch": 0.0018012117242508596, - "grad_norm": 0.036762818694114685, - "learning_rate": 7.333333633141592e-05, - "loss": 12.438, + "grad_norm": 0.03148627653717995, + "learning_rate": 7.333333333333334e-06, + "loss": 12.4382, "step": 11 }, { "epoch": 0.0019649582446373015, - "grad_norm": 0.04558774083852768, - "learning_rate": 7.999999797903001e-05, - "loss": 12.4414, + "grad_norm": 0.03997644782066345, + "learning_rate": 8.000000000000001e-06, + "loss": 12.4416, "step": 12 }, { "epoch": 0.002128704765023743, - "grad_norm": 0.057379335165023804, - "learning_rate": 8.666666690260172e-05, - "loss": 12.4501, + "grad_norm": 0.05061323940753937, + "learning_rate": 8.666666666666668e-06, + "loss": 12.4505, "step": 13 }, { "epoch": 0.002292451285410185, - "grad_norm": 0.04371897876262665, - "learning_rate": 9.333332855021581e-05, - "loss": 12.4428, + "grad_norm": 0.03674255311489105, + "learning_rate": 9.333333333333334e-06, + "loss": 12.4432, "step": 14 }, { "epoch": 0.0024561978057966267, - "grad_norm": 0.048772815614938736, - "learning_rate": 9.999999747378752e-05, - "loss": 12.4444, + "grad_norm": 0.040007635951042175, + "learning_rate": 1e-05, + "loss": 12.4448, "step": 15 }, { "epoch": 0.0026199443261830688, - "grad_norm": 0.0438830703496933, - "learning_rate": 0.00010666666639735922, - "loss": 12.4454, + "grad_norm": 0.035397134721279144, + "learning_rate": 1.0666666666666667e-05, + "loss": 12.4459, "step": 16 }, { "epoch": 0.0027836908465695104, - "grad_norm": 0.04073454812169075, - "learning_rate": 0.00011333332804497331, - "loss": 12.4374, + "grad_norm": 0.03148770332336426, + "learning_rate": 1.1333333333333334e-05, + "loss": 12.438, "step": 17 }, { "epoch": 0.002947437366955952, - "grad_norm": 0.05522220581769943, - "learning_rate": 0.00012000000424450263, - "loss": 12.4235, + "grad_norm": 0.03974079713225365, + "learning_rate": 1.2e-05, + "loss": 12.4245, "step": 18 }, { "epoch": 0.003111183887342394, - "grad_norm": 0.056581761687994, - "learning_rate": 0.00012666666589211673, - "loss": 12.4398, + "grad_norm": 0.04220546409487724, + "learning_rate": 1.2666666666666668e-05, + "loss": 12.4408, "step": 19 }, { "epoch": 0.0032749304077288356, - "grad_norm": 0.054074015468358994, - "learning_rate": 0.00013333333481568843, - "loss": 12.4549, + "grad_norm": 0.037504322826862335, + "learning_rate": 1.3333333333333333e-05, + "loss": 12.456, "step": 20 }, { "epoch": 0.0034386769281152777, - "grad_norm": 0.06291481107473373, - "learning_rate": 0.0001399999891873449, - "loss": 12.444, + "grad_norm": 0.03901657089591026, + "learning_rate": 1.4000000000000001e-05, + "loss": 12.4453, "step": 21 }, { "epoch": 0.0036024234485017193, - "grad_norm": 0.05257013067603111, - "learning_rate": 0.00014666667266283184, - "loss": 12.4552, + "grad_norm": 0.03178351745009422, + "learning_rate": 1.4666666666666668e-05, + "loss": 12.4564, "step": 22 }, { "epoch": 0.0037661699688881613, - "grad_norm": 0.05741506442427635, - "learning_rate": 0.00015333332703448832, - "loss": 12.4378, + "grad_norm": 0.033598098903894424, + "learning_rate": 1.5333333333333334e-05, + "loss": 12.4394, "step": 23 }, { "epoch": 0.003929916489274603, - "grad_norm": 0.07213739305734634, - "learning_rate": 0.00015999999595806003, - "loss": 12.439, + "grad_norm": 0.039266929030418396, + "learning_rate": 1.6000000000000003e-05, + "loss": 12.4412, "step": 24 }, { "epoch": 0.0040936630096610445, - "grad_norm": 0.06334943324327469, - "learning_rate": 0.00016666666488163173, - "loss": 12.4353, + "grad_norm": 0.03318433836102486, + "learning_rate": 1.6666666666666667e-05, + "loss": 12.4374, "step": 25 }, { "epoch": 0.004257409530047486, - "grad_norm": 0.08387371152639389, - "learning_rate": 0.00017333333380520344, - "loss": 12.44, + "grad_norm": 0.041068192571401596, + "learning_rate": 1.7333333333333336e-05, + "loss": 12.4429, "step": 26 }, { "epoch": 0.004421156050433929, - "grad_norm": 0.08301780372858047, - "learning_rate": 0.00017999998817685992, - "loss": 12.4327, + "grad_norm": 0.038254860788583755, + "learning_rate": 1.8e-05, + "loss": 12.4358, "step": 27 }, { "epoch": 0.00458490257082037, - "grad_norm": 0.09270280599594116, - "learning_rate": 0.00018666665710043162, - "loss": 12.4285, + "grad_norm": 0.0400465689599514, + "learning_rate": 1.866666666666667e-05, + "loss": 12.4324, "step": 28 }, { "epoch": 0.004748649091206812, - "grad_norm": 0.10819040238857269, - "learning_rate": 0.00019333332602400333, - "loss": 12.4291, + "grad_norm": 0.04147111251950264, + "learning_rate": 1.9333333333333333e-05, + "loss": 12.4338, "step": 29 }, { "epoch": 0.004912395611593253, - "grad_norm": 0.10528005659580231, - "learning_rate": 0.00019999999494757503, - "loss": 12.4423, + "grad_norm": 0.03947514668107033, + "learning_rate": 2e-05, + "loss": 12.4472, "step": 30 }, { "epoch": 0.005076142131979695, - "grad_norm": 0.13457149267196655, - "learning_rate": 0.00019999999494757503, - "loss": 12.4323, + "grad_norm": 0.04782761260867119, + "learning_rate": 2.0666666666666666e-05, + "loss": 12.4389, "step": 31 }, { "epoch": 0.0052398886523661375, - "grad_norm": 0.13698327541351318, - "learning_rate": 0.00019999999494757503, - "loss": 12.4182, + "grad_norm": 0.044334013015031815, + "learning_rate": 2.1333333333333335e-05, + "loss": 12.4259, "step": 32 }, { "epoch": 0.005403635172752579, - "grad_norm": 0.13750460743904114, - "learning_rate": 0.0001999999803956598, - "loss": 12.4365, + "grad_norm": 0.03857624903321266, + "learning_rate": 2.2000000000000003e-05, + "loss": 12.4445, "step": 33 }, { "epoch": 0.005567381693139021, - "grad_norm": 0.1702355593442917, - "learning_rate": 0.00019999996584374458, - "loss": 12.4517, + "grad_norm": 0.05013938993215561, + "learning_rate": 2.2666666666666668e-05, + "loss": 12.4622, "step": 34 }, { "epoch": 0.005731128213525462, - "grad_norm": 0.14421802759170532, - "learning_rate": 0.00019999996584374458, - "loss": 12.4439, + "grad_norm": 0.03651641681790352, + "learning_rate": 2.3333333333333336e-05, + "loss": 12.453, "step": 35 }, { "epoch": 0.005894874733911904, - "grad_norm": 0.17617714405059814, - "learning_rate": 0.00019999995129182935, - "loss": 12.4248, + "grad_norm": 0.04444118216633797, + "learning_rate": 2.4e-05, + "loss": 12.437, "step": 36 }, { "epoch": 0.006058621254298346, - "grad_norm": 0.1948043555021286, - "learning_rate": 0.0001999999221879989, - "loss": 12.4319, + "grad_norm": 0.0415070615708828, + "learning_rate": 2.466666666666667e-05, + "loss": 12.4461, "step": 37 }, { "epoch": 0.006222367774684788, - "grad_norm": 0.2024512141942978, - "learning_rate": 0.00019999989308416843, - "loss": 12.4267, + "grad_norm": 0.04257901757955551, + "learning_rate": 2.5333333333333337e-05, + "loss": 12.4423, "step": 38 }, { "epoch": 0.00638611429507123, - "grad_norm": 0.21583795547485352, - "learning_rate": 0.0001999998785322532, - "loss": 12.4148, + "grad_norm": 0.04246218129992485, + "learning_rate": 2.6000000000000002e-05, + "loss": 12.4324, "step": 39 }, { "epoch": 0.006549860815457671, - "grad_norm": 0.212106391787529, - "learning_rate": 0.00019999984942842275, - "loss": 12.4077, + "grad_norm": 0.0474301353096962, + "learning_rate": 2.6666666666666667e-05, + "loss": 12.4268, "step": 40 }, { "epoch": 0.006713607335844114, - "grad_norm": 0.25522929430007935, - "learning_rate": 0.0001999998203245923, - "loss": 12.4199, + "grad_norm": 0.04999005049467087, + "learning_rate": 2.733333333333333e-05, + "loss": 12.4445, "step": 41 }, { "epoch": 0.006877353856230555, - "grad_norm": 0.29123276472091675, - "learning_rate": 0.0001999997766688466, - "loss": 12.3987, + "grad_norm": 0.04916905239224434, + "learning_rate": 2.8000000000000003e-05, + "loss": 12.4276, "step": 42 }, { "epoch": 0.007041100376616997, - "grad_norm": 0.31058281660079956, - "learning_rate": 0.00019999974756501615, - "loss": 12.4021, + "grad_norm": 0.05572114139795303, + "learning_rate": 2.8666666666666668e-05, + "loss": 12.4357, "step": 43 }, { "epoch": 0.0072048468970034385, - "grad_norm": 0.22949866950511932, - "learning_rate": 0.00019999970390927047, - "loss": 12.4186, + "grad_norm": 0.03746430575847626, + "learning_rate": 2.9333333333333336e-05, + "loss": 12.4436, "step": 44 }, { "epoch": 0.00736859341738988, - "grad_norm": 0.27713820338249207, - "learning_rate": 0.00019999966025352478, - "loss": 12.4255, + "grad_norm": 0.03692767024040222, + "learning_rate": 3e-05, + "loss": 12.4552, "step": 45 }, { "epoch": 0.007532339937776323, - "grad_norm": 0.27647095918655396, - "learning_rate": 0.0001999996165977791, - "loss": 12.3932, + "grad_norm": 0.04242353141307831, + "learning_rate": 3.066666666666667e-05, + "loss": 12.4279, "step": 46 }, { "epoch": 0.007696086458162764, - "grad_norm": 0.3024648427963257, - "learning_rate": 0.0001999995729420334, - "loss": 12.3947, + "grad_norm": 0.05116885527968407, + "learning_rate": 3.1333333333333334e-05, + "loss": 12.4373, "step": 47 }, { "epoch": 0.007859832978549206, - "grad_norm": 0.2551914155483246, - "learning_rate": 0.0001999995147343725, - "loss": 12.4186, + "grad_norm": 0.039842814207077026, + "learning_rate": 3.2000000000000005e-05, + "loss": 12.4554, "step": 48 }, { "epoch": 0.008023579498935647, - "grad_norm": 0.25191551446914673, - "learning_rate": 0.00019999945652671158, - "loss": 12.4031, + "grad_norm": 0.0379764698445797, + "learning_rate": 3.266666666666667e-05, + "loss": 12.4418, "step": 49 }, { "epoch": 0.008187326019322089, - "grad_norm": 0.2916131913661957, - "learning_rate": 0.0001999994128709659, - "loss": 12.3902, + "grad_norm": 0.048301611095666885, + "learning_rate": 3.3333333333333335e-05, + "loss": 12.4398, "step": 50 }, { "epoch": 0.00835107253970853, - "grad_norm": 0.3026813566684723, - "learning_rate": 0.00019999934011138976, - "loss": 12.3681, + "grad_norm": 0.056472256779670715, + "learning_rate": 3.4000000000000007e-05, + "loss": 12.4247, "step": 51 }, { "epoch": 0.008514819060094972, - "grad_norm": 0.3356999456882477, - "learning_rate": 0.00019999928190372884, - "loss": 12.3927, + "grad_norm": 0.0549093633890152, + "learning_rate": 3.466666666666667e-05, + "loss": 12.4551, "step": 52 }, { "epoch": 0.008678565580481416, - "grad_norm": 0.2360207736492157, - "learning_rate": 0.00019999922369606793, - "loss": 12.351, + "grad_norm": 0.04894556105136871, + "learning_rate": 3.5333333333333336e-05, + "loss": 12.4026, "step": 53 }, { "epoch": 0.008842312100867857, - "grad_norm": 0.28673455119132996, - "learning_rate": 0.0001999991509364918, - "loss": 12.3748, + "grad_norm": 0.0566958487033844, + "learning_rate": 3.6e-05, + "loss": 12.4368, "step": 54 }, { "epoch": 0.009006058621254299, - "grad_norm": 0.308011919260025, - "learning_rate": 0.00019999907817691565, - "loss": 12.3639, + "grad_norm": 0.06052306294441223, + "learning_rate": 3.6666666666666666e-05, + "loss": 12.4367, "step": 55 }, { "epoch": 0.00916980514164074, - "grad_norm": 0.3026260435581207, - "learning_rate": 0.00019999899086542428, - "loss": 12.3605, + "grad_norm": 0.05914003401994705, + "learning_rate": 3.733333333333334e-05, + "loss": 12.4328, "step": 56 }, { "epoch": 0.009333551662027182, - "grad_norm": 0.22466889023780823, - "learning_rate": 0.00019999891810584813, - "loss": 12.3757, + "grad_norm": 0.050145380198955536, + "learning_rate": 3.8e-05, + "loss": 12.4385, "step": 57 }, { "epoch": 0.009497298182413624, - "grad_norm": 0.29599490761756897, - "learning_rate": 0.000199998845346272, - "loss": 12.372, + "grad_norm": 0.07536210119724274, + "learning_rate": 3.866666666666667e-05, + "loss": 12.4527, "step": 58 }, { "epoch": 0.009661044702800065, - "grad_norm": 0.23513752222061157, - "learning_rate": 0.00019999875803478062, - "loss": 12.3508, + "grad_norm": 0.06089041382074356, + "learning_rate": 3.933333333333333e-05, + "loss": 12.4182, "step": 59 }, { "epoch": 0.009824791223186507, - "grad_norm": 0.2927089035511017, - "learning_rate": 0.00019999867072328925, - "loss": 12.3443, + "grad_norm": 0.06450305879116058, + "learning_rate": 4e-05, + "loss": 12.4279, "step": 60 }, { "epoch": 0.009988537743572948, - "grad_norm": 0.23235340416431427, - "learning_rate": 0.00019999858341179788, - "loss": 12.3343, + "grad_norm": 0.05715927854180336, + "learning_rate": 4.066666666666667e-05, + "loss": 12.4077, "step": 61 }, { "epoch": 0.01015228426395939, - "grad_norm": 0.2948826253414154, - "learning_rate": 0.00019999848154839128, - "loss": 12.3566, + "grad_norm": 0.06323696672916412, + "learning_rate": 4.133333333333333e-05, + "loss": 12.4455, "step": 62 }, { "epoch": 0.010316030784345833, - "grad_norm": 0.29048776626586914, - "learning_rate": 0.00019999837968498468, - "loss": 12.3419, + "grad_norm": 0.06708288937807083, + "learning_rate": 4.2e-05, + "loss": 12.4392, "step": 63 }, { "epoch": 0.010479777304732275, - "grad_norm": 0.27304956316947937, - "learning_rate": 0.00019999829237349331, - "loss": 12.3437, + "grad_norm": 0.08000550419092178, + "learning_rate": 4.266666666666667e-05, + "loss": 12.442, "step": 64 }, { "epoch": 0.010643523825118717, - "grad_norm": 0.2460082769393921, - "learning_rate": 0.0001999981759581715, - "loss": 12.3449, + "grad_norm": 0.0591249093413353, + "learning_rate": 4.3333333333333334e-05, + "loss": 12.4374, "step": 65 }, { "epoch": 0.010807270345505158, - "grad_norm": 0.23197799921035767, - "learning_rate": 0.00019999808864668012, - "loss": 12.3434, + "grad_norm": 0.06522256880998611, + "learning_rate": 4.4000000000000006e-05, + "loss": 12.4319, "step": 66 }, { "epoch": 0.0109710168658916, - "grad_norm": 0.21384094655513763, - "learning_rate": 0.00019999798678327352, - "loss": 12.3519, + "grad_norm": 0.07051816582679749, + "learning_rate": 4.466666666666667e-05, + "loss": 12.4403, "step": 67 }, { "epoch": 0.011134763386278041, - "grad_norm": 0.26949459314346313, - "learning_rate": 0.00019999785581603646, - "loss": 12.34, + "grad_norm": 0.07400926947593689, + "learning_rate": 4.5333333333333335e-05, + "loss": 12.4411, "step": 68 }, { "epoch": 0.011298509906664483, - "grad_norm": 0.21784676611423492, - "learning_rate": 0.00019999775395262986, - "loss": 12.3514, + "grad_norm": 0.05660618841648102, + "learning_rate": 4.600000000000001e-05, + "loss": 12.4406, "step": 69 }, { "epoch": 0.011462256427050925, - "grad_norm": 0.23999345302581787, - "learning_rate": 0.00019999763753730804, - "loss": 12.3387, + "grad_norm": 0.08654368668794632, + "learning_rate": 4.666666666666667e-05, + "loss": 12.4398, "step": 70 }, { "epoch": 0.011626002947437366, - "grad_norm": 0.2197384089231491, - "learning_rate": 0.0001999975211219862, - "loss": 12.3307, + "grad_norm": 0.07927855104207993, + "learning_rate": 4.7333333333333336e-05, + "loss": 12.4291, "step": 71 }, { "epoch": 0.011789749467823808, - "grad_norm": 0.24359112977981567, - "learning_rate": 0.00019999739015474916, - "loss": 12.331, + "grad_norm": 0.08644130080938339, + "learning_rate": 4.8e-05, + "loss": 12.4459, "step": 72 }, { "epoch": 0.011953495988210251, - "grad_norm": 0.22217059135437012, - "learning_rate": 0.0001999972591875121, - "loss": 12.3253, + "grad_norm": 0.07535272091627121, + "learning_rate": 4.866666666666667e-05, + "loss": 12.4458, "step": 73 }, { "epoch": 0.012117242508596693, - "grad_norm": 0.22041912376880646, - "learning_rate": 0.00019999712822027504, - "loss": 12.3309, + "grad_norm": 0.07552432268857956, + "learning_rate": 4.933333333333334e-05, + "loss": 12.4359, "step": 74 }, { "epoch": 0.012280989028983134, - "grad_norm": 0.2732013165950775, - "learning_rate": 0.000199996997253038, - "loss": 12.3279, + "grad_norm": 0.07036083936691284, + "learning_rate": 5e-05, + "loss": 12.4276, "step": 75 }, { "epoch": 0.012444735549369576, - "grad_norm": 0.1881040781736374, - "learning_rate": 0.00019999686628580093, - "loss": 12.3237, + "grad_norm": 0.09128473699092865, + "learning_rate": 5.0666666666666674e-05, + "loss": 12.4385, "step": 76 }, { "epoch": 0.012608482069756018, - "grad_norm": 0.14977961778640747, - "learning_rate": 0.00019999673531856388, - "loss": 12.3332, + "grad_norm": 0.08544669300317764, + "learning_rate": 5.133333333333333e-05, + "loss": 12.4398, "step": 77 }, { "epoch": 0.01277222859014246, - "grad_norm": 0.18099509179592133, - "learning_rate": 0.00019999660435132682, - "loss": 12.321, + "grad_norm": 0.09351212531328201, + "learning_rate": 5.2000000000000004e-05, + "loss": 12.4355, "step": 78 }, { "epoch": 0.0129359751105289, - "grad_norm": 0.2314033955335617, - "learning_rate": 0.00019999645883217454, - "loss": 12.3326, + "grad_norm": 0.09390752017498016, + "learning_rate": 5.266666666666666e-05, + "loss": 12.4411, "step": 79 }, { "epoch": 0.013099721630915342, - "grad_norm": 0.18064385652542114, - "learning_rate": 0.00019999631331302226, - "loss": 12.3228, + "grad_norm": 0.10644028335809708, + "learning_rate": 5.333333333333333e-05, + "loss": 12.4441, "step": 80 }, { "epoch": 0.013263468151301784, - "grad_norm": 0.35007038712501526, - "learning_rate": 0.00019999615324195474, - "loss": 12.3499, + "grad_norm": 0.0856703445315361, + "learning_rate": 5.4000000000000005e-05, + "loss": 12.4478, "step": 81 }, { "epoch": 0.013427214671688227, - "grad_norm": 0.4421752691268921, - "learning_rate": 0.00019999600772280246, - "loss": 12.3257, + "grad_norm": 0.09133117645978928, + "learning_rate": 5.466666666666666e-05, + "loss": 12.4175, "step": 82 }, { "epoch": 0.013590961192074669, - "grad_norm": 0.2741035223007202, - "learning_rate": 0.00019999584765173495, - "loss": 12.3004, + "grad_norm": 0.11033530533313751, + "learning_rate": 5.5333333333333334e-05, + "loss": 12.4325, "step": 83 }, { "epoch": 0.01375470771246111, - "grad_norm": 0.1526937633752823, - "learning_rate": 0.00019999568758066744, - "loss": 12.3203, + "grad_norm": 0.10031059384346008, + "learning_rate": 5.6000000000000006e-05, + "loss": 12.4367, "step": 84 }, { "epoch": 0.013918454232847552, - "grad_norm": 0.15117084980010986, - "learning_rate": 0.00019999554206151515, - "loss": 12.317, + "grad_norm": 0.10192529857158661, + "learning_rate": 5.666666666666667e-05, + "loss": 12.4213, "step": 85 }, { "epoch": 0.014082200753233994, - "grad_norm": 0.3051150441169739, - "learning_rate": 0.0001999953674385324, - "loss": 12.3093, + "grad_norm": 0.1395905762910843, + "learning_rate": 5.7333333333333336e-05, + "loss": 12.43, "step": 86 }, { "epoch": 0.014245947273620435, - "grad_norm": 0.21714192628860474, - "learning_rate": 0.0001999952073674649, - "loss": 12.2923, + "grad_norm": 0.12898190319538116, + "learning_rate": 5.8e-05, + "loss": 12.4351, "step": 87 }, { "epoch": 0.014409693794006877, - "grad_norm": 0.29008936882019043, - "learning_rate": 0.00019999503274448216, - "loss": 12.3135, + "grad_norm": 0.125532329082489, + "learning_rate": 5.866666666666667e-05, + "loss": 12.4293, "step": 88 }, { "epoch": 0.014573440314393319, - "grad_norm": 0.15054282546043396, - "learning_rate": 0.00019999485812149942, - "loss": 12.337, + "grad_norm": 0.10696202516555786, + "learning_rate": 5.9333333333333343e-05, + "loss": 12.4444, "step": 89 }, { "epoch": 0.01473718683477976, - "grad_norm": 0.29189133644104004, - "learning_rate": 0.00019999468349851668, - "loss": 12.3169, + "grad_norm": 0.1154562458395958, + "learning_rate": 6e-05, + "loss": 12.4223, "step": 90 }, { "epoch": 0.014900933355166202, - "grad_norm": 0.16853569447994232, - "learning_rate": 0.00019999450887553394, - "loss": 12.3087, + "grad_norm": 0.13437619805335999, + "learning_rate": 6.066666666666667e-05, + "loss": 12.4324, "step": 91 }, { "epoch": 0.015064679875552645, - "grad_norm": 0.24085475504398346, - "learning_rate": 0.00019999431970063597, - "loss": 12.3309, + "grad_norm": 0.13228319585323334, + "learning_rate": 6.133333333333334e-05, + "loss": 12.4311, "step": 92 }, { "epoch": 0.015228426395939087, - "grad_norm": 0.14460758864879608, - "learning_rate": 0.00019999414507765323, - "loss": 12.3043, + "grad_norm": 0.14434094727039337, + "learning_rate": 6.2e-05, + "loss": 12.4221, "step": 93 }, { "epoch": 0.015392172916325528, - "grad_norm": 0.20345059037208557, - "learning_rate": 0.00019999395590275526, - "loss": 12.3139, + "grad_norm": 0.1585177183151245, + "learning_rate": 6.266666666666667e-05, + "loss": 12.4432, "step": 94 }, { "epoch": 0.01555591943671197, - "grad_norm": 0.30965539813041687, - "learning_rate": 0.0001999937667278573, - "loss": 12.2988, + "grad_norm": 0.1559607982635498, + "learning_rate": 6.333333333333333e-05, + "loss": 12.4328, "step": 95 }, { "epoch": 0.01571966595709841, - "grad_norm": 0.28346818685531616, - "learning_rate": 0.0001999935630010441, - "loss": 12.3345, + "grad_norm": 0.132478266954422, + "learning_rate": 6.400000000000001e-05, + "loss": 12.4274, "step": 96 }, { "epoch": 0.015883412477484853, - "grad_norm": 0.1707168072462082, - "learning_rate": 0.00019999337382614613, - "loss": 12.3099, + "grad_norm": 0.17377401888370514, + "learning_rate": 6.466666666666666e-05, + "loss": 12.4283, "step": 97 }, { "epoch": 0.016047158997871295, - "grad_norm": 0.20409435033798218, - "learning_rate": 0.00019999317009933293, - "loss": 12.338, + "grad_norm": 0.13973486423492432, + "learning_rate": 6.533333333333334e-05, + "loss": 12.4373, "step": 98 }, { "epoch": 0.016210905518257736, - "grad_norm": 0.35684144496917725, - "learning_rate": 0.00019999296637251973, - "loss": 12.2916, + "grad_norm": 0.17785918712615967, + "learning_rate": 6.6e-05, + "loss": 12.4219, "step": 99 }, { "epoch": 0.016374652038644178, - "grad_norm": 0.24036715924739838, - "learning_rate": 0.00019999277719762176, - "loss": 12.3097, - "step": 100 - }, - { - "epoch": 0.016374652038644178, - "eval_loss": 12.311452865600586, - "eval_runtime": 7.4253, - "eval_samples_per_second": 33.13, - "eval_steps_per_second": 16.565, + "grad_norm": 0.16729354858398438, + "learning_rate": 6.666666666666667e-05, + "loss": 12.4167, "step": 100 }, { "epoch": 0.01653839855903062, - "grad_norm": 0.14290913939476013, - "learning_rate": 0.00019999255891889334, - "loss": 12.3244, + "grad_norm": 0.16612979769706726, + "learning_rate": 6.733333333333333e-05, + "loss": 12.4189, "step": 101 }, { "epoch": 0.01670214507941706, - "grad_norm": 0.3147849440574646, - "learning_rate": 0.00019999235519208014, - "loss": 12.3063, + "grad_norm": 0.17767584323883057, + "learning_rate": 6.800000000000001e-05, + "loss": 12.43, "step": 102 }, { "epoch": 0.016865891599803503, - "grad_norm": 0.20907671749591827, - "learning_rate": 0.00019999213691335171, - "loss": 12.2983, + "grad_norm": 0.20716869831085205, + "learning_rate": 6.866666666666666e-05, + "loss": 12.4229, "step": 103 }, { "epoch": 0.017029638120189945, - "grad_norm": 0.12480394542217255, - "learning_rate": 0.0001999919186346233, - "loss": 12.3012, + "grad_norm": 0.21526587009429932, + "learning_rate": 6.933333333333334e-05, + "loss": 12.4282, "step": 104 }, { "epoch": 0.017193384640576386, - "grad_norm": 0.18896125257015228, - "learning_rate": 0.00019999170035589486, - "loss": 12.3152, + "grad_norm": 0.19571231305599213, + "learning_rate": 7e-05, + "loss": 12.4137, "step": 105 }, { "epoch": 0.01735713116096283, - "grad_norm": 0.34390348196029663, - "learning_rate": 0.0001999914675252512, - "loss": 12.3272, + "grad_norm": 0.14979496598243713, + "learning_rate": 7.066666666666667e-05, + "loss": 12.4119, "step": 106 }, { "epoch": 0.017520877681349273, - "grad_norm": 0.22107726335525513, - "learning_rate": 0.00019999124924652278, - "loss": 12.3076, + "grad_norm": 0.21473410725593567, + "learning_rate": 7.133333333333334e-05, + "loss": 12.4146, "step": 107 }, { "epoch": 0.017684624201735714, - "grad_norm": 0.22771164774894714, - "learning_rate": 0.00019999101641587913, - "loss": 12.3141, + "grad_norm": 0.19255532324314117, + "learning_rate": 7.2e-05, + "loss": 12.4214, "step": 108 }, { "epoch": 0.017848370722122156, - "grad_norm": 0.23079240322113037, - "learning_rate": 0.0001999907981371507, - "loss": 12.3073, + "grad_norm": 0.23225137591362, + "learning_rate": 7.266666666666667e-05, + "loss": 12.4152, "step": 109 }, { "epoch": 0.018012117242508598, - "grad_norm": 0.25316718220710754, - "learning_rate": 0.00019999055075459182, - "loss": 12.31, + "grad_norm": 0.2821487486362457, + "learning_rate": 7.333333333333333e-05, + "loss": 12.4242, "step": 110 }, { "epoch": 0.01817586376289504, - "grad_norm": 0.16608233749866486, - "learning_rate": 0.00019999031792394817, - "loss": 12.297, + "grad_norm": 0.2615848183631897, + "learning_rate": 7.4e-05, + "loss": 12.4051, "step": 111 }, { "epoch": 0.01833961028328148, - "grad_norm": 0.15613137185573578, - "learning_rate": 0.0001999900705413893, - "loss": 12.2899, + "grad_norm": 0.2540196478366852, + "learning_rate": 7.466666666666667e-05, + "loss": 12.4187, "step": 112 }, { "epoch": 0.018503356803667922, - "grad_norm": 0.4086350202560425, - "learning_rate": 0.00019998983771074563, - "loss": 12.3246, + "grad_norm": 0.20726901292800903, + "learning_rate": 7.533333333333334e-05, + "loss": 12.4194, "step": 113 }, { "epoch": 0.018667103324054364, - "grad_norm": 0.22173292934894562, - "learning_rate": 0.00019998959032818675, - "loss": 12.3183, + "grad_norm": 0.2292231023311615, + "learning_rate": 7.6e-05, + "loss": 12.4186, "step": 114 }, { "epoch": 0.018830849844440806, - "grad_norm": 0.15508519113063812, - "learning_rate": 0.00019998934294562787, - "loss": 12.3032, + "grad_norm": 0.24483346939086914, + "learning_rate": 7.666666666666667e-05, + "loss": 12.4136, "step": 115 }, { "epoch": 0.018994596364827247, - "grad_norm": 0.13017487525939941, - "learning_rate": 0.00019998908101115376, - "loss": 12.3025, + "grad_norm": 0.25352075695991516, + "learning_rate": 7.733333333333333e-05, + "loss": 12.3991, "step": 116 }, { "epoch": 0.01915834288521369, - "grad_norm": 0.18207354843616486, - "learning_rate": 0.00019998881907667965, - "loss": 12.2849, + "grad_norm": 0.28832969069480896, + "learning_rate": 7.800000000000001e-05, + "loss": 12.403, "step": 117 }, { "epoch": 0.01932208940560013, - "grad_norm": 0.14748769998550415, - "learning_rate": 0.00019998857169412076, - "loss": 12.2891, + "grad_norm": 0.28145042061805725, + "learning_rate": 7.866666666666666e-05, + "loss": 12.4107, "step": 118 }, { "epoch": 0.019485835925986572, - "grad_norm": 0.141549214720726, - "learning_rate": 0.00019998830975964665, - "loss": 12.294, + "grad_norm": 0.29081061482429504, + "learning_rate": 7.933333333333334e-05, + "loss": 12.3983, "step": 119 }, { "epoch": 0.019649582446373014, - "grad_norm": 0.14569465816020966, - "learning_rate": 0.00019998804782517254, - "loss": 12.304, + "grad_norm": 0.24011318385601044, + "learning_rate": 8e-05, + "loss": 12.3979, "step": 120 }, { "epoch": 0.019813328966759455, - "grad_norm": 0.39608556032180786, - "learning_rate": 0.0001999877713387832, - "loss": 12.3186, + "grad_norm": 0.2432175874710083, + "learning_rate": 8.066666666666667e-05, + "loss": 12.4146, "step": 121 }, { "epoch": 0.019977075487145897, - "grad_norm": 0.12429912388324738, - "learning_rate": 0.0001999875094043091, - "loss": 12.2826, + "grad_norm": 0.3161214292049408, + "learning_rate": 8.133333333333334e-05, + "loss": 12.3937, "step": 122 }, { "epoch": 0.02014082200753234, - "grad_norm": 0.22286245226860046, - "learning_rate": 0.00019998723291791975, - "loss": 12.3161, + "grad_norm": 0.23684079945087433, + "learning_rate": 8.2e-05, + "loss": 12.4068, "step": 123 }, { "epoch": 0.02030456852791878, - "grad_norm": 0.4403093755245209, - "learning_rate": 0.00019998697098344564, - "loss": 12.331, + "grad_norm": 0.20811216533184052, + "learning_rate": 8.266666666666667e-05, + "loss": 12.4116, "step": 124 }, { "epoch": 0.020468315048305225, - "grad_norm": 0.21915563941001892, - "learning_rate": 0.00019998667994514108, - "loss": 12.2953, + "grad_norm": 0.25938355922698975, + "learning_rate": 8.333333333333334e-05, + "loss": 12.3906, "step": 125 }, { "epoch": 0.020632061568691667, - "grad_norm": 0.13325762748718262, - "learning_rate": 0.00019998640345875174, - "loss": 12.3046, + "grad_norm": 0.26399001479148865, + "learning_rate": 8.4e-05, + "loss": 12.3827, "step": 126 }, { "epoch": 0.02079580808907811, - "grad_norm": 0.1221589520573616, - "learning_rate": 0.00019998611242044717, - "loss": 12.3158, + "grad_norm": 0.2413434386253357, + "learning_rate": 8.466666666666667e-05, + "loss": 12.4046, "step": 127 }, { "epoch": 0.02095955460946455, - "grad_norm": 0.2719859480857849, - "learning_rate": 0.00019998583593405783, - "loss": 12.3273, + "grad_norm": 0.21343113481998444, + "learning_rate": 8.533333333333334e-05, + "loss": 12.3997, "step": 128 }, { "epoch": 0.02112330112985099, - "grad_norm": 0.14147429168224335, - "learning_rate": 0.00019998553034383804, - "loss": 12.295, + "grad_norm": 0.2558322548866272, + "learning_rate": 8.6e-05, + "loss": 12.3871, "step": 129 }, { "epoch": 0.021287047650237433, - "grad_norm": 0.3306241035461426, - "learning_rate": 0.0001999852538574487, - "loss": 12.3274, + "grad_norm": 0.2511197626590729, + "learning_rate": 8.666666666666667e-05, + "loss": 12.4011, "step": 130 }, { "epoch": 0.021450794170623875, - "grad_norm": 0.1614050716161728, - "learning_rate": 0.0001999849482672289, - "loss": 12.3168, + "grad_norm": 0.2675906717777252, + "learning_rate": 8.733333333333333e-05, + "loss": 12.4074, "step": 131 }, { "epoch": 0.021614540691010316, - "grad_norm": 0.2571965754032135, - "learning_rate": 0.00019998465722892433, - "loss": 12.3033, + "grad_norm": 0.25766870379447937, + "learning_rate": 8.800000000000001e-05, + "loss": 12.3809, "step": 132 }, { "epoch": 0.021778287211396758, - "grad_norm": 0.3647140562534332, - "learning_rate": 0.0001999843370867893, - "loss": 12.3138, + "grad_norm": 0.25887688994407654, + "learning_rate": 8.866666666666668e-05, + "loss": 12.3937, "step": 133 }, { "epoch": 0.0219420337317832, - "grad_norm": 0.22893981635570526, - "learning_rate": 0.00019998404604848474, - "loss": 12.3124, + "grad_norm": 0.2538568079471588, + "learning_rate": 8.933333333333334e-05, + "loss": 12.3999, "step": 134 }, { "epoch": 0.02210578025216964, - "grad_norm": 0.18172000348567963, - "learning_rate": 0.00019998374045826495, - "loss": 12.2969, + "grad_norm": 0.22949941456317902, + "learning_rate": 9e-05, + "loss": 12.3719, "step": 135 }, { "epoch": 0.022269526772556083, - "grad_norm": 0.2715581953525543, - "learning_rate": 0.00019998342031612992, - "loss": 12.2789, + "grad_norm": 0.2809057831764221, + "learning_rate": 9.066666666666667e-05, + "loss": 12.3547, "step": 136 }, { "epoch": 0.022433273292942525, - "grad_norm": 0.2520585358142853, - "learning_rate": 0.0001999831001739949, - "loss": 12.2753, + "grad_norm": 0.29336753487586975, + "learning_rate": 9.133333333333334e-05, + "loss": 12.3573, "step": 137 }, { "epoch": 0.022597019813328966, - "grad_norm": 0.20995964109897614, - "learning_rate": 0.0001999827945837751, - "loss": 12.2933, + "grad_norm": 0.26571160554885864, + "learning_rate": 9.200000000000001e-05, + "loss": 12.3756, "step": 138 }, { "epoch": 0.022760766333715408, - "grad_norm": 0.18961293995380402, - "learning_rate": 0.00019998247444164008, - "loss": 12.2929, + "grad_norm": 0.27329036593437195, + "learning_rate": 9.266666666666666e-05, + "loss": 12.3841, "step": 139 }, { "epoch": 0.02292451285410185, - "grad_norm": 0.15688307583332062, - "learning_rate": 0.00019998215429950505, - "loss": 12.292, + "grad_norm": 0.2753359079360962, + "learning_rate": 9.333333333333334e-05, + "loss": 12.3678, "step": 140 }, { "epoch": 0.02308825937448829, - "grad_norm": 0.09857089072465897, - "learning_rate": 0.00019998183415737003, - "loss": 12.2883, + "grad_norm": 0.26319000124931335, + "learning_rate": 9.4e-05, + "loss": 12.3615, "step": 141 }, { "epoch": 0.023252005894874733, - "grad_norm": 0.2050275206565857, - "learning_rate": 0.00019998149946331978, - "loss": 12.2957, + "grad_norm": 0.23538266122341156, + "learning_rate": 9.466666666666667e-05, + "loss": 12.3522, "step": 142 }, { "epoch": 0.023415752415261174, - "grad_norm": 0.43624162673950195, - "learning_rate": 0.00019998116476926953, - "loss": 12.3091, + "grad_norm": 0.21864941716194153, + "learning_rate": 9.533333333333334e-05, + "loss": 12.362, "step": 143 }, { "epoch": 0.023579498935647616, - "grad_norm": 0.1448792666196823, - "learning_rate": 0.00019998083007521927, - "loss": 12.2973, + "grad_norm": 0.2649800479412079, + "learning_rate": 9.6e-05, + "loss": 12.3644, "step": 144 }, { "epoch": 0.02374324545603406, - "grad_norm": 0.1774253100156784, - "learning_rate": 0.00019998049538116902, - "loss": 12.3051, + "grad_norm": 0.254098504781723, + "learning_rate": 9.666666666666667e-05, + "loss": 12.3645, "step": 145 }, { "epoch": 0.023906991976420502, - "grad_norm": 0.13856734335422516, - "learning_rate": 0.00019998014613520354, - "loss": 12.2838, + "grad_norm": 0.26626890897750854, + "learning_rate": 9.733333333333335e-05, + "loss": 12.3378, "step": 146 }, { "epoch": 0.024070738496806944, - "grad_norm": 0.14116550981998444, - "learning_rate": 0.00019997979688923806, - "loss": 12.2834, + "grad_norm": 0.266643762588501, + "learning_rate": 9.8e-05, + "loss": 12.3436, "step": 147 }, { "epoch": 0.024234485017193386, - "grad_norm": 0.13662639260292053, - "learning_rate": 0.0001999794621951878, - "loss": 12.2885, + "grad_norm": 0.24790844321250916, + "learning_rate": 9.866666666666668e-05, + "loss": 12.3534, "step": 148 }, { "epoch": 0.024398231537579827, - "grad_norm": 0.0898972824215889, - "learning_rate": 0.00019997911294922233, - "loss": 12.2985, + "grad_norm": 0.24089303612709045, + "learning_rate": 9.933333333333334e-05, + "loss": 12.3588, "step": 149 }, { "epoch": 0.02456197805796627, - "grad_norm": 0.18935908377170563, - "learning_rate": 0.00019997874915134162, - "loss": 12.3198, + "grad_norm": 0.20637933909893036, + "learning_rate": 0.0001, + "loss": 12.3666, "step": 150 }, { "epoch": 0.02472572457835271, - "grad_norm": 0.14787761867046356, - "learning_rate": 0.00019997839990537614, - "loss": 12.3134, + "grad_norm": 0.18257121741771698, + "learning_rate": 0.00010066666666666667, + "loss": 12.3432, "step": 151 }, { "epoch": 0.024889471098739152, - "grad_norm": 0.1418771892786026, - "learning_rate": 0.00019997803610749543, - "loss": 12.292, + "grad_norm": 0.24397976696491241, + "learning_rate": 0.00010133333333333335, + "loss": 12.3509, "step": 152 }, { "epoch": 0.025053217619125594, - "grad_norm": 0.11932875216007233, - "learning_rate": 0.00019997767230961472, - "loss": 12.2779, + "grad_norm": 0.19030074775218964, + "learning_rate": 0.00010200000000000001, + "loss": 12.3322, "step": 153 }, { "epoch": 0.025216964139512035, - "grad_norm": 0.22217413783073425, - "learning_rate": 0.00019997732306364924, - "loss": 12.3222, + "grad_norm": 0.25341999530792236, + "learning_rate": 0.00010266666666666666, + "loss": 12.3715, "step": 154 }, { "epoch": 0.025380710659898477, - "grad_norm": 0.17280636727809906, - "learning_rate": 0.0001999769447138533, - "loss": 12.3, + "grad_norm": 0.1844940036535263, + "learning_rate": 0.00010333333333333334, + "loss": 12.3434, "step": 155 }, { "epoch": 0.02554445718028492, - "grad_norm": 0.11617791652679443, - "learning_rate": 0.0001999765809159726, - "loss": 12.2993, + "grad_norm": 0.1969732791185379, + "learning_rate": 0.00010400000000000001, + "loss": 12.3491, "step": 156 }, { "epoch": 0.02570820370067136, - "grad_norm": 0.21635393798351288, - "learning_rate": 0.00019997620256617665, - "loss": 12.2918, + "grad_norm": 0.21235470473766327, + "learning_rate": 0.00010466666666666667, + "loss": 12.3378, "step": 157 }, { "epoch": 0.0258719502210578, - "grad_norm": 0.11997334659099579, - "learning_rate": 0.00019997582421638072, - "loss": 12.2956, + "grad_norm": 0.21899423003196716, + "learning_rate": 0.00010533333333333332, + "loss": 12.346, "step": 158 }, { "epoch": 0.026035696741444243, - "grad_norm": 0.19266721606254578, - "learning_rate": 0.00019997544586658478, - "loss": 12.3138, + "grad_norm": 0.19178782403469086, + "learning_rate": 0.00010600000000000002, + "loss": 12.3623, "step": 159 }, { "epoch": 0.026199443261830685, - "grad_norm": 0.19289498031139374, - "learning_rate": 0.00019997506751678884, - "loss": 12.3125, + "grad_norm": 0.21276162564754486, + "learning_rate": 0.00010666666666666667, + "loss": 12.3637, "step": 160 }, { "epoch": 0.026363189782217127, - "grad_norm": 0.25989240407943726, - "learning_rate": 0.0001999746891669929, - "loss": 12.2826, + "grad_norm": 0.2185026854276657, + "learning_rate": 0.00010733333333333333, + "loss": 12.3458, "step": 161 }, { "epoch": 0.026526936302603568, - "grad_norm": 0.15681873261928558, - "learning_rate": 0.00019997429626528174, - "loss": 12.2928, + "grad_norm": 0.1894225925207138, + "learning_rate": 0.00010800000000000001, + "loss": 12.3417, "step": 162 }, { "epoch": 0.02669068282299001, - "grad_norm": 0.19333164393901825, - "learning_rate": 0.00019997390336357057, - "loss": 12.272, + "grad_norm": 0.2355530709028244, + "learning_rate": 0.00010866666666666667, + "loss": 12.3253, "step": 163 }, { "epoch": 0.026854429343376455, - "grad_norm": 0.21432405710220337, - "learning_rate": 0.0001999735104618594, - "loss": 12.3191, + "grad_norm": 0.17933063209056854, + "learning_rate": 0.00010933333333333333, + "loss": 12.3522, "step": 164 }, { "epoch": 0.027018175863762896, - "grad_norm": 0.14616520702838898, - "learning_rate": 0.00019997311756014824, - "loss": 12.2888, + "grad_norm": 0.20666790008544922, + "learning_rate": 0.00011000000000000002, + "loss": 12.325, "step": 165 }, { "epoch": 0.027181922384149338, - "grad_norm": 0.2478683441877365, - "learning_rate": 0.00019997272465843707, - "loss": 12.3073, + "grad_norm": 0.19132888317108154, + "learning_rate": 0.00011066666666666667, + "loss": 12.344, "step": 166 }, { "epoch": 0.02734566890453578, - "grad_norm": 0.10278715938329697, - "learning_rate": 0.00019997231720481068, - "loss": 12.2835, + "grad_norm": 0.16811159253120422, + "learning_rate": 0.00011133333333333333, + "loss": 12.3294, "step": 167 }, { "epoch": 0.02750941542492222, - "grad_norm": 0.15840058028697968, - "learning_rate": 0.00019997190975118428, - "loss": 12.2941, + "grad_norm": 0.1825079768896103, + "learning_rate": 0.00011200000000000001, + "loss": 12.3343, "step": 168 }, { "epoch": 0.027673161945308663, - "grad_norm": 0.12881392240524292, - "learning_rate": 0.00019997148774564266, - "loss": 12.3144, + "grad_norm": 0.18791979551315308, + "learning_rate": 0.00011266666666666668, + "loss": 12.3431, "step": 169 }, { "epoch": 0.027836908465695104, - "grad_norm": 0.12443830817937851, - "learning_rate": 0.00019997108029201627, - "loss": 12.2907, + "grad_norm": 0.17354759573936462, + "learning_rate": 0.00011333333333333334, + "loss": 12.3344, "step": 170 }, { "epoch": 0.028000654986081546, - "grad_norm": 0.22190840542316437, - "learning_rate": 0.00019997067283838987, - "loss": 12.2963, + "grad_norm": 0.18769007921218872, + "learning_rate": 0.00011399999999999999, + "loss": 12.3321, "step": 171 }, { "epoch": 0.028164401506467988, - "grad_norm": 0.1679324209690094, - "learning_rate": 0.00019997025083284825, - "loss": 12.276, + "grad_norm": 0.2163514792919159, + "learning_rate": 0.00011466666666666667, + "loss": 12.323, "step": 172 }, { "epoch": 0.02832814802685443, - "grad_norm": 0.14645697176456451, - "learning_rate": 0.00019996982882730663, - "loss": 12.2968, + "grad_norm": 0.18447071313858032, + "learning_rate": 0.00011533333333333334, + "loss": 12.337, "step": 173 }, { "epoch": 0.02849189454724087, - "grad_norm": 0.14275029301643372, - "learning_rate": 0.000199969406821765, - "loss": 12.2698, + "grad_norm": 0.2503637969493866, + "learning_rate": 0.000116, + "loss": 12.3102, "step": 174 }, { "epoch": 0.028655641067627313, - "grad_norm": 0.167531818151474, - "learning_rate": 0.00019996898481622338, - "loss": 12.2998, + "grad_norm": 0.1710224151611328, + "learning_rate": 0.00011666666666666668, + "loss": 12.323, "step": 175 }, { "epoch": 0.028819387588013754, - "grad_norm": 0.20161862671375275, - "learning_rate": 0.00019996856281068176, - "loss": 12.3161, + "grad_norm": 0.18545013666152954, + "learning_rate": 0.00011733333333333334, + "loss": 12.3386, "step": 176 }, { "epoch": 0.028983134108400196, - "grad_norm": 0.1790657937526703, - "learning_rate": 0.00019996811170130968, - "loss": 12.2802, + "grad_norm": 0.1715422123670578, + "learning_rate": 0.000118, + "loss": 12.3175, "step": 177 }, { "epoch": 0.029146880628786637, - "grad_norm": 0.13983246684074402, - "learning_rate": 0.00019996768969576806, - "loss": 12.2937, + "grad_norm": 0.21719999611377716, + "learning_rate": 0.00011866666666666669, + "loss": 12.3342, "step": 178 }, { "epoch": 0.02931062714917308, - "grad_norm": 0.09113561362028122, - "learning_rate": 0.0001999672531383112, - "loss": 12.2749, + "grad_norm": 0.14283528923988342, + "learning_rate": 0.00011933333333333334, + "loss": 12.3114, "step": 179 }, { "epoch": 0.02947437366955952, - "grad_norm": 0.15219911932945251, - "learning_rate": 0.00019996680202893913, - "loss": 12.291, + "grad_norm": 0.14593714475631714, + "learning_rate": 0.00012, + "loss": 12.3237, "step": 180 }, { "epoch": 0.029638120189945962, - "grad_norm": 0.15494637191295624, - "learning_rate": 0.00019996636547148228, - "loss": 12.307, + "grad_norm": 0.2272740751504898, + "learning_rate": 0.00012066666666666668, + "loss": 12.3285, "step": 181 }, { "epoch": 0.029801866710332404, - "grad_norm": 0.10758625715970993, - "learning_rate": 0.0001999659143621102, - "loss": 12.3137, + "grad_norm": 0.1673334389925003, + "learning_rate": 0.00012133333333333335, + "loss": 12.3374, "step": 182 }, { "epoch": 0.02996561323071885, - "grad_norm": 0.11248419433832169, - "learning_rate": 0.00019996547780465335, - "loss": 12.3021, + "grad_norm": 0.1368860900402069, + "learning_rate": 0.000122, + "loss": 12.335, "step": 183 }, { "epoch": 0.03012935975110529, - "grad_norm": 0.262703537940979, - "learning_rate": 0.00019996501214336604, - "loss": 12.3134, + "grad_norm": 0.2395031452178955, + "learning_rate": 0.00012266666666666668, + "loss": 12.3387, "step": 184 }, { "epoch": 0.030293106271491732, - "grad_norm": 0.16971834003925323, - "learning_rate": 0.00019996456103399396, - "loss": 12.2921, + "grad_norm": 0.16532330214977264, + "learning_rate": 0.00012333333333333334, + "loss": 12.3234, "step": 185 }, { "epoch": 0.030456852791878174, - "grad_norm": 0.21756646037101746, - "learning_rate": 0.00019996409537270665, - "loss": 12.3007, + "grad_norm": 0.18451902270317078, + "learning_rate": 0.000124, + "loss": 12.3312, "step": 186 }, { "epoch": 0.030620599312264615, - "grad_norm": 0.16931568086147308, - "learning_rate": 0.00019996362971141934, - "loss": 12.2984, + "grad_norm": 0.14678575098514557, + "learning_rate": 0.00012466666666666667, + "loss": 12.3127, "step": 187 }, { "epoch": 0.030784345832651057, - "grad_norm": 0.1374356746673584, - "learning_rate": 0.00019996316405013204, - "loss": 12.2771, + "grad_norm": 0.1310257613658905, + "learning_rate": 0.00012533333333333334, + "loss": 12.3053, "step": 188 }, { "epoch": 0.0309480923530375, - "grad_norm": 0.19620707631111145, - "learning_rate": 0.00019996271294075996, - "loss": 12.3238, + "grad_norm": 0.17656965553760529, + "learning_rate": 0.000126, + "loss": 12.3509, "step": 189 }, { "epoch": 0.03111183887342394, - "grad_norm": 0.15874610841274261, - "learning_rate": 0.00019996224727947265, - "loss": 12.281, + "grad_norm": 0.18695133924484253, + "learning_rate": 0.00012666666666666666, + "loss": 12.3122, "step": 190 }, { "epoch": 0.03127558539381038, - "grad_norm": 0.1111041009426117, - "learning_rate": 0.0001999617670662701, - "loss": 12.2995, + "grad_norm": 0.1660790890455246, + "learning_rate": 0.00012733333333333336, + "loss": 12.3267, "step": 191 }, { "epoch": 0.03143933191419682, - "grad_norm": 0.18783725798130035, - "learning_rate": 0.00019996128685306758, - "loss": 12.2759, + "grad_norm": 0.16188572347164154, + "learning_rate": 0.00012800000000000002, + "loss": 12.306, "step": 192 }, { "epoch": 0.031603078434583265, - "grad_norm": 0.2878400385379791, - "learning_rate": 0.00019996080663986504, - "loss": 12.2879, + "grad_norm": 0.4507922828197479, + "learning_rate": 0.00012866666666666666, + "loss": 12.312, "step": 193 }, { "epoch": 0.03176682495496971, - "grad_norm": 0.15140052139759064, - "learning_rate": 0.0001999603264266625, - "loss": 12.2966, + "grad_norm": 0.1829795241355896, + "learning_rate": 0.00012933333333333332, + "loss": 12.3141, "step": 194 }, { "epoch": 0.03193057147535615, - "grad_norm": 0.18027496337890625, - "learning_rate": 0.00019995984621345997, - "loss": 12.3031, + "grad_norm": 0.16686825454235077, + "learning_rate": 0.00013000000000000002, + "loss": 12.3271, "step": 195 }, { "epoch": 0.03209431799574259, - "grad_norm": 0.28657597303390503, - "learning_rate": 0.0001999593514483422, - "loss": 12.2774, + "grad_norm": 0.26499465107917786, + "learning_rate": 0.00013066666666666668, + "loss": 12.3014, "step": 196 }, { "epoch": 0.03225806451612903, - "grad_norm": 0.22153787314891815, - "learning_rate": 0.00019995885668322444, - "loss": 12.3025, + "grad_norm": 0.1288941651582718, + "learning_rate": 0.00013133333333333332, + "loss": 12.3216, "step": 197 }, { "epoch": 0.03242181103651547, - "grad_norm": 0.19516243040561676, - "learning_rate": 0.00019995836191810668, - "loss": 12.2923, + "grad_norm": 0.19364045560359955, + "learning_rate": 0.000132, + "loss": 12.3043, "step": 198 }, { "epoch": 0.032585557556901915, - "grad_norm": 0.17245976626873016, - "learning_rate": 0.0001999578671529889, - "loss": 12.265, + "grad_norm": 0.1700012981891632, + "learning_rate": 0.00013266666666666667, + "loss": 12.2953, "step": 199 }, { "epoch": 0.032749304077288356, - "grad_norm": 0.1579219251871109, - "learning_rate": 0.00019995737238787115, - "loss": 12.2812, - "step": 200 - }, - { - "epoch": 0.032749304077288356, - "eval_loss": 12.291393280029297, - "eval_runtime": 7.4374, - "eval_samples_per_second": 33.076, - "eval_steps_per_second": 16.538, + "grad_norm": 0.16173534095287323, + "learning_rate": 0.00013333333333333334, + "loss": 12.3054, "step": 200 }, { "epoch": 0.0329130505976748, - "grad_norm": 0.161886066198349, - "learning_rate": 0.00019995686307083815, - "loss": 12.2984, + "grad_norm": 0.1652362197637558, + "learning_rate": 0.000134, + "loss": 12.3216, "step": 201 }, { "epoch": 0.03307679711806124, - "grad_norm": 0.221256822347641, - "learning_rate": 0.0001999563683057204, - "loss": 12.3126, + "grad_norm": 0.18516595661640167, + "learning_rate": 0.00013466666666666667, + "loss": 12.3394, "step": 202 }, { "epoch": 0.03324054363844768, - "grad_norm": 0.12891951203346252, - "learning_rate": 0.0001999558589886874, - "loss": 12.3078, + "grad_norm": 0.16532324254512787, + "learning_rate": 0.00013533333333333333, + "loss": 12.3365, "step": 203 }, { "epoch": 0.03340429015883412, - "grad_norm": 0.1363368183374405, - "learning_rate": 0.00019995533511973917, - "loss": 12.2684, + "grad_norm": 0.1639377474784851, + "learning_rate": 0.00013600000000000003, + "loss": 12.2997, "step": 204 }, { "epoch": 0.033568036679220564, - "grad_norm": 0.12029746174812317, - "learning_rate": 0.00019995482580270618, - "loss": 12.283, + "grad_norm": 0.13720141351222992, + "learning_rate": 0.00013666666666666666, + "loss": 12.3105, "step": 205 }, { "epoch": 0.033731783199607006, - "grad_norm": 0.13670730590820312, - "learning_rate": 0.0001999543164856732, - "loss": 12.2871, + "grad_norm": 0.116838738322258, + "learning_rate": 0.00013733333333333333, + "loss": 12.3063, "step": 206 }, { "epoch": 0.03389552971999345, - "grad_norm": 0.12571097910404205, - "learning_rate": 0.00019995379261672497, - "loss": 12.297, + "grad_norm": 0.22012831270694733, + "learning_rate": 0.000138, + "loss": 12.32, "step": 207 }, { "epoch": 0.03405927624037989, - "grad_norm": 0.20929580926895142, - "learning_rate": 0.00019995326874777675, - "loss": 12.2853, + "grad_norm": 0.29010009765625, + "learning_rate": 0.00013866666666666669, + "loss": 12.3054, "step": 208 }, { "epoch": 0.03422302276076633, - "grad_norm": 0.13680557906627655, - "learning_rate": 0.00019995274487882853, - "loss": 12.3088, + "grad_norm": 0.13133449852466583, + "learning_rate": 0.00013933333333333335, + "loss": 12.329, "step": 209 }, { "epoch": 0.03438676928115277, - "grad_norm": 0.24794039130210876, - "learning_rate": 0.00019995220645796508, - "loss": 12.3039, + "grad_norm": 0.18425866961479187, + "learning_rate": 0.00014, + "loss": 12.3166, "step": 210 }, { "epoch": 0.034550515801539214, - "grad_norm": 0.220923513174057, - "learning_rate": 0.00019995168258901685, - "loss": 12.2739, + "grad_norm": 0.21036991477012634, + "learning_rate": 0.00014066666666666668, + "loss": 12.3058, "step": 211 }, { "epoch": 0.03471426232192566, - "grad_norm": 0.18528875708580017, - "learning_rate": 0.0001999511441681534, - "loss": 12.2915, + "grad_norm": 0.13942180573940277, + "learning_rate": 0.00014133333333333334, + "loss": 12.3181, "step": 212 }, { "epoch": 0.034878008842312104, - "grad_norm": 0.10424947738647461, - "learning_rate": 0.00019995060574728996, - "loss": 12.3008, + "grad_norm": 0.13669800758361816, + "learning_rate": 0.000142, + "loss": 12.3243, "step": 213 }, { "epoch": 0.035041755362698546, - "grad_norm": 0.10950024425983429, - "learning_rate": 0.00019995005277451128, - "loss": 12.2888, + "grad_norm": 0.11211808025836945, + "learning_rate": 0.00014266666666666667, + "loss": 12.3052, "step": 214 }, { "epoch": 0.03520550188308499, - "grad_norm": 0.1325610876083374, - "learning_rate": 0.00019994952890556306, - "loss": 12.2658, + "grad_norm": 0.2188958078622818, + "learning_rate": 0.00014333333333333334, + "loss": 12.2904, "step": 215 }, { "epoch": 0.03536924840347143, - "grad_norm": 0.21740667521953583, - "learning_rate": 0.00019994897593278438, - "loss": 12.2997, + "grad_norm": 0.20076972246170044, + "learning_rate": 0.000144, + "loss": 12.3283, "step": 216 }, { "epoch": 0.03553299492385787, - "grad_norm": 0.14994339644908905, - "learning_rate": 0.0001999484229600057, - "loss": 12.285, + "grad_norm": 0.1893579661846161, + "learning_rate": 0.0001446666666666667, + "loss": 12.3036, "step": 217 }, { "epoch": 0.03569674144424431, - "grad_norm": 0.1752844899892807, - "learning_rate": 0.00019994786998722702, - "loss": 12.2739, + "grad_norm": 0.12186531722545624, + "learning_rate": 0.00014533333333333333, + "loss": 12.2949, "step": 218 }, { "epoch": 0.035860487964630754, - "grad_norm": 0.16935011744499207, - "learning_rate": 0.00019994731701444834, - "loss": 12.2662, + "grad_norm": 0.19035090506076813, + "learning_rate": 0.000146, + "loss": 12.2976, "step": 219 }, { "epoch": 0.036024234485017195, - "grad_norm": 0.15907202661037445, - "learning_rate": 0.00019994674948975444, - "loss": 12.2754, + "grad_norm": 0.17918401956558228, + "learning_rate": 0.00014666666666666666, + "loss": 12.299, "step": 220 }, { "epoch": 0.03618798100540364, - "grad_norm": 0.1968131810426712, - "learning_rate": 0.00019994619651697576, - "loss": 12.2901, + "grad_norm": 0.22605666518211365, + "learning_rate": 0.00014733333333333335, + "loss": 12.3179, "step": 221 }, { "epoch": 0.03635172752579008, - "grad_norm": 0.15830907225608826, - "learning_rate": 0.00019994562899228185, - "loss": 12.279, + "grad_norm": 0.17605355381965637, + "learning_rate": 0.000148, + "loss": 12.3032, "step": 222 }, { "epoch": 0.03651547404617652, - "grad_norm": 0.13416405022144318, - "learning_rate": 0.00019994504691567272, - "loss": 12.2651, + "grad_norm": 0.1425754278898239, + "learning_rate": 0.00014866666666666666, + "loss": 12.2915, "step": 223 }, { "epoch": 0.03667922056656296, - "grad_norm": 0.2023450881242752, - "learning_rate": 0.00019994449394289404, - "loss": 12.2948, + "grad_norm": 0.17672143876552582, + "learning_rate": 0.00014933333333333335, + "loss": 12.3145, "step": 224 }, { "epoch": 0.0368429670869494, - "grad_norm": 0.11039238423109055, - "learning_rate": 0.0001999439118662849, - "loss": 12.2881, + "grad_norm": 0.12379948049783707, + "learning_rate": 0.00015000000000000001, + "loss": 12.3129, "step": 225 }, { "epoch": 0.037006713607335845, - "grad_norm": 0.2013334035873413, - "learning_rate": 0.00019994332978967577, - "loss": 12.3025, + "grad_norm": 0.1985873281955719, + "learning_rate": 0.00015066666666666668, + "loss": 12.3372, "step": 226 }, { "epoch": 0.037170460127722287, - "grad_norm": 0.20877353847026825, - "learning_rate": 0.00019994276226498187, - "loss": 12.3149, + "grad_norm": 0.22402392327785492, + "learning_rate": 0.00015133333333333334, + "loss": 12.3404, "step": 227 }, { "epoch": 0.03733420664810873, - "grad_norm": 0.12196000665426254, - "learning_rate": 0.00019994218018837273, - "loss": 12.2965, + "grad_norm": 0.14348964393138885, + "learning_rate": 0.000152, + "loss": 12.317, "step": 228 }, { "epoch": 0.03749795316849517, - "grad_norm": 0.18351589143276215, - "learning_rate": 0.0001999415981117636, - "loss": 12.3101, + "grad_norm": 0.19029805064201355, + "learning_rate": 0.00015266666666666667, + "loss": 12.3274, "step": 229 }, { "epoch": 0.03766169968888161, - "grad_norm": 0.3522573411464691, - "learning_rate": 0.00019994100148323923, - "loss": 12.3185, + "grad_norm": 0.3693244457244873, + "learning_rate": 0.00015333333333333334, + "loss": 12.336, "step": 230 }, { "epoch": 0.03782544620926805, - "grad_norm": 0.1347028911113739, - "learning_rate": 0.0001999404194066301, - "loss": 12.2782, + "grad_norm": 0.15163873136043549, + "learning_rate": 0.000154, + "loss": 12.2995, "step": 231 }, { "epoch": 0.037989192729654495, - "grad_norm": 0.1606866717338562, - "learning_rate": 0.00019993982277810574, - "loss": 12.2837, + "grad_norm": 0.18613851070404053, + "learning_rate": 0.00015466666666666667, + "loss": 12.3056, "step": 232 }, { "epoch": 0.038152939250040936, - "grad_norm": 0.18167780339717865, - "learning_rate": 0.00019993922614958137, - "loss": 12.305, + "grad_norm": 0.2206224650144577, + "learning_rate": 0.00015533333333333333, + "loss": 12.3205, "step": 233 }, { "epoch": 0.03831668577042738, - "grad_norm": 0.1324174553155899, - "learning_rate": 0.00019993861496914178, - "loss": 12.2859, + "grad_norm": 0.11398882418870926, + "learning_rate": 0.00015600000000000002, + "loss": 12.3021, "step": 234 }, { "epoch": 0.03848043229081382, - "grad_norm": 0.16425751149654388, - "learning_rate": 0.00019993801834061742, - "loss": 12.2773, + "grad_norm": 0.1618189513683319, + "learning_rate": 0.00015666666666666666, + "loss": 12.2969, "step": 235 }, { "epoch": 0.03864417881120026, - "grad_norm": 0.21324995160102844, - "learning_rate": 0.00019993740716017783, - "loss": 12.2713, + "grad_norm": 0.1825104057788849, + "learning_rate": 0.00015733333333333333, + "loss": 12.2849, "step": 236 }, { "epoch": 0.0388079253315867, - "grad_norm": 0.22252927720546722, - "learning_rate": 0.00019993679597973824, - "loss": 12.3141, + "grad_norm": 0.2700357139110565, + "learning_rate": 0.00015800000000000002, + "loss": 12.3281, "step": 237 }, { "epoch": 0.038971671851973144, - "grad_norm": 0.12647491693496704, - "learning_rate": 0.00019993618479929864, - "loss": 12.2744, + "grad_norm": 0.12693750858306885, + "learning_rate": 0.00015866666666666668, + "loss": 12.2915, "step": 238 }, { "epoch": 0.039135418372359586, - "grad_norm": 0.23671850562095642, - "learning_rate": 0.00019993557361885905, - "loss": 12.2572, + "grad_norm": 0.20626111328601837, + "learning_rate": 0.00015933333333333332, + "loss": 12.2844, "step": 239 }, { "epoch": 0.03929916489274603, - "grad_norm": 0.14135394990444183, - "learning_rate": 0.00019993494788650423, - "loss": 12.2701, + "grad_norm": 0.28300392627716064, + "learning_rate": 0.00016, + "loss": 12.2941, "step": 240 }, { "epoch": 0.03946291141313247, - "grad_norm": 0.19470906257629395, - "learning_rate": 0.00019993433670606464, - "loss": 12.2932, + "grad_norm": 0.2612934112548828, + "learning_rate": 0.00016066666666666668, + "loss": 12.3069, "step": 241 }, { "epoch": 0.03962665793351891, - "grad_norm": 0.21060970425605774, - "learning_rate": 0.00019993371097370982, - "loss": 12.2607, + "grad_norm": 0.16001901030540466, + "learning_rate": 0.00016133333333333334, + "loss": 12.28, "step": 242 }, { "epoch": 0.03979040445390535, - "grad_norm": 0.1116245836019516, - "learning_rate": 0.000199933085241355, - "loss": 12.2812, + "grad_norm": 0.13632474839687347, + "learning_rate": 0.000162, + "loss": 12.3005, "step": 243 }, { "epoch": 0.039954150974291794, - "grad_norm": 0.20140965282917023, - "learning_rate": 0.00019993244495708495, - "loss": 12.2896, + "grad_norm": 0.30553674697875977, + "learning_rate": 0.00016266666666666667, + "loss": 12.3069, "step": 244 }, { "epoch": 0.040117897494678235, - "grad_norm": 0.11436797678470612, - "learning_rate": 0.00019993181922473013, - "loss": 12.2738, + "grad_norm": 0.14686566591262817, + "learning_rate": 0.00016333333333333334, + "loss": 12.2975, "step": 245 }, { "epoch": 0.04028164401506468, - "grad_norm": 0.1359335482120514, - "learning_rate": 0.00019993119349237531, - "loss": 12.2512, + "grad_norm": 0.15335270762443542, + "learning_rate": 0.000164, + "loss": 12.2738, "step": 246 }, { "epoch": 0.04044539053545112, - "grad_norm": 0.1482047438621521, - "learning_rate": 0.00019993053865619004, - "loss": 12.2944, + "grad_norm": 0.15902471542358398, + "learning_rate": 0.00016466666666666667, + "loss": 12.3114, "step": 247 }, { "epoch": 0.04060913705583756, - "grad_norm": 0.12831032276153564, - "learning_rate": 0.00019992989837192, - "loss": 12.304, + "grad_norm": 0.13151733577251434, + "learning_rate": 0.00016533333333333333, + "loss": 12.3157, "step": 248 }, { "epoch": 0.040772883576224, - "grad_norm": 0.18362678587436676, - "learning_rate": 0.00019992925808764994, - "loss": 12.2673, + "grad_norm": 0.2536516785621643, + "learning_rate": 0.000166, + "loss": 12.2878, "step": 249 }, { "epoch": 0.04093663009661045, - "grad_norm": 0.13624298572540283, - "learning_rate": 0.0001999286178033799, - "loss": 12.2556, + "grad_norm": 0.21861162781715393, + "learning_rate": 0.0001666666666666667, + "loss": 12.2746, "step": 250 }, { "epoch": 0.04110037661699689, - "grad_norm": 0.1984298676252365, - "learning_rate": 0.00019992796296719462, - "loss": 12.3274, + "grad_norm": 0.19811271131038666, + "learning_rate": 0.00016733333333333335, + "loss": 12.3352, "step": 251 }, { "epoch": 0.041264123137383334, - "grad_norm": 0.19891658425331116, - "learning_rate": 0.00019992730813100934, - "loss": 12.2671, + "grad_norm": 0.4501516819000244, + "learning_rate": 0.000168, + "loss": 12.2949, "step": 252 }, { "epoch": 0.041427869657769775, - "grad_norm": 0.16791290044784546, - "learning_rate": 0.00019992665329482406, - "loss": 12.2687, + "grad_norm": 0.17889365553855896, + "learning_rate": 0.00016866666666666668, + "loss": 12.2884, "step": 253 }, { "epoch": 0.04159161617815622, - "grad_norm": 0.3288877606391907, - "learning_rate": 0.00019992598390672356, - "loss": 12.2904, + "grad_norm": 0.22105363011360168, + "learning_rate": 0.00016933333333333335, + "loss": 12.3049, "step": 254 }, { "epoch": 0.04175536269854266, - "grad_norm": 0.2074000984430313, - "learning_rate": 0.00019992532907053828, - "loss": 12.2962, + "grad_norm": 0.2042873650789261, + "learning_rate": 0.00017, + "loss": 12.3225, "step": 255 }, { "epoch": 0.0419191092189291, - "grad_norm": 0.16552244126796722, - "learning_rate": 0.00019992465968243778, - "loss": 12.2825, + "grad_norm": 0.13782532513141632, + "learning_rate": 0.00017066666666666668, + "loss": 12.3041, "step": 256 }, { "epoch": 0.04208285573931554, - "grad_norm": 0.0921081155538559, - "learning_rate": 0.0001999240048462525, - "loss": 12.2688, + "grad_norm": 0.11734238266944885, + "learning_rate": 0.00017133333333333334, + "loss": 12.2912, "step": 257 }, { "epoch": 0.04224660225970198, - "grad_norm": 0.1811661273241043, - "learning_rate": 0.000199923335458152, - "loss": 12.2645, + "grad_norm": 0.2625914216041565, + "learning_rate": 0.000172, + "loss": 12.2863, "step": 258 }, { "epoch": 0.042410348780088425, - "grad_norm": 0.1614493876695633, - "learning_rate": 0.00019992265151813626, - "loss": 12.3029, + "grad_norm": 0.15754052996635437, + "learning_rate": 0.00017266666666666667, + "loss": 12.3181, "step": 259 }, { "epoch": 0.042574095300474867, - "grad_norm": 0.1890016794204712, - "learning_rate": 0.00019992196757812053, - "loss": 12.2787, + "grad_norm": 0.21560034155845642, + "learning_rate": 0.00017333333333333334, + "loss": 12.3165, "step": 260 }, { "epoch": 0.04273784182086131, - "grad_norm": 0.137629434466362, - "learning_rate": 0.00019992129819002002, - "loss": 12.2516, + "grad_norm": 0.12006810307502747, + "learning_rate": 0.000174, + "loss": 12.2814, "step": 261 }, { "epoch": 0.04290158834124775, - "grad_norm": 0.2103978395462036, - "learning_rate": 0.0001999206142500043, - "loss": 12.2785, + "grad_norm": 0.19567811489105225, + "learning_rate": 0.00017466666666666667, + "loss": 12.303, "step": 262 }, { "epoch": 0.04306533486163419, - "grad_norm": 0.1392078995704651, - "learning_rate": 0.00019991993030998856, - "loss": 12.2816, + "grad_norm": 0.15239214897155762, + "learning_rate": 0.00017533333333333336, + "loss": 12.3033, "step": 263 }, { "epoch": 0.04322908138202063, - "grad_norm": 0.15243275463581085, - "learning_rate": 0.00019991924636997283, - "loss": 12.2646, + "grad_norm": 0.14320224523544312, + "learning_rate": 0.00017600000000000002, + "loss": 12.2838, "step": 264 }, { "epoch": 0.043392827902407075, - "grad_norm": 0.155501127243042, - "learning_rate": 0.00019991854787804186, - "loss": 12.2842, + "grad_norm": 0.1736345887184143, + "learning_rate": 0.00017666666666666666, + "loss": 12.3115, "step": 265 }, { "epoch": 0.043556574422793516, - "grad_norm": 0.1776060163974762, - "learning_rate": 0.00019991786393802613, - "loss": 12.2972, + "grad_norm": 0.16777797043323517, + "learning_rate": 0.00017733333333333335, + "loss": 12.3189, "step": 266 }, { "epoch": 0.04372032094317996, - "grad_norm": 0.1182047501206398, - "learning_rate": 0.00019991715089417994, - "loss": 12.2812, + "grad_norm": 0.1118602603673935, + "learning_rate": 0.00017800000000000002, + "loss": 12.2967, "step": 267 }, { "epoch": 0.0438840674635664, - "grad_norm": 0.10008183866739273, - "learning_rate": 0.00019991645240224898, - "loss": 12.2747, + "grad_norm": 0.14121738076210022, + "learning_rate": 0.00017866666666666668, + "loss": 12.2935, "step": 268 }, { "epoch": 0.04404781398395284, - "grad_norm": 0.2416013479232788, - "learning_rate": 0.00019991575391031802, - "loss": 12.2916, + "grad_norm": 0.2797190546989441, + "learning_rate": 0.00017933333333333332, + "loss": 12.3174, "step": 269 }, { "epoch": 0.04421156050433928, - "grad_norm": 0.15756867825984955, - "learning_rate": 0.00019991504086647183, - "loss": 12.259, + "grad_norm": 0.2606465220451355, + "learning_rate": 0.00018, + "loss": 12.283, "step": 270 }, { "epoch": 0.044375307024725724, - "grad_norm": 0.17258231341838837, - "learning_rate": 0.00019991434237454087, - "loss": 12.3201, + "grad_norm": 0.24559257924556732, + "learning_rate": 0.00018066666666666668, + "loss": 12.3289, "step": 271 }, { "epoch": 0.044539053545112166, - "grad_norm": 0.19413842260837555, - "learning_rate": 0.00019991362933069468, - "loss": 12.2855, + "grad_norm": 0.2690947949886322, + "learning_rate": 0.00018133333333333334, + "loss": 12.3016, "step": 272 }, { "epoch": 0.04470280006549861, - "grad_norm": 0.14625532925128937, - "learning_rate": 0.00019991291628684849, - "loss": 12.2546, + "grad_norm": 0.14213307201862335, + "learning_rate": 0.000182, + "loss": 12.2888, "step": 273 }, { "epoch": 0.04486654658588505, - "grad_norm": 0.2281506359577179, - "learning_rate": 0.0001999122032430023, - "loss": 12.2741, + "grad_norm": 0.23441320657730103, + "learning_rate": 0.00018266666666666667, + "loss": 12.2924, "step": 274 }, { "epoch": 0.04503029310627149, - "grad_norm": 0.207796111702919, - "learning_rate": 0.00019991147564724088, - "loss": 12.2596, + "grad_norm": 0.1753687560558319, + "learning_rate": 0.00018333333333333334, + "loss": 12.2822, "step": 275 }, { "epoch": 0.04519403962665793, - "grad_norm": 0.1390552818775177, - "learning_rate": 0.00019991074805147946, - "loss": 12.2327, + "grad_norm": 0.13752730190753937, + "learning_rate": 0.00018400000000000003, + "loss": 12.2564, "step": 276 }, { "epoch": 0.045357786147044374, - "grad_norm": 0.1534932404756546, - "learning_rate": 0.00019991002045571804, - "loss": 12.2644, + "grad_norm": 0.09531578421592712, + "learning_rate": 0.00018466666666666666, + "loss": 12.2847, "step": 277 }, { "epoch": 0.045521532667430815, - "grad_norm": 0.142225444316864, - "learning_rate": 0.00019990929285995662, - "loss": 12.2712, + "grad_norm": 0.13418786227703094, + "learning_rate": 0.00018533333333333333, + "loss": 12.2926, "step": 278 }, { "epoch": 0.04568527918781726, - "grad_norm": 0.1784576177597046, - "learning_rate": 0.0001999085652641952, - "loss": 12.2751, + "grad_norm": 0.12498784065246582, + "learning_rate": 0.00018600000000000002, + "loss": 12.2995, "step": 279 }, { "epoch": 0.0458490257082037, - "grad_norm": 0.13212628662586212, - "learning_rate": 0.00019990782311651856, - "loss": 12.2734, + "grad_norm": 0.1601720005273819, + "learning_rate": 0.0001866666666666667, + "loss": 12.2968, "step": 280 }, { "epoch": 0.04601277222859014, - "grad_norm": 0.12999513745307922, - "learning_rate": 0.0001999070809688419, - "loss": 12.2971, + "grad_norm": 0.14898695051670074, + "learning_rate": 0.00018733333333333335, + "loss": 12.3042, "step": 281 }, { "epoch": 0.04617651874897658, - "grad_norm": 0.14513814449310303, - "learning_rate": 0.00019990633882116526, - "loss": 12.257, + "grad_norm": 0.13938522338867188, + "learning_rate": 0.000188, + "loss": 12.2859, "step": 282 }, { "epoch": 0.046340265269363023, - "grad_norm": 0.1383667141199112, - "learning_rate": 0.00019990559667348862, - "loss": 12.2826, + "grad_norm": 0.17422890663146973, + "learning_rate": 0.00018866666666666668, + "loss": 12.3031, "step": 283 }, { "epoch": 0.046504011789749465, - "grad_norm": 0.16239354014396667, - "learning_rate": 0.00019990485452581197, - "loss": 12.2747, + "grad_norm": 0.20269818603992462, + "learning_rate": 0.00018933333333333335, + "loss": 12.2896, "step": 284 }, { "epoch": 0.04666775831013591, - "grad_norm": 0.14773111045360565, - "learning_rate": 0.0001999040978262201, - "loss": 12.2803, + "grad_norm": 0.16185009479522705, + "learning_rate": 0.00019, + "loss": 12.3044, "step": 285 }, { "epoch": 0.04683150483052235, - "grad_norm": 0.20306552946567535, - "learning_rate": 0.00019990334112662822, - "loss": 12.307, + "grad_norm": 0.26534372568130493, + "learning_rate": 0.00019066666666666668, + "loss": 12.3174, "step": 286 }, { "epoch": 0.04699525135090879, - "grad_norm": 0.1623079627752304, - "learning_rate": 0.00019990258442703635, - "loss": 12.3055, + "grad_norm": 0.19738201797008514, + "learning_rate": 0.00019133333333333334, + "loss": 12.3236, "step": 287 }, { "epoch": 0.04715899787129523, - "grad_norm": 0.17853973805904388, - "learning_rate": 0.00019990182772744447, - "loss": 12.3012, + "grad_norm": 0.28078433871269226, + "learning_rate": 0.000192, + "loss": 12.3156, "step": 288 }, { "epoch": 0.04732274439168168, - "grad_norm": 0.207291841506958, - "learning_rate": 0.00019990105647593737, - "loss": 12.2876, + "grad_norm": 0.2100210189819336, + "learning_rate": 0.0001926666666666667, + "loss": 12.2992, "step": 289 }, { "epoch": 0.04748649091206812, - "grad_norm": 0.15179339051246643, - "learning_rate": 0.0001999002997763455, - "loss": 12.2683, + "grad_norm": 0.15641532838344574, + "learning_rate": 0.00019333333333333333, + "loss": 12.2878, "step": 290 }, { "epoch": 0.04765023743245456, - "grad_norm": 0.18993042409420013, - "learning_rate": 0.00019989954307675362, - "loss": 12.2622, + "grad_norm": 0.2536814212799072, + "learning_rate": 0.000194, + "loss": 12.2886, "step": 291 }, { "epoch": 0.047813983952841005, - "grad_norm": 0.1102190762758255, - "learning_rate": 0.00019989875727333128, - "loss": 12.2814, + "grad_norm": 0.11571547389030457, + "learning_rate": 0.0001946666666666667, + "loss": 12.3017, "step": 292 }, { "epoch": 0.047977730473227446, - "grad_norm": 0.1876610964536667, - "learning_rate": 0.00019989798602182418, - "loss": 12.2715, + "grad_norm": 0.16971823573112488, + "learning_rate": 0.00019533333333333336, + "loss": 12.2832, "step": 293 }, { "epoch": 0.04814147699361389, - "grad_norm": 0.1995493471622467, - "learning_rate": 0.00019989721477031708, - "loss": 12.2674, + "grad_norm": 0.2792551815509796, + "learning_rate": 0.000196, + "loss": 12.3043, "step": 294 }, { "epoch": 0.04830522351400033, - "grad_norm": 0.11874358355998993, - "learning_rate": 0.00019989642896689475, - "loss": 12.3595, + "grad_norm": 0.21620360016822815, + "learning_rate": 0.00019666666666666666, + "loss": 12.3755, "step": 295 }, { "epoch": 0.04846897003438677, - "grad_norm": 0.2674122452735901, - "learning_rate": 0.00019989565771538764, - "loss": 12.3017, + "grad_norm": 0.24013638496398926, + "learning_rate": 0.00019733333333333335, + "loss": 12.3221, "step": 296 }, { "epoch": 0.04863271655477321, - "grad_norm": 0.15029270946979523, - "learning_rate": 0.00019989485736005008, - "loss": 12.268, + "grad_norm": 0.2197066694498062, + "learning_rate": 0.00019800000000000002, + "loss": 12.2921, "step": 297 }, { "epoch": 0.048796463075159655, - "grad_norm": 0.17938172817230225, - "learning_rate": 0.00019989407155662775, - "loss": 12.2819, + "grad_norm": 0.20803050696849823, + "learning_rate": 0.00019866666666666668, + "loss": 12.3009, "step": 298 }, { "epoch": 0.048960209595546096, - "grad_norm": 0.19023428857326508, - "learning_rate": 0.00019989328575320542, - "loss": 12.2706, + "grad_norm": 0.1672317534685135, + "learning_rate": 0.00019933333333333334, + "loss": 12.2926, "step": 299 }, { "epoch": 0.04912395611593254, - "grad_norm": 0.21805034577846527, - "learning_rate": 0.00019989248539786786, - "loss": 12.278, - "step": 300 - }, - { - "epoch": 0.04912395611593254, - "eval_loss": 12.275172233581543, - "eval_runtime": 7.4494, - "eval_samples_per_second": 33.023, - "eval_steps_per_second": 16.511, + "grad_norm": 0.1740259975194931, + "learning_rate": 0.0002, + "loss": 12.3002, "step": 300 }, { "epoch": 0.04928770263631898, - "grad_norm": 0.1406513899564743, - "learning_rate": 0.00019989169959444553, - "loss": 12.2771, + "grad_norm": 0.13165144622325897, + "learning_rate": 0.00019999999986637393, + "loss": 12.3066, "step": 301 }, { "epoch": 0.04945144915670542, - "grad_norm": 0.14678552746772766, - "learning_rate": 0.00019989088468719274, - "loss": 12.2657, + "grad_norm": 0.23863595724105835, + "learning_rate": 0.00019999999946549563, + "loss": 12.2885, "step": 302 }, { "epoch": 0.04961519567709186, - "grad_norm": 0.14424839615821838, - "learning_rate": 0.00019989008433185518, - "loss": 12.2738, + "grad_norm": 0.12646012008190155, + "learning_rate": 0.00019999999879736518, + "loss": 12.2915, "step": 303 }, { "epoch": 0.049778942197478304, - "grad_norm": 0.2018839716911316, - "learning_rate": 0.0001998892694246024, - "loss": 12.2814, + "grad_norm": 0.2852458953857422, + "learning_rate": 0.00019999999786198252, + "loss": 12.3148, "step": 304 }, { "epoch": 0.049942688717864746, - "grad_norm": 0.20368194580078125, - "learning_rate": 0.00019988846906926483, - "loss": 12.2506, + "grad_norm": 0.21312469244003296, + "learning_rate": 0.00019999999665934766, + "loss": 12.2737, "step": 305 }, { "epoch": 0.05010643523825119, - "grad_norm": 0.21352413296699524, - "learning_rate": 0.00019988765416201204, - "loss": 12.288, + "grad_norm": 0.21012352406978607, + "learning_rate": 0.00019999999518946062, + "loss": 12.3127, "step": 306 }, { "epoch": 0.05027018175863763, - "grad_norm": 0.11416824907064438, - "learning_rate": 0.00019988683925475925, - "loss": 12.2772, + "grad_norm": 0.12454304844141006, + "learning_rate": 0.00019999999345232143, + "loss": 12.2986, "step": 307 }, { "epoch": 0.05043392827902407, - "grad_norm": 0.18459837138652802, - "learning_rate": 0.00019988602434750646, - "loss": 12.3078, + "grad_norm": 0.20746897161006927, + "learning_rate": 0.00019999999144793007, + "loss": 12.3273, "step": 308 }, { "epoch": 0.05059767479941051, - "grad_norm": 0.1840011328458786, - "learning_rate": 0.00019988519488833845, - "loss": 12.2627, + "grad_norm": 0.17081008851528168, + "learning_rate": 0.0001999999891762865, + "loss": 12.2905, "step": 309 }, { "epoch": 0.050761421319796954, - "grad_norm": 0.14293089509010315, - "learning_rate": 0.00019988437998108566, - "loss": 12.2646, + "grad_norm": 0.1272573620080948, + "learning_rate": 0.0001999999866373908, + "loss": 12.2907, "step": 310 }, { "epoch": 0.050925167840183395, - "grad_norm": 0.1082555428147316, - "learning_rate": 0.00019988355052191764, - "loss": 12.2839, + "grad_norm": 0.1364651918411255, + "learning_rate": 0.00019999998383124298, + "loss": 12.2988, "step": 311 }, { "epoch": 0.05108891436056984, - "grad_norm": 0.5544154047966003, - "learning_rate": 0.00019988272106274962, - "loss": 12.2913, + "grad_norm": 0.4523601531982422, + "learning_rate": 0.00019999998075784293, + "loss": 12.2987, "step": 312 }, { "epoch": 0.05125266088095628, - "grad_norm": 0.1190885454416275, - "learning_rate": 0.00019988187705166638, - "loss": 12.2789, + "grad_norm": 0.1788499653339386, + "learning_rate": 0.0001999999774171908, + "loss": 12.3047, "step": 313 }, { "epoch": 0.05141640740134272, - "grad_norm": 0.14885587990283966, - "learning_rate": 0.00019988104759249836, - "loss": 12.2705, + "grad_norm": 0.21642278134822845, + "learning_rate": 0.00019999997380928654, + "loss": 12.2901, "step": 314 }, { "epoch": 0.05158015392172916, - "grad_norm": 0.10743315517902374, - "learning_rate": 0.00019988021813333035, - "loss": 12.2572, + "grad_norm": 0.12551110982894897, + "learning_rate": 0.00019999996993413013, + "loss": 12.2777, "step": 315 }, { "epoch": 0.0517439004421156, - "grad_norm": 0.10766734182834625, - "learning_rate": 0.0001998793741222471, - "loss": 12.274, + "grad_norm": 0.08959595859050751, + "learning_rate": 0.00019999996579172166, + "loss": 12.2922, "step": 316 }, { "epoch": 0.051907646962502045, - "grad_norm": 0.14055076241493225, - "learning_rate": 0.00019987851555924863, - "loss": 12.2629, + "grad_norm": 0.17989438772201538, + "learning_rate": 0.00019999996138206103, + "loss": 12.2858, "step": 317 }, { "epoch": 0.05207139348288849, - "grad_norm": 0.13312725722789764, - "learning_rate": 0.00019987767154816538, - "loss": 12.2709, + "grad_norm": 0.11149830371141434, + "learning_rate": 0.00019999995670514834, + "loss": 12.2864, "step": 318 }, { "epoch": 0.05223514000327493, - "grad_norm": 0.3041207790374756, - "learning_rate": 0.00019987682753708214, - "loss": 12.3052, + "grad_norm": 0.37905353307724, + "learning_rate": 0.00019999995176098358, + "loss": 12.3123, "step": 319 }, { "epoch": 0.05239888652366137, - "grad_norm": 0.14904162287712097, - "learning_rate": 0.00019987596897408366, - "loss": 12.2843, + "grad_norm": 0.1828998625278473, + "learning_rate": 0.00019999994654956678, + "loss": 12.2975, "step": 320 }, { "epoch": 0.05256263304404781, - "grad_norm": 0.21144121885299683, - "learning_rate": 0.0001998751104110852, - "loss": 12.2775, + "grad_norm": 0.2369372397661209, + "learning_rate": 0.00019999994107089793, + "loss": 12.2982, "step": 321 }, { "epoch": 0.05272637956443425, - "grad_norm": 0.1579839438199997, - "learning_rate": 0.00019987425184808671, - "loss": 12.2626, + "grad_norm": 0.21192120015621185, + "learning_rate": 0.000199999935324977, + "loss": 12.2798, "step": 322 }, { "epoch": 0.052890126084820695, - "grad_norm": 0.15492835640907288, - "learning_rate": 0.00019987339328508824, - "loss": 12.2489, + "grad_norm": 0.13581956923007965, + "learning_rate": 0.0001999999293118041, + "loss": 12.2714, "step": 323 }, { "epoch": 0.053053872605207136, - "grad_norm": 0.14935417473316193, - "learning_rate": 0.00019987252017017454, - "loss": 12.2721, + "grad_norm": 0.11289360374212265, + "learning_rate": 0.00019999992303137916, + "loss": 12.2879, "step": 324 }, { "epoch": 0.05321761912559358, - "grad_norm": 0.16872552037239075, - "learning_rate": 0.00019987164705526084, - "loss": 12.2601, + "grad_norm": 0.12893497943878174, + "learning_rate": 0.00019999991648370226, + "loss": 12.2844, "step": 325 }, { "epoch": 0.05338136564598002, - "grad_norm": 0.14731909334659576, - "learning_rate": 0.00019987078849226236, - "loss": 12.2728, + "grad_norm": 0.12965922057628632, + "learning_rate": 0.0001999999096687734, + "loss": 12.2866, "step": 326 }, { "epoch": 0.05354511216636647, - "grad_norm": 0.28716838359832764, - "learning_rate": 0.00019986991537734866, - "loss": 12.2734, + "grad_norm": 0.2003125697374344, + "learning_rate": 0.0001999999025865926, + "loss": 12.2908, "step": 327 }, { "epoch": 0.05370885868675291, - "grad_norm": 0.145146906375885, - "learning_rate": 0.00019986902771051973, - "loss": 12.2719, + "grad_norm": 0.1551949828863144, + "learning_rate": 0.00019999989523715984, + "loss": 12.2969, "step": 328 }, { "epoch": 0.05387260520713935, - "grad_norm": 0.12571807205677032, - "learning_rate": 0.00019986815459560603, - "loss": 12.2818, + "grad_norm": 0.11883686482906342, + "learning_rate": 0.00019999988762047516, + "loss": 12.3078, "step": 329 }, { "epoch": 0.05403635172752579, - "grad_norm": 0.1811470240354538, - "learning_rate": 0.00019986728148069233, - "loss": 12.2653, + "grad_norm": 0.14822140336036682, + "learning_rate": 0.00019999987973653863, + "loss": 12.2868, "step": 330 }, { "epoch": 0.054200098247912235, - "grad_norm": 0.18098489940166473, - "learning_rate": 0.00019986637926194817, - "loss": 12.2768, + "grad_norm": 0.16526789963245392, + "learning_rate": 0.00019999987158535022, + "loss": 12.3022, "step": 331 }, { "epoch": 0.054363844768298676, - "grad_norm": 0.1682722270488739, - "learning_rate": 0.00019986550614703447, - "loss": 12.2737, + "grad_norm": 0.22066651284694672, + "learning_rate": 0.00019999986316690996, + "loss": 12.2937, "step": 332 }, { "epoch": 0.05452759128868512, - "grad_norm": 0.18899443745613098, - "learning_rate": 0.0001998646039282903, - "loss": 12.2599, + "grad_norm": 0.23073990643024445, + "learning_rate": 0.00019999985448121787, + "loss": 12.2761, "step": 333 }, { "epoch": 0.05469133780907156, - "grad_norm": 0.10069891810417175, - "learning_rate": 0.00019986371626146138, - "loss": 12.2526, + "grad_norm": 0.1183820590376854, + "learning_rate": 0.000199999845528274, + "loss": 12.2706, "step": 334 }, { "epoch": 0.054855084329458, - "grad_norm": 0.14798307418823242, - "learning_rate": 0.00019986281404271722, - "loss": 12.2724, + "grad_norm": 0.15629184246063232, + "learning_rate": 0.0001999998363080783, + "loss": 12.2955, "step": 335 }, { "epoch": 0.05501883084984444, - "grad_norm": 0.15758149325847626, - "learning_rate": 0.00019986191182397306, - "loss": 12.2839, + "grad_norm": 0.21205270290374756, + "learning_rate": 0.0001999998268206309, + "loss": 12.2933, "step": 336 }, { "epoch": 0.055182577370230884, - "grad_norm": 0.2598915100097656, - "learning_rate": 0.0001998610096052289, - "loss": 12.2593, + "grad_norm": 0.27476778626441956, + "learning_rate": 0.00019999981706593173, + "loss": 12.2768, "step": 337 }, { "epoch": 0.055346323890617326, - "grad_norm": 0.22992730140686035, - "learning_rate": 0.00019986010738648474, - "loss": 12.2799, + "grad_norm": 0.23682667315006256, + "learning_rate": 0.0001999998070439809, + "loss": 12.3049, "step": 338 }, { "epoch": 0.05551007041100377, - "grad_norm": 0.14556151628494263, - "learning_rate": 0.00019985919061582536, - "loss": 12.2907, + "grad_norm": 0.1238231286406517, + "learning_rate": 0.00019999979675477839, + "loss": 12.3007, "step": 339 }, { "epoch": 0.05567381693139021, - "grad_norm": 0.11856741458177567, - "learning_rate": 0.00019985827384516597, - "loss": 12.2717, + "grad_norm": 0.10456238687038422, + "learning_rate": 0.0001999997861983242, + "loss": 12.2874, "step": 340 }, { "epoch": 0.05583756345177665, - "grad_norm": 0.14395655691623688, - "learning_rate": 0.00019985735707450658, - "loss": 12.2652, + "grad_norm": 0.11022137105464935, + "learning_rate": 0.00019999977537461843, + "loss": 12.2897, "step": 341 }, { "epoch": 0.05600130997216309, - "grad_norm": 0.09158793836832047, - "learning_rate": 0.0001998564403038472, - "loss": 12.261, + "grad_norm": 0.11375322192907333, + "learning_rate": 0.00019999976428366104, + "loss": 12.2778, "step": 342 }, { "epoch": 0.056165056492549534, - "grad_norm": 0.16784970462322235, - "learning_rate": 0.0001998555235331878, - "loss": 12.2369, + "grad_norm": 0.23204772174358368, + "learning_rate": 0.00019999975292545212, + "loss": 12.2546, "step": 343 }, { "epoch": 0.056328803012935975, - "grad_norm": 0.24563707411289215, - "learning_rate": 0.0001998545922106132, - "loss": 12.2968, + "grad_norm": 0.2459680140018463, + "learning_rate": 0.00019999974129999165, + "loss": 12.3037, "step": 344 }, { "epoch": 0.05649254953332242, - "grad_norm": 0.18693608045578003, - "learning_rate": 0.0001998536754399538, - "loss": 12.2752, + "grad_norm": 0.15076689422130585, + "learning_rate": 0.0001999997294072797, + "loss": 12.2902, "step": 345 }, { "epoch": 0.05665629605370886, - "grad_norm": 0.16343215107917786, - "learning_rate": 0.0001998527441173792, - "loss": 12.2639, + "grad_norm": 0.18627764284610748, + "learning_rate": 0.00019999971724731625, + "loss": 12.2869, "step": 346 }, { "epoch": 0.0568200425740953, - "grad_norm": 0.17143461108207703, - "learning_rate": 0.00019985181279480457, - "loss": 12.2966, + "grad_norm": 0.17207014560699463, + "learning_rate": 0.0001999997048201014, + "loss": 12.3093, "step": 347 }, { "epoch": 0.05698378909448174, - "grad_norm": 0.20530003309249878, - "learning_rate": 0.00019985088147222996, - "loss": 12.263, + "grad_norm": 0.24697749316692352, + "learning_rate": 0.00019999969212563512, + "loss": 12.2833, "step": 348 }, { "epoch": 0.05714753561486818, - "grad_norm": 0.15935033559799194, - "learning_rate": 0.00019984993559774011, - "loss": 12.2844, + "grad_norm": 0.1407640129327774, + "learning_rate": 0.0001999996791639175, + "loss": 12.3039, "step": 349 }, { "epoch": 0.057311282135254625, - "grad_norm": 0.22338813543319702, - "learning_rate": 0.00019984898972325027, - "loss": 12.288, + "grad_norm": 0.15878401696681976, + "learning_rate": 0.0001999996659349485, + "loss": 12.3057, "step": 350 }, { "epoch": 0.05747502865564107, - "grad_norm": 0.1451302468776703, - "learning_rate": 0.00019984804384876043, - "loss": 12.273, + "grad_norm": 0.4590788781642914, + "learning_rate": 0.00019999965243872828, + "loss": 12.2928, "step": 351 }, { "epoch": 0.05763877517602751, - "grad_norm": 0.21671491861343384, - "learning_rate": 0.00019984709797427058, - "loss": 12.2899, + "grad_norm": 0.1928272247314453, + "learning_rate": 0.00019999963867525675, + "loss": 12.3032, "step": 352 }, { "epoch": 0.05780252169641395, - "grad_norm": 0.1976754367351532, - "learning_rate": 0.0001998461375478655, - "loss": 12.2657, + "grad_norm": 0.22454416751861572, + "learning_rate": 0.000199999624644534, + "loss": 12.2843, "step": 353 }, { "epoch": 0.05796626821680039, - "grad_norm": 0.12198355793952942, - "learning_rate": 0.00019984519167337567, - "loss": 12.2725, + "grad_norm": 0.12228063493967056, + "learning_rate": 0.00019999961034656004, + "loss": 12.2829, "step": 354 }, { "epoch": 0.05813001473718683, - "grad_norm": 0.16143521666526794, - "learning_rate": 0.0001998442312469706, - "loss": 12.2536, + "grad_norm": 0.17021706700325012, + "learning_rate": 0.00019999959578133499, + "loss": 12.2823, "step": 355 }, { "epoch": 0.058293761257573275, - "grad_norm": 0.19851113855838776, - "learning_rate": 0.00019984328537248075, - "loss": 12.2631, + "grad_norm": 0.13477741181850433, + "learning_rate": 0.0001999995809488588, + "loss": 12.2836, "step": 356 }, { "epoch": 0.058457507777959716, - "grad_norm": 0.2403157353401184, - "learning_rate": 0.00019984231039416045, - "loss": 12.2417, + "grad_norm": 0.2285226583480835, + "learning_rate": 0.00019999956584913154, + "loss": 12.2625, "step": 357 }, { "epoch": 0.05862125429834616, - "grad_norm": 0.14061123132705688, - "learning_rate": 0.00019984134996775538, - "loss": 12.2621, + "grad_norm": 0.1142636239528656, + "learning_rate": 0.00019999955048215324, + "loss": 12.2804, "step": 358 }, { "epoch": 0.0587850008187326, - "grad_norm": 0.4004269540309906, - "learning_rate": 0.0001998403895413503, - "loss": 12.2795, + "grad_norm": 0.37811943888664246, + "learning_rate": 0.00019999953484792395, + "loss": 12.2842, "step": 359 }, { "epoch": 0.05894874733911904, - "grad_norm": 0.14407266676425934, - "learning_rate": 0.00019983941456303, - "loss": 12.2585, + "grad_norm": 0.14085888862609863, + "learning_rate": 0.00019999951894644373, + "loss": 12.2805, "step": 360 }, { "epoch": 0.05911249385950548, - "grad_norm": 0.1705065369606018, - "learning_rate": 0.00019983842503279448, - "loss": 12.2578, + "grad_norm": 0.19372887909412384, + "learning_rate": 0.0001999995027777126, + "loss": 12.2667, "step": 361 }, { "epoch": 0.059276240379891924, - "grad_norm": 0.19344770908355713, - "learning_rate": 0.00019983745005447417, - "loss": 12.2567, + "grad_norm": 0.2030063271522522, + "learning_rate": 0.00019999948634173063, + "loss": 12.2733, "step": 362 }, { "epoch": 0.059439986900278366, - "grad_norm": 0.2022704929113388, - "learning_rate": 0.00019983647507615387, - "loss": 12.2933, + "grad_norm": 0.19858768582344055, + "learning_rate": 0.0001999994696384978, + "loss": 12.3034, "step": 363 }, { "epoch": 0.05960373342066481, - "grad_norm": 0.1704675704240799, - "learning_rate": 0.00019983550009783357, - "loss": 12.2529, + "grad_norm": 0.15349508821964264, + "learning_rate": 0.00019999945266801423, + "loss": 12.2819, "step": 364 }, { "epoch": 0.059767479941051256, - "grad_norm": 0.2408781498670578, - "learning_rate": 0.00019983451056759804, - "loss": 12.2733, + "grad_norm": 0.17492233216762543, + "learning_rate": 0.0001999994354302799, + "loss": 12.2909, "step": 365 }, { "epoch": 0.0599312264614377, - "grad_norm": 0.12872779369354248, - "learning_rate": 0.00019983352103736252, - "loss": 12.2573, + "grad_norm": 0.15136563777923584, + "learning_rate": 0.0001999994179252949, + "loss": 12.2796, "step": 366 }, { "epoch": 0.06009497298182414, - "grad_norm": 0.12638714909553528, - "learning_rate": 0.000199832531507127, - "loss": 12.2392, + "grad_norm": 0.17633739113807678, + "learning_rate": 0.00019999940015305928, + "loss": 12.2582, "step": 367 }, { "epoch": 0.06025871950221058, - "grad_norm": 0.19075384736061096, - "learning_rate": 0.00019983152742497623, - "loss": 12.2769, + "grad_norm": 0.2000981867313385, + "learning_rate": 0.0001999993821135731, + "loss": 12.2918, "step": 368 }, { "epoch": 0.06042246602259702, - "grad_norm": 0.3625982999801636, - "learning_rate": 0.0001998305378947407, - "loss": 12.3061, + "grad_norm": 0.3609507381916046, + "learning_rate": 0.00019999936380683632, + "loss": 12.3054, "step": 369 }, { "epoch": 0.060586212542983464, - "grad_norm": 0.1546419858932495, - "learning_rate": 0.00019982953381258994, - "loss": 12.256, + "grad_norm": 0.1492471545934677, + "learning_rate": 0.00019999934523284908, + "loss": 12.2646, "step": 370 }, { "epoch": 0.060749959063369906, - "grad_norm": 0.13882146775722504, - "learning_rate": 0.00019982852973043919, - "loss": 12.268, + "grad_norm": 0.14887075126171112, + "learning_rate": 0.0001999993263916114, + "loss": 12.2835, "step": 371 }, { "epoch": 0.06091370558375635, - "grad_norm": 0.12971815466880798, - "learning_rate": 0.00019982752564828843, - "loss": 12.276, + "grad_norm": 0.1308159977197647, + "learning_rate": 0.00019999930728312334, + "loss": 12.287, "step": 372 }, { "epoch": 0.06107745210414279, - "grad_norm": 0.15311676263809204, - "learning_rate": 0.00019982650701422244, - "loss": 12.2588, + "grad_norm": 0.14279422163963318, + "learning_rate": 0.00019999928790738494, + "loss": 12.2745, "step": 373 }, { "epoch": 0.06124119862452923, - "grad_norm": 0.18194103240966797, - "learning_rate": 0.00019982550293207169, - "loss": 12.281, + "grad_norm": 0.13700923323631287, + "learning_rate": 0.00019999926826439623, + "loss": 12.3028, "step": 374 }, { "epoch": 0.06140494514491567, - "grad_norm": 0.24688465893268585, - "learning_rate": 0.0001998244842980057, - "loss": 12.2534, + "grad_norm": 0.17791202664375305, + "learning_rate": 0.0001999992483541573, + "loss": 12.2664, "step": 375 }, { "epoch": 0.061568691665302114, - "grad_norm": 0.17040275037288666, - "learning_rate": 0.00019982346566393971, - "loss": 12.2575, + "grad_norm": 0.28744640946388245, + "learning_rate": 0.00019999922817666817, + "loss": 12.2859, "step": 376 }, { "epoch": 0.061732438185688555, - "grad_norm": 0.16558203101158142, - "learning_rate": 0.00019982244702987373, - "loss": 12.2771, + "grad_norm": 0.1286764144897461, + "learning_rate": 0.00019999920773192896, + "loss": 12.2844, "step": 377 }, { "epoch": 0.061896184706075, - "grad_norm": 0.1640801578760147, - "learning_rate": 0.00019982141384389251, - "loss": 12.2924, + "grad_norm": 0.16422635316848755, + "learning_rate": 0.00019999918701993965, + "loss": 12.2979, "step": 378 }, { "epoch": 0.06205993122646144, - "grad_norm": 0.1380811631679535, - "learning_rate": 0.00019982039520982653, - "loss": 12.2528, + "grad_norm": 0.18279294669628143, + "learning_rate": 0.00019999916604070033, + "loss": 12.2773, "step": 379 }, { "epoch": 0.06222367774684788, - "grad_norm": 0.14711108803749084, - "learning_rate": 0.00019981936202384531, - "loss": 12.2581, + "grad_norm": 0.11040064692497253, + "learning_rate": 0.000199999144794211, + "loss": 12.2776, "step": 380 }, { "epoch": 0.06238742426723432, - "grad_norm": 0.10983294993638992, - "learning_rate": 0.0001998183288378641, - "loss": 12.2904, + "grad_norm": 0.11583856493234634, + "learning_rate": 0.00019999912328047184, + "loss": 12.2984, "step": 381 }, { "epoch": 0.06255117078762076, - "grad_norm": 0.1386169046163559, - "learning_rate": 0.0001998172956518829, - "loss": 12.2774, + "grad_norm": 0.15946663916110992, + "learning_rate": 0.0001999991014994828, + "loss": 12.2942, "step": 382 }, { "epoch": 0.0627149173080072, - "grad_norm": 0.22444948554039001, - "learning_rate": 0.00019981626246590167, - "loss": 12.2621, + "grad_norm": 0.1915806531906128, + "learning_rate": 0.000199999079451244, + "loss": 12.2742, "step": 383 }, { "epoch": 0.06287866382839365, - "grad_norm": 0.20805659890174866, - "learning_rate": 0.00019981521472800523, - "loss": 12.2589, + "grad_norm": 0.19050228595733643, + "learning_rate": 0.00019999905713575543, + "loss": 12.2752, "step": 384 }, { "epoch": 0.0630424103487801, - "grad_norm": 0.13287568092346191, - "learning_rate": 0.00019981415243819356, - "loss": 12.2598, + "grad_norm": 0.1742391288280487, + "learning_rate": 0.00019999903455301718, + "loss": 12.2773, "step": 385 }, { "epoch": 0.06320615686916653, - "grad_norm": 0.11777439713478088, - "learning_rate": 0.00019981310470029712, - "loss": 12.2512, + "grad_norm": 0.1563049852848053, + "learning_rate": 0.00019999901170302935, + "loss": 12.2585, "step": 386 }, { "epoch": 0.06336990338955298, - "grad_norm": 0.15062777698040009, - "learning_rate": 0.00019981205696240067, - "loss": 12.2682, + "grad_norm": 0.1489725112915039, + "learning_rate": 0.00019999898858579195, + "loss": 12.2925, "step": 387 }, { "epoch": 0.06353364990993941, - "grad_norm": 0.2239951193332672, - "learning_rate": 0.00019981100922450423, - "loss": 12.2345, + "grad_norm": 0.17745822668075562, + "learning_rate": 0.0001999989652013051, + "loss": 12.2546, "step": 388 }, { "epoch": 0.06369739643032586, - "grad_norm": 0.3479856252670288, - "learning_rate": 0.00019980994693469256, - "loss": 12.2865, + "grad_norm": 0.29051586985588074, + "learning_rate": 0.00019999894154956879, + "loss": 12.2911, "step": 389 }, { "epoch": 0.0638611429507123, - "grad_norm": 0.16244293749332428, - "learning_rate": 0.0001998088846448809, - "loss": 12.2478, + "grad_norm": 0.12695226073265076, + "learning_rate": 0.00019999891763058312, + "loss": 12.2646, "step": 390 }, { "epoch": 0.06402488947109874, - "grad_norm": 0.14563092589378357, - "learning_rate": 0.00019980782235506922, - "loss": 12.2693, + "grad_norm": 0.153972327709198, + "learning_rate": 0.00019999889344434819, + "loss": 12.2832, "step": 391 }, { "epoch": 0.06418863599148518, - "grad_norm": 0.11829731613397598, - "learning_rate": 0.00019980677461717278, - "loss": 12.2523, + "grad_norm": 0.13381552696228027, + "learning_rate": 0.00019999886899086397, + "loss": 12.261, "step": 392 }, { "epoch": 0.06435238251187163, - "grad_norm": 0.18642814457416534, - "learning_rate": 0.00019980569777544588, - "loss": 12.2565, + "grad_norm": 0.19647999107837677, + "learning_rate": 0.00019999884427013062, + "loss": 12.2679, "step": 393 }, { "epoch": 0.06451612903225806, - "grad_norm": 0.1708955317735672, - "learning_rate": 0.00019980462093371898, - "loss": 12.2734, + "grad_norm": 0.1712377965450287, + "learning_rate": 0.00019999881928214818, + "loss": 12.2874, "step": 394 }, { "epoch": 0.06467987555264451, - "grad_norm": 0.1829138845205307, - "learning_rate": 0.00019980354409199208, - "loss": 12.2634, + "grad_norm": 0.1798669546842575, + "learning_rate": 0.00019999879402691668, + "loss": 12.2939, "step": 395 }, { "epoch": 0.06484362207303095, - "grad_norm": 0.22328996658325195, - "learning_rate": 0.0001998024818021804, - "loss": 12.2646, + "grad_norm": 0.2037380337715149, + "learning_rate": 0.00019999876850443623, + "loss": 12.2857, "step": 396 }, { "epoch": 0.0650073685934174, - "grad_norm": 0.30070918798446655, - "learning_rate": 0.00019980139040853828, - "loss": 12.2608, + "grad_norm": 0.24441306293010712, + "learning_rate": 0.00019999874271470685, + "loss": 12.2994, "step": 397 }, { "epoch": 0.06517111511380383, - "grad_norm": 0.21575552225112915, - "learning_rate": 0.00019980031356681138, - "loss": 12.2592, + "grad_norm": 0.16837072372436523, + "learning_rate": 0.00019999871665772866, + "loss": 12.2756, "step": 398 }, { "epoch": 0.06533486163419028, - "grad_norm": 0.1523241251707077, - "learning_rate": 0.00019979922217316926, - "loss": 12.2764, + "grad_norm": 0.132755309343338, + "learning_rate": 0.00019999869033350174, + "loss": 12.2839, "step": 399 }, { "epoch": 0.06549860815457671, - "grad_norm": 0.13779282569885254, - "learning_rate": 0.00019979813077952713, - "loss": 12.282, - "step": 400 - }, - { - "epoch": 0.06549860815457671, - "eval_loss": 12.26255989074707, - "eval_runtime": 7.4384, - "eval_samples_per_second": 33.072, - "eval_steps_per_second": 16.536, + "grad_norm": 0.16452300548553467, + "learning_rate": 0.00019999866374202608, + "loss": 12.2943, "step": 400 }, { "epoch": 0.06566235467496316, - "grad_norm": 0.29484638571739197, - "learning_rate": 0.000199797039385885, - "loss": 12.2572, + "grad_norm": 0.2534669041633606, + "learning_rate": 0.00019999863688330183, + "loss": 12.2687, "step": 401 }, { "epoch": 0.0658261011953496, - "grad_norm": 0.19167940318584442, - "learning_rate": 0.00019979594799224287, - "loss": 12.2659, + "grad_norm": 0.19958250224590302, + "learning_rate": 0.00019999860975732903, + "loss": 12.2845, "step": 402 }, { "epoch": 0.06598984771573604, - "grad_norm": 0.20377042889595032, - "learning_rate": 0.00019979484204668552, - "loss": 12.2456, + "grad_norm": 0.13449449837207794, + "learning_rate": 0.00019999858236410776, + "loss": 12.2794, "step": 403 }, { "epoch": 0.06615359423612248, - "grad_norm": 0.1689855009317398, - "learning_rate": 0.0001997937506530434, - "loss": 12.2753, + "grad_norm": 0.17238451540470123, + "learning_rate": 0.0001999985547036381, + "loss": 12.3065, "step": 404 }, { "epoch": 0.06631734075650893, - "grad_norm": 0.22132129967212677, - "learning_rate": 0.00019979264470748603, - "loss": 12.2987, + "grad_norm": 0.15234483778476715, + "learning_rate": 0.0001999985267759201, + "loss": 12.3007, "step": 405 }, { "epoch": 0.06648108727689536, - "grad_norm": 0.15825076401233673, - "learning_rate": 0.00019979153876192868, - "loss": 12.256, + "grad_norm": 0.17298907041549683, + "learning_rate": 0.00019999849858095382, + "loss": 12.2824, "step": 406 }, { "epoch": 0.06664483379728181, - "grad_norm": 0.19967804849147797, - "learning_rate": 0.0001997904182644561, - "loss": 12.2423, + "grad_norm": 0.1439378559589386, + "learning_rate": 0.00019999847011873938, + "loss": 12.2567, "step": 407 }, { "epoch": 0.06680858031766825, - "grad_norm": 0.16729998588562012, - "learning_rate": 0.00019978931231889874, - "loss": 12.2698, + "grad_norm": 0.19181743264198303, + "learning_rate": 0.00019999844138927683, + "loss": 12.2789, "step": 408 }, { "epoch": 0.0669723268380547, - "grad_norm": 0.2205231636762619, - "learning_rate": 0.00019978819182142615, - "loss": 12.2751, + "grad_norm": 0.2083439975976944, + "learning_rate": 0.00019999841239256628, + "loss": 12.2807, "step": 409 }, { "epoch": 0.06713607335844113, - "grad_norm": 0.30310294032096863, - "learning_rate": 0.00019978707132395357, - "loss": 12.2969, + "grad_norm": 0.3232664167881012, + "learning_rate": 0.00019999838312860776, + "loss": 12.3084, "step": 410 }, { "epoch": 0.06729981987882758, - "grad_norm": 0.1694662868976593, - "learning_rate": 0.00019978595082648098, - "loss": 12.2553, + "grad_norm": 0.14184612035751343, + "learning_rate": 0.0001999983535974014, + "loss": 12.272, "step": 411 }, { "epoch": 0.06746356639921401, - "grad_norm": 0.21690568327903748, - "learning_rate": 0.0001997848303290084, - "loss": 12.2774, + "grad_norm": 0.30172014236450195, + "learning_rate": 0.00019999832379894721, + "loss": 12.2954, "step": 412 }, { "epoch": 0.06762731291960046, - "grad_norm": 0.12437091767787933, - "learning_rate": 0.0001997836952796206, - "loss": 12.254, + "grad_norm": 0.16100570559501648, + "learning_rate": 0.00019999829373324533, + "loss": 12.2742, "step": 413 }, { "epoch": 0.0677910594399869, - "grad_norm": 0.21863418817520142, - "learning_rate": 0.000199782574782148, - "loss": 12.3004, + "grad_norm": 0.15702582895755768, + "learning_rate": 0.00019999826340029583, + "loss": 12.3039, "step": 414 }, { "epoch": 0.06795480596037334, - "grad_norm": 0.14826977252960205, - "learning_rate": 0.0001997814397327602, - "loss": 12.2813, + "grad_norm": 0.1590198427438736, + "learning_rate": 0.00019999823280009878, + "loss": 12.2853, "step": 415 }, { "epoch": 0.06811855248075978, - "grad_norm": 0.2234913855791092, - "learning_rate": 0.00019978030468337238, - "loss": 12.277, + "grad_norm": 0.1716587245464325, + "learning_rate": 0.00019999820193265425, + "loss": 12.2875, "step": 416 }, { "epoch": 0.06828229900114623, - "grad_norm": 0.3348887264728546, - "learning_rate": 0.00019977915508206934, - "loss": 12.2771, + "grad_norm": 0.27681276202201843, + "learning_rate": 0.00019999817079796234, + "loss": 12.2874, "step": 417 }, { "epoch": 0.06844604552153266, - "grad_norm": 0.14655959606170654, - "learning_rate": 0.00019977802003268152, - "loss": 12.2347, + "grad_norm": 0.13572442531585693, + "learning_rate": 0.00019999813939602313, + "loss": 12.2638, "step": 418 }, { "epoch": 0.06860979204191911, - "grad_norm": 0.12391780316829681, - "learning_rate": 0.0001997768849832937, - "loss": 12.2504, + "grad_norm": 0.1282089799642563, + "learning_rate": 0.0001999981077268367, + "loss": 12.2649, "step": 419 }, { "epoch": 0.06877353856230554, - "grad_norm": 0.23915357887744904, - "learning_rate": 0.00019977572083007544, - "loss": 12.2661, + "grad_norm": 0.17424018681049347, + "learning_rate": 0.00019999807579040315, + "loss": 12.2719, "step": 420 }, { "epoch": 0.068937285082692, - "grad_norm": 0.10997791588306427, - "learning_rate": 0.00019977458578068763, - "loss": 12.2542, + "grad_norm": 0.13602033257484436, + "learning_rate": 0.00019999804358672253, + "loss": 12.2614, "step": 421 }, { "epoch": 0.06910103160307843, - "grad_norm": 0.1418733149766922, - "learning_rate": 0.00019977342162746936, - "loss": 12.2543, + "grad_norm": 0.14215390384197235, + "learning_rate": 0.00019999801111579498, + "loss": 12.2676, "step": 422 }, { "epoch": 0.06926477812346488, - "grad_norm": 0.3322834372520447, - "learning_rate": 0.00019977227202616632, - "loss": 12.3091, + "grad_norm": 0.3541161119937897, + "learning_rate": 0.00019999797837762053, + "loss": 12.3134, "step": 423 }, { "epoch": 0.06942852464385132, - "grad_norm": 0.20287387073040009, - "learning_rate": 0.00019977110787294805, - "loss": 12.2878, + "grad_norm": 0.2593739628791809, + "learning_rate": 0.00019999794537219932, + "loss": 12.2915, "step": 424 }, { "epoch": 0.06959227116423776, - "grad_norm": 0.19083108007907867, - "learning_rate": 0.00019976994371972978, - "loss": 12.24, + "grad_norm": 0.20327959954738617, + "learning_rate": 0.00019999791209953136, + "loss": 12.276, "step": 425 }, { "epoch": 0.06975601768462421, - "grad_norm": 0.24128662049770355, - "learning_rate": 0.0001997687795665115, - "loss": 12.2481, + "grad_norm": 0.18611687421798706, + "learning_rate": 0.00019999787855961684, + "loss": 12.265, "step": 426 }, { "epoch": 0.06991976420501064, - "grad_norm": 0.12994429469108582, - "learning_rate": 0.00019976761541329324, - "loss": 12.2503, + "grad_norm": 0.13483114540576935, + "learning_rate": 0.00019999784475245577, + "loss": 12.2595, "step": 427 }, { "epoch": 0.07008351072539709, - "grad_norm": 0.2344224601984024, - "learning_rate": 0.00019976643670815974, - "loss": 12.2626, + "grad_norm": 0.3187796473503113, + "learning_rate": 0.00019999781067804828, + "loss": 12.2764, "step": 428 }, { "epoch": 0.07024725724578353, - "grad_norm": 0.21825483441352844, - "learning_rate": 0.00019976525800302625, - "loss": 12.2574, + "grad_norm": 0.3139594793319702, + "learning_rate": 0.00019999777633639444, + "loss": 12.2634, "step": 429 }, { "epoch": 0.07041100376616997, - "grad_norm": 0.2866182327270508, - "learning_rate": 0.00019976409384980798, - "loss": 12.2815, + "grad_norm": 0.1972879022359848, + "learning_rate": 0.00019999774172749438, + "loss": 12.2803, "step": 430 }, { "epoch": 0.07057475028655641, - "grad_norm": 0.15483808517456055, - "learning_rate": 0.00019976291514467448, - "loss": 12.2937, + "grad_norm": 0.1937691867351532, + "learning_rate": 0.00019999770685134817, + "loss": 12.3016, "step": 431 }, { "epoch": 0.07073849680694286, - "grad_norm": 0.2085985690355301, - "learning_rate": 0.00019976172188762575, - "loss": 12.2691, + "grad_norm": 0.195434108376503, + "learning_rate": 0.00019999767170795588, + "loss": 12.2838, "step": 432 }, { "epoch": 0.07090224332732929, - "grad_norm": 0.1951405107975006, - "learning_rate": 0.00019976054318249226, - "loss": 12.2768, + "grad_norm": 0.19516132771968842, + "learning_rate": 0.00019999763629731761, + "loss": 12.2821, "step": 433 }, { "epoch": 0.07106598984771574, - "grad_norm": 0.14644214510917664, - "learning_rate": 0.00019975934992544353, - "loss": 12.267, + "grad_norm": 0.2725695073604584, + "learning_rate": 0.00019999760061943346, + "loss": 12.2809, "step": 434 }, { "epoch": 0.07122973636810218, - "grad_norm": 0.19653159379959106, - "learning_rate": 0.0001997581566683948, - "loss": 12.2755, + "grad_norm": 0.27861475944519043, + "learning_rate": 0.00019999756467430356, + "loss": 12.2857, "step": 435 }, { "epoch": 0.07139348288848862, - "grad_norm": 0.2027158886194229, - "learning_rate": 0.00019975696341134608, - "loss": 12.2935, + "grad_norm": 0.19172325730323792, + "learning_rate": 0.00019999752846192796, + "loss": 12.2979, "step": 436 }, { "epoch": 0.07155722940887506, - "grad_norm": 0.14168038964271545, - "learning_rate": 0.00019975575560238212, - "loss": 12.2662, + "grad_norm": 0.21028302609920502, + "learning_rate": 0.0001999974919823068, + "loss": 12.2816, "step": 437 }, { "epoch": 0.07172097592926151, - "grad_norm": 0.21052201092243195, - "learning_rate": 0.00019975454779341817, - "loss": 12.2358, + "grad_norm": 0.2027662843465805, + "learning_rate": 0.00019999745523544013, + "loss": 12.2401, "step": 438 }, { "epoch": 0.07188472244964794, - "grad_norm": 0.19395172595977783, - "learning_rate": 0.00019975335453636944, - "loss": 12.26, + "grad_norm": 0.20287398993968964, + "learning_rate": 0.00019999741822132808, + "loss": 12.2591, "step": 439 }, { "epoch": 0.07204846897003439, - "grad_norm": 0.19278693199157715, - "learning_rate": 0.0001997521467274055, - "loss": 12.258, + "grad_norm": 0.22502224147319794, + "learning_rate": 0.00019999738093997075, + "loss": 12.2593, "step": 440 }, { "epoch": 0.07221221549042083, - "grad_norm": 0.22367942333221436, - "learning_rate": 0.00019975093891844153, - "loss": 12.2787, + "grad_norm": 0.23881785571575165, + "learning_rate": 0.00019999734339136823, + "loss": 12.2821, "step": 441 }, { "epoch": 0.07237596201080727, - "grad_norm": 0.22102567553520203, - "learning_rate": 0.00019974973110947758, - "loss": 12.2644, + "grad_norm": 0.1868753433227539, + "learning_rate": 0.00019999730557552058, + "loss": 12.2635, "step": 442 }, { "epoch": 0.07253970853119371, - "grad_norm": 0.16479933261871338, - "learning_rate": 0.0001997485087485984, - "loss": 12.2687, + "grad_norm": 0.19476260244846344, + "learning_rate": 0.00019999726749242796, + "loss": 12.2884, "step": 443 }, { "epoch": 0.07270345505158016, - "grad_norm": 0.1651541292667389, - "learning_rate": 0.00019974730093963444, - "loss": 12.2566, + "grad_norm": 0.15796904265880585, + "learning_rate": 0.00019999722914209046, + "loss": 12.2677, "step": 444 }, { "epoch": 0.07286720157196659, - "grad_norm": 0.12695051729679108, - "learning_rate": 0.00019974606402684003, - "loss": 12.2622, + "grad_norm": 0.3154045045375824, + "learning_rate": 0.00019999719052450821, + "loss": 12.2752, "step": 445 }, { "epoch": 0.07303094809235304, - "grad_norm": 0.1750200092792511, - "learning_rate": 0.00019974484166596085, - "loss": 12.2779, + "grad_norm": 0.25553789734840393, + "learning_rate": 0.00019999715163968122, + "loss": 12.2866, "step": 446 }, { "epoch": 0.07319469461273947, - "grad_norm": 0.1328081488609314, - "learning_rate": 0.00019974360475316644, - "loss": 12.2475, + "grad_norm": 0.28748029470443726, + "learning_rate": 0.00019999711248760968, + "loss": 12.25, "step": 447 }, { "epoch": 0.07335844113312592, - "grad_norm": 0.16384023427963257, - "learning_rate": 0.00019974238239228725, - "loss": 12.3052, + "grad_norm": 0.18394634127616882, + "learning_rate": 0.00019999707306829367, + "loss": 12.3081, "step": 448 }, { "epoch": 0.07352218765351236, - "grad_norm": 0.26911428570747375, - "learning_rate": 0.00019974116003140807, - "loss": 12.2702, + "grad_norm": 0.19124747812747955, + "learning_rate": 0.00019999703338173327, + "loss": 12.2823, "step": 449 }, { "epoch": 0.0736859341738988, - "grad_norm": 0.16559433937072754, - "learning_rate": 0.00019973992311861366, - "loss": 12.2486, + "grad_norm": 0.19708351790905, + "learning_rate": 0.00019999699342792862, + "loss": 12.2666, "step": 450 }, { "epoch": 0.07384968069428524, - "grad_norm": 0.13143081963062286, - "learning_rate": 0.00019973867165390402, - "loss": 12.2532, + "grad_norm": 0.12283062934875488, + "learning_rate": 0.00019999695320687983, + "loss": 12.2661, "step": 451 }, { "epoch": 0.07401342721467169, - "grad_norm": 0.12418577075004578, - "learning_rate": 0.0001997374347411096, - "loss": 12.2817, + "grad_norm": 0.13629688322544098, + "learning_rate": 0.00019999691271858702, + "loss": 12.2952, "step": 452 }, { "epoch": 0.07417717373505812, - "grad_norm": 0.17400050163269043, - "learning_rate": 0.00019973618327639997, - "loss": 12.2573, + "grad_norm": 0.176702082157135, + "learning_rate": 0.00019999687196305019, + "loss": 12.266, "step": 453 }, { "epoch": 0.07434092025544457, - "grad_norm": 0.21183834969997406, - "learning_rate": 0.00019973494636360556, - "loss": 12.2938, + "grad_norm": 0.21978086233139038, + "learning_rate": 0.0001999968309402696, + "loss": 12.2909, "step": 454 }, { "epoch": 0.07450466677583101, - "grad_norm": 0.14846572279930115, - "learning_rate": 0.00019973369489889592, - "loss": 12.2649, + "grad_norm": 0.18183192610740662, + "learning_rate": 0.00019999678965024526, + "loss": 12.2789, "step": 455 }, { "epoch": 0.07466841329621746, - "grad_norm": 0.14805927872657776, - "learning_rate": 0.00019973242888227105, - "loss": 12.2494, + "grad_norm": 0.2290555238723755, + "learning_rate": 0.0001999967480929773, + "loss": 12.257, "step": 456 }, { "epoch": 0.07483215981660389, - "grad_norm": 0.174244225025177, - "learning_rate": 0.0001997311774175614, - "loss": 12.249, + "grad_norm": 0.22575747966766357, + "learning_rate": 0.00019999670626846589, + "loss": 12.2811, "step": 457 }, { "epoch": 0.07499590633699034, - "grad_norm": 0.21895098686218262, - "learning_rate": 0.00019972991140093654, - "loss": 12.2541, + "grad_norm": 0.2953977882862091, + "learning_rate": 0.00019999666417671103, + "loss": 12.2646, "step": 458 }, { "epoch": 0.07515965285737677, - "grad_norm": 0.15302148461341858, - "learning_rate": 0.00019972864538431168, - "loss": 12.2487, + "grad_norm": 0.18846358358860016, + "learning_rate": 0.0001999966218177129, + "loss": 12.2522, "step": 459 }, { "epoch": 0.07532339937776322, - "grad_norm": 0.16815854609012604, - "learning_rate": 0.00019972739391960204, - "loss": 12.2604, + "grad_norm": 0.3193408250808716, + "learning_rate": 0.00019999657919147167, + "loss": 12.2645, "step": 460 }, { "epoch": 0.07548714589814967, - "grad_norm": 0.16916708648204803, - "learning_rate": 0.00019972612790297717, - "loss": 12.2466, + "grad_norm": 0.27576708793640137, + "learning_rate": 0.00019999653629798733, + "loss": 12.2474, "step": 461 }, { "epoch": 0.0756508924185361, - "grad_norm": 0.22577545046806335, - "learning_rate": 0.00019972484733443707, - "loss": 12.2672, + "grad_norm": 0.19615419209003448, + "learning_rate": 0.00019999649313726009, + "loss": 12.2696, "step": 462 }, { "epoch": 0.07581463893892255, - "grad_norm": 0.3655492961406708, - "learning_rate": 0.00019972356676589698, - "loss": 12.264, + "grad_norm": 0.1791532337665558, + "learning_rate": 0.00019999644970929002, + "loss": 12.2756, "step": 463 }, { "epoch": 0.07597838545930899, - "grad_norm": 0.19828443229198456, - "learning_rate": 0.0001997223007492721, - "loss": 12.2637, + "grad_norm": 0.3186524510383606, + "learning_rate": 0.00019999640601407723, + "loss": 12.2628, "step": 464 }, { "epoch": 0.07614213197969544, - "grad_norm": 0.17368406057357788, - "learning_rate": 0.000199721020180732, + "grad_norm": 0.19734470546245575, + "learning_rate": 0.0001999963620516219, "loss": 12.2741, "step": 465 }, { "epoch": 0.07630587850008187, - "grad_norm": 0.18248553574085236, - "learning_rate": 0.00019971973961219192, - "loss": 12.2579, + "grad_norm": 0.28205740451812744, + "learning_rate": 0.00019999631782192405, + "loss": 12.2534, "step": 466 }, { "epoch": 0.07646962502046832, - "grad_norm": 0.2927585542201996, - "learning_rate": 0.0001997184444917366, - "loss": 12.2693, + "grad_norm": 0.2390183061361313, + "learning_rate": 0.0001999962733249839, + "loss": 12.2746, "step": 467 }, { "epoch": 0.07663337154085476, - "grad_norm": 0.2267952412366867, - "learning_rate": 0.0001997171639231965, - "loss": 12.2665, + "grad_norm": 0.19656427204608917, + "learning_rate": 0.0001999962285608015, + "loss": 12.2728, "step": 468 }, { "epoch": 0.0767971180612412, - "grad_norm": 0.18384003639221191, - "learning_rate": 0.00019971585425082594, - "loss": 12.2695, + "grad_norm": 0.14260216057300568, + "learning_rate": 0.00019999618352937697, + "loss": 12.2769, "step": 469 }, { "epoch": 0.07696086458162764, - "grad_norm": 0.19173267483711243, - "learning_rate": 0.00019971457368228585, - "loss": 12.2794, + "grad_norm": 0.1668563187122345, + "learning_rate": 0.0001999961382307105, + "loss": 12.2899, "step": 470 }, { "epoch": 0.07712461110201409, - "grad_norm": 0.16286374628543854, - "learning_rate": 0.00019971327856183052, - "loss": 12.2504, + "grad_norm": 0.186238631606102, + "learning_rate": 0.0001999960926648021, + "loss": 12.2611, "step": 471 }, { "epoch": 0.07728835762240052, - "grad_norm": 0.15662431716918945, - "learning_rate": 0.00019971196888945997, - "loss": 12.28, + "grad_norm": 0.16535009443759918, + "learning_rate": 0.00019999604683165197, + "loss": 12.2853, "step": 472 }, { "epoch": 0.07745210414278697, - "grad_norm": 0.26529473066329956, - "learning_rate": 0.00019971065921708941, - "loss": 12.3013, + "grad_norm": 0.23412054777145386, + "learning_rate": 0.00019999600073126024, + "loss": 12.3178, "step": 473 }, { "epoch": 0.0776158506631734, - "grad_norm": 0.2134474515914917, - "learning_rate": 0.0001997093640966341, - "loss": 12.2591, + "grad_norm": 0.21077093482017517, + "learning_rate": 0.00019999595436362698, + "loss": 12.2663, "step": 474 }, { "epoch": 0.07777959718355985, - "grad_norm": 0.16773010790348053, - "learning_rate": 0.00019970805442426354, - "loss": 12.2776, + "grad_norm": 0.1960388571023941, + "learning_rate": 0.00019999590772875236, + "loss": 12.2773, "step": 475 }, { "epoch": 0.07794334370394629, - "grad_norm": 0.1953994333744049, - "learning_rate": 0.00019970674475189298, - "loss": 12.2714, + "grad_norm": 0.16811984777450562, + "learning_rate": 0.0001999958608266365, + "loss": 12.2815, "step": 476 }, { "epoch": 0.07810709022433274, - "grad_norm": 0.23002862930297852, - "learning_rate": 0.0001997054205276072, - "loss": 12.289, + "grad_norm": 0.30715492367744446, + "learning_rate": 0.00019999581365727948, + "loss": 12.2967, "step": 477 }, { "epoch": 0.07827083674471917, - "grad_norm": 0.17535601556301117, - "learning_rate": 0.00019970411085523665, - "loss": 12.2659, + "grad_norm": 0.17825697362422943, + "learning_rate": 0.00019999576622068148, + "loss": 12.2716, "step": 478 }, { "epoch": 0.07843458326510562, - "grad_norm": 0.16866976022720337, - "learning_rate": 0.00019970278663095087, - "loss": 12.2542, + "grad_norm": 0.18646034598350525, + "learning_rate": 0.0001999957185168426, + "loss": 12.2571, "step": 479 }, { "epoch": 0.07859832978549205, - "grad_norm": 0.2173595428466797, - "learning_rate": 0.0001997014624066651, - "loss": 12.2807, + "grad_norm": 0.2670734226703644, + "learning_rate": 0.00019999567054576297, + "loss": 12.2827, "step": 480 }, { "epoch": 0.0787620763058785, - "grad_norm": 0.1581399142742157, - "learning_rate": 0.00019970012363046408, - "loss": 12.25, + "grad_norm": 0.17917779088020325, + "learning_rate": 0.0001999956223074427, + "loss": 12.2706, "step": 481 }, { "epoch": 0.07892582282626494, - "grad_norm": 0.41668128967285156, - "learning_rate": 0.0001996987994061783, - "loss": 12.3047, + "grad_norm": 0.3791571259498596, + "learning_rate": 0.000199995573801882, + "loss": 12.3039, "step": 482 }, { "epoch": 0.07908956934665139, - "grad_norm": 0.1310993880033493, - "learning_rate": 0.00019969746062997729, - "loss": 12.2899, + "grad_norm": 0.2250203788280487, + "learning_rate": 0.00019999552502908088, + "loss": 12.2971, "step": 483 }, { "epoch": 0.07925331586703782, - "grad_norm": 0.17166323959827423, - "learning_rate": 0.0001996961364056915, - "loss": 12.2605, + "grad_norm": 0.21701763570308685, + "learning_rate": 0.00019999547598903954, + "loss": 12.2741, "step": 484 }, { "epoch": 0.07941706238742427, - "grad_norm": 0.18003998696804047, - "learning_rate": 0.00019969478307757527, - "loss": 12.2674, + "grad_norm": 0.22662462294101715, + "learning_rate": 0.00019999542668175813, + "loss": 12.2691, "step": 485 }, { "epoch": 0.0795808089078107, - "grad_norm": 0.1621619164943695, - "learning_rate": 0.00019969344430137426, - "loss": 12.2738, + "grad_norm": 0.1721991002559662, + "learning_rate": 0.00019999537710723673, + "loss": 12.2777, "step": 486 }, { "epoch": 0.07974455542819715, - "grad_norm": 0.17165407538414001, - "learning_rate": 0.00019969210552517325, - "loss": 12.2644, + "grad_norm": 0.2554982900619507, + "learning_rate": 0.0001999953272654755, + "loss": 12.272, "step": 487 }, { "epoch": 0.07990830194858359, - "grad_norm": 0.2042619287967682, - "learning_rate": 0.000199690752197057, - "loss": 12.2444, + "grad_norm": 0.16333116590976715, + "learning_rate": 0.00019999527715647454, + "loss": 12.2473, "step": 488 }, { "epoch": 0.08007204846897004, - "grad_norm": 0.17771846055984497, - "learning_rate": 0.00019968939886894077, - "loss": 12.2738, + "grad_norm": 0.2844250798225403, + "learning_rate": 0.00019999522678023404, + "loss": 12.2779, "step": 489 }, { "epoch": 0.08023579498935647, - "grad_norm": 0.1405569314956665, - "learning_rate": 0.00019968804554082453, - "loss": 12.2692, + "grad_norm": 0.15430493652820587, + "learning_rate": 0.00019999517613675412, + "loss": 12.2759, "step": 490 }, { "epoch": 0.08039954150974292, - "grad_norm": 0.1747368574142456, - "learning_rate": 0.0001996866922127083, - "loss": 12.2532, + "grad_norm": 0.14520032703876495, + "learning_rate": 0.0001999951252260349, + "loss": 12.2643, "step": 491 }, { "epoch": 0.08056328803012935, - "grad_norm": 0.21995700895786285, - "learning_rate": 0.00019968532433267683, - "loss": 12.2536, + "grad_norm": 0.20975728332996368, + "learning_rate": 0.0001999950740480765, + "loss": 12.2562, "step": 492 }, { "epoch": 0.0807270345505158, - "grad_norm": 0.2268180549144745, - "learning_rate": 0.0001996839710045606, - "loss": 12.2637, + "grad_norm": 0.24926480650901794, + "learning_rate": 0.00019999502260287905, + "loss": 12.2656, "step": 493 }, { "epoch": 0.08089078107090224, - "grad_norm": 0.18887090682983398, - "learning_rate": 0.00019968260312452912, - "loss": 12.2552, + "grad_norm": 0.1939597725868225, + "learning_rate": 0.00019999497089044277, + "loss": 12.2797, "step": 494 }, { "epoch": 0.08105452759128869, - "grad_norm": 0.14496204257011414, - "learning_rate": 0.00019968123524449766, - "loss": 12.2786, + "grad_norm": 0.14060169458389282, + "learning_rate": 0.0001999949189107677, + "loss": 12.2859, "step": 495 }, { "epoch": 0.08121827411167512, - "grad_norm": 0.2236098051071167, - "learning_rate": 0.00019967985281255096, - "loss": 12.2744, + "grad_norm": 0.21786072850227356, + "learning_rate": 0.00019999486666385404, + "loss": 12.2858, "step": 496 }, { "epoch": 0.08138202063206157, - "grad_norm": 0.15895530581474304, - "learning_rate": 0.0001996784849325195, - "loss": 12.2531, + "grad_norm": 0.2889421880245209, + "learning_rate": 0.0001999948141497019, + "loss": 12.2572, "step": 497 }, { "epoch": 0.081545767152448, - "grad_norm": 0.17476806044578552, - "learning_rate": 0.0001996771025005728, - "loss": 12.2172, + "grad_norm": 0.291425496339798, + "learning_rate": 0.0001999947613683114, + "loss": 12.2282, "step": 498 }, { "epoch": 0.08170951367283445, - "grad_norm": 0.19695644080638885, - "learning_rate": 0.0001996757200686261, - "loss": 12.2513, + "grad_norm": 0.25705039501190186, + "learning_rate": 0.00019999470831968274, + "loss": 12.2507, "step": 499 }, { "epoch": 0.0818732601932209, - "grad_norm": 0.1432948112487793, - "learning_rate": 0.0001996743376366794, - "loss": 12.2683, + "grad_norm": 0.1913681924343109, + "learning_rate": 0.00019999465500381605, + "loss": 12.2724, "step": 500 }, { - "epoch": 0.0818732601932209, - "eval_loss": 12.256741523742676, - "eval_runtime": 7.3749, - "eval_samples_per_second": 33.356, - "eval_steps_per_second": 16.678, - "step": 500 + "epoch": 0.08203700671360734, + "grad_norm": 0.2260674387216568, + "learning_rate": 0.00019999460142071143, + "loss": 12.2539, + "step": 501 + }, + { + "epoch": 0.08220075323399378, + "grad_norm": 0.17089596390724182, + "learning_rate": 0.00019999454757036906, + "loss": 12.2639, + "step": 502 + }, + { + "epoch": 0.08236449975438022, + "grad_norm": 0.21298366785049438, + "learning_rate": 0.00019999449345278904, + "loss": 12.2546, + "step": 503 + }, + { + "epoch": 0.08252824627476667, + "grad_norm": 0.1498648226261139, + "learning_rate": 0.0001999944390679716, + "loss": 12.2882, + "step": 504 + }, + { + "epoch": 0.0826919927951531, + "grad_norm": 0.2605227530002594, + "learning_rate": 0.0001999943844159168, + "loss": 12.2627, + "step": 505 + }, + { + "epoch": 0.08285573931553955, + "grad_norm": 0.2724616229534149, + "learning_rate": 0.00019999432949662483, + "loss": 12.261, + "step": 506 + }, + { + "epoch": 0.08301948583592599, + "grad_norm": 0.17513243854045868, + "learning_rate": 0.00019999427431009582, + "loss": 12.2554, + "step": 507 + }, + { + "epoch": 0.08318323235631243, + "grad_norm": 0.3738495409488678, + "learning_rate": 0.00019999421885632992, + "loss": 12.2488, + "step": 508 + }, + { + "epoch": 0.08334697887669887, + "grad_norm": 0.17396804690361023, + "learning_rate": 0.00019999416313532726, + "loss": 12.2296, + "step": 509 + }, + { + "epoch": 0.08351072539708532, + "grad_norm": 0.2673768699169159, + "learning_rate": 0.00019999410714708802, + "loss": 12.2929, + "step": 510 + }, + { + "epoch": 0.08367447191747175, + "grad_norm": 0.126251682639122, + "learning_rate": 0.00019999405089161237, + "loss": 12.2418, + "step": 511 + }, + { + "epoch": 0.0838382184378582, + "grad_norm": 0.13301542401313782, + "learning_rate": 0.00019999399436890038, + "loss": 12.2771, + "step": 512 + }, + { + "epoch": 0.08400196495824463, + "grad_norm": 0.36997753381729126, + "learning_rate": 0.0001999939375789523, + "loss": 12.282, + "step": 513 + }, + { + "epoch": 0.08416571147863108, + "grad_norm": 0.1922840178012848, + "learning_rate": 0.00019999388052176822, + "loss": 12.2512, + "step": 514 + }, + { + "epoch": 0.08432945799901752, + "grad_norm": 0.1612626612186432, + "learning_rate": 0.00019999382319734827, + "loss": 12.2564, + "step": 515 + }, + { + "epoch": 0.08449320451940397, + "grad_norm": 0.3599528670310974, + "learning_rate": 0.00019999376560569267, + "loss": 12.3231, + "step": 516 + }, + { + "epoch": 0.0846569510397904, + "grad_norm": 0.2174975723028183, + "learning_rate": 0.00019999370774680152, + "loss": 12.2482, + "step": 517 + }, + { + "epoch": 0.08482069756017685, + "grad_norm": 0.16101105511188507, + "learning_rate": 0.00019999364962067502, + "loss": 12.2453, + "step": 518 + }, + { + "epoch": 0.08498444408056328, + "grad_norm": 0.16098256409168243, + "learning_rate": 0.0001999935912273133, + "loss": 12.2844, + "step": 519 + }, + { + "epoch": 0.08514819060094973, + "grad_norm": 0.21801027655601501, + "learning_rate": 0.0001999935325667165, + "loss": 12.2693, + "step": 520 + }, + { + "epoch": 0.08531193712133617, + "grad_norm": 0.14086532592773438, + "learning_rate": 0.0001999934736388848, + "loss": 12.2613, + "step": 521 + }, + { + "epoch": 0.08547568364172262, + "grad_norm": 0.20445305109024048, + "learning_rate": 0.00019999341444381833, + "loss": 12.2395, + "step": 522 + }, + { + "epoch": 0.08563943016210905, + "grad_norm": 0.21074581146240234, + "learning_rate": 0.00019999335498151733, + "loss": 12.2754, + "step": 523 + }, + { + "epoch": 0.0858031766824955, + "grad_norm": 0.1751207858324051, + "learning_rate": 0.00019999329525198186, + "loss": 12.2557, + "step": 524 + }, + { + "epoch": 0.08596692320288193, + "grad_norm": 0.19772769510746002, + "learning_rate": 0.00019999323525521213, + "loss": 12.2841, + "step": 525 + }, + { + "epoch": 0.08613066972326838, + "grad_norm": 0.1892375349998474, + "learning_rate": 0.00019999317499120827, + "loss": 12.2425, + "step": 526 + }, + { + "epoch": 0.08629441624365482, + "grad_norm": 0.13868379592895508, + "learning_rate": 0.00019999311445997045, + "loss": 12.2425, + "step": 527 + }, + { + "epoch": 0.08645816276404127, + "grad_norm": 0.19545531272888184, + "learning_rate": 0.00019999305366149886, + "loss": 12.2506, + "step": 528 + }, + { + "epoch": 0.0866219092844277, + "grad_norm": 0.16759023070335388, + "learning_rate": 0.00019999299259579366, + "loss": 12.2702, + "step": 529 + }, + { + "epoch": 0.08678565580481415, + "grad_norm": 0.1422002762556076, + "learning_rate": 0.000199992931262855, + "loss": 12.2798, + "step": 530 + }, + { + "epoch": 0.08694940232520058, + "grad_norm": 0.1551176756620407, + "learning_rate": 0.00019999286966268303, + "loss": 12.2362, + "step": 531 + }, + { + "epoch": 0.08711314884558703, + "grad_norm": 0.19540736079216003, + "learning_rate": 0.00019999280779527793, + "loss": 12.2627, + "step": 532 + }, + { + "epoch": 0.08727689536597347, + "grad_norm": 0.1344379335641861, + "learning_rate": 0.00019999274566063983, + "loss": 12.2668, + "step": 533 + }, + { + "epoch": 0.08744064188635992, + "grad_norm": 0.1458573043346405, + "learning_rate": 0.00019999268325876895, + "loss": 12.2599, + "step": 534 + }, + { + "epoch": 0.08760438840674635, + "grad_norm": 0.2072553187608719, + "learning_rate": 0.00019999262058966542, + "loss": 12.2642, + "step": 535 + }, + { + "epoch": 0.0877681349271328, + "grad_norm": 0.14165370166301727, + "learning_rate": 0.00019999255765332946, + "loss": 12.2656, + "step": 536 + }, + { + "epoch": 0.08793188144751923, + "grad_norm": 0.1711515188217163, + "learning_rate": 0.00019999249444976118, + "loss": 12.2662, + "step": 537 + }, + { + "epoch": 0.08809562796790568, + "grad_norm": 0.1479502171278, + "learning_rate": 0.00019999243097896076, + "loss": 12.2564, + "step": 538 + }, + { + "epoch": 0.08825937448829213, + "grad_norm": 0.32752886414527893, + "learning_rate": 0.0001999923672409284, + "loss": 12.3072, + "step": 539 + }, + { + "epoch": 0.08842312100867857, + "grad_norm": 0.19929148256778717, + "learning_rate": 0.00019999230323566422, + "loss": 12.298, + "step": 540 + }, + { + "epoch": 0.08858686752906501, + "grad_norm": 0.33046862483024597, + "learning_rate": 0.00019999223896316845, + "loss": 12.248, + "step": 541 + }, + { + "epoch": 0.08875061404945145, + "grad_norm": 0.13765937089920044, + "learning_rate": 0.0001999921744234412, + "loss": 12.2542, + "step": 542 + }, + { + "epoch": 0.0889143605698379, + "grad_norm": 0.19766588509082794, + "learning_rate": 0.00019999210961648269, + "loss": 12.2662, + "step": 543 + }, + { + "epoch": 0.08907810709022433, + "grad_norm": 0.23486214876174927, + "learning_rate": 0.00019999204454229308, + "loss": 12.2628, + "step": 544 + }, + { + "epoch": 0.08924185361061078, + "grad_norm": 0.33818957209587097, + "learning_rate": 0.00019999197920087254, + "loss": 12.2958, + "step": 545 + }, + { + "epoch": 0.08940560013099721, + "grad_norm": 0.17376133799552917, + "learning_rate": 0.00019999191359222125, + "loss": 12.2765, + "step": 546 + }, + { + "epoch": 0.08956934665138366, + "grad_norm": 0.20563820004463196, + "learning_rate": 0.00019999184771633938, + "loss": 12.2809, + "step": 547 + }, + { + "epoch": 0.0897330931717701, + "grad_norm": 0.23043566942214966, + "learning_rate": 0.00019999178157322707, + "loss": 12.2716, + "step": 548 + }, + { + "epoch": 0.08989683969215655, + "grad_norm": 0.17227397859096527, + "learning_rate": 0.00019999171516288456, + "loss": 12.2736, + "step": 549 + }, + { + "epoch": 0.09006058621254298, + "grad_norm": 0.20385326445102692, + "learning_rate": 0.00019999164848531203, + "loss": 12.2394, + "step": 550 + }, + { + "epoch": 0.09022433273292943, + "grad_norm": 0.13971176743507385, + "learning_rate": 0.0001999915815405096, + "loss": 12.2582, + "step": 551 + }, + { + "epoch": 0.09038807925331586, + "grad_norm": 0.15711809694766998, + "learning_rate": 0.0001999915143284775, + "loss": 12.2369, + "step": 552 + }, + { + "epoch": 0.09055182577370231, + "grad_norm": 0.20646773278713226, + "learning_rate": 0.00019999144684921584, + "loss": 12.2469, + "step": 553 + }, + { + "epoch": 0.09071557229408875, + "grad_norm": 0.1829550713300705, + "learning_rate": 0.00019999137910272488, + "loss": 12.2541, + "step": 554 + }, + { + "epoch": 0.0908793188144752, + "grad_norm": 0.14914344251155853, + "learning_rate": 0.0001999913110890048, + "loss": 12.2867, + "step": 555 + }, + { + "epoch": 0.09104306533486163, + "grad_norm": 0.1926674097776413, + "learning_rate": 0.00019999124280805573, + "loss": 12.2441, + "step": 556 + }, + { + "epoch": 0.09120681185524808, + "grad_norm": 0.12586253881454468, + "learning_rate": 0.00019999117425987785, + "loss": 12.2711, + "step": 557 + }, + { + "epoch": 0.09137055837563451, + "grad_norm": 0.2592686414718628, + "learning_rate": 0.00019999110544447138, + "loss": 12.238, + "step": 558 + }, + { + "epoch": 0.09153430489602096, + "grad_norm": 0.1788288652896881, + "learning_rate": 0.0001999910363618365, + "loss": 12.2804, + "step": 559 + }, + { + "epoch": 0.0916980514164074, + "grad_norm": 0.15797553956508636, + "learning_rate": 0.00019999096701197339, + "loss": 12.2578, + "step": 560 + }, + { + "epoch": 0.09186179793679385, + "grad_norm": 0.155698761343956, + "learning_rate": 0.00019999089739488221, + "loss": 12.2505, + "step": 561 + }, + { + "epoch": 0.09202554445718028, + "grad_norm": 0.201356902718544, + "learning_rate": 0.00019999082751056318, + "loss": 12.2666, + "step": 562 + }, + { + "epoch": 0.09218929097756673, + "grad_norm": 0.4484696090221405, + "learning_rate": 0.00019999075735901647, + "loss": 12.2981, + "step": 563 + }, + { + "epoch": 0.09235303749795316, + "grad_norm": 0.23958955705165863, + "learning_rate": 0.00019999068694024228, + "loss": 12.3007, + "step": 564 + }, + { + "epoch": 0.09251678401833961, + "grad_norm": 0.20138677954673767, + "learning_rate": 0.0001999906162542408, + "loss": 12.2547, + "step": 565 + }, + { + "epoch": 0.09268053053872605, + "grad_norm": 0.2650439739227295, + "learning_rate": 0.00019999054530101218, + "loss": 12.2488, + "step": 566 + }, + { + "epoch": 0.0928442770591125, + "grad_norm": 0.16292192041873932, + "learning_rate": 0.00019999047408055665, + "loss": 12.2734, + "step": 567 + }, + { + "epoch": 0.09300802357949893, + "grad_norm": 0.16789360344409943, + "learning_rate": 0.0001999904025928744, + "loss": 12.2769, + "step": 568 + }, + { + "epoch": 0.09317177009988538, + "grad_norm": 0.18500173091888428, + "learning_rate": 0.0001999903308379656, + "loss": 12.2368, + "step": 569 + }, + { + "epoch": 0.09333551662027181, + "grad_norm": 0.166462704539299, + "learning_rate": 0.00019999025881583044, + "loss": 12.2278, + "step": 570 + }, + { + "epoch": 0.09349926314065826, + "grad_norm": 0.16099901497364044, + "learning_rate": 0.00019999018652646912, + "loss": 12.2619, + "step": 571 + }, + { + "epoch": 0.0936630096610447, + "grad_norm": 0.1871333122253418, + "learning_rate": 0.00019999011396988186, + "loss": 12.2631, + "step": 572 + }, + { + "epoch": 0.09382675618143115, + "grad_norm": 0.16076967120170593, + "learning_rate": 0.0001999900411460688, + "loss": 12.2759, + "step": 573 + }, + { + "epoch": 0.09399050270181758, + "grad_norm": 0.1936284452676773, + "learning_rate": 0.0001999899680550302, + "loss": 12.2962, + "step": 574 + }, + { + "epoch": 0.09415424922220403, + "grad_norm": 0.15248098969459534, + "learning_rate": 0.00019998989469676622, + "loss": 12.2729, + "step": 575 + }, + { + "epoch": 0.09431799574259046, + "grad_norm": 0.2867307960987091, + "learning_rate": 0.00019998982107127702, + "loss": 12.256, + "step": 576 + }, + { + "epoch": 0.09448174226297691, + "grad_norm": 0.23930412530899048, + "learning_rate": 0.00019998974717856286, + "loss": 12.2622, + "step": 577 + }, + { + "epoch": 0.09464548878336336, + "grad_norm": 0.23292279243469238, + "learning_rate": 0.0001999896730186239, + "loss": 12.2749, + "step": 578 + }, + { + "epoch": 0.0948092353037498, + "grad_norm": 0.20313963294029236, + "learning_rate": 0.00019998959859146036, + "loss": 12.2894, + "step": 579 + }, + { + "epoch": 0.09497298182413624, + "grad_norm": 0.18153154850006104, + "learning_rate": 0.00019998952389707242, + "loss": 12.2516, + "step": 580 + }, + { + "epoch": 0.09513672834452268, + "grad_norm": 0.15528979897499084, + "learning_rate": 0.0001999894489354603, + "loss": 12.2622, + "step": 581 + }, + { + "epoch": 0.09530047486490913, + "grad_norm": 0.15820947289466858, + "learning_rate": 0.00019998937370662416, + "loss": 12.2784, + "step": 582 + }, + { + "epoch": 0.09546422138529556, + "grad_norm": 0.1906697005033493, + "learning_rate": 0.00019998929821056426, + "loss": 12.2721, + "step": 583 + }, + { + "epoch": 0.09562796790568201, + "grad_norm": 0.16757844388484955, + "learning_rate": 0.00019998922244728076, + "loss": 12.2518, + "step": 584 + }, + { + "epoch": 0.09579171442606844, + "grad_norm": 0.1887599378824234, + "learning_rate": 0.0001999891464167739, + "loss": 12.2657, + "step": 585 + }, + { + "epoch": 0.09595546094645489, + "grad_norm": 0.2148314267396927, + "learning_rate": 0.0001999890701190438, + "loss": 12.2641, + "step": 586 + }, + { + "epoch": 0.09611920746684133, + "grad_norm": 0.2525731921195984, + "learning_rate": 0.00019998899355409076, + "loss": 12.2726, + "step": 587 + }, + { + "epoch": 0.09628295398722778, + "grad_norm": 0.25445112586021423, + "learning_rate": 0.00019998891672191494, + "loss": 12.2299, + "step": 588 + }, + { + "epoch": 0.09644670050761421, + "grad_norm": 0.20000016689300537, + "learning_rate": 0.00019998883962251654, + "loss": 12.2536, + "step": 589 + }, + { + "epoch": 0.09661044702800066, + "grad_norm": 0.20457811653614044, + "learning_rate": 0.00019998876225589578, + "loss": 12.2577, + "step": 590 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.17679806053638458, + "learning_rate": 0.00019998868462205285, + "loss": 12.2519, + "step": 591 + }, + { + "epoch": 0.09693794006877354, + "grad_norm": 0.1711779534816742, + "learning_rate": 0.000199988606720988, + "loss": 12.2187, + "step": 592 + }, + { + "epoch": 0.09710168658915998, + "grad_norm": 0.2674786448478699, + "learning_rate": 0.0001999885285527014, + "loss": 12.2609, + "step": 593 + }, + { + "epoch": 0.09726543310954643, + "grad_norm": 0.14570537209510803, + "learning_rate": 0.00019998845011719326, + "loss": 12.2463, + "step": 594 + }, + { + "epoch": 0.09742917962993286, + "grad_norm": 0.17581459879875183, + "learning_rate": 0.00019998837141446378, + "loss": 12.2543, + "step": 595 + }, + { + "epoch": 0.09759292615031931, + "grad_norm": 0.1694401651620865, + "learning_rate": 0.0001999882924445132, + "loss": 12.2882, + "step": 596 + }, + { + "epoch": 0.09775667267070574, + "grad_norm": 0.18744853138923645, + "learning_rate": 0.00019998821320734177, + "loss": 12.2401, + "step": 597 + }, + { + "epoch": 0.09792041919109219, + "grad_norm": 0.26006919145584106, + "learning_rate": 0.00019998813370294957, + "loss": 12.2708, + "step": 598 + }, + { + "epoch": 0.09808416571147863, + "grad_norm": 0.17062494158744812, + "learning_rate": 0.00019998805393133692, + "loss": 12.2635, + "step": 599 + }, + { + "epoch": 0.09824791223186508, + "grad_norm": 0.19302265346050262, + "learning_rate": 0.00019998797389250404, + "loss": 12.2255, + "step": 600 + }, + { + "epoch": 0.09841165875225151, + "grad_norm": 0.1742558777332306, + "learning_rate": 0.00019998789358645106, + "loss": 12.2327, + "step": 601 + }, + { + "epoch": 0.09857540527263796, + "grad_norm": 0.21890263259410858, + "learning_rate": 0.0001999878130131783, + "loss": 12.2857, + "step": 602 + }, + { + "epoch": 0.0987391517930244, + "grad_norm": 0.16165485978126526, + "learning_rate": 0.00019998773217268586, + "loss": 12.2459, + "step": 603 + }, + { + "epoch": 0.09890289831341084, + "grad_norm": 0.2369285672903061, + "learning_rate": 0.0001999876510649741, + "loss": 12.2954, + "step": 604 + }, + { + "epoch": 0.09906664483379728, + "grad_norm": 0.1774766743183136, + "learning_rate": 0.00019998756969004307, + "loss": 12.2627, + "step": 605 + }, + { + "epoch": 0.09923039135418373, + "grad_norm": 0.14726974070072174, + "learning_rate": 0.00019998748804789308, + "loss": 12.2593, + "step": 606 + }, + { + "epoch": 0.09939413787457016, + "grad_norm": 0.20986835658550262, + "learning_rate": 0.0001999874061385244, + "loss": 12.2769, + "step": 607 + }, + { + "epoch": 0.09955788439495661, + "grad_norm": 0.15654174983501434, + "learning_rate": 0.0001999873239619371, + "loss": 12.2858, + "step": 608 + }, + { + "epoch": 0.09972163091534304, + "grad_norm": 0.2046964168548584, + "learning_rate": 0.00019998724151813155, + "loss": 12.2597, + "step": 609 + }, + { + "epoch": 0.09988537743572949, + "grad_norm": 0.1860288828611374, + "learning_rate": 0.0001999871588071079, + "loss": 12.2755, + "step": 610 + }, + { + "epoch": 0.10004912395611593, + "grad_norm": 0.17672719061374664, + "learning_rate": 0.00019998707582886635, + "loss": 12.2937, + "step": 611 + }, + { + "epoch": 0.10021287047650237, + "grad_norm": 0.25083354115486145, + "learning_rate": 0.00019998699258340718, + "loss": 12.2747, + "step": 612 + }, + { + "epoch": 0.10037661699688881, + "grad_norm": 0.2215733677148819, + "learning_rate": 0.00019998690907073056, + "loss": 12.2694, + "step": 613 + }, + { + "epoch": 0.10054036351727526, + "grad_norm": 0.2608155310153961, + "learning_rate": 0.00019998682529083676, + "loss": 12.2544, + "step": 614 + }, + { + "epoch": 0.1007041100376617, + "grad_norm": 0.25067582726478577, + "learning_rate": 0.00019998674124372596, + "loss": 12.258, + "step": 615 + }, + { + "epoch": 0.10086785655804814, + "grad_norm": 0.21041239798069, + "learning_rate": 0.00019998665692939842, + "loss": 12.2686, + "step": 616 + }, + { + "epoch": 0.10103160307843459, + "grad_norm": 0.1990176886320114, + "learning_rate": 0.00019998657234785434, + "loss": 12.2596, + "step": 617 + }, + { + "epoch": 0.10119534959882102, + "grad_norm": 0.3006889820098877, + "learning_rate": 0.00019998648749909397, + "loss": 12.2786, + "step": 618 + }, + { + "epoch": 0.10135909611920747, + "grad_norm": 0.19156216084957123, + "learning_rate": 0.0001999864023831175, + "loss": 12.2623, + "step": 619 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 0.24337586760520935, + "learning_rate": 0.0001999863169999252, + "loss": 12.2687, + "step": 620 + }, + { + "epoch": 0.10168658915998036, + "grad_norm": 0.2112797200679779, + "learning_rate": 0.00019998623134951727, + "loss": 12.2371, + "step": 621 + }, + { + "epoch": 0.10185033568036679, + "grad_norm": 0.3540230095386505, + "learning_rate": 0.00019998614543189394, + "loss": 12.2661, + "step": 622 + }, + { + "epoch": 0.10201408220075324, + "grad_norm": 0.21008434891700745, + "learning_rate": 0.00019998605924705546, + "loss": 12.2417, + "step": 623 + }, + { + "epoch": 0.10217782872113967, + "grad_norm": 0.3391290307044983, + "learning_rate": 0.000199985972795002, + "loss": 12.2789, + "step": 624 + }, + { + "epoch": 0.10234157524152612, + "grad_norm": 0.22646580636501312, + "learning_rate": 0.0001999858860757339, + "loss": 12.2695, + "step": 625 + }, + { + "epoch": 0.10250532176191256, + "grad_norm": 0.20563830435276031, + "learning_rate": 0.0001999857990892513, + "loss": 12.2392, + "step": 626 + }, + { + "epoch": 0.102669068282299, + "grad_norm": 0.14008118212223053, + "learning_rate": 0.00019998571183555447, + "loss": 12.2677, + "step": 627 + }, + { + "epoch": 0.10283281480268544, + "grad_norm": 0.18549852073192596, + "learning_rate": 0.00019998562431464365, + "loss": 12.2683, + "step": 628 + }, + { + "epoch": 0.10299656132307189, + "grad_norm": 0.1716676652431488, + "learning_rate": 0.00019998553652651903, + "loss": 12.2592, + "step": 629 + }, + { + "epoch": 0.10316030784345832, + "grad_norm": 0.18004333972930908, + "learning_rate": 0.0001999854484711809, + "loss": 12.2507, + "step": 630 + }, + { + "epoch": 0.10332405436384477, + "grad_norm": 0.1485266536474228, + "learning_rate": 0.00019998536014862944, + "loss": 12.2692, + "step": 631 + }, + { + "epoch": 0.1034878008842312, + "grad_norm": 0.16121861338615417, + "learning_rate": 0.00019998527155886496, + "loss": 12.2907, + "step": 632 + }, + { + "epoch": 0.10365154740461766, + "grad_norm": 0.2284204214811325, + "learning_rate": 0.00019998518270188763, + "loss": 12.2752, + "step": 633 + }, + { + "epoch": 0.10381529392500409, + "grad_norm": 0.21021804213523865, + "learning_rate": 0.0001999850935776977, + "loss": 12.2439, + "step": 634 + }, + { + "epoch": 0.10397904044539054, + "grad_norm": 0.18460194766521454, + "learning_rate": 0.0001999850041862954, + "loss": 12.2636, + "step": 635 + }, + { + "epoch": 0.10414278696577697, + "grad_norm": 0.235703706741333, + "learning_rate": 0.00019998491452768102, + "loss": 12.2853, + "step": 636 + }, + { + "epoch": 0.10430653348616342, + "grad_norm": 0.1810152679681778, + "learning_rate": 0.00019998482460185474, + "loss": 12.273, + "step": 637 + }, + { + "epoch": 0.10447028000654986, + "grad_norm": 0.26445272564888, + "learning_rate": 0.00019998473440881686, + "loss": 12.2545, + "step": 638 + }, + { + "epoch": 0.1046340265269363, + "grad_norm": 0.47541436553001404, + "learning_rate": 0.00019998464394856757, + "loss": 12.3134, + "step": 639 + }, + { + "epoch": 0.10479777304732274, + "grad_norm": 0.20061855018138885, + "learning_rate": 0.00019998455322110714, + "loss": 12.259, + "step": 640 + }, + { + "epoch": 0.10496151956770919, + "grad_norm": 0.19952458143234253, + "learning_rate": 0.00019998446222643579, + "loss": 12.2633, + "step": 641 + }, + { + "epoch": 0.10512526608809562, + "grad_norm": 0.18077236413955688, + "learning_rate": 0.00019998437096455375, + "loss": 12.2649, + "step": 642 + }, + { + "epoch": 0.10528901260848207, + "grad_norm": 0.19510169327259064, + "learning_rate": 0.00019998427943546134, + "loss": 12.2537, + "step": 643 + }, + { + "epoch": 0.1054527591288685, + "grad_norm": 0.1354212760925293, + "learning_rate": 0.00019998418763915872, + "loss": 12.2671, + "step": 644 + }, + { + "epoch": 0.10561650564925495, + "grad_norm": 0.15665219724178314, + "learning_rate": 0.0001999840955756462, + "loss": 12.2936, + "step": 645 + }, + { + "epoch": 0.10578025216964139, + "grad_norm": 0.17888222634792328, + "learning_rate": 0.00019998400324492394, + "loss": 12.2707, + "step": 646 + }, + { + "epoch": 0.10594399869002784, + "grad_norm": 0.20793989300727844, + "learning_rate": 0.0001999839106469923, + "loss": 12.263, + "step": 647 + }, + { + "epoch": 0.10610774521041427, + "grad_norm": 0.2504526674747467, + "learning_rate": 0.00019998381778185142, + "loss": 12.2566, + "step": 648 + }, + { + "epoch": 0.10627149173080072, + "grad_norm": 0.2541908621788025, + "learning_rate": 0.00019998372464950163, + "loss": 12.2473, + "step": 649 + }, + { + "epoch": 0.10643523825118716, + "grad_norm": 0.1839291751384735, + "learning_rate": 0.00019998363124994314, + "loss": 12.2588, + "step": 650 + }, + { + "epoch": 0.1065989847715736, + "grad_norm": 0.1613001674413681, + "learning_rate": 0.00019998353758317618, + "loss": 12.2363, + "step": 651 + }, + { + "epoch": 0.10676273129196004, + "grad_norm": 0.29961979389190674, + "learning_rate": 0.00019998344364920108, + "loss": 12.2315, + "step": 652 + }, + { + "epoch": 0.10692647781234649, + "grad_norm": 0.1734955608844757, + "learning_rate": 0.000199983349448018, + "loss": 12.2566, + "step": 653 + }, + { + "epoch": 0.10709022433273294, + "grad_norm": 0.22714467346668243, + "learning_rate": 0.00019998325497962721, + "loss": 12.2773, + "step": 654 + }, + { + "epoch": 0.10725397085311937, + "grad_norm": 0.18775632977485657, + "learning_rate": 0.00019998316024402902, + "loss": 12.2499, + "step": 655 + }, + { + "epoch": 0.10741771737350582, + "grad_norm": 0.14911091327667236, + "learning_rate": 0.00019998306524122365, + "loss": 12.2585, + "step": 656 + }, + { + "epoch": 0.10758146389389225, + "grad_norm": 0.153729647397995, + "learning_rate": 0.00019998296997121134, + "loss": 12.2517, + "step": 657 + }, + { + "epoch": 0.1077452104142787, + "grad_norm": 0.12172205001115799, + "learning_rate": 0.00019998287443399233, + "loss": 12.2633, + "step": 658 + }, + { + "epoch": 0.10790895693466514, + "grad_norm": 0.2324371337890625, + "learning_rate": 0.0001999827786295669, + "loss": 12.2705, + "step": 659 + }, + { + "epoch": 0.10807270345505159, + "grad_norm": 0.14951297640800476, + "learning_rate": 0.00019998268255793533, + "loss": 12.2622, + "step": 660 + }, + { + "epoch": 0.10823644997543802, + "grad_norm": 0.2147216498851776, + "learning_rate": 0.0001999825862190978, + "loss": 12.2956, + "step": 661 + }, + { + "epoch": 0.10840019649582447, + "grad_norm": 0.3348814845085144, + "learning_rate": 0.00019998248961305471, + "loss": 12.2714, + "step": 662 + }, + { + "epoch": 0.1085639430162109, + "grad_norm": 0.2494417130947113, + "learning_rate": 0.00019998239273980617, + "loss": 12.2685, + "step": 663 + }, + { + "epoch": 0.10872768953659735, + "grad_norm": 0.20005349814891815, + "learning_rate": 0.00019998229559935247, + "loss": 12.2377, + "step": 664 + }, + { + "epoch": 0.10889143605698379, + "grad_norm": 0.2329307198524475, + "learning_rate": 0.00019998219819169396, + "loss": 12.2541, + "step": 665 + }, + { + "epoch": 0.10905518257737024, + "grad_norm": 0.3226028382778168, + "learning_rate": 0.0001999821005168308, + "loss": 12.2794, + "step": 666 + }, + { + "epoch": 0.10921892909775667, + "grad_norm": 0.21378913521766663, + "learning_rate": 0.00019998200257476326, + "loss": 12.2466, + "step": 667 + }, + { + "epoch": 0.10938267561814312, + "grad_norm": 0.18209829926490784, + "learning_rate": 0.00019998190436549166, + "loss": 12.246, + "step": 668 + }, + { + "epoch": 0.10954642213852955, + "grad_norm": 0.1942809522151947, + "learning_rate": 0.00019998180588901625, + "loss": 12.2976, + "step": 669 + }, + { + "epoch": 0.109710168658916, + "grad_norm": 0.2823975384235382, + "learning_rate": 0.00019998170714533725, + "loss": 12.2609, + "step": 670 + }, + { + "epoch": 0.10987391517930244, + "grad_norm": 0.1592366099357605, + "learning_rate": 0.00019998160813445494, + "loss": 12.2825, + "step": 671 + }, + { + "epoch": 0.11003766169968889, + "grad_norm": 0.16642484068870544, + "learning_rate": 0.0001999815088563696, + "loss": 12.2322, + "step": 672 + }, + { + "epoch": 0.11020140822007532, + "grad_norm": 0.3897826373577118, + "learning_rate": 0.00019998140931108148, + "loss": 12.2343, + "step": 673 + }, + { + "epoch": 0.11036515474046177, + "grad_norm": 0.14695951342582703, + "learning_rate": 0.00019998130949859088, + "loss": 12.2712, + "step": 674 + }, + { + "epoch": 0.1105289012608482, + "grad_norm": 0.28502780199050903, + "learning_rate": 0.00019998120941889803, + "loss": 12.2653, + "step": 675 + }, + { + "epoch": 0.11069264778123465, + "grad_norm": 0.20605143904685974, + "learning_rate": 0.0001999811090720032, + "loss": 12.2505, + "step": 676 + }, + { + "epoch": 0.11085639430162109, + "grad_norm": 0.17322997748851776, + "learning_rate": 0.00019998100845790667, + "loss": 12.2404, + "step": 677 + }, + { + "epoch": 0.11102014082200753, + "grad_norm": 0.22566843032836914, + "learning_rate": 0.00019998090757660872, + "loss": 12.2536, + "step": 678 + }, + { + "epoch": 0.11118388734239397, + "grad_norm": 0.2020934671163559, + "learning_rate": 0.00019998080642810959, + "loss": 12.2919, + "step": 679 + }, + { + "epoch": 0.11134763386278042, + "grad_norm": 0.20571619272232056, + "learning_rate": 0.00019998070501240958, + "loss": 12.2496, + "step": 680 + }, + { + "epoch": 0.11151138038316685, + "grad_norm": 0.23394832015037537, + "learning_rate": 0.0001999806033295089, + "loss": 12.3042, + "step": 681 + }, + { + "epoch": 0.1116751269035533, + "grad_norm": 0.37446317076683044, + "learning_rate": 0.00019998050137940793, + "loss": 12.2232, + "step": 682 + }, + { + "epoch": 0.11183887342393974, + "grad_norm": 0.24774238467216492, + "learning_rate": 0.00019998039916210684, + "loss": 12.2569, + "step": 683 + }, + { + "epoch": 0.11200261994432618, + "grad_norm": 0.18902882933616638, + "learning_rate": 0.00019998029667760595, + "loss": 12.2678, + "step": 684 + }, + { + "epoch": 0.11216636646471262, + "grad_norm": 0.24386021494865417, + "learning_rate": 0.00019998019392590552, + "loss": 12.2515, + "step": 685 + }, + { + "epoch": 0.11233011298509907, + "grad_norm": 0.1919771432876587, + "learning_rate": 0.00019998009090700585, + "loss": 12.2585, + "step": 686 + }, + { + "epoch": 0.1124938595054855, + "grad_norm": 0.2406018227338791, + "learning_rate": 0.0001999799876209072, + "loss": 12.2547, + "step": 687 + }, + { + "epoch": 0.11265760602587195, + "grad_norm": 0.18448154628276825, + "learning_rate": 0.00019997988406760984, + "loss": 12.2652, + "step": 688 + }, + { + "epoch": 0.11282135254625839, + "grad_norm": 0.26016390323638916, + "learning_rate": 0.00019997978024711405, + "loss": 12.2443, + "step": 689 + }, + { + "epoch": 0.11298509906664483, + "grad_norm": 0.22282028198242188, + "learning_rate": 0.0001999796761594201, + "loss": 12.2521, + "step": 690 + }, + { + "epoch": 0.11314884558703127, + "grad_norm": 0.19451838731765747, + "learning_rate": 0.0001999795718045283, + "loss": 12.2511, + "step": 691 + }, + { + "epoch": 0.11331259210741772, + "grad_norm": 0.13449028134346008, + "learning_rate": 0.00019997946718243887, + "loss": 12.2649, + "step": 692 + }, + { + "epoch": 0.11347633862780417, + "grad_norm": 0.16171801090240479, + "learning_rate": 0.00019997936229315216, + "loss": 12.2522, + "step": 693 + }, + { + "epoch": 0.1136400851481906, + "grad_norm": 0.2671642601490021, + "learning_rate": 0.00019997925713666835, + "loss": 12.2789, + "step": 694 + }, + { + "epoch": 0.11380383166857705, + "grad_norm": 0.20152407884597778, + "learning_rate": 0.00019997915171298788, + "loss": 12.2889, + "step": 695 + }, + { + "epoch": 0.11396757818896348, + "grad_norm": 0.14468339085578918, + "learning_rate": 0.00019997904602211085, + "loss": 12.2395, + "step": 696 + }, + { + "epoch": 0.11413132470934993, + "grad_norm": 0.12371987104415894, + "learning_rate": 0.00019997894006403767, + "loss": 12.2567, + "step": 697 + }, + { + "epoch": 0.11429507122973637, + "grad_norm": 0.2297051101922989, + "learning_rate": 0.00019997883383876858, + "loss": 12.2663, + "step": 698 + }, + { + "epoch": 0.11445881775012282, + "grad_norm": 0.24164938926696777, + "learning_rate": 0.0001999787273463039, + "loss": 12.2613, + "step": 699 + }, + { + "epoch": 0.11462256427050925, + "grad_norm": 0.12142845243215561, + "learning_rate": 0.00019997862058664383, + "loss": 12.2554, + "step": 700 + }, + { + "epoch": 0.1147863107908957, + "grad_norm": 0.27234968543052673, + "learning_rate": 0.00019997851355978873, + "loss": 12.2556, + "step": 701 + }, + { + "epoch": 0.11495005731128213, + "grad_norm": 0.2992333173751831, + "learning_rate": 0.00019997840626573887, + "loss": 12.2827, + "step": 702 + }, + { + "epoch": 0.11511380383166858, + "grad_norm": 0.16893671452999115, + "learning_rate": 0.0001999782987044945, + "loss": 12.2516, + "step": 703 + }, + { + "epoch": 0.11527755035205502, + "grad_norm": 0.15710364282131195, + "learning_rate": 0.00019997819087605597, + "loss": 12.2315, + "step": 704 + }, + { + "epoch": 0.11544129687244147, + "grad_norm": 0.1487736850976944, + "learning_rate": 0.0001999780827804235, + "loss": 12.2508, + "step": 705 + }, + { + "epoch": 0.1156050433928279, + "grad_norm": 0.19448354840278625, + "learning_rate": 0.00019997797441759745, + "loss": 12.2404, + "step": 706 + }, + { + "epoch": 0.11576878991321435, + "grad_norm": 0.2013469785451889, + "learning_rate": 0.00019997786578757808, + "loss": 12.2744, + "step": 707 + }, + { + "epoch": 0.11593253643360078, + "grad_norm": 0.2695065438747406, + "learning_rate": 0.00019997775689036565, + "loss": 12.2469, + "step": 708 + }, + { + "epoch": 0.11609628295398723, + "grad_norm": 0.1316147893667221, + "learning_rate": 0.00019997764772596046, + "loss": 12.2832, + "step": 709 + }, + { + "epoch": 0.11626002947437367, + "grad_norm": 0.22008389234542847, + "learning_rate": 0.00019997753829436286, + "loss": 12.2447, + "step": 710 + }, + { + "epoch": 0.11642377599476011, + "grad_norm": 0.14515216648578644, + "learning_rate": 0.00019997742859557307, + "loss": 12.2219, + "step": 711 + }, + { + "epoch": 0.11658752251514655, + "grad_norm": 0.25737878680229187, + "learning_rate": 0.0001999773186295914, + "loss": 12.2293, + "step": 712 + }, + { + "epoch": 0.116751269035533, + "grad_norm": 0.22486746311187744, + "learning_rate": 0.0001999772083964182, + "loss": 12.2661, + "step": 713 + }, + { + "epoch": 0.11691501555591943, + "grad_norm": 0.17334704101085663, + "learning_rate": 0.0001999770978960537, + "loss": 12.2547, + "step": 714 + }, + { + "epoch": 0.11707876207630588, + "grad_norm": 0.1596781462430954, + "learning_rate": 0.00019997698712849823, + "loss": 12.2811, + "step": 715 + }, + { + "epoch": 0.11724250859669232, + "grad_norm": 0.2053455412387848, + "learning_rate": 0.00019997687609375203, + "loss": 12.2605, + "step": 716 + }, + { + "epoch": 0.11740625511707876, + "grad_norm": 0.15111824870109558, + "learning_rate": 0.00019997676479181547, + "loss": 12.2554, + "step": 717 + }, + { + "epoch": 0.1175700016374652, + "grad_norm": 0.14962074160575867, + "learning_rate": 0.00019997665322268881, + "loss": 12.2361, + "step": 718 + }, + { + "epoch": 0.11773374815785165, + "grad_norm": 0.18393626809120178, + "learning_rate": 0.00019997654138637238, + "loss": 12.237, + "step": 719 + }, + { + "epoch": 0.11789749467823808, + "grad_norm": 0.22599177062511444, + "learning_rate": 0.0001999764292828664, + "loss": 12.2619, + "step": 720 + }, + { + "epoch": 0.11806124119862453, + "grad_norm": 0.20265071094036102, + "learning_rate": 0.00019997631691217127, + "loss": 12.2589, + "step": 721 + }, + { + "epoch": 0.11822498771901097, + "grad_norm": 0.2219390720129013, + "learning_rate": 0.00019997620427428722, + "loss": 12.244, + "step": 722 + }, + { + "epoch": 0.11838873423939741, + "grad_norm": 0.2106860876083374, + "learning_rate": 0.0001999760913692146, + "loss": 12.2522, + "step": 723 + }, + { + "epoch": 0.11855248075978385, + "grad_norm": 0.19955146312713623, + "learning_rate": 0.00019997597819695364, + "loss": 12.2614, + "step": 724 + }, + { + "epoch": 0.1187162272801703, + "grad_norm": 0.24155764281749725, + "learning_rate": 0.00019997586475750475, + "loss": 12.2692, + "step": 725 + }, + { + "epoch": 0.11887997380055673, + "grad_norm": 0.18328611552715302, + "learning_rate": 0.0001999757510508681, + "loss": 12.2532, + "step": 726 + }, + { + "epoch": 0.11904372032094318, + "grad_norm": 0.28729379177093506, + "learning_rate": 0.0001999756370770441, + "loss": 12.2833, + "step": 727 + }, + { + "epoch": 0.11920746684132962, + "grad_norm": 0.13738590478897095, + "learning_rate": 0.00019997552283603306, + "loss": 12.2574, + "step": 728 + }, + { + "epoch": 0.11937121336171606, + "grad_norm": 0.2414904534816742, + "learning_rate": 0.00019997540832783522, + "loss": 12.3013, + "step": 729 + }, + { + "epoch": 0.11953495988210251, + "grad_norm": 0.1890541911125183, + "learning_rate": 0.0001999752935524509, + "loss": 12.351, + "step": 730 + }, + { + "epoch": 0.11969870640248895, + "grad_norm": 0.2413313090801239, + "learning_rate": 0.0001999751785098804, + "loss": 12.2582, + "step": 731 + }, + { + "epoch": 0.1198624529228754, + "grad_norm": 0.18245859444141388, + "learning_rate": 0.00019997506320012408, + "loss": 12.2701, + "step": 732 + }, + { + "epoch": 0.12002619944326183, + "grad_norm": 0.17451536655426025, + "learning_rate": 0.00019997494762318221, + "loss": 12.2617, + "step": 733 + }, + { + "epoch": 0.12018994596364828, + "grad_norm": 0.31427887082099915, + "learning_rate": 0.0001999748317790551, + "loss": 12.2727, + "step": 734 + }, + { + "epoch": 0.12035369248403471, + "grad_norm": 0.22707721590995789, + "learning_rate": 0.00019997471566774304, + "loss": 12.2636, + "step": 735 + }, + { + "epoch": 0.12051743900442116, + "grad_norm": 0.16001681983470917, + "learning_rate": 0.00019997459928924638, + "loss": 12.2485, + "step": 736 + }, + { + "epoch": 0.1206811855248076, + "grad_norm": 0.2794734239578247, + "learning_rate": 0.00019997448264356545, + "loss": 12.2922, + "step": 737 + }, + { + "epoch": 0.12084493204519405, + "grad_norm": 0.18110767006874084, + "learning_rate": 0.00019997436573070048, + "loss": 12.2591, + "step": 738 + }, + { + "epoch": 0.12100867856558048, + "grad_norm": 0.20784614980220795, + "learning_rate": 0.00019997424855065183, + "loss": 12.2375, + "step": 739 + }, + { + "epoch": 0.12117242508596693, + "grad_norm": 0.3365764617919922, + "learning_rate": 0.00019997413110341982, + "loss": 12.297, + "step": 740 + }, + { + "epoch": 0.12133617160635336, + "grad_norm": 0.17696967720985413, + "learning_rate": 0.00019997401338900476, + "loss": 12.2253, + "step": 741 + }, + { + "epoch": 0.12149991812673981, + "grad_norm": 0.1352595090866089, + "learning_rate": 0.00019997389540740693, + "loss": 12.2659, + "step": 742 + }, + { + "epoch": 0.12166366464712625, + "grad_norm": 0.268904447555542, + "learning_rate": 0.00019997377715862672, + "loss": 12.2673, + "step": 743 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 0.22296397387981415, + "learning_rate": 0.00019997365864266438, + "loss": 12.2469, + "step": 744 + }, + { + "epoch": 0.12199115768789913, + "grad_norm": 0.22052326798439026, + "learning_rate": 0.00019997353985952025, + "loss": 12.2621, + "step": 745 + }, + { + "epoch": 0.12215490420828558, + "grad_norm": 0.3132556974887848, + "learning_rate": 0.00019997342080919466, + "loss": 12.2708, + "step": 746 + }, + { + "epoch": 0.12231865072867201, + "grad_norm": 0.1781475841999054, + "learning_rate": 0.0001999733014916879, + "loss": 12.22, + "step": 747 + }, + { + "epoch": 0.12248239724905846, + "grad_norm": 0.21517692506313324, + "learning_rate": 0.0001999731819070003, + "loss": 12.257, + "step": 748 + }, + { + "epoch": 0.1226461437694449, + "grad_norm": 0.22570882737636566, + "learning_rate": 0.00019997306205513218, + "loss": 12.2332, + "step": 749 + }, + { + "epoch": 0.12280989028983134, + "grad_norm": 0.17157399654388428, + "learning_rate": 0.00019997294193608388, + "loss": 12.2591, + "step": 750 + }, + { + "epoch": 0.12297363681021778, + "grad_norm": 0.2372014820575714, + "learning_rate": 0.00019997282154985568, + "loss": 12.2546, + "step": 751 + }, + { + "epoch": 0.12313738333060423, + "grad_norm": 0.2063087373971939, + "learning_rate": 0.00019997270089644792, + "loss": 12.2637, + "step": 752 + }, + { + "epoch": 0.12330112985099066, + "grad_norm": 0.18634669482707977, + "learning_rate": 0.00019997257997586093, + "loss": 12.2695, + "step": 753 + }, + { + "epoch": 0.12346487637137711, + "grad_norm": 0.231937438249588, + "learning_rate": 0.00019997245878809508, + "loss": 12.2457, + "step": 754 + }, + { + "epoch": 0.12362862289176355, + "grad_norm": 0.17607882618904114, + "learning_rate": 0.0001999723373331506, + "loss": 12.2468, + "step": 755 + }, + { + "epoch": 0.12379236941215, + "grad_norm": 0.2614947259426117, + "learning_rate": 0.00019997221561102787, + "loss": 12.2423, + "step": 756 + }, + { + "epoch": 0.12395611593253643, + "grad_norm": 0.30709508061408997, + "learning_rate": 0.0001999720936217272, + "loss": 12.3144, + "step": 757 + }, + { + "epoch": 0.12411986245292288, + "grad_norm": 0.13760733604431152, + "learning_rate": 0.0001999719713652489, + "loss": 12.2767, + "step": 758 + }, + { + "epoch": 0.12428360897330931, + "grad_norm": 0.19334371387958527, + "learning_rate": 0.00019997184884159336, + "loss": 12.2511, + "step": 759 + }, + { + "epoch": 0.12444735549369576, + "grad_norm": 0.24581773579120636, + "learning_rate": 0.00019997172605076084, + "loss": 12.2743, + "step": 760 + }, + { + "epoch": 0.1246111020140822, + "grad_norm": 0.19460995495319366, + "learning_rate": 0.0001999716029927517, + "loss": 12.2359, + "step": 761 + }, + { + "epoch": 0.12477484853446864, + "grad_norm": 0.25555893778800964, + "learning_rate": 0.00019997147966756623, + "loss": 12.2551, + "step": 762 + }, + { + "epoch": 0.12493859505485508, + "grad_norm": 0.2933256924152374, + "learning_rate": 0.00019997135607520482, + "loss": 12.2914, + "step": 763 + }, + { + "epoch": 0.12510234157524153, + "grad_norm": 0.14119243621826172, + "learning_rate": 0.0001999712322156678, + "loss": 12.2373, + "step": 764 + }, + { + "epoch": 0.12526608809562798, + "grad_norm": 0.18366780877113342, + "learning_rate": 0.00019997110808895542, + "loss": 12.2386, + "step": 765 + }, + { + "epoch": 0.1254298346160144, + "grad_norm": 0.19770678877830505, + "learning_rate": 0.0001999709836950681, + "loss": 12.3011, + "step": 766 + }, + { + "epoch": 0.12559358113640084, + "grad_norm": 0.17909152805805206, + "learning_rate": 0.00019997085903400614, + "loss": 12.2668, + "step": 767 + }, + { + "epoch": 0.1257573276567873, + "grad_norm": 0.1801517903804779, + "learning_rate": 0.00019997073410576985, + "loss": 12.2533, + "step": 768 + }, + { + "epoch": 0.12592107417717374, + "grad_norm": 0.1824531853199005, + "learning_rate": 0.0001999706089103596, + "loss": 12.2619, + "step": 769 + }, + { + "epoch": 0.1260848206975602, + "grad_norm": 0.2011738419532776, + "learning_rate": 0.00019997048344777568, + "loss": 12.2424, + "step": 770 + }, + { + "epoch": 0.1262485672179466, + "grad_norm": 0.17008745670318604, + "learning_rate": 0.00019997035771801848, + "loss": 12.2381, + "step": 771 + }, + { + "epoch": 0.12641231373833306, + "grad_norm": 0.13051459193229675, + "learning_rate": 0.00019997023172108828, + "loss": 12.2417, + "step": 772 + }, + { + "epoch": 0.1265760602587195, + "grad_norm": 0.18956008553504944, + "learning_rate": 0.00019997010545698548, + "loss": 12.2951, + "step": 773 + }, + { + "epoch": 0.12673980677910596, + "grad_norm": 0.17199958860874176, + "learning_rate": 0.00019996997892571036, + "loss": 12.2622, + "step": 774 + }, + { + "epoch": 0.12690355329949238, + "grad_norm": 0.23833072185516357, + "learning_rate": 0.00019996985212726332, + "loss": 12.2614, + "step": 775 + }, + { + "epoch": 0.12706729981987883, + "grad_norm": 0.1904534250497818, + "learning_rate": 0.00019996972506164463, + "loss": 12.2211, + "step": 776 + }, + { + "epoch": 0.12723104634026527, + "grad_norm": 0.22543828189373016, + "learning_rate": 0.00019996959772885468, + "loss": 12.2184, + "step": 777 + }, + { + "epoch": 0.12739479286065172, + "grad_norm": 0.15488769114017487, + "learning_rate": 0.00019996947012889375, + "loss": 12.249, + "step": 778 + }, + { + "epoch": 0.12755853938103814, + "grad_norm": 0.21749526262283325, + "learning_rate": 0.00019996934226176226, + "loss": 12.2726, + "step": 779 + }, + { + "epoch": 0.1277222859014246, + "grad_norm": 0.2531960904598236, + "learning_rate": 0.00019996921412746048, + "loss": 12.2587, + "step": 780 + }, + { + "epoch": 0.12788603242181104, + "grad_norm": 0.17422235012054443, + "learning_rate": 0.0001999690857259888, + "loss": 12.2422, + "step": 781 + }, + { + "epoch": 0.1280497789421975, + "grad_norm": 0.3590430021286011, + "learning_rate": 0.00019996895705734756, + "loss": 12.2198, + "step": 782 + }, + { + "epoch": 0.1282135254625839, + "grad_norm": 0.24437008798122406, + "learning_rate": 0.0001999688281215371, + "loss": 12.2169, + "step": 783 + }, + { + "epoch": 0.12837727198297036, + "grad_norm": 0.23471295833587646, + "learning_rate": 0.00019996869891855773, + "loss": 12.2544, + "step": 784 + }, + { + "epoch": 0.1285410185033568, + "grad_norm": 0.1604817807674408, + "learning_rate": 0.00019996856944840986, + "loss": 12.2623, + "step": 785 + }, + { + "epoch": 0.12870476502374326, + "grad_norm": 0.3521434962749481, + "learning_rate": 0.00019996843971109378, + "loss": 12.2066, + "step": 786 + }, + { + "epoch": 0.12886851154412968, + "grad_norm": 0.1714937835931778, + "learning_rate": 0.00019996830970660985, + "loss": 12.306, + "step": 787 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.19912664592266083, + "learning_rate": 0.00019996817943495842, + "loss": 12.2221, + "step": 788 + }, + { + "epoch": 0.12919600458490257, + "grad_norm": 0.16007201373577118, + "learning_rate": 0.00019996804889613984, + "loss": 12.2631, + "step": 789 + }, + { + "epoch": 0.12935975110528902, + "grad_norm": 0.18096835911273956, + "learning_rate": 0.00019996791809015447, + "loss": 12.2448, + "step": 790 + }, + { + "epoch": 0.12952349762567544, + "grad_norm": 0.11847981065511703, + "learning_rate": 0.00019996778701700265, + "loss": 12.2355, + "step": 791 + }, + { + "epoch": 0.1296872441460619, + "grad_norm": 0.20610485970973969, + "learning_rate": 0.00019996765567668475, + "loss": 12.2543, + "step": 792 + }, + { + "epoch": 0.12985099066644834, + "grad_norm": 0.18722973763942719, + "learning_rate": 0.00019996752406920107, + "loss": 12.2384, + "step": 793 + }, + { + "epoch": 0.1300147371868348, + "grad_norm": 0.11530886590480804, + "learning_rate": 0.00019996739219455199, + "loss": 12.2489, + "step": 794 + }, + { + "epoch": 0.1301784837072212, + "grad_norm": 0.20853202044963837, + "learning_rate": 0.00019996726005273787, + "loss": 12.2275, + "step": 795 + }, + { + "epoch": 0.13034223022760766, + "grad_norm": 0.16415412724018097, + "learning_rate": 0.00019996712764375907, + "loss": 12.2659, + "step": 796 + }, + { + "epoch": 0.1305059767479941, + "grad_norm": 0.2464807629585266, + "learning_rate": 0.00019996699496761594, + "loss": 12.278, + "step": 797 + }, + { + "epoch": 0.13066972326838056, + "grad_norm": 0.16332204639911652, + "learning_rate": 0.00019996686202430878, + "loss": 12.2765, + "step": 798 + }, + { + "epoch": 0.13083346978876698, + "grad_norm": 0.14092905819416046, + "learning_rate": 0.00019996672881383805, + "loss": 12.2648, + "step": 799 + }, + { + "epoch": 0.13099721630915342, + "grad_norm": 0.15404187142848969, + "learning_rate": 0.00019996659533620404, + "loss": 12.2341, + "step": 800 + }, + { + "epoch": 0.13116096282953987, + "grad_norm": 0.26723140478134155, + "learning_rate": 0.0001999664615914071, + "loss": 12.2548, + "step": 801 + }, + { + "epoch": 0.13132470934992632, + "grad_norm": 0.16581694781780243, + "learning_rate": 0.00019996632757944758, + "loss": 12.2801, + "step": 802 + }, + { + "epoch": 0.13148845587031274, + "grad_norm": 0.15285004675388336, + "learning_rate": 0.00019996619330032588, + "loss": 12.25, + "step": 803 + }, + { + "epoch": 0.1316522023906992, + "grad_norm": 0.25647279620170593, + "learning_rate": 0.00019996605875404234, + "loss": 12.2543, + "step": 804 + }, + { + "epoch": 0.13181594891108564, + "grad_norm": 0.17818157374858856, + "learning_rate": 0.00019996592394059732, + "loss": 12.269, + "step": 805 + }, + { + "epoch": 0.1319796954314721, + "grad_norm": 0.1777380108833313, + "learning_rate": 0.00019996578885999117, + "loss": 12.2547, + "step": 806 + }, + { + "epoch": 0.13214344195185854, + "grad_norm": 0.1474923938512802, + "learning_rate": 0.0001999656535122243, + "loss": 12.2643, + "step": 807 + }, + { + "epoch": 0.13230718847224496, + "grad_norm": 0.24534763395786285, + "learning_rate": 0.000199965517897297, + "loss": 12.2275, + "step": 808 + }, + { + "epoch": 0.1324709349926314, + "grad_norm": 0.148993581533432, + "learning_rate": 0.00019996538201520964, + "loss": 12.244, + "step": 809 + }, + { + "epoch": 0.13263468151301785, + "grad_norm": 0.20542854070663452, + "learning_rate": 0.00019996524586596263, + "loss": 12.2345, + "step": 810 + }, + { + "epoch": 0.1327984280334043, + "grad_norm": 0.1916378140449524, + "learning_rate": 0.0001999651094495563, + "loss": 12.2586, + "step": 811 + }, + { + "epoch": 0.13296217455379072, + "grad_norm": 0.153681218624115, + "learning_rate": 0.00019996497276599103, + "loss": 12.2098, + "step": 812 + }, + { + "epoch": 0.13312592107417717, + "grad_norm": 0.19748173654079437, + "learning_rate": 0.00019996483581526722, + "loss": 12.2717, + "step": 813 + }, + { + "epoch": 0.13328966759456362, + "grad_norm": 0.1617303192615509, + "learning_rate": 0.0001999646985973852, + "loss": 12.2432, + "step": 814 + }, + { + "epoch": 0.13345341411495007, + "grad_norm": 0.16775915026664734, + "learning_rate": 0.00019996456111234527, + "loss": 12.261, + "step": 815 + }, + { + "epoch": 0.1336171606353365, + "grad_norm": 0.36850646138191223, + "learning_rate": 0.0001999644233601479, + "loss": 12.223, + "step": 816 + }, + { + "epoch": 0.13378090715572294, + "grad_norm": 0.3190401792526245, + "learning_rate": 0.00019996428534079338, + "loss": 12.2746, + "step": 817 + }, + { + "epoch": 0.1339446536761094, + "grad_norm": 0.16210894286632538, + "learning_rate": 0.00019996414705428217, + "loss": 12.2498, + "step": 818 + }, + { + "epoch": 0.13410840019649584, + "grad_norm": 0.21445512771606445, + "learning_rate": 0.00019996400850061456, + "loss": 12.2344, + "step": 819 + }, + { + "epoch": 0.13427214671688226, + "grad_norm": 0.3667347729206085, + "learning_rate": 0.00019996386967979096, + "loss": 12.2759, + "step": 820 + }, + { + "epoch": 0.1344358932372687, + "grad_norm": 0.2724533975124359, + "learning_rate": 0.00019996373059181174, + "loss": 12.2529, + "step": 821 + }, + { + "epoch": 0.13459963975765515, + "grad_norm": 0.15931159257888794, + "learning_rate": 0.00019996359123667726, + "loss": 12.2872, + "step": 822 + }, + { + "epoch": 0.1347633862780416, + "grad_norm": 0.12415755540132523, + "learning_rate": 0.00019996345161438786, + "loss": 12.2219, + "step": 823 + }, + { + "epoch": 0.13492713279842802, + "grad_norm": 0.16354034841060638, + "learning_rate": 0.00019996331172494395, + "loss": 12.2623, + "step": 824 + }, + { + "epoch": 0.13509087931881447, + "grad_norm": 0.17267750203609467, + "learning_rate": 0.00019996317156834593, + "loss": 12.2492, + "step": 825 + }, + { + "epoch": 0.13525462583920092, + "grad_norm": 0.1510535180568695, + "learning_rate": 0.00019996303114459414, + "loss": 12.2511, + "step": 826 + }, + { + "epoch": 0.13541837235958737, + "grad_norm": 0.13352897763252258, + "learning_rate": 0.00019996289045368895, + "loss": 12.2484, + "step": 827 + }, + { + "epoch": 0.1355821188799738, + "grad_norm": 0.2510572075843811, + "learning_rate": 0.00019996274949563074, + "loss": 12.2706, + "step": 828 + }, + { + "epoch": 0.13574586540036024, + "grad_norm": 0.1663379818201065, + "learning_rate": 0.0001999626082704199, + "loss": 12.2767, + "step": 829 + }, + { + "epoch": 0.1359096119207467, + "grad_norm": 0.22475042939186096, + "learning_rate": 0.0001999624667780568, + "loss": 12.2486, + "step": 830 + }, + { + "epoch": 0.13607335844113314, + "grad_norm": 0.17449508607387543, + "learning_rate": 0.00019996232501854182, + "loss": 12.2641, + "step": 831 + }, + { + "epoch": 0.13623710496151956, + "grad_norm": 0.19512225687503815, + "learning_rate": 0.00019996218299187532, + "loss": 12.2233, + "step": 832 + }, + { + "epoch": 0.136400851481906, + "grad_norm": 0.2455431967973709, + "learning_rate": 0.00019996204069805772, + "loss": 12.2716, + "step": 833 + }, + { + "epoch": 0.13656459800229245, + "grad_norm": 0.18555228412151337, + "learning_rate": 0.00019996189813708938, + "loss": 12.2554, + "step": 834 + }, + { + "epoch": 0.1367283445226789, + "grad_norm": 0.17858745157718658, + "learning_rate": 0.0001999617553089707, + "loss": 12.2599, + "step": 835 + }, + { + "epoch": 0.13689209104306532, + "grad_norm": 0.23524503409862518, + "learning_rate": 0.00019996161221370198, + "loss": 12.2692, + "step": 836 + }, + { + "epoch": 0.13705583756345177, + "grad_norm": 0.1710844486951828, + "learning_rate": 0.0001999614688512837, + "loss": 12.2646, + "step": 837 + }, + { + "epoch": 0.13721958408383822, + "grad_norm": 0.35525500774383545, + "learning_rate": 0.00019996132522171617, + "loss": 12.3043, + "step": 838 + }, + { + "epoch": 0.13738333060422467, + "grad_norm": 0.13505268096923828, + "learning_rate": 0.00019996118132499985, + "loss": 12.2324, + "step": 839 + }, + { + "epoch": 0.1375470771246111, + "grad_norm": 0.23420777916908264, + "learning_rate": 0.00019996103716113508, + "loss": 12.262, + "step": 840 + }, + { + "epoch": 0.13771082364499754, + "grad_norm": 0.22859814763069153, + "learning_rate": 0.00019996089273012225, + "loss": 12.2815, + "step": 841 + }, + { + "epoch": 0.137874570165384, + "grad_norm": 0.23992104828357697, + "learning_rate": 0.0001999607480319617, + "loss": 12.2568, + "step": 842 + }, + { + "epoch": 0.13803831668577043, + "grad_norm": 0.2638041377067566, + "learning_rate": 0.0001999606030666539, + "loss": 12.2447, + "step": 843 + }, + { + "epoch": 0.13820206320615686, + "grad_norm": 0.15055051445960999, + "learning_rate": 0.00019996045783419919, + "loss": 12.2772, + "step": 844 + }, + { + "epoch": 0.1383658097265433, + "grad_norm": 0.31120771169662476, + "learning_rate": 0.00019996031233459797, + "loss": 12.2348, + "step": 845 + }, + { + "epoch": 0.13852955624692975, + "grad_norm": 0.14593814313411713, + "learning_rate": 0.00019996016656785063, + "loss": 12.2528, + "step": 846 + }, + { + "epoch": 0.1386933027673162, + "grad_norm": 0.28545406460762024, + "learning_rate": 0.00019996002053395757, + "loss": 12.2506, + "step": 847 + }, + { + "epoch": 0.13885704928770265, + "grad_norm": 0.1730160117149353, + "learning_rate": 0.00019995987423291914, + "loss": 12.2506, + "step": 848 + }, + { + "epoch": 0.13902079580808907, + "grad_norm": 0.17055395245552063, + "learning_rate": 0.00019995972766473577, + "loss": 12.246, + "step": 849 + }, + { + "epoch": 0.13918454232847552, + "grad_norm": 0.14731362462043762, + "learning_rate": 0.00019995958082940783, + "loss": 12.2518, + "step": 850 + }, + { + "epoch": 0.13934828884886197, + "grad_norm": 0.3664889633655548, + "learning_rate": 0.00019995943372693573, + "loss": 12.291, + "step": 851 + }, + { + "epoch": 0.13951203536924842, + "grad_norm": 0.23178958892822266, + "learning_rate": 0.00019995928635731987, + "loss": 12.2564, + "step": 852 + }, + { + "epoch": 0.13967578188963484, + "grad_norm": 0.2134544551372528, + "learning_rate": 0.0001999591387205606, + "loss": 12.2453, + "step": 853 + }, + { + "epoch": 0.13983952841002129, + "grad_norm": 0.24148419499397278, + "learning_rate": 0.00019995899081665835, + "loss": 12.2928, + "step": 854 + }, + { + "epoch": 0.14000327493040773, + "grad_norm": 0.18369217216968536, + "learning_rate": 0.00019995884264561352, + "loss": 12.2615, + "step": 855 + }, + { + "epoch": 0.14016702145079418, + "grad_norm": 0.1529937982559204, + "learning_rate": 0.0001999586942074265, + "loss": 12.2502, + "step": 856 + }, + { + "epoch": 0.1403307679711806, + "grad_norm": 0.21245932579040527, + "learning_rate": 0.00019995854550209762, + "loss": 12.2318, + "step": 857 + }, + { + "epoch": 0.14049451449156705, + "grad_norm": 0.19340169429779053, + "learning_rate": 0.0001999583965296274, + "loss": 12.2613, + "step": 858 + }, + { + "epoch": 0.1406582610119535, + "grad_norm": 0.16757652163505554, + "learning_rate": 0.00019995824729001615, + "loss": 12.2642, + "step": 859 + }, + { + "epoch": 0.14082200753233995, + "grad_norm": 0.1938764750957489, + "learning_rate": 0.0001999580977832643, + "loss": 12.263, + "step": 860 + }, + { + "epoch": 0.14098575405272637, + "grad_norm": 0.15432967245578766, + "learning_rate": 0.00019995794800937227, + "loss": 12.2581, + "step": 861 + }, + { + "epoch": 0.14114950057311282, + "grad_norm": 0.17737895250320435, + "learning_rate": 0.00019995779796834042, + "loss": 12.2488, + "step": 862 + }, + { + "epoch": 0.14131324709349927, + "grad_norm": 0.21752485632896423, + "learning_rate": 0.00019995764766016913, + "loss": 12.2289, + "step": 863 + }, + { + "epoch": 0.14147699361388572, + "grad_norm": 0.21756303310394287, + "learning_rate": 0.00019995749708485888, + "loss": 12.2532, + "step": 864 + }, + { + "epoch": 0.14164074013427214, + "grad_norm": 0.18429669737815857, + "learning_rate": 0.00019995734624241002, + "loss": 12.2623, + "step": 865 + }, + { + "epoch": 0.14180448665465858, + "grad_norm": 0.26883766055107117, + "learning_rate": 0.00019995719513282297, + "loss": 12.246, + "step": 866 + }, + { + "epoch": 0.14196823317504503, + "grad_norm": 0.16113273799419403, + "learning_rate": 0.00019995704375609812, + "loss": 12.2727, + "step": 867 + }, + { + "epoch": 0.14213197969543148, + "grad_norm": 0.18849468231201172, + "learning_rate": 0.00019995689211223589, + "loss": 12.2752, + "step": 868 + }, + { + "epoch": 0.1422957262158179, + "grad_norm": 0.18139471113681793, + "learning_rate": 0.00019995674020123664, + "loss": 12.2799, + "step": 869 + }, + { + "epoch": 0.14245947273620435, + "grad_norm": 0.1637876331806183, + "learning_rate": 0.00019995658802310085, + "loss": 12.2431, + "step": 870 + }, + { + "epoch": 0.1426232192565908, + "grad_norm": 0.2694747745990753, + "learning_rate": 0.00019995643557782887, + "loss": 12.2353, + "step": 871 + }, + { + "epoch": 0.14278696577697725, + "grad_norm": 0.18601356446743011, + "learning_rate": 0.00019995628286542113, + "loss": 12.2567, + "step": 872 + }, + { + "epoch": 0.14295071229736367, + "grad_norm": 0.15186317265033722, + "learning_rate": 0.00019995612988587803, + "loss": 12.2468, + "step": 873 + }, + { + "epoch": 0.14311445881775012, + "grad_norm": 0.1920851171016693, + "learning_rate": 0.00019995597663920002, + "loss": 12.2523, + "step": 874 + }, + { + "epoch": 0.14327820533813657, + "grad_norm": 0.16891133785247803, + "learning_rate": 0.00019995582312538744, + "loss": 12.258, + "step": 875 + }, + { + "epoch": 0.14344195185852301, + "grad_norm": 0.20723503828048706, + "learning_rate": 0.00019995566934444075, + "loss": 12.261, + "step": 876 + }, + { + "epoch": 0.14360569837890944, + "grad_norm": 0.1818312555551529, + "learning_rate": 0.00019995551529636033, + "loss": 12.2568, + "step": 877 + }, + { + "epoch": 0.14376944489929588, + "grad_norm": 0.2277580350637436, + "learning_rate": 0.0001999553609811466, + "loss": 12.2225, + "step": 878 + }, + { + "epoch": 0.14393319141968233, + "grad_norm": 0.2139032483100891, + "learning_rate": 0.00019995520639880002, + "loss": 12.2652, + "step": 879 + }, + { + "epoch": 0.14409693794006878, + "grad_norm": 0.14729733765125275, + "learning_rate": 0.00019995505154932092, + "loss": 12.2582, + "step": 880 + }, + { + "epoch": 0.1442606844604552, + "grad_norm": 0.2961966395378113, + "learning_rate": 0.00019995489643270976, + "loss": 12.256, + "step": 881 + }, + { + "epoch": 0.14442443098084165, + "grad_norm": 0.25167980790138245, + "learning_rate": 0.00019995474104896697, + "loss": 12.2299, + "step": 882 + }, + { + "epoch": 0.1445881775012281, + "grad_norm": 0.19484156370162964, + "learning_rate": 0.00019995458539809292, + "loss": 12.2448, + "step": 883 + }, + { + "epoch": 0.14475192402161455, + "grad_norm": 0.28314441442489624, + "learning_rate": 0.0001999544294800881, + "loss": 12.2692, + "step": 884 + }, + { + "epoch": 0.144915670542001, + "grad_norm": 0.2943701446056366, + "learning_rate": 0.00019995427329495282, + "loss": 12.2277, + "step": 885 + }, + { + "epoch": 0.14507941706238742, + "grad_norm": 0.30075713992118835, + "learning_rate": 0.0001999541168426876, + "loss": 12.2477, + "step": 886 + }, + { + "epoch": 0.14524316358277387, + "grad_norm": 0.19734449684619904, + "learning_rate": 0.0001999539601232928, + "loss": 12.249, + "step": 887 + }, + { + "epoch": 0.14540691010316031, + "grad_norm": 0.23548541963100433, + "learning_rate": 0.00019995380313676884, + "loss": 12.267, + "step": 888 + }, + { + "epoch": 0.14557065662354676, + "grad_norm": 0.12990887463092804, + "learning_rate": 0.00019995364588311617, + "loss": 12.2397, + "step": 889 + }, + { + "epoch": 0.14573440314393318, + "grad_norm": 0.1529313325881958, + "learning_rate": 0.00019995348836233516, + "loss": 12.2598, + "step": 890 + }, + { + "epoch": 0.14589814966431963, + "grad_norm": 0.17717255651950836, + "learning_rate": 0.0001999533305744263, + "loss": 12.2697, + "step": 891 + }, + { + "epoch": 0.14606189618470608, + "grad_norm": 0.1448763757944107, + "learning_rate": 0.00019995317251938994, + "loss": 12.2429, + "step": 892 + }, + { + "epoch": 0.14622564270509253, + "grad_norm": 0.18021667003631592, + "learning_rate": 0.00019995301419722657, + "loss": 12.2432, + "step": 893 + }, + { + "epoch": 0.14638938922547895, + "grad_norm": 0.2443927526473999, + "learning_rate": 0.00019995285560793656, + "loss": 12.2451, + "step": 894 + }, + { + "epoch": 0.1465531357458654, + "grad_norm": 0.2532816231250763, + "learning_rate": 0.00019995269675152037, + "loss": 12.2565, + "step": 895 + }, + { + "epoch": 0.14671688226625185, + "grad_norm": 0.24490387737751007, + "learning_rate": 0.00019995253762797842, + "loss": 12.2618, + "step": 896 + }, + { + "epoch": 0.1468806287866383, + "grad_norm": 0.16988995671272278, + "learning_rate": 0.00019995237823731109, + "loss": 12.2267, + "step": 897 + }, + { + "epoch": 0.14704437530702472, + "grad_norm": 0.1722571700811386, + "learning_rate": 0.00019995221857951884, + "loss": 12.2419, + "step": 898 + }, + { + "epoch": 0.14720812182741116, + "grad_norm": 0.26967161893844604, + "learning_rate": 0.00019995205865460213, + "loss": 12.2608, + "step": 899 + }, + { + "epoch": 0.1473718683477976, + "grad_norm": 0.16842836141586304, + "learning_rate": 0.00019995189846256132, + "loss": 12.2694, + "step": 900 + }, + { + "epoch": 0.14753561486818406, + "grad_norm": 0.16113752126693726, + "learning_rate": 0.00019995173800339692, + "loss": 12.2416, + "step": 901 + }, + { + "epoch": 0.14769936138857048, + "grad_norm": 0.23641365766525269, + "learning_rate": 0.00019995157727710928, + "loss": 12.2583, + "step": 902 + }, + { + "epoch": 0.14786310790895693, + "grad_norm": 0.15737037360668182, + "learning_rate": 0.00019995141628369883, + "loss": 12.2499, + "step": 903 + }, + { + "epoch": 0.14802685442934338, + "grad_norm": 0.17378929257392883, + "learning_rate": 0.0001999512550231661, + "loss": 12.2601, + "step": 904 + }, + { + "epoch": 0.14819060094972983, + "grad_norm": 0.16688664257526398, + "learning_rate": 0.00019995109349551143, + "loss": 12.2777, + "step": 905 + }, + { + "epoch": 0.14835434747011625, + "grad_norm": 0.1975651979446411, + "learning_rate": 0.00019995093170073523, + "loss": 12.2414, + "step": 906 + }, + { + "epoch": 0.1485180939905027, + "grad_norm": 0.16175058484077454, + "learning_rate": 0.00019995076963883802, + "loss": 12.2685, + "step": 907 + }, + { + "epoch": 0.14868184051088915, + "grad_norm": 0.21172209084033966, + "learning_rate": 0.00019995060730982017, + "loss": 12.2506, + "step": 908 + }, + { + "epoch": 0.1488455870312756, + "grad_norm": 0.16677246987819672, + "learning_rate": 0.00019995044471368215, + "loss": 12.2602, + "step": 909 + }, + { + "epoch": 0.14900933355166202, + "grad_norm": 0.1518072932958603, + "learning_rate": 0.00019995028185042438, + "loss": 12.2631, + "step": 910 + }, + { + "epoch": 0.14917308007204846, + "grad_norm": 0.17491476237773895, + "learning_rate": 0.00019995011872004726, + "loss": 12.2561, + "step": 911 + }, + { + "epoch": 0.1493368265924349, + "grad_norm": 0.2581184208393097, + "learning_rate": 0.00019994995532255128, + "loss": 12.252, + "step": 912 + }, + { + "epoch": 0.14950057311282136, + "grad_norm": 0.15219415724277496, + "learning_rate": 0.00019994979165793684, + "loss": 12.2523, + "step": 913 + }, + { + "epoch": 0.14966431963320778, + "grad_norm": 0.16342870891094208, + "learning_rate": 0.00019994962772620442, + "loss": 12.2635, + "step": 914 + }, + { + "epoch": 0.14982806615359423, + "grad_norm": 0.25702103972435, + "learning_rate": 0.00019994946352735443, + "loss": 12.2455, + "step": 915 + }, + { + "epoch": 0.14999181267398068, + "grad_norm": 0.27089154720306396, + "learning_rate": 0.00019994929906138727, + "loss": 12.2748, + "step": 916 + }, + { + "epoch": 0.15015555919436713, + "grad_norm": 0.22161947190761566, + "learning_rate": 0.00019994913432830344, + "loss": 12.2711, + "step": 917 + }, + { + "epoch": 0.15031930571475355, + "grad_norm": 0.21250496804714203, + "learning_rate": 0.00019994896932810338, + "loss": 12.2435, + "step": 918 + }, + { + "epoch": 0.15048305223514, + "grad_norm": 0.21829836070537567, + "learning_rate": 0.00019994880406078752, + "loss": 12.2737, + "step": 919 + }, + { + "epoch": 0.15064679875552645, + "grad_norm": 0.2482108175754547, + "learning_rate": 0.00019994863852635625, + "loss": 12.2545, + "step": 920 + }, + { + "epoch": 0.1508105452759129, + "grad_norm": 0.20951540768146515, + "learning_rate": 0.00019994847272481007, + "loss": 12.2355, + "step": 921 + }, + { + "epoch": 0.15097429179629934, + "grad_norm": 0.18365690112113953, + "learning_rate": 0.00019994830665614943, + "loss": 12.2399, + "step": 922 + }, + { + "epoch": 0.15113803831668576, + "grad_norm": 0.22276079654693604, + "learning_rate": 0.0001999481403203747, + "loss": 12.3016, + "step": 923 + }, + { + "epoch": 0.1513017848370722, + "grad_norm": 0.2162826955318451, + "learning_rate": 0.00019994797371748643, + "loss": 12.2524, + "step": 924 + }, + { + "epoch": 0.15146553135745866, + "grad_norm": 0.21627525985240936, + "learning_rate": 0.000199947806847485, + "loss": 12.2235, + "step": 925 + }, + { + "epoch": 0.1516292778778451, + "grad_norm": 0.1927967220544815, + "learning_rate": 0.00019994763971037087, + "loss": 12.2738, + "step": 926 + }, + { + "epoch": 0.15179302439823153, + "grad_norm": 0.16526252031326294, + "learning_rate": 0.0001999474723061445, + "loss": 12.2413, + "step": 927 + }, + { + "epoch": 0.15195677091861798, + "grad_norm": 0.23166383802890778, + "learning_rate": 0.00019994730463480625, + "loss": 12.2622, + "step": 928 + }, + { + "epoch": 0.15212051743900443, + "grad_norm": 0.14684942364692688, + "learning_rate": 0.0001999471366963567, + "loss": 12.3034, + "step": 929 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 0.20183612406253815, + "learning_rate": 0.0001999469684907962, + "loss": 12.2465, + "step": 930 + }, + { + "epoch": 0.1524480104797773, + "grad_norm": 0.2674098312854767, + "learning_rate": 0.00019994680001812526, + "loss": 12.2674, + "step": 931 + }, + { + "epoch": 0.15261175700016374, + "grad_norm": 0.25609856843948364, + "learning_rate": 0.00019994663127834432, + "loss": 12.2579, + "step": 932 + }, + { + "epoch": 0.1527755035205502, + "grad_norm": 0.208163782954216, + "learning_rate": 0.0001999464622714538, + "loss": 12.2191, + "step": 933 + }, + { + "epoch": 0.15293925004093664, + "grad_norm": 0.23667578399181366, + "learning_rate": 0.0001999462929974542, + "loss": 12.2729, + "step": 934 + }, + { + "epoch": 0.15310299656132306, + "grad_norm": 0.1899147778749466, + "learning_rate": 0.00019994612345634592, + "loss": 12.2478, + "step": 935 + }, + { + "epoch": 0.1532667430817095, + "grad_norm": 0.2460227757692337, + "learning_rate": 0.00019994595364812944, + "loss": 12.2444, + "step": 936 + }, + { + "epoch": 0.15343048960209596, + "grad_norm": 0.27262449264526367, + "learning_rate": 0.0001999457835728052, + "loss": 12.239, + "step": 937 + }, + { + "epoch": 0.1535942361224824, + "grad_norm": 0.25000107288360596, + "learning_rate": 0.0001999456132303737, + "loss": 12.258, + "step": 938 + }, + { + "epoch": 0.15375798264286883, + "grad_norm": 0.21081086993217468, + "learning_rate": 0.0001999454426208353, + "loss": 12.2568, + "step": 939 + }, + { + "epoch": 0.15392172916325528, + "grad_norm": 0.27827292680740356, + "learning_rate": 0.00019994527174419056, + "loss": 12.2283, + "step": 940 + }, + { + "epoch": 0.15408547568364173, + "grad_norm": 0.26305484771728516, + "learning_rate": 0.00019994510060043988, + "loss": 12.2749, + "step": 941 + }, + { + "epoch": 0.15424922220402817, + "grad_norm": 0.21291232109069824, + "learning_rate": 0.00019994492918958372, + "loss": 12.2384, + "step": 942 + }, + { + "epoch": 0.1544129687244146, + "grad_norm": 0.17280226945877075, + "learning_rate": 0.00019994475751162257, + "loss": 12.2729, + "step": 943 + }, + { + "epoch": 0.15457671524480104, + "grad_norm": 0.16324912011623383, + "learning_rate": 0.00019994458556655685, + "loss": 12.2481, + "step": 944 + }, + { + "epoch": 0.1547404617651875, + "grad_norm": 0.18047533929347992, + "learning_rate": 0.00019994441335438704, + "loss": 12.2624, + "step": 945 + }, + { + "epoch": 0.15490420828557394, + "grad_norm": 0.1873340606689453, + "learning_rate": 0.0001999442408751136, + "loss": 12.2531, + "step": 946 + }, + { + "epoch": 0.15506795480596036, + "grad_norm": 0.30801185965538025, + "learning_rate": 0.000199944068128737, + "loss": 12.3044, + "step": 947 + }, + { + "epoch": 0.1552317013263468, + "grad_norm": 0.23735474050045013, + "learning_rate": 0.0001999438951152577, + "loss": 12.2282, + "step": 948 + }, + { + "epoch": 0.15539544784673326, + "grad_norm": 0.18847191333770752, + "learning_rate": 0.0001999437218346761, + "loss": 12.2607, + "step": 949 + }, + { + "epoch": 0.1555591943671197, + "grad_norm": 0.2875593900680542, + "learning_rate": 0.00019994354828699275, + "loss": 12.2203, + "step": 950 + }, + { + "epoch": 0.15572294088750613, + "grad_norm": 0.20091582834720612, + "learning_rate": 0.00019994337447220808, + "loss": 12.2733, + "step": 951 + }, + { + "epoch": 0.15588668740789258, + "grad_norm": 0.20508666336536407, + "learning_rate": 0.00019994320039032253, + "loss": 12.2486, + "step": 952 + }, + { + "epoch": 0.15605043392827903, + "grad_norm": 0.20233626663684845, + "learning_rate": 0.0001999430260413366, + "loss": 12.2954, + "step": 953 + }, + { + "epoch": 0.15621418044866547, + "grad_norm": 0.15181776881217957, + "learning_rate": 0.00019994285142525074, + "loss": 12.239, + "step": 954 + }, + { + "epoch": 0.1563779269690519, + "grad_norm": 0.1466662883758545, + "learning_rate": 0.00019994267654206544, + "loss": 12.2513, + "step": 955 + }, + { + "epoch": 0.15654167348943834, + "grad_norm": 0.3135926127433777, + "learning_rate": 0.00019994250139178114, + "loss": 12.2292, + "step": 956 + }, + { + "epoch": 0.1567054200098248, + "grad_norm": 0.23405830562114716, + "learning_rate": 0.00019994232597439832, + "loss": 12.2669, + "step": 957 + }, + { + "epoch": 0.15686916653021124, + "grad_norm": 0.20362041890621185, + "learning_rate": 0.00019994215028991744, + "loss": 12.2616, + "step": 958 + }, + { + "epoch": 0.15703291305059766, + "grad_norm": 0.1632128357887268, + "learning_rate": 0.00019994197433833896, + "loss": 12.222, + "step": 959 + }, + { + "epoch": 0.1571966595709841, + "grad_norm": 0.1670769453048706, + "learning_rate": 0.00019994179811966335, + "loss": 12.2788, + "step": 960 + }, + { + "epoch": 0.15736040609137056, + "grad_norm": 0.26559993624687195, + "learning_rate": 0.00019994162163389115, + "loss": 12.2452, + "step": 961 + }, + { + "epoch": 0.157524152611757, + "grad_norm": 0.26576483249664307, + "learning_rate": 0.00019994144488102276, + "loss": 12.2494, + "step": 962 + }, + { + "epoch": 0.15768789913214346, + "grad_norm": 0.13807259500026703, + "learning_rate": 0.00019994126786105864, + "loss": 12.2752, + "step": 963 + }, + { + "epoch": 0.15785164565252988, + "grad_norm": 0.155136376619339, + "learning_rate": 0.0001999410905739993, + "loss": 12.3317, + "step": 964 + }, + { + "epoch": 0.15801539217291632, + "grad_norm": 0.19782254099845886, + "learning_rate": 0.00019994091301984526, + "loss": 12.2419, + "step": 965 + }, + { + "epoch": 0.15817913869330277, + "grad_norm": 0.2084280252456665, + "learning_rate": 0.0001999407351985969, + "loss": 12.2583, + "step": 966 + }, + { + "epoch": 0.15834288521368922, + "grad_norm": 0.21387748420238495, + "learning_rate": 0.00019994055711025472, + "loss": 12.2722, + "step": 967 + }, + { + "epoch": 0.15850663173407564, + "grad_norm": 0.45252978801727295, + "learning_rate": 0.00019994037875481924, + "loss": 12.2957, + "step": 968 + }, + { + "epoch": 0.1586703782544621, + "grad_norm": 0.19610193371772766, + "learning_rate": 0.00019994020013229088, + "loss": 12.2351, + "step": 969 + }, + { + "epoch": 0.15883412477484854, + "grad_norm": 0.12732064723968506, + "learning_rate": 0.00019994002124267018, + "loss": 12.2691, + "step": 970 + }, + { + "epoch": 0.158997871295235, + "grad_norm": 0.1697031557559967, + "learning_rate": 0.00019993984208595758, + "loss": 12.2162, + "step": 971 + }, + { + "epoch": 0.1591616178156214, + "grad_norm": 0.3649829030036926, + "learning_rate": 0.00019993966266215355, + "loss": 12.265, + "step": 972 + }, + { + "epoch": 0.15932536433600786, + "grad_norm": 0.26839733123779297, + "learning_rate": 0.0001999394829712586, + "loss": 12.2342, + "step": 973 + }, + { + "epoch": 0.1594891108563943, + "grad_norm": 0.22787222266197205, + "learning_rate": 0.00019993930301327316, + "loss": 12.2668, + "step": 974 + }, + { + "epoch": 0.15965285737678075, + "grad_norm": 0.16727082431316376, + "learning_rate": 0.00019993912278819777, + "loss": 12.25, + "step": 975 + }, + { + "epoch": 0.15981660389716718, + "grad_norm": 0.17583709955215454, + "learning_rate": 0.00019993894229603288, + "loss": 12.2442, + "step": 976 + }, + { + "epoch": 0.15998035041755362, + "grad_norm": 0.16142424941062927, + "learning_rate": 0.00019993876153677899, + "loss": 12.2763, + "step": 977 + }, + { + "epoch": 0.16014409693794007, + "grad_norm": 0.17407581210136414, + "learning_rate": 0.00019993858051043655, + "loss": 12.2548, + "step": 978 + }, + { + "epoch": 0.16030784345832652, + "grad_norm": 0.27629950642585754, + "learning_rate": 0.0001999383992170061, + "loss": 12.2643, + "step": 979 + }, + { + "epoch": 0.16047158997871294, + "grad_norm": 0.2096289098262787, + "learning_rate": 0.00019993821765648803, + "loss": 12.2526, + "step": 980 + }, + { + "epoch": 0.1606353364990994, + "grad_norm": 0.2241704910993576, + "learning_rate": 0.00019993803582888292, + "loss": 12.283, + "step": 981 + }, + { + "epoch": 0.16079908301948584, + "grad_norm": 0.1575346291065216, + "learning_rate": 0.00019993785373419122, + "loss": 12.2503, + "step": 982 + }, + { + "epoch": 0.1609628295398723, + "grad_norm": 0.3286789655685425, + "learning_rate": 0.0001999376713724134, + "loss": 12.2153, + "step": 983 + }, + { + "epoch": 0.1611265760602587, + "grad_norm": 0.17322884500026703, + "learning_rate": 0.00019993748874355, + "loss": 12.2841, + "step": 984 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.18421022593975067, + "learning_rate": 0.00019993730584760147, + "loss": 12.2642, + "step": 985 + }, + { + "epoch": 0.1614540691010316, + "grad_norm": 0.2500661611557007, + "learning_rate": 0.00019993712268456828, + "loss": 12.2554, + "step": 986 + }, + { + "epoch": 0.16161781562141805, + "grad_norm": 0.13105358183383942, + "learning_rate": 0.00019993693925445094, + "loss": 12.261, + "step": 987 + }, + { + "epoch": 0.16178156214180447, + "grad_norm": 0.1763312965631485, + "learning_rate": 0.00019993675555724996, + "loss": 12.2281, + "step": 988 + }, + { + "epoch": 0.16194530866219092, + "grad_norm": 0.14316731691360474, + "learning_rate": 0.00019993657159296578, + "loss": 12.2557, + "step": 989 + }, + { + "epoch": 0.16210905518257737, + "grad_norm": 0.19045111536979675, + "learning_rate": 0.00019993638736159896, + "loss": 12.2593, + "step": 990 + }, + { + "epoch": 0.16227280170296382, + "grad_norm": 0.15188392996788025, + "learning_rate": 0.00019993620286314995, + "loss": 12.2705, + "step": 991 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 0.26927465200424194, + "learning_rate": 0.00019993601809761922, + "loss": 12.3048, + "step": 992 + }, + { + "epoch": 0.1626002947437367, + "grad_norm": 0.2550506591796875, + "learning_rate": 0.00019993583306500732, + "loss": 12.2609, + "step": 993 + }, + { + "epoch": 0.16276404126412314, + "grad_norm": 0.27006834745407104, + "learning_rate": 0.0001999356477653147, + "loss": 12.2305, + "step": 994 + }, + { + "epoch": 0.1629277877845096, + "grad_norm": 0.1495426744222641, + "learning_rate": 0.00019993546219854188, + "loss": 12.2397, + "step": 995 + }, + { + "epoch": 0.163091534304896, + "grad_norm": 0.1674230992794037, + "learning_rate": 0.00019993527636468937, + "loss": 12.2468, + "step": 996 + }, + { + "epoch": 0.16325528082528246, + "grad_norm": 0.1745332032442093, + "learning_rate": 0.0001999350902637576, + "loss": 12.2406, + "step": 997 + }, + { + "epoch": 0.1634190273456689, + "grad_norm": 0.1422467827796936, + "learning_rate": 0.00019993490389574715, + "loss": 12.2305, + "step": 998 + }, + { + "epoch": 0.16358277386605535, + "grad_norm": 0.21741586923599243, + "learning_rate": 0.0001999347172606585, + "loss": 12.2306, + "step": 999 + }, + { + "epoch": 0.1637465203864418, + "grad_norm": 0.15799422562122345, + "learning_rate": 0.00019993453035849207, + "loss": 12.2591, + "step": 1000 + }, + { + "epoch": 0.1637465203864418, + "eval_loss": 12.25197696685791, + "eval_runtime": 7.2766, + "eval_samples_per_second": 33.807, + "eval_steps_per_second": 16.903, + "step": 1000 } ], "logging_steps": 1, - "max_steps": 18321, + "max_steps": 61070, "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 100, + "num_train_epochs": 10, + "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { - "early_stopping_patience": 4, + "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { @@ -3583,7 +7051,7 @@ "attributes": {} } }, - "total_flos": 320946634752.0, + "total_flos": 641772748800.0, "train_batch_size": 2, "trial_name": null, "trial_params": null